Merge pull request #1 from facebook/master

merge from facebook
11 years ago · d1cafc0892
parent 7e9f28cb23 6d6305dd7d
commit d1cafc0892
539 changed files with 62527 additions and 18653 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,5 +1,5 @@
 TARGETS
-build_config.mk
+make_config.mk

 *.a
 *.arc
@ -20,6 +20,7 @@ build_config.mk
 *.d-e
 *.o-*
 *.swp
+*~

 ldb
 manifest_dump
@ -28,8 +29,22 @@ util/build_version.cc
 build_tools/VALGRIND_LOGS/
 coverage/COVERAGE_REPORT
 .gdbhistory
+package/
 .phutil_module_cache
+unity
 tags
+
+java/out
+java/target
+java/test-libs
 java/*.log
 java/include/org_rocksdb_*.h
+
+.idea/
+*.iml
+
 unity.cc
+java/crossbuild/.vagrant
+.vagrant/
+java/**.asc
+java/javadoc
--- a/.travis.yml
+++ b/.travis.yml
@ -14,7 +14,6 @@ before_install:
 - sudo dpkg -i libgflags-dev_2.0-1_amd64.deb
 # Lousy hack to disable use and testing of fallocate, which doesn't behave quite
 # as EnvPosixTest::AllocateTest expects within the Travis OpenVZ environment.
- - sed -i "s/fallocate(/HACK_NO_fallocate(/" build_tools/build_detect_platform
-script: make check -j8
+script: OPT=-DTRAVIS make unity && make clean && OPT=-DTRAVIS make check
 notifications:
    email: false
--- a/11
+++ b/11
@ -0,0 +1,11 @@
+Facebook Inc.
+Facebook Engineering Team
+
+Google Inc.
+# Initial version authors:
+Jeffrey Dean <jeff@google.com>
+Sanjay Ghemawat <sanjay@google.com>
+
+# Partial list of contributors:
+Kevin Regan <kevin.d.regan@gmail.com>
+Johan Bilien <jobi@litl.com>
--- a/HISTORY.md
+++ b/HISTORY.md
@ -1,6 +1,78 @@
 # Rocksdb Change Log

-### Unreleased
+### Unreleased Features
+* Changed the LRU caching algorithm so that referenced blocks (by iterators) are never evicted
+* By default we now optimize the compilation for the compilation platform (using -march=native). If you want to build portable binary, use 'PORTABLE=1' before the make command.
+* We now allow level-compaction to place files in different paths by
+  specifying them in db_paths along with the target_size.
+  Lower numbered levels will be placed earlier in the db_paths and higher
+  numbered levels will be placed later in the db_paths vector.
+* Potentially big performance improvements if you're using RocksDB with lots of column families (100-1000)
+* Added BlockBasedTableOptions.format_version option, which allows user to specify which version of block based table he wants. As a general guidline, newer versions have more features, but might not be readable by older versions of RocksDB.
+* Added new block based table format (version 2), which you can enable by setting BlockBasedTableOptions.format_version = 2. This format changes how we encode size information in compressed blocks and should help with memory allocations if you're using Zlib or BZip2 compressions.
+* GetThreadStatus() is now able to report compaction activity.
+* MemEnv (env that stores data in memory) is now available in default library build. You can create it by calling NewMemEnv().
+* Add SliceTransform.SameResultWhenAppended() to help users determine it is safe to apply prefix bloom/hash.
+* Block based table now makes use of prefix bloom filter if it is a full fulter.
+
+### Public API changes
+* Deprecated skip_log_error_on_recovery option
+* Logger method logv with log level parameter is now virtual
+
+### 3.9.0 (12/8/2014)
+
+### New Features
+* Add rocksdb::GetThreadList(), which in the future will return the current status of all
+  rocksdb-related threads.  We will have more code instruments in the following RocksDB
+  releases.
+* Change convert function in rocksdb/utilities/convenience.h to return Status instead of boolean.
+  Also add support for nested options in convert function
+
+### Public API changes
+* New API to create a checkpoint added. Given a directory name, creates a new
+  database which is an image of the existing database.
+* New API LinkFile added to Env. If you implement your own Env class, an
+  implementation of the API LinkFile will have to be provided.
+* MemTableRep takes MemTableAllocator instead of Arena
+
+### Improvements
+* RocksDBLite library now becomes smaller and will be compiled with -fno-exceptions flag.
+
+## 3.8.0 (11/14/2014)
+
+### Public API changes
+* BackupEngine::NewBackupEngine() was deprecated; please use BackupEngine::Open() from now on.
+* BackupableDB/RestoreBackupableDB have new GarbageCollect() methods, which will clean up files from corrupt and obsolete backups.
+* BackupableDB/RestoreBackupableDB have new GetCorruptedBackups() methods which list corrupt backups.
+
+### Cleanup
+* Bunch of code cleanup, some extra warnings turned on (-Wshadow, -Wshorten-64-to-32, -Wnon-virtual-dtor)
+
+### New features
+* CompactFiles and EventListener, although they are still in experimental state
+* Full ColumnFamily support in RocksJava.
+
+## 3.7.0 (11/6/2014)
+### Public API changes
+* Introduce SetOptions() API to allow adjusting a subset of options dynamically online
+* Introduce 4 new convenient functions for converting Options from string: GetColumnFamilyOptionsFromMap(), GetColumnFamilyOptionsFromString(), GetDBOptionsFromMap(), GetDBOptionsFromString()
+* Remove WriteBatchWithIndex.Delete() overloads using SliceParts
+* When opening a DB, if options.max_background_compactions is larger than the existing low pri pool of options.env, it will enlarge it. Similarly, options.max_background_flushes is larger than the existing high pri pool of options.env, it will enlarge it.
+
+## 3.6.0 (10/7/2014)
+### Disk format changes
+* If you're using RocksDB on ARM platforms and you're using default bloom filter, there is a disk format change you need to be aware of. There are three steps you need to do when you convert to new release: 1. turn off filter policy, 2. compact the whole database, 3. turn on filter policy
+
+### Behavior changes
+* We have refactored our system of stalling writes.  Any stall-related statistics' meanings are changed. Instead of per-write stall counts, we now count stalls per-epoch, where epochs are periods between flushes and compactions. You'll find more information in our Tuning Perf Guide once we release RocksDB 3.6.
+* When disableDataSync=true, we no longer sync the MANIFEST file.
+* Add identity_as_first_hash property to CuckooTable. SST file needs to be rebuilt to be opened by reader properly.
+
+### Public API changes
+* Change target_file_size_base type to uint64_t from int.
+* Remove allow_thread_local. This feature was proved to be stable, so we are turning it always-on.
+
+## 3.5.0 (9/3/2014)
 ### New Features
 * Add include/utilities/write_batch_with_index.h, providing a utilitiy class to query data out of WriteBatch when building it.
 * Move BlockBasedTable related options to BlockBasedTableOptions from Options. Change corresponding JNI interface. Options affected include:
@ -11,15 +83,12 @@
 ### Public API changes
 * The Prefix Extractor used with V2 compaction filters is now passed user key to SliceTransform::Transform instead of unparsed RocksDB key.

-
----- Past Releases -----
-
-
 ## 3.4.0 (8/18/2014)
 ### New Features
 * Support Multiple DB paths in universal style compactions
 * Add feature of storing plain table index and bloom filter in SST file.
 * CompactRange() will never output compacted files to level 0. This used to be the case when all the compaction input files were at level 0.
+* Added iterate_upper_bound to define the extent upto which the forward iterator will return entries. This will prevent iterating over delete markers and overwritten entries for edge cases where you want to break out the iterator anyways. This may improve perfomance in case there are a large number of delete markers or overwritten entries.

 ### Public API changes
 * DBOptions.db_paths now is a vector of a DBPath structure which indicates both of path and target size
--- a/INSTALL.md
+++ b/INSTALL.md
@ -2,7 +2,7 @@

 RocksDB's library should be able to compile without any dependency installed,
 although we recommend installing some compression libraries (see below).
-We do depend on newer gcc with C++11 support.
+We do depend on newer gcc/clang with C++11 support.

 There are few options when compiling RocksDB:

@ -15,6 +15,10 @@ There are few options when compiling RocksDB:
 * `make all` will compile our static library, and all our tools and unit tests. Our tools
 depend on gflags. You will need to have gflags installed to run `make all`.

+* By default the binary we produce is optimized for the platform you're compiling on
+(-march=native). If you want to build a portable binary, add 'PORTABLE=1' before
+your make commands, like this: `PORTABLE=1 make static_lib`
+
 ## Dependencies

 * You can link RocksDB with following compression libraries:
@ -72,13 +76,7 @@ depend on gflags. You will need to have gflags installed to run `make all`.
        * Install via [homebrew](http://brew.sh/).
            * If you're first time developer in MacOS, you still need to run: `xcode-select --install` in your command line.
            * run `brew tap homebrew/dupes; brew install gcc47 --use-llvm` to install gcc 4.7 (or higher).
-    * Install zlib, bzip2 and snappy libraries for compression.
-    * Install gflags. We have included a script
-    `build_tools/mac-install-gflags.sh`, which should automatically install it (execute this file instead of runing using "source" command).
-    If you installed gflags by other means (for example, `brew install gflags`),
-    please set `LIBRARY_PATH` and `CPATH` accordingly.
-    * Please note that some of the optimizations/features are disabled in OSX.
-    We did not run any production workloads on it.
+    * run `brew install rocksdb`

 * **iOS**:
-  * Run: `TARGET_OS=IOS make static_lib`
+  * Run: `TARGET_OS=IOS make static_lib`. When building the project which uses rocksdb iOS library, make sure to define two important pre-processing macros: `ROCKSDB_LITE` and `IOS_CROSS_COMPILE`.
--- a/296
+++ b/296
@ -3,12 +3,19 @@
 # found in the LICENSE file. See the AUTHORS file for names of contributors.

 # Inherit some settings from environment variables, if available
-INSTALL_PATH ?= $(CURDIR)

 #-----------------------------------------------

+CFLAGS += ${EXTRA_CFLAGS}
+CXXFLAGS += ${EXTRA_CXXFLAGS}
+LDFLAGS += $(EXTRA_LDFLAGS)
+MACHINE ?= $(shell uname -m)
+
 ifneq ($(MAKECMDGOALS),dbg)
-OPT += -O2 -fno-omit-frame-pointer -momit-leaf-frame-pointer
+OPT += -O2 -fno-omit-frame-pointer
+ifneq ($(MACHINE),ppc64) # ppc64 doesn't support -momit-leaf-frame-pointer
+OPT += -momit-leaf-frame-pointer
+endif
 else
 # intentionally left blank
 endif
@ -24,9 +31,9 @@ endif
 #-----------------------------------------------

 # detect what platform we're building on
-$(shell (export ROCKSDB_ROOT="$(CURDIR)"; "$(CURDIR)/build_tools/build_detect_platform" "$(CURDIR)/build_config.mk"))
+$(shell (export ROCKSDB_ROOT="$(CURDIR)"; "$(CURDIR)/build_tools/build_detect_platform" "$(CURDIR)/make_config.mk"))
 # this file is generated by the previous line to set build flags and sources
-include build_config.mk
+include make_config.mk

 ifneq ($(PLATFORM), IOS)
 CFLAGS += -g
@ -36,35 +43,71 @@ else
 OPT += -DNDEBUG
 endif

+ifneq ($(filter -DROCKSDB_LITE,$(OPT)),)
+	# found
+	CFLAGS += -fno-exceptions
+	CXXFLAGS += -fno-exceptions
+endif
+
 # ASAN doesn't work well with jemalloc. If we're compiling with ASAN, we should use regular malloc.
 ifdef COMPILE_WITH_ASAN
-	# ASAN compile flags
+	DISABLE_JEMALLOC=1
 	EXEC_LDFLAGS += -fsanitize=address
 	PLATFORM_CCFLAGS += -fsanitize=address
 	PLATFORM_CXXFLAGS += -fsanitize=address
-else
-	# if we're not compiling with ASAN, use jemalloc
+endif
+
+# TSAN doesn't work well with jemalloc. If we're compiling with TSAN, we should use regular malloc.
+ifdef COMPILE_WITH_TSAN
+	DISABLE_JEMALLOC=1
+	EXEC_LDFLAGS += -fsanitize=thread -pie
+	PLATFORM_CCFLAGS += -fsanitize=thread -fPIC -DROCKSDB_TSAN_RUN
+	PLATFORM_CXXFLAGS += -fsanitize=thread -fPIC -DROCKSDB_TSAN_RUN
+endif
+
+ifndef DISABLE_JEMALLOC
 	EXEC_LDFLAGS := $(JEMALLOC_LIB) $(EXEC_LDFLAGS)
 	PLATFORM_CXXFLAGS += $(JEMALLOC_INCLUDE) -DHAVE_JEMALLOC
 	PLATFORM_CCFLAGS += $(JEMALLOC_INCLUDE) -DHAVE_JEMALLOC
 endif

-WARNING_FLAGS = -Wall -Werror -Wsign-compare
+#-------------------------------------------------
+# make install related stuff
+INSTALL_PATH ?= /usr/local
+
+uninstall:
+	@rm -rf $(INSTALL_PATH)/include/rocksdb
+	@rm -rf $(INSTALL_PATH)/lib/$(LIBRARY)
+	@rm -rf $(INSTALL_PATH)/lib/$(SHARED)
+
+install:
+	@install -d $(INSTALL_PATH)/lib
+	@for header_dir in `find "include/rocksdb" -type d`; do \
+		install -d $(INSTALL_PATH)/$$header_dir; \
+	done
+	@for header in `find "include/rocksdb" -type f -name *.h`; do \
+		install -C -m 644 $$header $(INSTALL_PATH)/$$header; \
+	done
+	@[ ! -e $(LIBRARY) ] || install -C -m 644 $(LIBRARY) $(INSTALL_PATH)/lib
+	@[ ! -e $(SHARED) ] || install -C -m 644 $(SHARED) $(INSTALL_PATH)/lib
+#-------------------------------------------------
+
+WARNING_FLAGS = -Wall -Werror -Wsign-compare -Wshadow
 CFLAGS += $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CCFLAGS) $(OPT)
-CXXFLAGS += $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CXXFLAGS) $(OPT) -Woverloaded-virtual
+CXXFLAGS += $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CXXFLAGS) $(OPT) -Woverloaded-virtual -Wnon-virtual-dtor

 LDFLAGS += $(PLATFORM_LDFLAGS)

 LIBOBJECTS = $(SOURCES:.cc=.o)
-LIBOBJECTS += $(SOURCESCPP:.cpp=.o)
-MEMENVOBJECTS = $(MEMENV_SOURCES:.cc=.o)
+MOCKOBJECTS = $(MOCK_SOURCES:.cc=.o)

 TESTUTIL = ./util/testutil.o
-TESTHARNESS = ./util/testharness.o $(TESTUTIL)
+TESTHARNESS = ./util/testharness.o $(TESTUTIL) $(MOCKOBJECTS)
 BENCHHARNESS = ./util/benchharness.o
 VALGRIND_ERROR = 2
 VALGRIND_DIR = build_tools/VALGRIND_LOGS
 VALGRIND_VER := $(join $(VALGRIND_VER),valgrind)
+
 VALGRIND_OPTS = --error-exitcode=$(VALGRIND_ERROR) --leak-check=full

 TESTS = \
@ -85,22 +128,26 @@ TESTS = \
 	coding_test \
 	corruption_test \
 	crc32c_test \
+	slice_transform_test \
 	dbformat_test \
 	env_test \
-	blob_store_test \
+	fault_injection_test \
 	filelock_test \
 	filename_test \
-	filter_block_test \
+	block_based_filter_block_test \
+	full_filter_block_test \
 	histogram_test \
 	log_test \
 	manual_compaction_test \
 	memenv_test \
+	mock_env_test \
 	merge_test \
+	merger_test \
 	redis_test \
 	reduce_levels_test \
 	plain_table_db_test \
+	comparator_db_test \
 	prefix_test \
-	simple_table_db_test \
 	skiplist_test \
 	stringappend_test \
 	ttl_test \
@ -110,8 +157,11 @@ TESTS = \
 	spatial_db_test \
 	version_edit_test \
 	version_set_test \
+	compaction_picker_test \
+	version_builder_test \
 	file_indexer_test \
-	write_batch_test\
+	write_batch_test \
+	write_controller_test\
 	deletefile_test \
 	table_test \
 	thread_local_test \
@ -121,7 +171,14 @@ TESTS = \
 	cuckoo_table_builder_test \
 	cuckoo_table_reader_test \
 	cuckoo_table_db_test \
-	write_batch_with_index_test
+	flush_job_test \
+	wal_manager_test \
+	listener_test \
+	compaction_job_test \
+	thread_list_test \
+	sst_dump_test
+
+SUBSET :=  $(shell echo $(TESTS) |sed s/^.*$(ROCKSDBTESTS_START)/$(ROCKSDBTESTS_START)/)

 TOOLS = \
        sst_dump \
@ -130,9 +187,8 @@ TOOLS = \
        ldb \
 	db_repl_stress \
 	options_test \
-	blob_store_bench

-PROGRAMS = db_bench signal_test table_reader_bench log_and_apply_bench $(TOOLS)
+PROGRAMS = db_bench signal_test table_reader_bench log_and_apply_bench cache_bench perf_context_test memtablerep_bench $(TOOLS)

 # The library name is configurable since we are maintaining libraries of both
 # debug/release mode.
@ -140,7 +196,10 @@ ifeq ($(LIBNAME),)
        LIBNAME=librocksdb
 endif
 LIBRARY = ${LIBNAME}.a
-MEMENVLIBRARY = libmemenv.a
+
+ROCKSDB_MAJOR = $(shell egrep "ROCKSDB_MAJOR.[0-9]" include/rocksdb/version.h | cut -d ' ' -f 3)
+ROCKSDB_MINOR = $(shell egrep "ROCKSDB_MINOR.[0-9]" include/rocksdb/version.h | cut -d ' ' -f 3)
+ROCKSDB_PATCH = $(shell egrep "ROCKSDB_PATCH.[0-9]" include/rocksdb/version.h | cut -d ' ' -f 3)

 default: all

@ -153,29 +212,33 @@ ifneq ($(PLATFORM_SHARED_VERSIONED),true)
 SHARED1 = ${LIBNAME}.$(PLATFORM_SHARED_EXT)
 SHARED2 = $(SHARED1)
 SHARED3 = $(SHARED1)
+SHARED4 = $(SHARED1)
 SHARED = $(SHARED1)
 else
-# Update db.h if you change these.
-SHARED_MAJOR = 3
-SHARED_MINOR = 4
+SHARED_MAJOR = $(ROCKSDB_MAJOR)
+SHARED_MINOR = $(ROCKSDB_MINOR)
+SHARED_PATCH = $(ROCKSDB_PATCH)
 SHARED1 = ${LIBNAME}.$(PLATFORM_SHARED_EXT)
 SHARED2 = $(SHARED1).$(SHARED_MAJOR)
 SHARED3 = $(SHARED1).$(SHARED_MAJOR).$(SHARED_MINOR)
-SHARED = $(SHARED1) $(SHARED2) $(SHARED3)
-$(SHARED1): $(SHARED3)
-	ln -fs $(SHARED3) $(SHARED1)
-$(SHARED2): $(SHARED3)
-	ln -fs $(SHARED3) $(SHARED2)
+SHARED4 = $(SHARED1).$(SHARED_MAJOR).$(SHARED_MINOR).$(SHARED_PATCH)
+SHARED = $(SHARED1) $(SHARED2) $(SHARED3) $(SHARED4)
+$(SHARED1): $(SHARED4)
+	ln -fs $(SHARED4) $(SHARED1)
+$(SHARED2): $(SHARED4)
+	ln -fs $(SHARED4) $(SHARED2)
+$(SHARED3): $(SHARED4)
+	ln -fs $(SHARED4) $(SHARED3)
 endif

-$(SHARED3):
+$(SHARED4):
 	$(CXX) $(PLATFORM_SHARED_LDFLAGS)$(SHARED2) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) $(SOURCES) $(LDFLAGS) -o $@

 endif  # PLATFORM_SHARED_EXT

-.PHONY: blackbox_crash_test check clean coverage crash_test ldb_tests \
+.PHONY: blackbox_crash_test check clean coverage crash_test ldb_tests package \
 	release tags valgrind_check whitebox_crash_test format static_lib shared_lib all \
-	dbg
+	dbg rocksdbjavastatic rocksdbjava install uninstall analyze

 all: $(LIBRARY) $(PROGRAMS) $(TESTS)

@ -201,6 +264,10 @@ check: $(TESTS) ldb
 	for t in $(TESTS); do echo "***** Running $$t"; ./$$t || exit 1; done
 	python tools/ldb_test.py

+check_some: $(SUBSET) ldb
+	for t in $(SUBSET); do echo "***** Running $$t"; ./$$t || exit 1; done
+	python tools/ldb_test.py
+
 ldb_tests: ldb
 	python tools/ldb_test.py

@ -236,6 +303,10 @@ valgrind_check: all $(PROGRAMS) $(TESTS)
 		echo $$t $$((etime - stime)) >> $(VALGRIND_DIR)/valgrind_tests_times; \
 	done

+analyze:
+	$(MAKE) clean
+	$(CLANG_SCAN_BUILD) --use-analyzer=$(CLANG_ANALYZER) -o $(CURDIR)/scan_build_report $(MAKE) all -j32
+
 unity.cc:
 	$(shell (export ROCKSDB_ROOT="$(CURDIR)"; "$(CURDIR)/build_tools/unity" "$(CURDIR)/unity.cc"))

@ -243,10 +314,11 @@ unity: unity.cc unity.o
 	$(CXX) unity.o $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)

 clean:
-	-rm -f $(PROGRAMS) $(TESTS) $(LIBRARY) $(SHARED) $(MEMENVLIBRARY) build_config.mk unity.cc
+	-rm -f $(PROGRAMS) $(TESTS) $(LIBRARY) $(SHARED) make_config.mk unity.cc
 	-rm -rf ios-x86/* ios-arm/*
-	-find . -name "*.[od]" -exec rm {} \;
+	-find . -name "*.[oda]" -exec rm {} \;
 	-find . -type f -regex ".*\.\(\(gcda\)\|\(gcno\)\)" -exec rm {} \;
+	-rm -rf bzip2* snappy* zlib*
 tags:
 	ctags * -R
 	cscope -b `find . -name '*.cc'` `find . -name '*.h'`
@ -254,6 +326,9 @@ tags:
 format:
 	build_tools/format-diff.sh

+package:
+	bash build_tools/make_package.sh $(SHARED_MAJOR).$(SHARED_MINOR)
+
 # ---------------------------------------------------------------------------
 # 	Unit tests and tools
 # ---------------------------------------------------------------------------
@ -264,6 +339,12 @@ $(LIBRARY): $(LIBOBJECTS)
 db_bench: db/db_bench.o $(LIBOBJECTS) $(TESTUTIL)
 	$(CXX) db/db_bench.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@  $(LDFLAGS) $(COVERAGEFLAGS)

+cache_bench: util/cache_bench.o $(LIBOBJECTS) $(TESTUTIL)
+	$(CXX) util/cache_bench.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@  $(LDFLAGS) $(COVERAGEFLAGS)
+
+memtablerep_bench: db/memtablerep_bench.o $(LIBOBJECTS) $(TESTUTIL)
+	  $(CXX) db/memtablerep_bench.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@  $(LDFLAGS) $(COVERAGEFLAGS)
+
 block_hash_index_test: table/block_hash_index_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	 $(CXX) table/block_hash_index_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)

@ -276,9 +357,6 @@ db_sanity_test: tools/db_sanity_test.o $(LIBOBJECTS) $(TESTUTIL)
 db_repl_stress: tools/db_repl_stress.o $(LIBOBJECTS) $(TESTUTIL)
 	$(CXX) tools/db_repl_stress.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@  $(LDFLAGS) $(COVERAGEFLAGS)

-blob_store_bench: tools/blob_store_bench.o $(LIBOBJECTS) $(TESTUTIL)
-	$(CXX) tools/blob_store_bench.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@  $(LDFLAGS) $(COVERAGEFLAGS)
-
 signal_test: util/signal_test.o $(LIBOBJECTS)
 	$(CXX) util/signal_test.o $(LIBOBJECTS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)

@ -309,9 +387,6 @@ cache_test: util/cache_test.o $(LIBOBJECTS) $(TESTHARNESS)
 coding_test: util/coding_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) util/coding_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)

-blob_store_test: util/blob_store_test.o $(LIBOBJECTS) $(TESTHARNESS) $(TESTUTIL)
-	$(CXX) util/blob_store_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o$@ $(LDFLAGS) $(COVERAGEFLAGS)
-
 stringappend_test: utilities/merge_operators/string_append/stringappend_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) utilities/merge_operators/string_append/stringappend_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)

@ -333,6 +408,10 @@ corruption_test: db/corruption_test.o $(LIBOBJECTS) $(TESTHARNESS)
 crc32c_test: util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)

+slice_transform_test: util/slice_transform_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) util/slice_transform_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+
 db_test: db/db_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) db/db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)

@ -345,8 +424,8 @@ log_write_bench: util/log_write_bench.o $(LIBOBJECTS) $(TESTHARNESS)
 plain_table_db_test: db/plain_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) db/plain_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)

-simple_table_db_test: db/simple_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS)
-	$(CXX) db/simple_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+comparator_db_test: db/comparator_db_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) db/comparator_db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)

 table_reader_bench: table/table_reader_bench.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) table/table_reader_bench.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) -pg
@ -378,20 +457,35 @@ ttl_test: utilities/ttl/ttl_test.o $(LIBOBJECTS) $(TESTHARNESS)
 write_batch_with_index_test: utilities/write_batch_with_index/write_batch_with_index_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) utilities/write_batch_with_index/write_batch_with_index_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@  $(LDFLAGS) $(COVERAGEFLAGS)

+flush_job_test: db/flush_job_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) db/flush_job_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@  $(LDFLAGS) $(COVERAGEFLAGS)
+
+compaction_job_test: db/compaction_job_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) db/compaction_job_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@  $(LDFLAGS) $(COVERAGEFLAGS)
+
+wal_manager_test: db/wal_manager_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) db/wal_manager_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@  $(LDFLAGS) $(COVERAGEFLAGS)
+
 dbformat_test: db/dbformat_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) db/dbformat_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)

 env_test: util/env_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) util/env_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)

+fault_injection_test: db/fault_injection_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) db/fault_injection_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
 rate_limiter_test: util/rate_limiter_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) util/rate_limiter_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)

 filename_test: db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)

-filter_block_test: table/filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS)
-	$(CXX) table/filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+block_based_filter_block_test: table/block_based_filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) table/block_based_filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+full_filter_block_test: table/full_filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) table/full_filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)

 log_test: db/log_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) db/log_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
@ -411,6 +505,12 @@ version_edit_test: db/version_edit_test.o $(LIBOBJECTS) $(TESTHARNESS)
 version_set_test: db/version_set_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) db/version_set_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)

+compaction_picker_test: db/compaction_picker_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) db/compaction_picker_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+version_builder_test: db/version_builder_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) db/version_builder_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
 file_indexer_test : db/file_indexer_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) db/file_indexer_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)

@ -420,9 +520,15 @@ reduce_levels_test: tools/reduce_levels_test.o $(LIBOBJECTS) $(TESTHARNESS)
 write_batch_test: db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)

+write_controller_test: db/write_controller_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) db/write_controller_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
 merge_test: db/merge_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) db/merge_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)

+merger_test: table/merger_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) table/merger_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
 deletefile_test: db/deletefile_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) db/deletefile_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS)

@ -438,15 +544,26 @@ cuckoo_table_reader_test: table/cuckoo_table_reader_test.o $(LIBOBJECTS) $(TESTH
 cuckoo_table_db_test: db/cuckoo_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) db/cuckoo_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)

-options_test: util/options_test.o $(LIBOBJECTS) $(TESTHARNESS)
+listener_test: db/listener_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) db/listener_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+thread_list_test: util/thread_list_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) util/thread_list_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+compactor_test: utilities/compaction/compactor_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) utilities/compaction/compactor_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+options_test: util/options_test.o util/options_helper.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) util/options_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)

-$(MEMENVLIBRARY) : $(MEMENVOBJECTS)
-	rm -f $@
-	$(AR) -rs $@ $(MEMENVOBJECTS)
+sst_dump_test: util/sst_dump_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) util/sst_dump_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@  $(LDFLAGS) $(COVERAGEFLAGS)
+
+memenv_test : util/memenv_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) util/memenv_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)

-memenv_test : helpers/memenv/memenv_test.o $(MEMENVOBJECTS) $(LIBOBJECTS) $(TESTHARNESS)
-	$(CXX) helpers/memenv/memenv_test.o $(MEMENVOBJECTS) $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+mock_env_test : util/mock_env_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) util/mock_env_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)

 manual_compaction_test: util/manual_compaction_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) util/manual_compaction_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
@ -475,24 +592,83 @@ ldb: tools/ldb.o $(LIBOBJECTS)

 JNI_NATIVE_SOURCES = ./java/rocksjni/*.cc
 JAVA_INCLUDE = -I$(JAVA_HOME)/include/ -I$(JAVA_HOME)/include/linux
-ROCKSDBJNILIB = librocksdbjni.so
-ROCKSDB_JAR = rocksdbjni.jar
+ARCH := $(shell getconf LONG_BIT)
+ROCKSDBJNILIB = librocksdbjni-linux$(ARCH).so
+ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-linux$(ARCH).jar
+ROCKSDB_JAR_ALL = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH).jar
+ROCKSDB_JAVADOCS_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-javadoc.jar
+ROCKSDB_SOURCES_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-sources.jar

 ifeq ($(PLATFORM), OS_MACOSX)
-ROCKSDBJNILIB = librocksdbjni.jnilib
-JAVA_INCLUDE = -I/System/Library/Frameworks/JavaVM.framework/Headers/
+ROCKSDBJNILIB = librocksdbjni-osx.jnilib
+ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-osx.jar
+ifneq ("$(wildcard $(JAVA_HOME)/include/darwin)","")
+	JAVA_INCLUDE = -I$(JAVA_HOME)/include -I $(JAVA_HOME)/include/darwin
+else
+	JAVA_INCLUDE = -I/System/Library/Frameworks/JavaVM.framework/Headers/
+endif
 endif

+libz.a:
+	-rm -rf zlib-1.2.8
+	curl -O http://zlib.net/zlib-1.2.8.tar.gz
+	tar xvzf zlib-1.2.8.tar.gz
+	cd zlib-1.2.8 && CFLAGS='-fPIC' ./configure --static && make
+	cp zlib-1.2.8/libz.a .
+
+libbz2.a:
+	-rm -rf bzip2-1.0.6
+	curl -O  http://www.bzip.org/1.0.6/bzip2-1.0.6.tar.gz
+	tar xvzf bzip2-1.0.6.tar.gz
+	cd bzip2-1.0.6 && make CFLAGS='-fPIC -Wall -Winline -O2 -g -D_FILE_OFFSET_BITS=64'
+	cp bzip2-1.0.6/libbz2.a .
+
+libsnappy.a:
+	-rm -rf snappy-1.1.1
+	curl -O https://snappy.googlecode.com/files/snappy-1.1.1.tar.gz
+	tar xvzf snappy-1.1.1.tar.gz
+	cd snappy-1.1.1 && ./configure --with-pic --enable-static
+	cd snappy-1.1.1 && make
+	cp snappy-1.1.1/.libs/libsnappy.a .
+
+
+rocksdbjavastatic: libz.a libbz2.a libsnappy.a
+	OPT="-fPIC -DNDEBUG -O2" $(MAKE) $(LIBRARY) -j
+	cd java;$(MAKE) javalib;
+	rm -f ./java/target/$(ROCKSDBJNILIB)
+	$(CXX) $(CXXFLAGS) -I./java/. $(JAVA_INCLUDE) -shared -fPIC -o ./java/target/$(ROCKSDBJNILIB) $(JNI_NATIVE_SOURCES) $(LIBOBJECTS) $(COVERAGEFLAGS) libz.a libbz2.a libsnappy.a
+	cd java/target;strip -S -x $(ROCKSDBJNILIB)
+	cd java;jar -cf target/$(ROCKSDB_JAR) HISTORY*.md
+	cd java/target;jar -uf $(ROCKSDB_JAR) $(ROCKSDBJNILIB)
+	cd java/target/classes;jar -uf ../$(ROCKSDB_JAR) org/rocksdb/*.class org/rocksdb/util/*.class
+	cd java/target/apidocs;jar -cf ../$(ROCKSDB_JAVADOCS_JAR) *
+	cd java/src/main/java;jar -cf ../../../target/$(ROCKSDB_SOURCES_JAR) org
+
+rocksdbjavastaticrelease: rocksdbjavastatic
+	cd java/crossbuild && vagrant destroy -f && vagrant up linux32 && vagrant halt linux32 && vagrant up linux64 && vagrant halt linux64
+	cd java;jar -cf target/$(ROCKSDB_JAR_ALL) HISTORY*.md
+	cd java/target;jar -uf $(ROCKSDB_JAR_ALL) librocksdbjni-*.so librocksdbjni-*.jnilib
+	cd java/target/classes;jar -uf ../$(ROCKSDB_JAR_ALL) org/rocksdb/*.class org/rocksdb/util/*.class
+
+rocksdbjavastaticpublish: rocksdbjavastaticrelease
+	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/target/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-javadoc.jar -Dclassifier=javadoc
+	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/target/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-sources.jar -Dclassifier=sources
+	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/target/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-linux64.jar -Dclassifier=linux64
+	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/target/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-linux32.jar -Dclassifier=linux32
+	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/target/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-osx.jar -Dclassifier=osx
+	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/target/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH).jar
+
 rocksdbjava:
 	OPT="-fPIC -DNDEBUG -O2" $(MAKE) $(LIBRARY) -j32
-	cd java;$(MAKE) java;
-	rm -f ./java/$(ROCKSDBJNILIB)
-	$(CXX) $(CXXFLAGS) -I./java/. $(JAVA_INCLUDE) -shared -fPIC -o ./java/$(ROCKSDBJNILIB) $(JNI_NATIVE_SOURCES) $(LIBOBJECTS) $(JAVA_LDFLAGS) $(COVERAGEFLAGS)
-	cd java;jar -cf $(ROCKSDB_JAR) org/rocksdb/*.class org/rocksdb/util/*.class HISTORY*.md $(ROCKSDBJNILIB)
+	cd java;$(MAKE) javalib;
+	rm -f ./java/target/$(ROCKSDBJNILIB)
+	$(CXX) $(CXXFLAGS) -I./java/. $(JAVA_INCLUDE) -shared -fPIC -o ./java/target/$(ROCKSDBJNILIB) $(JNI_NATIVE_SOURCES) $(LIBOBJECTS) $(JAVA_LDFLAGS) $(COVERAGEFLAGS)
+	cd java;jar -cf target/$(ROCKSDB_JAR) HISTORY*.md
+	cd java/target;jar -uf $(ROCKSDB_JAR) $(ROCKSDBJNILIB)
+	cd java/target/classes;jar -uf ../$(ROCKSDB_JAR) org/rocksdb/*.class org/rocksdb/util/*.class

 jclean:
 	cd java;$(MAKE) clean;
-	rm -f $(ROCKSDBJNILIB)

 jtest:
 	cd java;$(MAKE) sample;$(MAKE) test;
@ -565,8 +741,12 @@ ifneq ($(MAKECMDGOALS),clean)
 ifneq ($(MAKECMDGOALS),format)
 ifneq ($(MAKECMDGOALS),jclean)
 ifneq ($(MAKECMDGOALS),jtest)
+ifneq ($(MAKECMDGOALS),package)
+ifneq ($(MAKECMDGOALS),analyze)
 -include $(DEPFILES)
 endif
 endif
 endif
 endif
+endif
+endif
--- a/README.md
+++ b/README.md
@ -3,7 +3,7 @@
 [![Build Status](https://travis-ci.org/facebook/rocksdb.svg?branch=master)](https://travis-ci.org/facebook/rocksdb)

 RocksDB is developed and maintained by Facebook Database Engineering Team.
-It is built on on earlier work on LevelDB by Sanjay Ghemawat (sanjay@google.com)
+It is built on earlier work on LevelDB by Sanjay Ghemawat (sanjay@google.com)
 and Jeff Dean (jeff@google.com)

 This code is a library that forms the core building block for a fast
--- a/16
+++ b/16
@ -0,0 +1,16 @@
+Vagrant.configure("2") do |config|
+
+  config.vm.provider "virtualbox" do |v|
+    v.memory = 4096
+    v.cpus = 2
+  end
+
+  config.vm.define "ubuntu14" do |box|
+    box.vm.box = "ubuntu/trusty64"
+  end
+
+  config.vm.define "centos65" do |box|
+    box.vm.box = "chef/centos-6.5"
+  end
+
+end
--- a/build_tools/build_detect_platform
+++ b/build_tools/build_detect_platform
@ -32,7 +32,7 @@
 #   2. Once install, add the include path/lib path for gflags to CPATH and
 #      LIBRARY_PATH respectively. If installed with default mode, the
 #      lib and include path will be /usr/local/lib and /usr/local/include
-# Mac user can do this by running build_tools/mac-install-gflags.sh
+# Mac user can do this by having brew installed and running brew install gflags

 OUTPUT=$1
 if test -z "$OUTPUT"; then
@ -46,18 +46,15 @@ PLATFORM_CXXFLAGS="-std=c++11"
 COMMON_FLAGS="-DROCKSDB_PLATFORM_POSIX"

 # Default to fbcode gcc on internal fb machines
-if [ -d /mnt/gvfs/third-party -a -z "$CXX" ]; then
+if [ -z "$ROCKSDB_NO_FBCODE" -a -d /mnt/gvfs/third-party ]; then
    FBCODE_BUILD="true"
-    if [ -z "$USE_CLANG" ]; then
-        CENTOS_VERSION=`rpm -q --qf "%{VERSION}" \
-          $(rpm -q --whatprovides redhat-release)`
-        if [ "$CENTOS_VERSION" = "6" ]; then
-          source "$PWD/build_tools/fbcode.gcc481.sh"
-        else
-          source "$PWD/build_tools/fbcode.gcc471.sh"
-        fi
+    # If we're compiling with TSAN we need pic build
+    PIC_BUILD=$COMPILE_WITH_TSAN
+    if [ -z "$ROCKSDB_FBCODE_BUILD_WITH_481" ]; then
+      source "$PWD/build_tools/fbcode_config.sh"
    else
-        source "$PWD/build_tools/fbcode.clang31.sh"
+      # we need this to build with MySQL. Don't use for other purposes.
+      source "$PWD/build_tools/fbcode_config4.8.1.sh"
    fi
 fi

@ -78,6 +75,14 @@ if test -z "$TARGET_OS"; then
    TARGET_OS=`uname -s`
 fi

+if test -z "$CLANG_SCAN_BUILD"; then
+    CLANG_SCAN_BUILD=scan-build
+fi
+
+if test -z "$CLANG_ANALYZER"; then  
+    CLANG_ANALYZER=$(which clang++)
+fi
+
 COMMON_FLAGS="$COMMON_FLAGS ${CFLAGS}"
 CROSS_COMPILE=
 PLATFORM_CCFLAGS=
@ -164,7 +169,7 @@ if test -z "$DO_NOT_RUN_BUILD_DETECT_VERSION"; then
  "$PWD/build_tools/build_detect_version"
 fi

-# We want to make a list of all cc files within util, db, table, and helpers
+# We want to make a list of all cc files within util, db and table
 # except for the test and benchmark files. By default, find will output a list
 # of all files matching either rule, so we need to append -print to make the
 # prune take effect.
@ -173,36 +178,27 @@ DIRS="util db table utilities"
 set -f # temporarily disable globbing so that our patterns arent expanded
 PRUNE_TEST="-name *test*.cc -prune"
 PRUNE_BENCH="-name *bench*.cc -prune"
-PORTABLE_FILES=`cd "$ROCKSDB_ROOT"; find $DIRS $PRUNE_TEST -o $PRUNE_BENCH -o -name '*.cc' -print | sort | tr "\n" " "`
-PORTABLE_CPP=`cd "$ROCKSDB_ROOT"; find $DIRS $PRUNE_TEST -o $PRUNE_BENCH -o -name '*.cpp' -print | sort | tr "\n" " "`
+PRUNE_MOCK="-name *mock*.cc -prune"
+PORTABLE_FILES=`cd "$ROCKSDB_ROOT"; find $DIRS $PRUNE_TEST -o $PRUNE_BENCH -o $PRUNE_MOCK -o -name '*.cc' -print | sort | tr "\n" " "`
+MOCK_SOURCES=`cd "$ROCKSDB_ROOT"; find $DIRS -name '*mock*.cc' -print | grep -v "test" | sort | tr "\n" " "`
 set +f # re-enable globbing

 # The sources consist of the portable files, plus the platform-specific port
 # file.
 echo "SOURCES=$PORTABLE_FILES $GENERIC_PORT_FILES $PORT_FILES" >> "$OUTPUT"
-echo "SOURCESCPP=$PORTABLE_CPP" >> "$OUTPUT"
-echo "MEMENV_SOURCES=helpers/memenv/memenv.cc" >> "$OUTPUT"
+echo "MOCK_SOURCES=$MOCK_SOURCES" >> "$OUTPUT"

 if [ "$CROSS_COMPILE" = "true" -o "$FBCODE_BUILD" = "true" ]; then
    # Cross-compiling; do not try any compilation tests.
    # Also don't need any compilation tests if compiling on fbcode
    true
 else
-    # If -std=c++0x works, use <atomic>.  Otherwise use port_posix.h.
-    $CXX $CFLAGS -std=c++0x -x c++ - -o /dev/null 2>/dev/null  <<EOF
-      #include <atomic>
-      int main() {}
-EOF
-    if [ "$?" = 0 ]; then
-        COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_ATOMIC_PRESENT"
-    fi
-
    # Test whether fallocate is available
    $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
      #include <fcntl.h>
      int main() {
 	int fd = open("/dev/null", 0);
-	fallocate(fd, 0, 0, 1024);
+  fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, 0, 1024);
      }
 EOF
    if [ "$?" = 0 ]; then
@ -302,6 +298,14 @@ EOF
    fi
 fi

+# Test whether -Wshorten-64-to-32 is available
+$CXX $CFLAGS -x c++ - -o /dev/null -Wshorten-64-to-32 2>/dev/null  <<EOF
+  int main() {}
+EOF
+if [ "$?" = 0 ]; then
+    COMMON_FLAGS="$COMMON_FLAGS -Wshorten-64-to-32"
+fi
+
 # shall we use HDFS?

 if test "$USE_HDFS"; then
@ -318,14 +322,22 @@ if test "$USE_HDFS"; then
  JAVA_LDFLAGS="$JAVA_LDFLAGS $HDFS_LDFLAGS"
 fi

-# if Intel SSE instruction set is supported, set USE_SSE=" -msse -msse4.2 "
-COMMON_FLAGS="$COMMON_FLAGS $USE_SSE"
+if test "$USE_SSE"; then
+  # if Intel SSE instruction set is supported, set USE_SSE=1
+  COMMON_FLAGS="$COMMON_FLAGS -msse -msse4.2 "
+elif test -z "$PORTABLE"; then
+  COMMON_FLAGS="$COMMON_FLAGS -march=native "
+fi

 PLATFORM_CCFLAGS="$PLATFORM_CCFLAGS $COMMON_FLAGS"
 PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS $COMMON_FLAGS"

 VALGRIND_VER="$VALGRIND_VER"

+ROCKSDB_MAJOR=`build_tools/version.sh major`
+ROCKSDB_MINOR=`build_tools/version.sh minor`
+ROCKSDB_PATCH=`build_tools/version.sh patch`
+
 echo "CC=$CC" >> "$OUTPUT"
 echo "CXX=$CXX" >> "$OUTPUT"
 echo "PLATFORM=$PLATFORM" >> "$OUTPUT"
@ -341,3 +353,8 @@ echo "PLATFORM_SHARED_VERSIONED=$PLATFORM_SHARED_VERSIONED" >> "$OUTPUT"
 echo "EXEC_LDFLAGS=$EXEC_LDFLAGS" >> "$OUTPUT"
 echo "JEMALLOC_INCLUDE=$JEMALLOC_INCLUDE" >> "$OUTPUT"
 echo "JEMALLOC_LIB=$JEMALLOC_LIB" >> "$OUTPUT"
+echo "ROCKSDB_MAJOR=$ROCKSDB_MAJOR" >> "$OUTPUT"
+echo "ROCKSDB_MINOR=$ROCKSDB_MINOR" >> "$OUTPUT"
+echo "ROCKSDB_PATCH=$ROCKSDB_PATCH" >> "$OUTPUT"
+echo "CLANG_SCAN_BUILD=$CLANG_SCAN_BUILD" >> "$OUTPUT"
+echo "CLANG_ANALYZER=$CLANG_ANALYZER" >> "$OUTPUT"
--- a/build_tools/fbcode.clang31.sh
+++ b/build_tools/fbcode.clang31.sh
@ -1,74 +0,0 @@
-#!/bin/sh
-#
-# Set environment variables so that we can compile leveldb using
-# fbcode settings.  It uses the latest g++ compiler and also
-# uses jemalloc
-
-TOOLCHAIN_REV=fbe3b095a4cc4a3713730050d182b7b4a80c342f
-TOOLCHAIN_EXECUTABLES="/mnt/gvfs/third-party/$TOOLCHAIN_REV/centos5.2-native"
-TOOLCHAIN_LIB_BASE="/mnt/gvfs/third-party/$TOOLCHAIN_REV/gcc-4.7.1-glibc-2.14.1"
-TOOL_JEMALLOC=jemalloc-3.3.1/9202ce3
-GLIBC_RUNTIME_PATH=/usr/local/fbcode/gcc-4.7.1-glibc-2.14.1
-
-# location of libgcc
-LIBGCC_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/include"
-LIBGCC_LIBS=" -L $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/libs"
-
-# location of glibc
-GLIBC_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/glibc/glibc-2.14.1/99df8fc/include"
-GLIBC_LIBS=" -L $TOOLCHAIN_LIB_BASE/glibc/glibc-2.14.1/99df8fc/lib"
-
-# location of snappy headers and libraries
-SNAPPY_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/snappy/snappy-1.0.3/7518bbe/include"
-SNAPPY_LIBS=" $TOOLCHAIN_LIB_BASE/snappy/snappy-1.0.3/7518bbe/lib/libsnappy.a"
-
-# location of zlib headers and libraries
-ZLIB_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/zlib/zlib-1.2.5/91ddd43/include"
-ZLIB_LIBS=" $TOOLCHAIN_LIB_BASE/zlib/zlib-1.2.5/91ddd43/lib/libz.a"
-
-# location of gflags headers and libraries
-GFLAGS_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/91ddd43/include"
-GFLAGS_LIBS=" $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/91ddd43/lib/libgflags.a"
-
-# location of bzip headers and libraries
-BZIP_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/bzip2/bzip2-1.0.6/91ddd43/include"
-BZIP_LIBS=" $TOOLCHAIN_LIB_BASE/bzip2/bzip2-1.0.6/91ddd43/lib/libbz2.a"
-
-# location of gflags headers and libraries
-GFLAGS_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/91ddd43/include"
-GFLAGS_LIBS=" $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/91ddd43/lib/libgflags.a"
-
-# use Intel SSE support for checksum calculations
-export USE_SSE=" -msse -msse4.2 "
-
-CC="$TOOLCHAIN_EXECUTABLES/clang/clang-3.2/0b7c69d/bin/clang $CLANG_INCLUDES"
-CXX="$TOOLCHAIN_EXECUTABLES/clang/clang-3.2/0b7c69d/bin/clang++ $CLANG_INCLUDES $JINCLUDE $SNAPPY_INCLUDE $ZLIB_INCLUDE $BZIP_INCLUDE $GFLAGS_INCLUDE"
-AR=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ar
-RANLIB=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ranlib
-
-CFLAGS="-B$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin -nostdlib "
-CFLAGS+=" -nostdinc -isystem $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/include/c++/4.7.1 "
-CFLAGS+=" -isystem $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/include/c++/4.7.1/x86_64-facebook-linux "
-CFLAGS+=" -isystem $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/include/c++/4.7.1/backward "
-CFLAGS+=" -isystem $TOOLCHAIN_LIB_BASE/glibc/glibc-2.14.1/99df8fc/include "
-CFLAGS+=" -isystem $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/include "
-CFLAGS+=" -isystem $TOOLCHAIN_LIB_BASE/clang/clang-3.2/0b7c69d/lib/clang/3.2/include "
-CFLAGS+=" -isystem $TOOLCHAIN_LIB_BASE/kernel-headers/kernel-headers-3.2.18_70_fbk11_00129_gc8882d0/da39a3e/include/linux "
-CFLAGS+=" -isystem $TOOLCHAIN_LIB_BASE/kernel-headers/kernel-headers-3.2.18_70_fbk11_00129_gc8882d0/da39a3e/include "
-CFLAGS+=" -Wall -Wno-sign-compare -Wno-unused-variable -Winvalid-pch -Wno-deprecated -Woverloaded-virtual"
-CFLAGS+=" $LIBGCC_INCLUDE $GLIBC_INCLUDE"
-CXXFLAGS="$CFLAGS -nostdinc++"
-
-CFLAGS+=" -I $TOOLCHAIN_LIB_BASE/jemalloc/$TOOL_JEMALLOC/include -DHAVE_JEMALLOC"
-
-EXEC_LDFLAGS=" -Wl,--whole-archive $TOOLCHAIN_LIB_BASE/jemalloc/$TOOL_JEMALLOC/lib/libjemalloc.a"
-EXEC_LDFLAGS+=" -Wl,--no-whole-archive $TOOLCHAIN_LIB_BASE/libunwind/libunwind-1.0.1/350336c/lib/libunwind.a"
-EXEC_LDFLAGS+=" $HDFSLIB $SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $GFLAGS_LIBS"
-EXEC_LDFLAGS+=" -Wl,--dynamic-linker,$GLIBC_RUNTIME_PATH/lib/ld-linux-x86-64.so.2"
-EXEC_LDFLAGS+=" -B$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin"
-
-PLATFORM_LDFLAGS="$LIBGCC_LIBS $GLIBC_LIBS "
-
-EXEC_LDFLAGS_SHARED="$SNAPPY_LIBS $ZLIB_LIBS $GFLAGS_LIBS"
-
-export CC CXX AR RANLIB CFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED 
--- a/build_tools/fbcode.gcc471.sh
+++ b/build_tools/fbcode.gcc471.sh
@ -1,70 +0,0 @@
-#!/bin/sh
-#
-# Set environment variables so that we can compile leveldb using
-# fbcode settings.  It uses the latest g++ compiler and also
-# uses jemalloc
-
-TOOLCHAIN_REV=fbe3b095a4cc4a3713730050d182b7b4a80c342f
-TOOLCHAIN_EXECUTABLES="/mnt/gvfs/third-party/$TOOLCHAIN_REV/centos5.2-native"
-TOOLCHAIN_LIB_BASE="/mnt/gvfs/third-party/$TOOLCHAIN_REV/gcc-4.7.1-glibc-2.14.1"
-TOOL_JEMALLOC=jemalloc-3.3.1/9202ce3
-
-# location of libhdfs libraries
-if test "$USE_HDFS"; then
-  JAVA_HOME="/usr/local/jdk-6u22-64"
-  JINCLUDE="-I$JAVA_HOME/include -I$JAVA_HOME/include/linux"
-  GLIBC_RUNTIME_PATH="/usr/local/fbcode/gcc-4.7.1-glibc-2.14.1"
-  HDFSLIB=" -Wl,--no-whole-archive hdfs/libhdfs.a -L$JAVA_HOME/jre/lib/amd64 "
-  HDFSLIB+=" -L$JAVA_HOME/jre/lib/amd64/server -L$GLIBC_RUNTIME_PATH/lib "
-  HDFSLIB+=" -ldl -lverify -ljava -ljvm "
-fi
-
-# location of libgcc
-LIBGCC_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/include"
-LIBGCC_LIBS=" -L $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/libs"
-
-# location of glibc
-GLIBC_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/glibc/glibc-2.14.1/99df8fc/include"
-GLIBC_LIBS=" -L $TOOLCHAIN_LIB_BASE/glibc/glibc-2.14.1/99df8fc/lib"
-
-# location of snappy headers and libraries
-SNAPPY_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/snappy/snappy-1.0.3/7518bbe/include"
-SNAPPY_LIBS=" $TOOLCHAIN_LIB_BASE/snappy/snappy-1.0.3/7518bbe/lib/libsnappy.a"
-
-# location of zlib headers and libraries
-ZLIB_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/zlib/zlib-1.2.5/91ddd43/include"
-ZLIB_LIBS=" $TOOLCHAIN_LIB_BASE/zlib/zlib-1.2.5/91ddd43/lib/libz.a"
-
-# location of bzip headers and libraries
-BZIP_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/bzip2/bzip2-1.0.6/91ddd43/include"
-BZIP_LIBS=" $TOOLCHAIN_LIB_BASE/bzip2/bzip2-1.0.6/91ddd43/lib/libbz2.a"
-
-# location of gflags headers and libraries
-GFLAGS_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/91ddd43/include"
-GFLAGS_LIBS=" $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/91ddd43/lib/libgflags.a"
-
-# use Intel SSE support for checksum calculations
-export USE_SSE=" -msse -msse4.2 "
-
-CC="$TOOLCHAIN_EXECUTABLES/gcc/gcc-4.7.1-glibc-2.14.1/bin/gcc"
-CXX="$TOOLCHAIN_EXECUTABLES/gcc/gcc-4.7.1-glibc-2.14.1/bin/g++ $JINCLUDE $SNAPPY_INCLUDE $ZLIB_INCLUDE $BZIP_INCLUDE $GFLAGS_INCLUDE"
-AR=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ar
-RANLIB=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ranlib
-
-CFLAGS="-B$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/gold -m64 -mtune=generic"
-CFLAGS+=" -I $TOOLCHAIN_LIB_BASE/jemalloc/$TOOL_JEMALLOC/include -DHAVE_JEMALLOC"
-CFLAGS+=" $LIBGCC_INCLUDE $GLIBC_INCLUDE"
-CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_ATOMIC_PRESENT -DROCKSDB_FALLOCATE_PRESENT"
-CFLAGS+=" -DSNAPPY -DGFLAGS=google -DZLIB -DBZIP2"
-
-EXEC_LDFLAGS=" -Wl,--whole-archive $TOOLCHAIN_LIB_BASE/jemalloc/$TOOL_JEMALLOC/lib/libjemalloc.a"
-EXEC_LDFLAGS+=" -Wl,--no-whole-archive $TOOLCHAIN_LIB_BASE/libunwind/libunwind-1.0.1/350336c/lib/libunwind.a"
-EXEC_LDFLAGS+=" $HDFSLIB $SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $GFLAGS_LIBS"
-
-PLATFORM_LDFLAGS="$LIBGCC_LIBS $GLIBC_LIBS "
-
-EXEC_LDFLAGS_SHARED="$SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $GFLAGS_LIBS"
-
-VALGRIND_VER="$TOOLCHAIN_LIB_BASE/valgrind/valgrind-3.8.1/91ddd43/bin/"
-
-export CC CXX AR RANLIB CFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED VALGRIND_VER
--- a/build_tools/fbcode.gcc481.sh
+++ b/build_tools/fbcode.gcc481.sh
@ -1,86 +0,0 @@
-#!/bin/sh
-#
-# Set environment variables so that we can compile rocksdb using
-# fbcode settings.  It uses the latest g++ compiler and also
-# uses jemalloc
-
-TOOLCHAIN_REV=53dc1fe83f84e9145b9ffb81b81aa7f6a49c87cc
-CENTOS_VERSION=`rpm -q --qf "%{VERSION}" $(rpm -q --whatprovides redhat-release)`
-if [ "$CENTOS_VERSION" = "6" ]; then
-  TOOLCHAIN_EXECUTABLES="/mnt/gvfs/third-party/$TOOLCHAIN_REV/centos6-native"
-else
-  TOOLCHAIN_EXECUTABLES="/mnt/gvfs/third-party/$TOOLCHAIN_REV/centos5.2-native"
-fi
-TOOLCHAIN_LIB_BASE="/mnt/gvfs/third-party/$TOOLCHAIN_REV/gcc-4.8.1-glibc-2.17"
-
-# location of libhdfs libraries
-if test "$USE_HDFS"; then
-  JAVA_HOME="/usr/local/jdk-6u22-64"
-  JINCLUDE="-I$JAVA_HOME/include -I$JAVA_HOME/include/linux"
-  GLIBC_RUNTIME_PATH="/usr/local/fbcode/gcc-4.8.1-glibc-2.17"
-  HDFSLIB=" -Wl,--no-whole-archive hdfs/libhdfs.a -L$JAVA_HOME/jre/lib/amd64 "
-  HDFSLIB+=" -L$JAVA_HOME/jre/lib/amd64/server -L$GLIBC_RUNTIME_PATH/lib "
-  HDFSLIB+=" -ldl -lverify -ljava -ljvm "
-fi
-
-# location of libgcc
-LIBGCC_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.8.1/8aac7fc/include"
-LIBGCC_LIBS=" -L $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.8.1/8aac7fc/libs"
-
-# location of glibc
-GLIBC_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/glibc/glibc-2.17/99df8fc/include"
-GLIBC_LIBS=" -L $TOOLCHAIN_LIB_BASE/glibc/glibc-2.17/99df8fc/lib"
-
-# location of snappy headers and libraries
-SNAPPY_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/snappy/snappy-1.0.3/43d84e2/include"
-SNAPPY_LIBS=" $TOOLCHAIN_LIB_BASE/snappy/snappy-1.0.3/43d84e2/lib/libsnappy.a"
-
-# location of zlib headers and libraries
-ZLIB_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/zlib/zlib-1.2.5/c3f970a/include"
-ZLIB_LIBS=" $TOOLCHAIN_LIB_BASE/zlib/zlib-1.2.5/c3f970a/lib/libz.a"
-
-# location of bzip headers and libraries
-BZIP_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/bzip2/bzip2-1.0.6/c3f970a/include"
-BZIP_LIBS=" $TOOLCHAIN_LIB_BASE/bzip2/bzip2-1.0.6/c3f970a/lib/libbz2.a"
-
-LZ4_REV=065ec7e38fe83329031f6668c43bef83eff5808b
-LZ4_INCLUDE=" -I /mnt/gvfs/third-party2/lz4/$LZ4_REV/r108/gcc-4.8.1-glibc-2.17/c3f970a/include"
-LZ4_LIBS=" /mnt/gvfs/third-party2/lz4/$LZ4_REV/r108/gcc-4.8.1-glibc-2.17/c3f970a/lib/liblz4.a"
-
-# location of gflags headers and libraries
-GFLAGS_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/c3f970a/include"
-GFLAGS_LIBS=" $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/c3f970a/lib/libgflags.a"
-
-# location of jemalloc
-JEMALLOC_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/jemalloc/jemalloc-3.4.1/4d53c6f/include/"
-JEMALLOC_LIB=" -Wl,--whole-archive $TOOLCHAIN_LIB_BASE/jemalloc/jemalloc-3.4.1/4d53c6f/lib/libjemalloc.a"
-
-# location of numa
-NUMA_REV=829d10dac0230f99cd7e1778869d2adf3da24b65
-NUMA_INCLUDE=" -I /mnt/gvfs/third-party2/numa/$NUMA_REV/2.0.8/gcc-4.8.1-glibc-2.17/c3f970a/include/"
-NUMA_LIB=" /mnt/gvfs/third-party2/numa/$NUMA_REV/2.0.8/gcc-4.8.1-glibc-2.17/c3f970a/lib/libnuma.a"
-
-# use Intel SSE support for checksum calculations
-export USE_SSE=" -msse -msse4.2 "
-
-CC="$TOOLCHAIN_EXECUTABLES/gcc/gcc-4.8.1/cc6c9dc/bin/gcc"
-CXX="$TOOLCHAIN_EXECUTABLES/gcc/gcc-4.8.1/cc6c9dc/bin/g++ $JINCLUDE $SNAPPY_INCLUDE $ZLIB_INCLUDE $BZIP_INCLUDE $LZ4_INCLUDE $GFLAGS_INCLUDE $NUMA_INCLUDE"
-AR=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ar
-RANLIB=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ranlib
-
-CFLAGS="-B$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/gold -m64 -mtune=generic"
-CFLAGS+=" $LIBGCC_INCLUDE $GLIBC_INCLUDE"
-CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_ATOMIC_PRESENT -DROCKSDB_FALLOCATE_PRESENT"
-CFLAGS+=" -DSNAPPY -DGFLAGS=google -DZLIB -DBZIP2 -DLZ4 -DNUMA"
-
-EXEC_LDFLAGS="-Wl,--dynamic-linker,/usr/local/fbcode/gcc-4.8.1-glibc-2.17/lib/ld.so"
-EXEC_LDFLAGS+=" -Wl,--no-whole-archive $TOOLCHAIN_LIB_BASE/libunwind/libunwind-1.0.1/675d945/lib/libunwind.a"
-EXEC_LDFLAGS+=" $HDFSLIB $SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $GFLAGS_LIBS $NUMA_LIB"
-
-PLATFORM_LDFLAGS="$LIBGCC_LIBS $GLIBC_LIBS "
-
-EXEC_LDFLAGS_SHARED="$SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $GFLAGS_LIBS"
-
-VALGRIND_VER="$TOOLCHAIN_LIB_BASE/valgrind/valgrind-3.8.1/c3f970a/bin/"
-
-export CC CXX AR RANLIB CFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED VALGRIND_VER JEMALLOC_LIB JEMALLOC_INCLUDE
--- a/build_tools/fbcode_config.sh
+++ b/build_tools/fbcode_config.sh
@ -0,0 +1,125 @@
+#!/bin/sh
+#
+# Set environment variables so that we can compile rocksdb using
+# fbcode settings.  It uses the latest g++ and clang compilers and also
+# uses jemalloc
+# Environment variables that change the behavior of this script:
+# PIC_BUILD -- if true, it will only take pic versions of libraries from fbcode. libraries that don't have pic variant will not be included
+
+CFLAGS=""
+
+# location of libgcc
+LIBGCC_BASE="/mnt/gvfs/third-party2/libgcc/0473c80518a10d6efcbe24c5eeca3fb4ec9b519c/4.9.x/gcc-4.9-glibc-2.20/e1a7e4e"
+LIBGCC_INCLUDE="$LIBGCC_BASE/include"
+LIBGCC_LIBS=" -L $LIBGCC_BASE/libs"
+
+# location of glibc
+GLIBC_REV=7397bed99280af5d9543439cdb7d018af7542720
+GLIBC_INCLUDE="/mnt/gvfs/third-party2/glibc/$GLIBC_REV/2.20/gcc-4.9-glibc-2.20/99df8fc/include"
+GLIBC_LIBS=" -L /mnt/gvfs/third-party2/glibc/$GLIBC_REV/2.20/gcc-4.9-glibc-2.20/99df8fc/lib"
+
+# location of snappy headers and libraries
+SNAPPY_INCLUDE=" -I /mnt/gvfs/third-party2/snappy/b0f269b3ca47770121aa159b99e1d8d2ab260e1f/1.0.3/gcc-4.9-glibc-2.20/c32916f/include/"
+if test -z $PIC_BUILD; then
+  SNAPPY_LIBS=" /mnt/gvfs/third-party2/snappy/b0f269b3ca47770121aa159b99e1d8d2ab260e1f/1.0.3/gcc-4.9-glibc-2.20/c32916f/lib/libsnappy.a"
+else
+  SNAPPY_LIBS=" /mnt/gvfs/third-party2/snappy/b0f269b3ca47770121aa159b99e1d8d2ab260e1f/1.0.3/gcc-4.9-glibc-2.20/c32916f/lib/libsnappy_pic.a"
+fi
+CFLAGS+=" -DSNAPPY"
+
+if test -z $PIC_BUILD; then
+  # location of zlib headers and libraries
+  ZLIB_INCLUDE=" -I /mnt/gvfs/third-party2/zlib/feb983d9667f4cf5e9da07ce75abc824764b67a1/1.2.8/gcc-4.9-glibc-2.20/4230243/include/"
+  ZLIB_LIBS=" /mnt/gvfs/third-party2/zlib/feb983d9667f4cf5e9da07ce75abc824764b67a1/1.2.8/gcc-4.9-glibc-2.20/4230243/lib/libz.a"
+  CFLAGS+=" -DZLIB"
+
+  # location of bzip headers and libraries
+  BZIP_INCLUDE=" -I /mnt/gvfs/third-party2/bzip2/af004cceebb2dfd173ca29933ea5915e727aad2f/1.0.6/gcc-4.9-glibc-2.20/4230243/include/"
+  BZIP_LIBS=" /mnt/gvfs/third-party2/bzip2/af004cceebb2dfd173ca29933ea5915e727aad2f/1.0.6/gcc-4.9-glibc-2.20/4230243/lib/libbz2.a"
+  CFLAGS+=" -DBZIP2"
+
+  LZ4_INCLUDE=" -I /mnt/gvfs/third-party2/lz4/79d2943e2dd7208a3e0b06cf95e9f85f05fe9e1b/r124/gcc-4.9-glibc-2.20/4230243/include/"
+  LZ4_LIBS=" /mnt/gvfs/third-party2/lz4/79d2943e2dd7208a3e0b06cf95e9f85f05fe9e1b/r124/gcc-4.9-glibc-2.20/4230243/lib/liblz4.a"
+  CFLAGS+=" -DLZ4"
+fi
+
+# location of gflags headers and libraries
+GFLAGS_INCLUDE=" -I /mnt/gvfs/third-party2/gflags/0fa60e2b88de3e469db6c482d6e6dac72f5d65f9/1.6/gcc-4.9-glibc-2.20/4230243/include/"
+if test -z $PIC_BUILD; then
+  GFLAGS_LIBS=" /mnt/gvfs/third-party2/gflags/0fa60e2b88de3e469db6c482d6e6dac72f5d65f9/1.6/gcc-4.9-glibc-2.20/4230243/lib/libgflags.a"
+else
+  GFLAGS_LIBS=" /mnt/gvfs/third-party2/gflags/0fa60e2b88de3e469db6c482d6e6dac72f5d65f9/1.6/gcc-4.9-glibc-2.20/4230243/lib/libgflags_pic.a"
+fi
+CFLAGS+=" -DGFLAGS=google"
+
+# location of jemalloc
+JEMALLOC_INCLUDE=" -I /mnt/gvfs/third-party2/jemalloc/bcd68e5e419efa4e61b9486d6854564d6d75a0b5/3.6.0/gcc-4.9-glibc-2.20/2aafc78/include/"
+JEMALLOC_LIB=" -Wl,--whole-archive /mnt/gvfs/third-party2/jemalloc/bcd68e5e419efa4e61b9486d6854564d6d75a0b5/3.6.0/gcc-4.9-glibc-2.20/2aafc78/lib/libjemalloc.a"
+
+if test -z $PIC_BUILD; then
+  # location of numa
+  NUMA_INCLUDE=" -I /mnt/gvfs/third-party2/numa/bbefc39ecbf31d0ca184168eb613ef8d397790ee/2.0.8/gcc-4.9-glibc-2.20/4230243/include/"
+  NUMA_LIB=" /mnt/gvfs/third-party2/numa/bbefc39ecbf31d0ca184168eb613ef8d397790ee/2.0.8/gcc-4.9-glibc-2.20/4230243/lib/libnuma.a"
+  CFLAGS+=" -DNUMA"
+
+  # location of libunwind
+  LIBUNWIND="/mnt/gvfs/third-party2/libunwind/1de3b75e0afedfe5585b231bbb340ec7a1542335/1.1/gcc-4.9-glibc-2.20/34235e8/lib/libunwind.a"
+fi
+
+# use Intel SSE support for checksum calculations
+export USE_SSE=1
+
+BINUTILS="/mnt/gvfs/third-party2/binutils/0b6ad0c88ddd903333a48ae8bff134efac468e4a/2.25/centos6-native/da39a3e/bin"
+AR="$BINUTILS/ar"
+
+DEPS_INCLUDE="$SNAPPY_INCLUDE $ZLIB_INCLUDE $BZIP_INCLUDE $LZ4_INCLUDE $GFLAGS_INCLUDE $NUMA_INCLUDE"
+
+GCC_BASE="/mnt/gvfs/third-party2/gcc/1c67a0b88f64d4d9ced0382d141c76aaa7d62fba/4.9.x/centos6-native/1317bc4"
+STDLIBS="-L $GCC_BASE/lib64"
+
+CLANG_BASE="/mnt/gvfs/third-party2/clang/290704c112bf894bf4a30d7bbd1be81e34998473/dev"
+CLANG_ANALYZER="$CLANG_BASE/centos6-native/af4b1a0/bin/clang++"
+CLANG_SCAN_BUILD="$CLANG_BASE/src/clang/tools/scan-build/scan-build"
+
+if [ -z "$USE_CLANG" ]; then
+  # gcc
+  CC="$GCC_BASE/bin/gcc"
+  CXX="$GCC_BASE/bin/g++"
+  
+  CFLAGS+=" -B$BINUTILS/gold"
+  CFLAGS+=" -isystem $GLIBC_INCLUDE"
+  CFLAGS+=" -isystem $LIBGCC_INCLUDE"
+else
+  # clang 
+  CLANG_INCLUDE="$CLANG_BASE/gcc-4.9-glibc-2.20/74c386f/lib/clang/dev/include/"
+  CC="$CLANG_BASE/centos6-native/af4b1a0/bin/clang"
+  CXX="$CLANG_BASE/centos6-native/af4b1a0/bin/clang++"
+
+  KERNEL_HEADERS_INCLUDE="/mnt/gvfs/third-party2/kernel-headers/ffd14f660a43c4b92717986b1bba66722ef089d0/3.2.18_70_fbk11_00129_gc8882d0/gcc-4.9-glibc-2.20/da39a3e/include"
+
+  CFLAGS+=" -B$BINUTILS/gold -nostdinc -nostdlib"
+  CFLAGS+=" -isystem $LIBGCC_BASE/include/c++/4.9.x "
+  CFLAGS+=" -isystem $LIBGCC_BASE/include/c++/4.9.x/x86_64-facebook-linux "
+  CFLAGS+=" -isystem $GLIBC_INCLUDE"
+  CFLAGS+=" -isystem $LIBGCC_INCLUDE"
+  CFLAGS+=" -isystem $CLANG_INCLUDE"
+  CFLAGS+=" -isystem $KERNEL_HEADERS_INCLUDE/linux "
+  CFLAGS+=" -isystem $KERNEL_HEADERS_INCLUDE "
+  CXXFLAGS="-nostdinc++"
+fi
+
+CFLAGS+=" $DEPS_INCLUDE"
+CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_FALLOCATE_PRESENT"
+CXXFLAGS+=" $CFLAGS"
+
+EXEC_LDFLAGS=" $SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $GFLAGS_LIBS $NUMA_LIB"
+EXEC_LDFLAGS+=" -Wl,--dynamic-linker,/usr/local/fbcode/gcc-4.9-glibc-2.20/lib/ld.so"
+EXEC_LDFLAGS+=" -Wl,--no-whole-archive $LIBUNWIND"
+
+PLATFORM_LDFLAGS="$LIBGCC_LIBS $GLIBC_LIBS $STDLIBS -lgcc -lstdc++"
+
+EXEC_LDFLAGS_SHARED="$SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $GFLAGS_LIBS"
+
+VALGRIND_VER="/mnt/gvfs/third-party2/valgrind/6c45ef049cbf11c2df593addb712cd891049e737/3.10.0/gcc-4.9-glibc-2.20/4230243/bin/"
+
+export CC CXX AR CFLAGS CXXFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED VALGRIND_VER JEMALLOC_LIB JEMALLOC_INCLUDE CLANG_ANALYZER CLANG_SCAN_BUILD
--- a/build_tools/fbcode_config4.8.1.sh
+++ b/build_tools/fbcode_config4.8.1.sh
@ -0,0 +1,105 @@
+#!/bin/sh
+#
+# Set environment variables so that we can compile rocksdb using
+# fbcode settings.  It uses the latest g++ compiler and also
+# uses jemalloc
+
+# location of libgcc
+LIBGCC_BASE="/mnt/gvfs/third-party2/libgcc/7712e757d7355cb51292454ee0b7b46a467fdfed/4.8.1/gcc-4.8.1-glibc-2.17/8aac7fc"
+LIBGCC_INCLUDE="$LIBGCC_BASE/include"
+LIBGCC_LIBS=" -L $LIBGCC_BASE/libs"
+
+# location of glibc
+GLIBC_REV=6e40560b4e0b6d690fd1cf8c7a43ad7452b04cfa
+GLIBC_INCLUDE="/mnt/gvfs/third-party2/glibc/$GLIBC_REV/2.17/gcc-4.8.1-glibc-2.17/99df8fc/include"
+GLIBC_LIBS=" -L /mnt/gvfs/third-party2/glibc/$GLIBC_REV/2.17/gcc-4.8.1-glibc-2.17/99df8fc/lib"
+
+# location of snappy headers and libraries
+SNAPPY_INCLUDE=" -I /mnt/gvfs/third-party2/snappy/aef17f6c0b44b4fe408bd06f67c93701ab0a6ceb/1.0.3/gcc-4.8.1-glibc-2.17/43d84e2/include"
+SNAPPY_LIBS=" /mnt/gvfs/third-party2/snappy/aef17f6c0b44b4fe408bd06f67c93701ab0a6ceb/1.0.3/gcc-4.8.1-glibc-2.17/43d84e2/lib/libsnappy.a"
+
+# location of zlib headers and libraries
+ZLIB_INCLUDE=" -I /mnt/gvfs/third-party2/zlib/25c6216928b4d77b59ddeca0990ff6fe9ac16b81/1.2.5/gcc-4.8.1-glibc-2.17/c3f970a/include"
+ZLIB_LIBS=" /mnt/gvfs/third-party2/zlib/25c6216928b4d77b59ddeca0990ff6fe9ac16b81/1.2.5/gcc-4.8.1-glibc-2.17/c3f970a/lib/libz.a"
+
+# location of bzip headers and libraries
+BZIP_INCLUDE=" -I /mnt/gvfs/third-party2/bzip2/c9ef7629c2aa0024f7a416e87602f06eb88f5eac/1.0.6/gcc-4.8.1-glibc-2.17/c3f970a/include/"
+BZIP_LIBS=" /mnt/gvfs/third-party2/bzip2/c9ef7629c2aa0024f7a416e87602f06eb88f5eac/1.0.6/gcc-4.8.1-glibc-2.17/c3f970a/lib/libbz2.a"
+
+LZ4_REV=065ec7e38fe83329031f6668c43bef83eff5808b
+LZ4_INCLUDE=" -I /mnt/gvfs/third-party2/lz4/$LZ4_REV/r108/gcc-4.8.1-glibc-2.17/c3f970a/include"
+LZ4_LIBS=" /mnt/gvfs/third-party2/lz4/$LZ4_REV/r108/gcc-4.8.1-glibc-2.17/c3f970a/lib/liblz4.a"
+
+# location of gflags headers and libraries
+GFLAGS_INCLUDE=" -I /mnt/gvfs/third-party2/gflags/1ad047a6e6f6673991918ecadc670868205a243a/1.6/gcc-4.8.1-glibc-2.17/c3f970a/include/"
+GFLAGS_LIBS=" /mnt/gvfs/third-party2/gflags/1ad047a6e6f6673991918ecadc670868205a243a/1.6/gcc-4.8.1-glibc-2.17/c3f970a/lib/libgflags.a"
+
+# location of jemalloc
+JEMALLOC_INCLUDE=" -I /mnt/gvfs/third-party2/jemalloc/c60d854f7824f334195fe7fd34b2bc9057e3c1f9/3.6.0/gcc-4.8.1-glibc-2.17/4d53c6f/include"
+JEMALLOC_LIB=" -Wl,--whole-archive /mnt/gvfs/third-party2/jemalloc/c60d854f7824f334195fe7fd34b2bc9057e3c1f9/3.6.0/gcc-4.8.1-glibc-2.17/4d53c6f/lib/libjemalloc.a"
+
+# location of numa
+NUMA_REV=829d10dac0230f99cd7e1778869d2adf3da24b65
+NUMA_INCLUDE=" -I /mnt/gvfs/third-party2/numa/$NUMA_REV/2.0.8/gcc-4.8.1-glibc-2.17/c3f970a/include/"
+NUMA_LIB=" /mnt/gvfs/third-party2/numa/$NUMA_REV/2.0.8/gcc-4.8.1-glibc-2.17/c3f970a/lib/libnuma.a"
+
+# location of libunwind
+LIBUNWIND_REV=2c060e64064559905d46fd194000d61592087bdc
+LIBUNWIND="/mnt/gvfs/third-party2/libunwind/$LIBUNWIND_REV/1.1/gcc-4.8.1-glibc-2.17/675d945/lib/libunwind.a"
+
+# use Intel SSE support for checksum calculations
+export USE_SSE=1
+
+BINUTILS="/mnt/gvfs/third-party2/binutils/2aff2e7b474cd3e6ab23495ad1224b7d214b9f8e/2.21.1/centos6-native/da39a3e/bin"
+AR="$BINUTILS/ar"
+
+DEPS_INCLUDE="$SNAPPY_INCLUDE $ZLIB_INCLUDE $BZIP_INCLUDE $LZ4_INCLUDE $GFLAGS_INCLUDE $NUMA_INCLUDE"
+
+GCC_BASE="/mnt/gvfs/third-party2/gcc/1ec615e23800f0815d474478ba476a0adc3fe788/4.8.1/centos6-native/cc6c9dc"
+STDLIBS="-L $GCC_BASE/lib64"
+
+if [ -z "$USE_CLANG" ]; then
+  # gcc
+  CC="$GCC_BASE/bin/gcc"
+  CXX="$GCC_BASE/bin/g++"
+  
+  CFLAGS="-B$BINUTILS/gold -m64 -mtune=generic"
+  CFLAGS+=" -isystem $GLIBC_INCLUDE"
+  CFLAGS+=" -isystem $LIBGCC_INCLUDE"
+else
+  # clang 
+  CLANG_BASE="/mnt/gvfs/third-party2/clang/9ab68376f938992c4eb5946ca68f90c3185cffc8/3.4"
+  CLANG_INCLUDE="$CLANG_BASE/gcc-4.8.1-glibc-2.17/fb0f730/lib/clang/3.4/include"
+  CC="$CLANG_BASE/centos6-native/9cefd8a/bin/clang"
+  CXX="$CLANG_BASE/centos6-native/9cefd8a/bin/clang++"
+
+  KERNEL_HEADERS_INCLUDE="/mnt/gvfs/third-party2/kernel-headers/a683ed7135276731065a9d76d3016c9731f4e2f9/3.2.18_70_fbk11_00129_gc8882d0/gcc-4.8.1-glibc-2.17/da39a3e/include/"
+
+  CFLAGS="-B$BINUTILS/gold -nostdinc -nostdlib"
+  CFLAGS+=" -isystem $LIBGCC_BASE/include/c++/4.8.1 "
+  CFLAGS+=" -isystem $LIBGCC_BASE/include/c++/4.8.1/x86_64-facebook-linux "
+  CFLAGS+=" -isystem $GLIBC_INCLUDE"
+  CFLAGS+=" -isystem $LIBGCC_INCLUDE"
+  CFLAGS+=" -isystem $CLANG_INCLUDE"
+  CFLAGS+=" -isystem $KERNEL_HEADERS_INCLUDE/linux "
+  CFLAGS+=" -isystem $KERNEL_HEADERS_INCLUDE "
+  CXXFLAGS="-nostdinc++"
+fi
+
+CFLAGS+=" $DEPS_INCLUDE"
+CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_FALLOCATE_PRESENT"
+CFLAGS+=" -DSNAPPY -DGFLAGS=google -DZLIB -DBZIP2 -DLZ4 -DNUMA"
+CXXFLAGS+=" $CFLAGS"
+
+EXEC_LDFLAGS=" $SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $GFLAGS_LIBS $NUMA_LIB"
+EXEC_LDFLAGS+=" -Wl,--dynamic-linker,/usr/local/fbcode/gcc-4.8.1-glibc-2.17/lib/ld.so"
+EXEC_LDFLAGS+=" -Wl,--no-whole-archive $LIBUNWIND"
+
+PLATFORM_LDFLAGS="$LIBGCC_LIBS $GLIBC_LIBS $STDLIBS -lgcc -lstdc++"
+
+EXEC_LDFLAGS_SHARED="$SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $GFLAGS_LIBS"
+
+VALGRIND_REV=b2a9f85e4b70cd03abc85a7f3027fbc4cef35bd0
+VALGRIND_VER="/mnt/gvfs/third-party2/valgrind/$VALGRIND_REV/3.8.1/gcc-4.8.1-glibc-2.17/c3f970a/bin/"
+
+export CC CXX AR CFLAGS CXXFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED VALGRIND_VER JEMALLOC_LIB JEMALLOC_INCLUDE
--- a/build_tools/mac-install-gflags.sh
+++ b/build_tools/mac-install-gflags.sh
@ -1,25 +0,0 @@
-#!/bin/sh
-# Install gflags for mac developers.
-
-set -e
-
-DIR=`mktemp -d /tmp/rocksdb_gflags_XXXX`
-
-cd $DIR
-wget https://gflags.googlecode.com/files/gflags-2.0.tar.gz
-tar xvfz gflags-2.0.tar.gz
-cd gflags-2.0
-
-./configure
-make
-make install
-
-# Add include/lib path for g++
-echo 'export LIBRARY_PATH+=":/usr/local/lib"' >> ~/.bash_profile
-echo 'export CPATH+=":/usr/local/include"' >> ~/.bash_profile
-
-echo ""
-echo "-----------------------------------------------------------------------------"
-echo "|                         Installation Completed                            |"
-echo "-----------------------------------------------------------------------------"
-echo "Please run \`. ~/.bash_profile\` to be able to compile with gflags"
--- a/build_tools/make_package.sh
+++ b/build_tools/make_package.sh
@ -0,0 +1,116 @@
+#/usr/bin/env bash
+
+set -e
+
+function log() {
+  echo "[+] $1"
+}
+
+function fatal() {
+  echo "[!] $1"
+  exit 1
+}
+
+function platform() {
+  local  __resultvar=$1
+  if [[ -f "/etc/yum.conf" ]]; then
+    eval $__resultvar="centos"
+  elif [[ -f "/etc/dpkg/dpkg.cfg" ]]; then
+    eval $__resultvar="ubuntu"
+  else
+    fatal "Unknwon operating system"
+  fi
+}
+platform OS
+
+function package() {
+  if [[ $OS = "ubuntu" ]]; then
+    if dpkg --get-selections | grep --quiet $1; then
+      log "$1 is already installed. skipping."
+    else
+      apt-get install $@ -y
+    fi
+  elif [[ $OS = "centos" ]]; then
+    if rpm -qa | grep --quiet $1; then
+      log "$1 is already installed. skipping."
+    else
+      yum install $@ -y
+    fi
+  fi
+}
+
+function detect_fpm_output() {
+  if [[ $OS = "ubuntu" ]]; then
+    export FPM_OUTPUT=deb
+  elif [[ $OS = "centos" ]]; then
+    export FPM_OUTPUT=rpm
+  fi
+}
+detect_fpm_output
+
+function gem_install() {
+  if gem list | grep --quiet $1; then
+    log "$1 is already installed. skipping."
+  else
+    gem install $@
+  fi
+}
+
+function main() {
+  if [[ $# -ne 1 ]]; then
+    fatal "Usage: $0 <rocksdb_version>"
+  else
+    log "using rocksdb version: $1"
+  fi
+
+  if [[ -d /vagrant ]]; then
+    if [[ $OS = "ubuntu" ]]; then
+      package g++-4.7
+      export CXX=g++-4.7
+
+      # the deb would depend on libgflags2, but the static lib is the only thing
+      # installed by make install
+      package libgflags-dev
+
+      package ruby-all-dev
+    elif [[ $OS = "centos" ]]; then
+      pushd /etc/yum.repos.d
+      if [[ ! -f /etc/yum.repos.d/devtools-1.1.repo ]]; then
+        wget http://people.centos.org/tru/devtools-1.1/devtools-1.1.repo
+      fi
+      package devtoolset-1.1-gcc --enablerepo=testing-1.1-devtools-6
+      package devtoolset-1.1-gcc-c++ --enablerepo=testing-1.1-devtools-6
+      export CC=/opt/centos/devtoolset-1.1/root/usr/bin/gcc
+      export CPP=/opt/centos/devtoolset-1.1/root/usr/bin/cpp
+      export CXX=/opt/centos/devtoolset-1.1/root/usr/bin/c++
+      export PATH=$PATH:/opt/centos/devtoolset-1.1/root/usr/bin
+      popd
+      if ! rpm -qa | grep --quiet gflags; then
+        rpm -i https://github.com/schuhschuh/gflags/releases/download/v2.1.0/gflags-devel-2.1.0-1.amd64.rpm
+      fi
+
+      package ruby
+      package ruby-devel
+      package rubygems
+      package rpm-build
+    fi
+  fi
+  gem_install fpm
+
+  make static_lib
+  make install INSTALL_PATH=package
+  fpm \
+    -s dir \
+    -t $FPM_OUTPUT \
+    -n rocksdb \
+    -v $1 \
+    --prefix /usr \
+    --url http://rocksdb.org/ \
+    -m rocksdb@fb.com \
+    --license BSD \
+    --vendor Facebook \
+    --description "RocksDB is an embeddable persistent key-value store for fast storage." \
+    package
+}
+
+main $@
--- a/build_tools/regression_build_test.sh
+++ b/build_tools/regression_build_test.sh
@ -344,6 +344,38 @@ common_in_mem_args="--db=/dev/shm/rocksdb \
    --threads=32 \
    --writes_per_second=81920 > ${STAT_FILE}.seekwhilewriting_in_ram

+# measure fillseq with bunch of column families
+./db_bench \
+    --benchmarks=fillseq \
+    --num_column_families=500 \
+    --write_buffer_size=1048576 \
+    --db=$DATA_DIR \
+    --use_existing_db=0 \
+    --num=$NUM \
+    --writes=$NUM \
+    --open_files=55000 \
+    --statistics=1 \
+    --histogram=1 \
+    --disable_data_sync=1 \
+    --disable_wal=1 \
+    --sync=0  > ${STAT_FILE}.fillseq_lots_column_families
+
+# measure overwrite performance with bunch of column families
+./db_bench \
+    --benchmarks=overwrite \
+    --num_column_families=500 \
+    --write_buffer_size=1048576 \
+    --db=$DATA_DIR \
+    --use_existing_db=1 \
+    --num=$NUM \
+    --writes=$((NUM / 10)) \
+    --open_files=55000 \
+    --statistics=1 \
+    --histogram=1 \
+    --disable_data_sync=1 \
+    --disable_wal=1 \
+    --sync=0 \
+    --threads=8 > ${STAT_FILE}.overwrite_lots_column_families

 # send data to ods
 function send_to_ods {
@ -392,3 +424,5 @@ send_benchmark_to_ods readrandom memtablereadrandom $STAT_FILE.memtablefillreadr
 send_benchmark_to_ods readwhilewriting readwhilewriting $STAT_FILE.readwhilewriting
 send_benchmark_to_ods readwhilewriting readwhilewriting_in_ram ${STAT_FILE}.readwhilewriting_in_ram
 send_benchmark_to_ods seekrandomwhilewriting seekwhilewriting_in_ram ${STAT_FILE}.seekwhilewriting_in_ram
+send_benchmark_to_ods fillseq fillseq_lots_column_families ${STAT_FILE}.fillseq_lots_column_families
+send_benchmark_to_ods overwrite overwrite_lots_column_families ${STAT_FILE}.overwrite_lots_column_families
--- a/build_tools/unity
+++ b/build_tools/unity
@ -54,7 +54,7 @@ case "$TARGET_OS" in
        exit 1
 esac

-# We want to make a list of all cc files within util, db, table, and helpers
+# We want to make a list of all cc files within util, db and table
 # except for the test and benchmark files. By default, find will output a list
 # of all files matching either rule, so we need to append -print to make the
 # prune take effect.
--- a/build_tools/version.sh
+++ b/build_tools/version.sh
@ -0,0 +1,14 @@
+#!/bin/sh
+if [ "$#" = "0" ]; then
+  echo "Usage: $0 major|minor|patch"
+  exit 1
+fi
+if [ "$1" = "major" ]; then
+  cat include/rocksdb/version.h  | grep MAJOR | head -n1 | awk '{print $3}'
+fi
+if [ "$1" = "minor" ]; then
+  cat include/rocksdb/version.h  | grep MINOR | head -n1 | awk '{print $3}'
+fi
+if [ "$1" = "patch" ]; then
+  cat include/rocksdb/version.h  | grep PATCH | head -n1 | awk '{print $3}'
+fi
--- a/coverage/coverage_test.sh
+++ b/coverage/coverage_test.sh
@ -11,8 +11,8 @@ fi
 ROOT=".."
 # Fetch right version of gcov
 if [ -d /mnt/gvfs/third-party -a -z "$CXX" ]; then
-  source $ROOT/build_tools/fbcode.gcc471.sh
-  GCOV=$TOOLCHAIN_EXECUTABLES/gcc/gcc-4.7.1/cc6c9dc/bin/gcov
+  source $ROOT/build_tools/fbcode_config.sh
+  GCOV=$GCC_BASE/bin/gcov
 else
  GCOV=$(which gcov)
 fi
--- a/db/builder.cc
+++ b/db/builder.cc
@ -26,21 +26,24 @@ namespace rocksdb {

 class TableFactory;

-TableBuilder* NewTableBuilder(const Options& options,
+TableBuilder* NewTableBuilder(const ImmutableCFOptions& ioptions,
                              const InternalKeyComparator& internal_comparator,
                              WritableFile* file,
-                              CompressionType compression_type) {
-  return options.table_factory->NewTableBuilder(options, internal_comparator,
-                                                file, compression_type);
+                              const CompressionType compression_type,
+                              const CompressionOptions& compression_opts) {
+  return ioptions.table_factory->NewTableBuilder(
+      ioptions, internal_comparator, file, compression_type, compression_opts);
 }

-Status BuildTable(const std::string& dbname, Env* env, const Options& options,
-                  const EnvOptions& soptions, TableCache* table_cache,
+Status BuildTable(const std::string& dbname, Env* env,
+                  const ImmutableCFOptions& ioptions,
+                  const EnvOptions& env_options, TableCache* table_cache,
                  Iterator* iter, FileMetaData* meta,
                  const InternalKeyComparator& internal_comparator,
                  const SequenceNumber newest_snapshot,
                  const SequenceNumber earliest_seqno_in_memtable,
                  const CompressionType compression,
+                  const CompressionOptions& compression_opts,
                  const Env::IOPriority io_priority) {
  Status s;
  meta->fd.file_size = 0;
@ -50,33 +53,36 @@ Status BuildTable(const std::string& dbname, Env* env, const Options& options,
  // If the sequence number of the smallest entry in the memtable is
  // smaller than the most recent snapshot, then we do not trigger
  // removal of duplicate/deleted keys as part of this builder.
-  bool purge = options.purge_redundant_kvs_while_flush;
+  bool purge = ioptions.purge_redundant_kvs_while_flush;
  if (earliest_seqno_in_memtable <= newest_snapshot) {
    purge = false;
  }

-  std::string fname = TableFileName(options.db_paths, meta->fd.GetNumber(),
+  std::string fname = TableFileName(ioptions.db_paths, meta->fd.GetNumber(),
                                    meta->fd.GetPathId());
  if (iter->Valid()) {
    unique_ptr<WritableFile> file;
-    s = env->NewWritableFile(fname, &file, soptions);
+    s = env->NewWritableFile(fname, &file, env_options);
    if (!s.ok()) {
      return s;
    }
    file->SetIOPriority(io_priority);

-    TableBuilder* builder =
-        NewTableBuilder(options, internal_comparator, file.get(), compression);
+    TableBuilder* builder = NewTableBuilder(
+        ioptions, internal_comparator, file.get(),
+        compression, compression_opts);

+    {
      // the first key is the smallest key
      Slice key = iter->key();
      meta->smallest.DecodeFrom(key);
      meta->smallest_seqno = GetInternalKeySeqno(key);
      meta->largest_seqno = meta->smallest_seqno;
+    }

    MergeHelper merge(internal_comparator.user_comparator(),
-                      options.merge_operator.get(), options.info_log.get(),
-                      options.min_partial_merge_operands,
+                      ioptions.merge_operator, ioptions.info_log,
+                      ioptions.min_partial_merge_operands,
                      true /* internal key corruption is not ok */);

    if (purge) {
@ -196,12 +202,12 @@ Status BuildTable(const std::string& dbname, Env* env, const Options& options,
    delete builder;

    // Finish and check for file errors
-    if (s.ok() && !options.disableDataSync) {
-      if (options.use_fsync) {
-        StopWatch sw(env, options.statistics.get(), TABLE_SYNC_MICROS);
+    if (s.ok() && !ioptions.disable_data_sync) {
+      if (ioptions.use_fsync) {
+        StopWatch sw(env, ioptions.statistics, TABLE_SYNC_MICROS);
        s = file->Fsync();
      } else {
-        StopWatch sw(env, options.statistics.get(), TABLE_SYNC_MICROS);
+        StopWatch sw(env, ioptions.statistics, TABLE_SYNC_MICROS);
        s = file->Sync();
      }
    }
@ -211,7 +217,7 @@ Status BuildTable(const std::string& dbname, Env* env, const Options& options,

    if (s.ok()) {
      // Verify that the table is usable
-      Iterator* it = table_cache->NewIterator(ReadOptions(), soptions,
+      Iterator* it = table_cache->NewIterator(ReadOptions(), env_options,
                                              internal_comparator, meta->fd);
      s = it->status();
      delete it;
--- a/db/builder.h
+++ b/db/builder.h
@ -11,6 +11,7 @@
 #include "rocksdb/status.h"
 #include "rocksdb/types.h"
 #include "rocksdb/options.h"
+#include "rocksdb/immutable_options.h"

 namespace rocksdb {

@ -26,8 +27,10 @@ class TableBuilder;
 class WritableFile;

 extern TableBuilder* NewTableBuilder(
-    const Options& options, const InternalKeyComparator& internal_comparator,
-    WritableFile* file, CompressionType compression_type);
+    const ImmutableCFOptions& options,
+    const InternalKeyComparator& internal_comparator,
+    WritableFile* file, const CompressionType compression_type,
+    const CompressionOptions& compression_opts);

 // Build a Table file from the contents of *iter.  The generated file
 // will be named according to number specified in meta. On success, the rest of
@ -35,13 +38,15 @@ extern TableBuilder* NewTableBuilder(
 // If no data is present in *iter, meta->file_size will be set to
 // zero, and no Table file will be produced.
 extern Status BuildTable(const std::string& dbname, Env* env,
-                         const Options& options, const EnvOptions& soptions,
+                         const ImmutableCFOptions& options,
+                         const EnvOptions& env_options,
                         TableCache* table_cache, Iterator* iter,
                         FileMetaData* meta,
                         const InternalKeyComparator& internal_comparator,
                         const SequenceNumber newest_snapshot,
                         const SequenceNumber earliest_seqno_in_memtable,
                         const CompressionType compression,
+                         const CompressionOptions& compression_opts,
                         const Env::IOPriority io_priority = Env::IO_HIGH);

 }  // namespace rocksdb
--- a/db/c.cc
+++ b/db/c.cc
@ -29,6 +29,7 @@
 #include "rocksdb/statistics.h"
 #include "rocksdb/slice_transform.h"
 #include "rocksdb/table.h"
+#include "rocksdb/utilities/backupable_db.h"

 using rocksdb::Cache;
 using rocksdb::ColumnFamilyDescriptor;
@ -56,6 +57,7 @@ using rocksdb::NewBloomFilterPolicy;
 using rocksdb::NewLRUCache;
 using rocksdb::Options;
 using rocksdb::BlockBasedTableOptions;
+using rocksdb::CuckooTableOptions;
 using rocksdb::RandomAccessFile;
 using rocksdb::Range;
 using rocksdb::ReadOptions;
@ -68,21 +70,32 @@ using rocksdb::WritableFile;
 using rocksdb::WriteBatch;
 using rocksdb::WriteOptions;
 using rocksdb::LiveFileMetaData;
+using rocksdb::BackupEngine;
+using rocksdb::BackupableDBOptions;
+using rocksdb::BackupInfo;
+using rocksdb::RestoreOptions;

 using std::shared_ptr;

 extern "C" {

 struct rocksdb_t                 { DB*               rep; };
+struct rocksdb_backup_engine_t   { BackupEngine*     rep; };
+struct rocksdb_backup_engine_info_t { std::vector<BackupInfo> rep; };
+struct rocksdb_restore_options_t { RestoreOptions rep; };
 struct rocksdb_iterator_t        { Iterator*         rep; };
 struct rocksdb_writebatch_t      { WriteBatch        rep; };
 struct rocksdb_snapshot_t        { const Snapshot*   rep; };
 struct rocksdb_flushoptions_t    { FlushOptions      rep; };
 struct rocksdb_fifo_compaction_options_t { CompactionOptionsFIFO rep; };
-struct rocksdb_readoptions_t     { ReadOptions       rep; };
+struct rocksdb_readoptions_t {
+   ReadOptions rep;
+   Slice upper_bound; // stack variable to set pointer to in ReadOptions
+};
 struct rocksdb_writeoptions_t    { WriteOptions      rep; };
 struct rocksdb_options_t         { Options           rep; };
 struct rocksdb_block_based_table_options_t  { BlockBasedTableOptions rep; };
+struct rocksdb_cuckoo_table_options_t  { CuckooTableOptions rep; };
 struct rocksdb_seqfile_t         { SequentialFile*   rep; };
 struct rocksdb_randomfile_t      { RandomAccessFile* rep; };
 struct rocksdb_writablefile_t    { WritableFile*     rep; };
@ -118,7 +131,7 @@ struct rocksdb_compactionfilter_t : public CompactionFilter {
      const Slice& existing_value,
      std::string* new_value,
      bool* value_changed) const {
-    char* c_new_value = NULL;
+    char* c_new_value = nullptr;
    size_t new_value_length = 0;
    unsigned char c_value_changed = 0;
    unsigned char result = (*filter_)(
@ -385,11 +398,9 @@ struct rocksdb_mergeoperator_t : public MergeOperator {
    unsigned char success;
    size_t new_value_len;
    char* tmp_new_value = (*full_merge_)(
-        state_,
-        key.data(), key.size(),
-        existing_value_data, existing_value_len,
-        &operand_pointers[0], &operand_sizes[0], n,
-        &success, &new_value_len);
+        state_, key.data(), key.size(), existing_value_data, existing_value_len,
+        &operand_pointers[0], &operand_sizes[0], static_cast<int>(n), &success,
+        &new_value_len);
    new_value->assign(tmp_new_value, new_value_len);

    if (delete_value_ != nullptr) {
@ -417,7 +428,7 @@ struct rocksdb_mergeoperator_t : public MergeOperator {
    size_t new_value_len;
    char* tmp_new_value = (*partial_merge_)(
        state_, key.data(), key.size(), &operand_pointers[0], &operand_sizes[0],
-        operand_count, &success, &new_value_len);
+        static_cast<int>(operand_count), &success, &new_value_len);
    new_value->assign(tmp_new_value, new_value_len);

    if (delete_value_ != nullptr) {
@ -524,6 +535,85 @@ rocksdb_t* rocksdb_open_for_read_only(
  return result;
 }

+rocksdb_backup_engine_t* rocksdb_backup_engine_open(
+    const rocksdb_options_t* options, const char* path, char** errptr) {
+  BackupEngine* be;
+  if (SaveError(errptr, BackupEngine::Open(options->rep.env,
+                                           BackupableDBOptions(path), &be))) {
+    return nullptr;
+  }
+  rocksdb_backup_engine_t* result = new rocksdb_backup_engine_t;
+  result->rep = be;
+  return result;
+}
+
+void rocksdb_backup_engine_create_new_backup(rocksdb_backup_engine_t* be,
+                                             rocksdb_t* db, char** errptr) {
+  SaveError(errptr, be->rep->CreateNewBackup(db->rep));
+}
+
+rocksdb_restore_options_t* rocksdb_restore_options_create() {
+  return new rocksdb_restore_options_t;
+}
+
+void rocksdb_restore_options_destroy(rocksdb_restore_options_t* opt) {
+  delete opt;
+}
+
+void rocksdb_restore_options_set_keep_log_files(rocksdb_restore_options_t* opt,
+                                                int v) {
+  opt->rep.keep_log_files = v;
+}
+
+void rocksdb_backup_engine_restore_db_from_latest_backup(
+    rocksdb_backup_engine_t* be, const char* db_dir, const char* wal_dir,
+    const rocksdb_restore_options_t* restore_options, char** errptr) {
+  SaveError(errptr, be->rep->RestoreDBFromLatestBackup(std::string(db_dir),
+                                                       std::string(wal_dir),
+                                                       restore_options->rep));
+}
+
+const rocksdb_backup_engine_info_t* rocksdb_backup_engine_get_backup_info(
+    rocksdb_backup_engine_t* be) {
+  rocksdb_backup_engine_info_t* result = new rocksdb_backup_engine_info_t;
+  be->rep->GetBackupInfo(&result->rep);
+  return result;
+}
+
+int rocksdb_backup_engine_info_count(const rocksdb_backup_engine_info_t* info) {
+  return static_cast<int>(info->rep.size());
+}
+
+const int64_t rocksdb_backup_engine_info_timestamp(
+    const rocksdb_backup_engine_info_t* info, int index) {
+  return info->rep[index].timestamp;
+}
+
+const uint32_t rocksdb_backup_engine_info_backup_id(
+    const rocksdb_backup_engine_info_t* info, int index) {
+  return info->rep[index].backup_id;
+}
+
+const uint64_t rocksdb_backup_engine_info_size(
+    const rocksdb_backup_engine_info_t* info, int index) {
+  return info->rep[index].size;
+}
+
+const uint32_t rocksdb_backup_engine_info_number_files(
+    const rocksdb_backup_engine_info_t* info, int index) {
+  return info->rep[index].number_files;
+}
+
+void rocksdb_backup_engine_info_destroy(
+    const rocksdb_backup_engine_info_t* info) {
+  delete info;
+}
+
+void rocksdb_backup_engine_close(rocksdb_backup_engine_t* be) {
+  delete be->rep;
+  delete be;
+}
+
 void rocksdb_close(rocksdb_t* db) {
  delete db->rep;
  delete db;
@ -1123,6 +1213,51 @@ void rocksdb_options_set_block_based_table_factory(
 }


+rocksdb_cuckoo_table_options_t*
+rocksdb_cuckoo_options_create() {
+  return new rocksdb_cuckoo_table_options_t;
+}
+
+void rocksdb_cuckoo_options_destroy(
+    rocksdb_cuckoo_table_options_t* options) {
+  delete options;
+}
+
+void rocksdb_cuckoo_options_set_hash_ratio(
+    rocksdb_cuckoo_table_options_t* options, double v) {
+  options->rep.hash_table_ratio = v;
+}
+
+void rocksdb_cuckoo_options_set_max_search_depth(
+    rocksdb_cuckoo_table_options_t* options, uint32_t v) {
+  options->rep.max_search_depth = v;
+}
+
+void rocksdb_cuckoo_options_set_cuckoo_block_size(
+    rocksdb_cuckoo_table_options_t* options, uint32_t v) {
+  options->rep.cuckoo_block_size = v;
+}
+
+void rocksdb_cuckoo_options_set_identity_as_first_hash(
+    rocksdb_cuckoo_table_options_t* options, unsigned char v) {
+  options->rep.identity_as_first_hash = v;
+}
+
+void rocksdb_cuckoo_options_set_use_module_hash(
+    rocksdb_cuckoo_table_options_t* options, unsigned char v) {
+  options->rep.use_module_hash = v;
+}
+
+void rocksdb_options_set_cuckoo_table_factory(
+    rocksdb_options_t *opt,
+    rocksdb_cuckoo_table_options_t* table_options) {
+  if (table_options) {
+    opt->rep.table_factory.reset(
+        rocksdb::NewCuckooTableFactory(table_options->rep));
+  }
+}
+
+
 rocksdb_options_t* rocksdb_options_create() {
  return new rocksdb_options_t;
 }
@ -1216,6 +1351,11 @@ void rocksdb_options_set_info_log_level(
  opt->rep.info_log_level = static_cast<InfoLogLevel>(v);
 }

+void rocksdb_options_set_db_write_buffer_size(rocksdb_options_t* opt,
+                                              size_t s) {
+  opt->rep.db_write_buffer_size = s;
+}
+
 void rocksdb_options_set_write_buffer_size(rocksdb_options_t* opt, size_t s) {
  opt->rep.write_buffer_size = s;
 }
@ -1224,6 +1364,10 @@ void rocksdb_options_set_max_open_files(rocksdb_options_t* opt, int n) {
  opt->rep.max_open_files = n;
 }

+void rocksdb_options_set_max_total_wal_size(rocksdb_options_t* opt, uint64_t n) {
+  opt->rep.max_total_wal_size = n;
+}
+
 void rocksdb_options_set_target_file_size_base(
    rocksdb_options_t* opt, uint64_t n) {
  opt->rep.target_file_size_base = n;
@ -1355,8 +1499,8 @@ void rocksdb_options_set_purge_redundant_kvs_while_flush(
  opt->rep.purge_redundant_kvs_while_flush = v;
 }

-void rocksdb_options_set_allow_os_buffer(
-    rocksdb_options_t* opt, unsigned char v) {
+void rocksdb_options_set_allow_os_buffer(rocksdb_options_t* opt,
+                                         unsigned char v) {
  opt->rep.allow_os_buffer = v;
 }

@ -1581,11 +1725,6 @@ void rocksdb_options_set_bloom_locality(
  opt->rep.bloom_locality = v;
 }

-void rocksdb_options_set_allow_thread_local(
-    rocksdb_options_t* opt, unsigned char v) {
-  opt->rep.allow_thread_local = v;
-}
-
 void rocksdb_options_set_inplace_update_support(
    rocksdb_options_t* opt, unsigned char v) {
  opt->rep.inplace_update_support = v;
@ -1844,6 +1983,19 @@ void rocksdb_readoptions_set_snapshot(
  opt->rep.snapshot = (snap ? snap->rep : nullptr);
 }

+void rocksdb_readoptions_set_iterate_upper_bound(
+    rocksdb_readoptions_t* opt,
+    const char* key, size_t keylen) {
+  if (key == nullptr) {
+    opt->upper_bound = Slice();
+    opt->rep.iterate_upper_bound = nullptr;
+
+  } else {
+    opt->upper_bound = Slice(key, keylen);
+    opt->rep.iterate_upper_bound = &opt->upper_bound;
+  }
+}
+
 void rocksdb_readoptions_set_read_tier(
    rocksdb_readoptions_t* opt, int v) {
  opt->rep.read_tier = static_cast<rocksdb::ReadTier>(v);
@ -2039,7 +2191,7 @@ void rocksdb_options_set_min_level_to_compress(rocksdb_options_t* opt, int level

 int rocksdb_livefiles_count(
  const rocksdb_livefiles_t* lf) {
-  return lf->rep.size();
+  return static_cast<int>(lf->rep.size());
 }

 const char* rocksdb_livefiles_name(
--- a/db/c_test.c
+++ b/db/c_test.c
@ -10,9 +10,11 @@
 #include <string.h>
 #include <sys/types.h>
 #include <unistd.h>
+#include <inttypes.h>

 const char* phase = "";
 static char dbname[200];
+static char dbbackupname[200];

 static void StartPhase(const char* name) {
  fprintf(stderr, "=== Test %s\n", name);
@ -132,7 +134,7 @@ static void CmpDestroy(void* arg) { }

 static int CmpCompare(void* arg, const char* a, size_t alen,
                      const char* b, size_t blen) {
-  int n = (alen < blen) ? alen : blen;
+  size_t n = (alen < blen) ? alen : blen;
  int r = memcmp(a, b, n);
  if (r == 0) {
    if (alen < blen) r = -1;
@ -346,6 +348,11 @@ int main(int argc, char** argv) {
           GetTempDir(),
           ((int) geteuid()));

+  snprintf(dbbackupname, sizeof(dbbackupname),
+           "%s/rocksdb_c_test-%d-backup",
+           GetTempDir(),
+           ((int) geteuid()));
+
  StartPhase("create_objects");
  cmp = rocksdb_comparator_create(NULL, CmpDestroy, CmpCompare, CmpName);
  env = rocksdb_create_default_env();
@ -396,6 +403,41 @@ int main(int argc, char** argv) {
  CheckNoError(err);
  CheckGet(db, roptions, "foo", "hello");

+  StartPhase("backup_and_restore");
+  {
+    rocksdb_destroy_db(options, dbbackupname, &err);
+    CheckNoError(err);
+
+    rocksdb_backup_engine_t *be = rocksdb_backup_engine_open(options, dbbackupname, &err);
+    CheckNoError(err);
+
+    rocksdb_backup_engine_create_new_backup(be, db, &err);
+    CheckNoError(err);
+
+    rocksdb_delete(db, woptions, "foo", 3, &err);
+    CheckNoError(err);
+
+    rocksdb_close(db);
+
+    rocksdb_destroy_db(options, dbname, &err);
+    CheckNoError(err);
+
+    rocksdb_restore_options_t *restore_options = rocksdb_restore_options_create();
+    rocksdb_restore_options_set_keep_log_files(restore_options, 0);
+    rocksdb_backup_engine_restore_db_from_latest_backup(be, dbname, dbname, restore_options, &err);
+    CheckNoError(err);
+    rocksdb_restore_options_destroy(restore_options);
+
+    rocksdb_options_set_error_if_exists(options, 0);
+    db = rocksdb_open(options, dbname, &err);
+    CheckNoError(err);
+    rocksdb_options_set_error_if_exists(options, 1);
+
+    CheckGet(db, roptions, "foo", "hello");
+
+    rocksdb_backup_engine_close(be);
+  }
+
  StartPhase("compactall");
  rocksdb_compact_range(db, NULL, 0, NULL, 0);
  CheckGet(db, roptions, "foo", "hello");
@ -576,37 +618,39 @@ int main(int argc, char** argv) {

  StartPhase("compaction_filter");
  {
-    rocksdb_options_t* options = rocksdb_options_create();
-    rocksdb_options_set_create_if_missing(options, 1);
+    rocksdb_options_t* options_with_filter = rocksdb_options_create();
+    rocksdb_options_set_create_if_missing(options_with_filter, 1);
    rocksdb_compactionfilter_t* cfilter;
    cfilter = rocksdb_compactionfilter_create(NULL, CFilterDestroy,
                                              CFilterFilter, CFilterName);
    // Create new database
    rocksdb_close(db);
-    rocksdb_destroy_db(options, dbname, &err);
-    rocksdb_options_set_compaction_filter(options, cfilter);
-    db = CheckCompaction(db, options, roptions, woptions);
+    rocksdb_destroy_db(options_with_filter, dbname, &err);
+    rocksdb_options_set_compaction_filter(options_with_filter, cfilter);
+    db = CheckCompaction(db, options_with_filter, roptions, woptions);

-    rocksdb_options_set_compaction_filter(options, NULL);
+    rocksdb_options_set_compaction_filter(options_with_filter, NULL);
    rocksdb_compactionfilter_destroy(cfilter);
-    rocksdb_options_destroy(options);
+    rocksdb_options_destroy(options_with_filter);
  }

  StartPhase("compaction_filter_factory");
  {
-    rocksdb_options_t* options = rocksdb_options_create();
-    rocksdb_options_set_create_if_missing(options, 1);
+    rocksdb_options_t* options_with_filter_factory = rocksdb_options_create();
+    rocksdb_options_set_create_if_missing(options_with_filter_factory, 1);
    rocksdb_compactionfilterfactory_t* factory;
    factory = rocksdb_compactionfilterfactory_create(
        NULL, CFilterFactoryDestroy, CFilterCreate, CFilterFactoryName);
    // Create new database
    rocksdb_close(db);
-    rocksdb_destroy_db(options, dbname, &err);
-    rocksdb_options_set_compaction_filter_factory(options, factory);
-    db = CheckCompaction(db, options, roptions, woptions);
-
-    rocksdb_options_set_compaction_filter_factory(options, NULL);
-    rocksdb_options_destroy(options);
+    rocksdb_destroy_db(options_with_filter_factory, dbname, &err);
+    rocksdb_options_set_compaction_filter_factory(options_with_filter_factory,
+                                                  factory);
+    db = CheckCompaction(db, options_with_filter_factory, roptions, woptions);
+
+    rocksdb_options_set_compaction_filter_factory(
+        options_with_filter_factory, NULL);
+    rocksdb_options_destroy(options_with_filter_factory);
  }

  StartPhase("compaction_filter_v2");
@ -799,8 +843,87 @@ int main(int argc, char** argv) {
    rocksdb_iter_get_error(iter, &err);
    CheckNoError(err);
    rocksdb_iter_destroy(iter);
+
+    rocksdb_close(db);
+    rocksdb_destroy_db(options, dbname, &err);
+  }
+
+  StartPhase("cuckoo_options");
+  {
+    rocksdb_cuckoo_table_options_t* cuckoo_options;
+    cuckoo_options = rocksdb_cuckoo_options_create();
+    rocksdb_cuckoo_options_set_hash_ratio(cuckoo_options, 0.5);
+    rocksdb_cuckoo_options_set_max_search_depth(cuckoo_options, 200);
+    rocksdb_cuckoo_options_set_cuckoo_block_size(cuckoo_options, 10);
+    rocksdb_cuckoo_options_set_identity_as_first_hash(cuckoo_options, 1);
+    rocksdb_cuckoo_options_set_use_module_hash(cuckoo_options, 0);
+    rocksdb_options_set_cuckoo_table_factory(options, cuckoo_options);
+
+    db = rocksdb_open(options, dbname, &err);
+    CheckNoError(err);
+
+    rocksdb_cuckoo_options_destroy(cuckoo_options);
+  }
+
+  StartPhase("iterate_upper_bound");
+  {
+    // Create new empty database
+    rocksdb_close(db);
+    rocksdb_destroy_db(options, dbname, &err);
+    CheckNoError(err);
+
+    rocksdb_options_set_prefix_extractor(options, NULL);
+    db = rocksdb_open(options, dbname, &err);
+    CheckNoError(err);
+
+    rocksdb_put(db, woptions, "a",    1, "0",    1, &err); CheckNoError(err);
+    rocksdb_put(db, woptions, "foo",  3, "bar",  3, &err); CheckNoError(err);
+    rocksdb_put(db, woptions, "foo1", 4, "bar1", 4, &err); CheckNoError(err);
+    rocksdb_put(db, woptions, "g1",   2, "0",    1, &err); CheckNoError(err);
+
+    // testing basic case with no iterate_upper_bound and no prefix_extractor
+    {
+       rocksdb_readoptions_set_iterate_upper_bound(roptions, NULL, 0);
+       rocksdb_iterator_t* iter = rocksdb_create_iterator(db, roptions);
+
+       rocksdb_iter_seek(iter, "foo", 3);
+       CheckCondition(rocksdb_iter_valid(iter));
+       CheckIter(iter, "foo", "bar");
+
+       rocksdb_iter_next(iter);
+       CheckCondition(rocksdb_iter_valid(iter));
+       CheckIter(iter, "foo1", "bar1");
+
+       rocksdb_iter_next(iter);
+       CheckCondition(rocksdb_iter_valid(iter));
+       CheckIter(iter, "g1", "0");
+
+       rocksdb_iter_destroy(iter);
    }

+    // testing iterate_upper_bound and forward iterator
+    // to make sure it stops at bound
+    {
+       // iterate_upper_bound points beyond the last expected entry
+       rocksdb_readoptions_set_iterate_upper_bound(roptions, "foo2", 4);
+
+       rocksdb_iterator_t* iter = rocksdb_create_iterator(db, roptions);
+
+       rocksdb_iter_seek(iter, "foo", 3);
+       CheckCondition(rocksdb_iter_valid(iter));
+       CheckIter(iter, "foo", "bar");
+
+       rocksdb_iter_next(iter);
+       CheckCondition(rocksdb_iter_valid(iter));
+       CheckIter(iter, "foo1", "bar1");
+
+       rocksdb_iter_next(iter);
+       // should stop here...
+       CheckCondition(!rocksdb_iter_valid(iter));
+
+       rocksdb_iter_destroy(iter);
+    }
+  }

  StartPhase("cleanup");
  rocksdb_close(db);
--- a/db/column_family.cc
+++ b/db/column_family.cc
@ -9,24 +9,65 @@

 #include "db/column_family.h"

+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include <inttypes.h>
 #include <vector>
 #include <string>
 #include <algorithm>
 #include <limits>

+#include "db/compaction_picker.h"
 #include "db/db_impl.h"
+#include "db/job_context.h"
 #include "db/version_set.h"
+#include "db/writebuffer.h"
 #include "db/internal_stats.h"
-#include "db/compaction_picker.h"
+#include "db/job_context.h"
 #include "db/table_properties_collector.h"
+#include "db/version_set.h"
+#include "db/write_controller.h"
 #include "util/autovector.h"
 #include "util/hash_skiplist_rep.h"
+#include "util/options_helper.h"

 namespace rocksdb {

-ColumnFamilyHandleImpl::ColumnFamilyHandleImpl(ColumnFamilyData* cfd,
-                                               DBImpl* db, port::Mutex* mutex)
-    : cfd_(cfd), db_(db), mutex_(mutex) {
+namespace {
+// This function computes the amount of time in microseconds by which a write
+// should be delayed based on the number of level-0 files according to the
+// following formula:
+// if n < bottom, return 0;
+// if n >= top, return 1000;
+// otherwise, let r = (n - bottom) /
+//                    (top - bottom)
+//  and return r^2 * 1000.
+// The goal of this formula is to gradually increase the rate at which writes
+// are slowed. We also tried linear delay (r * 1000), but it seemed to do
+// slightly worse. There is no other particular reason for choosing quadratic.
+uint64_t SlowdownAmount(int n, double bottom, double top) {
+  uint64_t delay;
+  if (n >= top) {
+    delay = 1000;
+  } else if (n < bottom) {
+    delay = 0;
+  } else {
+    // If we are here, we know that:
+    //   level0_start_slowdown <= n < level0_slowdown
+    // since the previous two conditions are false.
+    double how_much = static_cast<double>(n - bottom) / (top - bottom);
+    delay = std::max(how_much * how_much * 1000, 100.0);
+  }
+  assert(delay <= 1000);
+  return delay;
+}
+}  // namespace
+
+ColumnFamilyHandleImpl::ColumnFamilyHandleImpl(
+    ColumnFamilyData* column_family_data, DBImpl* db, InstrumentedMutex* mutex)
+    : cfd_(column_family_data), db_(db), mutex_(mutex) {
  if (cfd_ != nullptr) {
    cfd_->Ref();
  }
@ -34,21 +75,29 @@ ColumnFamilyHandleImpl::ColumnFamilyHandleImpl(ColumnFamilyData* cfd,

 ColumnFamilyHandleImpl::~ColumnFamilyHandleImpl() {
  if (cfd_ != nullptr) {
-    DBImpl::DeletionState deletion_state;
+    JobContext job_context;
    mutex_->Lock();
    if (cfd_->Unref()) {
      delete cfd_;
    }
-    db_->FindObsoleteFiles(deletion_state, false, true);
+    db_->FindObsoleteFiles(&job_context, false, true);
    mutex_->Unlock();
-    if (deletion_state.HaveSomethingToDelete()) {
-      db_->PurgeObsoleteFiles(deletion_state);
+    if (job_context.HaveSomethingToDelete()) {
+      db_->PurgeObsoleteFiles(job_context);
    }
  }
 }

 uint32_t ColumnFamilyHandleImpl::GetID() const { return cfd()->GetID(); }

+const std::string& ColumnFamilyHandleImpl::GetName() const {
+  return cfd()->GetName();
+}
+
+const Comparator* ColumnFamilyHandleImpl::user_comparator() const {
+  return cfd()->user_comparator();
+}
+
 ColumnFamilyOptions SanitizeOptions(const InternalKeyComparator* icmp,
                                    const ColumnFamilyOptions& src) {
  ColumnFamilyOptions result = src;
@ -132,7 +181,7 @@ SuperVersion* SuperVersion::Ref() {

 bool SuperVersion::Unref() {
  // fetch_sub returns the previous value of ref
-  uint32_t previous_refs = refs.fetch_sub(1, std::memory_order_relaxed);
+  uint32_t previous_refs = refs.fetch_sub(1);
  assert(previous_refs > 0);
  return previous_refs == 1;
 }
@ -174,20 +223,22 @@ void SuperVersionUnrefHandle(void* ptr) {
 }
 }  // anonymous namespace

-ColumnFamilyData::ColumnFamilyData(uint32_t id, const std::string& name,
-                                   Version* dummy_versions, Cache* table_cache,
-                                   const ColumnFamilyOptions& options,
-                                   const DBOptions* db_options,
-                                   const EnvOptions& storage_options,
-                                   ColumnFamilySet* column_family_set)
+ColumnFamilyData::ColumnFamilyData(
+    uint32_t id, const std::string& name, Version* _dummy_versions,
+    Cache* _table_cache, WriteBuffer* write_buffer,
+    const ColumnFamilyOptions& cf_options, const DBOptions* db_options,
+    const EnvOptions& env_options, ColumnFamilySet* column_family_set)
    : id_(id),
      name_(name),
-      dummy_versions_(dummy_versions),
+      dummy_versions_(_dummy_versions),
      current_(nullptr),
      refs_(0),
      dropped_(false),
-      internal_comparator_(options.comparator),
-      options_(*db_options, SanitizeOptions(&internal_comparator_, options)),
+      internal_comparator_(cf_options.comparator),
+      options_(*db_options, SanitizeOptions(&internal_comparator_, cf_options)),
+      ioptions_(options_),
+      mutable_cf_options_(options_, ioptions_),
+      write_buffer_(write_buffer),
      mem_(nullptr),
      imm_(options_.min_write_buffer_number_to_merge),
      super_version_(nullptr),
@ -196,48 +247,69 @@ ColumnFamilyData::ColumnFamilyData(uint32_t id, const std::string& name,
      next_(nullptr),
      prev_(nullptr),
      log_number_(0),
-      need_slowdown_for_num_level0_files_(false),
-      column_family_set_(column_family_set) {
+      column_family_set_(column_family_set),
+      pending_flush_(false),
+      pending_compaction_(false) {
  Ref();

-  // if dummy_versions is nullptr, then this is a dummy column family.
-  if (dummy_versions != nullptr) {
+  // if _dummy_versions is nullptr, then this is a dummy column family.
+  if (_dummy_versions != nullptr) {
    internal_stats_.reset(
-        new InternalStats(options_.num_levels, db_options->env, this));
-    table_cache_.reset(new TableCache(&options_, storage_options, table_cache));
-    if (options_.compaction_style == kCompactionStyleUniversal) {
+        new InternalStats(ioptions_.num_levels, db_options->env, this));
+    table_cache_.reset(new TableCache(ioptions_, env_options, _table_cache));
+    if (ioptions_.compaction_style == kCompactionStyleLevel) {
+      compaction_picker_.reset(
+          new LevelCompactionPicker(ioptions_, &internal_comparator_));
+#ifndef ROCKSDB_LITE
+    } else if (ioptions_.compaction_style == kCompactionStyleUniversal) {
      compaction_picker_.reset(
-          new UniversalCompactionPicker(&options_, &internal_comparator_));
-    } else if (options_.compaction_style == kCompactionStyleLevel) {
+          new UniversalCompactionPicker(ioptions_, &internal_comparator_));
+    } else if (ioptions_.compaction_style == kCompactionStyleFIFO) {
      compaction_picker_.reset(
-          new LevelCompactionPicker(&options_, &internal_comparator_));
+          new FIFOCompactionPicker(ioptions_, &internal_comparator_));
+    } else if (ioptions_.compaction_style == kCompactionStyleNone) {
+      compaction_picker_.reset(new NullCompactionPicker(
+          ioptions_, &internal_comparator_));
+      Log(InfoLogLevel::WARN_LEVEL, ioptions_.info_log,
+          "Column family %s does not use any background compaction. "
+          "Compactions can only be done via CompactFiles\n",
+          GetName().c_str());
+#endif  // !ROCKSDB_LITE
    } else {
-      assert(options_.compaction_style == kCompactionStyleFIFO);
+      Log(InfoLogLevel::ERROR_LEVEL, ioptions_.info_log,
+          "Unable to recognize the specified compaction style %d. "
+          "Column family %s will use kCompactionStyleLevel.\n",
+          ioptions_.compaction_style, GetName().c_str());
      compaction_picker_.reset(
-          new FIFOCompactionPicker(&options_, &internal_comparator_));
+          new LevelCompactionPicker(ioptions_, &internal_comparator_));
    }

-    Log(options_.info_log, "Options for column family \"%s\":\n",
-        name.c_str());
-    const ColumnFamilyOptions* cf_options = &options_;
-    cf_options->Dump(options_.info_log.get());
+    if (column_family_set_->NumberOfColumnFamilies() < 10) {
+      Log(InfoLogLevel::INFO_LEVEL, ioptions_.info_log,
+          "--------------- Options for column family [%s]:\n", name.c_str());
+      options_.Dump(ioptions_.info_log);
+    } else {
+      Log(InfoLogLevel::INFO_LEVEL, ioptions_.info_log,
+          "\t(skipping printing options)\n");
+    }
  }

-  RecalculateWriteStallConditions();
+  RecalculateWriteStallConditions(mutable_cf_options_);
 }

 // DB mutex held
 ColumnFamilyData::~ColumnFamilyData() {
-  assert(refs_ == 0);
+  assert(refs_.load(std::memory_order_relaxed) == 0);
  // remove from linked list
  auto prev = prev_;
  auto next = next_;
  prev->next_ = next;
  next->prev_ = prev;

-  // it's nullptr for dummy CFD
-  if (column_family_set_ != nullptr) {
-    // remove from column_family_set
+  if (!dropped_ && column_family_set_ != nullptr) {
+    // If it's dropped, it's already removed from column family set
+    // If column_family_set_ == nullptr, this is dummy CFD and not in
+    // ColumnFamilySet
    column_family_set_->RemoveColumnFamily(this);
  }

@ -245,6 +317,11 @@ ColumnFamilyData::~ColumnFamilyData() {
    current_->Unref();
  }

+  // It would be wrong if this ColumnFamilyData is in flush_queue_ or
+  // compaction_queue_ and we destroyed it
+  assert(!pending_flush_);
+  assert(!pending_compaction_);
+
  if (super_version_ != nullptr) {
    // Release SuperVersion reference kept in ThreadLocalPtr.
    // This must be done outside of mutex_ since unref handler can lock mutex.
@ -262,8 +339,9 @@ ColumnFamilyData::~ColumnFamilyData() {

  if (dummy_versions_ != nullptr) {
    // List must be empty
-    assert(dummy_versions_->next_ == dummy_versions_);
-    delete dummy_versions_;
+    assert(dummy_versions_->TEST_Next() == dummy_versions_);
+    bool deleted __attribute__((unused)) = dummy_versions_->Unref();
+    assert(deleted);
  }

  if (mem_ != nullptr) {
@ -276,90 +354,146 @@ ColumnFamilyData::~ColumnFamilyData() {
  }
 }

-void ColumnFamilyData::RecalculateWriteStallConditions() {
-  need_wait_for_num_memtables_ =
-    (imm()->size() == options()->max_write_buffer_number - 1);
+void ColumnFamilyData::SetDropped() {
+  // can't drop default CF
+  assert(id_ != 0);
+  dropped_ = true;
+  write_controller_token_.reset();

-  if (current_ != nullptr) {
-    need_wait_for_num_level0_files_ =
-      (current_->NumLevelFiles(0) >= options()->level0_stop_writes_trigger);
-  } else {
-    need_wait_for_num_level0_files_ = false;
-  }
-
-  RecalculateWriteStallRateLimitsConditions();
+  // remove from column_family_set
+  column_family_set_->RemoveColumnFamily(this);
 }

-void ColumnFamilyData::RecalculateWriteStallRateLimitsConditions() {
+void ColumnFamilyData::RecalculateWriteStallConditions(
+      const MutableCFOptions& mutable_cf_options) {
  if (current_ != nullptr) {
-    exceeds_hard_rate_limit_ =
-        (options()->hard_rate_limit > 1.0 &&
-         current_->MaxCompactionScore() > options()->hard_rate_limit);
-
-    exceeds_soft_rate_limit_ =
-        (options()->soft_rate_limit > 0.0 &&
-         current_->MaxCompactionScore() > options()->soft_rate_limit);
+    auto* vstorage = current_->storage_info();
+    const double score = vstorage->max_compaction_score();
+    const int max_level = vstorage->max_compaction_score_level();
+
+    auto write_controller = column_family_set_->write_controller_;
+
+    if (imm()->size() >= mutable_cf_options.max_write_buffer_number) {
+      write_controller_token_ = write_controller->GetStopToken();
+      internal_stats_->AddCFStats(InternalStats::MEMTABLE_COMPACTION, 1);
+      Log(InfoLogLevel::WARN_LEVEL, ioptions_.info_log,
+          "[%s] Stopping writes because we have %d immutable memtables "
+          "(waiting for flush), max_write_buffer_number is set to %d",
+          name_.c_str(), imm()->size(),
+          mutable_cf_options.max_write_buffer_number);
+    } else if (vstorage->NumLevelFiles(0) >=
+               mutable_cf_options.level0_stop_writes_trigger) {
+      write_controller_token_ = write_controller->GetStopToken();
+      internal_stats_->AddCFStats(InternalStats::LEVEL0_NUM_FILES, 1);
+      Log(InfoLogLevel::WARN_LEVEL, ioptions_.info_log,
+          "[%s] Stopping writes because we have %d level-0 files",
+          name_.c_str(), vstorage->NumLevelFiles(0));
+    } else if (mutable_cf_options.level0_slowdown_writes_trigger >= 0 &&
+               vstorage->NumLevelFiles(0) >=
+                   mutable_cf_options.level0_slowdown_writes_trigger) {
+      uint64_t slowdown =
+          SlowdownAmount(vstorage->NumLevelFiles(0),
+                         mutable_cf_options.level0_slowdown_writes_trigger,
+                         mutable_cf_options.level0_stop_writes_trigger);
+      write_controller_token_ = write_controller->GetDelayToken(slowdown);
+      internal_stats_->AddCFStats(InternalStats::LEVEL0_SLOWDOWN, slowdown);
+      Log(InfoLogLevel::WARN_LEVEL, ioptions_.info_log,
+          "[%s] Stalling writes because we have %d level-0 files (%" PRIu64
+          "us)",
+          name_.c_str(), vstorage->NumLevelFiles(0), slowdown);
+    } else if (mutable_cf_options.hard_rate_limit > 1.0 &&
+               score > mutable_cf_options.hard_rate_limit) {
+      uint64_t kHardLimitSlowdown = 1000;
+      write_controller_token_ =
+          write_controller->GetDelayToken(kHardLimitSlowdown);
+      internal_stats_->RecordLevelNSlowdown(max_level, kHardLimitSlowdown,
+                                            false);
+      Log(InfoLogLevel::WARN_LEVEL, ioptions_.info_log,
+          "[%s] Stalling writes because we hit hard limit on level %d. "
+          "(%" PRIu64 "us)",
+          name_.c_str(), max_level, kHardLimitSlowdown);
+    } else if (mutable_cf_options.soft_rate_limit > 0.0 &&
+               score > mutable_cf_options.soft_rate_limit) {
+      uint64_t slowdown = SlowdownAmount(score,
+          mutable_cf_options.soft_rate_limit,
+          mutable_cf_options.hard_rate_limit);
+      write_controller_token_ = write_controller->GetDelayToken(slowdown);
+      internal_stats_->RecordLevelNSlowdown(max_level, slowdown, true);
+      Log(InfoLogLevel::WARN_LEVEL, ioptions_.info_log,
+          "[%s] Stalling writes because we hit soft limit on level %d (%" PRIu64
+          "us)",
+          name_.c_str(), max_level, slowdown);
    } else {
-    exceeds_hard_rate_limit_ = false;
-    exceeds_soft_rate_limit_ = false;
+      write_controller_token_.reset();
+    }
  }
 }

 const EnvOptions* ColumnFamilyData::soptions() const {
-  return &(column_family_set_->storage_options_);
+  return &(column_family_set_->env_options_);
 }

-void ColumnFamilyData::SetCurrent(Version* current) {
-  current_ = current;
-  need_slowdown_for_num_level0_files_ =
-      (options_.level0_slowdown_writes_trigger >= 0 &&
-       current_->NumLevelFiles(0) >= options_.level0_slowdown_writes_trigger);
+void ColumnFamilyData::SetCurrent(Version* current_version) {
+  current_ = current_version;
 }

-void ColumnFamilyData::CreateNewMemtable() {
+MemTable* ColumnFamilyData::ConstructNewMemtable(
+    const MutableCFOptions& mutable_cf_options) {
  assert(current_ != nullptr);
+  return new MemTable(internal_comparator_, ioptions_,
+                      mutable_cf_options, write_buffer_);
+}
+
+void ColumnFamilyData::CreateNewMemtable(
+    const MutableCFOptions& mutable_cf_options) {
  if (mem_ != nullptr) {
    delete mem_->Unref();
  }
-  mem_ = new MemTable(internal_comparator_, options_);
+  SetMemtable(ConstructNewMemtable(mutable_cf_options));
  mem_->Ref();
 }

-Compaction* ColumnFamilyData::PickCompaction(LogBuffer* log_buffer) {
-  auto result = compaction_picker_->PickCompaction(current_, log_buffer);
-  RecalculateWriteStallRateLimitsConditions();
+bool ColumnFamilyData::NeedsCompaction() const {
+  return compaction_picker_->NeedsCompaction(current_->storage_info());
+}
+
+Compaction* ColumnFamilyData::PickCompaction(
+    const MutableCFOptions& mutable_options, LogBuffer* log_buffer) {
+  auto* result = compaction_picker_->PickCompaction(
+      GetName(), mutable_options, current_->storage_info(), log_buffer);
+  if (result != nullptr) {
+    result->SetInputVersion(current_);
+  }
  return result;
 }

-Compaction* ColumnFamilyData::CompactRange(int input_level, int output_level,
-                                           uint32_t output_path_id,
-                                           const InternalKey* begin,
-                                           const InternalKey* end,
+Compaction* ColumnFamilyData::CompactRange(
+    const MutableCFOptions& mutable_cf_options,
+    int input_level, int output_level, uint32_t output_path_id,
+    const InternalKey* begin, const InternalKey* end,
    InternalKey** compaction_end) {
-  return compaction_picker_->CompactRange(current_, input_level, output_level,
-                                          output_path_id, begin, end,
-                                          compaction_end);
+  auto* result = compaction_picker_->CompactRange(
+      GetName(), mutable_cf_options, current_->storage_info(), input_level,
+      output_level, output_path_id, begin, end, compaction_end);
+  if (result != nullptr) {
+    result->SetInputVersion(current_);
+  }
+  return result;
 }

 SuperVersion* ColumnFamilyData::GetReferencedSuperVersion(
-    port::Mutex* db_mutex) {
+    InstrumentedMutex* db_mutex) {
  SuperVersion* sv = nullptr;
-  if (LIKELY(column_family_set_->db_options_->allow_thread_local)) {
  sv = GetThreadLocalSuperVersion(db_mutex);
  sv->Ref();
  if (!ReturnThreadLocalSuperVersion(sv)) {
    sv->Unref();
  }
-  } else {
-    db_mutex->Lock();
-    sv = super_version_->Ref();
-    db_mutex->Unlock();
-  }
  return sv;
 }

 SuperVersion* ColumnFamilyData::GetThreadLocalSuperVersion(
-    port::Mutex* db_mutex) {
+    InstrumentedMutex* db_mutex) {
  SuperVersion* sv = nullptr;
  // The SuperVersion is cached in thread local storage to avoid acquiring
  // mutex when SuperVersion does not change since the last use. When a new
@ -382,11 +516,11 @@ SuperVersion* ColumnFamilyData::GetThreadLocalSuperVersion(
  sv = static_cast<SuperVersion*>(ptr);
  if (sv == SuperVersion::kSVObsolete ||
      sv->version_number != super_version_number_.load()) {
-    RecordTick(options_.statistics.get(), NUMBER_SUPERVERSION_ACQUIRES);
+    RecordTick(ioptions_.statistics, NUMBER_SUPERVERSION_ACQUIRES);
    SuperVersion* sv_to_delete = nullptr;

    if (sv && sv->Unref()) {
-      RecordTick(options_.statistics.get(), NUMBER_SUPERVERSION_CLEANUPS);
+      RecordTick(ioptions_.statistics, NUMBER_SUPERVERSION_CLEANUPS);
      db_mutex->Lock();
      // NOTE: underlying resources held by superversion (sst files) might
      // not be released until the next background job.
@ -422,20 +556,68 @@ bool ColumnFamilyData::ReturnThreadLocalSuperVersion(SuperVersion* sv) {
  return false;
 }

+void ColumnFamilyData::NotifyOnCompactionCompleted(
+    DB* db, Compaction* c, const Status& status) {
+#ifndef ROCKSDB_LITE
+  auto listeners = ioptions()->listeners;
+  CompactionJobInfo info;
+  info.cf_name = c->column_family_data()->GetName();
+  info.status = status;
+  info.output_level = c->output_level();
+  for (const auto fmd : *c->inputs(c->level())) {
+    info.input_files.push_back(
+        TableFileName(options_.db_paths,
+                      fmd->fd.GetNumber(),
+                      fmd->fd.GetPathId()));
+  }
+  for (const auto newf : c->edit()->GetNewFiles()) {
+    info.input_files.push_back(
+        TableFileName(options_.db_paths,
+                      newf.second.fd.GetNumber(),
+                      newf.second.fd.GetPathId()));
+  }
+  for (auto listener : listeners) {
+    listener->OnCompactionCompleted(db, info);
+  }
+#endif  // ROCKSDB_LITE
+}
+
+void ColumnFamilyData::NotifyOnFlushCompleted(
+    DB* db, const std::string& file_path,
+    bool triggered_flush_slowdown,
+    bool triggered_flush_stop) {
+
+#ifndef ROCKSDB_LITE
+  auto listeners = ioptions()->listeners;
+  for (auto listener : listeners) {
+    listener->OnFlushCompleted(
+        db, GetName(), file_path,
+        // Use path 0 as fulled memtables are first flushed into path 0.
+        triggered_flush_slowdown, triggered_flush_stop);
+  }
+#endif  // ROCKSDB_LITE
+}
+
 SuperVersion* ColumnFamilyData::InstallSuperVersion(
-    SuperVersion* new_superversion, port::Mutex* db_mutex) {
+    SuperVersion* new_superversion, InstrumentedMutex* db_mutex) {
+  db_mutex->AssertHeld();
+  return InstallSuperVersion(new_superversion, db_mutex, mutable_cf_options_);
+}
+
+SuperVersion* ColumnFamilyData::InstallSuperVersion(
+    SuperVersion* new_superversion, InstrumentedMutex* db_mutex,
+    const MutableCFOptions& mutable_cf_options) {
  new_superversion->db_mutex = db_mutex;
+  new_superversion->mutable_cf_options = mutable_cf_options;
  new_superversion->Init(mem_, imm_.current(), current_);
  SuperVersion* old_superversion = super_version_;
  super_version_ = new_superversion;
  ++super_version_number_;
  super_version_->version_number = super_version_number_;
  // Reset SuperVersions cached in thread local storage
-  if (column_family_set_->db_options_->allow_thread_local) {
  ResetThreadLocalSuperVersions();
-  }

-  RecalculateWriteStallConditions();
+  RecalculateWriteStallConditions(mutable_cf_options);

  if (old_superversion != nullptr && old_superversion->Unref()) {
    old_superversion->Cleanup();
@ -460,20 +642,37 @@ void ColumnFamilyData::ResetThreadLocalSuperVersions() {
  }
 }

+#ifndef ROCKSDB_LITE
+Status ColumnFamilyData::SetOptions(
+      const std::unordered_map<std::string, std::string>& options_map) {
+  MutableCFOptions new_mutable_cf_options;
+  Status s = GetMutableOptionsFromStrings(mutable_cf_options_, options_map,
+                                          &new_mutable_cf_options);
+  if (s.ok()) {
+    mutable_cf_options_ = new_mutable_cf_options;
+    mutable_cf_options_.RefreshDerivedOptions(ioptions_);
+  }
+  return s;
+}
+#endif  // ROCKSDB_LITE
+
 ColumnFamilySet::ColumnFamilySet(const std::string& dbname,
                                 const DBOptions* db_options,
-                                 const EnvOptions& storage_options,
-                                 Cache* table_cache)
+                                 const EnvOptions& env_options,
+                                 Cache* table_cache,
+                                 WriteBuffer* write_buffer,
+                                 WriteController* write_controller)
    : max_column_family_(0),
-      dummy_cfd_(new ColumnFamilyData(0, "", nullptr, nullptr,
+      dummy_cfd_(new ColumnFamilyData(0, "", nullptr, nullptr, nullptr,
                                      ColumnFamilyOptions(), db_options,
-                                      storage_options_, nullptr)),
+                                      env_options, nullptr)),
      default_cfd_cache_(nullptr),
      db_name_(dbname),
      db_options_(db_options),
-      storage_options_(storage_options),
+      env_options_(env_options),
      table_cache_(table_cache),
-      spin_lock_(ATOMIC_FLAG_INIT) {
+      write_buffer_(write_buffer),
+      write_controller_(write_controller) {
  // initialize linked list
  dummy_cfd_->prev_ = dummy_cfd_;
  dummy_cfd_->next_ = dummy_cfd_;
@ -530,18 +729,17 @@ size_t ColumnFamilySet::NumberOfColumnFamilies() const {
  return column_families_.size();
 }

-// under a DB mutex
+// under a DB mutex AND write thread
 ColumnFamilyData* ColumnFamilySet::CreateColumnFamily(
    const std::string& name, uint32_t id, Version* dummy_versions,
    const ColumnFamilyOptions& options) {
  assert(column_families_.find(name) == column_families_.end());
  ColumnFamilyData* new_cfd =
-      new ColumnFamilyData(id, name, dummy_versions, table_cache_, options,
-                           db_options_, storage_options_, this);
-  Lock();
+      new ColumnFamilyData(id, name, dummy_versions, table_cache_,
+                           write_buffer_, options, db_options_,
+                           env_options_, this);
  column_families_.insert({name, id});
  column_family_data_.insert({id, new_cfd});
-  Unlock();
  max_column_family_ = std::max(max_column_family_, id);
  // add to linked list
  new_cfd->next_ = dummy_cfd_;
@ -555,19 +753,11 @@ ColumnFamilyData* ColumnFamilySet::CreateColumnFamily(
  return new_cfd;
 }

-void ColumnFamilySet::Lock() {
-  // spin lock
-  while (spin_lock_.test_and_set(std::memory_order_acquire)) {
-  }
-}
-
-void ColumnFamilySet::Unlock() { spin_lock_.clear(std::memory_order_release); }
-
 // REQUIRES: DB mutex held
 void ColumnFamilySet::FreeDeadColumnFamilies() {
  autovector<ColumnFamilyData*> to_delete;
  for (auto cfd = dummy_cfd_->next_; cfd != dummy_cfd_; cfd = cfd->next_) {
-    if (cfd->refs_ == 0) {
+    if (cfd->refs_.load(std::memory_order_relaxed) == 0) {
      to_delete.push_back(cfd);
    }
  }
@ -577,25 +767,21 @@ void ColumnFamilySet::FreeDeadColumnFamilies() {
  }
 }

-// under a DB mutex
+// under a DB mutex AND from a write thread
 void ColumnFamilySet::RemoveColumnFamily(ColumnFamilyData* cfd) {
  auto cfd_iter = column_family_data_.find(cfd->GetID());
  assert(cfd_iter != column_family_data_.end());
-  Lock();
  column_family_data_.erase(cfd_iter);
  column_families_.erase(cfd->GetName());
-  Unlock();
 }

+// under a DB mutex OR from a write thread
 bool ColumnFamilyMemTablesImpl::Seek(uint32_t column_family_id) {
  if (column_family_id == 0) {
    // optimization for common case
    current_ = column_family_set_->GetDefault();
  } else {
-    // maybe outside of db mutex, should lock
-    column_family_set_->Lock();
    current_ = column_family_set_->GetColumnFamily(column_family_id);
-    column_family_set_->Unlock();
  }
  handle_.SetCFD(current_);
  return current_ != nullptr;
@ -611,16 +797,18 @@ MemTable* ColumnFamilyMemTablesImpl::GetMemTable() const {
  return current_->mem();
 }

-const Options* ColumnFamilyMemTablesImpl::GetOptions() const {
-  assert(current_ != nullptr);
-  return current_->options();
-}
-
 ColumnFamilyHandle* ColumnFamilyMemTablesImpl::GetColumnFamilyHandle() {
  assert(current_ != nullptr);
  return &handle_;
 }

+void ColumnFamilyMemTablesImpl::CheckMemtableFull() {
+  if (current_ != nullptr && current_->mem()->ShouldScheduleFlush()) {
+    flush_scheduler_->ScheduleFlush(current_);
+    current_->mem()->MarkFlushScheduled();
+  }
+}
+
 uint32_t GetColumnFamilyID(ColumnFamilyHandle* column_family) {
  uint32_t column_family_id = 0;
  if (column_family != nullptr) {
@ -630,4 +818,13 @@ uint32_t GetColumnFamilyID(ColumnFamilyHandle* column_family) {
  return column_family_id;
 }

+const Comparator* GetColumnFamilyUserComparator(
+    ColumnFamilyHandle* column_family) {
+  if (column_family != nullptr) {
+    auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+    return cfh->user_comparator();
+  }
+  return nullptr;
+}
+
 }  // namespace rocksdb
--- a/db/column_family.h
+++ b/db/column_family.h
@ -19,7 +19,11 @@
 #include "rocksdb/env.h"
 #include "db/memtable_list.h"
 #include "db/write_batch_internal.h"
+#include "db/write_controller.h"
 #include "db/table_cache.h"
+#include "db/flush_scheduler.h"
+#include "util/instrumented_mutex.h"
+#include "util/mutable_cf_options.h"
 #include "util/thread_local.h"

 namespace rocksdb {
@ -35,6 +39,8 @@ class InternalStats;
 class ColumnFamilyData;
 class DBImpl;
 class LogBuffer;
+class InstrumentedMutex;
+class InstrumentedMutexLock;

 // ColumnFamilyHandleImpl is the class that clients use to access different
 // column families. It has non-trivial destructor, which gets called when client
@ -42,17 +48,20 @@ class LogBuffer;
 class ColumnFamilyHandleImpl : public ColumnFamilyHandle {
 public:
  // create while holding the mutex
-  ColumnFamilyHandleImpl(ColumnFamilyData* cfd, DBImpl* db, port::Mutex* mutex);
+  ColumnFamilyHandleImpl(
+      ColumnFamilyData* cfd, DBImpl* db, InstrumentedMutex* mutex);
  // destroy without mutex
  virtual ~ColumnFamilyHandleImpl();
  virtual ColumnFamilyData* cfd() const { return cfd_; }
+  virtual const Comparator* user_comparator() const;

  virtual uint32_t GetID() const;
+  virtual const std::string& GetName() const override;

 private:
  ColumnFamilyData* cfd_;
  DBImpl* db_;
-  port::Mutex* mutex_;
+  InstrumentedMutex* mutex_;
 };

 // Does not ref-count ColumnFamilyData
@ -66,7 +75,7 @@ class ColumnFamilyHandleInternal : public ColumnFamilyHandleImpl {
  ColumnFamilyHandleInternal()
      : ColumnFamilyHandleImpl(nullptr, nullptr, nullptr) {}

-  void SetCFD(ColumnFamilyData* cfd) { internal_cfd_ = cfd; }
+  void SetCFD(ColumnFamilyData* _cfd) { internal_cfd_ = _cfd; }
  virtual ColumnFamilyData* cfd() const override { return internal_cfd_; }

 private:
@ -78,6 +87,7 @@ struct SuperVersion {
  MemTable* mem;
  MemTableListVersion* imm;
  Version* current;
+  MutableCFOptions mutable_cf_options;
  std::atomic<uint32_t> refs;
  // We need to_delete because during Cleanup(), imm->Unref() returns
  // all memtables that we need to free through this vector. We then
@ -85,7 +95,7 @@ struct SuperVersion {
  autovector<MemTable*> to_delete;
  // Version number of the current SuperVersion
  uint64_t version_number;
-  port::Mutex* db_mutex;
+  InstrumentedMutex* db_mutex;

  // should be called outside the mutex
  SuperVersion() = default;
@ -117,8 +127,7 @@ extern ColumnFamilyOptions SanitizeOptions(const InternalKeyComparator* icmp,

 class ColumnFamilySet;

-// This class keeps all the data that a column family needs. It's mosly dumb and
-// used just to provide access to metadata.
+// This class keeps all the data that a column family needs.
 // Most methods require DB mutex held, unless otherwise noted
 class ColumnFamilyData {
 public:
@ -129,17 +138,24 @@ class ColumnFamilyData {
  // thread-safe
  const std::string& GetName() const { return name_; }

-  void Ref() { ++refs_; }
+  // Ref() can only be called whily holding a DB mutex or during a
+  // single-threaded write.
+  void Ref() { refs_.fetch_add(1, std::memory_order_relaxed); }
  // will just decrease reference count to 0, but will not delete it. returns
  // true if the ref count was decreased to zero. in that case, it can be
-  // deleted by the caller immediatelly, or later, by calling
+  // deleted by the caller immediately, or later, by calling
  // FreeDeadColumnFamilies()
+  // Unref() can only be called while holding a DB mutex
  bool Unref() {
-    assert(refs_ > 0);
-    return --refs_ == 0;
+    int old_refs = refs_.fetch_sub(1, std::memory_order_relaxed);
+    assert(old_refs > 0);
+    return old_refs == 1;
  }

-  // This can only be called from single-threaded VersionSet::LogAndApply()
+  // SetDropped() can only be called under following conditions:
+  // 1) Holding a DB mutex,
+  // 2) from single-threaded write thread, AND
+  // 3) from single-threaded VersionSet::LogAndApply()
  // After dropping column family no other operation on that column family
  // will be executed. All the files and memory will be, however, kept around
  // until client drops the column family handle. That way, client can still
@ -147,27 +163,42 @@ class ColumnFamilyData {
  // Column family can be dropped and still alive. In that state:
  // *) Column family is not included in the iteration.
  // *) Compaction and flush is not executed on the dropped column family.
-  // *) Client can continue writing and reading from column family. However, all
-  // writes stay in the current memtable.
+  // *) Client can continue reading from column family. Writes will fail unless
+  // WriteOptions::ignore_missing_column_families is true
  // When the dropped column family is unreferenced, then we:
  // *) delete all memory associated with that column family
  // *) delete all the files associated with that column family
-  void SetDropped() {
-    // can't drop default CF
-    assert(id_ != 0);
-    dropped_ = true;
-  }
+  void SetDropped();
  bool IsDropped() const { return dropped_; }

  // thread-safe
-  int NumberLevels() const { return options_.num_levels; }
+  int NumberLevels() const { return ioptions_.num_levels; }

  void SetLogNumber(uint64_t log_number) { log_number_ = log_number; }
  uint64_t GetLogNumber() const { return log_number_; }

-  // thread-safe
+  // !!! To be deprecated! Please don't not use this function anymore!
  const Options* options() const { return &options_; }
+
+  // thread-safe
  const EnvOptions* soptions() const;
+  const ImmutableCFOptions* ioptions() const { return &ioptions_; }
+  // REQUIRES: DB mutex held
+  // This returns the MutableCFOptions used by current SuperVersion
+  // You shoul use this API to reference MutableCFOptions most of the time.
+  const MutableCFOptions* GetCurrentMutableCFOptions() const {
+    return &(super_version_->mutable_cf_options);
+  }
+  // REQUIRES: DB mutex held
+  // This returns the latest MutableCFOptions, which may be not in effect yet.
+  const MutableCFOptions* GetLatestMutableCFOptions() const {
+    return &mutable_cf_options_;
+  }
+#ifndef ROCKSDB_LITE
+  // REQUIRES: DB mutex held
+  Status SetOptions(
+      const std::unordered_map<std::string, std::string>& options_map);
+#endif  // ROCKSDB_LITE

  InternalStats* internal_stats() { return internal_stats_.get(); }

@ -175,17 +206,24 @@ class ColumnFamilyData {
  MemTable* mem() { return mem_; }
  Version* current() { return current_; }
  Version* dummy_versions() { return dummy_versions_; }
-  void SetMemtable(MemTable* new_mem) { mem_ = new_mem; }
  void SetCurrent(Version* current);
-  void CreateNewMemtable();
+  MemTable* ConstructNewMemtable(const MutableCFOptions& mutable_cf_options);
+  void SetMemtable(MemTable* new_mem) { mem_ = new_mem; }
+  void CreateNewMemtable(const MutableCFOptions& mutable_cf_options);

  TableCache* table_cache() const { return table_cache_.get(); }

  // See documentation in compaction_picker.h
-  Compaction* PickCompaction(LogBuffer* log_buffer);
-  Compaction* CompactRange(int input_level, int output_level,
-                           uint32_t output_path_id, const InternalKey* begin,
-                           const InternalKey* end,
+  // REQUIRES: DB mutex held
+  bool NeedsCompaction() const;
+  // REQUIRES: DB mutex held
+  Compaction* PickCompaction(const MutableCFOptions& mutable_options,
+                             LogBuffer* log_buffer);
+  // REQUIRES: DB mutex held
+  Compaction* CompactRange(
+      const MutableCFOptions& mutable_cf_options,
+      int input_level, int output_level, uint32_t output_path_id,
+      const InternalKey* begin, const InternalKey* end,
      InternalKey** compaction_end);

  CompactionPicker* compaction_picker() { return compaction_picker_.get(); }
@ -201,11 +239,11 @@ class ColumnFamilyData {
  SuperVersion* GetSuperVersion() { return super_version_; }
  // thread-safe
  // Return a already referenced SuperVersion to be used safely.
-  SuperVersion* GetReferencedSuperVersion(port::Mutex* db_mutex);
+  SuperVersion* GetReferencedSuperVersion(InstrumentedMutex* db_mutex);
  // thread-safe
  // Get SuperVersion stored in thread local storage. If it does not exist,
  // get a reference from a current SuperVersion.
-  SuperVersion* GetThreadLocalSuperVersion(port::Mutex* db_mutex);
+  SuperVersion* GetThreadLocalSuperVersion(InstrumentedMutex* db_mutex);
  // Try to return SuperVersion back to thread local storage. Retrun true on
  // success and false on failure. It fails when the thread local storage
  // contains anything other than SuperVersion::kSVInUse flag.
@ -218,40 +256,35 @@ class ColumnFamilyData {
  // if its reference count is zero and needs deletion or nullptr if not
  // As argument takes a pointer to allocated SuperVersion to enable
  // the clients to allocate SuperVersion outside of mutex.
+  // IMPORTANT: Only call this from DBImpl::InstallSuperVersion()
  SuperVersion* InstallSuperVersion(SuperVersion* new_superversion,
-                                    port::Mutex* db_mutex);
+                                    InstrumentedMutex* db_mutex,
+                                    const MutableCFOptions& mutable_cf_options);
+  SuperVersion* InstallSuperVersion(SuperVersion* new_superversion,
+                                    InstrumentedMutex* db_mutex);

  void ResetThreadLocalSuperVersions();

-  // A Flag indicating whether write needs to slowdown because of there are
-  // too many number of level0 files.
-  bool NeedSlowdownForNumLevel0Files() const {
-    return need_slowdown_for_num_level0_files_;
-  }
-
-  bool NeedWaitForNumLevel0Files() const {
-    return need_wait_for_num_level0_files_;
-  }
-
-  bool NeedWaitForNumMemtables() const {
-    return need_wait_for_num_memtables_;
-  }
+  void NotifyOnCompactionCompleted(DB* db, Compaction* c, const Status& status);

-  bool ExceedsSoftRateLimit() const {
-    return exceeds_soft_rate_limit_;
-  }
+  void NotifyOnFlushCompleted(
+      DB* db, const std::string& file_path,
+      bool triggered_flush_slowdown,
+      bool triggered_flush_stop);

-  bool ExceedsHardRateLimit() const {
-    return exceeds_hard_rate_limit_;
-  }
+  // Protected by DB mutex
+  void set_pending_flush(bool value) { pending_flush_ = value; }
+  void set_pending_compaction(bool value) { pending_compaction_ = value; }
+  bool pending_flush() { return pending_flush_; }
+  bool pending_compaction() { return pending_compaction_; }

 private:
  friend class ColumnFamilySet;
  ColumnFamilyData(uint32_t id, const std::string& name,
                   Version* dummy_versions, Cache* table_cache,
+                   WriteBuffer* write_buffer,
                   const ColumnFamilyOptions& options,
-                   const DBOptions* db_options,
-                   const EnvOptions& storage_options,
+                   const DBOptions* db_options, const EnvOptions& env_options,
                   ColumnFamilySet* column_family_set);

  // Recalculate some small conditions, which are changed only during
@ -259,25 +292,29 @@ class ColumnFamilyData {
  // recalculation of compaction score. These values are used in
  // DBImpl::MakeRoomForWrite function to decide, if it need to make
  // a write stall
-  void RecalculateWriteStallConditions();
-  void RecalculateWriteStallRateLimitsConditions();
+  void RecalculateWriteStallConditions(
+      const MutableCFOptions& mutable_cf_options);

  uint32_t id_;
  const std::string name_;
  Version* dummy_versions_;  // Head of circular doubly-linked list of versions.
  Version* current_;         // == dummy_versions->prev_

-  int refs_;                   // outstanding references to ColumnFamilyData
+  std::atomic<int> refs_;      // outstanding references to ColumnFamilyData
  bool dropped_;               // true if client dropped it

  const InternalKeyComparator internal_comparator_;

-  Options const options_;
+  const Options options_;
+  const ImmutableCFOptions ioptions_;
+  MutableCFOptions mutable_cf_options_;

  std::unique_ptr<TableCache> table_cache_;

  std::unique_ptr<InternalStats> internal_stats_;

+  WriteBuffer* write_buffer_;
+
  MemTable* mem_;
  MemTableList imm_;
  SuperVersion* super_version_;
@ -301,46 +338,38 @@ class ColumnFamilyData {
  // recovered from
  uint64_t log_number_;

-  // A flag indicating whether we should delay writes because
-  // we have too many level 0 files
-  bool need_slowdown_for_num_level0_files_;
-
-  // These 4 variables are updated only after compaction,
-  // adding new memtable, flushing memtables to files
-  // and/or add recalculation of compaction score.
-  // That's why theirs values are cached in ColumnFamilyData.
-  // Recalculation is made by RecalculateWriteStallConditions and
-  // RecalculateWriteStallRateLimitsConditions function. They are used
-  // in DBImpl::MakeRoomForWrite function to decide, if it need
-  // to sleep during write operation
-  bool need_wait_for_num_memtables_;
-
-  bool need_wait_for_num_level0_files_;
-
-  bool exceeds_hard_rate_limit_;
-
-  bool exceeds_soft_rate_limit_;
-
  // An object that keeps all the compaction stats
  // and picks the next compaction
  std::unique_ptr<CompactionPicker> compaction_picker_;

  ColumnFamilySet* column_family_set_;
+
+  std::unique_ptr<WriteControllerToken> write_controller_token_;
+
+  // If true --> this ColumnFamily is currently present in DBImpl::flush_queue_
+  bool pending_flush_;
+
+  // If true --> this ColumnFamily is currently present in
+  // DBImpl::compaction_queue_
+  bool pending_compaction_;
 };

 // ColumnFamilySet has interesting thread-safety requirements
-// * CreateColumnFamily() or RemoveColumnFamily() -- need to protect by DB
-// mutex. Inside, column_family_data_ and column_families_ will be protected
-// by Lock() and Unlock(). CreateColumnFamily() should ONLY be called from
-// VersionSet::LogAndApply() in the normal runtime. It is also called
-// during Recovery and in DumpManifest(). RemoveColumnFamily() is called
-// from ColumnFamilyData destructor
+// * CreateColumnFamily() or RemoveColumnFamily() -- need to be protected by DB
+// mutex AND executed in the write thread.
+// CreateColumnFamily() should ONLY be called from VersionSet::LogAndApply() AND
+// single-threaded write thread. It is also called during Recovery and in
+// DumpManifest().
+// RemoveColumnFamily() is only called from SetDropped(). DB mutex needs to be
+// held and it needs to be executed from the write thread. SetDropped() also
+// guarantees that it will be called only from single-threaded LogAndApply(),
+// but this condition is not that important.
 // * Iteration -- hold DB mutex, but you can release it in the body of
 // iteration. If you release DB mutex in body, reference the column
 // family before the mutex and unreference after you unlock, since the column
 // family might get dropped when the DB mutex is released
 // * GetDefault() -- thread safe
-// * GetColumnFamily() -- either inside of DB mutex or call Lock() <-> Unlock()
+// * GetColumnFamily() -- either inside of DB mutex or from a write thread
 // * GetNextColumnFamilyID(), GetMaxColumnFamily(), UpdateMaxColumnFamily(),
 // NumberOfColumnFamilies -- inside of DB mutex
 class ColumnFamilySet {
@ -354,7 +383,8 @@ class ColumnFamilySet {
      // dummy is never dead or dropped, so this will never be infinite
      do {
        current_ = current_->next_;
-      } while (current_->refs_ == 0 || current_->IsDropped());
+      } while (current_->refs_.load(std::memory_order_relaxed) == 0 ||
+               current_->IsDropped());
      return *this;
    }
    bool operator!=(const iterator& other) {
@ -367,7 +397,8 @@ class ColumnFamilySet {
  };

  ColumnFamilySet(const std::string& dbname, const DBOptions* db_options,
-                  const EnvOptions& storage_options, Cache* table_cache);
+                  const EnvOptions& env_options, Cache* table_cache,
+                  WriteBuffer* write_buffer, WriteController* write_controller);
  ~ColumnFamilySet();

  ColumnFamilyData* GetDefault() const;
@ -390,9 +421,6 @@ class ColumnFamilySet {
  iterator begin() { return iterator(dummy_cfd_->next_); }
  iterator end() { return iterator(dummy_cfd_); }

-  void Lock();
-  void Unlock();
-
  // REQUIRES: DB mutex held
  // Don't call while iterating over ColumnFamilySet
  void FreeDeadColumnFamilies();
@ -404,9 +432,12 @@ class ColumnFamilySet {
  void RemoveColumnFamily(ColumnFamilyData* cfd);

  // column_families_ and column_family_data_ need to be protected:
-  // * when mutating: 1. DB mutex locked first, 2. spinlock locked second
-  // * when reading, either: 1. lock DB mutex, or 2. lock spinlock
-  //  (if both, respect the ordering to avoid deadlock!)
+  // * when mutating both conditions have to be satisfied:
+  // 1. DB mutex locked
+  // 2. thread currently in single-threaded write thread
+  // * when reading, at least one condition needs to be satisfied:
+  // 1. DB mutex locked
+  // 2. accessed from a single-threaded write thread
  std::unordered_map<std::string, uint32_t> column_families_;
  std::unordered_map<uint32_t, ColumnFamilyData*> column_family_data_;

@ -420,41 +451,52 @@ class ColumnFamilySet {

  const std::string db_name_;
  const DBOptions* const db_options_;
-  const EnvOptions storage_options_;
+  const EnvOptions env_options_;
  Cache* table_cache_;
-  std::atomic_flag spin_lock_;
+  WriteBuffer* write_buffer_;
+  WriteController* write_controller_;
 };

 // We use ColumnFamilyMemTablesImpl to provide WriteBatch a way to access
 // memtables of different column families (specified by ID in the write batch)
 class ColumnFamilyMemTablesImpl : public ColumnFamilyMemTables {
 public:
-  explicit ColumnFamilyMemTablesImpl(ColumnFamilySet* column_family_set)
-      : column_family_set_(column_family_set), current_(nullptr) {}
+  explicit ColumnFamilyMemTablesImpl(ColumnFamilySet* column_family_set,
+                                     FlushScheduler* flush_scheduler)
+      : column_family_set_(column_family_set),
+        current_(nullptr),
+        flush_scheduler_(flush_scheduler) {}

  // sets current_ to ColumnFamilyData with column_family_id
  // returns false if column family doesn't exist
+  // REQUIRES: under a DB mutex OR from a write thread
  bool Seek(uint32_t column_family_id) override;

  // Returns log number of the selected column family
+  // REQUIRES: under a DB mutex OR from a write thread
  uint64_t GetLogNumber() const override;

  // REQUIRES: Seek() called first
+  // REQUIRES: under a DB mutex OR from a write thread
  virtual MemTable* GetMemTable() const override;

-  // Returns options for selected column family
-  // REQUIRES: Seek() called first
-  virtual const Options* GetOptions() const override;
-
  // Returns column family handle for the selected column family
+  // REQUIRES: under a DB mutex OR from a write thread
  virtual ColumnFamilyHandle* GetColumnFamilyHandle() override;

+  // REQUIRES: under a DB mutex OR from a write thread
+  virtual void CheckMemtableFull() override;
+
 private:
  ColumnFamilySet* column_family_set_;
  ColumnFamilyData* current_;
+  FlushScheduler* flush_scheduler_;
  ColumnFamilyHandleInternal handle_;
 };

 extern uint32_t GetColumnFamilyID(ColumnFamilyHandle* column_family);

+extern const Comparator* GetColumnFamilyUserComparator(
+    ColumnFamilyHandle* column_family);
+
 }  // namespace rocksdb
--- a/db/column_family_test.cc
+++ b/db/column_family_test.cc
@ -133,7 +133,7 @@ class ColumnFamilyTest {
  void CreateColumnFamilies(
      const std::vector<std::string>& cfs,
      const std::vector<ColumnFamilyOptions> options = {}) {
-    int cfi = handles_.size();
+    int cfi = static_cast<int>(handles_.size());
    handles_.resize(cfi + cfs.size());
    names_.resize(cfi + cfs.size());
    for (size_t i = 0; i < cfs.size(); ++i) {
@ -218,7 +218,7 @@ class ColumnFamilyTest {

  int NumTableFilesAtLevel(int level, int cf) {
    return GetProperty(cf,
-                       "rocksdb.num-files-at-level" + std::to_string(level));
+                       "rocksdb.num-files-at-level" + ToString(level));
  }

  // Return spread of files per level
@ -231,7 +231,7 @@ class ColumnFamilyTest {
      snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f);
      result += buf;
      if (f > 0) {
-        last_non_zero_offset = result.size();
+        last_non_zero_offset = static_cast<int>(result.size());
      }
    }
    result.resize(last_non_zero_offset);
@ -287,8 +287,8 @@ class ColumnFamilyTest {
    assert(num_per_cf.size() == handles_.size());

    for (size_t i = 0; i < num_per_cf.size(); ++i) {
-      ASSERT_EQ(num_per_cf[i],
-                GetProperty(i, "rocksdb.num-immutable-mem-table"));
+      ASSERT_EQ(num_per_cf[i], GetProperty(static_cast<int>(i),
+                                           "rocksdb.num-immutable-mem-table"));
    }
  }

@ -387,7 +387,7 @@ TEST(ColumnFamilyTest, DropTest) {
    Open({"default"});
    CreateColumnFamiliesAndReopen({"pikachu"});
    for (int i = 0; i < 100; ++i) {
-      ASSERT_OK(Put(1, std::to_string(i), "bar" + std::to_string(i)));
+      ASSERT_OK(Put(1, ToString(i), "bar" + ToString(i)));
    }
    ASSERT_OK(Flush(1));

@ -408,9 +408,15 @@ TEST(ColumnFamilyTest, WriteBatchFailure) {
  Open();
  CreateColumnFamiliesAndReopen({"one", "two"});
  WriteBatch batch;
+  batch.Put(handles_[0], Slice("existing"), Slice("column-family"));
  batch.Put(handles_[1], Slice("non-existing"), Slice("column-family"));
  ASSERT_OK(db_->Write(WriteOptions(), &batch));
  DropColumnFamilies({1});
+  WriteOptions woptions_ignore_missing_cf;
+  woptions_ignore_missing_cf.ignore_missing_column_families = true;
+  batch.Put(handles_[0], Slice("still here"), Slice("column-family"));
+  ASSERT_OK(db_->Write(woptions_ignore_missing_cf, &batch));
+  ASSERT_EQ("column-family", Get(0, "still here"));
  Status s = db_->Write(WriteOptions(), &batch);
  ASSERT_TRUE(s.IsInvalidArgument());
  Close();
@ -523,8 +529,28 @@ TEST(ColumnFamilyTest, FlushTest) {
  ASSERT_OK(Put(1, "mirko", "v3"));
  ASSERT_OK(Put(0, "foo", "v2"));
  ASSERT_OK(Put(2, "fodor", "v5"));
+
+  for (int j = 0; j < 2; j++) {
+    ReadOptions ro;
+    std::vector<Iterator*> iterators;
+    // Hold super version.
+    if (j == 0) {
+      ASSERT_OK(db_->NewIterators(ro, handles_, &iterators));
+    }
+
    for (int i = 0; i < 3; ++i) {
+      uint64_t max_total_in_memory_state =
+          dbfull()->TEST_max_total_in_memory_state();
      Flush(i);
+      ASSERT_EQ(dbfull()->TEST_max_total_in_memory_state(),
+                max_total_in_memory_state);
+    }
+    ASSERT_OK(Put(1, "foofoo", "bar"));
+    ASSERT_OK(Put(0, "foofoo", "bar"));
+
+    for (auto* it : iterators) {
+      delete it;
+    }
  }
  Reopen();

@ -705,6 +731,27 @@ TEST(ColumnFamilyTest, DifferentWriteBufferSizes) {
  Close();
 }

+TEST(ColumnFamilyTest, MemtableNotSupportSnapshot) {
+  Open();
+  auto* s1 = dbfull()->GetSnapshot();
+  ASSERT_TRUE(s1 != nullptr);
+  dbfull()->ReleaseSnapshot(s1);
+
+  // Add a column family that doesn't support snapshot
+  ColumnFamilyOptions first;
+  first.memtable_factory.reset(NewHashCuckooRepFactory(1024 * 1024));
+  CreateColumnFamilies({"first"}, {first});
+  auto* s2 = dbfull()->GetSnapshot();
+  ASSERT_TRUE(s2 == nullptr);
+
+  // Add a column family that supports snapshot. Snapshot stays not supported.
+  ColumnFamilyOptions second;
+  CreateColumnFamilies({"second"}, {second});
+  auto* s3 = dbfull()->GetSnapshot();
+  ASSERT_TRUE(s3 == nullptr);
+  Close();
+}
+
 TEST(ColumnFamilyTest, DifferentMergeOperators) {
  Open();
  CreateColumnFamilies({"first", "second"});
@ -768,14 +815,14 @@ TEST(ColumnFamilyTest, DifferentCompactionStyles) {
  for (int i = 0; i < one.level0_file_num_compaction_trigger - 1; ++i) {
    PutRandomData(1, 11, 10000);
    WaitForFlush(1);
-    ASSERT_EQ(std::to_string(i + 1), FilesPerLevel(1));
+    ASSERT_EQ(ToString(i + 1), FilesPerLevel(1));
  }

  // SETUP column family "two" -- level style with 4 levels
  for (int i = 0; i < two.level0_file_num_compaction_trigger - 1; ++i) {
    PutRandomData(2, 15, 10000);
    WaitForFlush(2);
-    ASSERT_EQ(std::to_string(i + 1), FilesPerLevel(2));
+    ASSERT_EQ(ToString(i + 1), FilesPerLevel(2));
  }

  // TRIGGER compaction "one"
@ -910,11 +957,11 @@ TEST(ColumnFamilyTest, DontRollEmptyLogs) {
  CreateColumnFamiliesAndReopen({"one", "two", "three", "four"});

  for (size_t i = 0; i < handles_.size(); ++i) {
-    PutRandomData(i, 10, 100);
+    PutRandomData(static_cast<int>(i), 10, 100);
  }
  int num_writable_file_start = env_->GetNumberOfNewWritableFileCalls();
  // this will trigger the flushes
-  for (size_t i = 0; i <= 4; ++i) {
+  for (int i = 0; i <= 4; ++i) {
    ASSERT_OK(Flush(i));
  }

--- a/db/compaction.cc
+++ b/db/compaction.cc
@ -9,7 +9,10 @@

 #include "db/compaction.h"

+#ifndef __STDC_FORMAT_MACROS
 #define __STDC_FORMAT_MACROS
+#endif
+
 #include <inttypes.h>
 #include <vector>

@ -26,7 +29,17 @@ uint64_t TotalFileSize(const std::vector<FileMetaData*>& files) {
  return sum;
 }

-Compaction::Compaction(Version* input_version, int start_level, int out_level,
+void Compaction::SetInputVersion(Version* _input_version) {
+  input_version_ = _input_version;
+  cfd_ = input_version_->cfd();
+
+  cfd_->Ref();
+  input_version_->Ref();
+  edit_ = new VersionEdit();
+  edit_->SetColumnFamily(cfd_->GetID());
+}
+
+Compaction::Compaction(int number_levels, int start_level, int out_level,
                       uint64_t target_file_size,
                       uint64_t max_grandparent_overlap_bytes,
                       uint32_t output_path_id,
@ -36,9 +49,10 @@ Compaction::Compaction(Version* input_version, int start_level, int out_level,
      output_level_(out_level),
      max_output_file_size_(target_file_size),
      max_grandparent_overlap_bytes_(max_grandparent_overlap_bytes),
-      input_version_(input_version),
-      number_levels_(input_version_->NumberLevels()),
-      cfd_(input_version_->cfd_),
+      input_version_(nullptr),
+      edit_(nullptr),
+      number_levels_(number_levels),
+      cfd_(nullptr),
      output_path_id_(output_path_id),
      output_compression_(output_compression),
      seek_compaction_(seek_compaction),
@ -53,11 +67,6 @@ Compaction::Compaction(Version* input_version, int start_level, int out_level,
      is_full_compaction_(false),
      is_manual_compaction_(false),
      level_ptrs_(std::vector<size_t>(number_levels_)) {
-
-  cfd_->Ref();
-  input_version_->Ref();
-  edit_ = new VersionEdit();
-  edit_->SetColumnFamily(cfd_->GetID());
  for (int i = 0; i < number_levels_; i++) {
    level_ptrs_[i] = 0;
  }
@ -69,6 +78,38 @@ Compaction::Compaction(Version* input_version, int start_level, int out_level,
  }
 }

+Compaction::Compaction(VersionStorageInfo* vstorage,
+    const autovector<CompactionInputFiles>& _inputs,
+    int _start_level, int _output_level,
+    uint64_t _max_grandparent_overlap_bytes,
+    const CompactionOptions& _options,
+    bool _deletion_compaction)
+    : start_level_(_start_level),
+      output_level_(_output_level),
+      max_output_file_size_(_options.output_file_size_limit),
+      max_grandparent_overlap_bytes_(_max_grandparent_overlap_bytes),
+      input_version_(nullptr),
+      number_levels_(vstorage->num_levels()),
+      cfd_(nullptr),
+      output_compression_(_options.compression),
+      seek_compaction_(false),
+      deletion_compaction_(_deletion_compaction),
+      inputs_(_inputs),
+      grandparent_index_(0),
+      seen_key_(false),
+      overlapped_bytes_(0),
+      base_index_(-1),
+      parent_index_(-1),
+      score_(0),
+      bottommost_level_(false),
+      is_full_compaction_(false),
+      is_manual_compaction_(false),
+      level_ptrs_(std::vector<size_t>(number_levels_)) {
+  for (int i = 0; i < number_levels_; i++) {
+    level_ptrs_[i] = 0;
+  }
+}
+
 Compaction::~Compaction() {
  delete edit_;
  if (input_version_ != nullptr) {
@ -83,8 +124,9 @@ Compaction::~Compaction() {

 void Compaction::GenerateFileLevels() {
  input_levels_.resize(num_input_levels());
-  for (int which = 0; which < num_input_levels(); which++) {
-    DoGenerateFileLevel(&input_levels_[which], inputs_[which].files, &arena_);
+  for (size_t which = 0; which < num_input_levels(); which++) {
+    DoGenerateLevelFilesBrief(&input_levels_[which], inputs_[which].files,
+                              &arena_);
  }
 }

@ -98,26 +140,29 @@ bool Compaction::IsTrivialMove() const {
          num_input_levels() == 2 &&
          num_input_files(0) == 1 &&
          num_input_files(1) == 0 &&
+          input(0, 0)->fd.GetPathId() == GetOutputPathId() &&
          TotalFileSize(grandparents_) <= max_grandparent_overlap_bytes_);
 }

-void Compaction::AddInputDeletions(VersionEdit* edit) {
-  for (int which = 0; which < num_input_levels(); which++) {
+void Compaction::AddInputDeletions(VersionEdit* out_edit) {
+  for (size_t which = 0; which < num_input_levels(); which++) {
    for (size_t i = 0; i < inputs_[which].size(); i++) {
-      edit->DeleteFile(level(which), inputs_[which][i]->fd.GetNumber());
+      out_edit->DeleteFile(level(which), inputs_[which][i]->fd.GetNumber());
    }
  }
 }

 bool Compaction::KeyNotExistsBeyondOutputLevel(const Slice& user_key) {
-  assert(cfd_->options()->compaction_style != kCompactionStyleFIFO);
-  if (cfd_->options()->compaction_style == kCompactionStyleUniversal) {
+  assert(input_version_ != nullptr);
+  assert(cfd_->ioptions()->compaction_style != kCompactionStyleFIFO);
+  if (cfd_->ioptions()->compaction_style == kCompactionStyleUniversal) {
    return bottommost_level_;
  }
  // Maybe use binary search to find right entry instead of linear search?
  const Comparator* user_cmp = cfd_->user_comparator();
  for (int lvl = output_level_ + 1; lvl < number_levels_; lvl++) {
-    const std::vector<FileMetaData*>& files = input_version_->files_[lvl];
+    const std::vector<FileMetaData*>& files =
+        input_version_->storage_info()->LevelFiles(lvl);
    for (; level_ptrs_[lvl] < files.size(); ) {
      FileMetaData* f = files[level_ptrs_[lvl]];
      if (user_cmp->Compare(user_key, f->largest.user_key()) <= 0) {
@ -163,7 +208,7 @@ bool Compaction::ShouldStopBefore(const Slice& internal_key) {

 // Mark (or clear) each file that is being compacted
 void Compaction::MarkFilesBeingCompacted(bool mark_as_compacted) {
-  for (int i = 0; i < num_input_levels(); i++) {
+  for (size_t i = 0; i < num_input_levels(); i++) {
    for (unsigned int j = 0; j < inputs_[i].size(); j++) {
      assert(mark_as_compacted ? !inputs_[i][j]->being_compacted :
                                  inputs_[i][j]->being_compacted);
@ -173,9 +218,9 @@ void Compaction::MarkFilesBeingCompacted(bool mark_as_compacted) {
 }

 // Is this compaction producing files at the bottommost level?
-void Compaction::SetupBottomMostLevel(bool is_manual) {
-  assert(cfd_->options()->compaction_style != kCompactionStyleFIFO);
-  if (cfd_->options()->compaction_style == kCompactionStyleUniversal) {
+void Compaction::SetupBottomMostLevel(VersionStorageInfo* vstorage,
+                                      bool is_manual, bool level0_only) {
+  if (level0_only) {
    // If universal compaction style is used and manual
    // compaction is occuring, then we are guaranteed that
    // all files will be picked in a single compaction
@ -190,32 +235,20 @@ void Compaction::SetupBottomMostLevel(bool is_manual) {
  bottommost_level_ = true;
  // checks whether there are files living beyond the output_level.
  for (int i = output_level_ + 1; i < number_levels_; i++) {
-    if (input_version_->NumLevelFiles(i) > 0) {
+    if (vstorage->NumLevelFiles(i) > 0) {
      bottommost_level_ = false;
      break;
    }
  }
 }

-void Compaction::ReleaseInputs() {
-  if (input_version_ != nullptr) {
-    input_version_->Unref();
-    input_version_ = nullptr;
-  }
-  if (cfd_ != nullptr) {
-    if (cfd_->Unref()) {
-      delete cfd_;
-    }
-    cfd_ = nullptr;
-  }
-}
-
 void Compaction::ReleaseCompactionFiles(Status status) {
  cfd_->compaction_picker()->ReleaseCompactionFiles(this, status);
 }

 void Compaction::ResetNextCompactionIndex() {
-  input_version_->ResetNextCompactionIndex(start_level_);
+  assert(input_version_ != nullptr);
+  input_version_->storage_info()->ResetNextCompactionIndex(start_level_);
 }

 namespace {
@ -248,14 +281,15 @@ void Compaction::Summary(char* output, int len) {
    return;
  }

-  for (int level = 0; level < num_input_levels(); ++level) {
-    if (level > 0) {
+  for (size_t level_iter = 0; level_iter < num_input_levels(); ++level_iter) {
+    if (level_iter > 0) {
      write += snprintf(output + write, len - write, "], [");
      if (write < 0 || write >= len) {
        return;
      }
    }
-    write += InputSummary(inputs_[level].files, output + write, len - write);
+    write +=
+        InputSummary(inputs_[level_iter].files, output + write, len - write);
    if (write < 0 || write >= len) {
      return;
    }
@ -264,15 +298,15 @@ void Compaction::Summary(char* output, int len) {
  snprintf(output + write, len - write, "]");
 }

-uint64_t Compaction::OutputFilePreallocationSize() {
+uint64_t Compaction::OutputFilePreallocationSize(
+    const MutableCFOptions& mutable_options) {
  uint64_t preallocation_size = 0;

-  if (cfd_->options()->compaction_style == kCompactionStyleLevel) {
-    preallocation_size =
-        cfd_->compaction_picker()->MaxFileSizeForLevel(output_level());
+  if (cfd_->ioptions()->compaction_style == kCompactionStyleLevel) {
+    preallocation_size = mutable_options.MaxFileSizeForLevel(output_level());
  } else {
-    for (int level = 0; level < num_input_levels(); ++level) {
-      for (const auto& f : inputs_[level].files) {
+    for (size_t level_iter = 0; level_iter < num_input_levels(); ++level_iter) {
+      for (const auto& f : inputs_[level_iter].files) {
        preallocation_size += f->fd.GetFileSize();
      }
    }
@ -282,4 +316,15 @@ uint64_t Compaction::OutputFilePreallocationSize() {
  return preallocation_size * 1.1;
 }

+Compaction* Compaction::TEST_NewCompaction(
+    int num_levels, int start_level, int out_level, uint64_t target_file_size,
+    uint64_t max_grandparent_overlap_bytes, uint32_t output_path_id,
+    CompressionType output_compression, bool seek_compaction,
+    bool deletion_compaction) {
+  return new Compaction(num_levels, start_level, out_level, target_file_size,
+                        max_grandparent_overlap_bytes, output_path_id,
+                        output_compression, seek_compaction,
+                        deletion_compaction);
+}
+
 }  // namespace rocksdb
--- a/db/compaction.h
+++ b/db/compaction.h
@ -10,6 +10,7 @@
 #pragma once
 #include "util/arena.h"
 #include "util/autovector.h"
+#include "util/mutable_cf_options.h"
 #include "db/version_set.h"

 namespace rocksdb {
@ -22,15 +23,23 @@ struct CompactionInputFiles {
  inline bool empty() const { return files.empty(); }
  inline size_t size() const { return files.size(); }
  inline void clear() { files.clear(); }
-  inline FileMetaData* operator[](int i) const { return files[i]; }
+  inline FileMetaData* operator[](size_t i) const { return files[i]; }
 };

 class Version;
 class ColumnFamilyData;
+class VersionStorageInfo;

 // A Compaction encapsulates information about a compaction.
 class Compaction {
 public:
+  Compaction(VersionStorageInfo* input_version,
+    const autovector<CompactionInputFiles>& inputs,
+    int start_level, int output_level,
+    uint64_t max_grandparent_overlap_bytes,
+    const CompactionOptions& options,
+    bool deletion_compaction);
+
  // No copying allowed
  Compaction(const Compaction&) = delete;
  void operator=(const Compaction&) = delete;
@ -39,7 +48,7 @@ class Compaction {

  // Returns the level associated to the specified compaction input level.
  // If compaction_input_level is not specified, then input_level is set to 0.
-  int level(int compaction_input_level = 0) const {
+  int level(size_t compaction_input_level = 0) const {
    return inputs_[compaction_input_level].level;
  }

@ -47,7 +56,7 @@ class Compaction {
  int output_level() const { return output_level_; }

  // Returns the number of input levels in this compaction.
-  int num_input_levels() const { return inputs_.size(); }
+  size_t num_input_levels() const { return inputs_.size(); }

  // Return the object that holds the edits to the descriptor done
  // by this compaction.
@ -57,7 +66,7 @@ class Compaction {
  // compaction input level.
  // The function will return 0 if when "compaction_input_level" < 0
  // or "compaction_input_level" >= "num_input_levels()".
-  int num_input_files(size_t compaction_input_level) const {
+  size_t num_input_files(size_t compaction_input_level) const {
    if (compaction_input_level < inputs_.size()) {
      return inputs_[compaction_input_level].size();
    }
@ -74,7 +83,7 @@ class Compaction {
  // specified compaction input level.
  // REQUIREMENT: "compaction_input_level" must be >= 0 and
  //              < "input_levels()"
-  FileMetaData* input(size_t compaction_input_level, int i) const {
+  FileMetaData* input(size_t compaction_input_level, size_t i) const {
    assert(compaction_input_level < inputs_.size());
    return inputs_[compaction_input_level][i];
  }
@ -88,8 +97,8 @@ class Compaction {
    return &inputs_[compaction_input_level].files;
  }

-  // Returns the FileLevel of the specified compaction input level.
-  FileLevel* input_levels(int compaction_input_level) {
+  // Returns the LevelFilesBrief of the specified compaction input level.
+  LevelFilesBrief* input_levels(size_t compaction_input_level) {
    return &input_levels_[compaction_input_level];
  }

@ -110,7 +119,7 @@ class Compaction {
  // moving a single input file to the next level (no merging or splitting)
  bool IsTrivialMove() const;

-  // If true, then the comaction can be done by simply deleting input files.
+  // If true, then the compaction can be done by simply deleting input files.
  bool IsDeletionCompaction() const {
    return deletion_compaction_;
  }
@ -126,10 +135,6 @@ class Compaction {
  // before processing "internal_key".
  bool ShouldStopBefore(const Slice& internal_key);

-  // Release the input version for the compaction, once the compaction
-  // is successful.
-  void ReleaseInputs();
-
  // Clear all files to indicate that they are not being compacted
  // Delete this compaction from the list of running compactions.
  void ReleaseCompactionFiles(Status status);
@ -151,10 +156,38 @@ class Compaction {
  // Was this compaction triggered manually by the client?
  bool IsManualCompaction() { return is_manual_compaction_; }

+  void SetOutputPathId(uint32_t path_id) { output_path_id_ = path_id; }
+
+  // Return the MutableCFOptions that should be used throughout the compaction
+  // procedure
+  const MutableCFOptions* mutable_cf_options() { return &mutable_cf_options_; }
+
  // Returns the size in bytes that the output file should be preallocated to.
  // In level compaction, that is max_file_size_. In universal compaction, that
  // is the sum of all input file sizes.
-  uint64_t OutputFilePreallocationSize();
+  uint64_t OutputFilePreallocationSize(const MutableCFOptions& mutable_options);
+
+  void SetInputVersion(Version* input_version);
+
+  // mark (or clear) all files that are being compacted
+  void MarkFilesBeingCompacted(bool mark_as_compacted);
+
+  // Initialize whether the compaction is producing files at the
+  // bottommost level.
+  //
+  // @see BottomMostLevel()
+  void SetupBottomMostLevel(VersionStorageInfo* vstorage, bool is_manual,
+                            bool level0_only);
+
+  static Compaction* TEST_NewCompaction(
+      int num_levels, int start_level, int out_level, uint64_t target_file_size,
+      uint64_t max_grandparent_overlap_bytes, uint32_t output_path_id,
+      CompressionType output_compression, bool seek_compaction = false,
+      bool deletion_compaction = false);
+
+  CompactionInputFiles* TEST_GetInputFiles(int l) {
+    return &inputs_[l];
+  }

 private:
  friend class CompactionPicker;
@ -162,7 +195,7 @@ class Compaction {
  friend class FIFOCompactionPicker;
  friend class LevelCompactionPicker;

-  Compaction(Version* input_version, int start_level, int out_level,
+  Compaction(int num_levels, int start_level, int out_level,
             uint64_t target_file_size, uint64_t max_grandparent_overlap_bytes,
             uint32_t output_path_id, CompressionType output_compression,
             bool seek_compaction = false, bool deletion_compaction = false);
@ -171,6 +204,7 @@ class Compaction {
  const int output_level_;  // levels to which output files are stored
  uint64_t max_output_file_size_;
  uint64_t max_grandparent_overlap_bytes_;
+  MutableCFOptions mutable_cf_options_;
  Version* input_version_;
  VersionEdit* edit_;
  int number_levels_;
@ -187,7 +221,7 @@ class Compaction {
  autovector<CompactionInputFiles> inputs_;

  // A copy of inputs_, organized more closely in memory
-  autovector<FileLevel, 2> input_levels_;
+  autovector<LevelFilesBrief, 2> input_levels_;

  // State used to check for number of of overlapping grandparent files
  // (grandparent == "output_level_ + 1")
@ -217,15 +251,6 @@ class Compaction {
  // records indices for all levels beyond "output_level_".
  std::vector<size_t> level_ptrs_;

-  // mark (or clear) all files that are being compacted
-  void MarkFilesBeingCompacted(bool mark_as_compacted);
-
-  // Initialize whether the compaction is producing files at the
-  // bottommost level.
-  //
-  // @see BottomMostLevel()
-  void SetupBottomMostLevel(bool is_manual);
-
  // In case of compaction error, reset the nextIndex that is used
  // to pick up the next file to be compacted from files_by_size_
  void ResetNextCompactionIndex();
--- a/db/compaction_job.cc
+++ b/db/compaction_job.cc
--- a/db/compaction_job.h
+++ b/db/compaction_job.h
@ -0,0 +1,127 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include <atomic>
+#include <deque>
+#include <limits>
+#include <set>
+#include <utility>
+#include <vector>
+#include <string>
+#include <functional>
+
+#include "db/dbformat.h"
+#include "db/log_writer.h"
+#include "db/snapshot.h"
+#include "db/column_family.h"
+#include "db/version_edit.h"
+#include "db/memtable_list.h"
+#include "port/port.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/transaction_log.h"
+#include "util/autovector.h"
+#include "util/stop_watch.h"
+#include "util/thread_local.h"
+#include "util/scoped_arena_iterator.h"
+#include "db/internal_stats.h"
+#include "db/write_controller.h"
+#include "db/flush_scheduler.h"
+#include "db/write_thread.h"
+#include "db/job_context.h"
+
+namespace rocksdb {
+
+class MemTable;
+class TableCache;
+class Version;
+class VersionEdit;
+class VersionSet;
+class Arena;
+
+class CompactionJob {
+ public:
+  // TODO(icanadi) make effort to reduce number of parameters here
+  // IMPORTANT: mutable_cf_options needs to be alive while CompactionJob is
+  // alive
+  CompactionJob(Compaction* compaction, const DBOptions& db_options,
+                const MutableCFOptions& mutable_cf_options,
+                const EnvOptions& env_options, VersionSet* versions,
+                std::atomic<bool>* shutting_down, LogBuffer* log_buffer,
+                Directory* db_directory, Directory* output_directory,
+                Statistics* stats, SnapshotList* snapshot_list,
+                bool is_snapshot_supported, std::shared_ptr<Cache> table_cache,
+                std::function<uint64_t()> yield_callback);
+
+  ~CompactionJob() { assert(compact_ == nullptr); }
+
+  // no copy/move
+  CompactionJob(CompactionJob&& job) = delete;
+  CompactionJob(const CompactionJob& job) = delete;
+  CompactionJob& operator=(const CompactionJob& job) = delete;
+
+  // REQUIRED: mutex held
+  void Prepare();
+  // REQUIRED mutex not held
+  Status Run();
+  // REQUIRED: mutex held
+  // status is the return of Run()
+  void Install(Status* status, InstrumentedMutex* db_mutex);
+
+ private:
+  void AllocateCompactionOutputFileNumbers();
+  // Call compaction filter if is_compaction_v2 is not true. Then iterate
+  // through input and compact the kv-pairs
+  Status ProcessKeyValueCompaction(int64_t* imm_micros, Iterator* input,
+                                   bool is_compaction_v2);
+  // Call compaction_filter_v2->Filter() on kv-pairs in compact
+  void CallCompactionFilterV2(CompactionFilterV2* compaction_filter_v2);
+  Status FinishCompactionOutputFile(Iterator* input);
+  Status InstallCompactionResults(InstrumentedMutex* db_mutex);
+  SequenceNumber findEarliestVisibleSnapshot(
+      SequenceNumber in, const std::vector<SequenceNumber>& snapshots,
+      SequenceNumber* prev_snapshot);
+  void RecordCompactionIOStats();
+  Status OpenCompactionOutputFile();
+  void CleanupCompaction(const Status& status);
+
+  // CompactionJob state
+  struct CompactionState;
+  CompactionState* compact_;
+
+  bool bottommost_level_;
+  SequenceNumber earliest_snapshot_;
+  SequenceNumber visible_at_tip_;
+  SequenceNumber latest_snapshot_;
+
+  InternalStats::CompactionStats compaction_stats_;
+
+  // DBImpl state
+  const DBOptions& db_options_;
+  const MutableCFOptions& mutable_cf_options_;
+  const EnvOptions& env_options_;
+  Env* env_;
+  VersionSet* versions_;
+  std::atomic<bool>* shutting_down_;
+  LogBuffer* log_buffer_;
+  Directory* db_directory_;
+  Directory* output_directory_;
+  Statistics* stats_;
+  SnapshotList* snapshots_;
+  bool is_snapshot_supported_;
+  std::shared_ptr<Cache> table_cache_;
+
+  // yield callback
+  std::function<uint64_t()> yield_callback_;
+};
+
+}  // namespace rocksdb
--- a/db/compaction_job_test.cc
+++ b/db/compaction_job_test.cc
@ -0,0 +1,183 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include <map>
+#include <string>
+
+#include "db/compaction_job.h"
+#include "db/column_family.h"
+#include "db/version_set.h"
+#include "db/writebuffer.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/options.h"
+#include "rocksdb/db.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+#include "table/mock_table.h"
+
+namespace rocksdb {
+
+// TODO(icanadi) Make it simpler once we mock out VersionSet
+class CompactionJobTest {
+ public:
+  CompactionJobTest()
+      : env_(Env::Default()),
+        dbname_(test::TmpDir() + "/compaction_job_test"),
+        mutable_cf_options_(Options(), ImmutableCFOptions(Options())),
+        table_cache_(NewLRUCache(50000, 16, 8)),
+        write_buffer_(db_options_.db_write_buffer_size),
+        versions_(new VersionSet(dbname_, &db_options_, env_options_,
+                                 table_cache_.get(), &write_buffer_,
+                                 &write_controller_)),
+        shutting_down_(false),
+        mock_table_factory_(new mock::MockTableFactory()) {
+    ASSERT_OK(env_->CreateDirIfMissing(dbname_));
+    db_options_.db_paths.emplace_back(dbname_,
+                                      std::numeric_limits<uint64_t>::max());
+    NewDB();
+    std::vector<ColumnFamilyDescriptor> column_families;
+    cf_options_.table_factory = mock_table_factory_;
+    column_families.emplace_back(kDefaultColumnFamilyName, cf_options_);
+
+
+    ASSERT_OK(versions_->Recover(column_families, false));
+  }
+
+  std::string GenerateFileName(uint64_t file_number) {
+    FileMetaData meta;
+    std::vector<DbPath> db_paths;
+    db_paths.emplace_back(dbname_, std::numeric_limits<uint64_t>::max());
+    meta.fd = FileDescriptor(file_number, 0, 0);
+    return TableFileName(db_paths, meta.fd.GetNumber(), meta.fd.GetPathId());
+  }
+
+  // returns expected result after compaction
+  mock::MockFileContents CreateTwoFiles() {
+    mock::MockFileContents expected_results;
+    const int kKeysPerFile = 10000;
+    SequenceNumber sequence_number = 0;
+    for (int i = 0; i < 2; ++i) {
+      mock::MockFileContents contents;
+      SequenceNumber smallest_seqno = 0, largest_seqno = 0;
+      InternalKey smallest, largest;
+      for (int k = 0; k < kKeysPerFile; ++k) {
+        auto key = ToString(i * (kKeysPerFile / 2) + k);
+        auto value = ToString(i * kKeysPerFile + k);
+        InternalKey internal_key(key, ++sequence_number, kTypeValue);
+        if (k == 0) {
+          smallest = internal_key;
+          smallest_seqno = sequence_number;
+        } else if (k == kKeysPerFile - 1) {
+          largest = internal_key;
+          largest_seqno = sequence_number;
+        }
+        std::pair<std::string, std::string> key_value(
+            {internal_key.Encode().ToString(), value});
+        contents.insert(key_value);
+        if (i == 1 || k < kKeysPerFile / 2) {
+          expected_results.insert(key_value);
+        }
+      }
+
+      uint64_t file_number = versions_->NewFileNumber();
+      ASSERT_OK(mock_table_factory_->CreateMockTable(
+          env_, GenerateFileName(file_number), std::move(contents)));
+
+      VersionEdit edit;
+      edit.AddFile(0, file_number, 0, 10, smallest, largest, smallest_seqno,
+                   largest_seqno);
+
+      mutex_.Lock();
+      versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(),
+                             mutable_cf_options_, &edit, &mutex_);
+      mutex_.Unlock();
+    }
+    versions_->SetLastSequence(sequence_number);
+    return expected_results;
+  }
+
+  void NewDB() {
+    VersionEdit new_db;
+    new_db.SetLogNumber(0);
+    new_db.SetNextFile(2);
+    new_db.SetLastSequence(0);
+
+    const std::string manifest = DescriptorFileName(dbname_, 1);
+    unique_ptr<WritableFile> file;
+    Status s = env_->NewWritableFile(
+        manifest, &file, env_->OptimizeForManifestWrite(env_options_));
+    ASSERT_OK(s);
+    {
+      log::Writer log(std::move(file));
+      std::string record;
+      new_db.EncodeTo(&record);
+      s = log.AddRecord(record);
+    }
+    ASSERT_OK(s);
+    // Make "CURRENT" file that points to the new manifest file.
+    s = SetCurrentFile(env_, dbname_, 1, nullptr);
+  }
+
+  Env* env_;
+  std::string dbname_;
+  EnvOptions env_options_;
+  MutableCFOptions mutable_cf_options_;
+  std::shared_ptr<Cache> table_cache_;
+  WriteController write_controller_;
+  DBOptions db_options_;
+  ColumnFamilyOptions cf_options_;
+  WriteBuffer write_buffer_;
+  std::unique_ptr<VersionSet> versions_;
+  InstrumentedMutex mutex_;
+  std::atomic<bool> shutting_down_;
+  std::shared_ptr<mock::MockTableFactory> mock_table_factory_;
+};
+
+TEST(CompactionJobTest, Simple) {
+  auto cfd = versions_->GetColumnFamilySet()->GetDefault();
+
+  auto expected_results = CreateTwoFiles();
+
+  auto files = cfd->current()->storage_info()->LevelFiles(0);
+  ASSERT_EQ(2U, files.size());
+
+  std::unique_ptr<Compaction> compaction(Compaction::TEST_NewCompaction(
+      7, 0, 1, 1024 * 1024, 10, 0, kNoCompression));
+  compaction->SetInputVersion(cfd->current());
+
+  auto compaction_input_files = compaction->TEST_GetInputFiles(0);
+  compaction_input_files->level = 0;
+  compaction_input_files->files.push_back(files[0]);
+  compaction_input_files->files.push_back(files[1]);
+
+  SnapshotList snapshots;
+  int yield_callback_called = 0;
+  std::function<uint64_t()> yield_callback = [&]() {
+    yield_callback_called++;
+    return 0;
+  };
+  LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, db_options_.info_log.get());
+  mutex_.Lock();
+  CompactionJob compaction_job(compaction.get(), db_options_,
+                               *cfd->GetLatestMutableCFOptions(), env_options_,
+                               versions_.get(), &shutting_down_, &log_buffer,
+                               nullptr, nullptr, nullptr, &snapshots, true,
+                               table_cache_, std::move(yield_callback));
+  compaction_job.Prepare();
+  mutex_.Unlock();
+  ASSERT_OK(compaction_job.Run());
+  mutex_.Lock();
+  Status s;
+  compaction_job.Install(&s, &mutex_);
+  ASSERT_OK(s);
+  mutex_.Unlock();
+
+  mock_table_factory_->AssertLatestFile(expected_results);
+  ASSERT_EQ(yield_callback_called, 20000);
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) { return rocksdb::test::RunAllTests(); }
--- a/db/compaction_picker.cc
+++ b/db/compaction_picker.cc
--- a/db/compaction_picker.h
+++ b/db/compaction_picker.h
@ -8,32 +8,43 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.

 #pragma once
+#include <vector>
+#include <memory>
+#include <set>
+#include <unordered_set>
+
 #include "db/version_set.h"
 #include "db/compaction.h"
 #include "rocksdb/status.h"
 #include "rocksdb/options.h"
 #include "rocksdb/env.h"
+#include "util/mutable_cf_options.h"

 #include <vector>
 #include <memory>
 #include <set>
+#include <string>

 namespace rocksdb {

 class LogBuffer;
 class Compaction;
-class Version;
+class VersionStorageInfo;
+struct CompactionInputFiles;

 class CompactionPicker {
 public:
-  CompactionPicker(const Options* options, const InternalKeyComparator* icmp);
+  CompactionPicker(const ImmutableCFOptions& ioptions,
+                   const InternalKeyComparator* icmp);
  virtual ~CompactionPicker();

  // Pick level and inputs for a new compaction.
  // Returns nullptr if there is no compaction to be done.
  // Otherwise returns a pointer to a heap-allocated object that
  // describes the compaction.  Caller should delete the result.
-  virtual Compaction* PickCompaction(Version* version,
+  virtual Compaction* PickCompaction(const std::string& cf_name,
+                                     const MutableCFOptions& mutable_cf_options,
+                                     VersionStorageInfo* vstorage,
                                     LogBuffer* log_buffer) = 0;

  // Return a compaction object for compacting the range [begin,end] in
@ -47,36 +58,59 @@ class CompactionPicker {
  // compaction_end will be set to nullptr.
  // Client is responsible for compaction_end storage -- when called,
  // *compaction_end should point to valid InternalKey!
-  virtual Compaction* CompactRange(Version* version, int input_level,
-                                   int output_level, uint32_t output_path_id,
-                                   const InternalKey* begin,
-                                   const InternalKey* end,
+  virtual Compaction* CompactRange(
+      const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+      VersionStorageInfo* vstorage, int input_level, int output_level,
+      uint32_t output_path_id, const InternalKey* begin, const InternalKey* end,
      InternalKey** compaction_end);

  // Given the current number of levels, returns the lowest allowed level
  // for compaction input.
  virtual int MaxInputLevel(int current_num_levels) const = 0;

-  // Free up the files that participated in a compaction
-  void ReleaseCompactionFiles(Compaction* c, Status status);
-
-  // Return the total amount of data that is undergoing
-  // compactions per level
-  void SizeBeingCompacted(std::vector<uint64_t>& sizes);
+  // The maximum allowed output level.  Default value is NumberLevels() - 1.
+  virtual int MaxOutputLevel() const {
+    return NumberLevels() - 1;
+  }

-  // Returns maximum total overlap bytes with grandparent
-  // level (i.e., level+2) before we stop building a single
-  // file in level->level+1 compaction.
-  uint64_t MaxGrandParentOverlapBytes(int level);
+  virtual bool NeedsCompaction(const VersionStorageInfo* vstorage) const = 0;
+
+  // Sanitize the input set of compaction input files.
+  // When the input parameters do not describe a valid compaction, the
+  // function will try to fix the input_files by adding necessary
+  // files.  If it's not possible to conver an invalid input_files
+  // into a valid one by adding more files, the function will return a
+  // non-ok status with specific reason.
+#ifndef ROCKSDB_LITE
+  Status SanitizeCompactionInputFiles(
+      std::unordered_set<uint64_t>* input_files,
+      const ColumnFamilyMetaData& cf_meta,
+      const int output_level) const;
+#endif  // ROCKSDB_LITE

-  // Returns maximum total bytes of data on a given level.
-  double MaxBytesForLevel(int level);
+  // Free up the files that participated in a compaction
+  void ReleaseCompactionFiles(Compaction* c, Status status);

-  // Get the max file size in a given level.
-  uint64_t MaxFileSizeForLevel(int level) const;
+  // Returns true if any one of the specified files are being compacted
+  bool FilesInCompaction(const std::vector<FileMetaData*>& files);
+
+  // Takes a list of CompactionInputFiles and returns a Compaction object.
+  Compaction* FormCompaction(
+      const CompactionOptions& compact_options,
+      const autovector<CompactionInputFiles>& input_files,
+      int output_level, VersionStorageInfo* vstorage,
+      const MutableCFOptions& mutable_cf_options) const;
+
+  // Converts a set of compaction input file numbers into
+  // a list of CompactionInputFiles.
+  Status GetCompactionInputsFromFileNumbers(
+      autovector<CompactionInputFiles>* input_files,
+      std::unordered_set<uint64_t>* input_set,
+      const VersionStorageInfo* vstorage,
+      const CompactionOptions& compact_options) const;

 protected:
-  int NumberLevels() const { return num_levels_; }
+  int NumberLevels() const { return ioptions_.num_levels; }

  // Stores the minimal range that covers all entries in inputs in
  // *smallest, *largest.
@ -101,110 +135,179 @@ class CompactionPicker {
  // populated.
  //
  // Will return false if it is impossible to apply this compaction.
-  bool ExpandWhileOverlapping(Compaction* c);
-
-  uint64_t ExpandedCompactionByteSizeLimit(int level);
-
-  // Returns true if any one of the specified files are being compacted
-  bool FilesInCompaction(std::vector<FileMetaData*>& files);
+  bool ExpandWhileOverlapping(const std::string& cf_name,
+                              VersionStorageInfo* vstorage, Compaction* c);

  // Returns true if any one of the parent files are being compacted
-  bool ParentRangeInCompaction(Version* version, const InternalKey* smallest,
+  bool ParentRangeInCompaction(VersionStorageInfo* vstorage,
+                               const InternalKey* smallest,
                               const InternalKey* largest, int level,
                               int* index);

-  void SetupOtherInputs(Compaction* c);
+  void SetupOtherInputs(const std::string& cf_name,
+                        const MutableCFOptions& mutable_cf_options,
+                        VersionStorageInfo* vstorage, Compaction* c);
+
+  const ImmutableCFOptions& ioptions_;
+
+  // A helper function to SanitizeCompactionInputFiles() that
+  // sanitizes "input_files" by adding necessary files.
+#ifndef ROCKSDB_LITE
+  virtual Status SanitizeCompactionInputFilesForAllLevels(
+      std::unordered_set<uint64_t>* input_files,
+      const ColumnFamilyMetaData& cf_meta,
+      const int output_level) const;
+#endif  // ROCKSDB_LITE

  // record all the ongoing compactions for all levels
  std::vector<std::set<Compaction*>> compactions_in_progress_;

-  // Per-level target file size.
-  std::unique_ptr<uint64_t[]> max_file_size_;
+  const InternalKeyComparator* const icmp_;
+};
+
+class LevelCompactionPicker : public CompactionPicker {
+ public:
+  LevelCompactionPicker(const ImmutableCFOptions& ioptions,
+                        const InternalKeyComparator* icmp)
+      : CompactionPicker(ioptions, icmp) {}
+  virtual Compaction* PickCompaction(const std::string& cf_name,
+                                     const MutableCFOptions& mutable_cf_options,
+                                     VersionStorageInfo* vstorage,
+                                     LogBuffer* log_buffer) override;

-  // Per-level max bytes
-  std::unique_ptr<uint64_t[]> level_max_bytes_;
+  // Returns current_num_levels - 2, meaning the last level cannot be
+  // compaction input level.
+  virtual int MaxInputLevel(int current_num_levels) const override {
+    return current_num_levels - 2;
+  }

-  const Options* const options_;
+  virtual bool NeedsCompaction(const VersionStorageInfo* vstorage) const
+      override;

- private:
-  int num_levels_;
+  // Pick a path ID to place a newly generated file, with its level
+  static uint32_t GetPathId(const ImmutableCFOptions& ioptions,
+                            const MutableCFOptions& mutable_cf_options,
+                            int level);

-  const InternalKeyComparator* const icmp_;
+ private:
+  // For the specfied level, pick a compaction.
+  // Returns nullptr if there is no compaction to be done.
+  // If level is 0 and there is already a compaction on that level, this
+  // function will return nullptr.
+  Compaction* PickCompactionBySize(const MutableCFOptions& mutable_cf_options,
+                                   VersionStorageInfo* vstorage, int level,
+                                   double score);
 };

+#ifndef ROCKSDB_LITE
 class UniversalCompactionPicker : public CompactionPicker {
 public:
-  UniversalCompactionPicker(const Options* options,
+  UniversalCompactionPicker(const ImmutableCFOptions& ioptions,
                            const InternalKeyComparator* icmp)
-      : CompactionPicker(options, icmp) {}
-  virtual Compaction* PickCompaction(Version* version,
+      : CompactionPicker(ioptions, icmp) {}
+  virtual Compaction* PickCompaction(const std::string& cf_name,
+                                     const MutableCFOptions& mutable_cf_options,
+                                     VersionStorageInfo* vstorage,
                                     LogBuffer* log_buffer) override;

-  // The maxinum allowed input level.  Always return 0.
+  // The maxinum allowed input level.  Always returns 0.
  virtual int MaxInputLevel(int current_num_levels) const override {
    return 0;
  }

+  // The maximum allowed output level.  Always returns 0.
+  virtual int MaxOutputLevel() const override {
+    return 0;
+  }
+
+  virtual bool NeedsCompaction(const VersionStorageInfo* vstorage) const
+      override;
+
 private:
  // Pick Universal compaction to limit read amplification
-  Compaction* PickCompactionUniversalReadAmp(Version* version, double score,
-                                             unsigned int ratio,
-                                             unsigned int num_files,
-                                             LogBuffer* log_buffer);
+  Compaction* PickCompactionUniversalReadAmp(
+      const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+      VersionStorageInfo* vstorage, double score, unsigned int ratio,
+      unsigned int num_files, LogBuffer* log_buffer);

  // Pick Universal compaction to limit space amplification.
-  Compaction* PickCompactionUniversalSizeAmp(Version* version, double score,
-                                             LogBuffer* log_buffer);
+  Compaction* PickCompactionUniversalSizeAmp(
+      const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+      VersionStorageInfo* vstorage, double score, LogBuffer* log_buffer);

  // Pick a path ID to place a newly generated file, with its estimated file
  // size.
-  static uint32_t GetPathId(const Options& options, uint64_t file_size);
+  static uint32_t GetPathId(const ImmutableCFOptions& ioptions,
+                            uint64_t file_size);
 };

-class LevelCompactionPicker : public CompactionPicker {
+class FIFOCompactionPicker : public CompactionPicker {
 public:
-  LevelCompactionPicker(const Options* options,
+  FIFOCompactionPicker(const ImmutableCFOptions& ioptions,
                       const InternalKeyComparator* icmp)
-      : CompactionPicker(options, icmp) {}
-  virtual Compaction* PickCompaction(Version* version,
+      : CompactionPicker(ioptions, icmp) {}
+
+  virtual Compaction* PickCompaction(const std::string& cf_name,
+                                     const MutableCFOptions& mutable_cf_options,
+                                     VersionStorageInfo* version,
                                     LogBuffer* log_buffer) override;

-  // Returns current_num_levels - 2, meaning the last level cannot be
-  // compaction input level.
+  virtual Compaction* CompactRange(
+      const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+      VersionStorageInfo* vstorage, int input_level, int output_level,
+      uint32_t output_path_id, const InternalKey* begin, const InternalKey* end,
+      InternalKey** compaction_end) override;
+
+  // The maxinum allowed input level.  Always returns 0.
  virtual int MaxInputLevel(int current_num_levels) const override {
-    return current_num_levels - 2;
+    return 0;
  }

- private:
-  // For the specfied level, pick a compaction.
-  // Returns nullptr if there is no compaction to be done.
-  // If level is 0 and there is already a compaction on that level, this
-  // function will return nullptr.
-  Compaction* PickCompactionBySize(Version* version, int level, double score);
+  // The maximum allowed output level.  Always returns 0.
+  virtual int MaxOutputLevel() const override {
+    return 0;
+  }
+
+  virtual bool NeedsCompaction(const VersionStorageInfo* vstorage) const
+      override;
 };

-class FIFOCompactionPicker : public CompactionPicker {
+class NullCompactionPicker : public CompactionPicker {
 public:
-  FIFOCompactionPicker(const Options* options,
-                       const InternalKeyComparator* icmp)
-      : CompactionPicker(options, icmp) {}
+  NullCompactionPicker(const ImmutableCFOptions& ioptions,
+                       const InternalKeyComparator* icmp) :
+      CompactionPicker(ioptions, icmp) {}
+  virtual ~NullCompactionPicker() {}
+
+  // Always return "nullptr"
+  Compaction* PickCompaction(const std::string& cf_name,
+                             const MutableCFOptions& mutable_cf_options,
+                             VersionStorageInfo* vstorage,
+                             LogBuffer* log_buffer) override {
+    return nullptr;
+  }

-  virtual Compaction* PickCompaction(Version* version,
-                                     LogBuffer* log_buffer) override;
+  // Always return "nullptr"
+  Compaction* CompactRange(
+      const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+      VersionStorageInfo* vstorage, int input_level, int output_level,
+      uint32_t output_path_id, const InternalKey* begin, const InternalKey* end,
+      InternalKey** compaction_end) override {
+    return nullptr;
+  }

-  virtual Compaction* CompactRange(Version* version, int input_level,
-                                   int output_level, uint32_t output_path_id,
-                                   const InternalKey* begin,
-                                   const InternalKey* end,
-                                   InternalKey** compaction_end) override;
+  // Given the current number of levels, returns the highest allowed level
+  // for compaction input.
+  virtual int MaxInputLevel(int current_num_levels) const {
+    return current_num_levels - 2;
+  }

-  // The maxinum allowed input level.  Always return 0.
-  virtual int MaxInputLevel(int current_num_levels) const override {
-    return 0;
+  // Always returns false.
+  virtual bool NeedsCompaction(const VersionStorageInfo* vstorage) const
+      override {
+    return false;
  }
 };
-
-// Utility function
-extern uint64_t TotalCompensatedFileSize(const std::vector<FileMetaData*>& files);
+#endif  // !ROCKSDB_LITE

 }  // namespace rocksdb
--- a/db/compaction_picker_test.cc
+++ b/db/compaction_picker_test.cc
@ -0,0 +1,259 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include "db/compaction_picker.h"
+#include <limits>
+#include <string>
+#include "util/logging.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+namespace rocksdb {
+
+class CountingLogger : public Logger {
+ public:
+  using Logger::Logv;
+  virtual void Logv(const char* format, va_list ap) override { log_count++; }
+  size_t log_count;
+};
+
+class CompactionPickerTest {
+ public:
+  const Comparator* ucmp_;
+  InternalKeyComparator icmp_;
+  Options options_;
+  ImmutableCFOptions ioptions_;
+  MutableCFOptions mutable_cf_options_;
+  LevelCompactionPicker level_compaction_picker;
+  std::string cf_name_;
+  CountingLogger logger_;
+  LogBuffer log_buffer_;
+  uint32_t file_num_;
+  CompactionOptionsFIFO fifo_options_;
+  std::unique_ptr<VersionStorageInfo> vstorage_;
+  std::vector<std::unique_ptr<FileMetaData>> files_;
+
+  CompactionPickerTest()
+      : ucmp_(BytewiseComparator()),
+        icmp_(ucmp_),
+        ioptions_(options_),
+        mutable_cf_options_(options_, ioptions_),
+        level_compaction_picker(ioptions_, &icmp_),
+        cf_name_("dummy"),
+        log_buffer_(InfoLogLevel::INFO_LEVEL, &logger_),
+        file_num_(1),
+        vstorage_(nullptr) {
+    fifo_options_.max_table_files_size = 1;
+    mutable_cf_options_.RefreshDerivedOptions(ioptions_);
+    ioptions_.db_paths.emplace_back("dummy",
+                                    std::numeric_limits<uint64_t>::max());
+  }
+
+  ~CompactionPickerTest() {
+  }
+
+  void NewVersionStorage(int num_levels, CompactionStyle style) {
+    DeleteVersionStorage();
+    options_.num_levels = num_levels;
+    vstorage_.reset(new VersionStorageInfo(
+        &icmp_, ucmp_, options_.num_levels, style, nullptr));
+  }
+
+  void DeleteVersionStorage() {
+    vstorage_.reset();
+    files_.clear();
+  }
+
+  void Add(int level, uint32_t file_number, const char* smallest,
+           const char* largest, uint64_t file_size = 0, uint32_t path_id = 0,
+           SequenceNumber smallest_seq = 100,
+           SequenceNumber largest_seq = 100) {
+    assert(level < vstorage_->num_levels());
+    FileMetaData* f = new FileMetaData;
+    f->fd = FileDescriptor(file_number, path_id, file_size);
+    f->smallest = InternalKey(smallest, smallest_seq, kTypeValue);
+    f->largest = InternalKey(largest, largest_seq, kTypeValue);
+    f->compensated_file_size = file_size;
+    f->refs = 0;
+    vstorage_->AddFile(level, f);
+    files_.emplace_back(f);
+  }
+
+  void UpdateVersionStorageInfo() {
+    vstorage_->UpdateFilesBySize();
+    vstorage_->UpdateNumNonEmptyLevels();
+    vstorage_->GenerateFileIndexer();
+    vstorage_->GenerateLevelFilesBrief();
+    vstorage_->ComputeCompactionScore(mutable_cf_options_, fifo_options_);
+    vstorage_->SetFinalized();
+  }
+};
+
+TEST(CompactionPickerTest, Empty) {
+  NewVersionStorage(6, kCompactionStyleLevel);
+  UpdateVersionStorageInfo();
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+  ASSERT_TRUE(compaction.get() == nullptr);
+}
+
+TEST(CompactionPickerTest, Single) {
+  NewVersionStorage(6, kCompactionStyleLevel);
+  mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+  Add(0, 1U, "p", "q");
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+  ASSERT_TRUE(compaction.get() == nullptr);
+}
+
+TEST(CompactionPickerTest, Level0Trigger) {
+  NewVersionStorage(6, kCompactionStyleLevel);
+  mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+  Add(0, 1U, "150", "200");
+  Add(0, 2U, "200", "250");
+
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(2U, compaction->num_input_files(0));
+  ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber());
+}
+
+TEST(CompactionPickerTest, Level1Trigger) {
+  NewVersionStorage(6, kCompactionStyleLevel);
+  Add(1, 66U, "150", "200", 1000000000U);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+  ASSERT_EQ(66U, compaction->input(0, 0)->fd.GetNumber());
+}
+
+TEST(CompactionPickerTest, Level1Trigger2) {
+  NewVersionStorage(6, kCompactionStyleLevel);
+  Add(1, 66U, "150", "200", 1000000001U);
+  Add(1, 88U, "201", "300", 1000000000U);
+  Add(2, 6U, "150", "179", 1000000000U);
+  Add(2, 7U, "180", "220", 1000000000U);
+  Add(2, 8U, "221", "300", 1000000000U);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+  ASSERT_EQ(2U, compaction->num_input_files(1));
+  ASSERT_EQ(66U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(6U, compaction->input(1, 0)->fd.GetNumber());
+  ASSERT_EQ(7U, compaction->input(1, 1)->fd.GetNumber());
+}
+
+TEST(CompactionPickerTest, LevelMaxScore) {
+  NewVersionStorage(6, kCompactionStyleLevel);
+  mutable_cf_options_.target_file_size_base = 10000000;
+  mutable_cf_options_.target_file_size_multiplier = 10;
+  Add(0, 1U, "150", "200", 1000000000U);
+  // Level 1 score 1.2
+  Add(1, 66U, "150", "200", 6000000U);
+  Add(1, 88U, "201", "300", 6000000U);
+  // Level 2 score 1.8. File 7 is the largest. Should be picked
+  Add(2, 6U, "150", "179", 60000000U);
+  Add(2, 7U, "180", "220", 60000001U);
+  Add(2, 8U, "221", "300", 60000000U);
+  // Level 3 score slightly larger than 1
+  Add(3, 26U, "150", "170", 260000000U);
+  Add(3, 27U, "171", "179", 260000000U);
+  Add(3, 28U, "191", "220", 260000000U);
+  Add(3, 29U, "221", "300", 260000000U);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+  ASSERT_EQ(7U, compaction->input(0, 0)->fd.GetNumber());
+}
+
+TEST(CompactionPickerTest, NeedsCompactionLevel) {
+  const int kLevels = 6;
+  const int kFileCount = 20;
+  for (int level = 0; level < kLevels - 1; ++level) {
+    uint64_t file_size =
+        mutable_cf_options_.MaxBytesForLevel(level) * 2 / kFileCount;
+    for (int file_count = 1; file_count <= kFileCount; ++file_count) {
+      // start a brand new version in each test.
+      NewVersionStorage(kLevels, kCompactionStyleLevel);
+      for (int i = 0; i < file_count; ++i) {
+        Add(level, i, ToString((i + 100) * 1000).c_str(),
+            ToString((i + 100) * 1000 + 999).c_str(),
+            file_size, 0, i * 100, i * 100 + 99);
+      }
+      UpdateVersionStorageInfo();
+      ASSERT_EQ(vstorage_->CompactionScoreLevel(0), level);
+      ASSERT_EQ(level_compaction_picker.NeedsCompaction(vstorage_.get()),
+                vstorage_->CompactionScore(0) >= 1);
+      // release the version storage
+      DeleteVersionStorage();
+    }
+  }
+}
+
+TEST(CompactionPickerTest, NeedsCompactionUniversal) {
+  NewVersionStorage(1, kCompactionStyleUniversal);
+  UniversalCompactionPicker universal_compaction_picker(
+      ioptions_, &icmp_);
+  // must return false when there's no files.
+  ASSERT_EQ(universal_compaction_picker.NeedsCompaction(vstorage_.get()),
+            false);
+
+  // verify the trigger given different number of L0 files.
+  for (int i = 1;
+       i <= mutable_cf_options_.level0_file_num_compaction_trigger * 2; ++i) {
+    Add(0, i, ToString((i + 100) * 1000).c_str(),
+        ToString((i + 100) * 1000 + 999).c_str(), 1000000, 0, i * 100,
+        i * 100 + 99);
+    ASSERT_EQ(level_compaction_picker.NeedsCompaction(vstorage_.get()),
+              vstorage_->CompactionScore(0) >= 1);
+  }
+}
+
+TEST(CompactionPickerTest, NeedsCompactionFIFO) {
+  NewVersionStorage(1, kCompactionStyleFIFO);
+  const int kFileCount =
+      mutable_cf_options_.level0_file_num_compaction_trigger * 3;
+  const uint64_t kFileSize = 100000;
+  const uint64_t kMaxSize = kFileSize * kFileCount / 2;
+
+  fifo_options_.max_table_files_size = kMaxSize;
+  ioptions_.compaction_options_fifo = fifo_options_;
+  FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_);
+
+  // must return false when there's no files.
+  ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), false);
+
+  // verify whether compaction is needed based on the current
+  // size of L0 files.
+  uint64_t current_size = 0;
+  for (int i = 1; i <= kFileCount; ++i) {
+    Add(0, i, ToString((i + 100) * 1000).c_str(),
+        ToString((i + 100) * 1000 + 999).c_str(),
+        kFileSize, 0, i * 100, i * 100 + 99);
+    current_size += kFileSize;
+    ASSERT_EQ(level_compaction_picker.NeedsCompaction(vstorage_.get()),
+              vstorage_->CompactionScore(0) >= 1);
+  }
+}
+
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) { return rocksdb::test::RunAllTests(); }
--- a/db/comparator_db_test.cc
+++ b/db/comparator_db_test.cc
@ -0,0 +1,434 @@
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+#include <map>
+#include <string>
+
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "util/hash.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+#include "utilities/merge_operators.h"
+
+using std::unique_ptr;
+
+namespace rocksdb {
+namespace {
+
+static const Comparator* comparator;
+
+// A comparator for std::map, using comparator
+struct MapComparator {
+  bool operator()(const std::string& a, const std::string& b) const {
+    return comparator->Compare(a, b) < 0;
+  }
+};
+
+typedef std::map<std::string, std::string, MapComparator> KVMap;
+
+class KVIter : public Iterator {
+ public:
+  explicit KVIter(const KVMap* map) : map_(map), iter_(map_->end()) {}
+  virtual bool Valid() const { return iter_ != map_->end(); }
+  virtual void SeekToFirst() { iter_ = map_->begin(); }
+  virtual void SeekToLast() {
+    if (map_->empty()) {
+      iter_ = map_->end();
+    } else {
+      iter_ = map_->find(map_->rbegin()->first);
+    }
+  }
+  virtual void Seek(const Slice& k) { iter_ = map_->lower_bound(k.ToString()); }
+  virtual void Next() { ++iter_; }
+  virtual void Prev() {
+    if (iter_ == map_->begin()) {
+      iter_ = map_->end();
+      return;
+    }
+    --iter_;
+  }
+
+  virtual Slice key() const { return iter_->first; }
+  virtual Slice value() const { return iter_->second; }
+  virtual Status status() const { return Status::OK(); }
+
+ private:
+  const KVMap* const map_;
+  KVMap::const_iterator iter_;
+};
+
+void AssertItersEqual(Iterator* iter1, Iterator* iter2) {
+  ASSERT_EQ(iter1->Valid(), iter2->Valid());
+  if (iter1->Valid()) {
+    ASSERT_EQ(iter1->key().ToString(), iter2->key().ToString());
+    ASSERT_EQ(iter1->value().ToString(), iter2->value().ToString());
+  }
+}
+
+// Measuring operations on DB (expect to be empty).
+// source_strings are candidate keys
+void DoRandomIteraratorTest(DB* db, std::vector<std::string> source_strings,
+                            Random* rnd, int num_writes, int num_iter_ops,
+                            int num_trigger_flush) {
+  KVMap map;
+
+  for (int i = 0; i < num_writes; i++) {
+    if (num_trigger_flush > 0 && i != 0 && i % num_trigger_flush == 0) {
+      db->Flush(FlushOptions());
+    }
+
+    int type = rnd->Uniform(2);
+    int index = rnd->Uniform(static_cast<int>(source_strings.size()));
+    auto& key = source_strings[index];
+    switch (type) {
+      case 0:
+        // put
+        map[key] = key;
+        ASSERT_OK(db->Put(WriteOptions(), key, key));
+        break;
+      case 1:
+        // delete
+        if (map.find(key) != map.end()) {
+          map.erase(key);
+        }
+        ASSERT_OK(db->Delete(WriteOptions(), key));
+        break;
+      default:
+        assert(false);
+    }
+  }
+
+  std::unique_ptr<Iterator> iter(db->NewIterator(ReadOptions()));
+  std::unique_ptr<Iterator> result_iter(new KVIter(&map));
+
+  bool is_valid = false;
+  for (int i = 0; i < num_iter_ops; i++) {
+    // Random walk and make sure iter and result_iter returns the
+    // same key and value
+    int type = rnd->Uniform(6);
+    ASSERT_OK(iter->status());
+    switch (type) {
+      case 0:
+        // Seek to First
+        iter->SeekToFirst();
+        result_iter->SeekToFirst();
+        break;
+      case 1:
+        // Seek to last
+        iter->SeekToLast();
+        result_iter->SeekToLast();
+        break;
+      case 2: {
+        // Seek to random key
+        auto key_idx = rnd->Uniform(static_cast<int>(source_strings.size()));
+        auto key = source_strings[key_idx];
+        iter->Seek(key);
+        result_iter->Seek(key);
+        break;
+      }
+      case 3:
+        // Next
+        if (is_valid) {
+          iter->Next();
+          result_iter->Next();
+        } else {
+          continue;
+        }
+        break;
+      case 4:
+        // Prev
+        if (is_valid) {
+          iter->Prev();
+          result_iter->Prev();
+        } else {
+          continue;
+        }
+        break;
+      default: {
+        assert(type == 5);
+        auto key_idx = rnd->Uniform(static_cast<int>(source_strings.size()));
+        auto key = source_strings[key_idx];
+        std::string result;
+        auto status = db->Get(ReadOptions(), key, &result);
+        if (map.find(key) == map.end()) {
+          ASSERT_TRUE(status.IsNotFound());
+        } else {
+          ASSERT_EQ(map[key], result);
+        }
+        break;
+      }
+    }
+    AssertItersEqual(iter.get(), result_iter.get());
+    is_valid = iter->Valid();
+  }
+}
+
+class DoubleComparator : public Comparator {
+ public:
+  DoubleComparator() {}
+
+  virtual const char* Name() const { return "DoubleComparator"; }
+
+  virtual int Compare(const Slice& a, const Slice& b) const {
+    double da = std::stod(a.ToString());
+    double db = std::stod(b.ToString());
+    if (da == db) {
+      return a.compare(b);
+    } else if (da > db) {
+      return 1;
+    } else {
+      return -1;
+    }
+  }
+  virtual void FindShortestSeparator(std::string* start,
+                                     const Slice& limit) const {}
+
+  virtual void FindShortSuccessor(std::string* key) const {}
+};
+
+class HashComparator : public Comparator {
+ public:
+  HashComparator() {}
+
+  virtual const char* Name() const { return "HashComparator"; }
+
+  virtual int Compare(const Slice& a, const Slice& b) const {
+    uint32_t ha = Hash(a.data(), a.size(), 66);
+    uint32_t hb = Hash(b.data(), b.size(), 66);
+    if (ha == hb) {
+      return a.compare(b);
+    } else if (ha > hb) {
+      return 1;
+    } else {
+      return -1;
+    }
+  }
+  virtual void FindShortestSeparator(std::string* start,
+                                     const Slice& limit) const {}
+
+  virtual void FindShortSuccessor(std::string* key) const {}
+};
+
+class TwoStrComparator : public Comparator {
+ public:
+  TwoStrComparator() {}
+
+  virtual const char* Name() const { return "TwoStrComparator"; }
+
+  virtual int Compare(const Slice& a, const Slice& b) const {
+    assert(a.size() >= 2);
+    assert(b.size() >= 2);
+    size_t size_a1 = static_cast<size_t>(a[0]);
+    size_t size_b1 = static_cast<size_t>(b[0]);
+    size_t size_a2 = static_cast<size_t>(a[1]);
+    size_t size_b2 = static_cast<size_t>(b[1]);
+    assert(size_a1 + size_a2 + 2 == a.size());
+    assert(size_b1 + size_b2 + 2 == b.size());
+
+    Slice a1 = Slice(a.data() + 2, size_a1);
+    Slice b1 = Slice(b.data() + 2, size_b1);
+    Slice a2 = Slice(a.data() + 2 + size_a1, size_a2);
+    Slice b2 = Slice(b.data() + 2 + size_b1, size_b2);
+
+    if (a1 != b1) {
+      return a1.compare(b1);
+    }
+    return a2.compare(b2);
+  }
+  virtual void FindShortestSeparator(std::string* start,
+                                     const Slice& limit) const {}
+
+  virtual void FindShortSuccessor(std::string* key) const {}
+};
+}  // namespace
+
+class ComparatorDBTest {
+ private:
+  std::string dbname_;
+  Env* env_;
+  DB* db_;
+  Options last_options_;
+  std::unique_ptr<const Comparator> comparator_guard;
+
+ public:
+  ComparatorDBTest() : env_(Env::Default()), db_(nullptr) {
+    comparator = BytewiseComparator();
+    dbname_ = test::TmpDir() + "/comparator_db_test";
+    ASSERT_OK(DestroyDB(dbname_, last_options_));
+  }
+
+  ~ComparatorDBTest() {
+    delete db_;
+    ASSERT_OK(DestroyDB(dbname_, last_options_));
+    comparator = BytewiseComparator();
+  }
+
+  DB* GetDB() { return db_; }
+
+  void SetOwnedComparator(const Comparator* cmp) {
+    comparator_guard.reset(cmp);
+    comparator = cmp;
+    last_options_.comparator = cmp;
+  }
+
+  // Return the current option configuration.
+  Options* GetOptions() { return &last_options_; }
+
+  void DestroyAndReopen() {
+    // Destroy using last options
+    Destroy();
+    ASSERT_OK(TryReopen());
+  }
+
+  void Destroy() {
+    delete db_;
+    db_ = nullptr;
+    ASSERT_OK(DestroyDB(dbname_, last_options_));
+  }
+
+  Status TryReopen() {
+    delete db_;
+    db_ = nullptr;
+    last_options_.create_if_missing = true;
+
+    return DB::Open(last_options_, dbname_, &db_);
+  }
+};
+
+TEST(ComparatorDBTest, Bytewise) {
+  for (int rand_seed = 301; rand_seed < 306; rand_seed++) {
+    DestroyAndReopen();
+    Random rnd(rand_seed);
+    DoRandomIteraratorTest(GetDB(),
+                           {"a", "b", "c", "d", "e", "f", "g", "h", "i"}, &rnd,
+                           8, 100, 3);
+  }
+}
+
+TEST(ComparatorDBTest, SimpleSuffixReverseComparator) {
+  SetOwnedComparator(new test::SimpleSuffixReverseComparator());
+
+  for (int rnd_seed = 301; rnd_seed < 316; rnd_seed++) {
+    Options* opt = GetOptions();
+    opt->comparator = comparator;
+    DestroyAndReopen();
+    Random rnd(rnd_seed);
+
+    std::vector<std::string> source_strings;
+    std::vector<std::string> source_prefixes;
+    // Randomly generate 5 prefixes
+    for (int i = 0; i < 5; i++) {
+      source_prefixes.push_back(test::RandomHumanReadableString(&rnd, 8));
+    }
+    for (int j = 0; j < 20; j++) {
+      int prefix_index = rnd.Uniform(static_cast<int>(source_prefixes.size()));
+      std::string key = source_prefixes[prefix_index] +
+                        test::RandomHumanReadableString(&rnd, rnd.Uniform(8));
+      source_strings.push_back(key);
+    }
+
+    DoRandomIteraratorTest(GetDB(), source_strings, &rnd, 30, 600, 66);
+  }
+}
+
+TEST(ComparatorDBTest, Uint64Comparator) {
+  SetOwnedComparator(test::Uint64Comparator());
+
+  for (int rnd_seed = 301; rnd_seed < 316; rnd_seed++) {
+    Options* opt = GetOptions();
+    opt->comparator = comparator;
+    DestroyAndReopen();
+    Random rnd(rnd_seed);
+    Random64 rnd64(rnd_seed);
+
+    std::vector<std::string> source_strings;
+    // Randomly generate source keys
+    for (int i = 0; i < 100; i++) {
+      uint64_t r = rnd64.Next();
+      std::string str;
+      str.resize(8);
+      memcpy(&str[0], static_cast<void*>(&r), 8);
+      source_strings.push_back(str);
+    }
+
+    DoRandomIteraratorTest(GetDB(), source_strings, &rnd, 200, 1000, 66);
+  }
+}
+
+TEST(ComparatorDBTest, DoubleComparator) {
+  SetOwnedComparator(new DoubleComparator());
+
+  for (int rnd_seed = 301; rnd_seed < 316; rnd_seed++) {
+    Options* opt = GetOptions();
+    opt->comparator = comparator;
+    DestroyAndReopen();
+    Random rnd(rnd_seed);
+
+    std::vector<std::string> source_strings;
+    // Randomly generate source keys
+    for (int i = 0; i < 100; i++) {
+      uint32_t r = rnd.Next();
+      uint32_t divide_order = rnd.Uniform(8);
+      double to_divide = 1.0;
+      for (uint32_t j = 0; j < divide_order; j++) {
+        to_divide *= 10.0;
+      }
+      source_strings.push_back(ToString(r / to_divide));
+    }
+
+    DoRandomIteraratorTest(GetDB(), source_strings, &rnd, 200, 1000, 66);
+  }
+}
+
+TEST(ComparatorDBTest, HashComparator) {
+  SetOwnedComparator(new HashComparator());
+
+  for (int rnd_seed = 301; rnd_seed < 316; rnd_seed++) {
+    Options* opt = GetOptions();
+    opt->comparator = comparator;
+    DestroyAndReopen();
+    Random rnd(rnd_seed);
+
+    std::vector<std::string> source_strings;
+    // Randomly generate source keys
+    for (int i = 0; i < 100; i++) {
+      source_strings.push_back(test::RandomKey(&rnd, 8));
+    }
+
+    DoRandomIteraratorTest(GetDB(), source_strings, &rnd, 200, 1000, 66);
+  }
+}
+
+TEST(ComparatorDBTest, TwoStrComparator) {
+  SetOwnedComparator(new TwoStrComparator());
+
+  for (int rnd_seed = 301; rnd_seed < 316; rnd_seed++) {
+    Options* opt = GetOptions();
+    opt->comparator = comparator;
+    DestroyAndReopen();
+    Random rnd(rnd_seed);
+
+    std::vector<std::string> source_strings;
+    // Randomly generate source keys
+    for (int i = 0; i < 100; i++) {
+      std::string str;
+      uint32_t size1 = rnd.Uniform(8);
+      uint32_t size2 = rnd.Uniform(8);
+      str.append(1, static_cast<char>(size1));
+      str.append(1, static_cast<char>(size2));
+      str.append(test::RandomKey(&rnd, size1));
+      str.append(test::RandomKey(&rnd, size2));
+      source_strings.push_back(str);
+    }
+
+    DoRandomIteraratorTest(GetDB(), source_strings, &rnd, 200, 1000, 66);
+  }
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) { return rocksdb::test::RunAllTests(); }
--- a/db/corruption_test.cc
+++ b/db/corruption_test.cc
@ -115,8 +115,8 @@ class CorruptionTest {
        continue;
      }
      missed += (key - next_expected);
-      next_expected = key + 1;
-      if (iter->value() != Value(key, &value_space)) {
+      next_expected = static_cast<unsigned int>(key + 1);
+      if (iter->value() != Value(static_cast<int>(key), &value_space)) {
        bad_values++;
      } else {
        correct++;
@ -131,7 +131,7 @@ class CorruptionTest {
    ASSERT_GE(max_expected, correct);
  }

-  void CorruptFile(const std::string fname, int offset, int bytes_to_corrupt) {
+  void CorruptFile(const std::string& fname, int offset, int bytes_to_corrupt) {
    struct stat sbuf;
    if (stat(fname.c_str(), &sbuf) != 0) {
      const char* msg = strerror(errno);
@ -143,14 +143,14 @@ class CorruptionTest {
      if (-offset > sbuf.st_size) {
        offset = 0;
      } else {
-        offset = sbuf.st_size + offset;
+        offset = static_cast<int>(sbuf.st_size + offset);
      }
    }
    if (offset > sbuf.st_size) {
-      offset = sbuf.st_size;
+      offset = static_cast<int>(sbuf.st_size);
    }
    if (offset + bytes_to_corrupt > sbuf.st_size) {
-      bytes_to_corrupt = sbuf.st_size - offset;
+      bytes_to_corrupt = static_cast<int>(sbuf.st_size - offset);
    }

    // Do it
@ -177,7 +177,7 @@ class CorruptionTest {
          type == filetype &&
          static_cast<int>(number) > picked_number) {  // Pick latest file
        fname = dbname_ + "/" + filenames[i];
-        picked_number = number;
+        picked_number = static_cast<int>(number);
      }
    }
    ASSERT_TRUE(!fname.empty()) << filetype;
@ -231,7 +231,9 @@ TEST(CorruptionTest, Recovery) {
  Check(100, 100);
  Corrupt(kLogFile, 19, 1);      // WriteBatch tag for first record
  Corrupt(kLogFile, log::kBlockSize + 1000, 1);  // Somewhere in second block
-  Reopen();
+  ASSERT_TRUE(!TryReopen().ok());
+  options_.paranoid_checks = false;
+  Reopen(&options_);

  // The 64 records in the first two log blocks are completely lost.
  Check(36, 36);
@ -246,7 +248,8 @@ TEST(CorruptionTest, RecoverWriteError) {
 TEST(CorruptionTest, NewFileErrorDuringWrite) {
  // Do enough writing to force minor compaction
  env_.writable_file_error_ = true;
-  const int num = 3 + (Options().write_buffer_size / kValueSize);
+  const int num =
+      static_cast<int>(3 + (Options().write_buffer_size / kValueSize));
  std::string value_storage;
  Status s;
  bool failed = false;
@ -332,6 +335,9 @@ TEST(CorruptionTest, CorruptedDescriptor) {
 }

 TEST(CorruptionTest, CompactionInputError) {
+  Options options;
+  options.max_background_flushes = 0;
+  Reopen(&options);
  Build(10);
  DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
  dbi->TEST_FlushMemTable();
@ -351,6 +357,7 @@ TEST(CorruptionTest, CompactionInputErrorParanoid) {
  options.paranoid_checks = true;
  options.write_buffer_size = 131072;
  options.max_write_buffer_number = 2;
+  options.max_background_flushes = 0;
  Reopen(&options);
  DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);

--- a/db/cuckoo_table_db_test.cc
+++ b/db/cuckoo_table_db_test.cc
@ -92,7 +92,7 @@ class CuckooTableDBTest {
  // Return spread of files per level
  std::string FilesPerLevel() {
    std::string result;
-    int last_non_zero_offset = 0;
+    size_t last_non_zero_offset = 0;
    for (int level = 0; level < db_->NumberLevels(); level++) {
      int f = NumTableFilesAtLevel(level);
      char buf[100];
@ -218,6 +218,7 @@ TEST(CuckooTableDBTest, Uint64Comparator) {

  // Add more keys.
  ASSERT_OK(Delete(Uint64Key(2)));  // Delete.
+  dbfull()->TEST_FlushMemTable();
  ASSERT_OK(Put(Uint64Key(3), "v0"));  // Update.
  ASSERT_OK(Put(Uint64Key(4), "v4"));
  dbfull()->TEST_FlushMemTable();
@ -227,28 +228,25 @@ TEST(CuckooTableDBTest, Uint64Comparator) {
  ASSERT_EQ("v4", Get(Uint64Key(4)));
 }

-TEST(CuckooTableDBTest, CompactionTrigger) {
+TEST(CuckooTableDBTest, CompactionIntoMultipleFiles) {
+  // Create a big L0 file and check it compacts into multiple files in L1.
  Options options = CurrentOptions();
-  options.write_buffer_size = 100 << 10;  // 100KB
-  options.level0_file_num_compaction_trigger = 2;
+  options.write_buffer_size = 270 << 10;
+  // Two SST files should be created, each containing 14 keys.
+  // Number of buckets will be 16. Total size ~156 KB.
+  options.target_file_size_base = 160 << 10;
  Reopen(&options);

-  // Write 11 values, each 10016 B
-  for (int idx = 0; idx < 11; ++idx) {
+  // Write 28 values, each 10016 B ~ 10KB
+  for (int idx = 0; idx < 28; ++idx) {
    ASSERT_OK(Put(Key(idx), std::string(10000, 'a' + idx)));
  }
  dbfull()->TEST_WaitForFlushMemTable();
  ASSERT_EQ("1", FilesPerLevel());

-  // Generate one more file in level-0, and should trigger level-0 compaction
-  for (int idx = 11; idx < 22; ++idx) {
-    ASSERT_OK(Put(Key(idx), std::string(10000, 'a' + idx)));
-  }
-  dbfull()->TEST_WaitForFlushMemTable();
  dbfull()->TEST_CompactRange(0, nullptr, nullptr);
-
  ASSERT_EQ("0,2", FilesPerLevel());
-  for (int idx = 0; idx < 22; ++idx) {
+  for (int idx = 0; idx < 28; ++idx) {
    ASSERT_EQ(std::string(10000, 'a' + idx), Get(Key(idx)));
  }
 }
--- a/db/db_bench.cc
+++ b/db/db_bench.cc
--- a/db/db_filesnapshot.cc
+++ b/db/db_filesnapshot.cc
@ -9,29 +9,35 @@

 #ifndef ROCKSDB_LITE

+#ifndef __STDC_FORMAT_MACROS
 #define __STDC_FORMAT_MACROS
+#endif
+
 #include <inttypes.h>
 #include <algorithm>
 #include <string>
 #include <stdint.h>
 #include "db/db_impl.h"
 #include "db/filename.h"
+#include "db/job_context.h"
 #include "db/version_set.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
 #include "port/port.h"
 #include "util/mutexlock.h"
 #include "util/sync_point.h"
+#include "util/file_util.h"

 namespace rocksdb {

 Status DBImpl::DisableFileDeletions() {
-  MutexLock l(&mutex_);
+  InstrumentedMutexLock l(&mutex_);
  ++disable_delete_obsolete_files_;
  if (disable_delete_obsolete_files_ == 1) {
-    Log(options_.info_log, "File Deletions Disabled");
+    Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
+        "File Deletions Disabled");
  } else {
-    Log(options_.info_log,
+    Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log,
        "File Deletions Disabled, but already disabled. Counter: %d",
        disable_delete_obsolete_files_);
  }
@ -39,10 +45,10 @@ Status DBImpl::DisableFileDeletions() {
 }

 Status DBImpl::EnableFileDeletions(bool force) {
-  DeletionState deletion_state;
+  JobContext job_context;
  bool should_purge_files = false;
  {
-    MutexLock l(&mutex_);
+    InstrumentedMutexLock l(&mutex_);
    if (force) {
      // if force, we need to enable file deletions right away
      disable_delete_obsolete_files_ = 0;
@ -50,19 +56,21 @@ Status DBImpl::EnableFileDeletions(bool force) {
      --disable_delete_obsolete_files_;
    }
    if (disable_delete_obsolete_files_ == 0)  {
-      Log(options_.info_log, "File Deletions Enabled");
+      Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
+          "File Deletions Enabled");
      should_purge_files = true;
-      FindObsoleteFiles(deletion_state, true);
+      FindObsoleteFiles(&job_context, true);
    } else {
-      Log(options_.info_log,
+      Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log,
          "File Deletions Enable, but not really enabled. Counter: %d",
          disable_delete_obsolete_files_);
    }
  }
  if (should_purge_files)  {
-    PurgeObsoleteFiles(deletion_state);
+    PurgeObsoleteFiles(job_context);
  }
-  LogFlush(options_.info_log);
+  job_context.Clean();
+  LogFlush(db_options_.info_log);
  return Status::OK();
 }

@ -95,8 +103,8 @@ Status DBImpl::GetLiveFiles(std::vector<std::string>& ret,

    if (!status.ok()) {
      mutex_.Unlock();
-      Log(options_.info_log, "Cannot Flush data %s\n",
-          status.ToString().c_str());
+      Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
+          "Cannot Flush data %s\n", status.ToString().c_str());
      return status;
    }
  }
@ -117,65 +125,17 @@ Status DBImpl::GetLiveFiles(std::vector<std::string>& ret,
  }

  ret.push_back(CurrentFileName(""));
-  ret.push_back(DescriptorFileName("", versions_->ManifestFileNumber()));
+  ret.push_back(DescriptorFileName("", versions_->manifest_file_number()));

  // find length of manifest file while holding the mutex lock
-  *manifest_file_size = versions_->ManifestFileSize();
+  *manifest_file_size = versions_->manifest_file_size();

  mutex_.Unlock();
  return Status::OK();
 }

 Status DBImpl::GetSortedWalFiles(VectorLogPtr& files) {
-  // First get sorted files in db dir, then get sorted files from archived
-  // dir, to avoid a race condition where a log file is moved to archived
-  // dir in between.
-  Status s;
-  // list wal files in main db dir.
-  VectorLogPtr logs;
-  s = GetSortedWalsOfType(options_.wal_dir, logs, kAliveLogFile);
-  if (!s.ok()) {
-    return s;
-  }
-
-  // Reproduce the race condition where a log file is moved
-  // to archived dir, between these two sync points, used in
-  // (DBTest,TransactionLogIteratorRace)
-  TEST_SYNC_POINT("DBImpl::GetSortedWalFiles:1");
-  TEST_SYNC_POINT("DBImpl::GetSortedWalFiles:2");
-
-  files.clear();
-  // list wal files in archive dir.
-  std::string archivedir = ArchivalDirectory(options_.wal_dir);
-  if (env_->FileExists(archivedir)) {
-    s = GetSortedWalsOfType(archivedir, files, kArchivedLogFile);
-    if (!s.ok()) {
-      return s;
-    }
-  }
-
-  uint64_t latest_archived_log_number = 0;
-  if (!files.empty()) {
-    latest_archived_log_number = files.back()->LogNumber();
-    Log(options_.info_log, "Latest Archived log: %" PRIu64,
-        latest_archived_log_number);
-  }
-
-  files.reserve(files.size() + logs.size());
-  for (auto& log : logs) {
-    if (log->LogNumber() > latest_archived_log_number) {
-      files.push_back(std::move(log));
-    } else {
-      // When the race condition happens, we could see the
-      // same log in both db dir and archived dir. Simply
-      // ignore the one in db dir. Note that, if we read
-      // archived dir first, we would have missed the log file.
-      Log(options_.info_log, "%s already moved to archive",
-          log->PathName().c_str());
-    }
-  }
-
-  return s;
+  return wal_manager_.GetSortedWalFiles(files);
 }

 }
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
--- a/db/db_impl.h
+++ b/db/db_impl.h
@ -12,7 +12,9 @@
 #include <deque>
 #include <limits>
 #include <set>
+#include <list>
 #include <utility>
+#include <list>
 #include <vector>
 #include <string>

@ -21,6 +23,8 @@
 #include "db/snapshot.h"
 #include "db/column_family.h"
 #include "db/version_edit.h"
+#include "db/wal_manager.h"
+#include "db/writebuffer.h"
 #include "memtable_list.h"
 #include "port/port.h"
 #include "rocksdb/db.h"
@ -30,7 +34,13 @@
 #include "util/autovector.h"
 #include "util/stop_watch.h"
 #include "util/thread_local.h"
+#include "util/scoped_arena_iterator.h"
+#include "util/hash.h"
+#include "util/instrumented_mutex.h"
 #include "db/internal_stats.h"
+#include "db/write_controller.h"
+#include "db/flush_scheduler.h"
+#include "db/write_thread.h"

 namespace rocksdb {

@ -41,6 +51,7 @@ class VersionEdit;
 class VersionSet;
 class CompactionFilterV2;
 class Arena;
+struct JobContext;

 class DBImpl : public DB {
 public:
@ -108,6 +119,17 @@ class DBImpl : public DB {
                              bool reduce_level = false, int target_level = -1,
                              uint32_t target_path_id = 0);

+  using DB::CompactFiles;
+  virtual Status CompactFiles(
+      const CompactionOptions& compact_options,
+      ColumnFamilyHandle* column_family,
+      const std::vector<std::string>& input_file_names,
+      const int output_level, const int output_path_id = -1);
+
+  using DB::SetOptions;
+  Status SetOptions(ColumnFamilyHandle* column_family,
+      const std::unordered_map<std::string, std::string>& options_map);
+
  using DB::NumberLevels;
  virtual int NumberLevels(ColumnFamilyHandle* column_family);
  using DB::MaxMemCompactionLevel;
@ -141,6 +163,15 @@ class DBImpl : public DB {
  virtual Status DeleteFile(std::string name);

  virtual void GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata);
+
+  // Obtains the meta data of the specified column family of the DB.
+  // Status::NotFound() will be returned if the current DB does not have
+  // any column family match the specified name.
+  // TODO(yhchiang): output parameter is placed in the end in this codebase.
+  virtual void GetColumnFamilyMetaData(
+      ColumnFamilyHandle* column_family,
+      ColumnFamilyMetaData* metadata) override;
+
 #endif  // ROCKSDB_LITE

  // checks if all live files exist on file system and that their file sizes
@ -173,8 +204,8 @@ class DBImpl : public DB {
  // Return an internal iterator over the current state of the database.
  // The keys of this iterator are internal keys (see format.h).
  // The returned iterator should be deleted when no longer needed.
-  Iterator* TEST_NewInternalIterator(ColumnFamilyHandle* column_family =
-                                         nullptr);
+  Iterator* TEST_NewInternalIterator(
+      Arena* arena, ColumnFamilyHandle* column_family = nullptr);

  // Return the maximum overlapping data (in bytes) at next level for any
  // file at a level >= 1.
@ -184,132 +215,79 @@ class DBImpl : public DB {
  // Return the current manifest file no.
  uint64_t TEST_Current_Manifest_FileNo();

-  // Trigger's a background call for testing.
-  void TEST_PurgeObsoleteteWAL();
-
  // get total level0 file size. Only for testing.
  uint64_t TEST_GetLevel0TotalSize();

-  void TEST_SetDefaultTimeToCheck(uint64_t default_interval_to_delete_obsolete_WAL)
-  {
-    default_interval_to_delete_obsolete_WAL_ = default_interval_to_delete_obsolete_WAL;
-  }
-
  void TEST_GetFilesMetaData(ColumnFamilyHandle* column_family,
                             std::vector<std::vector<FileMetaData>>* metadata);

-  Status TEST_ReadFirstRecord(const WalFileType type, const uint64_t number,
-                              SequenceNumber* sequence);
-
-  Status TEST_ReadFirstLine(const std::string& fname, SequenceNumber* sequence);
-#endif  // NDEBUG
-
-  // Structure to store information for candidate files to delete.
-  struct CandidateFileInfo {
-    std::string file_name;
-    uint32_t path_id;
-    CandidateFileInfo(std::string name, uint32_t path)
-        : file_name(name), path_id(path) {}
-    bool operator==(const CandidateFileInfo& other) const {
-      return file_name == other.file_name && path_id == other.path_id;
-    }
-  };
-
-  // needed for CleanupIteratorState
-  struct DeletionState {
-    inline bool HaveSomethingToDelete() const {
-      return  candidate_files.size() ||
-        sst_delete_files.size() ||
-        log_delete_files.size();
-    }
-
-    // a list of all files that we'll consider deleting
-    // (every once in a while this is filled up with all files
-    // in the DB directory)
-    std::vector<CandidateFileInfo> candidate_files;
+  void TEST_LockMutex();

-    // the list of all live sst files that cannot be deleted
-    std::vector<FileDescriptor> sst_live;
+  void TEST_UnlockMutex();

-    // a list of sst files that we need to delete
-    std::vector<FileMetaData*> sst_delete_files;
+  // REQUIRES: mutex locked
+  void* TEST_BeginWrite();

-    // a list of log files that we need to delete
-    std::vector<uint64_t> log_delete_files;
+  // REQUIRES: mutex locked
+  // pass the pointer that you got from TEST_BeginWrite()
+  void TEST_EndWrite(void* w);

-    // a list of memtables to be free
-    autovector<MemTable*> memtables_to_free;
-
-    autovector<SuperVersion*> superversions_to_free;
-
-    SuperVersion* new_superversion;  // if nullptr no new superversion
-
-    // the current manifest_file_number, log_number and prev_log_number
-    // that corresponds to the set of files in 'live'.
-    uint64_t manifest_file_number, pending_manifest_file_number, log_number,
-        prev_log_number;
-
-    explicit DeletionState(bool create_superversion = false) {
-      manifest_file_number = 0;
-      pending_manifest_file_number = 0;
-      log_number = 0;
-      prev_log_number = 0;
-      new_superversion = create_superversion ? new SuperVersion() : nullptr;
+  uint64_t TEST_max_total_in_memory_state() {
+    return max_total_in_memory_state_;
  }

-    ~DeletionState() {
-      // free pending memtables
-      for (auto m : memtables_to_free) {
-        delete m;
-      }
-      // free superversions
-      for (auto s : superversions_to_free) {
-        delete s;
-      }
-      // if new_superversion was not used, it will be non-nullptr and needs
-      // to be freed here
-      delete new_superversion;
-    }
-  };
+#endif  // ROCKSDB_LITE

  // Returns the list of live files in 'live' and the list
  // of all files in the filesystem in 'candidate_files'.
  // If force == false and the last call was less than
-  // options_.delete_obsolete_files_period_micros microseconds ago,
-  // it will not fill up the deletion_state
-  void FindObsoleteFiles(DeletionState& deletion_state,
-                         bool force,
+  // db_options_.delete_obsolete_files_period_micros microseconds ago,
+  // it will not fill up the job_context
+  void FindObsoleteFiles(JobContext* job_context, bool force,
                         bool no_full_scan = false);

  // Diffs the files listed in filenames and those that do not
  // belong to live files are posibly removed. Also, removes all the
  // files in sst_delete_files and log_delete_files.
  // It is not necessary to hold the mutex when invoking this method.
-  void PurgeObsoleteFiles(DeletionState& deletion_state);
+  void PurgeObsoleteFiles(const JobContext& background_contet);

  ColumnFamilyHandle* DefaultColumnFamily() const;

+  const SnapshotList& snapshots() const { return snapshots_; }
+
 protected:
  Env* const env_;
  const std::string dbname_;
  unique_ptr<VersionSet> versions_;
-  const DBOptions options_;
+  const DBOptions db_options_;
  Statistics* stats_;

  Iterator* NewInternalIterator(const ReadOptions&, ColumnFamilyData* cfd,
-                                SuperVersion* super_version,
-                                Arena* arena = nullptr);
+                                SuperVersion* super_version, Arena* arena);
+
+  void NotifyOnFlushCompleted(ColumnFamilyData* cfd, uint64_t file_number,
+                              const MutableCFOptions& mutable_cf_options);
+
+  void NotifyOnCompactionCompleted(ColumnFamilyData* cfd,
+                                   Compaction *c, const Status &st);
+
+  void NewThreadStatusCfInfo(ColumnFamilyData* cfd) const;
+
+  void EraseThreadStatusCfInfo(ColumnFamilyData* cfd) const;
+
+  void EraseThreadStatusDbInfo() const;

 private:
  friend class DB;
  friend class InternalStats;
 #ifndef ROCKSDB_LITE
-  friend class TailingIterator;
  friend class ForwardIterator;
 #endif
  friend struct SuperVersion;
+  friend class CompactedDBImpl;
  struct CompactionState;
-  struct Writer;
+
  struct WriteContext;

  Status NewDB();
@ -327,14 +305,34 @@ class DBImpl : public DB {
  // Delete any unneeded files and stale in-memory entries.
  void DeleteObsoleteFiles();

+  // Background process needs to call
+  //     auto x = CaptureCurrentFileNumberInPendingOutputs()
+  //     <do something>
+  //     ReleaseFileNumberFromPendingOutputs(x)
+  // This will protect any temporary files created while <do something> is
+  // executing from being deleted.
+  // -----------
+  // This function will capture current file number and append it to
+  // pending_outputs_. This will prevent any background process to delete any
+  // file created after this point.
+  std::list<uint64_t>::iterator CaptureCurrentFileNumberInPendingOutputs();
+  // This function should be called with the result of
+  // CaptureCurrentFileNumberInPendingOutputs(). It then marks that any file
+  // created between the calls CaptureCurrentFileNumberInPendingOutputs() and
+  // ReleaseFileNumberFromPendingOutputs() can now be deleted (if it's not live
+  // and blocked by any other pending_outputs_ calls)
+  void ReleaseFileNumberFromPendingOutputs(std::list<uint64_t>::iterator v);
+
  // Flush the in-memory write buffer to storage.  Switches to a new
  // log-file/memtable and writes a new descriptor iff successful.
-  Status FlushMemTableToOutputFile(ColumnFamilyData* cfd, bool* madeProgress,
-                                   DeletionState& deletion_state,
+  Status FlushMemTableToOutputFile(ColumnFamilyData* cfd,
+                                   const MutableCFOptions& mutable_cf_options,
+                                   bool* madeProgress, JobContext* job_context,
                                   LogBuffer* log_buffer);

-  Status RecoverLogFile(uint64_t log_number, SequenceNumber* max_sequence,
-                        bool read_only);
+  // REQUIRES: log_numbers are sorted in ascending order
+  Status RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
+                         SequenceNumber* max_sequence, bool read_only);

  // The following two methods are used to flush a memtable to
  // storage. The first one is used atdatabase RecoveryTime (when the
@ -343,47 +341,13 @@ class DBImpl : public DB {
  // concurrent flush memtables to storage.
  Status WriteLevel0TableForRecovery(ColumnFamilyData* cfd, MemTable* mem,
                                     VersionEdit* edit);
-  Status WriteLevel0Table(ColumnFamilyData* cfd, autovector<MemTable*>& mems,
-                          VersionEdit* edit, uint64_t* filenumber,
-                          LogBuffer* log_buffer);
+  Status DelayWrite(uint64_t expiration_time);

-  uint64_t SlowdownAmount(int n, double bottom, double top);
-
-  // Before applying write operation (such as DBImpl::Write, DBImpl::Flush)
-  // thread should grab the mutex_ and be the first on writers queue.
-  // BeginWrite is used for it.
-  // Be aware! Writer's job can be done by other thread (see DBImpl::Write
-  // for examples), so check it via w.done before applying changes.
-  //
-  // Writer* w:                writer to be placed in the queue
-  // uint64_t expiration_time: maximum time to be in the queue
-  // See also: EndWrite
-  Status BeginWrite(Writer* w, uint64_t expiration_time);
-
-  // After doing write job, we need to remove already used writers from
-  // writers_ queue and notify head of the queue about it.
-  // EndWrite is used for this.
-  //
-  // Writer* w:           Writer, that was added by BeginWrite function
-  // Writer* last_writer: Since we can join a few Writers (as DBImpl::Write
-  //                      does)
-  //                      we should pass last_writer as a parameter to
-  //                      EndWrite
-  //                      (if you don't touch other writers, just pass w)
-  // Status status:       Status of write operation
-  // See also: BeginWrite
-  void EndWrite(Writer* w, Writer* last_writer, Status status);
-
-  Status MakeRoomForWrite(ColumnFamilyData* cfd,
-                          WriteContext* context,
-                          uint64_t expiration_time);
+  Status ScheduleFlushes(WriteContext* context);

  Status SetNewMemtableAndNewLogFile(ColumnFamilyData* cfd,
                                     WriteContext* context);

-  void BuildBatchGroup(Writer** last_writer,
-                       autovector<WriteBatch*>* write_batch_group);
-
  // Force current memtable contents to be flushed.
  Status FlushMemTable(ColumnFamilyData* cfd, const FlushOptions& options);

@ -393,98 +357,55 @@ class DBImpl : public DB {
  void RecordFlushIOStats();
  void RecordCompactionIOStats();

+#ifndef ROCKSDB_LITE
+  Status CompactFilesImpl(
+      const CompactionOptions& compact_options, ColumnFamilyData* cfd,
+      Version* version, const std::vector<std::string>& input_file_names,
+      const int output_level, int output_path_id);
+#endif  // ROCKSDB_LITE
+
+  ColumnFamilyData* GetColumnFamilyDataByName(const std::string& cf_name);
+
  void MaybeScheduleFlushOrCompaction();
+  void SchedulePendingFlush(ColumnFamilyData* cfd);
+  void SchedulePendingCompaction(ColumnFamilyData* cfd);
  static void BGWorkCompaction(void* db);
  static void BGWorkFlush(void* db);
  void BackgroundCallCompaction();
  void BackgroundCallFlush();
-  Status BackgroundCompaction(bool* madeProgress, DeletionState& deletion_state,
+  Status BackgroundCompaction(bool* madeProgress, JobContext* job_context,
                              LogBuffer* log_buffer);
-  Status BackgroundFlush(bool* madeProgress, DeletionState& deletion_state,
-                         LogBuffer* log_buffer);
-  void CleanupCompaction(CompactionState* compact, Status status);
-  Status DoCompactionWork(CompactionState* compact,
-                          DeletionState& deletion_state,
+  Status BackgroundFlush(bool* madeProgress, JobContext* job_context,
                         LogBuffer* log_buffer);

  // This function is called as part of compaction. It enables Flush process to
  // preempt compaction, since it's higher prioirty
-  // Returns: micros spent executing
  uint64_t CallFlushDuringCompaction(ColumnFamilyData* cfd,
-                                     DeletionState& deletion_state,
-                                     LogBuffer* log_buffer);
-
-  // Call compaction filter if is_compaction_v2 is not true. Then iterate
-  // through input and compact the kv-pairs
-  Status ProcessKeyValueCompaction(
-    bool is_snapshot_supported,
-    SequenceNumber visible_at_tip,
-    SequenceNumber earliest_snapshot,
-    SequenceNumber latest_snapshot,
-    DeletionState& deletion_state,
-    bool bottommost_level,
-    int64_t& imm_micros,
-    Iterator* input,
-    CompactionState* compact,
-    bool is_compaction_v2,
+                                     const MutableCFOptions& mutable_cf_options,
+                                     JobContext* job_context,
                                     LogBuffer* log_buffer);

-  // Call compaction_filter_v2->Filter() on kv-pairs in compact
-  void CallCompactionFilterV2(CompactionState* compact,
-    CompactionFilterV2* compaction_filter_v2);
-
-  Status OpenCompactionOutputFile(CompactionState* compact);
-  Status FinishCompactionOutputFile(CompactionState* compact, Iterator* input);
-  Status InstallCompactionResults(CompactionState* compact,
-                                  LogBuffer* log_buffer);
-  void AllocateCompactionOutputFileNumbers(CompactionState* compact);
-  void ReleaseCompactionUnusedFileNumbers(CompactionState* compact);
-
-#ifdef ROCKSDB_LITE
-  void PurgeObsoleteWALFiles() {
-    // this function is used for archiving WAL files. we don't need this in
-    // ROCKSDB_LITE
-  }
-#else
-  void PurgeObsoleteWALFiles();
-
-  Status GetSortedWalsOfType(const std::string& path,
-                             VectorLogPtr& log_files,
-                             WalFileType type);
-
-  // Requires: all_logs should be sorted with earliest log file first
-  // Retains all log files in all_logs which contain updates with seq no.
-  // Greater Than or Equal to the requested SequenceNumber.
-  Status RetainProbableWalFiles(VectorLogPtr& all_logs,
-                                const SequenceNumber target);
-
-  Status ReadFirstRecord(const WalFileType type, const uint64_t number,
-                         SequenceNumber* sequence);
-
-  Status ReadFirstLine(const std::string& fname, SequenceNumber* sequence);
-#endif  // ROCKSDB_LITE
-
  void PrintStatistics();

  // dump rocksdb.stats to LOG
  void MaybeDumpStats();

-  // Return true if the current db supports snapshot.  If the current
-  // DB does not support snapshot, then calling GetSnapshot() will always
-  // return nullptr.
-  //
-  // @see GetSnapshot()
-  virtual bool IsSnapshotSupported() const;
-
  // Return the minimum empty level that could hold the total data in the
  // input level. Return the input level, if such level could not be found.
-  int FindMinimumEmptyLevelFitting(ColumnFamilyData* cfd, int level);
+  int FindMinimumEmptyLevelFitting(ColumnFamilyData* cfd,
+      const MutableCFOptions& mutable_cf_options, int level);

  // Move the files in the input level to the target level.
  // If target_level < 0, automatically calculate the minimum level that could
  // hold the data set.
  Status ReFitLevel(ColumnFamilyData* cfd, int level, int target_level = -1);

+  // helper functions for adding and removing from flush & compaction queues
+  void AddToCompactionQueue(ColumnFamilyData* cfd);
+  ColumnFamilyData* PopFirstFromCompactionQueue();
+  void AddToFlushQueue(ColumnFamilyData* cfd);
+  ColumnFamilyData* PopFirstFromFlushQueue();
+
  // table_cache_ provides its own synchronization
  std::shared_ptr<Cache> table_cache_;

@ -492,8 +413,8 @@ class DBImpl : public DB {
  FileLock* db_lock_;

  // State below is protected by mutex_
-  port::Mutex mutex_;
-  port::AtomicPointer shutting_down_;
+  InstrumentedMutex mutex_;
+  std::atomic<bool> shutting_down_;
  // This condition variable is signaled on these conditions:
  // * whenever bg_compaction_scheduled_ goes down to 0
  // * if bg_manual_only_ > 0, whenever a compaction finishes, even if it hasn't
@ -502,9 +423,10 @@ class DBImpl : public DB {
  // * whenever bg_flush_scheduled_ value decreases (i.e. whenever a flush is
  // done, even if it didn't make any progress)
  // * whenever there is an error in background flush or compaction
-  port::CondVar bg_cv_;
+  InstrumentedCondVar bg_cv_;
  uint64_t logfile_number_;
  unique_ptr<log::Writer> log_;
+  bool log_dir_synced_;
  bool log_empty_;
  ColumnFamilyHandleImpl* default_cf_handle_;
  InternalStats* default_cf_internal_stats_;
@ -526,26 +448,85 @@ class DBImpl : public DB {
  // some code-paths
  bool single_column_family_mode_;

-  std::unique_ptr<Directory> db_directory_;
+  bool is_snapshot_supported_;

-  // Queue of writers.
-  std::deque<Writer*> writers_;
-  WriteBatch tmp_batch_;
+  // Class to maintain directories for all database paths other than main one.
+  class Directories {
+   public:
+    Status SetDirectories(Env* env, const std::string& dbname,
+                          const std::string& wal_dir,
+                          const std::vector<DbPath>& data_paths);

-  SnapshotList snapshots_;
+    Directory* GetDataDir(size_t path_id);
+
+    Directory* GetWalDir() {
+      if (wal_dir_) {
+        return wal_dir_.get();
+      }
+      return db_dir_.get();
+    }
+
+    Directory* GetDbDir() { return db_dir_.get(); }

-  // cache for ReadFirstRecord() calls
-  std::unordered_map<uint64_t, SequenceNumber> read_first_record_cache_;
-  port::Mutex read_first_record_cache_mutex_;
+   private:
+    std::unique_ptr<Directory> db_dir_;
+    std::vector<std::unique_ptr<Directory>> data_dirs_;
+    std::unique_ptr<Directory> wal_dir_;
+
+    Status CreateAndNewDirectory(Env* env, const std::string& dirname,
+                                 std::unique_ptr<Directory>* directory) const;
+  };
+
+  Directories directories_;
+
+  WriteBuffer write_buffer_;

-  // Set of table files to protect from deletion because they are
-  // part of ongoing compactions.
-  // map from pending file number ID to their path IDs.
-  FileNumToPathIdMap pending_outputs_;
+  WriteThread write_thread_;
+
+  WriteBatch tmp_batch_;
+
+  WriteController write_controller_;
+  FlushScheduler flush_scheduler_;
+
+  SnapshotList snapshots_;

-  // At least one compaction or flush job is pending but not yet scheduled
-  // because of the max background thread limit.
-  bool bg_schedule_needed_;
+  // For each background job, pending_outputs_ keeps the current file number at
+  // the time that background job started.
+  // FindObsoleteFiles()/PurgeObsoleteFiles() never deletes any file that has
+  // number bigger than any of the file number in pending_outputs_. Since file
+  // numbers grow monotonically, this also means that pending_outputs_ is always
+  // sorted. After a background job is done executing, its file number is
+  // deleted from pending_outputs_, which allows PurgeObsoleteFiles() to clean
+  // it up.
+  // State is protected with db mutex.
+  std::list<uint64_t> pending_outputs_;
+
+  // flush_queue_ and compaction_queue_ hold column families that we need to
+  // flush and compact, respectively.
+  // A column family is inserted into flush_queue_ when it satisfies condition
+  // cfd->imm()->IsFlushPending()
+  // A column family is inserted into compaction_queue_ when it satisfied
+  // condition cfd->NeedsCompaction()
+  // Column families in this list are all Ref()-erenced
+  // TODO(icanadi) Provide some kind of ReferencedColumnFamily class that will
+  // do RAII on ColumnFamilyData
+  // Column families are in this queue when they need to be flushed or
+  // compacted. Consumers of these queues are flush and compaction threads. When
+  // column family is put on this queue, we increase unscheduled_flushes_ and
+  // unscheduled_compactions_. When these variables are bigger than zero, that
+  // means we need to schedule background threads for compaction and thread.
+  // Once the background threads are scheduled, we decrease unscheduled_flushes_
+  // and unscheduled_compactions_. That way we keep track of number of
+  // compaction and flush threads we need to schedule. This scheduling is done
+  // in MaybeScheduleFlushOrCompaction()
+  // invariant(column family present in flush_queue_ <==>
+  // ColumnFamilyData::pending_flush_ == true)
+  std::deque<ColumnFamilyData*> flush_queue_;
+  // invariant(column family present in compaction_queue_ <==>
+  // ColumnFamilyData::pending_compaction_ == true)
+  std::deque<ColumnFamilyData*> compaction_queue_;
+  int unscheduled_flushes_;
+  int unscheduled_compactions_;

  // count how many background compactions are running or have been scheduled
  int bg_compaction_scheduled_;
@ -584,30 +565,23 @@ class DBImpl : public DB {
  // without any synchronization
  int disable_delete_obsolete_files_;

-  // last time when DeleteObsoleteFiles was invoked
-  uint64_t delete_obsolete_files_last_run_;
-
-  // last time when PurgeObsoleteWALFiles ran.
-  uint64_t purge_wal_files_last_run_;
+  // next time when we should run DeleteObsoleteFiles with full scan
+  uint64_t delete_obsolete_files_next_run_;

  // last time stats were dumped to LOG
  std::atomic<uint64_t> last_stats_dump_time_microsec_;

-  // obsolete files will be deleted every this seconds if ttl deletion is
-  // enabled and archive size_limit is disabled.
-  uint64_t default_interval_to_delete_obsolete_WAL_;
-
  bool flush_on_destroy_; // Used when disableWAL is true.

  static const int KEEP_LOG_FILE_NUM = 1000;
-  static const uint64_t kNoTimeOut = std::numeric_limits<uint64_t>::max();
  std::string db_absolute_path_;

-  // count of the number of contiguous delaying writes
-  int delayed_writes_;
-
  // The options to access storage files
-  const EnvOptions storage_options_;
+  const EnvOptions env_options_;
+
+#ifndef ROCKSDB_LITE
+  WalManager wal_manager_;
+#endif  // ROCKSDB_LITE

  // A value of true temporarily disables scheduling of background work
  bool bg_work_gate_closed_;
@ -618,13 +592,16 @@ class DBImpl : public DB {
  // Indicate DB was opened successfully
  bool opened_successfully_;

+  // The list of registered event listeners.
+  std::list<EventListener*> listeners_;
+
+  // count how many events are currently being notified.
+  int notifying_events_;
+
  // No copying allowed
  DBImpl(const DBImpl&);
  void operator=(const DBImpl&);

-  // dump the delayed_writes_ to the log file and reset counter.
-  void DelayLoggingAndReset();
-
  // Return the earliest snapshot where seqno is visible.
  // Store the snapshot right before that, if any, in prev_snapshot
  inline SequenceNumber findEarliestVisibleSnapshot(
@ -633,10 +610,24 @@ class DBImpl : public DB {
    SequenceNumber* prev_snapshot);

  // Background threads call this function, which is just a wrapper around
-  // the cfd->InstallSuperVersion() function. Background threads carry
-  // deletion_state which can have new_superversion already allocated.
-  void InstallSuperVersion(ColumnFamilyData* cfd,
-                           DeletionState& deletion_state);
+  // the InstallSuperVersion() function. Background threads carry
+  // job_context which can have new_superversion already
+  // allocated.
+  void InstallSuperVersionBackground(
+      ColumnFamilyData* cfd, JobContext* job_context,
+      const MutableCFOptions& mutable_cf_options);
+
+  // All ColumnFamily state changes go through this function. Here we analyze
+  // the new state and we schedule background work if we detect that the new
+  // state needs flush or compaction.
+  // If dont_schedule_bg_work == true, then caller asks us to not schedule flush
+  // or compaction here, but it also promises to schedule needed background
+  // work. We use this to  scheduling background compactions when we are in the
+  // write thread, which is very performance critical. Caller schedules
+  // background work as soon as it exits the write thread
+  SuperVersion* InstallSuperVersion(ColumnFamilyData* cfd, SuperVersion* new_sv,
+                                    const MutableCFOptions& mutable_cf_options,
+                                    bool dont_schedule_bg_work = false);

  // Find Super version and reference it. Based on options, it might return
  // the thread local cached one.
@ -680,8 +671,4 @@ static void ClipToRange(T* ptr, V minvalue, V maxvalue) {
  if (static_cast<V>(*ptr) < minvalue) *ptr = minvalue;
 }

-// Dump db file summary, implemented in util/
-extern void DumpDBFileSummary(const DBOptions& options,
-                              const std::string& dbname);
-
 }  // namespace rocksdb
--- a/db/db_impl_debug.cc
+++ b/db/db_impl_debug.cc
@ -10,17 +10,17 @@
 #ifndef ROCKSDB_LITE

 #include "db/db_impl.h"
+#include "util/thread_status_updater.h"

 namespace rocksdb {

-void DBImpl::TEST_PurgeObsoleteteWAL() { PurgeObsoleteWALFiles(); }
-
 uint64_t DBImpl::TEST_GetLevel0TotalSize() {
-  MutexLock l(&mutex_);
-  return default_cf_handle_->cfd()->current()->NumLevelBytes(0);
+  InstrumentedMutexLock l(&mutex_);
+  return default_cf_handle_->cfd()->current()->storage_info()->NumLevelBytes(0);
 }

-Iterator* DBImpl::TEST_NewInternalIterator(ColumnFamilyHandle* column_family) {
+Iterator* DBImpl::TEST_NewInternalIterator(Arena* arena,
+                                           ColumnFamilyHandle* column_family) {
  ColumnFamilyData* cfd;
  if (column_family == nullptr) {
    cfd = default_cf_handle_->cfd();
@ -33,7 +33,7 @@ Iterator* DBImpl::TEST_NewInternalIterator(ColumnFamilyHandle* column_family) {
  SuperVersion* super_version = cfd->GetSuperVersion()->Ref();
  mutex_.Unlock();
  ReadOptions roptions;
-  return NewInternalIterator(roptions, cfd, super_version);
+  return NewInternalIterator(roptions, cfd, super_version, arena);
 }

 int64_t DBImpl::TEST_MaxNextLevelOverlappingBytes(
@ -45,8 +45,8 @@ int64_t DBImpl::TEST_MaxNextLevelOverlappingBytes(
    auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
    cfd = cfh->cfd();
  }
-  MutexLock l(&mutex_);
-  return cfd->current()->MaxNextLevelOverlappingBytes();
+  InstrumentedMutexLock l(&mutex_);
+  return cfd->current()->storage_info()->MaxNextLevelOverlappingBytes();
 }

 void DBImpl::TEST_GetFilesMetaData(
@ -54,10 +54,11 @@ void DBImpl::TEST_GetFilesMetaData(
    std::vector<std::vector<FileMetaData>>* metadata) {
  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
  auto cfd = cfh->cfd();
-  MutexLock l(&mutex_);
+  InstrumentedMutexLock l(&mutex_);
  metadata->resize(NumberLevels());
  for (int level = 0; level < NumberLevels(); level++) {
-    const std::vector<FileMetaData*>& files = cfd->current()->files_[level];
+    const std::vector<FileMetaData*>& files =
+        cfd->current()->storage_info()->LevelFiles(level);

    (*metadata)[level].clear();
    for (const auto& f : files) {
@ -67,7 +68,7 @@ void DBImpl::TEST_GetFilesMetaData(
 }

 uint64_t DBImpl::TEST_Current_Manifest_FileNo() {
-  return versions_->ManifestFileNumber();
+  return versions_->manifest_file_number();
 }

 Status DBImpl::TEST_CompactRange(int level, const Slice* begin,
@ -81,8 +82,8 @@ Status DBImpl::TEST_CompactRange(int level, const Slice* begin,
    cfd = cfh->cfd();
  }
  int output_level =
-      (cfd->options()->compaction_style == kCompactionStyleUniversal ||
-       cfd->options()->compaction_style == kCompactionStyleFIFO)
+      (cfd->ioptions()->compaction_style == kCompactionStyleUniversal ||
+       cfd->ioptions()->compaction_style == kCompactionStyleFIFO)
          ? level
          : level + 1;
  return RunManualCompaction(cfd, level, output_level, 0, begin, end);
@ -112,22 +113,33 @@ Status DBImpl::TEST_WaitForCompact() {
  // wait for compact. It actually waits for scheduled compaction
  // OR flush to finish.

-  MutexLock l(&mutex_);
+  InstrumentedMutexLock l(&mutex_);
  while ((bg_compaction_scheduled_ || bg_flush_scheduled_) && bg_error_.ok()) {
    bg_cv_.Wait();
  }
  return bg_error_;
 }

-Status DBImpl::TEST_ReadFirstRecord(const WalFileType type,
-                                    const uint64_t number,
-                                    SequenceNumber* sequence) {
-  return ReadFirstRecord(type, number, sequence);
+void DBImpl::TEST_LockMutex() {
+  mutex_.Lock();
 }

-Status DBImpl::TEST_ReadFirstLine(const std::string& fname,
-                                  SequenceNumber* sequence) {
-  return ReadFirstLine(fname, sequence);
+void DBImpl::TEST_UnlockMutex() {
+  mutex_.Unlock();
 }
+
+void* DBImpl::TEST_BeginWrite() {
+  auto w = new WriteThread::Writer(&mutex_);
+  Status s = write_thread_.EnterWriteThread(w, 0);
+  assert(s.ok() && !w->done);  // No timeout and nobody should do our job
+  return reinterpret_cast<void*>(w);
+}
+
+void DBImpl::TEST_EndWrite(void* w) {
+  auto writer = reinterpret_cast<WriteThread::Writer*>(w);
+  write_thread_.ExitWriteThread(writer, writer, Status::OK());
+  delete writer;
+}
+
 }  // namespace rocksdb
 #endif  // ROCKSDB_LITE
--- a/db/db_impl_readonly.cc
+++ b/db/db_impl_readonly.cc
@ -2,56 +2,31 @@
 //  This source code is licensed under the BSD-style license found in the
 //  LICENSE file in the root directory of this source tree. An additional grant
 //  of patent rights can be found in the PATENTS file in the same directory.
-//
-// Copyright (c) 2012 Facebook. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
+

 #include "db/db_impl_readonly.h"
+#include "utilities/compacted_db/compacted_db_impl.h"
 #include "db/db_impl.h"
-
-#include <algorithm>
-#include <set>
-#include <string>
-#include <stdint.h>
-#include <stdio.h>
-#include <vector>
-#include "db/db_iter.h"
-#include "db/dbformat.h"
-#include "db/filename.h"
-#include "db/log_reader.h"
-#include "db/log_writer.h"
-#include "db/memtable.h"
 #include "db/merge_context.h"
-#include "db/table_cache.h"
-#include "db/version_set.h"
-#include "db/write_batch_internal.h"
-#include "rocksdb/db.h"
-#include "rocksdb/env.h"
-#include "rocksdb/status.h"
-#include "rocksdb/table.h"
-#include "rocksdb/merge_operator.h"
-#include "port/port.h"
-#include "table/block.h"
-#include "table/merger.h"
-#include "table/two_level_iterator.h"
-#include "util/coding.h"
-#include "util/logging.h"
-#include "util/build_version.h"
+#include "db/db_iter.h"
+#include "util/perf_context_imp.h"

 namespace rocksdb {

-DBImplReadOnly::DBImplReadOnly(const DBOptions& options,
+#ifndef ROCKSDB_LITE
+
+DBImplReadOnly::DBImplReadOnly(const DBOptions& db_options,
                               const std::string& dbname)
-    : DBImpl(options, dbname) {
-  Log(options_.info_log, "Opening the db in read only mode");
+    : DBImpl(db_options, dbname) {
+  Log(INFO_LEVEL, db_options_.info_log, "Opening the db in read only mode");
+  LogFlush(db_options_.info_log);
 }

 DBImplReadOnly::~DBImplReadOnly() {
 }

 // Implementations of the DB interface
-Status DBImplReadOnly::Get(const ReadOptions& options,
+Status DBImplReadOnly::Get(const ReadOptions& read_options,
                           ColumnFamilyHandle* column_family, const Slice& key,
                           std::string* value) {
  Status s;
@ -61,33 +36,35 @@ Status DBImplReadOnly::Get(const ReadOptions& options,
  SuperVersion* super_version = cfd->GetSuperVersion();
  MergeContext merge_context;
  LookupKey lkey(key, snapshot);
-  if (super_version->mem->Get(lkey, value, &s, merge_context,
-                              *cfd->options())) {
+  if (super_version->mem->Get(lkey, value, &s, &merge_context)) {
  } else {
-    super_version->current->Get(options, lkey, value, &s, &merge_context);
+    PERF_TIMER_GUARD(get_from_output_files_time);
+    super_version->current->Get(read_options, lkey, value, &s, &merge_context);
  }
  return s;
 }

-Iterator* DBImplReadOnly::NewIterator(const ReadOptions& options,
+Iterator* DBImplReadOnly::NewIterator(const ReadOptions& read_options,
                                      ColumnFamilyHandle* column_family) {
  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
  auto cfd = cfh->cfd();
  SuperVersion* super_version = cfd->GetSuperVersion()->Ref();
  SequenceNumber latest_snapshot = versions_->LastSequence();
  auto db_iter = NewArenaWrappedDbIterator(
-      env_, *cfd->options(), cfd->user_comparator(),
-      (options.snapshot != nullptr
-           ? reinterpret_cast<const SnapshotImpl*>(options.snapshot)->number_
-           : latest_snapshot));
-  auto internal_iter =
-      NewInternalIterator(options, cfd, super_version, db_iter->GetArena());
+      env_, *cfd->ioptions(), cfd->user_comparator(),
+      (read_options.snapshot != nullptr
+           ? reinterpret_cast<const SnapshotImpl*>(
+                read_options.snapshot)->number_
+           : latest_snapshot),
+      super_version->mutable_cf_options.max_sequential_skip_in_iterations);
+  auto internal_iter = NewInternalIterator(
+      read_options, cfd, super_version, db_iter->GetArena());
  db_iter->SetIterUnderDBIter(internal_iter);
  return db_iter;
 }

 Status DBImplReadOnly::NewIterators(
-    const ReadOptions& options,
+    const ReadOptions& read_options,
    const std::vector<ColumnFamilyHandle*>& column_families,
    std::vector<Iterator*>* iterators) {
  if (iterators == nullptr) {
@ -98,14 +75,17 @@ Status DBImplReadOnly::NewIterators(
  SequenceNumber latest_snapshot = versions_->LastSequence();

  for (auto cfh : column_families) {
-    auto cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(cfh)->cfd();
-    auto db_iter = NewArenaWrappedDbIterator(
-        env_, *cfd->options(), cfd->user_comparator(),
-        options.snapshot != nullptr
-            ? reinterpret_cast<const SnapshotImpl*>(options.snapshot)->number_
-            : latest_snapshot);
-    auto internal_iter = NewInternalIterator(
-        options, cfd, cfd->GetSuperVersion()->Ref(), db_iter->GetArena());
+    auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(cfh)->cfd();
+    auto* sv = cfd->GetSuperVersion()->Ref();
+    auto* db_iter = NewArenaWrappedDbIterator(
+        env_, *cfd->ioptions(), cfd->user_comparator(),
+        (read_options.snapshot != nullptr
+            ? reinterpret_cast<const SnapshotImpl*>(
+                  read_options.snapshot)->number_
+            : latest_snapshot),
+        sv->mutable_cf_options.max_sequential_skip_in_iterations);
+    auto* internal_iter = NewInternalIterator(
+        read_options, cfd, sv, db_iter->GetArena());
    db_iter->SetIterUnderDBIter(internal_iter);
    iterators->push_back(db_iter);
  }
@ -117,6 +97,13 @@ Status DB::OpenForReadOnly(const Options& options, const std::string& dbname,
                           DB** dbptr, bool error_if_log_file_exist) {
  *dbptr = nullptr;

+  // Try to first open DB as fully compacted DB
+  Status s;
+  s = CompactedDBImpl::Open(options, dbname, dbptr);
+  if (s.ok()) {
+    return s;
+  }
+
  DBOptions db_options(options);
  ColumnFamilyOptions cf_options(options);
  std::vector<ColumnFamilyDescriptor> column_families;
@ -124,8 +111,7 @@ Status DB::OpenForReadOnly(const Options& options, const std::string& dbname,
      ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options));
  std::vector<ColumnFamilyHandle*> handles;

-  Status s =
-      DB::OpenForReadOnly(db_options, dbname, column_families, &handles, dbptr);
+  s = DB::OpenForReadOnly(db_options, dbname, column_families, &handles, dbptr);
  if (s.ok()) {
    assert(handles.size() == 1);
    // i can delete the handle since DBImpl is always holding a
@ -167,6 +153,10 @@ Status DB::OpenForReadOnly(
  impl->mutex_.Unlock();
  if (s.ok()) {
    *dbptr = impl;
+    for (auto* h : *handles) {
+      impl->NewThreadStatusCfInfo(
+          reinterpret_cast<ColumnFamilyHandleImpl*>(h)->cfd());
+    }
  } else {
    for (auto h : *handles) {
      delete h;
@ -177,5 +167,20 @@ Status DB::OpenForReadOnly(
  return s;
 }

+#else  // !ROCKSDB_LITE
+
+Status DB::OpenForReadOnly(const Options& options, const std::string& dbname,
+                           DB** dbptr, bool error_if_log_file_exist) {
+  return Status::NotSupported("Not supported in ROCKSDB_LITE.");
+}
+
+Status DB::OpenForReadOnly(
+    const DBOptions& db_options, const std::string& dbname,
+    const std::vector<ColumnFamilyDescriptor>& column_families,
+    std::vector<ColumnFamilyHandle*>* handles, DB** dbptr,
+    bool error_if_log_file_exist) {
+  return Status::NotSupported("Not supported in ROCKSDB_LITE.");
+}
+#endif  // !ROCKSDB_LITE

 }   // namespace rocksdb
--- a/db/db_impl_readonly.h
+++ b/db/db_impl_readonly.h
@ -2,24 +2,14 @@
 //  This source code is licensed under the BSD-style license found in the
 //  LICENSE file in the root directory of this source tree. An additional grant
 //  of patent rights can be found in the PATENTS file in the same directory.
-//
-// Copyright (c) 2012 Facebook. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.

 #pragma once
-#include "db/db_impl.h"

-#include <deque>
-#include <set>
+#ifndef ROCKSDB_LITE
+
+#include "db/db_impl.h"
 #include <vector>
 #include <string>
-#include "db/dbformat.h"
-#include "db/log_writer.h"
-#include "db/snapshot.h"
-#include "rocksdb/db.h"
-#include "rocksdb/env.h"
-#include "port/port.h"

 namespace rocksdb {

@ -75,10 +65,20 @@ class DBImplReadOnly : public DBImpl {
    return Status::NotSupported("Not supported operation in read only mode.");
  }

+  using DBImpl::CompactFiles;
+  virtual Status CompactFiles(
+      const CompactionOptions& compact_options,
+      ColumnFamilyHandle* column_family,
+      const std::vector<std::string>& input_file_names,
+      const int output_level, const int output_path_id = -1) override {
+    return Status::NotSupported("Not supported operation in read only mode.");
+  }
+
 #ifndef ROCKSDB_LITE
  virtual Status DisableFileDeletions() override {
    return Status::NotSupported("Not supported operation in read only mode.");
  }
+
  virtual Status EnableFileDeletions(bool force) override {
    return Status::NotSupported("Not supported operation in read only mode.");
  }
@ -103,3 +103,5 @@ class DBImplReadOnly : public DBImpl {
  void operator=(const DBImplReadOnly&);
 };
 }
+
+#endif  // !ROCKSDB_LITE
--- a/db/db_iter.cc
+++ b/db/db_iter.cc
@ -58,22 +58,25 @@ class DBIter: public Iterator {
    kReverse
  };

-  DBIter(Env* env, const Options& options, const Comparator* cmp,
-         Iterator* iter, SequenceNumber s, bool arena_mode)
+  DBIter(Env* env, const ImmutableCFOptions& ioptions,
+         const Comparator* cmp, Iterator* iter, SequenceNumber s,
+         bool arena_mode, uint64_t max_sequential_skip_in_iterations,
+         const Slice* iterate_upper_bound = nullptr)
      : arena_mode_(arena_mode),
        env_(env),
-        logger_(options.info_log.get()),
+        logger_(ioptions.info_log),
        user_comparator_(cmp),
-        user_merge_operator_(options.merge_operator.get()),
+        user_merge_operator_(ioptions.merge_operator),
        iter_(iter),
        sequence_(s),
        direction_(kForward),
        valid_(false),
        current_entry_is_merged_(false),
-        statistics_(options.statistics.get()) {
+        statistics_(ioptions.statistics),
+        iterate_upper_bound_(iterate_upper_bound) {
    RecordTick(statistics_, NO_ITERATORS);
-    has_prefix_extractor_ = (options.prefix_extractor.get() != nullptr);
-    max_skip_ = options.max_sequential_skip_in_iterations;
+    prefix_extractor_ = ioptions.prefix_extractor;
+    max_skip_ = max_sequential_skip_in_iterations;
  }
  virtual ~DBIter() {
    RecordTick(statistics_, NO_ITERATORS, -1);
@ -132,7 +135,7 @@ class DBIter: public Iterator {
    }
  }

-  bool has_prefix_extractor_;
+  const SliceTransform* prefix_extractor_;
  bool arena_mode_;
  Env* const env_;
  Logger* logger_;
@ -149,6 +152,7 @@ class DBIter: public Iterator {
  bool current_entry_is_merged_;
  Statistics* statistics_;
  uint64_t max_skip_;
+  const Slice* iterate_upper_bound_;

  // No copying allowed
  DBIter(const DBIter&);
@ -158,7 +162,8 @@ class DBIter: public Iterator {
 inline bool DBIter::ParseKey(ParsedInternalKey* ikey) {
  if (!ParseInternalKey(iter_->key(), ikey)) {
    status_ = Status::Corruption("corrupted internal key in DBIter");
-    Log(logger_, "corrupted internal key in DBIter: %s",
+    Log(InfoLogLevel::ERROR_LEVEL,
+        logger_, "corrupted internal key in DBIter: %s",
        iter_->key().ToString(true).c_str());
    return false;
  } else {
@ -194,9 +199,8 @@ void DBIter::Next() {
 // NOTE: In between, saved_key_ can point to a user key that has
 //       a delete marker
 inline void DBIter::FindNextUserEntry(bool skipping) {
-  PERF_TIMER_AUTO(find_next_user_entry_time);
+  PERF_TIMER_GUARD(find_next_user_entry_time);
  FindNextUserEntryInternal(skipping);
-  PERF_TIMER_STOP(find_next_user_entry_time);
 }

 // Actual implementation of DBIter::FindNextUserEntry()
@ -208,7 +212,14 @@ void DBIter::FindNextUserEntryInternal(bool skipping) {
  uint64_t num_skipped = 0;
  do {
    ParsedInternalKey ikey;
-    if (ParseKey(&ikey) && ikey.sequence <= sequence_) {
+
+    if (ParseKey(&ikey)) {
+      if (iterate_upper_bound_ != nullptr &&
+          ikey.user_key.compare(*iterate_upper_bound_) >= 0) {
+        break;
+      }
+
+      if (ikey.sequence <= sequence_) {
        if (skipping &&
           user_comparator_->Compare(ikey.user_key, saved_key_.GetKey()) <= 0) {
          num_skipped++;  // skip this entry
@ -241,6 +252,7 @@ void DBIter::FindNextUserEntryInternal(bool skipping) {
          }
        }
      }
+    }
    // If we have sequentially iterated via numerous keys and still not
    // found the next user-key, then it is better to seek so that we can
    // avoid too many key comparisons. We seek to the last occurence of
@ -267,16 +279,17 @@ void DBIter::FindNextUserEntryInternal(bool skipping) {
 //       iter_ points to the next entry (or invalid)
 void DBIter::MergeValuesNewToOld() {
  if (!user_merge_operator_) {
-    Log(logger_, "Options::merge_operator is null.");
-    throw std::logic_error("DBIter::MergeValuesNewToOld() with"
-                           " Options::merge_operator null");
+    Log(InfoLogLevel::ERROR_LEVEL,
+        logger_, "Options::merge_operator is null.");
+    status_ = Status::InvalidArgument("user_merge_operator_ must be set.");
+    valid_ = false;
+    return;
  }

  // Start the merge process by pushing the first operand
  std::deque<std::string> operands;
  operands.push_front(iter_->value().ToString());

-  std::string merge_result;   // Temporary string to hold merge result later
  ParsedInternalKey ikey;
  for (iter_->Next(); iter_->Valid(); iter_->Next()) {
    if (!ParseKey(&ikey)) {
@ -300,8 +313,8 @@ void DBIter::MergeValuesNewToOld() {
      // hit a put, merge the put value with operands and store the
      // final result in saved_value_. We are done!
      // ignore corruption if there is any.
-      const Slice value = iter_->value();
-      user_merge_operator_->FullMerge(ikey.user_key, &value, operands,
+      const Slice val = iter_->value();
+      user_merge_operator_->FullMerge(ikey.user_key, &val, operands,
                                      &saved_value_, logger_);
      // iter_ is positioned after put
      iter_->Next();
@ -311,8 +324,8 @@ void DBIter::MergeValuesNewToOld() {
    if (kTypeMerge == ikey.type) {
      // hit a merge, add the value as an operand and run associative merge.
      // when complete, add result to operands and continue.
-      const Slice& value = iter_->value();
-      operands.push_front(value.ToString());
+      const Slice& val = iter_->value();
+      operands.push_front(val.ToString());
    }
  }

@ -399,6 +412,7 @@ bool DBIter::FindValueForCurrentKey() {
      case kTypeDeletion:
        operands.clear();
        last_not_merge_type = kTypeDeletion;
+        PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
        break;
      case kTypeMerge:
        assert(user_merge_operator_ != nullptr);
@ -408,6 +422,7 @@ bool DBIter::FindValueForCurrentKey() {
        assert(false);
    }

+    PERF_COUNTER_ADD(internal_key_skipped_count, 1);
    assert(user_comparator_->Compare(ikey.user_key, saved_key_.GetKey()) == 0);
    iter_->Prev();
    ++num_skipped;
@ -491,8 +506,8 @@ bool DBIter::FindValueForCurrentKeyUsingSeek() {
    return true;
  }

-  const Slice& value = iter_->value();
-  user_merge_operator_->FullMerge(saved_key_.GetKey(), &value, operands,
+  const Slice& val = iter_->value();
+  user_merge_operator_->FullMerge(saved_key_.GetKey(), &val, operands,
                                  &saved_value_, logger_);
  valid_ = true;
  return true;
@ -554,12 +569,29 @@ void DBIter::FindParseableKey(ParsedInternalKey* ikey, Direction direction) {
 void DBIter::Seek(const Slice& target) {
  StopWatch sw(env_, statistics_, DB_SEEK);

+  // total ordering is not guaranteed if prefix_extractor is set
+  // hence prefix based seeks will not give correct results
+  if (iterate_upper_bound_ != nullptr && prefix_extractor_ != nullptr) {
+    if (!prefix_extractor_->InDomain(*iterate_upper_bound_) ||
+        !prefix_extractor_->InDomain(target) ||
+        prefix_extractor_->Transform(*iterate_upper_bound_).compare(
+          prefix_extractor_->Transform(target)) != 0) {
+      status_ = Status::InvalidArgument("read_options.iterate_*_bound "
+                  " and seek target need to have the same prefix.");
+      valid_ = false;
+      return;
+    }
+  }
+
  saved_key_.Clear();
  // now savved_key is used to store internal key.
  saved_key_.SetInternalKey(target, sequence_);
-  PERF_TIMER_AUTO(seek_internal_seek_time);
+
+  {
+    PERF_TIMER_GUARD(seek_internal_seek_time);
    iter_->Seek(saved_key_.GetKey());
-  PERF_TIMER_STOP(seek_internal_seek_time);
+  }
+
  if (iter_->Valid()) {
    direction_ = kForward;
    ClearSavedValue();
@ -572,14 +604,17 @@ void DBIter::Seek(const Slice& target) {
 void DBIter::SeekToFirst() {
  // Don't use iter_::Seek() if we set a prefix extractor
  // because prefix seek wiil be used.
-  if (has_prefix_extractor_) {
+  if (prefix_extractor_ != nullptr) {
    max_skip_ = std::numeric_limits<uint64_t>::max();
  }
  direction_ = kForward;
  ClearSavedValue();
-  PERF_TIMER_AUTO(seek_internal_seek_time);
+
+  {
+    PERF_TIMER_GUARD(seek_internal_seek_time);
    iter_->SeekToFirst();
-  PERF_TIMER_STOP(seek_internal_seek_time);
+  }
+
  if (iter_->Valid()) {
    FindNextUserEntry(false /* not skipping */);
  } else {
@ -590,24 +625,29 @@ void DBIter::SeekToFirst() {
 void DBIter::SeekToLast() {
  // Don't use iter_::Seek() if we set a prefix extractor
  // because prefix seek wiil be used.
-  if (has_prefix_extractor_) {
+  if (prefix_extractor_ != nullptr) {
    max_skip_ = std::numeric_limits<uint64_t>::max();
  }
  direction_ = kReverse;
  ClearSavedValue();
-  PERF_TIMER_AUTO(seek_internal_seek_time);
+
+  {
+    PERF_TIMER_GUARD(seek_internal_seek_time);
    iter_->SeekToLast();
-  PERF_TIMER_STOP(seek_internal_seek_time);
+  }

  PrevInternal();
 }

-Iterator* NewDBIterator(Env* env, const Options& options,
+Iterator* NewDBIterator(Env* env, const ImmutableCFOptions& ioptions,
                        const Comparator* user_key_comparator,
                        Iterator* internal_iter,
-                        const SequenceNumber& sequence) {
-  return new DBIter(env, options, user_key_comparator, internal_iter, sequence,
-                    false);
+                        const SequenceNumber& sequence,
+                        uint64_t max_sequential_skip_in_iterations,
+                        const Slice* iterate_upper_bound) {
+  return new DBIter(env, ioptions, user_key_comparator, internal_iter, sequence,
+                    false, max_sequential_skip_in_iterations,
+                    iterate_upper_bound);
 }

 ArenaWrappedDBIter::~ArenaWrappedDBIter() { db_iter_->~DBIter(); }
@ -635,14 +675,20 @@ void ArenaWrappedDBIter::RegisterCleanup(CleanupFunction function, void* arg1,
 }

 ArenaWrappedDBIter* NewArenaWrappedDbIterator(
-    Env* env, const Options& options, const Comparator* user_key_comparator,
-    const SequenceNumber& sequence) {
+    Env* env, const ImmutableCFOptions& ioptions,
+    const Comparator* user_key_comparator,
+    const SequenceNumber& sequence,
+    uint64_t max_sequential_skip_in_iterations,
+    const Slice* iterate_upper_bound) {
  ArenaWrappedDBIter* iter = new ArenaWrappedDBIter();
  Arena* arena = iter->GetArena();
  auto mem = arena->AllocateAligned(sizeof(DBIter));
-  DBIter* db_iter = new (mem)
-      DBIter(env, options, user_key_comparator, nullptr, sequence, true);
+  DBIter* db_iter = new (mem) DBIter(env, ioptions, user_key_comparator,
+      nullptr, sequence, true, max_sequential_skip_in_iterations,
+      iterate_upper_bound);
+
  iter->SetDBIter(db_iter);
+
  return iter;
 }

--- a/db/db_iter.h
+++ b/db/db_iter.h
@ -24,10 +24,12 @@ class DBIter;
 // into appropriate user keys.
 extern Iterator* NewDBIterator(
    Env* env,
-    const Options& options,
+    const ImmutableCFOptions& options,
    const Comparator *user_key_comparator,
    Iterator* internal_iter,
-    const SequenceNumber& sequence);
+    const SequenceNumber& sequence,
+    uint64_t max_sequential_skip_in_iterations,
+    const Slice* iterate_upper_bound = nullptr);

 // A wrapper iterator which wraps DB Iterator and the arena, with which the DB
 // iterator is supposed be allocated. This class is used as an entry point of
@ -67,7 +69,9 @@ class ArenaWrappedDBIter : public Iterator {

 // Generate the arena wrapped iterator class.
 extern ArenaWrappedDBIter* NewArenaWrappedDbIterator(
-    Env* env, const Options& options, const Comparator* user_key_comparator,
-    const SequenceNumber& sequence);
+    Env* env, const ImmutableCFOptions& options,
+    const Comparator* user_key_comparator,
+    const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iterations,
+    const Slice* iterate_upper_bound = nullptr);

 }  // namespace rocksdb
--- a/db/db_iter_test.cc
+++ b/db/db_iter_test.cc
@ -19,7 +19,7 @@

 namespace rocksdb {

-static uint32_t TestGetTickerCount(const Options& options,
+static uint64_t TestGetTickerCount(const Options& options,
                                   Tickers ticker_type) {
  return options.statistics->getTickerCount(ticker_type);
 }
@ -33,20 +33,23 @@ class TestIterator : public Iterator {
        iter_(0),
        cmp(comparator) {}

-  void AddMerge(std::string key, std::string value) {
-    Add(key, kTypeMerge, value);
+  void AddMerge(std::string argkey, std::string argvalue) {
+    Add(argkey, kTypeMerge, argvalue);
  }

-  void AddDeletion(std::string key) { Add(key, kTypeDeletion, std::string()); }
+  void AddDeletion(std::string argkey) {
+    Add(argkey, kTypeDeletion, std::string());
+  }

-  void AddPut(std::string key, std::string value) {
-    Add(key, kTypeValue, value);
+  void AddPut(std::string argkey, std::string argvalue) {
+    Add(argkey, kTypeValue, argvalue);
  }

-  void Add(std::string key, ValueType type, std::string value) {
+  void Add(std::string argkey, ValueType type, std::string argvalue) {
    valid_ = true;
-    ParsedInternalKey internal_key(key, sequence_number_++, type);
-    data_.push_back(std::pair<std::string, std::string>(std::string(), value));
+    ParsedInternalKey internal_key(argkey, sequence_number_++, type);
+    data_.push_back(
+        std::pair<std::string, std::string>(std::string(), argvalue));
    AppendInternalKey(&data_.back().first, internal_key);
  }

@ -158,7 +161,9 @@ TEST(DBIteratorTest, DBIteratorPrevNext) {
    internal_iter->Finish();

    std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 10));
+        NewDBIterator(env_, ImmutableCFOptions(options),
+                      BytewiseComparator(), internal_iter, 10,
+                      options.max_sequential_skip_in_iterations));

    db_iter->SeekToLast();
    ASSERT_TRUE(db_iter->Valid());
@ -191,7 +196,9 @@ TEST(DBIteratorTest, DBIteratorPrevNext) {
    internal_iter->Finish();

    std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 10));
+        NewDBIterator(env_, ImmutableCFOptions(options),
+                      BytewiseComparator(), internal_iter, 10,
+                      options.max_sequential_skip_in_iterations));

    db_iter->SeekToFirst();
    ASSERT_TRUE(db_iter->Valid());
@ -213,7 +220,6 @@ TEST(DBIteratorTest, DBIteratorPrevNext) {
  }

  {
-    Options options;
    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
    internal_iter->AddPut("a", "val_a");
    internal_iter->AddPut("b", "val_b");
@ -232,7 +238,9 @@ TEST(DBIteratorTest, DBIteratorPrevNext) {
    internal_iter->Finish();

    std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 2));
+        NewDBIterator(env_, ImmutableCFOptions(options),
+                      BytewiseComparator(), internal_iter, 2,
+                      options.max_sequential_skip_in_iterations));
    db_iter->SeekToLast();
    ASSERT_TRUE(db_iter->Valid());
    ASSERT_EQ(db_iter->key().ToString(), "b");
@ -248,7 +256,6 @@ TEST(DBIteratorTest, DBIteratorPrevNext) {
  }

  {
-    Options options;
    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
    internal_iter->AddPut("a", "val_a");
    internal_iter->AddPut("a", "val_a");
@ -262,7 +269,9 @@ TEST(DBIteratorTest, DBIteratorPrevNext) {
    internal_iter->Finish();

    std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 10));
+        NewDBIterator(env_, ImmutableCFOptions(options),
+                      BytewiseComparator(), internal_iter, 10,
+                      options.max_sequential_skip_in_iterations));
    db_iter->SeekToLast();
    ASSERT_TRUE(db_iter->Valid());
    ASSERT_EQ(db_iter->key().ToString(), "c");
@ -288,7 +297,9 @@ TEST(DBIteratorTest, DBIteratorEmpty) {
    internal_iter->Finish();

    std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 0));
+        NewDBIterator(env_, ImmutableCFOptions(options),
+                      BytewiseComparator(), internal_iter, 0,
+                      options.max_sequential_skip_in_iterations));
    db_iter->SeekToLast();
    ASSERT_TRUE(!db_iter->Valid());
  }
@ -298,7 +309,9 @@ TEST(DBIteratorTest, DBIteratorEmpty) {
    internal_iter->Finish();

    std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 0));
+        NewDBIterator(env_, ImmutableCFOptions(options),
+                      BytewiseComparator(), internal_iter, 0,
+                      options.max_sequential_skip_in_iterations));
    db_iter->SeekToFirst();
    ASSERT_TRUE(!db_iter->Valid());
  }
@ -318,7 +331,9 @@ TEST(DBIteratorTest, DBIteratorUseSkipCountSkips) {
  internal_iter->Finish();

  std::unique_ptr<Iterator> db_iter(
-      NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 2));
+      NewDBIterator(env_, ImmutableCFOptions(options),
+                    BytewiseComparator(), internal_iter, 2,
+                    options.max_sequential_skip_in_iterations));
  db_iter->SeekToLast();
  ASSERT_TRUE(db_iter->Valid());
  ASSERT_EQ(db_iter->key().ToString(), "c");
@ -350,19 +365,21 @@ TEST(DBIteratorTest, DBIteratorUseSkip) {
      TestIterator* internal_iter = new TestIterator(BytewiseComparator());
      internal_iter->AddMerge("b", "merge_1");
      internal_iter->AddMerge("a", "merge_2");
-      for (size_t i = 0; i < 200; ++i) {
-        internal_iter->AddPut("c", std::to_string(i));
+      for (size_t k = 0; k < 200; ++k) {
+        internal_iter->AddPut("c", ToString(k));
      }
      internal_iter->Finish();

      options.statistics = rocksdb::CreateDBStatistics();
      std::unique_ptr<Iterator> db_iter(NewDBIterator(
-          env_, options, BytewiseComparator(), internal_iter, i + 2));
+          env_, ImmutableCFOptions(options),
+          BytewiseComparator(), internal_iter, i + 2,
+          options.max_sequential_skip_in_iterations));
      db_iter->SeekToLast();
      ASSERT_TRUE(db_iter->Valid());

      ASSERT_EQ(db_iter->key().ToString(), "c");
-      ASSERT_EQ(db_iter->value().ToString(), std::to_string(i));
+      ASSERT_EQ(db_iter->value().ToString(), ToString(i));
      db_iter->Prev();
      ASSERT_TRUE(db_iter->Valid());

@ -384,14 +401,16 @@ TEST(DBIteratorTest, DBIteratorUseSkip) {
      TestIterator* internal_iter = new TestIterator(BytewiseComparator());
      internal_iter->AddMerge("b", "merge_1");
      internal_iter->AddMerge("a", "merge_2");
-      for (size_t i = 0; i < 200; ++i) {
+      for (size_t k = 0; k < 200; ++k) {
        internal_iter->AddDeletion("c");
      }
      internal_iter->AddPut("c", "200");
      internal_iter->Finish();

      std::unique_ptr<Iterator> db_iter(NewDBIterator(
-          env_, options, BytewiseComparator(), internal_iter, i + 2));
+          env_, ImmutableCFOptions(options),
+          BytewiseComparator(), internal_iter, i + 2,
+          options.max_sequential_skip_in_iterations));
      db_iter->SeekToLast();
      ASSERT_TRUE(db_iter->Valid());

@ -418,7 +437,9 @@ TEST(DBIteratorTest, DBIteratorUseSkip) {
      internal_iter->Finish();

      std::unique_ptr<Iterator> db_iter(NewDBIterator(
-          env_, options, BytewiseComparator(), internal_iter, 202));
+          env_, ImmutableCFOptions(options),
+          BytewiseComparator(), internal_iter, 202,
+          options.max_sequential_skip_in_iterations));
      db_iter->SeekToLast();
      ASSERT_TRUE(db_iter->Valid());

@ -443,13 +464,15 @@ TEST(DBIteratorTest, DBIteratorUseSkip) {
  {
    for (size_t i = 0; i < 200; ++i) {
      TestIterator* internal_iter = new TestIterator(BytewiseComparator());
-      for (size_t i = 0; i < 200; ++i) {
+      for (size_t k = 0; k < 200; ++k) {
        internal_iter->AddDeletion("c");
      }
      internal_iter->AddPut("c", "200");
      internal_iter->Finish();
      std::unique_ptr<Iterator> db_iter(
-          NewDBIterator(env_, options, BytewiseComparator(), internal_iter, i));
+          NewDBIterator(env_, ImmutableCFOptions(options),
+                        BytewiseComparator(), internal_iter, i,
+                        options.max_sequential_skip_in_iterations));
      db_iter->SeekToLast();
      ASSERT_TRUE(!db_iter->Valid());

@ -464,7 +487,9 @@ TEST(DBIteratorTest, DBIteratorUseSkip) {
    internal_iter->AddPut("c", "200");
    internal_iter->Finish();
    std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 200));
+        NewDBIterator(env_, ImmutableCFOptions(options),
+                      BytewiseComparator(), internal_iter, 200,
+                      options.max_sequential_skip_in_iterations));
    db_iter->SeekToLast();
    ASSERT_TRUE(db_iter->Valid());
    ASSERT_EQ(db_iter->key().ToString(), "c");
@ -487,22 +512,24 @@ TEST(DBIteratorTest, DBIteratorUseSkip) {
      TestIterator* internal_iter = new TestIterator(BytewiseComparator());
      internal_iter->AddMerge("b", "merge_1");
      internal_iter->AddMerge("a", "merge_2");
-      for (size_t i = 0; i < 200; ++i) {
-        internal_iter->AddPut("d", std::to_string(i));
+      for (size_t k = 0; k < 200; ++k) {
+        internal_iter->AddPut("d", ToString(k));
      }

-      for (size_t i = 0; i < 200; ++i) {
-        internal_iter->AddPut("c", std::to_string(i));
+      for (size_t k = 0; k < 200; ++k) {
+        internal_iter->AddPut("c", ToString(k));
      }
      internal_iter->Finish();

      std::unique_ptr<Iterator> db_iter(NewDBIterator(
-          env_, options, BytewiseComparator(), internal_iter, i + 2));
+          env_, ImmutableCFOptions(options),
+          BytewiseComparator(), internal_iter, i + 2,
+          options.max_sequential_skip_in_iterations));
      db_iter->SeekToLast();
      ASSERT_TRUE(db_iter->Valid());

      ASSERT_EQ(db_iter->key().ToString(), "d");
-      ASSERT_EQ(db_iter->value().ToString(), std::to_string(i));
+      ASSERT_EQ(db_iter->value().ToString(), ToString(i));
      db_iter->Prev();
      ASSERT_TRUE(db_iter->Valid());

@ -524,20 +551,22 @@ TEST(DBIteratorTest, DBIteratorUseSkip) {
      TestIterator* internal_iter = new TestIterator(BytewiseComparator());
      internal_iter->AddMerge("b", "b");
      internal_iter->AddMerge("a", "a");
-      for (size_t i = 0; i < 200; ++i) {
-        internal_iter->AddMerge("c", std::to_string(i));
+      for (size_t k = 0; k < 200; ++k) {
+        internal_iter->AddMerge("c", ToString(k));
      }
      internal_iter->Finish();

      std::unique_ptr<Iterator> db_iter(NewDBIterator(
-          env_, options, BytewiseComparator(), internal_iter, i + 2));
+          env_, ImmutableCFOptions(options),
+          BytewiseComparator(), internal_iter, i + 2,
+          options.max_sequential_skip_in_iterations));
      db_iter->SeekToLast();
      ASSERT_TRUE(db_iter->Valid());

      ASSERT_EQ(db_iter->key().ToString(), "c");
      std::string merge_result = "0";
      for (size_t j = 1; j <= i; ++j) {
-        merge_result += "," + std::to_string(j);
+        merge_result += "," + ToString(j);
      }
      ASSERT_EQ(db_iter->value().ToString(), merge_result);

@ -557,10 +586,10 @@ TEST(DBIteratorTest, DBIteratorUseSkip) {
  }
 }

-TEST(DBIteratorTest, DBIterator) {
+TEST(DBIteratorTest, DBIterator1) {
  Options options;
  options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
-  {
+
  TestIterator* internal_iter = new TestIterator(BytewiseComparator());
  internal_iter->AddPut("a", "0");
  internal_iter->AddPut("b", "0");
@ -569,8 +598,9 @@ TEST(DBIteratorTest, DBIterator) {
  internal_iter->AddMerge("b", "2");
  internal_iter->Finish();

-    std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 1));
+  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, 1,
+      options.max_sequential_skip_in_iterations));
  db_iter->SeekToFirst();
  ASSERT_TRUE(db_iter->Valid());
  ASSERT_EQ(db_iter->key().ToString(), "a");
@ -578,9 +608,12 @@ TEST(DBIteratorTest, DBIterator) {
  db_iter->Next();
  ASSERT_TRUE(db_iter->Valid());
  ASSERT_EQ(db_iter->key().ToString(), "b");
-  }
+}
+
+TEST(DBIteratorTest, DBIterator2) {
+  Options options;
+  options.merge_operator = MergeOperators::CreateFromStringId("stringappend");

-  {
  TestIterator* internal_iter = new TestIterator(BytewiseComparator());
  internal_iter->AddPut("a", "0");
  internal_iter->AddPut("b", "0");
@ -589,17 +622,21 @@ TEST(DBIteratorTest, DBIterator) {
  internal_iter->AddMerge("b", "2");
  internal_iter->Finish();

-    std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 0));
+  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, 0,
+      options.max_sequential_skip_in_iterations));
  db_iter->SeekToFirst();
  ASSERT_TRUE(db_iter->Valid());
  ASSERT_EQ(db_iter->key().ToString(), "a");
  ASSERT_EQ(db_iter->value().ToString(), "0");
  db_iter->Next();
  ASSERT_TRUE(!db_iter->Valid());
-  }
+}
+
+TEST(DBIteratorTest, DBIterator3) {
+  Options options;
+  options.merge_operator = MergeOperators::CreateFromStringId("stringappend");

-  {
  TestIterator* internal_iter = new TestIterator(BytewiseComparator());
  internal_iter->AddPut("a", "0");
  internal_iter->AddPut("b", "0");
@ -608,17 +645,20 @@ TEST(DBIteratorTest, DBIterator) {
  internal_iter->AddMerge("b", "2");
  internal_iter->Finish();

-    std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 2));
+  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, 2,
+      options.max_sequential_skip_in_iterations));
  db_iter->SeekToFirst();
  ASSERT_TRUE(db_iter->Valid());
  ASSERT_EQ(db_iter->key().ToString(), "a");
  ASSERT_EQ(db_iter->value().ToString(), "0");
  db_iter->Next();
  ASSERT_TRUE(!db_iter->Valid());
-  }
+}
+TEST(DBIteratorTest, DBIterator4) {
+  Options options;
+  options.merge_operator = MergeOperators::CreateFromStringId("stringappend");

-  {
  TestIterator* internal_iter = new TestIterator(BytewiseComparator());
  internal_iter->AddPut("a", "0");
  internal_iter->AddPut("b", "0");
@ -627,8 +667,9 @@ TEST(DBIteratorTest, DBIterator) {
  internal_iter->AddMerge("b", "2");
  internal_iter->Finish();

-    std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 4));
+  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, 4,
+      options.max_sequential_skip_in_iterations));
  db_iter->SeekToFirst();
  ASSERT_TRUE(db_iter->Valid());
  ASSERT_EQ(db_iter->key().ToString(), "a");
@ -639,9 +680,11 @@ TEST(DBIteratorTest, DBIterator) {
  ASSERT_EQ(db_iter->value().ToString(), "2");
  db_iter->Next();
  ASSERT_TRUE(!db_iter->Valid());
-  }
+}

-  {
+TEST(DBIteratorTest, DBIterator5) {
+  Options options;
+  options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
  {
    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
    internal_iter->AddMerge("a", "merge_1");
@ -653,8 +696,9 @@ TEST(DBIteratorTest, DBIterator) {
    internal_iter->AddMerge("a", "merge_6");
    internal_iter->Finish();

-      std::unique_ptr<Iterator> db_iter(
-          NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 0));
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter,
+        0, options.max_sequential_skip_in_iterations));
    db_iter->SeekToLast();
    ASSERT_TRUE(db_iter->Valid());
    ASSERT_EQ(db_iter->key().ToString(), "a");
@ -674,8 +718,9 @@ TEST(DBIteratorTest, DBIterator) {
    internal_iter->AddMerge("a", "merge_6");
    internal_iter->Finish();

-      std::unique_ptr<Iterator> db_iter(
-          NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 1));
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter,
+        1, options.max_sequential_skip_in_iterations));
    db_iter->SeekToLast();
    ASSERT_TRUE(db_iter->Valid());
    ASSERT_EQ(db_iter->key().ToString(), "a");
@ -695,8 +740,9 @@ TEST(DBIteratorTest, DBIterator) {
    internal_iter->AddMerge("a", "merge_6");
    internal_iter->Finish();

-      std::unique_ptr<Iterator> db_iter(
-          NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 2));
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter,
+        2, options.max_sequential_skip_in_iterations));
    db_iter->SeekToLast();
    ASSERT_TRUE(db_iter->Valid());
    ASSERT_EQ(db_iter->key().ToString(), "a");
@ -716,8 +762,9 @@ TEST(DBIteratorTest, DBIterator) {
    internal_iter->AddMerge("a", "merge_6");
    internal_iter->Finish();

-      std::unique_ptr<Iterator> db_iter(
-          NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 3));
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter,
+        3, options.max_sequential_skip_in_iterations));
    db_iter->SeekToLast();
    ASSERT_TRUE(db_iter->Valid());
    ASSERT_EQ(db_iter->key().ToString(), "a");
@ -737,8 +784,9 @@ TEST(DBIteratorTest, DBIterator) {
    internal_iter->AddMerge("a", "merge_6");
    internal_iter->Finish();

-      std::unique_ptr<Iterator> db_iter(
-          NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 4));
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter,
+        4, options.max_sequential_skip_in_iterations));
    db_iter->SeekToLast();
    ASSERT_TRUE(db_iter->Valid());
    ASSERT_EQ(db_iter->key().ToString(), "a");
@ -758,8 +806,9 @@ TEST(DBIteratorTest, DBIterator) {
    internal_iter->AddMerge("a", "merge_6");
    internal_iter->Finish();

-      std::unique_ptr<Iterator> db_iter(
-          NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 5));
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter,
+        5, options.max_sequential_skip_in_iterations));
    db_iter->SeekToLast();
    ASSERT_TRUE(db_iter->Valid());
    ASSERT_EQ(db_iter->key().ToString(), "a");
@ -779,8 +828,9 @@ TEST(DBIteratorTest, DBIterator) {
    internal_iter->AddMerge("a", "merge_6");
    internal_iter->Finish();

-      std::unique_ptr<Iterator> db_iter(
-          NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 6));
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter,
+        6, options.max_sequential_skip_in_iterations));
    db_iter->SeekToLast();
    ASSERT_TRUE(db_iter->Valid());
    ASSERT_EQ(db_iter->key().ToString(), "a");
@ -788,9 +838,11 @@ TEST(DBIteratorTest, DBIterator) {
    db_iter->Prev();
    ASSERT_TRUE(!db_iter->Valid());
  }
-  }
+}

-  {
+TEST(DBIteratorTest, DBIterator6) {
+  Options options;
+  options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
  {
    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
    internal_iter->AddMerge("a", "merge_1");
@ -802,8 +854,9 @@ TEST(DBIteratorTest, DBIterator) {
    internal_iter->AddMerge("a", "merge_6");
    internal_iter->Finish();

-      std::unique_ptr<Iterator> db_iter(
-          NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 0));
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter,
+        0, options.max_sequential_skip_in_iterations));
    db_iter->SeekToLast();
    ASSERT_TRUE(db_iter->Valid());
    ASSERT_EQ(db_iter->key().ToString(), "a");
@ -823,8 +876,9 @@ TEST(DBIteratorTest, DBIterator) {
    internal_iter->AddMerge("a", "merge_6");
    internal_iter->Finish();

-      std::unique_ptr<Iterator> db_iter(
-          NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 1));
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter,
+        1, options.max_sequential_skip_in_iterations));
    db_iter->SeekToLast();
    ASSERT_TRUE(db_iter->Valid());
    ASSERT_EQ(db_iter->key().ToString(), "a");
@ -844,8 +898,9 @@ TEST(DBIteratorTest, DBIterator) {
    internal_iter->AddMerge("a", "merge_6");
    internal_iter->Finish();

-      std::unique_ptr<Iterator> db_iter(
-          NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 2));
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter,
+        2, options.max_sequential_skip_in_iterations));
    db_iter->SeekToLast();
    ASSERT_TRUE(db_iter->Valid());
    ASSERT_EQ(db_iter->key().ToString(), "a");
@ -865,8 +920,9 @@ TEST(DBIteratorTest, DBIterator) {
    internal_iter->AddMerge("a", "merge_6");
    internal_iter->Finish();

-      std::unique_ptr<Iterator> db_iter(
-          NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 3));
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter,
+        3, options.max_sequential_skip_in_iterations));
    db_iter->SeekToLast();
    ASSERT_TRUE(!db_iter->Valid());
  }
@ -882,8 +938,9 @@ TEST(DBIteratorTest, DBIterator) {
    internal_iter->AddMerge("a", "merge_6");
    internal_iter->Finish();

-      std::unique_ptr<Iterator> db_iter(
-          NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 4));
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter,
+        4, options.max_sequential_skip_in_iterations));
    db_iter->SeekToLast();
    ASSERT_TRUE(db_iter->Valid());
    ASSERT_EQ(db_iter->key().ToString(), "a");
@ -903,8 +960,9 @@ TEST(DBIteratorTest, DBIterator) {
    internal_iter->AddMerge("a", "merge_6");
    internal_iter->Finish();

-      std::unique_ptr<Iterator> db_iter(
-          NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 5));
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter,
+        5, options.max_sequential_skip_in_iterations));
    db_iter->SeekToLast();
    ASSERT_TRUE(db_iter->Valid());
    ASSERT_EQ(db_iter->key().ToString(), "a");
@ -924,8 +982,9 @@ TEST(DBIteratorTest, DBIterator) {
    internal_iter->AddMerge("a", "merge_6");
    internal_iter->Finish();

-      std::unique_ptr<Iterator> db_iter(
-          NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 6));
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter,
+        6, options.max_sequential_skip_in_iterations));
    db_iter->SeekToLast();
    ASSERT_TRUE(db_iter->Valid());
    ASSERT_EQ(db_iter->key().ToString(), "a");
@ -933,9 +992,11 @@ TEST(DBIteratorTest, DBIterator) {
    db_iter->Prev();
    ASSERT_TRUE(!db_iter->Valid());
  }
-  }
+}

-  {
+TEST(DBIteratorTest, DBIterator7) {
+  Options options;
+  options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
  {
    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
    internal_iter->AddMerge("a", "merge_1");
@ -959,8 +1020,9 @@ TEST(DBIteratorTest, DBIterator) {
    internal_iter->AddDeletion("c");
    internal_iter->Finish();

-      std::unique_ptr<Iterator> db_iter(
-          NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 0));
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter,
+        0, options.max_sequential_skip_in_iterations));
    db_iter->SeekToLast();
    ASSERT_TRUE(db_iter->Valid());
    ASSERT_EQ(db_iter->key().ToString(), "a");
@ -992,8 +1054,9 @@ TEST(DBIteratorTest, DBIterator) {
    internal_iter->AddDeletion("c");
    internal_iter->Finish();

-      std::unique_ptr<Iterator> db_iter(
-          NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 2));
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter,
+        2, options.max_sequential_skip_in_iterations));
    db_iter->SeekToLast();
    ASSERT_TRUE(db_iter->Valid());

@ -1031,8 +1094,9 @@ TEST(DBIteratorTest, DBIterator) {
    internal_iter->AddDeletion("c");
    internal_iter->Finish();

-      std::unique_ptr<Iterator> db_iter(
-          NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 4));
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter,
+        4, options.max_sequential_skip_in_iterations));
    db_iter->SeekToLast();
    ASSERT_TRUE(db_iter->Valid());

@ -1070,8 +1134,9 @@ TEST(DBIteratorTest, DBIterator) {
    internal_iter->AddDeletion("c");
    internal_iter->Finish();

-      std::unique_ptr<Iterator> db_iter(
-          NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 5));
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter,
+        5, options.max_sequential_skip_in_iterations));
    db_iter->SeekToLast();
    ASSERT_TRUE(db_iter->Valid());

@ -1114,8 +1179,9 @@ TEST(DBIteratorTest, DBIterator) {
    internal_iter->AddDeletion("c");
    internal_iter->Finish();

-      std::unique_ptr<Iterator> db_iter(
-          NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 6));
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter,
+        6, options.max_sequential_skip_in_iterations));
    db_iter->SeekToLast();
    ASSERT_TRUE(db_iter->Valid());

@ -1159,8 +1225,9 @@ TEST(DBIteratorTest, DBIterator) {
    internal_iter->AddDeletion("c");
    internal_iter->Finish();

-      std::unique_ptr<Iterator> db_iter(
-          NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 7));
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter,
+        7, options.max_sequential_skip_in_iterations));
    db_iter->SeekToLast();
    ASSERT_TRUE(db_iter->Valid());

@ -1198,8 +1265,9 @@ TEST(DBIteratorTest, DBIterator) {
    internal_iter->AddDeletion("c");
    internal_iter->Finish();

-      std::unique_ptr<Iterator> db_iter(
-          NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 9));
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter,
+        9, options.max_sequential_skip_in_iterations));
    db_iter->SeekToLast();
    ASSERT_TRUE(db_iter->Valid());

@ -1244,7 +1312,8 @@ TEST(DBIteratorTest, DBIterator) {
    internal_iter->Finish();

    std::unique_ptr<Iterator> db_iter(NewDBIterator(
-          env_, options, BytewiseComparator(), internal_iter, 13));
+        env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter,
+        13, options.max_sequential_skip_in_iterations));
    db_iter->SeekToLast();
    ASSERT_TRUE(db_iter->Valid());

@ -1290,7 +1359,8 @@ TEST(DBIteratorTest, DBIterator) {
    internal_iter->Finish();

    std::unique_ptr<Iterator> db_iter(NewDBIterator(
-          env_, options, BytewiseComparator(), internal_iter, 14));
+        env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter,
+        14, options.max_sequential_skip_in_iterations));
    db_iter->SeekToLast();
    ASSERT_TRUE(db_iter->Valid());

@ -1305,18 +1375,20 @@ TEST(DBIteratorTest, DBIterator) {
    db_iter->Prev();
    ASSERT_TRUE(!db_iter->Valid());
  }
-  }
-
-  {
+}
+TEST(DBIteratorTest, DBIterator8) {
  Options options;
+  options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+
  TestIterator* internal_iter = new TestIterator(BytewiseComparator());
  internal_iter->AddDeletion("a");
  internal_iter->AddPut("a", "0");
  internal_iter->AddPut("b", "0");
  internal_iter->Finish();

-    std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 10));
+  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter,
+      10, options.max_sequential_skip_in_iterations));
  db_iter->SeekToLast();
  ASSERT_TRUE(db_iter->Valid());
  ASSERT_EQ(db_iter->key().ToString(), "b");
@ -1326,7 +1398,6 @@ TEST(DBIteratorTest, DBIterator) {
  ASSERT_TRUE(db_iter->Valid());
  ASSERT_EQ(db_iter->key().ToString(), "a");
  ASSERT_EQ(db_iter->value().ToString(), "0");
-  }
 }

 }  // namespace rocksdb
--- a/db/db_test.cc
+++ b/db/db_test.cc
--- a/db/dbformat.cc
+++ b/db/dbformat.cc
@ -127,8 +127,8 @@ void InternalKeyComparator::FindShortSuccessor(std::string* key) const {
  }
 }

-LookupKey::LookupKey(const Slice& user_key, SequenceNumber s) {
-  size_t usize = user_key.size();
+LookupKey::LookupKey(const Slice& _user_key, SequenceNumber s) {
+  size_t usize = _user_key.size();
  size_t needed = usize + 13;  // A conservative estimate
  char* dst;
  if (needed <= sizeof(space_)) {
@ -137,9 +137,10 @@ LookupKey::LookupKey(const Slice& user_key, SequenceNumber s) {
    dst = new char[needed];
  }
  start_ = dst;
-  dst = EncodeVarint32(dst, usize + 8);
+  // NOTE: We don't support users keys of more than 2GB :)
+  dst = EncodeVarint32(dst, static_cast<uint32_t>(usize + 8));
  kstart_ = dst;
-  memcpy(dst, user_key.data(), usize);
+  memcpy(dst, _user_key.data(), usize);
  dst += usize;
  EncodeFixed64(dst, PackSequenceAndType(s, kValueTypeForSeek));
  dst += 8;
--- a/db/dbformat.h
+++ b/db/dbformat.h
@ -132,8 +132,8 @@ class InternalKey {
  std::string rep_;
 public:
  InternalKey() { }   // Leave rep_ as empty to indicate it is invalid
-  InternalKey(const Slice& user_key, SequenceNumber s, ValueType t) {
-    AppendInternalKey(&rep_, ParsedInternalKey(user_key, s, t));
+  InternalKey(const Slice& _user_key, SequenceNumber s, ValueType t) {
+    AppendInternalKey(&rep_, ParsedInternalKey(_user_key, s, t));
  }

  bool Valid() const {
@ -201,18 +201,24 @@ class LookupKey {
 public:
  // Initialize *this for looking up user_key at a snapshot with
  // the specified sequence number.
-  LookupKey(const Slice& user_key, SequenceNumber sequence);
+  LookupKey(const Slice& _user_key, SequenceNumber sequence);

  ~LookupKey();

  // Return a key suitable for lookup in a MemTable.
-  Slice memtable_key() const { return Slice(start_, end_ - start_); }
+  Slice memtable_key() const {
+    return Slice(start_, static_cast<size_t>(end_ - start_));
+  }

  // Return an internal key (suitable for passing to an internal iterator)
-  Slice internal_key() const { return Slice(kstart_, end_ - kstart_); }
+  Slice internal_key() const {
+    return Slice(kstart_, static_cast<size_t>(end_ - kstart_));
+  }

  // Return the user key
-  Slice user_key() const { return Slice(kstart_, end_ - kstart_ - 8); }
+  Slice user_key() const {
+    return Slice(kstart_, static_cast<size_t>(end_ - kstart_ - 8));
+  }

 private:
  // We construct a char array of the form:
@ -244,7 +250,7 @@ class IterKey {

  Slice GetKey() const { return Slice(key_, key_size_); }

-  const size_t Size() { return key_size_; }
+  size_t Size() { return key_size_; }

  void Clear() { key_size_ = 0; }

@ -319,8 +325,8 @@ class IterKey {

  void EncodeLengthPrefixedKey(const Slice& key) {
    auto size = key.size();
-    EnlargeBufferIfNeeded(size + VarintLength(size));
-    char* ptr = EncodeVarint32(key_, size);
+    EnlargeBufferIfNeeded(size + static_cast<size_t>(VarintLength(size)));
+    char* ptr = EncodeVarint32(key_, static_cast<uint32_t>(size));
    memcpy(ptr, key.data(), size);
  }

--- a/db/deletefile_test.cc
+++ b/db/deletefile_test.cc
@ -34,6 +34,8 @@ class DeleteFileTest {
  DeleteFileTest() {
    db_ = nullptr;
    env_ = Env::Default();
+    options_.enable_thread_tracking = true;
+    options_.max_background_flushes = 0;
    options_.write_buffer_size = 1024*1024*1000;
    options_.target_file_size_base = 1024*1024*1000;
    options_.max_bytes_for_level_base = 1024*1024*1000;
@ -77,7 +79,7 @@ class DeleteFileTest {
    options.sync = false;
    ReadOptions roptions;
    for (int i = startkey; i < (numkeys + startkey) ; i++) {
-      std::string temp = std::to_string(i);
+      std::string temp = ToString(i);
      Slice key(temp);
      Slice value(temp);
      ASSERT_OK(db_->Put(options, key, value));
@ -147,7 +149,6 @@ class DeleteFileTest {
 TEST(DeleteFileTest, AddKeysAndQueryLevels) {
  CreateTwoLevels();
  std::vector<LiveFileMetaData> metadata;
-  std::vector<int> keysinlevel;
  db_->GetLiveFilesMetaData(&metadata);

  std::string level1file = "";
@ -287,6 +288,75 @@ TEST(DeleteFileTest, DeleteLogFiles) {
  CloseDB();
 }

+TEST(DeleteFileTest, DeleteNonDefaultColumnFamily) {
+  CloseDB();
+  DBOptions db_options;
+  db_options.create_if_missing = true;
+  db_options.create_missing_column_families = true;
+  std::vector<ColumnFamilyDescriptor> column_families;
+  column_families.emplace_back();
+  column_families.emplace_back("new_cf", ColumnFamilyOptions());
+
+  std::vector<rocksdb::ColumnFamilyHandle*> handles;
+  rocksdb::DB* db;
+  ASSERT_OK(DB::Open(db_options, dbname_, column_families, &handles, &db));
+
+  Random rnd(5);
+  for (int i = 0; i < 1000; ++i) {
+    ASSERT_OK(db->Put(WriteOptions(), handles[1], test::RandomKey(&rnd, 10),
+                      test::RandomKey(&rnd, 10)));
+  }
+  ASSERT_OK(db->Flush(FlushOptions(), handles[1]));
+  for (int i = 0; i < 1000; ++i) {
+    ASSERT_OK(db->Put(WriteOptions(), handles[1], test::RandomKey(&rnd, 10),
+                      test::RandomKey(&rnd, 10)));
+  }
+  ASSERT_OK(db->Flush(FlushOptions(), handles[1]));
+
+  std::vector<LiveFileMetaData> metadata;
+  db->GetLiveFilesMetaData(&metadata);
+  ASSERT_EQ(2U, metadata.size());
+  ASSERT_EQ("new_cf", metadata[0].column_family_name);
+  ASSERT_EQ("new_cf", metadata[1].column_family_name);
+  auto old_file = metadata[0].smallest_seqno < metadata[1].smallest_seqno
+                      ? metadata[0].name
+                      : metadata[1].name;
+  auto new_file = metadata[0].smallest_seqno > metadata[1].smallest_seqno
+                      ? metadata[0].name
+                      : metadata[1].name;
+  ASSERT_TRUE(db->DeleteFile(new_file).IsInvalidArgument());
+  ASSERT_OK(db->DeleteFile(old_file));
+
+  {
+    std::unique_ptr<Iterator> itr(db->NewIterator(ReadOptions(), handles[1]));
+    int count = 0;
+    for (itr->SeekToFirst(); itr->Valid(); itr->Next()) {
+      ASSERT_OK(itr->status());
+      ++count;
+    }
+    ASSERT_EQ(count, 1000);
+  }
+
+  delete handles[0];
+  delete handles[1];
+  delete db;
+
+  ASSERT_OK(DB::Open(db_options, dbname_, column_families, &handles, &db));
+  {
+    std::unique_ptr<Iterator> itr(db->NewIterator(ReadOptions(), handles[1]));
+    int count = 0;
+    for (itr->SeekToFirst(); itr->Valid(); itr->Next()) {
+      ASSERT_OK(itr->status());
+      ++count;
+    }
+    ASSERT_EQ(count, 1000);
+  }
+
+  delete handles[0];
+  delete handles[1];
+  delete db;
+}
+
 } //namespace rocksdb

 int main(int argc, char** argv) {
--- a/db/fault_injection_test.cc
+++ b/db/fault_injection_test.cc
@ -0,0 +1,742 @@
+//  Copyright (c) 2015, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright 2014 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+// This test uses a custom Env to keep track of the state of a filesystem as of
+// the last "sync". It then checks for data loss errors by purposely dropping
+// file data (or entire files) not protected by a "sync".
+
+#include <map>
+#include <set>
+#include "db/db_impl.h"
+#include "db/filename.h"
+#include "db/log_format.h"
+#include "db/version_set.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/table.h"
+#include "rocksdb/write_batch.h"
+#include "util/logging.h"
+#include "util/mock_env.h"
+#include "util/mutexlock.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+namespace rocksdb {
+
+static const int kValueSize = 1000;
+static const int kMaxNumValues = 2000;
+static const size_t kNumIterations = 3;
+
+class TestWritableFile;
+class FaultInjectionTestEnv;
+
+namespace {
+
+// Assume a filename, and not a directory name like "/foo/bar/"
+static std::string GetDirName(const std::string filename) {
+  size_t found = filename.find_last_of("/\\");
+  if (found == std::string::npos) {
+    return "";
+  } else {
+    return filename.substr(0, found);
+  }
+}
+
+// Trim the tailing "/" in the end of `str`
+static std::string TrimDirname(const std::string& str) {
+  size_t found = str.find_last_not_of("/");
+  if (found == std::string::npos) {
+    return str;
+  }
+  return str.substr(0, found + 1);
+}
+
+// Return pair <parent directory name, file name> of a full path.
+static std::pair<std::string, std::string> GetDirAndName(
+    const std::string& name) {
+  std::string dirname = GetDirName(name);
+  std::string fname = name.substr(dirname.size() + 1);
+  return std::make_pair(dirname, fname);
+}
+
+// A basic file truncation function suitable for this test.
+Status Truncate(Env* env, const std::string& filename, uint64_t length) {
+  unique_ptr<SequentialFile> orig_file;
+  const EnvOptions options;
+  Status s = env->NewSequentialFile(filename, &orig_file, options);
+  if (!s.ok()) {
+    fprintf(stderr, "Cannot truncate file %s: %s\n", filename.c_str(),
+            s.ToString().c_str());
+    return s;
+  }
+
+  char* scratch = new char[length];
+  rocksdb::Slice result;
+  s = orig_file->Read(length, &result, scratch);
+  if (s.ok()) {
+    std::string tmp_name = GetDirName(filename) + "/truncate.tmp";
+    unique_ptr<WritableFile> tmp_file;
+    s = env->NewWritableFile(tmp_name, &tmp_file, options);
+    if (s.ok()) {
+      s = tmp_file->Append(result);
+      if (s.ok()) {
+        s = env->RenameFile(tmp_name, filename);
+      } else {
+        fprintf(stderr, "Cannot rename file %s to %s: %s\n", tmp_name.c_str(),
+                filename.c_str(), s.ToString().c_str());
+        env->DeleteFile(tmp_name);
+      }
+    }
+  }
+  if (!s.ok()) {
+    fprintf(stderr, "Cannot truncate file %s: %s\n", filename.c_str(),
+            s.ToString().c_str());
+  }
+
+  delete[] scratch;
+
+  return s;
+}
+
+struct FileState {
+  std::string filename_;
+  ssize_t pos_;
+  ssize_t pos_at_last_sync_;
+  ssize_t pos_at_last_flush_;
+
+  explicit FileState(const std::string& filename)
+      : filename_(filename),
+        pos_(-1),
+        pos_at_last_sync_(-1),
+        pos_at_last_flush_(-1) { }
+
+  FileState() : pos_(-1), pos_at_last_sync_(-1), pos_at_last_flush_(-1) {}
+
+  bool IsFullySynced() const { return pos_ <= 0 || pos_ == pos_at_last_sync_; }
+
+  Status DropUnsyncedData(Env* env) const;
+
+  Status DropRandomUnsyncedData(Env* env, Random* rand) const;
+};
+
+}  // anonymous namespace
+
+// A wrapper around WritableFile which informs another Env whenever this file
+// is written to or sync'ed.
+class TestWritableFile : public WritableFile {
+ public:
+  explicit TestWritableFile(const std::string& fname,
+                            unique_ptr<WritableFile>&& f,
+                            FaultInjectionTestEnv* env);
+  virtual ~TestWritableFile();
+  virtual Status Append(const Slice& data);
+  virtual Status Close();
+  virtual Status Flush();
+  virtual Status Sync();
+
+ private:
+  FileState state_;
+  unique_ptr<WritableFile> target_;
+  bool writable_file_opened_;
+  FaultInjectionTestEnv* env_;
+};
+
+class TestDirectory : public Directory {
+ public:
+  explicit TestDirectory(FaultInjectionTestEnv* env, std::string dirname,
+                         Directory* dir)
+      : env_(env), dirname_(dirname), dir_(dir) {}
+  ~TestDirectory() {}
+
+  virtual Status Fsync() override;
+
+ private:
+  FaultInjectionTestEnv* env_;
+  std::string dirname_;
+  unique_ptr<Directory> dir_;
+};
+
+class FaultInjectionTestEnv : public EnvWrapper {
+ public:
+  explicit FaultInjectionTestEnv(Env* base)
+      : EnvWrapper(base),
+        filesystem_active_(true) {}
+  virtual ~FaultInjectionTestEnv() { }
+
+  Status NewDirectory(const std::string& name,
+                      unique_ptr<Directory>* result) override {
+    unique_ptr<Directory> r;
+    Status s = target()->NewDirectory(name, &r);
+    ASSERT_OK(s);
+    if (!s.ok()) {
+      return s;
+    }
+    result->reset(new TestDirectory(this, TrimDirname(name), r.release()));
+    return Status::OK();
+  }
+
+  Status NewWritableFile(const std::string& fname,
+                         unique_ptr<WritableFile>* result,
+                         const EnvOptions& soptions) {
+    Status s = target()->NewWritableFile(fname, result, soptions);
+    if (s.ok()) {
+      result->reset(new TestWritableFile(fname, std::move(*result), this));
+      // WritableFile doesn't append to files, so if the same file is opened
+      // again then it will be truncated - so forget our saved state.
+      UntrackFile(fname);
+      MutexLock l(&mutex_);
+      open_files_.insert(fname);
+      auto dir_and_name = GetDirAndName(fname);
+      auto& list = dir_to_new_files_since_last_sync_[dir_and_name.first];
+      list.insert(dir_and_name.second);
+    }
+    return s;
+  }
+
+  virtual Status DeleteFile(const std::string& f) {
+    Status s = EnvWrapper::DeleteFile(f);
+    if (!s.ok()) {
+      fprintf(stderr, "Cannot delete file %s: %s\n", f.c_str(),
+              s.ToString().c_str());
+    }
+    ASSERT_OK(s);
+    if (s.ok()) {
+      UntrackFile(f);
+    }
+    return s;
+  }
+
+  virtual Status RenameFile(const std::string& s, const std::string& t) {
+    Status ret = EnvWrapper::RenameFile(s, t);
+
+    if (ret.ok()) {
+      MutexLock l(&mutex_);
+      if (db_file_state_.find(s) != db_file_state_.end()) {
+        db_file_state_[t] = db_file_state_[s];
+        db_file_state_.erase(s);
+      }
+
+      auto sdn = GetDirAndName(s);
+      auto tdn = GetDirAndName(t);
+      if (dir_to_new_files_since_last_sync_[sdn.first].erase(sdn.second) != 0) {
+        auto& tlist = dir_to_new_files_since_last_sync_[tdn.first];
+        assert(tlist.find(tdn.second) == tlist.end());
+        tlist.insert(tdn.second);
+      }
+    }
+
+    return ret;
+  }
+
+  void WritableFileClosed(const FileState& state) {
+    MutexLock l(&mutex_);
+    if (open_files_.find(state.filename_) != open_files_.end()) {
+      db_file_state_[state.filename_] = state;
+      open_files_.erase(state.filename_);
+    }
+  }
+
+  // For every file that is not fully synced, make a call to `func` with
+  // FileState of the file as the parameter.
+  Status DropFileData(std::function<Status(Env*, FileState)> func) {
+    Status s;
+    MutexLock l(&mutex_);
+    for (std::map<std::string, FileState>::const_iterator it =
+             db_file_state_.begin();
+         s.ok() && it != db_file_state_.end(); ++it) {
+      const FileState& state = it->second;
+      if (!state.IsFullySynced()) {
+        s = func(target(), state);
+      }
+    }
+    return s;
+  }
+
+  Status DropUnsyncedFileData() {
+    return DropFileData([&](Env* env, const FileState& state) {
+      return state.DropUnsyncedData(env);
+    });
+  }
+
+  Status DropRandomUnsyncedFileData(Random* rnd) {
+    return DropFileData([&](Env* env, const FileState& state) {
+      return state.DropRandomUnsyncedData(env, rnd);
+    });
+  }
+
+  Status DeleteFilesCreatedAfterLastDirSync() {
+    // Because DeleteFile access this container make a copy to avoid deadlock
+    std::map<std::string, std::set<std::string>> map_copy;
+    {
+      MutexLock l(&mutex_);
+      map_copy.insert(dir_to_new_files_since_last_sync_.begin(),
+                      dir_to_new_files_since_last_sync_.end());
+    }
+
+    for (auto& pair : map_copy) {
+      for (std::string name : pair.second) {
+        Status s = DeleteFile(pair.first + "/" + name);
+        if (!s.ok()) {
+          return s;
+        }
+      }
+    }
+    return Status::OK();
+  }
+  void ResetState() {
+    MutexLock l(&mutex_);
+    db_file_state_.clear();
+    dir_to_new_files_since_last_sync_.clear();
+    SetFilesystemActiveNoLock(true);
+  }
+
+  void UntrackFile(const std::string& f) {
+    MutexLock l(&mutex_);
+    auto dir_and_name = GetDirAndName(f);
+    dir_to_new_files_since_last_sync_[dir_and_name.first].erase(
+        dir_and_name.second);
+    db_file_state_.erase(f);
+    open_files_.erase(f);
+  }
+
+  void SyncDir(const std::string& dirname) {
+    MutexLock l(&mutex_);
+    dir_to_new_files_since_last_sync_.erase(dirname);
+  }
+
+  // Setting the filesystem to inactive is the test equivalent to simulating a
+  // system reset. Setting to inactive will freeze our saved filesystem state so
+  // that it will stop being recorded. It can then be reset back to the state at
+  // the time of the reset.
+  bool IsFilesystemActive() {
+    MutexLock l(&mutex_);
+    return filesystem_active_;
+  }
+  void SetFilesystemActiveNoLock(bool active) { filesystem_active_ = active; }
+  void SetFilesystemActive(bool active) {
+    MutexLock l(&mutex_);
+    SetFilesystemActiveNoLock(active);
+  }
+  void AssertNoOpenFile() { ASSERT_TRUE(open_files_.empty()); }
+
+ private:
+  port::Mutex mutex_;
+  std::map<std::string, FileState> db_file_state_;
+  std::set<std::string> open_files_;
+  std::unordered_map<std::string, std::set<std::string>>
+      dir_to_new_files_since_last_sync_;
+  bool filesystem_active_;  // Record flushes, syncs, writes
+};
+
+Status FileState::DropUnsyncedData(Env* env) const {
+  ssize_t sync_pos = pos_at_last_sync_ == -1 ? 0 : pos_at_last_sync_;
+  return Truncate(env, filename_, sync_pos);
+}
+
+Status FileState::DropRandomUnsyncedData(Env* env, Random* rand) const {
+  ssize_t sync_pos = pos_at_last_sync_ == -1 ? 0 : pos_at_last_sync_;
+  assert(pos_ >= sync_pos);
+  int range = static_cast<int>(pos_ - sync_pos);
+  uint64_t truncated_size =
+      static_cast<uint64_t>(sync_pos) + rand->Uniform(range);
+  return Truncate(env, filename_, truncated_size);
+}
+
+Status TestDirectory::Fsync() {
+  env_->SyncDir(dirname_);
+  return dir_->Fsync();
+}
+
+TestWritableFile::TestWritableFile(const std::string& fname,
+                                   unique_ptr<WritableFile>&& f,
+                                   FaultInjectionTestEnv* env)
+      : state_(fname),
+        target_(std::move(f)),
+        writable_file_opened_(true),
+        env_(env) {
+  assert(target_ != nullptr);
+  state_.pos_ = 0;
+}
+
+TestWritableFile::~TestWritableFile() {
+  if (writable_file_opened_) {
+    Close();
+  }
+}
+
+Status TestWritableFile::Append(const Slice& data) {
+  Status s = target_->Append(data);
+  if (s.ok() && env_->IsFilesystemActive()) {
+    state_.pos_ += data.size();
+  }
+  return s;
+}
+
+Status TestWritableFile::Close() {
+  writable_file_opened_ = false;
+  Status s = target_->Close();
+  if (s.ok()) {
+    env_->WritableFileClosed(state_);
+  }
+  return s;
+}
+
+Status TestWritableFile::Flush() {
+  Status s = target_->Flush();
+  if (s.ok() && env_->IsFilesystemActive()) {
+    state_.pos_at_last_flush_ = state_.pos_;
+  }
+  return s;
+}
+
+Status TestWritableFile::Sync() {
+  if (!env_->IsFilesystemActive()) {
+    return Status::OK();
+  }
+  // No need to actual sync.
+  state_.pos_at_last_sync_ = state_.pos_;
+  return Status::OK();
+}
+
+class FaultInjectionTest {
+ protected:
+  enum OptionConfig {
+    kDefault,
+    kDifferentDataDir,
+    kWalDir,
+    kSyncWal,
+    kWalDirSyncWal,
+    kMultiLevels,
+    kEnd,
+  };
+  int option_config_;
+  // When need to make sure data is persistent, sync WAL
+  bool sync_use_wal_;
+  // When need to make sure data is persistent, call DB::CompactRange()
+  bool sync_use_compact_;
+
+ protected:
+ public:
+  enum ExpectedVerifResult { kValExpectFound, kValExpectNoError };
+  enum ResetMethod {
+    kResetDropUnsyncedData,
+    kResetDropRandomUnsyncedData,
+    kResetDeleteUnsyncedFiles,
+    kResetDropAndDeleteUnsynced
+  };
+
+  std::unique_ptr<Env> base_env_;
+  FaultInjectionTestEnv* env_;
+  std::string dbname_;
+  shared_ptr<Cache> tiny_cache_;
+  Options options_;
+  DB* db_;
+
+  FaultInjectionTest()
+      : option_config_(kDefault),
+        sync_use_wal_(false),
+        sync_use_compact_(true),
+        base_env_(nullptr),
+        env_(NULL),
+        db_(NULL) {
+    NewDB();
+  }
+
+  ~FaultInjectionTest() { ASSERT_OK(TearDown()); }
+
+  bool ChangeOptions() {
+    option_config_++;
+    if (option_config_ >= kEnd) {
+      return false;
+    } else {
+      if (option_config_ == kMultiLevels) {
+        base_env_.reset(new MockEnv(Env::Default()));
+      }
+      return true;
+    }
+  }
+
+  // Return the current option configuration.
+  Options CurrentOptions() {
+    sync_use_wal_ = false;
+    sync_use_compact_ = true;
+    Options options;
+    switch (option_config_) {
+      case kWalDir:
+        options.wal_dir = test::TmpDir(env_) + "/fault_test_wal";
+        break;
+      case kDifferentDataDir:
+        options.db_paths.emplace_back(test::TmpDir(env_) + "/fault_test_data",
+                                      1000000U);
+        break;
+      case kSyncWal:
+        sync_use_wal_ = true;
+        sync_use_compact_ = false;
+        break;
+      case kWalDirSyncWal:
+        options.wal_dir = test::TmpDir(env_) + "/fault_test_wal";
+        sync_use_wal_ = true;
+        sync_use_compact_ = false;
+        break;
+      case kMultiLevels:
+        options.write_buffer_size = 64 * 1024;
+        options.target_file_size_base = 64 * 1024;
+        options.level0_file_num_compaction_trigger = 2;
+        options.level0_slowdown_writes_trigger = 2;
+        options.level0_stop_writes_trigger = 4;
+        options.max_bytes_for_level_base = 128 * 1024;
+        options.max_write_buffer_number = 2;
+        options.max_background_compactions = 8;
+        options.max_background_flushes = 8;
+        sync_use_wal_ = true;
+        sync_use_compact_ = false;
+        break;
+      default:
+        break;
+    }
+    return options;
+  }
+
+  Status NewDB() {
+    assert(db_ == NULL);
+    assert(tiny_cache_ == nullptr);
+    assert(env_ == NULL);
+
+    env_ =
+        new FaultInjectionTestEnv(base_env_ ? base_env_.get() : Env::Default());
+
+    options_ = CurrentOptions();
+    options_.env = env_;
+    options_.paranoid_checks = true;
+
+    BlockBasedTableOptions table_options;
+    tiny_cache_ = NewLRUCache(100);
+    table_options.block_cache = tiny_cache_;
+    options_.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+    dbname_ = test::TmpDir() + "/fault_test";
+
+    ASSERT_OK(DestroyDB(dbname_, options_));
+
+    options_.create_if_missing = true;
+    Status s = OpenDB();
+    options_.create_if_missing = false;
+    return s;
+  }
+
+  Status SetUp() {
+    Status s = TearDown();
+    if (s.ok()) {
+      s = NewDB();
+    }
+    return s;
+  }
+
+  Status TearDown() {
+    CloseDB();
+
+    Status s = DestroyDB(dbname_, options_);
+
+    delete env_;
+    env_ = NULL;
+
+    tiny_cache_.reset();
+
+    return s;
+  }
+
+  void Build(const WriteOptions& write_options, int start_idx, int num_vals) {
+    std::string key_space, value_space;
+    WriteBatch batch;
+    for (int i = start_idx; i < start_idx + num_vals; i++) {
+      Slice key = Key(i, &key_space);
+      batch.Clear();
+      batch.Put(key, Value(i, &value_space));
+      ASSERT_OK(db_->Write(write_options, &batch));
+    }
+  }
+
+  Status ReadValue(int i, std::string* val) const {
+    std::string key_space, value_space;
+    Slice key = Key(i, &key_space);
+    Value(i, &value_space);
+    ReadOptions options;
+    return db_->Get(options, key, val);
+  }
+
+  Status Verify(int start_idx, int num_vals,
+                ExpectedVerifResult expected) const {
+    std::string val;
+    std::string value_space;
+    Status s;
+    for (int i = start_idx; i < start_idx + num_vals && s.ok(); i++) {
+      Value(i, &value_space);
+      s = ReadValue(i, &val);
+      if (s.ok()) {
+        ASSERT_EQ(value_space, val);
+      }
+      if (expected == kValExpectFound) {
+        if (!s.ok()) {
+          fprintf(stderr, "Error when read %dth record (expect found): %s\n", i,
+                  s.ToString().c_str());
+          return s;
+        }
+      } else if (!s.ok() && !s.IsNotFound()) {
+        fprintf(stderr, "Error when read %dth record: %s\n", i,
+                s.ToString().c_str());
+        return s;
+      }
+    }
+    return Status::OK();
+  }
+
+  // Return the ith key
+  Slice Key(int i, std::string* storage) const {
+    char buf[100];
+    snprintf(buf, sizeof(buf), "%016d", i);
+    storage->assign(buf, strlen(buf));
+    return Slice(*storage);
+  }
+
+  // Return the value to associate with the specified key
+  Slice Value(int k, std::string* storage) const {
+    Random r(k);
+    return test::RandomString(&r, kValueSize, storage);
+  }
+
+  Status OpenDB() {
+    delete db_;
+    db_ = NULL;
+    env_->ResetState();
+    return DB::Open(options_, dbname_, &db_);
+  }
+
+  void CloseDB() {
+    delete db_;
+    db_ = NULL;
+  }
+
+  void DeleteAllData() {
+    Iterator* iter = db_->NewIterator(ReadOptions());
+    WriteOptions options;
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      ASSERT_OK(db_->Delete(WriteOptions(), iter->key()));
+    }
+
+    delete iter;
+
+    FlushOptions flush_options;
+    flush_options.wait = true;
+    db_->Flush(flush_options);
+  }
+
+  // rnd cannot be null for kResetDropRandomUnsyncedData
+  void ResetDBState(ResetMethod reset_method, Random* rnd = nullptr) {
+    env_->AssertNoOpenFile();
+    switch (reset_method) {
+      case kResetDropUnsyncedData:
+        ASSERT_OK(env_->DropUnsyncedFileData());
+        break;
+      case kResetDropRandomUnsyncedData:
+        ASSERT_OK(env_->DropRandomUnsyncedFileData(rnd));
+        break;
+      case kResetDeleteUnsyncedFiles:
+        ASSERT_OK(env_->DeleteFilesCreatedAfterLastDirSync());
+        break;
+      case kResetDropAndDeleteUnsynced:
+        ASSERT_OK(env_->DropUnsyncedFileData());
+        ASSERT_OK(env_->DeleteFilesCreatedAfterLastDirSync());
+        break;
+      default:
+        assert(false);
+    }
+  }
+
+  void PartialCompactTestPreFault(int num_pre_sync, int num_post_sync) {
+    DeleteAllData();
+
+    WriteOptions write_options;
+    write_options.sync = sync_use_wal_;
+
+    Build(write_options, 0, num_pre_sync);
+    if (sync_use_compact_) {
+      db_->CompactRange(nullptr, nullptr);
+    }
+    write_options.sync = false;
+    Build(write_options, num_pre_sync, num_post_sync);
+  }
+
+  void PartialCompactTestReopenWithFault(ResetMethod reset_method,
+                                         int num_pre_sync, int num_post_sync,
+                                         Random* rnd = nullptr) {
+    env_->SetFilesystemActive(false);
+    CloseDB();
+    ResetDBState(reset_method, rnd);
+    ASSERT_OK(OpenDB());
+    ASSERT_OK(Verify(0, num_pre_sync, FaultInjectionTest::kValExpectFound));
+    ASSERT_OK(Verify(num_pre_sync, num_post_sync,
+                     FaultInjectionTest::kValExpectNoError));
+  }
+
+  void NoWriteTestPreFault() {
+  }
+
+  void NoWriteTestReopenWithFault(ResetMethod reset_method) {
+    CloseDB();
+    ResetDBState(reset_method);
+    ASSERT_OK(OpenDB());
+  }
+};
+
+TEST(FaultInjectionTest, FaultTest) {
+  do {
+    Random rnd(301);
+    ASSERT_OK(SetUp());
+
+    for (size_t idx = 0; idx < kNumIterations; idx++) {
+      int num_pre_sync = rnd.Uniform(kMaxNumValues);
+      int num_post_sync = rnd.Uniform(kMaxNumValues);
+
+      PartialCompactTestPreFault(num_pre_sync, num_post_sync);
+      PartialCompactTestReopenWithFault(kResetDropUnsyncedData, num_pre_sync,
+                                        num_post_sync);
+      NoWriteTestPreFault();
+      NoWriteTestReopenWithFault(kResetDropUnsyncedData);
+
+      PartialCompactTestPreFault(num_pre_sync, num_post_sync);
+      PartialCompactTestReopenWithFault(kResetDropRandomUnsyncedData,
+                                        num_pre_sync, num_post_sync, &rnd);
+      NoWriteTestPreFault();
+      NoWriteTestReopenWithFault(kResetDropUnsyncedData);
+
+      // Setting a separate data path won't pass the test as we don't sync
+      // it after creating new files,
+      PartialCompactTestPreFault(num_pre_sync, num_post_sync);
+      PartialCompactTestReopenWithFault(kResetDropAndDeleteUnsynced,
+                                        num_pre_sync, num_post_sync);
+      NoWriteTestPreFault();
+      NoWriteTestReopenWithFault(kResetDropAndDeleteUnsynced);
+
+      PartialCompactTestPreFault(num_pre_sync, num_post_sync);
+      // No new files created so we expect all values since no files will be
+      // dropped.
+      PartialCompactTestReopenWithFault(kResetDeleteUnsyncedFiles, num_pre_sync,
+                                        num_post_sync);
+      NoWriteTestPreFault();
+      NoWriteTestReopenWithFault(kResetDeleteUnsyncedFiles);
+    }
+  } while (ChangeOptions());
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
--- a/db/file_indexer.cc
+++ b/db/file_indexer.cc
@ -17,17 +17,16 @@ namespace rocksdb {
 FileIndexer::FileIndexer(const Comparator* ucmp)
    : num_levels_(0), ucmp_(ucmp), level_rb_(nullptr) {}

-uint32_t FileIndexer::NumLevelIndex() {
-  return next_level_index_.size();
-}
+size_t FileIndexer::NumLevelIndex() const { return next_level_index_.size(); }

-uint32_t FileIndexer::LevelIndexSize(uint32_t level) {
+size_t FileIndexer::LevelIndexSize(size_t level) const {
  return next_level_index_[level].num_index;
 }

-void FileIndexer::GetNextLevelIndex(
-    const uint32_t level, const uint32_t file_index, const int cmp_smallest,
-    const int cmp_largest, int32_t* left_bound, int32_t* right_bound) {
+void FileIndexer::GetNextLevelIndex(const size_t level, const size_t file_index,
+                                    const int cmp_smallest,
+                                    const int cmp_largest, int32_t* left_bound,
+                                    int32_t* right_bound) const {
  assert(level > 0);

  // Last level, no hint
@ -69,7 +68,7 @@ void FileIndexer::GetNextLevelIndex(
  assert(*right_bound <= level_rb_[level + 1]);
 }

-void FileIndexer::UpdateIndex(Arena* arena, const uint32_t num_levels,
+void FileIndexer::UpdateIndex(Arena* arena, const size_t num_levels,
                              std::vector<FileMetaData*>* const files) {
  if (files == nullptr) {
    return;
@ -90,17 +89,17 @@ void FileIndexer::UpdateIndex(Arena* arena, const uint32_t num_levels,
  }

  // L1 - Ln-1
-  for (uint32_t level = 1; level < num_levels_ - 1; ++level) {
+  for (size_t level = 1; level < num_levels_ - 1; ++level) {
    const auto& upper_files = files[level];
-    const int32_t upper_size = upper_files.size();
+    const int32_t upper_size = static_cast<int32_t>(upper_files.size());
    const auto& lower_files = files[level + 1];
-    level_rb_[level] = upper_files.size() - 1;
+    level_rb_[level] = static_cast<int32_t>(upper_files.size()) - 1;
    if (upper_size == 0) {
      continue;
    }
    IndexLevel& index_level = next_level_index_[level];
    index_level.num_index = upper_size;
-    char* mem = arena->AllocateAligned(upper_size * sizeof(IndexUnit));
+    mem = arena->AllocateAligned(upper_size * sizeof(IndexUnit));
    index_level.index_units = new (mem) IndexUnit[upper_size];

    CalculateLB(
@ -129,7 +128,8 @@ void FileIndexer::UpdateIndex(Arena* arena, const uint32_t num_levels,
        [](IndexUnit* index, int32_t f_idx) { index->largest_rb = f_idx; });
  }

-  level_rb_[num_levels_ - 1] = files[num_levels_ - 1].size() - 1;
+  level_rb_[num_levels_ - 1] =
+      static_cast<int32_t>(files[num_levels_ - 1].size()) - 1;
 }

 void FileIndexer::CalculateLB(
@ -137,8 +137,8 @@ void FileIndexer::CalculateLB(
    const std::vector<FileMetaData*>& lower_files, IndexLevel* index_level,
    std::function<int(const FileMetaData*, const FileMetaData*)> cmp_op,
    std::function<void(IndexUnit*, int32_t)> set_index) {
-  const int32_t upper_size = upper_files.size();
-  const int32_t lower_size = lower_files.size();
+  const int32_t upper_size = static_cast<int32_t>(upper_files.size());
+  const int32_t lower_size = static_cast<int32_t>(lower_files.size());
  int32_t upper_idx = 0;
  int32_t lower_idx = 0;

@ -175,8 +175,8 @@ void FileIndexer::CalculateRB(
    const std::vector<FileMetaData*>& lower_files, IndexLevel* index_level,
    std::function<int(const FileMetaData*, const FileMetaData*)> cmp_op,
    std::function<void(IndexUnit*, int32_t)> set_index) {
-  const int32_t upper_size = upper_files.size();
-  const int32_t lower_size = lower_files.size();
+  const int32_t upper_size = static_cast<int32_t>(upper_files.size());
+  const int32_t lower_size = static_cast<int32_t>(lower_files.size());
  int32_t upper_idx = upper_size - 1;
  int32_t lower_idx = lower_size - 1;

--- a/db/file_indexer.h
+++ b/db/file_indexer.h
@ -42,19 +42,19 @@ class FileIndexer {
 public:
  explicit FileIndexer(const Comparator* ucmp);

-  uint32_t NumLevelIndex();
+  size_t NumLevelIndex() const;

-  uint32_t LevelIndexSize(uint32_t level);
+  size_t LevelIndexSize(size_t level) const;

  // Return a file index range in the next level to search for a key based on
  // smallest and largest key comparision for the current file specified by
  // level and file_index. When *left_index < *right_index, both index should
  // be valid and fit in the vector size.
-  void GetNextLevelIndex(
-    const uint32_t level, const uint32_t file_index, const int cmp_smallest,
-    const int cmp_largest, int32_t* left_bound, int32_t* right_bound);
+  void GetNextLevelIndex(const size_t level, const size_t file_index,
+                         const int cmp_smallest, const int cmp_largest,
+                         int32_t* left_bound, int32_t* right_bound) const;

-  void UpdateIndex(Arena* arena, const uint32_t num_levels,
+  void UpdateIndex(Arena* arena, const size_t num_levels,
                   std::vector<FileMetaData*>* const files);

  enum {
@ -62,7 +62,7 @@ class FileIndexer {
  };

 private:
-  uint32_t num_levels_;
+  size_t num_levels_;
  const Comparator* ucmp_;

  struct IndexUnit {
--- a/db/file_indexer_test.cc
+++ b/db/file_indexer_test.cc
@ -22,8 +22,15 @@ class IntComparator : public Comparator {
  int Compare(const Slice& a, const Slice& b) const {
    assert(a.size() == 8);
    assert(b.size() == 8);
-    return *reinterpret_cast<const int64_t*>(a.data()) -
+    int64_t diff = *reinterpret_cast<const int64_t*>(a.data()) -
                   *reinterpret_cast<const int64_t*>(b.data());
+    if (diff < 0) {
+      return -1;
+    } else if (diff == 0) {
+      return 0;
+    } else {
+      return 1;
+    }
  }

  const char* Name() const {
@ -94,7 +101,6 @@ TEST(FileIndexerTest, Empty) {
 // Case 1: no overlap, files are on the left of next level files
 TEST(FileIndexerTest, no_overlap_left) {
  Arena arena;
-  uint32_t kNumLevels = 4;
  indexer = new FileIndexer(&ucmp);
  // level 1
  AddFile(1, 100, 200);
@ -135,7 +141,6 @@ TEST(FileIndexerTest, no_overlap_left) {
 // Case 2: no overlap, files are on the right of next level files
 TEST(FileIndexerTest, no_overlap_right) {
  Arena arena;
-  uint32_t kNumLevels = 4;
  indexer = new FileIndexer(&ucmp);
  // level 1
  AddFile(1, 2100, 2200);
@ -178,7 +183,6 @@ TEST(FileIndexerTest, no_overlap_right) {
 // Case 3: empty L2
 TEST(FileIndexerTest, empty_L2) {
  Arena arena;
-  uint32_t kNumLevels = 4;
  indexer = new FileIndexer(&ucmp);
  for (uint32_t i = 1; i < kNumLevels; ++i) {
    ASSERT_EQ(0U, indexer->LevelIndexSize(i));
--- a/db/filename.cc
+++ b/db/filename.cc
@ -6,7 +6,10 @@
 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
+#ifndef __STDC_FORMAT_MACROS
 #define __STDC_FORMAT_MACROS
+#endif
+
 #include "db/filename.h"
 #include <inttypes.h>

@ -16,6 +19,7 @@
 #include "db/dbformat.h"
 #include "rocksdb/env.h"
 #include "util/logging.h"
+#include "util/stop_watch.h"

 namespace rocksdb {

@ -76,6 +80,17 @@ std::string MakeTableFileName(const std::string& path, uint64_t number) {
  return MakeFileName(path, number, "sst");
 }

+uint64_t TableFileNameToNumber(const std::string& name) {
+  uint64_t number = 0;
+  uint64_t base = 1;
+  int pos = static_cast<int>(name.find_last_of('.'));
+  while (--pos >= 0 && name[pos] >= '0' && name[pos] <= '9') {
+    number += (name[pos] - '0') * base;
+    base *= 10;
+  }
+  return number;
+}
+
 std::string TableFileName(const std::vector<DbPath>& db_paths, uint64_t number,
                          uint32_t path_id) {
  assert(number > 0);
@ -315,4 +330,16 @@ Status SetIdentityFile(Env* env, const std::string& dbname) {
  return s;
 }

+Status SyncManifest(Env* env, const DBOptions* db_options, WritableFile* file) {
+  if (db_options->disableDataSync) {
+    return Status::OK();
+  } else if (db_options->use_fsync) {
+    StopWatch sw(env, db_options->statistics.get(), MANIFEST_FILE_SYNC_MICROS);
+    return file->Fsync();
+  } else {
+    StopWatch sw(env, db_options->statistics.get(), MANIFEST_FILE_SYNC_MICROS);
+    return file->Sync();
+  }
+}
+
 }  // namespace rocksdb
--- a/db/filename.h
+++ b/db/filename.h
@ -14,15 +14,18 @@
 #include <unordered_map>
 #include <string>
 #include <vector>
+
+#include "port/port.h"
+#include "rocksdb/options.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/status.h"
 #include "rocksdb/transaction_log.h"
-#include "port/port.h"

 namespace rocksdb {

 class Env;
 class Directory;
+class WritableFile;

 enum FileType {
  kLogFile,
@ -36,9 +39,6 @@ enum FileType {
  kIdentityFile
 };

-// map from file number to path ID.
-typedef std::unordered_map<uint64_t, uint32_t> FileNumToPathIdMap;
-
 // Return the name of the log file with the specified number
 // in the db named by "dbname".  The result will be prefixed with
 // "dbname".
@ -55,6 +55,10 @@ extern std::string ArchivedLogFileName(const std::string& dbname,

 extern std::string MakeTableFileName(const std::string& name, uint64_t number);

+// the reverse function of MakeTableFileName
+// TODO(yhchiang): could merge this function with ParseFileName()
+extern uint64_t TableFileNameToNumber(const std::string& name);
+
 // Return the name of the sstable with the specified number
 // in the db named by "dbname".  The result will be prefixed with
 // "dbname".
@ -134,4 +138,8 @@ extern Status SetCurrentFile(Env* env, const std::string& dbname,
 // Make the IDENTITY file for the db
 extern Status SetIdentityFile(Env* env, const std::string& dbname);

+// Sync manifest file `file`.
+extern Status SyncManifest(Env* env, const DBOptions* db_options,
+                           WritableFile* file);
+
 }  // namespace rocksdb
--- a/db/flush_job.cc
+++ b/db/flush_job.cc
@ -0,0 +1,221 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/flush_job.h"
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include <inttypes.h>
+#include <algorithm>
+#include <vector>
+
+#include "db/builder.h"
+#include "db/db_iter.h"
+#include "db/dbformat.h"
+#include "db/filename.h"
+#include "db/log_reader.h"
+#include "db/log_writer.h"
+#include "db/memtable.h"
+#include "db/memtable_list.h"
+#include "db/merge_context.h"
+#include "db/version_set.h"
+#include "port/port.h"
+#include "port/likely.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table.h"
+#include "table/block.h"
+#include "table/block_based_table_factory.h"
+#include "table/merger.h"
+#include "table/table_builder.h"
+#include "table/two_level_iterator.h"
+#include "util/coding.h"
+#include "util/file_util.h"
+#include "util/logging.h"
+#include "util/log_buffer.h"
+#include "util/mutexlock.h"
+#include "util/perf_context_imp.h"
+#include "util/iostats_context_imp.h"
+#include "util/stop_watch.h"
+#include "util/sync_point.h"
+
+namespace rocksdb {
+
+FlushJob::FlushJob(const std::string& dbname, ColumnFamilyData* cfd,
+                   const DBOptions& db_options,
+                   const MutableCFOptions& mutable_cf_options,
+                   const EnvOptions& env_options, VersionSet* versions,
+                   InstrumentedMutex* db_mutex,
+                   std::atomic<bool>* shutting_down,
+                   SequenceNumber newest_snapshot, JobContext* job_context,
+                   LogBuffer* log_buffer, Directory* db_directory,
+                   Directory* output_file_directory,
+                   CompressionType output_compression, Statistics* stats)
+    : dbname_(dbname),
+      cfd_(cfd),
+      db_options_(db_options),
+      mutable_cf_options_(mutable_cf_options),
+      env_options_(env_options),
+      versions_(versions),
+      db_mutex_(db_mutex),
+      shutting_down_(shutting_down),
+      newest_snapshot_(newest_snapshot),
+      job_context_(job_context),
+      log_buffer_(log_buffer),
+      db_directory_(db_directory),
+      output_file_directory_(output_file_directory),
+      output_compression_(output_compression),
+      stats_(stats) {}
+
+Status FlushJob::Run(uint64_t* file_number) {
+  // Save the contents of the earliest memtable as a new Table
+  uint64_t fn;
+  autovector<MemTable*> mems;
+  cfd_->imm()->PickMemtablesToFlush(&mems);
+  if (mems.empty()) {
+    LogToBuffer(log_buffer_, "[%s] Nothing in memtable to flush",
+                cfd_->GetName().c_str());
+    return Status::OK();
+  }
+
+  // entries mems are (implicitly) sorted in ascending order by their created
+  // time. We will use the first memtable's `edit` to keep the meta info for
+  // this flush.
+  MemTable* m = mems[0];
+  VersionEdit* edit = m->GetEdits();
+  edit->SetPrevLogNumber(0);
+  // SetLogNumber(log_num) indicates logs with number smaller than log_num
+  // will no longer be picked up for recovery.
+  edit->SetLogNumber(mems.back()->GetNextLogNumber());
+  edit->SetColumnFamily(cfd_->GetID());
+
+  // This will release and re-acquire the mutex.
+  Status s = WriteLevel0Table(mems, edit, &fn);
+
+  if (s.ok() &&
+      (shutting_down_->load(std::memory_order_acquire) || cfd_->IsDropped())) {
+    s = Status::ShutdownInProgress(
+        "Database shutdown or Column family drop during flush");
+  }
+
+  if (!s.ok()) {
+    cfd_->imm()->RollbackMemtableFlush(mems, fn);
+  } else {
+    // Replace immutable memtable with the generated Table
+    s = cfd_->imm()->InstallMemtableFlushResults(
+        cfd_, mutable_cf_options_, mems, versions_, db_mutex_, fn,
+        &job_context_->memtables_to_free, db_directory_, log_buffer_);
+  }
+
+  if (s.ok() && file_number != nullptr) {
+    *file_number = fn;
+  }
+  return s;
+}
+
+Status FlushJob::WriteLevel0Table(const autovector<MemTable*>& mems,
+                                  VersionEdit* edit, uint64_t* filenumber) {
+  db_mutex_->AssertHeld();
+  const uint64_t start_micros = db_options_.env->NowMicros();
+  FileMetaData meta;
+  // path 0 for level 0 file.
+  meta.fd = FileDescriptor(versions_->NewFileNumber(), 0, 0);
+  *filenumber = meta.fd.GetNumber();
+
+  const SequenceNumber earliest_seqno_in_memtable =
+      mems[0]->GetFirstSequenceNumber();
+  Version* base = cfd_->current();
+  base->Ref();  // it is likely that we do not need this reference
+  Status s;
+  {
+    db_mutex_->Unlock();
+    if (log_buffer_) {
+      log_buffer_->FlushBufferToLog();
+    }
+    std::vector<Iterator*> memtables;
+    ReadOptions ro;
+    ro.total_order_seek = true;
+    Arena arena;
+    for (MemTable* m : mems) {
+      Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
+          "[%s] Flushing memtable with next log file: %" PRIu64 "\n",
+          cfd_->GetName().c_str(), m->GetNextLogNumber());
+      memtables.push_back(m->NewIterator(ro, &arena));
+    }
+    {
+      ScopedArenaIterator iter(
+          NewMergingIterator(&cfd_->internal_comparator(), &memtables[0],
+                             static_cast<int>(memtables.size()), &arena));
+      Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
+          "[%s] Level-0 flush table #%" PRIu64 ": started",
+          cfd_->GetName().c_str(), meta.fd.GetNumber());
+
+      s = BuildTable(dbname_, db_options_.env, *cfd_->ioptions(), env_options_,
+                     cfd_->table_cache(), iter.get(), &meta,
+                     cfd_->internal_comparator(), newest_snapshot_,
+                     earliest_seqno_in_memtable, output_compression_,
+                     cfd_->ioptions()->compression_opts, Env::IO_HIGH);
+      LogFlush(db_options_.info_log);
+    }
+    Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
+        "[%s] Level-0 flush table #%" PRIu64 ": %" PRIu64 " bytes %s",
+        cfd_->GetName().c_str(), meta.fd.GetNumber(), meta.fd.GetFileSize(),
+        s.ToString().c_str());
+    if (!db_options_.disableDataSync && output_file_directory_ != nullptr) {
+      output_file_directory_->Fsync();
+    }
+    db_mutex_->Lock();
+  }
+  base->Unref();
+
+  // re-acquire the most current version
+  base = cfd_->current();
+
+  // Note that if file_size is zero, the file has been deleted and
+  // should not be added to the manifest.
+  int level = 0;
+  if (s.ok() && meta.fd.GetFileSize() > 0) {
+    const Slice min_user_key = meta.smallest.user_key();
+    const Slice max_user_key = meta.largest.user_key();
+    // if we have more than 1 background thread, then we cannot
+    // insert files directly into higher levels because some other
+    // threads could be concurrently producing compacted files for
+    // that key range.
+    if (base != nullptr && db_options_.max_background_compactions <= 1 &&
+        db_options_.max_background_flushes == 0 &&
+        cfd_->ioptions()->compaction_style == kCompactionStyleLevel) {
+      level = base->storage_info()->PickLevelForMemTableOutput(
+          mutable_cf_options_, min_user_key, max_user_key);
+      // If level does not match path id, reset level back to 0
+      uint32_t fdpath = LevelCompactionPicker::GetPathId(
+          *cfd_->ioptions(), mutable_cf_options_, level);
+      if (fdpath != 0) {
+        level = 0;
+      }
+    }
+    edit->AddFile(level, meta.fd.GetNumber(), meta.fd.GetPathId(),
+                  meta.fd.GetFileSize(), meta.smallest, meta.largest,
+                  meta.smallest_seqno, meta.largest_seqno);
+  }
+
+  InternalStats::CompactionStats stats(1);
+  stats.micros = db_options_.env->NowMicros() - start_micros;
+  stats.bytes_written = meta.fd.GetFileSize();
+  cfd_->internal_stats()->AddCompactionStats(level, stats);
+  cfd_->internal_stats()->AddCFStats(InternalStats::BYTES_FLUSHED,
+                                     meta.fd.GetFileSize());
+  RecordTick(stats_, COMPACT_WRITE_BYTES, meta.fd.GetFileSize());
+  return s;
+}
+
+}  // namespace rocksdb
--- a/db/flush_job.h
+++ b/db/flush_job.h
@ -0,0 +1,87 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include <atomic>
+#include <deque>
+#include <limits>
+#include <set>
+#include <utility>
+#include <vector>
+#include <string>
+
+#include "db/dbformat.h"
+#include "db/log_writer.h"
+#include "db/snapshot.h"
+#include "db/column_family.h"
+#include "db/version_edit.h"
+#include "db/memtable_list.h"
+#include "port/port.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/transaction_log.h"
+#include "util/autovector.h"
+#include "util/instrumented_mutex.h"
+#include "util/stop_watch.h"
+#include "util/thread_local.h"
+#include "util/scoped_arena_iterator.h"
+#include "db/internal_stats.h"
+#include "db/write_controller.h"
+#include "db/flush_scheduler.h"
+#include "db/write_thread.h"
+#include "db/job_context.h"
+
+namespace rocksdb {
+
+class MemTable;
+class TableCache;
+class Version;
+class VersionEdit;
+class VersionSet;
+class Arena;
+
+class FlushJob {
+ public:
+  // TODO(icanadi) make effort to reduce number of parameters here
+  // IMPORTANT: mutable_cf_options needs to be alive while FlushJob is alive
+  FlushJob(const std::string& dbname, ColumnFamilyData* cfd,
+           const DBOptions& db_options,
+           const MutableCFOptions& mutable_cf_options,
+           const EnvOptions& env_options, VersionSet* versions,
+           InstrumentedMutex* db_mutex, std::atomic<bool>* shutting_down,
+           SequenceNumber newest_snapshot, JobContext* job_context,
+           LogBuffer* log_buffer, Directory* db_directory,
+           Directory* output_file_directory, CompressionType output_compression,
+           Statistics* stats);
+  ~FlushJob() {}
+
+  Status Run(uint64_t* file_number = nullptr);
+
+ private:
+  Status WriteLevel0Table(const autovector<MemTable*>& mems, VersionEdit* edit,
+                          uint64_t* filenumber);
+  const std::string& dbname_;
+  ColumnFamilyData* cfd_;
+  const DBOptions& db_options_;
+  const MutableCFOptions& mutable_cf_options_;
+  const EnvOptions& env_options_;
+  VersionSet* versions_;
+  InstrumentedMutex* db_mutex_;
+  std::atomic<bool>* shutting_down_;
+  SequenceNumber newest_snapshot_;
+  JobContext* job_context_;
+  LogBuffer* log_buffer_;
+  Directory* db_directory_;
+  Directory* output_file_directory_;
+  CompressionType output_compression_;
+  Statistics* stats_;
+};
+
+}  // namespace rocksdb
--- a/db/flush_job_test.cc
+++ b/db/flush_job_test.cc
@ -0,0 +1,124 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include <map>
+#include <string>
+
+#include "db/flush_job.h"
+#include "db/column_family.h"
+#include "db/version_set.h"
+#include "db/writebuffer.h"
+#include "rocksdb/cache.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+#include "table/mock_table.h"
+
+namespace rocksdb {
+
+// TODO(icanadi) Mock out everything else:
+// 1. VersionSet
+// 2. Memtable
+class FlushJobTest {
+ public:
+  FlushJobTest()
+      : env_(Env::Default()),
+        dbname_(test::TmpDir() + "/flush_job_test"),
+        table_cache_(NewLRUCache(50000, 16, 8)),
+        write_buffer_(db_options_.db_write_buffer_size),
+        versions_(new VersionSet(dbname_, &db_options_, env_options_,
+                                 table_cache_.get(), &write_buffer_,
+                                 &write_controller_)),
+        shutting_down_(false),
+        mock_table_factory_(new mock::MockTableFactory()) {
+    ASSERT_OK(env_->CreateDirIfMissing(dbname_));
+    db_options_.db_paths.emplace_back(dbname_,
+                                      std::numeric_limits<uint64_t>::max());
+    // TODO(icanadi) Remove this once we mock out VersionSet
+    NewDB();
+    std::vector<ColumnFamilyDescriptor> column_families;
+    cf_options_.table_factory = mock_table_factory_;
+    column_families.emplace_back(kDefaultColumnFamilyName, cf_options_);
+
+    ASSERT_OK(versions_->Recover(column_families, false));
+  }
+
+  void NewDB() {
+    VersionEdit new_db;
+    new_db.SetLogNumber(0);
+    new_db.SetNextFile(2);
+    new_db.SetLastSequence(0);
+
+    const std::string manifest = DescriptorFileName(dbname_, 1);
+    unique_ptr<WritableFile> file;
+    Status s = env_->NewWritableFile(
+        manifest, &file, env_->OptimizeForManifestWrite(env_options_));
+    ASSERT_OK(s);
+    {
+      log::Writer log(std::move(file));
+      std::string record;
+      new_db.EncodeTo(&record);
+      s = log.AddRecord(record);
+    }
+    ASSERT_OK(s);
+    // Make "CURRENT" file that points to the new manifest file.
+    s = SetCurrentFile(env_, dbname_, 1, nullptr);
+  }
+
+  Env* env_;
+  std::string dbname_;
+  EnvOptions env_options_;
+  std::shared_ptr<Cache> table_cache_;
+  WriteController write_controller_;
+  DBOptions db_options_;
+  WriteBuffer write_buffer_;
+  ColumnFamilyOptions cf_options_;
+  std::unique_ptr<VersionSet> versions_;
+  InstrumentedMutex mutex_;
+  std::atomic<bool> shutting_down_;
+  std::shared_ptr<mock::MockTableFactory> mock_table_factory_;
+};
+
+TEST(FlushJobTest, Empty) {
+  JobContext job_context;
+  auto cfd = versions_->GetColumnFamilySet()->GetDefault();
+  FlushJob flush_job(dbname_, versions_->GetColumnFamilySet()->GetDefault(),
+                     db_options_, *cfd->GetLatestMutableCFOptions(),
+                     env_options_, versions_.get(), &mutex_, &shutting_down_,
+                     SequenceNumber(), &job_context, nullptr, nullptr, nullptr,
+                     kNoCompression, nullptr);
+  ASSERT_OK(flush_job.Run());
+  job_context.Clean();
+}
+
+TEST(FlushJobTest, NonEmpty) {
+  JobContext job_context;
+  auto cfd = versions_->GetColumnFamilySet()->GetDefault();
+  auto new_mem = cfd->ConstructNewMemtable(*cfd->GetLatestMutableCFOptions());
+  new_mem->Ref();
+  std::map<std::string, std::string> inserted_keys;
+  for (int i = 1; i < 10000; ++i) {
+    std::string key(ToString(i));
+    std::string value("value" + ToString(i));
+    new_mem->Add(SequenceNumber(i), kTypeValue, key, value);
+    InternalKey internal_key(key, SequenceNumber(i), kTypeValue);
+    inserted_keys.insert({internal_key.Encode().ToString(), value});
+  }
+  cfd->imm()->Add(new_mem);
+
+  FlushJob flush_job(dbname_, versions_->GetColumnFamilySet()->GetDefault(),
+                     db_options_, *cfd->GetLatestMutableCFOptions(),
+                     env_options_, versions_.get(), &mutex_, &shutting_down_,
+                     SequenceNumber(), &job_context, nullptr, nullptr, nullptr,
+                     kNoCompression, nullptr);
+  mutex_.Lock();
+  ASSERT_OK(flush_job.Run());
+  mutex_.Unlock();
+  mock_table_factory_->AssertSingleFile(inserted_keys);
+  job_context.Clean();
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) { return rocksdb::test::RunAllTests(); }
--- a/db/flush_scheduler.cc
+++ b/db/flush_scheduler.cc
@ -0,0 +1,63 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include "db/flush_scheduler.h"
+
+#include <cassert>
+
+#include "db/column_family.h"
+
+namespace rocksdb {
+
+void FlushScheduler::ScheduleFlush(ColumnFamilyData* cfd) {
+#ifndef NDEBUG
+  assert(column_families_set_.find(cfd) == column_families_set_.end());
+  column_families_set_.insert(cfd);
+#endif  // NDEBUG
+  cfd->Ref();
+  column_families_.push_back(cfd);
+}
+
+ColumnFamilyData* FlushScheduler::GetNextColumnFamily() {
+  ColumnFamilyData* cfd = nullptr;
+  while (column_families_.size() > 0) {
+    cfd = column_families_.front();
+    column_families_.pop_front();
+    if (cfd->IsDropped()) {
+      if (cfd->Unref()) {
+        delete cfd;
+        cfd = nullptr;
+      }
+    } else {
+      break;
+    }
+  }
+#ifndef NDEBUG
+  if (cfd != nullptr) {
+    auto itr = column_families_set_.find(cfd);
+    assert(itr != column_families_set_.end());
+    column_families_set_.erase(itr);
+  }
+#endif  // NDEBUG
+  return cfd;
+}
+
+bool FlushScheduler::Empty() { return column_families_.empty(); }
+
+void FlushScheduler::Clear() {
+  for (auto cfd : column_families_) {
+#ifndef NDEBUG
+    auto itr = column_families_set_.find(cfd);
+    assert(itr != column_families_set_.end());
+    column_families_set_.erase(itr);
+#endif  // NDEBUG
+    if (cfd->Unref()) {
+      delete cfd;
+    }
+  }
+  column_families_.clear();
+}
+
+}  // namespace rocksdb
--- a/db/flush_scheduler.h
+++ b/db/flush_scheduler.h
@ -0,0 +1,40 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#pragma once
+
+#include <stdint.h>
+#include <deque>
+#include <set>
+#include <vector>
+
+namespace rocksdb {
+
+class ColumnFamilyData;
+
+// This class is thread-compatible. It's should only be accessed from single
+// write thread (between BeginWrite() and EndWrite())
+class FlushScheduler {
+ public:
+  FlushScheduler() = default;
+  ~FlushScheduler() = default;
+
+  void ScheduleFlush(ColumnFamilyData* cfd);
+  // Returns Ref()-ed column family. Client needs to Unref()
+  // REQUIRES: db mutex is held (exception is single-threaded recovery)
+  ColumnFamilyData* GetNextColumnFamily();
+
+  bool Empty();
+
+  void Clear();
+
+ private:
+  std::deque<ColumnFamilyData*> column_families_;
+#ifndef NDEBUG
+  std::set<ColumnFamilyData*> column_families_set_;
+#endif  // NDEBUG
+};
+
+}  // namespace rocksdb
--- a/db/forward_iterator.cc
+++ b/db/forward_iterator.cc
@ -10,6 +10,7 @@
 #include <string>
 #include <utility>

+#include "db/job_context.h"
 #include "db/db_impl.h"
 #include "db/db_iter.h"
 #include "db/column_family.h"
@ -114,27 +115,36 @@ class LevelIterator : public Iterator {
 };

 ForwardIterator::ForwardIterator(DBImpl* db, const ReadOptions& read_options,
-                                 ColumnFamilyData* cfd)
+    ColumnFamilyData* cfd, SuperVersion* current_sv)
    : db_(db),
      read_options_(read_options),
      cfd_(cfd),
-      prefix_extractor_(cfd->options()->prefix_extractor.get()),
+      prefix_extractor_(cfd->ioptions()->prefix_extractor),
      user_comparator_(cfd->user_comparator()),
      immutable_min_heap_(MinIterComparator(&cfd_->internal_comparator())),
-      sv_(nullptr),
+      sv_(current_sv),
      mutable_iter_(nullptr),
      current_(nullptr),
+      status_(Status::OK()),
+      immutable_status_(Status::OK()),
      valid_(false),
-      is_prev_set_(false) {}
+      is_prev_set_(false),
+      is_prev_inclusive_(false) {
+  if (sv_) {
+    RebuildIterators(false);
+  }
+}

 ForwardIterator::~ForwardIterator() {
-  Cleanup();
+  Cleanup(true);
 }

-void ForwardIterator::Cleanup() {
-  delete mutable_iter_;
+void ForwardIterator::Cleanup(bool release_sv) {
+  if (mutable_iter_ != nullptr) {
+    mutable_iter_->~Iterator();
+  }
  for (auto* m : imm_iters_) {
-    delete m;
+    m->~Iterator();
  }
  imm_iters_.clear();
  for (auto* f : l0_iters_) {
@ -146,15 +156,17 @@ void ForwardIterator::Cleanup() {
  }
  level_iters_.clear();

+  if (release_sv) {
    if (sv_ != nullptr && sv_->Unref()) {
-    DBImpl::DeletionState deletion_state;
+      JobContext job_context;
      db_->mutex_.Lock();
      sv_->Cleanup();
-    db_->FindObsoleteFiles(deletion_state, false, true);
+      db_->FindObsoleteFiles(&job_context, false, true);
      db_->mutex_.Unlock();
      delete sv_;
-    if (deletion_state.HaveSomethingToDelete()) {
-      db_->PurgeObsoleteFiles(deletion_state);
+      if (job_context.HaveSomethingToDelete()) {
+        db_->PurgeObsoleteFiles(job_context);
+      }
    }
  }
 }
@ -166,8 +178,8 @@ bool ForwardIterator::Valid() const {
 void ForwardIterator::SeekToFirst() {
  if (sv_ == nullptr ||
      sv_ ->version_number != cfd_->GetSuperVersionNumber()) {
-    RebuildIterators();
-  } else if (status_.IsIncomplete()) {
+    RebuildIterators(true);
+  } else if (immutable_status_.IsIncomplete()) {
    ResetIncompleteIterators();
  }
  SeekInternal(Slice(), true);
@ -176,8 +188,8 @@ void ForwardIterator::SeekToFirst() {
 void ForwardIterator::Seek(const Slice& internal_key) {
  if (sv_ == nullptr ||
      sv_ ->version_number != cfd_->GetSuperVersionNumber()) {
-    RebuildIterators();
-  } else if (status_.IsIncomplete()) {
+    RebuildIterators(true);
+  } else if (immutable_status_.IsIncomplete()) {
    ResetIncompleteIterators();
  }
  SeekInternal(internal_key, false);
@ -185,6 +197,7 @@ void ForwardIterator::Seek(const Slice& internal_key) {

 void ForwardIterator::SeekInternal(const Slice& internal_key,
                                   bool seek_to_first) {
+  assert(mutable_iter_);
  // mutable
  seek_to_first ? mutable_iter_->SeekToFirst() :
                  mutable_iter_->Seek(internal_key);
@ -194,13 +207,16 @@ void ForwardIterator::SeekInternal(const Slice& internal_key,
  // if it turns to need to seek immutable often. We probably want to have
  // an option to turn it off.
  if (seek_to_first || NeedToSeekImmutable(internal_key)) {
+    immutable_status_ = Status::OK();
    {
      auto tmp = MinIterHeap(MinIterComparator(&cfd_->internal_comparator()));
      immutable_min_heap_.swap(tmp);
    }
    for (auto* m : imm_iters_) {
      seek_to_first ? m->SeekToFirst() : m->Seek(internal_key);
-      if (m->Valid()) {
+      if (!m->status().ok()) {
+        immutable_status_ = m->status();
+      } else if (m->Valid()) {
        immutable_min_heap_.push(m);
      }
    }
@ -209,27 +225,23 @@ void ForwardIterator::SeekInternal(const Slice& internal_key,
    if (!seek_to_first) {
      user_key = ExtractUserKey(internal_key);
    }
-    auto* files = sv_->current->files_;
-    for (uint32_t i = 0; i < files[0].size(); ++i) {
+    const VersionStorageInfo* vstorage = sv_->current->storage_info();
+    const std::vector<FileMetaData*>& l0 = vstorage->LevelFiles(0);
+    for (uint32_t i = 0; i < l0.size(); ++i) {
      if (seek_to_first) {
        l0_iters_[i]->SeekToFirst();
      } else {
        // If the target key passes over the larget key, we are sure Next()
        // won't go over this file.
        if (user_comparator_->Compare(user_key,
-              files[0][i]->largest.user_key()) > 0) {
+              l0[i]->largest.user_key()) > 0) {
          continue;
        }
        l0_iters_[i]->Seek(internal_key);
      }

-      if (l0_iters_[i]->status().IsIncomplete()) {
-        // if any of the immutable iterators is incomplete (no-io option was
-        // used), we are unable to reliably find the smallest key
-        assert(read_options_.read_tier == kBlockCacheTier);
-        status_ = l0_iters_[i]->status();
-        valid_ = false;
-        return;
+      if (!l0_iters_[i]->status().ok()) {
+        immutable_status_ = l0_iters_[i]->status();
      } else if (l0_iters_[i]->Valid()) {
        immutable_min_heap_.push(l0_iters_[i]);
      }
@ -237,86 +249,83 @@ void ForwardIterator::SeekInternal(const Slice& internal_key,

    int32_t search_left_bound = 0;
    int32_t search_right_bound = FileIndexer::kLevelMaxIndex;
-    for (int32_t level = 1; level < sv_->current->NumberLevels(); ++level) {
-      if (files[level].empty()) {
+    for (int32_t level = 1; level < vstorage->num_levels(); ++level) {
+      const std::vector<FileMetaData*>& level_files =
+          vstorage->LevelFiles(level);
+      if (level_files.empty()) {
        search_left_bound = 0;
        search_right_bound = FileIndexer::kLevelMaxIndex;
        continue;
      }
      assert(level_iters_[level - 1] != nullptr);
      uint32_t f_idx = 0;
+      const auto& indexer = vstorage->file_indexer();
      if (!seek_to_first) {
-        // TODO(ljin): remove before committing
-        // f_idx = FindFileInRange(
-        //    files[level], internal_key, 0, files[level].size());
-
        if (search_left_bound == search_right_bound) {
          f_idx = search_left_bound;
        } else if (search_left_bound < search_right_bound) {
-          f_idx = FindFileInRange(
-              files[level], internal_key, search_left_bound,
-              search_right_bound == FileIndexer::kLevelMaxIndex ?
-                files[level].size() : search_right_bound);
+          f_idx =
+              FindFileInRange(level_files, internal_key, search_left_bound,
+                              search_right_bound == FileIndexer::kLevelMaxIndex
+                                  ? static_cast<uint32_t>(level_files.size())
+                                  : search_right_bound);
        } else {
          // search_left_bound > search_right_bound
          // There are only 2 cases this can happen:
          // (1) target key is smaller than left most file
          // (2) target key is larger than right most file
-          assert(search_left_bound == (int32_t)files[level].size() ||
+          assert(search_left_bound == (int32_t)level_files.size() ||
                 search_right_bound == -1);
          if (search_right_bound == -1) {
            assert(search_left_bound == 0);
            f_idx = 0;
          } else {
-            sv_->current->file_indexer_.GetNextLevelIndex(
-                level, files[level].size() - 1,
+            indexer.GetNextLevelIndex(
+                level, level_files.size() - 1,
                1, 1, &search_left_bound, &search_right_bound);
            continue;
          }
        }

        // Prepare hints for the next level
-        if (f_idx < files[level].size()) {
+        if (f_idx < level_files.size()) {
          int cmp_smallest = user_comparator_->Compare(
-              user_key, files[level][f_idx]->smallest.user_key());
+              user_key, level_files[f_idx]->smallest.user_key());
          int cmp_largest = -1;
          if (cmp_smallest >= 0) {
            cmp_smallest = user_comparator_->Compare(
-                user_key, files[level][f_idx]->smallest.user_key());
+                user_key, level_files[f_idx]->smallest.user_key());
          }
-          sv_->current->file_indexer_.GetNextLevelIndex(level, f_idx,
+          indexer.GetNextLevelIndex(level, f_idx,
              cmp_smallest, cmp_largest,
              &search_left_bound, &search_right_bound);
        } else {
-          sv_->current->file_indexer_.GetNextLevelIndex(
-              level, files[level].size() - 1,
+          indexer.GetNextLevelIndex(
+              level, level_files.size() - 1,
              1, 1, &search_left_bound, &search_right_bound);
        }
      }

      // Seek
-      if (f_idx < files[level].size()) {
+      if (f_idx < level_files.size()) {
        level_iters_[level - 1]->SetFileIndex(f_idx);
        seek_to_first ? level_iters_[level - 1]->SeekToFirst() :
                        level_iters_[level - 1]->Seek(internal_key);

-        if (level_iters_[level - 1]->status().IsIncomplete()) {
-          // see above
-          assert(read_options_.read_tier == kBlockCacheTier);
-          status_ = level_iters_[level - 1]->status();
-          valid_ = false;
-          return;
+        if (!level_iters_[level - 1]->status().ok()) {
+          immutable_status_ = level_iters_[level - 1]->status();
        } else if (level_iters_[level - 1]->Valid()) {
          immutable_min_heap_.push(level_iters_[level - 1]);
        }
      }
    }

-    if (seek_to_first || immutable_min_heap_.empty()) {
+    if (seek_to_first) {
      is_prev_set_ = false;
    } else {
      prev_key_.SetKey(internal_key);
      is_prev_set_ = true;
+      is_prev_inclusive_ = true;
    }
  } else if (current_ && current_ != mutable_iter_) {
    // current_ is one of immutable iterators, push it back to the heap
@ -334,24 +343,33 @@ void ForwardIterator::Next() {
    std::string current_key = key().ToString();
    Slice old_key(current_key.data(), current_key.size());

-    RebuildIterators();
+    RebuildIterators(true);
    SeekInternal(old_key, false);
    if (!valid_ || key().compare(old_key) != 0) {
      return;
    }
  } else if (current_ != mutable_iter_) {
    // It is going to advance immutable iterator
+
+    bool update_prev_key = true;
+    if (is_prev_set_ && prefix_extractor_) {
+      // advance prev_key_ to current_ only if they share the same prefix
+      update_prev_key =
+        prefix_extractor_->Transform(prev_key_.GetKey()).compare(
+          prefix_extractor_->Transform(current_->key())) == 0;
+    }
+
+    if (update_prev_key) {
      prev_key_.SetKey(current_->key());
      is_prev_set_ = true;
+      is_prev_inclusive_ = false;
+    }
  }

  current_->Next();
  if (current_ != mutable_iter_) {
-    if (current_->status().IsIncomplete()) {
-      assert(read_options_.read_tier == kBlockCacheTier);
-      status_ = current_->status();
-      valid_ = false;
-      return;
+    if (!current_->status().ok()) {
+      immutable_status_ = current_->status();
    } else if (current_->Valid()) {
      immutable_min_heap_.push(current_);
    }
@ -377,45 +395,35 @@ Status ForwardIterator::status() const {
    return mutable_iter_->status();
  }

-  for (auto *it : imm_iters_) {
-    if (it && !it->status().ok()) {
-      return it->status();
-    }
-  }
-  for (auto *it : l0_iters_) {
-    if (it && !it->status().ok()) {
-      return it->status();
-    }
-  }
-  for (auto *it : level_iters_) {
-    if (it && !it->status().ok()) {
-      return it->status();
-    }
-  }
-
-  return Status::OK();
+  return immutable_status_;
 }

-void ForwardIterator::RebuildIterators() {
+void ForwardIterator::RebuildIterators(bool refresh_sv) {
  // Clean up
-  Cleanup();
+  Cleanup(refresh_sv);
+  if (refresh_sv) {
    // New
    sv_ = cfd_->GetReferencedSuperVersion(&(db_->mutex_));
-  mutable_iter_ = sv_->mem->NewIterator(read_options_);
-  sv_->imm->AddIterators(read_options_, &imm_iters_);
-  const auto& l0_files = sv_->current->files_[0];
+  }
+  mutable_iter_ = sv_->mem->NewIterator(read_options_, &arena_);
+  sv_->imm->AddIterators(read_options_, &imm_iters_, &arena_);
+
+  const auto* vstorage = sv_->current->storage_info();
+  const auto& l0_files = vstorage->LevelFiles(0);
  l0_iters_.reserve(l0_files.size());
  for (const auto* l0 : l0_files) {
    l0_iters_.push_back(cfd_->table_cache()->NewIterator(
        read_options_, *cfd_->soptions(), cfd_->internal_comparator(), l0->fd));
  }
-  level_iters_.reserve(sv_->current->NumberLevels() - 1);
-  for (int32_t level = 1; level < sv_->current->NumberLevels(); ++level) {
-    if (sv_->current->files_[level].empty()) {
+  level_iters_.reserve(vstorage->num_levels() - 1);
+  for (int32_t level = 1; level < vstorage->num_levels(); ++level) {
+    const auto& level_files = vstorage->LevelFiles(level);
+
+    if (level_files.empty()) {
      level_iters_.push_back(nullptr);
    } else {
-      level_iters_.push_back(new LevelIterator(cfd_, read_options_,
-          sv_->current->files_[level]));
+      level_iters_.push_back(
+          new LevelIterator(cfd_, read_options_, level_files));
    }
  }

@ -424,7 +432,7 @@ void ForwardIterator::RebuildIterators() {
 }

 void ForwardIterator::ResetIncompleteIterators() {
-  const auto& l0_files = sv_->current->files_[0];
+  const auto& l0_files = sv_->current->storage_info()->LevelFiles(0);
  for (uint32_t i = 0; i < l0_iters_.size(); ++i) {
    assert(i < l0_files.size());
    if (!l0_iters_[i]->status().IsIncomplete()) {
@ -474,7 +482,14 @@ void ForwardIterator::UpdateCurrent() {
 }

 bool ForwardIterator::NeedToSeekImmutable(const Slice& target) {
-  if (!valid_ || !is_prev_set_) {
+  // We maintain the interval (prev_key_, immutable_min_heap_.top()->key())
+  // such that there are no records with keys within that range in
+  // immutable_min_heap_. Since immutable structures (SST files and immutable
+  // memtables) can't change in this version, we don't need to do a seek if
+  // 'target' belongs to that interval (immutable_min_heap_.top() is already
+  // at the correct position).
+
+  if (!valid_ || !current_ || !is_prev_set_ || !immutable_status_.ok()) {
    return true;
  }
  Slice prev_key = prev_key_.GetKey();
@ -483,11 +498,15 @@ bool ForwardIterator::NeedToSeekImmutable(const Slice& target) {
    return true;
  }
  if (cfd_->internal_comparator().InternalKeyComparator::Compare(
-        prev_key, target) >= 0) {
+        prev_key, target) >= (is_prev_inclusive_ ? 1 : 0)) {
    return true;
  }
-  if (immutable_min_heap_.empty() ||
-      cfd_->internal_comparator().InternalKeyComparator::Compare(
+
+  if (immutable_min_heap_.empty() && current_ == mutable_iter_) {
+    // Nothing to seek on.
+    return false;
+  }
+  if (cfd_->internal_comparator().InternalKeyComparator::Compare(
        target, current_ == mutable_iter_ ? immutable_min_heap_.top()->key()
                                          : current_->key()) > 0) {
    return true;
--- a/db/forward_iterator.h
+++ b/db/forward_iterator.h
@ -14,6 +14,7 @@
 #include "rocksdb/iterator.h"
 #include "rocksdb/options.h"
 #include "db/dbformat.h"
+#include "util/arena.h"

 namespace rocksdb {

@ -50,7 +51,7 @@ typedef std::priority_queue<Iterator*,
 class ForwardIterator : public Iterator {
 public:
  ForwardIterator(DBImpl* db, const ReadOptions& read_options,
-                  ColumnFamilyData* cfd);
+                  ColumnFamilyData* cfd, SuperVersion* current_sv = nullptr);
  virtual ~ForwardIterator();

  void SeekToLast() override {
@ -71,8 +72,8 @@ class ForwardIterator : public Iterator {
  virtual Status status() const override;

 private:
-  void Cleanup();
-  void RebuildIterators();
+  void Cleanup(bool release_sv);
+  void RebuildIterators(bool refresh_sv);
  void ResetIncompleteIterators();
  void SeekInternal(const Slice& internal_key, bool seek_to_first);
  void UpdateCurrent();
@ -96,10 +97,13 @@ class ForwardIterator : public Iterator {
  Iterator* current_;
  // internal iterator status
  Status status_;
+  Status immutable_status_;
  bool valid_;

  IterKey prev_key_;
  bool is_prev_set_;
+  bool is_prev_inclusive_;
+  Arena arena_;
 };

 }  // namespace rocksdb
--- a/db/internal_stats.cc
+++ b/db/internal_stats.cc
@ -7,14 +7,21 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.

 #include "db/internal_stats.h"
+
+#ifndef __STDC_FORMAT_MACROS
 #define __STDC_FORMAT_MACROS
+#endif
+
 #include <inttypes.h>
 #include <vector>
 #include "db/column_family.h"
+
 #include "db/db_impl.h"
+#include "util/string_util.h"

 namespace rocksdb {

+#ifndef ROCKSDB_LITE
 namespace {
 const double kMB = 1048576.0;
 const double kGB = kMB * 1024;
@ -24,18 +31,18 @@ void PrintLevelStatsHeader(char* buf, size_t len, const std::string& cf_name) {
      buf, len,
      "\n** Compaction Stats [%s] **\n"
      "Level   Files   Size(MB) Score Read(GB)  Rn(GB) Rnp1(GB) "
-      "Write(GB) Wnew(GB) RW-Amp W-Amp Rd(MB/s) Wr(MB/s)  Rn(cnt) "
-      "Rnp1(cnt) Wnp1(cnt) Wnew(cnt)  Comp(sec) Comp(cnt) Avg(sec) "
-      "Stall(sec) Stall(cnt) Avg(ms)\n"
+      "Write(GB) Wnew(GB) Moved(GB) W-Amp Rd(MB/s) Wr(MB/s) "
+      "Comp(sec) Comp(cnt) Avg(sec) "
+      "Stall(sec) Stall(cnt) Avg(ms)     RecordIn   RecordDrop\n"
      "--------------------------------------------------------------------"
      "--------------------------------------------------------------------"
-      "--------------------------------------------------------------------\n",
+      "----------------------------------------------------------\n",
      cf_name.c_str());
 }

 void PrintLevelStats(char* buf, size_t len, const std::string& name,
    int num_files, int being_compacted, double total_file_size, double score,
-    double rw_amp, double w_amp, double stall_us, uint64_t stalls,
+    double w_amp, double stall_us, uint64_t stalls,
    const InternalStats::CompactionStats& stats) {
  uint64_t bytes_read = stats.bytes_readn + stats.bytes_readnp1;
  uint64_t bytes_new = stats.bytes_written - stats.bytes_readnp1;
@ -48,40 +55,31 @@ void PrintLevelStats(char* buf, size_t len, const std::string& name,
           "%8.1f "                    /* Rnp1(GB) */
           "%9.1f "                    /* Write(GB) */
           "%8.1f "                    /* Wnew(GB) */
-    "%6.1f " /* RW-Amp */
+           "%9.1f "                    /* Moved(GB) */
           "%5.1f "                    /* W-Amp */
           "%8.1f "                    /* Rd(MB/s) */
           "%8.1f "                    /* Wr(MB/s) */
-    "%8d " /* Rn(cnt) */
-    "%9d " /* Rnp1(cnt) */
-    "%9d " /* Wnp1(cnt) */
-    "%9d " /* Wnew(cnt) */
-    "%10.0f " /* Comp(sec) */
+           "%9.0f "                   /* Comp(sec) */
           "%9d "                      /* Comp(cnt) */
           "%8.3f "                    /* Avg(sec) */
           "%10.2f "                   /* Stall(sec) */
-    "%10" PRIu64 " " /* Stall(cnt) */
-    "%7.2f\n" /* Avg(ms) */,
-    name.c_str(), num_files, being_compacted, total_file_size / kMB, score,
-    bytes_read / kGB,
-    stats.bytes_readn / kGB,
-    stats.bytes_readnp1 / kGB,
-    stats.bytes_written / kGB,
-    bytes_new / kGB,
-    rw_amp,
-    w_amp,
-    bytes_read / kMB / elapsed,
+           "%10" PRIu64
+           " "      /* Stall(cnt) */
+           "%7.2f " /* Avg(ms) */
+           "%12" PRIu64
+           " " /* input entries */
+           "%12" PRIu64 "\n" /* number of records reduced */,
+           name.c_str(), num_files, being_compacted, total_file_size / kMB,
+           score, bytes_read / kGB, stats.bytes_readn / kGB,
+           stats.bytes_readnp1 / kGB, stats.bytes_written / kGB,
+           bytes_new / kGB, stats.bytes_moved / kGB,
+           w_amp, bytes_read / kMB / elapsed,
           stats.bytes_written / kMB / elapsed,
-    stats.files_in_leveln,
-    stats.files_in_levelnp1,
-    stats.files_out_levelnp1,
-    stats.files_out_levelnp1 - stats.files_in_levelnp1,
-    stats.micros / 1000000.0,
-    stats.count,
+           stats.micros / 1000000.0, stats.count,
           stats.count == 0 ? 0 : stats.micros / 1000000.0 / stats.count,
-    stall_us / 1000000.0,
-    stalls,
-    stalls == 0 ? 0 : stall_us / 1000.0 / stalls);
+           stall_us / 1000000.0, stalls,
+           stalls == 0 ? 0 : stall_us / 1000.0 / stalls,
+           stats.num_input_records, stats.num_dropped_records);
 }


@ -125,6 +123,8 @@ DBPropertyType GetPropertyType(const Slice& property, bool* is_int_property,
    return kBackgroundErrors;
  } else if (in == "cur-size-active-mem-table") {
    return kCurSizeActiveMemTable;
+  } else if (in == "cur-size-all-mem-tables") {
+    return kCurSizeAllMemTables;
  } else if (in == "num-entries-active-mem-table") {
    return kNumEntriesInMutableMemtable;
  } else if (in == "num-entries-imm-mem-tables") {
@ -136,6 +136,10 @@ DBPropertyType GetPropertyType(const Slice& property, bool* is_int_property,
    return kEstimatedUsageByTableReaders;
  } else if (in == "is-file-deletions-enabled") {
    return kIsFileDeletionEnabled;
+  } else if (in == "num-snapshots") {
+    return kNumSnapshots;
+  } else if (in == "oldest-snapshot-time") {
+    return kOldestSnapshotTime;
  }
  return kUnknown;
 }
@ -159,7 +163,8 @@ bool InternalStats::GetStringProperty(DBPropertyType property_type,
                                      const Slice& property,
                                      std::string* value) {
  assert(value != nullptr);
-  Version* current = cfd_->current();
+  auto* current = cfd_->current();
+  const auto* vstorage = current->storage_info();
  Slice in = property;

  switch (property_type) {
@ -172,7 +177,7 @@ bool InternalStats::GetStringProperty(DBPropertyType property_type,
      } else {
        char buf[100];
        snprintf(buf, sizeof(buf), "%d",
-                 current->NumLevelFiles(static_cast<int>(level)));
+                 vstorage->NumLevelFiles(static_cast<int>(level)));
        *value = buf;
        return true;
      }
@ -186,8 +191,8 @@ bool InternalStats::GetStringProperty(DBPropertyType property_type,

      for (int level = 0; level < number_levels_; level++) {
        snprintf(buf, sizeof(buf), "%3d %8d %8.0f\n", level,
-                 current->NumLevelFiles(level),
-                 current->NumLevelBytes(level) / kMB);
+                 vstorage->NumLevelFiles(level),
+                 vstorage->NumLevelBytes(level) / kMB);
        value->append(buf);
      }
      return true;
@ -219,7 +224,7 @@ bool InternalStats::GetStringProperty(DBPropertyType property_type,

 bool InternalStats::GetIntProperty(DBPropertyType property_type,
                                   uint64_t* value, DBImpl* db) const {
-  Version* current = cfd_->current();
+  const auto* vstorage = cfd_->current()->storage_info();

  switch (property_type) {
    case kNumImmutableMemTable:
@ -232,7 +237,7 @@ bool InternalStats::GetIntProperty(DBPropertyType property_type,
    case kCompactionPending:
      // 1 if the system already determines at least one compacdtion is needed.
      // 0 otherwise,
-      *value = (current->NeedsCompaction() ? 1 : 0);
+      *value = (cfd_->compaction_picker()->NeedsCompaction(vstorage) ? 1 : 0);
      return true;
    case kBackgroundErrors:
      // Accumulated number of  errors in background flushes or compactions.
@ -242,12 +247,17 @@ bool InternalStats::GetIntProperty(DBPropertyType property_type,
      // Current size of the active memtable
      *value = cfd_->mem()->ApproximateMemoryUsage();
      return true;
+    case kCurSizeAllMemTables:
+      // Current size of the active memtable + immutable memtables
+      *value = cfd_->mem()->ApproximateMemoryUsage() +
+               cfd_->imm()->ApproximateMemoryUsage();
+      return true;
    case kNumEntriesInMutableMemtable:
-      // Current size of the active memtable
+      // Current number of entires in the active memtable
      *value = cfd_->mem()->GetNumEntries();
      return true;
    case kNumEntriesInImmutableMemtable:
-      // Current size of the active memtable
+      // Current number of entries in the immutable memtables
      *value = cfd_->imm()->current()->GetTotalNumEntries();
      return true;
    case kEstimatedNumKeys:
@ -255,11 +265,19 @@ bool InternalStats::GetIntProperty(DBPropertyType property_type,
      // Use estimated entries in tables + total entries in memtables.
      *value = cfd_->mem()->GetNumEntries() +
               cfd_->imm()->current()->GetTotalNumEntries() +
-               current->GetEstimatedActiveKeys();
+               vstorage->GetEstimatedActiveKeys();
+      return true;
+    case kNumSnapshots:
+      *value = db->snapshots().count();
      return true;
+    case kOldestSnapshotTime:
+      *value = static_cast<uint64_t>(db->snapshots().GetOldestSnapshotTime());
+      return true;
+#ifndef ROCKSDB_LITE
    case kIsFileDeletionEnabled:
      *value = db->IsFileDeletionsEnabled();
      return true;
+#endif
    default:
      return false;
  }
@ -276,18 +294,29 @@ void InternalStats::DumpDBStats(std::string* value) {
  value->append(buf);
  // Cumulative
  uint64_t user_bytes_written = db_stats_[InternalStats::BYTES_WRITTEN];
+  uint64_t num_keys_written = db_stats_[InternalStats::NUMBER_KEYS_WRITTEN];
  uint64_t write_other = db_stats_[InternalStats::WRITE_DONE_BY_OTHER];
  uint64_t write_self = db_stats_[InternalStats::WRITE_DONE_BY_SELF];
  uint64_t wal_bytes = db_stats_[InternalStats::WAL_FILE_BYTES];
  uint64_t wal_synced = db_stats_[InternalStats::WAL_FILE_SYNCED];
  uint64_t write_with_wal = db_stats_[InternalStats::WRITE_WITH_WAL];
+  uint64_t write_stall_micros = db_stats_[InternalStats::WRITE_STALL_MICROS];
  // Data
+  // writes: total number of write requests.
+  // keys: total number of key updates issued by all the write requests
+  // batches: number of group commits issued to the DB. Each group can contain
+  //          one or more writes.
+  // so writes/keys is the average number of put in multi-put or put
+  // writes/batches is the average group commit size.
+  //
+  // The format is the same for interval stats.
  snprintf(buf, sizeof(buf),
-           "Cumulative writes: %" PRIu64 " writes, %" PRIu64 " batches, "
-           "%.1f writes per batch, %.2f GB user ingest\n",
-           write_other + write_self, write_self,
+           "Cumulative writes: %" PRIu64 " writes, %" PRIu64 " keys, %" PRIu64
+           " batches, %.1f writes per batch, %.2f GB user ingest, "
+           "stall micros: %" PRIu64 "\n",
+           write_other + write_self, num_keys_written, write_self,
           (write_other + write_self) / static_cast<double>(write_self + 1),
-           user_bytes_written / kGB);
+           user_bytes_written / kGB, write_stall_micros);
  value->append(buf);
  // WAL
  snprintf(buf, sizeof(buf),
@ -301,14 +330,18 @@ void InternalStats::DumpDBStats(std::string* value) {
  // Interval
  uint64_t interval_write_other = write_other - db_stats_snapshot_.write_other;
  uint64_t interval_write_self = write_self - db_stats_snapshot_.write_self;
+  uint64_t interval_num_keys_written =
+      num_keys_written - db_stats_snapshot_.num_keys_written;
  snprintf(buf, sizeof(buf),
-           "Interval writes: %" PRIu64 " writes, %" PRIu64 " batches, "
-           "%.1f writes per batch, %.1f MB user ingest\n",
+           "Interval writes: %" PRIu64 " writes, %" PRIu64 " keys, %" PRIu64
+           " batches, %.1f writes per batch, %.1f MB user ingest, "
+           "stall micros: %" PRIu64 "\n",
           interval_write_other + interval_write_self,
-           interval_write_self,
+           interval_num_keys_written, interval_write_self,
           static_cast<double>(interval_write_other + interval_write_self) /
               (interval_write_self + 1),
-           (user_bytes_written - db_stats_snapshot_.ingest_bytes) / kMB);
+           (user_bytes_written - db_stats_snapshot_.ingest_bytes) / kMB,
+           write_stall_micros - db_stats_snapshot_.write_stall_micros);
  value->append(buf);

  uint64_t interval_write_with_wal =
@ -330,30 +363,33 @@ void InternalStats::DumpDBStats(std::string* value) {
  db_stats_snapshot_.ingest_bytes = user_bytes_written;
  db_stats_snapshot_.write_other = write_other;
  db_stats_snapshot_.write_self = write_self;
+  db_stats_snapshot_.num_keys_written = num_keys_written;
  db_stats_snapshot_.wal_bytes = wal_bytes;
  db_stats_snapshot_.wal_synced = wal_synced;
  db_stats_snapshot_.write_with_wal = write_with_wal;
+  db_stats_snapshot_.write_stall_micros = write_stall_micros;
 }

 void InternalStats::DumpCFStats(std::string* value) {
-  Version* current = cfd_->current();
+  const VersionStorageInfo* vstorage = cfd_->current()->storage_info();

  int num_levels_to_check =
-      (cfd_->options()->compaction_style != kCompactionStyleUniversal &&
-       cfd_->options()->compaction_style != kCompactionStyleFIFO)
-          ? current->NumberLevels() - 1
+      (cfd_->ioptions()->compaction_style != kCompactionStyleUniversal &&
+       cfd_->ioptions()->compaction_style != kCompactionStyleFIFO)
+          ? vstorage->num_levels() - 1
          : 1;
+
  // Compaction scores are sorted base on its value. Restore them to the
  // level order
  std::vector<double> compaction_score(number_levels_, 0);
  for (int i = 0; i < num_levels_to_check; ++i) {
-    compaction_score[current->compaction_level_[i]] =
-      current->compaction_score_[i];
+    compaction_score[vstorage->CompactionScoreLevel(i)] =
+        vstorage->CompactionScore(i);
  }
  // Count # of files being compacted for each level
  std::vector<int> files_being_compacted(number_levels_, 0);
  for (int level = 0; level < num_levels_to_check; ++level) {
-    for (auto* f : current->files_[level]) {
+    for (auto* f : vstorage->LevelFiles(level)) {
      if (f->being_compacted) {
        ++files_being_compacted[level];
      }
@ -376,7 +412,7 @@ void InternalStats::DumpCFStats(std::string* value) {
  uint64_t total_stall_count = 0;
  double total_stall_us = 0;
  for (int level = 0; level < number_levels_; level++) {
-    int files = current->NumLevelFiles(level);
+    int files = vstorage->NumLevelFiles(level);
    total_files += files;
    total_files_being_compacted += files_being_compacted[level];
    if (comp_stats_[level].micros > 0 || files > 0) {
@ -395,36 +431,29 @@ void InternalStats::DumpCFStats(std::string* value) {
            stall_leveln_slowdown_hard_[level]);

      stats_sum.Add(comp_stats_[level]);
-      total_file_size += current->NumLevelBytes(level);
+      total_file_size += vstorage->NumLevelBytes(level);
      total_stall_us += stall_us;
      total_stall_count += stalls;
      total_slowdown_soft += stall_leveln_slowdown_soft_[level];
      total_slowdown_count_soft += stall_leveln_slowdown_count_soft_[level];
      total_slowdown_hard += stall_leveln_slowdown_hard_[level];
      total_slowdown_count_hard += stall_leveln_slowdown_count_hard_[level];
-      int64_t bytes_read = comp_stats_[level].bytes_readn +
-                           comp_stats_[level].bytes_readnp1;
-      double rw_amp = (comp_stats_[level].bytes_readn == 0) ? 0.0
-          : (comp_stats_[level].bytes_written + bytes_read) /
-            static_cast<double>(comp_stats_[level].bytes_readn);
      double w_amp = (comp_stats_[level].bytes_readn == 0) ? 0.0
          : comp_stats_[level].bytes_written /
            static_cast<double>(comp_stats_[level].bytes_readn);
-      PrintLevelStats(buf, sizeof(buf), "L" + std::to_string(level),
-          files, files_being_compacted[level], current->NumLevelBytes(level),
-          compaction_score[level], rw_amp, w_amp, stall_us, stalls,
-          comp_stats_[level]);
+      PrintLevelStats(buf, sizeof(buf), "L" + ToString(level), files,
+                      files_being_compacted[level],
+                      vstorage->NumLevelBytes(level), compaction_score[level],
+                      w_amp, stall_us, stalls, comp_stats_[level]);
      value->append(buf);
    }
  }
  uint64_t curr_ingest = cf_stats_value_[BYTES_FLUSHED];
  // Cumulative summary
-  double rw_amp = (stats_sum.bytes_written + stats_sum.bytes_readn +
-      stats_sum.bytes_readnp1) / static_cast<double>(curr_ingest + 1);
  double w_amp = stats_sum.bytes_written / static_cast<double>(curr_ingest + 1);
  // Stats summary across levels
  PrintLevelStats(buf, sizeof(buf), "Sum", total_files,
-      total_files_being_compacted, total_file_size, 0, rw_amp, w_amp,
+      total_files_being_compacted, total_file_size, 0, w_amp,
      total_stall_us, total_stall_count, stats_sum);
  value->append(buf);
  // Interval summary
@ -432,12 +461,9 @@ void InternalStats::DumpCFStats(std::string* value) {
      curr_ingest - cf_stats_snapshot_.ingest_bytes + 1;
  CompactionStats interval_stats(stats_sum);
  interval_stats.Subtract(cf_stats_snapshot_.comp_stats);
-  rw_amp = (interval_stats.bytes_written +
-      interval_stats.bytes_readn + interval_stats.bytes_readnp1) /
-      static_cast<double>(interval_ingest);
  w_amp = interval_stats.bytes_written / static_cast<double>(interval_ingest);
  PrintLevelStats(buf, sizeof(buf), "Int", 0, 0, 0, 0,
-      rw_amp, w_amp, total_stall_us - cf_stats_snapshot_.stall_us,
+      w_amp, total_stall_us - cf_stats_snapshot_.stall_us,
      total_stall_count - cf_stats_snapshot_.stall_count, interval_stats);
  value->append(buf);

@ -473,4 +499,14 @@ void InternalStats::DumpCFStats(std::string* value) {
  cf_stats_snapshot_.stall_count = total_stall_count;
 }

+
+#else
+
+DBPropertyType GetPropertyType(const Slice& property, bool* is_int_property,
+                               bool* need_out_of_mutex) {
+  return kUnknown;
+}
+
+#endif  // !ROCKSDB_LITE
+
 }  // namespace rocksdb
--- a/db/internal_stats.h
+++ b/db/internal_stats.h
@ -21,6 +21,8 @@ namespace rocksdb {
 class MemTableList;
 class DBImpl;

+// IMPORTANT: If you add a new property here, also add it to the list in
+//            include/rocksdb/db.h
 enum DBPropertyType : uint32_t {
  kUnknown,
  kNumFilesAtLevel,  // Number of files at a specific level
@ -36,6 +38,8 @@ enum DBPropertyType : uint32_t {
  kCompactionPending,      // Return 1 if a compaction is pending. Otherwise 0.
  kBackgroundErrors,       // Return accumulated background errors encountered.
  kCurSizeActiveMemTable,  // Return current size of the active memtable
+  kCurSizeAllMemTables,    // Return current size of all (active + immutable)
+                           // memtables
  kNumEntriesInMutableMemtable,    // Return number of entries in the mutable
                                   // memtable.
  kNumEntriesInImmutableMemtable,  // Return sum of number of entries in all
@ -44,12 +48,16 @@ enum DBPropertyType : uint32_t {
  kEstimatedUsageByTableReaders,  // Estimated memory by table readers.
  kIsFileDeletionEnabled,         // Equals disable_delete_obsolete_files_,
                                  // 0 means file deletions enabled
+  kNumSnapshots,                  // Number of snapshots in the system
+  kOldestSnapshotTime,            // Unix timestamp of the first snapshot
 };

 extern DBPropertyType GetPropertyType(const Slice& property,
                                      bool* is_int_property,
                                      bool* need_out_of_mutex);

+
+#ifndef ROCKSDB_LITE
 class InternalStats {
 public:
  enum InternalCFStatsType {
@ -65,9 +73,11 @@ class InternalStats {
    WAL_FILE_BYTES,
    WAL_FILE_SYNCED,
    BYTES_WRITTEN,
+    NUMBER_KEYS_WRITTEN,
    WRITE_DONE_BY_OTHER,
    WRITE_DONE_BY_SELF,
    WRITE_WITH_WAL,
+    WRITE_STALL_MICROS,
    INTERNAL_DB_STATS_ENUM_MAX,
  };

@ -114,6 +124,9 @@ class InternalStats {
    // Total bytes written during compaction between levels N and N+1
    uint64_t bytes_written;

+    // Total bytes moved to this level
+    uint64_t bytes_moved;
+
    // Files read from level N during compaction between levels N and N+1
    int files_in_leveln;

@ -123,27 +136,40 @@ class InternalStats {
    // Files written during compaction between levels N and N+1
    int files_out_levelnp1;

+    // Total incoming entries during compaction between levels N and N+1
+    uint64_t num_input_records;
+
+    // Accumulated diff number of entries
+    // (num input entries - num output entires) for compaction  levels N and N+1
+    uint64_t num_dropped_records;
+
    // Number of compactions done
    int count;

-    explicit CompactionStats(int count = 0)
+    explicit CompactionStats(int _count = 0)
        : micros(0),
          bytes_readn(0),
          bytes_readnp1(0),
          bytes_written(0),
+          bytes_moved(0),
          files_in_leveln(0),
          files_in_levelnp1(0),
          files_out_levelnp1(0),
-          count(count) {}
+          num_input_records(0),
+          num_dropped_records(0),
+          count(_count) {}

    explicit CompactionStats(const CompactionStats& c)
        : micros(c.micros),
          bytes_readn(c.bytes_readn),
          bytes_readnp1(c.bytes_readnp1),
          bytes_written(c.bytes_written),
+          bytes_moved(c.bytes_moved),
          files_in_leveln(c.files_in_leveln),
          files_in_levelnp1(c.files_in_levelnp1),
          files_out_levelnp1(c.files_out_levelnp1),
+          num_input_records(c.num_input_records),
+          num_dropped_records(c.num_dropped_records),
          count(c.count) {}

    void Add(const CompactionStats& c) {
@ -151,9 +177,12 @@ class InternalStats {
      this->bytes_readn += c.bytes_readn;
      this->bytes_readnp1 += c.bytes_readnp1;
      this->bytes_written += c.bytes_written;
+      this->bytes_moved += c.bytes_moved;
      this->files_in_leveln += c.files_in_leveln;
      this->files_in_levelnp1 += c.files_in_levelnp1;
      this->files_out_levelnp1 += c.files_out_levelnp1;
+      this->num_input_records += c.num_input_records;
+      this->num_dropped_records += c.num_dropped_records;
      this->count += c.count;
    }

@ -162,9 +191,12 @@ class InternalStats {
      this->bytes_readn -= c.bytes_readn;
      this->bytes_readnp1 -= c.bytes_readnp1;
      this->bytes_written -= c.bytes_written;
+      this->bytes_moved -= c.bytes_moved;
      this->files_in_leveln -= c.files_in_leveln;
      this->files_in_levelnp1 -= c.files_in_levelnp1;
      this->files_out_levelnp1 -= c.files_out_levelnp1;
+      this->num_input_records -= c.num_input_records;
+      this->num_dropped_records -= c.num_dropped_records;
      this->count -= c.count;
    }
  };
@ -173,6 +205,10 @@ class InternalStats {
    comp_stats_[level].Add(stats);
  }

+  void IncBytesMoved(int level, uint64_t amount) {
+    comp_stats_[level].bytes_moved += amount;
+  }
+
  void RecordLevelNSlowdown(int level, uint64_t micros, bool soft) {
    if (soft) {
      stall_leveln_slowdown_soft_[level] += micros;
@ -247,6 +283,13 @@ class InternalStats {
    // another thread.
    uint64_t write_other;
    uint64_t write_self;
+    // Total number of keys written. write_self and write_other measure number
+    // of write requests written, Each of the write request can contain updates
+    // to multiple keys. num_keys_written is total number of keys updated by all
+    // those writes.
+    uint64_t num_keys_written;
+    // Total time writes delayed by stalls.
+    uint64_t write_stall_micros;
    double seconds_up;

    DBStatsSnapshot()
@ -256,6 +299,8 @@ class InternalStats {
          write_with_wal(0),
          write_other(0),
          write_self(0),
+          num_keys_written(0),
+          write_stall_micros(0),
          seconds_up(0) {}
  } db_stats_snapshot_;

@ -272,4 +317,78 @@ class InternalStats {
  const uint64_t started_at_;
 };

+#else
+
+class InternalStats {
+ public:
+  enum InternalCFStatsType {
+    LEVEL0_SLOWDOWN,
+    MEMTABLE_COMPACTION,
+    LEVEL0_NUM_FILES,
+    WRITE_STALLS_ENUM_MAX,
+    BYTES_FLUSHED,
+    INTERNAL_CF_STATS_ENUM_MAX,
+  };
+
+  enum InternalDBStatsType {
+    WAL_FILE_BYTES,
+    WAL_FILE_SYNCED,
+    BYTES_WRITTEN,
+    NUMBER_KEYS_WRITTEN,
+    WRITE_DONE_BY_OTHER,
+    WRITE_DONE_BY_SELF,
+    WRITE_WITH_WAL,
+    WRITE_STALL_MICROS,
+    INTERNAL_DB_STATS_ENUM_MAX,
+  };
+
+  InternalStats(int num_levels, Env* env, ColumnFamilyData* cfd) {}
+
+  struct CompactionStats {
+    uint64_t micros;
+    uint64_t bytes_readn;
+    uint64_t bytes_readnp1;
+    uint64_t bytes_written;
+    uint64_t bytes_moved;
+    int files_in_leveln;
+    int files_in_levelnp1;
+    int files_out_levelnp1;
+    uint64_t num_input_records;
+    uint64_t num_dropped_records;
+    int count;
+
+    explicit CompactionStats(int _count = 0) {}
+
+    explicit CompactionStats(const CompactionStats& c) {}
+
+    void Add(const CompactionStats& c) {}
+
+    void Subtract(const CompactionStats& c) {}
+  };
+
+  void AddCompactionStats(int level, const CompactionStats& stats) {}
+
+  void IncBytesMoved(int level, uint64_t amount) {}
+
+  void RecordLevelNSlowdown(int level, uint64_t micros, bool soft) {}
+
+  void AddCFStats(InternalCFStatsType type, uint64_t value) {}
+
+  void AddDBStats(InternalDBStatsType type, uint64_t value) {}
+
+  uint64_t GetBackgroundErrorCount() const { return 0; }
+
+  uint64_t BumpAndGetBackgroundErrorCount() { return 0; }
+
+  bool GetStringProperty(DBPropertyType property_type, const Slice& property,
+                         std::string* value) { return false; }
+
+  bool GetIntProperty(DBPropertyType property_type, uint64_t* value,
+                      DBImpl* db) const { return false; }
+
+  bool GetIntPropertyOutOfMutex(DBPropertyType property_type, Version* version,
+                                uint64_t* value) const { return false; }
+};
+#endif  // !ROCKSDB_LITE
+
 }  // namespace rocksdb
--- a/db/job_context.h
+++ b/db/job_context.h
@ -0,0 +1,103 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "db/column_family.h"
+
+namespace rocksdb {
+
+class MemTable;
+
+struct JobContext {
+  inline bool HaveSomethingToDelete() const {
+    return full_scan_candidate_files.size() || sst_delete_files.size() ||
+           log_delete_files.size() || new_superversion != nullptr ||
+           superversions_to_free.size() > 0 || memtables_to_free.size() > 0;
+  }
+
+  // Structure to store information for candidate files to delete.
+  struct CandidateFileInfo {
+    std::string file_name;
+    uint32_t path_id;
+    CandidateFileInfo(std::string name, uint32_t path)
+        : file_name(std::move(name)), path_id(path) {}
+    bool operator==(const CandidateFileInfo& other) const {
+      return file_name == other.file_name && path_id == other.path_id;
+    }
+  };
+
+  // a list of all files that we'll consider deleting
+  // (every once in a while this is filled up with all files
+  // in the DB directory)
+  // (filled only if we're doing full scan)
+  std::vector<CandidateFileInfo> full_scan_candidate_files;
+
+  // the list of all live sst files that cannot be deleted
+  std::vector<FileDescriptor> sst_live;
+
+  // a list of sst files that we need to delete
+  std::vector<FileMetaData*> sst_delete_files;
+
+  // a list of log files that we need to delete
+  std::vector<uint64_t> log_delete_files;
+
+  // a list of memtables to be free
+  autovector<MemTable*> memtables_to_free;
+
+  autovector<SuperVersion*> superversions_to_free;
+
+  SuperVersion* new_superversion;  // if nullptr no new superversion
+
+  // the current manifest_file_number, log_number and prev_log_number
+  // that corresponds to the set of files in 'live'.
+  uint64_t manifest_file_number;
+  uint64_t pending_manifest_file_number;
+  uint64_t log_number;
+  uint64_t prev_log_number;
+
+  uint64_t min_pending_output = 0;
+
+  explicit JobContext(bool create_superversion = false) {
+    manifest_file_number = 0;
+    pending_manifest_file_number = 0;
+    log_number = 0;
+    prev_log_number = 0;
+    new_superversion = create_superversion ? new SuperVersion() : nullptr;
+  }
+
+  void Clean() {
+    // free pending memtables
+    for (auto m : memtables_to_free) {
+      delete m;
+    }
+    // free superversions
+    for (auto s : superversions_to_free) {
+      delete s;
+    }
+    // if new_superversion was not used, it will be non-nullptr and needs
+    // to be freed here
+    delete new_superversion;
+
+    memtables_to_free.clear();
+    superversions_to_free.clear();
+    new_superversion = nullptr;
+  }
+
+  ~JobContext() {
+    assert(memtables_to_free.size() == 0);
+    assert(superversions_to_free.size() == 0);
+    assert(new_superversion == nullptr);
+  }
+};
+
+}  // namespace rocksdb
--- a/db/listener_test.cc
+++ b/db/listener_test.cc
@ -0,0 +1,401 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+#include "db/dbformat.h"
+#include "db/db_impl.h"
+#include "db/filename.h"
+#include "db/version_set.h"
+#include "db/write_batch_internal.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/perf_context.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/table.h"
+#include "rocksdb/options.h"
+#include "rocksdb/table_properties.h"
+#include "table/block_based_table_factory.h"
+#include "table/plain_table_factory.h"
+#include "util/hash.h"
+#include "util/hash_linklist_rep.h"
+#include "utilities/merge_operators.h"
+#include "util/logging.h"
+#include "util/mutexlock.h"
+#include "util/rate_limiter.h"
+#include "util/statistics.h"
+#include "util/testharness.h"
+#include "util/sync_point.h"
+#include "util/testutil.h"
+
+#ifndef ROCKSDB_LITE
+
+namespace rocksdb {
+
+class EventListenerTest {
+ public:
+  EventListenerTest() {
+    dbname_ = test::TmpDir() + "/listener_test";
+    ASSERT_OK(DestroyDB(dbname_, Options()));
+    db_ = nullptr;
+    Reopen();
+  }
+
+  ~EventListenerTest() {
+    Close();
+    Options options;
+    options.db_paths.emplace_back(dbname_, 0);
+    options.db_paths.emplace_back(dbname_ + "_2", 0);
+    options.db_paths.emplace_back(dbname_ + "_3", 0);
+    options.db_paths.emplace_back(dbname_ + "_4", 0);
+    ASSERT_OK(DestroyDB(dbname_, options));
+  }
+
+  void CreateColumnFamilies(const std::vector<std::string>& cfs,
+                            const ColumnFamilyOptions* options = nullptr) {
+    ColumnFamilyOptions cf_opts;
+    cf_opts = ColumnFamilyOptions(Options());
+    size_t cfi = handles_.size();
+    handles_.resize(cfi + cfs.size());
+    for (auto cf : cfs) {
+      ASSERT_OK(db_->CreateColumnFamily(cf_opts, cf, &handles_[cfi++]));
+    }
+  }
+
+  void Close() {
+    for (auto h : handles_) {
+      delete h;
+    }
+    handles_.clear();
+    delete db_;
+    db_ = nullptr;
+  }
+
+  void ReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+                                const Options* options = nullptr) {
+    ASSERT_OK(TryReopenWithColumnFamilies(cfs, options));
+  }
+
+  Status TryReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+                                     const Options* options = nullptr) {
+    Close();
+    Options opts = (options == nullptr) ? Options() : *options;
+    std::vector<const Options*> v_opts(cfs.size(), &opts);
+    return TryReopenWithColumnFamilies(cfs, v_opts);
+  }
+
+  Status TryReopenWithColumnFamilies(
+      const std::vector<std::string>& cfs,
+      const std::vector<const Options*>& options) {
+    Close();
+    ASSERT_EQ(cfs.size(), options.size());
+    std::vector<ColumnFamilyDescriptor> column_families;
+    for (size_t i = 0; i < cfs.size(); ++i) {
+      column_families.push_back(ColumnFamilyDescriptor(cfs[i], *options[i]));
+    }
+    DBOptions db_opts = DBOptions(*options[0]);
+    return DB::Open(db_opts, dbname_, column_families, &handles_, &db_);
+  }
+
+  Status TryReopen(Options* options = nullptr) {
+    Close();
+    Options opts;
+    if (options != nullptr) {
+      opts = *options;
+    } else {
+      opts.create_if_missing = true;
+    }
+
+    return DB::Open(opts, dbname_, &db_);
+  }
+
+  void Reopen(Options* options = nullptr) {
+    ASSERT_OK(TryReopen(options));
+  }
+
+  void CreateAndReopenWithCF(const std::vector<std::string>& cfs,
+                             const Options* options = nullptr) {
+    CreateColumnFamilies(cfs, options);
+    std::vector<std::string> cfs_plus_default = cfs;
+    cfs_plus_default.insert(cfs_plus_default.begin(), kDefaultColumnFamilyName);
+    ReopenWithColumnFamilies(cfs_plus_default, options);
+  }
+
+  DBImpl* dbfull() {
+    return reinterpret_cast<DBImpl*>(db_);
+  }
+
+  Status Put(int cf, const Slice& k, const Slice& v,
+             WriteOptions wo = WriteOptions()) {
+    return db_->Put(wo, handles_[cf], k, v);
+  }
+
+  Status Flush(int cf = 0) {
+    FlushOptions opt = FlushOptions();
+    opt.wait = true;
+    if (cf == 0) {
+      return db_->Flush(opt);
+    } else {
+      return db_->Flush(opt, handles_[cf]);
+    }
+  }
+
+  DB* db_;
+  std::string dbname_;
+  std::vector<ColumnFamilyHandle*> handles_;
+};
+
+class TestCompactionListener : public EventListener {
+ public:
+  void OnCompactionCompleted(DB *db, const CompactionJobInfo& ci) override {
+    compacted_dbs_.push_back(db);
+  }
+
+  std::vector<DB*> compacted_dbs_;
+};
+
+TEST(EventListenerTest, OnSingleDBCompactionTest) {
+  const int kTestKeySize = 16;
+  const int kTestValueSize = 984;
+  const int kEntrySize = kTestKeySize + kTestValueSize;
+  const int kEntriesPerBuffer = 100;
+  const int kNumL0Files = 4;
+
+  Options options;
+  options.create_if_missing = true;
+  options.write_buffer_size = kEntrySize * kEntriesPerBuffer;
+  options.compaction_style = kCompactionStyleLevel;
+  options.target_file_size_base = options.write_buffer_size;
+  options.max_bytes_for_level_base = options.target_file_size_base * 2;
+  options.max_bytes_for_level_multiplier = 2;
+  options.compression = kNoCompression;
+  options.enable_thread_tracking = true;
+  options.level0_file_num_compaction_trigger = kNumL0Files;
+
+  TestCompactionListener* listener = new TestCompactionListener();
+  options.listeners.emplace_back(listener);
+  std::vector<std::string> cf_names = {
+      "pikachu", "ilya", "muromec", "dobrynia",
+      "nikitich", "alyosha", "popovich"};
+  CreateAndReopenWithCF(cf_names, &options);
+  ASSERT_OK(Put(1, "pikachu", std::string(90000, 'p')));
+  ASSERT_OK(Put(2, "ilya", std::string(90000, 'i')));
+  ASSERT_OK(Put(3, "muromec", std::string(90000, 'm')));
+  ASSERT_OK(Put(4, "dobrynia", std::string(90000, 'd')));
+  ASSERT_OK(Put(5, "nikitich", std::string(90000, 'n')));
+  ASSERT_OK(Put(6, "alyosha", std::string(90000, 'a')));
+  ASSERT_OK(Put(7, "popovich", std::string(90000, 'p')));
+  for (size_t i = 1; i < 8; ++i) {
+    ASSERT_OK(Flush(static_cast<int>(i)));
+    const Slice kStart = "a";
+    const Slice kEnd = "z";
+    ASSERT_OK(dbfull()->CompactRange(handles_[i], &kStart, &kEnd));
+    dbfull()->TEST_WaitForFlushMemTable();
+    dbfull()->TEST_WaitForCompact();
+  }
+
+  ASSERT_EQ(listener->compacted_dbs_.size(), cf_names.size());
+  for (size_t i = 0; i < cf_names.size(); ++i) {
+    ASSERT_EQ(listener->compacted_dbs_[i], db_);
+  }
+}
+
+class TestFlushListener : public EventListener {
+ public:
+  void OnFlushCompleted(
+      DB* db, const std::string& name,
+      const std::string& file_path,
+      bool triggered_writes_slowdown,
+      bool triggered_writes_stop) override {
+    flushed_dbs_.push_back(db);
+    flushed_column_family_names_.push_back(name);
+    if (triggered_writes_slowdown) {
+      slowdown_count++;
+    }
+    if (triggered_writes_stop) {
+      stop_count++;
+    }
+  }
+
+  std::vector<std::string> flushed_column_family_names_;
+  std::vector<DB*> flushed_dbs_;
+  int slowdown_count;
+  int stop_count;
+};
+
+TEST(EventListenerTest, OnSingleDBFlushTest) {
+  Options options;
+  options.write_buffer_size = 100000;
+  TestFlushListener* listener = new TestFlushListener();
+  options.listeners.emplace_back(listener);
+  std::vector<std::string> cf_names = {
+      "pikachu", "ilya", "muromec", "dobrynia",
+      "nikitich", "alyosha", "popovich"};
+  CreateAndReopenWithCF(cf_names, &options);
+
+  ASSERT_OK(Put(1, "pikachu", std::string(90000, 'p')));
+  ASSERT_OK(Put(2, "ilya", std::string(90000, 'i')));
+  ASSERT_OK(Put(3, "muromec", std::string(90000, 'm')));
+  ASSERT_OK(Put(4, "dobrynia", std::string(90000, 'd')));
+  ASSERT_OK(Put(5, "nikitich", std::string(90000, 'n')));
+  ASSERT_OK(Put(6, "alyosha", std::string(90000, 'a')));
+  ASSERT_OK(Put(7, "popovich", std::string(90000, 'p')));
+  for (size_t i = 1; i < 8; ++i) {
+    ASSERT_OK(Flush(static_cast<int>(i)));
+    dbfull()->TEST_WaitForFlushMemTable();
+    ASSERT_EQ(listener->flushed_dbs_.size(), i);
+    ASSERT_EQ(listener->flushed_column_family_names_.size(), i);
+  }
+
+  // make sure call-back functions are called in the right order
+  for (size_t i = 0; i < cf_names.size(); ++i) {
+    ASSERT_EQ(listener->flushed_dbs_[i], db_);
+    ASSERT_EQ(listener->flushed_column_family_names_[i], cf_names[i]);
+  }
+}
+
+TEST(EventListenerTest, MultiCF) {
+  Options options;
+  options.write_buffer_size = 100000;
+  TestFlushListener* listener = new TestFlushListener();
+  options.listeners.emplace_back(listener);
+  std::vector<std::string> cf_names = {
+      "pikachu", "ilya", "muromec", "dobrynia",
+      "nikitich", "alyosha", "popovich"};
+  CreateAndReopenWithCF(cf_names, &options);
+
+  ASSERT_OK(Put(1, "pikachu", std::string(90000, 'p')));
+  ASSERT_OK(Put(2, "ilya", std::string(90000, 'i')));
+  ASSERT_OK(Put(3, "muromec", std::string(90000, 'm')));
+  ASSERT_OK(Put(4, "dobrynia", std::string(90000, 'd')));
+  ASSERT_OK(Put(5, "nikitich", std::string(90000, 'n')));
+  ASSERT_OK(Put(6, "alyosha", std::string(90000, 'a')));
+  ASSERT_OK(Put(7, "popovich", std::string(90000, 'p')));
+  for (size_t i = 1; i < 8; ++i) {
+    ASSERT_OK(Flush(static_cast<int>(i)));
+    ASSERT_EQ(listener->flushed_dbs_.size(), i);
+    ASSERT_EQ(listener->flushed_column_family_names_.size(), i);
+  }
+
+  // make sure call-back functions are called in the right order
+  for (size_t i = 0; i < cf_names.size(); i++) {
+    ASSERT_EQ(listener->flushed_dbs_[i], db_);
+    ASSERT_EQ(listener->flushed_column_family_names_[i], cf_names[i]);
+  }
+}
+
+TEST(EventListenerTest, MultiDBMultiListeners) {
+  std::vector<TestFlushListener*> listeners;
+  const int kNumDBs = 5;
+  const int kNumListeners = 10;
+  for (int i = 0; i < kNumListeners; ++i) {
+    listeners.emplace_back(new TestFlushListener());
+  }
+
+  std::vector<std::string> cf_names = {
+      "pikachu", "ilya", "muromec", "dobrynia",
+      "nikitich", "alyosha", "popovich"};
+
+  Options options;
+  options.create_if_missing = true;
+  for (int i = 0; i < kNumListeners; ++i) {
+    options.listeners.emplace_back(listeners[i]);
+  }
+  DBOptions db_opts(options);
+  ColumnFamilyOptions cf_opts(options);
+
+  std::vector<DB*> dbs;
+  std::vector<std::vector<ColumnFamilyHandle *>> vec_handles;
+
+  for (int d = 0; d < kNumDBs; ++d) {
+    ASSERT_OK(DestroyDB(dbname_ + ToString(d), options));
+    DB* db;
+    std::vector<ColumnFamilyHandle*> handles;
+    ASSERT_OK(DB::Open(options, dbname_ + ToString(d), &db));
+    for (size_t c = 0; c < cf_names.size(); ++c) {
+      ColumnFamilyHandle* handle;
+      db->CreateColumnFamily(cf_opts, cf_names[c], &handle);
+      handles.push_back(handle);
+    }
+
+    vec_handles.push_back(std::move(handles));
+    dbs.push_back(db);
+  }
+
+  for (int d = 0; d < kNumDBs; ++d) {
+    for (size_t c = 0; c < cf_names.size(); ++c) {
+      ASSERT_OK(dbs[d]->Put(WriteOptions(), vec_handles[d][c],
+                cf_names[c], cf_names[c]));
+    }
+  }
+
+  for (size_t c = 0; c < cf_names.size(); ++c) {
+    for (int d = 0; d < kNumDBs; ++d) {
+      ASSERT_OK(dbs[d]->Flush(FlushOptions(), vec_handles[d][c]));
+      reinterpret_cast<DBImpl*>(dbs[d])->TEST_WaitForFlushMemTable();
+    }
+  }
+
+  for (auto* listener : listeners) {
+    int pos = 0;
+    for (size_t c = 0; c < cf_names.size(); ++c) {
+      for (int d = 0; d < kNumDBs; ++d) {
+        ASSERT_EQ(listener->flushed_dbs_[pos], dbs[d]);
+        ASSERT_EQ(listener->flushed_column_family_names_[pos], cf_names[c]);
+        pos++;
+      }
+    }
+  }
+
+  for (auto handles : vec_handles) {
+    for (auto h : handles) {
+      delete h;
+    }
+    handles.clear();
+  }
+  vec_handles.clear();
+
+  for (auto db : dbs) {
+    delete db;
+  }
+}
+
+TEST(EventListenerTest, DisableBGCompaction) {
+  Options options;
+  TestFlushListener* listener = new TestFlushListener();
+  const int kSlowdownTrigger = 5;
+  const int kStopTrigger = 10;
+  options.level0_slowdown_writes_trigger = kSlowdownTrigger;
+  options.level0_stop_writes_trigger = kStopTrigger;
+  options.listeners.emplace_back(listener);
+  // BG compaction is disabled.  Number of L0 files will simply keeps
+  // increasing in this test.
+  options.compaction_style = kCompactionStyleNone;
+  options.compression = kNoCompression;
+  options.write_buffer_size = 100000;  // Small write buffer
+
+  CreateAndReopenWithCF({"pikachu"}, &options);
+  WriteOptions wopts;
+  wopts.timeout_hint_us = 100000;
+  ColumnFamilyMetaData cf_meta;
+  db_->GetColumnFamilyMetaData(handles_[1], &cf_meta);
+  // keep writing until writes are forced to stop.
+  for (int i = 0; static_cast<int>(cf_meta.file_count) < kStopTrigger; ++i) {
+    Put(1, ToString(i), std::string(100000, 'x'), wopts);
+    db_->GetColumnFamilyMetaData(handles_[1], &cf_meta);
+  }
+  ASSERT_GE(listener->slowdown_count, kStopTrigger - kSlowdownTrigger);
+  ASSERT_GE(listener->stop_count, 1);
+}
+
+}  // namespace rocksdb
+
+#endif  // ROCKSDB_LITE
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
+
--- a/db/log_and_apply_bench.cc
+++ b/db/log_and_apply_bench.cc
@ -6,25 +6,33 @@

 #include <vector>

+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include <inttypes.h>
 #include "util/testharness.h"
 #include "util/benchharness.h"
 #include "db/version_set.h"
+#include "db/write_controller.h"
+#include "db/writebuffer.h"
 #include "util/mutexlock.h"

 namespace rocksdb {

-std::string MakeKey(unsigned int num) {
+std::string MakeKey(uint64_t num) {
  char buf[30];
-  snprintf(buf, sizeof(buf), "%016u", num);
+  snprintf(buf, sizeof(buf), "%016" PRIu64, num);
  return std::string(buf);
 }

 void BM_LogAndApply(int iters, int num_base_files) {
  VersionSet* vset;
+  WriteController wc;
  ColumnFamilyData* default_cfd;
  uint64_t fnum = 1;
-  port::Mutex mu;
-  MutexLock l(&mu);
+  InstrumentedMutex mu;
+  InstrumentedMutexLock l(&mu);

  BENCHMARK_SUSPEND {
    std::string dbname = test::TmpDir() + "/rocksdb_test_benchmark";
@ -45,9 +53,10 @@ void BM_LogAndApply(int iters, int num_base_files) {
    // Notice we are using the default options not through SanitizeOptions().
    // We might want to initialize some options manually if needed.
    options.db_paths.emplace_back(dbname, 0);
+    WriteBuffer wb(options.db_write_buffer_size);
    // The parameter of table cache is passed in as null, so any file I/O
    // operation is likely to fail.
-    vset = new VersionSet(dbname, &options, sopt, nullptr);
+    vset = new VersionSet(dbname, &options, sopt, nullptr, &wb, &wc);
    std::vector<ColumnFamilyDescriptor> dummy;
    dummy.push_back(ColumnFamilyDescriptor());
    ASSERT_OK(vset->Recover(dummy));
@ -58,7 +67,8 @@ void BM_LogAndApply(int iters, int num_base_files) {
      InternalKey limit(MakeKey(2 * fnum + 1), 1, kTypeDeletion);
      vbase.AddFile(2, ++fnum, 0, 1 /* file size */, start, limit, 1, 1);
    }
-    ASSERT_OK(vset->LogAndApply(default_cfd, &vbase, &mu));
+    ASSERT_OK(vset->LogAndApply(default_cfd,
+          *default_cfd->GetLatestMutableCFOptions(), &vbase, &mu));
  }

  for (int i = 0; i < iters; i++) {
@ -67,8 +77,10 @@ void BM_LogAndApply(int iters, int num_base_files) {
    InternalKey start(MakeKey(2 * fnum), 1, kTypeValue);
    InternalKey limit(MakeKey(2 * fnum + 1), 1, kTypeDeletion);
    vedit.AddFile(2, ++fnum, 0, 1 /* file size */, start, limit, 1, 1);
-    vset->LogAndApply(default_cfd, &vedit, &mu);
+    vset->LogAndApply(default_cfd, *default_cfd->GetLatestMutableCFOptions(),
+                      &vedit, &mu);
  }
+  delete vset;
 }

 BENCHMARK_NAMED_PARAM(BM_LogAndApply, 1000_iters_1_file, 1000, 1)
--- a/db/log_reader.cc
+++ b/db/log_reader.cc
@ -20,9 +20,9 @@ namespace log {
 Reader::Reporter::~Reporter() {
 }

-Reader::Reader(unique_ptr<SequentialFile>&& file, Reporter* reporter,
+Reader::Reader(unique_ptr<SequentialFile>&& _file, Reporter* reporter,
               bool checksum, uint64_t initial_offset)
-    : file_(std::move(file)),
+    : file_(std::move(_file)),
      reporter_(reporter),
      checksum_(checksum),
      backing_store_(new char[kBlockSize]),
@ -32,8 +32,7 @@ Reader::Reader(unique_ptr<SequentialFile>&& file, Reporter* reporter,
      eof_offset_(0),
      last_record_offset_(0),
      end_of_buffer_offset_(0),
-      initial_offset_(initial_offset) {
-}
+      initial_offset_(initial_offset) {}

 Reader::~Reader() {
  delete[] backing_store_;
@ -55,7 +54,7 @@ bool Reader::SkipToInitialBlock() {
  if (block_start_location > 0) {
    Status skip_status = file_->Skip(block_start_location);
    if (!skip_status.ok()) {
-      ReportDrop(block_start_location, skip_status);
+      ReportDrop(static_cast<size_t>(block_start_location), skip_status);
      return false;
    }
  }
--- a/db/log_test.cc
+++ b/db/log_test.cc
@ -558,9 +558,9 @@ TEST(LogTest, ErrorJoinsRecords) {

  ASSERT_EQ("correct", Read());
  ASSERT_EQ("EOF", Read());
-  const unsigned int dropped = DroppedBytes();
-  ASSERT_LE(dropped, 2*kBlockSize + 100);
-  ASSERT_GE(dropped, 2*kBlockSize);
+  size_t dropped = DroppedBytes();
+  ASSERT_LE(dropped, 2 * kBlockSize + 100);
+  ASSERT_GE(dropped, 2 * kBlockSize);
 }

 TEST(LogTest, ReadStart) {
--- a/db/memtable.cc
+++ b/db/memtable.cc
@ -15,6 +15,7 @@

 #include "db/dbformat.h"
 #include "db/merge_context.h"
+#include "db/writebuffer.h"
 #include "rocksdb/comparator.h"
 #include "rocksdb/env.h"
 #include "rocksdb/iterator.h"
@ -31,41 +32,63 @@

 namespace rocksdb {

-MemTable::MemTable(const InternalKeyComparator& cmp, const Options& options)
+MemTableOptions::MemTableOptions(
+    const ImmutableCFOptions& ioptions,
+    const MutableCFOptions& mutable_cf_options)
+  : write_buffer_size(mutable_cf_options.write_buffer_size),
+    arena_block_size(mutable_cf_options.arena_block_size),
+    memtable_prefix_bloom_bits(mutable_cf_options.memtable_prefix_bloom_bits),
+    memtable_prefix_bloom_probes(
+        mutable_cf_options.memtable_prefix_bloom_probes),
+    memtable_prefix_bloom_huge_page_tlb_size(
+        mutable_cf_options.memtable_prefix_bloom_huge_page_tlb_size),
+    inplace_update_support(ioptions.inplace_update_support),
+    inplace_update_num_locks(mutable_cf_options.inplace_update_num_locks),
+    inplace_callback(ioptions.inplace_callback),
+    max_successive_merges(mutable_cf_options.max_successive_merges),
+    filter_deletes(mutable_cf_options.filter_deletes),
+    statistics(ioptions.statistics),
+    merge_operator(ioptions.merge_operator),
+    info_log(ioptions.info_log) {}
+
+MemTable::MemTable(const InternalKeyComparator& cmp,
+                   const ImmutableCFOptions& ioptions,
+                   const MutableCFOptions& mutable_cf_options,
+                   WriteBuffer* write_buffer)
    : comparator_(cmp),
+      moptions_(ioptions, mutable_cf_options),
      refs_(0),
-      kArenaBlockSize(OptimizeBlockSize(options.arena_block_size)),
-      kWriteBufferSize(options.write_buffer_size),
-      arena_(options.arena_block_size),
-      table_(options.memtable_factory->CreateMemTableRep(
-          comparator_, &arena_, options.prefix_extractor.get(),
-          options.info_log.get())),
+      kArenaBlockSize(OptimizeBlockSize(moptions_.arena_block_size)),
+      arena_(moptions_.arena_block_size),
+      allocator_(&arena_, write_buffer),
+      table_(ioptions.memtable_factory->CreateMemTableRep(
+          comparator_, &allocator_, ioptions.prefix_extractor,
+          ioptions.info_log)),
      num_entries_(0),
      flush_in_progress_(false),
      flush_completed_(false),
      file_number_(0),
      first_seqno_(0),
      mem_next_logfile_number_(0),
-      locks_(options.inplace_update_support ? options.inplace_update_num_locks
-                                            : 0),
-      prefix_extractor_(options.prefix_extractor.get()),
-      should_flush_(ShouldFlushNow()) {
+      locks_(moptions_.inplace_update_support ?
+             moptions_.inplace_update_num_locks : 0),
+      prefix_extractor_(ioptions.prefix_extractor),
+      should_flush_(ShouldFlushNow()),
+      flush_scheduled_(false) {
  // if should_flush_ == true without an entry inserted, something must have
  // gone wrong already.
  assert(!should_flush_);
-  if (prefix_extractor_ && options.memtable_prefix_bloom_bits > 0) {
+  if (prefix_extractor_ && moptions_.memtable_prefix_bloom_bits > 0) {
    prefix_bloom_.reset(new DynamicBloom(
-        &arena_,
-        options.memtable_prefix_bloom_bits, options.bloom_locality,
-        options.memtable_prefix_bloom_probes, nullptr,
-        options.memtable_prefix_bloom_huge_page_tlb_size,
-        options.info_log.get()));
+        &allocator_,
+        moptions_.memtable_prefix_bloom_bits, ioptions.bloom_locality,
+        moptions_.memtable_prefix_bloom_probes, nullptr,
+        moptions_.memtable_prefix_bloom_huge_page_tlb_size,
+        ioptions.info_log));
  }
 }

-MemTable::~MemTable() {
-  assert(refs_ == 0);
-}
+MemTable::~MemTable() { assert(refs_ == 0); }

 size_t MemTable::ApproximateMemoryUsage() {
  size_t arena_usage = arena_.ApproximateMemoryUsage();
@ -97,14 +120,16 @@ bool MemTable::ShouldFlushNow() const {
  // if we can still allocate one more block without exceeding the
  // over-allocation ratio, then we should not flush.
  if (allocated_memory + kArenaBlockSize <
-      kWriteBufferSize + kArenaBlockSize * kAllowOverAllocationRatio) {
+      moptions_.write_buffer_size +
+      kArenaBlockSize * kAllowOverAllocationRatio) {
    return false;
  }

-  // if user keeps adding entries that exceeds kWriteBufferSize, we need to
-  // flush earlier even though we still have much available memory left.
-  if (allocated_memory >
-      kWriteBufferSize + kArenaBlockSize * kAllowOverAllocationRatio) {
+  // if user keeps adding entries that exceeds moptions.write_buffer_size,
+  // we need to flush earlier even though we still have much available
+  // memory left.
+  if (allocated_memory > moptions_.write_buffer_size +
+      kArenaBlockSize * kAllowOverAllocationRatio) {
    return true;
  }

@ -158,7 +183,7 @@ Slice MemTableRep::UserKey(const char* key) const {
 }

 KeyHandle MemTableRep::Allocate(const size_t len, char** buf) {
-  *buf = arena_->Allocate(len);
+  *buf = allocator_->Allocate(len);
  return static_cast<KeyHandle>(*buf);
 }

@ -167,7 +192,7 @@ KeyHandle MemTableRep::Allocate(const size_t len, char** buf) {
 // into this scratch space.
 const char* EncodeKey(std::string* scratch, const Slice& target) {
  scratch->clear();
-  PutVarint32(scratch, target.size());
+  PutVarint32(scratch, static_cast<uint32_t>(target.size()));
  scratch->append(target.data(), target.size());
  return scratch->data();
 }
@ -175,12 +200,12 @@ const char* EncodeKey(std::string* scratch, const Slice& target) {
 class MemTableIterator: public Iterator {
 public:
  MemTableIterator(
-      const MemTable& mem, const ReadOptions& options, Arena* arena)
+      const MemTable& mem, const ReadOptions& read_options, Arena* arena)
      : bloom_(nullptr),
        prefix_extractor_(mem.prefix_extractor_),
        valid_(false),
        arena_mode_(arena != nullptr) {
-    if (prefix_extractor_ != nullptr && !options.total_order_seek) {
+    if (prefix_extractor_ != nullptr && !read_options.total_order_seek) {
      bloom_ = mem.prefix_bloom_.get();
      iter_ = mem.table_->GetDynamicPrefixIterator(arena);
    } else {
@ -248,14 +273,10 @@ class MemTableIterator: public Iterator {
  void operator=(const MemTableIterator&);
 };

-Iterator* MemTable::NewIterator(const ReadOptions& options, Arena* arena) {
-  if (arena == nullptr) {
-    return new MemTableIterator(*this, options, nullptr);
-  } else {
+Iterator* MemTable::NewIterator(const ReadOptions& read_options, Arena* arena) {
+  assert(arena != nullptr);
  auto mem = arena->AllocateAligned(sizeof(MemTableIterator));
-    return new (mem)
-        MemTableIterator(*this, options, arena);
-  }
+  return new (mem) MemTableIterator(*this, read_options, arena);
 }

 port::RWMutex* MemTable::GetLock(const Slice& key) {
@ -271,12 +292,12 @@ void MemTable::Add(SequenceNumber s, ValueType type,
  //  key bytes    : char[internal_key.size()]
  //  value_size   : varint32 of value.size()
  //  value bytes  : char[value.size()]
-  size_t key_size = key.size();
-  size_t val_size = value.size();
-  size_t internal_key_size = key_size + 8;
-  const size_t encoded_len =
-      VarintLength(internal_key_size) + internal_key_size +
-      VarintLength(val_size) + val_size;
+  uint32_t key_size = static_cast<uint32_t>(key.size());
+  uint32_t val_size = static_cast<uint32_t>(value.size());
+  uint32_t internal_key_size = key_size + 8;
+  const uint32_t encoded_len = VarintLength(internal_key_size) +
+                               internal_key_size + VarintLength(val_size) +
+                               val_size;
  char* buf = nullptr;
  KeyHandle handle = table_->Allocate(encoded_len, &buf);
  assert(buf != nullptr);
@ -399,7 +420,6 @@ static bool SaveValue(void* arg, const char* entry) {
          *(s->found_final_value) = true;
          return false;
        }
-        std::string merge_result;  // temporary area for merge results later
        Slice v = GetLengthPrefixedSlice(key_ptr + key_length);
        *(s->merge_in_progress) = true;
        merge_context->PushOperand(v);
@ -416,13 +436,13 @@ static bool SaveValue(void* arg, const char* entry) {
 }

 bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
-                   MergeContext& merge_context, const Options& options) {
+                   MergeContext* merge_context) {
  // The sequence number is updated synchronously in version_set.h
-  if (first_seqno_ == 0) {
+  if (IsEmpty()) {
    // Avoiding recording stats for speed.
    return false;
  }
-  PERF_TIMER_AUTO(get_from_memtable_time);
+  PERF_TIMER_GUARD(get_from_memtable_time);

  Slice user_key = key.user_key();
  bool found_final_value = false;
@ -440,11 +460,11 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
    saver.value = value;
    saver.status = s;
    saver.mem = this;
-    saver.merge_context = &merge_context;
-    saver.merge_operator = options.merge_operator.get();
-    saver.logger = options.info_log.get();
-    saver.inplace_update_support = options.inplace_update_support;
-    saver.statistics = options.statistics.get();
+    saver.merge_context = merge_context;
+    saver.merge_operator = moptions_.merge_operator;
+    saver.logger = moptions_.info_log;
+    saver.inplace_update_support = moptions_.inplace_update_support;
+    saver.statistics = moptions_.statistics;
    table_->Get(key, &saver, SaveValue);
  }

@ -452,7 +472,6 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
  if (!found_final_value && merge_in_progress) {
    *s = Status::MergeInProgress("");
  }
-  PERF_TIMER_STOP(get_from_memtable_time);
  PERF_COUNTER_ADD(get_from_memtable_count, 1);
  return found_final_value;
 }
@ -487,8 +506,8 @@ void MemTable::Update(SequenceNumber seq,
      switch (static_cast<ValueType>(tag & 0xff)) {
        case kTypeValue: {
          Slice prev_value = GetLengthPrefixedSlice(key_ptr + key_length);
-          uint32_t prev_size = prev_value.size();
-          uint32_t new_size = value.size();
+          uint32_t prev_size = static_cast<uint32_t>(prev_value.size());
+          uint32_t new_size = static_cast<uint32_t>(value.size());

          // Update value, if new value size  <= previous value size
          if (new_size <= prev_size ) {
@ -517,8 +536,7 @@ void MemTable::Update(SequenceNumber seq,

 bool MemTable::UpdateCallback(SequenceNumber seq,
                              const Slice& key,
-                              const Slice& delta,
-                              const Options& options) {
+                              const Slice& delta) {
  LookupKey lkey(key, seq);
  Slice memkey = lkey.memtable_key();

@ -546,14 +564,14 @@ bool MemTable::UpdateCallback(SequenceNumber seq,
      switch (static_cast<ValueType>(tag & 0xff)) {
        case kTypeValue: {
          Slice prev_value = GetLengthPrefixedSlice(key_ptr + key_length);
-          uint32_t  prev_size = prev_value.size();
+          uint32_t prev_size = static_cast<uint32_t>(prev_value.size());

          char* prev_buffer = const_cast<char*>(prev_value.data());
          uint32_t new_prev_size = prev_size;

          std::string str_value;
          WriteLock wl(GetLock(lkey.user_key()));
-          auto status = options.inplace_callback(prev_buffer, &new_prev_size,
+          auto status = moptions_.inplace_callback(prev_buffer, &new_prev_size,
                                                   delta, &str_value);
          if (status == UpdateStatus::UPDATED_INPLACE) {
            // Value already updated by callback.
@ -567,12 +585,12 @@ bool MemTable::UpdateCallback(SequenceNumber seq,
                memcpy(p, prev_buffer, new_prev_size);
              }
            }
-            RecordTick(options.statistics.get(), NUMBER_KEYS_UPDATED);
+            RecordTick(moptions_.statistics, NUMBER_KEYS_UPDATED);
            should_flush_ = ShouldFlushNow();
            return true;
          } else if (status == UpdateStatus::UPDATED) {
            Add(seq, kTypeValue, key, Slice(str_value));
-            RecordTick(options.statistics.get(), NUMBER_KEYS_WRITTEN);
+            RecordTick(moptions_.statistics, NUMBER_KEYS_WRITTEN);
            should_flush_ = ShouldFlushNow();
            return true;
          } else if (status == UpdateStatus::UPDATE_FAILED) {
--- a/db/memtable.h
+++ b/db/memtable.h
@ -10,21 +10,48 @@
 #pragma once
 #include <string>
 #include <memory>
+#include <functional>
 #include <deque>
+#include <vector>
 #include "db/dbformat.h"
 #include "db/skiplist.h"
 #include "db/version_edit.h"
 #include "rocksdb/db.h"
 #include "rocksdb/memtablerep.h"
+#include "rocksdb/immutable_options.h"
+#include "db/memtable_allocator.h"
 #include "util/arena.h"
 #include "util/dynamic_bloom.h"
+#include "util/mutable_cf_options.h"

 namespace rocksdb {

-class Arena;
 class Mutex;
 class MemTableIterator;
 class MergeContext;
+class WriteBuffer;
+
+struct MemTableOptions {
+  explicit MemTableOptions(
+      const ImmutableCFOptions& ioptions,
+      const MutableCFOptions& mutable_cf_options);
+  size_t write_buffer_size;
+  size_t arena_block_size;
+  uint32_t memtable_prefix_bloom_bits;
+  uint32_t memtable_prefix_bloom_probes;
+  size_t memtable_prefix_bloom_huge_page_tlb_size;
+  bool inplace_update_support;
+  size_t inplace_update_num_locks;
+  UpdateStatus (*inplace_callback)(char* existing_value,
+                                   uint32_t* existing_value_size,
+                                   Slice delta_value,
+                                   std::string* merged_value);
+  size_t max_successive_merges;
+  bool filter_deletes;
+  Statistics* statistics;
+  MergeOperator* merge_operator;
+  Logger* info_log;
+};

 class MemTable {
 public:
@ -40,7 +67,9 @@ class MemTable {
  // MemTables are reference counted.  The initial reference count
  // is zero and the caller must call Ref() at least once.
  explicit MemTable(const InternalKeyComparator& comparator,
-                    const Options& options);
+                    const ImmutableCFOptions& ioptions,
+                    const MutableCFOptions& mutable_cf_options,
+                    WriteBuffer* write_buffer);

  ~MemTable();

@ -67,7 +96,11 @@ class MemTable {

  // This method heuristically determines if the memtable should continue to
  // host more data.
-  bool ShouldFlush() const { return should_flush_; }
+  bool ShouldScheduleFlush() const {
+    return flush_scheduled_ == false && should_flush_;
+  }
+
+  void MarkFlushScheduled() { flush_scheduled_ = true; }

  // Return an iterator that yields the contents of the memtable.
  //
@ -81,8 +114,7 @@ class MemTable {
  // arena: If not null, the arena needs to be used to allocate the Iterator.
  //        Calling ~Iterator of the iterator will destroy all the states but
  //        those allocated in arena.
-  Iterator* NewIterator(const ReadOptions& options,
-                        Arena* arena = nullptr);
+  Iterator* NewIterator(const ReadOptions& read_options, Arena* arena);

  // Add an entry into memtable that maps key to value at the
  // specified sequence number and with the specified type.
@ -100,7 +132,7 @@ class MemTable {
  //   store MergeInProgress in s, and return false.
  // Else, return false.
  bool Get(const LookupKey& key, std::string* value, Status* s,
-           MergeContext& merge_context, const Options& options);
+           MergeContext* merge_context);

  // Attempts to update the new_value inplace, else does normal Add
  // Pseudocode
@ -124,8 +156,7 @@ class MemTable {
  //   else return false
  bool UpdateCallback(SequenceNumber seq,
                      const Slice& key,
-                      const Slice& delta,
-                      const Options& options);
+                      const Slice& delta);

  // Returns the number of successive merge entries starting from the newest
  // entry for the key up to the last non-merge entry or last entry for the
@ -138,6 +169,9 @@ class MemTable {
  // Returns the edits area that is needed for flushing the memtable
  VersionEdit* GetEdits() { return &edit_; }

+  // Returns if there is no entry inserted to the mem table.
+  bool IsEmpty() const { return first_seqno_ == 0; }
+
  // Returns the sequence number of the first element that was inserted
  // into the memtable
  SequenceNumber GetFirstSequenceNumber() { return first_seqno_; }
@ -151,7 +185,10 @@ class MemTable {
  void SetNextLogNumber(uint64_t num) { mem_next_logfile_number_ = num; }

  // Notify the underlying storage that no more items will be added
-  void MarkImmutable() { table_->MarkReadOnly(); }
+  void MarkImmutable() {
+    table_->MarkReadOnly();
+    allocator_.DoneAllocating();
+  }

  // return true if the current MemTableRep supports merge operator.
  bool IsMergeOperatorSupported() const {
@ -159,7 +196,10 @@ class MemTable {
  }

  // return true if the current MemTableRep supports snapshots.
-  bool IsSnapshotSupported() const { return table_->IsSnapshotSupported(); }
+  // inplace update prevents snapshots,
+  bool IsSnapshotSupported() const {
+    return table_->IsSnapshotSupported() && !moptions_.inplace_update_support;
+  }

  // Get the lock associated for the key
  port::RWMutex* GetLock(const Slice& key);
@ -168,10 +208,10 @@ class MemTable {
    return comparator_.comparator;
  }

-  const Arena& TEST_GetArena() const { return arena_; }
+  const MemTableOptions* GetMemTableOptions() const { return &moptions_; }

 private:
-  // Dynamically check if we can add more incoming entries.
+  // Dynamically check if we can add more incoming entries
  bool ShouldFlushNow() const;

  friend class MemTableIterator;
@ -179,10 +219,11 @@ class MemTable {
  friend class MemTableList;

  KeyComparator comparator_;
+  const MemTableOptions moptions_;
  int refs_;
  const size_t kArenaBlockSize;
-  const size_t kWriteBufferSize;
  Arena arena_;
+  MemTableAllocator allocator_;
  unique_ptr<MemTableRep> table_;

  uint64_t num_entries_;
@ -214,6 +255,9 @@ class MemTable {

  // a flag indicating if a memtable has met the criteria to flush
  bool should_flush_;
+
+  // a flag indicating if flush has been scheduled
+  bool flush_scheduled_;
 };

 extern const char* EncodeKey(std::string* scratch, const Slice& target);
--- a/db/memtable_allocator.cc
+++ b/db/memtable_allocator.cc
@ -0,0 +1,52 @@
+//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <assert.h>
+
+#include "db/memtable_allocator.h"
+#include "db/writebuffer.h"
+#include "util/arena.h"
+
+namespace rocksdb {
+
+MemTableAllocator::MemTableAllocator(Arena* arena, WriteBuffer* write_buffer)
+    : arena_(arena), write_buffer_(write_buffer), bytes_allocated_(0) {
+}
+
+MemTableAllocator::~MemTableAllocator() {
+  DoneAllocating();
+}
+
+char* MemTableAllocator::Allocate(size_t bytes) {
+  assert(write_buffer_ != nullptr);
+  bytes_allocated_ += bytes;
+  write_buffer_->ReserveMem(bytes);
+  return arena_->Allocate(bytes);
+}
+
+char* MemTableAllocator::AllocateAligned(size_t bytes, size_t huge_page_size,
+                                         Logger* logger) {
+  assert(write_buffer_ != nullptr);
+  bytes_allocated_ += bytes;
+  write_buffer_->ReserveMem(bytes);
+  return arena_->AllocateAligned(bytes, huge_page_size, logger);
+}
+
+void MemTableAllocator::DoneAllocating() {
+  if (write_buffer_ != nullptr) {
+    write_buffer_->FreeMem(bytes_allocated_);
+    write_buffer_ = nullptr;
+  }
+}
+
+size_t MemTableAllocator::BlockSize() const {
+  return arena_->BlockSize();
+}
+
+}  // namespace rocksdb
--- a/db/memtable_allocator.h
+++ b/db/memtable_allocator.h
@ -0,0 +1,47 @@
+//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// This is used by the MemTable to allocate write buffer memory. It connects
+// to WriteBuffer so we can track and enforce overall write buffer limits.
+
+#pragma once
+#include "util/allocator.h"
+
+namespace rocksdb {
+
+class Arena;
+class Logger;
+class WriteBuffer;
+
+class MemTableAllocator : public Allocator {
+ public:
+  explicit MemTableAllocator(Arena* arena, WriteBuffer* write_buffer);
+  ~MemTableAllocator();
+
+  // Allocator interface
+  char* Allocate(size_t bytes) override;
+  char* AllocateAligned(size_t bytes, size_t huge_page_size = 0,
+                        Logger* logger = nullptr) override;
+  size_t BlockSize() const override;
+
+  // Call when we're finished allocating memory so we can free it from
+  // the write buffer's limit.
+  void DoneAllocating();
+
+ private:
+  Arena* arena_;
+  WriteBuffer* write_buffer_;
+  size_t bytes_allocated_;
+
+  // No copying allowed
+  MemTableAllocator(const MemTableAllocator&);
+  void operator=(const MemTableAllocator&);
+};
+
+}  // namespace rocksdb
--- a/db/memtable_list.cc
+++ b/db/memtable_list.cc
@ -5,6 +5,11 @@
 //
 #include "db/memtable_list.h"

+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include <inttypes.h>
 #include <string>
 #include "rocksdb/db.h"
 #include "db/memtable.h"
@ -62,10 +67,9 @@ int MemTableList::size() const {
 // Return the most recent value found, if any.
 // Operands stores the list of merge operations to apply, so far.
 bool MemTableListVersion::Get(const LookupKey& key, std::string* value,
-                              Status* s, MergeContext& merge_context,
-                              const Options& options) {
+                              Status* s, MergeContext* merge_context) {
  for (auto& memtable : memlist_) {
-    if (memtable->Get(key, value, s, merge_context, options)) {
+    if (memtable->Get(key, value, s, merge_context)) {
      return true;
    }
  }
@ -73,9 +77,10 @@ bool MemTableListVersion::Get(const LookupKey& key, std::string* value,
 }

 void MemTableListVersion::AddIterators(const ReadOptions& options,
-                                       std::vector<Iterator*>* iterator_list) {
+                                       std::vector<Iterator*>* iterator_list,
+                                       Arena* arena) {
  for (auto& m : memlist_) {
-    iterator_list->push_back(m->NewIterator(options));
+    iterator_list->push_back(m->NewIterator(options, arena));
  }
 }

@ -114,7 +119,7 @@ void MemTableListVersion::Remove(MemTable* m) {
 bool MemTableList::IsFlushPending() const {
  if ((flush_requested_ && num_flush_not_started_ >= 1) ||
      (num_flush_not_started_ >= min_write_buffer_number_to_merge_)) {
-    assert(imm_flush_needed.NoBarrier_Load() != nullptr);
+    assert(imm_flush_needed.load(std::memory_order_relaxed));
    return true;
  }
  return false;
@ -129,7 +134,7 @@ void MemTableList::PickMemtablesToFlush(autovector<MemTable*>* ret) {
      assert(!m->flush_completed_);
      num_flush_not_started_--;
      if (num_flush_not_started_ == 0) {
-        imm_flush_needed.Release_Store(nullptr);
+        imm_flush_needed.store(false, std::memory_order_release);
      }
      m->flush_in_progress_ = true;  // flushing will start very soon
      ret->push_back(m);
@ -139,8 +144,7 @@ void MemTableList::PickMemtablesToFlush(autovector<MemTable*>* ret) {
 }

 void MemTableList::RollbackMemtableFlush(const autovector<MemTable*>& mems,
-                                         uint64_t file_number,
-                                         FileNumToPathIdMap* pending_outputs) {
+                                         uint64_t file_number) {
  assert(!mems.empty());

  // If the flush was not successful, then just reset state.
@ -154,15 +158,14 @@ void MemTableList::RollbackMemtableFlush(const autovector<MemTable*>& mems,
    m->edit_.Clear();
    num_flush_not_started_++;
  }
-  pending_outputs->erase(file_number);
-  imm_flush_needed.Release_Store(reinterpret_cast<void *>(1));
+  imm_flush_needed.store(true, std::memory_order_release);
 }

 // Record a successful flush in the manifest file
 Status MemTableList::InstallMemtableFlushResults(
-    ColumnFamilyData* cfd, const autovector<MemTable*>& mems, VersionSet* vset,
-    port::Mutex* mu, Logger* info_log, uint64_t file_number,
-    FileNumToPathIdMap* pending_outputs, autovector<MemTable*>* to_delete,
+    ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
+    const autovector<MemTable*>& mems, VersionSet* vset, InstrumentedMutex* mu,
+    uint64_t file_number, autovector<MemTable*>* to_delete,
    Directory* db_directory, LogBuffer* log_buffer) {
  mu->AssertHeld();

@ -193,11 +196,11 @@ Status MemTableList::InstallMemtableFlushResults(
      break;
    }

-    LogToBuffer(log_buffer, "[%s] Level-0 commit table #%lu started",
-                cfd->GetName().c_str(), (unsigned long)m->file_number_);
+    LogToBuffer(log_buffer, "[%s] Level-0 commit table #%" PRIu64 " started",
+                cfd->GetName().c_str(), m->file_number_);

    // this can release and reacquire the mutex.
-    s = vset->LogAndApply(cfd, &m->edit_, mu, db_directory);
+    s = vset->LogAndApply(cfd, mutable_cf_options, &m->edit_, mu, db_directory);

    // we will be changing the version in the next code path,
    // so we better create a new one, since versions are immutable
@ -208,34 +211,26 @@ Status MemTableList::InstallMemtableFlushResults(
    uint64_t mem_id = 1;  // how many memtables has been flushed.
    do {
      if (s.ok()) { // commit new state
-        LogToBuffer(log_buffer,
-                    "[%s] Level-0 commit table #%lu: memtable #%lu done",
-                    cfd->GetName().c_str(), (unsigned long)m->file_number_,
-                    (unsigned long)mem_id);
+        LogToBuffer(log_buffer, "[%s] Level-0 commit table #%" PRIu64
+                                ": memtable #%" PRIu64 " done",
+                    cfd->GetName().c_str(), m->file_number_, mem_id);
        current_->Remove(m);
        assert(m->file_number_ > 0);

-        // pending_outputs can be cleared only after the newly created file
-        // has been written to a committed version so that other concurrently
-        // executing compaction threads do not mistakenly assume that this
-        // file is not live.
-        pending_outputs->erase(m->file_number_);
        if (m->Unref() != nullptr) {
          to_delete->push_back(m);
        }
      } else {
        //commit failed. setup state so that we can flush again.
-        Log(info_log,
-            "Level-0 commit table #%lu: memtable #%lu failed",
-            (unsigned long)m->file_number_,
-            (unsigned long)mem_id);
+        LogToBuffer(log_buffer, "Level-0 commit table #%" PRIu64
+                                ": memtable #%" PRIu64 " failed",
+                    m->file_number_, mem_id);
        m->flush_completed_ = false;
        m->flush_in_progress_ = false;
        m->edit_.Clear();
        num_flush_not_started_++;
-        pending_outputs->erase(m->file_number_);
        m->file_number_ = 0;
-        imm_flush_needed.Release_Store((void *)1);
+        imm_flush_needed.store(true, std::memory_order_release);
      }
      ++mem_id;
    } while (!current_->memlist_.empty() && (m = current_->memlist_.back()) &&
@ -258,17 +253,17 @@ void MemTableList::Add(MemTable* m) {
  m->MarkImmutable();
  num_flush_not_started_++;
  if (num_flush_not_started_ == 1) {
-    imm_flush_needed.Release_Store((void *)1);
+    imm_flush_needed.store(true, std::memory_order_release);
  }
 }

 // Returns an estimate of the number of bytes of data in use.
 size_t MemTableList::ApproximateMemoryUsage() {
-  size_t size = 0;
+  size_t total_size = 0;
  for (auto& memtable : current_->memlist_) {
-    size += memtable->ApproximateMemoryUsage();
+    total_size += memtable->ApproximateMemoryUsage();
  }
-  return size;
+  return total_size;
 }

 void MemTableList::InstallNewVersion() {
--- a/db/memtable_list.h
+++ b/db/memtable_list.h
@ -22,13 +22,14 @@
 #include "rocksdb/iterator.h"
 #include "rocksdb/options.h"
 #include "util/autovector.h"
+#include "util/instrumented_mutex.h"
 #include "util/log_buffer.h"

 namespace rocksdb {

 class ColumnFamilyData;
 class InternalKeyComparator;
-class Mutex;
+class InstrumentedMutex;
 class MergeIteratorBuilder;

 // keeps a list of immutable memtables in a vector. the list is immutable
@ -46,10 +47,10 @@ class MemTableListVersion {
  // Search all the memtables starting from the most recent one.
  // Return the most recent value found, if any.
  bool Get(const LookupKey& key, std::string* value, Status* s,
-           MergeContext& merge_context, const Options& options);
+           MergeContext* merge_context);

  void AddIterators(const ReadOptions& options,
-                    std::vector<Iterator*>* iterator_list);
+                    std::vector<Iterator*>* iterator_list, Arena* arena);

  void AddIterators(const ReadOptions& options,
                    MergeIteratorBuilder* merge_iter_builder);
@ -78,12 +79,12 @@ class MemTableList {
 public:
  // A list of memtables.
  explicit MemTableList(int min_write_buffer_number_to_merge)
-      : min_write_buffer_number_to_merge_(min_write_buffer_number_to_merge),
+      : imm_flush_needed(false),
+        min_write_buffer_number_to_merge_(min_write_buffer_number_to_merge),
        current_(new MemTableListVersion()),
        num_flush_not_started_(0),
        commit_in_progress_(false),
        flush_requested_(false) {
-    imm_flush_needed.Release_Store(nullptr);
    current_->Ref();
  }
  ~MemTableList() {}
@ -92,7 +93,7 @@ class MemTableList {

  // so that background threads can detect non-nullptr pointer to
  // determine whether there is anything more to start flushing.
-  port::AtomicPointer imm_flush_needed;
+  std::atomic<bool> imm_flush_needed;

  // Returns the total number of memtables in the list
  int size() const;
@ -108,14 +109,13 @@ class MemTableList {
  // Reset status of the given memtable list back to pending state so that
  // they can get picked up again on the next round of flush.
  void RollbackMemtableFlush(const autovector<MemTable*>& mems,
-                             uint64_t file_number,
-                             FileNumToPathIdMap* pending_outputs);
+                             uint64_t file_number);

  // Commit a successful flush in the manifest file
  Status InstallMemtableFlushResults(
-      ColumnFamilyData* cfd, const autovector<MemTable*>& m, VersionSet* vset,
-      port::Mutex* mu, Logger* info_log, uint64_t file_number,
-      FileNumToPathIdMap* pending_outputs, autovector<MemTable*>* to_delete,
+      ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
+      const autovector<MemTable*>& m, VersionSet* vset, InstrumentedMutex* mu,
+      uint64_t file_number, autovector<MemTable*>* to_delete,
      Directory* db_directory, LogBuffer* log_buffer);

  // New memtables are inserted at the front of the list.
--- a/db/memtablerep_bench.cc
+++ b/db/memtablerep_bench.cc
@ -0,0 +1,696 @@
+//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#define __STDC_FORMAT_MACROS
+
+#ifndef GFLAGS
+#include <cstdio>
+int main() {
+  fprintf(stderr, "Please install gflags to run rocksdb tools\n");
+  return 1;
+}
+#else
+
+#include <gflags/gflags.h>
+
+#include <atomic>
+#include <iostream>
+#include <memory>
+#include <thread>
+#include <type_traits>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "db/memtable.h"
+#include "db/writebuffer.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/options.h"
+#include "rocksdb/slice_transform.h"
+#include "util/arena.h"
+#include "util/mutexlock.h"
+#include "util/stop_watch.h"
+#include "util/testutil.h"
+
+using GFLAGS::ParseCommandLineFlags;
+using GFLAGS::RegisterFlagValidator;
+using GFLAGS::SetUsageMessage;
+
+DEFINE_string(benchmarks, "fillrandom",
+              "Comma-separated list of benchmarks to run. Options:\n"
+              "\tfillrandom             -- write N random values\n"
+              "\tfillseq                -- write N values in sequential order\n"
+              "\treadrandom             -- read N values in random order\n"
+              "\treadseq                -- scan the DB\n"
+              "\treadwrite              -- 1 thread writes while N - 1 threads "
+              "do random\n"
+              "\t                          reads\n"
+              "\tseqreadwrite           -- 1 thread writes while N - 1 threads "
+              "do scans\n");
+
+DEFINE_string(memtablerep, "skiplist",
+              "Which implementation of memtablerep to use. See "
+              "include/memtablerep.h for\n"
+              "  more details. Options:\n"
+              "\tskiplist            -- backed by a skiplist\n"
+              "\tvector              -- backed by an std::vector\n"
+              "\thashskiplist        -- backed by a hash skip list\n"
+              "\thashlinklist        -- backed by a hash linked list\n"
+              "\tcuckoo              -- backed by a cuckoo hash table");
+
+DEFINE_int64(bucket_count, 1000000,
+             "bucket_count parameter to pass into NewHashSkiplistRepFactory or "
+             "NewHashLinkListRepFactory");
+
+DEFINE_int32(
+    hashskiplist_height, 4,
+    "skiplist_height parameter to pass into NewHashSkiplistRepFactory");
+
+DEFINE_int32(
+    hashskiplist_branching_factor, 4,
+    "branching_factor parameter to pass into NewHashSkiplistRepFactory");
+
+DEFINE_int32(
+    huge_page_tlb_size, 0,
+    "huge_page_tlb_size parameter to pass into NewHashLinkListRepFactory");
+
+DEFINE_int32(bucket_entries_logging_threshold, 4096,
+             "bucket_entries_logging_threshold parameter to pass into "
+             "NewHashLinkListRepFactory");
+
+DEFINE_bool(if_log_bucket_dist_when_flash, true,
+            "if_log_bucket_dist_when_flash parameter to pass into "
+            "NewHashLinkListRepFactory");
+
+DEFINE_int32(
+    threshold_use_skiplist, 256,
+    "threshold_use_skiplist parameter to pass into NewHashLinkListRepFactory");
+
+DEFINE_int64(
+    write_buffer_size, 256,
+    "write_buffer_size parameter to pass into NewHashCuckooRepFactory");
+
+DEFINE_int64(
+    average_data_size, 64,
+    "average_data_size parameter to pass into NewHashCuckooRepFactory");
+
+DEFINE_int64(
+    hash_function_count, 4,
+    "hash_function_count parameter to pass into NewHashCuckooRepFactory");
+
+DEFINE_int32(
+    num_threads, 1,
+    "Number of concurrent threads to run. If the benchmark includes writes,\n"
+    "then at most one thread will be a writer");
+
+DEFINE_int32(num_operations, 1000000,
+             "Number of operations to do for write and random read benchmarks");
+
+DEFINE_int32(num_scans, 10,
+             "Number of times for each thread to scan the memtablerep for "
+             "sequential read "
+             "benchmarks");
+
+DEFINE_int32(item_size, 100, "Number of bytes each item should be");
+
+DEFINE_int32(prefix_length, 8,
+             "Prefix length to pass into NewFixedPrefixTransform");
+
+/* VectorRep settings */
+DEFINE_int64(vectorrep_count, 0,
+             "Number of entries to reserve on VectorRep initialization");
+
+DEFINE_int64(seed, 0,
+             "Seed base for random number generators. "
+             "When 0 it is deterministic.");
+
+static rocksdb::Env* FLAGS_env = rocksdb::Env::Default();
+
+namespace rocksdb {
+
+namespace {
+struct CallbackVerifyArgs {
+  bool found;
+  LookupKey* key;
+  MemTableRep* table;
+  InternalKeyComparator* comparator;
+};
+}  // namespace
+
+// Helper for quickly generating random data.
+class RandomGenerator {
+ private:
+  std::string data_;
+  unsigned int pos_;
+
+ public:
+  RandomGenerator() {
+    Random rnd(301);
+    auto size = (unsigned)std::max(1048576, FLAGS_item_size);
+    test::RandomString(&rnd, size, &data_);
+    pos_ = 0;
+  }
+
+  Slice Generate(unsigned int len) {
+    assert(len <= data_.size());
+    if (pos_ + len > data_.size()) {
+      pos_ = 0;
+    }
+    pos_ += len;
+    return Slice(data_.data() + pos_ - len, len);
+  }
+};
+
+enum WriteMode { SEQUENTIAL, RANDOM, UNIQUE_RANDOM };
+
+class KeyGenerator {
+ public:
+  KeyGenerator(Random64* rand, WriteMode mode, uint64_t num)
+      : rand_(rand), mode_(mode), num_(num), next_(0) {
+    if (mode_ == UNIQUE_RANDOM) {
+      // NOTE: if memory consumption of this approach becomes a concern,
+      // we can either break it into pieces and only random shuffle a section
+      // each time. Alternatively, use a bit map implementation
+      // (https://reviews.facebook.net/differential/diff/54627/)
+      values_.resize(num_);
+      for (uint64_t i = 0; i < num_; ++i) {
+        values_[i] = i;
+      }
+      std::shuffle(
+          values_.begin(), values_.end(),
+          std::default_random_engine(static_cast<unsigned int>(FLAGS_seed)));
+    }
+  }
+
+  uint64_t Next() {
+    switch (mode_) {
+      case SEQUENTIAL:
+        return next_++;
+      case RANDOM:
+        return rand_->Next() % num_;
+      case UNIQUE_RANDOM:
+        return values_[next_++];
+    }
+    assert(false);
+    return std::numeric_limits<uint64_t>::max();
+  }
+
+ private:
+  Random64* rand_;
+  WriteMode mode_;
+  const uint64_t num_;
+  uint64_t next_;
+  std::vector<uint64_t> values_;
+};
+
+class BenchmarkThread {
+ public:
+  explicit BenchmarkThread(MemTableRep* table, KeyGenerator* key_gen,
+                           uint64_t* bytes_written, uint64_t* bytes_read,
+                           uint64_t* sequence, uint64_t num_ops,
+                           uint64_t* read_hits)
+      : table_(table),
+        key_gen_(key_gen),
+        bytes_written_(bytes_written),
+        bytes_read_(bytes_read),
+        sequence_(sequence),
+        num_ops_(num_ops),
+        read_hits_(read_hits) {}
+
+  virtual void operator()() = 0;
+  virtual ~BenchmarkThread() {}
+
+ protected:
+  MemTableRep* table_;
+  KeyGenerator* key_gen_;
+  uint64_t* bytes_written_;
+  uint64_t* bytes_read_;
+  uint64_t* sequence_;
+  uint64_t num_ops_;
+  uint64_t* read_hits_;
+  RandomGenerator generator_;
+};
+
+class FillBenchmarkThread : public BenchmarkThread {
+ public:
+  FillBenchmarkThread(MemTableRep* table, KeyGenerator* key_gen,
+                      uint64_t* bytes_written, uint64_t* bytes_read,
+                      uint64_t* sequence, uint64_t num_ops, uint64_t* read_hits)
+      : BenchmarkThread(table, key_gen, bytes_written, bytes_read, sequence,
+                        num_ops, read_hits) {}
+
+  void FillOne() {
+    char* buf = nullptr;
+    auto internal_key_size = 16;
+    auto encoded_len =
+        FLAGS_item_size + VarintLength(internal_key_size) + internal_key_size;
+    KeyHandle handle = table_->Allocate(encoded_len, &buf);
+    assert(buf != nullptr);
+    char* p = EncodeVarint32(buf, internal_key_size);
+    auto key = key_gen_->Next();
+    EncodeFixed64(p, key);
+    p += 8;
+    EncodeFixed64(p, ++(*sequence_));
+    p += 8;
+    Slice bytes = generator_.Generate(FLAGS_item_size);
+    memcpy(p, bytes.data(), FLAGS_item_size);
+    p += FLAGS_item_size;
+    assert(p == buf + encoded_len);
+    table_->Insert(handle);
+    *bytes_written_ += encoded_len;
+  }
+
+  void operator()() override {
+    for (unsigned int i = 0; i < num_ops_; ++i) {
+      FillOne();
+    }
+  }
+};
+
+class ConcurrentFillBenchmarkThread : public FillBenchmarkThread {
+ public:
+  ConcurrentFillBenchmarkThread(MemTableRep* table, KeyGenerator* key_gen,
+                                uint64_t* bytes_written, uint64_t* bytes_read,
+                                uint64_t* sequence, uint64_t num_ops,
+                                uint64_t* read_hits,
+                                std::atomic_int* threads_done)
+      : FillBenchmarkThread(table, key_gen, bytes_written, bytes_read, sequence,
+                            num_ops, read_hits) {
+    threads_done_ = threads_done;
+  }
+
+  void operator()() override {
+    // # of read threads will be total threads - write threads (always 1). Loop
+    // while all reads complete.
+    while ((*threads_done_).load() < (FLAGS_num_threads - 1)) {
+      FillOne();
+    }
+  }
+
+ private:
+  std::atomic_int* threads_done_;
+};
+
+class ReadBenchmarkThread : public BenchmarkThread {
+ public:
+  ReadBenchmarkThread(MemTableRep* table, KeyGenerator* key_gen,
+                      uint64_t* bytes_written, uint64_t* bytes_read,
+                      uint64_t* sequence, uint64_t num_ops, uint64_t* read_hits)
+      : BenchmarkThread(table, key_gen, bytes_written, bytes_read, sequence,
+                        num_ops, read_hits) {}
+
+  static bool callback(void* arg, const char* entry) {
+    CallbackVerifyArgs* callback_args = static_cast<CallbackVerifyArgs*>(arg);
+    assert(callback_args != nullptr);
+    uint32_t key_length;
+    const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
+    if ((callback_args->comparator)->user_comparator()->Compare(
+            Slice(key_ptr, key_length - 8), callback_args->key->user_key()) ==
+        0) {
+      callback_args->found = true;
+    }
+    return false;
+  }
+
+  void ReadOne() {
+    std::string user_key;
+    auto key = key_gen_->Next();
+    PutFixed64(&user_key, key);
+    LookupKey lookup_key(user_key, *sequence_);
+    InternalKeyComparator internal_key_comp(BytewiseComparator());
+    CallbackVerifyArgs verify_args;
+    verify_args.found = false;
+    verify_args.key = &lookup_key;
+    verify_args.table = table_;
+    verify_args.comparator = &internal_key_comp;
+    table_->Get(lookup_key, &verify_args, callback);
+    if (verify_args.found) {
+      *bytes_read_ += VarintLength(16) + 16 + FLAGS_item_size;
+      ++*read_hits_;
+    }
+  }
+  void operator()() override {
+    for (unsigned int i = 0; i < num_ops_; ++i) {
+      ReadOne();
+    }
+  }
+};
+
+class SeqReadBenchmarkThread : public BenchmarkThread {
+ public:
+  SeqReadBenchmarkThread(MemTableRep* table, KeyGenerator* key_gen,
+                         uint64_t* bytes_written, uint64_t* bytes_read,
+                         uint64_t* sequence, uint64_t num_ops,
+                         uint64_t* read_hits)
+      : BenchmarkThread(table, key_gen, bytes_written, bytes_read, sequence,
+                        num_ops, read_hits) {}
+
+  void ReadOneSeq() {
+    std::unique_ptr<MemTableRep::Iterator> iter(table_->GetIterator());
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      // pretend to read the value
+      *bytes_read_ += VarintLength(16) + 16 + FLAGS_item_size;
+    }
+    ++*read_hits_;
+  }
+
+  void operator()() override {
+    for (unsigned int i = 0; i < num_ops_; ++i) {
+      { ReadOneSeq(); }
+    }
+  }
+};
+
+class ConcurrentReadBenchmarkThread : public ReadBenchmarkThread {
+ public:
+  ConcurrentReadBenchmarkThread(MemTableRep* table, KeyGenerator* key_gen,
+                                uint64_t* bytes_written, uint64_t* bytes_read,
+                                uint64_t* sequence, uint64_t num_ops,
+                                uint64_t* read_hits,
+                                std::atomic_int* threads_done)
+      : ReadBenchmarkThread(table, key_gen, bytes_written, bytes_read, sequence,
+                            num_ops, read_hits) {
+    threads_done_ = threads_done;
+  }
+
+  void operator()() override {
+    for (unsigned int i = 0; i < num_ops_; ++i) {
+      ReadOne();
+    }
+    ++*threads_done_;
+  }
+
+ private:
+  std::atomic_int* threads_done_;
+};
+
+class SeqConcurrentReadBenchmarkThread : public SeqReadBenchmarkThread {
+ public:
+  SeqConcurrentReadBenchmarkThread(MemTableRep* table, KeyGenerator* key_gen,
+                                   uint64_t* bytes_written,
+                                   uint64_t* bytes_read, uint64_t* sequence,
+                                   uint64_t num_ops, uint64_t* read_hits,
+                                   std::atomic_int* threads_done)
+      : SeqReadBenchmarkThread(table, key_gen, bytes_written, bytes_read,
+                               sequence, num_ops, read_hits) {
+    threads_done_ = threads_done;
+  }
+
+  void operator()() override {
+    for (unsigned int i = 0; i < num_ops_; ++i) {
+      ReadOneSeq();
+    }
+    ++*threads_done_;
+  }
+
+ private:
+  std::atomic_int* threads_done_;
+};
+
+class Benchmark {
+ public:
+  explicit Benchmark(MemTableRep* table, KeyGenerator* key_gen,
+                     uint64_t* sequence, uint32_t num_threads)
+      : table_(table),
+        key_gen_(key_gen),
+        sequence_(sequence),
+        num_threads_(num_threads) {}
+
+  virtual ~Benchmark() {}
+  virtual void Run() {
+    std::cout << "Number of threads: " << num_threads_ << std::endl;
+    std::vector<std::thread> threads;
+    uint64_t bytes_written = 0;
+    uint64_t bytes_read = 0;
+    uint64_t read_hits = 0;
+    StopWatchNano timer(Env::Default(), true);
+    RunThreads(&threads, &bytes_written, &bytes_read, true, &read_hits);
+    auto elapsed_time = static_cast<double>(timer.ElapsedNanos() / 1000);
+    std::cout << "Elapsed time: " << static_cast<int>(elapsed_time) << " us"
+              << std::endl;
+
+    if (bytes_written > 0) {
+      auto MiB_written = static_cast<double>(bytes_written) / (1 << 20);
+      auto write_throughput = MiB_written / (elapsed_time / 1000000);
+      std::cout << "Total bytes written: " << MiB_written << " MiB"
+                << std::endl;
+      std::cout << "Write throughput: " << write_throughput << " MiB/s"
+                << std::endl;
+      auto us_per_op = elapsed_time / num_write_ops_per_thread_;
+      std::cout << "write us/op: " << us_per_op << std::endl;
+    }
+    if (bytes_read > 0) {
+      auto MiB_read = static_cast<double>(bytes_read) / (1 << 20);
+      auto read_throughput = MiB_read / (elapsed_time / 1000000);
+      std::cout << "Total bytes read: " << MiB_read << " MiB" << std::endl;
+      std::cout << "Read throughput: " << read_throughput << " MiB/s"
+                << std::endl;
+      auto us_per_op = elapsed_time / num_read_ops_per_thread_;
+      std::cout << "read us/op: " << us_per_op << std::endl;
+    }
+  }
+
+  virtual void RunThreads(std::vector<std::thread>* threads,
+                          uint64_t* bytes_written, uint64_t* bytes_read,
+                          bool write, uint64_t* read_hits) = 0;
+
+ protected:
+  MemTableRep* table_;
+  KeyGenerator* key_gen_;
+  uint64_t* sequence_;
+  uint64_t num_write_ops_per_thread_;
+  uint64_t num_read_ops_per_thread_;
+  const uint32_t num_threads_;
+};
+
+class FillBenchmark : public Benchmark {
+ public:
+  explicit FillBenchmark(MemTableRep* table, KeyGenerator* key_gen,
+                         uint64_t* sequence)
+      : Benchmark(table, key_gen, sequence, 1) {
+    num_write_ops_per_thread_ = FLAGS_num_operations;
+  }
+
+  void RunThreads(std::vector<std::thread>* threads, uint64_t* bytes_written,
+                  uint64_t* bytes_read, bool write,
+                  uint64_t* read_hits) override {
+    FillBenchmarkThread(table_, key_gen_, bytes_written, bytes_read, sequence_,
+                        num_write_ops_per_thread_, read_hits)();
+  }
+};
+
+class ReadBenchmark : public Benchmark {
+ public:
+  explicit ReadBenchmark(MemTableRep* table, KeyGenerator* key_gen,
+                         uint64_t* sequence)
+      : Benchmark(table, key_gen, sequence, FLAGS_num_threads) {
+    num_read_ops_per_thread_ = FLAGS_num_operations / FLAGS_num_threads;
+  }
+
+  void RunThreads(std::vector<std::thread>* threads, uint64_t* bytes_written,
+                  uint64_t* bytes_read, bool write,
+                  uint64_t* read_hits) override {
+    for (int i = 0; i < FLAGS_num_threads; ++i) {
+      threads->emplace_back(
+          ReadBenchmarkThread(table_, key_gen_, bytes_written, bytes_read,
+                              sequence_, num_read_ops_per_thread_, read_hits));
+    }
+    for (auto& thread : *threads) {
+      thread.join();
+    }
+    std::cout << "read hit%: "
+              << (static_cast<double>(*read_hits) / FLAGS_num_operations) * 100
+              << std::endl;
+  }
+};
+
+class SeqReadBenchmark : public Benchmark {
+ public:
+  explicit SeqReadBenchmark(MemTableRep* table, uint64_t* sequence)
+      : Benchmark(table, nullptr, sequence, FLAGS_num_threads) {
+    num_read_ops_per_thread_ = FLAGS_num_scans;
+  }
+
+  void RunThreads(std::vector<std::thread>* threads, uint64_t* bytes_written,
+                  uint64_t* bytes_read, bool write,
+                  uint64_t* read_hits) override {
+    for (int i = 0; i < FLAGS_num_threads; ++i) {
+      threads->emplace_back(SeqReadBenchmarkThread(
+          table_, key_gen_, bytes_written, bytes_read, sequence_,
+          num_read_ops_per_thread_, read_hits));
+    }
+    for (auto& thread : *threads) {
+      thread.join();
+    }
+  }
+};
+
+template <class ReadThreadType>
+class ReadWriteBenchmark : public Benchmark {
+ public:
+  explicit ReadWriteBenchmark(MemTableRep* table, KeyGenerator* key_gen,
+                              uint64_t* sequence)
+      : Benchmark(table, key_gen, sequence, FLAGS_num_threads) {
+    num_read_ops_per_thread_ =
+        FLAGS_num_threads <= 1
+            ? 0
+            : (FLAGS_num_operations / (FLAGS_num_threads - 1));
+    num_write_ops_per_thread_ = FLAGS_num_operations;
+  }
+
+  void RunThreads(std::vector<std::thread>* threads, uint64_t* bytes_written,
+                  uint64_t* bytes_read, bool write,
+                  uint64_t* read_hits) override {
+    std::atomic_int threads_done;
+    threads_done.store(0);
+    threads->emplace_back(ConcurrentFillBenchmarkThread(
+        table_, key_gen_, bytes_written, bytes_read, sequence_,
+        num_write_ops_per_thread_, read_hits, &threads_done));
+    for (int i = 1; i < FLAGS_num_threads; ++i) {
+      threads->emplace_back(
+          ReadThreadType(table_, key_gen_, bytes_written, bytes_read, sequence_,
+                         num_read_ops_per_thread_, read_hits, &threads_done));
+    }
+    for (auto& thread : *threads) {
+      thread.join();
+    }
+  }
+};
+
+}  // namespace rocksdb
+
+void PrintWarnings() {
+#if defined(__GNUC__) && !defined(__OPTIMIZE__)
+  fprintf(stdout,
+          "WARNING: Optimization is disabled: benchmarks unnecessarily slow\n");
+#endif
+#ifndef NDEBUG
+  fprintf(stdout,
+          "WARNING: Assertions are enabled; benchmarks unnecessarily slow\n");
+#endif
+}
+
+int main(int argc, char** argv) {
+  rocksdb::port::InstallStackTraceHandler();
+  SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) +
+                  " [OPTIONS]...");
+  ParseCommandLineFlags(&argc, &argv, true);
+
+  PrintWarnings();
+
+  rocksdb::Options options;
+
+  std::unique_ptr<rocksdb::MemTableRepFactory> factory;
+  if (FLAGS_memtablerep == "skiplist") {
+    factory.reset(new rocksdb::SkipListFactory);
+  } else if (FLAGS_memtablerep == "vector") {
+    factory.reset(new rocksdb::VectorRepFactory);
+  } else if (FLAGS_memtablerep == "hashskiplist") {
+    factory.reset(rocksdb::NewHashSkipListRepFactory(
+        FLAGS_bucket_count, FLAGS_hashskiplist_height,
+        FLAGS_hashskiplist_branching_factor));
+    options.prefix_extractor.reset(
+        rocksdb::NewFixedPrefixTransform(FLAGS_prefix_length));
+  } else if (FLAGS_memtablerep == "hashlinklist") {
+    factory.reset(rocksdb::NewHashLinkListRepFactory(
+        FLAGS_bucket_count, FLAGS_huge_page_tlb_size,
+        FLAGS_bucket_entries_logging_threshold,
+        FLAGS_if_log_bucket_dist_when_flash, FLAGS_threshold_use_skiplist));
+    options.prefix_extractor.reset(
+        rocksdb::NewFixedPrefixTransform(FLAGS_prefix_length));
+  } else if (FLAGS_memtablerep == "cuckoo") {
+    factory.reset(rocksdb::NewHashCuckooRepFactory(
+        FLAGS_write_buffer_size, FLAGS_average_data_size,
+        static_cast<uint32_t>(FLAGS_hash_function_count)));
+    options.prefix_extractor.reset(
+        rocksdb::NewFixedPrefixTransform(FLAGS_prefix_length));
+  } else {
+    fprintf(stdout, "Unknown memtablerep: %s\n", FLAGS_memtablerep.c_str());
+    exit(1);
+  }
+
+  rocksdb::InternalKeyComparator internal_key_comp(
+      rocksdb::BytewiseComparator());
+  rocksdb::MemTable::KeyComparator key_comp(internal_key_comp);
+  rocksdb::Arena arena;
+  rocksdb::WriteBuffer wb(FLAGS_write_buffer_size);
+  rocksdb::MemTableAllocator memtable_allocator(&arena, &wb);
+  uint64_t sequence;
+  auto createMemtableRep = [&] {
+    sequence = 0;
+    return factory->CreateMemTableRep(key_comp, &memtable_allocator,
+                                      options.prefix_extractor.get(),
+                                      options.info_log.get());
+  };
+  std::unique_ptr<rocksdb::MemTableRep> memtablerep;
+  rocksdb::Random64 rng(FLAGS_seed);
+  const char* benchmarks = FLAGS_benchmarks.c_str();
+  while (benchmarks != nullptr) {
+    std::unique_ptr<rocksdb::KeyGenerator> key_gen;
+    const char* sep = strchr(benchmarks, ',');
+    rocksdb::Slice name;
+    if (sep == nullptr) {
+      name = benchmarks;
+      benchmarks = nullptr;
+    } else {
+      name = rocksdb::Slice(benchmarks, sep - benchmarks);
+      benchmarks = sep + 1;
+    }
+    std::unique_ptr<rocksdb::Benchmark> benchmark;
+    if (name == rocksdb::Slice("fillseq")) {
+      memtablerep.reset(createMemtableRep());
+      key_gen.reset(new rocksdb::KeyGenerator(&rng, rocksdb::SEQUENTIAL,
+                                              FLAGS_num_operations));
+      benchmark.reset(new rocksdb::FillBenchmark(memtablerep.get(),
+                                                 key_gen.get(), &sequence));
+    } else if (name == rocksdb::Slice("fillrandom")) {
+      memtablerep.reset(createMemtableRep());
+      key_gen.reset(new rocksdb::KeyGenerator(&rng, rocksdb::UNIQUE_RANDOM,
+                                              FLAGS_num_operations));
+      benchmark.reset(new rocksdb::FillBenchmark(memtablerep.get(),
+                                                 key_gen.get(), &sequence));
+    } else if (name == rocksdb::Slice("readrandom")) {
+      key_gen.reset(new rocksdb::KeyGenerator(&rng, rocksdb::RANDOM,
+                                              FLAGS_num_operations));
+      benchmark.reset(new rocksdb::ReadBenchmark(memtablerep.get(),
+                                                 key_gen.get(), &sequence));
+    } else if (name == rocksdb::Slice("readseq")) {
+      key_gen.reset(new rocksdb::KeyGenerator(&rng, rocksdb::SEQUENTIAL,
+                                              FLAGS_num_operations));
+      benchmark.reset(
+          new rocksdb::SeqReadBenchmark(memtablerep.get(), &sequence));
+    } else if (name == rocksdb::Slice("readwrite")) {
+      memtablerep.reset(createMemtableRep());
+      key_gen.reset(new rocksdb::KeyGenerator(&rng, rocksdb::RANDOM,
+                                              FLAGS_num_operations));
+      benchmark.reset(new rocksdb::ReadWriteBenchmark<
+          rocksdb::ConcurrentReadBenchmarkThread>(memtablerep.get(),
+                                                  key_gen.get(), &sequence));
+    } else if (name == rocksdb::Slice("seqreadwrite")) {
+      memtablerep.reset(createMemtableRep());
+      key_gen.reset(new rocksdb::KeyGenerator(&rng, rocksdb::RANDOM,
+                                              FLAGS_num_operations));
+      benchmark.reset(new rocksdb::ReadWriteBenchmark<
+          rocksdb::SeqConcurrentReadBenchmarkThread>(memtablerep.get(),
+                                                     key_gen.get(), &sequence));
+    } else {
+      std::cout << "WARNING: skipping unknown benchmark '" << name.ToString()
+                << std::endl;
+      continue;
+    }
+    std::cout << "Running " << name.ToString() << std::endl;
+    benchmark->Run();
+  }
+
+  return 0;
+}
+
+#endif  // GFLAGS
--- a/db/merge_helper.cc
+++ b/db/merge_helper.cc
@ -85,9 +85,10 @@ void MergeHelper::MergeUntil(Iterator* iter, SequenceNumber stop_before,
      // We store the result in keys_.back() and operands_.back()
      // if nothing went wrong (i.e.: no operand corruption on disk)
      if (success_) {
-        std::string& key = keys_.back();  // The original key encountered
+        std::string& original_key =
+            keys_.back();  // The original key encountered
        orig_ikey.type = kTypeValue;
-        UpdateInternalKey(&key[0], key.size(),
+        UpdateInternalKey(&original_key[0], original_key.size(),
                          orig_ikey.sequence, orig_ikey.type);
        swap(operands_.back(), merge_result);
      } else {
@ -108,17 +109,17 @@ void MergeHelper::MergeUntil(Iterator* iter, SequenceNumber stop_before,
      //   => store result in operands_.back() (and update keys_.back())
      //   => change the entry type to kTypeValue for keys_.back()
      // We are done! Success!
-      const Slice value = iter->value();
-      success_ = user_merge_operator_->FullMerge(ikey.user_key, &value,
-                                                 operands_, &merge_result,
-                                                 logger_);
+      const Slice val = iter->value();
+      success_ = user_merge_operator_->FullMerge(ikey.user_key, &val, operands_,
+                                                 &merge_result, logger_);

      // We store the result in keys_.back() and operands_.back()
      // if nothing went wrong (i.e.: no operand corruption on disk)
      if (success_) {
-        std::string& key = keys_.back();  // The original key encountered
+        std::string& original_key =
+            keys_.back();  // The original key encountered
        orig_ikey.type = kTypeValue;
-        UpdateInternalKey(&key[0], key.size(),
+        UpdateInternalKey(&original_key[0], original_key.size(),
                          orig_ikey.sequence, orig_ikey.type);
        swap(operands_.back(), merge_result);
      } else {
@ -177,9 +178,9 @@ void MergeHelper::MergeUntil(Iterator* iter, SequenceNumber stop_before,
                                               logger_);

    if (success_) {
-      std::string& key = keys_.back();  // The original key encountered
+      std::string& original_key = keys_.back();  // The original key encountered
      orig_ikey.type = kTypeValue;
-      UpdateInternalKey(&key[0], key.size(),
+      UpdateInternalKey(&original_key[0], original_key.size(),
                        orig_ikey.sequence, orig_ikey.type);

      // The final value() is always stored in operands_.back()
--- a/db/merge_test.cc
+++ b/db/merge_test.cc
@ -23,15 +23,11 @@ using namespace std;
 using namespace rocksdb;

 namespace {
-  int numMergeOperatorCalls;
-  void resetNumMergeOperatorCalls() {
-    numMergeOperatorCalls = 0;
-  }
+size_t num_merge_operator_calls;
+void resetNumMergeOperatorCalls() { num_merge_operator_calls = 0; }

-  int num_partial_merge_calls;
-  void resetNumPartialMergeCalls() {
-    num_partial_merge_calls = 0;
-  }
+size_t num_partial_merge_calls;
+void resetNumPartialMergeCalls() { num_partial_merge_calls = 0; }
 }

 class CountMergeOperator : public AssociativeMergeOperator {
@ -45,7 +41,7 @@ class CountMergeOperator : public AssociativeMergeOperator {
                     const Slice& value,
                     std::string* new_value,
                     Logger* logger) const override {
-    ++numMergeOperatorCalls;
+    ++num_merge_operator_calls;
    if (existing_value == nullptr) {
      new_value->assign(value.data(), value.size());
      return true;
@ -307,31 +303,31 @@ void testCounters(Counters& counters, DB* db, bool test_compaction) {
  }
 }

-void testSuccessiveMerge(
-    Counters& counters, int max_num_merges, int num_merges) {
+void testSuccessiveMerge(Counters& counters, size_t max_num_merges,
+                         size_t num_merges) {

  counters.assert_remove("z");
  uint64_t sum = 0;

-  for (int i = 1; i <= num_merges; ++i) {
+  for (size_t i = 1; i <= num_merges; ++i) {
    resetNumMergeOperatorCalls();
    counters.assert_add("z", i);
    sum += i;

    if (i % (max_num_merges + 1) == 0) {
-      assert(numMergeOperatorCalls == max_num_merges + 1);
+      assert(num_merge_operator_calls == max_num_merges + 1);
    } else {
-      assert(numMergeOperatorCalls == 0);
+      assert(num_merge_operator_calls == 0);
    }

    resetNumMergeOperatorCalls();
    assert(counters.assert_get("z") == sum);
-    assert(numMergeOperatorCalls == i % (max_num_merges + 1));
+    assert(num_merge_operator_calls == i % (max_num_merges + 1));
  }
 }

-void testPartialMerge(Counters* counters, DB* db, int max_merge, int min_merge,
-                      int count) {
+void testPartialMerge(Counters* counters, DB* db, size_t max_merge,
+                      size_t min_merge, size_t count) {
  FlushOptions o;
  o.wait = true;

@ -339,7 +335,7 @@ void testPartialMerge(Counters* counters, DB* db, int max_merge, int min_merge,
  //              operands exceeds the threshold.
  uint64_t tmp_sum = 0;
  resetNumPartialMergeCalls();
-  for (int i = 1; i <= count; i++) {
+  for (size_t i = 1; i <= count; i++) {
    counters->assert_add("b", i);
    tmp_sum += i;
  }
@ -348,7 +344,7 @@ void testPartialMerge(Counters* counters, DB* db, int max_merge, int min_merge,
  ASSERT_EQ(tmp_sum, counters->assert_get("b"));
  if (count > max_merge) {
    // in this case, FullMerge should be called instead.
-    ASSERT_EQ(num_partial_merge_calls, 0);
+    ASSERT_EQ(num_partial_merge_calls, 0U);
  } else {
    // if count >= min_merge, then partial merge should be called once.
    ASSERT_EQ((count >= min_merge), (num_partial_merge_calls == 1));
@ -358,20 +354,18 @@ void testPartialMerge(Counters* counters, DB* db, int max_merge, int min_merge,
  resetNumPartialMergeCalls();
  tmp_sum = 0;
  db->Put(rocksdb::WriteOptions(), "c", "10");
-  for (int i = 1; i <= count; i++) {
+  for (size_t i = 1; i <= count; i++) {
    counters->assert_add("c", i);
    tmp_sum += i;
  }
  db->Flush(o);
  db->CompactRange(nullptr, nullptr);
  ASSERT_EQ(tmp_sum, counters->assert_get("c"));
-  ASSERT_EQ(num_partial_merge_calls, 0);
+  ASSERT_EQ(num_partial_merge_calls, 0U);
 }

-void testSingleBatchSuccessiveMerge(
-    DB* db,
-    int max_num_merges,
-    int num_merges) {
+void testSingleBatchSuccessiveMerge(DB* db, size_t max_num_merges,
+                                    size_t num_merges) {
  assert(num_merges > max_num_merges);

  Slice key("BatchSuccessiveMerge");
@ -380,7 +374,7 @@ void testSingleBatchSuccessiveMerge(

  // Create the batch
  WriteBatch batch;
-  for (int i = 0; i < num_merges; ++i) {
+  for (size_t i = 0; i < num_merges; ++i) {
    batch.Merge(key, merge_value_slice);
  }

@ -390,8 +384,9 @@ void testSingleBatchSuccessiveMerge(
    Status s = db->Write(WriteOptions(), &batch);
    assert(s.ok());
  }
-  assert(numMergeOperatorCalls ==
-      num_merges - (num_merges % (max_num_merges + 1)));
+  ASSERT_EQ(
+      num_merge_operator_calls,
+      static_cast<size_t>(num_merges - (num_merges % (max_num_merges + 1))));

  // Get the value
  resetNumMergeOperatorCalls();
@ -403,10 +398,18 @@ void testSingleBatchSuccessiveMerge(
  assert(get_value_str.size() == sizeof(uint64_t));
  uint64_t get_value = DecodeFixed64(&get_value_str[0]);
  ASSERT_EQ(get_value, num_merges * merge_value);
-  ASSERT_EQ(numMergeOperatorCalls, (num_merges % (max_num_merges + 1)));
+  ASSERT_EQ(num_merge_operator_calls,
+            static_cast<size_t>((num_merges % (max_num_merges + 1))));
 }

 void runTest(int argc, const string& dbname, const bool use_ttl = false) {
+  bool compact = false;
+  if (argc > 1) {
+    compact = true;
+    cout << "Turn on Compaction\n";
+  }
+
+  {
    auto db = OpenDb(dbname, use_ttl);

    {
@ -415,20 +418,14 @@ void runTest(int argc, const string& dbname, const bool use_ttl = false) {
      testCounters(counters, db.get(), true);
    }

-  bool compact = false;
-  if (argc > 1) {
-    compact = true;
-    cout << "Turn on Compaction\n";
-  }
-
    {
      cout << "Test merge-based counters... \n";
      MergeBasedCounters counters(db, 0);
      testCounters(counters, db.get(), compact);
    }
+  }

  DestroyDB(dbname, Options());
-  db.reset();

  {
    cout << "Test merge in memtable... \n";
--- a/db/perf_context_test.cc
+++ b/db/perf_context_test.cc
@ -6,7 +6,6 @@
 #include <algorithm>
 #include <iostream>
 #include <vector>
-#include "/usr/include/valgrind/callgrind.h"

 #include "rocksdb/db.h"
 #include "rocksdb/perf_context.h"
@ -15,6 +14,8 @@
 #include "util/histogram.h"
 #include "util/stop_watch.h"
 #include "util/testharness.h"
+#include "util/thread_status_util.h"
+#include "util/string_util.h"


 bool FLAGS_random_key = false;
@ -29,7 +30,7 @@ const std::string kDbName = rocksdb::test::TmpDir() + "/perf_context_test";

 namespace rocksdb {

-std::shared_ptr<DB> OpenDb() {
+std::shared_ptr<DB> OpenDb(bool read_only = false) {
    DB* db;
    Options options;
    options.create_if_missing = true;
@ -39,12 +40,21 @@ std::shared_ptr<DB> OpenDb() {
      FLAGS_min_write_buffer_number_to_merge;

    if (FLAGS_use_set_based_memetable) {
-      auto prefix_extractor = rocksdb::NewFixedPrefixTransform(0);
-      options.memtable_factory.reset(
-          NewHashSkipListRepFactory(prefix_extractor));
+#ifndef ROCKSDB_LITE
+      options.prefix_extractor.reset(rocksdb::NewFixedPrefixTransform(0));
+      options.memtable_factory.reset(NewHashSkipListRepFactory());
+#else
+      fprintf(stderr, "Prefix hash is not supported in lite mode\n");
+      exit(1);
+#endif  // ROCKSDB_LITE
    }

-    Status s = DB::Open(options, kDbName,  &db);
+    Status s;
+    if (!read_only) {
+      s = DB::Open(options, kDbName, &db);
+    } else {
+      s = DB::OpenForReadOnly(options, kDbName, &db);
+    }
    ASSERT_OK(s);
    return std::shared_ptr<DB>(db);
 }
@ -58,25 +68,26 @@ TEST(PerfContextTest, SeekIntoDeletion) {
  ReadOptions read_options;

  for (int i = 0; i < FLAGS_total_keys; ++i) {
-    std::string key = "k" + std::to_string(i);
-    std::string value = "v" + std::to_string(i);
+    std::string key = "k" + ToString(i);
+    std::string value = "v" + ToString(i);

    db->Put(write_options, key, value);
  }

  for (int i = 0; i < FLAGS_total_keys -1 ; ++i) {
-    std::string key = "k" + std::to_string(i);
+    std::string key = "k" + ToString(i);
    db->Delete(write_options, key);
  }

  HistogramImpl hist_get;
  HistogramImpl hist_get_time;
  for (int i = 0; i < FLAGS_total_keys - 1; ++i) {
-    std::string key = "k" + std::to_string(i);
+    std::string key = "k" + ToString(i);
    std::string value;

    perf_context.Reset();
-    StopWatchNano timer(Env::Default(), true);
+    StopWatchNano timer(Env::Default());
+    timer.Start();
    auto status = db->Get(read_options, key, &value);
    auto elapsed_nanos = timer.ElapsedNanos();
    ASSERT_TRUE(status.IsNotFound());
@ -84,9 +95,10 @@ TEST(PerfContextTest, SeekIntoDeletion) {
    hist_get_time.Add(elapsed_nanos);
  }

-  std::cout << "Get uesr key comparison: \n" << hist_get.ToString()
+  std::cout << "Get user key comparison: \n" << hist_get.ToString()
            << "Get time: \n" << hist_get_time.ToString();

+  {
    HistogramImpl hist_seek_to_first;
    std::unique_ptr<Iterator> iter(db->NewIterator(read_options));

@ -96,15 +108,19 @@ TEST(PerfContextTest, SeekIntoDeletion) {
    hist_seek_to_first.Add(perf_context.user_key_comparison_count);
    auto elapsed_nanos = timer.ElapsedNanos();

-  std::cout << "SeekToFirst uesr key comparison: \n" << hist_seek_to_first.ToString()
-            << "ikey skipped: " << perf_context.internal_key_skipped_count << "\n"
-            << "idelete skipped: " << perf_context.internal_delete_skipped_count << "\n"
+    std::cout << "SeekToFirst uesr key comparison: \n"
+              << hist_seek_to_first.ToString()
+              << "ikey skipped: " << perf_context.internal_key_skipped_count
+              << "\n"
+              << "idelete skipped: "
+              << perf_context.internal_delete_skipped_count << "\n"
              << "elapsed: " << elapsed_nanos << "\n";
+  }

  HistogramImpl hist_seek;
  for (int i = 0; i < FLAGS_total_keys; ++i) {
    std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
-    std::string key = "k" + std::to_string(i);
+    std::string key = "k" + ToString(i);

    perf_context.Reset();
    StopWatchNano timer(Env::Default(), true);
@ -149,11 +165,12 @@ TEST(PerfContextTest, StopWatchNanoOverhead) {
 TEST(PerfContextTest, StopWatchOverhead) {
  // profile the timer cost by itself!
  const int kTotalIterations = 1000000;
+  uint64_t elapsed = 0;
  std::vector<uint64_t> timings(kTotalIterations);

-  StopWatch timer(Env::Default());
+  StopWatch timer(Env::Default(), nullptr, 0, &elapsed);
  for (auto& timing : timings) {
-    timing = timer.ElapsedMicros();
+    timing = elapsed;
  }

  HistogramImpl histogram;
@ -166,7 +183,7 @@ TEST(PerfContextTest, StopWatchOverhead) {
  std::cout << histogram.ToString();
 }

-void ProfileKeyComparison() {
+void ProfileQueries(bool enabled_time = false) {
  DestroyDB(kDbName, Options());    // Start this test with a fresh DB

  auto db = OpenDb();
@ -175,74 +192,248 @@ void ProfileKeyComparison() {
  ReadOptions read_options;

  HistogramImpl hist_put;
+
  HistogramImpl hist_get;
  HistogramImpl hist_get_snapshot;
  HistogramImpl hist_get_memtable;
+  HistogramImpl hist_get_files;
  HistogramImpl hist_get_post_process;
  HistogramImpl hist_num_memtable_checked;
+
+  HistogramImpl hist_mget;
+  HistogramImpl hist_mget_snapshot;
+  HistogramImpl hist_mget_memtable;
+  HistogramImpl hist_mget_files;
+  HistogramImpl hist_mget_post_process;
+  HistogramImpl hist_mget_num_memtable_checked;
+
  HistogramImpl hist_write_pre_post;
  HistogramImpl hist_write_wal_time;
  HistogramImpl hist_write_memtable_time;

+  uint64_t total_db_mutex_nanos = 0;
+
  std::cout << "Inserting " << FLAGS_total_keys << " key/value pairs\n...\n";

  std::vector<int> keys;
+  const int kFlushFlag = -1;
  for (int i = 0; i < FLAGS_total_keys; ++i) {
    keys.push_back(i);
+    if (i == FLAGS_total_keys / 2) {
+      // Issuing a flush in the middle.
+      keys.push_back(kFlushFlag);
+    }
  }

  if (FLAGS_random_key) {
    std::random_shuffle(keys.begin(), keys.end());
  }
-
+#ifndef NDEBUG
+  ThreadStatusUtil::TEST_SetStateDelay(ThreadStatus::STATE_MUTEX_WAIT, 1U);
+#endif
+  int num_mutex_waited = 0;
  for (const int i : keys) {
-    std::string key = "k" + std::to_string(i);
-    std::string value = "v" + std::to_string(i);
+    if (i == kFlushFlag) {
+      FlushOptions fo;
+      db->Flush(fo);
+      continue;
+    }
+
+    std::string key = "k" + ToString(i);
+    std::string value = "v" + ToString(i);
+
+    std::vector<std::string> values;

    perf_context.Reset();
    db->Put(write_options, key, value);
+    if (++num_mutex_waited > 3) {
+#ifndef NDEBUG
+      ThreadStatusUtil::TEST_SetStateDelay(ThreadStatus::STATE_MUTEX_WAIT, 0U);
+#endif
+    }
    hist_write_pre_post.Add(perf_context.write_pre_and_post_process_time);
    hist_write_wal_time.Add(perf_context.write_wal_time);
    hist_write_memtable_time.Add(perf_context.write_memtable_time);
    hist_put.Add(perf_context.user_key_comparison_count);
+    total_db_mutex_nanos += perf_context.db_mutex_lock_nanos;
+  }
+#ifndef NDEBUG
+  ThreadStatusUtil::TEST_SetStateDelay(ThreadStatus::STATE_MUTEX_WAIT, 0U);
+#endif
+
+  for (const int i : keys) {
+    std::string key = "k" + ToString(i);
+    std::string value = "v" + ToString(i);
+
+    std::vector<Slice> multiget_keys = {Slice(key)};
+    std::vector<std::string> values;

    perf_context.Reset();
    db->Get(read_options, key, &value);
    hist_get_snapshot.Add(perf_context.get_snapshot_time);
    hist_get_memtable.Add(perf_context.get_from_memtable_time);
+    hist_get_files.Add(perf_context.get_from_output_files_time);
    hist_num_memtable_checked.Add(perf_context.get_from_memtable_count);
    hist_get_post_process.Add(perf_context.get_post_process_time);
    hist_get.Add(perf_context.user_key_comparison_count);
+
+    perf_context.Reset();
+    db->MultiGet(read_options, multiget_keys, &values);
+    hist_mget_snapshot.Add(perf_context.get_snapshot_time);
+    hist_mget_memtable.Add(perf_context.get_from_memtable_time);
+    hist_mget_files.Add(perf_context.get_from_output_files_time);
+    hist_mget_num_memtable_checked.Add(perf_context.get_from_memtable_count);
+    hist_mget_post_process.Add(perf_context.get_post_process_time);
+    hist_mget.Add(perf_context.user_key_comparison_count);
  }

  std::cout << "Put uesr key comparison: \n" << hist_put.ToString()
-            << "Get uesr key comparison: \n" << hist_get.ToString();
+            << "Get uesr key comparison: \n" << hist_get.ToString()
+            << "MultiGet uesr key comparison: \n" << hist_get.ToString();
  std::cout << "Put(): Pre and Post Process Time: \n"
            << hist_write_pre_post.ToString()
            << " Writing WAL time: \n"
            << hist_write_wal_time.ToString() << "\n"
            << " Writing Mem Table time: \n"
-            << hist_write_memtable_time.ToString() << "\n";
+            << hist_write_memtable_time.ToString() << "\n"
+            << " Total DB mutex nanos: \n" << total_db_mutex_nanos << "\n";
+
+  std::cout << "Get(): Time to get snapshot: \n" << hist_get_snapshot.ToString()
+            << " Time to get value from memtables: \n"
+            << hist_get_memtable.ToString() << "\n"
+            << " Time to get value from output files: \n"
+            << hist_get_files.ToString() << "\n"
+            << " Number of memtables checked: \n"
+            << hist_num_memtable_checked.ToString() << "\n"
+            << " Time to post process: \n" << hist_get_post_process.ToString()
+            << "\n";

-  std::cout << "Get(): Time to get snapshot: \n"
+  std::cout << "MultiGet(): Time to get snapshot: \n"
+            << hist_mget_snapshot.ToString()
+            << " Time to get value from memtables: \n"
+            << hist_mget_memtable.ToString() << "\n"
+            << " Time to get value from output files: \n"
+            << hist_mget_files.ToString() << "\n"
+            << " Number of memtables checked: \n"
+            << hist_mget_num_memtable_checked.ToString() << "\n"
+            << " Time to post process: \n" << hist_mget_post_process.ToString()
+            << "\n";
+
+  if (enabled_time) {
+    ASSERT_GT(hist_get.Average(), 0);
+    ASSERT_GT(hist_get_snapshot.Average(), 0);
+    ASSERT_GT(hist_get_memtable.Average(), 0);
+    ASSERT_GT(hist_get_files.Average(), 0);
+    ASSERT_GT(hist_get_post_process.Average(), 0);
+    ASSERT_GT(hist_num_memtable_checked.Average(), 0);
+
+    ASSERT_GT(hist_mget.Average(), 0);
+    ASSERT_GT(hist_mget_snapshot.Average(), 0);
+    ASSERT_GT(hist_mget_memtable.Average(), 0);
+    ASSERT_GT(hist_mget_files.Average(), 0);
+    ASSERT_GT(hist_mget_post_process.Average(), 0);
+    ASSERT_GT(hist_mget_num_memtable_checked.Average(), 0);
+#ifndef NDEBUG
+    ASSERT_GT(total_db_mutex_nanos, 2000U);
+#endif
+  }
+
+  db.reset();
+  db = OpenDb(true);
+
+  hist_get.Clear();
+  hist_get_snapshot.Clear();
+  hist_get_memtable.Clear();
+  hist_get_files.Clear();
+  hist_get_post_process.Clear();
+  hist_num_memtable_checked.Clear();
+
+  hist_mget.Clear();
+  hist_mget_snapshot.Clear();
+  hist_mget_memtable.Clear();
+  hist_mget_files.Clear();
+  hist_mget_post_process.Clear();
+  hist_mget_num_memtable_checked.Clear();
+
+  for (const int i : keys) {
+    std::string key = "k" + ToString(i);
+    std::string value = "v" + ToString(i);
+
+    std::vector<Slice> multiget_keys = {Slice(key)};
+    std::vector<std::string> values;
+
+    perf_context.Reset();
+    db->Get(read_options, key, &value);
+    hist_get_snapshot.Add(perf_context.get_snapshot_time);
+    hist_get_memtable.Add(perf_context.get_from_memtable_time);
+    hist_get_files.Add(perf_context.get_from_output_files_time);
+    hist_num_memtable_checked.Add(perf_context.get_from_memtable_count);
+    hist_get_post_process.Add(perf_context.get_post_process_time);
+    hist_get.Add(perf_context.user_key_comparison_count);
+
+    perf_context.Reset();
+    db->MultiGet(read_options, multiget_keys, &values);
+    hist_mget_snapshot.Add(perf_context.get_snapshot_time);
+    hist_mget_memtable.Add(perf_context.get_from_memtable_time);
+    hist_mget_files.Add(perf_context.get_from_output_files_time);
+    hist_mget_num_memtable_checked.Add(perf_context.get_from_memtable_count);
+    hist_mget_post_process.Add(perf_context.get_post_process_time);
+    hist_mget.Add(perf_context.user_key_comparison_count);
+  }
+
+  std::cout << "ReadOnly Get uesr key comparison: \n" << hist_get.ToString()
+            << "ReadOnly MultiGet uesr key comparison: \n"
+            << hist_mget.ToString();
+
+  std::cout << "ReadOnly Get(): Time to get snapshot: \n"
            << hist_get_snapshot.ToString()
            << " Time to get value from memtables: \n"
            << hist_get_memtable.ToString() << "\n"
+            << " Time to get value from output files: \n"
+            << hist_get_files.ToString() << "\n"
            << " Number of memtables checked: \n"
            << hist_num_memtable_checked.ToString() << "\n"
-            << " Time to post process: \n"
-            << hist_get_post_process.ToString() << "\n";
+            << " Time to post process: \n" << hist_get_post_process.ToString()
+            << "\n";
+
+  std::cout << "ReadOnly MultiGet(): Time to get snapshot: \n"
+            << hist_mget_snapshot.ToString()
+            << " Time to get value from memtables: \n"
+            << hist_mget_memtable.ToString() << "\n"
+            << " Time to get value from output files: \n"
+            << hist_mget_files.ToString() << "\n"
+            << " Number of memtables checked: \n"
+            << hist_mget_num_memtable_checked.ToString() << "\n"
+            << " Time to post process: \n" << hist_mget_post_process.ToString()
+            << "\n";
+
+  if (enabled_time) {
+    ASSERT_GT(hist_get.Average(), 0);
+    ASSERT_GT(hist_get_memtable.Average(), 0);
+    ASSERT_GT(hist_get_files.Average(), 0);
+    ASSERT_GT(hist_num_memtable_checked.Average(), 0);
+    // In read-only mode Get(), no super version operation is needed
+    ASSERT_EQ(hist_get_post_process.Average(), 0);
+    ASSERT_EQ(hist_get_snapshot.Average(), 0);
+
+    ASSERT_GT(hist_mget.Average(), 0);
+    ASSERT_GT(hist_mget_snapshot.Average(), 0);
+    ASSERT_GT(hist_mget_memtable.Average(), 0);
+    ASSERT_GT(hist_mget_files.Average(), 0);
+    ASSERT_GT(hist_mget_post_process.Average(), 0);
+    ASSERT_GT(hist_mget_num_memtable_checked.Average(), 0);
+  }
 }

 TEST(PerfContextTest, KeyComparisonCount) {
  SetPerfLevel(kEnableCount);
-  ProfileKeyComparison();
+  ProfileQueries();

  SetPerfLevel(kDisable);
-  ProfileKeyComparison();
+  ProfileQueries();

  SetPerfLevel(kEnableTime);
-  ProfileKeyComparison();
+  ProfileQueries(true);
 }

 // make perf_context_test
@ -281,8 +472,8 @@ TEST(PerfContextTest, SeekKeyComparison) {
  SetPerfLevel(kEnableTime);
  StopWatchNano timer(Env::Default());
  for (const int i : keys) {
-    std::string key = "k" + std::to_string(i);
-    std::string value = "v" + std::to_string(i);
+    std::string key = "k" + ToString(i);
+    std::string value = "v" + ToString(i);

    perf_context.Reset();
    timer.Start();
@ -301,8 +492,8 @@ TEST(PerfContextTest, SeekKeyComparison) {
  HistogramImpl hist_next;

  for (int i = 0; i < FLAGS_total_keys; ++i) {
-    std::string key = "k" + std::to_string(i);
-    std::string value = "v" + std::to_string(i);
+    std::string key = "k" + ToString(i);
+    std::string value = "v" + ToString(i);

    std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
    perf_context.Reset();
--- a/db/plain_table_db_test.cc
+++ b/db/plain_table_db_test.cc
@ -158,7 +158,7 @@ class PlainTableDBTest {
  // Return spread of files per level
  std::string FilesPerLevel() {
    std::string result;
-    int last_non_zero_offset = 0;
+    size_t last_non_zero_offset = 0;
    for (int level = 0; level < db_->NumberLevels(); level++) {
      int f = NumTableFilesAtLevel(level);
      char buf[100];
@ -192,16 +192,17 @@ extern const uint64_t kPlainTableMagicNumber;

 class TestPlainTableReader : public PlainTableReader {
 public:
-  TestPlainTableReader(const EnvOptions& storage_options,
+  TestPlainTableReader(const EnvOptions& env_options,
                       const InternalKeyComparator& icomparator,
                       EncodingType encoding_type, uint64_t file_size,
                       int bloom_bits_per_key, double hash_table_ratio,
                       size_t index_sparseness,
                       const TableProperties* table_properties,
                       unique_ptr<RandomAccessFile>&& file,
-                       const Options& options, bool* expect_bloom_not_match,
+                       const ImmutableCFOptions& ioptions,
+                       bool* expect_bloom_not_match,
                       bool store_index_in_file)
-      : PlainTableReader(options, std::move(file), storage_options, icomparator,
+      : PlainTableReader(ioptions, std::move(file), env_options, icomparator,
                         encoding_type, file_size, table_properties),
        expect_bloom_not_match_(expect_bloom_not_match) {
    Status s = MmapDataFile();
@ -218,7 +219,7 @@ class TestPlainTableReader : public PlainTableReader {
          PlainTablePropertyNames::kBloomVersion);
      ASSERT_TRUE(bloom_version_ptr != props->user_collected_properties.end());
      ASSERT_EQ(bloom_version_ptr->second, std::string("1"));
-      if (options.bloom_locality > 0) {
+      if (ioptions.bloom_locality > 0) {
        auto num_blocks_ptr = props->user_collected_properties.find(
            PlainTablePropertyNames::kNumBloomBlocks);
        ASSERT_TRUE(num_blocks_ptr != props->user_collected_properties.end());
@ -253,25 +254,26 @@ class TestPlainTableFactory : public PlainTableFactory {
        store_index_in_file_(options.store_index_in_file),
        expect_bloom_not_match_(expect_bloom_not_match) {}

-  Status NewTableReader(const Options& options, const EnvOptions& soptions,
+  Status NewTableReader(const ImmutableCFOptions& ioptions,
+                        const EnvOptions& env_options,
                        const InternalKeyComparator& internal_comparator,
                        unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
                        unique_ptr<TableReader>* table) const override {
    TableProperties* props = nullptr;
    auto s = ReadTableProperties(file.get(), file_size, kPlainTableMagicNumber,
-                                 options.env, options.info_log.get(), &props);
+                                 ioptions.env, ioptions.info_log, &props);
    ASSERT_TRUE(s.ok());

    if (store_index_in_file_) {
      BlockHandle bloom_block_handle;
      s = FindMetaBlock(file.get(), file_size, kPlainTableMagicNumber,
-                        options.env, BloomBlockBuilder::kBloomBlock,
+                        ioptions.env, BloomBlockBuilder::kBloomBlock,
                        &bloom_block_handle);
      ASSERT_TRUE(s.ok());

      BlockHandle index_block_handle;
      s = FindMetaBlock(
-          file.get(), file_size, kPlainTableMagicNumber, options.env,
+          file.get(), file_size, kPlainTableMagicNumber, ioptions.env,
          PlainTableIndexBuilder::kPlainTableIndexBlock, &index_block_handle);
      ASSERT_TRUE(s.ok());
    }
@ -284,9 +286,9 @@ class TestPlainTableFactory : public PlainTableFactory {
        DecodeFixed32(encoding_type_prop->second.c_str()));

    std::unique_ptr<PlainTableReader> new_reader(new TestPlainTableReader(
-        soptions, internal_comparator, encoding_type, file_size,
+        env_options, internal_comparator, encoding_type, file_size,
        bloom_bits_per_key_, hash_table_ratio_, index_sparseness_, props,
-        std::move(file), options, expect_bloom_not_match_,
+        std::move(file), ioptions, expect_bloom_not_match_,
        store_index_in_file_));

    *table = std::move(new_reader);
@ -626,7 +628,7 @@ TEST(PlainTableDBTest, IteratorLargeKeys) {
  };

  for (size_t i = 0; i < 7; i++) {
-    ASSERT_OK(Put(key_list[i], std::to_string(i)));
+    ASSERT_OK(Put(key_list[i], ToString(i)));
  }

  dbfull()->TEST_FlushMemTable();
@ -637,7 +639,7 @@ TEST(PlainTableDBTest, IteratorLargeKeys) {
  for (size_t i = 0; i < 7; i++) {
    ASSERT_TRUE(iter->Valid());
    ASSERT_EQ(key_list[i], iter->key().ToString());
-    ASSERT_EQ(std::to_string(i), iter->value().ToString());
+    ASSERT_EQ(ToString(i), iter->value().ToString());
    iter->Next();
  }

@ -674,7 +676,7 @@ TEST(PlainTableDBTest, IteratorLargeKeysWithPrefix) {
      MakeLongKeyWithPrefix(26, '6')};

  for (size_t i = 0; i < 7; i++) {
-    ASSERT_OK(Put(key_list[i], std::to_string(i)));
+    ASSERT_OK(Put(key_list[i], ToString(i)));
  }

  dbfull()->TEST_FlushMemTable();
@ -685,7 +687,7 @@ TEST(PlainTableDBTest, IteratorLargeKeysWithPrefix) {
  for (size_t i = 0; i < 7; i++) {
    ASSERT_TRUE(iter->Valid());
    ASSERT_EQ(key_list[i], iter->key().ToString());
-    ASSERT_EQ(std::to_string(i), iter->value().ToString());
+    ASSERT_EQ(ToString(i), iter->value().ToString());
    iter->Next();
  }

@ -694,40 +696,12 @@ TEST(PlainTableDBTest, IteratorLargeKeysWithPrefix) {
  delete iter;
 }

-// A test comparator which compare two strings in this way:
-// (1) first compare prefix of 8 bytes in alphabet order,
-// (2) if two strings share the same prefix, sort the other part of the string
-//     in the reverse alphabet order.
-class SimpleSuffixReverseComparator : public Comparator {
- public:
-  SimpleSuffixReverseComparator() {}
-
-  virtual const char* Name() const { return "SimpleSuffixReverseComparator"; }
-
-  virtual int Compare(const Slice& a, const Slice& b) const {
-    Slice prefix_a = Slice(a.data(), 8);
-    Slice prefix_b = Slice(b.data(), 8);
-    int prefix_comp = prefix_a.compare(prefix_b);
-    if (prefix_comp != 0) {
-      return prefix_comp;
-    } else {
-      Slice suffix_a = Slice(a.data() + 8, a.size() - 8);
-      Slice suffix_b = Slice(b.data() + 8, b.size() - 8);
-      return -(suffix_a.compare(suffix_b));
-    }
-  }
-  virtual void FindShortestSeparator(std::string* start,
-                                     const Slice& limit) const {}
-
-  virtual void FindShortSuccessor(std::string* key) const {}
-};
-
 TEST(PlainTableDBTest, IteratorReverseSuffixComparator) {
  Options options = CurrentOptions();
  options.create_if_missing = true;
  // Set only one bucket to force bucket conflict.
  // Test index interval for the same prefix to be 1, 2 and 4
-  SimpleSuffixReverseComparator comp;
+  test::SimpleSuffixReverseComparator comp;
  options.comparator = &comp;
  DestroyAndReopen(&options);

@ -890,7 +864,7 @@ TEST(PlainTableDBTest, HashBucketConflictReverseSuffixComparator) {
    for (unsigned char i = 1; i <= 3; i++) {
      Options options = CurrentOptions();
      options.create_if_missing = true;
-      SimpleSuffixReverseComparator comp;
+      test::SimpleSuffixReverseComparator comp;
      options.comparator = &comp;
      // Set only one bucket to force bucket conflict.
      // Test index interval for the same prefix to be 1, 2 and 4
--- a/db/prefix_test.cc
+++ b/db/prefix_test.cc
@ -29,14 +29,14 @@ using GFLAGS::ParseCommandLineFlags;

 DEFINE_bool(trigger_deadlock, false,
            "issue delete in range scan to trigger PrefixHashMap deadlock");
-DEFINE_uint64(bucket_count, 100000, "number of buckets");
+DEFINE_int32(bucket_count, 100000, "number of buckets");
 DEFINE_uint64(num_locks, 10001, "number of locks");
 DEFINE_bool(random_prefix, false, "randomize prefix");
 DEFINE_uint64(total_prefixes, 100000, "total number of prefixes");
 DEFINE_uint64(items_per_prefix, 1, "total number of values per prefix");
 DEFINE_int64(write_buffer_size, 33554432, "");
-DEFINE_int64(max_write_buffer_number, 2, "");
-DEFINE_int64(min_write_buffer_number_to_merge, 1, "");
+DEFINE_int32(max_write_buffer_number, 2, "");
+DEFINE_int32(min_write_buffer_number_to_merge, 1, "");
 DEFINE_int32(skiplist_height, 4, "");
 DEFINE_int32(memtable_prefix_bloom_bits, 10000000, "");
 DEFINE_int32(memtable_prefix_bloom_probes, 10, "");
@ -52,7 +52,8 @@ struct TestKey {
  uint64_t prefix;
  uint64_t sorted;

-  TestKey(uint64_t prefix, uint64_t sorted) : prefix(prefix), sorted(sorted) {}
+  TestKey(uint64_t _prefix, uint64_t _sorted)
+      : prefix(_prefix), sorted(_sorted) {}
 };

 // return a slice backed by test_key
@ -441,7 +442,7 @@ TEST(PrefixTest, DynamicPrefixIterator) {
    for (auto prefix : prefixes) {
      TestKey test_key(prefix, FLAGS_items_per_prefix / 2);
      Slice key = TestKeyToSlice(test_key);
-      std::string value = "v" + std::to_string(0);
+      std::string value = "v" + ToString(0);

      perf_context.Reset();
      StopWatchNano timer(Env::Default(), true);
--- a/db/repair.cc
+++ b/db/repair.cc
@ -31,7 +31,10 @@

 #ifndef ROCKSDB_LITE

+#ifndef __STDC_FORMAT_MACROS
 #define __STDC_FORMAT_MACROS
+#endif
+
 #include <inttypes.h>
 #include "db/builder.h"
 #include "db/db_impl.h"
@ -42,10 +45,14 @@
 #include "db/memtable.h"
 #include "db/table_cache.h"
 #include "db/version_edit.h"
+#include "db/writebuffer.h"
 #include "db/write_batch_internal.h"
 #include "rocksdb/comparator.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "rocksdb/immutable_options.h"
+#include "util/scoped_arena_iterator.h"

 namespace rocksdb {

@ -58,6 +65,7 @@ class Repairer {
        env_(options.env),
        icmp_(options.comparator),
        options_(SanitizeOptions(dbname, &icmp_, options)),
+        ioptions_(options_),
        raw_table_cache_(
            // TableCache can be small since we expect each table to be opened
            // once.
@ -65,7 +73,7 @@ class Repairer {
                        options_.table_cache_remove_scan_count_limit)),
        next_file_number_(1) {
    table_cache_ =
-        new TableCache(&options_, storage_options_, raw_table_cache_.get());
+        new TableCache(ioptions_, env_options_, raw_table_cache_.get());
    edit_ = new VersionEdit();
  }

@ -87,7 +95,7 @@ class Repairer {
      for (size_t i = 0; i < tables_.size(); i++) {
        bytes += tables_[i].meta.fd.GetFileSize();
      }
-      Log(options_.info_log,
+      Log(InfoLogLevel::WARN_LEVEL, options_.info_log,
          "**** Repaired rocksdb %s; "
          "recovered %zu files; %" PRIu64
          "bytes. "
@ -107,8 +115,9 @@ class Repairer {

  std::string const dbname_;
  Env* const env_;
-  InternalKeyComparator const icmp_;
-  Options const options_;
+  const InternalKeyComparator icmp_;
+  const Options options_;
+  const ImmutableCFOptions ioptions_;
  std::shared_ptr<Cache> raw_table_cache_;
  TableCache* table_cache_;
  VersionEdit* edit_;
@ -118,7 +127,7 @@ class Repairer {
  std::vector<uint64_t> logs_;
  std::vector<TableInfo> tables_;
  uint64_t next_file_number_;
-  const EnvOptions storage_options_;
+  const EnvOptions env_options_;

  Status FindFiles() {
    std::vector<std::string> filenames;
@ -167,7 +176,7 @@ class Repairer {
      std::string logname = LogFileName(dbname_, logs_[i]);
      Status status = ConvertLogToTable(logs_[i]);
      if (!status.ok()) {
-        Log(options_.info_log,
+        Log(InfoLogLevel::WARN_LEVEL, options_.info_log,
            "Log #%" PRIu64 ": ignoring conversion error: %s", logs_[i],
            status.ToString().c_str());
      }
@ -182,7 +191,8 @@ class Repairer {
      uint64_t lognum;
      virtual void Corruption(size_t bytes, const Status& s) {
        // We print error messages for corruption, but continue repairing.
-        Log(info_log, "Log #%" PRIu64 ": dropping %d bytes; %s", lognum,
+        Log(InfoLogLevel::ERROR_LEVEL, info_log,
+            "Log #%" PRIu64 ": dropping %d bytes; %s", lognum,
            static_cast<int>(bytes), s.ToString().c_str());
      }
    };
@ -190,7 +200,7 @@ class Repairer {
    // Open the log file
    std::string logname = LogFileName(dbname_, log);
    unique_ptr<SequentialFile> lfile;
-    Status status = env_->NewSequentialFile(logname, &lfile, storage_options_);
+    Status status = env_->NewSequentialFile(logname, &lfile, env_options_);
    if (!status.ok()) {
      return status;
    }
@ -211,8 +221,10 @@ class Repairer {
    std::string scratch;
    Slice record;
    WriteBatch batch;
-    MemTable* mem = new MemTable(icmp_, options_);
-    auto cf_mems_default = new ColumnFamilyMemTablesDefault(mem, &options_);
+    WriteBuffer wb(options_.db_write_buffer_size);
+    MemTable* mem = new MemTable(icmp_, ioptions_,
+                                 MutableCFOptions(options_, ioptions_), &wb);
+    auto cf_mems_default = new ColumnFamilyMemTablesDefault(mem);
    mem->Ref();
    int counter = 0;
    while (reader.ReadRecord(&record, &scratch)) {
@ -226,7 +238,8 @@ class Repairer {
      if (status.ok()) {
        counter += WriteBatchInternal::Count(&batch);
      } else {
-        Log(options_.info_log, "Log #%" PRIu64 ": ignoring %s", log,
+        Log(InfoLogLevel::WARN_LEVEL,
+            options_.info_log, "Log #%" PRIu64 ": ignoring %s", log,
            status.ToString().c_str());
        status = Status::OK();  // Keep going with rest of file
      }
@ -236,12 +249,15 @@ class Repairer {
    // since ExtractMetaData() will also generate edits.
    FileMetaData meta;
    meta.fd = FileDescriptor(next_file_number_++, 0, 0);
+    {
      ReadOptions ro;
      ro.total_order_seek = true;
-    Iterator* iter = mem->NewIterator(ro);
-    status = BuildTable(dbname_, env_, options_, storage_options_, table_cache_,
-                        iter, &meta, icmp_, 0, 0, kNoCompression);
-    delete iter;
+      Arena arena;
+      ScopedArenaIterator iter(mem->NewIterator(ro, &arena));
+      status = BuildTable(dbname_, env_, ioptions_, env_options_, table_cache_,
+                          iter.get(), &meta, icmp_, 0, 0, kNoCompression,
+                          CompressionOptions());
+    }
    delete mem->Unref();
    delete cf_mems_default;
    mem = nullptr;
@ -250,9 +266,9 @@ class Repairer {
        table_fds_.push_back(meta.fd);
      }
    }
-    Log(options_.info_log,
-        "Log #%" PRIu64 ": %d ops saved to Table #%" PRIu64 " %s", log, counter,
-        meta.fd.GetNumber(), status.ToString().c_str());
+    Log(InfoLogLevel::INFO_LEVEL, options_.info_log,
+        "Log #%" PRIu64 ": %d ops saved to Table #%" PRIu64 " %s",
+        log, counter, meta.fd.GetNumber(), status.ToString().c_str());
    return status;
  }

@ -267,7 +283,8 @@ class Repairer {
        char file_num_buf[kFormatFileNumberBufSize];
        FormatFileNumber(t.meta.fd.GetNumber(), t.meta.fd.GetPathId(),
                         file_num_buf, sizeof(file_num_buf));
-        Log(options_.info_log, "Table #%s: ignoring %s", file_num_buf,
+        Log(InfoLogLevel::WARN_LEVEL, options_.info_log,
+            "Table #%s: ignoring %s", file_num_buf,
            status.ToString().c_str());
        ArchiveFile(fname);
      } else {
@ -286,7 +303,7 @@ class Repairer {
                                file_size);
    if (status.ok()) {
      Iterator* iter = table_cache_->NewIterator(
-          ReadOptions(), storage_options_, icmp_, t->meta.fd);
+          ReadOptions(), env_options_, icmp_, t->meta.fd);
      bool empty = true;
      ParsedInternalKey parsed;
      t->min_sequence = 0;
@ -294,7 +311,8 @@ class Repairer {
      for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
        Slice key = iter->key();
        if (!ParseInternalKey(key, &parsed)) {
-          Log(options_.info_log, "Table #%" PRIu64 ": unparsable key %s",
+          Log(InfoLogLevel::ERROR_LEVEL,
+              options_.info_log, "Table #%" PRIu64 ": unparsable key %s",
              t->meta.fd.GetNumber(), EscapeString(key).c_str());
          continue;
        }
@ -317,7 +335,8 @@ class Repairer {
      }
      delete iter;
    }
-    Log(options_.info_log, "Table #%" PRIu64 ": %d entries %s",
+    Log(InfoLogLevel::INFO_LEVEL,
+        options_.info_log, "Table #%" PRIu64 ": %d entries %s",
        t->meta.fd.GetNumber(), counter, status.ToString().c_str());
    return status;
  }
@ -326,7 +345,7 @@ class Repairer {
    std::string tmp = TempFileName(dbname_, 1);
    unique_ptr<WritableFile> file;
    Status status = env_->NewWritableFile(
-        tmp, &file, env_->OptimizeForManifestWrite(storage_options_));
+        tmp, &file, env_->OptimizeForManifestWrite(env_options_));
    if (!status.ok()) {
      return status;
    }
@ -394,7 +413,8 @@ class Repairer {
    new_file.append("/");
    new_file.append((slash == nullptr) ? fname.c_str() : slash + 1);
    Status s = env_->RenameFile(fname, new_file);
-    Log(options_.info_log, "Archiving %s: %s\n",
+    Log(InfoLogLevel::INFO_LEVEL,
+        options_.info_log, "Archiving %s: %s\n",
        fname.c_str(), s.ToString().c_str());
  }
 };
--- a/db/simple_table_db_test.cc
+++ b/db/simple_table_db_test.cc
@ -1,810 +0,0 @@
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under the BSD-style license found in the
-//  LICENSE file in the root directory of this source tree. An additional grant
-//  of patent rights can be found in the PATENTS file in the same directory.
-//
-// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-#include <algorithm>
-#include <set>
-
-#include "rocksdb/db.h"
-#include "rocksdb/filter_policy.h"
-#include "db/db_impl.h"
-#include "db/filename.h"
-#include "db/version_set.h"
-#include "db/write_batch_internal.h"
-#include "rocksdb/statistics.h"
-#include "rocksdb/cache.h"
-#include "rocksdb/compaction_filter.h"
-#include "rocksdb/env.h"
-#include "rocksdb/table.h"
-#include "rocksdb/table_properties.h"
-#include "table/table_builder.h"
-#include "util/hash.h"
-#include "util/logging.h"
-#include "util/mutexlock.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
-#include "utilities/merge_operators.h"
-
-using std::unique_ptr;
-
-// IS THIS FILE STILL NEEDED?
-namespace rocksdb {
-
-// SimpleTable is a simple table format for UNIT TEST ONLY. It is not built
-// as production quality.
-// SimpleTable requires the input key size to be fixed 16 bytes, value cannot
-// be longer than 150000 bytes and stored data on disk in this format:
-// +--------------------------------------------+  <= key1 offset
-// | key1            | value_size (4 bytes) |   |
-// +----------------------------------------+   |
-// | value1                                     |
-// |                                            |
-// +----------------------------------------+---+  <= key2 offset
-// | key2            | value_size (4 bytes) |   |
-// +----------------------------------------+   |
-// | value2                                     |
-// |                                            |
-// |        ......                              |
-// +-----------------+--------------------------+   <= index_block_offset
-// | key1            | key1 offset (8 bytes)    |
-// +-----------------+--------------------------+
-// | key2            | key2 offset (8 bytes)    |
-// +-----------------+--------------------------+
-// | key3            | key3 offset (8 bytes)    |
-// +-----------------+--------------------------+
-// |        ......                              |
-// +-----------------+------------+-------------+
-// | index_block_offset (8 bytes) |
-// +------------------------------+
-
-// SimpleTable is a simple table format for UNIT TEST ONLY. It is not built
-// as production quality.
-class SimpleTableReader: public TableReader {
-public:
-  // Attempt to open the table that is stored in bytes [0..file_size)
-  // of "file", and read the metadata entries necessary to allow
-  // retrieving data from the table.
-  //
-  // If successful, returns ok and sets "*table" to the newly opened
-  // table.  The client should delete "*table" when no longer needed.
-  // If there was an error while initializing the table, sets "*table"
-  // to nullptr and returns a non-ok status.  Does not take ownership of
-  // "*source", but the client must ensure that "source" remains live
-  // for the duration of the returned table's lifetime.
-  //
-  // *file must remain live while this Table is in use.
-  static Status Open(const Options& options, const EnvOptions& soptions,
-                     unique_ptr<RandomAccessFile> && file, uint64_t file_size,
-                     unique_ptr<TableReader>* table_reader);
-
-  Iterator* NewIterator(const ReadOptions&, Arena* arena) override;
-
-  Status Get(const ReadOptions&, const Slice& key, void* arg,
-             bool (*handle_result)(void* arg, const ParsedInternalKey& k,
-                                   const Slice& v),
-             void (*mark_key_may_exist)(void*) = nullptr) override;
-
-  uint64_t ApproximateOffsetOf(const Slice& key) override;
-
-  virtual size_t ApproximateMemoryUsage() const override { return 0; }
-
-  void SetupForCompaction() override;
-
-  std::shared_ptr<const TableProperties> GetTableProperties() const override;
-
-  ~SimpleTableReader();
-
-private:
-  struct Rep;
-  Rep* rep_;
-
-  explicit SimpleTableReader(Rep* rep) {
-    rep_ = rep;
-  }
-  friend class TableCache;
-  friend class SimpleTableIterator;
-
-  Status GetOffset(const Slice& target, uint64_t* offset);
-
-  // No copying allowed
-  explicit SimpleTableReader(const TableReader&) = delete;
-  void operator=(const TableReader&) = delete;
-};
-
-// Iterator to iterate SimpleTable
-class SimpleTableIterator: public Iterator {
-public:
-  explicit SimpleTableIterator(SimpleTableReader* table);
-  ~SimpleTableIterator();
-
-  bool Valid() const;
-
-  void SeekToFirst();
-
-  void SeekToLast();
-
-  void Seek(const Slice& target);
-
-  void Next();
-
-  void Prev();
-
-  Slice key() const;
-
-  Slice value() const;
-
-  Status status() const;
-
-private:
-  SimpleTableReader* table_;
-  uint64_t offset_;
-  uint64_t next_offset_;
-  Slice key_;
-  Slice value_;
-  char tmp_str_[4];
-  char* key_str_;
-  char* value_str_;
-  int value_str_len_;
-  Status status_;
-  // No copying allowed
-  SimpleTableIterator(const SimpleTableIterator&) = delete;
-  void operator=(const Iterator&) = delete;
-};
-
-struct SimpleTableReader::Rep {
-  ~Rep() {
-  }
-  Rep(const EnvOptions& storage_options, uint64_t index_start_offset,
-      int num_entries) :
-      soptions(storage_options), index_start_offset(index_start_offset),
-      num_entries(num_entries) {
-  }
-
-  Options options;
-  const EnvOptions& soptions;
-  Status status;
-  unique_ptr<RandomAccessFile> file;
-  uint64_t index_start_offset;
-  int num_entries;
-  std::shared_ptr<TableProperties> table_properties;
-
-  const static int user_key_size = 16;
-  const static int offset_length = 8;
-  const static int key_footer_len = 8;
-
-  static int GetInternalKeyLength() {
-    return user_key_size + key_footer_len;
-  }
-};
-
-SimpleTableReader::~SimpleTableReader() {
-  delete rep_;
-}
-
-Status SimpleTableReader::Open(const Options& options,
-                               const EnvOptions& soptions,
-                               unique_ptr<RandomAccessFile> && file,
-                               uint64_t size,
-                               unique_ptr<TableReader>* table_reader) {
-  char footer_space[Rep::offset_length];
-  Slice footer_input;
-  Status s = file->Read(size - Rep::offset_length, Rep::offset_length,
-                        &footer_input, footer_space);
-  if (s.ok()) {
-    uint64_t index_start_offset = DecodeFixed64(footer_space);
-
-    int num_entries = (size - Rep::offset_length - index_start_offset)
-        / (Rep::GetInternalKeyLength() + Rep::offset_length);
-    SimpleTableReader::Rep* rep = new SimpleTableReader::Rep(soptions,
-                                                             index_start_offset,
-                                                             num_entries);
-
-    rep->file = std::move(file);
-    rep->options = options;
-    table_reader->reset(new SimpleTableReader(rep));
-  }
-  return s;
-}
-
-void SimpleTableReader::SetupForCompaction() {
-}
-
-std::shared_ptr<const TableProperties> SimpleTableReader::GetTableProperties()
-    const {
-  return rep_->table_properties;
-}
-
-Iterator* SimpleTableReader::NewIterator(const ReadOptions& options,
-                                         Arena* arena) {
-  if (arena == nullptr) {
-    return new SimpleTableIterator(this);
-  } else {
-    auto mem = arena->AllocateAligned(sizeof(SimpleTableIterator));
-    return new (mem) SimpleTableIterator(this);
-  }
-}
-
-Status SimpleTableReader::GetOffset(const Slice& target, uint64_t* offset) {
-  uint32_t left = 0;
-  uint32_t right = rep_->num_entries - 1;
-  char key_chars[Rep::GetInternalKeyLength()];
-  Slice tmp_slice;
-
-  uint32_t target_offset = 0;
-  while (left <= right) {
-    uint32_t mid = (left + right + 1) / 2;
-
-    uint64_t offset_to_read = rep_->index_start_offset
-        + (Rep::GetInternalKeyLength() + Rep::offset_length) * mid;
-    Status s = rep_->file->Read(offset_to_read, Rep::GetInternalKeyLength(),
-                                &tmp_slice, key_chars);
-    if (!s.ok()) {
-      return s;
-    }
-
-    InternalKeyComparator ikc(rep_->options.comparator);
-    int compare_result = ikc.Compare(tmp_slice, target);
-
-    if (compare_result < 0) {
-      if (left == right) {
-        target_offset = right + 1;
-        break;
-      }
-      left = mid;
-    } else {
-      if (left == right) {
-        target_offset = left;
-        break;
-      }
-      right = mid - 1;
-    }
-  }
-
-  if (target_offset >= (uint32_t) rep_->num_entries) {
-    *offset = rep_->index_start_offset;
-    return Status::OK();
-  }
-
-  char value_offset_chars[Rep::offset_length];
-
-  int64_t offset_for_value_offset = rep_->index_start_offset
-      + (Rep::GetInternalKeyLength() + Rep::offset_length) * target_offset
-      + Rep::GetInternalKeyLength();
-  Status s = rep_->file->Read(offset_for_value_offset, Rep::offset_length,
-                              &tmp_slice, value_offset_chars);
-  if (s.ok()) {
-    *offset = DecodeFixed64(value_offset_chars);
-  }
-  return s;
-}
-
-Status SimpleTableReader::Get(const ReadOptions& options, const Slice& k,
-                              void* arg,
-                              bool (*saver)(void*, const ParsedInternalKey&,
-                                            const Slice&),
-                              void (*mark_key_may_exist)(void*)) {
-  Status s;
-  SimpleTableIterator* iter = new SimpleTableIterator(this);
-  for (iter->Seek(k); iter->Valid(); iter->Next()) {
-    ParsedInternalKey parsed_key;
-    if (!ParseInternalKey(iter->key(), &parsed_key)) {
-      return Status::Corruption(Slice());
-    }
-
-    if (!(*saver)(arg, parsed_key, iter->value())) {
-      break;
-    }
-  }
-  s = iter->status();
-  delete iter;
-  return s;
-}
-
-uint64_t SimpleTableReader::ApproximateOffsetOf(const Slice& key) {
-  return 0;
-}
-
-SimpleTableIterator::SimpleTableIterator(SimpleTableReader* table) :
-    table_(table) {
-  key_str_ = new char[SimpleTableReader::Rep::GetInternalKeyLength()];
-  value_str_len_ = -1;
-  SeekToFirst();
-}
-
-SimpleTableIterator::~SimpleTableIterator() {
- delete[] key_str_;
- if (value_str_len_ >= 0) {
-   delete[] value_str_;
- }
-}
-
-bool SimpleTableIterator::Valid() const {
-  return offset_ < table_->rep_->index_start_offset;
-}
-
-void SimpleTableIterator::SeekToFirst() {
-  next_offset_ = 0;
-  Next();
-}
-
-void SimpleTableIterator::SeekToLast() {
-  assert(false);
-}
-
-void SimpleTableIterator::Seek(const Slice& target) {
-  Status s = table_->GetOffset(target, &next_offset_);
-  if (!s.ok()) {
-    status_ = s;
-  }
-  Next();
-}
-
-void SimpleTableIterator::Next() {
-  offset_ = next_offset_;
-  if (offset_ >= table_->rep_->index_start_offset) {
-    return;
-  }
-  Slice result;
-  int internal_key_size = SimpleTableReader::Rep::GetInternalKeyLength();
-
-  Status s = table_->rep_->file->Read(next_offset_, internal_key_size, &result,
-                                      key_str_);
-  next_offset_ += internal_key_size;
-  key_ = result;
-
-  Slice value_size_slice;
-  s = table_->rep_->file->Read(next_offset_, 4, &value_size_slice, tmp_str_);
-  next_offset_ += 4;
-  uint32_t value_size = DecodeFixed32(tmp_str_);
-
-  Slice value_slice;
-  if ((int) value_size > value_str_len_) {
-    if (value_str_len_ >= 0) {
-      delete[] value_str_;
-    }
-    value_str_ = new char[value_size];
-    value_str_len_ = value_size;
-  }
-  s = table_->rep_->file->Read(next_offset_, value_size, &value_slice,
-                               value_str_);
-  next_offset_ += value_size;
-  value_ = value_slice;
-}
-
-void SimpleTableIterator::Prev() {
-  assert(false);
-}
-
-Slice SimpleTableIterator::key() const {
-  Log(table_->rep_->options.info_log, "key!!!!");
-  return key_;
-}
-
-Slice SimpleTableIterator::value() const {
-  return value_;
-}
-
-Status SimpleTableIterator::status() const {
-  return status_;
-}
-
-class SimpleTableBuilder: public TableBuilder {
-public:
-  // Create a builder that will store the contents of the table it is
-  // building in *file.  Does not close the file.  It is up to the
-  // caller to close the file after calling Finish(). The output file
-  // will be part of level specified by 'level'.  A value of -1 means
-  // that the caller does not know which level the output file will reside.
-  SimpleTableBuilder(const Options& options, WritableFile* file,
-                     CompressionType compression_type);
-
-  // REQUIRES: Either Finish() or Abandon() has been called.
-  ~SimpleTableBuilder();
-
-  // Add key,value to the table being constructed.
-  // REQUIRES: key is after any previously added key according to comparator.
-  // REQUIRES: Finish(), Abandon() have not been called
-  void Add(const Slice& key, const Slice& value) override;
-
-  // Return non-ok iff some error has been detected.
-  Status status() const override;
-
-  // Finish building the table.  Stops using the file passed to the
-  // constructor after this function returns.
-  // REQUIRES: Finish(), Abandon() have not been called
-  Status Finish() override;
-
-  // Indicate that the contents of this builder should be abandoned.  Stops
-  // using the file passed to the constructor after this function returns.
-  // If the caller is not going to call Finish(), it must call Abandon()
-  // before destroying this builder.
-  // REQUIRES: Finish(), Abandon() have not been called
-  void Abandon() override;
-
-  // Number of calls to Add() so far.
-  uint64_t NumEntries() const override;
-
-  // Size of the file generated so far.  If invoked after a successful
-  // Finish() call, returns the size of the final generated file.
-  uint64_t FileSize() const override;
-
-private:
-  struct Rep;
-  Rep* rep_;
-
-  // No copying allowed
-  SimpleTableBuilder(const SimpleTableBuilder&) = delete;
-  void operator=(const SimpleTableBuilder&) = delete;
-};
-
-struct SimpleTableBuilder::Rep {
-  Options options;
-  WritableFile* file;
-  uint64_t offset = 0;
-  Status status;
-
-  uint64_t num_entries = 0;
-
-  bool closed = false;  // Either Finish() or Abandon() has been called.
-
-  const static int user_key_size = 16;
-  const static int offset_length = 8;
-  const static int key_footer_len = 8;
-
-  static int GetInternalKeyLength() {
-    return user_key_size + key_footer_len;
-  }
-
-  std::string index;
-
-  Rep(const Options& opt, WritableFile* f) :
-      options(opt), file(f) {
-  }
-  ~Rep() {
-  }
-};
-
-SimpleTableBuilder::SimpleTableBuilder(const Options& options,
-                                       WritableFile* file,
-                                       CompressionType compression_type) :
-    rep_(new SimpleTableBuilder::Rep(options, file)) {
-}
-
-SimpleTableBuilder::~SimpleTableBuilder() {
-  delete (rep_);
-}
-
-void SimpleTableBuilder::Add(const Slice& key, const Slice& value) {
-  assert((int ) key.size() == Rep::GetInternalKeyLength());
-
-  // Update index
-  rep_->index.append(key.data(), key.size());
-  PutFixed64(&(rep_->index), rep_->offset);
-
-  // Write key-value pair
-  rep_->file->Append(key);
-  rep_->offset += Rep::GetInternalKeyLength();
-
-  std::string size;
-  int value_size = value.size();
-  PutFixed32(&size, value_size);
-  Slice sizeSlice(size);
-  rep_->file->Append(sizeSlice);
-  rep_->file->Append(value);
-  rep_->offset += value_size + 4;
-
-  rep_->num_entries++;
-}
-
-Status SimpleTableBuilder::status() const {
-  return Status::OK();
-}
-
-Status SimpleTableBuilder::Finish() {
-  Rep* r = rep_;
-  assert(!r->closed);
-  r->closed = true;
-
-  uint64_t index_offset = rep_->offset;
-  Slice index_slice(rep_->index);
-  rep_->file->Append(index_slice);
-  rep_->offset += index_slice.size();
-
-  std::string index_offset_str;
-  PutFixed64(&index_offset_str, index_offset);
-  Slice foot_slice(index_offset_str);
-  rep_->file->Append(foot_slice);
-  rep_->offset += foot_slice.size();
-
-  return Status::OK();
-}
-
-void SimpleTableBuilder::Abandon() {
-  rep_->closed = true;
-}
-
-uint64_t SimpleTableBuilder::NumEntries() const {
-  return rep_->num_entries;
-}
-
-uint64_t SimpleTableBuilder::FileSize() const {
-  return rep_->offset;
-}
-
-class SimpleTableFactory: public TableFactory {
-public:
-  ~SimpleTableFactory() {
-  }
-  SimpleTableFactory() {
-  }
-  const char* Name() const override {
-    return "SimpleTable";
-  }
-  Status NewTableReader(const Options& options, const EnvOptions& soptions,
-                        const InternalKeyComparator& internal_key,
-                        unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
-                        unique_ptr<TableReader>* table_reader) const;
-
-  TableBuilder* NewTableBuilder(const Options& options,
-                                const InternalKeyComparator& internal_key,
-                                WritableFile* file,
-                                CompressionType compression_type) const;
-
-  virtual Status SanitizeDBOptions(DBOptions* db_opts) const override {
-    return Status::OK();
-  }
-
-  virtual std::string GetPrintableTableOptions() const override {
-    return std::string();
-  }
-};
-
-Status SimpleTableFactory::NewTableReader(
-    const Options& options, const EnvOptions& soptions,
-    const InternalKeyComparator& internal_key,
-    unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
-    unique_ptr<TableReader>* table_reader) const {
-
-  return SimpleTableReader::Open(options, soptions, std::move(file), file_size,
-                                 table_reader);
-}
-
-TableBuilder* SimpleTableFactory::NewTableBuilder(
-    const Options& options, const InternalKeyComparator& internal_key,
-    WritableFile* file, CompressionType compression_type) const {
-  return new SimpleTableBuilder(options, file, compression_type);
-}
-
-class SimpleTableDBTest {
-protected:
-public:
-  std::string dbname_;
-  Env* env_;
-  DB* db_;
-
-  Options last_options_;
-
-  SimpleTableDBTest() :
-      env_(Env::Default()) {
-    dbname_ = test::TmpDir() + "/simple_table_db_test";
-    ASSERT_OK(DestroyDB(dbname_, Options()));
-    db_ = nullptr;
-    Reopen();
-  }
-
-  ~SimpleTableDBTest() {
-    delete db_;
-    ASSERT_OK(DestroyDB(dbname_, Options()));
-  }
-
-  // Return the current option configuration.
-  Options CurrentOptions() {
-    Options options;
-    options.table_factory.reset(new SimpleTableFactory());
-    return options;
-  }
-
-  DBImpl* dbfull() {
-    return reinterpret_cast<DBImpl*>(db_);
-  }
-
-  void Reopen(Options* options = nullptr) {
-    ASSERT_OK(TryReopen(options));
-  }
-
-  void Close() {
-    delete db_;
-    db_ = nullptr;
-  }
-
-  void DestroyAndReopen(Options* options = nullptr) {
-    //Destroy using last options
-    Destroy(&last_options_);
-    ASSERT_OK(TryReopen(options));
-  }
-
-  void Destroy(Options* options) {
-    delete db_;
-    db_ = nullptr;
-    ASSERT_OK(DestroyDB(dbname_, *options));
-  }
-
-  Status PureReopen(Options* options, DB** db) {
-    return DB::Open(*options, dbname_, db);
-  }
-
-  Status TryReopen(Options* options = nullptr) {
-    delete db_;
-    db_ = nullptr;
-    Options opts;
-    if (options != nullptr) {
-      opts = *options;
-    } else {
-      opts = CurrentOptions();
-      opts.create_if_missing = true;
-    }
-    last_options_ = opts;
-
-    return DB::Open(opts, dbname_, &db_);
-  }
-
-  Status Put(const Slice& k, const Slice& v) {
-    return db_->Put(WriteOptions(), k, v);
-  }
-
-  Status Delete(const std::string& k) {
-    return db_->Delete(WriteOptions(), k);
-  }
-
-  std::string Get(const std::string& k, const Snapshot* snapshot = nullptr) {
-    ReadOptions options;
-    options.snapshot = snapshot;
-    std::string result;
-    Status s = db_->Get(options, k, &result);
-    if (s.IsNotFound()) {
-      result = "NOT_FOUND";
-    } else if (!s.ok()) {
-      result = s.ToString();
-    }
-    return result;
-  }
-
-
-  int NumTableFilesAtLevel(int level) {
-    std::string property;
-    ASSERT_TRUE(
-        db_->GetProperty("rocksdb.num-files-at-level" + NumberToString(level),
-                         &property));
-    return atoi(property.c_str());
-  }
-
-  // Return spread of files per level
-  std::string FilesPerLevel() {
-    std::string result;
-    int last_non_zero_offset = 0;
-    for (int level = 0; level < db_->NumberLevels(); level++) {
-      int f = NumTableFilesAtLevel(level);
-      char buf[100];
-      snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f);
-      result += buf;
-      if (f > 0) {
-        last_non_zero_offset = result.size();
-      }
-    }
-    result.resize(last_non_zero_offset);
-    return result;
-  }
-
-  std::string IterStatus(Iterator* iter) {
-    std::string result;
-    if (iter->Valid()) {
-      result = iter->key().ToString() + "->" + iter->value().ToString();
-    } else {
-      result = "(invalid)";
-    }
-    return result;
-  }
-};
-
-TEST(SimpleTableDBTest, Empty) {
-  ASSERT_TRUE(db_ != nullptr);
-  ASSERT_EQ("NOT_FOUND", Get("0000000000000foo"));
-}
-
-TEST(SimpleTableDBTest, ReadWrite) {
-  ASSERT_OK(Put("0000000000000foo", "v1"));
-  ASSERT_EQ("v1", Get("0000000000000foo"));
-  ASSERT_OK(Put("0000000000000bar", "v2"));
-  ASSERT_OK(Put("0000000000000foo", "v3"));
-  ASSERT_EQ("v3", Get("0000000000000foo"));
-  ASSERT_EQ("v2", Get("0000000000000bar"));
-}
-
-TEST(SimpleTableDBTest, Flush) {
-  ASSERT_OK(Put("0000000000000foo", "v1"));
-  ASSERT_OK(Put("0000000000000bar", "v2"));
-  ASSERT_OK(Put("0000000000000foo", "v3"));
-  dbfull()->TEST_FlushMemTable();
-  ASSERT_EQ("v3", Get("0000000000000foo"));
-  ASSERT_EQ("v2", Get("0000000000000bar"));
-}
-
-TEST(SimpleTableDBTest, Flush2) {
-  ASSERT_OK(Put("0000000000000bar", "b"));
-  ASSERT_OK(Put("0000000000000foo", "v1"));
-  dbfull()->TEST_FlushMemTable();
-
-  ASSERT_OK(Put("0000000000000foo", "v2"));
-  dbfull()->TEST_FlushMemTable();
-  ASSERT_EQ("v2", Get("0000000000000foo"));
-
-  ASSERT_OK(Put("0000000000000eee", "v3"));
-  dbfull()->TEST_FlushMemTable();
-  ASSERT_EQ("v3", Get("0000000000000eee"));
-
-  ASSERT_OK(Delete("0000000000000bar"));
-  dbfull()->TEST_FlushMemTable();
-  ASSERT_EQ("NOT_FOUND", Get("0000000000000bar"));
-
-  ASSERT_OK(Put("0000000000000eee", "v5"));
-  dbfull()->TEST_FlushMemTable();
-  ASSERT_EQ("v5", Get("0000000000000eee"));
-}
-
-static std::string Key(int i) {
-  char buf[100];
-  snprintf(buf, sizeof(buf), "key_______%06d", i);
-  return std::string(buf);
-}
-
-static std::string RandomString(Random* rnd, int len) {
-  std::string r;
-  test::RandomString(rnd, len, &r);
-  return r;
-}
-
-TEST(SimpleTableDBTest, CompactionTrigger) {
-  Options options = CurrentOptions();
-  options.write_buffer_size = 100 << 10; //100KB
-  options.num_levels = 3;
-  options.max_mem_compaction_level = 0;
-  options.level0_file_num_compaction_trigger = 3;
-  Reopen(&options);
-
-  Random rnd(301);
-
-  for (int num = 0; num < options.level0_file_num_compaction_trigger - 1;
-      num++) {
-    std::vector<std::string> values;
-    // Write 120KB (12 values, each 10K)
-    for (int i = 0; i < 12; i++) {
-      values.push_back(RandomString(&rnd, 10000));
-      ASSERT_OK(Put(Key(i), values[i]));
-    }
-    dbfull()->TEST_WaitForFlushMemTable();
-    ASSERT_EQ(NumTableFilesAtLevel(0), num + 1);
-  }
-
-  //generate one more file in level-0, and should trigger level-0 compaction
-  std::vector<std::string> values;
-  for (int i = 0; i < 12; i++) {
-    values.push_back(RandomString(&rnd, 10000));
-    ASSERT_OK(Put(Key(i), values[i]));
-  }
-  dbfull()->TEST_WaitForCompact();
-
-  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
-  ASSERT_EQ(NumTableFilesAtLevel(1), 1);
-}
-
-}  // namespace rocksdb
-
-int main(int argc, char** argv) {
-  return rocksdb::test::RunAllTests();
-}
--- a/db/skiplist.h
+++ b/db/skiplist.h
@ -32,10 +32,10 @@

 #pragma once
 #include <assert.h>
+#include <atomic>
 #include <stdlib.h>
-#include "util/arena.h"
 #include "port/port.h"
-#include "util/arena.h"
+#include "util/allocator.h"
 #include "util/random.h"

 namespace rocksdb {
@ -47,9 +47,9 @@ class SkipList {

 public:
  // Create a new SkipList object that will use "cmp" for comparing keys,
-  // and will allocate memory using "*arena".  Objects allocated in the arena
-  // must remain allocated for the lifetime of the skiplist object.
-  explicit SkipList(Comparator cmp, Arena* arena,
+  // and will allocate memory using "*allocator".  Objects allocated in the
+  // allocator must remain allocated for the lifetime of the skiplist object.
+  explicit SkipList(Comparator cmp, Allocator* allocator,
                    int32_t max_height = 12, int32_t branching_factor = 4);

  // Insert key into the list.
@ -109,21 +109,20 @@ class SkipList {

  // Immutable after construction
  Comparator const compare_;
-  Arena* const arena_;    // Arena used for allocations of nodes
+  Allocator* const allocator_;    // Allocator used for allocations of nodes

  Node* const head_;

  // Modified only by Insert().  Read racily by readers, but stale
  // values are ok.
-  port::AtomicPointer max_height_;   // Height of the entire list
+  std::atomic<int> max_height_;  // Height of the entire list

  // Used for optimizing sequential insert patterns
  Node** prev_;
  int32_t prev_height_;

  inline int GetMaxHeight() const {
-    return static_cast<int>(
-        reinterpret_cast<intptr_t>(max_height_.NoBarrier_Load()));
+    return max_height_.load(std::memory_order_relaxed);
  }

  // Read/written only by Insert().
@ -169,35 +168,35 @@ struct SkipList<Key, Comparator>::Node {
    assert(n >= 0);
    // Use an 'acquire load' so that we observe a fully initialized
    // version of the returned Node.
-    return reinterpret_cast<Node*>(next_[n].Acquire_Load());
+    return (next_[n].load(std::memory_order_acquire));
  }
  void SetNext(int n, Node* x) {
    assert(n >= 0);
    // Use a 'release store' so that anybody who reads through this
    // pointer observes a fully initialized version of the inserted node.
-    next_[n].Release_Store(x);
+    next_[n].store(x, std::memory_order_release);
  }

  // No-barrier variants that can be safely used in a few locations.
  Node* NoBarrier_Next(int n) {
    assert(n >= 0);
-    return reinterpret_cast<Node*>(next_[n].NoBarrier_Load());
+    return next_[n].load(std::memory_order_relaxed);
  }
  void NoBarrier_SetNext(int n, Node* x) {
    assert(n >= 0);
-    next_[n].NoBarrier_Store(x);
+    next_[n].store(x, std::memory_order_relaxed);
  }

 private:
  // Array of length equal to the node height.  next_[0] is lowest level link.
-  port::AtomicPointer next_[1];
+  std::atomic<Node*> next_[1];
 };

 template<typename Key, class Comparator>
 typename SkipList<Key, Comparator>::Node*
 SkipList<Key, Comparator>::NewNode(const Key& key, int height) {
-  char* mem = arena_->AllocateAligned(
-      sizeof(Node) + sizeof(port::AtomicPointer) * (height - 1));
+  char* mem = allocator_->AllocateAligned(
+      sizeof(Node) + sizeof(std::atomic<Node*>) * (height - 1));
  return new (mem) Node(key);
 }

@ -356,23 +355,24 @@ typename SkipList<Key, Comparator>::Node* SkipList<Key, Comparator>::FindLast()
 }

 template<typename Key, class Comparator>
-SkipList<Key, Comparator>::SkipList(const Comparator cmp, Arena* arena,
+SkipList<Key, Comparator>::SkipList(const Comparator cmp, Allocator* allocator,
                                   int32_t max_height,
                                   int32_t branching_factor)
    : kMaxHeight_(max_height),
      kBranching_(branching_factor),
      compare_(cmp),
-      arena_(arena),
+      allocator_(allocator),
      head_(NewNode(0 /* any key will do */, max_height)),
-      max_height_(reinterpret_cast<void*>(1)),
+      max_height_(1),
      prev_height_(1),
      rnd_(0xdeadbeef) {
  assert(kMaxHeight_ > 0);
  assert(kBranching_ > 0);
-  // Allocate the prev_ Node* array, directly from the passed-in arena.
+  // Allocate the prev_ Node* array, directly from the passed-in allocator.
  // prev_ does not need to be freed, as its life cycle is tied up with
-  // the arena as a whole.
-  prev_ = (Node**) arena_->AllocateAligned(sizeof(Node*) * kMaxHeight_);
+  // the allocator as a whole.
+  prev_ = reinterpret_cast<Node**>(
+            allocator_->AllocateAligned(sizeof(Node*) * kMaxHeight_));
  for (int i = 0; i < kMaxHeight_; i++) {
    head_->SetNext(i, nullptr);
    prev_[i] = head_;
@ -402,7 +402,7 @@ void SkipList<Key, Comparator>::Insert(const Key& key) {
    // the loop below.  In the former case the reader will
    // immediately drop to the next level since nullptr sorts after all
    // keys.  In the latter case the reader will use the new node.
-    max_height_.NoBarrier_Store(reinterpret_cast<void*>(height));
+    max_height_.store(height, std::memory_order_relaxed);
  }

  x = NewNode(key, height);
--- a/db/skiplist_test.cc
+++ b/db/skiplist_test.cc
@ -191,13 +191,11 @@ class ConcurrentTest {

  // Per-key generation
  struct State {
-    port::AtomicPointer generation[K];
-    void Set(int k, intptr_t v) {
-      generation[k].Release_Store(reinterpret_cast<void*>(v));
-    }
-    intptr_t Get(int k) {
-      return reinterpret_cast<intptr_t>(generation[k].Acquire_Load());
+    std::atomic<int> generation[K];
+    void Set(int k, int v) {
+      generation[k].store(v, std::memory_order_release);
    }
+    int Get(int k) { return generation[k].load(std::memory_order_acquire); }

    State() {
      for (unsigned int k = 0; k < K; k++) {
@ -221,9 +219,9 @@ class ConcurrentTest {
  // REQUIRES: External synchronization
  void WriteStep(Random* rnd) {
    const uint32_t k = rnd->Next() % K;
-    const intptr_t g = current_.Get(k) + 1;
-    const Key key = MakeKey(k, g);
-    list_.Insert(key);
+    const int g = current_.Get(k) + 1;
+    const Key new_key = MakeKey(k, g);
+    list_.Insert(new_key);
    current_.Set(k, g);
  }

@ -255,11 +253,10 @@ class ConcurrentTest {
        // Note that generation 0 is never inserted, so it is ok if
        // <*,0,*> is missing.
        ASSERT_TRUE((gen(pos) == 0U) ||
-                    (gen(pos) > (uint64_t)initial_state.Get(key(pos)))
-                    ) << "key: " << key(pos)
-                      << "; gen: " << gen(pos)
-                      << "; initgen: "
-                      << initial_state.Get(key(pos));
+                    (gen(pos) > static_cast<uint64_t>(initial_state.Get(
+                                    static_cast<int>(key(pos))))))
+            << "key: " << key(pos) << "; gen: " << gen(pos)
+            << "; initgen: " << initial_state.Get(static_cast<int>(key(pos)));

        // Advance to next key in the valid key space
        if (key(pos) < key(current)) {
@ -303,7 +300,7 @@ class TestState {
 public:
  ConcurrentTest t_;
  int seed_;
-  port::AtomicPointer quit_flag_;
+  std::atomic<bool> quit_flag_;

  enum ReaderState {
    STARTING,
@ -312,10 +309,7 @@ class TestState {
  };

  explicit TestState(int s)
-      : seed_(s),
-        quit_flag_(nullptr),
-        state_(STARTING),
-        state_cv_(&mu_) {}
+      : seed_(s), quit_flag_(false), state_(STARTING), state_cv_(&mu_) {}

  void Wait(ReaderState s) {
    mu_.Lock();
@ -343,7 +337,7 @@ static void ConcurrentReader(void* arg) {
  Random rnd(state->seed_);
  int64_t reads = 0;
  state->Change(TestState::RUNNING);
-  while (!state->quit_flag_.Acquire_Load()) {
+  while (!state->quit_flag_.load(std::memory_order_acquire)) {
    state->t_.ReadStep(&rnd);
    ++reads;
  }
@ -362,10 +356,10 @@ static void RunConcurrent(int run) {
    TestState state(seed + 1);
    Env::Default()->Schedule(ConcurrentReader, &state);
    state.Wait(TestState::RUNNING);
-    for (int i = 0; i < kSize; i++) {
+    for (int k = 0; k < kSize; k++) {
      state.t_.WriteStep(&rnd);
    }
-    state.quit_flag_.Release_Store(&state);  // Any non-nullptr arg will do
+    state.quit_flag_.store(true, std::memory_order_release);
    state.Wait(TestState::DONE);
  }
 }
--- a/db/snapshot.h
+++ b/db/snapshot.h
@ -20,6 +20,8 @@ class SnapshotImpl : public Snapshot {
 public:
  SequenceNumber number_;  // const after creation

+  virtual SequenceNumber GetSequenceNumber() const { return number_; }
+
 private:
  friend class SnapshotList;

@ -28,6 +30,8 @@ class SnapshotImpl : public Snapshot {
  SnapshotImpl* next_;

  SnapshotList* list_;                 // just for sanity checks
+
+  int64_t unix_time_;
 };

 class SnapshotList {
@ -36,20 +40,23 @@ class SnapshotList {
    list_.prev_ = &list_;
    list_.next_ = &list_;
    list_.number_ = 0xFFFFFFFFL;      // placeholder marker, for debugging
+    count_ = 0;
  }

  bool empty() const { return list_.next_ == &list_; }
  SnapshotImpl* oldest() const { assert(!empty()); return list_.next_; }
  SnapshotImpl* newest() const { assert(!empty()); return list_.prev_; }

-  const SnapshotImpl* New(SequenceNumber seq) {
+  const SnapshotImpl* New(SequenceNumber seq, uint64_t unix_time) {
    SnapshotImpl* s = new SnapshotImpl;
    s->number_ = seq;
+    s->unix_time_ = unix_time;
    s->list_ = this;
    s->next_ = &list_;
    s->prev_ = list_.prev_;
    s->prev_->next_ = s;
    s->next_->prev_ = s;
+    count_++;
    return s;
  }

@ -57,6 +64,7 @@ class SnapshotList {
    assert(s->list_ == this);
    s->prev_->next_ = s->next_;
    s->next_->prev_ = s->prev_;
+    count_--;
    delete s;
  }

@ -71,16 +79,27 @@ class SnapshotList {
  }

  // get the sequence number of the most recent snapshot
-  const SequenceNumber GetNewest() {
+  SequenceNumber GetNewest() {
    if (empty()) {
      return 0;
    }
    return newest()->number_;
  }

+  int64_t GetOldestSnapshotTime() const {
+    if (empty()) {
+      return 0;
+    } else {
+      return oldest()->unix_time_;
+    }
+  }
+
+  uint64_t count() const { return count_; }
+
 private:
  // Dummy head of doubly-linked list of snapshots
  SnapshotImpl list_;
+  uint64_t count_;
 };

 }  // namespace rocksdb
--- a/db/table_cache.cc
+++ b/db/table_cache.cc
@ -15,6 +15,7 @@
 #include "rocksdb/statistics.h"
 #include "table/iterator_wrapper.h"
 #include "table/table_reader.h"
+#include "table/get_context.h"
 #include "util/coding.h"
 #include "util/stop_watch.h"

@ -36,12 +37,10 @@ static Slice GetSliceForFileNumber(const uint64_t* file_number) {
               sizeof(*file_number));
 }

-TableCache::TableCache(const Options* options,
-                       const EnvOptions& storage_options, Cache* const cache)
-    : env_(options->env),
-      db_paths_(options->db_paths),
-      options_(options),
-      storage_options_(storage_options),
+TableCache::TableCache(const ImmutableCFOptions& ioptions,
+                       const EnvOptions& env_options, Cache* const cache)
+    : ioptions_(ioptions),
+      env_options_(env_options),
      cache_(cache) {}

 TableCache::~TableCache() {
@ -55,7 +54,7 @@ void TableCache::ReleaseHandle(Cache::Handle* handle) {
  cache_->Release(handle);
 }

-Status TableCache::FindTable(const EnvOptions& toptions,
+Status TableCache::FindTable(const EnvOptions& env_options,
                             const InternalKeyComparator& internal_comparator,
                             const FileDescriptor& fd, Cache::Handle** handle,
                             const bool no_io) {
@ -68,28 +67,27 @@ Status TableCache::FindTable(const EnvOptions& toptions,
      return Status::Incomplete("Table not found in table_cache, no_io is set");
    }
    std::string fname =
-        TableFileName(db_paths_, fd.GetNumber(), fd.GetPathId());
+        TableFileName(ioptions_.db_paths, fd.GetNumber(), fd.GetPathId());
    unique_ptr<RandomAccessFile> file;
    unique_ptr<TableReader> table_reader;
-    s = env_->NewRandomAccessFile(fname, &file, toptions);
-    RecordTick(options_->statistics.get(), NO_FILE_OPENS);
+    s = ioptions_.env->NewRandomAccessFile(fname, &file, env_options);
+    RecordTick(ioptions_.statistics, NO_FILE_OPENS);
    if (s.ok()) {
-      if (options_->advise_random_on_open) {
+      if (ioptions_.advise_random_on_open) {
        file->Hint(RandomAccessFile::RANDOM);
      }
-      StopWatch sw(env_, options_->statistics.get(), TABLE_OPEN_IO_MICROS);
-      s = options_->table_factory->NewTableReader(
-          *options_, toptions, internal_comparator, std::move(file),
+      StopWatch sw(ioptions_.env, ioptions_.statistics, TABLE_OPEN_IO_MICROS);
+      s = ioptions_.table_factory->NewTableReader(
+          ioptions_, env_options, internal_comparator, std::move(file),
          fd.GetFileSize(), &table_reader);
    }

    if (!s.ok()) {
      assert(table_reader == nullptr);
-      RecordTick(options_->statistics.get(), NO_FILE_ERRORS);
+      RecordTick(ioptions_.statistics, NO_FILE_ERRORS);
      // We do not cache error results so that if the error is transient,
      // or somebody repairs the file, we recover automatically.
    } else {
-      assert(file.get() == nullptr);
      *handle = cache_->Insert(key, table_reader.release(), 1, &DeleteEntry);
    }
  }
@ -97,7 +95,7 @@ Status TableCache::FindTable(const EnvOptions& toptions,
 }

 Iterator* TableCache::NewIterator(const ReadOptions& options,
-                                  const EnvOptions& toptions,
+                                  const EnvOptions& env_options,
                                  const InternalKeyComparator& icomparator,
                                  const FileDescriptor& fd,
                                  TableReader** table_reader_ptr,
@ -109,7 +107,7 @@ Iterator* TableCache::NewIterator(const ReadOptions& options,
  Cache::Handle* handle = nullptr;
  Status s;
  if (table_reader == nullptr) {
-    s = FindTable(toptions, icomparator, fd, &handle,
+    s = FindTable(env_options, icomparator, fd, &handle,
                  options.read_tier == kBlockCacheTier);
    if (!s.ok()) {
      return NewErrorIterator(s, arena);
@ -134,34 +132,33 @@ Iterator* TableCache::NewIterator(const ReadOptions& options,

 Status TableCache::Get(const ReadOptions& options,
                       const InternalKeyComparator& internal_comparator,
-                       const FileDescriptor& fd, const Slice& k, void* arg,
-                       bool (*saver)(void*, const ParsedInternalKey&,
-                                     const Slice&),
-                       void (*mark_key_may_exist)(void*)) {
+                       const FileDescriptor& fd, const Slice& k,
+                       GetContext* get_context) {
  TableReader* t = fd.table_reader;
  Status s;
  Cache::Handle* handle = nullptr;
  if (!t) {
-    s = FindTable(storage_options_, internal_comparator, fd, &handle,
+    s = FindTable(env_options_, internal_comparator, fd, &handle,
                  options.read_tier == kBlockCacheTier);
    if (s.ok()) {
      t = GetTableReaderFromHandle(handle);
    }
  }
  if (s.ok()) {
-    s = t->Get(options, k, arg, saver, mark_key_may_exist);
+    s = t->Get(options, k, get_context);
    if (handle != nullptr) {
      ReleaseHandle(handle);
    }
  } else if (options.read_tier && s.IsIncomplete()) {
    // Couldnt find Table in cache but treat as kFound if no_io set
-    (*mark_key_may_exist)(arg);
+    get_context->MarkKeyMayExist();
    return Status::OK();
  }
  return s;
 }
+
 Status TableCache::GetTableProperties(
-    const EnvOptions& toptions,
+    const EnvOptions& env_options,
    const InternalKeyComparator& internal_comparator, const FileDescriptor& fd,
    std::shared_ptr<const TableProperties>* properties, bool no_io) {
  Status s;
@ -174,7 +171,7 @@ Status TableCache::GetTableProperties(
  }

  Cache::Handle* table_handle = nullptr;
-  s = FindTable(toptions, internal_comparator, fd, &table_handle, no_io);
+  s = FindTable(env_options, internal_comparator, fd, &table_handle, no_io);
  if (!s.ok()) {
    return s;
  }
@ -186,7 +183,7 @@ Status TableCache::GetTableProperties(
 }

 size_t TableCache::GetMemoryUsageByTableReader(
-    const EnvOptions& toptions,
+    const EnvOptions& env_options,
    const InternalKeyComparator& internal_comparator,
    const FileDescriptor& fd) {
  Status s;
@ -197,7 +194,7 @@ size_t TableCache::GetMemoryUsageByTableReader(
  }

  Cache::Handle* table_handle = nullptr;
-  s = FindTable(toptions, internal_comparator, fd, &table_handle, true);
+  s = FindTable(env_options, internal_comparator, fd, &table_handle, true);
  if (!s.ok()) {
    return 0;
  }
--- a/db/table_cache.h
+++ b/db/table_cache.h
@ -19,6 +19,7 @@
 #include "rocksdb/cache.h"
 #include "rocksdb/env.h"
 #include "rocksdb/table.h"
+#include "rocksdb/options.h"
 #include "table/table_reader.h"

 namespace rocksdb {
@ -26,11 +27,12 @@ namespace rocksdb {
 class Env;
 class Arena;
 struct FileDescriptor;
+class GetContext;

 class TableCache {
 public:
-  TableCache(const Options* options, const EnvOptions& storage_options,
-             Cache* cache);
+  TableCache(const ImmutableCFOptions& ioptions,
+             const EnvOptions& storage_options, Cache* cache);
  ~TableCache();

  // Return an iterator for the specified file number (the corresponding
@ -51,10 +53,8 @@ class TableCache {
  // it returns false.
  Status Get(const ReadOptions& options,
             const InternalKeyComparator& internal_comparator,
-             const FileDescriptor& file_fd, const Slice& k, void* arg,
-             bool (*handle_result)(void*, const ParsedInternalKey&,
-                                   const Slice&),
-             void (*mark_key_may_exist)(void*) = nullptr);
+             const FileDescriptor& file_fd, const Slice& k,
+             GetContext* get_context);

  // Evict any entry for the specified file number
  static void Evict(Cache* cache, uint64_t file_number);
@ -91,10 +91,8 @@ class TableCache {
  void ReleaseHandle(Cache::Handle* handle);

 private:
-  Env* const env_;
-  const std::vector<DbPath> db_paths_;
-  const Options* options_;
-  const EnvOptions& storage_options_;
+  const ImmutableCFOptions& ioptions_;
+  const EnvOptions& env_options_;
  Cache* const cache_;
 };

--- a/db/table_properties_collector.cc
+++ b/db/table_properties_collector.cc
@ -7,6 +7,7 @@

 #include "db/dbformat.h"
 #include "util/coding.h"
+#include "util/string_util.h"

 namespace rocksdb {

@ -40,7 +41,7 @@ Status InternalKeyPropertiesCollector::Finish(
 UserCollectedProperties
 InternalKeyPropertiesCollector::GetReadableProperties() const {
  return {
-    { "kDeletedKeys", std::to_string(deleted_keys_) }
+    { "kDeletedKeys", ToString(deleted_keys_) }
  };
 }

--- a/db/table_properties_collector_test.cc
+++ b/db/table_properties_collector_test.cc
@ -11,6 +11,7 @@
 #include "db/dbformat.h"
 #include "db/table_properties_collector.h"
 #include "rocksdb/table.h"
+#include "rocksdb/immutable_options.h"
 #include "table/block_based_table_factory.h"
 #include "table/meta_blocks.h"
 #include "table/plain_table_factory.h"
@ -78,6 +79,7 @@ class FakeRandomeAccessFile : public RandomAccessFile {

 class DumbLogger : public Logger {
 public:
+  using Logger::Logv;
  virtual void Logv(const char* format, va_list ap) { }
  virtual size_t GetLogFileSize() const { return 0; }
 };
@ -85,12 +87,14 @@ class DumbLogger : public Logger {
 // Utilities test functions
 namespace {
 void MakeBuilder(const Options& options,
+                 const ImmutableCFOptions& ioptions,
                 const InternalKeyComparator& internal_comparator,
                 std::unique_ptr<FakeWritableFile>* writable,
                 std::unique_ptr<TableBuilder>* builder) {
  writable->reset(new FakeWritableFile);
-  builder->reset(options.table_factory->NewTableBuilder(
-      options, internal_comparator, writable->get(), options.compression));
+  builder->reset(ioptions.table_factory->NewTableBuilder(
+      ioptions, internal_comparator, writable->get(),
+      options.compression, options.compression_opts));
 }
 }  // namespace

@ -153,7 +157,8 @@ void TestCustomizedTablePropertiesCollector(
  // -- Step 1: build table
  std::unique_ptr<TableBuilder> builder;
  std::unique_ptr<FakeWritableFile> writable;
-  MakeBuilder(options, internal_comparator, &writable, &builder);
+  const ImmutableCFOptions ioptions(options);
+  MakeBuilder(options, ioptions, internal_comparator, &writable, &builder);

  for (const auto& kv : kvs) {
    if (encode_as_internal) {
@ -264,9 +269,10 @@ void TestInternalKeyPropertiesCollector(
    options.table_properties_collector_factories = {
        std::make_shared<InternalKeyPropertiesCollectorFactory>()};
  }
+  const ImmutableCFOptions ioptions(options);

  for (int iter = 0; iter < 2; ++iter) {
-    MakeBuilder(options, pikc, &writable, &builder);
+    MakeBuilder(options, ioptions, pikc, &writable, &builder);
    for (const auto& k : keys) {
      builder->Add(k.Encode(), "val");
    }
--- a/db/transaction_log_impl.cc
+++ b/db/transaction_log_impl.cc
@ -4,6 +4,11 @@
 //  of patent rights can be found in the PATENTS file in the same directory.

 #ifndef ROCKSDB_LITE
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include <inttypes.h>
 #include "db/transaction_log_impl.h"
 #include "db/write_batch_internal.h"

@ -13,7 +18,7 @@ TransactionLogIteratorImpl::TransactionLogIteratorImpl(
    const std::string& dir, const DBOptions* options,
    const TransactionLogIterator::ReadOptions& read_options,
    const EnvOptions& soptions, const SequenceNumber seq,
-    std::unique_ptr<VectorLogPtr> files, DBImpl const* const dbimpl)
+    std::unique_ptr<VectorLogPtr> files, VersionSet const* const versions)
    : dir_(dir),
      options_(options),
      read_options_(read_options),
@ -25,9 +30,9 @@ TransactionLogIteratorImpl::TransactionLogIteratorImpl(
      currentFileIndex_(0),
      currentBatchSeq_(0),
      currentLastSeq_(0),
-      dbimpl_(dbimpl) {
+      versions_(versions) {
  assert(files_ != nullptr);
-  assert(dbimpl_ != nullptr);
+  assert(versions_ != nullptr);

  reporter_.env = options_->env;
  reporter_.info_log = options_->info_log.get();
@ -43,14 +48,14 @@ Status TransactionLogIteratorImpl::OpenLogFile(
    return env->NewSequentialFile(fname, file, soptions_);
  } else {
    std::string fname = LogFileName(dir_, logFile->LogNumber());
-    Status status = env->NewSequentialFile(fname, file, soptions_);
-    if (!status.ok()) {
+    Status s = env->NewSequentialFile(fname, file, soptions_);
+    if (!s.ok()) {
      //  If cannot open file in DB directory.
      //  Try the archive dir, as it could have moved in the meanwhile.
      fname = ArchivedLogFileName(dir_, logFile->LogNumber());
-      status = env->NewSequentialFile(fname, file, soptions_);
+      s = env->NewSequentialFile(fname, file, soptions_);
    }
-    return status;
+    return s;
  }
 }

@ -74,7 +79,7 @@ bool TransactionLogIteratorImpl::RestrictedRead(
    Slice* record,
    std::string* scratch) {
  // Don't read if no more complete entries to read from logs
-  if (currentLastSeq_ >= dbimpl_->GetLatestSequenceNumber()) {
+  if (currentLastSeq_ >= versions_->LastSequence()) {
    return false;
  }
  return currentLogReader_->ReadRecord(record, scratch);
@ -177,15 +182,15 @@ void TransactionLogIteratorImpl::NextImpl(bool internal) {
    // Open the next file
    if (currentFileIndex_ < files_->size() - 1) {
      ++currentFileIndex_;
-      Status status =OpenLogReader(files_->at(currentFileIndex_).get());
-      if (!status.ok()) {
+      Status s = OpenLogReader(files_->at(currentFileIndex_).get());
+      if (!s.ok()) {
        isValid_ = false;
-        currentStatus_ = status;
+        currentStatus_ = s;
        return;
      }
    } else {
      isValid_ = false;
-      if (currentLastSeq_ == dbimpl_->GetLatestSequenceNumber()) {
+      if (currentLastSeq_ == versions_->LastSequence()) {
        currentStatus_ = Status::OK();
      } else {
        currentStatus_ = Status::Corruption("NO MORE DATA LEFT");
@ -203,12 +208,10 @@ bool TransactionLogIteratorImpl::IsBatchExpected(
  if (batchSeq != expectedSeq) {
    char buf[200];
    snprintf(buf, sizeof(buf),
-             "Discontinuity in log records. Got seq=%lu, Expected seq=%lu, "
-             "Last flushed seq=%lu.Log iterator will reseek the correct "
-             "batch.",
-             (unsigned long)batchSeq,
-             (unsigned long)expectedSeq,
-             (unsigned long)dbimpl_->GetLatestSequenceNumber());
+             "Discontinuity in log records. Got seq=%" PRIu64
+             ", Expected seq=%" PRIu64 ", Last flushed seq=%" PRIu64
+             ".Log iterator will reseek the correct batch.",
+             batchSeq, expectedSeq, versions_->LastSequence());
    reporter_.Info(buf);
    return false;
  }
@ -240,7 +243,7 @@ void TransactionLogIteratorImpl::UpdateCurrentWriteBatch(const Slice& record) {
  currentLastSeq_ = currentBatchSeq_ +
                    WriteBatchInternal::Count(batch.get()) - 1;
  // currentBatchSeq_ can only change here
-  assert(currentLastSeq_ <= dbimpl_->GetLatestSequenceNumber());
+  assert(currentLastSeq_ <= versions_->LastSequence());

  currentBatch_ = move(batch);
  isValid_ = true;
@ -249,9 +252,9 @@ void TransactionLogIteratorImpl::UpdateCurrentWriteBatch(const Slice& record) {

 Status TransactionLogIteratorImpl::OpenLogReader(const LogFile* logFile) {
  unique_ptr<SequentialFile> file;
-  Status status = OpenLogFile(logFile, &file);
-  if (!status.ok()) {
-    return status;
+  Status s = OpenLogFile(logFile, &file);
+  if (!s.ok()) {
+    return s;
  }
  assert(file);
  currentLogReader_.reset(new log::Reader(std::move(file), &reporter_,
--- a/db/transaction_log_impl.h
+++ b/db/transaction_log_impl.h
@ -11,23 +11,12 @@
 #include "rocksdb/options.h"
 #include "rocksdb/types.h"
 #include "rocksdb/transaction_log.h"
-#include "db/db_impl.h"
+#include "db/version_set.h"
 #include "db/log_reader.h"
 #include "db/filename.h"

 namespace rocksdb {

-struct LogReporter : public log::Reader::Reporter {
-  Env* env;
-  Logger* info_log;
-  virtual void Corruption(size_t bytes, const Status& s) {
-    Log(info_log, "dropping %zu bytes; %s", bytes, s.ToString().c_str());
-  }
-  virtual void Info(const char* s) {
-    Log(info_log, "%s", s);
-  }
-};
-
 class LogFileImpl : public LogFile {
 public:
  LogFileImpl(uint64_t logNum, WalFileType logType, SequenceNumber startSeq,
@ -71,7 +60,7 @@ class TransactionLogIteratorImpl : public TransactionLogIterator {
      const std::string& dir, const DBOptions* options,
      const TransactionLogIterator::ReadOptions& read_options,
      const EnvOptions& soptions, const SequenceNumber seqNum,
-      std::unique_ptr<VectorLogPtr> files, DBImpl const* const dbimpl);
+      std::unique_ptr<VectorLogPtr> files, VersionSet const* const versions);

  virtual bool Valid();

@ -95,10 +84,24 @@ class TransactionLogIteratorImpl : public TransactionLogIterator {
  std::unique_ptr<WriteBatch> currentBatch_;
  unique_ptr<log::Reader> currentLogReader_;
  Status OpenLogFile(const LogFile* logFile, unique_ptr<SequentialFile>* file);
-  LogReporter reporter_;
+
+  struct LogReporter : public log::Reader::Reporter {
+    Env* env;
+    Logger* info_log;
+    virtual void Corruption(size_t bytes, const Status& s) {
+      Log(InfoLogLevel::ERROR_LEVEL, info_log, "dropping %zu bytes; %s", bytes,
+          s.ToString().c_str());
+    }
+    virtual void Info(const char* s) {
+      Log(InfoLogLevel::INFO_LEVEL, info_log, "%s", s);
+    }
+  } reporter_;
+
  SequenceNumber currentBatchSeq_; // sequence number at start of current batch
  SequenceNumber currentLastSeq_; // last sequence in the current batch
-  DBImpl const * const dbimpl_; // The db on whose log files this iterates
+  // Used only to get latest seq. num
+  // TODO(icanadi) can this be just a callback?
+  VersionSet const* const versions_;

  // Reads from transaction log only if the writebatch record has been written
  bool RestrictedRead(Slice* record, std::string* scratch);
--- a/db/version_builder.cc
+++ b/db/version_builder.cc
@ -0,0 +1,330 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/version_builder.h"
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include <inttypes.h>
+#include <algorithm>
+#include <set>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "db/table_cache.h"
+#include "db/version_set.h"
+#include "table/table_reader.h"
+
+namespace rocksdb {
+
+bool NewestFirstBySeqNo(FileMetaData* a, FileMetaData* b) {
+  if (a->smallest_seqno != b->smallest_seqno) {
+    return a->smallest_seqno > b->smallest_seqno;
+  }
+  if (a->largest_seqno != b->largest_seqno) {
+    return a->largest_seqno > b->largest_seqno;
+  }
+  // Break ties by file number
+  return a->fd.GetNumber() > b->fd.GetNumber();
+}
+
+namespace {
+bool BySmallestKey(FileMetaData* a, FileMetaData* b,
+                   const InternalKeyComparator* cmp) {
+  int r = cmp->Compare(a->smallest, b->smallest);
+  if (r != 0) {
+    return (r < 0);
+  }
+  // Break ties by file number
+  return (a->fd.GetNumber() < b->fd.GetNumber());
+}
+}  // namespace
+
+class VersionBuilder::Rep {
+ private:
+  // Helper to sort files_ in v
+  // kLevel0 -- NewestFirstBySeqNo
+  // kLevelNon0 -- BySmallestKey
+  struct FileComparator {
+    enum SortMethod { kLevel0 = 0, kLevelNon0 = 1, } sort_method;
+    const InternalKeyComparator* internal_comparator;
+
+    bool operator()(FileMetaData* f1, FileMetaData* f2) const {
+      switch (sort_method) {
+        case kLevel0:
+          return NewestFirstBySeqNo(f1, f2);
+        case kLevelNon0:
+          return BySmallestKey(f1, f2, internal_comparator);
+      }
+      assert(false);
+      return false;
+    }
+  };
+
+  struct LevelState {
+    std::unordered_set<uint64_t> deleted_files;
+    // Map from file number to file meta data.
+    std::unordered_map<uint64_t, FileMetaData*> added_files;
+  };
+
+  const EnvOptions& env_options_;
+  TableCache* table_cache_;
+  VersionStorageInfo* base_vstorage_;
+  LevelState* levels_;
+  FileComparator level_zero_cmp_;
+  FileComparator level_nonzero_cmp_;
+
+ public:
+  Rep(const EnvOptions& env_options, TableCache* table_cache,
+      VersionStorageInfo* base_vstorage)
+      : env_options_(env_options),
+        table_cache_(table_cache),
+        base_vstorage_(base_vstorage) {
+    levels_ = new LevelState[base_vstorage_->num_levels()];
+    level_zero_cmp_.sort_method = FileComparator::kLevel0;
+    level_nonzero_cmp_.sort_method = FileComparator::kLevelNon0;
+    level_nonzero_cmp_.internal_comparator =
+        base_vstorage_->InternalComparator();
+  }
+
+  ~Rep() {
+    for (int level = 0; level < base_vstorage_->num_levels(); level++) {
+      const auto& added = levels_[level].added_files;
+      for (auto& pair : added) {
+        UnrefFile(pair.second);
+      }
+    }
+
+    delete[] levels_;
+  }
+
+  void UnrefFile(FileMetaData* f) {
+    f->refs--;
+    if (f->refs <= 0) {
+      if (f->table_reader_handle) {
+        assert(table_cache_ != nullptr);
+        table_cache_->ReleaseHandle(f->table_reader_handle);
+        f->table_reader_handle = nullptr;
+      }
+      delete f;
+    }
+  }
+
+  void CheckConsistency(VersionStorageInfo* vstorage) {
+#ifndef NDEBUG
+    // make sure the files are sorted correctly
+    for (int level = 0; level < vstorage->num_levels(); level++) {
+      auto& level_files = vstorage->LevelFiles(level);
+      for (size_t i = 1; i < level_files.size(); i++) {
+        auto f1 = level_files[i - 1];
+        auto f2 = level_files[i];
+        if (level == 0) {
+          assert(level_zero_cmp_(f1, f2));
+          assert(f1->largest_seqno > f2->largest_seqno);
+        } else {
+          assert(level_nonzero_cmp_(f1, f2));
+
+          // Make sure there is no overlap in levels > 0
+          if (vstorage->InternalComparator()->Compare(f1->largest,
+                                                      f2->smallest) >= 0) {
+            fprintf(stderr, "overlapping ranges in same level %s vs. %s\n",
+                    (f1->largest).DebugString().c_str(),
+                    (f2->smallest).DebugString().c_str());
+            abort();
+          }
+        }
+      }
+    }
+#endif
+  }
+
+  void CheckConsistencyForDeletes(VersionEdit* edit, uint64_t number,
+                                  int level) {
+#ifndef NDEBUG
+    // a file to be deleted better exist in the previous version
+    bool found = false;
+    for (int l = 0; !found && l < base_vstorage_->num_levels(); l++) {
+      const std::vector<FileMetaData*>& base_files =
+          base_vstorage_->LevelFiles(l);
+      for (unsigned int i = 0; i < base_files.size(); i++) {
+        FileMetaData* f = base_files[i];
+        if (f->fd.GetNumber() == number) {
+          found = true;
+          break;
+        }
+      }
+    }
+    // if the file did not exist in the previous version, then it
+    // is possibly moved from lower level to higher level in current
+    // version
+    for (int l = level + 1; !found && l < base_vstorage_->num_levels(); l++) {
+      auto& level_added = levels_[l].added_files;
+      auto got = level_added.find(number);
+      if (got != level_added.end()) {
+        found = true;
+        break;
+      }
+    }
+
+    // maybe this file was added in a previous edit that was Applied
+    if (!found) {
+      auto& level_added = levels_[level].added_files;
+      auto got = level_added.find(number);
+      if (got != level_added.end()) {
+        found = true;
+      }
+    }
+    if (!found) {
+      fprintf(stderr, "not found %" PRIu64 "\n", number);
+    }
+    assert(found);
+#endif
+  }
+
+  // Apply all of the edits in *edit to the current state.
+  void Apply(VersionEdit* edit) {
+    CheckConsistency(base_vstorage_);
+
+    // Delete files
+    const VersionEdit::DeletedFileSet& del = edit->GetDeletedFiles();
+    for (const auto& del_file : del) {
+      const auto level = del_file.first;
+      const auto number = del_file.second;
+      levels_[level].deleted_files.insert(number);
+      CheckConsistencyForDeletes(edit, number, level);
+
+      auto exising = levels_[level].added_files.find(number);
+      if (exising != levels_[level].added_files.end()) {
+        UnrefFile(exising->second);
+        levels_[level].added_files.erase(number);
+      }
+    }
+
+    // Add new files
+    for (const auto& new_file : edit->GetNewFiles()) {
+      const int level = new_file.first;
+      FileMetaData* f = new FileMetaData(new_file.second);
+      f->refs = 1;
+
+      assert(levels_[level].added_files.find(f->fd.GetNumber()) ==
+             levels_[level].added_files.end());
+      levels_[level].deleted_files.erase(f->fd.GetNumber());
+      levels_[level].added_files[f->fd.GetNumber()] = f;
+    }
+  }
+
+  // Save the current state in *v.
+  void SaveTo(VersionStorageInfo* vstorage) {
+    CheckConsistency(base_vstorage_);
+    CheckConsistency(vstorage);
+
+    for (int level = 0; level < base_vstorage_->num_levels(); level++) {
+      const auto& cmp = (level == 0) ? level_zero_cmp_ : level_nonzero_cmp_;
+      // Merge the set of added files with the set of pre-existing files.
+      // Drop any deleted files.  Store the result in *v.
+      const auto& base_files = base_vstorage_->LevelFiles(level);
+      auto base_iter = base_files.begin();
+      auto base_end = base_files.end();
+      const auto& unordered_added_files = levels_[level].added_files;
+      vstorage->Reserve(level,
+                        base_files.size() + unordered_added_files.size());
+
+      // Sort added files for the level.
+      std::vector<FileMetaData*> added_files;
+      added_files.reserve(unordered_added_files.size());
+      for (const auto& pair : unordered_added_files) {
+        added_files.push_back(pair.second);
+      }
+      std::sort(added_files.begin(), added_files.end(), cmp);
+
+#ifndef NDEBUG
+      FileMetaData* prev_file = nullptr;
+#endif
+
+      for (const auto& added : added_files) {
+#ifndef NDEBUG
+        if (level > 0 && prev_file != nullptr) {
+          assert(base_vstorage_->InternalComparator()->Compare(
+                     prev_file->smallest, added->smallest) <= 0);
+        }
+        prev_file = added;
+#endif
+
+        // Add all smaller files listed in base_
+        for (auto bpos = std::upper_bound(base_iter, base_end, added, cmp);
+             base_iter != bpos; ++base_iter) {
+          MaybeAddFile(vstorage, level, *base_iter);
+        }
+
+        MaybeAddFile(vstorage, level, added);
+      }
+
+      // Add remaining base files
+      for (; base_iter != base_end; ++base_iter) {
+        MaybeAddFile(vstorage, level, *base_iter);
+      }
+    }
+
+    CheckConsistency(vstorage);
+  }
+
+  void LoadTableHandlers() {
+    assert(table_cache_ != nullptr);
+    for (int level = 0; level < base_vstorage_->num_levels(); level++) {
+      for (auto& file_meta_pair : levels_[level].added_files) {
+        auto* file_meta = file_meta_pair.second;
+        assert(!file_meta->table_reader_handle);
+        table_cache_->FindTable(
+            env_options_, *(base_vstorage_->InternalComparator()),
+            file_meta->fd, &file_meta->table_reader_handle, false);
+        if (file_meta->table_reader_handle != nullptr) {
+          // Load table_reader
+          file_meta->fd.table_reader = table_cache_->GetTableReaderFromHandle(
+              file_meta->table_reader_handle);
+        }
+      }
+    }
+  }
+
+  void MaybeAddFile(VersionStorageInfo* vstorage, int level, FileMetaData* f) {
+    if (levels_[level].deleted_files.count(f->fd.GetNumber()) > 0) {
+      // File is deleted: do nothing
+    } else {
+      vstorage->AddFile(level, f);
+    }
+  }
+};
+
+VersionBuilder::VersionBuilder(const EnvOptions& env_options,
+                               TableCache* table_cache,
+                               VersionStorageInfo* base_vstorage)
+    : rep_(new Rep(env_options, table_cache, base_vstorage)) {}
+VersionBuilder::~VersionBuilder() { delete rep_; }
+void VersionBuilder::CheckConsistency(VersionStorageInfo* vstorage) {
+  rep_->CheckConsistency(vstorage);
+}
+void VersionBuilder::CheckConsistencyForDeletes(VersionEdit* edit,
+                                                uint64_t number, int level) {
+  rep_->CheckConsistencyForDeletes(edit, number, level);
+}
+void VersionBuilder::Apply(VersionEdit* edit) { rep_->Apply(edit); }
+void VersionBuilder::SaveTo(VersionStorageInfo* vstorage) {
+  rep_->SaveTo(vstorage);
+}
+void VersionBuilder::LoadTableHandlers() { rep_->LoadTableHandlers(); }
+void VersionBuilder::MaybeAddFile(VersionStorageInfo* vstorage, int level,
+                                  FileMetaData* f) {
+  rep_->MaybeAddFile(vstorage, level, f);
+}
+
+}  // namespace rocksdb
--- a/db/version_builder.h
+++ b/db/version_builder.h
@ -0,0 +1,42 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+#pragma once
+#include "rocksdb/env.h"
+
+namespace rocksdb {
+
+class TableCache;
+class VersionStorageInfo;
+class VersionEdit;
+struct FileMetaData;
+
+// A helper class so we can efficiently apply a whole sequence
+// of edits to a particular state without creating intermediate
+// Versions that contain full copies of the intermediate state.
+class VersionBuilder {
+ public:
+  VersionBuilder(const EnvOptions& env_options, TableCache* table_cache,
+                 VersionStorageInfo* base_vstorage);
+  ~VersionBuilder();
+  void CheckConsistency(VersionStorageInfo* vstorage);
+  void CheckConsistencyForDeletes(VersionEdit* edit, uint64_t number,
+                                  int level);
+  void Apply(VersionEdit* edit);
+  void SaveTo(VersionStorageInfo* vstorage);
+  void LoadTableHandlers();
+  void MaybeAddFile(VersionStorageInfo* vstorage, int level, FileMetaData* f);
+
+ private:
+  class Rep;
+  Rep* rep_;
+};
+
+extern bool NewestFirstBySeqNo(FileMetaData* a, FileMetaData* b);
+}  // namespace rocksdb
--- a/db/version_builder_test.cc
+++ b/db/version_builder_test.cc
@ -0,0 +1,229 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include <string>
+#include "db/version_edit.h"
+#include "db/version_set.h"
+#include "util/logging.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+namespace rocksdb {
+
+class VersionBuilderTest {
+ public:
+  const Comparator* ucmp_;
+  InternalKeyComparator icmp_;
+  Options options_;
+  ImmutableCFOptions ioptions_;
+  MutableCFOptions mutable_cf_options_;
+  VersionStorageInfo vstorage_;
+  uint32_t file_num_;
+  CompactionOptionsFIFO fifo_options_;
+  std::vector<uint64_t> size_being_compacted_;
+
+  VersionBuilderTest()
+      : ucmp_(BytewiseComparator()),
+        icmp_(ucmp_),
+        ioptions_(options_),
+        mutable_cf_options_(options_, ioptions_),
+        vstorage_(&icmp_, ucmp_, options_.num_levels, kCompactionStyleLevel,
+                 nullptr),
+        file_num_(1) {
+    mutable_cf_options_.RefreshDerivedOptions(ioptions_);
+    size_being_compacted_.resize(options_.num_levels);
+  }
+
+  ~VersionBuilderTest() {
+    for (int i = 0; i < vstorage_.num_levels(); i++) {
+      for (auto* f : vstorage_.LevelFiles(i)) {
+        if (--f->refs == 0) {
+          delete f;
+        }
+      }
+    }
+  }
+
+  InternalKey GetInternalKey(const char* ukey,
+                             SequenceNumber smallest_seq = 100) {
+    return InternalKey(ukey, smallest_seq, kTypeValue);
+  }
+
+  void Add(int level, uint32_t file_number, const char* smallest,
+           const char* largest, uint64_t file_size = 0, uint32_t path_id = 0,
+           SequenceNumber smallest_seq = 100,
+           SequenceNumber largest_seq = 100,
+           uint64_t num_entries = 0, uint64_t num_deletions = 0,
+           bool sampled = false) {
+    assert(level < vstorage_.num_levels());
+    FileMetaData* f = new FileMetaData;
+    f->fd = FileDescriptor(file_number, path_id, file_size);
+    f->smallest = GetInternalKey(smallest, smallest_seq);
+    f->largest = GetInternalKey(largest, largest_seq);
+    f->compensated_file_size = file_size;
+    f->refs = 0;
+    f->num_entries = num_entries;
+    f->num_deletions = num_deletions;
+    vstorage_.AddFile(level, f);
+    if (sampled) {
+      f->init_stats_from_file = true;
+      vstorage_.UpdateAccumulatedStats(f);
+    }
+  }
+
+  void UpdateVersionStorageInfo() {
+    vstorage_.UpdateFilesBySize();
+    vstorage_.UpdateNumNonEmptyLevels();
+    vstorage_.GenerateFileIndexer();
+    vstorage_.GenerateLevelFilesBrief();
+    vstorage_.SetFinalized();
+  }
+};
+
+TEST(VersionBuilderTest, ApplyAndSaveTo) {
+  Add(0, 1U, "150", "200", 100U);
+  // Level 1 score 1.2
+  Add(1, 66U, "150", "200", 100U);
+  Add(1, 88U, "201", "300", 100U);
+  // Level 2 score 1.8. File 7 is the largest. Should be picked
+  Add(2, 6U, "150", "179", 100U);
+  Add(2, 7U, "180", "220", 100U);
+  Add(2, 8U, "221", "300", 100U);
+  // Level 3 score slightly larger than 1
+  Add(3, 26U, "150", "170", 100U);
+  Add(3, 27U, "171", "179", 100U);
+  Add(3, 28U, "191", "220", 100U);
+  Add(3, 29U, "221", "300", 100U);
+  UpdateVersionStorageInfo();
+
+  VersionEdit version_edit;
+  version_edit.AddFile(2, 666, 0, 100U, GetInternalKey("301"),
+                       GetInternalKey("350"), 200, 200);
+  version_edit.DeleteFile(3, 27U);
+
+  EnvOptions env_options;
+
+  VersionBuilder version_builder(env_options, nullptr, &vstorage_);
+
+  VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
+                                  kCompactionStyleLevel, nullptr);
+  version_builder.Apply(&version_edit);
+  version_builder.SaveTo(&new_vstorage);
+
+  ASSERT_EQ(400U, new_vstorage.NumLevelBytes(2));
+  ASSERT_EQ(300U, new_vstorage.NumLevelBytes(3));
+
+  for (int i = 0; i < new_vstorage.num_levels(); i++) {
+    for (auto* f : new_vstorage.LevelFiles(i)) {
+      if (--f->refs == 0) {
+        delete f;
+      }
+    }
+  }
+}
+
+TEST(VersionBuilderTest, ApplyMultipleAndSaveTo) {
+  UpdateVersionStorageInfo();
+
+  VersionEdit version_edit;
+  version_edit.AddFile(2, 666, 0, 100U, GetInternalKey("301"),
+                       GetInternalKey("350"), 200, 200);
+  version_edit.AddFile(2, 676, 0, 100U, GetInternalKey("401"),
+                       GetInternalKey("450"), 200, 200);
+  version_edit.AddFile(2, 636, 0, 100U, GetInternalKey("601"),
+                       GetInternalKey("650"), 200, 200);
+  version_edit.AddFile(2, 616, 0, 100U, GetInternalKey("501"),
+                       GetInternalKey("550"), 200, 200);
+  version_edit.AddFile(2, 606, 0, 100U, GetInternalKey("701"),
+                       GetInternalKey("750"), 200, 200);
+
+  EnvOptions env_options;
+
+  VersionBuilder version_builder(env_options, nullptr, &vstorage_);
+
+  VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
+                                  kCompactionStyleLevel, nullptr);
+  version_builder.Apply(&version_edit);
+  version_builder.SaveTo(&new_vstorage);
+
+  ASSERT_EQ(500U, new_vstorage.NumLevelBytes(2));
+
+  for (int i = 0; i < new_vstorage.num_levels(); i++) {
+    for (auto* f : new_vstorage.LevelFiles(i)) {
+      if (--f->refs == 0) {
+        delete f;
+      }
+    }
+  }
+}
+
+TEST(VersionBuilderTest, ApplyDeleteAndSaveTo) {
+  UpdateVersionStorageInfo();
+
+  EnvOptions env_options;
+  VersionBuilder version_builder(env_options, nullptr, &vstorage_);
+  VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
+                                  kCompactionStyleLevel, nullptr);
+
+  VersionEdit version_edit;
+  version_edit.AddFile(2, 666, 0, 100U, GetInternalKey("301"),
+                       GetInternalKey("350"), 200, 200);
+  version_edit.AddFile(2, 676, 0, 100U, GetInternalKey("401"),
+                       GetInternalKey("450"), 200, 200);
+  version_edit.AddFile(2, 636, 0, 100U, GetInternalKey("601"),
+                       GetInternalKey("650"), 200, 200);
+  version_edit.AddFile(2, 616, 0, 100U, GetInternalKey("501"),
+                       GetInternalKey("550"), 200, 200);
+  version_edit.AddFile(2, 606, 0, 100U, GetInternalKey("701"),
+                       GetInternalKey("750"), 200, 200);
+  version_builder.Apply(&version_edit);
+
+  VersionEdit version_edit2;
+  version_edit.AddFile(2, 808, 0, 100U, GetInternalKey("901"),
+                       GetInternalKey("950"), 200, 200);
+  version_edit2.DeleteFile(2, 616);
+  version_edit2.DeleteFile(2, 636);
+  version_edit.AddFile(2, 806, 0, 100U, GetInternalKey("801"),
+                       GetInternalKey("850"), 200, 200);
+  version_builder.Apply(&version_edit2);
+
+  version_builder.SaveTo(&new_vstorage);
+
+  ASSERT_EQ(300U, new_vstorage.NumLevelBytes(2));
+
+  for (int i = 0; i < new_vstorage.num_levels(); i++) {
+    for (auto* f : new_vstorage.LevelFiles(i)) {
+      if (--f->refs == 0) {
+        delete f;
+      }
+    }
+  }
+}
+
+TEST(VersionBuilderTest, EstimatedActiveKeys) {
+  const uint32_t kTotalSamples = 20;
+  const uint32_t kNumLevels = 5;
+  const uint32_t kFilesPerLevel = 8;
+  const uint32_t kNumFiles = kNumLevels * kFilesPerLevel;
+  const uint32_t kEntriesPerFile = 1000;
+  const uint32_t kDeletionsPerFile = 100;
+  for (uint32_t i = 0; i < kNumFiles; ++i) {
+    Add(static_cast<int>(i / kFilesPerLevel), i + 1,
+        ToString((i + 100) * 1000).c_str(),
+        ToString((i + 100) * 1000 + 999).c_str(),
+        100U,  0, 100, 100,
+        kEntriesPerFile, kDeletionsPerFile,
+        (i < kTotalSamples));
+  }
+  // minus 2X for the number of deletion entries because:
+  // 1x for deletion entry does not count as a data entry.
+  // 1x for each deletion entry will actually remove one data entry.
+  ASSERT_EQ(vstorage_.GetEstimatedActiveKeys(),
+            (kEntriesPerFile - 2 * kDeletionsPerFile) * kNumFiles);
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) { return rocksdb::test::RunAllTests(); }
--- a/db/version_edit.cc
+++ b/db/version_edit.cc
@ -64,7 +64,7 @@ void VersionEdit::Clear() {
  column_family_name_.clear();
 }

-void VersionEdit::EncodeTo(std::string* dst) const {
+bool VersionEdit::EncodeTo(std::string* dst) const {
  if (has_comparator_) {
    PutVarint32(dst, kComparator);
    PutLengthPrefixedSlice(dst, comparator_);
@ -98,6 +98,9 @@ void VersionEdit::EncodeTo(std::string* dst) const {

  for (size_t i = 0; i < new_files_.size(); i++) {
    const FileMetaData& f = new_files_[i].second;
+    if (!f.smallest.Valid() || !f.largest.Valid()) {
+      return false;
+    }
    if (f.fd.GetPathId() == 0) {
      // Use older format to make sure user can roll back the build if they
      // don't config multiple DB paths.
@ -131,6 +134,7 @@ void VersionEdit::EncodeTo(std::string* dst) const {
  if (is_column_family_drop_) {
    PutVarint32(dst, kColumnFamilyDrop);
  }
+  return true;
 }

 static bool GetInternalKey(Slice* input, InternalKey* dst) {
@ -164,7 +168,6 @@ Status VersionEdit::DecodeFrom(const Slice& src) {

  // Temporary storage for parsing
  int level;
-  uint64_t number;
  FileMetaData f;
  Slice str;
  InternalKey key;
@ -233,9 +236,9 @@ Status VersionEdit::DecodeFrom(const Slice& src) {
        }
        break;

-      case kDeletedFile:
-        if (GetLevel(&input, &level, &msg) &&
-            GetVarint64(&input, &number)) {
+      case kDeletedFile: {
+        uint64_t number;
+        if (GetLevel(&input, &level, &msg) && GetVarint64(&input, &number)) {
          deleted_files_.insert(std::make_pair(level, number));
        } else {
          if (!msg) {
@ -243,6 +246,7 @@ Status VersionEdit::DecodeFrom(const Slice& src) {
          }
        }
        break;
+      }

      case kNewFile: {
        uint64_t number;
@ -293,7 +297,7 @@ Status VersionEdit::DecodeFrom(const Slice& src) {
          new_files_.push_back(std::make_pair(level, f));
        } else {
          if (!msg) {
-            msg = "new-file2 entry";
+            msg = "new-file3 entry";
          }
        }
        break;
--- a/db/version_edit.h
+++ b/db/version_edit.h
@ -8,6 +8,7 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.

 #pragma once
+#include <algorithm>
 #include <set>
 #include <utility>
 #include <vector>
@ -38,10 +39,10 @@ struct FileDescriptor {

  FileDescriptor() : FileDescriptor(0, 0, 0) {}

-  FileDescriptor(uint64_t number, uint32_t path_id, uint64_t file_size)
+  FileDescriptor(uint64_t number, uint32_t path_id, uint64_t _file_size)
      : table_reader(nullptr),
        packed_number_and_path_id(PackFileNumberAndPathId(number, path_id)),
-        file_size(file_size) {}
+        file_size(_file_size) {}

  FileDescriptor& operator=(const FileDescriptor& fd) {
    table_reader = fd.table_reader;
@ -74,9 +75,11 @@ struct FileMetaData {
  // Stats for compensating deletion entries during compaction

  // File size compensated by deletion entry.
-  // This is updated in Version::UpdateTemporaryStats() first time when the
-  // file is created or loaded.  After it is updated, it is immutable.
+  // This is updated in Version::UpdateAccumulatedStats() first time when the
+  // file is created or loaded.  After it is updated (!= 0), it is immutable.
  uint64_t compensated_file_size;
+  // These values can mutate, but they can only be read or written from
+  // single-threaded LogAndApply thread
  uint64_t num_entries;            // the number of entries.
  uint64_t num_deletions;          // the number of deletion entries.
  uint64_t raw_key_size;           // total uncompressed key size.
@ -109,20 +112,16 @@ struct FdWithKeyRange {
        largest_key() {
  }

-  FdWithKeyRange(FileDescriptor fd,
-      Slice smallest_key, Slice largest_key)
-      : fd(fd),
-        smallest_key(smallest_key),
-        largest_key(largest_key) {
-  }
+  FdWithKeyRange(FileDescriptor _fd, Slice _smallest_key, Slice _largest_key)
+      : fd(_fd), smallest_key(_smallest_key), largest_key(_largest_key) {}
 };

 // Data structure to store an array of FdWithKeyRange in one level
 // Actual data is guaranteed to be stored closely
-struct FileLevel {
+struct LevelFilesBrief {
  size_t num_files;
  FdWithKeyRange* files;
-  FileLevel() {
+  LevelFilesBrief() {
    num_files = 0;
    files = nullptr;
  }
@ -163,13 +162,13 @@ class VersionEdit {
  // Add the specified file at the specified number.
  // REQUIRES: This version has not been saved (see VersionSet::SaveTo)
  // REQUIRES: "smallest" and "largest" are smallest and largest keys in file
-  void AddFile(int level, uint64_t file, uint64_t file_size,
-               uint64_t file_path_id, const InternalKey& smallest,
+  void AddFile(int level, uint64_t file, uint32_t file_path_id,
+               uint64_t file_size, const InternalKey& smallest,
               const InternalKey& largest, const SequenceNumber& smallest_seqno,
               const SequenceNumber& largest_seqno) {
    assert(smallest_seqno <= largest_seqno);
    FileMetaData f;
-    f.fd = FileDescriptor(file, file_size, file_path_id);
+    f.fd = FileDescriptor(file, file_path_id, file_size);
    f.smallest = smallest;
    f.largest = largest;
    f.smallest_seqno = smallest_seqno;
@ -183,9 +182,7 @@ class VersionEdit {
  }

  // Number of edits
-  int NumEntries() {
-    return new_files_.size() + deleted_files_.size();
-  }
+  size_t NumEntries() { return new_files_.size() + deleted_files_.size(); }

  bool IsColumnFamilyManipulation() {
    return is_column_family_add_ || is_column_family_drop_;
@ -212,17 +209,23 @@ class VersionEdit {
    is_column_family_drop_ = true;
  }

-  void EncodeTo(std::string* dst) const;
+  // return true on success.
+  bool EncodeTo(std::string* dst) const;
  Status DecodeFrom(const Slice& src);

+  typedef std::set<std::pair<int, uint64_t>> DeletedFileSet;
+
+  const DeletedFileSet& GetDeletedFiles() { return deleted_files_; }
+  const std::vector<std::pair<int, FileMetaData>>& GetNewFiles() {
+    return new_files_;
+  }
+
  std::string DebugString(bool hex_key = false) const;

 private:
  friend class VersionSet;
  friend class Version;

-  typedef std::set< std::pair<int, uint64_t>> DeletedFileSet;
-
  bool GetLevel(Slice* input, int* level, const char** msg);

  int max_level_;
--- a/Show More
+++ b/Show More