Merge branch 'master' of github.com:facebook/rocksdb into HEAD

12 years ago · c65448f95a
parent 2e0d432157 731e55c01c
commit c65448f95a
92 changed files with 7862 additions and 4085 deletions
--- a/.gitignore
+++ b/.gitignore
@ -13,6 +13,10 @@ build_config.mk
 *_bench
 *_stress
 *.out
 *.class
 *.jar
 *.*jnilib*
 *.d-e
 ldb
 manifest_dump
@ -23,3 +27,5 @@ coverage/COVERAGE_REPORT
 .gdbhistory
 .phutil_module_cache
 tags
 java/*.log
 java/include/org_rocksdb_*.h
--- a/HISTORY.md
+++ b/HISTORY.md
@ -1,11 +1,15 @@
 # Rocksdb Change Log
-## Unreleased
+## Unreleased (will be released in 3.0)
 * Column family support
 ### Public API changes
 ## 2.8.0 (04/04/2014)
 * Removed arena.h from public header files.
 * By default, checksums are verified on every read from database
 * Change default value of several options, including: paranoid_checks=true, max_open_files=5000, level0_slowdown_writes_trigger=20, level0_stop_writes_trigger=24, disable_seek_compaction=true, max_background_flushes=1 and allow_mmap_writes=false
 * Added is_manual_compaction to CompactionFilter::Context
 * Added "virtual void WaitForJoin()" in class Env. Default operation is no-op.
 * Removed BackupEngine::DeleteBackupsNewerThan() function
@ -15,11 +19,18 @@
 * Added Env::GetThreadPoolQueueLen(), which returns the waiting queue length of thread pools
 * Added a command "checkconsistency" in ldb tool, which checks
  if file system state matches DB state (file existence and file sizes)
 * Separate options related to block based table to a new struct BlockBasedTableOptions
 * WriteBatch has a new function Count() to return total size in the batch, and Data() now returns a reference instead of a copy
 * Add more counters to perf context.
 * Supports several more DB properties: compaction-pending, background-errors and cur-size-active-mem-table.
 ### New Features
 * If we find one truncated record at the end of the MANIFEST or WAL files,
  we will ignore it. We assume that writers of these records were interrupted
  and that we can safely ignore it.
 * A new SST format "PlainTable" is added, which is optimized for memory-only workloads. It can be created through NewPlainTableFactory() or NewTotalOrderPlainTableFactory().
 * A new mem table implementation hash linked list optimizing for the case that there are only few keys for each prefix, which can be created through NewHashLinkListRepFactory().
 * Merge operator supports a new function PartialMergeMulti() to allow users to do partial merges against multiple operands.
 * Now compaction filter has a V2 interface. It buffers the kv-pairs sharing the same key prefix, process them in batches, and return the batched results back to DB. The new interface uses a new structure CompactionFilterContext for the same purpose as CompactionFilter::Context in V1.
 * Geo-spatial support for locations and radial-search.
--- a/INSTALL.md
+++ b/INSTALL.md
@ -67,6 +67,9 @@ libraries. You are on your own.
    * Please note that some of the optimizations/features are disabled in OSX.
    We did not run any production workloads on it.
 * **iOS**:
  * Run: `TARGET_OS=IOS make static_lib`
 ## Compilation
 `make clean; make` will compile librocksdb.a (RocksDB static library) and all
 the unit tests. You can run all unit tests with `make check`.
--- a/34
+++ b/34
@ -23,6 +23,14 @@ $(shell (export ROCKSDB_ROOT=$(CURDIR); $(CURDIR)/build_tools/build_detect_platf
 # this file is generated by the previous line to set build flags and sources
 include build_config.mk
 ifneq ($(PLATFORM), IOS)
 CFLAGS += -g
 CXXFLAGS += -g
 else
 # no debug info for IOS, that will make our library big
 OPT += -DNDEBUG
 endif
 # ASAN doesn't work well with jemalloc. If we're compiling with ASAN, we should use regular malloc.
 ifdef COMPILE_WITH_ASAN
 	# ASAN compile flags
@ -37,8 +45,8 @@ else
 endif
 WARNING_FLAGS = -Wall -Werror -Wno-sign-compare
-CFLAGS += -g $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CCFLAGS) $(OPT)
+CFLAGS += $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CCFLAGS) $(OPT)
-CXXFLAGS += -g $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CXXFLAGS) $(OPT) -Woverloaded-virtual
+CXXFLAGS += $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CXXFLAGS) $(OPT) -Woverloaded-virtual
 LDFLAGS += $(PLATFORM_LDFLAGS)
@ -57,6 +65,7 @@ TESTS = \
 	db_test \
 	block_hash_index_test \
 	autovector_test \
 	column_family_test \
 	table_properties_collector_test \
 	arena_test \
 	auto_roll_logger_test \
@ -148,11 +157,15 @@ $(SHARED3):
 endif  # PLATFORM_SHARED_EXT
 .PHONY: blackbox_crash_test check clean coverage crash_test ldb_tests \
-	release tags valgrind_check whitebox_crash_test format shared_lib all \
+	release tags valgrind_check whitebox_crash_test format static_lib shared_lib all \
 	dbg
 all: $(LIBRARY) $(PROGRAMS)
 static_lib: $(LIBRARY)
 shared_lib: $(SHARED)
 dbg: $(LIBRARY) $(PROGRAMS)
 # Will also generate shared libraries.
@ -218,8 +231,6 @@ tags:
 format:
 	build_tools/format-diff.sh
 shared_lib: $(SHARED)
 # ---------------------------------------------------------------------------
 # 	Unit tests and tools
 # ---------------------------------------------------------------------------
@ -260,6 +271,9 @@ arena_test: util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS)
 autovector_test: util/autovector_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) util/autovector_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
 column_family_test: db/column_family_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) db/column_family_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
 table_properties_collector_test: db/table_properties_collector_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) db/table_properties_collector_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
@ -435,20 +449,20 @@ ifeq ($(PLATFORM), IOS)
 PLATFORMSROOT=/Applications/Xcode.app/Contents/Developer/Platforms
 SIMULATORROOT=$(PLATFORMSROOT)/iPhoneSimulator.platform/Developer
 DEVICEROOT=$(PLATFORMSROOT)/iPhoneOS.platform/Developer
-IOSVERSION=$(shell defaults read $(PLATFORMSROOT)/iPhoneOS.platform/versionCFBundleShortVersionString)
+IOSVERSION=$(shell defaults read $(PLATFORMSROOT)/iPhoneOS.platform/version CFBundleShortVersionString)
 .cc.o:
 	mkdir -p ios-x86/$(dir $@)
-	$(SIMULATORROOT)/usr/bin/$(CXX) $(CXXFLAGS) -isysroot $(SIMULATORROOT)/SDKs/iPhoneSimulator$(IOSVERSION).sdk -arch i686 -c $< -o ios-x86/$@ $(COVERAGEFLAGS)
+	$(CXX) $(CXXFLAGS) -isysroot $(SIMULATORROOT)/SDKs/iPhoneSimulator$(IOSVERSION).sdk -arch i686 -arch x86_64 -c $< -o ios-x86/$@
 	mkdir -p ios-arm/$(dir $@)
-	$(DEVICEROOT)/usr/bin/$(CXX) $(CXXFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk -arch armv6 -arch armv7 -c $< -o ios-arm/$@ $(COVERAGEFLAGS)
+	xcrun -sdk iphoneos $(CXX) $(CXXFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk -arch armv6 -arch armv7 -arch armv7s -arch arm64 -c $< -o ios-arm/$@
 	lipo ios-x86/$@ ios-arm/$@ -create -output $@
 .c.o:
 	mkdir -p ios-x86/$(dir $@)
-	$(SIMULATORROOT)/usr/bin/$(CC) $(CFLAGS) -isysroot $(SIMULATORROOT)/SDKs/iPhoneSimulator$(IOSVERSION).sdk -arch i686 -c $< -o ios-x86/$@
+	$(CC) $(CFLAGS) -isysroot $(SIMULATORROOT)/SDKs/iPhoneSimulator$(IOSVERSION).sdk -arch i686 -arch x86_64 -c $< -o ios-x86/$@
 	mkdir -p ios-arm/$(dir $@)
-	$(DEVICEROOT)/usr/bin/$(CC) $(CFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk -arch armv6 -arch armv7 -c $< -o ios-arm/$@
+	xcrun -sdk iphoneos $(CC) $(CFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk -arch armv6 -arch armv7 -arch armv7s -arch arm64 -c $< -o ios-arm/$@
 	lipo ios-x86/$@ ios-arm/$@ -create -output $@
 else
--- a/build_tools/build_detect_platform
+++ b/build_tools/build_detect_platform
@ -87,7 +87,7 @@ PLATFORM_SHARED_CFLAGS="-fPIC"
 PLATFORM_SHARED_VERSIONED=false
 # generic port files (working on all platform by #ifdef) go directly in /port
-GENERIC_PORT_FILES=`find $ROCKSDB_ROOT/port -name '*.cc' | tr "\n" " "`
+GENERIC_PORT_FILES=`cd $ROCKSDB_ROOT; find port -name '*.cc' | tr "\n" " "`
 # On GCC, we pick libc's memcmp over GCC's memcmp via -fno-builtin-memcmp
 case "$TARGET_OS" in
@ -98,6 +98,13 @@ case "$TARGET_OS" in
        PLATFORM_SHARED_LDFLAGS="-dynamiclib -install_name "
        # PORT_FILES=port/darwin/darwin_specific.cc
        ;;
    IOS)
        PLATFORM=IOS
        COMMON_FLAGS="$COMMON_FLAGS -DOS_MACOSX -DIOS_CROSS_COMPILE"
        PLATFORM_SHARED_EXT=dylib
        PLATFORM_SHARED_LDFLAGS="-dynamiclib -install_name "
        CROSS_COMPILE=true
        ;;
    Linux)
        PLATFORM=OS_LINUX
        COMMON_FLAGS="$COMMON_FLAGS -DOS_LINUX"
--- a/db/c.cc
+++ b/db/c.cc
@ -25,12 +25,14 @@
 #include "rocksdb/universal_compaction.h"
 #include "rocksdb/statistics.h"
 #include "rocksdb/slice_transform.h"
 #include "rocksdb/table.h"
 using rocksdb::Cache;
 using rocksdb::Comparator;
 using rocksdb::CompressionType;
 using rocksdb::DB;
 using rocksdb::Env;
 using rocksdb::InfoLogLevel;
 using rocksdb::FileLock;
 using rocksdb::FilterPolicy;
 using rocksdb::FlushOptions;
@ -656,6 +658,11 @@ void rocksdb_options_set_info_log(rocksdb_options_t* opt, rocksdb_logger_t* l) {
  }
 }
 void rocksdb_options_set_info_log_level(
    rocksdb_options_t* opt, int v) {
  opt->rep.info_log_level = static_cast<InfoLogLevel>(v);
 }
 void rocksdb_options_set_write_buffer_size(rocksdb_options_t* opt, size_t s) {
  opt->rep.write_buffer_size = s;
 }
@ -714,6 +721,14 @@ void rocksdb_options_set_max_grandparent_overlap_factor(
  opt->rep.max_grandparent_overlap_factor = n;
 }
 void rocksdb_options_set_max_bytes_for_level_multiplier_additional(
    rocksdb_options_t* opt, int* level_values, size_t num_levels) {
  opt->rep.max_bytes_for_level_multiplier_additional.resize(num_levels);
  for (size_t i = 0; i < num_levels; ++i) {
    opt->rep.max_bytes_for_level_multiplier_additional[i] = level_values[i];
  }
 }
 void rocksdb_options_enable_statistics(rocksdb_options_t* opt) {
  opt->rep.statistics = rocksdb::CreateDBStatistics();
 }
@ -857,6 +872,24 @@ void rocksdb_options_set_advise_random_on_open(
  opt->rep.advise_random_on_open = v;
 }
 void rocksdb_options_set_access_hint_on_compaction_start(
    rocksdb_options_t* opt, int v) {
  switch(v) {
    case 0:
      opt->rep.access_hint_on_compaction_start = rocksdb::Options::NONE;
      break;
    case 1:
      opt->rep.access_hint_on_compaction_start = rocksdb::Options::NORMAL;
      break;
    case 2:
      opt->rep.access_hint_on_compaction_start = rocksdb::Options::SEQUENTIAL;
      break;
    case 3:
      opt->rep.access_hint_on_compaction_start = rocksdb::Options::WILLNEED;
      break;
  }
 }
 void rocksdb_options_set_use_adaptive_mutex(
    rocksdb_options_t* opt, unsigned char v) {
  opt->rep.use_adaptive_mutex = v;
@ -867,6 +900,11 @@ void rocksdb_options_set_bytes_per_sync(
  opt->rep.bytes_per_sync = v;
 }
 void rocksdb_options_set_verify_checksums_in_compaction(
    rocksdb_options_t* opt, unsigned char v) {
  opt->rep.verify_checksums_in_compaction = v;
 }
 void rocksdb_options_set_filter_deletes(
    rocksdb_options_t* opt, unsigned char v) {
  opt->rep.filter_deletes = v;
@ -1003,11 +1041,48 @@ void rocksdb_options_set_hash_link_list_rep(
  opt->rep.memtable_factory.reset(factory);
 }
 void rocksdb_options_set_plain_table_factory(
    rocksdb_options_t *opt, uint32_t user_key_len, int bloom_bits_per_key,
    double hash_table_ratio, size_t index_sparseness) {
  static rocksdb::TableFactory* factory = 0;
  if (!factory) {
    factory = rocksdb::NewPlainTableFactory(
        user_key_len, bloom_bits_per_key,
        hash_table_ratio, index_sparseness);
  }
  opt->rep.table_factory.reset(factory);
 }
 void rocksdb_options_set_max_successive_merges(
    rocksdb_options_t* opt, size_t v) {
  opt->rep.max_successive_merges = v;
 }
 void rocksdb_options_set_min_partial_merge_operands(
    rocksdb_options_t* opt, uint32_t v) {
  opt->rep.min_partial_merge_operands = v;
 }
 void rocksdb_options_set_bloom_locality(
    rocksdb_options_t* opt, uint32_t v) {
  opt->rep.bloom_locality = v;
 }
 void rocksdb_options_set_allow_thread_local(
    rocksdb_options_t* opt, unsigned char v) {
  opt->rep.allow_thread_local = v;
 }
 void rocksdb_options_set_inplace_update_support(
    rocksdb_options_t* opt, unsigned char v) {
  opt->rep.inplace_update_support = v;
 }
 void rocksdb_options_set_inplace_update_num_locks(
    rocksdb_options_t* opt, size_t v) {
  opt->rep.inplace_update_num_locks = v;
 }
 void rocksdb_options_set_compaction_style(rocksdb_options_t *opt, int style) {
  opt->rep.compaction_style = static_cast<rocksdb::CompactionStyle>(style);
 }
@ -1022,21 +1097,14 @@ DB::OpenForReadOnly
 DB::MultiGet
 DB::KeyMayExist
 DB::GetOptions
 DB::GetLiveFiles
 DB::GetSortedWalFiles
 DB::GetLatestSequenceNumber
 DB::GetUpdatesSince
 DB::DeleteFile
 DB::GetDbIdentity
 DB::RunManualCompaction
 custom cache
 compaction_filter
 max_bytes_for_level_multiplier_additional
 access_hint_on_compaction_start
 table_factory
 table_properties_collectors
 inplace_update_support
 inplace_update_num_locks
 */
 rocksdb_comparator_t* rocksdb_comparator_create(
--- a/db/c_test.c
+++ b/db/c_test.c
@ -443,6 +443,7 @@ int main(int argc, char** argv) {
    rocksdb_options_set_filter_policy(options, policy);
    rocksdb_options_set_prefix_extractor(options, rocksdb_slicetransform_create_fixed_prefix(3));
    rocksdb_options_set_hash_skip_list_rep(options, 50000, 4, 4);
    rocksdb_options_set_plain_table_factory(options, 4, 10, 0.75, 16);
    db = rocksdb_open(options, dbname, &err);
    CheckNoError(err);
--- a/db/column_family.cc
+++ b/db/column_family.cc
@ -0,0 +1,489 @@
 //  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
 //  This source code is licensed under the BSD-style license found in the
 //  LICENSE file in the root directory of this source tree. An additional grant
 //  of patent rights can be found in the PATENTS file in the same directory.
 //
 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 #include "db/column_family.h"
 #include <vector>
 #include <string>
 #include <algorithm>
 #include "db/db_impl.h"
 #include "db/version_set.h"
 #include "db/internal_stats.h"
 #include "db/compaction_picker.h"
 #include "db/table_properties_collector.h"
 #include "util/autovector.h"
 #include "util/hash_skiplist_rep.h"
 namespace rocksdb {
 ColumnFamilyHandleImpl::ColumnFamilyHandleImpl(ColumnFamilyData* cfd,
                                               DBImpl* db, port::Mutex* mutex)
    : cfd_(cfd), db_(db), mutex_(mutex) {
  if (cfd_ != nullptr) {
    cfd_->Ref();
  }
 }
 ColumnFamilyHandleImpl::~ColumnFamilyHandleImpl() {
  if (cfd_ != nullptr) {
    DBImpl::DeletionState deletion_state;
    mutex_->Lock();
    if (cfd_->Unref()) {
      delete cfd_;
    }
    db_->FindObsoleteFiles(deletion_state, false, true);
    mutex_->Unlock();
    if (deletion_state.HaveSomethingToDelete()) {
      db_->PurgeObsoleteFiles(deletion_state);
    }
  }
 }
 uint32_t ColumnFamilyHandleImpl::GetID() const { return cfd()->GetID(); }
 namespace {
 // Fix user-supplied options to be reasonable
 template <class T, class V>
 static void ClipToRange(T* ptr, V minvalue, V maxvalue) {
  if (static_cast<V>(*ptr) > maxvalue) *ptr = maxvalue;
  if (static_cast<V>(*ptr) < minvalue) *ptr = minvalue;
 }
 }  // anonymous namespace
 ColumnFamilyOptions SanitizeOptions(const InternalKeyComparator* icmp,
                                    const InternalFilterPolicy* ipolicy,
                                    const ColumnFamilyOptions& src) {
  ColumnFamilyOptions result = src;
  result.comparator = icmp;
  result.filter_policy = (src.filter_policy != nullptr) ? ipolicy : nullptr;
  ClipToRange(&result.write_buffer_size,
              ((size_t)64) << 10, ((size_t)64) << 30);
  // if user sets arena_block_size, we trust user to use this value. Otherwise,
  // calculate a proper value from writer_buffer_size;
  if (result.arena_block_size <= 0) {
    result.arena_block_size = result.write_buffer_size / 10;
  }
  result.min_write_buffer_number_to_merge =
      std::min(result.min_write_buffer_number_to_merge,
               result.max_write_buffer_number - 1);
  if (result.block_cache == nullptr && !result.no_block_cache) {
    result.block_cache = NewLRUCache(8 << 20);
  }
  result.compression_per_level = src.compression_per_level;
  if (result.block_size_deviation < 0 || result.block_size_deviation > 100) {
    result.block_size_deviation = 0;
  }
  if (result.max_mem_compaction_level >= result.num_levels) {
    result.max_mem_compaction_level = result.num_levels - 1;
  }
  if (result.soft_rate_limit > result.hard_rate_limit) {
    result.soft_rate_limit = result.hard_rate_limit;
  }
  if (!result.prefix_extractor) {
    assert(result.memtable_factory);
    Slice name = result.memtable_factory->Name();
    if (name.compare("HashSkipListRepFactory") == 0 ||
        name.compare("HashLinkListRepFactory") == 0) {
      result.memtable_factory = std::make_shared<SkipListFactory>();
    }
  }
  // -- Sanitize the table properties collector
  // All user defined properties collectors will be wrapped by
  // UserKeyTablePropertiesCollector since for them they only have the
  // knowledge of the user keys; internal keys are invisible to them.
  auto& collectors = result.table_properties_collectors;
  for (size_t i = 0; i < result.table_properties_collectors.size(); ++i) {
    assert(collectors[i]);
    collectors[i] =
        std::make_shared<UserKeyTablePropertiesCollector>(collectors[i]);
  }
  // Add collector to collect internal key statistics
  collectors.push_back(std::make_shared<InternalKeyPropertiesCollector>());
  return result;
 }
 int SuperVersion::dummy = 0;
 void* const SuperVersion::kSVInUse = &SuperVersion::dummy;
 void* const SuperVersion::kSVObsolete = nullptr;
 SuperVersion::~SuperVersion() {
  for (auto td : to_delete) {
    delete td;
  }
 }
 SuperVersion* SuperVersion::Ref() {
  refs.fetch_add(1, std::memory_order_relaxed);
  return this;
 }
 bool SuperVersion::Unref() {
  // fetch_sub returns the previous value of ref
  uint32_t previous_refs = refs.fetch_sub(1, std::memory_order_relaxed);
  assert(previous_refs > 0);
  return previous_refs == 1;
 }
 void SuperVersion::Cleanup() {
  assert(refs.load(std::memory_order_relaxed) == 0);
  imm->Unref(&to_delete);
  MemTable* m = mem->Unref();
  if (m != nullptr) {
    to_delete.push_back(m);
  }
  current->Unref();
 }
 void SuperVersion::Init(MemTable* new_mem, MemTableListVersion* new_imm,
                        Version* new_current) {
  mem = new_mem;
  imm = new_imm;
  current = new_current;
  mem->Ref();
  imm->Ref();
  current->Ref();
  refs.store(1, std::memory_order_relaxed);
 }
 namespace {
 void SuperVersionUnrefHandle(void* ptr) {
  // UnrefHandle is called when a thread exists or a ThreadLocalPtr gets
  // destroyed. When former happens, the thread shouldn't see kSVInUse.
  // When latter happens, we are in ~ColumnFamilyData(), no get should happen as
  // well.
  SuperVersion* sv = static_cast<SuperVersion*>(ptr);
  if (sv->Unref()) {
    sv->db_mutex->Lock();
    sv->Cleanup();
    sv->db_mutex->Unlock();
    delete sv;
  }
 }
 }  // anonymous namespace
 ColumnFamilyData::ColumnFamilyData(const std::string& dbname, uint32_t id,
                                   const std::string& name,
                                   Version* dummy_versions, Cache* table_cache,
                                   const ColumnFamilyOptions& options,
                                   const DBOptions* db_options,
                                   const EnvOptions& storage_options,
                                   ColumnFamilySet* column_family_set)
    : id_(id),
      name_(name),
      dummy_versions_(dummy_versions),
      current_(nullptr),
      refs_(0),
      dropped_(false),
      internal_comparator_(options.comparator),
      internal_filter_policy_(options.filter_policy),
      options_(*db_options, SanitizeOptions(&internal_comparator_,
                                            &internal_filter_policy_, options)),
      mem_(nullptr),
      imm_(options.min_write_buffer_number_to_merge),
      super_version_(nullptr),
      super_version_number_(0),
      local_sv_(new ThreadLocalPtr(&SuperVersionUnrefHandle)),
      next_(nullptr),
      prev_(nullptr),
      log_number_(0),
      need_slowdown_for_num_level0_files_(false),
      column_family_set_(column_family_set) {
  Ref();
  // if dummy_versions is nullptr, then this is a dummy column family.
  if (dummy_versions != nullptr) {
    internal_stats_.reset(new InternalStats(options.num_levels, db_options->env,
                                            db_options->statistics.get()));
    table_cache_.reset(
        new TableCache(dbname, &options_, storage_options, table_cache));
    if (options_.compaction_style == kCompactionStyleUniversal) {
      compaction_picker_.reset(
          new UniversalCompactionPicker(&options_, &internal_comparator_));
    } else {
      compaction_picker_.reset(
          new LevelCompactionPicker(&options_, &internal_comparator_));
    }
    Log(options_.info_log, "Options for column family \"%s\":\n",
        name.c_str());
    const ColumnFamilyOptions* cf_options = &options_;
    cf_options->Dump(options_.info_log.get());
  }
 }
 // DB mutex held
 ColumnFamilyData::~ColumnFamilyData() {
  assert(refs_ == 0);
  // remove from linked list
  auto prev = prev_;
  auto next = next_;
  prev->next_ = next;
  next->prev_ = prev;
  // it's nullptr for dummy CFD
  if (column_family_set_ != nullptr) {
    // remove from column_family_set
    column_family_set_->RemoveColumnFamily(this);
  }
  if (current_ != nullptr) {
    current_->Unref();
  }
  if (super_version_ != nullptr) {
    // Release SuperVersion reference kept in ThreadLocalPtr.
    // This must be done outside of mutex_ since unref handler can lock mutex.
    super_version_->db_mutex->Unlock();
    local_sv_.reset();
    super_version_->db_mutex->Lock();
    bool is_last_reference __attribute__((unused));
    is_last_reference = super_version_->Unref();
    assert(is_last_reference);
    super_version_->Cleanup();
    delete super_version_;
    super_version_ = nullptr;
  }
  if (dummy_versions_ != nullptr) {
    // List must be empty
    assert(dummy_versions_->next_ == dummy_versions_);
    delete dummy_versions_;
  }
  if (mem_ != nullptr) {
    delete mem_->Unref();
  }
  autovector<MemTable*> to_delete;
  imm_.current()->Unref(&to_delete);
  for (MemTable* m : to_delete) {
    delete m;
  }
 }
 void ColumnFamilyData::SetCurrent(Version* current) {
  current_ = current;
  need_slowdown_for_num_level0_files_ =
      (options_.level0_slowdown_writes_trigger >= 0 &&
       current_->NumLevelFiles(0) >= options_.level0_slowdown_writes_trigger);
 }
 void ColumnFamilyData::CreateNewMemtable() {
  assert(current_ != nullptr);
  if (mem_ != nullptr) {
    delete mem_->Unref();
  }
  mem_ = new MemTable(internal_comparator_, options_);
  mem_->Ref();
 }
 Compaction* ColumnFamilyData::PickCompaction(LogBuffer* log_buffer) {
  return compaction_picker_->PickCompaction(current_, log_buffer);
 }
 Compaction* ColumnFamilyData::CompactRange(int input_level, int output_level,
                                           const InternalKey* begin,
                                           const InternalKey* end,
                                           InternalKey** compaction_end) {
  return compaction_picker_->CompactRange(current_, input_level, output_level,
                                          begin, end, compaction_end);
 }
 SuperVersion* ColumnFamilyData::InstallSuperVersion(
    SuperVersion* new_superversion, port::Mutex* db_mutex) {
  new_superversion->db_mutex = db_mutex;
  new_superversion->Init(mem_, imm_.current(), current_);
  SuperVersion* old_superversion = super_version_;
  super_version_ = new_superversion;
  ++super_version_number_;
  super_version_->version_number = super_version_number_;
  if (old_superversion != nullptr && old_superversion->Unref()) {
    old_superversion->Cleanup();
    return old_superversion;  // will let caller delete outside of mutex
  }
  return nullptr;
 }
 void ColumnFamilyData::ResetThreadLocalSuperVersions() {
  autovector<void*> sv_ptrs;
  local_sv_->Scrape(&sv_ptrs, SuperVersion::kSVObsolete);
  for (auto ptr : sv_ptrs) {
    assert(ptr);
    if (ptr == SuperVersion::kSVInUse) {
      continue;
    }
    auto sv = static_cast<SuperVersion*>(ptr);
    if (sv->Unref()) {
      sv->Cleanup();
      delete sv;
    }
  }
 }
 ColumnFamilySet::ColumnFamilySet(const std::string& dbname,
                                 const DBOptions* db_options,
                                 const EnvOptions& storage_options,
                                 Cache* table_cache)
    : max_column_family_(0),
      dummy_cfd_(new ColumnFamilyData(dbname, 0, "", nullptr, nullptr,
                                      ColumnFamilyOptions(), db_options,
                                      storage_options_, nullptr)),
      default_cfd_cache_(nullptr),
      db_name_(dbname),
      db_options_(db_options),
      storage_options_(storage_options),
      table_cache_(table_cache),
      spin_lock_(ATOMIC_FLAG_INIT) {
  // initialize linked list
  dummy_cfd_->prev_ = dummy_cfd_;
  dummy_cfd_->next_ = dummy_cfd_;
 }
 ColumnFamilySet::~ColumnFamilySet() {
  while (column_family_data_.size() > 0) {
    // cfd destructor will delete itself from column_family_data_
    auto cfd = column_family_data_.begin()->second;
    cfd->Unref();
    delete cfd;
  }
  dummy_cfd_->Unref();
  delete dummy_cfd_;
 }
 ColumnFamilyData* ColumnFamilySet::GetDefault() const {
  assert(default_cfd_cache_ != nullptr);
  return default_cfd_cache_;
 }
 ColumnFamilyData* ColumnFamilySet::GetColumnFamily(uint32_t id) const {
  auto cfd_iter = column_family_data_.find(id);
  if (cfd_iter != column_family_data_.end()) {
    return cfd_iter->second;
  } else {
    return nullptr;
  }
 }
 ColumnFamilyData* ColumnFamilySet::GetColumnFamily(const std::string& name)
    const {
  auto cfd_iter = column_families_.find(name);
  if (cfd_iter != column_families_.end()) {
    auto cfd = GetColumnFamily(cfd_iter->second);
    assert(cfd != nullptr);
    return cfd;
  } else {
    return nullptr;
  }
 }
 uint32_t ColumnFamilySet::GetNextColumnFamilyID() {
  return ++max_column_family_;
 }
 uint32_t ColumnFamilySet::GetMaxColumnFamily() { return max_column_family_; }
 void ColumnFamilySet::UpdateMaxColumnFamily(uint32_t new_max_column_family) {
  max_column_family_ = std::max(new_max_column_family, max_column_family_);
 }
 // under a DB mutex
 ColumnFamilyData* ColumnFamilySet::CreateColumnFamily(
    const std::string& name, uint32_t id, Version* dummy_versions,
    const ColumnFamilyOptions& options) {
  assert(column_families_.find(name) == column_families_.end());
  ColumnFamilyData* new_cfd =
      new ColumnFamilyData(db_name_, id, name, dummy_versions, table_cache_,
                           options, db_options_, storage_options_, this);
  Lock();
  column_families_.insert({name, id});
  column_family_data_.insert({id, new_cfd});
  Unlock();
  max_column_family_ = std::max(max_column_family_, id);
  // add to linked list
  new_cfd->next_ = dummy_cfd_;
  auto prev = dummy_cfd_->prev_;
  new_cfd->prev_ = prev;
  prev->next_ = new_cfd;
  dummy_cfd_->prev_ = new_cfd;
  if (id == 0) {
    default_cfd_cache_ = new_cfd;
  }
  return new_cfd;
 }
 void ColumnFamilySet::Lock() {
  // spin lock
  while (spin_lock_.test_and_set(std::memory_order_acquire)) {
  }
 }
 void ColumnFamilySet::Unlock() { spin_lock_.clear(std::memory_order_release); }
 // REQUIRES: DB mutex held
 void ColumnFamilySet::FreeDeadColumnFamilies() {
  autovector<ColumnFamilyData*> to_delete;
  for (auto cfd = dummy_cfd_->next_; cfd != dummy_cfd_; cfd = cfd->next_) {
    if (cfd->refs_ == 0) {
      to_delete.push_back(cfd);
    }
  }
  for (auto cfd : to_delete) {
    // this is very rare, so it's not a problem that we do it under a mutex
    delete cfd;
  }
 }
 // under a DB mutex
 void ColumnFamilySet::RemoveColumnFamily(ColumnFamilyData* cfd) {
  auto cfd_iter = column_family_data_.find(cfd->GetID());
  assert(cfd_iter != column_family_data_.end());
  Lock();
  column_family_data_.erase(cfd_iter);
  column_families_.erase(cfd->GetName());
  Unlock();
 }
 bool ColumnFamilyMemTablesImpl::Seek(uint32_t column_family_id) {
  if (column_family_id == 0) {
    // optimization for common case
    current_ = column_family_set_->GetDefault();
  } else {
    // maybe outside of db mutex, should lock
    column_family_set_->Lock();
    current_ = column_family_set_->GetColumnFamily(column_family_id);
    column_family_set_->Unlock();
  }
  handle_.SetCFD(current_);
  return current_ != nullptr;
 }
 uint64_t ColumnFamilyMemTablesImpl::GetLogNumber() const {
  assert(current_ != nullptr);
  return current_->GetLogNumber();
 }
 MemTable* ColumnFamilyMemTablesImpl::GetMemTable() const {
  assert(current_ != nullptr);
  return current_->mem();
 }
 const Options* ColumnFamilyMemTablesImpl::GetOptions() const {
  assert(current_ != nullptr);
  return current_->options();
 }
 ColumnFamilyHandle* ColumnFamilyMemTablesImpl::GetColumnFamilyHandle() {
  assert(current_ != nullptr);
  return &handle_;
 }
 }  // namespace rocksdb
--- a/db/column_family.h
+++ b/db/column_family.h
@ -0,0 +1,408 @@
 //  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
 //  This source code is licensed under the BSD-style license found in the
 //  LICENSE file in the root directory of this source tree. An additional grant
 //  of patent rights can be found in the PATENTS file in the same directory.
 //
 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 #pragma once
 #include <unordered_map>
 #include <string>
 #include <vector>
 #include <atomic>
 #include "rocksdb/options.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
 #include "db/memtable_list.h"
 #include "db/write_batch_internal.h"
 #include "db/table_cache.h"
 #include "util/thread_local.h"
 namespace rocksdb {
 class Version;
 class VersionSet;
 class MemTable;
 class MemTableListVersion;
 class CompactionPicker;
 class Compaction;
 class InternalKey;
 class InternalStats;
 class ColumnFamilyData;
 class DBImpl;
 class LogBuffer;
 // ColumnFamilyHandleImpl is the class that clients use to access different
 // column families. It has non-trivial destructor, which gets called when client
 // is done using the column family
 class ColumnFamilyHandleImpl : public ColumnFamilyHandle {
 public:
  // create while holding the mutex
  ColumnFamilyHandleImpl(ColumnFamilyData* cfd, DBImpl* db, port::Mutex* mutex);
  // destroy without mutex
  virtual ~ColumnFamilyHandleImpl();
  virtual ColumnFamilyData* cfd() const { return cfd_; }
  virtual uint32_t GetID() const;
 private:
  ColumnFamilyData* cfd_;
  DBImpl* db_;
  port::Mutex* mutex_;
 };
 // Does not ref-count ColumnFamilyData
 // We use this dummy ColumnFamilyHandleImpl because sometimes MemTableInserter
 // calls DBImpl methods. When this happens, MemTableInserter need access to
 // ColumnFamilyHandle (same as the client would need). In that case, we feed
 // MemTableInserter dummy ColumnFamilyHandle and enable it to call DBImpl
 // methods
 class ColumnFamilyHandleInternal : public ColumnFamilyHandleImpl {
 public:
  ColumnFamilyHandleInternal()
      : ColumnFamilyHandleImpl(nullptr, nullptr, nullptr) {}
  void SetCFD(ColumnFamilyData* cfd) { internal_cfd_ = cfd; }
  virtual ColumnFamilyData* cfd() const override { return internal_cfd_; }
 private:
  ColumnFamilyData* internal_cfd_;
 };
 // holds references to memtable, all immutable memtables and version
 struct SuperVersion {
  MemTable* mem;
  MemTableListVersion* imm;
  Version* current;
  std::atomic<uint32_t> refs;
  // We need to_delete because during Cleanup(), imm->Unref() returns
  // all memtables that we need to free through this vector. We then
  // delete all those memtables outside of mutex, during destruction
  autovector<MemTable*> to_delete;
  // Version number of the current SuperVersion
  uint64_t version_number;
  port::Mutex* db_mutex;
  // should be called outside the mutex
  SuperVersion() = default;
  ~SuperVersion();
  SuperVersion* Ref();
  bool Unref();
  // call these two methods with db mutex held
  // Cleanup unrefs mem, imm and current. Also, it stores all memtables
  // that needs to be deleted in to_delete vector. Unrefing those
  // objects needs to be done in the mutex
  void Cleanup();
  void Init(MemTable* new_mem, MemTableListVersion* new_imm,
            Version* new_current);
  // The value of dummy is not actually used. kSVInUse takes its address as a
  // mark in the thread local storage to indicate the SuperVersion is in use
  // by thread. This way, the value of kSVInUse is guaranteed to have no
  // conflict with SuperVersion object address and portable on different
  // platform.
  static int dummy;
  static void* const kSVInUse;
  static void* const kSVObsolete;
 };
 extern ColumnFamilyOptions SanitizeOptions(const InternalKeyComparator* icmp,
                                           const InternalFilterPolicy* ipolicy,
                                           const ColumnFamilyOptions& src);
 class ColumnFamilySet;
 // This class keeps all the data that a column family needs. It's mosly dumb and
 // used just to provide access to metadata.
 // Most methods require DB mutex held, unless otherwise noted
 class ColumnFamilyData {
 public:
  ~ColumnFamilyData();
  // thread-safe
  uint32_t GetID() const { return id_; }
  // thread-safe
  const std::string& GetName() const { return name_; }
  void Ref() { ++refs_; }
  // will just decrease reference count to 0, but will not delete it. returns
  // true if the ref count was decreased to zero. in that case, it can be
  // deleted by the caller immediatelly, or later, by calling
  // FreeDeadColumnFamilies()
  bool Unref() {
    assert(refs_ > 0);
    return --refs_ == 0;
  }
  // This can only be called from single-threaded VersionSet::LogAndApply()
  // After dropping column family no other operation on that column family
  // will be executed. All the files and memory will be, however, kept around
  // until client drops the column family handle. That way, client can still
  // access data from dropped column family.
  // Column family can be dropped and still alive. In that state:
  // *) Column family is not included in the iteration.
  // *) Compaction and flush is not executed on the dropped column family.
  // *) Client can continue writing and reading from column family. However, all
  // writes stay in the current memtable.
  // When the dropped column family is unreferenced, then we:
  // *) delete all memory associated with that column family
  // *) delete all the files associated with that column family
  void SetDropped() {
    // can't drop default CF
    assert(id_ != 0);
    dropped_ = true;
  }
  bool IsDropped() const { return dropped_; }
  // thread-safe
  int NumberLevels() const { return options_.num_levels; }
  void SetLogNumber(uint64_t log_number) { log_number_ = log_number; }
  uint64_t GetLogNumber() const { return log_number_; }
  // thread-safe
  const Options* options() const { return &options_; }
  InternalStats* internal_stats() { return internal_stats_.get(); }
  MemTableList* imm() { return &imm_; }
  MemTable* mem() { return mem_; }
  Version* current() { return current_; }
  Version* dummy_versions() { return dummy_versions_; }
  void SetMemtable(MemTable* new_mem) { mem_ = new_mem; }
  void SetCurrent(Version* current);
  void CreateNewMemtable();
  TableCache* table_cache() { return table_cache_.get(); }
  // See documentation in compaction_picker.h
  Compaction* PickCompaction(LogBuffer* log_buffer);
  Compaction* CompactRange(int input_level, int output_level,
                           const InternalKey* begin, const InternalKey* end,
                           InternalKey** compaction_end);
  CompactionPicker* compaction_picker() { return compaction_picker_.get(); }
  // thread-safe
  const Comparator* user_comparator() const {
    return internal_comparator_.user_comparator();
  }
  // thread-safe
  const InternalKeyComparator& internal_comparator() const {
    return internal_comparator_;
  }
  SuperVersion* GetSuperVersion() { return super_version_; }
  // thread-safe
  ThreadLocalPtr* GetThreadLocalSuperVersion() const { return local_sv_.get(); }
  // thread-safe
  uint64_t GetSuperVersionNumber() const {
    return super_version_number_.load();
  }
  // will return a pointer to SuperVersion* if previous SuperVersion
  // if its reference count is zero and needs deletion or nullptr if not
  // As argument takes a pointer to allocated SuperVersion to enable
  // the clients to allocate SuperVersion outside of mutex.
  SuperVersion* InstallSuperVersion(SuperVersion* new_superversion,
                                    port::Mutex* db_mutex);
  void ResetThreadLocalSuperVersions();
  // A Flag indicating whether write needs to slowdown because of there are
  // too many number of level0 files.
  bool NeedSlowdownForNumLevel0Files() const {
    return need_slowdown_for_num_level0_files_;
  }
 private:
  friend class ColumnFamilySet;
  ColumnFamilyData(const std::string& dbname, uint32_t id,
                   const std::string& name, Version* dummy_versions,
                   Cache* table_cache, const ColumnFamilyOptions& options,
                   const DBOptions* db_options,
                   const EnvOptions& storage_options,
                   ColumnFamilySet* column_family_set);
  uint32_t id_;
  const std::string name_;
  Version* dummy_versions_;  // Head of circular doubly-linked list of versions.
  Version* current_;         // == dummy_versions->prev_
  int refs_;                   // outstanding references to ColumnFamilyData
  bool dropped_;               // true if client dropped it
  const InternalKeyComparator internal_comparator_;
  const InternalFilterPolicy internal_filter_policy_;
  Options const options_;
  std::unique_ptr<TableCache> table_cache_;
  std::unique_ptr<InternalStats> internal_stats_;
  MemTable* mem_;
  MemTableList imm_;
  SuperVersion* super_version_;
  // An ordinal representing the current SuperVersion. Updated by
  // InstallSuperVersion(), i.e. incremented every time super_version_
  // changes.
  std::atomic<uint64_t> super_version_number_;
  // Thread's local copy of SuperVersion pointer
  // This needs to be destructed before mutex_
  std::unique_ptr<ThreadLocalPtr> local_sv_;
  // pointers for a circular linked list. we use it to support iterations
  // that can be concurrent with writes
  ColumnFamilyData* next_;
  ColumnFamilyData* prev_;
  // This is the earliest log file number that contains data from this
  // Column Family. All earlier log files must be ignored and not
  // recovered from
  uint64_t log_number_;
  // A flag indicating whether we should delay writes because
  // we have too many level 0 files
  bool need_slowdown_for_num_level0_files_;
  // An object that keeps all the compaction stats
  // and picks the next compaction
  std::unique_ptr<CompactionPicker> compaction_picker_;
  ColumnFamilySet* column_family_set_;
 };
 // ColumnFamilySet has interesting thread-safety requirements
 // * CreateColumnFamily() or RemoveColumnFamily() -- need to protect by DB
 // mutex. Inside, column_family_data_ and column_families_ will be protected
 // by Lock() and Unlock(). CreateColumnFamily() should ONLY be called from
 // VersionSet::LogAndApply() in the normal runtime. It is also called
 // during Recovery and in DumpManifest(). RemoveColumnFamily() is called
 // from ColumnFamilyData destructor
 // * Iteration -- hold DB mutex, but you can release it in the body of
 // iteration. If you release DB mutex in body, reference the column
 // family before the mutex and unreference after you unlock, since the column
 // family might get dropped when the DB mutex is released
 // * GetDefault() -- thread safe
 // * GetColumnFamily() -- either inside of DB mutex or call Lock() <-> Unlock()
 // * GetNextColumnFamilyID(), GetMaxColumnFamily(), UpdateMaxColumnFamily() --
 // inside of DB mutex
 class ColumnFamilySet {
 public:
  // ColumnFamilySet supports iteration
  class iterator {
   public:
    explicit iterator(ColumnFamilyData* cfd)
        : current_(cfd) {}
    iterator& operator++() {
      // dummy is never dead or dropped, so this will never be infinite
      do {
        current_ = current_->next_;
      } while (current_->refs_ == 0 || current_->IsDropped());
      return *this;
    }
    bool operator!=(const iterator& other) {
      return this->current_ != other.current_;
    }
    ColumnFamilyData* operator*() { return current_; }
   private:
    ColumnFamilyData* current_;
  };
  ColumnFamilySet(const std::string& dbname, const DBOptions* db_options,
                  const EnvOptions& storage_options, Cache* table_cache);
  ~ColumnFamilySet();
  ColumnFamilyData* GetDefault() const;
  // GetColumnFamily() calls return nullptr if column family is not found
  ColumnFamilyData* GetColumnFamily(uint32_t id) const;
  ColumnFamilyData* GetColumnFamily(const std::string& name) const;
  // this call will return the next available column family ID. it guarantees
  // that there is no column family with id greater than or equal to the
  // returned value in the current running instance or anytime in RocksDB
  // instance history.
  uint32_t GetNextColumnFamilyID();
  uint32_t GetMaxColumnFamily();
  void UpdateMaxColumnFamily(uint32_t new_max_column_family);
  ColumnFamilyData* CreateColumnFamily(const std::string& name, uint32_t id,
                                       Version* dummy_version,
                                       const ColumnFamilyOptions& options);
  iterator begin() { return iterator(dummy_cfd_->next_); }
  iterator end() { return iterator(dummy_cfd_); }
  void Lock();
  void Unlock();
  // REQUIRES: DB mutex held
  // Don't call while iterating over ColumnFamilySet
  void FreeDeadColumnFamilies();
 private:
  friend class ColumnFamilyData;
  // helper function that gets called from cfd destructor
  // REQUIRES: DB mutex held
  void RemoveColumnFamily(ColumnFamilyData* cfd);
  // column_families_ and column_family_data_ need to be protected:
  // * when mutating: 1. DB mutex locked first, 2. spinlock locked second
  // * when reading, either: 1. lock DB mutex, or 2. lock spinlock
  //  (if both, respect the ordering to avoid deadlock!)
  std::unordered_map<std::string, uint32_t> column_families_;
  std::unordered_map<uint32_t, ColumnFamilyData*> column_family_data_;
  uint32_t max_column_family_;
  ColumnFamilyData* dummy_cfd_;
  // We don't hold the refcount here, since default column family always exists
  // We are also not responsible for cleaning up default_cfd_cache_. This is
  // just a cache that makes common case (accessing default column family)
  // faster
  ColumnFamilyData* default_cfd_cache_;
  const std::string db_name_;
  const DBOptions* const db_options_;
  const EnvOptions storage_options_;
  Cache* table_cache_;
  std::atomic_flag spin_lock_;
 };
 // We use ColumnFamilyMemTablesImpl to provide WriteBatch a way to access
 // memtables of different column families (specified by ID in the write batch)
 class ColumnFamilyMemTablesImpl : public ColumnFamilyMemTables {
 public:
  explicit ColumnFamilyMemTablesImpl(ColumnFamilySet* column_family_set)
      : column_family_set_(column_family_set), current_(nullptr) {}
  // sets current_ to ColumnFamilyData with column_family_id
  // returns false if column family doesn't exist
  bool Seek(uint32_t column_family_id) override;
  // Returns log number of the selected column family
  uint64_t GetLogNumber() const override;
  // REQUIRES: Seek() called first
  virtual MemTable* GetMemTable() const override;
  // Returns options for selected column family
  // REQUIRES: Seek() called first
  virtual const Options* GetOptions() const override;
  // Returns column family handle for the selected column family
  virtual ColumnFamilyHandle* GetColumnFamilyHandle() override;
 private:
  ColumnFamilySet* column_family_set_;
  ColumnFamilyData* current_;
  ColumnFamilyHandleInternal handle_;
 };
 }  // namespace rocksdb
--- a/db/column_family_test.cc
+++ b/db/column_family_test.cc
@ -0,0 +1,857 @@
 //  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
 //  This source code is licensed under the BSD-style license found in the
 //  LICENSE file in the root directory of this source tree. An additional grant
 //  of patent rights can be found in the PATENTS file in the same directory.
 //
 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 #include <algorithm>
 #include <vector>
 #include <string>
 #include "db/db_impl.h"
 #include "rocksdb/env.h"
 #include "rocksdb/db.h"
 #include "util/testharness.h"
 #include "util/testutil.h"
 #include "util/coding.h"
 #include "utilities/merge_operators.h"
 namespace rocksdb {
 namespace {
 std::string RandomString(Random* rnd, int len) {
  std::string r;
  test::RandomString(rnd, len, &r);
  return r;
 }
 }  // anonymous namespace
 class ColumnFamilyTest {
 public:
  ColumnFamilyTest() : rnd_(139) {
    env_ = Env::Default();
    dbname_ = test::TmpDir() + "/column_family_test";
    db_options_.create_if_missing = true;
    DestroyDB(dbname_, Options(db_options_, column_family_options_));
  }
  void Close() {
    for (auto h : handles_) {
      delete h;
    }
    handles_.clear();
    names_.clear();
    delete db_;
    db_ = nullptr;
  }
  Status TryOpen(std::vector<std::string> cf,
                 std::vector<ColumnFamilyOptions> options = {}) {
    std::vector<ColumnFamilyDescriptor> column_families;
    names_.clear();
    for (size_t i = 0; i < cf.size(); ++i) {
      column_families.push_back(ColumnFamilyDescriptor(
          cf[i], options.size() == 0 ? column_family_options_ : options[i]));
      names_.push_back(cf[i]);
    }
    return DB::Open(db_options_, dbname_, column_families, &handles_, &db_);
  }
  void Open(std::vector<std::string> cf,
            std::vector<ColumnFamilyOptions> options = {}) {
    ASSERT_OK(TryOpen(cf, options));
  }
  void Open() {
    Open({"default"});
  }
  DBImpl* dbfull() { return reinterpret_cast<DBImpl*>(db_); }
  int GetProperty(int cf, std::string property) {
    std::string value;
    ASSERT_TRUE(dbfull()->GetProperty(handles_[cf], property, &value));
    return std::stoi(value);
  }
  void Destroy() {
    for (auto h : handles_) {
      delete h;
    }
    handles_.clear();
    names_.clear();
    delete db_;
    db_ = nullptr;
    ASSERT_OK(DestroyDB(dbname_, Options(db_options_, column_family_options_)));
  }
  void CreateColumnFamilies(
      const std::vector<std::string>& cfs,
      const std::vector<ColumnFamilyOptions> options = {}) {
    int cfi = handles_.size();
    handles_.resize(cfi + cfs.size());
    names_.resize(cfi + cfs.size());
    for (size_t i = 0; i < cfs.size(); ++i) {
      ASSERT_OK(db_->CreateColumnFamily(
          options.size() == 0 ? column_family_options_ : options[i], cfs[i],
          &handles_[cfi]));
      names_[cfi] = cfs[i];
      cfi++;
    }
  }
  void Reopen(const std::vector<ColumnFamilyOptions> options = {}) {
    std::vector<std::string> names;
    for (auto name : names_) {
      if (name != "") {
        names.push_back(name);
      }
    }
    Close();
    assert(options.size() == 0 || names.size() == options.size());
    Open(names, options);
  }
  void CreateColumnFamiliesAndReopen(const std::vector<std::string>& cfs) {
    CreateColumnFamilies(cfs);
    Reopen();
  }
  void DropColumnFamilies(const std::vector<int>& cfs) {
    for (auto cf : cfs) {
      ASSERT_OK(db_->DropColumnFamily(handles_[cf]));
      delete handles_[cf];
      handles_[cf] = nullptr;
      names_[cf] = "";
    }
  }
  void PutRandomData(int cf, int num, int key_value_size) {
    for (int i = 0; i < num; ++i) {
      // 10 bytes for key, rest is value
      ASSERT_OK(Put(cf, test::RandomKey(&rnd_, 10),
                    RandomString(&rnd_, key_value_size - 10)));
    }
  }
  void WaitForFlush(int cf) {
    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[cf]));
  }
  void WaitForCompaction() { ASSERT_OK(dbfull()->TEST_WaitForCompact()); }
  Status Put(int cf, const std::string& key, const std::string& value) {
    return db_->Put(WriteOptions(), handles_[cf], Slice(key), Slice(value));
  }
  Status Merge(int cf, const std::string& key, const std::string& value) {
    return db_->Merge(WriteOptions(), handles_[cf], Slice(key), Slice(value));
  }
  Status Flush(int cf) {
    return db_->Flush(FlushOptions(), handles_[cf]);
  }
  std::string Get(int cf, const std::string& key) {
    ReadOptions options;
    options.verify_checksums = true;
    std::string result;
    Status s = db_->Get(options, handles_[cf], Slice(key), &result);
    if (s.IsNotFound()) {
      result = "NOT_FOUND";
    } else if (!s.ok()) {
      result = s.ToString();
    }
    return result;
  }
  void CompactAll(int cf) {
    ASSERT_OK(db_->CompactRange(handles_[cf], nullptr, nullptr));
  }
  void Compact(int cf, const Slice& start, const Slice& limit) {
    ASSERT_OK(db_->CompactRange(handles_[cf], &start, &limit));
  }
  int NumTableFilesAtLevel(int level, int cf) {
    return GetProperty(cf,
                       "rocksdb.num-files-at-level" + std::to_string(level));
  }
  // Return spread of files per level
  std::string FilesPerLevel(int cf) {
    std::string result;
    int last_non_zero_offset = 0;
    for (int level = 0; level < dbfull()->NumberLevels(handles_[cf]); level++) {
      int f = NumTableFilesAtLevel(level, cf);
      char buf[100];
      snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f);
      result += buf;
      if (f > 0) {
        last_non_zero_offset = result.size();
      }
    }
    result.resize(last_non_zero_offset);
    return result;
  }
  int CountLiveFiles(int cf) {
    std::vector<LiveFileMetaData> metadata;
    db_->GetLiveFilesMetaData(&metadata);
    return static_cast<int>(metadata.size());
  }
  // Do n memtable flushes, each of which produces an sstable
  // covering the range [small,large].
  void MakeTables(int cf, int n, const std::string& small,
                  const std::string& large) {
    for (int i = 0; i < n; i++) {
      ASSERT_OK(Put(cf, small, "begin"));
      ASSERT_OK(Put(cf, large, "end"));
      ASSERT_OK(db_->Flush(FlushOptions(), handles_[cf]));
    }
  }
  int CountLiveLogFiles() {
    int micros_wait_for_log_deletion = 20000;
    env_->SleepForMicroseconds(micros_wait_for_log_deletion);
    int ret = 0;
    VectorLogPtr wal_files;
    Status s;
    // GetSortedWalFiles is a flakey function -- it gets all the wal_dir
    // children files and then later checks for their existance. if some of the
    // log files doesn't exist anymore, it reports an error. it does all of this
    // without DB mutex held, so if a background process deletes the log file
    // while the function is being executed, it returns an error. We retry the
    // function 10 times to avoid the error failing the test
    for (int retries = 0; retries < 10; ++retries) {
      wal_files.clear();
      s = db_->GetSortedWalFiles(wal_files);
      if (s.ok()) {
        break;
      }
    }
    ASSERT_OK(s);
    for (const auto& wal : wal_files) {
      if (wal->Type() == kAliveLogFile) {
        ++ret;
      }
    }
    return ret;
  }
  void AssertNumberOfImmutableMemtables(std::vector<int> num_per_cf) {
    assert(num_per_cf.size() == handles_.size());
    for (size_t i = 0; i < num_per_cf.size(); ++i) {
      ASSERT_EQ(num_per_cf[i],
                GetProperty(i, "rocksdb.num-immutable-mem-table"));
    }
  }
  void CopyFile(const std::string& source, const std::string& destination,
                uint64_t size = 0) {
    const EnvOptions soptions;
    unique_ptr<SequentialFile> srcfile;
    ASSERT_OK(env_->NewSequentialFile(source, &srcfile, soptions));
    unique_ptr<WritableFile> destfile;
    ASSERT_OK(env_->NewWritableFile(destination, &destfile, soptions));
    if (size == 0) {
      // default argument means copy everything
      ASSERT_OK(env_->GetFileSize(source, &size));
    }
    char buffer[4096];
    Slice slice;
    while (size > 0) {
      uint64_t one = std::min(uint64_t(sizeof(buffer)), size);
      ASSERT_OK(srcfile->Read(one, &slice, buffer));
      ASSERT_OK(destfile->Append(slice));
      size -= slice.size();
    }
    ASSERT_OK(destfile->Close());
  }
  std::vector<ColumnFamilyHandle*> handles_;
  std::vector<std::string> names_;
  ColumnFamilyOptions column_family_options_;
  DBOptions db_options_;
  std::string dbname_;
  DB* db_ = nullptr;
  Env* env_;
  Random rnd_;
 };
 TEST(ColumnFamilyTest, DontReuseColumnFamilyID) {
  for (int iter = 0; iter < 3; ++iter) {
    Open();
    CreateColumnFamilies({"one", "two", "three"});
    for (size_t i = 0; i < handles_.size(); ++i) {
      auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(handles_[i]);
      ASSERT_EQ(i, cfh->GetID());
    }
    if (iter == 1) {
      Reopen();
    }
    DropColumnFamilies({3});
    Reopen();
    if (iter == 2) {
      // this tests if max_column_family is correctly persisted with
      // WriteSnapshot()
      Reopen();
    }
    CreateColumnFamilies({"three2"});
    // ID 3 that was used for dropped column family "three" should not be reused
    auto cfh3 = reinterpret_cast<ColumnFamilyHandleImpl*>(handles_[3]);
    ASSERT_EQ(4, cfh3->GetID());
    Close();
    Destroy();
  }
 }
 TEST(ColumnFamilyTest, AddDrop) {
  Open();
  CreateColumnFamilies({"one", "two", "three"});
  ASSERT_EQ("NOT_FOUND", Get(1, "fodor"));
  ASSERT_EQ("NOT_FOUND", Get(2, "fodor"));
  DropColumnFamilies({2});
  ASSERT_EQ("NOT_FOUND", Get(1, "fodor"));
  CreateColumnFamilies({"four"});
  ASSERT_EQ("NOT_FOUND", Get(3, "fodor"));
  ASSERT_OK(Put(1, "fodor", "mirko"));
  ASSERT_EQ("mirko", Get(1, "fodor"));
  ASSERT_EQ("NOT_FOUND", Get(3, "fodor"));
  Close();
  ASSERT_TRUE(TryOpen({"default"}).IsInvalidArgument());
  Open({"default", "one", "three", "four"});
  DropColumnFamilies({1});
  Reopen();
  Close();
  std::vector<std::string> families;
  ASSERT_OK(DB::ListColumnFamilies(db_options_, dbname_, &families));
  sort(families.begin(), families.end());
  ASSERT_TRUE(families ==
              std::vector<std::string>({"default", "four", "three"}));
 }
 TEST(ColumnFamilyTest, DropTest) {
  // first iteration - dont reopen DB before dropping
  // second iteration - reopen DB before dropping
  for (int iter = 0; iter < 2; ++iter) {
    Open({"default"});
    CreateColumnFamiliesAndReopen({"pikachu"});
    for (int i = 0; i < 100; ++i) {
      ASSERT_OK(Put(1, std::to_string(i), "bar" + std::to_string(i)));
    }
    ASSERT_OK(Flush(1));
    if (iter == 1) {
      Reopen();
    }
    ASSERT_EQ("bar1", Get(1, "1"));
    ASSERT_EQ(CountLiveFiles(1), 1);
    DropColumnFamilies({1});
    // make sure that all files are deleted when we drop the column family
    ASSERT_EQ(CountLiveFiles(1), 0);
    Destroy();
  }
 }
 TEST(ColumnFamilyTest, WriteBatchFailure) {
  Open();
  CreateColumnFamiliesAndReopen({"one", "two"});
  WriteBatch batch;
  batch.Put(handles_[1], Slice("non-existing"), Slice("column-family"));
  ASSERT_OK(db_->Write(WriteOptions(), &batch));
  DropColumnFamilies({1});
  Status s = db_->Write(WriteOptions(), &batch);
  ASSERT_TRUE(s.IsInvalidArgument());
  Close();
 }
 TEST(ColumnFamilyTest, ReadWrite) {
  Open();
  CreateColumnFamiliesAndReopen({"one", "two"});
  ASSERT_OK(Put(0, "foo", "v1"));
  ASSERT_OK(Put(0, "bar", "v2"));
  ASSERT_OK(Put(1, "mirko", "v3"));
  ASSERT_OK(Put(0, "foo", "v2"));
  ASSERT_OK(Put(2, "fodor", "v5"));
  for (int iter = 0; iter <= 3; ++iter) {
    ASSERT_EQ("v2", Get(0, "foo"));
    ASSERT_EQ("v2", Get(0, "bar"));
    ASSERT_EQ("v3", Get(1, "mirko"));
    ASSERT_EQ("v5", Get(2, "fodor"));
    ASSERT_EQ("NOT_FOUND", Get(0, "fodor"));
    ASSERT_EQ("NOT_FOUND", Get(1, "fodor"));
    ASSERT_EQ("NOT_FOUND", Get(2, "foo"));
    if (iter <= 1) {
      Reopen();
    }
  }
  Close();
 }
 TEST(ColumnFamilyTest, IgnoreRecoveredLog) {
  std::string backup_logs = dbname_ + "/backup_logs";
  // delete old files in backup_logs directory
  ASSERT_OK(env_->CreateDirIfMissing(dbname_));
  ASSERT_OK(env_->CreateDirIfMissing(backup_logs));
  std::vector<std::string> old_files;
  env_->GetChildren(backup_logs, &old_files);
  for (auto& file : old_files) {
    if (file != "." && file != "..") {
      env_->DeleteFile(backup_logs + "/" + file);
    }
  }
  column_family_options_.merge_operator =
      MergeOperators::CreateUInt64AddOperator();
  db_options_.wal_dir = dbname_ + "/logs";
  Destroy();
  Open();
  CreateColumnFamilies({"cf1", "cf2"});
  // fill up the DB
  std::string one, two, three;
  PutFixed64(&one, 1);
  PutFixed64(&two, 2);
  PutFixed64(&three, 3);
  ASSERT_OK(Merge(0, "foo", one));
  ASSERT_OK(Merge(1, "mirko", one));
  ASSERT_OK(Merge(0, "foo", one));
  ASSERT_OK(Merge(2, "bla", one));
  ASSERT_OK(Merge(2, "fodor", one));
  ASSERT_OK(Merge(0, "bar", one));
  ASSERT_OK(Merge(2, "bla", one));
  ASSERT_OK(Merge(1, "mirko", two));
  ASSERT_OK(Merge(1, "franjo", one));
  // copy the logs to backup
  std::vector<std::string> logs;
  env_->GetChildren(db_options_.wal_dir, &logs);
  for (auto& log : logs) {
    if (log != ".." && log != ".") {
      CopyFile(db_options_.wal_dir + "/" + log, backup_logs + "/" + log);
    }
  }
  // recover the DB
  Close();
  // 1. check consistency
  // 2. copy the logs from backup back to WAL dir. if the recovery happens
  // again on the same log files, this should lead to incorrect results
  // due to applying merge operator twice
  // 3. check consistency
  for (int iter = 0; iter < 2; ++iter) {
    // assert consistency
    Open({"default", "cf1", "cf2"});
    ASSERT_EQ(two, Get(0, "foo"));
    ASSERT_EQ(one, Get(0, "bar"));
    ASSERT_EQ(three, Get(1, "mirko"));
    ASSERT_EQ(one, Get(1, "franjo"));
    ASSERT_EQ(one, Get(2, "fodor"));
    ASSERT_EQ(two, Get(2, "bla"));
    Close();
    if (iter == 0) {
      // copy the logs from backup back to wal dir
      for (auto& log : logs) {
        if (log != ".." && log != ".") {
          CopyFile(backup_logs + "/" + log, db_options_.wal_dir + "/" + log);
        }
      }
    }
  }
 }
 TEST(ColumnFamilyTest, FlushTest) {
  Open();
  CreateColumnFamiliesAndReopen({"one", "two"});
  ASSERT_OK(Put(0, "foo", "v1"));
  ASSERT_OK(Put(0, "bar", "v2"));
  ASSERT_OK(Put(1, "mirko", "v3"));
  ASSERT_OK(Put(0, "foo", "v2"));
  ASSERT_OK(Put(2, "fodor", "v5"));
  for (int i = 0; i < 3; ++i) {
    Flush(i);
  }
  Reopen();
  for (int iter = 0; iter <= 2; ++iter) {
    ASSERT_EQ("v2", Get(0, "foo"));
    ASSERT_EQ("v2", Get(0, "bar"));
    ASSERT_EQ("v3", Get(1, "mirko"));
    ASSERT_EQ("v5", Get(2, "fodor"));
    ASSERT_EQ("NOT_FOUND", Get(0, "fodor"));
    ASSERT_EQ("NOT_FOUND", Get(1, "fodor"));
    ASSERT_EQ("NOT_FOUND", Get(2, "foo"));
    if (iter <= 1) {
      Reopen();
    }
  }
  Close();
 }
 // Makes sure that obsolete log files get deleted
 TEST(ColumnFamilyTest, LogDeletionTest) {
  column_family_options_.write_buffer_size = 100000;  // 100KB
  Open();
  CreateColumnFamilies({"one", "two", "three", "four"});
  // Each bracket is one log file. if number is in (), it means
  // we don't need it anymore (it's been flushed)
  // []
  ASSERT_EQ(CountLiveLogFiles(), 0);
  PutRandomData(0, 1, 100);
  // [0]
  PutRandomData(1, 1, 100);
  // [0, 1]
  PutRandomData(1, 1000, 100);
  WaitForFlush(1);
  // [0, (1)] [1]
  ASSERT_EQ(CountLiveLogFiles(), 2);
  PutRandomData(0, 1, 100);
  // [0, (1)] [0, 1]
  ASSERT_EQ(CountLiveLogFiles(), 2);
  PutRandomData(2, 1, 100);
  // [0, (1)] [0, 1, 2]
  PutRandomData(2, 1000, 100);
  WaitForFlush(2);
  // [0, (1)] [0, 1, (2)] [2]
  ASSERT_EQ(CountLiveLogFiles(), 3);
  PutRandomData(2, 1000, 100);
  WaitForFlush(2);
  // [0, (1)] [0, 1, (2)] [(2)] [2]
  ASSERT_EQ(CountLiveLogFiles(), 4);
  PutRandomData(3, 1, 100);
  // [0, (1)] [0, 1, (2)] [(2)] [2, 3]
  PutRandomData(1, 1, 100);
  // [0, (1)] [0, 1, (2)] [(2)] [1, 2, 3]
  ASSERT_EQ(CountLiveLogFiles(), 4);
  PutRandomData(1, 1000, 100);
  WaitForFlush(1);
  // [0, (1)] [0, (1), (2)] [(2)] [(1), 2, 3] [1]
  ASSERT_EQ(CountLiveLogFiles(), 5);
  PutRandomData(0, 1000, 100);
  WaitForFlush(0);
  // [(0), (1)] [(0), (1), (2)] [(2)] [(1), 2, 3] [1, (0)] [0]
  // delete obsolete logs -->
  // [(1), 2, 3] [1, (0)] [0]
  ASSERT_EQ(CountLiveLogFiles(), 3);
  PutRandomData(0, 1000, 100);
  WaitForFlush(0);
  // [(1), 2, 3] [1, (0)], [(0)] [0]
  ASSERT_EQ(CountLiveLogFiles(), 4);
  PutRandomData(1, 1000, 100);
  WaitForFlush(1);
  // [(1), 2, 3] [(1), (0)] [(0)] [0, (1)] [1]
  ASSERT_EQ(CountLiveLogFiles(), 5);
  PutRandomData(2, 1000, 100);
  WaitForFlush(2);
  // [(1), (2), 3] [(1), (0)] [(0)] [0, (1)] [1, (2)], [2]
  ASSERT_EQ(CountLiveLogFiles(), 6);
  PutRandomData(3, 1000, 100);
  WaitForFlush(3);
  // [(1), (2), (3)] [(1), (0)] [(0)] [0, (1)] [1, (2)], [2, (3)] [3]
  // delete obsolete logs -->
  // [0, (1)] [1, (2)], [2, (3)] [3]
  ASSERT_EQ(CountLiveLogFiles(), 4);
  Close();
 }
 // Makes sure that obsolete log files get deleted
 TEST(ColumnFamilyTest, DifferentWriteBufferSizes) {
  Open();
  CreateColumnFamilies({"one", "two", "three"});
  ColumnFamilyOptions default_cf, one, two, three;
  // setup options. all column families have max_write_buffer_number setup to 10
  // "default" -> 100KB memtable, start flushing immediatelly
  // "one" -> 200KB memtable, start flushing with two immutable memtables
  // "two" -> 1MB memtable, start flushing with three immutable memtables
  // "three" -> 90KB memtable, start flushing with four immutable memtables
  default_cf.write_buffer_size = 100000;
  default_cf.max_write_buffer_number = 10;
  default_cf.min_write_buffer_number_to_merge = 1;
  one.write_buffer_size = 200000;
  one.max_write_buffer_number = 10;
  one.min_write_buffer_number_to_merge = 2;
  two.write_buffer_size = 1000000;
  two.max_write_buffer_number = 10;
  two.min_write_buffer_number_to_merge = 3;
  three.write_buffer_size = 90000;
  three.max_write_buffer_number = 10;
  three.min_write_buffer_number_to_merge = 4;
  Reopen({default_cf, one, two, three});
  int micros_wait_for_flush = 10000;
  PutRandomData(0, 100, 1000);
  WaitForFlush(0);
  AssertNumberOfImmutableMemtables({0, 0, 0, 0});
  ASSERT_EQ(CountLiveLogFiles(), 1);
  PutRandomData(1, 200, 1000);
  env_->SleepForMicroseconds(micros_wait_for_flush);
  AssertNumberOfImmutableMemtables({0, 1, 0, 0});
  ASSERT_EQ(CountLiveLogFiles(), 2);
  PutRandomData(2, 1000, 1000);
  env_->SleepForMicroseconds(micros_wait_for_flush);
  AssertNumberOfImmutableMemtables({0, 1, 1, 0});
  ASSERT_EQ(CountLiveLogFiles(), 3);
  PutRandomData(2, 1000, 1000);
  env_->SleepForMicroseconds(micros_wait_for_flush);
  AssertNumberOfImmutableMemtables({0, 1, 2, 0});
  ASSERT_EQ(CountLiveLogFiles(), 4);
  PutRandomData(3, 90, 1000);
  env_->SleepForMicroseconds(micros_wait_for_flush);
  AssertNumberOfImmutableMemtables({0, 1, 2, 1});
  ASSERT_EQ(CountLiveLogFiles(), 5);
  PutRandomData(3, 90, 1000);
  env_->SleepForMicroseconds(micros_wait_for_flush);
  AssertNumberOfImmutableMemtables({0, 1, 2, 2});
  ASSERT_EQ(CountLiveLogFiles(), 6);
  PutRandomData(3, 90, 1000);
  env_->SleepForMicroseconds(micros_wait_for_flush);
  AssertNumberOfImmutableMemtables({0, 1, 2, 3});
  ASSERT_EQ(CountLiveLogFiles(), 7);
  PutRandomData(0, 100, 1000);
  WaitForFlush(0);
  AssertNumberOfImmutableMemtables({0, 1, 2, 3});
  ASSERT_EQ(CountLiveLogFiles(), 8);
  PutRandomData(2, 100, 10000);
  WaitForFlush(2);
  AssertNumberOfImmutableMemtables({0, 1, 0, 3});
  ASSERT_EQ(CountLiveLogFiles(), 9);
  PutRandomData(3, 90, 1000);
  WaitForFlush(3);
  AssertNumberOfImmutableMemtables({0, 1, 0, 0});
  ASSERT_EQ(CountLiveLogFiles(), 10);
  PutRandomData(3, 90, 1000);
  env_->SleepForMicroseconds(micros_wait_for_flush);
  AssertNumberOfImmutableMemtables({0, 1, 0, 1});
  ASSERT_EQ(CountLiveLogFiles(), 11);
  PutRandomData(1, 200, 1000);
  WaitForFlush(1);
  AssertNumberOfImmutableMemtables({0, 0, 0, 1});
  ASSERT_EQ(CountLiveLogFiles(), 5);
  PutRandomData(3, 90*6, 1000);
  WaitForFlush(3);
  AssertNumberOfImmutableMemtables({0, 0, 0, 0});
  ASSERT_EQ(CountLiveLogFiles(), 12);
  PutRandomData(0, 100, 1000);
  WaitForFlush(0);
  AssertNumberOfImmutableMemtables({0, 0, 0, 0});
  ASSERT_EQ(CountLiveLogFiles(), 12);
  PutRandomData(2, 3*100, 10000);
  WaitForFlush(2);
  AssertNumberOfImmutableMemtables({0, 0, 0, 0});
  ASSERT_EQ(CountLiveLogFiles(), 12);
  PutRandomData(1, 2*200, 1000);
  WaitForFlush(1);
  AssertNumberOfImmutableMemtables({0, 0, 0, 0});
  ASSERT_EQ(CountLiveLogFiles(), 7);
  Close();
 }
 TEST(ColumnFamilyTest, DifferentMergeOperators) {
  Open();
  CreateColumnFamilies({"first", "second"});
  ColumnFamilyOptions default_cf, first, second;
  first.merge_operator = MergeOperators::CreateUInt64AddOperator();
  second.merge_operator = MergeOperators::CreateStringAppendOperator();
  Reopen({default_cf, first, second});
  std::string one, two, three;
  PutFixed64(&one, 1);
  PutFixed64(&two, 2);
  PutFixed64(&three, 3);
  ASSERT_OK(Put(0, "foo", two));
  ASSERT_OK(Put(0, "foo", one));
  ASSERT_TRUE(Merge(0, "foo", two).IsNotSupported());
  ASSERT_EQ(Get(0, "foo"), one);
  ASSERT_OK(Put(1, "foo", two));
  ASSERT_OK(Put(1, "foo", one));
  ASSERT_OK(Merge(1, "foo", two));
  ASSERT_EQ(Get(1, "foo"), three);
  ASSERT_OK(Put(2, "foo", two));
  ASSERT_OK(Put(2, "foo", one));
  ASSERT_OK(Merge(2, "foo", two));
  ASSERT_EQ(Get(2, "foo"), one + "," + two);
  Close();
 }
 TEST(ColumnFamilyTest, DifferentCompactionStyles) {
  Open();
  CreateColumnFamilies({"one", "two"});
  ColumnFamilyOptions default_cf, one, two;
  db_options_.max_open_files = 20;  // only 10 files in file cache
  db_options_.disableDataSync = true;
  default_cf.compaction_style = kCompactionStyleLevel;
  default_cf.num_levels = 3;
  default_cf.write_buffer_size = 64 << 10;  // 64KB
  default_cf.target_file_size_base = 30 << 10;
  default_cf.filter_policy = nullptr;
  default_cf.no_block_cache = true;
  default_cf.source_compaction_factor = 100;
  default_cf.disable_seek_compaction = false;
  one.compaction_style = kCompactionStyleUniversal;
  // trigger compaction if there are >= 4 files
  one.level0_file_num_compaction_trigger = 4;
  one.write_buffer_size = 100000;
  two.compaction_style = kCompactionStyleLevel;
  two.num_levels = 4;
  two.max_mem_compaction_level = 0;
  two.level0_file_num_compaction_trigger = 3;
  two.write_buffer_size = 100000;
  Reopen({default_cf, one, two});
  // SETUP column family "default" - test read compaction
  ASSERT_EQ("", FilesPerLevel(0));
  PutRandomData(0, 1, 4096);
  ASSERT_OK(Flush(0));
  ASSERT_EQ("0,0,1", FilesPerLevel(0));
  // write 8MB
  PutRandomData(0, 2000, 4096);
  ASSERT_OK(Flush(0));
  // clear levels 0 and 1
  dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[0]);
  dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[0]);
  ASSERT_EQ(NumTableFilesAtLevel(0, 0), 0);
  ASSERT_EQ(NumTableFilesAtLevel(1, 0), 0);
  // write some new keys into level 0 and 1
  PutRandomData(0, 1024, 512);
  ASSERT_OK(Flush(0));
  WaitForCompaction();
  PutRandomData(0, 10, 512);
  ASSERT_OK(Flush(0));
  // remember number of files in each level
  int l1 = NumTableFilesAtLevel(0, 0);
  int l2 = NumTableFilesAtLevel(1, 0);
  int l3 = NumTableFilesAtLevel(2, 0);
  ASSERT_NE(l1, 0);
  ASSERT_NE(l2, 0);
  ASSERT_NE(l3, 0);
  // SETUP column family "one" -- universal style
  for (int i = 0; i < one.level0_file_num_compaction_trigger - 1; ++i) {
    PutRandomData(1, 11, 10000);
    WaitForFlush(1);
    ASSERT_EQ(std::to_string(i + 1), FilesPerLevel(1));
  }
  // SETUP column family "two" -- level style with 4 levels
  for (int i = 0; i < two.level0_file_num_compaction_trigger - 1; ++i) {
    PutRandomData(2, 15, 10000);
    WaitForFlush(2);
    ASSERT_EQ(std::to_string(i + 1), FilesPerLevel(2));
  }
  // TRIGGER compaction "default"
  // read a bunch of times, trigger read compaction
  for (int i = 0; i < 200000; ++i) {
    Get(0, std::to_string(i));
  }
  // TRIGGER compaction "one"
  PutRandomData(1, 12, 10000);
  // TRIGGER compaction "two"
  PutRandomData(2, 10, 10000);
  // WAIT for compactions
  WaitForCompaction();
  // VERIFY compaction "default"
  // verify that the number of files have decreased
  // in some level, indicating that there was a compaction
  ASSERT_TRUE(NumTableFilesAtLevel(0, 0) < l1 ||
              NumTableFilesAtLevel(1, 0) < l2 ||
              NumTableFilesAtLevel(2, 0) < l3);
  // VERIFY compaction "one"
  ASSERT_EQ("1", FilesPerLevel(1));
  // VERIFY compaction "two"
  ASSERT_EQ("0,1", FilesPerLevel(2));
  CompactAll(2);
  ASSERT_EQ("0,1", FilesPerLevel(2));
  Close();
 }
 namespace {
 std::string IterStatus(Iterator* iter) {
  std::string result;
  if (iter->Valid()) {
    result = iter->key().ToString() + "->" + iter->value().ToString();
  } else {
    result = "(invalid)";
  }
  return result;
 }
 }  // anonymous namespace
 TEST(ColumnFamilyTest, NewIteratorsTest) {
  // iter == 0 -- no tailing
  // iter == 2 -- tailing
  for (int iter = 0; iter < 2; ++iter) {
    Open();
    CreateColumnFamiliesAndReopen({"one", "two"});
    ASSERT_OK(Put(0, "a", "b"));
    ASSERT_OK(Put(1, "b", "a"));
    ASSERT_OK(Put(2, "c", "m"));
    ASSERT_OK(Put(2, "v", "t"));
    std::vector<Iterator*> iterators;
    ReadOptions options;
    options.tailing = (iter == 1);
    ASSERT_OK(db_->NewIterators(options, handles_, &iterators));
    for (auto it : iterators) {
      it->SeekToFirst();
    }
    ASSERT_EQ(IterStatus(iterators[0]), "a->b");
    ASSERT_EQ(IterStatus(iterators[1]), "b->a");
    ASSERT_EQ(IterStatus(iterators[2]), "c->m");
    ASSERT_OK(Put(1, "x", "x"));
    for (auto it : iterators) {
      it->Next();
    }
    ASSERT_EQ(IterStatus(iterators[0]), "(invalid)");
    if (iter == 0) {
      // no tailing
      ASSERT_EQ(IterStatus(iterators[1]), "(invalid)");
    } else {
      // tailing
      ASSERT_EQ(IterStatus(iterators[1]), "x->x");
    }
    ASSERT_EQ(IterStatus(iterators[2]), "v->t");
    for (auto it : iterators) {
      delete it;
    }
    Destroy();
  }
 }
 }  // namespace rocksdb
 int main(int argc, char** argv) {
  return rocksdb::test::RunAllTests();
 }
--- a/db/compaction.cc
+++ b/db/compaction.cc
@ -8,6 +8,7 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 #include "db/compaction.h"
 #include "db/column_family.h"
 namespace rocksdb {
@ -29,6 +30,7 @@ Compaction::Compaction(Version* input_version, int level, int out_level,
      max_grandparent_overlap_bytes_(max_grandparent_overlap_bytes),
      input_version_(input_version),
      number_levels_(input_version_->NumberLevels()),
      cfd_(input_version_->cfd_),
      seek_compaction_(seek_compaction),
      enable_compression_(enable_compression),
      grandparent_index_(0),
@ -42,8 +44,10 @@ Compaction::Compaction(Version* input_version, int level, int out_level,
      is_manual_compaction_(false),
      level_ptrs_(std::vector<size_t>(number_levels_)) {
  cfd_->Ref();
  input_version_->Ref();
  edit_ = new VersionEdit();
  edit_->SetColumnFamily(cfd_->GetID());
  for (int i = 0; i < number_levels_; i++) {
    level_ptrs_[i] = 0;
  }
@ -54,6 +58,11 @@ Compaction::~Compaction() {
  if (input_version_ != nullptr) {
    input_version_->Unref();
  }
  if (cfd_ != nullptr) {
    if (cfd_->Unref()) {
      delete cfd_;
    }
  }
 }
 bool Compaction::IsTrivialMove() const {
@ -77,12 +86,11 @@ void Compaction::AddInputDeletions(VersionEdit* edit) {
 }
 bool Compaction::IsBaseLevelForKey(const Slice& user_key) {
-  if (input_version_->vset_->options_->compaction_style ==
+  if (cfd_->options()->compaction_style == kCompactionStyleUniversal) {
      kCompactionStyleUniversal) {
    return bottommost_level_;
  }
  // Maybe use binary search to find right entry instead of linear search?
-  const Comparator* user_cmp = input_version_->vset_->icmp_.user_comparator();
+  const Comparator* user_cmp = cfd_->user_comparator();
  for (int lvl = level_ + 2; lvl < number_levels_; lvl++) {
    const std::vector<FileMetaData*>& files = input_version_->files_[lvl];
    for (; level_ptrs_[lvl] < files.size(); ) {
@ -103,7 +111,7 @@ bool Compaction::IsBaseLevelForKey(const Slice& user_key) {
 bool Compaction::ShouldStopBefore(const Slice& internal_key) {
  // Scan to find earliest grandparent file that contains key.
-  const InternalKeyComparator* icmp = &input_version_->vset_->icmp_;
+  const InternalKeyComparator* icmp = &cfd_->internal_comparator();
  while (grandparent_index_ < grandparents_.size() &&
      icmp->Compare(internal_key,
                    grandparents_[grandparent_index_]->largest.Encode()) > 0) {
@ -141,8 +149,7 @@ void Compaction::MarkFilesBeingCompacted(bool value) {
 // Is this compaction producing files at the bottommost level?
 void Compaction::SetupBottomMostLevel(bool isManual) {
-  if (input_version_->vset_->options_->compaction_style  ==
+  if (cfd_->options()->compaction_style == kCompactionStyleUniversal) {
         kCompactionStyleUniversal) {
    // If universal compaction style is used and manual
    // compaction is occuring, then we are guaranteed that
    // all files will be picked in a single compaction
@ -155,8 +162,7 @@ void Compaction::SetupBottomMostLevel(bool isManual) {
    return;
  }
  bottommost_level_ = true;
-  int num_levels = input_version_->vset_->NumberLevels();
+  for (int i = output_level() + 1; i < number_levels_; i++) {
  for (int i = output_level() + 1; i < num_levels; i++) {
    if (input_version_->NumLevelFiles(i) > 0) {
      bottommost_level_ = false;
      break;
@ -169,6 +175,16 @@ void Compaction::ReleaseInputs() {
    input_version_->Unref();
    input_version_ = nullptr;
  }
  if (cfd_ != nullptr) {
    if (cfd_->Unref()) {
      delete cfd_;
    }
    cfd_ = nullptr;
  }
 }
 void Compaction::ReleaseCompactionFiles(Status status) {
  cfd_->compaction_picker()->ReleaseCompactionFiles(this, status);
 }
 void Compaction::ResetNextCompactionIndex() {
--- a/db/compaction.h
+++ b/db/compaction.h
@ -13,6 +13,7 @@
 namespace rocksdb {
 class Version;
 class ColumnFamilyData;
 // A Compaction encapsulates information about a compaction.
 class Compaction {
@ -36,6 +37,8 @@ class Compaction {
  // Returns input version of the compaction
  Version* input_version() const { return input_version_; }
  ColumnFamilyData* column_family_data() const { return cfd_; }
  // Return the ith input file at "level()+which" ("which" must be 0 or 1).
  FileMetaData* input(int which, int i) const { return inputs_[which][i]; }
@ -67,6 +70,10 @@ class Compaction {
  // is successful.
  void ReleaseInputs();
  // Clear all files to indicate that they are not being compacted
  // Delete this compaction from the list of running compactions.
  void ReleaseCompactionFiles(Status status);
  void Summary(char* output, int len);
  // Return the score that was used to pick this compaction run.
@ -97,6 +104,7 @@ class Compaction {
  Version* input_version_;
  VersionEdit* edit_;
  int number_levels_;
  ColumnFamilyData* cfd_;
  bool seek_compaction_;
  bool enable_compression_;
--- a/db/compaction_picker.cc
+++ b/db/compaction_picker.cc
@ -277,14 +277,10 @@ void CompactionPicker::SetupOtherInputs(Compaction* c) {
        Log(options_->info_log,
            "Expanding@%lu %lu+%lu (%lu+%lu bytes) to %lu+%lu (%lu+%lu bytes)"
            "\n",
-            (unsigned long)level,
+            (unsigned long)level, (unsigned long)(c->inputs_[0].size()),
-            (unsigned long)(c->inputs_[0].size()),
+            (unsigned long)(c->inputs_[1].size()), (unsigned long)inputs0_size,
-            (unsigned long)(c->inputs_[1].size()),
+            (unsigned long)inputs1_size, (unsigned long)(expanded0.size()),
-            (unsigned long)inputs0_size,
+            (unsigned long)(expanded1.size()), (unsigned long)expanded0_size,
            (unsigned long)inputs1_size,
            (unsigned long)(expanded0.size()),
            (unsigned long)(expanded1.size()),
            (unsigned long)expanded0_size,
            (unsigned long)inputs1_size);
        smallest = new_start;
        largest = new_limit;
@ -587,7 +583,7 @@ Compaction* UniversalCompactionPicker::PickCompaction(Version* version,
                               options_->level0_file_num_compaction_trigger;
      if ((c = PickCompactionUniversalReadAmp(
               version, score, UINT_MAX, num_files, log_buffer)) != nullptr) {
-        Log(options_->info_log, "Universal: compacting for file num\n");
+        LogToBuffer(log_buffer, "Universal: compacting for file num\n");
      }
    }
  }
@ -653,7 +649,7 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalReadAmp(
  FileMetaData* f = nullptr;
  bool done = false;
  int start_index = 0;
-  unsigned int candidate_count;
+  unsigned int candidate_count = 0;
  assert(file_by_time.size() == version->files_[level].size());
  unsigned int max_files_to_compact = std::min(max_merge_width,
--- a/db/compaction_picker.h
+++ b/db/compaction_picker.h
@ -12,6 +12,7 @@
 #include "db/compaction.h"
 #include "rocksdb/status.h"
 #include "rocksdb/options.h"
 #include "rocksdb/env.h"
 #include <vector>
 #include <memory>
@ -118,6 +119,7 @@ class CompactionPicker {
  std::unique_ptr<uint64_t[]> level_max_bytes_;
  const Options* const options_;
 private:
  int num_levels_;
--- a/db/db_bench.cc
+++ b/db/db_bench.cc
@ -42,7 +42,6 @@
 DEFINE_string(benchmarks,
              "fillseq,"
              "fillsync,"
              "fillrandom,"
@ -53,6 +52,7 @@ DEFINE_string(benchmarks,
              "readreverse,"
              "compact,"
              "readrandom,"
              "multireadrandom,"
              "readseq,"
              "readtocache,"
              "readreverse,"
@ -64,8 +64,7 @@ DEFINE_string(benchmarks,
              "crc32c,"
              "compress,"
              "uncompress,"
-              "acquireload,"
+              "acquireload,",
              "fillfromstdin,",
              "Comma-separated list of operations to run in the specified order"
              "Actual benchmarks:\n"
@ -129,16 +128,8 @@ DEFINE_int64(merge_keys, -1,
 DEFINE_int64(reads, -1, "Number of read operations to do.  "
             "If negative, do FLAGS_num reads.");
 DEFINE_int64(read_range, 1, "When ==1 reads use ::Get, when >1 reads use"
             " an iterator");
 DEFINE_bool(use_prefix_blooms, false, "Whether to place prefixes in blooms");
 DEFINE_int32(bloom_locality, 0, "Control bloom filter probes locality");
 DEFINE_bool(use_prefix_api, false, "Whether to set ReadOptions.prefix for"
            " prefixscanrandom. If true, use_prefix_blooms must also be true.");
 DEFINE_int64(seed, 0, "Seed base for random number generators. "
             "When 0 it is deterministic.");
@ -278,12 +269,6 @@ DEFINE_bool(disable_wal, false, "If true, do not write WAL for write.");
 DEFINE_string(wal_dir, "", "If not empty, use the given dir for WAL");
 DEFINE_bool(use_snapshot, false, "If true, create a snapshot per query when"
            " randomread benchmark is used");
 DEFINE_bool(get_approx, false, "If true, call GetApproximateSizes per query"
            " when read_range is > 1 and randomread benchmark is used");
 DEFINE_int32(num_levels, 7, "The total number of levels");
 DEFINE_int32(target_file_size_base, 2 * 1048576, "Target file size at level-1");
@ -461,20 +446,9 @@ DEFINE_string(compaction_fadvice, "NORMAL",
 static auto FLAGS_compaction_fadvice_e =
  rocksdb::Options().access_hint_on_compaction_start;
 DEFINE_bool(use_multiget, false,
            "Use multiget to access a series of keys instead of get");
 DEFINE_bool(use_tailing_iterator, false,
            "Use tailing iterator to access a series of keys instead of get");
 DEFINE_int64(keys_per_multiget, 90, "If use_multiget is true, determines number"
             " of keys to group per call Arbitrary default is good because it"
             " agrees with readwritepercent");
 // TODO: Apply this flag to generic Get calls too. Currently only with Multiget
 DEFINE_bool(warn_missing_keys, true, "Print a message to user when a key is"
            " missing in a Get/MultiGet call");
 DEFINE_bool(use_adaptive_mutex, rocksdb::Options().use_adaptive_mutex,
            "Use adaptive mutex");
@ -798,7 +772,7 @@ class Duration {
    start_at_ = FLAGS_env->NowMicros();
  }
-  bool Done(int increment) {
+  bool Done(int64_t increment) {
    if (increment <= 0) increment = 1;    // avoid Done(0) and infinite loops
    ops_ += increment;
@ -834,13 +808,12 @@ class Benchmark {
  int key_size_;
  int prefix_size_;
  int64_t keys_per_prefix_;
-  int entries_per_batch_;
+  int64_t entries_per_batch_;
  WriteOptions write_options_;
  int64_t reads_;
  int64_t writes_;
  int64_t readwrites_;
  int64_t merge_keys_;
  int heap_counter_;
  void PrintHeader() {
    PrintEnvironment();
    fprintf(stdout, "Keys:       %d bytes each\n", FLAGS_key_size);
@ -1037,8 +1010,7 @@ class Benchmark {
    readwrites_((FLAGS_writes < 0  && FLAGS_reads < 0)? FLAGS_num :
                ((FLAGS_writes > FLAGS_reads) ? FLAGS_writes : FLAGS_reads)
               ),
-    merge_keys_(FLAGS_merge_keys < 0 ? FLAGS_num : FLAGS_merge_keys),
+    merge_keys_(FLAGS_merge_keys < 0 ? FLAGS_num : FLAGS_merge_keys) {
    heap_counter_(0) {
    if (FLAGS_prefix_size > FLAGS_key_size) {
      fprintf(stderr, "prefix size is larger than key size");
      exit(1);
@ -1062,6 +1034,10 @@ class Benchmark {
    delete prefix_extractor_;
  }
  Slice AllocateKey() {
    return Slice(new char[key_size_], key_size_);
  }
  // Generate key according to the given specification and random number.
  // The resulting key will have the following format (if keys_per_prefix_
  // is positive), extra trailing bytes are either cut off or paddd with '0'.
@ -1074,10 +1050,8 @@ class Benchmark {
  //   ----------------------------
  //   |        key 00000         |
  //   ----------------------------
-  std::string GenerateKeyFromInt(uint64_t v, int64_t num_keys) {
+  void GenerateKeyFromInt(uint64_t v, int64_t num_keys, Slice* key) {
-    std::string key;
+    char* start = const_cast<char*>(key->data());
    key.resize(key_size_);
    char* start = &(key[0]);
    char* pos = start;
    if (keys_per_prefix_ > 0) {
      int64_t num_prefix = num_keys / keys_per_prefix_;
@ -1109,8 +1083,6 @@ class Benchmark {
    if (key_size_ > pos - start) {
      memset(pos, '0', key_size_ - (pos - start));
    }
    return key;
  }
  void Run() {
@ -1155,15 +1127,12 @@ class Benchmark {
      } else if (name == Slice("fillrandom")) {
        fresh_db = true;
        method = &Benchmark::WriteRandom;
      } else if (name == Slice("fillfromstdin")) {
        fresh_db = true;
        method = &Benchmark::WriteFromStdin;
      } else if (name == Slice("filluniquerandom")) {
        fresh_db = true;
        if (num_threads > 1) {
          fprintf(stderr, "filluniquerandom multithreaded not supported"
-                           " set --threads=1");
+                           ", use 1 thread");
-          exit(1);
+          num_threads = 1;
        }
        method = &Benchmark::WriteUniqueRandom;
      } else if (name == Slice("overwrite")) {
@ -1189,19 +1158,18 @@ class Benchmark {
        method = &Benchmark::ReadReverse;
      } else if (name == Slice("readrandom")) {
        method = &Benchmark::ReadRandom;
      } else if (name == Slice("multireadrandom")) {
        method = &Benchmark::MultiReadRandom;
      } else if (name == Slice("readmissing")) {
-        method = &Benchmark::ReadMissing;
+        ++key_size_;
        method = &Benchmark::ReadRandom;
      } else if (name == Slice("newiterator")) {
        method = &Benchmark::IteratorCreation;
      } else if (name == Slice("seekrandom")) {
        method = &Benchmark::SeekRandom;
      } else if (name == Slice("readhot")) {
        method = &Benchmark::ReadHot;
      } else if (name == Slice("readrandomsmall")) {
        reads_ /= 1000;
        method = &Benchmark::ReadRandom;
      } else if (name == Slice("prefixscanrandom")) {
        method = &Benchmark::PrefixScanRandom;
      } else if (name == Slice("deleteseq")) {
        method = &Benchmark::DeleteSeq;
      } else if (name == Slice("deleterandom")) {
@ -1215,10 +1183,9 @@ class Benchmark {
        if (FLAGS_merge_operator.empty()) {
          fprintf(stdout, "%-12s : skipped (--merge_operator is unknown)\n",
                  name.ToString().c_str());
-          method = nullptr;
+          exit(1);
        } else {
          method = &Benchmark::ReadRandomMergeRandom;
        }
        method = &Benchmark::ReadRandomMergeRandom;
      } else if (name == Slice("updaterandom")) {
        method = &Benchmark::UpdateRandom;
      } else if (name == Slice("appendrandom")) {
@ -1227,10 +1194,9 @@ class Benchmark {
        if (FLAGS_merge_operator.empty()) {
          fprintf(stdout, "%-12s : skipped (--merge_operator is unknown)\n",
                  name.ToString().c_str());
-          method = nullptr;
+          exit(1);
        } else {
          method = &Benchmark::MergeRandom;
        }
        method = &Benchmark::MergeRandom;
      } else if (name == Slice("randomwithverify")) {
        method = &Benchmark::RandomWithVerify;
      } else if (name == Slice("compact")) {
@ -1243,8 +1209,6 @@ class Benchmark {
        method = &Benchmark::Compress;
      } else if (name == Slice("uncompress")) {
        method = &Benchmark::Uncompress;
      } else if (name == Slice("heapprofile")) {
        HeapProfile();
      } else if (name == Slice("stats")) {
        PrintStats("rocksdb.stats");
      } else if (name == Slice("levelstats")) {
@ -1254,6 +1218,7 @@ class Benchmark {
      } else {
        if (name != Slice()) {  // No error message for empty name
          fprintf(stderr, "unknown benchmark '%s'\n", name.ToString().c_str());
          exit(1);
        }
      }
@ -1540,7 +1505,7 @@ class Benchmark {
    options.compaction_style = FLAGS_compaction_style_e;
    options.block_size = FLAGS_block_size;
    options.filter_policy = filter_policy_;
-    if (FLAGS_use_plain_table || FLAGS_use_prefix_blooms) {
+    if (FLAGS_use_plain_table) {
      options.prefix_extractor.reset(
          NewFixedPrefixTransform(FLAGS_prefix_size));
    }
@ -1715,54 +1680,6 @@ class Benchmark {
    DoWrite(thread, UNIQUE_RANDOM);
  }
  void writeOrFail(WriteBatch& batch) {
    Status s = db_->Write(write_options_, &batch);
    if (!s.ok()) {
      fprintf(stderr, "put error: %s\n", s.ToString().c_str());
      exit(1);
    }
  }
  void WriteFromStdin(ThreadState* thread) {
    size_t count = 0;
    WriteBatch batch;
    const size_t bufferLen = 32 << 20;
    unique_ptr<char[]> line = unique_ptr<char[]>(new char[bufferLen]);
    char* linep = line.get();
    const int batchSize = 100 << 10;
    const char columnSeparator = '\t';
    const char lineSeparator = '\n';
    while (fgets(linep, bufferLen, stdin) != nullptr) {
      ++count;
      char* tab = std::find(linep, linep + bufferLen, columnSeparator);
      if (tab == linep + bufferLen) {
        fprintf(stderr, "[Error] No Key delimiter TAB at line %zu\n", count);
        continue;
      }
      Slice key(linep, tab - linep);
      tab++;
      char* endLine = std::find(tab, linep + bufferLen, lineSeparator);
      if (endLine  == linep + bufferLen) {
        fprintf(stderr, "[Error] No ENTER at end of line # %zu\n", count);
        continue;
      }
      Slice value(tab, endLine - tab);
      thread->stats.FinishedSingleOp(db_);
      thread->stats.AddBytes(endLine - linep - 1);
      if (batch.Count() < batchSize) {
        batch.Put(key, value);
        continue;
      }
      writeOrFail(batch);
      batch.Clear();
    }
    if (batch.Count() > 0) {
      writeOrFail(batch);
    }
  }
  void DoWrite(ThreadState* thread, WriteMode write_mode) {
    const int test_duration = write_mode == RANDOM ? FLAGS_duration : 0;
    const int64_t num_ops = writes_ == 0 ? num_ : writes_;
@ -1783,10 +1700,13 @@ class Benchmark {
    WriteBatch batch;
    Status s;
    int64_t bytes = 0;
-    int i = 0;
+    int64_t i = 0;
    Slice key = AllocateKey();
    std::unique_ptr<const char[]> key_guard(key.data());
    while (!duration.Done(entries_per_batch_)) {
      batch.Clear();
-      for (int j = 0; j < entries_per_batch_; j++) {
+      for (int64_t j = 0; j < entries_per_batch_; j++) {
        int64_t k = 0;
        switch(write_mode) {
          case SEQUENTIAL:
@ -1825,9 +1745,9 @@ class Benchmark {
              break;
            }
        };
-        std::string key = GenerateKeyFromInt(k, FLAGS_num);
+        GenerateKeyFromInt(k, FLAGS_num, &key);
        batch.Put(key, gen.Generate(value_size_));
-        bytes += value_size_ + key.size();
+        bytes += value_size_ + key_size_;
        thread->stats.FinishedSingleOp(db_);
      }
      s = db_->Write(write_options_, &batch);
@ -1866,135 +1786,22 @@ class Benchmark {
    thread->stats.AddBytes(bytes);
  }
  // Calls MultiGet over a list of keys from a random distribution.
  // Returns the total number of keys found.
  long MultiGetRandom(ReadOptions& options, int num_keys,
                      Random64* rand, int64_t range, const char* suffix) {
    assert(num_keys > 0);
    std::vector<Slice> keys(num_keys);
    std::vector<std::string> values(num_keys);
    std::vector<std::string> gen_keys(num_keys);
    int i;
    int64_t k;
    // Fill the keys vector
    for(i=0; i<num_keys; ++i) {
      k = rand->Next() % range;
      gen_keys[i] = GenerateKeyFromInt(k, range) + suffix;
      keys[i] = gen_keys[i];
    }
    if (FLAGS_use_snapshot) {
      options.snapshot = db_->GetSnapshot();
    }
    // Apply the operation
    std::vector<Status> statuses = db_->MultiGet(options, keys, &values);
    assert((long)statuses.size() == num_keys);
    assert((long)keys.size() == num_keys);  // Should always be the case.
    assert((long)values.size() == num_keys);
    if (FLAGS_use_snapshot) {
      db_->ReleaseSnapshot(options.snapshot);
      options.snapshot = nullptr;
    }
    // Count number found
    long found = 0;
    for(i=0; i<num_keys; ++i) {
      if (statuses[i].ok()){
        ++found;
      } else if (FLAGS_warn_missing_keys == true) {
        // Key not found, or error.
        fprintf(stderr, "get error: %s\n", statuses[i].ToString().c_str());
      }
    }
    return found;
  }
  void ReadRandom(ThreadState* thread) {
    ReadOptions options(FLAGS_verify_checksum, true);
    Duration duration(FLAGS_duration, reads_);
    int64_t found = 0;
    int64_t read = 0;
-    if (FLAGS_use_multiget) {   // MultiGet
+    int64_t found = 0;
-      const long& kpg = FLAGS_keys_per_multiget;  // keys per multiget group
+    ReadOptions options(FLAGS_verify_checksum, true);
-      long keys_left = reads_;
+    Slice key = AllocateKey();
-
+    std::unique_ptr<const char[]> key_guard(key.data());
-      // Recalculate number of keys per group, and call MultiGet until done
+    std::string value;
      long num_keys;
      while(num_keys = std::min(keys_left, kpg), !duration.Done(num_keys)) {
        read += num_keys;
        found +=
          MultiGetRandom(options, num_keys, &thread->rand, FLAGS_num, "");
        thread->stats.FinishedSingleOp(db_);
        keys_left -= num_keys;
      }
    } else if (FLAGS_use_tailing_iterator) {  // use tailing iterator for gets
      options.tailing = true;
      Iterator* iter = db_->NewIterator(options);
      while (!duration.Done(1)) {
        const int64_t k = thread->rand.Next() % FLAGS_num;
        std::string key = GenerateKeyFromInt(k, FLAGS_num);
        iter->Seek(key);
        read++;
        if (iter->Valid() && iter->key().compare(Slice(key)) == 0) {
          found++;
        }
        thread->stats.FinishedSingleOp(db_);
      }
      delete iter;
    } else {    // Regular case. Do one "get" at a time Get
      options.tailing = true;
      options.prefix_seek = (FLAGS_prefix_size == 0);
      Iterator* iter = db_->NewIterator(options);
      std::string value;
      while (!duration.Done(1)) {
        const int64_t k = thread->rand.Next() % FLAGS_num;
        std::string key = GenerateKeyFromInt(k, FLAGS_num);
        if (FLAGS_use_snapshot) {
          options.snapshot = db_->GetSnapshot();
        }
        if (FLAGS_read_range < 2) {
          read++;
          if (db_->Get(options, key, &value).ok()) {
            found++;
          }
        } else {
          int count = 1;
          if (FLAGS_get_approx) {
            std::string key2 =
                GenerateKeyFromInt(k + static_cast<int>(FLAGS_read_range),
                                   FLAGS_num + FLAGS_read_range);
            Range range(key, key2);
            uint64_t sizes;
            db_->GetApproximateSizes(&range, 1, &sizes);
          }
          read += FLAGS_read_range;
          for (iter->Seek(key);
               iter->Valid() && count <= FLAGS_read_range;
               ++count, iter->Next()) {
            found++;
          }
        }
        if (FLAGS_use_snapshot) {
          db_->ReleaseSnapshot(options.snapshot);
          options.snapshot = nullptr;
        }
-        thread->stats.FinishedSingleOp(db_);
+    Duration duration(FLAGS_duration, reads_);
    while (!duration.Done(1)) {
      GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key);
      read++;
      if (db_->Get(options, key, &value).ok()) {
        found++;
      }
-
+      thread->stats.FinishedSingleOp(db_);
      delete iter;
    }
    char msg[100];
@ -2008,113 +1815,41 @@ class Benchmark {
    }
  }
-  void PrefixScanRandom(ThreadState* thread) {
+  // Calls MultiGet over a list of keys from a random distribution.
-    if (FLAGS_use_prefix_api) {
+  // Returns the total number of keys found.
-      assert(FLAGS_use_prefix_blooms);
+  void MultiReadRandom(ThreadState* thread) {
-      assert(FLAGS_bloom_bits >= 1);
+    int64_t read = 0;
    }
    ReadOptions options(FLAGS_verify_checksum, true);
    Duration duration(FLAGS_duration, reads_);
    int64_t found = 0;
    while (!duration.Done(1)) {
      std::string value;
      const int k = thread->rand.Next() % FLAGS_num;
      std::string key = GenerateKeyFromInt(k, FLAGS_num);
      Slice skey(key);
      Slice prefix = prefix_extractor_->Transform(skey);
      options.prefix = FLAGS_use_prefix_api ? &prefix : nullptr;
      Iterator* iter = db_->NewIterator(options);
      for (iter->Seek(skey);
           iter->Valid() && iter->key().starts_with(prefix);
           iter->Next()) {
        found++;
      }
      delete iter;
      thread->stats.FinishedSingleOp(db_);
    }
    char msg[100];
    snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)",
             found, reads_);
    thread->stats.AddMessage(msg);
  }
  void ReadMissing(ThreadState* thread) {
    FLAGS_warn_missing_keys = false;    // Never warn about missing keys
    Duration duration(FLAGS_duration, reads_);
    ReadOptions options(FLAGS_verify_checksum, true);
-
+    std::vector<Slice> keys(entries_per_batch_);
-    if (FLAGS_use_multiget) {
+    std::vector<std::string> values(entries_per_batch_);
-      const long& kpg = FLAGS_keys_per_multiget;  // keys per multiget group
+    while (keys.size() < entries_per_batch_) {
-      long keys_left = reads_;
+      keys.push_back(AllocateKey());
      // Recalculate number of keys per group, and call MultiGet until done
      long num_keys;
      long found;
      while(num_keys = std::min(keys_left, kpg), !duration.Done(num_keys)) {
        found =
          MultiGetRandom(options, num_keys, &thread->rand, FLAGS_num, ".");
        // We should not find any key since the key we try to get has a
        // different suffix
        if (found) {
          assert(false);
        }
        thread->stats.FinishedSingleOp(db_);
        keys_left -= num_keys;
      }
    } else {  // Regular case (not MultiGet)
      std::string value;
      Status s;
      while (!duration.Done(1)) {
        const int64_t k = thread->rand.Next() % FLAGS_num;
        std::string key = GenerateKeyFromInt(k, FLAGS_num) + ".";
        s = db_->Get(options, key, &value);
        assert(!s.ok() && s.IsNotFound());
        thread->stats.FinishedSingleOp(db_);
      }
    }
  }
  void ReadHot(ThreadState* thread) {
    Duration duration(FLAGS_duration, reads_);
-    ReadOptions options(FLAGS_verify_checksum, true);
+    while (!duration.Done(1)) {
-    const int64_t range = (FLAGS_num + 99) / 100;
+      for (int64_t i = 0; i < entries_per_batch_; ++i) {
-    int64_t found = 0;
+        GenerateKeyFromInt(thread->rand.Next() % FLAGS_num,
-
+            FLAGS_num, &keys[i]);
    if (FLAGS_use_multiget) {
      const int64_t kpg = FLAGS_keys_per_multiget;  // keys per multiget group
      int64_t keys_left = reads_;
      // Recalculate number of keys per group, and call MultiGet until done
      long num_keys;
      while(num_keys = std::min(keys_left, kpg), !duration.Done(num_keys)) {
        found += MultiGetRandom(options, num_keys, &thread->rand, range, "");
        thread->stats.FinishedSingleOp(db_);
        keys_left -= num_keys;
      }
-    } else {
+      std::vector<Status> statuses = db_->MultiGet(options, keys, &values);
-      std::string value;
+      assert(statuses.size() == entries_per_batch_);
-      while (!duration.Done(1)) {
+
-        const int64_t k = thread->rand.Next() % range;
+      read += entries_per_batch_;
-        std::string key = GenerateKeyFromInt(k, range);
+      for (int64_t i = 0; i < entries_per_batch_; ++i) {
-        if (db_->Get(options, key, &value).ok()) {
+        if (statuses[i].ok()) {
          ++found;
        }
        thread->stats.FinishedSingleOp(db_);
      }
    }
    for (auto& k : keys) {
      delete k.data();
    }
    char msg[100];
    snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)",
-             found, reads_);
+             found, read);
    thread->stats.AddMessage(msg);
  }
@ -2129,44 +1864,53 @@ class Benchmark {
  }
  void SeekRandom(ThreadState* thread) {
-    Duration duration(FLAGS_duration, reads_);
+    int64_t read = 0;
    ReadOptions options(FLAGS_verify_checksum, true);
    std::string value;
    int64_t found = 0;
    ReadOptions options(FLAGS_verify_checksum, true);
    options.tailing = FLAGS_use_tailing_iterator;
    auto* iter = db_->NewIterator(options);
    Slice key = AllocateKey();
    std::unique_ptr<const char[]> key_guard(key.data());
    Duration duration(FLAGS_duration, reads_);
    while (!duration.Done(1)) {
-      Iterator* iter = db_->NewIterator(options);
+      GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key);
      const int64_t k = thread->rand.Next() % FLAGS_num;
      std::string key = GenerateKeyFromInt(k, FLAGS_num);
      iter->Seek(key);
-      if (iter->Valid() && iter->key() == Slice(key)) found++;
+      read++;
-      delete iter;
+      if (iter->Valid() && iter->key().compare(key) == 0) {
        found++;
      }
      thread->stats.FinishedSingleOp(db_);
    }
    delete iter;
    char msg[100];
    snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)",
-             found, num_);
+             found, read);
    thread->stats.AddMessage(msg);
  }
  void DoDelete(ThreadState* thread, bool seq) {
    WriteBatch batch;
    Status s;
    Duration duration(seq ? 0 : FLAGS_duration, num_);
-    long i = 0;
+    int64_t i = 0;
    Slice key = AllocateKey();
    std::unique_ptr<const char[]> key_guard(key.data());
    while (!duration.Done(entries_per_batch_)) {
      batch.Clear();
-      for (int j = 0; j < entries_per_batch_; j++) {
+      for (int64_t j = 0; j < entries_per_batch_; ++j) {
-        const int64_t k = seq ? i+j : (thread->rand.Next() % FLAGS_num);
+        const int64_t k = seq ? i + j : (thread->rand.Next() % FLAGS_num);
-        std::string key = GenerateKeyFromInt(k, FLAGS_num);
+        GenerateKeyFromInt(k, FLAGS_num, &key);
        batch.Delete(key);
        thread->stats.FinishedSingleOp(db_);
      }
-      s = db_->Write(write_options_, &batch);
+      auto s = db_->Write(write_options_, &batch);
      if (!s.ok()) {
        fprintf(stderr, "del error: %s\n", s.ToString().c_str());
        exit(1);
      }
-      ++i;
+      i += entries_per_batch_;
    }
  }
@ -2197,6 +1941,9 @@ class Benchmark {
      // Don't merge stats from this thread with the readers.
      thread->stats.SetExcludeFromMerge();
      Slice key = AllocateKey();
      std::unique_ptr<const char[]> key_guard(key.data());
      while (true) {
        {
          MutexLock l(&thread->shared->mu);
@ -2206,8 +1953,7 @@ class Benchmark {
          }
        }
-        const int64_t k = thread->rand.Next() % FLAGS_num;
+        GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key);
        std::string key = GenerateKeyFromInt(k, FLAGS_num);
        Status s = db_->Put(write_options_, key, gen.Generate(value_size_));
        if (!s.ok()) {
          fprintf(stderr, "put error: %s\n", s.ToString().c_str());
@ -2235,7 +1981,7 @@ class Benchmark {
  // Given a key K and value V, this puts (K+"0", V), (K+"1", V), (K+"2", V)
  // in DB atomically i.e in a single batch. Also refer GetMany.
  Status PutMany(const WriteOptions& writeoptions,
-                  const Slice& key, const Slice& value) {
+                 const Slice& key, const Slice& value) {
    std::string suffixes[3] = {"2", "1", "0"};
    std::string keys[3];
@ -2273,7 +2019,7 @@ class Benchmark {
  // in the same snapshot, and verifies that all the values are identical.
  // ASSUMES that PutMany was used to put (K, V) into the DB.
  Status GetMany(const ReadOptions& readoptions,
-                  const Slice& key, std::string* value) {
+                 const Slice& key, std::string* value) {
    std::string suffixes[3] = {"0", "1", "2"};
    std::string keys[3];
    Slice key_slices[3];
@ -2328,16 +2074,19 @@ class Benchmark {
    int64_t puts_done = 0;
    int64_t deletes_done = 0;
    Slice key = AllocateKey();
    std::unique_ptr<const char[]> key_guard(key.data());
    // the number of iterations is the larger of read_ or write_
    for (int64_t i = 0; i < readwrites_; i++) {
      const int64_t k = thread->rand.Next() % (FLAGS_numdistinct);
      std::string key = GenerateKeyFromInt(k, FLAGS_numdistinct);
      if (get_weight == 0 && put_weight == 0 && delete_weight == 0) {
        // one batch completed, reinitialize for next batch
        get_weight = FLAGS_readwritepercent;
        delete_weight = FLAGS_deletepercent;
        put_weight = 100 - get_weight - delete_weight;
      }
      GenerateKeyFromInt(thread->rand.Next() % FLAGS_numdistinct,
          FLAGS_numdistinct, &key);
      if (get_weight > 0) {
        // do all the gets first
        Status s = GetMany(options, key, &value);
@ -2383,12 +2132,6 @@ class Benchmark {
  // This is different from ReadWhileWriting because it does not use
  // an extra thread.
  void ReadRandomWriteRandom(ThreadState* thread) {
    if (FLAGS_use_multiget){
      // Separate function for multiget (for ease of reading)
      ReadRandomWriteRandomMultiGet(thread);
      return;
    }
    ReadOptions options(FLAGS_verify_checksum, true);
    RandomGenerator gen;
    std::string value;
@ -2399,28 +2142,18 @@ class Benchmark {
    int64_t writes_done = 0;
    Duration duration(FLAGS_duration, readwrites_);
    Slice key = AllocateKey();
    std::unique_ptr<const char[]> key_guard(key.data());
    // the number of iterations is the larger of read_ or write_
    while (!duration.Done(1)) {
-      const int64_t k = thread->rand.Next() % FLAGS_num;
+      GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key);
      std::string key = GenerateKeyFromInt(k, FLAGS_num);
      if (get_weight == 0 && put_weight == 0) {
        // one batch completed, reinitialize for next batch
        get_weight = FLAGS_readwritepercent;
        put_weight = 100 - get_weight;
      }
      if (get_weight > 0) {
        if (FLAGS_use_snapshot) {
          options.snapshot = db_->GetSnapshot();
        }
        if (FLAGS_get_approx) {
          std::string key2 = GenerateKeyFromInt(k + 1, FLAGS_num + 1);
          Range range(key, key2);
          uint64_t sizes;
          db_->GetApproximateSizes(&range, 1, &sizes);
        }
        // do all the gets first
        Status s = db_->Get(options, key, &value);
        if (!s.ok() && !s.IsNotFound()) {
@ -2430,14 +2163,8 @@ class Benchmark {
        } else if (!s.IsNotFound()) {
          found++;
        }
        get_weight--;
        reads_done++;
        if (FLAGS_use_snapshot) {
          db_->ReleaseSnapshot(options.snapshot);
        }
      } else  if (put_weight > 0) {
        // then do all the corresponding number of puts
        // for all the gets we have done earlier
@ -2458,82 +2185,6 @@ class Benchmark {
    thread->stats.AddMessage(msg);
  }
  // ReadRandomWriteRandom (with multiget)
  // Does FLAGS_keys_per_multiget reads (per multiget), followed by some puts.
  // FLAGS_readwritepercent will specify the ratio of gets to puts.
  // e.g.: If FLAGS_keys_per_multiget == 100 and FLAGS_readwritepercent == 75
  // Then each block will do 100 multigets and 33 puts
  // So there are 133 operations in-total: 100 of them (75%) are gets, and 33
  // of them (25%) are puts.
  void ReadRandomWriteRandomMultiGet(ThreadState* thread) {
    ReadOptions options(FLAGS_verify_checksum, true);
    RandomGenerator gen;
    // For multiget
    const long& kpg = FLAGS_keys_per_multiget;  // keys per multiget group
    long keys_left = readwrites_;  // number of keys still left to read
    long num_keys;                  // number of keys to read in current group
    long num_put_keys;              // number of keys to put in current group
    int64_t found = 0;
    int64_t reads_done = 0;
    int64_t writes_done = 0;
    int64_t multigets_done = 0;
    // the number of iterations is the larger of read_ or write_
    Duration duration(FLAGS_duration, readwrites_);
    while(true) {
      // Read num_keys keys, then write num_put_keys keys.
      // The ratio of num_keys to num_put_keys is always FLAGS_readwritepercent
      // And num_keys is set to be FLAGS_keys_per_multiget (kpg)
      // num_put_keys is calculated accordingly (to maintain the ratio)
      // Note: On the final iteration, num_keys and num_put_keys will be smaller
      num_keys = std::min(keys_left*(FLAGS_readwritepercent + 99)/100, kpg);
      num_put_keys = num_keys * (100-FLAGS_readwritepercent)
                     / FLAGS_readwritepercent;
      // This will break the loop when duration is complete
      if (duration.Done(num_keys + num_put_keys)) {
        break;
      }
      // A quick check to make sure our formula doesn't break on edge cases
      assert(num_keys >= 1);
      assert(num_keys + num_put_keys <= keys_left);
      // Apply the MultiGet operations
      found += MultiGetRandom(options, num_keys, &thread->rand, FLAGS_num, "");
      ++multigets_done;
      reads_done+=num_keys;
      thread->stats.FinishedSingleOp(db_);
      // Now do the puts
      int i;
      int64_t k;
      for(i=0; i<num_put_keys; ++i) {
        k = thread->rand.Next() % FLAGS_num;
        std::string key = GenerateKeyFromInt(k, FLAGS_num);
        Status s = db_->Put(write_options_, key,
                            gen.Generate(value_size_));
        if (!s.ok()) {
          fprintf(stderr, "put error: %s\n", s.ToString().c_str());
          exit(1);
        }
        writes_done++;
        thread->stats.FinishedSingleOp(db_);
      }
      keys_left -= (num_keys + num_put_keys);
    }
    char msg[100];
    snprintf(msg, sizeof(msg),
             "( reads:%" PRIu64 " writes:%" PRIu64 " total:%" PRIu64 \
             " multiget_ops:%" PRIu64 " found:%" PRIu64 ")",
             reads_done, writes_done, readwrites_, multigets_done, found);
    thread->stats.AddMessage(msg);
  }
  //
  // Read-modify-write for random keys
  void UpdateRandom(ThreadState* thread) {
@ -2543,30 +2194,16 @@ class Benchmark {
    int64_t found = 0;
    Duration duration(FLAGS_duration, readwrites_);
    Slice key = AllocateKey();
    std::unique_ptr<const char[]> key_guard(key.data());
    // the number of iterations is the larger of read_ or write_
    while (!duration.Done(1)) {
-      const int64_t k = thread->rand.Next() % FLAGS_num;
+      GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key);
      std::string key = GenerateKeyFromInt(k, FLAGS_num);
      if (FLAGS_use_snapshot) {
        options.snapshot = db_->GetSnapshot();
      }
      if (FLAGS_get_approx) {
        std::string key2 = GenerateKeyFromInt(k + 1, FLAGS_num + 1);
        Range range(key, key2);
        uint64_t sizes;
        db_->GetApproximateSizes(&range, 1, &sizes);
      }
      if (db_->Get(options, key, &value).ok()) {
        found++;
      }
      if (FLAGS_use_snapshot) {
        db_->ReleaseSnapshot(options.snapshot);
      }
      Status s = db_->Put(write_options_, key, gen.Generate(value_size_));
      if (!s.ok()) {
        fprintf(stderr, "put error: %s\n", s.ToString().c_str());
@ -2589,22 +2226,12 @@ class Benchmark {
    std::string value;
    int64_t found = 0;
    Slice key = AllocateKey();
    std::unique_ptr<const char[]> key_guard(key.data());
    // The number of iterations is the larger of read_ or write_
    Duration duration(FLAGS_duration, readwrites_);
    while (!duration.Done(1)) {
-      const int64_t k = thread->rand.Next() % FLAGS_num;
+      GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key);
      std::string key = GenerateKeyFromInt(k, FLAGS_num);
      if (FLAGS_use_snapshot) {
        options.snapshot = db_->GetSnapshot();
      }
      if (FLAGS_get_approx) {
        std::string key2 = GenerateKeyFromInt(k + 1, FLAGS_num + 1);
        Range range(key, key2);
        uint64_t sizes;
        db_->GetApproximateSizes(&range, 1, &sizes);
      }
      // Get the existing value
      if (db_->Get(options, key, &value).ok()) {
@ -2614,10 +2241,6 @@ class Benchmark {
        value.clear();
      }
      if (FLAGS_use_snapshot) {
        db_->ReleaseSnapshot(options.snapshot);
      }
      // Update the value (by appending data)
      Slice operand = gen.Generate(value_size_);
      if (value.size() > 0) {
@ -2634,6 +2257,7 @@ class Benchmark {
      }
      thread->stats.FinishedSingleOp(db_);
    }
    char msg[100];
    snprintf(msg, sizeof(msg), "( updates:%" PRIu64 " found:%" PRIu64 ")",
            readwrites_, found);
@ -2653,11 +2277,12 @@ class Benchmark {
  void MergeRandom(ThreadState* thread) {
    RandomGenerator gen;
    Slice key = AllocateKey();
    std::unique_ptr<const char[]> key_guard(key.data());
    // The number of iterations is the larger of read_ or write_
    Duration duration(FLAGS_duration, readwrites_);
    while (!duration.Done(1)) {
-      const int64_t k = thread->rand.Next() % merge_keys_;
+      GenerateKeyFromInt(thread->rand.Next() % merge_keys_, merge_keys_, &key);
      std::string key = GenerateKeyFromInt(k, merge_keys_);
      Status s = db_->Merge(write_options_, key, gen.Generate(value_size_));
@ -2690,12 +2315,12 @@ class Benchmark {
    int64_t num_merges = 0;
    size_t max_length = 0;
    Slice key = AllocateKey();
    std::unique_ptr<const char[]> key_guard(key.data());
    // the number of iterations is the larger of read_ or write_
    Duration duration(FLAGS_duration, readwrites_);
    while (!duration.Done(1)) {
-      const int64_t k = thread->rand.Next() % merge_keys_;
+      GenerateKeyFromInt(thread->rand.Next() % merge_keys_, merge_keys_, &key);
      std::string key = GenerateKeyFromInt(k, merge_keys_);
      bool do_merge = int(thread->rand.Next() % 100) < FLAGS_mergereadpercent;
@ -2727,6 +2352,7 @@ class Benchmark {
      thread->stats.FinishedSingleOp(db_);
    }
    char msg[100];
    snprintf(msg, sizeof(msg),
             "(reads:%" PRIu64 " merges:%" PRIu64 " total:%" PRIu64 " hits:%" \
@ -2735,7 +2361,6 @@ class Benchmark {
    thread->stats.AddMessage(msg);
  }
  void Compact(ThreadState* thread) {
    db_->CompactRange(nullptr, nullptr);
  }
@ -2747,28 +2372,6 @@ class Benchmark {
    }
    fprintf(stdout, "\n%s\n", stats.c_str());
  }
  static void WriteToFile(void* arg, const char* buf, int n) {
    reinterpret_cast<WritableFile*>(arg)->Append(Slice(buf, n));
  }
  void HeapProfile() {
    char fname[100];
    EnvOptions soptions;
    snprintf(fname, sizeof(fname), "%s/heap-%04d", FLAGS_db.c_str(),
             ++heap_counter_);
    unique_ptr<WritableFile> file;
    Status s = FLAGS_env->NewWritableFile(fname, &file, soptions);
    if (!s.ok()) {
      fprintf(stderr, "%s\n", s.ToString().c_str());
      return;
    }
    bool ok = port::GetHeapProfile(WriteToFile, file.get());
    if (!ok) {
      fprintf(stderr, "heap profiling not supported\n");
      FLAGS_env->DeleteFile(fname);
    }
  }
 };
 }  // namespace rocksdb
--- a/db/db_filesnapshot.cc
+++ b/db/db_filesnapshot.cc
@ -7,6 +7,8 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.
 #define __STDC_FORMAT_MACROS
 #include <inttypes.h>
 #include <algorithm>
 #include <string>
 #include <stdint.h>
@ -17,6 +19,7 @@
 #include "rocksdb/env.h"
 #include "port/port.h"
 #include "util/mutexlock.h"
 #include "util/sync_point.h"
 namespace rocksdb {
@ -60,21 +63,36 @@ Status DBImpl::GetLiveFiles(std::vector<std::string>& ret,
  *manifest_file_size = 0;
  mutex_.Lock();
  if (flush_memtable) {
    // flush all dirty data to disk.
-    Status status =  Flush(FlushOptions());
+    Status status;
    for (auto cfd : *versions_->GetColumnFamilySet()) {
      cfd->Ref();
      mutex_.Unlock();
      status = FlushMemTable(cfd, FlushOptions());
      mutex_.Lock();
      cfd->Unref();
      if (!status.ok()) {
        break;
      }
    }
    versions_->GetColumnFamilySet()->FreeDeadColumnFamilies();
    if (!status.ok()) {
      mutex_.Unlock();
      Log(options_.info_log, "Cannot Flush data %s\n",
          status.ToString().c_str());
      return status;
    }
  }
  MutexLock l(&mutex_);
  // Make a set of all of the live *.sst files
  std::set<uint64_t> live;
-  versions_->current()->AddLiveFiles(&live);
+  for (auto cfd : *versions_->GetColumnFamilySet()) {
    cfd->current()->AddLiveFiles(&live);
  }
  ret.clear();
  ret.reserve(live.size() + 2); //*.sst + CURRENT + MANIFEST
@ -91,24 +109,60 @@ Status DBImpl::GetLiveFiles(std::vector<std::string>& ret,
  // find length of manifest file while holding the mutex lock
  *manifest_file_size = versions_->ManifestFileSize();
  mutex_.Unlock();
  return Status::OK();
 }
 Status DBImpl::GetSortedWalFiles(VectorLogPtr& files) {
-  // First get sorted files in archive dir, then append sorted files from main
+  // First get sorted files in db dir, then get sorted files from archived
-  // dir to maintain sorted order
+  // dir, to avoid a race condition where a log file is moved to archived
  // dir in between.
  Status s;
  // list wal files in main db dir.
  VectorLogPtr logs;
  s = GetSortedWalsOfType(options_.wal_dir, logs, kAliveLogFile);
  if (!s.ok()) {
    return s;
  }
  // Reproduce the race condition where a log file is moved
  // to archived dir, between these two sync points, used in
  // (DBTest,TransactionLogIteratorRace)
  TEST_SYNC_POINT("DBImpl::GetSortedWalFiles:1");
  TEST_SYNC_POINT("DBImpl::GetSortedWalFiles:2");
  files.clear();
  // list wal files in archive dir.
  Status s;
  std::string archivedir = ArchivalDirectory(options_.wal_dir);
  if (env_->FileExists(archivedir)) {
-    s = AppendSortedWalsOfType(archivedir, files, kArchivedLogFile);
+    s = GetSortedWalsOfType(archivedir, files, kArchivedLogFile);
    if (!s.ok()) {
      return s;
    }
  }
-  // list wal files in main db dir.
+
-  return AppendSortedWalsOfType(options_.wal_dir, files, kAliveLogFile);
+  uint64_t latest_archived_log_number = 0;
  if (!files.empty()) {
    latest_archived_log_number = files.back()->LogNumber();
    Log(options_.info_log, "Latest Archived log: %" PRIu64,
        latest_archived_log_number);
  }
  files.reserve(files.size() + logs.size());
  for (auto& log : logs) {
    if (log->LogNumber() > latest_archived_log_number) {
      files.push_back(std::move(log));
    } else {
      // When the race condition happens, we could see the
      // same log in both db dir and archived dir. Simply
      // ignore the one in db dir. Note that, if we read
      // archived dir first, we would have missed the log file.
      Log(options_.info_log, "%s already moved to archive",
          log->PathName().c_str());
    }
  }
  return s;
 }
 }
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
--- a/db/db_impl.h
+++ b/db/db_impl.h
@ -13,10 +13,12 @@
 #include <set>
 #include <utility>
 #include <vector>
 #include <string>
 #include "db/dbformat.h"
 #include "db/log_writer.h"
 #include "db/snapshot.h"
 #include "db/column_family.h"
 #include "db/version_edit.h"
 #include "memtable_list.h"
 #include "port/port.h"
@ -40,44 +42,79 @@ class CompactionFilterV2;
 class DBImpl : public DB {
 public:
-  DBImpl(const Options& options, const std::string& dbname);
+  DBImpl(const DBOptions& options, const std::string& dbname);
  virtual ~DBImpl();
  // Implementations of the DB interface
-  virtual Status Put(const WriteOptions&, const Slice& key, const Slice& value);
+  using DB::Put;
-  virtual Status Merge(const WriteOptions&, const Slice& key,
+  virtual Status Put(const WriteOptions& options,
                     ColumnFamilyHandle* column_family, const Slice& key,
                     const Slice& value);
  using DB::Merge;
  virtual Status Merge(const WriteOptions& options,
                       ColumnFamilyHandle* column_family, const Slice& key,
                       const Slice& value);
-  virtual Status Delete(const WriteOptions&, const Slice& key);
+  using DB::Delete;
  virtual Status Delete(const WriteOptions& options,
                        ColumnFamilyHandle* column_family, const Slice& key);
  using DB::Write;
  virtual Status Write(const WriteOptions& options, WriteBatch* updates);
  using DB::Get;
  virtual Status Get(const ReadOptions& options,
-                     const Slice& key,
+                     ColumnFamilyHandle* column_family, const Slice& key,
                     std::string* value);
-  virtual std::vector<Status> MultiGet(const ReadOptions& options,
+  using DB::MultiGet;
-                                       const std::vector<Slice>& keys,
+  virtual std::vector<Status> MultiGet(
-                                       std::vector<std::string>* values);
+      const ReadOptions& options,
      const std::vector<ColumnFamilyHandle*>& column_family,
      const std::vector<Slice>& keys, std::vector<std::string>* values);
  virtual Status CreateColumnFamily(const ColumnFamilyOptions& options,
                                    const std::string& column_family,
                                    ColumnFamilyHandle** handle);
  virtual Status DropColumnFamily(ColumnFamilyHandle* column_family);
  // Returns false if key doesn't exist in the database and true if it may.
  // If value_found is not passed in as null, then return the value if found in
  // memory. On return, if value was found, then value_found will be set to true
  // , otherwise false.
  using DB::KeyMayExist;
  virtual bool KeyMayExist(const ReadOptions& options,
-                           const Slice& key,
+                           ColumnFamilyHandle* column_family, const Slice& key,
-                           std::string* value,
+                           std::string* value, bool* value_found = nullptr);
-                           bool* value_found = nullptr);
+  using DB::NewIterator;
-  virtual Iterator* NewIterator(const ReadOptions&);
+  virtual Iterator* NewIterator(const ReadOptions& options,
                                ColumnFamilyHandle* column_family);
  virtual Status NewIterators(
      const ReadOptions& options,
      const std::vector<ColumnFamilyHandle*>& column_families,
      std::vector<Iterator*>* iterators);
  virtual const Snapshot* GetSnapshot();
  virtual void ReleaseSnapshot(const Snapshot* snapshot);
-  virtual bool GetProperty(const Slice& property, std::string* value);
+  using DB::GetProperty;
-  virtual void GetApproximateSizes(const Range* range, int n, uint64_t* sizes);
+  virtual bool GetProperty(ColumnFamilyHandle* column_family,
-  virtual Status CompactRange(const Slice* begin, const Slice* end,
+                           const Slice& property, std::string* value);
  using DB::GetApproximateSizes;
  virtual void GetApproximateSizes(ColumnFamilyHandle* column_family,
                                   const Range* range, int n, uint64_t* sizes);
  using DB::CompactRange;
  virtual Status CompactRange(ColumnFamilyHandle* column_family,
                              const Slice* begin, const Slice* end,
                              bool reduce_level = false, int target_level = -1);
-  virtual int NumberLevels();
+
-  virtual int MaxMemCompactionLevel();
+  using DB::NumberLevels;
-  virtual int Level0StopWriteTrigger();
+  virtual int NumberLevels(ColumnFamilyHandle* column_family);
  using DB::MaxMemCompactionLevel;
  virtual int MaxMemCompactionLevel(ColumnFamilyHandle* column_family);
  using DB::Level0StopWriteTrigger;
  virtual int Level0StopWriteTrigger(ColumnFamilyHandle* column_family);
  virtual const std::string& GetName() const;
  virtual Env* GetEnv() const;
-  virtual const Options& GetOptions() const;
+  using DB::GetOptions;
-  virtual Status Flush(const FlushOptions& options);
+  virtual const Options& GetOptions(ColumnFamilyHandle* column_family) const;
  using DB::Flush;
  virtual Status Flush(const FlushOptions& options,
                       ColumnFamilyHandle* column_family);
  virtual Status DisableFileDeletions();
  virtual Status EnableFileDeletions(bool force);
  // All the returned filenames start with "/"
@ -92,8 +129,7 @@ class DBImpl : public DB {
          read_options = TransactionLogIterator::ReadOptions());
  virtual Status DeleteFile(std::string name);
-  virtual void GetLiveFilesMetaData(
+  virtual void GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata);
    std::vector<LiveFileMetaData> *metadata);
  // checks if all live files exist on file system and that their file sizes
  // match to our in-memory records
@ -101,23 +137,21 @@ class DBImpl : public DB {
  virtual Status GetDbIdentity(std::string& identity);
-  Status RunManualCompaction(int input_level,
+  Status RunManualCompaction(ColumnFamilyData* cfd, int input_level,
-                             int output_level,
+                             int output_level, const Slice* begin,
                             const Slice* begin,
                             const Slice* end);
  // Extra methods (for testing) that are not in the public DB interface
  // Compact any files in the named level that overlap [*begin, *end]
-  Status TEST_CompactRange(int level,
+  Status TEST_CompactRange(int level, const Slice* begin, const Slice* end,
-                           const Slice* begin,
+                           ColumnFamilyHandle* column_family = nullptr);
                           const Slice* end);
  // Force current memtable contents to be flushed.
  Status TEST_FlushMemTable(bool wait = true);
  // Wait for memtable compaction
-  Status TEST_WaitForFlushMemTable();
+  Status TEST_WaitForFlushMemTable(ColumnFamilyHandle* column_family = nullptr);
  // Wait for any compaction
  Status TEST_WaitForCompact();
@ -125,14 +159,13 @@ class DBImpl : public DB {
  // Return an internal iterator over the current state of the database.
  // The keys of this iterator are internal keys (see format.h).
  // The returned iterator should be deleted when no longer needed.
-  Iterator* TEST_NewInternalIterator();
+  Iterator* TEST_NewInternalIterator(ColumnFamilyHandle* column_family =
                                         nullptr);
  // Return the maximum overlapping data (in bytes) at next level for any
  // file at a level >= 1.
-  int64_t TEST_MaxNextLevelOverlappingBytes();
+  int64_t TEST_MaxNextLevelOverlappingBytes(ColumnFamilyHandle* column_family =
-
+                                                nullptr);
  // Simulate a db crash, no elegant closing of database.
  void TEST_Destroy_DBImpl();
  // Return the current manifest file no.
  uint64_t TEST_Current_Manifest_FileNo();
@ -148,61 +181,8 @@ class DBImpl : public DB {
    default_interval_to_delete_obsolete_WAL_ = default_interval_to_delete_obsolete_WAL;
  }
-  void TEST_GetFilesMetaData(std::vector<std::vector<FileMetaData>>* metadata);
+  void TEST_GetFilesMetaData(ColumnFamilyHandle* column_family,
-
+                             std::vector<std::vector<FileMetaData>>* metadata);
  // holds references to memtable, all immutable memtables and version
  struct SuperVersion {
    MemTable* mem;
    MemTableListVersion* imm;
    Version* current;
    std::atomic<uint32_t> refs;
    // We need to_delete because during Cleanup(), imm->Unref() returns
    // all memtables that we need to free through this vector. We then
    // delete all those memtables outside of mutex, during destruction
    autovector<MemTable*> to_delete;
    // Version number of the current SuperVersion
    uint64_t version_number;
    DBImpl* db;
    // should be called outside the mutex
    SuperVersion() = default;
    ~SuperVersion();
    SuperVersion* Ref();
    // Returns true if this was the last reference and caller should
    // call Clenaup() and delete the object
    bool Unref();
    // call these two methods with db mutex held
    // Cleanup unrefs mem, imm and current. Also, it stores all memtables
    // that needs to be deleted in to_delete vector. Unrefing those
    // objects needs to be done in the mutex
    void Cleanup();
    void Init(MemTable* new_mem, MemTableListVersion* new_imm,
              Version* new_current);
    // The value of dummy is not actually used. kSVInUse takes its address as a
    // mark in the thread local storage to indicate the SuperVersion is in use
    // by thread. This way, the value of kSVInUse is guaranteed to have no
    // conflict with SuperVersion object address and portable on different
    // platform.
    static int dummy;
    static void* const kSVInUse;
    static void* const kSVObsolete;
  };
  static void SuperVersionUnrefHandle(void* ptr) {
    // UnrefHandle is called when a thread exists or a ThreadLocalPtr gets
    // destroyed. When former happens, the thread shouldn't see kSVInUse.
    // When latter happens, we are in ~DBImpl(), no get should happen as well.
    assert(ptr != SuperVersion::kSVInUse);
    DBImpl::SuperVersion* sv = static_cast<DBImpl::SuperVersion*>(ptr);
    if (sv->Unref()) {
      sv->db->mutex_.Lock();
      sv->Cleanup();
      sv->db->mutex_.Unlock();
      delete sv;
    }
  }
  // needed for CleanupIteratorState
  struct DeletionState {
@ -231,7 +211,7 @@ class DBImpl : public DB {
    autovector<SuperVersion*> superversions_to_free;
-    SuperVersion* new_superversion; // if nullptr no new superversion
+    SuperVersion* new_superversion;  // if nullptr no new superversion
    // the current manifest_file_number, log_number and prev_log_number
    // that corresponds to the set of files in 'live'.
@ -243,8 +223,7 @@ class DBImpl : public DB {
      pending_manifest_file_number = 0;
      log_number = 0;
      prev_log_number = 0;
-      new_superversion =
+      new_superversion = create_superversion ? new SuperVersion() : nullptr;
          create_superversion ? new SuperVersion() : nullptr;
    }
    ~DeletionState() {
@ -277,23 +256,16 @@ class DBImpl : public DB {
  // It is not necessary to hold the mutex when invoking this method.
  void PurgeObsoleteFiles(DeletionState& deletion_state);
  ColumnFamilyHandle* DefaultColumnFamily() const;
 protected:
  Env* const env_;
  const std::string dbname_;
  unique_ptr<VersionSet> versions_;
-  const InternalKeyComparator internal_comparator_;
+  const DBOptions options_;
  const Options options_;  // options_.comparator == &internal_comparator_
  const Comparator* user_comparator() const {
    return internal_comparator_.user_comparator();
  }
  SuperVersion* GetSuperVersion() {
    return super_version_;
  }
-  Iterator* NewInternalIterator(const ReadOptions&,
+  Iterator* NewInternalIterator(const ReadOptions&, ColumnFamilyData* cfd,
-                                SequenceNumber* latest_snapshot);
+                                SuperVersion* super_version);
 private:
  friend class DB;
@ -306,8 +278,10 @@ class DBImpl : public DB {
  Status NewDB();
  // Recover the descriptor from persistent storage.  May do a significant
-  // amount of work to recover recently logged updates.
+  // amount of work to recover recently logged updates.  Any changes to
-  Status Recover(bool read_only = false, bool error_if_log_file_exist = false);
+  // be made to the descriptor are added to *edit.
  Status Recover(const std::vector<ColumnFamilyDescriptor>& column_families,
                 bool read_only = false, bool error_if_log_file_exist = false);
  void MaybeIgnoreError(Status* s) const;
@ -318,7 +292,7 @@ class DBImpl : public DB {
  // Flush the in-memory write buffer to storage.  Switches to a new
  // log-file/memtable and writes a new descriptor iff successful.
-  Status FlushMemTableToOutputFile(bool* madeProgress,
+  Status FlushMemTableToOutputFile(ColumnFamilyData* cfd, bool* madeProgress,
                                   DeletionState& deletion_state,
                                   LogBuffer* log_buffer);
@ -330,25 +304,26 @@ class DBImpl : public DB {
  // database is opened) and is heavyweight because it holds the mutex
  // for the entire period. The second method WriteLevel0Table supports
  // concurrent flush memtables to storage.
-  Status WriteLevel0TableForRecovery(MemTable* mem, VersionEdit* edit);
+  Status WriteLevel0TableForRecovery(ColumnFamilyData* cfd, MemTable* mem,
-  Status WriteLevel0Table(autovector<MemTable*>& mems, VersionEdit* edit,
+                                     VersionEdit* edit);
-                          uint64_t* filenumber,
+  Status WriteLevel0Table(ColumnFamilyData* cfd, autovector<MemTable*>& mems,
                          VersionEdit* edit, uint64_t* filenumber,
                          LogBuffer* log_buffer);
  uint64_t SlowdownAmount(int n, double bottom, double top);
-  // MakeRoomForWrite will return superversion_to_free through an arugment,
+
-  // which the caller needs to delete. We do it because caller can delete
+  // TODO(icanadi) free superversion_to_free and old_log outside of mutex
-  // the superversion outside of mutex
+  Status MakeRoomForWrite(ColumnFamilyData* cfd,
-  Status MakeRoomForWrite(bool force /* compact even if there is room? */,
+                          bool force /* flush even if there is room? */);
-                          SuperVersion** superversion_to_free);
+
  void BuildBatchGroup(Writer** last_writer,
                       autovector<WriteBatch*>* write_batch_group);
  // Force current memtable contents to be flushed.
-  Status FlushMemTable(const FlushOptions& options);
+  Status FlushMemTable(ColumnFamilyData* cfd, const FlushOptions& options);
  // Wait for memtable flushed
-  Status WaitForFlushMemTable();
+  Status WaitForFlushMemTable(ColumnFamilyData* cfd);
  void MaybeScheduleLogDBDeployStats();
  static void BGLogDBDeployStats(void* db);
@ -368,6 +343,13 @@ class DBImpl : public DB {
                          DeletionState& deletion_state,
                          LogBuffer* log_buffer);
  // This function is called as part of compaction. It enables Flush process to
  // preempt compaction, since it's higher prioirty
  // Returns: micros spent executing
  uint64_t CallFlushDuringCompaction(ColumnFamilyData* cfd,
                                     DeletionState& deletion_state,
                                     LogBuffer* log_buffer);
  // Call compaction filter if is_compaction_v2 is not true. Then iterate
  // through input and compact the kv-pairs
  Status ProcessKeyValueCompaction(
@ -388,15 +370,16 @@ class DBImpl : public DB {
  Status OpenCompactionOutputFile(CompactionState* compact);
  Status FinishCompactionOutputFile(CompactionState* compact, Iterator* input);
-  Status InstallCompactionResults(CompactionState* compact);
+  Status InstallCompactionResults(CompactionState* compact,
                                  LogBuffer* log_buffer);
  void AllocateCompactionOutputFileNumbers(CompactionState* compact);
  void ReleaseCompactionUnusedFileNumbers(CompactionState* compact);
  void PurgeObsoleteWALFiles();
-  Status AppendSortedWalsOfType(const std::string& path,
+  Status GetSortedWalsOfType(const std::string& path,
-                                VectorLogPtr& log_files,
+                             VectorLogPtr& log_files,
-                                WalFileType type);
+                             WalFileType type);
  // Requires: all_logs should be sorted with earliest log file first
  // Retains all log files in all_logs which contain updates with seq no.
@ -419,30 +402,23 @@ class DBImpl : public DB {
  // Return the minimum empty level that could hold the total data in the
  // input level. Return the input level, if such level could not be found.
-  int FindMinimumEmptyLevelFitting(int level);
+  int FindMinimumEmptyLevelFitting(ColumnFamilyData* cfd, int level);
  // Move the files in the input level to the target level.
  // If target_level < 0, automatically calculate the minimum level that could
  // hold the data set.
-  Status ReFitLevel(int level, int target_level = -1);
+  Status ReFitLevel(ColumnFamilyData* cfd, int level, int target_level = -1);
  // Returns the current SuperVersion number.
  uint64_t CurrentVersionNumber() const;
  // Returns a pair of iterators (mutable-only and immutable-only) used
-  // internally by TailingIterator and stores CurrentVersionNumber() in
+  // internally by TailingIterator and stores cfd->GetSuperVersionNumber() in
  // *superversion_number. These iterators are always up-to-date, i.e. can
  // be used to read new data.
  std::pair<Iterator*, Iterator*> GetTailingIteratorPair(
-    const ReadOptions& options,
+      const ReadOptions& options, ColumnFamilyData* cfd,
-    uint64_t* superversion_number);
+      uint64_t* superversion_number);
  // Constant after construction
  const InternalFilterPolicy internal_filter_policy_;
  bool owns_info_log_;
  // table_cache_ provides its own synchronization
-  unique_ptr<TableCache> table_cache_;
+  std::shared_ptr<Cache> table_cache_;
  // Lock over the persistent DB state.  Non-nullptr iff successfully acquired.
  FileLock* db_lock_;
@ -451,20 +427,11 @@ class DBImpl : public DB {
  port::Mutex mutex_;
  port::AtomicPointer shutting_down_;
  port::CondVar bg_cv_;          // Signalled when background work finishes
  MemTable* mem_;
  MemTableList imm_;             // Memtable that are not changing
  uint64_t logfile_number_;
  unique_ptr<log::Writer> log_;
-
+  ColumnFamilyHandleImpl* default_cf_handle_;
-  SuperVersion* super_version_;
+  unique_ptr<ColumnFamilyMemTablesImpl> column_family_memtables_;
-
+  std::deque<uint64_t> alive_log_files_;
  // An ordinal representing the current SuperVersion. Updated by
  // InstallSuperVersion(), i.e. incremented every time super_version_
  // changes.
  std::atomic<uint64_t> super_version_number_;
  // Thread's local copy of SuperVersion pointer
  // This needs to be destructed after mutex_
  ThreadLocalPtr* local_sv_;
  std::string host_name_;
@ -500,6 +467,7 @@ class DBImpl : public DB {
  // Information for a manual compaction
  struct ManualCompaction {
    ColumnFamilyData* cfd;
    int input_level;
    int output_level;
    bool done;
@ -541,8 +509,6 @@ class DBImpl : public DB {
  bool flush_on_destroy_; // Used when disableWAL is true.
  InternalStats internal_stats_;
  static const int KEEP_LOG_FILE_NUM = 1000;
  std::string db_absolute_path_;
@ -575,28 +541,21 @@ class DBImpl : public DB {
    std::vector<SequenceNumber>& snapshots,
    SequenceNumber* prev_snapshot);
  // will return a pointer to SuperVersion* if previous SuperVersion
  // if its reference count is zero and needs deletion or nullptr if not
  // As argument takes a pointer to allocated SuperVersion
  // Foreground threads call this function directly (they don't carry
  // deletion state and have to handle their own creation and deletion
  // of SuperVersion)
  SuperVersion* InstallSuperVersion(SuperVersion* new_superversion);
  // Background threads call this function, which is just a wrapper around
-  // the InstallSuperVersion() function above. Background threads carry
+  // the cfd->InstallSuperVersion() function. Background threads carry
  // deletion_state which can have new_superversion already allocated.
-  void InstallSuperVersion(DeletionState& deletion_state);
+  void InstallSuperVersion(ColumnFamilyData* cfd,
                           DeletionState& deletion_state);
-  void ResetThreadLocalSuperVersions(DeletionState* deletion_state);
+  using DB::GetPropertiesOfAllTables;
-
+  virtual Status GetPropertiesOfAllTables(ColumnFamilyHandle* column_family,
-  virtual Status GetPropertiesOfAllTables(TablePropertiesCollection* props)
+                                          TablePropertiesCollection* props)
      override;
  // Function that Get and KeyMayExist call with no_io true or false
  // Note: 'value_found' from KeyMayExist propagates here
-  Status GetImpl(const ReadOptions& options,
+  Status GetImpl(const ReadOptions& options, ColumnFamilyHandle* column_family,
-                 const Slice& key,
+                 const Slice& key, std::string* value,
                 std::string* value,
                 bool* value_found = nullptr);
 };
@ -606,7 +565,7 @@ extern Options SanitizeOptions(const std::string& db,
                               const InternalKeyComparator* icmp,
                               const InternalFilterPolicy* ipolicy,
                               const Options& src);
-
+extern DBOptions SanitizeOptions(const std::string& db, const DBOptions& src);
 // Determine compression type, based on user options, level of the output
 // file and whether compression is disabled.
--- a/db/db_impl_readonly.cc
+++ b/db/db_impl_readonly.cc
@ -42,8 +42,8 @@
 namespace rocksdb {
-DBImplReadOnly::DBImplReadOnly(const Options& options,
+DBImplReadOnly::DBImplReadOnly(const DBOptions& options,
-    const std::string& dbname)
+                               const std::string& dbname)
    : DBImpl(options, dbname) {
  Log(options_.info_log, "Opening the db in read only mode");
 }
@ -53,42 +53,57 @@ DBImplReadOnly::~DBImplReadOnly() {
 // Implementations of the DB interface
 Status DBImplReadOnly::Get(const ReadOptions& options,
-                   const Slice& key,
+                           ColumnFamilyHandle* column_family, const Slice& key,
-                   std::string* value) {
+                           std::string* value) {
  Status s;
  SequenceNumber snapshot = versions_->LastSequence();
-  SuperVersion* super_version = GetSuperVersion();
+  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
  auto cfd = cfh->cfd();
  SuperVersion* super_version = cfd->GetSuperVersion();
  MergeContext merge_context;
  LookupKey lkey(key, snapshot);
-  if (super_version->mem->Get(lkey, value, &s, merge_context, options_)) {
+  if (super_version->mem->Get(lkey, value, &s, merge_context,
                              *cfd->options())) {
  } else {
    Version::GetStats stats;
    super_version->current->Get(options, lkey, value, &s, &merge_context,
-                                &stats, options_);
+                                &stats, *cfd->options());
  }
  return s;
 }
-Iterator* DBImplReadOnly::NewIterator(const ReadOptions& options) {
+Iterator* DBImplReadOnly::NewIterator(const ReadOptions& options,
-  SequenceNumber latest_snapshot;
+                                      ColumnFamilyHandle* column_family) {
-  Iterator* internal_iter = NewInternalIterator(options, &latest_snapshot);
+  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
  auto cfd = cfh->cfd();
  SuperVersion* super_version = cfd->GetSuperVersion()->Ref();
  SequenceNumber latest_snapshot = versions_->LastSequence();
  Iterator* internal_iter = NewInternalIterator(options, cfd, super_version);
  return NewDBIterator(
-    &dbname_, env_, options_,  user_comparator(),internal_iter,
+      &dbname_, env_, *cfd->options(), cfd->user_comparator(), internal_iter,
      (options.snapshot != nullptr
-      ? reinterpret_cast<const SnapshotImpl*>(options.snapshot)->number_
+           ? reinterpret_cast<const SnapshotImpl*>(options.snapshot)->number_
-      : latest_snapshot));
+           : latest_snapshot));
 }
 Status DB::OpenForReadOnly(const Options& options, const std::string& dbname,
-                DB** dbptr, bool error_if_log_file_exist) {
+                           DB** dbptr, bool error_if_log_file_exist) {
  *dbptr = nullptr;
-  DBImplReadOnly* impl = new DBImplReadOnly(options, dbname);
+  DBOptions db_options(options);
  ColumnFamilyOptions cf_options(options);
  std::vector<ColumnFamilyDescriptor> column_families;
  column_families.push_back(
      ColumnFamilyDescriptor(default_column_family_name, cf_options));
  DBImplReadOnly* impl = new DBImplReadOnly(db_options, dbname);
  impl->mutex_.Lock();
-  Status s = impl->Recover(true /* read only */, error_if_log_file_exist);
+  Status s = impl->Recover(column_families, true /* read only */,
                           error_if_log_file_exist);
  if (s.ok()) {
-    delete impl->InstallSuperVersion(new DBImpl::SuperVersion());
+    for (auto cfd : *impl->versions_->GetColumnFamilySet()) {
      delete cfd->InstallSuperVersion(new SuperVersion(), &impl->mutex_);
    }
  }
  impl->mutex_.Unlock();
  if (s.ok()) {
--- a/db/db_impl_readonly.h
+++ b/db/db_impl_readonly.h
@ -12,6 +12,8 @@
 #include <deque>
 #include <set>
 #include <vector>
 #include <string>
 #include "db/dbformat.h"
 #include "db/log_writer.h"
 #include "db/snapshot.h"
@ -23,57 +25,79 @@
 namespace rocksdb {
 class DBImplReadOnly : public DBImpl {
-public:
+ public:
-  DBImplReadOnly(const Options& options, const std::string& dbname);
+  DBImplReadOnly(const DBOptions& options, const std::string& dbname);
- virtual ~DBImplReadOnly();
+  virtual ~DBImplReadOnly();
- // Implementations of the DB interface
+  // Implementations of the DB interface
- virtual Status Get(const ReadOptions& options,
+  using DB::Get;
-                    const Slice& key,
+  virtual Status Get(const ReadOptions& options,
-                    std::string* value);
+                     ColumnFamilyHandle* column_family, const Slice& key,
                     std::string* value);
- // TODO: Implement ReadOnly MultiGet?
+  // TODO: Implement ReadOnly MultiGet?
- virtual Iterator* NewIterator(const ReadOptions&);
+  using DBImpl::NewIterator;
  virtual Iterator* NewIterator(const ReadOptions&,
                                ColumnFamilyHandle* column_family);
- virtual Status Put(const WriteOptions&, const Slice& key, const Slice& value) {
+  virtual Status NewIterators(
-   return Status::NotSupported("Not supported operation in read only mode.");
+      const ReadOptions& options,
- }
+      const std::vector<ColumnFamilyHandle*>& column_family,
- virtual Status Merge(const WriteOptions&, const Slice& key,
+      std::vector<Iterator*>* iterators) {
-                      const Slice& value) {
+   // TODO
-   return Status::NotSupported("Not supported operation in read only mode.");
+    return Status::NotSupported("Not supported yet.");
- }
+  }
 virtual Status Delete(const WriteOptions&, const Slice& key) {
   return Status::NotSupported("Not supported operation in read only mode.");
 }
 virtual Status Write(const WriteOptions& options, WriteBatch* updates) {
   return Status::NotSupported("Not supported operation in read only mode.");
 }
 virtual Status CompactRange(const Slice* begin, const Slice* end,
                             bool reduce_level = false, int target_level = -1) {
   return Status::NotSupported("Not supported operation in read only mode.");
 }
 virtual Status DisableFileDeletions() {
   return Status::NotSupported("Not supported operation in read only mode.");
 }
 virtual Status EnableFileDeletions(bool force) {
   return Status::NotSupported("Not supported operation in read only mode.");
 }
 virtual Status GetLiveFiles(std::vector<std::string>&,
                             uint64_t* manifest_file_size,
                             bool flush_memtable = true) {
   return Status::NotSupported("Not supported operation in read only mode.");
 }
 virtual Status Flush(const FlushOptions& options) {
   return Status::NotSupported("Not supported operation in read only mode.");
 }
-private:
+  using DBImpl::Put;
- friend class DB;
+  virtual Status Put(const WriteOptions& options,
                     ColumnFamilyHandle* column_family, const Slice& key,
                     const Slice& value) {
    return Status::NotSupported("Not supported operation in read only mode.");
  }
  using DBImpl::Merge;
  virtual Status Merge(const WriteOptions& options,
                       ColumnFamilyHandle* column_family, const Slice& key,
                       const Slice& value) {
    return Status::NotSupported("Not supported operation in read only mode.");
  }
  using DBImpl::Delete;
  virtual Status Delete(const WriteOptions& options,
                        ColumnFamilyHandle* column_family, const Slice& key) {
    return Status::NotSupported("Not supported operation in read only mode.");
  }
  virtual Status Write(const WriteOptions& options, WriteBatch* updates) {
    return Status::NotSupported("Not supported operation in read only mode.");
  }
  using DBImpl::CompactRange;
  virtual Status CompactRange(ColumnFamilyHandle* column_family,
                              const Slice* begin, const Slice* end,
                              bool reduce_level = false,
                              int target_level = -1) {
    return Status::NotSupported("Not supported operation in read only mode.");
  }
  virtual Status DisableFileDeletions() {
    return Status::NotSupported("Not supported operation in read only mode.");
  }
  virtual Status EnableFileDeletions(bool force) {
    return Status::NotSupported("Not supported operation in read only mode.");
  }
  virtual Status GetLiveFiles(std::vector<std::string>&,
                              uint64_t* manifest_file_size,
                              bool flush_memtable = true) {
    return Status::NotSupported("Not supported operation in read only mode.");
  }
  using DBImpl::Flush;
  virtual Status Flush(const FlushOptions& options,
                       ColumnFamilyHandle* column_family) {
    return Status::NotSupported("Not supported operation in read only mode.");
  }
- // No copying allowed
+ private:
- DBImplReadOnly(const DBImplReadOnly&);
+  friend class DB;
 void operator=(const DBImplReadOnly&);
 };
  // No copying allowed
  DBImplReadOnly(const DBImplReadOnly&);
  void operator=(const DBImplReadOnly&);
 };
 }
--- a/db/db_iter.cc
+++ b/db/db_iter.cc
@ -39,71 +39,6 @@ static void DumpInternalIter(Iterator* iter) {
 namespace {
 class IterLookupKey {
 public:
  IterLookupKey() : key_(space_), buf_size_(sizeof(space_)), key_size_(0) {}
  ~IterLookupKey() { Clear(); }
  Slice GetKey() const {
    if (key_ != nullptr) {
      return Slice(key_, key_size_);
    } else {
      return Slice();
    }
  }
  bool Valid() const { return key_ != nullptr; }
  void Clear() {
    if (key_ != nullptr && key_ != space_) {
      delete[] key_;
    }
    key_ = space_;
    buf_size_ = sizeof(buf_size_);
  }
  // Enlarge the buffer size if needed based on key_size.
  // By default, static allocated buffer is used. Once there is a key
  // larger than the static allocated buffer, another buffer is dynamically
  // allocated, until a larger key buffer is requested. In that case, we
  // reallocate buffer and delete the old one.
  void EnlargeBufferIfNeeded(size_t key_size) {
    // If size is smaller than buffer size, continue using current buffer,
    // or the static allocated one, as default
    if (key_size > buf_size_) {
      // Need to enlarge the buffer.
      Clear();
      key_ = new char[key_size];
      buf_size_ = key_size;
    }
    key_size_ = key_size;
  }
  void SetUserKey(const Slice& user_key) {
    size_t size = user_key.size();
    EnlargeBufferIfNeeded(size);
    memcpy(key_, user_key.data(), size);
  }
  void SetInternalKey(const Slice& user_key, SequenceNumber s) {
    size_t usize = user_key.size();
    EnlargeBufferIfNeeded(usize + sizeof(uint64_t));
    memcpy(key_, user_key.data(), usize);
    EncodeFixed64(key_ + usize, PackSequenceAndType(s, kValueTypeForSeek));
  }
 private:
  char* key_;
  size_t buf_size_;
  size_t key_size_;
  char space_[32];  // Avoid allocation for short keys
  // No copying allowed
  IterLookupKey(const IterLookupKey&) = delete;
  void operator=(const LookupKey&) = delete;
 };
 // Memtables and sstables that make the DB representation contain
 // (userkey,seq,type) => uservalue entries.  DBIter
 // combines multiple entries for the same userkey found in the DB
@ -191,7 +126,7 @@ class DBIter: public Iterator {
  SequenceNumber const sequence_;
  Status status_;
-  IterLookupKey saved_key_;   // == current key when direction_==kReverse
+  IterKey saved_key_;   // == current key when direction_==kReverse
  std::string saved_value_;   // == current raw value when direction_==kReverse
  std::string skip_key_;
  Direction direction_;
@ -254,10 +189,9 @@ void DBIter::Next() {
 // NOTE: In between, saved_key_ can point to a user key that has
 //       a delete marker
 inline void DBIter::FindNextUserEntry(bool skipping) {
-  StopWatchNano timer(env_, false);
+  PERF_TIMER_AUTO(find_next_user_entry_time);
  StartPerfTimer(&timer);
  FindNextUserEntryInternal(skipping);
-  BumpPerfTime(&perf_context.find_next_user_entry_time, &timer);
+  PERF_TIMER_STOP(find_next_user_entry_time);
 }
 // Actual implementation of DBIter::FindNextUserEntry()
@ -273,7 +207,7 @@ void DBIter::FindNextUserEntryInternal(bool skipping) {
      if (skipping &&
          user_comparator_->Compare(ikey.user_key, saved_key_.GetKey()) <= 0) {
        num_skipped++; // skip this entry
-        BumpPerfCount(&perf_context.internal_key_skipped_count);
+        PERF_COUNTER_ADD(internal_key_skipped_count, 1);
      } else {
        skipping = false;
        switch (ikey.type) {
@ -283,7 +217,7 @@ void DBIter::FindNextUserEntryInternal(bool skipping) {
            saved_key_.SetUserKey(ikey.user_key);
            skipping = true;
            num_skipped = 0;
-            BumpPerfCount(&perf_context.internal_delete_skipped_count);
+            PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
            break;
          case kTypeValue:
            valid_ = true;
@ -488,10 +422,9 @@ void DBIter::Seek(const Slice& target) {
  saved_key_.Clear();
  // now savved_key is used to store internal key.
  saved_key_.SetInternalKey(target, sequence_);
-  StopWatchNano internal_seek_timer(env_, false);
+  PERF_TIMER_AUTO(seek_internal_seek_time);
  StartPerfTimer(&internal_seek_timer);
  iter_->Seek(saved_key_.GetKey());
-  BumpPerfTime(&perf_context.seek_internal_seek_time, &internal_seek_timer);
+  PERF_TIMER_STOP(seek_internal_seek_time);
  if (iter_->Valid()) {
    direction_ = kForward;
    ClearSavedValue();
@ -504,10 +437,9 @@ void DBIter::Seek(const Slice& target) {
 void DBIter::SeekToFirst() {
  direction_ = kForward;
  ClearSavedValue();
-  StopWatchNano internal_seek_timer(env_, false);
+  PERF_TIMER_AUTO(seek_internal_seek_time);
  StartPerfTimer(&internal_seek_timer);
  iter_->SeekToFirst();
-  BumpPerfTime(&perf_context.seek_internal_seek_time, &internal_seek_timer);
+  PERF_TIMER_STOP(seek_internal_seek_time);
  if (iter_->Valid()) {
    FindNextUserEntry(false /* not skipping */);
  } else {
@ -526,10 +458,9 @@ void DBIter::SeekToLast() {
  direction_ = kReverse;
  ClearSavedValue();
-  StopWatchNano internal_seek_timer(env_, false);
+  PERF_TIMER_AUTO(seek_internal_seek_time);
  StartPerfTimer(&internal_seek_timer);
  iter_->SeekToLast();
-  BumpPerfTime(&perf_context.seek_internal_seek_time, &internal_seek_timer);
+  PERF_TIMER_STOP(seek_internal_seek_time);
  FindPrevUserEntry();
 }
--- a/db/db_stats_logger.cc
+++ b/db/db_stats_logger.cc
@ -65,7 +65,7 @@ void DBImpl::LogDBDeployStats() {
  uint64_t file_total_size = 0;
  uint32_t file_total_num = 0;
-  Version* current = versions_->current();
+  Version* current = default_cf_handle_->cfd()->current();
  for (int i = 0; i < current->NumberLevels(); i++) {
    file_total_num += current->NumLevelFiles(i);
    file_total_size += current->NumLevelBytes(i);
--- a/db/db_test.cc
+++ b/db/db_test.cc
--- a/db/dbformat.cc
+++ b/db/dbformat.cc
@ -59,7 +59,7 @@ int InternalKeyComparator::Compare(const Slice& akey, const Slice& bkey) const {
  //    decreasing sequence number
  //    decreasing type (though sequence# should be enough to disambiguate)
  int r = user_comparator_->Compare(ExtractUserKey(akey), ExtractUserKey(bkey));
-  BumpPerfCount(&perf_context.user_key_comparison_count);
+  PERF_COUNTER_ADD(user_key_comparison_count, 1);
  if (r == 0) {
    const uint64_t anum = DecodeFixed64(akey.data() + akey.size() - 8);
    const uint64_t bnum = DecodeFixed64(bkey.data() + bkey.size() - 8);
@ -79,7 +79,7 @@ int InternalKeyComparator::Compare(const ParsedInternalKey& a,
  //    decreasing sequence number
  //    decreasing type (though sequence# should be enough to disambiguate)
  int r = user_comparator_->Compare(a.user_key, b.user_key);
-  BumpPerfCount(&perf_context.user_key_comparison_count);
+  PERF_COUNTER_ADD(user_key_comparison_count, 1);
  if (r == 0) {
    if (a.sequence > b.sequence) {
      r = -1;
--- a/db/dbformat.h
+++ b/db/dbformat.h
@ -32,6 +32,9 @@ enum ValueType : unsigned char {
  kTypeValue = 0x1,
  kTypeMerge = 0x2,
  kTypeLogData = 0x3,
  kTypeColumnFamilyDeletion = 0x4,
  kTypeColumnFamilyValue = 0x5,
  kTypeColumnFamilyMerge = 0x6,
  kMaxValue = 0x7F
 };
@ -235,4 +238,74 @@ inline LookupKey::~LookupKey() {
  if (start_ != space_) delete[] start_;
 }
 class IterKey {
 public:
  IterKey() : key_(space_), buf_size_(sizeof(space_)), key_size_(0) {}
  ~IterKey() { Clear(); }
  Slice GetKey() const {
    if (key_ != nullptr) {
      return Slice(key_, key_size_);
    } else {
      return Slice();
    }
  }
  bool Valid() const { return key_ != nullptr; }
  void Clear() {
    if (key_ != nullptr && key_ != space_) {
      delete[] key_;
    }
    key_ = space_;
    buf_size_ = sizeof(buf_size_);
  }
  // Enlarge the buffer size if needed based on key_size.
  // By default, static allocated buffer is used. Once there is a key
  // larger than the static allocated buffer, another buffer is dynamically
  // allocated, until a larger key buffer is requested. In that case, we
  // reallocate buffer and delete the old one.
  void EnlargeBufferIfNeeded(size_t key_size) {
    // If size is smaller than buffer size, continue using current buffer,
    // or the static allocated one, as default
    if (key_size > buf_size_) {
      // Need to enlarge the buffer.
      Clear();
      key_ = new char[key_size];
      buf_size_ = key_size;
    }
    key_size_ = key_size;
  }
  void SetUserKey(const Slice& user_key) {
    size_t size = user_key.size();
    EnlargeBufferIfNeeded(size);
    memcpy(key_, user_key.data(), size);
  }
  void SetInternalKey(const Slice& user_key, SequenceNumber s,
                      ValueType value_type = kValueTypeForSeek) {
    size_t usize = user_key.size();
    EnlargeBufferIfNeeded(usize + sizeof(uint64_t));
    memcpy(key_, user_key.data(), usize);
    EncodeFixed64(key_ + usize, PackSequenceAndType(s, value_type));
  }
  void SetInternalKey(const ParsedInternalKey& parsed_key) {
    SetInternalKey(parsed_key.user_key, parsed_key.sequence, parsed_key.type);
  }
 private:
  char* key_;
  size_t buf_size_;
  size_t key_size_;
  char space_[32];  // Avoid allocation for short keys
  // No copying allowed
  IterKey(const IterKey&) = delete;
  void operator=(const IterKey&) = delete;
 };
 }  // namespace rocksdb
--- a/db/internal_stats.cc
+++ b/db/internal_stats.cc
@ -7,8 +7,7 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 #include "db/internal_stats.h"
-#include "db/db_impl.h"
+#include "db/column_family.h"
 #include "db/memtable_list.h"
 #include <vector>
@ -44,10 +43,8 @@ DBPropertyType GetPropertyType(const Slice& property) {
 bool InternalStats::GetProperty(DBPropertyType property_type,
                                const Slice& property, std::string* value,
-                                DBImpl* db) {
+                                ColumnFamilyData* cfd) {
-  VersionSet* version_set = db->versions_.get();
+  Version* current = cfd->current();
  Version* current = version_set->current();
  const MemTableList& imm = db->imm_;
  Slice in = property;
  switch (property_type) {
@ -110,7 +107,6 @@ bool InternalStats::GetProperty(DBPropertyType property_type,
        write_with_wal = statistics_->getTickerCount(WRITE_WITH_WAL);
      }
      // Pardon the long line but I think it is easier to read this way.
      snprintf(
          buf, sizeof(buf),
          "                               Compactions\n"
@ -159,7 +155,7 @@ bool InternalStats::GetProperty(DBPropertyType property_type,
                   "%9lu\n",
                   level, files, current->NumLevelBytes(level) / 1048576.0,
                   current->NumLevelBytes(level) /
-                       version_set->MaxBytesForLevel(level),
+                       cfd->compaction_picker()->MaxBytesForLevel(level),
                   compaction_stats_[level].micros / 1e6,
                   bytes_read / 1048576.0,
                   compaction_stats_[level].bytes_written / 1048576.0,
@ -334,11 +330,11 @@ bool InternalStats::GetProperty(DBPropertyType property_type,
      *value = current->DebugString();
      return true;
    case kNumImmutableMemTable:
-      *value = std::to_string(imm.size());
+      *value = std::to_string(cfd->imm()->size());
      return true;
    case kMemtableFlushPending:
      // Return number of mem tables that are ready to flush (made immutable)
-      *value = std::to_string(imm.IsFlushPending() ? 1 : 0);
+      *value = std::to_string(cfd->imm()->IsFlushPending() ? 1 : 0);
      return true;
    case kCompactionPending:
      // 1 if the system already determines at least one compacdtion is needed.
@ -351,7 +347,7 @@ bool InternalStats::GetProperty(DBPropertyType property_type,
      return true;
    case kCurSizeActiveMemTable:
      // Current size of the active memtable
-      *value = std::to_string(db->mem_->ApproximateMemoryUsage());
+      *value = std::to_string(cfd->mem()->ApproximateMemoryUsage());
      return true;
    default:
      return false;
--- a/db/internal_stats.h
+++ b/db/internal_stats.h
@ -16,6 +16,8 @@
 #include <vector>
 #include <string>
 class ColumnFamilyData;
 namespace rocksdb {
 class MemTableList;
@ -126,7 +128,7 @@ class InternalStats {
  uint64_t BumpAndGetBackgroundErrorCount() { return ++bg_error_count_; }
  bool GetProperty(DBPropertyType property_type, const Slice& property,
-                   std::string* value, DBImpl* db);
+                   std::string* value, ColumnFamilyData* cfd);
 private:
  std::vector<CompactionStats> compaction_stats_;
--- a/db/memtable.cc
+++ b/db/memtable.cc
@ -29,7 +29,8 @@
 namespace rocksdb {
-MemTable::MemTable(const InternalKeyComparator& cmp, const Options& options)
+MemTable::MemTable(const InternalKeyComparator& cmp,
                   const Options& options)
    : comparator_(cmp),
      refs_(0),
      kArenaBlockSize(OptimizeBlockSize(options.arena_block_size)),
@ -42,7 +43,6 @@ MemTable::MemTable(const InternalKeyComparator& cmp, const Options& options)
      file_number_(0),
      first_seqno_(0),
      mem_next_logfile_number_(0),
      mem_logfile_number_(0),
      locks_(options.inplace_update_support ? options.inplace_update_num_locks
                                            : 0),
      prefix_extractor_(options.prefix_extractor.get()),
@ -142,6 +142,11 @@ Slice MemTableRep::UserKey(const char* key) const {
  return Slice(slice.data(), slice.size() - 8);
 }
 KeyHandle MemTableRep::Allocate(const size_t len, char** buf) {
  *buf = arena_->Allocate(len);
  return static_cast<KeyHandle>(*buf);
 }
 // Encode a suitable internal key target for "target" and return it.
 // Uses *scratch as scratch space, and the returned pointer will point
 // into this scratch space.
@ -243,7 +248,9 @@ void MemTable::Add(SequenceNumber s, ValueType type,
  const size_t encoded_len =
      VarintLength(internal_key_size) + internal_key_size +
      VarintLength(val_size) + val_size;
-  char* buf = arena_.Allocate(encoded_len);
+  char* buf = nullptr;
  KeyHandle handle = table_->Allocate(encoded_len, &buf);
  assert(buf != nullptr);
  char* p = EncodeVarint32(buf, internal_key_size);
  memcpy(p, key.data(), key_size);
  p += key_size;
@ -252,7 +259,7 @@ void MemTable::Add(SequenceNumber s, ValueType type,
  p = EncodeVarint32(p, val_size);
  memcpy(p, value.data(), val_size);
  assert((unsigned)(p + val_size - buf) == (unsigned)encoded_len);
-  table_->Insert(buf);
+  table_->Insert(handle);
  if (prefix_bloom_) {
    assert(prefix_extractor_);
@ -370,8 +377,7 @@ static bool SaveValue(void* arg, const char* entry) {
 bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
                   MergeContext& merge_context, const Options& options) {
-  StopWatchNano memtable_get_timer(options.env, false);
+  PERF_TIMER_AUTO(get_from_memtable_time);
  StartPerfTimer(&memtable_get_timer);
  Slice user_key = key.user_key();
  bool found_final_value = false;
@ -401,8 +407,8 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
  if (!found_final_value && merge_in_progress) {
    *s = Status::MergeInProgress("");
  }
-  BumpPerfTime(&perf_context.get_from_memtable_time, &memtable_get_timer);
+  PERF_TIMER_STOP(get_from_memtable_time);
-  BumpPerfCount(&perf_context.get_from_memtable_count);
+  PERF_COUNTER_ADD(get_from_memtable_count, 1);
  return found_final_value;
 }
--- a/db/memtable.h
+++ b/db/memtable.h
@ -13,7 +13,7 @@
 #include <deque>
 #include "db/dbformat.h"
 #include "db/skiplist.h"
-#include "db/version_set.h"
+#include "db/version_edit.h"
 #include "rocksdb/db.h"
 #include "rocksdb/memtablerep.h"
 #include "util/arena.h"
@ -39,7 +39,7 @@ class MemTable {
  // MemTables are reference counted.  The initial reference count
  // is zero and the caller must call Ref() at least once.
  explicit MemTable(const InternalKeyComparator& comparator,
-                    const Options& options = Options());
+                    const Options& options);
  ~MemTable();
@ -147,14 +147,6 @@ class MemTable {
  // be flushed to storage
  void SetNextLogNumber(uint64_t num) { mem_next_logfile_number_ = num; }
  // Returns the logfile number that can be safely deleted when this
  // memstore is flushed to storage
  uint64_t GetLogNumber() { return mem_logfile_number_; }
  // Sets the logfile number that can be safely deleted when this
  // memstore is flushed to storage
  void SetLogNumber(uint64_t num) { mem_logfile_number_ = num; }
  // Notify the underlying storage that no more items will be added
  void MarkImmutable() { table_->MarkReadOnly(); }
@ -197,10 +189,6 @@ class MemTable {
  // The log files earlier than this number can be deleted.
  uint64_t mem_next_logfile_number_;
  // The log file that backs this memtable (to be deleted when
  // memtable flush is done)
  uint64_t mem_logfile_number_;
  // rw locks for inplace updates
  std::vector<port::RWMutex> locks_;
--- a/db/memtable_list.cc
+++ b/db/memtable_list.cc
@ -8,9 +8,11 @@
 #include <string>
 #include "rocksdb/db.h"
 #include "db/memtable.h"
 #include "db/version_set.h"
 #include "rocksdb/env.h"
 #include "rocksdb/iterator.h"
 #include "util/coding.h"
 #include "util/log_buffer.h"
 namespace rocksdb {
@ -120,7 +122,8 @@ void MemTableList::PickMemtablesToFlush(autovector<MemTable*>* ret) {
 }
 void MemTableList::RollbackMemtableFlush(const autovector<MemTable*>& mems,
-     uint64_t file_number, std::set<uint64_t>* pending_outputs) {
+                                         uint64_t file_number,
                                         std::set<uint64_t>* pending_outputs) {
  assert(!mems.empty());
  // If the flush was not successful, then just reset state.
@ -140,10 +143,10 @@ void MemTableList::RollbackMemtableFlush(const autovector<MemTable*>& mems,
 // Record a successful flush in the manifest file
 Status MemTableList::InstallMemtableFlushResults(
-    const autovector<MemTable*>& mems, VersionSet* vset,
+    ColumnFamilyData* cfd, const autovector<MemTable*>& mems, VersionSet* vset,
    port::Mutex* mu, Logger* info_log, uint64_t file_number,
    std::set<uint64_t>& pending_outputs, autovector<MemTable*>* to_delete,
-    Directory* db_directory) {
+    Directory* db_directory, LogBuffer* log_buffer) {
  mu->AssertHeld();
  // flush was sucessful
@ -173,12 +176,11 @@ Status MemTableList::InstallMemtableFlushResults(
      break;
    }
-    Log(info_log,
+    LogToBuffer(log_buffer, "Level-0 commit table #%lu started",
-        "Level-0 commit table #%lu started",
+                (unsigned long)m->file_number_);
        (unsigned long)m->file_number_);
    // this can release and reacquire the mutex.
-    s = vset->LogAndApply(&m->edit_, mu, db_directory);
+    s = vset->LogAndApply(cfd, &m->edit_, mu, db_directory);
    // we will be changing the version in the next code path,
    // so we better create a new one, since versions are immutable
@ -189,10 +191,8 @@ Status MemTableList::InstallMemtableFlushResults(
    uint64_t mem_id = 1;  // how many memtables has been flushed.
    do {
      if (s.ok()) { // commit new state
-        Log(info_log,
+        LogToBuffer(log_buffer, "Level-0 commit table #%lu: memtable #%lu done",
-            "Level-0 commit table #%lu: memtable #%lu done",
+                    (unsigned long)m->file_number_, (unsigned long)mem_id);
            (unsigned long)m->file_number_,
            (unsigned long)mem_id);
        current_->Remove(m);
        assert(m->file_number_ > 0);
--- a/db/memtable_list.h
+++ b/db/memtable_list.h
@ -7,19 +7,25 @@
 #include <string>
 #include <list>
 #include <vector>
 #include <set>
 #include <deque>
 #include "rocksdb/db.h"
 #include "rocksdb/options.h"
 #include "rocksdb/iterator.h"
 #include "db/dbformat.h"
 #include "db/memtable.h"
 #include "db/skiplist.h"
-#include "rocksdb/db.h"
+#include "db/memtable.h"
 #include "rocksdb/db.h"
 #include "rocksdb/iterator.h"
 #include "rocksdb/options.h"
 #include "util/autovector.h"
 #include "util/log_buffer.h"
 namespace rocksdb {
 class ColumnFamilyData;
 class InternalKeyComparator;
 class Mutex;
@ -99,12 +105,14 @@ class MemTableList {
                             std::set<uint64_t>* pending_outputs);
  // Commit a successful flush in the manifest file
-  Status InstallMemtableFlushResults(const autovector<MemTable*>& m,
+  Status InstallMemtableFlushResults(ColumnFamilyData* cfd,
                                     const autovector<MemTable*>& m,
                                     VersionSet* vset, port::Mutex* mu,
                                     Logger* info_log, uint64_t file_number,
                                     std::set<uint64_t>& pending_outputs,
                                     autovector<MemTable*>* to_delete,
-                                     Directory* db_directory);
+                                     Directory* db_directory,
                                     LogBuffer* log_buffer);
  // New memtables are inserted at the front of the list.
  // Takes ownership of the referenced held on *m by the caller of Add().
--- a/db/plain_table_db_test.cc
+++ b/db/plain_table_db_test.cc
@ -429,6 +429,48 @@ TEST(PlainTableDBTest, Iterator) {
  }
 }
 std::string MakeLongKey(size_t length, char c) {
  return std::string(length, c);
 }
 TEST(PlainTableDBTest, IteratorLargeKeys) {
  Options options = CurrentOptions();
  options.table_factory.reset(NewTotalOrderPlainTableFactory(0, 0, 16));
  options.create_if_missing = true;
  options.prefix_extractor.reset();
  DestroyAndReopen(&options);
  std::string key_list[] = {
      MakeLongKey(30, '0'),
      MakeLongKey(16, '1'),
      MakeLongKey(32, '2'),
      MakeLongKey(60, '3'),
      MakeLongKey(90, '4'),
      MakeLongKey(50, '5'),
      MakeLongKey(26, '6')
  };
  for (size_t i = 0; i < 7; i++) {
    ASSERT_OK(Put(key_list[i], std::to_string(i)));
  }
  dbfull()->TEST_FlushMemTable();
  Iterator* iter = dbfull()->NewIterator(ro_);
  iter->Seek(key_list[0]);
  for (size_t i = 0; i < 7; i++) {
    ASSERT_TRUE(iter->Valid());
    ASSERT_EQ(key_list[i], iter->key().ToString());
    ASSERT_EQ(std::to_string(i), iter->value().ToString());
    iter->Next();
  }
  ASSERT_TRUE(!iter->Valid());
  delete iter;
 }
 // A test comparator which compare two strings in this way:
 // (1) first compare prefix of 8 bytes in alphabet order,
 // (2) if two strings share the same prefix, sort the other part of the string
--- a/db/repair.cc
+++ b/db/repair.cc
@ -55,14 +55,20 @@ class Repairer {
        icmp_(options.comparator),
        ipolicy_(options.filter_policy),
        options_(SanitizeOptions(dbname, &icmp_, &ipolicy_, options)),
        raw_table_cache_(
            // TableCache can be small since we expect each table to be opened
            // once.
            NewLRUCache(10, options_.table_cache_numshardbits,
                        options_.table_cache_remove_scan_count_limit)),
        next_file_number_(1) {
-    // TableCache can be small since we expect each table to be opened once.
+    table_cache_ = new TableCache(dbname_, &options_, storage_options_,
-    table_cache_ = new TableCache(dbname_, &options_, storage_options_, 10);
+                                  raw_table_cache_.get());
    edit_ = new VersionEdit();
  }
  ~Repairer() {
    delete table_cache_;
    raw_table_cache_.reset();
    delete edit_;
  }
@ -102,6 +108,7 @@ class Repairer {
  InternalKeyComparator const icmp_;
  InternalFilterPolicy const ipolicy_;
  Options const options_;
  std::shared_ptr<Cache> raw_table_cache_;
  TableCache* table_cache_;
  VersionEdit* edit_;
@ -197,6 +204,7 @@ class Repairer {
    Slice record;
    WriteBatch batch;
    MemTable* mem = new MemTable(icmp_, options_);
    auto cf_mems_default = new ColumnFamilyMemTablesDefault(mem, &options_);
    mem->Ref();
    int counter = 0;
    while (reader.ReadRecord(&record, &scratch)) {
@ -206,7 +214,7 @@ class Repairer {
        continue;
      }
      WriteBatchInternal::SetContents(&batch, record);
-      status = WriteBatchInternal::InsertInto(&batch, mem, &options_);
+      status = WriteBatchInternal::InsertInto(&batch, cf_mems_default);
      if (status.ok()) {
        counter += WriteBatchInternal::Count(&batch);
      } else {
@ -226,6 +234,7 @@ class Repairer {
                        iter, &meta, icmp_, 0, 0, kNoCompression);
    delete iter;
    delete mem->Unref();
    delete cf_mems_default;
    mem = nullptr;
    if (status.ok()) {
      if (meta.file_size > 0) {
--- a/db/table_cache.cc
+++ b/db/table_cache.cc
@ -35,18 +35,13 @@ static Slice GetSliceForFileNumber(uint64_t* file_number) {
               sizeof(*file_number));
 }
-TableCache::TableCache(const std::string& dbname,
+TableCache::TableCache(const std::string& dbname, const Options* options,
-                       const Options* options,
+                       const EnvOptions& storage_options, Cache* const cache)
                       const EnvOptions& storage_options,
                       int entries)
    : env_(options->env),
      dbname_(dbname),
      options_(options),
      storage_options_(storage_options),
-      cache_(
+      cache_(cache) {}
        NewLRUCache(entries, options->table_cache_numshardbits,
                    options->table_cache_remove_scan_count_limit)) {
 }
 TableCache::~TableCache() {
 }
@ -124,7 +119,7 @@ Iterator* TableCache::NewIterator(const ReadOptions& options,
  TableReader* table_reader = GetTableReaderFromHandle(handle);
  Iterator* result = table_reader->NewIterator(options);
  if (!file_meta.table_reader_handle) {
-    result->RegisterCleanup(&UnrefEntry, cache_.get(), handle);
+    result->RegisterCleanup(&UnrefEntry, cache_, handle);
  }
  if (table_reader_ptr != nullptr) {
    *table_reader_ptr = table_reader;
@ -216,8 +211,8 @@ bool TableCache::PrefixMayMatch(const ReadOptions& options,
  return may_match;
 }
-void TableCache::Evict(uint64_t file_number) {
+void TableCache::Evict(Cache* cache, uint64_t file_number) {
-  cache_->Erase(GetSliceForFileNumber(&file_number));
+  cache->Erase(GetSliceForFileNumber(&file_number));
 }
 }  // namespace rocksdb
--- a/db/table_cache.h
+++ b/db/table_cache.h
@ -30,7 +30,7 @@ struct FileMetaData;
 class TableCache {
 public:
  TableCache(const std::string& dbname, const Options* options,
-             const EnvOptions& storage_options, int entries);
+             const EnvOptions& storage_options, Cache* cache);
  ~TableCache();
  // Return an iterator for the specified file number (the corresponding
@ -64,7 +64,7 @@ class TableCache {
                      const Slice& internal_prefix, bool* table_io);
  // Evict any entry for the specified file number
-  void Evict(uint64_t file_number);
+  static void Evict(Cache* cache, uint64_t file_number);
  // Find table reader
  Status FindTable(const EnvOptions& toptions,
@ -95,7 +95,7 @@ class TableCache {
  const std::string dbname_;
  const Options* options_;
  const EnvOptions& storage_options_;
-  std::shared_ptr<Cache> cache_;
+  Cache* const cache_;
 };
 }  // namespace rocksdb
--- a/db/tailing_iter.cc
+++ b/db/tailing_iter.cc
@ -8,15 +8,19 @@
 #include <string>
 #include <utility>
 #include "db/db_impl.h"
 #include "db/column_family.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/slice_transform.h"
 namespace rocksdb {
 TailingIterator::TailingIterator(DBImpl* db, const ReadOptions& options,
-                                 const Comparator* comparator)
+                                 ColumnFamilyData* cfd)
-    : db_(db), options_(options), comparator_(comparator),
+    : db_(db),
-      version_number_(0), current_(nullptr),
+      options_(options),
      cfd_(cfd),
      version_number_(0),
      current_(nullptr),
      status_(Status::InvalidArgument("Seek() not called on this iterator")) {}
 bool TailingIterator::Valid() const {
@ -53,10 +57,9 @@ void TailingIterator::Seek(const Slice& target) {
  // 'target' -- in this case, prev_key_ is included in the interval, so
  // prev_inclusive_ has to be set.
-  if (!is_prev_set_ ||
+  const Comparator* cmp = cfd_->user_comparator();
-      comparator_->Compare(prev_key_, target) >= !is_prev_inclusive_ ||
+  if (!is_prev_set_ || cmp->Compare(prev_key_, target) >= !is_prev_inclusive_ ||
-      (immutable_->Valid() &&
+      (immutable_->Valid() && cmp->Compare(target, immutable_->key()) > 0) ||
       comparator_->Compare(target, immutable_->key()) > 0) ||
      (options_.prefix_seek && !IsSamePrefix(target))) {
    SeekImmutable(target);
  }
@ -121,7 +124,7 @@ void TailingIterator::SeekToLast() {
 void TailingIterator::CreateIterators() {
  std::pair<Iterator*, Iterator*> iters =
-    db_->GetTailingIteratorPair(options_, &version_number_);
+      db_->GetTailingIteratorPair(options_, cfd_, &version_number_);
  assert(iters.first && iters.second);
@ -137,9 +140,10 @@ void TailingIterator::UpdateCurrent() {
  if (mutable_->Valid()) {
    current_ = mutable_.get();
  }
  const Comparator* cmp = cfd_->user_comparator();
  if (immutable_->Valid() &&
      (current_ == nullptr ||
-       comparator_->Compare(immutable_->key(), current_->key()) < 0)) {
+       cmp->Compare(immutable_->key(), current_->key()) < 0)) {
    current_ = immutable_.get();
  }
@ -151,11 +155,11 @@ void TailingIterator::UpdateCurrent() {
 bool TailingIterator::IsCurrentVersion() const {
  return mutable_ != nullptr && immutable_ != nullptr &&
-    version_number_ == db_->CurrentVersionNumber();
+         version_number_ == cfd_->GetSuperVersionNumber();
 }
 bool TailingIterator::IsSamePrefix(const Slice& target) const {
-  const SliceTransform* extractor = db_->options_.prefix_extractor.get();
+  const SliceTransform* extractor = cfd_->options()->prefix_extractor.get();
  assert(extractor);
  assert(is_prev_set_);
--- a/db/tailing_iter.h
+++ b/db/tailing_iter.h
@ -13,6 +13,7 @@
 namespace rocksdb {
 class DBImpl;
 class ColumnFamilyData;
 /**
 * TailingIterator is a special type of iterator that doesn't use an (implicit)
@ -25,7 +26,7 @@ class DBImpl;
 class TailingIterator : public Iterator {
 public:
  TailingIterator(DBImpl* db, const ReadOptions& options,
-                  const Comparator* comparator);
+                  ColumnFamilyData* cfd);
  virtual ~TailingIterator() {}
  virtual bool Valid() const override;
@ -41,7 +42,7 @@ class TailingIterator : public Iterator {
 private:
  DBImpl* const db_;
  const ReadOptions options_;
-  const Comparator* const comparator_;
+  ColumnFamilyData* const cfd_;
  uint64_t version_number_;
  // TailingIterator merges the contents of the two iterators below (one using
--- a/db/transaction_log_impl.cc
+++ b/db/transaction_log_impl.cc
@ -9,7 +9,7 @@
 namespace rocksdb {
 TransactionLogIteratorImpl::TransactionLogIteratorImpl(
-    const std::string& dir, const Options* options,
+    const std::string& dir, const DBOptions* options,
    const TransactionLogIterator::ReadOptions& read_options,
    const EnvOptions& soptions, const SequenceNumber seq,
    std::unique_ptr<VectorLogPtr> files, DBImpl const* const dbimpl)
--- a/db/transaction_log_impl.h
+++ b/db/transaction_log_impl.h
@ -67,7 +67,7 @@ class LogFileImpl : public LogFile {
 class TransactionLogIteratorImpl : public TransactionLogIterator {
 public:
  TransactionLogIteratorImpl(
-      const std::string& dir, const Options* options,
+      const std::string& dir, const DBOptions* options,
      const TransactionLogIterator::ReadOptions& read_options,
      const EnvOptions& soptions, const SequenceNumber seqNum,
      std::unique_ptr<VectorLogPtr> files, DBImpl const* const dbimpl);
@ -82,7 +82,7 @@ class TransactionLogIteratorImpl : public TransactionLogIterator {
 private:
  const std::string& dir_;
-  const Options* options_;
+  const DBOptions* options_;
  const TransactionLogIterator::ReadOptions read_options_;
  const EnvOptions& soptions_;
  SequenceNumber startingSequenceNumber_;
--- a/db/version_edit.cc
+++ b/db/version_edit.cc
@ -11,6 +11,7 @@
 #include "db/version_set.h"
 #include "util/coding.h"
 #include "rocksdb/slice.h"
 namespace rocksdb {
@ -29,6 +30,11 @@ enum Tag {
  // these are new formats divergent from open source leveldb
  kNewFile2             = 100,  // store smallest & largest seqno
  kColumnFamily         = 200,  // specify column family for version edit
  kColumnFamilyAdd      = 201,
  kColumnFamilyDrop     = 202,
  kMaxColumnFamily      = 203,
 };
 void VersionEdit::Clear() {
@ -38,13 +44,19 @@ void VersionEdit::Clear() {
  prev_log_number_ = 0;
  last_sequence_ = 0;
  next_file_number_ = 0;
  max_column_family_ = 0;
  has_comparator_ = false;
  has_log_number_ = false;
  has_prev_log_number_ = false;
  has_next_file_number_ = false;
  has_last_sequence_ = false;
  has_max_column_family_ = false;
  deleted_files_.clear();
  new_files_.clear();
  column_family_ = 0;
  is_column_family_add_ = 0;
  is_column_family_drop_ = 0;
  column_family_name_.clear();
 }
 void VersionEdit::EncodeTo(std::string* dst) const {
@ -68,6 +80,10 @@ void VersionEdit::EncodeTo(std::string* dst) const {
    PutVarint32(dst, kLastSequence);
    PutVarint64(dst, last_sequence_);
  }
  if (has_max_column_family_) {
    PutVarint32(dst, kMaxColumnFamily);
    PutVarint32(dst, max_column_family_);
  }
  for (const auto& deleted : deleted_files_) {
    PutVarint32(dst, kDeletedFile);
@ -86,6 +102,21 @@ void VersionEdit::EncodeTo(std::string* dst) const {
    PutVarint64(dst, f.smallest_seqno);
    PutVarint64(dst, f.largest_seqno);
  }
  // 0 is default and does not need to be explicitly written
  if (column_family_ != 0) {
    PutVarint32(dst, kColumnFamily);
    PutVarint32(dst, column_family_);
  }
  if (is_column_family_add_) {
    PutVarint32(dst, kColumnFamilyAdd);
    PutLengthPrefixedSlice(dst, Slice(column_family_name_));
  }
  if (is_column_family_drop_) {
    PutVarint32(dst, kColumnFamilyDrop);
  }
 }
 static bool GetInternalKey(Slice* input, InternalKey* dst) {
@ -167,6 +198,14 @@ Status VersionEdit::DecodeFrom(const Slice& src) {
        }
        break;
      case kMaxColumnFamily:
        if (GetVarint32(&input, &max_column_family_)) {
          has_max_column_family_ = true;
        } else {
          msg = "max column family";
        }
        break;
      case kCompactPointer:
        if (GetLevel(&input, &level, &msg) &&
            GetInternalKey(&input, &key)) {
@ -221,6 +260,29 @@ Status VersionEdit::DecodeFrom(const Slice& src) {
        }
        break;
      case kColumnFamily:
        if (!GetVarint32(&input, &column_family_)) {
          if (!msg) {
            msg = "set column family id";
          }
        }
        break;
      case kColumnFamilyAdd:
        if (GetLengthPrefixedSlice(&input, &str)) {
          is_column_family_add_ = true;
          column_family_name_ = str.ToString();
        } else {
          if (!msg) {
            msg = "column family add";
          }
        }
        break;
      case kColumnFamilyDrop:
        is_column_family_drop_ = true;
        break;
      default:
        msg = "unknown tag";
        break;
@ -282,6 +344,19 @@ std::string VersionEdit::DebugString(bool hex_key) const {
    r.append(" .. ");
    r.append(f.largest.DebugString(hex_key));
  }
  r.append("\n  ColumnFamily: ");
  AppendNumberTo(&r, column_family_);
  if (is_column_family_add_) {
    r.append("\n  ColumnFamilyAdd: ");
    r.append(column_family_name_);
  }
  if (is_column_family_drop_) {
    r.append("\n  ColumnFamilyDrop");
  }
  if (has_max_column_family_) {
    r.append("\n  MaxColumnFamily: ");
    AppendNumberTo(&r, max_column_family_);
  }
  r.append("\n}\n");
  return r;
 }
--- a/db/version_edit.h
+++ b/db/version_edit.h
@ -11,6 +11,7 @@
 #include <set>
 #include <utility>
 #include <vector>
 #include <string>
 #include "rocksdb/cache.h"
 #include "db/dbformat.h"
@ -32,11 +33,14 @@ struct FileMetaData {
  // Needs to be disposed when refs becomes 0.
  Cache::Handle* table_reader_handle;
-  FileMetaData(uint64_t number, uint64_t file_size) :
+  FileMetaData(uint64_t number, uint64_t file_size)
-      refs(0), allowed_seeks(1 << 30), number(number), file_size(file_size),
+      : refs(0),
-      being_compacted(false), table_reader_handle(nullptr) {
+        allowed_seeks(1 << 30),
-  }
+        number(number),
-  FileMetaData() : FileMetaData(0, 0) { }
+        file_size(file_size),
        being_compacted(false),
        table_reader_handle(nullptr) {}
  FileMetaData() : FileMetaData(0, 0) {}
 };
 class VersionEdit {
@ -66,6 +70,10 @@ class VersionEdit {
    has_last_sequence_ = true;
    last_sequence_ = seq;
  }
  void SetMaxColumnFamily(uint32_t max_column_family) {
    has_max_column_family_ = true;
    max_column_family_ = max_column_family;
  }
  // Add the specified file at the specified number.
  // REQUIRES: This version has not been saved (see VersionSet::SaveTo)
@ -97,6 +105,31 @@ class VersionEdit {
    return new_files_.size() + deleted_files_.size();
  }
  bool IsColumnFamilyManipulation() {
    return is_column_family_add_ || is_column_family_drop_;
  }
  void SetColumnFamily(uint32_t column_family_id) {
    column_family_ = column_family_id;
  }
  // set column family ID by calling SetColumnFamily()
  void AddColumnFamily(const std::string& name) {
    assert(!is_column_family_drop_);
    assert(!is_column_family_add_);
    assert(NumEntries() == 0);
    is_column_family_add_ = true;
    column_family_name_ = name;
  }
  // set column family ID by calling SetColumnFamily()
  void DropColumnFamily() {
    assert(!is_column_family_drop_);
    assert(!is_column_family_add_);
    assert(NumEntries() == 0);
    is_column_family_drop_ = true;
  }
  void EncodeTo(std::string* dst) const;
  Status DecodeFrom(const Slice& src);
@ -114,15 +147,27 @@ class VersionEdit {
  uint64_t log_number_;
  uint64_t prev_log_number_;
  uint64_t next_file_number_;
  uint32_t max_column_family_;
  SequenceNumber last_sequence_;
  bool has_comparator_;
  bool has_log_number_;
  bool has_prev_log_number_;
  bool has_next_file_number_;
  bool has_last_sequence_;
  bool has_max_column_family_;
  DeletedFileSet deleted_files_;
-  std::vector<std::pair<int, FileMetaData> > new_files_;
+  std::vector<std::pair<int, FileMetaData>> new_files_;
  // Each version edit record should have column_family_id set
  // If it's not set, it is default (0)
  uint32_t column_family_;
  // a version edit can be either column_family add or
  // column_family drop. If it's column family add,
  // it also includes column family name.
  bool is_column_family_drop_;
  bool is_column_family_add_;
  std::string column_family_name_;
 };
 }  // namespace rocksdb
--- a/db/version_edit_test.cc
+++ b/db/version_edit_test.cc
@ -45,6 +45,19 @@ TEST(VersionEditTest, EncodeDecode) {
  TestEncodeDecode(edit);
 }
 TEST(VersionEditTest, ColumnFamilyTest) {
  VersionEdit edit;
  edit.SetColumnFamily(2);
  edit.AddColumnFamily("column_family");
  edit.SetMaxColumnFamily(5);
  TestEncodeDecode(edit);
  edit.Clear();
  edit.SetColumnFamily(3);
  edit.DropColumnFamily();
  TestEncodeDecode(edit);
 }
 }  // namespace rocksdb
 int main(int argc, char** argv) {
--- a/db/version_set.cc
+++ b/db/version_set.cc
--- a/db/version_set.h
+++ b/db/version_set.h
@ -24,12 +24,15 @@
 #include <vector>
 #include <deque>
 #include <atomic>
 #include <limits>
 #include "db/dbformat.h"
 #include "db/version_edit.h"
 #include "port/port.h"
 #include "db/table_cache.h"
 #include "db/compaction.h"
 #include "db/compaction_picker.h"
 #include "db/column_family.h"
 #include "db/log_reader.h"
 namespace rocksdb {
@ -41,10 +44,12 @@ class Iterator;
 class LogBuffer;
 class LookupKey;
 class MemTable;
 class MergeContext;
 class TableCache;
 class Version;
 class VersionSet;
 class MergeContext;
 class ColumnFamilyData;
 class ColumnFamilySet;
 class TableCache;
 // Return the smallest index i such that files[i]->largest >= key.
 // Return files.size() if there is no such file.
@ -208,6 +213,7 @@ class Version {
  friend class Compaction;
  friend class VersionSet;
  friend class DBImpl;
  friend class ColumnFamilyData;
  friend class CompactionPicker;
  friend class LevelCompactionPicker;
  friend class UniversalCompactionPicker;
@ -223,6 +229,7 @@ class Version {
  // record results in files_by_size_. The largest files are listed first.
  void UpdateFilesBySize();
  ColumnFamilyData* cfd_;  // ColumnFamilyData to which this Version belongs
  VersionSet* vset_;            // VersionSet to which this Version belongs
  Version* next_;               // Next version in linked list
  Version* prev_;               // Previous version in linked list
@ -268,7 +275,7 @@ class Version {
  // used for debugging and logging purposes only.
  uint64_t version_number_;
-  explicit Version(VersionSet* vset, uint64_t version_number = 0);
+  Version(ColumnFamilyData* cfd, VersionSet* vset, uint64_t version_number = 0);
  ~Version();
@ -285,22 +292,29 @@ class Version {
 class VersionSet {
 public:
-  VersionSet(const std::string& dbname, const Options* options,
+  VersionSet(const std::string& dbname, const DBOptions* options,
-             const EnvOptions& storage_options, TableCache* table_cache,
+             const EnvOptions& storage_options, Cache* table_cache);
             const InternalKeyComparator*);
  ~VersionSet();
  // Apply *edit to the current version to form a new descriptor that
  // is both saved to persistent state and installed as the new
  // current version.  Will release *mu while actually writing to the file.
  // column_family_options has to be set if edit is column family add
  // REQUIRES: *mu is held on entry.
  // REQUIRES: no other thread concurrently calls LogAndApply()
-  Status LogAndApply(VersionEdit* edit, port::Mutex* mu,
+  Status LogAndApply(ColumnFamilyData* column_family_data, VersionEdit* edit,
-                     Directory* db_directory = nullptr,
+                     port::Mutex* mu, Directory* db_directory = nullptr,
-                     bool new_descriptor_log = false);
+                     bool new_descriptor_log = false,
                     const ColumnFamilyOptions* column_family_options =
                         nullptr);
  // Recover the last saved descriptor from persistent storage.
-  Status Recover();
+  Status Recover(const std::vector<ColumnFamilyDescriptor>& column_families);
  // Reads a manifest file and returns a list of column families in
  // column_families.
  static Status ListColumnFamilies(std::vector<std::string>* column_families,
                                   const std::string& dbname, Env* env);
  // Try to reduce the number of levels. This call is valid when
  // only one level from the new max level to the old
@ -316,15 +330,6 @@ class VersionSet {
                                     const EnvOptions& storage_options,
                                     int new_levels);
  // Return the current version.
  Version* current() const { return current_; }
  // A Flag indicating whether write needs to slowdown because of there are
  // too many number of level0 files.
  bool NeedSlowdownForNumLevel0Files() const {
    return need_slowdown_for_num_level0_files_;
  }
  // Return the current manifest file number
  uint64_t ManifestFileNumber() const { return manifest_file_number_; }
@ -358,37 +363,21 @@ class VersionSet {
  // Mark the specified file number as used.
  void MarkFileNumberUsed(uint64_t number);
  // Return the current log file number.
  uint64_t LogNumber() const { return log_number_; }
  // Return the log file number for the log file that is currently
  // being compacted, or zero if there is no such log file.
  uint64_t PrevLogNumber() const { return prev_log_number_; }
-  int NumberLevels() const { return num_levels_; }
+  // Returns the minimum log number such that all
-
+  // log numbers less than or equal to it can be deleted
-  // Pick level and inputs for a new compaction.
+  uint64_t MinLogNumber() const {
-  // Returns nullptr if there is no compaction to be done.
+    uint64_t min_log_num = std::numeric_limits<uint64_t>::max();
-  // Otherwise returns a pointer to a heap-allocated object that
+    for (auto cfd : *column_family_set_) {
-  // describes the compaction.  Caller should delete the result.
+      if (min_log_num > cfd->GetLogNumber()) {
-  Compaction* PickCompaction(LogBuffer* log_buffer);
+        min_log_num = cfd->GetLogNumber();
-
+      }
-  // Return a compaction object for compacting the range [begin,end] in
+    }
-  // the specified level.  Returns nullptr if there is nothing in that
+    return min_log_num;
-  // level that overlaps the specified range.  Caller should delete
+  }
  // the result.
  //
  // The returned Compaction might not include the whole requested range.
  // In that case, compaction_end will be set to the next key that needs
  // compacting. In case the compaction will compact the whole range,
  // compaction_end will be set to nullptr.
  // Client is responsible for compaction_end storage -- when called,
  // *compaction_end should point to valid InternalKey!
  Compaction* CompactRange(int input_level,
                           int output_level,
                           const InternalKey* begin,
                           const InternalKey* end,
                           InternalKey** compaction_end);
  // Create an iterator that reads over the compaction inputs for "*c".
  // The caller should delete the iterator when no longer needed.
@ -414,62 +403,53 @@ class VersionSet {
  // pick the same files to compact.
  bool VerifyCompactionFileConsistency(Compaction* c);
-  double MaxBytesForLevel(int level);
+  Status GetMetadataForFile(uint64_t number, int* filelevel,
-
+                            FileMetaData** metadata, ColumnFamilyData** cfd);
  // Get the max file size in a given level.
  uint64_t MaxFileSizeForLevel(int level);
  void ReleaseCompactionFiles(Compaction* c, Status status);
  Status GetMetadataForFile(
    uint64_t number, int *filelevel, FileMetaData **metadata);
  void GetLiveFilesMetaData(
    std::vector<LiveFileMetaData> *metadata);
  void GetObsoleteFiles(std::vector<FileMetaData*>* files);
  ColumnFamilySet* GetColumnFamilySet() { return column_family_set_.get(); }
 private:
  class Builder;
  struct ManifestWriter;
  friend class Compaction;
  friend class Version;
  struct LogReporter : public log::Reader::Reporter {
    Status* status;
    virtual void Corruption(size_t bytes, const Status& s) {
      if (this->status->ok()) *this->status = s;
    }
  };
  // Save current contents to *log
  Status WriteSnapshot(log::Writer* log);
-  void AppendVersion(Version* v);
+  void AppendVersion(ColumnFamilyData* column_family_data, Version* v);
  bool ManifestContains(uint64_t manifest_file_number,
                        const std::string& record) const;
  ColumnFamilyData* CreateColumnFamily(const ColumnFamilyOptions& options,
                                       VersionEdit* edit);
  std::unique_ptr<ColumnFamilySet> column_family_set_;
  Env* const env_;
  const std::string dbname_;
-  const Options* const options_;
+  const DBOptions* const options_;
  TableCache* const table_cache_;
  const InternalKeyComparator icmp_;
  uint64_t next_file_number_;
  uint64_t manifest_file_number_;
  uint64_t pending_manifest_file_number_;
  std::atomic<uint64_t> last_sequence_;
  uint64_t log_number_;
  uint64_t prev_log_number_;  // 0 or backing store for memtable being compacted
  int num_levels_;
  // Opened lazily
  unique_ptr<log::Writer> descriptor_log_;
  Version dummy_versions_;  // Head of circular doubly-linked list of versions.
  Version* current_;        // == dummy_versions_.prev_
  // A flag indicating whether we should delay writes because
  // we have too many level 0 files
  bool need_slowdown_for_num_level0_files_;
  // An object that keeps all the compaction stats
  // and picks the next compaction
  std::unique_ptr<CompactionPicker> compaction_picker_;
  // generates a increasing version number for every new version
  uint64_t current_version_number_;
@ -493,8 +473,9 @@ class VersionSet {
  VersionSet(const VersionSet&);
  void operator=(const VersionSet&);
-  void LogAndApplyHelper(Builder*b, Version* v,
+  void LogAndApplyCFHelper(VersionEdit* edit);
-                           VersionEdit* edit, port::Mutex* mu);
+  void LogAndApplyHelper(ColumnFamilyData* cfd, Builder* b, Version* v,
                         VersionEdit* edit, port::Mutex* mu);
 };
 }  // namespace rocksdb
--- a/db/write_batch.cc
+++ b/db/write_batch.cc
@ -15,6 +15,9 @@
 //    kTypeValue varstring varstring
 //    kTypeMerge varstring varstring
 //    kTypeDeletion varstring
 //    kTypeColumnFamilyValue varint32 varstring varstring
 //    kTypeColumnFamilyMerge varint32 varstring varstring
 //    kTypeColumnFamilyDeletion varint32 varstring varstring
 // varstring :=
 //    len: varint32
 //    data: uint8[len]
@ -45,10 +48,20 @@ WriteBatch::~WriteBatch() { }
 WriteBatch::Handler::~Handler() { }
 void WriteBatch::Handler::Put(const Slice& key, const Slice& value) {
  // you need to either implement Put or PutCF
  throw std::runtime_error("Handler::Put not implemented!");
 }
 void WriteBatch::Handler::Merge(const Slice& key, const Slice& value) {
  throw std::runtime_error("Handler::Merge not implemented!");
 }
 void WriteBatch::Handler::Delete(const Slice& key) {
  // you need to either implement Delete or DeleteCF
  throw std::runtime_error("Handler::Delete not implemented!");
 }
 void WriteBatch::Handler::LogData(const Slice& blob) {
  // If the user has not specified something to do with blobs, then we ignore
  // them.
@ -76,31 +89,48 @@ Status WriteBatch::Iterate(Handler* handler) const {
  input.remove_prefix(kHeader);
  Slice key, value, blob;
  int found = 0;
-  while (!input.empty() && handler->Continue()) {
+  Status s;
  while (s.ok() && !input.empty() && handler->Continue()) {
    char tag = input[0];
    input.remove_prefix(1);
    uint32_t column_family = 0;  // default
    switch (tag) {
      case kTypeColumnFamilyValue:
        if (!GetVarint32(&input, &column_family)) {
          return Status::Corruption("bad WriteBatch Put");
        }
      // intentional fallthrough
      case kTypeValue:
        if (GetLengthPrefixedSlice(&input, &key) &&
            GetLengthPrefixedSlice(&input, &value)) {
-          handler->Put(key, value);
+          s = handler->PutCF(column_family, key, value);
          found++;
        } else {
          return Status::Corruption("bad WriteBatch Put");
        }
        break;
      case kTypeColumnFamilyDeletion:
        if (!GetVarint32(&input, &column_family)) {
          return Status::Corruption("bad WriteBatch Delete");
        }
      // intentional fallthrough
      case kTypeDeletion:
        if (GetLengthPrefixedSlice(&input, &key)) {
-          handler->Delete(key);
+          s = handler->DeleteCF(column_family, key);
          found++;
        } else {
          return Status::Corruption("bad WriteBatch Delete");
        }
        break;
      case kTypeColumnFamilyMerge:
        if (!GetVarint32(&input, &column_family)) {
          return Status::Corruption("bad WriteBatch Merge");
        }
      // intentional fallthrough
      case kTypeMerge:
        if (GetLengthPrefixedSlice(&input, &key) &&
            GetLengthPrefixedSlice(&input, &value)) {
-          handler->Merge(key, value);
+          s = handler->MergeCF(column_family, key, value);
          found++;
        } else {
          return Status::Corruption("bad WriteBatch Merge");
@ -117,7 +147,10 @@ Status WriteBatch::Iterate(Handler* handler) const {
        return Status::Corruption("unknown WriteBatch tag");
    }
  }
- if (found != WriteBatchInternal::Count(this)) {
+  if (!s.ok()) {
    return s;
  }
  if (found != WriteBatchInternal::Count(this)) {
    return Status::Corruption("WriteBatch has wrong count");
  } else {
    return Status::OK();
@ -140,29 +173,76 @@ void WriteBatchInternal::SetSequence(WriteBatch* b, SequenceNumber seq) {
  EncodeFixed64(&b->rep_[0], seq);
 }
-void WriteBatch::Put(const Slice& key, const Slice& value) {
+void WriteBatch::Put(ColumnFamilyHandle* column_family, const Slice& key,
                     const Slice& value) {
  uint32_t column_family_id = 0;
  if (column_family != nullptr) {
    auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
    column_family_id = cfh->GetID();
  }
  WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1);
-  rep_.push_back(static_cast<char>(kTypeValue));
+  if (column_family_id == 0) {
    rep_.push_back(static_cast<char>(kTypeValue));
  } else {
    rep_.push_back(static_cast<char>(kTypeColumnFamilyValue));
    PutVarint32(&rep_, column_family_id);
  }
  PutLengthPrefixedSlice(&rep_, key);
  PutLengthPrefixedSlice(&rep_, value);
 }
-void WriteBatch::Put(const SliceParts& key, const SliceParts& value) {
+void WriteBatch::Put(ColumnFamilyHandle* column_family, const SliceParts& key,
                     const SliceParts& value) {
  uint32_t column_family_id = 0;
  if (column_family != nullptr) {
    auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
    column_family_id = cfh->GetID();
  }
  WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1);
-  rep_.push_back(static_cast<char>(kTypeValue));
+  if (column_family_id == 0) {
    rep_.push_back(static_cast<char>(kTypeValue));
  } else {
    rep_.push_back(static_cast<char>(kTypeColumnFamilyValue));
    PutVarint32(&rep_, column_family_id);
  }
  PutLengthPrefixedSliceParts(&rep_, key);
  PutLengthPrefixedSliceParts(&rep_, value);
 }
-void WriteBatch::Delete(const Slice& key) {
+void WriteBatch::Delete(ColumnFamilyHandle* column_family, const Slice& key) {
  uint32_t column_family_id = 0;
  if (column_family != nullptr) {
    auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
    column_family_id = cfh->GetID();
  }
  WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1);
-  rep_.push_back(static_cast<char>(kTypeDeletion));
+  if (column_family_id == 0) {
    rep_.push_back(static_cast<char>(kTypeDeletion));
  } else {
    rep_.push_back(static_cast<char>(kTypeColumnFamilyDeletion));
    PutVarint32(&rep_, column_family_id);
  }
  PutLengthPrefixedSlice(&rep_, key);
 }
-void WriteBatch::Merge(const Slice& key, const Slice& value) {
+void WriteBatch::Merge(ColumnFamilyHandle* column_family, const Slice& key,
                       const Slice& value) {
  uint32_t column_family_id = 0;
  if (column_family != nullptr) {
    auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
    column_family_id = cfh->GetID();
  }
  WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1);
-  rep_.push_back(static_cast<char>(kTypeMerge));
+  if (column_family_id == 0) {
    rep_.push_back(static_cast<char>(kTypeMerge));
  } else {
    rep_.push_back(static_cast<char>(kTypeColumnFamilyMerge));
    PutVarint32(&rep_, column_family_id);
  }
  PutLengthPrefixedSlice(&rep_, key);
  PutLengthPrefixedSlice(&rep_, value);
 }
@ -176,33 +256,70 @@ namespace {
 class MemTableInserter : public WriteBatch::Handler {
 public:
  SequenceNumber sequence_;
-  MemTable* mem_;
+  ColumnFamilyMemTables* cf_mems_;
-  const Options* options_;
+  bool recovery_;
  uint64_t log_number_;
  DBImpl* db_;
-  const bool filter_deletes_;
+  const bool dont_filter_deletes_;
-
+
-  MemTableInserter(SequenceNumber sequence, MemTable* mem, const Options* opts,
+  MemTableInserter(SequenceNumber sequence, ColumnFamilyMemTables* cf_mems,
-                   DB* db, const bool filter_deletes)
+                   bool recovery, uint64_t log_number, DB* db,
-    : sequence_(sequence),
+                   const bool dont_filter_deletes)
-      mem_(mem),
+      : sequence_(sequence),
-      options_(opts),
+        cf_mems_(cf_mems),
-      db_(reinterpret_cast<DBImpl*>(db)),
+        recovery_(recovery),
-      filter_deletes_(filter_deletes) {
+        log_number_(log_number),
-    assert(mem_);
+        db_(reinterpret_cast<DBImpl*>(db)),
-    if (filter_deletes_) {
+        dont_filter_deletes_(dont_filter_deletes) {
-      assert(options_);
+    assert(cf_mems);
    if (!dont_filter_deletes_) {
      assert(db_);
    }
  }
-  virtual void Put(const Slice& key, const Slice& value) {
+  bool SeekToColumnFamily(uint32_t column_family_id, Status* s) {
-    if (!options_->inplace_update_support) {
+    bool found = cf_mems_->Seek(column_family_id);
-      mem_->Add(sequence_, kTypeValue, key, value);
+    if (recovery_ && (!found || log_number_ < cf_mems_->GetLogNumber())) {
-    } else if (options_->inplace_callback == nullptr) {
+      // if in recovery envoronment:
-      mem_->Update(sequence_, key, value);
+      // * If column family was not found, it might mean that the WAL write
-      RecordTick(options_->statistics.get(), NUMBER_KEYS_UPDATED);
+      // batch references to the column family that was dropped after the
      // insert. We don't want to fail the whole write batch in that case -- we
      // just ignore the update.
      // * If log_number_ < cf_mems_->GetLogNumber(), this means that column
      // family already contains updates from this log. We can't apply updates
      // twice because of update-in-place or merge workloads -- ignore the
      // update
      *s = Status::OK();
      return false;
    }
    if (!found) {
      assert(!recovery_);
      // If the column family was not found in non-recovery enviornment
      // (client's write code-path), we have to fail the write and return
      // the failure status to the client.
      *s = Status::InvalidArgument(
          "Invalid column family specified in write batch");
      return false;
    }
    return true;
  }
  virtual Status PutCF(uint32_t column_family_id, const Slice& key,
                       const Slice& value) {
    Status seek_status;
    if (!SeekToColumnFamily(column_family_id, &seek_status)) {
      ++sequence_;
      return seek_status;
    }
    MemTable* mem = cf_mems_->GetMemTable();
    const Options* options = cf_mems_->GetOptions();
    if (!options->inplace_update_support) {
      mem->Add(sequence_, kTypeValue, key, value);
    } else if (options->inplace_callback == nullptr) {
      mem->Update(sequence_, key, value);
      RecordTick(options->statistics.get(), NUMBER_KEYS_UPDATED);
    } else {
-      if (mem_->UpdateCallback(sequence_, key, value, *options_)) {
+      if (mem->UpdateCallback(sequence_, key, value, *options)) {
      } else {
        // key not found in memtable. Do sst get, update, add
        SnapshotImpl read_from_snapshot;
@ -212,21 +329,26 @@ class MemTableInserter : public WriteBatch::Handler {
        std::string prev_value;
        std::string merged_value;
-        Status s = db_->Get(ropts, key, &prev_value);
+
        auto cf_handle = cf_mems_->GetColumnFamilyHandle();
        if (cf_handle == nullptr) {
          cf_handle = db_->DefaultColumnFamily();
        }
        Status s = db_->Get(ropts, cf_handle, key, &prev_value);
        char* prev_buffer = const_cast<char*>(prev_value.c_str());
        uint32_t prev_size = prev_value.size();
-        auto status =
+        auto status = options->inplace_callback(s.ok() ? prev_buffer : nullptr,
-          options_->inplace_callback(s.ok() ? prev_buffer: nullptr,
+                                                s.ok() ? &prev_size : nullptr,
-                                     s.ok() ? &prev_size: nullptr,
+                                                value, &merged_value);
                                     value, &merged_value);
        if (status == UpdateStatus::UPDATED_INPLACE) {
          // prev_value is updated in-place with final value.
-          mem_->Add(sequence_, kTypeValue, key, Slice(prev_buffer, prev_size));
+          mem->Add(sequence_, kTypeValue, key, Slice(prev_buffer, prev_size));
-          RecordTick(options_->statistics.get(), NUMBER_KEYS_WRITTEN);
+          RecordTick(options->statistics.get(), NUMBER_KEYS_WRITTEN);
        } else if (status == UpdateStatus::UPDATED) {
          // merged_value contains the final value.
-          mem_->Add(sequence_, kTypeValue, key, Slice(merged_value));
+          mem->Add(sequence_, kTypeValue, key, Slice(merged_value));
-          RecordTick(options_->statistics.get(), NUMBER_KEYS_WRITTEN);
+          RecordTick(options->statistics.get(), NUMBER_KEYS_WRITTEN);
        }
      }
    }
@ -234,19 +356,28 @@ class MemTableInserter : public WriteBatch::Handler {
    // sequence number. Even if the update eventually fails and does not result
    // in memtable add/update.
    sequence_++;
    return Status::OK();
  }
-  virtual void Merge(const Slice& key, const Slice& value) {
+  virtual Status MergeCF(uint32_t column_family_id, const Slice& key,
                         const Slice& value) {
    Status seek_status;
    if (!SeekToColumnFamily(column_family_id, &seek_status)) {
      ++sequence_;
      return seek_status;
    }
    MemTable* mem = cf_mems_->GetMemTable();
    const Options* options = cf_mems_->GetOptions();
    bool perform_merge = false;
-    if (options_->max_successive_merges > 0 && db_ != nullptr) {
+    if (options->max_successive_merges > 0 && db_ != nullptr) {
      LookupKey lkey(key, sequence_);
      // Count the number of successive merges at the head
      // of the key in the memtable
-      size_t num_merges = mem_->CountSuccessiveMergeEntries(lkey);
+      size_t num_merges = mem->CountSuccessiveMergeEntries(lkey);
-      if (num_merges >= options_->max_successive_merges) {
+      if (num_merges >= options->max_successive_merges) {
        perform_merge = true;
      }
    }
@ -262,62 +393,78 @@ class MemTableInserter : public WriteBatch::Handler {
      ReadOptions read_options;
      read_options.snapshot = &read_from_snapshot;
-      db_->Get(read_options, key, &get_value);
+      auto cf_handle = cf_mems_->GetColumnFamilyHandle();
      if (cf_handle == nullptr) {
        cf_handle = db_->DefaultColumnFamily();
      }
      db_->Get(read_options, cf_handle, key, &get_value);
      Slice get_value_slice = Slice(get_value);
      // 2) Apply this merge
-      auto merge_operator = options_->merge_operator.get();
+      auto merge_operator = options->merge_operator.get();
      assert(merge_operator);
      std::deque<std::string> operands;
      operands.push_front(value.ToString());
      std::string new_value;
-      if (!merge_operator->FullMerge(key,
+      if (!merge_operator->FullMerge(key, &get_value_slice, operands,
-                                     &get_value_slice,
+                                     &new_value, options->info_log.get())) {
                                     operands,
                                     &new_value,
                                     options_->info_log.get())) {
          // Failed to merge!
-          RecordTick(options_->statistics.get(), NUMBER_MERGE_FAILURES);
+        RecordTick(options->statistics.get(), NUMBER_MERGE_FAILURES);
          // Store the delta in memtable
          perform_merge = false;
      } else {
        // 3) Add value to memtable
-        mem_->Add(sequence_, kTypeValue, key, new_value);
+        mem->Add(sequence_, kTypeValue, key, new_value);
      }
    }
    if (!perform_merge) {
      // Add merge operator to memtable
-      mem_->Add(sequence_, kTypeMerge, key, value);
+      mem->Add(sequence_, kTypeMerge, key, value);
    }
    sequence_++;
    return Status::OK();
  }
-  virtual void Delete(const Slice& key) {
+
-    if (filter_deletes_) {
+  virtual Status DeleteCF(uint32_t column_family_id, const Slice& key) {
    Status seek_status;
    if (!SeekToColumnFamily(column_family_id, &seek_status)) {
      ++sequence_;
      return seek_status;
    }
    MemTable* mem = cf_mems_->GetMemTable();
    const Options* options = cf_mems_->GetOptions();
    if (!dont_filter_deletes_ && options->filter_deletes) {
      SnapshotImpl read_from_snapshot;
      read_from_snapshot.number_ = sequence_;
      ReadOptions ropts;
      ropts.snapshot = &read_from_snapshot;
      std::string value;
-      if (!db_->KeyMayExist(ropts, key, &value)) {
+      auto cf_handle = cf_mems_->GetColumnFamilyHandle();
-        RecordTick(options_->statistics.get(), NUMBER_FILTERED_DELETES);
+      if (cf_handle == nullptr) {
-        return;
+        cf_handle = db_->DefaultColumnFamily();
      }
      if (!db_->KeyMayExist(ropts, cf_handle, key, &value)) {
        RecordTick(options->statistics.get(), NUMBER_FILTERED_DELETES);
        return Status::OK();
      }
    }
-    mem_->Add(sequence_, kTypeDeletion, key, Slice());
+    mem->Add(sequence_, kTypeDeletion, key, Slice());
    sequence_++;
    return Status::OK();
  }
 };
 }  // namespace
-Status WriteBatchInternal::InsertInto(const WriteBatch* b, MemTable* mem,
+Status WriteBatchInternal::InsertInto(const WriteBatch* b,
-                                      const Options* opts, DB* db,
+                                      ColumnFamilyMemTables* memtables,
-                                      const bool filter_deletes) {
+                                      bool recovery, uint64_t log_number,
-  MemTableInserter inserter(WriteBatchInternal::Sequence(b), mem, opts, db,
+                                      DB* db, const bool dont_filter_deletes) {
-                            filter_deletes);
+  MemTableInserter inserter(WriteBatchInternal::Sequence(b), memtables,
                            recovery, log_number, db, dont_filter_deletes);
  return b->Iterate(&inserter);
 }
--- a/db/write_batch_internal.h
+++ b/db/write_batch_internal.h
@ -17,6 +17,49 @@ namespace rocksdb {
 class MemTable;
 class ColumnFamilyMemTables {
 public:
  virtual ~ColumnFamilyMemTables() {}
  virtual bool Seek(uint32_t column_family_id) = 0;
  // returns true if the update to memtable should be ignored
  // (useful when recovering from log whose updates have already
  // been processed)
  virtual uint64_t GetLogNumber() const = 0;
  virtual MemTable* GetMemTable() const = 0;
  virtual const Options* GetOptions() const = 0;
  virtual ColumnFamilyHandle* GetColumnFamilyHandle() = 0;
 };
 class ColumnFamilyMemTablesDefault : public ColumnFamilyMemTables {
 public:
  ColumnFamilyMemTablesDefault(MemTable* mem, const Options* options)
      : ok_(false), mem_(mem), options_(options) {}
  bool Seek(uint32_t column_family_id) override {
    ok_ = (column_family_id == 0);
    return ok_;
  }
  uint64_t GetLogNumber() const override { return 0; }
  MemTable* GetMemTable() const override {
    assert(ok_);
    return mem_;
  }
  const Options* GetOptions() const override {
    assert(ok_);
    return options_;
  }
  ColumnFamilyHandle* GetColumnFamilyHandle() override { return nullptr; }
 private:
  bool ok_;
  MemTable* mem_;
  const Options* const options_;
 };
 // WriteBatchInternal provides static methods for manipulating a
 // WriteBatch that we don't want in the public WriteBatch interface.
 class WriteBatchInternal {
@ -45,11 +88,21 @@ class WriteBatchInternal {
  static void SetContents(WriteBatch* batch, const Slice& contents);
  // Inserts batch entries into memtable
-  // Drops deletes in batch if filter_del is set to true and
+  // If dont_filter_deletes is false AND options.filter_deletes is true,
-  // db->KeyMayExist returns false
+  // then --> Drops deletes in batch if db->KeyMayExist returns false
-  static Status InsertInto(const WriteBatch* batch, MemTable* memtable,
+  // If recovery == true, this means InsertInto is executed on a recovery
-                           const Options* opts, DB* db = nullptr,
+  // code-path. WriteBatch referencing a dropped column family can be
-                           const bool filter_del = false);
+  // found on a recovery code-path and should be ignored (recovery should not
  // fail). Additionally, the memtable will be updated only if
  // memtables->GetLogNumber() >= log_number
  // However, if recovery == false, any WriteBatch referencing
  // non-existing column family will return a failure. Also, log_number is
  // ignored in that case
  static Status InsertInto(const WriteBatch* batch,
                           ColumnFamilyMemTables* memtables,
                           bool recovery = false, uint64_t log_number = 0,
                           DB* db = nullptr,
                           const bool dont_filter_deletes = true);
  static void Append(WriteBatch* dst, const WriteBatch* src);
 };
--- a/db/write_batch_test.cc
+++ b/db/write_batch_test.cc
@ -11,6 +11,7 @@
 #include <memory>
 #include "db/memtable.h"
 #include "db/column_family.h"
 #include "db/write_batch_internal.h"
 #include "rocksdb/env.h"
 #include "rocksdb/memtablerep.h"
@ -27,7 +28,8 @@ static std::string PrintContents(WriteBatch* b) {
  MemTable* mem = new MemTable(cmp, options);
  mem->Ref();
  std::string state;
-  Status s = WriteBatchInternal::InsertInto(b, mem, &options);
+  ColumnFamilyMemTablesDefault cf_mems_default(mem, &options);
  Status s = WriteBatchInternal::InsertInto(b, &cf_mems_default);
  int count = 0;
  Iterator* iter = mem->NewIterator();
  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
@ -144,17 +146,37 @@ TEST(WriteBatchTest, Append) {
 namespace {
  struct TestHandler : public WriteBatch::Handler {
    std::string seen;
-    virtual void Put(const Slice& key, const Slice& value) {
+    virtual Status PutCF(uint32_t column_family_id, const Slice& key,
-      seen += "Put(" + key.ToString() + ", " + value.ToString() + ")";
+                       const Slice& value) {
      if (column_family_id == 0) {
        seen += "Put(" + key.ToString() + ", " + value.ToString() + ")";
      } else {
        seen += "PutCF(" + std::to_string(column_family_id) + ", " +
                key.ToString() + ", " + value.ToString() + ")";
      }
      return Status::OK();
    }
-    virtual void Merge(const Slice& key, const Slice& value) {
+    virtual Status MergeCF(uint32_t column_family_id, const Slice& key,
-      seen += "Merge(" + key.ToString() + ", " + value.ToString() + ")";
+                         const Slice& value) {
      if (column_family_id == 0) {
        seen += "Merge(" + key.ToString() + ", " + value.ToString() + ")";
      } else {
        seen += "MergeCF(" + std::to_string(column_family_id) + ", " +
                key.ToString() + ", " + value.ToString() + ")";
      }
      return Status::OK();
    }
    virtual void LogData(const Slice& blob) {
      seen += "LogData(" + blob.ToString() + ")";
    }
-    virtual void Delete(const Slice& key) {
+    virtual Status DeleteCF(uint32_t column_family_id, const Slice& key) {
-      seen += "Delete(" + key.ToString() + ")";
+      if (column_family_id == 0) {
        seen += "Delete(" + key.ToString() + ")";
      } else {
        seen += "DeleteCF(" + std::to_string(column_family_id) + ", " +
                key.ToString() + ")";
      }
      return Status::OK();
    }
  };
 }
@ -194,21 +216,23 @@ TEST(WriteBatchTest, Continue) {
  struct Handler : public TestHandler {
    int num_seen = 0;
-    virtual void Put(const Slice& key, const Slice& value) {
+    virtual Status PutCF(uint32_t column_family_id, const Slice& key,
                       const Slice& value) {
      ++num_seen;
-      TestHandler::Put(key, value);
+      return TestHandler::PutCF(column_family_id, key, value);
    }
-    virtual void Merge(const Slice& key, const Slice& value) {
+    virtual Status MergeCF(uint32_t column_family_id, const Slice& key,
                         const Slice& value) {
      ++num_seen;
-      TestHandler::Merge(key, value);
+      return TestHandler::MergeCF(column_family_id, key, value);
    }
    virtual void LogData(const Slice& blob) {
      ++num_seen;
      TestHandler::LogData(blob);
    }
-    virtual void Delete(const Slice& key) {
+    virtual Status DeleteCF(uint32_t column_family_id, const Slice& key) {
      ++num_seen;
-      TestHandler::Delete(key);
+      return TestHandler::DeleteCF(column_family_id, key);
    }
    virtual bool Continue() override {
      return num_seen < 3;
@ -256,6 +280,42 @@ TEST(WriteBatchTest, PutGatherSlices) {
  ASSERT_EQ(3, batch.Count());
 }
 namespace {
 class ColumnFamilyHandleImplDummy : public ColumnFamilyHandleImpl {
 public:
  ColumnFamilyHandleImplDummy(int id)
      : ColumnFamilyHandleImpl(nullptr, nullptr, nullptr), id_(id) {}
  uint32_t GetID() const override { return id_; }
 private:
  uint32_t id_;
 };
 }  // namespace anonymous
 TEST(WriteBatchTest, ColumnFamiliesBatchTest) {
  WriteBatch batch;
  ColumnFamilyHandleImplDummy zero(0), two(2), three(3), eight(8);
  batch.Put(&zero, Slice("foo"), Slice("bar"));
  batch.Put(&two, Slice("twofoo"), Slice("bar2"));
  batch.Put(&eight, Slice("eightfoo"), Slice("bar8"));
  batch.Delete(&eight, Slice("eightfoo"));
  batch.Merge(&three, Slice("threethree"), Slice("3three"));
  batch.Put(&zero, Slice("foo"), Slice("bar"));
  batch.Merge(Slice("omom"), Slice("nom"));
  TestHandler handler;
  batch.Iterate(&handler);
  ASSERT_EQ(
      "Put(foo, bar)"
      "PutCF(2, twofoo, bar2)"
      "PutCF(8, eightfoo, bar8)"
      "DeleteCF(8, eightfoo)"
      "MergeCF(3, threethree, 3three)"
      "Put(foo, bar)"
      "Merge(omom, nom)",
      handler.seen);
 }
 }  // namespace rocksdb
 int main(int argc, char** argv) {
--- a/include/rocksdb/c.h
+++ b/include/rocksdb/c.h
@ -243,6 +243,7 @@ extern void rocksdb_options_set_paranoid_checks(
    rocksdb_options_t*, unsigned char);
 extern void rocksdb_options_set_env(rocksdb_options_t*, rocksdb_env_t*);
 extern void rocksdb_options_set_info_log(rocksdb_options_t*, rocksdb_logger_t*);
 extern void rocksdb_options_set_info_log_level(rocksdb_options_t*, int);
 extern void rocksdb_options_set_write_buffer_size(rocksdb_options_t*, size_t);
 extern void rocksdb_options_set_max_open_files(rocksdb_options_t*, int);
 extern void rocksdb_options_set_cache(rocksdb_options_t*, rocksdb_cache_t*);
@ -275,6 +276,8 @@ extern void rocksdb_options_set_expanded_compaction_factor(
    rocksdb_options_t*, int);
 extern void rocksdb_options_set_max_grandparent_overlap_factor(
    rocksdb_options_t*, int);
 extern void rocksdb_options_set_max_bytes_for_level_multiplier_additional(
    rocksdb_options_t*, int* level_values, size_t num_levels);
 extern void rocksdb_options_enable_statistics(rocksdb_options_t*);
 extern void rocksdb_options_set_max_write_buffer_number(rocksdb_options_t*, int);
@ -330,10 +333,14 @@ extern void rocksdb_options_set_block_size_deviation(
    rocksdb_options_t*, int);
 extern void rocksdb_options_set_advise_random_on_open(
    rocksdb_options_t*, unsigned char);
 extern void rocksdb_options_set_access_hint_on_compaction_start(
    rocksdb_options_t*, int);
 extern void rocksdb_options_set_use_adaptive_mutex(
    rocksdb_options_t*, unsigned char);
 extern void rocksdb_options_set_bytes_per_sync(
    rocksdb_options_t*, uint64_t);
 extern void rocksdb_options_set_verify_checksums_in_compaction(
    rocksdb_options_t*, unsigned char);
 extern void rocksdb_options_set_filter_deletes(
    rocksdb_options_t*, unsigned char);
 extern void rocksdb_options_set_max_sequential_skip_in_iterations(
@ -348,6 +355,7 @@ extern void rocksdb_options_prepare_for_bulk_load(rocksdb_options_t*);
 extern void rocksdb_options_set_memtable_vector_rep(rocksdb_options_t*);
 extern void rocksdb_options_set_hash_skip_list_rep(rocksdb_options_t*, size_t, int32_t, int32_t);
 extern void rocksdb_options_set_hash_link_list_rep(rocksdb_options_t*, size_t);
 extern void rocksdb_options_set_plain_table_factory(rocksdb_options_t*, uint32_t, int, double, size_t);
 extern void rocksdb_options_set_max_bytes_for_level_base(rocksdb_options_t* opt, uint64_t n);
 extern void rocksdb_options_set_stats_dump_period_sec(rocksdb_options_t* opt, unsigned int sec);
@ -360,6 +368,16 @@ extern void rocksdb_options_set_memtable_prefix_bloom_probes(
    rocksdb_options_t*, uint32_t);
 extern void rocksdb_options_set_max_successive_merges(
    rocksdb_options_t*, size_t);
 extern void rocksdb_options_set_min_partial_merge_operands(
    rocksdb_options_t*, uint32_t);
 extern void rocksdb_options_set_bloom_locality(
    rocksdb_options_t*, uint32_t);
 extern void rocksdb_options_set_allow_thread_local(
    rocksdb_options_t*, unsigned char);
 extern void rocksdb_options_set_inplace_update_support(
    rocksdb_options_t*, unsigned char);
 extern void rocksdb_options_set_inplace_update_num_locks(
    rocksdb_options_t*, size_t);
 enum {
  rocksdb_no_compression = 0,
--- a/include/rocksdb/db.h
+++ b/include/rocksdb/db.h
@ -13,6 +13,7 @@
 #include <stdio.h>
 #include <memory>
 #include <vector>
 #include <string>
 #include <unordered_map>
 #include "rocksdb/iterator.h"
 #include "rocksdb/options.h"
@ -23,8 +24,24 @@ namespace rocksdb {
 using std::unique_ptr;
 class ColumnFamilyHandle {
 public:
  virtual ~ColumnFamilyHandle() {}
 };
 extern const std::string default_column_family_name;
 struct ColumnFamilyDescriptor {
  std::string name;
  ColumnFamilyOptions options;
  ColumnFamilyDescriptor()
      : name(default_column_family_name), options(ColumnFamilyOptions()) {}
  ColumnFamilyDescriptor(const std::string& name,
                         const ColumnFamilyOptions& options)
      : name(name), options(options) {}
 };
 // Update Makefile if you change these
-static const int kMajorVersion = 2;
+static const int kMajorVersion = 3;
 static const int kMinorVersion = 0;
 struct Options;
@ -87,33 +104,80 @@ class DB {
  // that modify data, like put/delete, will return error.
  // If the db is opened in read only mode, then no compactions
  // will happen.
  // TODO(icanadi): implement OpenForReadOnly that specifies column families.
  // User can open DB in read-only mode even if not specifying all column
  // families
  static Status OpenForReadOnly(const Options& options,
      const std::string& name, DB** dbptr,
      bool error_if_log_file_exist = false);
  // Open DB with column families.
  // db_options specify database specific options
  // column_families is the vector of all column families you'd like to open,
  // containing column family name and options. The default column family name
  // is 'default'.
  // If everything is OK, handles will on return be the same size
  // as column_families --- handles[i] will be a handle that you
  // will use to operate on column family column_family[i]
  static Status Open(const DBOptions& db_options, const std::string& name,
                     const std::vector<ColumnFamilyDescriptor>& column_families,
                     std::vector<ColumnFamilyHandle*>* handles, DB** dbptr);
  // ListColumnFamilies will open the DB specified by argument name
  // and return the list of all column families in that DB
  // through column_families argument. The ordering of
  // column families in column_families is unspecified.
  static Status ListColumnFamilies(const DBOptions& db_options,
                                   const std::string& name,
                                   std::vector<std::string>* column_families);
  DB() { }
  virtual ~DB();
  // Create a column_family and return the handle of column family
  // through the argument handle.
  virtual Status CreateColumnFamily(const ColumnFamilyOptions& options,
                                    const std::string& column_family_name,
                                    ColumnFamilyHandle** handle);
  // Drop a column family specified by column_family handle. This call
  // only records a drop record in the manifest and prevents the column
  // family from flushing and compacting.
  virtual Status DropColumnFamily(ColumnFamilyHandle* column_family);
  // Set the database entry for "key" to "value".
  // Returns OK on success, and a non-OK status on error.
  // Note: consider setting options.sync = true.
  virtual Status Put(const WriteOptions& options,
-                     const Slice& key,
+                     ColumnFamilyHandle* column_family, const Slice& key,
                     const Slice& value) = 0;
  Status Put(const WriteOptions& options, const Slice& key,
             const Slice& value) {
    return Put(options, DefaultColumnFamily(), key, value);
  }
  // Remove the database entry (if any) for "key".  Returns OK on
  // success, and a non-OK status on error.  It is not an error if "key"
  // did not exist in the database.
  // Note: consider setting options.sync = true.
-  virtual Status Delete(const WriteOptions& options, const Slice& key) = 0;
+  virtual Status Delete(const WriteOptions& options,
                        ColumnFamilyHandle* column_family,
                        const Slice& key) = 0;
  Status Delete(const WriteOptions& options, const Slice& key) {
    return Delete(options, DefaultColumnFamily(), key);
  }
  // Merge the database entry for "key" with "value".  Returns OK on success,
  // and a non-OK status on error. The semantics of this operation is
  // determined by the user provided merge_operator when opening DB.
  // Note: consider setting options.sync = true.
  virtual Status Merge(const WriteOptions& options,
-                       const Slice& key,
+                       ColumnFamilyHandle* column_family, const Slice& key,
                       const Slice& value) = 0;
  Status Merge(const WriteOptions& options, const Slice& key,
               const Slice& value) {
    return Merge(options, DefaultColumnFamily(), key, value);
  }
  // Apply the specified updates to the database.
  // Returns OK on success, non-OK on failure.
@ -128,8 +192,11 @@ class DB {
  //
  // May return some other Status on an error.
  virtual Status Get(const ReadOptions& options,
-                     const Slice& key,
+                     ColumnFamilyHandle* column_family, const Slice& key,
                     std::string* value) = 0;
  Status Get(const ReadOptions& options, const Slice& key, std::string* value) {
    return Get(options, DefaultColumnFamily(), key, value);
  }
  // If keys[i] does not exist in the database, then the i'th returned
  // status will be one for which Status::IsNotFound() is true, and
@ -141,9 +208,17 @@ class DB {
  // Similarly, the number of returned statuses will be the number of keys.
  // Note: keys will not be "de-duplicated". Duplicate keys will return
  // duplicate values in order.
-  virtual std::vector<Status> MultiGet(const ReadOptions& options,
+  virtual std::vector<Status> MultiGet(
-                                       const std::vector<Slice>& keys,
+      const ReadOptions& options,
-                                       std::vector<std::string>* values) = 0;
+      const std::vector<ColumnFamilyHandle*>& column_family,
      const std::vector<Slice>& keys, std::vector<std::string>* values) = 0;
  std::vector<Status> MultiGet(const ReadOptions& options,
                               const std::vector<Slice>& keys,
                               std::vector<std::string>* values) {
    return MultiGet(options, std::vector<ColumnFamilyHandle*>(
                                 keys.size(), DefaultColumnFamily()),
                    keys, values);
  }
  // If the key definitely does not exist in the database, then this method
  // returns false, else true. If the caller wants to obtain value when the key
@ -153,14 +228,17 @@ class DB {
  // to make this lighter weight is to avoid doing any IOs.
  // Default implementation here returns true and sets 'value_found' to false
  virtual bool KeyMayExist(const ReadOptions& options,
-                           const Slice& key,
+                           ColumnFamilyHandle* column_family, const Slice& key,
-                           std::string* value,
+                           std::string* value, bool* value_found = nullptr) {
                           bool* value_found = nullptr) {
    if (value_found != nullptr) {
      *value_found = false;
    }
    return true;
  }
  bool KeyMayExist(const ReadOptions& options, const Slice& key,
                   std::string* value, bool* value_found = nullptr) {
    return KeyMayExist(options, DefaultColumnFamily(), key, value, value_found);
  }
  // Return a heap-allocated iterator over the contents of the database.
  // The result of NewIterator() is initially invalid (caller must
@ -168,7 +246,18 @@ class DB {
  //
  // Caller should delete the iterator when it is no longer needed.
  // The returned iterator should be deleted before this db is deleted.
-  virtual Iterator* NewIterator(const ReadOptions& options) = 0;
+  virtual Iterator* NewIterator(const ReadOptions& options,
                                ColumnFamilyHandle* column_family) = 0;
  Iterator* NewIterator(const ReadOptions& options) {
    return NewIterator(options, DefaultColumnFamily());
  }
  // Returns iterators from a consistent database state across multiple
  // column families. Iterators are heap allocated and need to be deleted
  // before the db is deleted
  virtual Status NewIterators(
      const ReadOptions& options,
      const std::vector<ColumnFamilyHandle*>& column_families,
      std::vector<Iterator*>* iterators) = 0;
  // Return a handle to the current DB state.  Iterators created with
  // this handle will all observe a stable snapshot of the current DB
@ -194,7 +283,11 @@ class DB {
  //     about the internal operation of the DB.
  //  "rocksdb.sstables" - returns a multi-line string that describes all
  //     of the sstables that make up the db contents.
-  virtual bool GetProperty(const Slice& property, std::string* value) = 0;
+  virtual bool GetProperty(ColumnFamilyHandle* column_family,
                           const Slice& property, std::string* value) = 0;
  bool GetProperty(const Slice& property, std::string* value) {
    return GetProperty(DefaultColumnFamily(), property, value);
  }
  // For each i in [0,n-1], store in "sizes[i]", the approximate
  // file system space used by keys in "[range[i].start .. range[i].limit)".
@ -204,8 +297,12 @@ class DB {
  // sizes will be one-tenth the size of the corresponding user data size.
  //
  // The results may not include the sizes of recently written data.
-  virtual void GetApproximateSizes(const Range* range, int n,
+  virtual void GetApproximateSizes(ColumnFamilyHandle* column_family,
                                   const Range* range, int n,
                                   uint64_t* sizes) = 0;
  void GetApproximateSizes(const Range* range, int n, uint64_t* sizes) {
    GetApproximateSizes(DefaultColumnFamily(), range, n, sizes);
  }
  // Compact the underlying storage for the key range [*begin,*end].
  // The actual compaction interval might be superset of [*begin, *end].
@ -224,19 +321,32 @@ class DB {
  // hosting all the files. In this case, client could set reduce_level
  // to true, to move the files back to the minimum level capable of holding
  // the data set or a given level (specified by non-negative target_level).
-  virtual Status CompactRange(const Slice* begin, const Slice* end,
+  virtual Status CompactRange(ColumnFamilyHandle* column_family,
                              const Slice* begin, const Slice* end,
                              bool reduce_level = false,
                              int target_level = -1) = 0;
  Status CompactRange(const Slice* begin, const Slice* end,
                    bool reduce_level = false, int target_level = -1) {
    return CompactRange(DefaultColumnFamily(), begin, end, reduce_level,
                        target_level);
  }
  // Number of levels used for this DB.
-  virtual int NumberLevels() = 0;
+  virtual int NumberLevels(ColumnFamilyHandle* column_family) = 0;
  int NumberLevels() { return NumberLevels(DefaultColumnFamily()); }
  // Maximum level to which a new compacted memtable is pushed if it
  // does not create overlap.
-  virtual int MaxMemCompactionLevel() = 0;
+  virtual int MaxMemCompactionLevel(ColumnFamilyHandle* column_family) = 0;
  int MaxMemCompactionLevel() {
    return MaxMemCompactionLevel(DefaultColumnFamily());
  }
  // Number of files in level-0 that would stop writes.
-  virtual int Level0StopWriteTrigger() = 0;
+  virtual int Level0StopWriteTrigger(ColumnFamilyHandle* column_family) = 0;
  int Level0StopWriteTrigger() {
    return Level0StopWriteTrigger(DefaultColumnFamily());
  }
  // Get DB name -- the exact same name that was provided as an argument to
  // DB::Open()
@ -246,10 +356,18 @@ class DB {
  virtual Env* GetEnv() const = 0;
  // Get DB Options that we use
-  virtual const Options& GetOptions() const = 0;
+  virtual const Options& GetOptions(ColumnFamilyHandle* column_family)
      const = 0;
  const Options& GetOptions() const {
    return GetOptions(DefaultColumnFamily());
  }
  // Flush all mem-table data.
-  virtual Status Flush(const FlushOptions& options) = 0;
+  virtual Status Flush(const FlushOptions& options,
                       ColumnFamilyHandle* column_family) = 0;
  Status Flush(const FlushOptions& options) {
    return Flush(options, DefaultColumnFamily());
  }
  // Prevent file deletions. Compactions will continue to occur,
  // but no obsolete files will be deleted. Calling this multiple
@ -279,9 +397,12 @@ class DB {
  // Setting flush_memtable to true does Flush before recording the live files.
  // Setting flush_memtable to false is useful when we don't want to wait for
  // flush which may have to wait for compaction to complete taking an
-  // indeterminate time. But this will have to use GetSortedWalFiles after
+  // indeterminate time.
-  // GetLiveFiles to compensate for memtables missed in this snapshot due to the
+  //
-  // absence of Flush, by WAL files to recover the database consistently later
+  // In case you have multiple column families, even if flush_memtable is true,
  // you still need to call GetSortedWalFiles after GetLiveFiles to compensate
  // for new data that arrived to already-flushed column families while other
  // column families were flushing
  virtual Status GetLiveFiles(std::vector<std::string>&,
                              uint64_t* manifest_file_size,
                              bool flush_memtable = true) = 0;
@ -319,7 +440,14 @@ class DB {
  // be set properly
  virtual Status GetDbIdentity(std::string& identity) = 0;
-  virtual Status GetPropertiesOfAllTables(TablePropertiesCollection* props) = 0;
+  // Returns default column family handle
  virtual ColumnFamilyHandle* DefaultColumnFamily() const = 0;
  virtual Status GetPropertiesOfAllTables(ColumnFamilyHandle* column_family,
                                          TablePropertiesCollection* props) = 0;
  Status GetPropertiesOfAllTables(TablePropertiesCollection* props) {
    return GetPropertiesOfAllTables(DefaultColumnFamily(), props);
  }
 private:
  // No copying allowed
--- a/include/rocksdb/env.h
+++ b/include/rocksdb/env.h
@ -34,7 +34,7 @@ class Slice;
 class WritableFile;
 class RandomRWFile;
 class Directory;
-struct Options;
+struct DBOptions;
 using std::unique_ptr;
 using std::shared_ptr;
@ -47,7 +47,7 @@ struct EnvOptions {
  EnvOptions();
  // construct from Options
-  explicit EnvOptions(const Options& options);
+  explicit EnvOptions(const DBOptions& options);
  // If true, then allow caching of data in environment buffers
  bool use_os_buffer = true;
--- a/include/rocksdb/memtablerep.h
+++ b/include/rocksdb/memtablerep.h
@ -45,6 +45,8 @@ class LookupKey;
 class Slice;
 class SliceTransform;
 typedef void* KeyHandle;
 class MemTableRep {
 public:
  // KeyComparator provides a means to compare keys, which are internal keys
@ -62,11 +64,19 @@ class MemTableRep {
    virtual ~KeyComparator() { }
  };
  explicit MemTableRep(Arena* arena) : arena_(arena) {}
  // Allocate a buf of len size for storing key. The idea is that a specific
  // memtable representation knows its underlying data structure better. By
  // allowing it to allocate memory, it can possibly put correlated stuff
  // in consecutive memory area to make processor prefetching more efficient.
  virtual KeyHandle Allocate(const size_t len, char** buf);
  // Insert key into the collection. (The caller will pack key and value into a
-  // single buffer and pass that in as the parameter to Insert)
+  // single buffer and pass that in as the parameter to Insert).
  // REQUIRES: nothing that compares equal to key is currently in the
  // collection.
-  virtual void Insert(const char* key) = 0;
+  virtual void Insert(KeyHandle handle) = 0;
  // Returns true iff an entry that compares equal to key is in the collection.
  virtual bool Contains(const char* key) const = 0;
@ -153,6 +163,8 @@ class MemTableRep {
  // When *key is an internal key concatenated with the value, returns the
  // user key.
  virtual Slice UserKey(const char* key) const;
  Arena* arena_;
 };
 // This is the base class for all factories that are used by RocksDB to create
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@ -72,8 +72,9 @@ enum UpdateStatus {    // Return status For inplace update callback
  UPDATED         = 2, // No inplace update. Merged value set
 };
-// Options to control the behavior of a database (passed to DB::Open)
+struct Options;
-struct Options {
+
 struct ColumnFamilyOptions {
  // -------------------
  // Parameters that affect behavior
@ -130,38 +131,6 @@ struct Options {
  // Default: a factory that doesn't provide any object
  std::shared_ptr<CompactionFilterFactoryV2> compaction_filter_factory_v2;
  // If true, the database will be created if it is missing.
  // Default: false
  bool create_if_missing;
  // If true, an error is raised if the database already exists.
  // Default: false
  bool error_if_exists;
  // If true, the implementation will do aggressive checking of the
  // data it is processing and will stop early if it detects any
  // errors.  This may have unforeseen ramifications: for example, a
  // corruption of one DB entry may cause a large number of entries to
  // become unreadable or for the entire DB to become unopenable.
  // If any of the writes to the database fails (Put, Delete, Merge, Write),
  // the database will switch to read-only mode and fail all other
  // Write operations.
  // Default: true
  bool paranoid_checks;
  // Use the specified object to interact with the environment,
  // e.g. to read/write files, schedule background work, etc.
  // Default: Env::Default()
  Env* env;
  // Any internal progress/error information generated by the db will
  // be written to info_log if it is non-nullptr, or to a file stored
  // in the same directory as the DB contents if info_log is nullptr.
  // Default: nullptr
  shared_ptr<Logger> info_log;
  InfoLogLevel info_log_level;
  // -------------------
  // Parameters that affect performance
@ -193,15 +162,6 @@ struct Options {
  // individual write buffers.  Default: 1
  int min_write_buffer_number_to_merge;
  // Number of open files that can be used by the DB.  You may need to
  // increase this if your database has a large working set. Value -1 means
  // files opened are always kept open. You can estimate number of files based
  // on target_file_size_base and target_file_size_multiplier for level-based
  // compaction. For universal-style compaction, you can usually set it to -1.
  //
  // Default: 5000
  int max_open_files;
  // Control over blocks (user data is stored in a set of blocks, and
  // a block is the unit of reading from disk).
@ -369,93 +329,12 @@ struct Options {
  // stop building a single file in a level->level+1 compaction.
  int max_grandparent_overlap_factor;
  // If non-null, then we should collect metrics about database operations
  // Statistics objects should not be shared between DB instances as
  // it does not use any locks to prevent concurrent updates.
  shared_ptr<Statistics> statistics;
  // If true, then the contents of data files are not synced
  // to stable storage. Their contents remain in the OS buffers till the
  // OS decides to flush them. This option is good for bulk-loading
  // of data. Once the bulk-loading is complete, please issue a
  // sync to the OS to flush all dirty buffesrs to stable storage.
  // Default: false
  bool disableDataSync;
  // If true, then every store to stable storage will issue a fsync.
  // If false, then every store to stable storage will issue a fdatasync.
  // This parameter should be set to true while storing data to
  // filesystem like ext3 that can lose files after a reboot.
  // Default: false
  bool use_fsync;
  // This number controls how often a new scribe log about
  // db deploy stats is written out.
  // -1 indicates no logging at all.
  // Default value is 1800 (half an hour).
  int db_stats_log_interval;
  // This specifies the info LOG dir.
  // If it is empty, the log files will be in the same dir as data.
  // If it is non empty, the log files will be in the specified dir,
  // and the db data dir's absolute path will be used as the log file
  // name's prefix.
  std::string db_log_dir;
  // This specifies the absolute dir path for write-ahead logs (WAL).
  // If it is empty, the log files will be in the same dir as data,
  //   dbname is used as the data dir by default
  // If it is non empty, the log files will be in kept the specified dir.
  // When destroying the db,
  //   all log files in wal_dir and the dir itself is deleted
  std::string wal_dir;
  // Disable compaction triggered by seek.
  // With bloomfilter and fast storage, a miss on one level
  // is very cheap if the file handle is cached in table cache
  // (which is true if max_open_files is large).
  bool disable_seek_compaction;
  // The periodicity when obsolete files get deleted. The default
  // value is 6 hours. The files that get out of scope by compaction
  // process will still get automatically delete on every compaction,
  // regardless of this setting
  uint64_t delete_obsolete_files_period_micros;
  // Maximum number of concurrent background jobs, submitted to
  // the default LOW priority thread pool
  // Default: 1
  int max_background_compactions;
  // Maximum number of concurrent background memtable flush jobs, submitted to
  // the HIGH priority thread pool.
  // By default, all background jobs (major compaction and memtable flush) go
  // to the LOW priority pool. If this option is set to a positive number,
  // memtable flush jobs will be submitted to the HIGH priority pool.
  // It is important when the same Env is shared by multiple db instances.
  // Without a separate pool, long running major compaction jobs could
  // potentially block memtable flush jobs of other db instances, leading to
  // unnecessary Put stalls.
  // Default: 1
  int max_background_flushes;
  // Specify the maximal size of the info log file. If the log file
  // is larger than `max_log_file_size`, a new info log file will
  // be created.
  // If max_log_file_size == 0, all logs will be written to one
  // log file.
  size_t max_log_file_size;
  // Time for the info log file to roll (in seconds).
  // If specified with non-zero value, log file will be rolled
  // if it has been active longer than `log_file_time_to_roll`.
  // Default: 0 (disabled)
  size_t log_file_time_to_roll;
  // Maximal info log files to be kept.
  // Default: 1000
  size_t keep_log_file_num;
  // Puts are delayed 0-1 ms when any level has a compaction score that exceeds
  // soft_rate_limit. This is ignored when == 0.0.
  // CONSTRAINT: soft_rate_limit <= hard_rate_limit. If this constraint does not
@ -473,32 +352,14 @@ struct Options {
  // Default: 1000
  unsigned int rate_limit_delay_max_milliseconds;
  // manifest file is rolled over on reaching this limit.
  // The older manifest file be deleted.
  // The default value is MAX_INT so that roll-over does not take place.
  uint64_t max_manifest_file_size;
  // Disable block cache. If this is set to true,
  // then no block cache should be used, and the block_cache should
  // point to a nullptr object.
  // Default: false
  bool no_block_cache;
-  // Number of shards used for table cache.
+  // size of one block in arena memory allocation.
-  int table_cache_numshardbits;
+  // If <= 0, a proper value is automatically calculated (usually 1/10 of
  // During data eviction of table's LRU cache, it would be inefficient
  // to strictly follow LRU because this piece of memory will not really
  // be released unless its refcount falls to zero. Instead, make two
  // passes: the first pass will release items with refcount = 1,
  // and if not enough space releases after scanning the number of
  // elements specified by this parameter, we will remove items in LRU
  // order.
  int table_cache_remove_scan_count_limit;
  // Size of one block in arena memory allocation.
  //
  // If <= 0, a proper value is automatically calculated (usually about 1/10 of
  // writer_buffer_size).
  //
  // There are two additonal restriction of the The specified size:
@ -512,71 +373,14 @@ struct Options {
  // Default: 0
  size_t arena_block_size;
  // Create an Options object with default values for all fields.
  Options();
  void Dump(Logger* log) const;
  // Set appropriate parameters for bulk loading.
  // The reason that this is a function that returns "this" instead of a
  // constructor is to enable chaining of multiple similar calls in the future.
  //
  // All data will be in level 0 without any automatic compaction.
  // It's recommended to manually call CompactRange(NULL, NULL) before reading
  // from the database, because otherwise the read can be very slow.
  Options* PrepareForBulkLoad();
  // Disable automatic compactions. Manual compactions can still
-  // be issued on this database.
+  // be issued on this column family
  bool disable_auto_compactions;
  // The following two fields affect how archived logs will be deleted.
  // 1. If both set to 0, logs will be deleted asap and will not get into
  //    the archive.
  // 2. If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0,
  //    WAL files will be checked every 10 min and if total size is greater
  //    then WAL_size_limit_MB, they will be deleted starting with the
  //    earliest until size_limit is met. All empty files will be deleted.
  // 3. If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then
  //    WAL files will be checked every WAL_ttl_secondsi / 2 and those that
  //    are older than WAL_ttl_seconds will be deleted.
  // 4. If both are not 0, WAL files will be checked every 10 min and both
  //    checks will be performed with ttl being first.
  uint64_t WAL_ttl_seconds;
  uint64_t WAL_size_limit_MB;
  // Number of bytes to preallocate (via fallocate) the manifest
  // files.  Default is 4mb, which is reasonable to reduce random IO
  // as well as prevent overallocation for mounts that preallocate
  // large amounts of data (such as xfs's allocsize option).
  size_t manifest_preallocation_size;
  // Purge duplicate/deleted keys when a memtable is flushed to storage.
  // Default: true
  bool purge_redundant_kvs_while_flush;
  // Data being read from file storage may be buffered in the OS
  // Default: true
  bool allow_os_buffer;
  // Allow the OS to mmap file for reading sst tables. Default: false
  bool allow_mmap_reads;
  // Allow the OS to mmap file for writing. Default: false
  bool allow_mmap_writes;
  // Disable child process inherit open files. Default: true
  bool is_fd_close_on_exec;
  // Skip log corruption error on recovery (If client is ok with
  // losing most recent changes)
  // Default: false
  bool skip_log_error_on_recovery;
  // if not zero, dump rocksdb.stats to LOG every stats_dump_period_sec
  // Default: 3600 (1 hour)
  unsigned int stats_dump_period_sec;
  // This is used to close a block before it reaches the configured
  // 'block_size'. If the percentage of free space in the current block is less
  // than this specified number and adding a new record to the block will
@ -585,45 +389,17 @@ struct Options {
  // Default is 10.
  int block_size_deviation;
  // If set true, will hint the underlying file system that the file
  // access pattern is random, when a sst file is opened.
  // Default: true
  bool advise_random_on_open;
  // Specify the file access pattern once a compaction is started.
  // It will be applied to all input files of a compaction.
  // Default: NORMAL
  enum {
    NONE,
    NORMAL,
    SEQUENTIAL,
    WILLNEED
  } access_hint_on_compaction_start;
  // Use adaptive mutex, which spins in the user space before resorting
  // to kernel. This could reduce context switch when the mutex is not
  // heavily contended. However, if the mutex is hot, we could end up
  // wasting spin time.
  // Default: false
  bool use_adaptive_mutex;
  // Allows OS to incrementally sync files to disk while they are being
  // written, asynchronously, in the background.
  // Issue one request for every bytes_per_sync written. 0 turns it off.
  // Default: 0
  uint64_t bytes_per_sync;
  // The compaction style. Default: kCompactionStyleLevel
  CompactionStyle compaction_style;
  // The options needed to support Universal Style compactions
  CompactionOptionsUniversal compaction_options_universal;
  // If true, compaction will verify checksum on every read that happens
  // as part of compaction
  // Default: true
  bool verify_checksums_in_compaction;
  // The options needed to support Universal Style compactions
  CompactionOptionsUniversal compaction_options_universal;
  // Use KeyMayExist API to filter deletes when this is true.
  // If KeyMayExist returns false, i.e. the key definitely does not exist, then
  // the delete is a noop. KeyMayExist only incurs in-memory look up.
@ -653,7 +429,7 @@ struct Options {
  // Default: emtpy vector -- no user-defined statistics collection will be
  // performed.
  typedef std::vector<std::shared_ptr<TablePropertiesCollector>>
-          TablePropertiesCollectors;
+      TablePropertiesCollectors;
  TablePropertiesCollectors table_properties_collectors;
  // Allows thread-safe inplace updates.
@ -750,9 +526,266 @@ struct Options {
  // Default: 2
  uint32_t min_partial_merge_operands;
  // Create ColumnFamilyOptions with default values for all fields
  ColumnFamilyOptions();
  // Create ColumnFamilyOptions from Options
  explicit ColumnFamilyOptions(const Options& options);
  void Dump(Logger* log) const;
 };
 struct DBOptions {
  // If true, the database will be created if it is missing.
  // Default: false
  bool create_if_missing;
  // If true, an error is raised if the database already exists.
  // Default: false
  bool error_if_exists;
  // If true, the implementation will do aggressive checking of the
  // data it is processing and will stop early if it detects any
  // errors.  This may have unforeseen ramifications: for example, a
  // corruption of one DB entry may cause a large number of entries to
  // become unreadable or for the entire DB to become unopenable.
  // If any of the  writes to the database fails (Put, Delete, Merge, Write),
  // the database will switch to read-only mode and fail all other
  // Write operations.
  // Default: true
  bool paranoid_checks;
  // Use the specified object to interact with the environment,
  // e.g. to read/write files, schedule background work, etc.
  // Default: Env::Default()
  Env* env;
  // Any internal progress/error information generated by the db will
  // be written to info_log if it is non-nullptr, or to a file stored
  // in the same directory as the DB contents if info_log is nullptr.
  // Default: nullptr
  shared_ptr<Logger> info_log;
  InfoLogLevel info_log_level;
  // Number of open files that can be used by the DB.  You may need to
  // increase this if your database has a large working set. Value -1 means
  // files opened are always kept open. You can estimate number of files based
  // on target_file_size_base and target_file_size_multiplier for level-based
  // compaction. For universal-style compaction, you can usually set it to -1.
  // Default: 5000
  int max_open_files;
  // If non-null, then we should collect metrics about database operations
  // Statistics objects should not be shared between DB instances as
  // it does not use any locks to prevent concurrent updates.
  shared_ptr<Statistics> statistics;
  // If true, then the contents of data files are not synced
  // to stable storage. Their contents remain in the OS buffers till the
  // OS decides to flush them. This option is good for bulk-loading
  // of data. Once the bulk-loading is complete, please issue a
  // sync to the OS to flush all dirty buffesrs to stable storage.
  // Default: false
  bool disableDataSync;
  // If true, then every store to stable storage will issue a fsync.
  // If false, then every store to stable storage will issue a fdatasync.
  // This parameter should be set to true while storing data to
  // filesystem like ext3 that can lose files after a reboot.
  // Default: false
  bool use_fsync;
  // This number controls how often a new scribe log about
  // db deploy stats is written out.
  // -1 indicates no logging at all.
  // Default value is 1800 (half an hour).
  int db_stats_log_interval;
  // This specifies the info LOG dir.
  // If it is empty, the log files will be in the same dir as data.
  // If it is non empty, the log files will be in the specified dir,
  // and the db data dir's absolute path will be used as the log file
  // name's prefix.
  std::string db_log_dir;
  // This specifies the absolute dir path for write-ahead logs (WAL).
  // If it is empty, the log files will be in the same dir as data,
  //   dbname is used as the data dir by default
  // If it is non empty, the log files will be in kept the specified dir.
  // When destroying the db,
  //   all log files in wal_dir and the dir itself is deleted
  std::string wal_dir;
  // The periodicity when obsolete files get deleted. The default
  // value is 6 hours. The files that get out of scope by compaction
  // process will still get automatically delete on every compaction,
  // regardless of this setting
  uint64_t delete_obsolete_files_period_micros;
  // Maximum number of concurrent background compaction jobs, submitted to
  // the default LOW priority thread pool.
  // If you're increasing this, also consider increasing number of threads in
  // LOW priority thread pool. For more information, see
  // Env::SetBackgroundThreads
  // Default: 1
  int max_background_compactions;
  // Maximum number of concurrent background memtable flush jobs, submitted to
  // the HIGH priority thread pool.
  //
  // By default, all background jobs (major compaction and memtable flush) go
  // to the LOW priority pool. If this option is set to a positive number,
  // memtable flush jobs will be submitted to the HIGH priority pool.
  // It is important when the same Env is shared by multiple db instances.
  // Without a separate pool, long running major compaction jobs could
  // potentially block memtable flush jobs of other db instances, leading to
  // unnecessary Put stalls.
  //
  // If you're increasing this, also consider increasing number of threads in
  // HIGH priority thread pool. For more information, see
  // Env::SetBackgroundThreads
  // Default: 1
  int max_background_flushes;
  // Specify the maximal size of the info log file. If the log file
  // is larger than `max_log_file_size`, a new info log file will
  // be created.
  // If max_log_file_size == 0, all logs will be written to one
  // log file.
  size_t max_log_file_size;
  // Time for the info log file to roll (in seconds).
  // If specified with non-zero value, log file will be rolled
  // if it has been active longer than `log_file_time_to_roll`.
  // Default: 0 (disabled)
  size_t log_file_time_to_roll;
  // Maximal info log files to be kept.
  // Default: 1000
  size_t keep_log_file_num;
  // manifest file is rolled over on reaching this limit.
  // The older manifest file be deleted.
  // The default value is MAX_INT so that roll-over does not take place.
  uint64_t max_manifest_file_size;
  // Number of shards used for table cache.
  int table_cache_numshardbits;
  // During data eviction of table's LRU cache, it would be inefficient
  // to strictly follow LRU because this piece of memory will not really
  // be released unless its refcount falls to zero. Instead, make two
  // passes: the first pass will release items with refcount = 1,
  // and if not enough space releases after scanning the number of
  // elements specified by this parameter, we will remove items in LRU
  // order.
  int table_cache_remove_scan_count_limit;
  // The following two fields affect how archived logs will be deleted.
  // 1. If both set to 0, logs will be deleted asap and will not get into
  //    the archive.
  // 2. If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0,
  //    WAL files will be checked every 10 min and if total size is greater
  //    then WAL_size_limit_MB, they will be deleted starting with the
  //    earliest until size_limit is met. All empty files will be deleted.
  // 3. If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then
  //    WAL files will be checked every WAL_ttl_secondsi / 2 and those that
  //    are older than WAL_ttl_seconds will be deleted.
  // 4. If both are not 0, WAL files will be checked every 10 min and both
  //    checks will be performed with ttl being first.
  uint64_t WAL_ttl_seconds;
  uint64_t WAL_size_limit_MB;
  // Number of bytes to preallocate (via fallocate) the manifest
  // files.  Default is 4mb, which is reasonable to reduce random IO
  // as well as prevent overallocation for mounts that preallocate
  // large amounts of data (such as xfs's allocsize option).
  size_t manifest_preallocation_size;
  // Data being read from file storage may be buffered in the OS
  // Default: true
  bool allow_os_buffer;
  // Allow the OS to mmap file for reading sst tables. Default: false
  bool allow_mmap_reads;
  // Allow the OS to mmap file for writing. Default: false
  bool allow_mmap_writes;
  // Disable child process inherit open files. Default: true
  bool is_fd_close_on_exec;
  // Skip log corruption error on recovery (If client is ok with
  // losing most recent changes)
  // Default: false
  bool skip_log_error_on_recovery;
  // if not zero, dump rocksdb.stats to LOG every stats_dump_period_sec
  // Default: 3600 (1 hour)
  unsigned int stats_dump_period_sec;
  // If set true, will hint the underlying file system that the file
  // access pattern is random, when a sst file is opened.
  // Default: true
  bool advise_random_on_open;
  // Specify the file access pattern once a compaction is started.
  // It will be applied to all input files of a compaction.
  // Default: NORMAL
  enum {
    NONE,
    NORMAL,
    SEQUENTIAL,
    WILLNEED
  } access_hint_on_compaction_start;
  // Use adaptive mutex, which spins in the user space before resorting
  // to kernel. This could reduce context switch when the mutex is not
  // heavily contended. However, if the mutex is hot, we could end up
  // wasting spin time.
  // Default: false
  bool use_adaptive_mutex;
  // Allows OS to incrementally sync files to disk while they are being
  // written, asynchronously, in the background.
  // Issue one request for every bytes_per_sync written. 0 turns it off.
  // Default: 0
  uint64_t bytes_per_sync;
  // Allow RocksDB to use thread local storage to optimize performance.
  // Default: true
  bool allow_thread_local;
  // Create DBOptions with default values for all fields
  DBOptions();
  // Create DBOptions from Options
  explicit DBOptions(const Options& options);
  void Dump(Logger* log) const;
 };
 // Options to control the behavior of a database (passed to DB::Open)
 struct Options : public DBOptions, public ColumnFamilyOptions {
  // Create an Options object with default values for all fields.
  Options() :
    DBOptions(),
    ColumnFamilyOptions() {}
  Options(const DBOptions& db_options,
          const ColumnFamilyOptions& column_family_options)
      : DBOptions(db_options), ColumnFamilyOptions(column_family_options) {}
  void Dump(Logger* log) const;
  // Set appropriate parameters for bulk loading.
  // The reason that this is a function that returns "this" instead of a
  // constructor is to enable chaining of multiple similar calls in the future.
  //
  // All data will be in level 0 without any automatic compaction.
  // It's recommended to manually call CompactRange(NULL, NULL) before reading
  // from the database, because otherwise the read can be very slow.
  Options* PrepareForBulkLoad();
 };
 //
--- a/include/rocksdb/perf_context.h
+++ b/include/rocksdb/perf_context.h
@ -64,7 +64,11 @@ struct PerfContext {
  uint64_t write_memtable_time;
 };
 #if defined(NPERF_CONTEXT) || defined(IOS_CROSS_COMPILE)
 extern PerfContext perf_context;
 #else
 extern __thread PerfContext perf_context;
 #endif
 }
--- a/include/rocksdb/write_batch.h
+++ b/include/rocksdb/write_batch.h
@ -31,6 +31,7 @@
 namespace rocksdb {
 class Slice;
 class ColumnFamilyHandle;
 struct SliceParts;
 class WriteBatch {
@ -39,19 +40,34 @@ class WriteBatch {
  ~WriteBatch();
  // Store the mapping "key->value" in the database.
-  void Put(const Slice& key, const Slice& value);
+  void Put(ColumnFamilyHandle* column_family, const Slice& key,
           const Slice& value);
  void Put(const Slice& key, const Slice& value) {
    Put(nullptr, key, value);
  }
  // Variant of Put() that gathers output like writev(2).  The key and value
  // that will be written to the database are concatentations of arrays of
  // slices.
-  void Put(const SliceParts& key, const SliceParts& value);
+  void Put(ColumnFamilyHandle* column_family, const SliceParts& key,
           const SliceParts& value);
  void Put(const SliceParts& key, const SliceParts& value) {
    Put(nullptr, key, value);
  }
  // Merge "value" with the existing value of "key" in the database.
  // "key->merge(existing, value)"
-  void Merge(const Slice& key, const Slice& value);
+  void Merge(ColumnFamilyHandle* column_family, const Slice& key,
             const Slice& value);
  void Merge(const Slice& key, const Slice& value) {
    Merge(nullptr, key, value);
  }
  // If the database contains a mapping for "key", erase it.  Else do nothing.
-  void Delete(const Slice& key);
+  void Delete(ColumnFamilyHandle* column_family, const Slice& key);
  void Delete(const Slice& key) {
    Delete(nullptr, key);
  }
  // Append a blob of arbitrary size to the records in this batch. The blob will
  // be stored in the transaction log but not in any other file. In particular,
@ -72,14 +88,46 @@ class WriteBatch {
  class Handler {
   public:
    virtual ~Handler();
-    virtual void Put(const Slice& key, const Slice& value) = 0;
+    // default implementation will just call Put without column family for
    // backwards compatibility. If the column family is not default,
    // the function is noop
    virtual Status PutCF(uint32_t column_family_id, const Slice& key,
                         const Slice& value) {
      if (column_family_id == 0) {
        // Put() historically doesn't return status. We didn't want to be
        // backwards incompatible so we didn't change the return status
        // (this is a public API). We do an ordinary get and return Status::OK()
        Put(key, value);
        return Status::OK();
      }
      return Status::InvalidArgument(
          "non-default column family and PutCF not implemented");
    }
    virtual void Put(const Slice& key, const Slice& value);
    // Merge and LogData are not pure virtual. Otherwise, we would break
    // existing clients of Handler on a source code level. The default
    // implementation of Merge simply throws a runtime exception.
    virtual Status MergeCF(uint32_t column_family_id, const Slice& key,
                           const Slice& value) {
      if (column_family_id == 0) {
        Merge(key, value);
        return Status::OK();
      }
      return Status::InvalidArgument(
          "non-default column family and MergeCF not implemented");
    }
    virtual void Merge(const Slice& key, const Slice& value);
    // The default implementation of LogData does nothing.
    virtual void LogData(const Slice& blob);
-    virtual void Delete(const Slice& key) = 0;
+    virtual Status DeleteCF(uint32_t column_family_id, const Slice& key) {
      if (column_family_id == 0) {
        Delete(key);
        return Status::OK();
      }
      return Status::InvalidArgument(
          "non-default column family and DeleteCF not implemented");
    }
    virtual void Delete(const Slice& key);
    // Continue is called by WriteBatch::Iterate. If it returns false,
    // iteration is halted. Otherwise, it continues iterating. The default
    // implementation always returns true.
--- a/include/utilities/stackable_db.h
+++ b/include/utilities/stackable_db.h
@ -21,40 +21,49 @@ class StackableDB : public DB {
    return db_;
  }
  using DB::Put;
  virtual Status Put(const WriteOptions& options,
-                     const Slice& key,
+                     ColumnFamilyHandle* column_family, const Slice& key,
                     const Slice& val) override {
-    return db_->Put(options, key, val);
+    return db_->Put(options, column_family, key, val);
  }
  using DB::Get;
  virtual Status Get(const ReadOptions& options,
-                     const Slice& key,
+                     ColumnFamilyHandle* column_family, const Slice& key,
                     std::string* value) override {
-    return db_->Get(options, key, value);
+    return db_->Get(options, column_family, key, value);
  }
-  virtual std::vector<Status> MultiGet(const ReadOptions& options,
+  using DB::MultiGet;
-                                       const std::vector<Slice>& keys,
+  virtual std::vector<Status> MultiGet(
-                                       std::vector<std::string>* values)
+      const ReadOptions& options,
-    override {
+      const std::vector<ColumnFamilyHandle*>& column_family,
-      return db_->MultiGet(options, keys, values);
+      const std::vector<Slice>& keys,
      std::vector<std::string>* values) override {
    return db_->MultiGet(options, column_family, keys, values);
  }
  using DB::KeyMayExist;
  virtual bool KeyMayExist(const ReadOptions& options,
-                           const Slice& key,
+                           ColumnFamilyHandle* column_family, const Slice& key,
                           std::string* value,
                           bool* value_found = nullptr) override {
-    return db_->KeyMayExist(options, key, value, value_found);
+    return db_->KeyMayExist(options, column_family, key, value, value_found);
  }
-  virtual Status Delete(const WriteOptions& wopts, const Slice& key) override {
+  using DB::Delete;
-    return db_->Delete(wopts, key);
+  virtual Status Delete(const WriteOptions& wopts,
                        ColumnFamilyHandle* column_family,
                        const Slice& key) override {
    return db_->Delete(wopts, column_family, key);
  }
  using DB::Merge;
  virtual Status Merge(const WriteOptions& options,
-                       const Slice& key,
+                       ColumnFamilyHandle* column_family, const Slice& key,
                       const Slice& value) override {
-    return db_->Merge(options, key, value);
+    return db_->Merge(options, column_family, key, value);
  }
@ -63,10 +72,20 @@ class StackableDB : public DB {
      return db_->Write(opts, updates);
  }
-  virtual Iterator* NewIterator(const ReadOptions& opts) override {
+  using DB::NewIterator;
-    return db_->NewIterator(opts);
+  virtual Iterator* NewIterator(const ReadOptions& opts,
                                ColumnFamilyHandle* column_family) override {
    return db_->NewIterator(opts, column_family);
  }
  virtual Status NewIterators(
      const ReadOptions& options,
      const std::vector<ColumnFamilyHandle*>& column_families,
      std::vector<Iterator*>* iterators) {
    return db_->NewIterators(options, column_families, iterators);
  }
  virtual const Snapshot* GetSnapshot() override {
    return db_->GetSnapshot();
  }
@ -75,32 +94,43 @@ class StackableDB : public DB {
    return db_->ReleaseSnapshot(snapshot);
  }
-  virtual bool GetProperty(const Slice& property, std::string* value)
+  using DB::GetProperty;
-    override {
+  virtual bool GetProperty(ColumnFamilyHandle* column_family,
-      return db_->GetProperty(property, value);
+                           const Slice& property, std::string* value) override {
      return db_->GetProperty(column_family, property, value);
  }
-  virtual void GetApproximateSizes(const Range* r, int n, uint64_t* sizes)
+  using DB::GetApproximateSizes;
-    override {
+  virtual void GetApproximateSizes(ColumnFamilyHandle* column_family,
-      return db_->GetApproximateSizes(r, n, sizes);
+                                   const Range* r, int n,
                                   uint64_t* sizes) override {
      return db_->GetApproximateSizes(column_family, r, n, sizes);
  }
-  virtual Status CompactRange(const Slice* begin, const Slice* end,
+  using DB::CompactRange;
  virtual Status CompactRange(ColumnFamilyHandle* column_family,
                              const Slice* begin, const Slice* end,
                              bool reduce_level = false,
                              int target_level = -1) override {
-    return db_->CompactRange(begin, end, reduce_level, target_level);
+    return db_->CompactRange(column_family, begin, end, reduce_level,
                             target_level);
  }
-  virtual int NumberLevels() override {
+  using DB::NumberLevels;
-    return db_->NumberLevels();
+  virtual int NumberLevels(ColumnFamilyHandle* column_family) override {
    return db_->NumberLevels(column_family);
  }
-  virtual int MaxMemCompactionLevel() override {
+  using DB::MaxMemCompactionLevel;
-    return db_->MaxMemCompactionLevel();
+  virtual int MaxMemCompactionLevel(ColumnFamilyHandle* column_family)
      override {
    return db_->MaxMemCompactionLevel(column_family);
  }
-  virtual int Level0StopWriteTrigger() override {
+  using DB::Level0StopWriteTrigger;
-    return db_->Level0StopWriteTrigger();
+  virtual int Level0StopWriteTrigger(ColumnFamilyHandle* column_family)
      override {
    return db_->Level0StopWriteTrigger(column_family);
  }
  virtual const std::string& GetName() const override {
@ -111,12 +141,16 @@ class StackableDB : public DB {
    return db_->GetEnv();
  }
-  virtual const Options& GetOptions() const override {
+  using DB::GetOptions;
-    return db_->GetOptions();
+  virtual const Options& GetOptions(ColumnFamilyHandle* column_family) const
      override {
    return db_->GetOptions(column_family);
  }
-  virtual Status Flush(const FlushOptions& fopts) override {
+  using DB::Flush;
-    return db_->Flush(fopts);
+  virtual Status Flush(const FlushOptions& fopts,
                       ColumnFamilyHandle* column_family) override {
    return db_->Flush(fopts, column_family);
  }
  virtual Status DisableFileDeletions() override {
@ -148,8 +182,10 @@ class StackableDB : public DB {
    return db_->GetDbIdentity(identity);
  }
-  virtual Status GetPropertiesOfAllTables(TablePropertiesCollection* props) {
+  using DB::GetPropertiesOfAllTables;
-    return db_->GetPropertiesOfAllTables(props);
+  virtual Status GetPropertiesOfAllTables(ColumnFamilyHandle* column_family,
                                          TablePropertiesCollection* props) {
    return db_->GetPropertiesOfAllTables(column_family, props);
  }
  virtual Status GetUpdatesSince(
@ -158,6 +194,10 @@ class StackableDB : public DB {
    return db_->GetUpdatesSince(seq_number, iter, read_options);
  }
  virtual ColumnFamilyHandle* DefaultColumnFamily() const override {
    return db_->DefaultColumnFamily();
  }
 protected:
  DB* db_;
 };
--- a/java/rocksjni/write_batch.cc
+++ b/java/rocksjni/write_batch.cc
@ -208,7 +208,9 @@ jbyteArray Java_org_rocksdb_WriteBatchTest_getContents(
  rocksdb::MemTable* mem = new rocksdb::MemTable(cmp, options);
  mem->Ref();
  std::string state;
-  rocksdb::Status s = rocksdb::WriteBatchInternal::InsertInto(b, mem, &options);
+  rocksdb::ColumnFamilyMemTablesDefault cf_mems_default(mem, &options);
  rocksdb::Status s =
      rocksdb::WriteBatchInternal::InsertInto(b, &cf_mems_default);
  int count = 0;
  rocksdb::Iterator* iter = mem->NewIterator();
  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
--- a/port/port_example.h
+++ b/port/port_example.h
@ -127,13 +127,6 @@ extern bool Snappy_GetUncompressedLength(const char* input, size_t length,
 extern bool Snappy_Uncompress(const char* input_data, size_t input_length,
                              char* output);
 // ------------------ Miscellaneous -------------------
 // If heap profiling is not supported, returns false.
 // Else repeatedly calls (*func)(arg, data, n) and then returns true.
 // The concatenation of all "data[0,n-1]" fragments is the heap profile.
 extern bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg);
 }  // namespace port
 }  // namespace rocksdb
--- a/port/port_posix.h
+++ b/port/port_posix.h
@ -476,10 +476,6 @@ inline bool LZ4HC_Compress(const CompressionOptions &opts, const char* input,
  return false;
 }
 inline bool GetHeapProfile(void (*func)(void *, const char *, int), void *arg) {
  return false;
 }
 #define CACHE_LINE_SIZE 64U
 } // namespace port
--- a/table/block_based_table_reader.cc
+++ b/table/block_based_table_reader.cc
@ -45,7 +45,9 @@ namespace {
 // The longest the prefix of the cache key used to identify blocks can be.
 // We are using the fact that we know for Posix files the unique ID is three
 // varints.
-const size_t kMaxCacheKeyPrefixSize = kMaxVarint64Length*3+1;
+// For some reason, compiling for iOS complains that this variable is unused
 const size_t kMaxCacheKeyPrefixSize __attribute__((unused)) =
    kMaxVarint64Length * 3 + 1;
 // Read the block identified by "handle" from "file".
 // The only relevant option is options.verify_checksums for now.
@ -105,7 +107,7 @@ Cache::Handle* GetEntryFromCache(Cache* block_cache, const Slice& key,
                                 Statistics* statistics) {
  auto cache_handle = block_cache->Lookup(key);
  if (cache_handle != nullptr) {
-    BumpPerfCount(&perf_context.block_cache_hit_count);
+    PERF_COUNTER_ADD(block_cache_hit_count, 1);
    // overall cache hit
    RecordTick(statistics, BLOCK_CACHE_HIT);
    // block-type specific cache hit
--- a/table/filter_block.h
+++ b/table/filter_block.h
@ -46,6 +46,9 @@ class FilterBlockBuilder {
  bool SamePrefix(const Slice &key1, const Slice &key2) const;
  void GenerateFilter();
  // important: all of these might point to invalid addresses
  // at the time of destruction of this filter block. destructor
  // should NOT dereference them.
  const FilterPolicy* policy_;
  const SliceTransform* prefix_extractor_;
  bool whole_key_filtering_;
--- a/table/format.cc
+++ b/table/format.cc
@ -125,12 +125,11 @@ Status ReadBlockContents(RandomAccessFile* file,
  char* buf = new char[n + kBlockTrailerSize];
  Slice contents;
-  StopWatchNano timer(env);
+  PERF_TIMER_AUTO(block_read_time);
  StartPerfTimer(&timer);
  Status s = file->Read(handle.offset(), n + kBlockTrailerSize, &contents, buf);
-  BumpPerfCount(&perf_context.block_read_count);
+  PERF_TIMER_MEASURE(block_read_time);
-  BumpPerfCount(&perf_context.block_read_byte, n + kBlockTrailerSize);
+  PERF_COUNTER_ADD(block_read_count, 1);
-  BumpPerfTime(&perf_context.block_read_time, &timer);
+  PERF_COUNTER_ADD(block_read_byte, n + kBlockTrailerSize);
  if (!s.ok()) {
    delete[] buf;
@ -151,7 +150,7 @@ Status ReadBlockContents(RandomAccessFile* file,
      s = Status::Corruption("block checksum mismatch");
      return s;
    }
-    BumpPerfTime(&perf_context.block_checksum_time, &timer);
+    PERF_TIMER_MEASURE(block_checksum_time);
  }
  // If the caller has requested that the block not be uncompressed
@ -175,7 +174,7 @@ Status ReadBlockContents(RandomAccessFile* file,
    s = UncompressBlockContents(data, n, result);
    delete[] buf;
  }
-  BumpPerfTime(&perf_context.block_decompress_time, &timer);
+  PERF_TIMER_STOP(block_decompress_time);
  return s;
 }
--- a/table/merger.cc
+++ b/table/merger.cc
@ -25,16 +25,14 @@ namespace {
 class MergingIterator : public Iterator {
 public:
-  MergingIterator(Env* const env, const Comparator* comparator,
+  MergingIterator(const Comparator* comparator, Iterator** children, int n)
                  Iterator** children, int n)
      : comparator_(comparator),
        children_(n),
        current_(nullptr),
        use_heap_(true),
        env_(env),
        direction_(kForward),
        maxHeap_(NewMaxIterHeap(comparator_)),
-        minHeap_ (NewMinIterHeap(comparator_)) {
+        minHeap_(NewMinIterHeap(comparator_)) {
    for (int i = 0; i < n; i++) {
      children_[i].Set(children[i]);
    }
@ -79,13 +77,13 @@ class MergingIterator : public Iterator {
    // Invalidate the heap.
    use_heap_ = false;
    IteratorWrapper* first_child = nullptr;
-    StopWatchNano child_seek_timer(env_, false);
+    PERF_TIMER_DECLARE();
-    StopWatchNano min_heap_timer(env_, false);
+
    for (auto& child : children_) {
-      StartPerfTimer(&child_seek_timer);
+      PERF_TIMER_START(seek_child_seek_time);
      child.Seek(target);
-      BumpPerfTime(&perf_context.seek_child_seek_time, &child_seek_timer);
+      PERF_TIMER_STOP(seek_child_seek_time);
-      BumpPerfCount(&perf_context.seek_child_seek_count);
+      PERF_COUNTER_ADD(seek_child_seek_count, 1);
      if (child.Valid()) {
        // This child has valid key
@ -97,26 +95,24 @@ class MergingIterator : public Iterator {
          } else {
            // We have more than one children with valid keys. Initialize
            // the heap and put the first child into the heap.
-            StartPerfTimer(&min_heap_timer);
+            PERF_TIMER_START(seek_min_heap_time);
            ClearHeaps();
            BumpPerfTime(&perf_context.seek_min_heap_time, &child_seek_timer);
            StartPerfTimer(&min_heap_timer);
            minHeap_.push(first_child);
-            BumpPerfTime(&perf_context.seek_min_heap_time, &child_seek_timer);
+            PERF_TIMER_STOP(seek_min_heap_time);
          }
        }
        if (use_heap_) {
-          StartPerfTimer(&min_heap_timer);
+          PERF_TIMER_START(seek_min_heap_time);
          minHeap_.push(&child);
-          BumpPerfTime(&perf_context.seek_min_heap_time, &child_seek_timer);
+          PERF_TIMER_STOP(seek_min_heap_time);
        }
      }
    }
    if (use_heap_) {
      // If heap is valid, need to put the smallest key to curent_.
-      StartPerfTimer(&min_heap_timer);
+      PERF_TIMER_START(seek_min_heap_time);
      FindSmallest();
-      BumpPerfTime(&perf_context.seek_min_heap_time, &child_seek_timer);
+      PERF_TIMER_STOP(seek_min_heap_time);
    } else {
      // The heap is not valid, then the current_ iterator is the first
      // one, or null if there is no first child.
@ -232,7 +228,6 @@ class MergingIterator : public Iterator {
  // This flag is always true for reverse direction, as we always use heap for
  // the reverse iterating case.
  bool use_heap_;
  Env* const env_;
  // Which direction is the iterator moving?
  enum Direction {
    kForward,
@ -272,15 +267,14 @@ void MergingIterator::ClearHeaps() {
 }
 }  // namespace
-Iterator* NewMergingIterator(Env* const env, const Comparator* cmp,
+Iterator* NewMergingIterator(const Comparator* cmp, Iterator** list, int n) {
                             Iterator** list, int n) {
  assert(n >= 0);
  if (n == 0) {
    return NewEmptyIterator();
  } else if (n == 1) {
    return list[0];
  } else {
-    return new MergingIterator(env, cmp, list, n);
+    return new MergingIterator(cmp, list, n);
  }
 }
--- a/table/merger.h
+++ b/table/merger.h
@ -23,8 +23,7 @@ class Env;
 // key is present in K child iterators, it will be yielded K times.
 //
 // REQUIRES: n >= 0
-extern Iterator* NewMergingIterator(Env* const env,
+extern Iterator* NewMergingIterator(const Comparator* comparator,
                                    const Comparator* comparator,
                                    Iterator** children, int n);
 }  // namespace rocksdb
--- a/table/plain_table_reader.cc
+++ b/table/plain_table_reader.cc
@ -81,10 +81,9 @@ class PlainTableIterator : public Iterator {
  bool use_prefix_seek_;
  uint32_t offset_;
  uint32_t next_offset_;
-  Slice key_;
+  IterKey key_;
  Slice value_;
  Status status_;
  std::string tmp_str_;
  // No copying allowed
  PlainTableIterator(const PlainTableIterator&) = delete;
  void operator=(const Iterator&) = delete;
@ -720,9 +719,7 @@ void PlainTableIterator::Next() {
    status_ = table_->Next(&next_offset_, &parsed_key, &value_);
    if (status_.ok()) {
      // Make a copy in this case. TODO optimize.
-      tmp_str_.clear();
+      key_.SetInternalKey(parsed_key);
      AppendInternalKey(&tmp_str_, parsed_key);
      key_ = Slice(tmp_str_);
    } else {
      offset_ = next_offset_ = table_->data_end_offset_;
    }
@ -735,7 +732,7 @@ void PlainTableIterator::Prev() {
 Slice PlainTableIterator::key() const {
  assert(Valid());
-  return key_;
+  return key_.GetKey();
 }
 Slice PlainTableIterator::value() const {
--- a/table/table_test.cc
+++ b/table/table_test.cc
@ -1554,7 +1554,8 @@ TEST(MemTableTest, Simple) {
  batch.Put(std::string("k2"), std::string("v2"));
  batch.Put(std::string("k3"), std::string("v3"));
  batch.Put(std::string("largekey"), std::string("vlarge"));
-  ASSERT_TRUE(WriteBatchInternal::InsertInto(&batch, memtable, &options).ok());
+  ColumnFamilyMemTablesDefault cf_mems_default(memtable, &options);
  ASSERT_TRUE(WriteBatchInternal::InsertInto(&batch, &cf_mems_default).ok());
  Iterator* iter = memtable->NewIterator();
  iter->SeekToFirst();
--- a/tools/auto_sanity_test.sh
+++ b/tools/auto_sanity_test.sh
@ -0,0 +1,71 @@
 TMP_DIR="/tmp/rocksdb-sanity-test"
 if [ "$#" -lt 2 ]; then
  echo "usage: ./auto_sanity_test.sh [new_commit] [old_commit]"
  echo "Missing either [new_commit] or [old_commit], perform sanity check with the latest and 10th latest commits."
  recent_commits=`git log | grep -e "^commit [a-z0-9]\+$"| head -n10 | sed -e 's/commit //g'`
  commit_new=`echo "$recent_commits" | head -n1`
  commit_old=`echo "$recent_commits" | tail -n1`
  echo "the most recent commits are:"
  echo "$recent_commits"
 else
  commit_new=$1
  commit_old=$2
 fi
 if [ ! -d $TMP_DIR ]; then
  mkdir $TMP_DIR
 fi
 dir_new="${TMP_DIR}/${commit_new}"
 dir_old="${TMP_DIR}/${commit_old}"
 function makestuff() {
  echo "make clean"
  make clean > /dev/null
  echo "make db_sanity_test -j32"
  make db_sanity_test -j32 > /dev/null
  if [ $? -ne 0 ]; then
    echo "[ERROR] Failed to perform 'make db_sanity_test'"
    exit 1
  fi
 }
 rm -r -f $dir_new
 rm -r -f $dir_old
 echo "Running db sanity check with commits $commit_new and $commit_old."
 echo "============================================================="
 echo "Making build $commit_new"
 makestuff
 mv db_sanity_test new_db_sanity_test
 echo "Creating db based on the new commit --- $commit_new"
 ./new_db_sanity_test $dir_new create
 echo "============================================================="
 echo "Making build $commit_old"
 makestuff
 mv db_sanity_test old_db_sanity_test
 echo "Creating db based on the old commit --- $commit_old"
 ./old_db_sanity_test $dir_old create
 echo "============================================================="
 echo "Verifying new db $dir_new using the old commit --- $commit_old"
 ./old_db_sanity_test $dir_new verify
 if [ $? -ne 0 ]; then
  echo "[ERROR] Verification of $dir_new using commit $commit_old failed."
  exit 2
 fi
 echo "============================================================="
 echo "Verifying old db $dir_old using the new commit --- $commit_new"
 ./new_db_sanity_test $dir_old verify
 if [ $? -ne 0 ]; then
  echo "[ERROR] Verification of $dir_old using commit $commit_new failed."
  exit 2
 fi
 rm old_db_sanity_test
 rm new_db_sanity_test
 echo "Auto sanity test passed!"
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@ -88,6 +88,7 @@ def main(argv):
            --open_files=500000
            --verify_checksum=1
            --sync=0
            --progress_reports=0
            --disable_wal=0
            --disable_data_sync=1
            --target_file_size_base=2097152
--- a/tools/db_crashtest2.py
+++ b/tools/db_crashtest2.py
@ -101,6 +101,7 @@ def main(argv):
            --open_files=500000
            --verify_checksum=1
            --sync=0
            --progress_reports=0
            --disable_wal=0
            --disable_data_sync=1
            --target_file_size_base=2097152
--- a/tools/db_stress.cc
+++ b/tools/db_stress.cc
@ -60,14 +60,16 @@ static bool ValidateUint32Range(const char* flagname, uint64_t value) {
  return true;
 }
 DEFINE_uint64(seed, 2341234, "Seed for PRNG");
-static const bool FLAGS_seed_dummy =
+static const bool FLAGS_seed_dummy __attribute__((unused)) =
-  google::RegisterFlagValidator(&FLAGS_seed, &ValidateUint32Range);
+    google::RegisterFlagValidator(&FLAGS_seed, &ValidateUint32Range);
-DEFINE_int64(max_key, 1 * KB * KB * KB,
+DEFINE_int64(max_key, 1 * KB* KB,
             "Max number of key/values to place in database");
 DEFINE_int32(column_families, 10, "Number of column families");
 DEFINE_bool(test_batches_snapshots, false,
-            "If set, the test uses MultiGet(), MultiPut() and MultiDelete()"
+            "If set, the test uses MultiGet(), Multiut() and MultiDelete()"
            " which read/write/delete multiple keys in a batch. In this mode,"
            " we do not verify db content by comparing the content with the "
            "pre-allocated array. Instead, we do partial verification inside"
@ -95,7 +97,10 @@ DEFINE_bool(histogram, false, "Print histogram of operation timings");
 DEFINE_bool(destroy_db_initially, true,
            "Destroys the database dir before start if this is true");
-DEFINE_bool (verbose, false, "Verbose");
+DEFINE_bool(verbose, false, "Verbose");
 DEFINE_bool(progress_reports, true,
            "If true, db_stress will report number of finished operations");
 DEFINE_int32(write_buffer_size, rocksdb::Options().write_buffer_size,
             "Number of bytes to buffer in memtable before compacting");
@ -146,6 +151,10 @@ DEFINE_int32(max_background_compactions,
             "The maximum number of concurrent background compactions "
             "that can occur in parallel.");
 DEFINE_int32(max_background_flushes, rocksdb::Options().max_background_flushes,
             "The maximum number of concurrent background flushes "
             "that can occur in parallel.");
 DEFINE_int32(universal_size_ratio, 0, "The ratio of file sizes that trigger"
             " compaction in universal style");
@ -158,6 +167,11 @@ DEFINE_int32(universal_max_merge_width, 0, "The max number of files to compact"
 DEFINE_int32(universal_max_size_amplification_percent, 0,
             "The max size amplification for universal style compaction");
 DEFINE_int32(clear_column_family_one_in, 1000000,
             "With a chance of 1/N, delete a column family and then recreate "
             "it again. If N == 0, never drop/create column families. "
             "When test_batches_snapshots is true, this flag has no effect");
 DEFINE_int64(cache_size, 2 * KB * KB * KB,
             "Number of bytes to use as a cache of uncompressed data.");
@ -170,8 +184,8 @@ static bool ValidateInt32Positive(const char* flagname, int32_t value) {
  return true;
 }
 DEFINE_int32(reopen, 10, "Number of times database reopens");
-static const bool FLAGS_reopen_dummy =
+static const bool FLAGS_reopen_dummy __attribute__((unused)) =
-  google::RegisterFlagValidator(&FLAGS_reopen, &ValidateInt32Positive);
+    google::RegisterFlagValidator(&FLAGS_reopen, &ValidateInt32Positive);
 DEFINE_int32(bloom_bits, 10, "Bloom filter bits per key. "
             "Negative means use default settings.");
@ -198,9 +212,9 @@ DEFINE_bool(use_fsync, false, "If true, issue fsync instead of fdatasync");
 DEFINE_int32(kill_random_test, 0,
             "If non-zero, kill at various points in source code with "
             "probability 1/this");
-static const bool FLAGS_kill_random_test_dummy =
+static const bool FLAGS_kill_random_test_dummy __attribute__((unused)) =
-  google::RegisterFlagValidator(&FLAGS_kill_random_test,
+    google::RegisterFlagValidator(&FLAGS_kill_random_test,
-                                &ValidateInt32Positive);
+                                  &ValidateInt32Positive);
 extern int rocksdb_kill_odds;
 DEFINE_bool(disable_wal, false, "If true, do not write WAL for write.");
@ -226,42 +240,37 @@ static bool ValidateInt32Percent(const char* flagname, int32_t value) {
 }
 DEFINE_int32(readpercent, 10,
             "Ratio of reads to total workload (expressed as a percentage)");
-static const bool FLAGS_readpercent_dummy =
+static const bool FLAGS_readpercent_dummy __attribute__((unused)) =
-  google::RegisterFlagValidator(&FLAGS_readpercent, &ValidateInt32Percent);
+    google::RegisterFlagValidator(&FLAGS_readpercent, &ValidateInt32Percent);
 DEFINE_int32(prefixpercent, 20,
             "Ratio of prefix iterators to total workload (expressed as a"
             " percentage)");
-static const bool FLAGS_prefixpercent_dummy =
+static const bool FLAGS_prefixpercent_dummy __attribute__((unused)) =
-  google::RegisterFlagValidator(&FLAGS_prefixpercent, &ValidateInt32Percent);
+    google::RegisterFlagValidator(&FLAGS_prefixpercent, &ValidateInt32Percent);
 DEFINE_int32(writepercent, 45,
             " Ratio of deletes to total workload (expressed as a percentage)");
-static const bool FLAGS_writepercent_dummy =
+static const bool FLAGS_writepercent_dummy __attribute__((unused)) =
-  google::RegisterFlagValidator(&FLAGS_writepercent, &ValidateInt32Percent);
+    google::RegisterFlagValidator(&FLAGS_writepercent, &ValidateInt32Percent);
 DEFINE_int32(delpercent, 15,
             "Ratio of deletes to total workload (expressed as a percentage)");
-static const bool FLAGS_delpercent_dummy =
+static const bool FLAGS_delpercent_dummy __attribute__((unused)) =
-  google::RegisterFlagValidator(&FLAGS_delpercent, &ValidateInt32Percent);
+    google::RegisterFlagValidator(&FLAGS_delpercent, &ValidateInt32Percent);
 DEFINE_int32(iterpercent, 10, "Ratio of iterations to total workload"
             " (expressed as a percentage)");
-static const bool FLAGS_iterpercent_dummy =
+static const bool FLAGS_iterpercent_dummy __attribute__((unused)) =
-  google::RegisterFlagValidator(&FLAGS_iterpercent, &ValidateInt32Percent);
+    google::RegisterFlagValidator(&FLAGS_iterpercent, &ValidateInt32Percent);
 DEFINE_uint64(num_iterations, 10, "Number of iterations per MultiIterate run");
-static const bool FLAGS_num_iterations_dummy =
+static const bool FLAGS_num_iterations_dummy __attribute__((unused)) =
-  google::RegisterFlagValidator(&FLAGS_num_iterations, &ValidateUint32Range);
+    google::RegisterFlagValidator(&FLAGS_num_iterations, &ValidateUint32Range);
 DEFINE_bool(disable_seek_compaction, false,
            "Option to disable compation triggered by read.");
 DEFINE_uint64(delete_obsolete_files_period_micros, 0,
              "Option to delete obsolete files periodically"
              "0 means that obsolete files are "
              " deleted after every compaction run.");
 enum rocksdb::CompressionType StringToCompressionType(const char* ctype) {
  assert(ctype);
@ -290,21 +299,21 @@ DEFINE_string(hdfs, "", "Name of hdfs environment");
 // posix or hdfs environment
 static rocksdb::Env* FLAGS_env = rocksdb::Env::Default();
-DEFINE_uint64(ops_per_thread, 600000, "Number of operations per thread.");
+DEFINE_uint64(ops_per_thread, 1200000, "Number of operations per thread.");
-static const bool FLAGS_ops_per_thread_dummy =
+static const bool FLAGS_ops_per_thread_dummy __attribute__((unused)) =
-  google::RegisterFlagValidator(&FLAGS_ops_per_thread, &ValidateUint32Range);
+    google::RegisterFlagValidator(&FLAGS_ops_per_thread, &ValidateUint32Range);
 DEFINE_uint64(log2_keys_per_lock, 2, "Log2 of number of keys per lock");
-static const bool FLAGS_log2_keys_per_lock_dummy =
+static const bool FLAGS_log2_keys_per_lock_dummy __attribute__((unused)) =
-  google::RegisterFlagValidator(&FLAGS_log2_keys_per_lock,
+    google::RegisterFlagValidator(&FLAGS_log2_keys_per_lock,
-                                &ValidateUint32Range);
+                                  &ValidateUint32Range);
 DEFINE_int32(purge_redundant_percent, 50,
             "Percentage of times we want to purge redundant keys in memory "
             "before flushing");
-static const bool FLAGS_purge_redundant_percent_dummy =
+static const bool FLAGS_purge_redundant_percent_dummy __attribute__((unused)) =
-  google::RegisterFlagValidator(&FLAGS_purge_redundant_percent,
+    google::RegisterFlagValidator(&FLAGS_purge_redundant_percent,
-                                &ValidateInt32Percent);
+                                  &ValidateInt32Percent);
 DEFINE_bool(filter_deletes, false, "On true, deletes use KeyMayExist to drop"
            " the delete if key not present");
@ -438,16 +447,18 @@ class Stats {
      last_op_finish_ = now;
    }
-    done_++;
+      done_++;
-    if (done_ >= next_report_) {
+    if (FLAGS_progress_reports) {
-      if      (next_report_ < 1000)   next_report_ += 100;
+      if (done_ >= next_report_) {
-      else if (next_report_ < 5000)   next_report_ += 500;
+        if      (next_report_ < 1000)   next_report_ += 100;
-      else if (next_report_ < 10000)  next_report_ += 1000;
+        else if (next_report_ < 5000)   next_report_ += 500;
-      else if (next_report_ < 50000)  next_report_ += 5000;
+        else if (next_report_ < 10000)  next_report_ += 1000;
-      else if (next_report_ < 100000) next_report_ += 10000;
+        else if (next_report_ < 50000)  next_report_ += 5000;
-      else if (next_report_ < 500000) next_report_ += 50000;
+        else if (next_report_ < 100000) next_report_ += 10000;
-      else                            next_report_ += 100000;
+        else if (next_report_ < 500000) next_report_ += 50000;
-      fprintf(stdout, "... finished %ld ops%30s\r", done_, "");
+        else                            next_report_ += 100000;
        fprintf(stdout, "... finished %ld ops%30s\r", done_, "");
      }
    }
  }
@ -515,7 +526,7 @@ class Stats {
 // State shared by all concurrent executions of the same benchmark.
 class SharedState {
 public:
-  static const uint32_t SENTINEL = 0xffffffff;
+  static const uint32_t SENTINEL;
  explicit SharedState(StressTest* stress_test) :
      cv_(&mu_),
@ -531,28 +542,27 @@ class SharedState {
      start_verify_(false),
      stress_test_(stress_test) {
    if (FLAGS_test_batches_snapshots) {
      key_locks_ = nullptr;
      values_ = nullptr;
      fprintf(stdout, "No lock creation because test_batches_snapshots set\n");
      return;
    }
-    values_ = new uint32_t[max_key_];
+    values_.resize(FLAGS_column_families);
-    for (long i = 0; i < max_key_; i++) {
+
-      values_[i] = SENTINEL;
+    for (int i = 0; i < FLAGS_column_families; ++i) {
      values_[i] = std::vector<uint32_t>(max_key_, SENTINEL);
    }
    long num_locks = (max_key_ >> log2_keys_per_lock_);
    if (max_key_ & ((1 << log2_keys_per_lock_) - 1)) {
-      num_locks ++;
+      num_locks++;
    }
    fprintf(stdout, "Creating %ld locks\n", num_locks * FLAGS_column_families);
    key_locks_.resize(FLAGS_column_families);
    for (int i = 0; i < FLAGS_column_families; ++i) {
      key_locks_[i] = std::vector<port::Mutex>(num_locks);
    }
    fprintf(stdout, "Creating %ld locks\n", num_locks);
    key_locks_ = new port::Mutex[num_locks];
  }
-  ~SharedState() {
+  ~SharedState() {}
    delete[] values_;
    delete[] key_locks_;
  }
  port::Mutex* GetMutex() {
    return &mu_;
@ -622,26 +632,36 @@ class SharedState {
    return start_verify_;
  }
-  port::Mutex* GetMutexForKey(long key) {
+  port::Mutex* GetMutexForKey(int cf, long key) {
-    return &key_locks_[key >> log2_keys_per_lock_];
+    return &key_locks_[cf][key >> log2_keys_per_lock_];
  }
-  void Put(long key, uint32_t value_base) {
+  void LockColumnFamily(int cf) {
-    values_[key] = value_base;
+    for (auto& mutex : key_locks_[cf]) {
      mutex.Lock();
    }
  }
-  uint32_t Get(long key) const {
+  void UnlockColumnFamily(int cf) {
-    return values_[key];
+    for (auto& mutex : key_locks_[cf]) {
      mutex.Unlock();
    }
  }
-  void Delete(long key) const {
+  void ClearColumnFamily(int cf) {
-    values_[key] = SENTINEL;
+    std::fill(values_[cf].begin(), values_[cf].end(), SENTINEL);
  }
-  uint32_t GetSeed() const {
+  void Put(int cf, long key, uint32_t value_base) {
-    return seed_;
+    values_[cf][key] = value_base;
  }
  uint32_t Get(int cf, long key) const { return values_[cf][key]; }
  void Delete(int cf, long key) { values_[cf][key] = SENTINEL; }
  uint32_t GetSeed() const { return seed_; }
 private:
  port::Mutex mu_;
  port::CondVar cv_;
@ -657,11 +677,12 @@ class SharedState {
  bool start_verify_;
  StressTest* stress_test_;
-  uint32_t *values_;
+  std::vector<std::vector<uint32_t>> values_;
-  port::Mutex *key_locks_;
+  std::vector<std::vector<port::Mutex>> key_locks_;
 };
 const uint32_t SharedState::SENTINEL = 0xffffffff;
 // Per-thread state for concurrent executions of the same benchmark.
 struct ThreadState {
  uint32_t tid; // 0..n-1
@ -682,13 +703,14 @@ class StressTest {
 public:
  StressTest()
      : cache_(NewLRUCache(FLAGS_cache_size)),
-        compressed_cache_(FLAGS_compressed_cache_size >= 0 ?
+        compressed_cache_(FLAGS_compressed_cache_size >= 0
-                          NewLRUCache(FLAGS_compressed_cache_size) :
+                              ? NewLRUCache(FLAGS_compressed_cache_size)
-                          nullptr),
+                              : nullptr),
        filter_policy_(FLAGS_bloom_bits >= 0
-                       ? NewBloomFilterPolicy(FLAGS_bloom_bits)
+                           ? NewBloomFilterPolicy(FLAGS_bloom_bits)
-                       : nullptr),
+                           : nullptr),
        db_(nullptr),
        new_column_family_name_(0),
        num_times_reopened_(0) {
    if (FLAGS_destroy_db_initially) {
      std::vector<std::string> files;
@ -703,6 +725,10 @@ class StressTest {
  }
  ~StressTest() {
    for (auto cf : column_families_) {
      delete cf;
    }
    column_families_.clear();
    delete db_;
    delete filter_policy_;
  }
@ -817,9 +843,9 @@ class StressTest {
  // Given a key K and value V, this puts ("0"+K, "0"+V), ("1"+K, "1"+V), ...
  // ("9"+K, "9"+V) in DB atomically i.e in a single batch.
  // Also refer MultiGet.
-  Status MultiPut(ThreadState* thread,
+  Status MultiPut(ThreadState* thread, const WriteOptions& writeoptions,
-                  const WriteOptions& writeoptions,
+                  ColumnFamilyHandle* column_family, const Slice& key,
-                  const Slice& key, const Slice& value, size_t sz) {
+                  const Slice& value, size_t sz) {
    std::string keys[10] = {"9", "8", "7", "6", "5",
                            "4", "3", "2", "1", "0"};
    std::string values[10] = {"9", "8", "7", "6", "5",
@ -832,9 +858,9 @@ class StressTest {
      values[i] += value.ToString();
      value_slices[i] = values[i];
      if (FLAGS_use_merge) {
-        batch.Merge(keys[i], value_slices[i]);
+        batch.Merge(column_family, keys[i], value_slices[i]);
      } else {
-        batch.Put(keys[i], value_slices[i]);
+        batch.Put(column_family, keys[i], value_slices[i]);
      }
    }
@ -852,9 +878,8 @@ class StressTest {
  // Given a key K, this deletes ("0"+K), ("1"+K),... ("9"+K)
  // in DB atomically i.e in a single batch. Also refer MultiGet.
-  Status MultiDelete(ThreadState* thread,
+  Status MultiDelete(ThreadState* thread, const WriteOptions& writeoptions,
-                     const WriteOptions& writeoptions,
+                     ColumnFamilyHandle* column_family, const Slice& key) {
                     const Slice& key) {
    std::string keys[10] = {"9", "7", "5", "3", "1",
                            "8", "6", "4", "2", "0"};
@ -862,7 +887,7 @@ class StressTest {
    Status s;
    for (int i = 0; i < 10; i++) {
      keys[i] += key.ToString();
-      batch.Delete(keys[i]);
+      batch.Delete(column_family, keys[i]);
    }
    s = db_->Write(writeoptions, &batch);
@ -880,9 +905,9 @@ class StressTest {
  // in the same snapshot, and verifies that all the values are of the form
  // "0"+V, "1"+V,..."9"+V.
  // ASSUMES that MultiPut was used to put (K, V) into the DB.
-  Status MultiGet(ThreadState* thread,
+  Status MultiGet(ThreadState* thread, const ReadOptions& readoptions,
-                  const ReadOptions& readoptions,
+                  ColumnFamilyHandle* column_family, const Slice& key,
-                  const Slice& key, std::string* value) {
+                  std::string* value) {
    std::string keys[10] = {"0", "1", "2", "3", "4", "5", "6", "7", "8", "9"};
    Slice key_slices[10];
    std::string values[10];
@ -892,7 +917,7 @@ class StressTest {
    for (int i = 0; i < 10; i++) {
      keys[i] += key.ToString();
      key_slices[i] = keys[i];
-      s = db_->Get(readoptionscopy, key_slices[i], value);
+      s = db_->Get(readoptionscopy, column_family, key_slices[i], value);
      if (!s.ok() && !s.IsNotFound()) {
        fprintf(stderr, "get error: %s\n", s.ToString().c_str());
        values[i] = "";
@ -937,8 +962,8 @@ class StressTest {
  // each series should be the same length, and it is verified for each
  // index i that all the i'th values are of the form "0"+V, "1"+V,..."9"+V.
  // ASSUMES that MultiPut was used to put (K, V)
-  Status MultiPrefixScan(ThreadState* thread,
+  Status MultiPrefixScan(ThreadState* thread, const ReadOptions& readoptions,
-                         const ReadOptions& readoptions,
+                         ColumnFamilyHandle* column_family,
                         const Slice& key) {
    std::string prefixes[10] = {"0", "1", "2", "3", "4",
                                "5", "6", "7", "8", "9"};
@ -954,7 +979,7 @@ class StressTest {
      readoptionscopy[i] = readoptions;
      readoptionscopy[i].prefix_seek = true;
      readoptionscopy[i].snapshot = snapshot;
-      iters[i] = db_->NewIterator(readoptionscopy[i]);
+      iters[i] = db_->NewIterator(readoptionscopy[i], column_family);
      iters[i]->Seek(prefix_slices[i]);
    }
@ -1012,15 +1037,14 @@ class StressTest {
  // Given a key K, this creates an iterator which scans to K and then
  // does a random sequence of Next/Prev operations.
-  Status MultiIterate(ThreadState* thread,
+  Status MultiIterate(ThreadState* thread, const ReadOptions& readoptions,
-                      const ReadOptions& readoptions,
+                      ColumnFamilyHandle* column_family, const Slice& key) {
                      const Slice& key) {
    Status s;
    const Snapshot* snapshot = db_->GetSnapshot();
    ReadOptions readoptionscopy = readoptions;
    readoptionscopy.snapshot = snapshot;
    readoptionscopy.prefix_seek = FLAGS_prefix_size > 0;
-    unique_ptr<Iterator> iter(db_->NewIterator(readoptionscopy));
+    unique_ptr<Iterator> iter(db_->NewIterator(readoptionscopy, column_family));
    iter->Seek(key);
    for (uint64_t i = 0; i < FLAGS_num_iterations && iter->Valid(); i++) {
@ -1075,15 +1099,50 @@ class StressTest {
        }
      }
      if (!FLAGS_test_batches_snapshots &&
          FLAGS_clear_column_family_one_in != 0) {
        if (thread->rand.OneIn(FLAGS_clear_column_family_one_in)) {
          // drop column family and then create it again (can't drop default)
          int cf = thread->rand.Next() % (FLAGS_column_families - 1) + 1;
          std::string new_name =
              std::to_string(new_column_family_name_.fetch_add(1));
          {
            MutexLock l(thread->shared->GetMutex());
            fprintf(
                stdout,
                "[CF %d] Dropping and recreating column family. new name: %s\n",
                cf, new_name.c_str());
          }
          thread->shared->LockColumnFamily(cf);
          Status s __attribute__((unused));
          s = db_->DropColumnFamily(column_families_[cf]);
          delete column_families_[cf];
          assert(s.ok());
          s = db_->CreateColumnFamily(ColumnFamilyOptions(options_), new_name,
                                      &column_families_[cf]);
          column_family_names_[cf] = new_name;
          thread->shared->ClearColumnFamily(cf);
          assert(s.ok());
          thread->shared->UnlockColumnFamily(cf);
        }
      }
      long rand_key = thread->rand.Next() % max_key;
      int rand_column_family = thread->rand.Next() % FLAGS_column_families;
      std::string keystr = Key(rand_key);
      Slice key = keystr;
      int prob_op = thread->rand.Uniform(100);
      std::unique_ptr<MutexLock> l;
      if (!FLAGS_test_batches_snapshots) {
        l.reset(new MutexLock(
            thread->shared->GetMutexForKey(rand_column_family, rand_key)));
      }
      auto column_family = column_families_[rand_column_family];
      if (prob_op >= 0 && prob_op < (int)FLAGS_readpercent) {
        // OPERATION read
        if (!FLAGS_test_batches_snapshots) {
-          Status s = db_->Get(read_opts, key, &from_db);
+          Status s = db_->Get(read_opts, column_family, key, &from_db);
          if (s.ok()) {
            // found case
            thread->stats.AddGets(1, 1);
@ -1095,7 +1154,7 @@ class StressTest {
            thread->stats.AddErrors(1);
          }
        } else {
-          MultiGet(thread, read_opts, key, &from_db);
+          MultiGet(thread, read_opts, column_family, key, &from_db);
        }
      } else if ((int)FLAGS_readpercent <= prob_op && prob_op < prefixBound) {
        // OPERATION prefix scan
@ -1106,7 +1165,7 @@ class StressTest {
        if (!FLAGS_test_batches_snapshots) {
          Slice prefix = Slice(key.data(), FLAGS_prefix_size);
          read_opts.prefix_seek = true;
-          Iterator* iter = db_->NewIterator(read_opts);
+          Iterator* iter = db_->NewIterator(read_opts, column_family);
          int64_t count = 0;
          for (iter->Seek(prefix);
               iter->Valid() && iter->key().starts_with(prefix); iter->Next()) {
@ -1121,7 +1180,7 @@ class StressTest {
          }
          delete iter;
        } else {
-          MultiPrefixScan(thread, read_opts, key);
+          MultiPrefixScan(thread, read_opts, column_family, key);
        }
      } else if (prefixBound <= prob_op && prob_op < writeBound) {
        // OPERATION write
@ -1129,42 +1188,36 @@ class StressTest {
        size_t sz = GenerateValue(value_base, value, sizeof(value));
        Slice v(value, sz);
        if (!FLAGS_test_batches_snapshots) {
          MutexLock l(thread->shared->GetMutexForKey(rand_key));
          if (FLAGS_verify_before_write) {
            std::string keystr2 = Key(rand_key);
            Slice k = keystr2;
-            Status s = db_->Get(read_opts, k, &from_db);
+            Status s = db_->Get(read_opts, column_family, k, &from_db);
-            VerifyValue(rand_key,
+            VerifyValue(rand_column_family, rand_key, read_opts,
-                        read_opts,
+                        *(thread->shared), from_db, s, true);
                        *(thread->shared),
                        from_db,
                        s,
                        true);
          }
-          thread->shared->Put(rand_key, value_base);
+          thread->shared->Put(rand_column_family, rand_key, value_base);
          if (FLAGS_use_merge) {
-            db_->Merge(write_opts, key, v);
+            db_->Merge(write_opts, column_family, key, v);
          } else {
-            db_->Put(write_opts, key, v);
+            db_->Put(write_opts, column_family, key, v);
          }
          thread->stats.AddBytesForWrites(1, sz);
        } else {
-          MultiPut(thread, write_opts, key, v, sz);
+          MultiPut(thread, write_opts, column_family, key, v, sz);
        }
-        PrintKeyValue(rand_key, value, sz);
+        PrintKeyValue(rand_column_family, rand_key, value, sz);
      } else if (writeBound <= prob_op && prob_op < delBound) {
        // OPERATION delete
        if (!FLAGS_test_batches_snapshots) {
-          MutexLock l(thread->shared->GetMutexForKey(rand_key));
+          thread->shared->Delete(rand_column_family, rand_key);
-          thread->shared->Delete(rand_key);
+          db_->Delete(write_opts, column_family, key);
          db_->Delete(write_opts, key);
          thread->stats.AddDeletes(1);
        } else {
-          MultiDelete(thread, write_opts, key);
+          MultiDelete(thread, write_opts, column_family, key);
        }
      } else {
        // OPERATION iterate
-        MultiIterate(thread, read_opts, key);
+        MultiIterate(thread, read_opts, column_family, key);
      }
      thread->stats.FinishedSingleOp();
    }
@ -1182,97 +1235,100 @@ class StressTest {
    if (thread->tid == shared.GetNumThreads() - 1) {
      end = max_key;
    }
-
+    for (size_t cf = 0; cf < column_families_.size(); ++cf) {
-    if (!thread->rand.OneIn(2)) {
+      if (!thread->rand.OneIn(2)) {
-      options.prefix_seek = FLAGS_prefix_size > 0;
+        // Use iterator to verify this range
-      // Use iterator to verify this range
+        options.prefix_seek = FLAGS_prefix_size > 0;
-      unique_ptr<Iterator> iter(db_->NewIterator(options));
+        unique_ptr<Iterator> iter(
-      iter->Seek(Key(start));
+            db_->NewIterator(options, column_families_[cf]));
-      for (long i = start; i < end; i++) {
+        iter->Seek(Key(start));
-        // TODO(ljin): update "long" to uint64_t
+        for (long i = start; i < end; i++) {
-        // Reseek when the prefix changes
+          // TODO(ljin): update "long" to uint64_t
-        if (i % (static_cast<int64_t>(1) << 8 * (8 - FLAGS_prefix_size)) == 0) {
+          // Reseek when the prefix changes
-          iter->Seek(Key(i));
+          if (i % (static_cast<int64_t>(1) << 8 * (8 - FLAGS_prefix_size)) ==
-        }
+              0) {
-        std::string from_db;
+            iter->Seek(Key(i));
-        std::string keystr = Key(i);
+          }
-        Slice k = keystr;
+          std::string from_db;
-        Status s = iter->status();
+          std::string keystr = Key(i);
-        if (iter->Valid()) {
+          Slice k = keystr;
-          if (iter->key().compare(k) > 0) {
+          Status s = iter->status();
          if (iter->Valid()) {
            if (iter->key().compare(k) > 0) {
              s = Status::NotFound(Slice());
            } else if (iter->key().compare(k) == 0) {
              from_db = iter->value().ToString();
              iter->Next();
            } else if (iter->key().compare(k) < 0) {
              VerificationAbort("An out of range key was found", cf, i);
            }
          } else {
            // The iterator found no value for the key in question, so do not
            // move to the next item in the iterator
            s = Status::NotFound(Slice());
          } else if (iter->key().compare(k) == 0) {
            from_db = iter->value().ToString();
            iter->Next();
          } else if (iter->key().compare(k) < 0) {
            VerificationAbort("An out of range key was found", i);
          }
-        } else {
+          VerifyValue(cf, i, options, shared, from_db, s, true);
-          // The iterator found no value for the key in question, so do not
+          if (from_db.length()) {
-          // move to the next item in the iterator
+            PrintKeyValue(cf, i, from_db.data(), from_db.length());
-          s = Status::NotFound(Slice());
+          }
        }
        VerifyValue(i, options, shared, from_db, s, true);
        if (from_db.length()) {
          PrintKeyValue(i, from_db.data(), from_db.length());
        }
-      }
+      } else {
-    } else {
+        // Use Get to verify this range
-      // Use Get to verify this range
+        for (long i = start; i < end; i++) {
-      for (long i = start; i < end; i++) {
+          std::string from_db;
-        std::string from_db;
+          std::string keystr = Key(i);
-        std::string keystr = Key(i);
+          Slice k = keystr;
-        Slice k = keystr;
+          Status s = db_->Get(options, column_families_[cf], k, &from_db);
-        Status s = db_->Get(options, k, &from_db);
+          VerifyValue(cf, i, options, shared, from_db, s, true);
-        VerifyValue(i, options, shared, from_db, s, true);
+          if (from_db.length()) {
-        if (from_db.length()) {
+            PrintKeyValue(cf, i, from_db.data(), from_db.length());
-          PrintKeyValue(i, from_db.data(), from_db.length());
+          }
        }
      }
    }
  }
-  void VerificationAbort(std::string msg, long key) const {
+  void VerificationAbort(std::string msg, int cf, long key) const {
-    fprintf(stderr, "Verification failed for key %ld: %s\n",
+    fprintf(stderr, "Verification failed for column family %d key %ld: %s\n",
-            key, msg.c_str());
+            cf, key, msg.c_str());
    exit(1);
  }
-  void VerifyValue(long key,
+  void VerifyValue(int cf, long key, const ReadOptions& opts,
-                   const ReadOptions &opts,
+                   const SharedState& shared, const std::string& value_from_db,
-                   const SharedState &shared,
+                   Status s, bool strict = false) const {
                   const std::string &value_from_db,
                   Status s,
                   bool strict=false) const {
    // compare value_from_db with the value in the shared state
    char value[100];
-    uint32_t value_base = shared.Get(key);
+    uint32_t value_base = shared.Get(cf, key);
    if (value_base == SharedState::SENTINEL && !strict) {
      return;
    }
    if (s.ok()) {
      if (value_base == SharedState::SENTINEL) {
-        VerificationAbort("Unexpected value found", key);
+        VerificationAbort("Unexpected value found", cf, key);
      }
      size_t sz = GenerateValue(value_base, value, sizeof(value));
      if (value_from_db.length() != sz) {
-        VerificationAbort("Length of value read is not equal", key);
+        VerificationAbort("Length of value read is not equal", cf, key);
      }
      if (memcmp(value_from_db.data(), value, sz) != 0) {
-        VerificationAbort("Contents of value read don't match", key);
+        VerificationAbort("Contents of value read don't match", cf, key);
      }
    } else {
      if (value_base != SharedState::SENTINEL) {
-        VerificationAbort("Value not found", key);
+        VerificationAbort("Value not found", cf, key);
      }
    }
  }
-  static void PrintKeyValue(uint32_t key, const char *value, size_t sz) {
+  static void PrintKeyValue(int cf, uint32_t key, const char* value,
-    if (!FLAGS_verbose) return;
+                            size_t sz) {
-    fprintf(stdout, "%u ==> (%u) ", key, (unsigned int)sz);
+    if (!FLAGS_verbose) {
-    for (size_t i=0; i<sz; i++) {
+      return;
    }
    fprintf(stdout, "[CF %d] %u ==> (%u) ", cf, key, (unsigned int)sz);
    for (size_t i = 0; i < sz; i++) {
      fprintf(stdout, "%X", value[i]);
    }
    fprintf(stdout, "\n");
@ -1290,8 +1346,13 @@ class StressTest {
  }
  void PrintEnv() const {
-    fprintf(stdout, "LevelDB version     : %d.%d\n",
+    fprintf(stdout, "RocksDB version     : %d.%d\n", kMajorVersion,
-            kMajorVersion, kMinorVersion);
+            kMinorVersion);
    fprintf(stdout, "Column families     : %d\n", FLAGS_column_families);
    if (!FLAGS_test_batches_snapshots) {
      fprintf(stdout, "Clear CFs one in    : %d\n",
              FLAGS_clear_column_family_one_in);
    }
    fprintf(stdout, "Number of threads   : %d\n", FLAGS_threads);
    fprintf(stdout,
            "Ops per thread      : %lu\n",
@ -1368,43 +1429,41 @@ class StressTest {
  void Open() {
    assert(db_ == nullptr);
-    Options options;
+    options_.block_cache = cache_;
-    options.block_cache = cache_;
+    options_.block_cache_compressed = compressed_cache_;
-    options.block_cache_compressed = compressed_cache_;
+    options_.write_buffer_size = FLAGS_write_buffer_size;
-    options.write_buffer_size = FLAGS_write_buffer_size;
+    options_.max_write_buffer_number = FLAGS_max_write_buffer_number;
-    options.max_write_buffer_number = FLAGS_max_write_buffer_number;
+    options_.min_write_buffer_number_to_merge =
-    options.min_write_buffer_number_to_merge =
+        FLAGS_min_write_buffer_number_to_merge;
-      FLAGS_min_write_buffer_number_to_merge;
+    options_.max_background_compactions = FLAGS_max_background_compactions;
-    options.max_background_compactions = FLAGS_max_background_compactions;
+    options_.max_background_flushes = FLAGS_max_background_flushes;
-    options.compaction_style =
+    options_.compaction_style =
-      static_cast<rocksdb::CompactionStyle>(FLAGS_compaction_style);
+        static_cast<rocksdb::CompactionStyle>(FLAGS_compaction_style);
-    options.block_size = FLAGS_block_size;
+    options_.block_size = FLAGS_block_size;
-    options.filter_policy = filter_policy_;
+    options_.filter_policy = filter_policy_;
-    options.prefix_extractor.reset(NewFixedPrefixTransform(FLAGS_prefix_size));
+    options_.prefix_extractor.reset(NewFixedPrefixTransform(FLAGS_prefix_size));
-    options.max_open_files = FLAGS_open_files;
+    options_.max_open_files = FLAGS_open_files;
-    options.statistics = dbstats;
+    options_.statistics = dbstats;
-    options.env = FLAGS_env;
+    options_.env = FLAGS_env;
-    options.disableDataSync = FLAGS_disable_data_sync;
+    options_.disableDataSync = FLAGS_disable_data_sync;
-    options.use_fsync = FLAGS_use_fsync;
+    options_.use_fsync = FLAGS_use_fsync;
-    options.allow_mmap_reads = FLAGS_mmap_read;
+    options_.allow_mmap_reads = FLAGS_mmap_read;
    rocksdb_kill_odds = FLAGS_kill_random_test;
-    options.target_file_size_base = FLAGS_target_file_size_base;
+    options_.target_file_size_base = FLAGS_target_file_size_base;
-    options.target_file_size_multiplier = FLAGS_target_file_size_multiplier;
+    options_.target_file_size_multiplier = FLAGS_target_file_size_multiplier;
-    options.max_bytes_for_level_base = FLAGS_max_bytes_for_level_base;
+    options_.max_bytes_for_level_base = FLAGS_max_bytes_for_level_base;
-    options.max_bytes_for_level_multiplier =
+    options_.max_bytes_for_level_multiplier =
        FLAGS_max_bytes_for_level_multiplier;
-    options.level0_stop_writes_trigger = FLAGS_level0_stop_writes_trigger;
+    options_.level0_stop_writes_trigger = FLAGS_level0_stop_writes_trigger;
-    options.level0_slowdown_writes_trigger =
+    options_.level0_slowdown_writes_trigger =
-      FLAGS_level0_slowdown_writes_trigger;
+        FLAGS_level0_slowdown_writes_trigger;
-    options.level0_file_num_compaction_trigger =
+    options_.level0_file_num_compaction_trigger =
-      FLAGS_level0_file_num_compaction_trigger;
+        FLAGS_level0_file_num_compaction_trigger;
-    options.compression = FLAGS_compression_type_e;
+    options_.compression = FLAGS_compression_type_e;
-    options.create_if_missing = true;
+    options_.create_if_missing = true;
-    options.disable_seek_compaction = FLAGS_disable_seek_compaction;
+    options_.disable_seek_compaction = FLAGS_disable_seek_compaction;
-    options.delete_obsolete_files_period_micros =
+    options_.max_manifest_file_size = 10 * 1024;
-      FLAGS_delete_obsolete_files_period_micros;
+    options_.filter_deletes = FLAGS_filter_deletes;
    options.max_manifest_file_size = 1024;
    options.filter_deletes = FLAGS_filter_deletes;
    if ((FLAGS_prefix_size == 0) == (FLAGS_rep_factory == kHashSkipList)) {
      fprintf(stderr,
            "prefix_size should be non-zero iff memtablerep == prefix_hash\n");
@ -1412,51 +1471,107 @@ class StressTest {
    }
    switch (FLAGS_rep_factory) {
      case kHashSkipList:
-        options.memtable_factory.reset(NewHashSkipListRepFactory());
+        options_.memtable_factory.reset(NewHashSkipListRepFactory());
        break;
      case kSkipList:
        // no need to do anything
        break;
      case kVectorRep:
-        options.memtable_factory.reset(new VectorRepFactory());
+        options_.memtable_factory.reset(new VectorRepFactory());
        break;
    }
    static Random purge_percent(1000); // no benefit from non-determinism here
    if (static_cast<int32_t>(purge_percent.Uniform(100)) <
        FLAGS_purge_redundant_percent - 1) {
-      options.purge_redundant_kvs_while_flush = false;
+      options_.purge_redundant_kvs_while_flush = false;
    }
    if (FLAGS_use_merge) {
-      options.merge_operator = MergeOperators::CreatePutOperator();
+      options_.merge_operator = MergeOperators::CreatePutOperator();
    }
    // set universal style compaction configurations, if applicable
    if (FLAGS_universal_size_ratio != 0) {
-      options.compaction_options_universal.size_ratio =
+      options_.compaction_options_universal.size_ratio =
-        FLAGS_universal_size_ratio;
+          FLAGS_universal_size_ratio;
    }
    if (FLAGS_universal_min_merge_width != 0) {
-      options.compaction_options_universal.min_merge_width =
+      options_.compaction_options_universal.min_merge_width =
-        FLAGS_universal_min_merge_width;
+          FLAGS_universal_min_merge_width;
    }
    if (FLAGS_universal_max_merge_width != 0) {
-      options.compaction_options_universal.max_merge_width =
+      options_.compaction_options_universal.max_merge_width =
-        FLAGS_universal_max_merge_width;
+          FLAGS_universal_max_merge_width;
    }
    if (FLAGS_universal_max_size_amplification_percent != 0) {
-      options.compaction_options_universal.max_size_amplification_percent =
+      options_.compaction_options_universal.max_size_amplification_percent =
-        FLAGS_universal_max_size_amplification_percent;
+          FLAGS_universal_max_size_amplification_percent;
    }
    fprintf(stdout, "DB path: [%s]\n", FLAGS_db.c_str());
    Status s;
    if (FLAGS_ttl == -1) {
-      s = DB::Open(options, FLAGS_db, &db_);
+      std::vector<std::string> existing_column_families;
      s = DB::ListColumnFamilies(DBOptions(options_), FLAGS_db,
                                 &existing_column_families);  // ignore errors
      if (!s.ok()) {
        // DB doesn't exist
        assert(existing_column_families.empty());
        assert(column_family_names_.empty());
        column_family_names_.push_back(default_column_family_name);
      } else if (column_family_names_.empty()) {
        // this is the first call to the function Open()
        column_family_names_ = existing_column_families;
      } else {
        // this is a reopen. just assert that existing column_family_names are
        // equivalent to what we remember
        auto sorted_cfn = column_family_names_;
        sort(sorted_cfn.begin(), sorted_cfn.end());
        sort(existing_column_families.begin(), existing_column_families.end());
        if (sorted_cfn != existing_column_families) {
          fprintf(stderr,
                  "Expected column families differ from the existing:\n");
          printf("Expected: {");
          for (auto cf : sorted_cfn) {
            printf("%s ", cf.c_str());
          }
          printf("}\n");
          printf("Existing: {");
          for (auto cf : existing_column_families) {
            printf("%s ", cf.c_str());
          }
          printf("}\n");
        }
        assert(sorted_cfn == existing_column_families);
      }
      std::vector<ColumnFamilyDescriptor> cf_descriptors;
      for (auto name : column_family_names_) {
        if (name != default_column_family_name) {
          new_column_family_name_ =
              std::max(new_column_family_name_.load(), std::stoi(name) + 1);
        }
        cf_descriptors.emplace_back(name, ColumnFamilyOptions(options_));
      }
      s = DB::Open(DBOptions(options_), FLAGS_db, cf_descriptors,
                   &column_families_, &db_);
      if (s.ok()) {
        while (s.ok() &&
               column_families_.size() < (size_t)FLAGS_column_families) {
          ColumnFamilyHandle* cf = nullptr;
          std::string name = std::to_string(new_column_family_name_.load());
          new_column_family_name_++;
          s = db_->CreateColumnFamily(ColumnFamilyOptions(options_), name, &cf);
          column_families_.push_back(cf);
          column_family_names_.push_back(name);
        }
      }
      assert(!s.ok() || column_families_.size() ==
                            static_cast<size_t>(FLAGS_column_families));
    } else {
-      s = UtilityDB::OpenTtlDB(options, FLAGS_db, &sdb_, FLAGS_ttl);
+      StackableDB* sdb;
-      db_ = sdb_;
+      s = UtilityDB::OpenTtlDB(options_, FLAGS_db, &sdb, FLAGS_ttl);
      db_ = sdb;
    }
    if (!s.ok()) {
      fprintf(stderr, "open error: %s\n", s.ToString().c_str());
@ -1465,13 +1580,11 @@ class StressTest {
  }
  void Reopen() {
-    // do not close the db. Just delete the lock file. This
+    for (auto cf : column_families_) {
-    // simulates a crash-recovery kind of situation.
+      delete cf;
    if (FLAGS_ttl != -1) {
      ((DBWithTTL*) db_)->TEST_Destroy_DBWithTtl();
    } else {
      ((DBImpl*) db_)->TEST_Destroy_DBImpl();
    }
    column_families_.clear();
    delete db_;
    db_ = nullptr;
    num_times_reopened_++;
@ -1493,14 +1606,15 @@ class StressTest {
  shared_ptr<Cache> compressed_cache_;
  const FilterPolicy* filter_policy_;
  DB* db_;
-  StackableDB* sdb_;
+  Options options_;
  std::vector<ColumnFamilyHandle*> column_families_;
  std::vector<std::string> column_family_names_;
  std::atomic<int> new_column_family_name_;
  int num_times_reopened_;
 };
 }  // namespace rocksdb
 int main(int argc, char** argv) {
  google::SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) +
                          " [OPTIONS]...");
--- a/util/auto_roll_logger.cc
+++ b/util/auto_roll_logger.cc
@ -81,7 +81,7 @@ Status CreateLoggerFromOptions(
    const std::string& dbname,
    const std::string& db_log_dir,
    Env* env,
-    const Options& options,
+    const DBOptions& options,
    std::shared_ptr<Logger>* logger) {
  std::string db_absolute_path;
  env->GetAbsolutePath(dbname, &db_absolute_path);
--- a/util/auto_roll_logger.h
+++ b/util/auto_roll_logger.h
@ -85,7 +85,7 @@ Status CreateLoggerFromOptions(
    const std::string& dbname,
    const std::string& db_log_dir,
    Env* env,
-    const Options& options,
+    const DBOptions& options,
    std::shared_ptr<Logger>* logger);
 }  // namespace rocksdb
--- a/util/auto_roll_logger_test.cc
+++ b/util/auto_roll_logger_test.cc
@ -197,7 +197,7 @@ TEST(AutoRollLoggerTest, CompositeRollByTimeAndSizeLogger) {
 }
 TEST(AutoRollLoggerTest, CreateLoggerFromOptions) {
-  Options options;
+  DBOptions options;
  shared_ptr<Logger> logger;
  // Normal logger
--- a/util/crc32c.cc
+++ b/util/crc32c.cc
@ -314,24 +314,12 @@ static inline void Slow_CRC32(uint64_t* l, uint8_t const **p) {
 }
 static inline void Fast_CRC32(uint64_t* l, uint8_t const **p) {
-  #ifdef __SSE4_2__
+#ifdef __SSE4_2__
  *l = _mm_crc32_u64(*l, LE_LOAD64(*p));
  *p += 8;
-  #else
+#else
  Slow_CRC32(l, p);
-  #endif
+#endif
 }
 // Detect if SS42 or not.
 static bool isSSE42() {
  #ifdef __GNUC__
  uint32_t c_;
  uint32_t d_;
  __asm__("cpuid" : "=c"(c_), "=d"(d_) : "a"(1) : "ebx");
  return c_ & (1U << 20); // copied from CpuId.h in Folly.
  #else
  return false;
  #endif
 }
 template<void (*CRC32)(uint64_t*, uint8_t const**)>
@ -377,6 +365,18 @@ uint32_t ExtendImpl(uint32_t crc, const char* buf, size_t size) {
  return l ^ 0xffffffffu;
 }
 // Detect if SS42 or not.
 static bool isSSE42() {
 #if defined(__GNUC__) && defined(__x86_64__) && !defined(IOS_CROSS_COMPILE)
  uint32_t c_;
  uint32_t d_;
  __asm__("cpuid" : "=c"(c_), "=d"(d_) : "a"(1) : "ebx");
  return c_ & (1U << 20);  // copied from CpuId.h in Folly.
 #else
  return false;
 #endif
 }
 typedef uint32_t (*Function)(uint32_t, const char*, size_t);
 static inline Function Choose_Extend() {
--- a/util/dynamic_bloom_test.cc
+++ b/util/dynamic_bloom_test.cc
@ -3,6 +3,8 @@
 //  LICENSE file in the root directory of this source tree. An additional grant
 //  of patent rights can be found in the PATENTS file in the same directory.
 #define __STDC_FORMAT_MACROS
 #include <inttypes.h>
 #include <algorithm>
 #include <gflags/gflags.h>
@ -74,11 +76,12 @@ TEST(DynamicBloomTest, VaryingLengths) {
  // Count number of filters that significantly exceed the false positive rate
  int mediocre_filters = 0;
  int good_filters = 0;
  uint32_t num_probes = static_cast<uint32_t>(FLAGS_num_probes);
  fprintf(stderr, "bits_per_key: %d  num_probes: %d\n",
-          FLAGS_bits_per_key, FLAGS_num_probes);
+          FLAGS_bits_per_key, num_probes);
-  for (uint32_t cl_per_block = 0; cl_per_block < FLAGS_num_probes;
+  for (uint32_t cl_per_block = 0; cl_per_block < num_probes;
      ++cl_per_block) {
    for (uint32_t num = 1; num <= 10000; num = NextNum(num)) {
      uint32_t bloom_bits = 0;
@ -88,7 +91,7 @@ TEST(DynamicBloomTest, VaryingLengths) {
        bloom_bits = std::max(num * FLAGS_bits_per_key,
            cl_per_block * CACHE_LINE_SIZE * 8);
      }
-      DynamicBloom bloom(bloom_bits, cl_per_block, FLAGS_num_probes);
+      DynamicBloom bloom(bloom_bits, cl_per_block, num_probes);
      for (uint64_t i = 0; i < num; i++) {
        bloom.Add(Key(i, buffer));
        ASSERT_TRUE(bloom.MayContain(Key(i, buffer)));
@ -127,6 +130,7 @@ TEST(DynamicBloomTest, VaryingLengths) {
 TEST(DynamicBloomTest, perf) {
  StopWatchNano timer(Env::Default());
  uint32_t num_probes = static_cast<uint32_t>(FLAGS_num_probes);
  if (!FLAGS_enable_perf) {
    return;
@ -134,9 +138,9 @@ TEST(DynamicBloomTest, perf) {
  for (uint64_t m = 1; m <= 8; ++m) {
    const uint64_t num_keys = m * 8 * 1024 * 1024;
-    fprintf(stderr, "testing %luM keys\n", m * 8);
+    fprintf(stderr, "testing %" PRIu64 "M keys\n", m * 8);
-    DynamicBloom std_bloom(num_keys * 10, 0, FLAGS_num_probes);
+    DynamicBloom std_bloom(num_keys * 10, 0, num_probes);
    timer.Start();
    for (uint64_t i = 1; i <= num_keys; ++i) {
@ -144,7 +148,7 @@ TEST(DynamicBloomTest, perf) {
    }
    uint64_t elapsed = timer.ElapsedNanos();
-    fprintf(stderr, "standard bloom, avg add latency %lu\n",
+    fprintf(stderr, "standard bloom, avg add latency %" PRIu64 "\n",
            elapsed / num_keys);
    uint64_t count = 0;
@ -155,13 +159,13 @@ TEST(DynamicBloomTest, perf) {
      }
    }
    elapsed = timer.ElapsedNanos();
-    fprintf(stderr, "standard bloom, avg query latency %lu\n",
+    fprintf(stderr, "standard bloom, avg query latency %" PRIu64 "\n",
            elapsed / count);
    ASSERT_TRUE(count == num_keys);
-    for (int cl_per_block = 1; cl_per_block <= FLAGS_num_probes;
+    for (uint32_t cl_per_block = 1; cl_per_block <= num_probes;
        ++cl_per_block) {
-      DynamicBloom blocked_bloom(num_keys * 10, cl_per_block, FLAGS_num_probes);
+      DynamicBloom blocked_bloom(num_keys * 10, cl_per_block, num_probes);
      timer.Start();
      for (uint64_t i = 1; i <= num_keys; ++i) {
@ -169,7 +173,7 @@ TEST(DynamicBloomTest, perf) {
      }
      uint64_t elapsed = timer.ElapsedNanos();
-      fprintf(stderr, "blocked bloom(%d), avg add latency %lu\n",
+      fprintf(stderr, "blocked bloom(%d), avg add latency %" PRIu64 "\n",
              cl_per_block, elapsed / num_keys);
      uint64_t count = 0;
@ -182,7 +186,7 @@ TEST(DynamicBloomTest, perf) {
      }
      elapsed = timer.ElapsedNanos();
-      fprintf(stderr, "blocked bloom(%d), avg query latency %lu\n",
+      fprintf(stderr, "blocked bloom(%d), avg query latency %" PRIu64 "\n",
              cl_per_block, elapsed / count);
      ASSERT_TRUE(count == num_keys);
    }
--- a/util/env.cc
+++ b/util/env.cc
@ -231,7 +231,7 @@ EnvWrapper::~EnvWrapper() {
 namespace {  // anonymous namespace
-void AssignEnvOptions(EnvOptions* env_options, const Options& options) {
+void AssignEnvOptions(EnvOptions* env_options, const DBOptions& options) {
  env_options->use_os_buffer = options.allow_os_buffer;
  env_options->use_mmap_reads = options.allow_mmap_reads;
  env_options->use_mmap_writes = options.allow_mmap_writes;
@ -249,12 +249,12 @@ EnvOptions Env::OptimizeForManifestWrite(const EnvOptions& env_options) const {
  return env_options;
 }
-EnvOptions::EnvOptions(const Options& options) {
+EnvOptions::EnvOptions(const DBOptions& options) {
  AssignEnvOptions(this, options);
 }
 EnvOptions::EnvOptions() {
-  Options options;
+  DBOptions options;
  AssignEnvOptions(this, options);
 }
--- a/util/hash_linklist_rep.cc
+++ b/util/hash_linklist_rep.cc
@ -22,12 +22,6 @@ namespace {
 typedef const char* Key;
 struct Node {
  explicit Node(const Key& k) :
      key(k) {
  }
  Key const key;
  // Accessors/mutators for links.  Wrapped in methods so we can
  // add the appropriate barriers as necessary.
  Node* Next() {
@ -40,17 +34,19 @@ struct Node {
    // pointer observes a fully initialized version of the inserted node.
    next_.Release_Store(x);
  }
  // No-barrier variants that can be safely used in a few locations.
  Node* NoBarrier_Next() {
    return reinterpret_cast<Node*>(next_.NoBarrier_Load());
  }
  void NoBarrier_SetNext(Node* x) {
    next_.NoBarrier_Store(x);
  }
-private:
+ private:
  port::AtomicPointer next_;
 public:
  char key[0];
 };
 class HashLinkListRep : public MemTableRep {
@ -58,7 +54,9 @@ class HashLinkListRep : public MemTableRep {
  HashLinkListRep(const MemTableRep::KeyComparator& compare, Arena* arena,
                  const SliceTransform* transform, size_t bucket_size);
-  virtual void Insert(const char* key) override;
+  virtual KeyHandle Allocate(const size_t len, char** buf) override;
  virtual void Insert(KeyHandle handle) override;
  virtual bool Contains(const char* key) const override;
@ -93,8 +91,6 @@ class HashLinkListRep : public MemTableRep {
  const SliceTransform* transform_;
  const MemTableRep::KeyComparator& compare_;
  // immutable after construction
  Arena* const arena_;
  bool BucketContains(Node* head, const Slice& key) const;
@ -114,11 +110,6 @@ class HashLinkListRep : public MemTableRep {
    return GetBucket(GetHash(slice));
  }
  Node* NewNode(const Key& key) {
    char* mem = arena_->AllocateAligned(sizeof(Node));
    return new (mem) Node(key);
  }
  bool Equal(const Slice& a, const Key& b) const {
    return (compare_(b, a) == 0);
  }
@ -318,10 +309,10 @@ class HashLinkListRep : public MemTableRep {
 HashLinkListRep::HashLinkListRep(const MemTableRep::KeyComparator& compare,
                                 Arena* arena, const SliceTransform* transform,
                                 size_t bucket_size)
-  : bucket_size_(bucket_size),
+  : MemTableRep(arena),
    bucket_size_(bucket_size),
    transform_(transform),
-    compare_(compare),
+    compare_(compare) {
    arena_(arena) {
  char* mem = arena_->AllocateAligned(
      sizeof(port::AtomicPointer) * bucket_size);
@ -335,15 +326,22 @@ HashLinkListRep::HashLinkListRep(const MemTableRep::KeyComparator& compare,
 HashLinkListRep::~HashLinkListRep() {
 }
-void HashLinkListRep::Insert(const char* key) {
+KeyHandle HashLinkListRep::Allocate(const size_t len, char** buf) {
-  assert(!Contains(key));
+  char* mem = arena_->AllocateAligned(sizeof(Node) + len);
-  Slice internal_key = GetLengthPrefixedSlice(key);
+  Node* x = new (mem) Node();
  *buf = x->key;
  return static_cast<void*>(x);
 }
 void HashLinkListRep::Insert(KeyHandle handle) {
  Node* x = static_cast<Node*>(handle);
  assert(!Contains(x->key));
  Slice internal_key = GetLengthPrefixedSlice(x->key);
  auto transformed = GetPrefix(internal_key);
  auto& bucket = buckets_[GetHash(transformed)];
  Node* head = static_cast<Node*>(bucket.Acquire_Load());
  if (!head) {
    Node* x = NewNode(key);
    // NoBarrier_SetNext() suffices since we will add a barrier when
    // we publish a pointer to "x" in prev[i].
    x->NoBarrier_SetNext(nullptr);
@ -372,9 +370,7 @@ void HashLinkListRep::Insert(const char* key) {
  }
  // Our data structure does not allow duplicate insertion
-  assert(cur == nullptr || !Equal(key, cur->key));
+  assert(cur == nullptr || !Equal(x->key, cur->key));
  Node* x = NewNode(key);
  // NoBarrier_SetNext() suffices since we will add a barrier when
  // we publish a pointer to "x" in prev[i].
--- a/util/hash_skiplist_rep.cc
+++ b/util/hash_skiplist_rep.cc
@ -25,7 +25,7 @@ class HashSkipListRep : public MemTableRep {
                  const SliceTransform* transform, size_t bucket_size,
                  int32_t skiplist_height, int32_t skiplist_branching_factor);
-  virtual void Insert(const char* key) override;
+  virtual void Insert(KeyHandle handle) override;
  virtual bool Contains(const char* key) const override;
@ -225,7 +225,8 @@ HashSkipListRep::HashSkipListRep(const MemTableRep::KeyComparator& compare,
                                 Arena* arena, const SliceTransform* transform,
                                 size_t bucket_size, int32_t skiplist_height,
                                 int32_t skiplist_branching_factor)
-    : bucket_size_(bucket_size),
+    : MemTableRep(arena),
      bucket_size_(bucket_size),
      skiplist_height_(skiplist_height),
      skiplist_branching_factor_(skiplist_branching_factor),
      transform_(transform),
@ -255,7 +256,8 @@ HashSkipListRep::Bucket* HashSkipListRep::GetInitializedBucket(
  return bucket;
 }
-void HashSkipListRep::Insert(const char* key) {
+void HashSkipListRep::Insert(KeyHandle handle) {
  auto* key = static_cast<char*>(handle);
  assert(!Contains(key));
  auto transformed = transform_->Transform(UserKey(key));
  auto bucket = GetInitializedBucket(transformed);
--- a/util/ldb_cmd.cc
+++ b/util/ldb_cmd.cc
@ -11,6 +11,7 @@
 #include "db/filename.h"
 #include "db/write_batch_internal.h"
 #include "rocksdb/write_batch.h"
 #include "rocksdb/cache.h"
 #include "util/coding.h"
 #include <ctime>
@ -152,6 +153,8 @@ LDBCommand* LDBCommand::SelectCommand(
    return new DBLoaderCommand(cmdParams, option_map, flags);
  } else if (cmd == ManifestDumpCommand::Name()) {
    return new ManifestDumpCommand(cmdParams, option_map, flags);
  } else if (cmd == ListColumnFamiliesCommand::Name()) {
    return new ListColumnFamiliesCommand(cmdParams, option_map, flags);
  } else if (cmd == InternalDumpCommand::Name()) {
    return new InternalDumpCommand(cmdParams, option_map, flags);
  } else if (cmd == CheckConsistencyCommand::Name()) {
@ -540,11 +543,10 @@ void ManifestDumpCommand::DoCommand() {
  EnvOptions sopt;
  std::string file(manifestfile);
  std::string dbname("dummy");
-  TableCache* tc = new TableCache(dbname, &options, sopt, 10);
+  std::shared_ptr<Cache> tc(NewLRUCache(
-  const InternalKeyComparator* cmp =
+      options.max_open_files - 10, options.table_cache_numshardbits,
-    new InternalKeyComparator(options.comparator);
+      options.table_cache_remove_scan_count_limit));
-
+  VersionSet* versions = new VersionSet(dbname, &options, sopt, tc.get());
  VersionSet* versions = new VersionSet(dbname, &options, sopt, tc, cmp);
  Status s = versions->DumpManifest(options, file, verbose_, is_key_hex_);
  if (!s.ok()) {
    printf("Error in processing file %s %s\n", manifestfile.c_str(),
@ -557,6 +559,48 @@ void ManifestDumpCommand::DoCommand() {
 // ----------------------------------------------------------------------------
 void ListColumnFamiliesCommand::Help(string& ret) {
  ret.append("  ");
  ret.append(ListColumnFamiliesCommand::Name());
  ret.append(" full_path_to_db_directory ");
  ret.append("\n");
 }
 ListColumnFamiliesCommand::ListColumnFamiliesCommand(
    const vector<string>& params, const map<string, string>& options,
    const vector<string>& flags)
    : LDBCommand(options, flags, false, {}) {
  if (params.size() != 1) {
    exec_state_ = LDBCommandExecuteResult::FAILED(
        "dbname must be specified for the list_column_families command");
  } else {
    dbname_ = params[0];
  }
 }
 void ListColumnFamiliesCommand::DoCommand() {
  vector<string> column_families;
  Status s = DB::ListColumnFamilies(DBOptions(), dbname_, &column_families);
  if (!s.ok()) {
    printf("Error in processing db %s %s\n", dbname_.c_str(),
           s.ToString().c_str());
  } else {
    printf("Column families in %s: \n{", dbname_.c_str());
    bool first = true;
    for (auto cf : column_families) {
      if (!first) {
        printf(", ");
      }
      first = false;
      printf("%s", cf.c_str());
    }
    printf("}\n");
  }
 }
 // ----------------------------------------------------------------------------
 string ReadableTime(int unixtime) {
  char time_buffer [80];
  time_t rawtime = unixtime;
@ -1018,19 +1062,26 @@ Options ReduceDBLevelsCommand::PrepareOptionsForOpenDB() {
 Status ReduceDBLevelsCommand::GetOldNumOfLevels(Options& opt,
    int* levels) {
  EnvOptions soptions;
-  TableCache tc(db_path_, &opt, soptions, 10);
+  std::shared_ptr<Cache> tc(
      NewLRUCache(opt.max_open_files - 10, opt.table_cache_numshardbits,
                  opt.table_cache_remove_scan_count_limit));
  const InternalKeyComparator cmp(opt.comparator);
-  VersionSet versions(db_path_, &opt, soptions, &tc, &cmp);
+  VersionSet versions(db_path_, &opt, soptions, tc.get());
  std::vector<ColumnFamilyDescriptor> dummy;
  ColumnFamilyDescriptor dummy_descriptor(default_column_family_name,
                                          ColumnFamilyOptions(opt));
  dummy.push_back(dummy_descriptor);
  // We rely the VersionSet::Recover to tell us the internal data structures
  // in the db. And the Recover() should never do any change
  // (like LogAndApply) to the manifest file.
-  Status st = versions.Recover();
+  Status st = versions.Recover(dummy);
  if (!st.ok()) {
    return st;
  }
  int max = -1;
-  for (int i = 0; i < versions.NumberLevels(); i++) {
+  auto default_cfd = versions.GetColumnFamilySet()->GetDefault();
-    if (versions.current()->NumLevelFiles(i)) {
+  for (int i = 0; i < default_cfd->NumberLevels(); i++) {
    if (default_cfd->current()->NumLevelFiles(i)) {
      max = i;
    }
  }
@ -1075,7 +1126,6 @@ void ReduceDBLevelsCommand::DoCommand() {
  CloseDB();
  EnvOptions soptions;
  st = VersionSet::ReduceNumberOfLevels(db_path_, &opt, soptions, new_levels_);
  if (!st.ok()) {
    exec_state_ = LDBCommandExecuteResult::FAILED(st.ToString());
--- a/util/ldb_cmd.h
+++ b/util/ldb_cmd.h
@ -484,6 +484,23 @@ private:
  static const string ARG_PATH;
 };
 class ListColumnFamiliesCommand : public LDBCommand {
 public:
  static string Name() { return "list_column_families"; }
  ListColumnFamiliesCommand(const vector<string>& params,
                            const map<string, string>& options,
                            const vector<string>& flags);
  static void Help(string& ret);
  virtual void DoCommand();
  virtual bool NoDBOpen() { return true; }
 private:
  string dbname_;
 };
 class ReduceDBLevelsCommand : public LDBCommand {
 public:
  static string Name() { return "reduce_levels"; }
--- a/util/ldb_tool.cc
+++ b/util/ldb_tool.cc
@ -64,6 +64,7 @@ public:
    DBDumperCommand::Help(ret);
    DBLoaderCommand::Help(ret);
    ManifestDumpCommand::Help(ret);
    ListColumnFamiliesCommand::Help(ret);
    InternalDumpCommand::Help(ret);
    fprintf(stderr, "%s\n", ret.c_str());
--- a/util/options.cc
+++ b/util/options.cc
@ -26,23 +26,17 @@
 namespace rocksdb {
-Options::Options()
+ColumnFamilyOptions::ColumnFamilyOptions()
    : comparator(BytewiseComparator()),
      merge_operator(nullptr),
      compaction_filter(nullptr),
      compaction_filter_factory(std::shared_ptr<CompactionFilterFactory>(
          new DefaultCompactionFilterFactory())),
-      compaction_filter_factory_v2(new DefaultCompactionFilterFactoryV2()),
+      compaction_filter_factory_v2(
-      create_if_missing(false),
+          new DefaultCompactionFilterFactoryV2()),
      error_if_exists(false),
      paranoid_checks(true),
      env(Env::Default()),
      info_log(nullptr),
      info_log_level(INFO),
      write_buffer_size(4 << 20),
      max_write_buffer_number(2),
      min_write_buffer_number_to_merge(1),
      max_open_files(5000),
      block_cache(nullptr),
      block_cache_compressed(nullptr),
      block_size(4096),
@ -64,88 +58,252 @@ Options::Options()
      expanded_compaction_factor(25),
      source_compaction_factor(1),
      max_grandparent_overlap_factor(10),
      disable_seek_compaction(true),
      soft_rate_limit(0.0),
      hard_rate_limit(0.0),
      rate_limit_delay_max_milliseconds(1000),
      no_block_cache(false),
      arena_block_size(0),
      disable_auto_compactions(false),
      purge_redundant_kvs_while_flush(true),
      block_size_deviation(10),
      compaction_style(kCompactionStyleLevel),
      verify_checksums_in_compaction(true),
      filter_deletes(false),
      max_sequential_skip_in_iterations(8),
      memtable_factory(std::shared_ptr<SkipListFactory>(new SkipListFactory)),
      table_factory(
          std::shared_ptr<TableFactory>(new BlockBasedTableFactory())),
      inplace_update_support(false),
      inplace_update_num_locks(10000),
      inplace_callback(nullptr),
      memtable_prefix_bloom_bits(0),
      memtable_prefix_bloom_probes(6),
      bloom_locality(0),
      max_successive_merges(0),
      min_partial_merge_operands(2) {
  assert(memtable_factory.get() != nullptr);
 }
 ColumnFamilyOptions::ColumnFamilyOptions(const Options& options)
    : comparator(options.comparator),
      merge_operator(options.merge_operator),
      compaction_filter(options.compaction_filter),
      compaction_filter_factory(options.compaction_filter_factory),
      compaction_filter_factory_v2(options.compaction_filter_factory_v2),
      write_buffer_size(options.write_buffer_size),
      max_write_buffer_number(options.max_write_buffer_number),
      min_write_buffer_number_to_merge(
          options.min_write_buffer_number_to_merge),
      block_cache(options.block_cache),
      block_cache_compressed(options.block_cache_compressed),
      block_size(options.block_size),
      block_restart_interval(options.block_restart_interval),
      compression(options.compression),
      compression_per_level(options.compression_per_level),
      compression_opts(options.compression_opts),
      filter_policy(options.filter_policy),
      prefix_extractor(options.prefix_extractor),
      whole_key_filtering(options.whole_key_filtering),
      num_levels(options.num_levels),
      level0_file_num_compaction_trigger(
          options.level0_file_num_compaction_trigger),
      level0_slowdown_writes_trigger(options.level0_slowdown_writes_trigger),
      level0_stop_writes_trigger(options.level0_stop_writes_trigger),
      max_mem_compaction_level(options.max_mem_compaction_level),
      target_file_size_base(options.target_file_size_base),
      target_file_size_multiplier(options.target_file_size_multiplier),
      max_bytes_for_level_base(options.max_bytes_for_level_base),
      max_bytes_for_level_multiplier(options.max_bytes_for_level_multiplier),
      max_bytes_for_level_multiplier_additional(
          options.max_bytes_for_level_multiplier_additional),
      expanded_compaction_factor(options.expanded_compaction_factor),
      source_compaction_factor(options.source_compaction_factor),
      max_grandparent_overlap_factor(options.max_grandparent_overlap_factor),
      disable_seek_compaction(options.disable_seek_compaction),
      soft_rate_limit(options.soft_rate_limit),
      hard_rate_limit(options.hard_rate_limit),
      rate_limit_delay_max_milliseconds(
          options.rate_limit_delay_max_milliseconds),
      no_block_cache(options.no_block_cache),
      arena_block_size(options.arena_block_size),
      disable_auto_compactions(options.disable_auto_compactions),
      purge_redundant_kvs_while_flush(options.purge_redundant_kvs_while_flush),
      block_size_deviation(options.block_size_deviation),
      compaction_style(options.compaction_style),
      verify_checksums_in_compaction(options.verify_checksums_in_compaction),
      compaction_options_universal(options.compaction_options_universal),
      filter_deletes(options.filter_deletes),
      max_sequential_skip_in_iterations(
          options.max_sequential_skip_in_iterations),
      memtable_factory(options.memtable_factory),
      table_factory(options.table_factory),
      table_properties_collectors(options.table_properties_collectors),
      inplace_update_support(options.inplace_update_support),
      inplace_update_num_locks(options.inplace_update_num_locks),
      inplace_callback(options.inplace_callback),
      memtable_prefix_bloom_bits(options.memtable_prefix_bloom_bits),
      memtable_prefix_bloom_probes(options.memtable_prefix_bloom_probes),
      bloom_locality(options.bloom_locality),
      max_successive_merges(options.max_successive_merges),
      min_partial_merge_operands(options.min_partial_merge_operands) {
  assert(memtable_factory.get() != nullptr);
 }
 DBOptions::DBOptions()
    : create_if_missing(false),
      error_if_exists(false),
      paranoid_checks(true),
      env(Env::Default()),
      info_log(nullptr),
      info_log_level(INFO),
      max_open_files(5000),
      statistics(nullptr),
      disableDataSync(false),
      use_fsync(false),
      db_stats_log_interval(1800),
      db_log_dir(""),
      wal_dir(""),
      disable_seek_compaction(true),
      delete_obsolete_files_period_micros(6 * 60 * 60 * 1000000UL),
      max_background_compactions(1),
      max_background_flushes(1),
      max_log_file_size(0),
      log_file_time_to_roll(0),
      keep_log_file_num(1000),
      soft_rate_limit(0.0),
      hard_rate_limit(0.0),
      rate_limit_delay_max_milliseconds(1000),
      max_manifest_file_size(std::numeric_limits<uint64_t>::max()),
      no_block_cache(false),
      table_cache_numshardbits(4),
      table_cache_remove_scan_count_limit(16),
      arena_block_size(0),
      disable_auto_compactions(false),
      WAL_ttl_seconds(0),
      WAL_size_limit_MB(0),
      manifest_preallocation_size(4 * 1024 * 1024),
      purge_redundant_kvs_while_flush(true),
      allow_os_buffer(true),
      allow_mmap_reads(false),
      allow_mmap_writes(false),
      is_fd_close_on_exec(true),
      skip_log_error_on_recovery(false),
      stats_dump_period_sec(3600),
      block_size_deviation(10),
      advise_random_on_open(true),
      access_hint_on_compaction_start(NORMAL),
      use_adaptive_mutex(false),
      bytes_per_sync(0),
-      compaction_style(kCompactionStyleLevel),
+      allow_thread_local(true) {}
-      verify_checksums_in_compaction(true),
+
-      filter_deletes(false),
+DBOptions::DBOptions(const Options& options)
-      max_sequential_skip_in_iterations(8),
+    : create_if_missing(options.create_if_missing),
-      memtable_factory(std::shared_ptr<SkipListFactory>(new SkipListFactory)),
+      error_if_exists(options.error_if_exists),
-      table_factory(
+      paranoid_checks(options.paranoid_checks),
-          std::shared_ptr<TableFactory>(new BlockBasedTableFactory())),
+      env(options.env),
-      inplace_update_support(false),
+      info_log(options.info_log),
-      inplace_update_num_locks(10000),
+      info_log_level(options.info_log_level),
-      inplace_callback(nullptr),
+      max_open_files(options.max_open_files),
-      memtable_prefix_bloom_bits(0),
+      statistics(options.statistics),
-      memtable_prefix_bloom_probes(6),
+      disableDataSync(options.disableDataSync),
-      bloom_locality(0),
+      use_fsync(options.use_fsync),
-      max_successive_merges(0),
+      db_stats_log_interval(options.db_stats_log_interval),
-      min_partial_merge_operands(2),
+      db_log_dir(options.db_log_dir),
-      allow_thread_local(true) {
+      wal_dir(options.wal_dir),
-  assert(memtable_factory.get() != nullptr);
+      delete_obsolete_files_period_micros(
-}
+          options.delete_obsolete_files_period_micros),
      max_background_compactions(options.max_background_compactions),
      max_background_flushes(options.max_background_flushes),
      max_log_file_size(options.max_log_file_size),
      log_file_time_to_roll(options.log_file_time_to_roll),
      keep_log_file_num(options.keep_log_file_num),
      max_manifest_file_size(options.max_manifest_file_size),
      table_cache_numshardbits(options.table_cache_numshardbits),
      table_cache_remove_scan_count_limit(
          options.table_cache_remove_scan_count_limit),
      WAL_ttl_seconds(options.WAL_ttl_seconds),
      WAL_size_limit_MB(options.WAL_size_limit_MB),
      manifest_preallocation_size(options.manifest_preallocation_size),
      allow_os_buffer(options.allow_os_buffer),
      allow_mmap_reads(options.allow_mmap_reads),
      allow_mmap_writes(options.allow_mmap_writes),
      is_fd_close_on_exec(options.is_fd_close_on_exec),
      skip_log_error_on_recovery(options.skip_log_error_on_recovery),
      stats_dump_period_sec(options.stats_dump_period_sec),
      advise_random_on_open(options.advise_random_on_open),
      access_hint_on_compaction_start(options.access_hint_on_compaction_start),
      use_adaptive_mutex(options.use_adaptive_mutex),
      bytes_per_sync(options.bytes_per_sync),
      allow_thread_local(options.allow_thread_local) {}
 static const char* const access_hints[] = {
  "NONE", "NORMAL", "SEQUENTIAL", "WILLNEED"
 };
-void
+void DBOptions::Dump(Logger* log) const {
 Options::Dump(Logger* log) const
 {
    Log(log,"              Options.comparator: %s", comparator->Name());
    Log(log,"          Options.merge_operator: %s",
        merge_operator? merge_operator->Name() : "None");
    Log(log,"       Options.compaction_filter: %s",
        compaction_filter? compaction_filter->Name() : "None");
    Log(log,"       Options.compaction_filter_factory: %s",
        compaction_filter_factory->Name());
    Log(log, "       Options.compaction_filter_factory_v2: %s",
        compaction_filter_factory_v2->Name());
    Log(log,"        Options.memtable_factory: %s",
        memtable_factory->Name());
    Log(log,"           Options.table_factory: %s", table_factory->Name());
    Log(log,"         Options.error_if_exists: %d", error_if_exists);
    Log(log,"       Options.create_if_missing: %d", create_if_missing);
    Log(log,"         Options.paranoid_checks: %d", paranoid_checks);
    Log(log,"                     Options.env: %p", env);
    Log(log,"                Options.info_log: %p", info_log.get());
    Log(log,"       Options.write_buffer_size: %zd", write_buffer_size);
    Log(log," Options.max_write_buffer_number: %d", max_write_buffer_number);
    Log(log,"          Options.max_open_files: %d", max_open_files);
    Log(log, "       Options.disableDataSync: %d", disableDataSync);
    Log(log, "             Options.use_fsync: %d", use_fsync);
    Log(log, "     Options.max_log_file_size: %zu", max_log_file_size);
    Log(log, "Options.max_manifest_file_size: %lu",
        (unsigned long)max_manifest_file_size);
    Log(log, "     Options.log_file_time_to_roll: %zu", log_file_time_to_roll);
    Log(log, "     Options.keep_log_file_num: %zu", keep_log_file_num);
    Log(log, " Options.db_stats_log_interval: %d", db_stats_log_interval);
    Log(log, "       Options.allow_os_buffer: %d", allow_os_buffer);
    Log(log, "      Options.allow_mmap_reads: %d", allow_mmap_reads);
    Log(log, "     Options.allow_mmap_writes: %d", allow_mmap_writes);
    Log(log, "                             Options.db_log_dir: %s",
        db_log_dir.c_str());
    Log(log, "                             Options.wal_dir: %s",
        wal_dir.c_str());
    Log(log, "               Options.table_cache_numshardbits: %d",
        table_cache_numshardbits);
    Log(log, "    Options.table_cache_remove_scan_count_limit: %d",
        table_cache_remove_scan_count_limit);
    Log(log, "    Options.delete_obsolete_files_period_micros: %lu",
        (unsigned long)delete_obsolete_files_period_micros);
    Log(log, "             Options.max_background_compactions: %d",
        max_background_compactions);
    Log(log, "                 Options.max_background_flushes: %d",
        max_background_flushes);
    Log(log, "                        Options.WAL_ttl_seconds: %lu",
        (unsigned long)WAL_ttl_seconds);
    Log(log, "                      Options.WAL_size_limit_MB: %lu",
        (unsigned long)WAL_size_limit_MB);
    Log(log, "            Options.manifest_preallocation_size: %zu",
        manifest_preallocation_size);
    Log(log, "                         Options.allow_os_buffer: %d",
        allow_os_buffer);
    Log(log, "                        Options.allow_mmap_reads: %d",
        allow_mmap_reads);
    Log(log, "                       Options.allow_mmap_writes: %d",
        allow_mmap_writes);
    Log(log, "                     Options.is_fd_close_on_exec: %d",
        is_fd_close_on_exec);
    Log(log, "              Options.skip_log_error_on_recovery: %d",
        skip_log_error_on_recovery);
    Log(log, "                   Options.stats_dump_period_sec: %u",
        stats_dump_period_sec);
    Log(log, "                   Options.advise_random_on_open: %d",
        advise_random_on_open);
    Log(log, "         Options.access_hint_on_compaction_start: %s",
        access_hints[access_hint_on_compaction_start]);
    Log(log, "                      Options.use_adaptive_mutex: %d",
        use_adaptive_mutex);
    Log(log, "                          Options.bytes_per_sync: %lu",
        (unsigned long)bytes_per_sync);
 }  // DBOptions::Dump
 void ColumnFamilyOptions::Dump(Logger* log) const {
  Log(log, "              Options.comparator: %s", comparator->Name());
  Log(log, "          Options.merge_operator: %s",
      merge_operator ? merge_operator->Name() : "None");
  Log(log, "       Options.compaction_filter_factory: %s",
      compaction_filter_factory->Name());
  Log(log, "       Options.compaction_filter_factory_v2: %s",
      compaction_filter_factory_v2->Name());
  Log(log, "        Options.memtable_factory: %s", memtable_factory->Name());
  Log(log, "           Options.table_factory: %s", table_factory->Name());
  Log(log, "       Options.write_buffer_size: %zd", write_buffer_size);
  Log(log, " Options.max_write_buffer_number: %d", max_write_buffer_number);
    Log(log,"             Options.block_cache: %p", block_cache.get());
    Log(log,"  Options.block_cache_compressed: %p",
        block_cache_compressed.get());
@ -173,18 +331,6 @@ Options::Dump(Logger* log) const
        prefix_extractor == nullptr ? "nullptr" : prefix_extractor->Name());
    Log(log,"   Options.whole_key_filtering: %d", whole_key_filtering);
    Log(log,"            Options.num_levels: %d", num_levels);
    Log(log,"       Options.disableDataSync: %d", disableDataSync);
    Log(log,"             Options.use_fsync: %d", use_fsync);
    Log(log,"     Options.max_log_file_size: %zu", max_log_file_size);
    Log(log,"Options.max_manifest_file_size: %lu",
        (unsigned long)max_manifest_file_size);
    Log(log,"     Options.log_file_time_to_roll: %zu", log_file_time_to_roll);
    Log(log,"     Options.keep_log_file_num: %zu", keep_log_file_num);
    Log(log," Options.db_stats_log_interval: %d",
        db_stats_log_interval);
    Log(log,"       Options.allow_os_buffer: %d", allow_os_buffer);
    Log(log,"      Options.allow_mmap_reads: %d", allow_mmap_reads);
    Log(log,"     Options.allow_mmap_writes: %d", allow_mmap_writes);
    Log(log,"       Options.min_write_buffer_number_to_merge: %d",
        min_write_buffer_number_to_merge);
    Log(log,"        Options.purge_redundant_kvs_while_flush: %d",
@ -223,26 +369,12 @@ Options::Dump(Logger* log) const
        source_compaction_factor);
    Log(log,"         Options.max_grandparent_overlap_factor: %d",
        max_grandparent_overlap_factor);
    Log(log,"                             Options.db_log_dir: %s",
        db_log_dir.c_str());
    Log(log,"                             Options.wal_dir: %s",
        wal_dir.c_str());
    Log(log,"                Options.disable_seek_compaction: %d",
        disable_seek_compaction);
    Log(log,"                         Options.no_block_cache: %d",
        no_block_cache);
    Log(log,"               Options.table_cache_numshardbits: %d",
        table_cache_numshardbits);
    Log(log,"    Options.table_cache_remove_scan_count_limit: %d",
        table_cache_remove_scan_count_limit);
    Log(log,"                       Options.arena_block_size: %zu",
        arena_block_size);
    Log(log,"    Options.delete_obsolete_files_period_micros: %lu",
        (unsigned long)delete_obsolete_files_period_micros);
    Log(log,"             Options.max_background_compactions: %d",
        max_background_compactions);
    Log(log,"                 Options.max_background_flushes: %d",
        max_background_flushes);
    Log(log,"                      Options.soft_rate_limit: %.2f",
        soft_rate_limit);
    Log(log,"                      Options.hard_rate_limit: %.2f",
@ -251,36 +383,10 @@ Options::Dump(Logger* log) const
        rate_limit_delay_max_milliseconds);
    Log(log,"               Options.disable_auto_compactions: %d",
        disable_auto_compactions);
    Log(log,"                        Options.WAL_ttl_seconds: %lu",
        (unsigned long)WAL_ttl_seconds);
    Log(log,"                      Options.WAL_size_limit_MB: %lu",
        (unsigned long)WAL_size_limit_MB);
    Log(log,"            Options.manifest_preallocation_size: %zu",
        manifest_preallocation_size);
    Log(log,"         Options.purge_redundant_kvs_while_flush: %d",
        purge_redundant_kvs_while_flush);
    Log(log,"                         Options.allow_os_buffer: %d",
        allow_os_buffer);
    Log(log,"                        Options.allow_mmap_reads: %d",
        allow_mmap_reads);
    Log(log,"                       Options.allow_mmap_writes: %d",
        allow_mmap_writes);
    Log(log,"                     Options.is_fd_close_on_exec: %d",
        is_fd_close_on_exec);
    Log(log,"              Options.skip_log_error_on_recovery: %d",
        skip_log_error_on_recovery);
    Log(log,"                   Options.stats_dump_period_sec: %u",
        stats_dump_period_sec);
    Log(log,"                    Options.block_size_deviation: %d",
        block_size_deviation);
    Log(log,"                   Options.advise_random_on_open: %d",
        advise_random_on_open);
    Log(log,"         Options.access_hint_on_compaction_start: %s",
        access_hints[access_hint_on_compaction_start]);
    Log(log,"                      Options.use_adaptive_mutex: %d",
        use_adaptive_mutex);
    Log(log,"                          Options.bytes_per_sync: %lu",
        (unsigned long)bytes_per_sync);
    Log(log,"                          Options.filter_deletes: %d",
        filter_deletes);
    Log(log, "          Options.verify_checksums_in_compaction: %d",
@ -317,8 +423,15 @@ Options::Dump(Logger* log) const
        memtable_prefix_bloom_bits);
    Log(log, "            Options.memtable_prefix_bloom_probes: %d",
        memtable_prefix_bloom_probes);
    Log(log, "                          Options.bloom_locality: %d",
        bloom_locality);
    Log(log, "                   Options.max_successive_merges: %zd",
        max_successive_merges);
 }  // ColumnFamilyOptions::Dump
 void Options::Dump(Logger* log) const {
  DBOptions::Dump(log);
  ColumnFamilyOptions::Dump(log);
 }   // Options::Dump
 //
--- a/util/perf_context.cc
+++ b/util/perf_context.cc
@ -9,12 +9,21 @@
 namespace rocksdb {
-// by default, enable counts only
+#if defined(NPERF_CONTEXT) || defined(IOS_CROSS_COMPILE)
 PerfLevel perf_level = kEnableCount;
 // This is a dummy variable since some place references it
 PerfContext perf_context;
 #else
 __thread PerfLevel perf_level = kEnableCount;
 __thread PerfContext perf_context;
 #endif
-void SetPerfLevel(PerfLevel level) { perf_level = level; }
+void SetPerfLevel(PerfLevel level) {
  perf_level = level;
 }
 void PerfContext::Reset() {
 #if !defined(NPERF_CONTEXT) && !defined(IOS_CROSS_COMPILE)
  user_key_comparison_count = 0;
  block_cache_hit_count = 0;
  block_read_count = 0;
@ -38,11 +47,15 @@ void PerfContext::Reset() {
  find_next_user_entry_time = 0;
  write_pre_and_post_process_time = 0;
  write_memtable_time = 0;
 #endif
 }
 #define OUTPUT(counter) #counter << " = " << counter << ", "
 std::string PerfContext::ToString() const {
 #if defined(NPERF_CONTEXT) || defined(IOS_CROSS_COMPILE)
  return "";
 #else
  std::ostringstream ss;
  ss << OUTPUT(user_key_comparison_count)
     << OUTPUT(block_cache_hit_count)
@ -67,8 +80,7 @@ std::string PerfContext::ToString() const {
     << OUTPUT(write_pre_and_post_process_time)
     << OUTPUT(write_memtable_time);
  return ss.str();
 #endif
 }
 __thread PerfContext perf_context;
 }
--- a/util/perf_context_imp.h
+++ b/util/perf_context_imp.h
@ -9,26 +9,80 @@
 namespace rocksdb {
-extern enum PerfLevel perf_level;
+#if defined(NPERF_CONTEXT) || defined(IOS_CROSS_COMPILE)
-inline void StartPerfTimer(StopWatchNano* timer) {
+#define PERF_TIMER_DECLARE()
-  if (perf_level >= PerfLevel::kEnableTime) {
+#define PERF_TIMER_START(metric)
-    timer->Start();
+#define PERF_TIMER_AUTO(metric)
 #define PERF_TIMER_MEASURE(metric)
 #define PERF_TIMER_STOP(metric)
 #define PERF_COUNTER_ADD(metric, value)
 #else
 extern __thread PerfLevel perf_level;
 class PerfStepTimer {
 public:
  PerfStepTimer()
    : enabled_(perf_level >= PerfLevel::kEnableTime),
      env_(enabled_ ? Env::Default() : nullptr),
      start_(0) {
  }
 }
-inline void BumpPerfCount(uint64_t* count, uint64_t delta = 1) {
+  void Start() {
-  if (perf_level >= PerfLevel::kEnableCount) {
+    if (enabled_) {
-    *count += delta;
+      start_ = env_->NowNanos();
    }
  }
 }
-inline void BumpPerfTime(uint64_t* time,
+  void Measure(uint64_t* metric) {
-                         StopWatchNano* timer,
+    if (start_) {
-                         bool reset = true) {
+      uint64_t now = env_->NowNanos();
-  if (perf_level >= PerfLevel::kEnableTime) {
+      *metric += now - start_;
-    *time += timer->ElapsedNanos(reset);
+      start_ = now;
    }
  }
-}
+
  void Stop(uint64_t* metric) {
    if (start_) {
      *metric += env_->NowNanos() - start_;
      start_ = 0;
    }
  }
 private:
  const bool enabled_;
  Env* const env_;
  uint64_t start_;
 };
 // Declare the local timer object to be used later on
 #define PERF_TIMER_DECLARE()           \
  PerfStepTimer perf_step_timer;
 // Set start time of the timer
 #define PERF_TIMER_START(metric)          \
  perf_step_timer.Start();
 // Declare and set start time of the timer
 #define PERF_TIMER_AUTO(metric)           \
  PerfStepTimer perf_step_timer;          \
  perf_step_timer.Start();
 // Update metric with time elapsed since last START. start time is reset
 // to current timestamp.
 #define PERF_TIMER_MEASURE(metric)        \
  perf_step_timer.Measure(&(perf_context.metric));
 // Update metric with time elapsed since last START. But start time is not set.
 #define PERF_TIMER_STOP(metric)        \
  perf_step_timer.Stop(&(perf_context.metric));
 // Increase metric value
 #define PERF_COUNTER_ADD(metric, value)     \
  perf_context.metric += value;
 #endif
 }
--- a/util/skiplistrep.cc
+++ b/util/skiplistrep.cc
@ -13,13 +13,13 @@ class SkipListRep : public MemTableRep {
  SkipList<const char*, const MemTableRep::KeyComparator&> skip_list_;
 public:
  explicit SkipListRep(const MemTableRep::KeyComparator& compare, Arena* arena)
-    : skip_list_(compare, arena) {
+    : MemTableRep(arena), skip_list_(compare, arena) {
  }
  // Insert key into the list.
  // REQUIRES: nothing that compares equal to key is currently in the list.
-  virtual void Insert(const char* key) override {
+  virtual void Insert(KeyHandle handle) override {
-    skip_list_.Insert(key);
+    skip_list_.Insert(static_cast<char*>(handle));
  }
  // Returns true iff an entry that compares equal to key is in the list.
--- a/util/sync_point.cc
+++ b/util/sync_point.cc
@ -0,0 +1,62 @@
 //  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
 //  This source code is licensed under the BSD-style license found in the
 //  LICENSE file in the root directory of this source tree. An additional grant
 //  of patent rights can be found in the PATENTS file in the same directory.
 #include "util/sync_point.h"
 namespace rocksdb {
 SyncPoint* SyncPoint::GetInstance() {
  static SyncPoint sync_point;
  return &sync_point;
 }
 void SyncPoint::LoadDependency(const std::vector<Dependency>& dependencies) {
  successors_.clear();
  predecessors_.clear();
  cleared_points_.clear();
  for (const auto& dependency : dependencies) {
    successors_[dependency.predecessor].push_back(dependency.successor);
    predecessors_[dependency.successor].push_back(dependency.predecessor);
  }
 }
 bool SyncPoint::PredecessorsAllCleared(const std::string& point) {
  for (const auto& pred : predecessors_[point]) {
    if (cleared_points_.count(pred) == 0) {
      return false;
    }
  }
  return true;
 }
 void SyncPoint::EnableProcessing() {
  std::unique_lock<std::mutex> lock(mutex_);
  enabled_ = true;
 }
 void SyncPoint::DisableProcessing() {
  std::unique_lock<std::mutex> lock(mutex_);
  enabled_ = false;
 }
 void SyncPoint::ClearTrace() {
  std::unique_lock<std::mutex> lock(mutex_);
  cleared_points_.clear();
 }
 void SyncPoint::Process(const std::string& point) {
  std::unique_lock<std::mutex> lock(mutex_);
  if (!enabled_) return;
  while (!PredecessorsAllCleared(point)) {
    cv_.wait(lock);
  }
  cleared_points_.insert(point);
  cv_.notify_all();
 }
 }  // namespace rocksdb
--- a/util/sync_point.h
+++ b/util/sync_point.h
@ -0,0 +1,79 @@
 //  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
 //  This source code is licensed under the BSD-style license found in the
 //  LICENSE file in the root directory of this source tree. An additional grant
 //  of patent rights can be found in the PATENTS file in the same directory.
 #pragma once
 #include <condition_variable>
 #include <mutex>
 #include <string>
 #include <unordered_set>
 #include <unordered_map>
 #include <vector>
 namespace rocksdb {
 // This class provides facility to reproduce race conditions deterministically
 // in unit tests.
 // Developer could specify sync points in the codebase via TEST_SYNC_POINT.
 // Each sync point represents a position in the execution stream of a thread.
 // In the unit test, 'Happens After' relationship among sync points could be
 // setup via SyncPoint::LoadDependency, to reproduce a desired interleave of
 // threads execution.
 // Refer to (DBTest,TransactionLogIteratorRace), for an exmaple use case.
 class SyncPoint {
 public:
  static SyncPoint* GetInstance();
  struct Dependency {
    std::string predecessor;
    std::string successor;
  };
  // call once at the beginning of a test to setup the dependency between
  // sync points
  void LoadDependency(const std::vector<Dependency>& dependencies);
  // enable sync point processing (disabled on startup)
  void EnableProcessing();
  // disable sync point processing
  void DisableProcessing();
  // remove the execution trace of all sync points
  void ClearTrace();
  // triggered by TEST_SYNC_POINT, blocking execution until all predecessors
  // are executed.
  void Process(const std::string& point);
  // TODO: it might be useful to provide a function that blocks until all
  // sync points are cleared.
 private:
  bool PredecessorsAllCleared(const std::string& point);
  // successor/predecessor map loaded from LoadDependency
  std::unordered_map<std::string, std::vector<std::string>> successors_;
  std::unordered_map<std::string, std::vector<std::string>> predecessors_;
  std::mutex mutex_;
  std::condition_variable cv_;
  // sync points that have been passed through
  std::unordered_set<std::string> cleared_points_;
  bool enabled_ = false;
 };
 }  // namespace rocksdb
 // Use TEST_SYNC_POINT to specify sync points inside code base.
 // Sync points can have happens-after depedency on other sync points,
 // configured at runtime via SyncPoint::LoadDependency. This could be
 // utilized to re-produce race conditions between threads.
 // See TransactionLogIteratorRace in db_test.cc for an example use case.
 // TEST_SYNC_POINT is no op in release build.
 #ifdef NDEBUG
 #define TEST_SYNC_POINT(x)
 #else
 #define TEST_SYNC_POINT(x) rocksdb::SyncPoint::GetInstance()->Process(x)
 #endif
--- a/util/thread_local.h
+++ b/util/thread_local.h
@ -16,6 +16,7 @@
 #include "util/autovector.h"
 #include "port/port_posix.h"
 #include "util/thread_local.h"
 namespace rocksdb {
--- a/util/vectorrep.cc
+++ b/util/vectorrep.cc
@ -30,7 +30,7 @@ class VectorRep : public MemTableRep {
  // single buffer and pass that in as the parameter to Insert)
  // REQUIRES: nothing that compares equal to key is currently in the
  // collection.
-  virtual void Insert(const char* key) override;
+  virtual void Insert(KeyHandle handle) override;
  // Returns true iff an entry that compares equal to key is in the collection.
  virtual bool Contains(const char* key) const override;
@ -106,7 +106,8 @@ class VectorRep : public MemTableRep {
  const KeyComparator& compare_;
 };
-void VectorRep::Insert(const char* key) {
+void VectorRep::Insert(KeyHandle handle) {
  auto* key = static_cast<char*>(handle);
  assert(!Contains(key));
  WriteLock l(&rwlock_);
  assert(!immutable_);
@ -134,7 +135,8 @@ size_t VectorRep::ApproximateMemoryUsage() {
 }
 VectorRep::VectorRep(const KeyComparator& compare, Arena* arena, size_t count)
-  : bucket_(new Bucket()),
+  : MemTableRep(arena),
    bucket_(new Bucket()),
    immutable_(false),
    sorted_(false),
    compare_(compare) { bucket_.get()->reserve(count); }
--- a/utilities/backupable/backupable_db_test.cc
+++ b/utilities/backupable/backupable_db_test.cc
@ -44,7 +44,9 @@ class DummyDB : public StackableDB {
    return options_.env;
  }
-  virtual const Options& GetOptions() const override {
+  using DB::GetOptions;
  virtual const Options& GetOptions(ColumnFamilyHandle* column_family) const
      override {
    return options_;
  }
@ -68,6 +70,10 @@ class DummyDB : public StackableDB {
    return Status::OK();
  }
  virtual ColumnFamilyHandle* DefaultColumnFamily() const override {
    return nullptr;
  }
  class DummyLogFile : public LogFile {
   public:
    /* implicit */
@ -345,7 +351,7 @@ class BackupableDBTest {
    options_.wal_dir = dbname_;
    // set up backup db options
    CreateLoggerFromOptions(dbname_, backupdir_, env_,
-                            Options(), &logger_);
+                            DBOptions(), &logger_);
    backupable_options_.reset(new BackupableDBOptions(
        backupdir_, test_backup_env_.get(), true, logger_.get(), true));
@ -425,6 +431,19 @@ class BackupableDBTest {
    }
  }
  void DeleteLogFiles() {
    std::vector<std::string> delete_logs;
    env_->GetChildren(dbname_, &delete_logs);
    for (auto f : delete_logs) {
      uint64_t number;
      FileType type;
      bool ok = ParseFileName(f, &number, &type);
      if (ok && type == kLogFile) {
        env_->DeleteFile(dbname_ + "/" + f);
      }
    }
  }
  // files
  std::string dbname_;
  std::string backupdir_;
@ -721,10 +740,11 @@ TEST(BackupableDBTest, FailOverwritingBackups) {
  // create backups 1, 2, 3, 4, 5
  OpenBackupableDB(true);
  for (int i = 0; i < 5; ++i) {
    FillDB(db_.get(), 100 * i, 100 * (i + 1));
    ASSERT_OK(db_->CreateNewBackup(true));
    CloseBackupableDB();
    DeleteLogFiles();
    OpenBackupableDB(false);
    FillDB(db_.get(), 100 * i, 100 * (i + 1));
    ASSERT_OK(db_->CreateNewBackup(true));
  }
  CloseBackupableDB();
@ -826,7 +846,7 @@ TEST(BackupableDBTest, RateLimiting) {
    auto rate_limited_backup_time = (bytes_written * kMicrosPerSec) /
                                    backupable_options_->backup_rate_limit;
    ASSERT_GT(backup_time, 0.9 * rate_limited_backup_time);
-    ASSERT_LT(backup_time, 1.5 * rate_limited_backup_time);
+    ASSERT_LT(backup_time, 2.5 * rate_limited_backup_time);
    CloseBackupableDB();
@ -838,7 +858,7 @@ TEST(BackupableDBTest, RateLimiting) {
    auto rate_limited_restore_time = (bytes_written * kMicrosPerSec) /
                                     backupable_options_->restore_rate_limit;
    ASSERT_GT(restore_time, 0.9 * rate_limited_restore_time);
-    ASSERT_LT(restore_time, 1.5 * rate_limited_restore_time);
+    ASSERT_LT(restore_time, 2.5 * rate_limited_restore_time);
    AssertBackupConsistency(0, 0, 100000, 100010);
  }
--- a/utilities/geodb/geodb_test.cc
+++ b/utilities/geodb/geodb_test.cc
@ -35,7 +35,7 @@ class GeoDBTest {
  }
 };
-const std::string GeoDBTest::kDefaultDbName = "/tmp/geodefault/";
+const std::string GeoDBTest::kDefaultDbName = "/tmp/geodefault";
 Options GeoDBTest::options = Options();
 // Insert, Get and Remove
@ -106,14 +106,14 @@ TEST(GeoDBTest, Search) {
  std::vector<GeoObject> values;
  status = getdb()->SearchRadial(GeoPosition(46, 46), 200000, &values);
  ASSERT_TRUE(status.ok());
-  ASSERT_EQ(values.size(), 1);
+  ASSERT_EQ(values.size(), 1U);
  // search all objects centered at 46 degree latitude with
  // a radius of 2 kilometers. There should be none.
  values.clear();
  status = getdb()->SearchRadial(GeoPosition(46, 46), 2, &values);
  ASSERT_TRUE(status.ok());
-  ASSERT_EQ(values.size(), 0);
+  ASSERT_EQ(values.size(), 0U);
 }
 }  // namespace rocksdb
--- a/utilities/ttl/db_ttl.cc
+++ b/utilities/ttl/db_ttl.cc
@ -119,15 +119,16 @@ Status DBWithTTL::StripTS(std::string* str) {
  return st;
 }
-Status DBWithTTL::Put(const WriteOptions& opt, const Slice& key,
+Status DBWithTTL::Put(const WriteOptions& options,
                      ColumnFamilyHandle* column_family, const Slice& key,
                      const Slice& val) {
  WriteBatch batch;
  batch.Put(key, val);
-  return Write(opt, &batch);
+  return Write(options, &batch);
 }
 Status DBWithTTL::Get(const ReadOptions& options,
-                      const Slice& key,
+                      ColumnFamilyHandle* column_family, const Slice& key,
                      std::string* value) {
  Status st = db_->Get(options, key, value);
  if (!st.ok()) {
@ -140,18 +141,18 @@ Status DBWithTTL::Get(const ReadOptions& options,
  return StripTS(value);
 }
-std::vector<Status> DBWithTTL::MultiGet(const ReadOptions& options,
+std::vector<Status> DBWithTTL::MultiGet(
-                                        const std::vector<Slice>& keys,
+    const ReadOptions& options,
-                                        std::vector<std::string>* values) {
+    const std::vector<ColumnFamilyHandle*>& column_family,
    const std::vector<Slice>& keys, std::vector<std::string>* values) {
  return std::vector<Status>(keys.size(),
                             Status::NotSupported("MultiGet not\
                               supported with TTL"));
 }
 bool DBWithTTL::KeyMayExist(const ReadOptions& options,
-                            const Slice& key,
+                            ColumnFamilyHandle* column_family, const Slice& key,
-                            std::string* value,
+                            std::string* value, bool* value_found) {
                            bool* value_found) {
  bool ret = db_->KeyMayExist(options, key, value, value_found);
  if (ret && value != nullptr && value_found != nullptr && *value_found) {
    if (!SanityCheckTimestamp(*value).ok() || !StripTS(value).ok()) {
@ -161,12 +162,12 @@ bool DBWithTTL::KeyMayExist(const ReadOptions& options,
  return ret;
 }
-Status DBWithTTL::Merge(const WriteOptions& opt,
+Status DBWithTTL::Merge(const WriteOptions& options,
-                        const Slice& key,
+                        ColumnFamilyHandle* column_family, const Slice& key,
                        const Slice& value) {
  WriteBatch batch;
  batch.Merge(key, value);
-  return Write(opt, &batch);
+  return Write(options, &batch);
 }
 Status DBWithTTL::Write(const WriteOptions& opts, WriteBatch* updates) {
@ -208,12 +209,9 @@ Status DBWithTTL::Write(const WriteOptions& opts, WriteBatch* updates) {
  }
 }
-Iterator* DBWithTTL::NewIterator(const ReadOptions& opts) {
+Iterator* DBWithTTL::NewIterator(const ReadOptions& opts,
-  return new TtlIterator(db_->NewIterator(opts));
+                                 ColumnFamilyHandle* column_family) {
-}
+  return new TtlIterator(db_->NewIterator(opts, column_family));
 void DBWithTTL::TEST_Destroy_DBWithTtl() {
  ((DBImpl*) db_)->TEST_Destroy_DBImpl();
 }
 }  // namespace rocksdb
--- a/utilities/ttl/db_ttl.h
+++ b/utilities/ttl/db_ttl.h
@ -23,30 +23,39 @@ class DBWithTTL : public StackableDB {
  virtual ~DBWithTTL();
-  virtual Status Put(const WriteOptions& o, const Slice& key,
+  using StackableDB::Put;
  virtual Status Put(const WriteOptions& options,
                     ColumnFamilyHandle* column_family, const Slice& key,
                     const Slice& val) override;
-  virtual Status Get(const ReadOptions& options, const Slice& key,
+  using StackableDB::Get;
  virtual Status Get(const ReadOptions& options,
                     ColumnFamilyHandle* column_family, const Slice& key,
                     std::string* value) override;
  using StackableDB::MultiGet;
  virtual std::vector<Status> MultiGet(
-      const ReadOptions& options, const std::vector<Slice>& keys,
+      const ReadOptions& options,
      const std::vector<ColumnFamilyHandle*>& column_family,
      const std::vector<Slice>& keys,
      std::vector<std::string>* values) override;
  using StackableDB::KeyMayExist;
  virtual bool KeyMayExist(const ReadOptions& options,
-                           const Slice& key,
+                           ColumnFamilyHandle* column_family, const Slice& key,
                           std::string* value,
                           bool* value_found = nullptr) override;
-  virtual Status Merge(const WriteOptions& options, const Slice& key,
+  using StackableDB::Merge;
  virtual Status Merge(const WriteOptions& options,
                       ColumnFamilyHandle* column_family, const Slice& key,
                       const Slice& value) override;
  virtual Status Write(const WriteOptions& opts, WriteBatch* updates) override;
-  virtual Iterator* NewIterator(const ReadOptions& opts) override;
+  using StackableDB::NewIterator;
-
+  virtual Iterator* NewIterator(const ReadOptions& opts,
-  // Simulate a db crash, no elegant closing of database.
+                                ColumnFamilyHandle* column_family) override;
  void TEST_Destroy_DBWithTtl();
  virtual DB* GetBaseDB() {
    return db_;