From ddb8039e3aa1fe99d4a17c21cc970831258d1bb8 Mon Sep 17 00:00:00 2001
From: Naveen <nsomasundaram@linkedin.com>
Date: Mon, 18 Aug 2014 14:03:46 -0700
Subject: [PATCH 001/829] RocksDB static build Make file changes to download
 and build the dependencies .Load the shared library when RocksDB is
 initialized

---
 Makefile                                  | 32 +++++++++++++++-
 java/org/rocksdb/NativeLibraryLoader.java | 46 +++++++++++++++++++++++
 java/org/rocksdb/Options.java             |  3 ++
 java/org/rocksdb/RocksDB.java             | 24 ++++++++++--
 4 files changed, 100 insertions(+), 5 deletions(-)
 create mode 100644 java/org/rocksdb/NativeLibraryLoader.java

diff --git a/Makefile b/Makefile
index 1bd202bc9..fd7c8c7d1 100644
--- a/Makefile
+++ b/Makefile
@@ -175,7 +175,7 @@ endif  # PLATFORM_SHARED_EXT
 
 .PHONY: blackbox_crash_test check clean coverage crash_test ldb_tests \
 	release tags valgrind_check whitebox_crash_test format static_lib shared_lib all \
-	dbg
+	dbg rocksdbjavastatic rocksdbjava
 
 all: $(LIBRARY) $(PROGRAMS) $(TESTS)
 
@@ -480,6 +480,36 @@ ROCKSDBJNILIB = librocksdbjni.jnilib
 JAVA_INCLUDE = -I/System/Library/Frameworks/JavaVM.framework/Headers/
 endif
 
+
+rocksdbjavastatic:
+	#build zlib 
+	curl -O http://zlib.net/zlib-1.2.8.tar.gz
+	tar xvzf zlib-1.2.8.tar.gz
+	cd zlib-1.2.8 && CFLAGS='-fPIC' ./configure --static && make
+	cp zlib-1.2.8/libz.a .
+	rm -rf zlib-1.2.8.tar.gz zlib-1.2.8
+	
+	#build bzip
+	curl -O  http://www.bzip.org/1.0.6/bzip2-1.0.6.tar.gz
+	tar xvzf bzip2-1.0.6.tar.gz
+	cd bzip2-1.0.6 && make CFLAGS='-fPIC -Wall -Winline -O2 -g -D_FILE_OFFSET_BITS=64'
+	cp bzip2-1.0.6/libbz2.a .
+	rm -rf bzip2-1.0.6.tar.gz bzip2-1.0.6
+
+	#build snappy
+	curl -O https://snappy.googlecode.com/files/snappy-1.1.1.tar.gz
+	tar xvzf snappy-1.1.1.tar.gz
+	cd snappy-1.1.1 && ./configure --with-pic --enable-static 
+	cd snappy-1.1.1 && make
+	cp snappy-1.1.1/.libs/libsnappy.a .
+	rm -rf snappy-1.1.1 snappy-1.1.1.tar.gz
+	OPT="-fPIC -DNDEBUG -O2" $(MAKE) $(LIBRARY) -j
+	cd java;$(MAKE) java;
+	rm -f ./java/$(ROCKSDBJNILIB)
+	$(CXX) $(CXXFLAGS) -I./java/. $(JAVA_INCLUDE) -shared -fPIC -o ./java/$(ROCKSDBJNILIB) $(JNI_NATIVE_SOURCES) $(LIBOBJECTS) $(COVERAGEFLAGS) libz.a libbz2.a libsnappy.a
+	cd java;jar -cf $(ROCKSDB_JAR) org/rocksdb/*.class org/rocksdb/util/*.class HISTORY*.md $(ROCKSDBJNILIB)
+	
+
 rocksdbjava:
 	OPT="-fPIC -DNDEBUG -O2" $(MAKE) $(LIBRARY) -j32
 	cd java;$(MAKE) java;
diff --git a/java/org/rocksdb/NativeLibraryLoader.java b/java/org/rocksdb/NativeLibraryLoader.java
new file mode 100644
index 000000000..f49f54488
--- /dev/null
+++ b/java/org/rocksdb/NativeLibraryLoader.java
@@ -0,0 +1,46 @@
+package org.rocksdb;
+
+import java.io.*;
+
+public class NativeLibraryLoader
+{
+
+  private static String sharedLibraryName = "librocksdbjni.so";
+  private static String tempFilePrefix = "librocksdbjni";
+  private static String tempFileSuffix = ".so";
+  /**
+   * Private constructor - this class will never be instanced
+   */
+  private NativeLibraryLoader() {
+  }
+
+  public static void loadLibraryFromJar() throws IOException {
+
+    File temp = File.createTempFile(tempFilePrefix, tempFileSuffix);
+    temp.deleteOnExit();
+
+    if (!temp.exists()) {
+      throw new FileNotFoundException("File " + temp.getAbsolutePath() + " does not exist.");
+    }
+
+    byte[] buffer = new byte[1024];
+    int readBytes;
+
+    InputStream is = ClassLoader.getSystemClassLoader().getResourceAsStream(sharedLibraryName);
+    if (is == null) {
+      throw new FileNotFoundException(sharedLibraryName + " was not found inside JAR.");
+    }
+
+    OutputStream os = new FileOutputStream(temp);
+    try {
+      while ((readBytes = is.read(buffer)) != -1) {
+        os.write(buffer, 0, readBytes);
+      }
+    } finally {
+      os.close();
+      is.close();
+    }
+
+    System.load(temp.getAbsolutePath());
+  }
+}
diff --git a/java/org/rocksdb/Options.java b/java/org/rocksdb/Options.java
index 95f994606..420bfebba 100644
--- a/java/org/rocksdb/Options.java
+++ b/java/org/rocksdb/Options.java
@@ -13,6 +13,9 @@ package org.rocksdb;
  * native resources will be released as part of the process.
  */
 public class Options extends RocksObject {
+  static{
+    RocksDB.loadLibrary();
+  }
   static final long DEFAULT_CACHE_SIZE = 8 << 20;
   static final int DEFAULT_NUM_SHARD_BITS = -1;
   /**
diff --git a/java/org/rocksdb/RocksDB.java b/java/org/rocksdb/RocksDB.java
index c7b06cc6d..bd9a4f648 100644
--- a/java/org/rocksdb/RocksDB.java
+++ b/java/org/rocksdb/RocksDB.java
@@ -11,6 +11,7 @@ import java.util.HashMap;
 import java.io.Closeable;
 import java.io.IOException;
 import org.rocksdb.util.Environment;
+import org.rocksdb.NativeLibraryLoader;
 
 /**
  * A RocksDB is a persistent ordered map from keys to values.  It is safe for
@@ -18,16 +19,24 @@ import org.rocksdb.util.Environment;
  * All methods of this class could potentially throw RocksDBException, which
  * indicates sth wrong at the rocksdb library side and the call failed.
  */
-public class RocksDB extends RocksObject {
+public class RocksDB extends org.rocksdb.RocksObject
+{
+
   public static final int NOT_FOUND = -1;
   private static final String[] compressionLibs_ = {
       "snappy", "z", "bzip2", "lz4", "lz4hc"};
 
+  static {
+      loadLibrary();
+  }
+
+
   /**
    * Loads the necessary library files.
    * Calling this method twice will have no effect.
    */
-  public static synchronized void loadLibrary() {
+  public static synchronized void loadLibrary()
+  {
     // loading possibly necessary libraries.
     for (String lib : compressionLibs_) {
       try {
@@ -36,8 +45,15 @@ public class RocksDB extends RocksObject {
         // since it may be optional, we ignore its loading failure here.
       }
     }
-    // However, if any of them is required.  We will see error here.
-    System.loadLibrary("rocksdbjni");
+
+    try
+    {
+      NativeLibraryLoader.loadLibraryFromJar();
+    }
+    catch (IOException e)
+    {
+      e.printStackTrace();
+    }
   }
 
   /**

From 343e98a7d13391ab95dd0b64c1422da597926a96 Mon Sep 17 00:00:00 2001
From: Naveen <nsomasundaram@linkedin.com>
Date: Mon, 18 Aug 2014 14:08:10 -0700
Subject: [PATCH 002/829] Reverting import change

---
 java/org/rocksdb/RocksDB.java | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/java/org/rocksdb/RocksDB.java b/java/org/rocksdb/RocksDB.java
index bd9a4f648..6825bf3c5 100644
--- a/java/org/rocksdb/RocksDB.java
+++ b/java/org/rocksdb/RocksDB.java
@@ -19,7 +19,7 @@ import org.rocksdb.NativeLibraryLoader;
  * All methods of this class could potentially throw RocksDBException, which
  * indicates sth wrong at the rocksdb library side and the call failed.
  */
-public class RocksDB extends org.rocksdb.RocksObject
+public class RocksDB extends RocksObject
 {
 
   public static final int NOT_FOUND = -1;

From f09329cb017a4d401b341880304a48aacc5d0841 Mon Sep 17 00:00:00 2001
From: Nik Bougalis <nikb@bougalis.net>
Date: Sun, 31 Aug 2014 00:54:15 -0700
Subject: [PATCH 003/829] Fix candidate file comparison when using path ids

---
 db/db_impl.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/db/db_impl.cc b/db/db_impl.cc
index 9cb09d719..16622758f 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -631,7 +631,7 @@ bool CompareCandidateFile(const rocksdb::DBImpl::CandidateFileInfo& first,
   } else if (first.file_name < second.file_name) {
     return false;
   } else {
-    return (first.path_id > first.path_id);
+    return (first.path_id > second.path_id);
   }
 }
 };  // namespace

From dff2b1a8f89211b97eea45cf6ce776b06388bc9d Mon Sep 17 00:00:00 2001
From: Wankai Zhang <wankaizhang@gmail.com>
Date: Mon, 1 Sep 2014 22:54:10 +0800
Subject: [PATCH 004/829] typo improvement

---
 table/block_based_table_builder.cc | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/table/block_based_table_builder.cc b/table/block_based_table_builder.cc
index 37b6f86fc..ecb176a97 100644
--- a/table/block_based_table_builder.cc
+++ b/table/block_based_table_builder.cc
@@ -133,12 +133,12 @@ class ShortenedIndexBuilder : public IndexBuilder {
     index_block_builder_.Add(*last_key_in_current_block, handle_encoding);
   }
 
-  virtual Status Finish(IndexBlocks* index_blocks) {
+  virtual Status Finish(IndexBlocks* index_blocks) override {
     index_blocks->index_block_contents = index_block_builder_.Finish();
     return Status::OK();
   }
 
-  virtual size_t EstimatedSize() const {
+  virtual size_t EstimatedSize() const override {
     return index_block_builder_.CurrentSizeEstimate();
   }
 
@@ -175,14 +175,14 @@ class HashIndexBuilder : public IndexBuilder {
   explicit HashIndexBuilder(const Comparator* comparator,
                             const SliceTransform* hash_key_extractor)
       : IndexBuilder(comparator),
-        primary_index_builder(comparator),
+        primary_index_builder_(comparator),
         hash_key_extractor_(hash_key_extractor) {}
 
   virtual void AddIndexEntry(std::string* last_key_in_current_block,
                              const Slice* first_key_in_next_block,
                              const BlockHandle& block_handle) override {
     ++current_restart_index_;
-    primary_index_builder.AddIndexEntry(last_key_in_current_block,
+    primary_index_builder_.AddIndexEntry(last_key_in_current_block,
                                         first_key_in_next_block, block_handle);
   }
 
@@ -213,9 +213,9 @@ class HashIndexBuilder : public IndexBuilder {
     }
   }
 
-  virtual Status Finish(IndexBlocks* index_blocks) {
+  virtual Status Finish(IndexBlocks* index_blocks) override {
     FlushPendingPrefix();
-    primary_index_builder.Finish(index_blocks);
+    primary_index_builder_.Finish(index_blocks);
     index_blocks->meta_blocks.insert(
         {kHashIndexPrefixesBlock.c_str(), prefix_block_});
     index_blocks->meta_blocks.insert(
@@ -223,8 +223,8 @@ class HashIndexBuilder : public IndexBuilder {
     return Status::OK();
   }
 
-  virtual size_t EstimatedSize() const {
-    return primary_index_builder.EstimatedSize() + prefix_block_.size() +
+  virtual size_t EstimatedSize() const override {
+    return primary_index_builder_.EstimatedSize() + prefix_block_.size() +
            prefix_meta_block_.size();
   }
 
@@ -237,7 +237,7 @@ class HashIndexBuilder : public IndexBuilder {
     PutVarint32(&prefix_meta_block_, pending_block_num_);
   }
 
-  ShortenedIndexBuilder primary_index_builder;
+  ShortenedIndexBuilder primary_index_builder_;
   const SliceTransform* hash_key_extractor_;
 
   // stores a sequence of prefixes

From 7dcadb1d37c91a2418d3df3f308ebd256ca4238b Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Tue, 2 Sep 2014 08:34:54 -0700
Subject: [PATCH 005/829] Don't let flush preempt compaction in certain cases

Summary:
I have an application configured with 16 background threads. Write rates are high. L0->L1 compactions is very slow and it limits the concurrency of the system. While it's happening, other 15 threads are idle. However, when there is a need of a flush, that one thread busy with L0->L1 is doing flush, instead of any other 15 threads that are just sitting there.

This diff prevents that. If there are threads that are idle, we don't let flush preempt compaction.

Test Plan: Will run stress test

Reviewers: ljin, sdong, yhchiang

Reviewed By: sdong, yhchiang

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D22299
---
 db/db_impl.cc | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/db/db_impl.cc b/db/db_impl.cc
index 16622758f..9900ff2bb 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -2581,6 +2581,10 @@ inline SequenceNumber DBImpl::findEarliestVisibleSnapshot(
 uint64_t DBImpl::CallFlushDuringCompaction(ColumnFamilyData* cfd,
                                            DeletionState& deletion_state,
                                            LogBuffer* log_buffer) {
+  if (options_.max_background_flushes > 0) {
+    // flush thread will take care of this
+    return 0;
+  }
   if (cfd->imm()->imm_flush_needed.NoBarrier_Load() != nullptr) {
     const uint64_t imm_start = env_->NowMicros();
     mutex_.Lock();

From 990df99a610def4378be69465a407703b441b19b Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Tue, 2 Sep 2014 10:50:15 -0700
Subject: [PATCH 006/829] Fix ios compile

Summary: We need to set contbuild for this :)

Test Plan: compiles

Reviewers: sdong, yhchiang, ljin

Reviewed By: ljin

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D22701
---
 db/internal_stats.cc  | 2 ++
 table/block_builder.h | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/db/internal_stats.cc b/db/internal_stats.cc
index 34eb99781..3142d13b3 100644
--- a/db/internal_stats.cc
+++ b/db/internal_stats.cc
@@ -257,9 +257,11 @@ bool InternalStats::GetIntProperty(DBPropertyType property_type,
                cfd_->imm()->current()->GetTotalNumEntries() +
                current->GetEstimatedActiveKeys();
       return true;
+#ifndef ROCKSDB_LITE
     case kIsFileDeletionEnabled:
       *value = db->IsFileDeletionsEnabled();
       return true;
+#endif
     default:
       return false;
   }
diff --git a/table/block_builder.h b/table/block_builder.h
index eb7c49f7d..a63e7c795 100644
--- a/table/block_builder.h
+++ b/table/block_builder.h
@@ -50,7 +50,7 @@ class BlockBuilder {
 
  private:
   const int          block_restart_interval_;
-  const Comparator*  comparator_;
+  const Comparator* comparator_ __attribute__((unused));  // only used in assert
 
   std::string           buffer_;    // Destination buffer
   std::vector<uint32_t> restarts_;  // Restart points

From 076bd01a29fbb404bc6902c0ce67a009750a103d Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Tue, 2 Sep 2014 11:49:38 -0700
Subject: [PATCH 007/829] Fix compile

Summary: gcc on our dev boxes is not happy about __attribute__((unused))

Test Plan: compiles now

Reviewers: sdong, ljin

Reviewed By: ljin

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D22707
---
 table/block_based_table_builder.cc | 4 ++--
 table/block_builder.cc             | 6 +-----
 table/block_builder.h              | 3 +--
 table/block_test.cc                | 5 ++---
 table/meta_blocks.cc               | 8 ++------
 table/table_test.cc                | 5 ++---
 6 files changed, 10 insertions(+), 21 deletions(-)

diff --git a/table/block_based_table_builder.cc b/table/block_based_table_builder.cc
index 0e5ea0a69..03f1e199c 100644
--- a/table/block_based_table_builder.cc
+++ b/table/block_based_table_builder.cc
@@ -116,7 +116,7 @@ class ShortenedIndexBuilder : public IndexBuilder {
  public:
   explicit ShortenedIndexBuilder(const Comparator* comparator)
       : IndexBuilder(comparator),
-        index_block_builder_(1 /* block_restart_interval == 1 */, comparator) {}
+        index_block_builder_(1 /* block_restart_interval == 1 */) {}
 
   virtual void AddIndexEntry(std::string* last_key_in_current_block,
                              const Slice* first_key_in_next_block,
@@ -420,7 +420,7 @@ struct BlockBasedTableBuilder::Rep {
         table_options(table_opt),
         internal_comparator(icomparator),
         file(f),
-        data_block(table_options.block_restart_interval, &internal_comparator),
+        data_block(table_options.block_restart_interval),
         internal_prefix_transform(options.prefix_extractor.get()),
         index_builder(CreateIndexBuilder(
               table_options.index_type, &internal_comparator,
diff --git a/table/block_builder.cc b/table/block_builder.cc
index 5bac54ae7..f8627743a 100644
--- a/table/block_builder.cc
+++ b/table/block_builder.cc
@@ -41,10 +41,8 @@
 
 namespace rocksdb {
 
-BlockBuilder::BlockBuilder(int block_restart_interval,
-                           const Comparator* comparator)
+BlockBuilder::BlockBuilder(int block_restart_interval)
     : block_restart_interval_(block_restart_interval),
-      comparator_(comparator),
       restarts_(),
       counter_(0),
       finished_(false) {
@@ -96,8 +94,6 @@ void BlockBuilder::Add(const Slice& key, const Slice& value) {
   Slice last_key_piece(last_key_);
   assert(!finished_);
   assert(counter_ <= block_restart_interval_);
-  assert(buffer_.empty() // No values yet?
-         || comparator_->Compare(key, last_key_piece) > 0);
   size_t shared = 0;
   if (counter_ < block_restart_interval_) {
     // See how much sharing to do with previous string
diff --git a/table/block_builder.h b/table/block_builder.h
index a63e7c795..3b5b2b444 100644
--- a/table/block_builder.h
+++ b/table/block_builder.h
@@ -22,7 +22,7 @@ class BlockBuilder {
   BlockBuilder(const BlockBuilder&) = delete;
   void operator=(const BlockBuilder&) = delete;
   
-  BlockBuilder(int block_restart_interval, const Comparator* comparator);
+  explicit BlockBuilder(int block_restart_interval);
   
   // Reset the contents as if the BlockBuilder was just constructed.
   void Reset();
@@ -50,7 +50,6 @@ class BlockBuilder {
 
  private:
   const int          block_restart_interval_;
-  const Comparator* comparator_ __attribute__((unused));  // only used in assert
 
   std::string           buffer_;    // Destination buffer
   std::vector<uint32_t> restarts_;  // Restart points
diff --git a/table/block_test.cc b/table/block_test.cc
index da01d6def..b36787f8f 100644
--- a/table/block_test.cc
+++ b/table/block_test.cc
@@ -76,7 +76,7 @@ TEST(BlockTest, SimpleTest) {
 
   std::vector<std::string> keys;
   std::vector<std::string> values;
-  BlockBuilder builder(16, ic.get());
+  BlockBuilder builder(16);
   int num_records = 100000;
 
   GenerateRandomKVs(&keys, &values, 0, num_records);
@@ -132,8 +132,7 @@ BlockContents GetBlockContents(std::unique_ptr<BlockBuilder> *builder,
                                const std::vector<std::string> &keys,
                                const std::vector<std::string> &values,
                                const int prefix_group_size = 1) {
-  builder->reset(
-      new BlockBuilder(1 /* restart interval */, BytewiseComparator()));
+  builder->reset(new BlockBuilder(1 /* restart interval */));
 
   // Add only half of the keys
   for (size_t i = 0; i < keys.size(); ++i) {
diff --git a/table/meta_blocks.cc b/table/meta_blocks.cc
index a95f4c119..d9d0ed6c9 100644
--- a/table/meta_blocks.cc
+++ b/table/meta_blocks.cc
@@ -16,9 +16,7 @@
 namespace rocksdb {
 
 MetaIndexBuilder::MetaIndexBuilder()
-    : meta_index_block_(
-        new BlockBuilder(1 /* restart interval */, BytewiseComparator())) {
-}
+    : meta_index_block_(new BlockBuilder(1 /* restart interval */)) {}
 
 void MetaIndexBuilder::Add(const std::string& key,
                            const BlockHandle& handle) {
@@ -35,9 +33,7 @@ Slice MetaIndexBuilder::Finish() {
 }
 
 PropertyBlockBuilder::PropertyBlockBuilder()
-  : properties_block_(
-      new BlockBuilder(1 /* restart interval */, BytewiseComparator())) {
-}
+    : properties_block_(new BlockBuilder(1 /* restart interval */)) {}
 
 void PropertyBlockBuilder::Add(const std::string& name,
                                const std::string& val) {
diff --git a/table/table_test.cc b/table/table_test.cc
index 929cdf832..500abf48f 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -244,8 +244,7 @@ class BlockConstructor: public Constructor {
                             const KVMap& data) {
     delete block_;
     block_ = nullptr;
-    BlockBuilder builder(table_options.block_restart_interval,
-                         &internal_comparator);
+    BlockBuilder builder(table_options.block_restart_interval);
 
     for (KVMap::const_iterator it = data.begin();
          it != data.end();
@@ -1054,7 +1053,7 @@ TEST(BlockBasedTableTest, BasicBlockBasedTableProperties) {
   ASSERT_EQ("", props.filter_policy_name);  // no filter policy is used
 
   // Verify data size.
-  BlockBuilder block_builder(1, options.comparator);
+  BlockBuilder block_builder(1);
   for (const auto& item : kvmap) {
     block_builder.Add(item.first, item.second);
   }

From 6614a484183033978918b75fde2d8b589a704d8e Mon Sep 17 00:00:00 2001
From: Torrie Fischer <torrie@ripple.com>
Date: Fri, 22 Aug 2014 15:28:58 -0700
Subject: [PATCH 008/829] Refactor PerfStepTimer to stop on destruct

This eliminates the need to remember to call PERF_TIMER_STOP when a section has
been timed. This allows more useful design with the perf timers and enables
possible return value optimizations. Simplistic example:

class Foo {
  public:
    Foo(int v) : m_v(v);
  private:
    int m_v;
}

Foo makeFrobbedFoo(int *errno)
{
  *errno = 0;
  return Foo();
}

Foo bar(int *errno)
{
  PERF_TIMER_GUARD(some_timer);

  return makeFrobbedFoo(errno);
}

int main(int argc, char[] argv)
{
  Foo f;
  int errno;

  f = bar(&errno);

  if (errno)
    return -1;
  return 0;
}

After bar() is called, perf_context.some_timer would be incremented as if
Stop(&perf_context.some_timer) was called at the end, and the compiler is still
able to produce optimizations on the return value from makeFrobbedFoo() through
to main().
---
 db/db_impl.cc           | 32 +++++++++++++-----------------
 db/db_iter.cc           | 29 ++++++++++++++++-----------
 db/memtable.cc          |  3 +--
 table/format.cc         | 14 +++++++------
 table/merger.cc         | 17 +++++++---------
 util/perf_context_imp.h | 44 ++++++++++++++++++++---------------------
 6 files changed, 70 insertions(+), 69 deletions(-)

diff --git a/db/db_impl.cc b/db/db_impl.cc
index 7774b796e..32a7d23f2 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -3375,7 +3375,7 @@ Status DBImpl::GetImpl(const ReadOptions& options,
                        ColumnFamilyHandle* column_family, const Slice& key,
                        std::string* value, bool* value_found) {
   StopWatch sw(env_, stats_, DB_GET);
-  PERF_TIMER_AUTO(get_snapshot_time);
+  PERF_TIMER_GUARD(get_snapshot_time);
 
   auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
   auto cfd = cfh->cfd();
@@ -3399,6 +3399,7 @@ Status DBImpl::GetImpl(const ReadOptions& options,
   // merge_operands will contain the sequence of merges in the latter case.
   LookupKey lkey(key, snapshot);
   PERF_TIMER_STOP(get_snapshot_time);
+
   if (sv->mem->Get(lkey, value, &s, merge_context, *cfd->options())) {
     // Done
     RecordTick(stats_, MEMTABLE_HIT);
@@ -3406,20 +3407,19 @@ Status DBImpl::GetImpl(const ReadOptions& options,
     // Done
     RecordTick(stats_, MEMTABLE_HIT);
   } else {
-    PERF_TIMER_START(get_from_output_files_time);
-
+    PERF_TIMER_GUARD(get_from_output_files_time);
     sv->current->Get(options, lkey, value, &s, &merge_context, value_found);
-    PERF_TIMER_STOP(get_from_output_files_time);
     RecordTick(stats_, MEMTABLE_MISS);
   }
 
-  PERF_TIMER_START(get_post_process_time);
+  {
+    PERF_TIMER_GUARD(get_post_process_time);
 
-  ReturnAndCleanupSuperVersion(cfd, sv);
+    ReturnAndCleanupSuperVersion(cfd, sv);
 
-  RecordTick(stats_, NUMBER_KEYS_READ);
-  RecordTick(stats_, BYTES_READ, value->size());
-  PERF_TIMER_STOP(get_post_process_time);
+    RecordTick(stats_, NUMBER_KEYS_READ);
+    RecordTick(stats_, BYTES_READ, value->size());
+  }
   return s;
 }
 
@@ -3429,7 +3429,7 @@ std::vector<Status> DBImpl::MultiGet(
     const std::vector<Slice>& keys, std::vector<std::string>* values) {
 
   StopWatch sw(env_, stats_, DB_MULTIGET);
-  PERF_TIMER_AUTO(get_snapshot_time);
+  PERF_TIMER_GUARD(get_snapshot_time);
 
   SequenceNumber snapshot;
 
@@ -3505,7 +3505,7 @@ std::vector<Status> DBImpl::MultiGet(
   }
 
   // Post processing (decrement reference counts and record statistics)
-  PERF_TIMER_START(get_post_process_time);
+  PERF_TIMER_GUARD(get_post_process_time);
   autovector<SuperVersion*> superversions_to_delete;
 
   // TODO(icanadi) do we need lock here or just around Cleanup()?
@@ -3878,7 +3878,7 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
   if (my_batch == nullptr) {
     return Status::Corruption("Batch is nullptr!");
   }
-  PERF_TIMER_AUTO(write_pre_and_post_process_time);
+  PERF_TIMER_GUARD(write_pre_and_post_process_time);
   Writer w(&mutex_);
   w.batch = my_batch;
   w.sync = options.sync;
@@ -4011,7 +4011,7 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
 
       uint64_t log_size = 0;
       if (!options.disableWAL) {
-        PERF_TIMER_START(write_wal_time);
+        PERF_TIMER_GUARD(write_wal_time);
         Slice log_entry = WriteBatchInternal::Contents(updates);
         status = log_->AddRecord(log_entry);
         total_log_size_ += log_entry.size();
@@ -4029,10 +4029,9 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
             status = log_->file()->Sync();
           }
         }
-        PERF_TIMER_STOP(write_wal_time);
       }
       if (status.ok()) {
-        PERF_TIMER_START(write_memtable_time);
+        PERF_TIMER_GUARD(write_memtable_time);
 
         status = WriteBatchInternal::InsertInto(
             updates, column_family_memtables_.get(), false, 0, this, false);
@@ -4044,8 +4043,6 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
         // into the memtable would result in a state that some write ops might
         // have succeeded in memtable but Status reports error for all writes.
 
-        PERF_TIMER_STOP(write_memtable_time);
-
         SetTickerCount(stats_, SEQUENCE_NUMBER, last_sequence);
       }
       PERF_TIMER_START(write_pre_and_post_process_time);
@@ -4079,7 +4076,6 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
     RecordTick(stats_, WRITE_TIMEDOUT);
   }
 
-  PERF_TIMER_STOP(write_pre_and_post_process_time);
   return status;
 }
 
diff --git a/db/db_iter.cc b/db/db_iter.cc
index 370ffd8cb..599a56a99 100644
--- a/db/db_iter.cc
+++ b/db/db_iter.cc
@@ -194,9 +194,8 @@ void DBIter::Next() {
 // NOTE: In between, saved_key_ can point to a user key that has
 //       a delete marker
 inline void DBIter::FindNextUserEntry(bool skipping) {
-  PERF_TIMER_AUTO(find_next_user_entry_time);
+  PERF_TIMER_GUARD(find_next_user_entry_time);
   FindNextUserEntryInternal(skipping);
-  PERF_TIMER_STOP(find_next_user_entry_time);
 }
 
 // Actual implementation of DBIter::FindNextUserEntry()
@@ -557,9 +556,12 @@ void DBIter::Seek(const Slice& target) {
   saved_key_.Clear();
   // now savved_key is used to store internal key.
   saved_key_.SetInternalKey(target, sequence_);
-  PERF_TIMER_AUTO(seek_internal_seek_time);
-  iter_->Seek(saved_key_.GetKey());
-  PERF_TIMER_STOP(seek_internal_seek_time);
+
+  {
+    PERF_TIMER_GUARD(seek_internal_seek_time);
+    iter_->Seek(saved_key_.GetKey());
+  }
+
   if (iter_->Valid()) {
     direction_ = kForward;
     ClearSavedValue();
@@ -577,9 +579,12 @@ void DBIter::SeekToFirst() {
   }
   direction_ = kForward;
   ClearSavedValue();
-  PERF_TIMER_AUTO(seek_internal_seek_time);
-  iter_->SeekToFirst();
-  PERF_TIMER_STOP(seek_internal_seek_time);
+
+  {
+    PERF_TIMER_GUARD(seek_internal_seek_time);
+    iter_->SeekToFirst();
+  }
+
   if (iter_->Valid()) {
     FindNextUserEntry(false /* not skipping */);
   } else {
@@ -595,9 +600,11 @@ void DBIter::SeekToLast() {
   }
   direction_ = kReverse;
   ClearSavedValue();
-  PERF_TIMER_AUTO(seek_internal_seek_time);
-  iter_->SeekToLast();
-  PERF_TIMER_STOP(seek_internal_seek_time);
+
+  {
+    PERF_TIMER_GUARD(seek_internal_seek_time);
+    iter_->SeekToLast();
+  }
 
   PrevInternal();
 }
diff --git a/db/memtable.cc b/db/memtable.cc
index 523998c30..e9e7051c7 100644
--- a/db/memtable.cc
+++ b/db/memtable.cc
@@ -422,7 +422,7 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
     // Avoiding recording stats for speed.
     return false;
   }
-  PERF_TIMER_AUTO(get_from_memtable_time);
+  PERF_TIMER_GUARD(get_from_memtable_time);
 
   Slice user_key = key.user_key();
   bool found_final_value = false;
@@ -452,7 +452,6 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
   if (!found_final_value && merge_in_progress) {
     *s = Status::MergeInProgress("");
   }
-  PERF_TIMER_STOP(get_from_memtable_time);
   PERF_COUNTER_ADD(get_from_memtable_count, 1);
   return found_final_value;
 }
diff --git a/table/format.cc b/table/format.cc
index a642965d5..46105247f 100644
--- a/table/format.cc
+++ b/table/format.cc
@@ -211,10 +211,13 @@ Status ReadBlock(RandomAccessFile* file, const Footer& footer,
                   const ReadOptions& options, const BlockHandle& handle,
                   Slice* contents,  /* result of reading */ char* buf) {
   size_t n = static_cast<size_t>(handle.size());
+  Status s;
+
+  {
+    PERF_TIMER_GUARD(block_read_time);
+    s = file->Read(handle.offset(), n + kBlockTrailerSize, contents, buf);
+  }
 
-  PERF_TIMER_AUTO(block_read_time);
-  Status s = file->Read(handle.offset(), n + kBlockTrailerSize, contents, buf);
-  PERF_TIMER_MEASURE(block_read_time);
   PERF_COUNTER_ADD(block_read_count, 1);
   PERF_COUNTER_ADD(block_read_byte, n + kBlockTrailerSize);
 
@@ -228,6 +231,7 @@ Status ReadBlock(RandomAccessFile* file, const Footer& footer,
   // Check the crc of the type and the block contents
   const char* data = contents->data();  // Pointer to where Read put the data
   if (options.verify_checksums) {
+    PERF_TIMER_GUARD(block_checksum_time);
     uint32_t value = DecodeFixed32(data + n + 1);
     uint32_t actual = 0;
     switch (footer.checksum()) {
@@ -247,7 +251,6 @@ Status ReadBlock(RandomAccessFile* file, const Footer& footer,
     if (!s.ok()) {
       return s;
     }
-    PERF_TIMER_STOP(block_checksum_time);
   }
   return s;
 }
@@ -265,7 +268,7 @@ Status DecompressBlock(BlockContents* result, size_t block_size,
   result->cachable = false;
   result->heap_allocated = false;
 
-  PERF_TIMER_AUTO(block_decompress_time);
+  PERF_TIMER_GUARD(block_decompress_time);
   rocksdb::CompressionType compression_type =
       static_cast<rocksdb::CompressionType>(data[n]);
   // If the caller has requested that the block not be uncompressed
@@ -295,7 +298,6 @@ Status DecompressBlock(BlockContents* result, size_t block_size,
   } else {
     s = UncompressBlockContents(data, n, result);
   }
-  PERF_TIMER_STOP(block_decompress_time);
   return s;
 }
 
diff --git a/table/merger.cc b/table/merger.cc
index 611480cec..a53376ceb 100644
--- a/table/merger.cc
+++ b/table/merger.cc
@@ -116,12 +116,12 @@ class MergingIterator : public Iterator {
     // Invalidate the heap.
     use_heap_ = false;
     IteratorWrapper* first_child = nullptr;
-    PERF_TIMER_DECLARE();
 
     for (auto& child : children_) {
-      PERF_TIMER_START(seek_child_seek_time);
-      child.Seek(target);
-      PERF_TIMER_STOP(seek_child_seek_time);
+      {
+        PERF_TIMER_GUARD(seek_child_seek_time);
+        child.Seek(target);
+      }
       PERF_COUNTER_ADD(seek_child_seek_count, 1);
 
       if (child.Valid()) {
@@ -134,24 +134,21 @@ class MergingIterator : public Iterator {
           } else {
             // We have more than one children with valid keys. Initialize
             // the heap and put the first child into the heap.
-            PERF_TIMER_START(seek_min_heap_time);
+            PERF_TIMER_GUARD(seek_min_heap_time);
             ClearHeaps();
             minHeap_.push(first_child);
-            PERF_TIMER_STOP(seek_min_heap_time);
           }
         }
         if (use_heap_) {
-          PERF_TIMER_START(seek_min_heap_time);
+          PERF_TIMER_GUARD(seek_min_heap_time);
           minHeap_.push(&child);
-          PERF_TIMER_STOP(seek_min_heap_time);
         }
       }
     }
     if (use_heap_) {
       // If heap is valid, need to put the smallest key to curent_.
-      PERF_TIMER_START(seek_min_heap_time);
+      PERF_TIMER_GUARD(seek_min_heap_time);
       FindSmallest();
-      PERF_TIMER_STOP(seek_min_heap_time);
     } else {
       // The heap is not valid, then the current_ iterator is the first
       // one, or null if there is no first child.
diff --git a/util/perf_context_imp.h b/util/perf_context_imp.h
index dc4ae95e5..e39790105 100644
--- a/util/perf_context_imp.h
+++ b/util/perf_context_imp.h
@@ -11,11 +11,10 @@ namespace rocksdb {
 
 #if defined(NPERF_CONTEXT) || defined(IOS_CROSS_COMPILE)
 
-#define PERF_TIMER_DECLARE()
-#define PERF_TIMER_START(metric)
-#define PERF_TIMER_AUTO(metric)
+#define PERF_TIMER_GUARD(metric)
 #define PERF_TIMER_MEASURE(metric)
 #define PERF_TIMER_STOP(metric)
+#define PERF_TIMER_START(metric)
 #define PERF_COUNTER_ADD(metric, value)
 
 #else
@@ -24,10 +23,15 @@ extern __thread PerfLevel perf_level;
 
 class PerfStepTimer {
  public:
-  PerfStepTimer()
+  PerfStepTimer(uint64_t* metric)
     : enabled_(perf_level >= PerfLevel::kEnableTime),
       env_(enabled_ ? Env::Default() : nullptr),
-      start_(0) {
+      start_(0),
+      metric_(metric) {
+  }
+
+  ~PerfStepTimer() {
+    Stop();
   }
 
   void Start() {
@@ -36,17 +40,17 @@ class PerfStepTimer {
     }
   }
 
-  void Measure(uint64_t* metric) {
+  void Measure() {
     if (start_) {
       uint64_t now = env_->NowNanos();
-      *metric += now - start_;
+      *metric_ += now - start_;
       start_ = now;
     }
   }
 
-  void Stop(uint64_t* metric) {
+  void Stop() {
     if (start_) {
-      *metric += env_->NowNanos() - start_;
+      *metric_ += env_->NowNanos() - start_;
       start_ = 0;
     }
   }
@@ -55,29 +59,25 @@ class PerfStepTimer {
   const bool enabled_;
   Env* const env_;
   uint64_t start_;
+  uint64_t* metric_;
 };
 
-// Declare the local timer object to be used later on
-#define PERF_TIMER_DECLARE()           \
-  PerfStepTimer perf_step_timer;
+// Stop the timer and update the metric
+#define PERF_TIMER_STOP(metric)          \
+  perf_step_timer_ ## metric.Stop();
 
-// Set start time of the timer
 #define PERF_TIMER_START(metric)          \
-  perf_step_timer.Start();
+  perf_step_timer_ ## metric.Start();
 
 // Declare and set start time of the timer
-#define PERF_TIMER_AUTO(metric)           \
-  PerfStepTimer perf_step_timer;          \
-  perf_step_timer.Start();
+#define PERF_TIMER_GUARD(metric)           \
+  PerfStepTimer perf_step_timer_ ## metric(&(perf_context.metric));          \
+  perf_step_timer_ ## metric.Start();
 
 // Update metric with time elapsed since last START. start time is reset
 // to current timestamp.
 #define PERF_TIMER_MEASURE(metric)        \
-  perf_step_timer.Measure(&(perf_context.metric));
-
-// Update metric with time elapsed since last START. But start time is not set.
-#define PERF_TIMER_STOP(metric)        \
-  perf_step_timer.Stop(&(perf_context.metric));
+  perf_step_timer_ ## metric.Measure();
 
 // Increase metric value
 #define PERF_COUNTER_ADD(metric, value)     \

From 8438a19360861070a3d5a86d0162c6160495618f Mon Sep 17 00:00:00 2001
From: Feng Zhu <zagfox@fb.com>
Date: Tue, 2 Sep 2014 12:25:58 -0700
Subject: [PATCH 009/829] fix dropping column family bug

Summary: 1. db/db_impl.cc:2324 (DBImpl::BackgroundCompaction) should not raise bg_error_ when column family is dropped during compaction.

Test Plan: 1. db_stress

Reviewers: ljin, yhchiang, dhruba, igor, sdong

Reviewed By: igor

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D22653
---
 db/db_impl.cc            | 2 +-
 include/rocksdb/status.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/db/db_impl.cc b/db/db_impl.cc
index 9900ff2bb..f41d1d05d 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -2321,7 +2321,7 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress,
 
   if (status.ok()) {
     // Done
-  } else if (shutting_down_.Acquire_Load()) {
+  } else if (status.IsShutdownInProgress()) {
     // Ignore compaction errors found during shutting down
   } else {
     Log(InfoLogLevel::WARN_LEVEL, options_.info_log, "Compaction error: %s",
diff --git a/include/rocksdb/status.h b/include/rocksdb/status.h
index b20689a77..d13ff9d81 100644
--- a/include/rocksdb/status.h
+++ b/include/rocksdb/status.h
@@ -96,7 +96,7 @@ class Status {
   // Returns true iff the status indicates Incomplete
   bool IsIncomplete() const { return code() == kIncomplete; }
 
-  // Returns true iff the status indicates Incomplete
+  // Returns true iff the status indicates Shutdown In progress
   bool IsShutdownInProgress() const { return code() == kShutdownInProgress; }
 
   bool IsTimedOut() const { return code() == kTimedOut; }

From 8ed70fc20918c645962c660ad7d4a7381f63e6dc Mon Sep 17 00:00:00 2001
From: Feng Zhu <zagfox@fb.com>
Date: Tue, 2 Sep 2014 13:21:59 -0700
Subject: [PATCH 010/829] add assert to db Put in db_stress test

Summary:
1. assert db->Put to be true in db_stress
2. begin column family with name "1".

Test Plan: 1. ./db_stress

Reviewers: ljin, yhchiang, dhruba, sdong, igor

Reviewed By: sdong, igor

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D22659
---
 tools/db_stress.cc | 30 ++++++++++++++++++++++++------
 1 file changed, 24 insertions(+), 6 deletions(-)

diff --git a/tools/db_stress.cc b/tools/db_stress.cc
index cffcb1c47..e9955953d 100644
--- a/tools/db_stress.cc
+++ b/tools/db_stress.cc
@@ -31,6 +31,7 @@ int main() {
 #include <sys/types.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <exception>
 #include <gflags/gflags.h>
 #include "db/db_impl.h"
 #include "db/version_set.h"
@@ -759,7 +760,7 @@ class StressTest {
                            ? NewBloomFilterPolicy(FLAGS_bloom_bits)
                            : nullptr),
         db_(nullptr),
-        new_column_family_name_(0),
+        new_column_family_name_(1),
         num_times_reopened_(0) {
     if (FLAGS_destroy_db_initially) {
       std::vector<std::string> files;
@@ -1217,12 +1218,20 @@ class StressTest {
           Status s __attribute__((unused));
           s = db_->DropColumnFamily(column_families_[cf]);
           delete column_families_[cf];
-          assert(s.ok());
+          if (!s.ok()) {
+            fprintf(stderr, "dropping column family error: %s\n",
+                s.ToString().c_str());
+            std::terminate();
+          }
           s = db_->CreateColumnFamily(ColumnFamilyOptions(options_), new_name,
                                       &column_families_[cf]);
           column_family_names_[cf] = new_name;
           thread->shared->ClearColumnFamily(cf);
-          assert(s.ok());
+          if (!s.ok()) {
+            fprintf(stderr, "creating column family error: %s\n",
+                s.ToString().c_str());
+            std::terminate();
+          }
           thread->shared->UnlockColumnFamily(cf);
         }
       }
@@ -1297,10 +1306,15 @@ class StressTest {
             }
           }
           thread->shared->Put(rand_column_family, rand_key, value_base);
+          Status s;
           if (FLAGS_use_merge) {
-            db_->Merge(write_opts, column_family, key, v);
+            s = db_->Merge(write_opts, column_family, key, v);
           } else {
-            db_->Put(write_opts, column_family, key, v);
+            s = db_->Put(write_opts, column_family, key, v);
+          }
+          if (!s.ok()) {
+            fprintf(stderr, "put or merge error: %s\n", s.ToString().c_str());
+            std::terminate();
           }
           thread->stats.AddBytesForWrites(1, sz);
         } else {
@@ -1311,8 +1325,12 @@ class StressTest {
         // OPERATION delete
         if (!FLAGS_test_batches_snapshots) {
           thread->shared->Delete(rand_column_family, rand_key);
-          db_->Delete(write_opts, column_family, key);
+          Status s = db_->Delete(write_opts, column_family, key);
           thread->stats.AddDeletes(1);
+          if (!s.ok()) {
+            fprintf(stderr, "delete error: %s\n", s.ToString().c_str());
+            std::terminate();
+          }
         } else {
           MultiDelete(thread, write_opts, column_family, key);
         }

From a84234a61bdc6d8853fd8940dec5b8dcfd4845c7 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Tue, 2 Sep 2014 13:29:05 -0700
Subject: [PATCH 011/829] Ignore missing column families

Summary:
Before this diff, whenever we Write to non-existing column family, Write() would fail.

This diff adds an option to not fail a Write() when WriteBatch points to non-existing column family. MongoDB said this would be useful for them, since they might have a transaction updating an index that was dropped by another thread. This way, they don't have to worry about checking if all indexes are alive on every write. They don't care if they lose writes to dropped index.

Test Plan: added a small unit test

Reviewers: sdong, yhchiang, ljin

Reviewed By: ljin

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D22143
---
 db/column_family_test.cc  |  6 ++++++
 db/db_impl.cc             | 15 +++++++++----
 db/write_batch.cc         | 44 +++++++++++++++++++--------------------
 db/write_batch_internal.h | 18 ++++++++--------
 include/rocksdb/options.h | 12 ++++++++++-
 5 files changed, 58 insertions(+), 37 deletions(-)

diff --git a/db/column_family_test.cc b/db/column_family_test.cc
index ac3435593..b96e66829 100644
--- a/db/column_family_test.cc
+++ b/db/column_family_test.cc
@@ -408,9 +408,15 @@ TEST(ColumnFamilyTest, WriteBatchFailure) {
   Open();
   CreateColumnFamiliesAndReopen({"one", "two"});
   WriteBatch batch;
+  batch.Put(handles_[0], Slice("existing"), Slice("column-family"));
   batch.Put(handles_[1], Slice("non-existing"), Slice("column-family"));
   ASSERT_OK(db_->Write(WriteOptions(), &batch));
   DropColumnFamilies({1});
+  WriteOptions woptions_ignore_missing_cf;
+  woptions_ignore_missing_cf.ignore_missing_column_families = true;
+  batch.Put(handles_[0], Slice("still here"), Slice("column-family"));
+  ASSERT_OK(db_->Write(woptions_ignore_missing_cf, &batch));
+  ASSERT_EQ("column-family", Get(0, "still here"));
   Status s = db_->Write(WriteOptions(), &batch);
   ASSERT_TRUE(s.IsInvalidArgument());
   Close();
diff --git a/db/db_impl.cc b/db/db_impl.cc
index 36c0da12f..c22aa5809 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -1301,14 +1301,20 @@ Status DBImpl::RecoverLogFile(uint64_t log_number, SequenceNumber* max_sequence,
   WriteBatch batch;
   while (reader.ReadRecord(&record, &scratch)) {
     if (record.size() < 12) {
-      reporter.Corruption(
-          record.size(), Status::Corruption("log record too small"));
+      reporter.Corruption(record.size(),
+                          Status::Corruption("log record too small"));
       continue;
     }
     WriteBatchInternal::SetContents(&batch, record);
 
+    // If column family was not found, it might mean that the WAL write
+    // batch references to the column family that was dropped after the
+    // insert. We don't want to fail the whole write batch in that case -- we
+    // just ignore the update. That's why we set ignore missing column families
+    // to true
     status = WriteBatchInternal::InsertInto(
-        &batch, column_family_memtables_.get(), true, log_number);
+        &batch, column_family_memtables_.get(),
+        true /* ignore missing column families */, log_number);
 
     MaybeIgnoreError(&status);
     if (!status.ok()) {
@@ -4066,7 +4072,8 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
         PERF_TIMER_GUARD(write_memtable_time);
 
         status = WriteBatchInternal::InsertInto(
-            updates, column_family_memtables_.get(), false, 0, this, false);
+            updates, column_family_memtables_.get(),
+            options.ignore_missing_column_families, 0, this, false);
         // A non-OK status here indicates iteration failure (either in-memory
         // writebatch corruption (very bad), or the client specified invalid
         // column family).  This will later on trigger bg_error_.
diff --git a/db/write_batch.cc b/db/write_batch.cc
index fdc0e2c6e..bfa5e3f6f 100644
--- a/db/write_batch.cc
+++ b/db/write_batch.cc
@@ -299,17 +299,17 @@ class MemTableInserter : public WriteBatch::Handler {
  public:
   SequenceNumber sequence_;
   ColumnFamilyMemTables* cf_mems_;
-  bool recovery_;
+  bool ignore_missing_column_families_;
   uint64_t log_number_;
   DBImpl* db_;
   const bool dont_filter_deletes_;
 
   MemTableInserter(SequenceNumber sequence, ColumnFamilyMemTables* cf_mems,
-                   bool recovery, uint64_t log_number, DB* db,
-                   const bool dont_filter_deletes)
+                   bool ignore_missing_column_families, uint64_t log_number,
+                   DB* db, const bool dont_filter_deletes)
       : sequence_(sequence),
         cf_mems_(cf_mems),
-        recovery_(recovery),
+        ignore_missing_column_families_(ignore_missing_column_families),
         log_number_(log_number),
         db_(reinterpret_cast<DBImpl*>(db)),
         dont_filter_deletes_(dont_filter_deletes) {
@@ -321,12 +321,18 @@ class MemTableInserter : public WriteBatch::Handler {
 
   bool SeekToColumnFamily(uint32_t column_family_id, Status* s) {
     bool found = cf_mems_->Seek(column_family_id);
-    if (recovery_ && (!found || log_number_ < cf_mems_->GetLogNumber())) {
-      // if in recovery envoronment:
-      // * If column family was not found, it might mean that the WAL write
-      // batch references to the column family that was dropped after the
-      // insert. We don't want to fail the whole write batch in that case -- we
-      // just ignore the update.
+    if (!found) {
+      if (ignore_missing_column_families_) {
+        *s = Status::OK();
+      } else {
+        *s = Status::InvalidArgument(
+            "Invalid column family specified in write batch");
+      }
+      return false;
+    }
+    if (log_number_ != 0 && log_number_ < cf_mems_->GetLogNumber()) {
+      // This is true only in recovery environment (log_number_ is always 0 in
+      // non-recovery, regular write code-path)
       // * If log_number_ < cf_mems_->GetLogNumber(), this means that column
       // family already contains updates from this log. We can't apply updates
       // twice because of update-in-place or merge workloads -- ignore the
@@ -334,18 +340,8 @@ class MemTableInserter : public WriteBatch::Handler {
       *s = Status::OK();
       return false;
     }
-    if (!found) {
-      assert(!recovery_);
-      // If the column family was not found in non-recovery enviornment
-      // (client's write code-path), we have to fail the write and return
-      // the failure status to the client.
-      *s = Status::InvalidArgument(
-          "Invalid column family specified in write batch");
-      return false;
-    }
     return true;
   }
-
   virtual Status PutCF(uint32_t column_family_id, const Slice& key,
                        const Slice& value) {
     Status seek_status;
@@ -503,10 +499,12 @@ class MemTableInserter : public WriteBatch::Handler {
 
 Status WriteBatchInternal::InsertInto(const WriteBatch* b,
                                       ColumnFamilyMemTables* memtables,
-                                      bool recovery, uint64_t log_number,
-                                      DB* db, const bool dont_filter_deletes) {
+                                      bool ignore_missing_column_families,
+                                      uint64_t log_number, DB* db,
+                                      const bool dont_filter_deletes) {
   MemTableInserter inserter(WriteBatchInternal::Sequence(b), memtables,
-                            recovery, log_number, db, dont_filter_deletes);
+                            ignore_missing_column_families, log_number, db,
+                            dont_filter_deletes);
   return b->Iterate(&inserter);
 }
 
diff --git a/db/write_batch_internal.h b/db/write_batch_internal.h
index 9a191f4cb..615a47f5e 100644
--- a/db/write_batch_internal.h
+++ b/db/write_batch_internal.h
@@ -106,18 +106,18 @@ class WriteBatchInternal {
   // Inserts batch entries into memtable
   // If dont_filter_deletes is false AND options.filter_deletes is true,
   // then --> Drops deletes in batch if db->KeyMayExist returns false
-  // If recovery == true, this means InsertInto is executed on a recovery
-  // code-path. WriteBatch referencing a dropped column family can be
-  // found on a recovery code-path and should be ignored (recovery should not
-  // fail). Additionally, the memtable will be updated only if
+  // If ignore_missing_column_families == true. WriteBatch referencing
+  // non-existing column family should be ignored.
+  // However, if ignore_missing_column_families == false, any WriteBatch
+  // referencing non-existing column family will return a InvalidArgument()
+  // failure.
+  //
+  // If log_number is non-zero, the memtable will be updated only if
   // memtables->GetLogNumber() >= log_number
-  // However, if recovery == false, any WriteBatch referencing
-  // non-existing column family will return a failure. Also, log_number is
-  // ignored in that case
   static Status InsertInto(const WriteBatch* batch,
                            ColumnFamilyMemTables* memtables,
-                           bool recovery = false, uint64_t log_number = 0,
-                           DB* db = nullptr,
+                           bool ignore_missing_column_families = false,
+                           uint64_t log_number = 0, DB* db = nullptr,
                            const bool dont_filter_deletes = true);
 
   static void Append(WriteBatch* dst, const WriteBatch* src);
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index 3569409c4..0ca303344 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -959,7 +959,17 @@ struct WriteOptions {
   // Default: 0
   uint64_t timeout_hint_us;
 
-  WriteOptions() : sync(false), disableWAL(false), timeout_hint_us(0) {}
+  // If true and if user is trying to write to column families that don't exist
+  // (they were dropped),  ignore the write (don't return an error). If there
+  // are multiple writes in a WriteBatch, other writes will succeed.
+  // Default: false
+  bool ignore_missing_column_families;
+
+  WriteOptions()
+      : sync(false),
+        disableWAL(false),
+        timeout_hint_us(0),
+        ignore_missing_column_families(false) {}
 };
 
 // Options that control flush operations

From 9b58c73c7c48769ef6f2e3a1e17ab33b9e28d43d Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Tue, 2 Sep 2014 14:42:23 -0700
Subject: [PATCH 012/829] call SanitizeDBOptionsByCFOptions() in the right
 place

Summary: It only covers Open() with default column family right now

Test Plan: make release

Reviewers: igor, yhchiang, sdong

Reviewed By: sdong

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D22467
---
 db/db_impl.cc                     | 17 +++++++++--------
 db/simple_table_db_test.cc        |  2 +-
 include/rocksdb/table.h           |  2 +-
 table/adaptive_table_factory.h    |  2 +-
 table/block_based_table_factory.h |  2 +-
 table/cuckoo_table_factory.h      |  2 +-
 table/plain_table_factory.h       |  2 +-
 7 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/db/db_impl.cc b/db/db_impl.cc
index c22aa5809..7c65e9a61 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -290,8 +290,10 @@ DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src) {
   return result;
 }
 
+namespace {
+
 Status SanitizeDBOptionsByCFOptions(
-    DBOptions* db_opts,
+    const DBOptions* db_opts,
     const std::vector<ColumnFamilyDescriptor>& column_families) {
   Status s;
   for (auto cf : column_families) {
@@ -303,7 +305,6 @@ Status SanitizeDBOptionsByCFOptions(
   return Status::OK();
 }
 
-namespace {
 CompressionType GetCompressionFlush(const Options& options) {
   // Compressing memtable flushes might not help unless the sequential load
   // optimization is used for leveled compaction. Otherwise the CPU and
@@ -4802,11 +4803,7 @@ Status DB::Open(const Options& options, const std::string& dbname, DB** dbptr) {
   column_families.push_back(
       ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options));
   std::vector<ColumnFamilyHandle*> handles;
-  Status s = SanitizeDBOptionsByCFOptions(&db_options, column_families);
-  if (!s.ok()) {
-    return s;
-  }
-  s = DB::Open(db_options, dbname, column_families, &handles, dbptr);
+  Status s = DB::Open(db_options, dbname, column_families, &handles, dbptr);
   if (s.ok()) {
     assert(handles.size() == 1);
     // i can delete the handle since DBImpl is always holding a reference to
@@ -4819,6 +4816,10 @@ Status DB::Open(const Options& options, const std::string& dbname, DB** dbptr) {
 Status DB::Open(const DBOptions& db_options, const std::string& dbname,
                 const std::vector<ColumnFamilyDescriptor>& column_families,
                 std::vector<ColumnFamilyHandle*>* handles, DB** dbptr) {
+  Status s = SanitizeDBOptionsByCFOptions(&db_options, column_families);
+  if (!s.ok()) {
+    return s;
+  }
   if (db_options.db_paths.size() > 1) {
     for (auto& cfd : column_families) {
       if (cfd.options.compaction_style != kCompactionStyleUniversal) {
@@ -4844,7 +4845,7 @@ Status DB::Open(const DBOptions& db_options, const std::string& dbname,
   }
 
   DBImpl* impl = new DBImpl(db_options, dbname);
-  Status s = impl->env_->CreateDirIfMissing(impl->options_.wal_dir);
+  s = impl->env_->CreateDirIfMissing(impl->options_.wal_dir);
   if (s.ok()) {
     for (auto db_path : impl->options_.db_paths) {
       s = impl->env_->CreateDirIfMissing(db_path.path);
diff --git a/db/simple_table_db_test.cc b/db/simple_table_db_test.cc
index 3a5809774..e88485070 100644
--- a/db/simple_table_db_test.cc
+++ b/db/simple_table_db_test.cc
@@ -556,7 +556,7 @@ public:
                                 WritableFile* file,
                                 CompressionType compression_type) const;
 
-  virtual Status SanitizeDBOptions(DBOptions* db_opts) const override {
+  virtual Status SanitizeDBOptions(const DBOptions* db_opts) const override {
     return Status::OK();
   }
 
diff --git a/include/rocksdb/table.h b/include/rocksdb/table.h
index 3a47ed939..0f8b41074 100644
--- a/include/rocksdb/table.h
+++ b/include/rocksdb/table.h
@@ -331,7 +331,7 @@ class TableFactory {
   //
   // If the function cannot find a way to sanitize the input DB Options,
   // a non-ok Status will be returned.
-  virtual Status SanitizeDBOptions(DBOptions* db_opts) const = 0;
+  virtual Status SanitizeDBOptions(const DBOptions* db_opts) const = 0;
 
   // Return a string that contains printable format of table configurations.
   // RocksDB prints configurations at DB Open().
diff --git a/table/adaptive_table_factory.h b/table/adaptive_table_factory.h
index 571e07498..f119d97b1 100644
--- a/table/adaptive_table_factory.h
+++ b/table/adaptive_table_factory.h
@@ -43,7 +43,7 @@ class AdaptiveTableFactory : public TableFactory {
       override;
 
   // Sanitizes the specified DB Options.
-  Status SanitizeDBOptions(DBOptions* db_opts) const override {
+  Status SanitizeDBOptions(const DBOptions* db_opts) const override {
     if (db_opts->allow_mmap_reads == false) {
       return Status::NotSupported(
           "AdaptiveTable with allow_mmap_reads == false is not supported.");
diff --git a/table/block_based_table_factory.h b/table/block_based_table_factory.h
index 90282bf9d..d7045346a 100644
--- a/table/block_based_table_factory.h
+++ b/table/block_based_table_factory.h
@@ -45,7 +45,7 @@ class BlockBasedTableFactory : public TableFactory {
       WritableFile* file, CompressionType compression_type) const override;
 
   // Sanitizes the specified DB Options.
-  Status SanitizeDBOptions(DBOptions* db_opts) const override {
+  Status SanitizeDBOptions(const DBOptions* db_opts) const override {
     return Status::OK();
   }
 
diff --git a/table/cuckoo_table_factory.h b/table/cuckoo_table_factory.h
index 06f657d22..5799a7f23 100644
--- a/table/cuckoo_table_factory.h
+++ b/table/cuckoo_table_factory.h
@@ -55,7 +55,7 @@ class CuckooTableFactory : public TableFactory {
       CompressionType compression_type) const override;
 
   // Sanitizes the specified DB Options.
-  Status SanitizeDBOptions(DBOptions* db_opts) const override {
+  Status SanitizeDBOptions(const DBOptions* db_opts) const override {
     return Status::OK();
   }
 
diff --git a/table/plain_table_factory.h b/table/plain_table_factory.h
index 31e20b016..d1cf0cae6 100644
--- a/table/plain_table_factory.h
+++ b/table/plain_table_factory.h
@@ -169,7 +169,7 @@ class PlainTableFactory : public TableFactory {
   static const char kValueTypeSeqId0 = 0xFF;
 
   // Sanitizes the specified DB Options.
-  Status SanitizeDBOptions(DBOptions* db_opts) const override {
+  Status SanitizeDBOptions(const DBOptions* db_opts) const override {
     if (db_opts->allow_mmap_reads == false) {
       return Status::NotSupported(
           "PlainTable with allow_mmap_reads == false is not supported.");

From 19cc588b778826415db15f6336a5619897a69a73 Mon Sep 17 00:00:00 2001
From: wankai <wankaizhang@gmail.com>
Date: Thu, 4 Sep 2014 00:44:49 +0800
Subject: [PATCH 013/829] change to filter_block std::unique_ptr support RAII

---
 table/block_based_table_builder.cc | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/table/block_based_table_builder.cc b/table/block_based_table_builder.cc
index 5d0fc9988..ddfbe74a6 100644
--- a/table/block_based_table_builder.cc
+++ b/table/block_based_table_builder.cc
@@ -401,7 +401,7 @@ struct BlockBasedTableBuilder::Rep {
   TableProperties props;
 
   bool closed = false;  // Either Finish() or Abandon() has been called.
-  FilterBlockBuilder* filter_block;
+  std::unique_ptr<FilterBlockBuilder> filter_block;
   char compressed_cache_key_prefix[BlockBasedTable::kMaxCacheKeyPrefixSize];
   size_t compressed_cache_key_prefix_size;
 
@@ -461,7 +461,6 @@ BlockBasedTableBuilder::BlockBasedTableBuilder(
 
 BlockBasedTableBuilder::~BlockBasedTableBuilder() {
   assert(rep_->closed);  // Catch errors where caller forgot to call Finish()
-  delete rep_->filter_block;
   delete rep_;
 }
 

From 703c3eacd93802060af8a1a825d2061aa4a0c7b3 Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Wed, 3 Sep 2014 17:01:34 -0700
Subject: [PATCH 014/829] comments about the BlockBasedTableOptions migration
 in Options

Summary: as title

Test Plan: none

Reviewers: sdong, igor

Reviewed By: igor

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D22737
---
 include/rocksdb/options.h | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index 0ca303344..11d976fb2 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -409,10 +409,24 @@ struct ColumnFamilyOptions {
   std::shared_ptr<MemTableRepFactory> memtable_factory;
 
   // This is a factory that provides TableFactory objects.
-  // Default: a factory that provides a default implementation of
-  // Table and TableBuilder.
+  // Default: a block-based table factory that provides a default
+  // implementation of TableBuilder and TableReader with default
+  // BlockBasedTableOptions.
   std::shared_ptr<TableFactory> table_factory;
 
+  // Block-based table related options are moved to BlockBasedTableOptions.
+  // Related options that were originally here but now moved include:
+  //   no_block_cache
+  //   block_cache
+  //   block_cache_compressed
+  //   block_size
+  //   block_size_deviation
+  //   block_restart_interval
+  //   filter_policy
+  //   whole_key_filtering
+  // If you'd like to customize some of these options, you will need to
+  // use NewBlockBasedTableFactory() to construct a new table factory.
+
   // This option allows user to to collect their own interested statistics of
   // the tables.
   // Default: empty vector -- no user-defined statistics collection will be

From 1b1d9619ff708fa72b834c247592fc591f993330 Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Wed, 3 Sep 2014 17:03:30 -0700
Subject: [PATCH 015/829] update HISTORY.md

Summary: as title

Test Plan: no

Reviewers: sdong, igor

Reviewed By: igor

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D22761
---
 HISTORY.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index 420377cbf..c6c566ede 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,6 +1,10 @@
 # Rocksdb Change Log
 
 ### Unreleased
+
+----- Past Releases -----
+
+## 3.5.0 (9/3/2014)
 ### New Features
 * Add include/utilities/write_batch_with_index.h, providing a utilitiy class to query data out of WriteBatch when building it.
 * Move BlockBasedTable related options to BlockBasedTableOptions from Options. Change corresponding JNI interface. Options affected include:
@@ -11,10 +15,6 @@
 ### Public API changes
 * The Prefix Extractor used with V2 compaction filters is now passed user key to SliceTransform::Transform instead of unparsed RocksDB key.
 
-
------ Past Releases -----
-
-
 ## 3.4.0 (8/18/2014)
 ### New Features
 * Support Multiple DB paths in universal style compactions

From 1785114a6fc1d5f80ea849671a8ad8038f2a010c Mon Sep 17 00:00:00 2001
From: wankai <wankaizhang@gmail.com>
Date: Thu, 4 Sep 2014 09:10:13 +0800
Subject: [PATCH 016/829] delete unused Comparator

---
 table/block_builder.h | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/table/block_builder.h b/table/block_builder.h
index 3b5b2b444..c01a23bea 100644
--- a/table/block_builder.h
+++ b/table/block_builder.h
@@ -15,15 +15,13 @@
 
 namespace rocksdb {
 
-class Comparator;
-
 class BlockBuilder {
  public:
   BlockBuilder(const BlockBuilder&) = delete;
   void operator=(const BlockBuilder&) = delete;
-  
+
   explicit BlockBuilder(int block_restart_interval);
-  
+
   // Reset the contents as if the BlockBuilder was just constructed.
   void Reset();
 

From ef5b384729824a70806117832d198928cf4a5374 Mon Sep 17 00:00:00 2001
From: liuhuahang <liuhuahang@zerus.co>
Date: Thu, 4 Sep 2014 22:04:37 +0800
Subject: [PATCH 017/829] fix a few compile warnings

1, const qualifiers on return types make no sense and will trigger a compile warning: warning: type qualifiers ignored on function return type [-Wignored-qualifiers]

2, class HistogramImpl has virtual functions and thus should have a virtual destructor

3, with some toolchain, the macro __STDC_FORMAT_MACROS is predefined and thus should be checked before define

Change-Id: I69747a03bfae88671bfbb2637c80d17600159c99
Signed-off-by: liuhuahang <liuhuahang@zerus.co>
---
 db/dbformat.h                     | 2 +-
 db/snapshot.h                     | 2 +-
 table/block_prefix_index.cc       | 4 ++--
 table/block_prefix_index.h        | 2 +-
 util/histogram.cc                 | 2 +-
 util/histogram.h                  | 6 ++++--
 utilities/spatialdb/spatial_db.cc | 3 +++
 7 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/db/dbformat.h b/db/dbformat.h
index eb5d8ed53..516a4693b 100644
--- a/db/dbformat.h
+++ b/db/dbformat.h
@@ -244,7 +244,7 @@ class IterKey {
 
   Slice GetKey() const { return Slice(key_, key_size_); }
 
-  const size_t Size() { return key_size_; }
+  size_t Size() { return key_size_; }
 
   void Clear() { key_size_ = 0; }
 
diff --git a/db/snapshot.h b/db/snapshot.h
index 2c2e3eac8..51fa556c8 100644
--- a/db/snapshot.h
+++ b/db/snapshot.h
@@ -71,7 +71,7 @@ class SnapshotList {
   }
 
   // get the sequence number of the most recent snapshot
-  const SequenceNumber GetNewest() {
+  SequenceNumber GetNewest() {
     if (empty()) {
       return 0;
     }
diff --git a/table/block_prefix_index.cc b/table/block_prefix_index.cc
index f06dcd9fe..d64b73b98 100644
--- a/table/block_prefix_index.cc
+++ b/table/block_prefix_index.cc
@@ -210,8 +210,8 @@ Status BlockPrefixIndex::Create(const SliceTransform* internal_prefix_extractor,
   return s;
 }
 
-const uint32_t BlockPrefixIndex::GetBlocks(const Slice& key,
-                                           uint32_t** blocks) {
+uint32_t BlockPrefixIndex::GetBlocks(const Slice& key,
+                                     uint32_t** blocks) {
   Slice prefix = internal_prefix_extractor_->Transform(key);
 
   uint32_t bucket = PrefixToBucket(prefix, num_buckets_);
diff --git a/table/block_prefix_index.h b/table/block_prefix_index.h
index 2afecadd2..662bc09aa 100644
--- a/table/block_prefix_index.h
+++ b/table/block_prefix_index.h
@@ -23,7 +23,7 @@ class BlockPrefixIndex {
   // the key, based on the prefix.
   // Returns the total number of relevant blocks, 0 means the key does
   // not exist.
-  const uint32_t GetBlocks(const Slice& key, uint32_t** blocks);
+  uint32_t GetBlocks(const Slice& key, uint32_t** blocks);
 
   size_t ApproximateMemoryUsage() const {
     return sizeof(BlockPrefixIndex) +
diff --git a/util/histogram.cc b/util/histogram.cc
index 968769cef..0dbfba7d6 100644
--- a/util/histogram.cc
+++ b/util/histogram.cc
@@ -53,7 +53,7 @@ HistogramBucketMapper::HistogramBucketMapper()
   }
 }
 
-const size_t HistogramBucketMapper::IndexForValue(const uint64_t value) const {
+size_t HistogramBucketMapper::IndexForValue(const uint64_t value) const {
   if (value >= maxBucketValue_) {
     return bucketValues_.size() - 1;
   } else if ( value >= minBucketValue_ ) {
diff --git a/util/histogram.h b/util/histogram.h
index d95588dc2..af3a019d8 100644
--- a/util/histogram.h
+++ b/util/histogram.h
@@ -23,10 +23,10 @@ class HistogramBucketMapper {
   HistogramBucketMapper();
 
   // converts a value to the bucket index.
-  const size_t IndexForValue(const uint64_t value) const;
+  size_t IndexForValue(const uint64_t value) const;
   // number of buckets required.
 
-  const size_t BucketCount() const {
+  size_t BucketCount() const {
     return bucketValues_.size();
   }
 
@@ -65,6 +65,8 @@ class HistogramImpl {
   virtual double StandardDeviation() const;
   virtual void Data(HistogramData * const data) const;
 
+  virtual ~HistogramImpl() {}
+
  private:
   // To be able to use HistogramImpl as thread local variable, its constructor
   // has to be static. That's why we're using manually values from BucketMapper
diff --git a/utilities/spatialdb/spatial_db.cc b/utilities/spatialdb/spatial_db.cc
index 8b9e49bd4..21a111d3e 100644
--- a/utilities/spatialdb/spatial_db.cc
+++ b/utilities/spatialdb/spatial_db.cc
@@ -7,7 +7,10 @@
 
 #include "rocksdb/utilities/spatial_db.h"
 
+#ifndef __STDC_FORMAT_MACROS
 #define __STDC_FORMAT_MACROS
+#endif
+
 #include <inttypes.h>
 #include <string>
 #include <vector>

From a4816269f1b194282cd8a3cbc203c549fc66bceb Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Thu, 4 Sep 2014 10:22:28 -0700
Subject: [PATCH 018/829] Relax backupable rate limiting test

---
 utilities/backupable/backupable_db_test.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/utilities/backupable/backupable_db_test.cc b/utilities/backupable/backupable_db_test.cc
index 1d876cd50..a585d1a9c 100644
--- a/utilities/backupable/backupable_db_test.cc
+++ b/utilities/backupable/backupable_db_test.cc
@@ -916,7 +916,7 @@ TEST(BackupableDBTest, RateLimiting) {
     auto backup_time = env_->NowMicros() - start_backup;
     auto rate_limited_backup_time = (bytes_written * kMicrosPerSec) /
                                     backupable_options_->backup_rate_limit;
-    ASSERT_GT(backup_time, 0.9 * rate_limited_backup_time);
+    ASSERT_GT(backup_time, 0.8 * rate_limited_backup_time);
 
     CloseBackupableDB();
 
@@ -927,7 +927,7 @@ TEST(BackupableDBTest, RateLimiting) {
     CloseRestoreDB();
     auto rate_limited_restore_time = (bytes_written * kMicrosPerSec) /
                                      backupable_options_->restore_rate_limit;
-    ASSERT_GT(restore_time, 0.9 * rate_limited_restore_time);
+    ASSERT_GT(restore_time, 0.8 * rate_limited_restore_time);
 
     AssertBackupConsistency(0, 0, 100000, 100010);
   }

From 51ea8890023b2e7e77757a16c10f19b6e9f78d63 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Thu, 4 Sep 2014 10:23:45 -0700
Subject: [PATCH 019/829] Fix travis builds

Summary:
Lots of travis builds are failing because on EnvPosixTest.RandomAccessUniqueID: https://travis-ci.org/facebook/rocksdb/builds/34400833

This is the result of their environment and not because of RocksDB's bug.

Also note that RocksDB works correctly even though UniqueID feature is not present in the system (as it's the case with os x)

Test Plan:
OPT=-DTRAVIS make env_test && ./env_test
Observed that offending tests are not being run

Reviewers: sdong, yhchiang, ljin

Reviewed By: ljin

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D22803
---
 .travis.yml      | 3 +--
 util/env_test.cc | 8 ++++++--
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 66f37a5d2..bcb852cf0 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -14,7 +14,6 @@ before_install:
  - sudo dpkg -i libgflags-dev_2.0-1_amd64.deb
 # Lousy hack to disable use and testing of fallocate, which doesn't behave quite
 # as EnvPosixTest::AllocateTest expects within the Travis OpenVZ environment.
- - sed -i "s/fallocate(/HACK_NO_fallocate(/" build_tools/build_detect_platform
-script: make check -j8
+script: OPT=-DTRAVIS make check -j8
 notifications:
     email: false
diff --git a/util/env_test.cc b/util/env_test.cc
index c0d00ce94..1c4d0bba0 100644
--- a/util/env_test.cc
+++ b/util/env_test.cc
@@ -392,6 +392,9 @@ TEST(EnvPosixTest, DecreaseNumBgThreads) {
 }
 
 #ifdef OS_LINUX
+// Travis doesn't support fallocate or getting unique ID from files for whatever
+// reason.
+#ifndef TRAVIS
 // To make sure the Env::GetUniqueId() related tests work correctly, The files
 // should be stored in regular storage like "hard disk" or "flash device".
 // Otherwise we cannot get the correct id.
@@ -507,7 +510,7 @@ TEST(EnvPosixTest, AllocateTest) {
   // verify that preallocated blocks were deallocated on file close
   ASSERT_GT(st_blocks, f_stat.st_blocks);
 }
-#endif
+#endif  // ROCKSDB_FALLOCATE_PRESENT
 
 // Returns true if any of the strings in ss are the prefix of another string.
 bool HasPrefix(const std::unordered_set<std::string>& ss) {
@@ -638,7 +641,8 @@ TEST(EnvPosixTest, InvalidateCache) {
   // Delete the file
   ASSERT_OK(env_->DeleteFile(fname));
 }
-#endif
+#endif  // not TRAVIS
+#endif  // OS_LINUX
 
 TEST(EnvPosixTest, PosixRandomRWFileTest) {
   EnvOptions soptions;

From e0b99d4f5db70513b94ab91c594afea272493568 Mon Sep 17 00:00:00 2001
From: Raghav Pisolkar <raghavpi@dev639.prn1.facebook.com>
Date: Thu, 4 Sep 2014 10:48:24 -0700
Subject: [PATCH 020/829] created a new ReadOptions parameter
 'iterate_upper_bound'

---
 HISTORY.md                |   1 +
 db/c.cc                   |   7 ++
 db/db_impl.cc             |   6 +-
 db/db_iter.cc             | 113 ++++++++++++++++++++-----------
 db/db_iter.h              |   5 +-
 db/db_test.cc             | 138 ++++++++++++++++++++++++++++++++++++++
 include/rocksdb/c.h       |   4 ++
 include/rocksdb/options.h |  14 ++++
 8 files changed, 243 insertions(+), 45 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index c6c566ede..922d3e2c9 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -20,6 +20,7 @@
 * Support Multiple DB paths in universal style compactions
 * Add feature of storing plain table index and bloom filter in SST file.
 * CompactRange() will never output compacted files to level 0. This used to be the case when all the compaction input files were at level 0.
+* Added iterate_upper_bound to define the extent upto which the forward iterator will return entries. This will prevent iterating over delete markers and overwritten entries for edge cases where you want to break out the iterator anyways. This may improve perfomance in case there are a large number of delete markers or overwritten entries.
 
 ### Public API changes
 * DBOptions.db_paths now is a vector of a DBPath structure which indicates both of path and target size
diff --git a/db/c.cc b/db/c.cc
index 3114f3500..9ea549646 100644
--- a/db/c.cc
+++ b/db/c.cc
@@ -1844,6 +1844,13 @@ void rocksdb_readoptions_set_snapshot(
   opt->rep.snapshot = (snap ? snap->rep : nullptr);
 }
 
+void rocksdb_readoptions_set_iterate_upper_bound(
+    rocksdb_readoptions_t* opt,
+    const char* key, size_t keylen) {
+  Slice prefix = Slice(key, keylen);
+  opt->rep.iterate_upper_bound = &prefix;
+}
+
 void rocksdb_readoptions_set_read_tier(
     rocksdb_readoptions_t* opt, int v) {
   opt->rep.read_tier = static_cast<rocksdb::ReadTier>(v);
diff --git a/db/db_impl.cc b/db/db_impl.cc
index 7c65e9a61..f18bb2141 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -3677,7 +3677,7 @@ Iterator* DBImpl::NewIterator(const ReadOptions& options,
     // TODO(ljin): remove tailing iterator
     auto iter = new ForwardIterator(this, options, cfd);
     return NewDBIterator(env_, *cfd->options(), cfd->user_comparator(), iter,
-                         kMaxSequenceNumber);
+                         kMaxSequenceNumber, options.iterate_upper_bound);
 // return new TailingIterator(env_, this, options, cfd);
 #endif
   } else {
@@ -3733,7 +3733,9 @@ Iterator* DBImpl::NewIterator(const ReadOptions& options,
     // likely that any iterator pointer is close to the iterator it points to so
     // that they are likely to be in the same cache line and/or page.
     ArenaWrappedDBIter* db_iter = NewArenaWrappedDbIterator(
-        env_, *cfd->options(), cfd->user_comparator(), snapshot);
+        env_, *cfd->options(), cfd->user_comparator(),
+        snapshot, options.iterate_upper_bound);
+
     Iterator* internal_iter =
         NewInternalIterator(options, cfd, sv, db_iter->GetArena());
     db_iter->SetIterUnderDBIter(internal_iter);
diff --git a/db/db_iter.cc b/db/db_iter.cc
index 599a56a99..bfdcd4edb 100644
--- a/db/db_iter.cc
+++ b/db/db_iter.cc
@@ -59,7 +59,8 @@ class DBIter: public Iterator {
   };
 
   DBIter(Env* env, const Options& options, const Comparator* cmp,
-         Iterator* iter, SequenceNumber s, bool arena_mode)
+         Iterator* iter, SequenceNumber s, bool arena_mode,
+         const Slice* iterate_upper_bound = nullptr)
       : arena_mode_(arena_mode),
         env_(env),
         logger_(options.info_log.get()),
@@ -70,9 +71,10 @@ class DBIter: public Iterator {
         direction_(kForward),
         valid_(false),
         current_entry_is_merged_(false),
-        statistics_(options.statistics.get()) {
+        statistics_(options.statistics.get()),
+        iterate_upper_bound_(iterate_upper_bound) {
     RecordTick(statistics_, NO_ITERATORS);
-    has_prefix_extractor_ = (options.prefix_extractor.get() != nullptr);
+    prefix_extractor_ = options.prefix_extractor.get();
     max_skip_ = options.max_sequential_skip_in_iterations;
   }
   virtual ~DBIter() {
@@ -132,7 +134,7 @@ class DBIter: public Iterator {
     }
   }
 
-  bool has_prefix_extractor_;
+  const SliceTransform* prefix_extractor_;
   bool arena_mode_;
   Env* const env_;
   Logger* logger_;
@@ -149,6 +151,7 @@ class DBIter: public Iterator {
   bool current_entry_is_merged_;
   Statistics* statistics_;
   uint64_t max_skip_;
+  const Slice* iterate_upper_bound_;
 
   // No copying allowed
   DBIter(const DBIter&);
@@ -207,36 +210,44 @@ void DBIter::FindNextUserEntryInternal(bool skipping) {
   uint64_t num_skipped = 0;
   do {
     ParsedInternalKey ikey;
-    if (ParseKey(&ikey) && ikey.sequence <= sequence_) {
-      if (skipping &&
-          user_comparator_->Compare(ikey.user_key, saved_key_.GetKey()) <= 0) {
-        num_skipped++; // skip this entry
-        PERF_COUNTER_ADD(internal_key_skipped_count, 1);
-      } else {
-        skipping = false;
-        switch (ikey.type) {
-          case kTypeDeletion:
-            // Arrange to skip all upcoming entries for this key since
-            // they are hidden by this deletion.
-            saved_key_.SetKey(ikey.user_key);
-            skipping = true;
-            num_skipped = 0;
-            PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
-            break;
-          case kTypeValue:
-            valid_ = true;
-            saved_key_.SetKey(ikey.user_key);
-            return;
-          case kTypeMerge:
-            // By now, we are sure the current ikey is going to yield a value
-            saved_key_.SetKey(ikey.user_key);
-            current_entry_is_merged_ = true;
-            valid_ = true;
-            MergeValuesNewToOld();  // Go to a different state machine
-            return;
-          default:
-            assert(false);
-            break;
+
+    if (ParseKey(&ikey)) {
+      if (iterate_upper_bound_ != nullptr &&
+          ikey.user_key.compare(*iterate_upper_bound_) >= 0) {
+        break;
+      }
+
+      if (ikey.sequence <= sequence_) {
+        if (skipping &&
+           user_comparator_->Compare(ikey.user_key, saved_key_.GetKey()) <= 0) {
+          num_skipped++;  // skip this entry
+          PERF_COUNTER_ADD(internal_key_skipped_count, 1);
+        } else {
+          skipping = false;
+          switch (ikey.type) {
+            case kTypeDeletion:
+              // Arrange to skip all upcoming entries for this key since
+              // they are hidden by this deletion.
+              saved_key_.SetKey(ikey.user_key);
+              skipping = true;
+              num_skipped = 0;
+              PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
+              break;
+            case kTypeValue:
+              valid_ = true;
+              saved_key_.SetKey(ikey.user_key);
+              return;
+            case kTypeMerge:
+              // By now, we are sure the current ikey is going to yield a value
+              saved_key_.SetKey(ikey.user_key);
+              current_entry_is_merged_ = true;
+              valid_ = true;
+              MergeValuesNewToOld();  // Go to a different state machine
+              return;
+            default:
+              assert(false);
+              break;
+          }
         }
       }
     }
@@ -398,6 +409,7 @@ bool DBIter::FindValueForCurrentKey() {
       case kTypeDeletion:
         operands.clear();
         last_not_merge_type = kTypeDeletion;
+        PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
         break;
       case kTypeMerge:
         assert(user_merge_operator_ != nullptr);
@@ -407,6 +419,7 @@ bool DBIter::FindValueForCurrentKey() {
         assert(false);
     }
 
+    PERF_COUNTER_ADD(internal_key_skipped_count, 1);
     assert(user_comparator_->Compare(ikey.user_key, saved_key_.GetKey()) == 0);
     iter_->Prev();
     ++num_skipped;
@@ -553,6 +566,20 @@ void DBIter::FindParseableKey(ParsedInternalKey* ikey, Direction direction) {
 void DBIter::Seek(const Slice& target) {
   StopWatch sw(env_, statistics_, DB_SEEK);
 
+  // total ordering is not guaranteed if prefix_extractor is set
+  // hence prefix based seeks will not give correct results
+  if (iterate_upper_bound_ != nullptr && prefix_extractor_ != nullptr) {
+    if (!prefix_extractor_->InDomain(*iterate_upper_bound_) ||
+        !prefix_extractor_->InDomain(target) ||
+        prefix_extractor_->Transform(*iterate_upper_bound_).compare(
+          prefix_extractor_->Transform(target)) != 0) {
+      status_ = Status::InvalidArgument("read_options.iterate_*_bound "
+                  " and seek target need to have the same prefix.");
+      valid_ = false;
+      return;
+    }
+  }
+
   saved_key_.Clear();
   // now savved_key is used to store internal key.
   saved_key_.SetInternalKey(target, sequence_);
@@ -574,7 +601,7 @@ void DBIter::Seek(const Slice& target) {
 void DBIter::SeekToFirst() {
   // Don't use iter_::Seek() if we set a prefix extractor
   // because prefix seek wiil be used.
-  if (has_prefix_extractor_) {
+  if (prefix_extractor_ != nullptr) {
     max_skip_ = std::numeric_limits<uint64_t>::max();
   }
   direction_ = kForward;
@@ -595,7 +622,7 @@ void DBIter::SeekToFirst() {
 void DBIter::SeekToLast() {
   // Don't use iter_::Seek() if we set a prefix extractor
   // because prefix seek wiil be used.
-  if (has_prefix_extractor_) {
+  if (prefix_extractor_ != nullptr) {
     max_skip_ = std::numeric_limits<uint64_t>::max();
   }
   direction_ = kReverse;
@@ -612,9 +639,10 @@ void DBIter::SeekToLast() {
 Iterator* NewDBIterator(Env* env, const Options& options,
                         const Comparator* user_key_comparator,
                         Iterator* internal_iter,
-                        const SequenceNumber& sequence) {
+                        const SequenceNumber& sequence,
+                        const Slice* iterate_upper_bound) {
   return new DBIter(env, options, user_key_comparator, internal_iter, sequence,
-                    false);
+                    false, iterate_upper_bound);
 }
 
 ArenaWrappedDBIter::~ArenaWrappedDBIter() { db_iter_->~DBIter(); }
@@ -643,13 +671,16 @@ void ArenaWrappedDBIter::RegisterCleanup(CleanupFunction function, void* arg1,
 
 ArenaWrappedDBIter* NewArenaWrappedDbIterator(
     Env* env, const Options& options, const Comparator* user_key_comparator,
-    const SequenceNumber& sequence) {
+    const SequenceNumber& sequence,
+    const Slice* iterate_upper_bound) {
   ArenaWrappedDBIter* iter = new ArenaWrappedDBIter();
   Arena* arena = iter->GetArena();
   auto mem = arena->AllocateAligned(sizeof(DBIter));
-  DBIter* db_iter = new (mem)
-      DBIter(env, options, user_key_comparator, nullptr, sequence, true);
+  DBIter* db_iter = new (mem) DBIter(env, options, user_key_comparator,
+      nullptr, sequence, true, iterate_upper_bound);
+
   iter->SetDBIter(db_iter);
+
   return iter;
 }
 
diff --git a/db/db_iter.h b/db/db_iter.h
index cb9840324..ffea34fa9 100644
--- a/db/db_iter.h
+++ b/db/db_iter.h
@@ -27,7 +27,8 @@ extern Iterator* NewDBIterator(
     const Options& options,
     const Comparator *user_key_comparator,
     Iterator* internal_iter,
-    const SequenceNumber& sequence);
+    const SequenceNumber& sequence,
+    const Slice* iterate_upper_bound = nullptr);
 
 // A wrapper iterator which wraps DB Iterator and the arena, with which the DB
 // iterator is supposed be allocated. This class is used as an entry point of
@@ -68,6 +69,6 @@ class ArenaWrappedDBIter : public Iterator {
 // Generate the arena wrapped iterator class.
 extern ArenaWrappedDBIter* NewArenaWrappedDbIterator(
     Env* env, const Options& options, const Comparator* user_key_comparator,
-    const SequenceNumber& sequence);
+    const SequenceNumber& sequence, const Slice* iterate_upper_bound = nullptr);
 
 }  // namespace rocksdb
diff --git a/db/db_test.cc b/db/db_test.cc
index 6295f5921..0b0365211 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -7743,6 +7743,144 @@ TEST(DBTest, TableOptionsSanitizeTest) {
   ASSERT_TRUE(TryReopen(&options).IsNotSupported());
 }
 
+TEST(DBTest, DBIteratorBoundTest) {
+  Options options;
+  options.env = env_;
+  options.create_if_missing = true;
+
+  options.prefix_extractor = nullptr;
+  DestroyAndReopen(&options);
+  ASSERT_OK(Put("a", "0"));
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Put("foo1", "bar1"));
+  ASSERT_OK(Put("g1", "0"));
+
+  // testing basic case with no iterate_upper_bound and no prefix_extractor
+  {
+    ReadOptions ro;
+    ro.iterate_upper_bound = nullptr;
+
+    std::unique_ptr<Iterator> iter(db_->NewIterator(ro));
+
+    iter->Seek("foo");
+
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(Slice("foo")), 0);
+
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(Slice("foo1")), 0);
+
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(Slice("g1")), 0);
+  }
+
+  // testing iterate_upper_bound and forward iterator
+  // to make sure it stops at bound
+  {
+    ReadOptions ro;
+    // iterate_upper_bound points beyond the last expected entry
+    ro.iterate_upper_bound = new Slice("foo2");
+
+    std::unique_ptr<Iterator> iter(db_->NewIterator(ro));
+
+    iter->Seek("foo");
+
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(Slice("foo")), 0);
+
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(("foo1")), 0);
+
+    iter->Next();
+    // should stop here...
+    ASSERT_TRUE(!iter->Valid());
+  }
+
+  // prefix is the first letter of the key
+  options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+
+  DestroyAndReopen(&options);
+  ASSERT_OK(Put("a", "0"));
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Put("foo1", "bar1"));
+  ASSERT_OK(Put("g1", "0"));
+
+  // testing with iterate_upper_bound and prefix_extractor
+  // Seek target and iterate_upper_bound are not is same prefix
+  // This should be an error
+  {
+    ReadOptions ro;
+    ro.iterate_upper_bound = new Slice("g1");
+
+    std::unique_ptr<Iterator> iter(db_->NewIterator(ro));
+
+    iter->Seek("foo");
+
+    ASSERT_TRUE(!iter->Valid());
+    ASSERT_TRUE(iter->status().IsInvalidArgument());
+  }
+
+  // testing that iterate_upper_bound prevents iterating over deleted items
+  // if the bound has already reached
+  {
+    options.prefix_extractor = nullptr;
+    DestroyAndReopen(&options);
+    ASSERT_OK(Put("a", "0"));
+    ASSERT_OK(Put("b", "0"));
+    ASSERT_OK(Put("b1", "0"));
+    ASSERT_OK(Put("c", "0"));
+    ASSERT_OK(Put("d", "0"));
+    ASSERT_OK(Put("e", "0"));
+    ASSERT_OK(Delete("c"));
+    ASSERT_OK(Delete("d"));
+
+    // base case with no bound
+    ReadOptions ro;
+    ro.iterate_upper_bound = nullptr;
+
+    std::unique_ptr<Iterator> iter(db_->NewIterator(ro));
+
+    iter->Seek("b");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(Slice("b")), 0);
+
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(("b1")), 0);
+
+    perf_context.Reset();
+    iter->Next();
+
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(static_cast<int>(perf_context.internal_delete_skipped_count), 2);
+
+    // now testing with iterate_bound
+    ro.iterate_upper_bound = new Slice("c");
+
+    iter.reset(db_->NewIterator(ro));
+
+    perf_context.Reset();
+
+    iter->Seek("b");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(Slice("b")), 0);
+
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(("b1")), 0);
+
+    iter->Next();
+    // the iteration should stop as soon as the the bound key is reached
+    // even though the key is deleted
+    // hence internal_delete_skipped_count should be 0
+    ASSERT_TRUE(!iter->Valid());
+    ASSERT_EQ(static_cast<int>(perf_context.internal_delete_skipped_count), 0);
+  }
+}
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h
index c54e6707f..e4b1bb753 100644
--- a/include/rocksdb/c.h
+++ b/include/rocksdb/c.h
@@ -698,6 +698,10 @@ extern void rocksdb_readoptions_set_fill_cache(
 extern void rocksdb_readoptions_set_snapshot(
     rocksdb_readoptions_t*,
     const rocksdb_snapshot_t*);
+extern void rocksdb_readoptions_set_iterate_upper_bound(
+    rocksdb_readoptions_t*,
+    const char* key,
+    size_t keylen);
 extern void rocksdb_readoptions_set_read_tier(
     rocksdb_readoptions_t*, int);
 extern void rocksdb_readoptions_set_tailing(
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index 11d976fb2..fbb3b6ddb 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -903,6 +903,18 @@ struct ReadOptions {
   // ! DEPRECATED
   // const Slice* prefix;
 
+  // "iterate_upper_bound" defines the extent upto which the forward iterator
+  // can returns entries. Once the bound is reached, Valid() will be false.
+  // "iterate_upper_bound" is exclusive ie the bound value is
+  // not a valid entry.  If iterator_extractor is not null, the Seek target
+  // and iterator_upper_bound need to have the same prefix.
+  // This is because ordering is not guaranteed outside of prefix domain.
+  // There is no lower bound on the iterator. If needed, that can be easily
+  // implemented
+  //
+  // Default: nullptr
+  const Slice* iterate_upper_bound;
+
   // Specify if this read request should process data that ALREADY
   // resides on a particular cache. If the required data is not
   // found at the specified cache, then Status::Incomplete is returned.
@@ -926,6 +938,7 @@ struct ReadOptions {
       : verify_checksums(true),
         fill_cache(true),
         snapshot(nullptr),
+        iterate_upper_bound(nullptr),
         read_tier(kReadAllTier),
         tailing(false),
         total_order_seek(false) {}
@@ -933,6 +946,7 @@ struct ReadOptions {
       : verify_checksums(cksum),
         fill_cache(cache),
         snapshot(nullptr),
+        iterate_upper_bound(nullptr),
         read_tier(kReadAllTier),
         tailing(false),
         total_order_seek(false) {}

From 5665e5e285c25c1674567f747df92c131037d2dc Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Thu, 4 Sep 2014 16:18:36 -0700
Subject: [PATCH 021/829] introduce ImmutableOptions

Summary:
As a preparation to support updating some options dynamically, I'd like
to first introduce ImmutableOptions, which is a subset of Options that
cannot be changed during the course of a DB lifetime without restart.

ColumnFamily will keep both Options and ImmutableOptions. Any component
below ColumnFamily should only take ImmutableOptions in their
constructor. Other options should be taken from APIs, which will be
allowed to adjust dynamically.

I am yet to make changes to memtable and other related classes to take
ImmutableOptions in their ctor. That can be done in a seprate diff as
this one is already pretty big.

Test Plan: make all check

Reviewers: yhchiang, igor, sdong

Reviewed By: sdong

Subscribers: leveldb, dhruba

Differential Revision: https://reviews.facebook.net/D22545
---
 db/builder.cc                         | 40 +++++++------
 db/builder.h                          | 11 +++-
 db/column_family.cc                   | 15 ++---
 db/column_family.h                    | 13 ++--
 db/db_impl.cc                         | 37 ++++++------
 db/db_impl.h                          |  2 +-
 db/plain_table_db_test.cc             | 22 +++----
 db/repair.cc                          | 23 ++++---
 db/simple_table_db_test.cc            | 69 +++++++++++----------
 db/table_cache.cc                     | 43 +++++++-------
 db/table_cache.h                      | 11 ++--
 db/table_properties_collector_test.cc |  3 +-
 include/rocksdb/immutable_options.h   | 62 +++++++++++++++++++
 include/rocksdb/table.h               | 23 ++++---
 table/adaptive_table_factory.cc       | 18 +++---
 table/adaptive_table_factory.h        | 24 ++++----
 table/block_based_table_builder.cc    | 49 ++++++++-------
 table/block_based_table_builder.h     |  6 +-
 table/block_based_table_factory.cc    | 13 ++--
 table/block_based_table_factory.h     | 17 +++---
 table/block_based_table_reader.cc     | 67 +++++++++++----------
 table/block_based_table_reader.h      |  5 +-
 table/cuckoo_table_factory.cc         | 18 +++---
 table/cuckoo_table_factory.h          |  7 ++-
 table/cuckoo_table_reader.cc          |  6 +-
 table/cuckoo_table_reader.h           |  5 +-
 table/cuckoo_table_reader_test.cc     | 15 +++--
 table/filter_block.cc                 |  9 +--
 table/filter_block.h                  |  5 +-
 table/filter_block_test.cc            | 18 +++---
 table/plain_table_builder.cc          | 29 ++++-----
 table/plain_table_builder.h           |  4 +-
 table/plain_table_factory.cc          | 14 +++--
 table/plain_table_factory.h           | 21 +++----
 table/plain_table_index.cc            |  8 +--
 table/plain_table_index.h             |  8 +--
 table/plain_table_reader.cc           | 51 ++++++++--------
 table/plain_table_reader.h            | 10 ++--
 table/table_reader_bench.cc           |  8 ++-
 table/table_test.cc                   | 86 ++++++++++++++++++---------
 tools/sst_dump.cc                     |  6 +-
 util/options.cc                       | 21 +++++++
 42 files changed, 554 insertions(+), 368 deletions(-)
 create mode 100644 include/rocksdb/immutable_options.h

diff --git a/db/builder.cc b/db/builder.cc
index 1084f0413..2c5094370 100644
--- a/db/builder.cc
+++ b/db/builder.cc
@@ -26,21 +26,24 @@ namespace rocksdb {
 
 class TableFactory;
 
-TableBuilder* NewTableBuilder(const Options& options,
+TableBuilder* NewTableBuilder(const ImmutableCFOptions& ioptions,
                               const InternalKeyComparator& internal_comparator,
                               WritableFile* file,
-                              CompressionType compression_type) {
-  return options.table_factory->NewTableBuilder(options, internal_comparator,
-                                                file, compression_type);
+                              const CompressionType compression_type,
+                              const CompressionOptions& compression_opts) {
+  return ioptions.table_factory->NewTableBuilder(
+      ioptions, internal_comparator, file, compression_type, compression_opts);
 }
 
-Status BuildTable(const std::string& dbname, Env* env, const Options& options,
-                  const EnvOptions& soptions, TableCache* table_cache,
+Status BuildTable(const std::string& dbname, Env* env,
+                  const ImmutableCFOptions& ioptions,
+                  const EnvOptions& env_options, TableCache* table_cache,
                   Iterator* iter, FileMetaData* meta,
                   const InternalKeyComparator& internal_comparator,
                   const SequenceNumber newest_snapshot,
                   const SequenceNumber earliest_seqno_in_memtable,
                   const CompressionType compression,
+                  const CompressionOptions& compression_opts,
                   const Env::IOPriority io_priority) {
   Status s;
   meta->fd.file_size = 0;
@@ -50,23 +53,24 @@ Status BuildTable(const std::string& dbname, Env* env, const Options& options,
   // If the sequence number of the smallest entry in the memtable is
   // smaller than the most recent snapshot, then we do not trigger
   // removal of duplicate/deleted keys as part of this builder.
-  bool purge = options.purge_redundant_kvs_while_flush;
+  bool purge = ioptions.purge_redundant_kvs_while_flush;
   if (earliest_seqno_in_memtable <= newest_snapshot) {
     purge = false;
   }
 
-  std::string fname = TableFileName(options.db_paths, meta->fd.GetNumber(),
+  std::string fname = TableFileName(ioptions.db_paths, meta->fd.GetNumber(),
                                     meta->fd.GetPathId());
   if (iter->Valid()) {
     unique_ptr<WritableFile> file;
-    s = env->NewWritableFile(fname, &file, soptions);
+    s = env->NewWritableFile(fname, &file, env_options);
     if (!s.ok()) {
       return s;
     }
     file->SetIOPriority(io_priority);
 
-    TableBuilder* builder =
-        NewTableBuilder(options, internal_comparator, file.get(), compression);
+    TableBuilder* builder = NewTableBuilder(
+        ioptions, internal_comparator, file.get(),
+        compression, compression_opts);
 
     // the first key is the smallest key
     Slice key = iter->key();
@@ -75,8 +79,8 @@ Status BuildTable(const std::string& dbname, Env* env, const Options& options,
     meta->largest_seqno = meta->smallest_seqno;
 
     MergeHelper merge(internal_comparator.user_comparator(),
-                      options.merge_operator.get(), options.info_log.get(),
-                      options.min_partial_merge_operands,
+                      ioptions.merge_operator, ioptions.info_log,
+                      ioptions.min_partial_merge_operands,
                       true /* internal key corruption is not ok */);
 
     if (purge) {
@@ -196,12 +200,12 @@ Status BuildTable(const std::string& dbname, Env* env, const Options& options,
     delete builder;
 
     // Finish and check for file errors
-    if (s.ok() && !options.disableDataSync) {
-      if (options.use_fsync) {
-        StopWatch sw(env, options.statistics.get(), TABLE_SYNC_MICROS);
+    if (s.ok() && !ioptions.disable_data_sync) {
+      if (ioptions.use_fsync) {
+        StopWatch sw(env, ioptions.statistics, TABLE_SYNC_MICROS);
         s = file->Fsync();
       } else {
-        StopWatch sw(env, options.statistics.get(), TABLE_SYNC_MICROS);
+        StopWatch sw(env, ioptions.statistics, TABLE_SYNC_MICROS);
         s = file->Sync();
       }
     }
@@ -211,7 +215,7 @@ Status BuildTable(const std::string& dbname, Env* env, const Options& options,
 
     if (s.ok()) {
       // Verify that the table is usable
-      Iterator* it = table_cache->NewIterator(ReadOptions(), soptions,
+      Iterator* it = table_cache->NewIterator(ReadOptions(), env_options,
                                               internal_comparator, meta->fd);
       s = it->status();
       delete it;
diff --git a/db/builder.h b/db/builder.h
index f57501abd..cf3ebd1ae 100644
--- a/db/builder.h
+++ b/db/builder.h
@@ -11,6 +11,7 @@
 #include "rocksdb/status.h"
 #include "rocksdb/types.h"
 #include "rocksdb/options.h"
+#include "rocksdb/immutable_options.h"
 
 namespace rocksdb {
 
@@ -26,8 +27,10 @@ class TableBuilder;
 class WritableFile;
 
 extern TableBuilder* NewTableBuilder(
-    const Options& options, const InternalKeyComparator& internal_comparator,
-    WritableFile* file, CompressionType compression_type);
+    const ImmutableCFOptions& options,
+    const InternalKeyComparator& internal_comparator,
+    WritableFile* file, const CompressionType compression_type,
+    const CompressionOptions& compression_opts);
 
 // Build a Table file from the contents of *iter.  The generated file
 // will be named according to number specified in meta. On success, the rest of
@@ -35,13 +38,15 @@ extern TableBuilder* NewTableBuilder(
 // If no data is present in *iter, meta->file_size will be set to
 // zero, and no Table file will be produced.
 extern Status BuildTable(const std::string& dbname, Env* env,
-                         const Options& options, const EnvOptions& soptions,
+                         const ImmutableCFOptions& options,
+                         const EnvOptions& env_options,
                          TableCache* table_cache, Iterator* iter,
                          FileMetaData* meta,
                          const InternalKeyComparator& internal_comparator,
                          const SequenceNumber newest_snapshot,
                          const SequenceNumber earliest_seqno_in_memtable,
                          const CompressionType compression,
+                         const CompressionOptions& compression_opts,
                          const Env::IOPriority io_priority = Env::IO_HIGH);
 
 }  // namespace rocksdb
diff --git a/db/column_family.cc b/db/column_family.cc
index b1c9ba7e8..7e06c9bd7 100644
--- a/db/column_family.cc
+++ b/db/column_family.cc
@@ -178,7 +178,7 @@ ColumnFamilyData::ColumnFamilyData(uint32_t id, const std::string& name,
                                    Version* dummy_versions, Cache* table_cache,
                                    const ColumnFamilyOptions& options,
                                    const DBOptions* db_options,
-                                   const EnvOptions& storage_options,
+                                   const EnvOptions& env_options,
                                    ColumnFamilySet* column_family_set)
     : id_(id),
       name_(name),
@@ -188,6 +188,7 @@ ColumnFamilyData::ColumnFamilyData(uint32_t id, const std::string& name,
       dropped_(false),
       internal_comparator_(options.comparator),
       options_(*db_options, SanitizeOptions(&internal_comparator_, options)),
+      ioptions_(options_),
       mem_(nullptr),
       imm_(options_.min_write_buffer_number_to_merge),
       super_version_(nullptr),
@@ -204,7 +205,7 @@ ColumnFamilyData::ColumnFamilyData(uint32_t id, const std::string& name,
   if (dummy_versions != nullptr) {
     internal_stats_.reset(
         new InternalStats(options_.num_levels, db_options->env, this));
-    table_cache_.reset(new TableCache(&options_, storage_options, table_cache));
+    table_cache_.reset(new TableCache(ioptions_, env_options, table_cache));
     if (options_.compaction_style == kCompactionStyleUniversal) {
       compaction_picker_.reset(
           new UniversalCompactionPicker(&options_, &internal_comparator_));
@@ -306,7 +307,7 @@ void ColumnFamilyData::RecalculateWriteStallRateLimitsConditions() {
 }
 
 const EnvOptions* ColumnFamilyData::soptions() const {
-  return &(column_family_set_->storage_options_);
+  return &(column_family_set_->env_options_);
 }
 
 void ColumnFamilyData::SetCurrent(Version* current) {
@@ -462,16 +463,16 @@ void ColumnFamilyData::ResetThreadLocalSuperVersions() {
 
 ColumnFamilySet::ColumnFamilySet(const std::string& dbname,
                                  const DBOptions* db_options,
-                                 const EnvOptions& storage_options,
+                                 const EnvOptions& env_options,
                                  Cache* table_cache)
     : max_column_family_(0),
       dummy_cfd_(new ColumnFamilyData(0, "", nullptr, nullptr,
                                       ColumnFamilyOptions(), db_options,
-                                      storage_options_, nullptr)),
+                                      env_options_, nullptr)),
       default_cfd_cache_(nullptr),
       db_name_(dbname),
       db_options_(db_options),
-      storage_options_(storage_options),
+      env_options_(env_options),
       table_cache_(table_cache),
       spin_lock_(ATOMIC_FLAG_INIT) {
   // initialize linked list
@@ -537,7 +538,7 @@ ColumnFamilyData* ColumnFamilySet::CreateColumnFamily(
   assert(column_families_.find(name) == column_families_.end());
   ColumnFamilyData* new_cfd =
       new ColumnFamilyData(id, name, dummy_versions, table_cache_, options,
-                           db_options_, storage_options_, this);
+                           db_options_, env_options_, this);
   Lock();
   column_families_.insert({name, id});
   column_family_data_.insert({id, new_cfd});
diff --git a/db/column_family.h b/db/column_family.h
index 33bceadc6..a68189d51 100644
--- a/db/column_family.h
+++ b/db/column_family.h
@@ -165,9 +165,11 @@ class ColumnFamilyData {
   void SetLogNumber(uint64_t log_number) { log_number_ = log_number; }
   uint64_t GetLogNumber() const { return log_number_; }
 
-  // thread-safe
+  // TODO(ljin): make this API thread-safe once we allow updating options_
   const Options* options() const { return &options_; }
+  // thread-safe
   const EnvOptions* soptions() const;
+  const ImmutableCFOptions* ioptions() const { return &ioptions_; }
 
   InternalStats* internal_stats() { return internal_stats_.get(); }
 
@@ -251,7 +253,7 @@ class ColumnFamilyData {
                    Version* dummy_versions, Cache* table_cache,
                    const ColumnFamilyOptions& options,
                    const DBOptions* db_options,
-                   const EnvOptions& storage_options,
+                   const EnvOptions& env_options,
                    ColumnFamilySet* column_family_set);
 
   // Recalculate some small conditions, which are changed only during
@@ -272,7 +274,8 @@ class ColumnFamilyData {
 
   const InternalKeyComparator internal_comparator_;
 
-  Options const options_;
+  const Options options_;
+  const ImmutableCFOptions ioptions_;
 
   std::unique_ptr<TableCache> table_cache_;
 
@@ -367,7 +370,7 @@ class ColumnFamilySet {
   };
 
   ColumnFamilySet(const std::string& dbname, const DBOptions* db_options,
-                  const EnvOptions& storage_options, Cache* table_cache);
+                  const EnvOptions& env_options, Cache* table_cache);
   ~ColumnFamilySet();
 
   ColumnFamilyData* GetDefault() const;
@@ -420,7 +423,7 @@ class ColumnFamilySet {
 
   const std::string db_name_;
   const DBOptions* const db_options_;
-  const EnvOptions storage_options_;
+  const EnvOptions env_options_;
   Cache* table_cache_;
   std::atomic_flag spin_lock_;
 };
diff --git a/db/db_impl.cc b/db/db_impl.cc
index f18bb2141..049d40c7b 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -356,7 +356,7 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname)
       default_interval_to_delete_obsolete_WAL_(600),
       flush_on_destroy_(false),
       delayed_writes_(0),
-      storage_options_(options),
+      env_options_(options),
       bg_work_gate_closed_(false),
       refitting_level_(false),
       opened_successfully_(false) {
@@ -372,7 +372,7 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname)
                   options_.table_cache_remove_scan_count_limit);
 
   versions_.reset(
-      new VersionSet(dbname_, &options_, storage_options_, table_cache_.get()));
+      new VersionSet(dbname_, &options_, env_options_, table_cache_.get()));
   column_family_memtables_.reset(
       new ColumnFamilyMemTablesImpl(versions_->GetColumnFamilySet()));
 
@@ -453,7 +453,7 @@ Status DBImpl::NewDB() {
   const std::string manifest = DescriptorFileName(dbname_, 1);
   unique_ptr<WritableFile> file;
   Status s = env_->NewWritableFile(
-      manifest, &file, env_->OptimizeForManifestWrite(storage_options_));
+      manifest, &file, env_->OptimizeForManifestWrite(env_options_));
   if (!s.ok()) {
     return s;
   }
@@ -1075,7 +1075,7 @@ Status DBImpl::ReadFirstLine(const std::string& fname,
   };
 
   unique_ptr<SequentialFile> file;
-  Status status = env_->NewSequentialFile(fname, &file, storage_options_);
+  Status status = env_->NewSequentialFile(fname, &file, env_options_);
 
   if (!status.ok()) {
     return status;
@@ -1275,7 +1275,7 @@ Status DBImpl::RecoverLogFile(uint64_t log_number, SequenceNumber* max_sequence,
   // Open the log file
   std::string fname = LogFileName(options_.wal_dir, log_number);
   unique_ptr<SequentialFile> file;
-  Status status = env_->NewSequentialFile(fname, &file, storage_options_);
+  Status status = env_->NewSequentialFile(fname, &file, env_options_);
   if (!status.ok()) {
     MaybeIgnoreError(&status);
     return status;
@@ -1425,10 +1425,11 @@ Status DBImpl::WriteLevel0TableForRecovery(ColumnFamilyData* cfd, MemTable* mem,
   Status s;
   {
     mutex_.Unlock();
-    s = BuildTable(dbname_, env_, *cfd->options(), storage_options_,
+    s = BuildTable(dbname_, env_, *cfd->ioptions(), env_options_,
                    cfd->table_cache(), iter, &meta, cfd->internal_comparator(),
                    newest_snapshot, earliest_seqno_in_memtable,
-                   GetCompressionFlush(*cfd->options()), Env::IO_HIGH);
+                   GetCompressionFlush(*cfd->options()),
+                   cfd->options()->compression_opts, Env::IO_HIGH);
     LogFlush(options_.info_log);
     mutex_.Lock();
   }
@@ -1495,10 +1496,11 @@ Status DBImpl::WriteLevel0Table(ColumnFamilyData* cfd,
     Log(options_.info_log, "[%s] Level-0 flush table #%" PRIu64 ": started",
         cfd->GetName().c_str(), meta.fd.GetNumber());
 
-    s = BuildTable(dbname_, env_, *cfd->options(), storage_options_,
+    s = BuildTable(dbname_, env_, *cfd->ioptions(), env_options_,
                    cfd->table_cache(), iter, &meta, cfd->internal_comparator(),
                    newest_snapshot, earliest_seqno_in_memtable,
-                   GetCompressionFlush(*cfd->options()), Env::IO_HIGH);
+                   GetCompressionFlush(*cfd->options()),
+                   cfd->options()->compression_opts, Env::IO_HIGH);
     LogFlush(options_.info_log);
     delete iter;
     Log(options_.info_log,
@@ -2447,7 +2449,7 @@ Status DBImpl::OpenCompactionOutputFile(CompactionState* compact) {
   // Make the output file
   std::string fname = TableFileName(options_.db_paths, file_number,
                                     compact->compaction->GetOutputPathId());
-  Status s = env_->NewWritableFile(fname, &compact->outfile, storage_options_);
+  Status s = env_->NewWritableFile(fname, &compact->outfile, env_options_);
 
   if (s.ok()) {
     compact->outfile->SetIOPriority(Env::IO_LOW);
@@ -2456,8 +2458,9 @@ Status DBImpl::OpenCompactionOutputFile(CompactionState* compact) {
 
     ColumnFamilyData* cfd = compact->compaction->column_family_data();
     compact->builder.reset(NewTableBuilder(
-        *cfd->options(), cfd->internal_comparator(), compact->outfile.get(),
-        compact->compaction->OutputCompressionType()));
+        *cfd->ioptions(), cfd->internal_comparator(), compact->outfile.get(),
+        compact->compaction->OutputCompressionType(),
+        cfd->options()->compression_opts));
   }
   LogFlush(options_.info_log);
   return s;
@@ -2506,7 +2509,7 @@ Status DBImpl::FinishCompactionOutputFile(CompactionState* compact,
     ColumnFamilyData* cfd = compact->compaction->column_family_data();
     FileDescriptor fd(output_number, output_path_id, current_bytes);
     Iterator* iter = cfd->table_cache()->NewIterator(
-        ReadOptions(), storage_options_, cfd->internal_comparator(), fd);
+        ReadOptions(), env_options_, cfd->internal_comparator(), fd);
     s = iter->status();
     delete iter;
     if (s.ok()) {
@@ -3355,7 +3358,7 @@ Iterator* DBImpl::NewInternalIterator(const ReadOptions& options,
     // Collect all needed child iterators for immutable memtables
     super_version->imm->AddIterators(options, &merge_iter_builder);
     // Collect iterators for files in L0 - Ln
-    super_version->current->AddIterators(options, storage_options_,
+    super_version->current->AddIterators(options, env_options_,
                                          &merge_iter_builder);
     internal_iter = merge_iter_builder.Finish();
   } else {
@@ -3366,7 +3369,7 @@ Iterator* DBImpl::NewInternalIterator(const ReadOptions& options,
     // Collect all needed child iterators for immutable memtables
     super_version->imm->AddIterators(options, &iterator_list);
     // Collect iterators for files in L0 - Ln
-    super_version->current->AddIterators(options, storage_options_,
+    super_version->current->AddIterators(options, env_options_,
                                          &iterator_list);
     internal_iter = NewMergingIterator(&cfd->internal_comparator(),
                                        &iterator_list[0], iterator_list.size());
@@ -4377,7 +4380,7 @@ Status DBImpl::SetNewMemtableAndNewLogFile(ColumnFamilyData* cfd,
     if (creating_new_log) {
       s = env_->NewWritableFile(LogFileName(options_.wal_dir, new_log_number),
                                 &lfile,
-                                env_->OptimizeForLogWrite(storage_options_));
+                                env_->OptimizeForLogWrite(env_options_));
       if (s.ok()) {
         // Our final size should be less than write_buffer_size
         // (compression, etc) but err on the side of caution.
@@ -4615,7 +4618,7 @@ Status DBImpl::GetUpdatesSince(
     return s;
   }
   iter->reset(new TransactionLogIteratorImpl(options_.wal_dir, &options_,
-                                             read_options, storage_options_,
+                                             read_options, env_options_,
                                              seq, std::move(wal_files), this));
   return (*iter)->status();
 }
diff --git a/db/db_impl.h b/db/db_impl.h
index 086ac9fd4..caacd012a 100644
--- a/db/db_impl.h
+++ b/db/db_impl.h
@@ -607,7 +607,7 @@ class DBImpl : public DB {
   int delayed_writes_;
 
   // The options to access storage files
-  const EnvOptions storage_options_;
+  const EnvOptions env_options_;
 
   // A value of true temporarily disables scheduling of background work
   bool bg_work_gate_closed_;
diff --git a/db/plain_table_db_test.cc b/db/plain_table_db_test.cc
index bb0f96f15..1750d265c 100644
--- a/db/plain_table_db_test.cc
+++ b/db/plain_table_db_test.cc
@@ -192,16 +192,17 @@ extern const uint64_t kPlainTableMagicNumber;
 
 class TestPlainTableReader : public PlainTableReader {
  public:
-  TestPlainTableReader(const EnvOptions& storage_options,
+  TestPlainTableReader(const EnvOptions& env_options,
                        const InternalKeyComparator& icomparator,
                        EncodingType encoding_type, uint64_t file_size,
                        int bloom_bits_per_key, double hash_table_ratio,
                        size_t index_sparseness,
                        const TableProperties* table_properties,
                        unique_ptr<RandomAccessFile>&& file,
-                       const Options& options, bool* expect_bloom_not_match,
+                       const ImmutableCFOptions& ioptions,
+                       bool* expect_bloom_not_match,
                        bool store_index_in_file)
-      : PlainTableReader(options, std::move(file), storage_options, icomparator,
+      : PlainTableReader(ioptions, std::move(file), env_options, icomparator,
                          encoding_type, file_size, table_properties),
         expect_bloom_not_match_(expect_bloom_not_match) {
     Status s = MmapDataFile();
@@ -218,7 +219,7 @@ class TestPlainTableReader : public PlainTableReader {
           PlainTablePropertyNames::kBloomVersion);
       ASSERT_TRUE(bloom_version_ptr != props->user_collected_properties.end());
       ASSERT_EQ(bloom_version_ptr->second, std::string("1"));
-      if (options.bloom_locality > 0) {
+      if (ioptions.bloom_locality > 0) {
         auto num_blocks_ptr = props->user_collected_properties.find(
             PlainTablePropertyNames::kNumBloomBlocks);
         ASSERT_TRUE(num_blocks_ptr != props->user_collected_properties.end());
@@ -253,25 +254,26 @@ class TestPlainTableFactory : public PlainTableFactory {
         store_index_in_file_(options.store_index_in_file),
         expect_bloom_not_match_(expect_bloom_not_match) {}
 
-  Status NewTableReader(const Options& options, const EnvOptions& soptions,
+  Status NewTableReader(const ImmutableCFOptions& ioptions,
+                        const EnvOptions& env_options,
                         const InternalKeyComparator& internal_comparator,
                         unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
                         unique_ptr<TableReader>* table) const override {
     TableProperties* props = nullptr;
     auto s = ReadTableProperties(file.get(), file_size, kPlainTableMagicNumber,
-                                 options.env, options.info_log.get(), &props);
+                                 ioptions.env, ioptions.info_log, &props);
     ASSERT_TRUE(s.ok());
 
     if (store_index_in_file_) {
       BlockHandle bloom_block_handle;
       s = FindMetaBlock(file.get(), file_size, kPlainTableMagicNumber,
-                        options.env, BloomBlockBuilder::kBloomBlock,
+                        ioptions.env, BloomBlockBuilder::kBloomBlock,
                         &bloom_block_handle);
       ASSERT_TRUE(s.ok());
 
       BlockHandle index_block_handle;
       s = FindMetaBlock(
-          file.get(), file_size, kPlainTableMagicNumber, options.env,
+          file.get(), file_size, kPlainTableMagicNumber, ioptions.env,
           PlainTableIndexBuilder::kPlainTableIndexBlock, &index_block_handle);
       ASSERT_TRUE(s.ok());
     }
@@ -284,9 +286,9 @@ class TestPlainTableFactory : public PlainTableFactory {
         DecodeFixed32(encoding_type_prop->second.c_str()));
 
     std::unique_ptr<PlainTableReader> new_reader(new TestPlainTableReader(
-        soptions, internal_comparator, encoding_type, file_size,
+        env_options, internal_comparator, encoding_type, file_size,
         bloom_bits_per_key_, hash_table_ratio_, index_sparseness_, props,
-        std::move(file), options, expect_bloom_not_match_,
+        std::move(file), ioptions, expect_bloom_not_match_,
         store_index_in_file_));
 
     *table = std::move(new_reader);
diff --git a/db/repair.cc b/db/repair.cc
index 820cc1924..3c64449d1 100644
--- a/db/repair.cc
+++ b/db/repair.cc
@@ -46,6 +46,8 @@
 #include "rocksdb/comparator.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "rocksdb/immutable_options.h"
 
 namespace rocksdb {
 
@@ -58,6 +60,7 @@ class Repairer {
         env_(options.env),
         icmp_(options.comparator),
         options_(SanitizeOptions(dbname, &icmp_, options)),
+        ioptions_(options_),
         raw_table_cache_(
             // TableCache can be small since we expect each table to be opened
             // once.
@@ -65,7 +68,7 @@ class Repairer {
                         options_.table_cache_remove_scan_count_limit)),
         next_file_number_(1) {
     table_cache_ =
-        new TableCache(&options_, storage_options_, raw_table_cache_.get());
+        new TableCache(ioptions_, env_options_, raw_table_cache_.get());
     edit_ = new VersionEdit();
   }
 
@@ -107,8 +110,9 @@ class Repairer {
 
   std::string const dbname_;
   Env* const env_;
-  InternalKeyComparator const icmp_;
-  Options const options_;
+  const InternalKeyComparator icmp_;
+  const Options options_;
+  const ImmutableCFOptions ioptions_;
   std::shared_ptr<Cache> raw_table_cache_;
   TableCache* table_cache_;
   VersionEdit* edit_;
@@ -118,7 +122,7 @@ class Repairer {
   std::vector<uint64_t> logs_;
   std::vector<TableInfo> tables_;
   uint64_t next_file_number_;
-  const EnvOptions storage_options_;
+  const EnvOptions env_options_;
 
   Status FindFiles() {
     std::vector<std::string> filenames;
@@ -190,7 +194,7 @@ class Repairer {
     // Open the log file
     std::string logname = LogFileName(dbname_, log);
     unique_ptr<SequentialFile> lfile;
-    Status status = env_->NewSequentialFile(logname, &lfile, storage_options_);
+    Status status = env_->NewSequentialFile(logname, &lfile, env_options_);
     if (!status.ok()) {
       return status;
     }
@@ -239,8 +243,9 @@ class Repairer {
     ReadOptions ro;
     ro.total_order_seek = true;
     Iterator* iter = mem->NewIterator(ro);
-    status = BuildTable(dbname_, env_, options_, storage_options_, table_cache_,
-                        iter, &meta, icmp_, 0, 0, kNoCompression);
+    status = BuildTable(dbname_, env_, ioptions_, env_options_, table_cache_,
+                        iter, &meta, icmp_, 0, 0, kNoCompression,
+                        CompressionOptions());
     delete iter;
     delete mem->Unref();
     delete cf_mems_default;
@@ -286,7 +291,7 @@ class Repairer {
                                 file_size);
     if (status.ok()) {
       Iterator* iter = table_cache_->NewIterator(
-          ReadOptions(), storage_options_, icmp_, t->meta.fd);
+          ReadOptions(), env_options_, icmp_, t->meta.fd);
       bool empty = true;
       ParsedInternalKey parsed;
       t->min_sequence = 0;
@@ -326,7 +331,7 @@ class Repairer {
     std::string tmp = TempFileName(dbname_, 1);
     unique_ptr<WritableFile> file;
     Status status = env_->NewWritableFile(
-        tmp, &file, env_->OptimizeForManifestWrite(storage_options_));
+        tmp, &file, env_->OptimizeForManifestWrite(env_options_));
     if (!status.ok()) {
       return status;
     }
diff --git a/db/simple_table_db_test.cc b/db/simple_table_db_test.cc
index e88485070..0a0ecf064 100644
--- a/db/simple_table_db_test.cc
+++ b/db/simple_table_db_test.cc
@@ -79,7 +79,8 @@ public:
   // for the duration of the returned table's lifetime.
   //
   // *file must remain live while this Table is in use.
-  static Status Open(const Options& options, const EnvOptions& soptions,
+  static Status Open(const ImmutableCFOptions& options,
+                     const EnvOptions& env_options,
                      unique_ptr<RandomAccessFile> && file, uint64_t file_size,
                      unique_ptr<TableReader>* table_reader);
 
@@ -160,14 +161,14 @@ private:
 struct SimpleTableReader::Rep {
   ~Rep() {
   }
-  Rep(const EnvOptions& storage_options, uint64_t index_start_offset,
-      int num_entries) :
-      soptions(storage_options), index_start_offset(index_start_offset),
-      num_entries(num_entries) {
+  Rep(const ImmutableCFOptions& ioptions, const EnvOptions& env_options,
+      uint64_t index_start_offset, int num_entries) :
+      ioptions(ioptions), env_options(env_options),
+      index_start_offset(index_start_offset), num_entries(num_entries) {
   }
 
-  Options options;
-  const EnvOptions& soptions;
+  const ImmutableCFOptions& ioptions;
+  const EnvOptions& env_options;
   Status status;
   unique_ptr<RandomAccessFile> file;
   uint64_t index_start_offset;
@@ -187,8 +188,8 @@ SimpleTableReader::~SimpleTableReader() {
   delete rep_;
 }
 
-Status SimpleTableReader::Open(const Options& options,
-                               const EnvOptions& soptions,
+Status SimpleTableReader::Open(const ImmutableCFOptions& ioptions,
+                               const EnvOptions& env_options,
                                unique_ptr<RandomAccessFile> && file,
                                uint64_t size,
                                unique_ptr<TableReader>* table_reader) {
@@ -201,12 +202,10 @@ Status SimpleTableReader::Open(const Options& options,
 
     int num_entries = (size - Rep::offset_length - index_start_offset)
         / (Rep::GetInternalKeyLength() + Rep::offset_length);
-    SimpleTableReader::Rep* rep = new SimpleTableReader::Rep(soptions,
-                                                             index_start_offset,
-                                                             num_entries);
+    SimpleTableReader::Rep* rep = new SimpleTableReader::Rep(
+        ioptions, env_options, index_start_offset, num_entries);
 
     rep->file = std::move(file);
-    rep->options = options;
     table_reader->reset(new SimpleTableReader(rep));
   }
   return s;
@@ -248,7 +247,7 @@ Status SimpleTableReader::GetOffset(const Slice& target, uint64_t* offset) {
       return s;
     }
 
-    InternalKeyComparator ikc(rep_->options.comparator);
+    InternalKeyComparator ikc(rep_->ioptions.comparator);
     int compare_result = ikc.Compare(tmp_slice, target);
 
     if (compare_result < 0) {
@@ -382,7 +381,7 @@ void SimpleTableIterator::Prev() {
 }
 
 Slice SimpleTableIterator::key() const {
-  Log(table_->rep_->options.info_log, "key!!!!");
+  Log(table_->rep_->ioptions.info_log, "key!!!!");
   return key_;
 }
 
@@ -401,7 +400,7 @@ public:
   // caller to close the file after calling Finish(). The output file
   // will be part of level specified by 'level'.  A value of -1 means
   // that the caller does not know which level the output file will reside.
-  SimpleTableBuilder(const Options& options, WritableFile* file,
+  SimpleTableBuilder(const ImmutableCFOptions& ioptions, WritableFile* file,
                      CompressionType compression_type);
 
   // REQUIRES: Either Finish() or Abandon() has been called.
@@ -444,7 +443,7 @@ private:
 };
 
 struct SimpleTableBuilder::Rep {
-  Options options;
+  const ImmutableCFOptions& ioptions;
   WritableFile* file;
   uint64_t offset = 0;
   Status status;
@@ -463,17 +462,17 @@ struct SimpleTableBuilder::Rep {
 
   std::string index;
 
-  Rep(const Options& opt, WritableFile* f) :
-      options(opt), file(f) {
+  Rep(const ImmutableCFOptions& iopt, WritableFile* f) :
+      ioptions(iopt), file(f) {
   }
   ~Rep() {
   }
 };
 
-SimpleTableBuilder::SimpleTableBuilder(const Options& options,
+SimpleTableBuilder::SimpleTableBuilder(const ImmutableCFOptions& ioptions,
                                        WritableFile* file,
                                        CompressionType compression_type) :
-    rep_(new SimpleTableBuilder::Rep(options, file)) {
+    rep_(new SimpleTableBuilder::Rep(ioptions, file)) {
 }
 
 SimpleTableBuilder::~SimpleTableBuilder() {
@@ -546,15 +545,18 @@ public:
   const char* Name() const override {
     return "SimpleTable";
   }
-  Status NewTableReader(const Options& options, const EnvOptions& soptions,
+  Status NewTableReader(const ImmutableCFOptions& ioptions,
+                        const EnvOptions& env_options,
                         const InternalKeyComparator& internal_key,
                         unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
                         unique_ptr<TableReader>* table_reader) const;
 
-  TableBuilder* NewTableBuilder(const Options& options,
-                                const InternalKeyComparator& internal_key,
-                                WritableFile* file,
-                                CompressionType compression_type) const;
+  TableBuilder* NewTableBuilder(
+      const ImmutableCFOptions& ioptions,
+      const InternalKeyComparator& internal_key,
+      WritableFile* file,
+      const CompressionType compression_type,
+      const CompressionOptions& compression_opts) const;
 
   virtual Status SanitizeDBOptions(const DBOptions* db_opts) const override {
     return Status::OK();
@@ -566,19 +568,22 @@ public:
 };
 
 Status SimpleTableFactory::NewTableReader(
-    const Options& options, const EnvOptions& soptions,
+    const ImmutableCFOptions& ioptions,
+    const EnvOptions& env_options,
     const InternalKeyComparator& internal_key,
     unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
     unique_ptr<TableReader>* table_reader) const {
 
-  return SimpleTableReader::Open(options, soptions, std::move(file), file_size,
-                                 table_reader);
+  return SimpleTableReader::Open(ioptions, env_options, std::move(file),
+                                 file_size, table_reader);
 }
 
 TableBuilder* SimpleTableFactory::NewTableBuilder(
-    const Options& options, const InternalKeyComparator& internal_key,
-    WritableFile* file, CompressionType compression_type) const {
-  return new SimpleTableBuilder(options, file, compression_type);
+    const ImmutableCFOptions& ioptions,
+    const InternalKeyComparator& internal_key,
+    WritableFile* file, const CompressionType compression_type,
+    const CompressionOptions& compression_opts) const {
+  return new SimpleTableBuilder(ioptions, file, compression_type);
 }
 
 class SimpleTableDBTest {
diff --git a/db/table_cache.cc b/db/table_cache.cc
index c362499a6..5cb96f8bf 100644
--- a/db/table_cache.cc
+++ b/db/table_cache.cc
@@ -36,12 +36,10 @@ static Slice GetSliceForFileNumber(const uint64_t* file_number) {
                sizeof(*file_number));
 }
 
-TableCache::TableCache(const Options* options,
-                       const EnvOptions& storage_options, Cache* const cache)
-    : env_(options->env),
-      db_paths_(options->db_paths),
-      options_(options),
-      storage_options_(storage_options),
+TableCache::TableCache(const ImmutableCFOptions& ioptions,
+                       const EnvOptions& env_options, Cache* const cache)
+    : ioptions_(ioptions),
+      env_options_(env_options),
       cache_(cache) {}
 
 TableCache::~TableCache() {
@@ -55,7 +53,7 @@ void TableCache::ReleaseHandle(Cache::Handle* handle) {
   cache_->Release(handle);
 }
 
-Status TableCache::FindTable(const EnvOptions& toptions,
+Status TableCache::FindTable(const EnvOptions& env_options,
                              const InternalKeyComparator& internal_comparator,
                              const FileDescriptor& fd, Cache::Handle** handle,
                              const bool no_io) {
@@ -68,24 +66,24 @@ Status TableCache::FindTable(const EnvOptions& toptions,
       return Status::Incomplete("Table not found in table_cache, no_io is set");
     }
     std::string fname =
-        TableFileName(db_paths_, fd.GetNumber(), fd.GetPathId());
+        TableFileName(ioptions_.db_paths, fd.GetNumber(), fd.GetPathId());
     unique_ptr<RandomAccessFile> file;
     unique_ptr<TableReader> table_reader;
-    s = env_->NewRandomAccessFile(fname, &file, toptions);
-    RecordTick(options_->statistics.get(), NO_FILE_OPENS);
+    s = ioptions_.env->NewRandomAccessFile(fname, &file, env_options);
+    RecordTick(ioptions_.statistics, NO_FILE_OPENS);
     if (s.ok()) {
-      if (options_->advise_random_on_open) {
+      if (ioptions_.advise_random_on_open) {
         file->Hint(RandomAccessFile::RANDOM);
       }
-      StopWatch sw(env_, options_->statistics.get(), TABLE_OPEN_IO_MICROS);
-      s = options_->table_factory->NewTableReader(
-          *options_, toptions, internal_comparator, std::move(file),
+      StopWatch sw(ioptions_.env, ioptions_.statistics, TABLE_OPEN_IO_MICROS);
+      s = ioptions_.table_factory->NewTableReader(
+          ioptions_, env_options, internal_comparator, std::move(file),
           fd.GetFileSize(), &table_reader);
     }
 
     if (!s.ok()) {
       assert(table_reader == nullptr);
-      RecordTick(options_->statistics.get(), NO_FILE_ERRORS);
+      RecordTick(ioptions_.statistics, NO_FILE_ERRORS);
       // We do not cache error results so that if the error is transient,
       // or somebody repairs the file, we recover automatically.
     } else {
@@ -97,7 +95,7 @@ Status TableCache::FindTable(const EnvOptions& toptions,
 }
 
 Iterator* TableCache::NewIterator(const ReadOptions& options,
-                                  const EnvOptions& toptions,
+                                  const EnvOptions& env_options,
                                   const InternalKeyComparator& icomparator,
                                   const FileDescriptor& fd,
                                   TableReader** table_reader_ptr,
@@ -109,7 +107,7 @@ Iterator* TableCache::NewIterator(const ReadOptions& options,
   Cache::Handle* handle = nullptr;
   Status s;
   if (table_reader == nullptr) {
-    s = FindTable(toptions, icomparator, fd, &handle,
+    s = FindTable(env_options, icomparator, fd, &handle,
                   options.read_tier == kBlockCacheTier);
     if (!s.ok()) {
       return NewErrorIterator(s, arena);
@@ -142,7 +140,7 @@ Status TableCache::Get(const ReadOptions& options,
   Status s;
   Cache::Handle* handle = nullptr;
   if (!t) {
-    s = FindTable(storage_options_, internal_comparator, fd, &handle,
+    s = FindTable(env_options_, internal_comparator, fd, &handle,
                   options.read_tier == kBlockCacheTier);
     if (s.ok()) {
       t = GetTableReaderFromHandle(handle);
@@ -160,8 +158,9 @@ Status TableCache::Get(const ReadOptions& options,
   }
   return s;
 }
+
 Status TableCache::GetTableProperties(
-    const EnvOptions& toptions,
+    const EnvOptions& env_options,
     const InternalKeyComparator& internal_comparator, const FileDescriptor& fd,
     std::shared_ptr<const TableProperties>* properties, bool no_io) {
   Status s;
@@ -174,7 +173,7 @@ Status TableCache::GetTableProperties(
   }
 
   Cache::Handle* table_handle = nullptr;
-  s = FindTable(toptions, internal_comparator, fd, &table_handle, no_io);
+  s = FindTable(env_options, internal_comparator, fd, &table_handle, no_io);
   if (!s.ok()) {
     return s;
   }
@@ -186,7 +185,7 @@ Status TableCache::GetTableProperties(
 }
 
 size_t TableCache::GetMemoryUsageByTableReader(
-    const EnvOptions& toptions,
+    const EnvOptions& env_options,
     const InternalKeyComparator& internal_comparator,
     const FileDescriptor& fd) {
   Status s;
@@ -197,7 +196,7 @@ size_t TableCache::GetMemoryUsageByTableReader(
   }
 
   Cache::Handle* table_handle = nullptr;
-  s = FindTable(toptions, internal_comparator, fd, &table_handle, true);
+  s = FindTable(env_options, internal_comparator, fd, &table_handle, true);
   if (!s.ok()) {
     return 0;
   }
diff --git a/db/table_cache.h b/db/table_cache.h
index 79090e064..2f6740d9f 100644
--- a/db/table_cache.h
+++ b/db/table_cache.h
@@ -19,6 +19,7 @@
 #include "rocksdb/cache.h"
 #include "rocksdb/env.h"
 #include "rocksdb/table.h"
+#include "rocksdb/options.h"
 #include "table/table_reader.h"
 
 namespace rocksdb {
@@ -29,8 +30,8 @@ struct FileDescriptor;
 
 class TableCache {
  public:
-  TableCache(const Options* options, const EnvOptions& storage_options,
-             Cache* cache);
+  TableCache(const ImmutableCFOptions& ioptions,
+             const EnvOptions& storage_options, Cache* cache);
   ~TableCache();
 
   // Return an iterator for the specified file number (the corresponding
@@ -91,10 +92,8 @@ class TableCache {
   void ReleaseHandle(Cache::Handle* handle);
 
  private:
-  Env* const env_;
-  const std::vector<DbPath> db_paths_;
-  const Options* options_;
-  const EnvOptions& storage_options_;
+  const ImmutableCFOptions& ioptions_;
+  const EnvOptions& env_options_;
   Cache* const cache_;
 };
 
diff --git a/db/table_properties_collector_test.cc b/db/table_properties_collector_test.cc
index 638b259f2..8168ca5d6 100644
--- a/db/table_properties_collector_test.cc
+++ b/db/table_properties_collector_test.cc
@@ -90,7 +90,8 @@ void MakeBuilder(const Options& options,
                  std::unique_ptr<TableBuilder>* builder) {
   writable->reset(new FakeWritableFile);
   builder->reset(options.table_factory->NewTableBuilder(
-      options, internal_comparator, writable->get(), options.compression));
+      ImmutableCFOptions(options), internal_comparator, writable->get(),
+      options.compression, options.compression_opts));
 }
 }  // namespace
 
diff --git a/include/rocksdb/immutable_options.h b/include/rocksdb/immutable_options.h
new file mode 100644
index 000000000..22084f6f0
--- /dev/null
+++ b/include/rocksdb/immutable_options.h
@@ -0,0 +1,62 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#pragma once
+
+#include <vector>
+#include "rocksdb/options.h"
+
+namespace rocksdb {
+
+// ImmutableCFOptions is a data struct used by RocksDB internal. It contains a
+// subset of Options that should not be changed during the entire lifetime
+// of DB. You shouldn't need to access this data structure unless you are
+// implementing a new TableFactory.
+struct ImmutableCFOptions {
+  explicit ImmutableCFOptions(const Options& options);
+
+  const SliceTransform* prefix_extractor;
+
+  const Comparator* comparator;
+
+  MergeOperator* merge_operator;
+
+  Logger* info_log;
+
+  Statistics* statistics;
+
+  InfoLogLevel info_log_level;
+
+  Env* env;
+
+  // Allow the OS to mmap file for reading sst tables. Default: false
+  bool allow_mmap_reads;
+
+  // Allow the OS to mmap file for writing. Default: false
+  bool allow_mmap_writes;
+
+  std::vector<DbPath> db_paths;
+
+  TableFactory* table_factory;
+
+  Options::TablePropertiesCollectorFactories
+    table_properties_collector_factories;
+
+  bool advise_random_on_open;
+
+  // This options is required by PlainTableReader. May need to move it
+  // to PlainTalbeOptions just like bloom_bits_per_key
+  uint32_t bloom_locality;
+
+  bool purge_redundant_kvs_while_flush;
+
+  uint32_t min_partial_merge_operands;
+
+  bool disable_data_sync;
+
+  bool use_fsync;
+};
+
+}  // namespace rocksdb
diff --git a/include/rocksdb/table.h b/include/rocksdb/table.h
index 0f8b41074..2fb4f50dd 100644
--- a/include/rocksdb/table.h
+++ b/include/rocksdb/table.h
@@ -23,6 +23,7 @@
 #include "rocksdb/env.h"
 #include "rocksdb/iterator.h"
 #include "rocksdb/options.h"
+#include "rocksdb/immutable_options.h"
 #include "rocksdb/status.h"
 
 namespace rocksdb {
@@ -293,14 +294,15 @@ class TableFactory {
   //     and cache the table object returned.
   // (1) SstFileReader (for SST Dump) opens the table and dump the table
   //     contents using the interator of the table.
-  // options and soptions are options. options is the general options.
+  // ImmutableCFOptions is a subset of Options that can not be altered.
+  // EnvOptions is a subset of Options that will be used by Env.
   // Multiple configured can be accessed from there, including and not
   // limited to block cache and key comparators.
   // file is a file handler to handle the file for the table
   // file_size is the physical file size of the file
   // table_reader is the output table reader
   virtual Status NewTableReader(
-      const Options& options, const EnvOptions& soptions,
+      const ImmutableCFOptions& ioptions, const EnvOptions& env_options,
       const InternalKeyComparator& internal_comparator,
       unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
       unique_ptr<TableReader>* table_reader) const = 0;
@@ -318,14 +320,17 @@ class TableFactory {
   // (4) When running Repairer, it creates a table builder to convert logs to
   //     SST files (In Repairer::ConvertLogToTable() by calling BuildTable())
   //
-  // options is the general options. Multiple configured can be acceseed from
-  // there, including and not limited to compression options.
-  // file is a handle of a writable file. It is the caller's responsibility to
-  // keep the file open and close the file after closing the table builder.
-  // compression_type is the compression type to use in this table.
+  // ImmutableCFOptions is a subset of Options that can not be altered.
+  // Multiple configured can be acceseed from there, including and not limited
+  // to compression options. file is a handle of a writable file.
+  // It is the caller's responsibility to keep the file open and close the file
+  // after closing the table builder. compression_type is the compression type
+  // to use in this table.
   virtual TableBuilder* NewTableBuilder(
-      const Options& options, const InternalKeyComparator& internal_comparator,
-      WritableFile* file, CompressionType compression_type) const = 0;
+      const ImmutableCFOptions& ioptions,
+      const InternalKeyComparator& internal_comparator,
+      WritableFile* file, const CompressionType compression_type,
+      const CompressionOptions& compression_opts) const = 0;
 
   // Sanitizes the specified DB Options.
   //
diff --git a/table/adaptive_table_factory.cc b/table/adaptive_table_factory.cc
index a259e79d8..c693064af 100644
--- a/table/adaptive_table_factory.cc
+++ b/table/adaptive_table_factory.cc
@@ -39,7 +39,7 @@ extern const uint64_t kLegacyBlockBasedTableMagicNumber;
 extern const uint64_t kCuckooTableMagicNumber;
 
 Status AdaptiveTableFactory::NewTableReader(
-    const Options& options, const EnvOptions& soptions,
+    const ImmutableCFOptions& ioptions, const EnvOptions& env_options,
     const InternalKeyComparator& icomp, unique_ptr<RandomAccessFile>&& file,
     uint64_t file_size, unique_ptr<TableReader>* table) const {
   Footer footer;
@@ -50,24 +50,26 @@ Status AdaptiveTableFactory::NewTableReader(
   if (footer.table_magic_number() == kPlainTableMagicNumber ||
       footer.table_magic_number() == kLegacyPlainTableMagicNumber) {
     return plain_table_factory_->NewTableReader(
-        options, soptions, icomp, std::move(file), file_size, table);
+        ioptions, env_options, icomp, std::move(file), file_size, table);
   } else if (footer.table_magic_number() == kBlockBasedTableMagicNumber ||
       footer.table_magic_number() == kLegacyBlockBasedTableMagicNumber) {
     return block_based_table_factory_->NewTableReader(
-        options, soptions, icomp, std::move(file), file_size, table);
+        ioptions, env_options, icomp, std::move(file), file_size, table);
   } else if (footer.table_magic_number() == kCuckooTableMagicNumber) {
     return cuckoo_table_factory_->NewTableReader(
-        options, soptions, icomp, std::move(file), file_size, table);
+        ioptions, env_options, icomp, std::move(file), file_size, table);
   } else {
     return Status::NotSupported("Unidentified table format");
   }
 }
 
 TableBuilder* AdaptiveTableFactory::NewTableBuilder(
-    const Options& options, const InternalKeyComparator& internal_comparator,
-    WritableFile* file, CompressionType compression_type) const {
-  return table_factory_to_write_->NewTableBuilder(options, internal_comparator,
-                                                  file, compression_type);
+    const ImmutableCFOptions& ioptions,
+    const InternalKeyComparator& internal_comparator,
+    WritableFile* file, const CompressionType compression_type,
+    const CompressionOptions& compression_opts) const {
+  return table_factory_to_write_->NewTableBuilder(
+      ioptions, internal_comparator, file, compression_type, compression_opts);
 }
 
 std::string AdaptiveTableFactory::GetPrintableTableOptions() const {
diff --git a/table/adaptive_table_factory.h b/table/adaptive_table_factory.h
index f119d97b1..f0920db97 100644
--- a/table/adaptive_table_factory.h
+++ b/table/adaptive_table_factory.h
@@ -12,7 +12,6 @@
 
 namespace rocksdb {
 
-struct Options;
 struct EnvOptions;
 
 using std::unique_ptr;
@@ -31,16 +30,21 @@ class AdaptiveTableFactory : public TableFactory {
       std::shared_ptr<TableFactory> block_based_table_factory,
       std::shared_ptr<TableFactory> plain_table_factory,
       std::shared_ptr<TableFactory> cuckoo_table_factory);
+
   const char* Name() const override { return "AdaptiveTableFactory"; }
-  Status NewTableReader(const Options& options, const EnvOptions& soptions,
-                        const InternalKeyComparator& internal_comparator,
-                        unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
-                        unique_ptr<TableReader>* table) const override;
-  TableBuilder* NewTableBuilder(const Options& options,
-                                const InternalKeyComparator& icomparator,
-                                WritableFile* file,
-                                CompressionType compression_type) const
-      override;
+
+  Status NewTableReader(
+      const ImmutableCFOptions& ioptions, const EnvOptions& env_options,
+      const InternalKeyComparator& internal_comparator,
+      unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
+      unique_ptr<TableReader>* table) const override;
+
+  TableBuilder* NewTableBuilder(
+      const ImmutableCFOptions& ioptions,
+      const InternalKeyComparator& icomparator,
+      WritableFile* file,
+      const CompressionType compression_type,
+      const CompressionOptions& compression_opts) const override;
 
   // Sanitizes the specified DB Options.
   Status SanitizeDBOptions(const DBOptions* db_opts) const override {
diff --git a/table/block_based_table_builder.cc b/table/block_based_table_builder.cc
index ddfbe74a6..fde363760 100644
--- a/table/block_based_table_builder.cc
+++ b/table/block_based_table_builder.cc
@@ -25,7 +25,6 @@
 #include "rocksdb/env.h"
 #include "rocksdb/filter_policy.h"
 #include "rocksdb/flush_block_policy.h"
-#include "rocksdb/options.h"
 #include "rocksdb/table.h"
 
 #include "table/block.h"
@@ -385,7 +384,7 @@ class BlockBasedTableBuilder::BlockBasedTablePropertiesCollector
 };
 
 struct BlockBasedTableBuilder::Rep {
-  const Options options;
+  const ImmutableCFOptions ioptions;
   const BlockBasedTableOptions table_options;
   const InternalKeyComparator& internal_comparator;
   WritableFile* file;
@@ -397,7 +396,8 @@ struct BlockBasedTableBuilder::Rep {
   std::unique_ptr<IndexBuilder> index_builder;
 
   std::string last_key;
-  CompressionType compression_type;
+  const CompressionType compression_type;
+  const CompressionOptions compression_opts;
   TableProperties props;
 
   bool closed = false;  // Either Finish() or Abandon() has been called.
@@ -413,27 +413,31 @@ struct BlockBasedTableBuilder::Rep {
   std::vector<std::unique_ptr<TablePropertiesCollector>>
       table_properties_collectors;
 
-  Rep(const Options& opt, const BlockBasedTableOptions& table_opt,
+  Rep(const ImmutableCFOptions& ioptions,
+      const BlockBasedTableOptions& table_opt,
       const InternalKeyComparator& icomparator,
-      WritableFile* f, CompressionType compression_type)
-      : options(opt),
+      WritableFile* f, const CompressionType compression_type,
+      const CompressionOptions& compression_opts)
+      : ioptions(ioptions),
         table_options(table_opt),
         internal_comparator(icomparator),
         file(f),
         data_block(table_options.block_restart_interval),
-        internal_prefix_transform(options.prefix_extractor.get()),
+        internal_prefix_transform(ioptions.prefix_extractor),
         index_builder(CreateIndexBuilder(
               table_options.index_type, &internal_comparator,
               &this->internal_prefix_transform)),
         compression_type(compression_type),
+        compression_opts(compression_opts),
         filter_block(table_options.filter_policy == nullptr ?
             nullptr :
-            new FilterBlockBuilder(opt, table_options, &internal_comparator)),
+            new FilterBlockBuilder(ioptions.prefix_extractor,
+                                   table_options, &internal_comparator)),
         flush_block_policy(
             table_options.flush_block_policy_factory->NewFlushBlockPolicy(
               table_options, data_block)) {
     for (auto& collector_factories :
-         options.table_properties_collector_factories) {
+         ioptions.table_properties_collector_factories) {
       table_properties_collectors.emplace_back(
           collector_factories->CreateTablePropertiesCollector());
     }
@@ -443,11 +447,13 @@ struct BlockBasedTableBuilder::Rep {
 };
 
 BlockBasedTableBuilder::BlockBasedTableBuilder(
-    const Options& options, const BlockBasedTableOptions& table_options,
+    const ImmutableCFOptions& ioptions,
+    const BlockBasedTableOptions& table_options,
     const InternalKeyComparator& internal_comparator, WritableFile* file,
-    CompressionType compression_type)
-    : rep_(new Rep(options, table_options, internal_comparator,
-                   file, compression_type)) {
+    const CompressionType compression_type,
+    const CompressionOptions& compression_opts)
+    : rep_(new Rep(ioptions, table_options, internal_comparator,
+                   file, compression_type, compression_opts)) {
   if (rep_->filter_block != nullptr) {
     rep_->filter_block->StartBlock(0);
   }
@@ -502,7 +508,7 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) {
 
   r->index_builder->OnKeyAdded(key);
   NotifyCollectTableCollectorsOnAdd(key, value, r->table_properties_collectors,
-                                    r->options.info_log.get());
+                                    r->ioptions.info_log);
 }
 
 void BlockBasedTableBuilder::Flush() {
@@ -540,10 +546,10 @@ void BlockBasedTableBuilder::WriteBlock(const Slice& raw_block_contents,
   Slice block_contents;
   if (raw_block_contents.size() < kCompressionSizeLimit) {
     block_contents =
-        CompressBlock(raw_block_contents, r->options.compression_opts, &type,
+        CompressBlock(raw_block_contents, r->compression_opts, &type,
                       &r->compressed_output);
   } else {
-    RecordTick(r->options.statistics.get(), NUMBER_BLOCK_NOT_COMPRESSED);
+    RecordTick(r->ioptions.statistics, NUMBER_BLOCK_NOT_COMPRESSED);
     type = kNoCompression;
     block_contents = raw_block_contents;
   }
@@ -555,8 +561,7 @@ void BlockBasedTableBuilder::WriteRawBlock(const Slice& block_contents,
                                            CompressionType type,
                                            BlockHandle* handle) {
   Rep* r = rep_;
-  StopWatch sw(r->options.env, r->options.statistics.get(),
-               WRITE_RAW_BLOCK_MICROS);
+  StopWatch sw(r->ioptions.env, r->ioptions.statistics, WRITE_RAW_BLOCK_MICROS);
   handle->set_offset(r->offset);
   handle->set_size(block_contents.size());
   r->status = r->file->Append(block_contents);
@@ -717,7 +722,7 @@ Status BlockBasedTableBuilder::Finish() {
 
       // Add use collected properties
       NotifyCollectTableCollectorsOnFinish(r->table_properties_collectors,
-                                           r->options.info_log.get(),
+                                           r->ioptions.info_log,
                                            &property_block_builder);
 
       BlockHandle properties_block_handle;
@@ -776,14 +781,12 @@ Status BlockBasedTableBuilder::Finish() {
       }
     }
 
-    Log(
-        r->options.info_log,
+    Log(r->ioptions.info_log,
         "Table was constructed:\n"
         "  [basic properties]: %s\n"
         "  [user collected properties]: %s",
         r->props.ToString().c_str(),
-        user_collected.c_str()
-    );
+        user_collected.c_str());
   }
 
   return r->status;
diff --git a/table/block_based_table_builder.h b/table/block_based_table_builder.h
index 72a2f207a..6fde32919 100644
--- a/table/block_based_table_builder.h
+++ b/table/block_based_table_builder.h
@@ -28,10 +28,12 @@ class BlockBasedTableBuilder : public TableBuilder {
   // Create a builder that will store the contents of the table it is
   // building in *file.  Does not close the file.  It is up to the
   // caller to close the file after calling Finish().
-  BlockBasedTableBuilder(const Options& options,
+  BlockBasedTableBuilder(const ImmutableCFOptions& ioptions,
                          const BlockBasedTableOptions& table_options,
                          const InternalKeyComparator& internal_comparator,
-                         WritableFile* file, CompressionType compression_type);
+                         WritableFile* file,
+                         const CompressionType compression_type,
+                         const CompressionOptions& compression_opts);
 
   // REQUIRES: Either Finish() or Abandon() has been called.
   ~BlockBasedTableBuilder();
diff --git a/table/block_based_table_factory.cc b/table/block_based_table_factory.cc
index de30fb383..b4e2e7d1f 100644
--- a/table/block_based_table_factory.cc
+++ b/table/block_based_table_factory.cc
@@ -41,21 +41,24 @@ BlockBasedTableFactory::BlockBasedTableFactory(
 }
 
 Status BlockBasedTableFactory::NewTableReader(
-    const Options& options, const EnvOptions& soptions,
+    const ImmutableCFOptions& ioptions, const EnvOptions& soptions,
     const InternalKeyComparator& internal_comparator,
     unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
     unique_ptr<TableReader>* table_reader) const {
-  return BlockBasedTable::Open(options, soptions, table_options_,
+  return BlockBasedTable::Open(ioptions, soptions, table_options_,
                                internal_comparator, std::move(file), file_size,
                                table_reader);
 }
 
 TableBuilder* BlockBasedTableFactory::NewTableBuilder(
-    const Options& options, const InternalKeyComparator& internal_comparator,
-    WritableFile* file, CompressionType compression_type) const {
+    const ImmutableCFOptions& ioptions,
+    const InternalKeyComparator& internal_comparator,
+    WritableFile* file, const CompressionType compression_type,
+    const CompressionOptions& compression_opts) const {
 
   auto table_builder = new BlockBasedTableBuilder(
-      options, table_options_, internal_comparator, file, compression_type);
+      ioptions, table_options_, internal_comparator, file,
+      compression_type, compression_opts);
 
   return table_builder;
 }
diff --git a/table/block_based_table_factory.h b/table/block_based_table_factory.h
index d7045346a..2dcfda6d4 100644
--- a/table/block_based_table_factory.h
+++ b/table/block_based_table_factory.h
@@ -14,13 +14,11 @@
 #include <string>
 
 #include "rocksdb/flush_block_policy.h"
-#include "rocksdb/options.h"
 #include "rocksdb/table.h"
 #include "db/dbformat.h"
 
 namespace rocksdb {
 
-struct Options;
 struct EnvOptions;
 
 using std::unique_ptr;
@@ -35,14 +33,17 @@ class BlockBasedTableFactory : public TableFactory {
 
   const char* Name() const override { return "BlockBasedTable"; }
 
-  Status NewTableReader(const Options& options, const EnvOptions& soptions,
-                        const InternalKeyComparator& internal_comparator,
-                        unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
-                        unique_ptr<TableReader>* table_reader) const override;
+  Status NewTableReader(
+      const ImmutableCFOptions& ioptions, const EnvOptions& soptions,
+      const InternalKeyComparator& internal_comparator,
+      unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
+      unique_ptr<TableReader>* table_reader) const override;
 
   TableBuilder* NewTableBuilder(
-      const Options& options, const InternalKeyComparator& internal_comparator,
-      WritableFile* file, CompressionType compression_type) const override;
+      const ImmutableCFOptions& ioptions,
+      const InternalKeyComparator& internal_comparator,
+      WritableFile* file, const CompressionType compression_type,
+      const CompressionOptions& compression_opts) const override;
 
   // Sanitizes the specified DB Options.
   Status SanitizeDBOptions(const DBOptions* db_opts) const override {
diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc
index 0be38a1dc..cf915e105 100644
--- a/table/block_based_table_reader.cc
+++ b/table/block_based_table_reader.cc
@@ -336,15 +336,16 @@ class HashIndexReader : public IndexReader {
 
 
 struct BlockBasedTable::Rep {
-  Rep(const EnvOptions& storage_options,
+  Rep(const ImmutableCFOptions& ioptions,
+      const EnvOptions& env_options,
       const BlockBasedTableOptions& table_opt,
       const InternalKeyComparator& internal_comparator)
-      : soptions(storage_options), table_options(table_opt),
+      : ioptions(ioptions), env_options(env_options), table_options(table_opt),
         filter_policy(table_opt.filter_policy.get()),
         internal_comparator(internal_comparator) {}
 
-  Options options;
-  const EnvOptions& soptions;
+  const ImmutableCFOptions& ioptions;
+  const EnvOptions& env_options;
   const BlockBasedTableOptions& table_options;
   const FilterPolicy* const filter_policy;
   const InternalKeyComparator& internal_comparator;
@@ -446,7 +447,8 @@ void BlockBasedTable::GenerateCachePrefix(Cache* cc,
   }
 }
 
-Status BlockBasedTable::Open(const Options& options, const EnvOptions& soptions,
+Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions,
+                             const EnvOptions& env_options,
                              const BlockBasedTableOptions& table_options,
                              const InternalKeyComparator& internal_comparator,
                              unique_ptr<RandomAccessFile>&& file,
@@ -461,8 +463,7 @@ Status BlockBasedTable::Open(const Options& options, const EnvOptions& soptions,
   // We've successfully read the footer and the index block: we're
   // ready to serve requests.
   Rep* rep = new BlockBasedTable::Rep(
-      soptions, table_options, internal_comparator);
-  rep->options = options;
+      ioptions, env_options, table_options, internal_comparator);
   rep->file = std::move(file);
   rep->footer = footer;
   rep->index_type = table_options.index_type;
@@ -484,7 +485,7 @@ Status BlockBasedTable::Open(const Options& options, const EnvOptions& soptions,
     TableProperties* table_properties = nullptr;
     if (s.ok()) {
       s = ReadProperties(meta_iter->value(), rep->file.get(), rep->footer,
-                         rep->options.env, rep->options.info_log.get(),
+                         rep->ioptions.env, rep->ioptions.info_log,
                          &table_properties);
     }
 
@@ -492,12 +493,12 @@ Status BlockBasedTable::Open(const Options& options, const EnvOptions& soptions,
       auto err_msg =
         "[Warning] Encountered error while reading data from properties "
         "block " + s.ToString();
-      Log(rep->options.info_log, "%s", err_msg.c_str());
+      Log(rep->ioptions.info_log, "%s", err_msg.c_str());
     } else {
       rep->table_properties.reset(table_properties);
     }
   } else {
-    Log(WARN_LEVEL, rep->options.info_log,
+    Log(WARN_LEVEL, rep->ioptions.info_log,
         "Cannot find Properties block from file.");
   }
 
@@ -546,7 +547,8 @@ Status BlockBasedTable::Open(const Options& options, const EnvOptions& soptions,
 }
 
 void BlockBasedTable::SetupForCompaction() {
-  switch (rep_->options.access_hint_on_compaction_start) {
+  /*
+  switch (.access_hint_on_compaction_start) {
     case Options::NONE:
       break;
     case Options::NORMAL:
@@ -562,6 +564,7 @@ void BlockBasedTable::SetupForCompaction() {
       assert(false);
   }
   compaction_optimized_ = true;
+  */
 }
 
 std::shared_ptr<const TableProperties> BlockBasedTable::GetTableProperties()
@@ -596,13 +599,13 @@ Status BlockBasedTable::ReadMetaBlock(
       ReadOptions(),
       rep->footer.metaindex_handle(),
       &meta,
-      rep->options.env);
+      rep->ioptions.env);
 
     if (!s.ok()) {
       auto err_msg =
         "[Warning] Encountered error while reading data from properties"
         "block " + s.ToString();
-      Log(rep->options.info_log, "%s", err_msg.c_str());
+      Log(rep->ioptions.info_log, "%s", err_msg.c_str());
     }
   if (!s.ok()) {
     delete meta;
@@ -746,7 +749,7 @@ FilterBlockReader* BlockBasedTable::ReadFilter(const BlockHandle& filter_handle,
   ReadOptions opt;
   BlockContents block;
   if (!ReadBlockContents(rep->file.get(), rep->footer, opt, filter_handle,
-                         &block, rep->options.env, false).ok()) {
+                         &block, rep->ioptions.env, false).ok()) {
     return nullptr;
   }
 
@@ -755,7 +758,8 @@ FilterBlockReader* BlockBasedTable::ReadFilter(const BlockHandle& filter_handle,
   }
 
   return new FilterBlockReader(
-       rep->options, rep->table_options, block.data, block.heap_allocated);
+       rep->ioptions.prefix_extractor, rep->table_options,
+       block.data, block.heap_allocated);
 }
 
 BlockBasedTable::CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter(
@@ -780,7 +784,7 @@ BlockBasedTable::CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter(
       cache_key
   );
 
-  Statistics* statistics = rep_->options.statistics.get();
+  Statistics* statistics = rep_->ioptions.statistics;
   auto cache_handle =
       GetEntryFromCache(block_cache, key, BLOCK_CACHE_FILTER_MISS,
                         BLOCK_CACHE_FILTER_HIT, statistics);
@@ -830,7 +834,7 @@ Iterator* BlockBasedTable::NewIndexIterator(const ReadOptions& read_options,
   char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
   auto key = GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size,
                          rep_->footer.index_handle(), cache_key);
-  Statistics* statistics = rep_->options.statistics.get();
+  Statistics* statistics = rep_->ioptions.statistics;
   auto cache_handle =
       GetEntryFromCache(block_cache, key, BLOCK_CACHE_INDEX_MISS,
                         BLOCK_CACHE_INDEX_HIT, statistics);
@@ -906,7 +910,7 @@ Iterator* BlockBasedTable::NewDataBlockIterator(Rep* rep,
 
   // If either block cache is enabled, we'll try to read from it.
   if (block_cache != nullptr || block_cache_compressed != nullptr) {
-    Statistics* statistics = rep->options.statistics.get();
+    Statistics* statistics = rep->ioptions.statistics;
     char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
     char compressed_cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
     Slice key, /* key to the block cache */
@@ -930,9 +934,9 @@ Iterator* BlockBasedTable::NewDataBlockIterator(Rep* rep,
     if (block.value == nullptr && !no_io && ro.fill_cache) {
       Block* raw_block = nullptr;
       {
-        StopWatch sw(rep->options.env, statistics, READ_BLOCK_GET_MICROS);
+        StopWatch sw(rep->ioptions.env, statistics, READ_BLOCK_GET_MICROS);
         s = ReadBlockFromFile(rep->file.get(), rep->footer, ro, handle,
-                              &raw_block, rep->options.env,
+                              &raw_block, rep->ioptions.env,
                               block_cache_compressed == nullptr);
       }
 
@@ -955,7 +959,7 @@ Iterator* BlockBasedTable::NewDataBlockIterator(Rep* rep,
       }
     }
     s = ReadBlockFromFile(rep->file.get(), rep->footer, ro, handle,
-                          &block.value, rep->options.env);
+                          &block.value, rep->ioptions.env);
   }
 
   Iterator* iter;
@@ -982,7 +986,8 @@ class BlockBasedTable::BlockEntryIteratorState : public TwoLevelIteratorState {
  public:
   BlockEntryIteratorState(BlockBasedTable* table,
                           const ReadOptions& read_options)
-      : TwoLevelIteratorState(table->rep_->options.prefix_extractor != nullptr),
+      : TwoLevelIteratorState(
+          table->rep_->ioptions.prefix_extractor != nullptr),
         table_(table),
         read_options_(read_options) {}
 
@@ -1020,8 +1025,8 @@ bool BlockBasedTable::PrefixMayMatch(const Slice& internal_key) {
     return true;
   }
 
-  assert(rep_->options.prefix_extractor != nullptr);
-  auto prefix = rep_->options.prefix_extractor->Transform(
+  assert(rep_->ioptions.prefix_extractor != nullptr);
+  auto prefix = rep_->ioptions.prefix_extractor->Transform(
       ExtractUserKey(internal_key));
   InternalKey internal_key_prefix(prefix, 0, kTypeValue);
   auto internal_prefix = internal_key_prefix.Encode();
@@ -1072,7 +1077,7 @@ bool BlockBasedTable::PrefixMayMatch(const Slice& internal_key) {
     filter_entry.Release(rep_->table_options.block_cache.get());
   }
 
-  Statistics* statistics = rep_->options.statistics.get();
+  Statistics* statistics = rep_->ioptions.statistics;
   RecordTick(statistics, BLOOM_FILTER_PREFIX_CHECKED);
   if (!may_match) {
     RecordTick(statistics, BLOOM_FILTER_PREFIX_USEFUL);
@@ -1111,7 +1116,7 @@ Status BlockBasedTable::Get(
       // Not found
       // TODO: think about interaction with Merge. If a user key cannot
       // cross one data block, we should be fine.
-      RecordTick(rep_->options.statistics.get(), BLOOM_FILTER_USEFUL);
+      RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL);
       break;
     } else {
       BlockIter biter;
@@ -1205,13 +1210,13 @@ Status BlockBasedTable::CreateIndexReader(IndexReader** index_reader,
   }
 
   auto file = rep_->file.get();
-  auto env = rep_->options.env;
+  auto env = rep_->ioptions.env;
   auto comparator = &rep_->internal_comparator;
   const Footer& footer = rep_->footer;
 
   if (index_type_on_file == BlockBasedTableOptions::kHashSearch &&
-      rep_->options.prefix_extractor == nullptr) {
-    Log(rep_->options.info_log,
+      rep_->ioptions.prefix_extractor == nullptr) {
+    Log(rep_->ioptions.info_log,
         "BlockBasedTableOptions::kHashSearch requires "
         "options.prefix_extractor to be set."
         " Fall back to binary seach index.");
@@ -1232,7 +1237,7 @@ Status BlockBasedTable::CreateIndexReader(IndexReader** index_reader,
         if (!s.ok()) {
           // we simply fall back to binary search in case there is any
           // problem with prefix hash index loading.
-          Log(rep_->options.info_log,
+          Log(rep_->ioptions.info_log,
               "Unable to read the metaindex block."
               " Fall back to binary seach index.");
           return BinarySearchIndexReader::Create(
@@ -1244,7 +1249,7 @@ Status BlockBasedTable::CreateIndexReader(IndexReader** index_reader,
       // We need to wrap data with internal_prefix_transform to make sure it can
       // handle prefix correctly.
       rep_->internal_prefix_transform.reset(
-          new InternalKeySliceTransform(rep_->options.prefix_extractor.get()));
+          new InternalKeySliceTransform(rep_->ioptions.prefix_extractor));
       return HashIndexReader::Create(
           rep_->internal_prefix_transform.get(), footer, file, env, comparator,
           footer.index_handle(), meta_index_iter, index_reader,
diff --git a/table/block_based_table_reader.h b/table/block_based_table_reader.h
index 3ff97dda6..b5686d265 100644
--- a/table/block_based_table_reader.h
+++ b/table/block_based_table_reader.h
@@ -14,6 +14,7 @@
 #include <utility>
 #include <string>
 
+#include "rocksdb/options.h"
 #include "rocksdb/statistics.h"
 #include "rocksdb/status.h"
 #include "rocksdb/table.h"
@@ -36,7 +37,6 @@ class TableReader;
 class WritableFile;
 struct BlockBasedTableOptions;
 struct EnvOptions;
-struct Options;
 struct ReadOptions;
 
 using std::unique_ptr;
@@ -58,7 +58,8 @@ class BlockBasedTable : public TableReader {
   // to nullptr and returns a non-ok status.
   //
   // *file must remain live while this Table is in use.
-  static Status Open(const Options& db_options, const EnvOptions& env_options,
+  static Status Open(const ImmutableCFOptions& ioptions,
+                     const EnvOptions& env_options,
                      const BlockBasedTableOptions& table_options,
                      const InternalKeyComparator& internal_key_comparator,
                      unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
diff --git a/table/cuckoo_table_factory.cc b/table/cuckoo_table_factory.cc
index e2cc6fd89..5727a91c0 100644
--- a/table/cuckoo_table_factory.cc
+++ b/table/cuckoo_table_factory.cc
@@ -11,11 +11,12 @@
 #include "table/cuckoo_table_reader.h"
 
 namespace rocksdb {
-Status CuckooTableFactory::NewTableReader(const Options& options,
-    const EnvOptions& soptions, const InternalKeyComparator& icomp,
+
+Status CuckooTableFactory::NewTableReader(const ImmutableCFOptions& ioptions,
+    const EnvOptions& env_options, const InternalKeyComparator& icomp,
     std::unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
     std::unique_ptr<TableReader>* table) const {
-  std::unique_ptr<CuckooTableReader> new_reader(new CuckooTableReader(options,
+  std::unique_ptr<CuckooTableReader> new_reader(new CuckooTableReader(ioptions,
       std::move(file), file_size, icomp.user_comparator(), nullptr));
   Status s = new_reader->status();
   if (s.ok()) {
@@ -25,10 +26,13 @@ Status CuckooTableFactory::NewTableReader(const Options& options,
 }
 
 TableBuilder* CuckooTableFactory::NewTableBuilder(
-    const Options& options, const InternalKeyComparator& internal_comparator,
-    WritableFile* file, CompressionType compression_type) const {
-  return new CuckooTableBuilder(file, hash_table_ratio_, 64, max_search_depth_,
-      internal_comparator.user_comparator(), cuckoo_block_size_, nullptr);
+    const ImmutableCFOptions& ioptions,
+    const InternalKeyComparator& internal_comparator,
+    WritableFile* file, const CompressionType,
+    const CompressionOptions&) const {
+  return new CuckooTableBuilder(file, hash_table_ratio_, 64,
+      max_search_depth_, internal_comparator.user_comparator(),
+      cuckoo_block_size_, nullptr);
 }
 
 std::string CuckooTableFactory::GetPrintableTableOptions() const {
diff --git a/table/cuckoo_table_factory.h b/table/cuckoo_table_factory.h
index 5799a7f23..2b575dc45 100644
--- a/table/cuckoo_table_factory.h
+++ b/table/cuckoo_table_factory.h
@@ -9,6 +9,7 @@
 #include <string>
 #include "rocksdb/table.h"
 #include "util/murmurhash.h"
+#include "rocksdb/options.h"
 
 namespace rocksdb {
 
@@ -45,14 +46,14 @@ class CuckooTableFactory : public TableFactory {
   const char* Name() const override { return "CuckooTable"; }
 
   Status NewTableReader(
-      const Options& options, const EnvOptions& soptions,
+      const ImmutableCFOptions& ioptions, const EnvOptions& env_options,
       const InternalKeyComparator& internal_comparator,
       unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
       unique_ptr<TableReader>* table) const override;
 
-  TableBuilder* NewTableBuilder(const Options& options,
+  TableBuilder* NewTableBuilder(const ImmutableCFOptions& options,
       const InternalKeyComparator& icomparator, WritableFile* file,
-      CompressionType compression_type) const override;
+      const CompressionType, const CompressionOptions&) const override;
 
   // Sanitizes the specified DB Options.
   Status SanitizeDBOptions(const DBOptions* db_opts) const override {
diff --git a/table/cuckoo_table_reader.cc b/table/cuckoo_table_reader.cc
index f1dcbc3bb..1fdbc4475 100644
--- a/table/cuckoo_table_reader.cc
+++ b/table/cuckoo_table_reader.cc
@@ -29,7 +29,7 @@ namespace {
 extern const uint64_t kCuckooTableMagicNumber;
 
 CuckooTableReader::CuckooTableReader(
-    const Options& options,
+    const ImmutableCFOptions& ioptions,
     std::unique_ptr<RandomAccessFile>&& file,
     uint64_t file_size,
     const Comparator* comparator,
@@ -37,12 +37,12 @@ CuckooTableReader::CuckooTableReader(
     : file_(std::move(file)),
       ucomp_(comparator),
       get_slice_hash_(get_slice_hash) {
-  if (!options.allow_mmap_reads) {
+  if (!ioptions.allow_mmap_reads) {
     status_ = Status::InvalidArgument("File is not mmaped");
   }
   TableProperties* props = nullptr;
   status_ = ReadTableProperties(file_.get(), file_size, kCuckooTableMagicNumber,
-      options.env, options.info_log.get(), &props);
+      ioptions.env, ioptions.info_log, &props);
   if (!status_.ok()) {
     return;
   }
diff --git a/table/cuckoo_table_reader.h b/table/cuckoo_table_reader.h
index 05d5c3397..61e048eb6 100644
--- a/table/cuckoo_table_reader.h
+++ b/table/cuckoo_table_reader.h
@@ -16,6 +16,7 @@
 
 #include "db/dbformat.h"
 #include "rocksdb/env.h"
+#include "rocksdb/options.h"
 #include "table/table_reader.h"
 
 namespace rocksdb {
@@ -26,7 +27,7 @@ class TableReader;
 class CuckooTableReader: public TableReader {
  public:
   CuckooTableReader(
-      const Options& options,
+      const ImmutableCFOptions& ioptions,
       std::unique_ptr<RandomAccessFile>&& file,
       uint64_t file_size,
       const Comparator* user_comparator,
@@ -40,7 +41,7 @@ class CuckooTableReader: public TableReader {
   Status status() const { return status_; }
 
   Status Get(
-      const ReadOptions& readOptions, const Slice& key, void* handle_context,
+      const ReadOptions& read_options, const Slice& key, void* handle_context,
       bool (*result_handler)(void* arg, const ParsedInternalKey& k,
                              const Slice& v),
       void (*mark_key_may_exist_handler)(void* handle_context) = nullptr)
diff --git a/table/cuckoo_table_reader_test.cc b/table/cuckoo_table_reader_test.cc
index 63fe0ae5b..53946e71b 100644
--- a/table/cuckoo_table_reader_test.cc
+++ b/table/cuckoo_table_reader_test.cc
@@ -121,8 +121,9 @@ class CuckooReaderTest {
     // Check reader now.
     std::unique_ptr<RandomAccessFile> read_file;
     ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options));
+    const ImmutableCFOptions ioptions(options);
     CuckooTableReader reader(
-        options,
+        ioptions,
         std::move(read_file),
         file_size,
         ucomp,
@@ -147,8 +148,9 @@ class CuckooReaderTest {
   void CheckIterator(const Comparator* ucomp = BytewiseComparator()) {
     std::unique_ptr<RandomAccessFile> read_file;
     ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options));
+    const ImmutableCFOptions ioptions(options);
     CuckooTableReader reader(
-        options,
+        ioptions,
         std::move(read_file),
         file_size,
         ucomp,
@@ -325,8 +327,9 @@ TEST(CuckooReaderTest, WhenKeyNotFound) {
   CreateCuckooFileAndCheckReader();
   std::unique_ptr<RandomAccessFile> read_file;
   ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options));
+  const ImmutableCFOptions ioptions(options);
   CuckooTableReader reader(
-      options,
+      ioptions,
       std::move(read_file),
       file_size,
       BytewiseComparator(),
@@ -433,8 +436,9 @@ void WriteFile(const std::vector<std::string>& keys,
   std::unique_ptr<RandomAccessFile> read_file;
   ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options));
 
+  const ImmutableCFOptions ioptions(options);
   CuckooTableReader reader(
-      options, std::move(read_file), file_size,
+      ioptions, std::move(read_file), file_size,
       test::Uint64Comparator(), nullptr);
   ASSERT_OK(reader.status());
   ReadOptions r_options;
@@ -460,8 +464,9 @@ void ReadKeys(uint64_t num, uint32_t batch_size) {
   std::unique_ptr<RandomAccessFile> read_file;
   ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options));
 
+  const ImmutableCFOptions ioptions(options);
   CuckooTableReader reader(
-      options, std::move(read_file), file_size, test::Uint64Comparator(),
+      ioptions, std::move(read_file), file_size, test::Uint64Comparator(),
       nullptr);
   ASSERT_OK(reader.status());
   const UserCollectedProperties user_props =
diff --git a/table/filter_block.cc b/table/filter_block.cc
index 6b4ff1c10..30284017b 100644
--- a/table/filter_block.cc
+++ b/table/filter_block.cc
@@ -21,11 +21,11 @@ namespace rocksdb {
 static const size_t kFilterBaseLg = 11;
 static const size_t kFilterBase = 1 << kFilterBaseLg;
 
-FilterBlockBuilder::FilterBlockBuilder(const Options& opt,
+FilterBlockBuilder::FilterBlockBuilder(const SliceTransform* prefix_extractor,
                                        const BlockBasedTableOptions& table_opt,
                                        const Comparator* internal_comparator)
     : policy_(table_opt.filter_policy.get()),
-      prefix_extractor_(opt.prefix_extractor.get()),
+      prefix_extractor_(prefix_extractor),
       whole_key_filtering_(table_opt.whole_key_filtering),
       comparator_(internal_comparator) {}
 
@@ -126,10 +126,11 @@ void FilterBlockBuilder::GenerateFilter() {
 }
 
 FilterBlockReader::FilterBlockReader(
-    const Options& opt, const BlockBasedTableOptions& table_opt,
+    const SliceTransform* prefix_extractor,
+    const BlockBasedTableOptions& table_opt,
     const Slice& contents, bool delete_contents_after_use)
     : policy_(table_opt.filter_policy.get()),
-      prefix_extractor_(opt.prefix_extractor.get()),
+      prefix_extractor_(prefix_extractor),
       whole_key_filtering_(table_opt.whole_key_filtering),
       data_(nullptr),
       offset_(nullptr),
diff --git a/table/filter_block.h b/table/filter_block.h
index 5041393f6..efee5ac71 100644
--- a/table/filter_block.h
+++ b/table/filter_block.h
@@ -18,7 +18,6 @@
 #include <stdint.h>
 #include <string>
 #include <vector>
-#include "rocksdb/options.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/slice_transform.h"
 #include "rocksdb/table.h"
@@ -36,7 +35,7 @@ class FilterPolicy;
 //      (StartBlock AddKey*)* Finish
 class FilterBlockBuilder {
  public:
-  explicit FilterBlockBuilder(const Options& opt,
+  explicit FilterBlockBuilder(const SliceTransform* prefix_extractor,
                               const BlockBasedTableOptions& table_opt,
                               const Comparator* internal_comparator);
 
@@ -71,7 +70,7 @@ class FilterBlockReader {
  public:
  // REQUIRES: "contents" and *policy must stay live while *this is live.
   FilterBlockReader(
-    const Options& opt,
+    const SliceTransform* prefix_extractor,
     const BlockBasedTableOptions& table_opt,
     const Slice& contents,
     bool delete_contents_after_use = false);
diff --git a/table/filter_block_test.cc b/table/filter_block_test.cc
index 95496a82c..903247e80 100644
--- a/table/filter_block_test.cc
+++ b/table/filter_block_test.cc
@@ -45,26 +45,26 @@ class TestHashFilter : public FilterPolicy {
 
 class FilterBlockTest {
  public:
-  Options options_;
+  const Comparator* comparator_;
   BlockBasedTableOptions table_options_;
 
-  FilterBlockTest() {
-    options_ = Options();
+  FilterBlockTest()
+    : comparator_(BytewiseComparator()) {
     table_options_.filter_policy.reset(new TestHashFilter());
   }
 };
 
 TEST(FilterBlockTest, EmptyBuilder) {
-  FilterBlockBuilder builder(options_, table_options_, options_.comparator);
+  FilterBlockBuilder builder(nullptr, table_options_, comparator_);
   Slice block = builder.Finish();
   ASSERT_EQ("\\x00\\x00\\x00\\x00\\x0b", EscapeString(block));
-  FilterBlockReader reader(options_, table_options_, block);
+  FilterBlockReader reader(nullptr, table_options_, block);
   ASSERT_TRUE(reader.KeyMayMatch(0, "foo"));
   ASSERT_TRUE(reader.KeyMayMatch(100000, "foo"));
 }
 
 TEST(FilterBlockTest, SingleChunk) {
-  FilterBlockBuilder builder(options_, table_options_, options_.comparator);
+  FilterBlockBuilder builder(nullptr, table_options_, comparator_);
   builder.StartBlock(100);
   builder.AddKey("foo");
   builder.AddKey("bar");
@@ -74,7 +74,7 @@ TEST(FilterBlockTest, SingleChunk) {
   builder.StartBlock(300);
   builder.AddKey("hello");
   Slice block = builder.Finish();
-  FilterBlockReader reader(options_, table_options_, block);
+  FilterBlockReader reader(nullptr, table_options_, block);
   ASSERT_TRUE(reader.KeyMayMatch(100, "foo"));
   ASSERT_TRUE(reader.KeyMayMatch(100, "bar"));
   ASSERT_TRUE(reader.KeyMayMatch(100, "box"));
@@ -85,7 +85,7 @@ TEST(FilterBlockTest, SingleChunk) {
 }
 
 TEST(FilterBlockTest, MultiChunk) {
-  FilterBlockBuilder builder(options_, table_options_, options_.comparator);
+  FilterBlockBuilder builder(nullptr, table_options_, comparator_);
 
   // First filter
   builder.StartBlock(0);
@@ -105,7 +105,7 @@ TEST(FilterBlockTest, MultiChunk) {
   builder.AddKey("hello");
 
   Slice block = builder.Finish();
-  FilterBlockReader reader(options_, table_options_, block);
+  FilterBlockReader reader(nullptr, table_options_, block);
 
   // Check first filter
   ASSERT_TRUE(reader.KeyMayMatch(0, "foo"));
diff --git a/table/plain_table_builder.cc b/table/plain_table_builder.cc
index 4f3b62ad4..49489ed64 100644
--- a/table/plain_table_builder.cc
+++ b/table/plain_table_builder.cc
@@ -58,24 +58,24 @@ extern const uint64_t kPlainTableMagicNumber = 0x8242229663bf9564ull;
 extern const uint64_t kLegacyPlainTableMagicNumber = 0x4f3418eb7a8f13b8ull;
 
 PlainTableBuilder::PlainTableBuilder(
-    const Options& options, WritableFile* file, uint32_t user_key_len,
-    EncodingType encoding_type, size_t index_sparseness,
+    const ImmutableCFOptions& ioptions, WritableFile* file,
+    uint32_t user_key_len, EncodingType encoding_type, size_t index_sparseness,
     uint32_t bloom_bits_per_key, uint32_t num_probes, size_t huge_page_tlb_size,
     double hash_table_ratio, bool store_index_in_file)
-    : options_(options),
+    : ioptions_(ioptions),
       bloom_block_(num_probes),
       file_(file),
       bloom_bits_per_key_(bloom_bits_per_key),
       huge_page_tlb_size_(huge_page_tlb_size),
-      encoder_(encoding_type, user_key_len, options.prefix_extractor.get(),
+      encoder_(encoding_type, user_key_len, ioptions.prefix_extractor,
                index_sparseness),
       store_index_in_file_(store_index_in_file),
-      prefix_extractor_(options.prefix_extractor.get()) {
+      prefix_extractor_(ioptions.prefix_extractor) {
   // Build index block and save it in the file if hash_table_ratio > 0
   if (store_index_in_file_) {
     assert(hash_table_ratio > 0 || IsTotalOrderMode());
     index_builder_.reset(
-        new PlainTableIndexBuilder(&arena_, options, index_sparseness,
+        new PlainTableIndexBuilder(&arena_, ioptions, index_sparseness,
                                    hash_table_ratio, huge_page_tlb_size_));
     assert(bloom_bits_per_key_ > 0);
     properties_.user_collected_properties
@@ -93,10 +93,10 @@ PlainTableBuilder::PlainTableBuilder(
   // plain encoding.
   properties_.format_version = (encoding_type == kPlain) ? 0 : 1;
 
-  if (options_.prefix_extractor) {
+  if (ioptions_.prefix_extractor) {
     properties_.user_collected_properties
         [PlainTablePropertyNames::kPrefixExtractorName] =
-        options_.prefix_extractor->Name();
+        ioptions_.prefix_extractor->Name();
   }
 
   std::string val;
@@ -105,7 +105,7 @@ PlainTableBuilder::PlainTableBuilder(
       [PlainTablePropertyNames::kEncodingType] = val;
 
   for (auto& collector_factories :
-       options.table_properties_collector_factories) {
+       ioptions.table_properties_collector_factories) {
     table_properties_collectors_.emplace_back(
         collector_factories->CreateTablePropertiesCollector());
   }
@@ -124,11 +124,11 @@ void PlainTableBuilder::Add(const Slice& key, const Slice& value) {
 
   // Store key hash
   if (store_index_in_file_) {
-    if (options_.prefix_extractor.get() == nullptr) {
+    if (ioptions_.prefix_extractor == nullptr) {
       keys_or_prefixes_hashes_.push_back(GetSliceHash(internal_key.user_key));
     } else {
       Slice prefix =
-          options_.prefix_extractor->Transform(internal_key.user_key);
+          ioptions_.prefix_extractor->Transform(internal_key.user_key);
       keys_or_prefixes_hashes_.push_back(GetSliceHash(prefix));
     }
   }
@@ -160,7 +160,7 @@ void PlainTableBuilder::Add(const Slice& key, const Slice& value) {
 
   // notify property collectors
   NotifyCollectTableCollectorsOnAdd(key, value, table_properties_collectors_,
-                                    options_.info_log.get());
+                                    ioptions_.info_log);
 }
 
 Status PlainTableBuilder::status() const { return status_; }
@@ -183,7 +183,8 @@ Status PlainTableBuilder::Finish() {
   if (store_index_in_file_ && (properties_.num_entries > 0)) {
     bloom_block_.SetTotalBits(
         &arena_, properties_.num_entries * bloom_bits_per_key_,
-        options_.bloom_locality, huge_page_tlb_size_, options_.info_log.get());
+        ioptions_.bloom_locality, huge_page_tlb_size_,
+        ioptions_.info_log);
 
     PutVarint32(&properties_.user_collected_properties
                      [PlainTablePropertyNames::kNumBloomBlocks],
@@ -224,7 +225,7 @@ Status PlainTableBuilder::Finish() {
 
   // -- Add user collected properties
   NotifyCollectTableCollectorsOnFinish(table_properties_collectors_,
-                                       options_.info_log.get(),
+                                       ioptions_.info_log,
                                        &property_block_builder);
 
   // -- Write property block
diff --git a/table/plain_table_builder.h b/table/plain_table_builder.h
index 2871d887e..c3af08072 100644
--- a/table/plain_table_builder.h
+++ b/table/plain_table_builder.h
@@ -30,7 +30,7 @@ class PlainTableBuilder: public TableBuilder {
   // caller to close the file after calling Finish(). The output file
   // will be part of level specified by 'level'.  A value of -1 means
   // that the caller does not know which level the output file will reside.
-  PlainTableBuilder(const Options& options, WritableFile* file,
+  PlainTableBuilder(const ImmutableCFOptions& ioptions, WritableFile* file,
                     uint32_t user_key_size, EncodingType encoding_type,
                     size_t index_sparseness, uint32_t bloom_bits_per_key,
                     uint32_t num_probes = 6, size_t huge_page_tlb_size = 0,
@@ -71,7 +71,7 @@ class PlainTableBuilder: public TableBuilder {
 
  private:
   Arena arena_;
-  Options options_;
+  const ImmutableCFOptions& ioptions_;
   std::vector<std::unique_ptr<TablePropertiesCollector>>
       table_properties_collectors_;
 
diff --git a/table/plain_table_factory.cc b/table/plain_table_factory.cc
index 145179bae..de23cc902 100644
--- a/table/plain_table_factory.cc
+++ b/table/plain_table_factory.cc
@@ -14,22 +14,24 @@
 
 namespace rocksdb {
 
-Status PlainTableFactory::NewTableReader(const Options& options,
-                                         const EnvOptions& soptions,
+Status PlainTableFactory::NewTableReader(const ImmutableCFOptions& ioptions,
+                                         const EnvOptions& env_options,
                                          const InternalKeyComparator& icomp,
                                          unique_ptr<RandomAccessFile>&& file,
                                          uint64_t file_size,
                                          unique_ptr<TableReader>* table) const {
-  return PlainTableReader::Open(options, soptions, icomp, std::move(file),
+  return PlainTableReader::Open(ioptions, env_options, icomp, std::move(file),
                                 file_size, table, bloom_bits_per_key_,
                                 hash_table_ratio_, index_sparseness_,
                                 huge_page_tlb_size_, full_scan_mode_);
 }
 
 TableBuilder* PlainTableFactory::NewTableBuilder(
-    const Options& options, const InternalKeyComparator& internal_comparator,
-    WritableFile* file, CompressionType compression_type) const {
-  return new PlainTableBuilder(options, file, user_key_len_, encoding_type_,
+    const ImmutableCFOptions& ioptions,
+    const InternalKeyComparator& internal_comparator,
+    WritableFile* file, const CompressionType,
+    const CompressionOptions&) const {
+  return new PlainTableBuilder(ioptions, file, user_key_len_, encoding_type_,
                                index_sparseness_, bloom_bits_per_key_, 6,
                                huge_page_tlb_size_, hash_table_ratio_,
                                store_index_in_file_);
diff --git a/table/plain_table_factory.h b/table/plain_table_factory.h
index d1cf0cae6..54c628c15 100644
--- a/table/plain_table_factory.h
+++ b/table/plain_table_factory.h
@@ -14,7 +14,6 @@
 
 namespace rocksdb {
 
-struct Options;
 struct EnvOptions;
 
 using std::unique_ptr;
@@ -154,15 +153,17 @@ class PlainTableFactory : public TableFactory {
         full_scan_mode_(options.full_scan_mode),
         store_index_in_file_(options.store_index_in_file) {}
   const char* Name() const override { return "PlainTable"; }
-  Status NewTableReader(const Options& options, const EnvOptions& soptions,
-                        const InternalKeyComparator& internal_comparator,
-                        unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
-                        unique_ptr<TableReader>* table) const override;
-  TableBuilder* NewTableBuilder(const Options& options,
-                                const InternalKeyComparator& icomparator,
-                                WritableFile* file,
-                                CompressionType compression_type) const
-      override;
+  Status NewTableReader(
+      const ImmutableCFOptions& options, const EnvOptions& soptions,
+      const InternalKeyComparator& internal_comparator,
+      unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
+      unique_ptr<TableReader>* table) const override;
+  TableBuilder* NewTableBuilder(
+      const ImmutableCFOptions& options,
+      const InternalKeyComparator& icomparator,
+      WritableFile* file,
+      const CompressionType,
+      const CompressionOptions&) const override;
 
   std::string GetPrintableTableOptions() const override;
 
diff --git a/table/plain_table_index.cc b/table/plain_table_index.cc
index efba9b71d..61f9e335b 100644
--- a/table/plain_table_index.cc
+++ b/table/plain_table_index.cc
@@ -93,7 +93,7 @@ Slice PlainTableIndexBuilder::Finish() {
   BucketizeIndexes(&hash_to_offsets, &entries_per_bucket);
 
   keys_per_prefix_hist_.Add(num_keys_per_prefix_);
-  Log(options_.info_log, "Number of Keys per prefix Histogram: %s",
+  Log(ioptions_.info_log, "Number of Keys per prefix Histogram: %s",
       keys_per_prefix_hist_.ToString().c_str());
 
   // From the temp data structure, populate indexes.
@@ -147,11 +147,11 @@ void PlainTableIndexBuilder::BucketizeIndexes(
 Slice PlainTableIndexBuilder::FillIndexes(
     const std::vector<IndexRecord*>& hash_to_offsets,
     const std::vector<uint32_t>& entries_per_bucket) {
-  Log(options_.info_log, "Reserving %zu bytes for plain table's sub_index",
+  Log(ioptions_.info_log, "Reserving %zu bytes for plain table's sub_index",
       sub_index_size_);
   auto total_allocate_size = GetTotalSize();
   char* allocated = arena_->AllocateAligned(
-      total_allocate_size, huge_page_tlb_size_, options_.info_log.get());
+      total_allocate_size, huge_page_tlb_size_, ioptions_.info_log);
 
   auto temp_ptr = EncodeVarint32(allocated, index_size_);
   uint32_t* index =
@@ -191,7 +191,7 @@ Slice PlainTableIndexBuilder::FillIndexes(
   }
   assert(sub_index_offset == sub_index_size_);
 
-  Log(options_.info_log, "hash table size: %d, suffix_map length %zu",
+  Log(ioptions_.info_log, "hash table size: %d, suffix_map length %zu",
       index_size_, sub_index_size_);
   return Slice(allocated, GetTotalSize());
 }
diff --git a/table/plain_table_index.h b/table/plain_table_index.h
index f63bbd0d5..0b26ecd0d 100644
--- a/table/plain_table_index.h
+++ b/table/plain_table_index.h
@@ -108,11 +108,11 @@ class PlainTableIndex {
 // #wiki-in-memory-index-format
 class PlainTableIndexBuilder {
  public:
-  PlainTableIndexBuilder(Arena* arena, const Options& options,
+  PlainTableIndexBuilder(Arena* arena, const ImmutableCFOptions& ioptions,
                          uint32_t index_sparseness, double hash_table_ratio,
                          double huge_page_tlb_size)
       : arena_(arena),
-        options_(options),
+        ioptions_(ioptions),
         record_list_(kRecordsPerGroup),
         is_first_record_(true),
         due_index_(false),
@@ -120,7 +120,7 @@ class PlainTableIndexBuilder {
         num_keys_per_prefix_(0),
         prev_key_prefix_hash_(0),
         index_sparseness_(index_sparseness),
-        prefix_extractor_(options.prefix_extractor.get()),
+        prefix_extractor_(ioptions.prefix_extractor),
         hash_table_ratio_(hash_table_ratio),
         huge_page_tlb_size_(huge_page_tlb_size) {}
 
@@ -196,7 +196,7 @@ class PlainTableIndexBuilder {
                     const std::vector<uint32_t>& entries_per_bucket);
 
   Arena* arena_;
-  Options options_;
+  const ImmutableCFOptions ioptions_;
   HistogramImpl keys_per_prefix_hist_;
   IndexRecordList record_list_;
   bool is_first_record_;
diff --git a/table/plain_table_reader.cc b/table/plain_table_reader.cc
index b5eccd310..3a6d48be8 100644
--- a/table/plain_table_reader.cc
+++ b/table/plain_table_reader.cc
@@ -87,7 +87,7 @@ class PlainTableIterator : public Iterator {
 };
 
 extern const uint64_t kPlainTableMagicNumber;
-PlainTableReader::PlainTableReader(const Options& options,
+PlainTableReader::PlainTableReader(const ImmutableCFOptions& ioptions,
                                    unique_ptr<RandomAccessFile>&& file,
                                    const EnvOptions& storage_options,
                                    const InternalKeyComparator& icomparator,
@@ -99,10 +99,10 @@ PlainTableReader::PlainTableReader(const Options& options,
       full_scan_mode_(false),
       data_end_offset_(table_properties->data_size),
       user_key_len_(table_properties->fixed_key_len),
-      prefix_extractor_(options.prefix_extractor.get()),
+      prefix_extractor_(ioptions.prefix_extractor),
       enable_bloom_(false),
       bloom_(6, nullptr),
-      options_(options),
+      ioptions_(ioptions),
       file_(std::move(file)),
       file_size_(file_size),
       table_properties_(nullptr) {}
@@ -110,8 +110,8 @@ PlainTableReader::PlainTableReader(const Options& options,
 PlainTableReader::~PlainTableReader() {
 }
 
-Status PlainTableReader::Open(const Options& options,
-                              const EnvOptions& soptions,
+Status PlainTableReader::Open(const ImmutableCFOptions& ioptions,
+                              const EnvOptions& env_options,
                               const InternalKeyComparator& internal_comparator,
                               unique_ptr<RandomAccessFile>&& file,
                               uint64_t file_size,
@@ -119,14 +119,14 @@ Status PlainTableReader::Open(const Options& options,
                               const int bloom_bits_per_key,
                               double hash_table_ratio, size_t index_sparseness,
                               size_t huge_page_tlb_size, bool full_scan_mode) {
-  assert(options.allow_mmap_reads);
+  assert(ioptions.allow_mmap_reads);
   if (file_size > PlainTableIndex::kMaxFileSize) {
     return Status::NotSupported("File is too large for PlainTableReader!");
   }
 
   TableProperties* props = nullptr;
   auto s = ReadTableProperties(file.get(), file_size, kPlainTableMagicNumber,
-                               options.env, options.info_log.get(), &props);
+                               ioptions.env, ioptions.info_log, &props);
   if (!s.ok()) {
     return s;
   }
@@ -137,12 +137,12 @@ Status PlainTableReader::Open(const Options& options,
       user_props.find(PlainTablePropertyNames::kPrefixExtractorName);
 
   if (!full_scan_mode && prefix_extractor_in_file != user_props.end()) {
-    if (!options.prefix_extractor) {
+    if (!ioptions.prefix_extractor) {
       return Status::InvalidArgument(
           "Prefix extractor is missing when opening a PlainTable built "
           "using a prefix extractor");
     } else if (prefix_extractor_in_file->second.compare(
-                   options.prefix_extractor->Name()) != 0) {
+                   ioptions.prefix_extractor->Name()) != 0) {
       return Status::InvalidArgument(
           "Prefix extractor given doesn't match the one used to build "
           "PlainTable");
@@ -158,8 +158,8 @@ Status PlainTableReader::Open(const Options& options,
   }
 
   std::unique_ptr<PlainTableReader> new_reader(new PlainTableReader(
-      options, std::move(file), soptions, internal_comparator, encoding_type,
-      file_size, props));
+      ioptions, std::move(file), env_options, internal_comparator,
+      encoding_type, file_size, props));
 
   s = new_reader->MmapDataFile();
   if (!s.ok()) {
@@ -207,7 +207,7 @@ Status PlainTableReader::PopulateIndexRecordList(
   bool is_first_record = true;
   Slice key_prefix_slice;
   PlainTableKeyDecoder decoder(encoding_type_, user_key_len_,
-                               options_.prefix_extractor.get());
+                               ioptions_.prefix_extractor);
   while (pos < data_end_offset_) {
     uint32_t key_offset = pos;
     ParsedInternalKey key;
@@ -252,8 +252,8 @@ void PlainTableReader::AllocateAndFillBloom(int bloom_bits_per_key,
     uint32_t bloom_total_bits = num_prefixes * bloom_bits_per_key;
     if (bloom_total_bits > 0) {
       enable_bloom_ = true;
-      bloom_.SetTotalBits(&arena_, bloom_total_bits, options_.bloom_locality,
-                          huge_page_tlb_size, options_.info_log.get());
+      bloom_.SetTotalBits(&arena_, bloom_total_bits, ioptions_.bloom_locality,
+                          huge_page_tlb_size, ioptions_.info_log);
       FillBloom(prefix_hashes);
     }
   }
@@ -281,14 +281,14 @@ Status PlainTableReader::PopulateIndex(TableProperties* props,
 
   BlockContents bloom_block_contents;
   auto s = ReadMetaBlock(file_.get(), file_size_, kPlainTableMagicNumber,
-                         options_.env, BloomBlockBuilder::kBloomBlock,
+                         ioptions_.env, BloomBlockBuilder::kBloomBlock,
                          &bloom_block_contents);
   bool index_in_file = s.ok();
 
   BlockContents index_block_contents;
   s = ReadMetaBlock(file_.get(), file_size_, kPlainTableMagicNumber,
-                    options_.env, PlainTableIndexBuilder::kPlainTableIndexBlock,
-                    &index_block_contents);
+      ioptions_.env, PlainTableIndexBuilder::kPlainTableIndexBlock,
+      &index_block_contents);
 
   index_in_file &= s.ok();
 
@@ -310,8 +310,9 @@ Status PlainTableReader::PopulateIndex(TableProperties* props,
     index_block = nullptr;
   }
 
-  if ((options_.prefix_extractor.get() == nullptr) && (hash_table_ratio != 0)) {
-  // options.prefix_extractor is requried for a hash-based look-up.
+  if ((ioptions_.prefix_extractor == nullptr) &&
+      (hash_table_ratio != 0)) {
+    // ioptions.prefix_extractor is requried for a hash-based look-up.
     return Status::NotSupported(
         "PlainTable requires a prefix extractor enable prefix hash mode.");
   }
@@ -328,8 +329,8 @@ Status PlainTableReader::PopulateIndex(TableProperties* props,
           table_properties_->num_entries * bloom_bits_per_key;
       if (num_bloom_bits > 0) {
         enable_bloom_ = true;
-        bloom_.SetTotalBits(&arena_, num_bloom_bits, options_.bloom_locality,
-                            huge_page_tlb_size, options_.info_log.get());
+        bloom_.SetTotalBits(&arena_, num_bloom_bits, ioptions_.bloom_locality,
+                            huge_page_tlb_size, ioptions_.info_log);
       }
     }
   } else {
@@ -351,7 +352,7 @@ Status PlainTableReader::PopulateIndex(TableProperties* props,
         bloom_block->size() * 8, num_blocks);
   }
 
-  PlainTableIndexBuilder index_builder(&arena_, options_, index_sparseness,
+  PlainTableIndexBuilder index_builder(&arena_, ioptions_, index_sparseness,
                                        hash_table_ratio, huge_page_tlb_size);
 
   std::vector<uint32_t> prefix_hashes;
@@ -422,7 +423,7 @@ Status PlainTableReader::GetOffset(const Slice& target, const Slice& prefix,
     uint32_t file_offset = GetFixed32Element(base_ptr, mid);
     size_t tmp;
     Status s = PlainTableKeyDecoder(encoding_type_, user_key_len_,
-                                    options_.prefix_extractor.get())
+                                    ioptions_.prefix_extractor)
                    .NextKey(file_data_.data() + file_offset,
                             file_data_.data() + data_end_offset_, &mid_key,
                             nullptr, &tmp);
@@ -451,7 +452,7 @@ Status PlainTableReader::GetOffset(const Slice& target, const Slice& prefix,
   size_t tmp;
   uint32_t low_key_offset = GetFixed32Element(base_ptr, low);
   Status s = PlainTableKeyDecoder(encoding_type_, user_key_len_,
-                                  options_.prefix_extractor.get())
+                                  ioptions_.prefix_extractor)
                  .NextKey(file_data_.data() + low_key_offset,
                           file_data_.data() + data_end_offset_, &low_key,
                           nullptr, &tmp);
@@ -565,7 +566,7 @@ Status PlainTableReader::Get(const ReadOptions& ro, const Slice& target,
   }
   Slice found_value;
   PlainTableKeyDecoder decoder(encoding_type_, user_key_len_,
-                               options_.prefix_extractor.get());
+                               ioptions_.prefix_extractor);
   while (offset < data_end_offset_) {
     Status s = Next(&decoder, &offset, &found_key, nullptr, &found_value);
     if (!s.ok()) {
diff --git a/table/plain_table_reader.h b/table/plain_table_reader.h
index 4a626979a..fcc94a53e 100644
--- a/table/plain_table_reader.h
+++ b/table/plain_table_reader.h
@@ -52,7 +52,8 @@ extern const uint32_t kPlainTableVariableLength;
 // The implementation of IndexedTableReader requires output file is mmaped
 class PlainTableReader: public TableReader {
  public:
-  static Status Open(const Options& options, const EnvOptions& soptions,
+  static Status Open(const ImmutableCFOptions& ioptions,
+                     const EnvOptions& env_options,
                      const InternalKeyComparator& internal_comparator,
                      unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
                      unique_ptr<TableReader>* table,
@@ -82,8 +83,9 @@ class PlainTableReader: public TableReader {
     return arena_.MemoryAllocatedBytes();
   }
 
-  PlainTableReader(const Options& options, unique_ptr<RandomAccessFile>&& file,
-                   const EnvOptions& storage_options,
+  PlainTableReader(const ImmutableCFOptions& ioptions,
+                   unique_ptr<RandomAccessFile>&& file,
+                   const EnvOptions& env_options,
                    const InternalKeyComparator& internal_comparator,
                    EncodingType encoding_type, uint64_t file_size,
                    const TableProperties* table_properties);
@@ -132,7 +134,7 @@ class PlainTableReader: public TableReader {
   DynamicBloom bloom_;
   Arena arena_;
 
-  const Options& options_;
+  const ImmutableCFOptions& ioptions_;
   unique_ptr<RandomAccessFile> file_;
   uint32_t file_size_;
   std::shared_ptr<const TableProperties> table_properties_;
diff --git a/table/table_reader_bench.cc b/table/table_reader_bench.cc
index ed2c7c52d..584937587 100644
--- a/table/table_reader_bench.cc
+++ b/table/table_reader_bench.cc
@@ -88,10 +88,12 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options,
   TableBuilder* tb = nullptr;
   DB* db = nullptr;
   Status s;
+  const ImmutableCFOptions ioptions(opts);
   if (!through_db) {
     env->NewWritableFile(file_name, &file, env_options);
-    tb = opts.table_factory->NewTableBuilder(opts, ikc, file.get(),
-                                             CompressionType::kNoCompression);
+    tb = opts.table_factory->NewTableBuilder(ioptions, ikc, file.get(),
+                                             CompressionType::kNoCompression,
+                                             CompressionOptions());
   } else {
     s = DB::Open(opts, dbname, &db);
     ASSERT_OK(s);
@@ -122,7 +124,7 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options,
     uint64_t file_size;
     env->GetFileSize(file_name, &file_size);
     s = opts.table_factory->NewTableReader(
-        opts, env_options, ikc, std::move(raf), file_size, &table_reader);
+        ioptions, env_options, ikc, std::move(raf), file_size, &table_reader);
   }
 
   Random rnd(301);
diff --git a/table/table_test.cc b/table/table_test.cc
index 500abf48f..df4997588 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -194,6 +194,7 @@ class Constructor {
   // been added so far.  Returns the keys in sorted order in "*keys"
   // and stores the key/value pairs in "*kvmap"
   void Finish(const Options& options,
+              const ImmutableCFOptions& ioptions,
               const BlockBasedTableOptions& table_options,
               const InternalKeyComparator& internal_comparator,
               std::vector<std::string>* keys, KVMap* kvmap) {
@@ -206,12 +207,14 @@ class Constructor {
       keys->push_back(it->first);
     }
     data_.clear();
-    Status s = FinishImpl(options, table_options, internal_comparator, *kvmap);
+    Status s = FinishImpl(options, ioptions, table_options,
+                          internal_comparator, *kvmap);
     ASSERT_TRUE(s.ok()) << s.ToString();
   }
 
   // Construct the data structure from the data in "data"
   virtual Status FinishImpl(const Options& options,
+                            const ImmutableCFOptions& ioptions,
                             const BlockBasedTableOptions& table_options,
                             const InternalKeyComparator& internal_comparator,
                             const KVMap& data) = 0;
@@ -239,6 +242,7 @@ class BlockConstructor: public Constructor {
     delete block_;
   }
   virtual Status FinishImpl(const Options& options,
+                            const ImmutableCFOptions& ioptions,
                             const BlockBasedTableOptions& table_options,
                             const InternalKeyComparator& internal_comparator,
                             const KVMap& data) {
@@ -322,14 +326,16 @@ class TableConstructor: public Constructor {
   ~TableConstructor() { Reset(); }
 
   virtual Status FinishImpl(const Options& options,
+                            const ImmutableCFOptions& ioptions,
                             const BlockBasedTableOptions& table_options,
                             const InternalKeyComparator& internal_comparator,
                             const KVMap& data) {
     Reset();
     sink_.reset(new StringSink());
     unique_ptr<TableBuilder> builder;
-    builder.reset(options.table_factory->NewTableBuilder(
-        options, internal_comparator, sink_.get(), options.compression));
+    builder.reset(ioptions.table_factory->NewTableBuilder(
+        ioptions, internal_comparator, sink_.get(), options.compression,
+        CompressionOptions()));
 
     for (KVMap::const_iterator it = data.begin();
          it != data.end();
@@ -352,9 +358,9 @@ class TableConstructor: public Constructor {
     // Open the table
     uniq_id_ = cur_uniq_id_++;
     source_.reset(new StringSource(sink_->contents(), uniq_id_,
-                                   options.allow_mmap_reads));
-    return options.table_factory->NewTableReader(
-        options, soptions, internal_comparator, std::move(source_),
+                                   ioptions.allow_mmap_reads));
+    return ioptions.table_factory->NewTableReader(
+        ioptions, soptions, internal_comparator, std::move(source_),
         sink_->contents().size(), &table_reader_);
   }
 
@@ -372,12 +378,12 @@ class TableConstructor: public Constructor {
     return table_reader_->ApproximateOffsetOf(key);
   }
 
-  virtual Status Reopen(const Options& options) {
+  virtual Status Reopen(const ImmutableCFOptions& ioptions) {
     source_.reset(
         new StringSource(sink_->contents(), uniq_id_,
-                         options.allow_mmap_reads));
-    return options.table_factory->NewTableReader(
-        options, soptions, *last_internal_key_, std::move(source_),
+                         ioptions.allow_mmap_reads));
+    return ioptions.table_factory->NewTableReader(
+        ioptions, soptions, *last_internal_key_, std::move(source_),
         sink_->contents().size(), &table_reader_);
   }
 
@@ -421,6 +427,7 @@ class MemTableConstructor: public Constructor {
     delete memtable_->Unref();
   }
   virtual Status FinishImpl(const Options& options,
+                            const ImmutableCFOptions& ioptions,
                             const BlockBasedTableOptions& table_options,
                             const InternalKeyComparator& internal_comparator,
                             const KVMap& data) {
@@ -460,6 +467,7 @@ class DBConstructor: public Constructor {
     delete db_;
   }
   virtual Status FinishImpl(const Options& options,
+                            const ImmutableCFOptions& ioptions,
                             const BlockBasedTableOptions& table_options,
                             const InternalKeyComparator& internal_comparator,
                             const KVMap& data) {
@@ -670,7 +678,7 @@ class FixedOrLessPrefixTransform : public SliceTransform {
 
 class Harness {
  public:
-  Harness() : constructor_(nullptr) { }
+  Harness() : ioptions_(options_), constructor_(nullptr) {}
 
   void Init(const TestArgs& args) {
     delete constructor_;
@@ -756,6 +764,7 @@ class Harness {
         constructor_ = new DBConstructor(options_.comparator);
         break;
     }
+    ioptions_ = ImmutableCFOptions(options_);
   }
 
   ~Harness() {
@@ -769,8 +778,8 @@ class Harness {
   void Test(Random* rnd) {
     std::vector<std::string> keys;
     KVMap data;
-    constructor_->Finish(options_, table_options_, *internal_comparator_,
-                         &keys, &data);
+    constructor_->Finish(options_, ioptions_, table_options_,
+                         *internal_comparator_, &keys, &data);
 
     TestForwardScan(keys, data);
     if (support_prev_) {
@@ -939,6 +948,7 @@ class Harness {
 
  private:
   Options options_ = Options();
+  ImmutableCFOptions ioptions_;
   BlockBasedTableOptions table_options_ = BlockBasedTableOptions();
   Constructor* constructor_;
   bool support_prev_;
@@ -1038,7 +1048,8 @@ TEST(BlockBasedTableTest, BasicBlockBasedTableProperties) {
   table_options.block_restart_interval = 1;
   options.table_factory.reset(NewBlockBasedTableFactory(table_options));
 
-  c.Finish(options, table_options,
+  const ImmutableCFOptions ioptions(options);
+  c.Finish(options, ioptions, table_options,
            GetPlainInternalComparator(options.comparator), &keys, &kvmap);
 
   auto& props = *c.GetTableReader()->GetTableProperties();
@@ -1071,7 +1082,8 @@ TEST(BlockBasedTableTest, FilterPolicyNameProperties) {
   Options options;
   options.table_factory.reset(NewBlockBasedTableFactory(table_options));
 
-  c.Finish(options, table_options,
+  const ImmutableCFOptions ioptions(options);
+  c.Finish(options, ioptions, table_options,
            GetPlainInternalComparator(options.comparator), &keys, &kvmap);
   auto& props = *c.GetTableReader()->GetTableProperties();
   ASSERT_EQ("rocksdb.BuiltinBloomFilter", props.filter_policy_name);
@@ -1122,7 +1134,8 @@ TEST(BlockBasedTableTest, TotalOrderSeekOnHashIndex) {
     c.Add("cccc2", std::string('a', 56));
     std::vector<std::string> keys;
     KVMap kvmap;
-    c.Finish(options, table_options,
+    const ImmutableCFOptions ioptions(options);
+    c.Finish(options, ioptions, table_options,
              GetPlainInternalComparator(options.comparator), &keys, &kvmap);
     auto props = c.GetTableReader()->GetTableProperties();
     ASSERT_EQ(7u, props->num_data_blocks);
@@ -1206,7 +1219,8 @@ TEST(TableTest, HashIndexTest) {
 
   std::unique_ptr<InternalKeyComparator> comparator(
       new InternalKeyComparator(BytewiseComparator()));
-  c.Finish(options, table_options, *comparator, &keys, &kvmap);
+  const ImmutableCFOptions ioptions(options);
+  c.Finish(options, ioptions, table_options, *comparator, &keys, &kvmap);
   auto reader = c.GetTableReader();
 
   auto props = reader->GetTableProperties();
@@ -1314,7 +1328,8 @@ TEST(BlockBasedTableTest, IndexSizeStat) {
     table_options.block_restart_interval = 1;
     options.table_factory.reset(NewBlockBasedTableFactory(table_options));
 
-    c.Finish(options, table_options,
+    const ImmutableCFOptions ioptions(options);
+    c.Finish(options, ioptions, table_options,
              GetPlainInternalComparator(options.comparator), &ks, &kvmap);
     auto index_size = c.GetTableReader()->GetTableProperties()->index_size;
     ASSERT_GT(index_size, last_index_size);
@@ -1340,7 +1355,8 @@ TEST(BlockBasedTableTest, NumBlockStat) {
 
   std::vector<std::string> ks;
   KVMap kvmap;
-  c.Finish(options, table_options,
+  const ImmutableCFOptions ioptions(options);
+  c.Finish(options, ioptions, table_options,
            GetPlainInternalComparator(options.comparator), &ks, &kvmap);
   ASSERT_EQ(kvmap.size(),
             c.GetTableReader()->GetTableProperties()->num_data_blocks);
@@ -1416,7 +1432,8 @@ TEST(BlockBasedTableTest, BlockCacheDisabledTest) {
 
   TableConstructor c(BytewiseComparator(), true);
   c.Add("key", "value");
-  c.Finish(options, table_options,
+  const ImmutableCFOptions ioptions(options);
+  c.Finish(options, ioptions, table_options,
            GetPlainInternalComparator(options.comparator), &keys, &kvmap);
 
   // preloading filter/index blocks is enabled.
@@ -1458,7 +1475,8 @@ TEST(BlockBasedTableTest, FilterBlockInBlockCache) {
 
   TableConstructor c(BytewiseComparator());
   c.Add("key", "value");
-  c.Finish(options, table_options,
+  const ImmutableCFOptions ioptions(options);
+  c.Finish(options, ioptions, table_options,
            GetPlainInternalComparator(options.comparator), &keys, &kvmap);
   // preloading filter/index blocks is prohibited.
   auto reader = dynamic_cast<BlockBasedTable*>(c.GetTableReader());
@@ -1512,7 +1530,8 @@ TEST(BlockBasedTableTest, FilterBlockInBlockCache) {
   table_options.block_cache.reset();
   options.table_factory.reset(new BlockBasedTableFactory(table_options));
   options.statistics = CreateDBStatistics();  // reset the stats
-  c.Reopen(options);
+  const ImmutableCFOptions ioptions1(options);
+  c.Reopen(ioptions1);
   table_options.no_block_cache = false;
 
   {
@@ -1529,7 +1548,8 @@ TEST(BlockBasedTableTest, FilterBlockInBlockCache) {
   // too small to fit even one entry.
   table_options.block_cache = NewLRUCache(1);
   options.table_factory.reset(new BlockBasedTableFactory(table_options));
-  c.Reopen(options);
+  const ImmutableCFOptions ioptions2(options);
+  c.Reopen(ioptions2);
   {
     BlockCachePropertiesSnapshot props(options.statistics.get());
     props.AssertEqual(1,  // index block miss
@@ -1583,7 +1603,8 @@ TEST(BlockBasedTableTest, BlockCacheLeak) {
   c.Add("k07", std::string(100000, 'x'));
   std::vector<std::string> keys;
   KVMap kvmap;
-  c.Finish(opt, table_options, *ikc, &keys, &kvmap);
+  const ImmutableCFOptions ioptions(opt);
+  c.Finish(opt, ioptions, table_options, *ikc, &keys, &kvmap);
 
   unique_ptr<Iterator> iter(c.NewIterator());
   iter->SeekToFirst();
@@ -1594,7 +1615,8 @@ TEST(BlockBasedTableTest, BlockCacheLeak) {
   }
   ASSERT_OK(iter->status());
 
-  ASSERT_OK(c.Reopen(opt));
+  const ImmutableCFOptions ioptions1(opt);
+  ASSERT_OK(c.Reopen(ioptions1));
   auto table_reader = dynamic_cast<BlockBasedTable*>(c.GetTableReader());
   for (const std::string& key : keys) {
     ASSERT_TRUE(table_reader->TEST_KeyInCache(ReadOptions(), key));
@@ -1603,7 +1625,8 @@ TEST(BlockBasedTableTest, BlockCacheLeak) {
   // rerun with different block cache
   table_options.block_cache = NewLRUCache(16 * 1024 * 1024);
   opt.table_factory.reset(NewBlockBasedTableFactory(table_options));
-  ASSERT_OK(c.Reopen(opt));
+  const ImmutableCFOptions ioptions2(opt);
+  ASSERT_OK(c.Reopen(ioptions2));
   table_reader = dynamic_cast<BlockBasedTable*>(c.GetTableReader());
   for (const std::string& key : keys) {
     ASSERT_TRUE(!table_reader->TEST_KeyInCache(ReadOptions(), key));
@@ -1619,9 +1642,11 @@ TEST(PlainTableTest, BasicPlainTableProperties) {
   PlainTableFactory factory(plain_table_options);
   StringSink sink;
   Options options;
+  const ImmutableCFOptions ioptions(options);
   InternalKeyComparator ikc(options.comparator);
   std::unique_ptr<TableBuilder> builder(
-      factory.NewTableBuilder(options, ikc, &sink, kNoCompression));
+      factory.NewTableBuilder(ioptions, ikc, &sink, kNoCompression,
+                              CompressionOptions()));
 
   for (char c = 'a'; c <= 'z'; ++c) {
     std::string key(8, c);
@@ -1664,7 +1689,9 @@ TEST(GeneralTableTest, ApproximateOffsetOfPlain) {
   options.compression = kNoCompression;
   BlockBasedTableOptions table_options;
   table_options.block_size = 1024;
-  c.Finish(options, table_options, internal_comparator, &keys, &kvmap);
+  const ImmutableCFOptions ioptions(options);
+  c.Finish(options, ioptions, table_options, internal_comparator,
+           &keys, &kvmap);
 
   ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"),       0,      0));
   ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01"),       0,      0));
@@ -1694,7 +1721,8 @@ static void DoCompressionTest(CompressionType comp) {
   options.compression = comp;
   BlockBasedTableOptions table_options;
   table_options.block_size = 1024;
-  c.Finish(options, table_options, ikc, &keys, &kvmap);
+  const ImmutableCFOptions ioptions(options);
+  c.Finish(options, ioptions, table_options, ikc, &keys, &kvmap);
 
   ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"),       0,      0));
   ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01"),       0,      0));
diff --git a/tools/sst_dump.cc b/tools/sst_dump.cc
index 9b130c7c6..6c496e8dd 100644
--- a/tools/sst_dump.cc
+++ b/tools/sst_dump.cc
@@ -68,6 +68,7 @@ class SstFileReader {
   // options_ and internal_comparator_ will also be used in
   // ReadSequential internally (specifically, seek-related operations)
   Options options_;
+  const ImmutableCFOptions ioptions_;
   InternalKeyComparator internal_comparator_;
   unique_ptr<TableProperties> table_properties_;
 };
@@ -76,7 +77,8 @@ SstFileReader::SstFileReader(const std::string& file_path,
                              bool verify_checksum,
                              bool output_hex)
     :file_name_(file_path), read_num_(0), verify_checksum_(verify_checksum),
-    output_hex_(output_hex), internal_comparator_(BytewiseComparator()) {
+    output_hex_(output_hex), ioptions_(options_),
+    internal_comparator_(BytewiseComparator()) {
   fprintf(stdout, "Process %s\n", file_path.c_str());
 
   init_result_ = NewTableReader(file_name_);
@@ -123,7 +125,7 @@ Status SstFileReader::NewTableReader(const std::string& file_path) {
 
   if (s.ok()) {
     s = options_.table_factory->NewTableReader(
-        options_, soptions_, internal_comparator_, std::move(file_), file_size,
+        ioptions_, soptions_, internal_comparator_, std::move(file_), file_size,
         &table_reader_);
   }
   return s;
diff --git a/util/options.cc b/util/options.cc
index b16c6f2f5..fc9285442 100644
--- a/util/options.cc
+++ b/util/options.cc
@@ -8,6 +8,7 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #include "rocksdb/options.h"
+#include "rocksdb/immutable_options.h"
 
 #define __STDC_FORMAT_MACROS
 #include <inttypes.h>
@@ -28,6 +29,26 @@
 
 namespace rocksdb {
 
+ImmutableCFOptions::ImmutableCFOptions(const Options& options)
+  : prefix_extractor(options.prefix_extractor.get()),
+    comparator(options.comparator),
+    merge_operator(options.merge_operator.get()),
+    info_log(options.info_log.get()),
+    statistics(options.statistics.get()),
+    env(options.env),
+    allow_mmap_reads(options.allow_mmap_reads),
+    allow_mmap_writes(options.allow_mmap_writes),
+    db_paths(options.db_paths),
+    table_factory(options.table_factory.get()),
+    table_properties_collector_factories(
+        options.table_properties_collector_factories),
+    advise_random_on_open(options.advise_random_on_open),
+    bloom_locality(options.bloom_locality),
+    purge_redundant_kvs_while_flush(options.purge_redundant_kvs_while_flush),
+    min_partial_merge_operands(options.min_partial_merge_operands),
+    disable_data_sync(options.disableDataSync),
+    use_fsync(options.use_fsync) {}
+
 ColumnFamilyOptions::ColumnFamilyOptions()
     : comparator(BytewiseComparator()),
       merge_operator(nullptr),

From 45a5e3ede01b64d8f3a5c2fe411b728f1d7b9242 Mon Sep 17 00:00:00 2001
From: Stanislau Hlebik <stash@fb.com>
Date: Thu, 4 Sep 2014 17:40:41 -0700
Subject: [PATCH 022/829] Remove path with arena==nullptr from
 NewInternalIterator

Summary:
Simply code by removing code path which does not use Arena
from NewInternalIterator

Test Plan:
make all check
make valgrind_check

Reviewers: sdong

Reviewed By: sdong

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D22395
---
 db/db_impl.cc                | 119 +++++++++++++++-----------------
 db/db_impl.h                 |   8 +--
 db/db_impl_debug.cc          |   5 +-
 db/db_test.cc                | 128 +++++++++++++++++++----------------
 db/forward_iterator.cc       |  10 +--
 db/forward_iterator.h        |   2 +
 db/memtable.cc               |  10 +--
 db/memtable.h                |   3 +-
 db/memtable_list.cc          |   5 +-
 db/memtable_list.h           |   2 +-
 db/repair.cc                 |  17 +++--
 db/version_set.cc            |  25 -------
 db/version_set.h             |   2 -
 db/write_batch_test.cc       |   5 +-
 java/rocksjni/write_batch.cc |   6 +-
 table/table_test.cc          |  53 ++++++++++++---
 util/ldb_cmd.cc              |   4 +-
 util/scoped_arena_iterator.h |  28 ++++++++
 18 files changed, 240 insertions(+), 192 deletions(-)
 create mode 100644 util/scoped_arena_iterator.h

diff --git a/db/db_impl.cc b/db/db_impl.cc
index 049d40c7b..4e3816d64 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -1415,31 +1415,32 @@ Status DBImpl::WriteLevel0TableForRecovery(ColumnFamilyData* cfd, MemTable* mem,
   pending_outputs_[meta.fd.GetNumber()] = 0;  // path 0 for level 0 file.
   ReadOptions ro;
   ro.total_order_seek = true;
-  Iterator* iter = mem->NewIterator(ro);
-  const SequenceNumber newest_snapshot = snapshots_.GetNewest();
-  const SequenceNumber earliest_seqno_in_memtable =
-    mem->GetFirstSequenceNumber();
-  Log(options_.info_log, "[%s] Level-0 table #%" PRIu64 ": started",
-      cfd->GetName().c_str(), meta.fd.GetNumber());
-
+  Arena arena;
   Status s;
   {
-    mutex_.Unlock();
-    s = BuildTable(dbname_, env_, *cfd->ioptions(), env_options_,
-                   cfd->table_cache(), iter, &meta, cfd->internal_comparator(),
-                   newest_snapshot, earliest_seqno_in_memtable,
-                   GetCompressionFlush(*cfd->options()),
-                   cfd->options()->compression_opts, Env::IO_HIGH);
-    LogFlush(options_.info_log);
-    mutex_.Lock();
-  }
+    ScopedArenaIterator iter(mem->NewIterator(ro, &arena));
+    const SequenceNumber newest_snapshot = snapshots_.GetNewest();
+    const SequenceNumber earliest_seqno_in_memtable =
+        mem->GetFirstSequenceNumber();
+    Log(options_.info_log, "[%s] Level-0 table #%" PRIu64 ": started",
+        cfd->GetName().c_str(), meta.fd.GetNumber());
 
-  Log(options_.info_log,
-      "[%s] Level-0 table #%" PRIu64 ": %" PRIu64 " bytes %s",
-      cfd->GetName().c_str(), meta.fd.GetNumber(), meta.fd.GetFileSize(),
-      s.ToString().c_str());
-  delete iter;
+    {
+      mutex_.Unlock();
+      s = BuildTable(
+          dbname_, env_, *cfd->ioptions(), env_options_, cfd->table_cache(),
+          iter.get(), &meta, cfd->internal_comparator(), newest_snapshot,
+          earliest_seqno_in_memtable, GetCompressionFlush(*cfd->options()),
+          cfd->options()->compression_opts, Env::IO_HIGH);
+      LogFlush(options_.info_log);
+      mutex_.Lock();
+    }
 
+    Log(options_.info_log,
+        "[%s] Level-0 table #%" PRIu64 ": %" PRIu64 " bytes %s",
+        cfd->GetName().c_str(), meta.fd.GetNumber(), meta.fd.GetFileSize(),
+        s.ToString().c_str());
+  }
   pending_outputs_.erase(meta.fd.GetNumber());
 
   // Note that if file_size is zero, the file has been deleted and
@@ -1485,24 +1486,27 @@ Status DBImpl::WriteLevel0Table(ColumnFamilyData* cfd,
     std::vector<Iterator*> memtables;
     ReadOptions ro;
     ro.total_order_seek = true;
+    Arena arena;
     for (MemTable* m : mems) {
       Log(options_.info_log,
           "[%s] Flushing memtable with next log file: %" PRIu64 "\n",
           cfd->GetName().c_str(), m->GetNextLogNumber());
-      memtables.push_back(m->NewIterator(ro));
+      memtables.push_back(m->NewIterator(ro, &arena));
+    }
+    {
+      ScopedArenaIterator iter(NewMergingIterator(&cfd->internal_comparator(),
+                                                  &memtables[0],
+                                                  memtables.size(), &arena));
+      Log(options_.info_log, "[%s] Level-0 flush table #%" PRIu64 ": started",
+          cfd->GetName().c_str(), meta.fd.GetNumber());
+
+      s = BuildTable(
+          dbname_, env_, *cfd->ioptions(), env_options_, cfd->table_cache(),
+          iter.get(), &meta, cfd->internal_comparator(), newest_snapshot,
+          earliest_seqno_in_memtable, GetCompressionFlush(*cfd->options()),
+          cfd->options()->compression_opts, Env::IO_HIGH);
+      LogFlush(options_.info_log);
     }
-    Iterator* iter = NewMergingIterator(&cfd->internal_comparator(),
-                                        &memtables[0], memtables.size());
-    Log(options_.info_log, "[%s] Level-0 flush table #%" PRIu64 ": started",
-        cfd->GetName().c_str(), meta.fd.GetNumber());
-
-    s = BuildTable(dbname_, env_, *cfd->ioptions(), env_options_,
-                   cfd->table_cache(), iter, &meta, cfd->internal_comparator(),
-                   newest_snapshot, earliest_seqno_in_memtable,
-                   GetCompressionFlush(*cfd->options()),
-                   cfd->options()->compression_opts, Env::IO_HIGH);
-    LogFlush(options_.info_log);
-    delete iter;
     Log(options_.info_log,
         "[%s] Level-0 flush table #%" PRIu64 ": %" PRIu64 " bytes %s",
         cfd->GetName().c_str(), meta.fd.GetNumber(), meta.fd.GetFileSize(),
@@ -3349,31 +3353,18 @@ Iterator* DBImpl::NewInternalIterator(const ReadOptions& options,
                                       SuperVersion* super_version,
                                       Arena* arena) {
   Iterator* internal_iter;
-  if (arena != nullptr) {
-    // Need to create internal iterator from the arena.
-    MergeIteratorBuilder merge_iter_builder(&cfd->internal_comparator(), arena);
-    // Collect iterator for mutable mem
-    merge_iter_builder.AddIterator(
-        super_version->mem->NewIterator(options, arena));
-    // Collect all needed child iterators for immutable memtables
-    super_version->imm->AddIterators(options, &merge_iter_builder);
-    // Collect iterators for files in L0 - Ln
-    super_version->current->AddIterators(options, env_options_,
-                                         &merge_iter_builder);
-    internal_iter = merge_iter_builder.Finish();
-  } else {
-    // Need to create internal iterator using malloc.
-    std::vector<Iterator*> iterator_list;
-    // Collect iterator for mutable mem
-    iterator_list.push_back(super_version->mem->NewIterator(options));
-    // Collect all needed child iterators for immutable memtables
-    super_version->imm->AddIterators(options, &iterator_list);
-    // Collect iterators for files in L0 - Ln
-    super_version->current->AddIterators(options, env_options_,
-                                         &iterator_list);
-    internal_iter = NewMergingIterator(&cfd->internal_comparator(),
-                                       &iterator_list[0], iterator_list.size());
-  }
+  assert(arena != nullptr);
+  // Need to create internal iterator from the arena.
+  MergeIteratorBuilder merge_iter_builder(&cfd->internal_comparator(), arena);
+  // Collect iterator for mutable mem
+  merge_iter_builder.AddIterator(
+      super_version->mem->NewIterator(options, arena));
+  // Collect all needed child iterators for immutable memtables
+  super_version->imm->AddIterators(options, &merge_iter_builder);
+  // Collect iterators for files in L0 - Ln
+  super_version->current->AddIterators(options, env_options_,
+                                       &merge_iter_builder);
+  internal_iter = merge_iter_builder.Finish();
   IterState* cleanup = new IterState(this, &mutex_, super_version);
   internal_iter->RegisterCleanup(CleanupIteratorState, cleanup, nullptr);
 
@@ -3790,10 +3781,12 @@ Status DBImpl::NewIterators(
               ? reinterpret_cast<const SnapshotImpl*>(options.snapshot)->number_
               : latest_snapshot;
 
-      auto iter = NewInternalIterator(options, cfd, super_versions[i]);
-      iter = NewDBIterator(env_, *cfd->options(),
-                           cfd->user_comparator(), iter, snapshot);
-      iterators->push_back(iter);
+      ArenaWrappedDBIter* db_iter = NewArenaWrappedDbIterator(
+          env_, *cfd->options(), cfd->user_comparator(), snapshot);
+      Iterator* internal_iter = NewInternalIterator(
+          options, cfd, super_versions[i], db_iter->GetArena());
+      db_iter->SetIterUnderDBIter(internal_iter);
+      iterators->push_back(db_iter);
     }
   }
 
diff --git a/db/db_impl.h b/db/db_impl.h
index caacd012a..1ccaabb6c 100644
--- a/db/db_impl.h
+++ b/db/db_impl.h
@@ -30,6 +30,7 @@
 #include "util/autovector.h"
 #include "util/stop_watch.h"
 #include "util/thread_local.h"
+#include "util/scoped_arena_iterator.h"
 #include "db/internal_stats.h"
 
 namespace rocksdb {
@@ -173,8 +174,8 @@ class DBImpl : public DB {
   // Return an internal iterator over the current state of the database.
   // The keys of this iterator are internal keys (see format.h).
   // The returned iterator should be deleted when no longer needed.
-  Iterator* TEST_NewInternalIterator(ColumnFamilyHandle* column_family =
-                                         nullptr);
+  Iterator* TEST_NewInternalIterator(
+      Arena* arena, ColumnFamilyHandle* column_family = nullptr);
 
   // Return the maximum overlapping data (in bytes) at next level for any
   // file at a level >= 1.
@@ -297,8 +298,7 @@ class DBImpl : public DB {
   Statistics* stats_;
 
   Iterator* NewInternalIterator(const ReadOptions&, ColumnFamilyData* cfd,
-                                SuperVersion* super_version,
-                                Arena* arena = nullptr);
+                                SuperVersion* super_version, Arena* arena);
 
  private:
   friend class DB;
diff --git a/db/db_impl_debug.cc b/db/db_impl_debug.cc
index 8df66f6c6..77d4e0551 100644
--- a/db/db_impl_debug.cc
+++ b/db/db_impl_debug.cc
@@ -20,7 +20,8 @@ uint64_t DBImpl::TEST_GetLevel0TotalSize() {
   return default_cf_handle_->cfd()->current()->NumLevelBytes(0);
 }
 
-Iterator* DBImpl::TEST_NewInternalIterator(ColumnFamilyHandle* column_family) {
+Iterator* DBImpl::TEST_NewInternalIterator(Arena* arena,
+                                           ColumnFamilyHandle* column_family) {
   ColumnFamilyData* cfd;
   if (column_family == nullptr) {
     cfd = default_cf_handle_->cfd();
@@ -33,7 +34,7 @@ Iterator* DBImpl::TEST_NewInternalIterator(ColumnFamilyHandle* column_family) {
   SuperVersion* super_version = cfd->GetSuperVersion()->Ref();
   mutex_.Unlock();
   ReadOptions roptions;
-  return NewInternalIterator(roptions, cfd, super_version);
+  return NewInternalIterator(roptions, cfd, super_version, arena);
 }
 
 int64_t DBImpl::TEST_MaxNextLevelOverlappingBytes(
diff --git a/db/db_test.cc b/db/db_test.cc
index 0b0365211..5bd781696 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -41,6 +41,7 @@
 #include "util/rate_limiter.h"
 #include "util/statistics.h"
 #include "util/testharness.h"
+#include "util/scoped_arena_iterator.h"
 #include "util/sync_point.h"
 #include "util/testutil.h"
 
@@ -755,11 +756,12 @@ class DBTest {
   }
 
   std::string AllEntriesFor(const Slice& user_key, int cf = 0) {
-    Iterator* iter;
+    ScopedArenaIterator iter;
+    Arena arena;
     if (cf == 0) {
-      iter = dbfull()->TEST_NewInternalIterator();
+      iter.set(dbfull()->TEST_NewInternalIterator(&arena));
     } else {
-      iter = dbfull()->TEST_NewInternalIterator(handles_[cf]);
+      iter.set(dbfull()->TEST_NewInternalIterator(&arena, handles_[cf]));
     }
     InternalKey target(user_key, kMaxSequenceNumber, kTypeValue);
     iter->Seek(target.Encode());
@@ -804,7 +806,6 @@ class DBTest {
       }
       result += "]";
     }
-    delete iter;
     return result;
   }
 
@@ -1042,11 +1043,12 @@ class DBTest {
 
   // Utility method to test InplaceUpdate
   void validateNumberOfEntries(int numValues, int cf = 0) {
-    Iterator* iter;
+    ScopedArenaIterator iter;
+    Arena arena;
     if (cf != 0) {
-      iter = dbfull()->TEST_NewInternalIterator(handles_[cf]);
+      iter.set(dbfull()->TEST_NewInternalIterator(&arena, handles_[cf]));
     } else {
-      iter = dbfull()->TEST_NewInternalIterator();
+      iter.set(dbfull()->TEST_NewInternalIterator(&arena));
     }
     iter->SeekToFirst();
     ASSERT_EQ(iter->status().ok(), true);
@@ -1060,7 +1062,6 @@ class DBTest {
       ASSERT_EQ(ikey.sequence, (unsigned)seq--);
       iter->Next();
     }
-    delete iter;
     ASSERT_EQ(0, seq);
   }
 
@@ -4210,22 +4211,25 @@ TEST(DBTest, CompactionFilter) {
   // TODO: figure out sequence number squashtoo
   int count = 0;
   int total = 0;
-  Iterator* iter = dbfull()->TEST_NewInternalIterator(handles_[1]);
-  iter->SeekToFirst();
-  ASSERT_OK(iter->status());
-  while (iter->Valid()) {
-    ParsedInternalKey ikey(Slice(), 0, kTypeValue);
-    ikey.sequence = -1;
-    ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true);
-    total++;
-    if (ikey.sequence != 0) {
-      count++;
+  Arena arena;
+  {
+    ScopedArenaIterator iter(
+        dbfull()->TEST_NewInternalIterator(&arena, handles_[1]));
+    iter->SeekToFirst();
+    ASSERT_OK(iter->status());
+    while (iter->Valid()) {
+      ParsedInternalKey ikey(Slice(), 0, kTypeValue);
+      ikey.sequence = -1;
+      ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true);
+      total++;
+      if (ikey.sequence != 0) {
+        count++;
+      }
+      iter->Next();
     }
-    iter->Next();
   }
   ASSERT_EQ(total, 100000);
   ASSERT_EQ(count, 1);
-  delete iter;
 
   // overwrite all the 100K keys once again.
   for (int i = 0; i < 100000; i++) {
@@ -4280,7 +4284,7 @@ TEST(DBTest, CompactionFilter) {
   ASSERT_EQ(NumTableFilesAtLevel(1, 1), 0);
 
   // Scan the entire database to ensure that nothing is left
-  iter = db_->NewIterator(ReadOptions(), handles_[1]);
+  Iterator* iter = db_->NewIterator(ReadOptions(), handles_[1]);
   iter->SeekToFirst();
   count = 0;
   while (iter->Valid()) {
@@ -4296,18 +4300,20 @@ TEST(DBTest, CompactionFilter) {
   // TODO: remove the following or design a different
   // test
   count = 0;
-  iter = dbfull()->TEST_NewInternalIterator(handles_[1]);
-  iter->SeekToFirst();
-  ASSERT_OK(iter->status());
-  while (iter->Valid()) {
-    ParsedInternalKey ikey(Slice(), 0, kTypeValue);
-    ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true);
-    ASSERT_NE(ikey.sequence, (unsigned)0);
-    count++;
-    iter->Next();
+  {
+    ScopedArenaIterator iter(
+        dbfull()->TEST_NewInternalIterator(&arena, handles_[1]));
+    iter->SeekToFirst();
+    ASSERT_OK(iter->status());
+    while (iter->Valid()) {
+      ParsedInternalKey ikey(Slice(), 0, kTypeValue);
+      ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true);
+      ASSERT_NE(ikey.sequence, (unsigned)0);
+      count++;
+      iter->Next();
+    }
+    ASSERT_EQ(count, 0);
   }
-  ASSERT_EQ(count, 0);
-  delete iter;
 }
 
 // Tests the edge case where compaction does not produce any output -- all
@@ -4429,22 +4435,24 @@ TEST(DBTest, CompactionFilterContextManual) {
   // Verify total number of keys is correct after manual compaction.
   int count = 0;
   int total = 0;
-  Iterator* iter = dbfull()->TEST_NewInternalIterator();
-  iter->SeekToFirst();
-  ASSERT_OK(iter->status());
-  while (iter->Valid()) {
-    ParsedInternalKey ikey(Slice(), 0, kTypeValue);
-    ikey.sequence = -1;
-    ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true);
-    total++;
-    if (ikey.sequence != 0) {
-      count++;
+  {
+    Arena arena;
+    ScopedArenaIterator iter(dbfull()->TEST_NewInternalIterator(&arena));
+    iter->SeekToFirst();
+    ASSERT_OK(iter->status());
+    while (iter->Valid()) {
+      ParsedInternalKey ikey(Slice(), 0, kTypeValue);
+      ikey.sequence = -1;
+      ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true);
+      total++;
+      if (ikey.sequence != 0) {
+        count++;
+      }
+      iter->Next();
     }
-    iter->Next();
+    ASSERT_EQ(total, 700);
+    ASSERT_EQ(count, 1);
   }
-  ASSERT_EQ(total, 700);
-  ASSERT_EQ(count, 1);
-  delete iter;
 }
 
 class KeepFilterV2 : public CompactionFilterV2 {
@@ -4601,25 +4609,27 @@ TEST(DBTest, CompactionFilterV2) {
   // All the files are in the lowest level.
   int count = 0;
   int total = 0;
-  Iterator* iter = dbfull()->TEST_NewInternalIterator();
-  iter->SeekToFirst();
-  ASSERT_OK(iter->status());
-  while (iter->Valid()) {
-    ParsedInternalKey ikey(Slice(), 0, kTypeValue);
-    ikey.sequence = -1;
-    ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true);
-    total++;
-    if (ikey.sequence != 0) {
-      count++;
+  {
+    Arena arena;
+    ScopedArenaIterator iter(dbfull()->TEST_NewInternalIterator(&arena));
+    iter->SeekToFirst();
+    ASSERT_OK(iter->status());
+    while (iter->Valid()) {
+      ParsedInternalKey ikey(Slice(), 0, kTypeValue);
+      ikey.sequence = -1;
+      ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true);
+      total++;
+      if (ikey.sequence != 0) {
+        count++;
+      }
+      iter->Next();
     }
-    iter->Next();
   }
 
   ASSERT_EQ(total, 100000);
   // 1 snapshot only. Since we are using universal compacton,
   // the sequence no is cleared for better compression
   ASSERT_EQ(count, 1);
-  delete iter;
 
   // create a new database with the compaction
   // filter in such a way that it deletes all keys
@@ -4643,7 +4653,7 @@ TEST(DBTest, CompactionFilterV2) {
   ASSERT_EQ(NumTableFilesAtLevel(1), 0);
 
   // Scan the entire database to ensure that nothing is left
-  iter = db_->NewIterator(ReadOptions());
+  Iterator* iter = db_->NewIterator(ReadOptions());
   iter->SeekToFirst();
   count = 0;
   while (iter->Valid()) {
diff --git a/db/forward_iterator.cc b/db/forward_iterator.cc
index 74e6dd249..6b78c4037 100644
--- a/db/forward_iterator.cc
+++ b/db/forward_iterator.cc
@@ -132,9 +132,11 @@ ForwardIterator::~ForwardIterator() {
 }
 
 void ForwardIterator::Cleanup() {
-  delete mutable_iter_;
+  if (mutable_iter_ != nullptr) {
+    mutable_iter_->~Iterator();
+  }
   for (auto* m : imm_iters_) {
-    delete m;
+    m->~Iterator();
   }
   imm_iters_.clear();
   for (auto* f : l0_iters_) {
@@ -401,8 +403,8 @@ void ForwardIterator::RebuildIterators() {
   Cleanup();
   // New
   sv_ = cfd_->GetReferencedSuperVersion(&(db_->mutex_));
-  mutable_iter_ = sv_->mem->NewIterator(read_options_);
-  sv_->imm->AddIterators(read_options_, &imm_iters_);
+  mutable_iter_ = sv_->mem->NewIterator(read_options_, &arena_);
+  sv_->imm->AddIterators(read_options_, &imm_iters_, &arena_);
   const auto& l0_files = sv_->current->files_[0];
   l0_iters_.reserve(l0_files.size());
   for (const auto* l0 : l0_files) {
diff --git a/db/forward_iterator.h b/db/forward_iterator.h
index bbf423a50..653a0ac0c 100644
--- a/db/forward_iterator.h
+++ b/db/forward_iterator.h
@@ -14,6 +14,7 @@
 #include "rocksdb/iterator.h"
 #include "rocksdb/options.h"
 #include "db/dbformat.h"
+#include "util/arena.h"
 
 namespace rocksdb {
 
@@ -100,6 +101,7 @@ class ForwardIterator : public Iterator {
 
   IterKey prev_key_;
   bool is_prev_set_;
+  Arena arena_;
 };
 
 }  // namespace rocksdb
diff --git a/db/memtable.cc b/db/memtable.cc
index e9e7051c7..e102575a4 100644
--- a/db/memtable.cc
+++ b/db/memtable.cc
@@ -249,13 +249,9 @@ class MemTableIterator: public Iterator {
 };
 
 Iterator* MemTable::NewIterator(const ReadOptions& options, Arena* arena) {
-  if (arena == nullptr) {
-    return new MemTableIterator(*this, options, nullptr);
-  } else {
-    auto mem = arena->AllocateAligned(sizeof(MemTableIterator));
-    return new (mem)
-        MemTableIterator(*this, options, arena);
-  }
+  assert(arena != nullptr);
+  auto mem = arena->AllocateAligned(sizeof(MemTableIterator));
+  return new (mem) MemTableIterator(*this, options, arena);
 }
 
 port::RWMutex* MemTable::GetLock(const Slice& key) {
diff --git a/db/memtable.h b/db/memtable.h
index 8bc281c6c..2723f30d8 100644
--- a/db/memtable.h
+++ b/db/memtable.h
@@ -81,8 +81,7 @@ class MemTable {
   // arena: If not null, the arena needs to be used to allocate the Iterator.
   //        Calling ~Iterator of the iterator will destroy all the states but
   //        those allocated in arena.
-  Iterator* NewIterator(const ReadOptions& options,
-                        Arena* arena = nullptr);
+  Iterator* NewIterator(const ReadOptions& options, Arena* arena);
 
   // Add an entry into memtable that maps key to value at the
   // specified sequence number and with the specified type.
diff --git a/db/memtable_list.cc b/db/memtable_list.cc
index d3fc1356b..418aae230 100644
--- a/db/memtable_list.cc
+++ b/db/memtable_list.cc
@@ -73,9 +73,10 @@ bool MemTableListVersion::Get(const LookupKey& key, std::string* value,
 }
 
 void MemTableListVersion::AddIterators(const ReadOptions& options,
-                                       std::vector<Iterator*>* iterator_list) {
+                                       std::vector<Iterator*>* iterator_list,
+                                       Arena* arena) {
   for (auto& m : memlist_) {
-    iterator_list->push_back(m->NewIterator(options));
+    iterator_list->push_back(m->NewIterator(options, arena));
   }
 }
 
diff --git a/db/memtable_list.h b/db/memtable_list.h
index f4923e831..997834e78 100644
--- a/db/memtable_list.h
+++ b/db/memtable_list.h
@@ -49,7 +49,7 @@ class MemTableListVersion {
            MergeContext& merge_context, const Options& options);
 
   void AddIterators(const ReadOptions& options,
-                    std::vector<Iterator*>* iterator_list);
+                    std::vector<Iterator*>* iterator_list, Arena* arena);
 
   void AddIterators(const ReadOptions& options,
                     MergeIteratorBuilder* merge_iter_builder);
diff --git a/db/repair.cc b/db/repair.cc
index 3c64449d1..dfe79fb23 100644
--- a/db/repair.cc
+++ b/db/repair.cc
@@ -48,6 +48,7 @@
 #include "rocksdb/env.h"
 #include "rocksdb/options.h"
 #include "rocksdb/immutable_options.h"
+#include "util/scoped_arena_iterator.h"
 
 namespace rocksdb {
 
@@ -240,13 +241,15 @@ class Repairer {
     // since ExtractMetaData() will also generate edits.
     FileMetaData meta;
     meta.fd = FileDescriptor(next_file_number_++, 0, 0);
-    ReadOptions ro;
-    ro.total_order_seek = true;
-    Iterator* iter = mem->NewIterator(ro);
-    status = BuildTable(dbname_, env_, ioptions_, env_options_, table_cache_,
-                        iter, &meta, icmp_, 0, 0, kNoCompression,
-                        CompressionOptions());
-    delete iter;
+    {
+      ReadOptions ro;
+      ro.total_order_seek = true;
+      Arena arena;
+      ScopedArenaIterator iter(mem->NewIterator(ro, &arena));
+      status = BuildTable(dbname_, env_, ioptions_, env_options_, table_cache_,
+                          iter.get(), &meta, icmp_, 0, 0, kNoCompression,
+                          CompressionOptions());
+    }
     delete mem->Unref();
     delete cf_mems_default;
     mem = nullptr;
diff --git a/db/version_set.cc b/db/version_set.cc
index 3a1545853..eca56ba2a 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -596,31 +596,6 @@ uint64_t Version::GetEstimatedActiveKeys() {
   return num_non_deletions_ - num_deletions_;
 }
 
-void Version::AddIterators(const ReadOptions& read_options,
-                           const EnvOptions& soptions,
-                           std::vector<Iterator*>* iters) {
-  // Merge all level zero files together since they may overlap
-  for (size_t i = 0; i < file_levels_[0].num_files; i++) {
-    const auto& file = file_levels_[0].files[i];
-    iters->push_back(cfd_->table_cache()->NewIterator(
-        read_options, soptions, cfd_->internal_comparator(), file.fd));
-  }
-
-  // For levels > 0, we can use a concatenating iterator that sequentially
-  // walks through the non-overlapping files in the level, opening them
-  // lazily.
-  for (int level = 1; level < num_levels_; level++) {
-    if (file_levels_[level].num_files != 0) {
-      iters->push_back(NewTwoLevelIterator(new LevelFileIteratorState(
-          cfd_->table_cache(), read_options, soptions,
-          cfd_->internal_comparator(), false /* for_compaction */,
-          cfd_->options()->prefix_extractor != nullptr),
-        new LevelFileNumIterator(cfd_->internal_comparator(),
-            &file_levels_[level])));
-    }
-  }
-}
-
 void Version::AddIterators(const ReadOptions& read_options,
                            const EnvOptions& soptions,
                            MergeIteratorBuilder* merge_iter_builder) {
diff --git a/db/version_set.h b/db/version_set.h
index 2f6d477a1..e9747f839 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -86,8 +86,6 @@ class Version {
   // Append to *iters a sequence of iterators that will
   // yield the contents of this Version when merged together.
   // REQUIRES: This version has been saved (see VersionSet::SaveTo)
-  void AddIterators(const ReadOptions&, const EnvOptions& soptions,
-                    std::vector<Iterator*>* iters);
 
   void AddIterators(const ReadOptions&, const EnvOptions& soptions,
                     MergeIteratorBuilder* merger_iter_builder);
diff --git a/db/write_batch_test.cc b/db/write_batch_test.cc
index 1d30552b3..aefb01e79 100644
--- a/db/write_batch_test.cc
+++ b/db/write_batch_test.cc
@@ -18,6 +18,7 @@
 #include "rocksdb/utilities/write_batch_with_index.h"
 #include "util/logging.h"
 #include "util/testharness.h"
+#include "util/scoped_arena_iterator.h"
 
 namespace rocksdb {
 
@@ -32,7 +33,8 @@ static std::string PrintContents(WriteBatch* b) {
   ColumnFamilyMemTablesDefault cf_mems_default(mem, &options);
   Status s = WriteBatchInternal::InsertInto(b, &cf_mems_default);
   int count = 0;
-  Iterator* iter = mem->NewIterator(ReadOptions());
+  Arena arena;
+  ScopedArenaIterator iter(mem->NewIterator(ReadOptions(), &arena));
   for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
     ParsedInternalKey ikey;
     memset((void *)&ikey, 0, sizeof(ikey));
@@ -67,7 +69,6 @@ static std::string PrintContents(WriteBatch* b) {
     state.append("@");
     state.append(NumberToString(ikey.sequence));
   }
-  delete iter;
   if (!s.ok()) {
     state.append(s.ToString());
   } else if (count != WriteBatchInternal::Count(b)) {
diff --git a/java/rocksjni/write_batch.cc b/java/rocksjni/write_batch.cc
index e8b2456ee..9a4eb70fd 100644
--- a/java/rocksjni/write_batch.cc
+++ b/java/rocksjni/write_batch.cc
@@ -18,6 +18,7 @@
 #include "rocksdb/env.h"
 #include "rocksdb/memtablerep.h"
 #include "util/logging.h"
+#include "util/scoped_arena_iterator.h"
 #include "util/testharness.h"
 
 /*
@@ -209,7 +210,9 @@ jbyteArray Java_org_rocksdb_WriteBatchTest_getContents(
   rocksdb::Status s =
       rocksdb::WriteBatchInternal::InsertInto(b, &cf_mems_default);
   int count = 0;
-  rocksdb::Iterator* iter = mem->NewIterator(rocksdb::ReadOptions());
+  Arena arena;
+  ScopedArenaIterator iter(mem->NewIterator(
+      rocksdb::ReadOptions(), false /*don't enforce total order*/, &arena));
   for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
     rocksdb::ParsedInternalKey ikey;
     memset(reinterpret_cast<void*>(&ikey), 0, sizeof(ikey));
@@ -244,7 +247,6 @@ jbyteArray Java_org_rocksdb_WriteBatchTest_getContents(
     state.append("@");
     state.append(rocksdb::NumberToString(ikey.sequence));
   }
-  delete iter;
   if (!s.ok()) {
     state.append(s.ToString());
   } else if (count != rocksdb::WriteBatchInternal::Count(b)) {
diff --git a/table/table_test.cc b/table/table_test.cc
index df4997588..a0f844014 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -42,6 +42,7 @@
 #include "util/statistics.h"
 #include "util/testharness.h"
 #include "util/testutil.h"
+#include "util/scoped_arena_iterator.h"
 
 namespace rocksdb {
 
@@ -223,8 +224,12 @@ class Constructor {
 
   virtual const KVMap& data() { return data_; }
 
+  virtual bool IsArenaMode() const { return false; }
+
   virtual DB* db() const { return nullptr; }  // Overridden in DBConstructor
 
+  virtual bool AnywayDeleteIterator() const { return false; }
+
  protected:
   const InternalKeyComparator* last_internal_key_;
 
@@ -279,8 +284,15 @@ class BlockConstructor: public Constructor {
 // A helper class that converts internal format keys into user keys
 class KeyConvertingIterator: public Iterator {
  public:
-  explicit KeyConvertingIterator(Iterator* iter) : iter_(iter) { }
-  virtual ~KeyConvertingIterator() { delete iter_; }
+  KeyConvertingIterator(Iterator* iter, bool arena_mode = false)
+      : iter_(iter), arena_mode_(arena_mode) {}
+  virtual ~KeyConvertingIterator() {
+    if (arena_mode_) {
+      iter_->~Iterator();
+    } else {
+      delete iter_;
+    }
+  }
   virtual bool Valid() const { return iter_->Valid(); }
   virtual void Seek(const Slice& target) {
     ParsedInternalKey ikey(target, kMaxSequenceNumber, kTypeValue);
@@ -311,6 +323,7 @@ class KeyConvertingIterator: public Iterator {
  private:
   mutable Status status_;
   Iterator* iter_;
+  bool arena_mode_;
 
   // No copying allowed
   KeyConvertingIterator(const KeyConvertingIterator&);
@@ -391,6 +404,10 @@ class TableConstructor: public Constructor {
     return table_reader_.get();
   }
 
+  virtual bool AnywayDeleteIterator() const override {
+    return convert_to_internal_key_;
+  }
+
  private:
   void Reset() {
     uniq_id_ = 0;
@@ -398,12 +415,12 @@ class TableConstructor: public Constructor {
     sink_.reset();
     source_.reset();
   }
-  bool convert_to_internal_key_;
 
   uint64_t uniq_id_;
   unique_ptr<StringSink> sink_;
   unique_ptr<StringSource> source_;
   unique_ptr<TableReader> table_reader_;
+  bool convert_to_internal_key_;
 
   TableConstructor();
 
@@ -446,10 +463,16 @@ class MemTableConstructor: public Constructor {
     return Status::OK();
   }
   virtual Iterator* NewIterator() const {
-    return new KeyConvertingIterator(memtable_->NewIterator(ReadOptions()));
+    return new KeyConvertingIterator(
+        memtable_->NewIterator(ReadOptions(), &arena_), true);
   }
 
+  virtual bool AnywayDeleteIterator() const override { return true; }
+
+  virtual bool IsArenaMode() const override { return true; }
+
  private:
+  mutable Arena arena_;
   InternalKeyComparator internal_comparator_;
   MemTable* memtable_;
   std::shared_ptr<SkipListFactory> table_factory_;
@@ -800,7 +823,11 @@ class Harness {
       iter->Next();
     }
     ASSERT_TRUE(!iter->Valid());
-    delete iter;
+    if (constructor_->IsArenaMode() && !constructor_->AnywayDeleteIterator()) {
+      iter->~Iterator();
+    } else {
+      delete iter;
+    }
   }
 
   void TestBackwardScan(const std::vector<std::string>& keys,
@@ -815,7 +842,11 @@ class Harness {
       iter->Prev();
     }
     ASSERT_TRUE(!iter->Valid());
-    delete iter;
+    if (constructor_->IsArenaMode() && !constructor_->AnywayDeleteIterator()) {
+      iter->~Iterator();
+    } else {
+      delete iter;
+    }
   }
 
   void TestRandomAccess(Random* rnd,
@@ -885,7 +916,11 @@ class Harness {
         }
       }
     }
-    delete iter;
+    if (constructor_->IsArenaMode() && !constructor_->AnywayDeleteIterator()) {
+      iter->~Iterator();
+    } else {
+      delete iter;
+    }
   }
 
   std::string ToString(const KVMap& data, const KVMap::const_iterator& it) {
@@ -1835,7 +1870,8 @@ TEST(MemTableTest, Simple) {
   ColumnFamilyMemTablesDefault cf_mems_default(memtable, &options);
   ASSERT_TRUE(WriteBatchInternal::InsertInto(&batch, &cf_mems_default).ok());
 
-  Iterator* iter = memtable->NewIterator(ReadOptions());
+  Arena arena;
+  ScopedArenaIterator iter(memtable->NewIterator(ReadOptions(), &arena));
   iter->SeekToFirst();
   while (iter->Valid()) {
     fprintf(stderr, "key: '%s' -> '%s'\n",
@@ -1844,7 +1880,6 @@ TEST(MemTableTest, Simple) {
     iter->Next();
   }
 
-  delete iter;
   delete memtable->Unref();
 }
 
diff --git a/util/ldb_cmd.cc b/util/ldb_cmd.cc
index 1aa3856a3..aef84fa35 100644
--- a/util/ldb_cmd.cc
+++ b/util/ldb_cmd.cc
@@ -14,6 +14,7 @@
 #include "rocksdb/write_batch.h"
 #include "rocksdb/cache.h"
 #include "util/coding.h"
+#include "util/scoped_arena_iterator.h"
 #include "utilities/ttl/db_ttl_impl.h"
 
 #include <ctime>
@@ -739,7 +740,8 @@ void InternalDumpCommand::DoCommand() {
   uint64_t c=0;
   uint64_t s1=0,s2=0;
   // Setup internal key iterator
-  auto iter = unique_ptr<Iterator>(idb->TEST_NewInternalIterator());
+  Arena arena;
+  ScopedArenaIterator iter(idb->TEST_NewInternalIterator(&arena));
   Status st = iter->status();
   if (!st.ok()) {
     exec_state_ = LDBCommandExecuteResult::FAILED("Iterator error:"
diff --git a/util/scoped_arena_iterator.h b/util/scoped_arena_iterator.h
new file mode 100644
index 000000000..2021d2dc2
--- /dev/null
+++ b/util/scoped_arena_iterator.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include "rocksdb/iterator.h"
+
+namespace rocksdb {
+class ScopedArenaIterator {
+ public:
+  explicit ScopedArenaIterator(Iterator* iter = nullptr) : iter_(iter) {}
+
+  Iterator* operator->() { return iter_; }
+
+  void set(Iterator* iter) { iter_ = iter; }
+
+  Iterator* get() { return iter_; }
+
+  ~ScopedArenaIterator() { iter_->~Iterator(); }
+
+ private:
+  Iterator* iter_;
+};
+}  // namespace rocksdb

From 4329d74e0581fa6ade91733643803f9ea3716743 Mon Sep 17 00:00:00 2001
From: Nik Bougalis <nikb@bougalis.net>
Date: Thu, 4 Sep 2014 20:09:45 -0700
Subject: [PATCH 023/829] Fix swapped variable names to accurately reflect
 usage

---
 db/version_edit.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/db/version_edit.h b/db/version_edit.h
index 58edfed45..db133402c 100644
--- a/db/version_edit.h
+++ b/db/version_edit.h
@@ -163,13 +163,13 @@ class VersionEdit {
   // Add the specified file at the specified number.
   // REQUIRES: This version has not been saved (see VersionSet::SaveTo)
   // REQUIRES: "smallest" and "largest" are smallest and largest keys in file
-  void AddFile(int level, uint64_t file, uint64_t file_size,
-               uint64_t file_path_id, const InternalKey& smallest,
+  void AddFile(int level, uint64_t file, uint64_t file_path_id,
+               uint64_t file_size, const InternalKey& smallest,
                const InternalKey& largest, const SequenceNumber& smallest_seqno,
                const SequenceNumber& largest_seqno) {
     assert(smallest_seqno <= largest_seqno);
     FileMetaData f;
-    f.fd = FileDescriptor(file, file_size, file_path_id);
+    f.fd = FileDescriptor(file, file_path_id, file_size);
     f.smallest = smallest;
     f.largest = largest;
     f.smallest_seqno = smallest_seqno;

From 0cd0ec4fe0d3bca14431766674a3382b83993bd9 Mon Sep 17 00:00:00 2001
From: Nik Bougalis <nikb@bougalis.net>
Date: Thu, 4 Sep 2014 20:52:00 -0700
Subject: [PATCH 024/829] Plug memory leak during index creation

---
 utilities/document/document_db.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/utilities/document/document_db.cc b/utilities/document/document_db.cc
index c12a1f253..8e15a52ca 100644
--- a/utilities/document/document_db.cc
+++ b/utilities/document/document_db.cc
@@ -736,6 +736,7 @@ class DocumentDBImpl : public DocumentDB {
         CreateColumnFamily(ColumnFamilyOptions(rocksdb_options_),
                            InternalSecondaryIndexName(index.name), &cf_handle);
     if (!s.ok()) {
+      delete index_obj;
       return s;
     }
 

From bb6ae0f80c9546e5eafec0f3aac2695aa8ef56df Mon Sep 17 00:00:00 2001
From: liuhuahang <liuhuahang@zerus.co>
Date: Fri, 5 Sep 2014 14:14:37 +0800
Subject: [PATCH 025/829] fix more compile warnings

N/A

Change-Id: I5b6f9c70aea7d3f3489328834fed323d41106d9f
Signed-off-by: liuhuahang <liuhuahang@zerus.co>
---
 db/compaction.cc                          | 3 +++
 db/compaction_picker.cc                   | 3 +++
 db/db_bench.cc                            | 2 ++
 db/db_filesnapshot.cc                     | 3 +++
 db/db_impl.cc                             | 3 +++
 db/filename.cc                            | 3 +++
 db/internal_stats.cc                      | 4 ++++
 db/repair.cc                              | 3 +++
 db/version_set.cc                         | 3 +++
 include/rocksdb/utilities/backupable_db.h | 3 +++
 table/cuckoo_table_reader_test.cc         | 3 +++
 util/db_info_dummper.cc                   | 3 +++
 util/dynamic_bloom_test.cc                | 3 +++
 util/logging.cc                           | 3 +++
 util/options.cc                           | 3 +++
 util/options_test.cc                      | 3 +++
 util/rate_limiter_test.cc                 | 3 +++
 util/statistics.cc                        | 3 +++
 utilities/backupable/backupable_db.cc     | 2 ++
 utilities/document/json_document.cc       | 3 +++
 utilities/geodb/geodb_impl.cc             | 2 ++
 21 files changed, 61 insertions(+)

diff --git a/db/compaction.cc b/db/compaction.cc
index 0bffa0162..cf0b682aa 100644
--- a/db/compaction.cc
+++ b/db/compaction.cc
@@ -9,7 +9,10 @@
 
 #include "db/compaction.h"
 
+#ifndef __STDC_FORMAT_MACROS
 #define __STDC_FORMAT_MACROS
+#endif
+
 #include <inttypes.h>
 #include <vector>
 
diff --git a/db/compaction_picker.cc b/db/compaction_picker.cc
index e05d07776..6e9a46ed4 100644
--- a/db/compaction_picker.cc
+++ b/db/compaction_picker.cc
@@ -9,7 +9,10 @@
 
 #include "db/compaction_picker.h"
 
+#ifndef __STDC_FORMAT_MACROS
 #define __STDC_FORMAT_MACROS
+#endif
+
 #include <inttypes.h>
 #include <limits>
 #include "db/filename.h"
diff --git a/db/db_bench.cc b/db/db_bench.cc
index 2f88e81ff..bd4389b49 100644
--- a/db/db_bench.cc
+++ b/db/db_bench.cc
@@ -7,7 +7,9 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
+#ifndef __STDC_FORMAT_MACROS
 #define __STDC_FORMAT_MACROS
+#endif
 
 #ifndef GFLAGS
 #include <cstdio>
diff --git a/db/db_filesnapshot.cc b/db/db_filesnapshot.cc
index 4185a40ca..aa1408f38 100644
--- a/db/db_filesnapshot.cc
+++ b/db/db_filesnapshot.cc
@@ -9,7 +9,10 @@
 
 #ifndef ROCKSDB_LITE
 
+#ifndef __STDC_FORMAT_MACROS
 #define __STDC_FORMAT_MACROS
+#endif
+
 #include <inttypes.h>
 #include <algorithm>
 #include <string>
diff --git a/db/db_impl.cc b/db/db_impl.cc
index 4e3816d64..f18304407 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -9,7 +9,10 @@
 
 #include "db/db_impl.h"
 
+#ifndef __STDC_FORMAT_MACROS
 #define __STDC_FORMAT_MACROS
+#endif
+
 #include <inttypes.h>
 #include <algorithm>
 #include <climits>
diff --git a/db/filename.cc b/db/filename.cc
index 42c7efb78..a8f685296 100644
--- a/db/filename.cc
+++ b/db/filename.cc
@@ -6,7 +6,10 @@
 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
+#ifndef __STDC_FORMAT_MACROS
 #define __STDC_FORMAT_MACROS
+#endif
+
 #include "db/filename.h"
 #include <inttypes.h>
 
diff --git a/db/internal_stats.cc b/db/internal_stats.cc
index 3142d13b3..c9f9306e2 100644
--- a/db/internal_stats.cc
+++ b/db/internal_stats.cc
@@ -7,7 +7,11 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #include "db/internal_stats.h"
+
+#ifndef __STDC_FORMAT_MACROS
 #define __STDC_FORMAT_MACROS
+#endif
+
 #include <inttypes.h>
 #include <vector>
 #include "db/column_family.h"
diff --git a/db/repair.cc b/db/repair.cc
index dfe79fb23..ea6cdd642 100644
--- a/db/repair.cc
+++ b/db/repair.cc
@@ -31,7 +31,10 @@
 
 #ifndef ROCKSDB_LITE
 
+#ifndef __STDC_FORMAT_MACROS
 #define __STDC_FORMAT_MACROS
+#endif
+
 #include <inttypes.h>
 #include "db/builder.h"
 #include "db/db_impl.h"
diff --git a/db/version_set.cc b/db/version_set.cc
index eca56ba2a..82183a982 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -9,7 +9,10 @@
 
 #include "db/version_set.h"
 
+#ifndef __STDC_FORMAT_MACROS
 #define __STDC_FORMAT_MACROS
+#endif
+
 #include <inttypes.h>
 #include <algorithm>
 #include <map>
diff --git a/include/rocksdb/utilities/backupable_db.h b/include/rocksdb/utilities/backupable_db.h
index 78365769d..bf3f919ae 100644
--- a/include/rocksdb/utilities/backupable_db.h
+++ b/include/rocksdb/utilities/backupable_db.h
@@ -10,7 +10,10 @@
 #pragma once
 #ifndef ROCKSDB_LITE
 
+#ifndef __STDC_FORMAT_MACROS
 #define __STDC_FORMAT_MACROS
+#endif
+
 #include <inttypes.h>
 #include <string>
 #include <map>
diff --git a/table/cuckoo_table_reader_test.cc b/table/cuckoo_table_reader_test.cc
index 53946e71b..3138fb9ef 100644
--- a/table/cuckoo_table_reader_test.cc
+++ b/table/cuckoo_table_reader_test.cc
@@ -11,7 +11,10 @@ int main() {
 }
 #else
 
+#ifndef __STDC_FORMAT_MACROS
 #define __STDC_FORMAT_MACROS
+#endif
+
 #include <inttypes.h>
 #include <gflags/gflags.h>
 #include <vector>
diff --git a/util/db_info_dummper.cc b/util/db_info_dummper.cc
index d5dd97ad2..2e0d34481 100644
--- a/util/db_info_dummper.cc
+++ b/util/db_info_dummper.cc
@@ -6,7 +6,10 @@
 // Must not be included from any .h files to avoid polluting the namespace
 // with macros.
 
+#ifndef __STDC_FORMAT_MACROS
 #define __STDC_FORMAT_MACROS
+#endif
+
 #include <inttypes.h>
 #include <stdio.h>
 #include <string>
diff --git a/util/dynamic_bloom_test.cc b/util/dynamic_bloom_test.cc
index 3e55488f2..6d228e81d 100644
--- a/util/dynamic_bloom_test.cc
+++ b/util/dynamic_bloom_test.cc
@@ -11,7 +11,10 @@ int main() {
 }
 #else
 
+#ifndef __STDC_FORMAT_MACROS
 #define __STDC_FORMAT_MACROS
+#endif
+
 #include <inttypes.h>
 #include <algorithm>
 #include <gflags/gflags.h>
diff --git a/util/logging.cc b/util/logging.cc
index 1b5549d73..4dfb9a449 100644
--- a/util/logging.cc
+++ b/util/logging.cc
@@ -9,7 +9,10 @@
 
 #include "util/logging.h"
 
+#ifndef __STDC_FORMAT_MACROS
 #define __STDC_FORMAT_MACROS
+#endif
+
 #include <inttypes.h>
 #include <errno.h>
 #include <stdarg.h>
diff --git a/util/options.cc b/util/options.cc
index fc9285442..371ecda78 100644
--- a/util/options.cc
+++ b/util/options.cc
@@ -10,7 +10,10 @@
 #include "rocksdb/options.h"
 #include "rocksdb/immutable_options.h"
 
+#ifndef __STDC_FORMAT_MACROS
 #define __STDC_FORMAT_MACROS
+#endif
+
 #include <inttypes.h>
 #include <limits>
 
diff --git a/util/options_test.cc b/util/options_test.cc
index be07a83f5..afe3795f9 100644
--- a/util/options_test.cc
+++ b/util/options_test.cc
@@ -7,7 +7,10 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
+#ifndef __STDC_FORMAT_MACROS
 #define __STDC_FORMAT_MACROS
+#endif
+
 #include <inttypes.h>
 #include <gflags/gflags.h>
 
diff --git a/util/rate_limiter_test.cc b/util/rate_limiter_test.cc
index 1b72e4ed0..9d6cfb7e6 100644
--- a/util/rate_limiter_test.cc
+++ b/util/rate_limiter_test.cc
@@ -7,7 +7,10 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
+#ifndef __STDC_FORMAT_MACROS
 #define __STDC_FORMAT_MACROS
+#endif
+
 #include <inttypes.h>
 #include <limits>
 #include "util/testharness.h"
diff --git a/util/statistics.cc b/util/statistics.cc
index 24957c9b6..9d828a6fe 100644
--- a/util/statistics.cc
+++ b/util/statistics.cc
@@ -5,7 +5,10 @@
 //
 #include "util/statistics.h"
 
+#ifndef __STDC_FORMAT_MACROS
 #define __STDC_FORMAT_MACROS
+#endif
+
 #include <inttypes.h>
 #include "rocksdb/statistics.h"
 #include "port/likely.h"
diff --git a/utilities/backupable/backupable_db.cc b/utilities/backupable/backupable_db.cc
index 436f4c2d6..4d1a9b76b 100644
--- a/utilities/backupable/backupable_db.cc
+++ b/utilities/backupable/backupable_db.cc
@@ -15,7 +15,9 @@
 #include "util/crc32c.h"
 #include "rocksdb/transaction_log.h"
 
+#ifndef __STDC_FORMAT_MACROS
 #define __STDC_FORMAT_MACROS
+#endif
 
 #include <inttypes.h>
 #include <algorithm>
diff --git a/utilities/document/json_document.cc b/utilities/document/json_document.cc
index 641f4ee09..4368b759d 100644
--- a/utilities/document/json_document.cc
+++ b/utilities/document/json_document.cc
@@ -6,7 +6,10 @@
 
 #include "rocksdb/utilities/json_document.h"
 
+#ifndef __STDC_FORMAT_MACROS
 #define __STDC_FORMAT_MACROS
+#endif
+
 #include <inttypes.h>
 #include <cassert>
 #include <string>
diff --git a/utilities/geodb/geodb_impl.cc b/utilities/geodb/geodb_impl.cc
index f63c91c3e..6c13fd691 100644
--- a/utilities/geodb/geodb_impl.cc
+++ b/utilities/geodb/geodb_impl.cc
@@ -7,7 +7,9 @@
 
 #include "utilities/geodb/geodb_impl.h"
 
+#ifndef __STDC_FORMAT_MACROS
 #define __STDC_FORMAT_MACROS
+#endif
 
 #include <vector>
 #include <map>

From adcd2532ca18d642db5d5b8ea6df219aad1113b5 Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Fri, 5 Sep 2014 09:53:04 -0700
Subject: [PATCH 026/829] fix asan check

Summary:
PlainTable takes reference instead of a copy. Keep a copy in the test
code

Test Plan: make asan_check

Reviewers: sdong, igor

Reviewed By: igor

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D22899
---
 db/table_properties_collector_test.cc | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/db/table_properties_collector_test.cc b/db/table_properties_collector_test.cc
index 8168ca5d6..74abf8670 100644
--- a/db/table_properties_collector_test.cc
+++ b/db/table_properties_collector_test.cc
@@ -11,6 +11,7 @@
 #include "db/dbformat.h"
 #include "db/table_properties_collector.h"
 #include "rocksdb/table.h"
+#include "rocksdb/immutable_options.h"
 #include "table/block_based_table_factory.h"
 #include "table/meta_blocks.h"
 #include "table/plain_table_factory.h"
@@ -85,12 +86,13 @@ class DumbLogger : public Logger {
 // Utilities test functions
 namespace {
 void MakeBuilder(const Options& options,
+                 const ImmutableCFOptions& ioptions,
                  const InternalKeyComparator& internal_comparator,
                  std::unique_ptr<FakeWritableFile>* writable,
                  std::unique_ptr<TableBuilder>* builder) {
   writable->reset(new FakeWritableFile);
-  builder->reset(options.table_factory->NewTableBuilder(
-      ImmutableCFOptions(options), internal_comparator, writable->get(),
+  builder->reset(ioptions.table_factory->NewTableBuilder(
+      ioptions, internal_comparator, writable->get(),
       options.compression, options.compression_opts));
 }
 }  // namespace
@@ -154,7 +156,8 @@ void TestCustomizedTablePropertiesCollector(
   // -- Step 1: build table
   std::unique_ptr<TableBuilder> builder;
   std::unique_ptr<FakeWritableFile> writable;
-  MakeBuilder(options, internal_comparator, &writable, &builder);
+  const ImmutableCFOptions ioptions(options);
+  MakeBuilder(options, ioptions, internal_comparator, &writable, &builder);
 
   for (const auto& kv : kvs) {
     if (encode_as_internal) {
@@ -265,9 +268,10 @@ void TestInternalKeyPropertiesCollector(
     options.table_properties_collector_factories = {
         std::make_shared<InternalKeyPropertiesCollectorFactory>()};
   }
+  const ImmutableCFOptions ioptions(options);
 
   for (int iter = 0; iter < 2; ++iter) {
-    MakeBuilder(options, pikc, &writable, &builder);
+    MakeBuilder(options, ioptions, pikc, &writable, &builder);
     for (const auto& k : keys) {
       builder->Add(k.Encode(), "val");
     }

From 0fbb3facc020a38520710b409d628384f8f29f0d Mon Sep 17 00:00:00 2001
From: Raghav Pisolkar <raghavpi@dev639.prn1.facebook.com>
Date: Fri, 5 Sep 2014 00:47:54 -0700
Subject: [PATCH 027/829] fixed memory leak in unit test DBIteratorBoundTest

Summary: fixed memory leak in unit test DBIteratorBoundTest

Test Plan: ran valgrind test on my unit test

Reviewers: sdong

Differential Revision: https://reviews.facebook.net/D22911
---
 db/db_test.cc | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/db/db_test.cc b/db/db_test.cc
index 5bd781696..570af31a5 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -7791,7 +7791,8 @@ TEST(DBTest, DBIteratorBoundTest) {
   {
     ReadOptions ro;
     // iterate_upper_bound points beyond the last expected entry
-    ro.iterate_upper_bound = new Slice("foo2");
+    Slice prefix("foo2");
+    ro.iterate_upper_bound = &prefix;
 
     std::unique_ptr<Iterator> iter(db_->NewIterator(ro));
 
@@ -7823,7 +7824,8 @@ TEST(DBTest, DBIteratorBoundTest) {
   // This should be an error
   {
     ReadOptions ro;
-    ro.iterate_upper_bound = new Slice("g1");
+    Slice prefix("g1");
+    ro.iterate_upper_bound = &prefix;
 
     std::unique_ptr<Iterator> iter(db_->NewIterator(ro));
 
@@ -7868,7 +7870,8 @@ TEST(DBTest, DBIteratorBoundTest) {
     ASSERT_EQ(static_cast<int>(perf_context.internal_delete_skipped_count), 2);
 
     // now testing with iterate_bound
-    ro.iterate_upper_bound = new Slice("c");
+    Slice prefix("c");
+    ro.iterate_upper_bound = &prefix;
 
     iter.reset(db_->NewIterator(ro));
 

From 5cd0576ffeacba2071771282a262d81fdb1e2232 Mon Sep 17 00:00:00 2001
From: Radheshyam Balasundaram <rbs@fb.com>
Date: Fri, 5 Sep 2014 11:18:01 -0700
Subject: [PATCH 028/829] Fix compaction bug in Cuckoo Table Builder. Use
 kvs_.size() instead of num_entries in FileSize() method.

Summary: Fix compaction bug in Cuckoo Table Builder. Use kvs_.size() instead of num_entries in FileSize() method. Also added tests.

Test Plan:
make check all
Also ran db_bench to generate multiple files.

Reviewers: sdong, ljin

Reviewed By: ljin

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D22743
---
 db/cuckoo_table_db_test.cc         | 26 ++++++++++++++++++-
 table/cuckoo_table_builder.cc      |  5 ++--
 table/cuckoo_table_builder_test.cc | 41 ++++++++++++++++++++++++------
 3 files changed, 60 insertions(+), 12 deletions(-)

diff --git a/db/cuckoo_table_db_test.cc b/db/cuckoo_table_db_test.cc
index c1e59b1b5..2652d1776 100644
--- a/db/cuckoo_table_db_test.cc
+++ b/db/cuckoo_table_db_test.cc
@@ -245,14 +245,38 @@ TEST(CuckooTableDBTest, CompactionTrigger) {
     ASSERT_OK(Put(Key(idx), std::string(10000, 'a' + idx)));
   }
   dbfull()->TEST_WaitForFlushMemTable();
-  dbfull()->TEST_CompactRange(0, nullptr, nullptr);
+  ASSERT_EQ("2", FilesPerLevel());
 
+  dbfull()->TEST_CompactRange(0, nullptr, nullptr);
   ASSERT_EQ("0,2", FilesPerLevel());
   for (int idx = 0; idx < 22; ++idx) {
     ASSERT_EQ(std::string(10000, 'a' + idx), Get(Key(idx)));
   }
 }
 
+TEST(CuckooTableDBTest, CompactionIntoMultipleFiles) {
+  // Create a big L0 file and check it compacts into multiple files in L1.
+  Options options = CurrentOptions();
+  options.write_buffer_size = 270 << 10;
+  // Two SST files should be created, each containing 14 keys.
+  // Number of buckets will be 16. Total size ~156 KB.
+  options.target_file_size_base = 160 << 10;
+  Reopen(&options);
+
+  // Write 28 values, each 10016 B ~ 10KB
+  for (int idx = 0; idx < 28; ++idx) {
+    ASSERT_OK(Put(Key(idx), std::string(10000, 'a' + idx)));
+  }
+  dbfull()->TEST_WaitForFlushMemTable();
+  ASSERT_EQ("1", FilesPerLevel());
+
+  dbfull()->TEST_CompactRange(0, nullptr, nullptr);
+  ASSERT_EQ("0,2", FilesPerLevel());
+  for (int idx = 0; idx < 28; ++idx) {
+    ASSERT_EQ(std::string(10000, 'a' + idx), Get(Key(idx)));
+  }
+}
+
 TEST(CuckooTableDBTest, SameKeyInsertedInTwoDifferentFilesAndCompacted) {
   // Insert same key twice so that they go to different SST files. Then wait for
   // compaction and check if the latest value is stored and old value removed.
diff --git a/table/cuckoo_table_builder.cc b/table/cuckoo_table_builder.cc
index 6326d3787..e107071f2 100644
--- a/table/cuckoo_table_builder.cc
+++ b/table/cuckoo_table_builder.cc
@@ -56,7 +56,6 @@ CuckooTableBuilder::CuckooTableBuilder(
       ucomp_(user_comparator),
       get_slice_hash_(get_slice_hash),
       closed_(false) {
-  properties_.num_entries = 0;
   // Data is in a huge block.
   properties_.num_data_blocks = 1;
   properties_.index_size = 0;
@@ -64,7 +63,7 @@ CuckooTableBuilder::CuckooTableBuilder(
 }
 
 void CuckooTableBuilder::Add(const Slice& key, const Slice& value) {
-  if (properties_.num_entries >= kMaxVectorIdx - 1) {
+  if (kvs_.size() >= kMaxVectorIdx - 1) {
     status_ = Status::NotSupported("Number of keys in a file must be < 2^32-1");
     return;
   }
@@ -311,7 +310,7 @@ uint64_t CuckooTableBuilder::NumEntries() const {
 uint64_t CuckooTableBuilder::FileSize() const {
   if (closed_) {
     return file_->GetFileSize();
-  } else if (properties_.num_entries == 0) {
+  } else if (kvs_.size() == 0) {
     return 0;
   }
 
diff --git a/table/cuckoo_table_builder_test.cc b/table/cuckoo_table_builder_test.cc
index 69647d410..be13dc9a3 100644
--- a/table/cuckoo_table_builder_test.cc
+++ b/table/cuckoo_table_builder_test.cc
@@ -135,6 +135,7 @@ TEST(CuckooBuilderTest, SuccessWithEmptyFile) {
   CuckooTableBuilder builder(writable_file.get(), kHashTableRatio,
       4, 100, BytewiseComparator(), 1, GetSliceHash);
   ASSERT_OK(builder.status());
+  ASSERT_EQ(0UL, builder.FileSize());
   ASSERT_OK(builder.Finish());
   ASSERT_OK(writable_file->Close());
   CheckFileContents({}, {}, {}, "", 0, 2, false);
@@ -155,6 +156,7 @@ TEST(CuckooBuilderTest, WriteSuccessNoCollisionFullKey) {
   for (auto& user_key : user_keys) {
     keys.push_back(GetInternalKey(user_key, false));
   }
+  uint32_t expected_table_size = NextPowOf2(keys.size() / kHashTableRatio);
 
   unique_ptr<WritableFile> writable_file;
   fname = test::TmpDir() + "/NoCollisionFullKey";
@@ -167,10 +169,12 @@ TEST(CuckooBuilderTest, WriteSuccessNoCollisionFullKey) {
     ASSERT_EQ(builder.NumEntries(), i + 1);
     ASSERT_OK(builder.status());
   }
+  uint32_t bucket_size = keys[0].size() + values[0].size();
+  ASSERT_LE(expected_table_size * bucket_size, builder.FileSize());
   ASSERT_OK(builder.Finish());
   ASSERT_OK(writable_file->Close());
+  ASSERT_LE(expected_table_size * bucket_size, builder.FileSize());
 
-  uint32_t expected_table_size = NextPowOf2(keys.size() / kHashTableRatio);
   std::string expected_unused_bucket = GetInternalKey("key00", true);
   expected_unused_bucket += std::string(values[0].size(), 'a');
   CheckFileContents(keys, values, expected_locations,
@@ -192,6 +196,7 @@ TEST(CuckooBuilderTest, WriteSuccessWithCollisionFullKey) {
   for (auto& user_key : user_keys) {
     keys.push_back(GetInternalKey(user_key, false));
   }
+  uint32_t expected_table_size = NextPowOf2(keys.size() / kHashTableRatio);
 
   unique_ptr<WritableFile> writable_file;
   fname = test::TmpDir() + "/WithCollisionFullKey";
@@ -204,10 +209,12 @@ TEST(CuckooBuilderTest, WriteSuccessWithCollisionFullKey) {
     ASSERT_EQ(builder.NumEntries(), i + 1);
     ASSERT_OK(builder.status());
   }
+  uint32_t bucket_size = keys[0].size() + values[0].size();
+  ASSERT_LE(expected_table_size * bucket_size, builder.FileSize());
   ASSERT_OK(builder.Finish());
   ASSERT_OK(writable_file->Close());
+  ASSERT_LE(expected_table_size * bucket_size, builder.FileSize());
 
-  uint32_t expected_table_size = NextPowOf2(keys.size() / kHashTableRatio);
   std::string expected_unused_bucket = GetInternalKey("key00", true);
   expected_unused_bucket += std::string(values[0].size(), 'a');
   CheckFileContents(keys, values, expected_locations,
@@ -229,6 +236,7 @@ TEST(CuckooBuilderTest, WriteSuccessWithCollisionAndCuckooBlock) {
   for (auto& user_key : user_keys) {
     keys.push_back(GetInternalKey(user_key, false));
   }
+  uint32_t expected_table_size = NextPowOf2(keys.size() / kHashTableRatio);
 
   unique_ptr<WritableFile> writable_file;
   uint32_t cuckoo_block_size = 2;
@@ -242,10 +250,12 @@ TEST(CuckooBuilderTest, WriteSuccessWithCollisionAndCuckooBlock) {
     ASSERT_EQ(builder.NumEntries(), i + 1);
     ASSERT_OK(builder.status());
   }
+  uint32_t bucket_size = keys[0].size() + values[0].size();
+  ASSERT_LE(expected_table_size * bucket_size, builder.FileSize());
   ASSERT_OK(builder.Finish());
   ASSERT_OK(writable_file->Close());
+  ASSERT_LE(expected_table_size * bucket_size, builder.FileSize());
 
-  uint32_t expected_table_size = NextPowOf2(keys.size() / kHashTableRatio);
   std::string expected_unused_bucket = GetInternalKey("key00", true);
   expected_unused_bucket += std::string(values[0].size(), 'a');
   CheckFileContents(keys, values, expected_locations,
@@ -272,6 +282,7 @@ TEST(CuckooBuilderTest, WithCollisionPathFullKey) {
   for (auto& user_key : user_keys) {
     keys.push_back(GetInternalKey(user_key, false));
   }
+  uint32_t expected_table_size = NextPowOf2(keys.size() / kHashTableRatio);
 
   unique_ptr<WritableFile> writable_file;
   fname = test::TmpDir() + "/WithCollisionPathFullKey";
@@ -284,10 +295,12 @@ TEST(CuckooBuilderTest, WithCollisionPathFullKey) {
     ASSERT_EQ(builder.NumEntries(), i + 1);
     ASSERT_OK(builder.status());
   }
+  uint32_t bucket_size = keys[0].size() + values[0].size();
+  ASSERT_LE(expected_table_size * bucket_size, builder.FileSize());
   ASSERT_OK(builder.Finish());
   ASSERT_OK(writable_file->Close());
+  ASSERT_LE(expected_table_size * bucket_size, builder.FileSize());
 
-  uint32_t expected_table_size = NextPowOf2(keys.size() / kHashTableRatio);
   std::string expected_unused_bucket = GetInternalKey("key00", true);
   expected_unused_bucket += std::string(values[0].size(), 'a');
   CheckFileContents(keys, values, expected_locations,
@@ -311,6 +324,7 @@ TEST(CuckooBuilderTest, WithCollisionPathFullKeyAndCuckooBlock) {
   for (auto& user_key : user_keys) {
     keys.push_back(GetInternalKey(user_key, false));
   }
+  uint32_t expected_table_size = NextPowOf2(keys.size() / kHashTableRatio);
 
   unique_ptr<WritableFile> writable_file;
   fname = test::TmpDir() + "/WithCollisionPathFullKeyAndCuckooBlock";
@@ -323,10 +337,12 @@ TEST(CuckooBuilderTest, WithCollisionPathFullKeyAndCuckooBlock) {
     ASSERT_EQ(builder.NumEntries(), i + 1);
     ASSERT_OK(builder.status());
   }
+  uint32_t bucket_size = keys[0].size() + values[0].size();
+  ASSERT_LE(expected_table_size * bucket_size, builder.FileSize());
   ASSERT_OK(builder.Finish());
   ASSERT_OK(writable_file->Close());
+  ASSERT_LE(expected_table_size * bucket_size, builder.FileSize());
 
-  uint32_t expected_table_size = NextPowOf2(keys.size() / kHashTableRatio);
   std::string expected_unused_bucket = GetInternalKey("key00", true);
   expected_unused_bucket += std::string(values[0].size(), 'a');
   CheckFileContents(keys, values, expected_locations,
@@ -344,6 +360,7 @@ TEST(CuckooBuilderTest, WriteSuccessNoCollisionUserKey) {
     {user_keys[3], {3, 4, 5, 6}}
   };
   std::vector<uint64_t> expected_locations = {0, 1, 2, 3};
+  uint32_t expected_table_size = NextPowOf2(user_keys.size() / kHashTableRatio);
 
   unique_ptr<WritableFile> writable_file;
   fname = test::TmpDir() + "/NoCollisionUserKey";
@@ -356,10 +373,12 @@ TEST(CuckooBuilderTest, WriteSuccessNoCollisionUserKey) {
     ASSERT_EQ(builder.NumEntries(), i + 1);
     ASSERT_OK(builder.status());
   }
+  uint32_t bucket_size = user_keys[0].size() + values[0].size();
+  ASSERT_LE(expected_table_size * bucket_size, builder.FileSize());
   ASSERT_OK(builder.Finish());
   ASSERT_OK(writable_file->Close());
+  ASSERT_LE(expected_table_size * bucket_size, builder.FileSize());
 
-  uint32_t expected_table_size = NextPowOf2(user_keys.size() / kHashTableRatio);
   std::string expected_unused_bucket = "key00";
   expected_unused_bucket += std::string(values[0].size(), 'a');
   CheckFileContents(user_keys, values, expected_locations,
@@ -377,6 +396,7 @@ TEST(CuckooBuilderTest, WriteSuccessWithCollisionUserKey) {
     {user_keys[3], {0, 1, 2, 3}},
   };
   std::vector<uint64_t> expected_locations = {0, 1, 2, 3};
+  uint32_t expected_table_size = NextPowOf2(user_keys.size() / kHashTableRatio);
 
   unique_ptr<WritableFile> writable_file;
   fname = test::TmpDir() + "/WithCollisionUserKey";
@@ -389,10 +409,12 @@ TEST(CuckooBuilderTest, WriteSuccessWithCollisionUserKey) {
     ASSERT_EQ(builder.NumEntries(), i + 1);
     ASSERT_OK(builder.status());
   }
+  uint32_t bucket_size = user_keys[0].size() + values[0].size();
+  ASSERT_LE(expected_table_size * bucket_size, builder.FileSize());
   ASSERT_OK(builder.Finish());
   ASSERT_OK(writable_file->Close());
+  ASSERT_LE(expected_table_size * bucket_size, builder.FileSize());
 
-  uint32_t expected_table_size = NextPowOf2(user_keys.size() / kHashTableRatio);
   std::string expected_unused_bucket = "key00";
   expected_unused_bucket += std::string(values[0].size(), 'a');
   CheckFileContents(user_keys, values, expected_locations,
@@ -412,6 +434,7 @@ TEST(CuckooBuilderTest, WithCollisionPathUserKey) {
     {user_keys[4], {0, 2}},
   };
   std::vector<uint64_t> expected_locations = {0, 1, 3, 4, 2};
+  uint32_t expected_table_size = NextPowOf2(user_keys.size() / kHashTableRatio);
 
   unique_ptr<WritableFile> writable_file;
   fname = test::TmpDir() + "/WithCollisionPathUserKey";
@@ -424,10 +447,12 @@ TEST(CuckooBuilderTest, WithCollisionPathUserKey) {
     ASSERT_EQ(builder.NumEntries(), i + 1);
     ASSERT_OK(builder.status());
   }
+  uint32_t bucket_size = user_keys[0].size() + values[0].size();
+  ASSERT_LE(expected_table_size * bucket_size, builder.FileSize());
   ASSERT_OK(builder.Finish());
   ASSERT_OK(writable_file->Close());
+  ASSERT_LE(expected_table_size * bucket_size, builder.FileSize());
 
-  uint32_t expected_table_size = NextPowOf2(user_keys.size() / kHashTableRatio);
   std::string expected_unused_bucket = "key00";
   expected_unused_bucket += std::string(values[0].size(), 'a');
   CheckFileContents(user_keys, values, expected_locations,

From c9e419ccb6f29a21f86354afda0256a78579f201 Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Fri, 5 Sep 2014 11:48:17 -0700
Subject: [PATCH 029/829] rename options_ to db_options_ in DBImpl to avoid
 confusion

Summary: as title

Test Plan: make release

Reviewers: sdong, igor

Reviewed By: igor

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D22935
---
 db/db_filesnapshot.cc  |  20 +--
 db/db_impl.cc          | 341 +++++++++++++++++++++--------------------
 db/db_impl.h           |   4 +-
 db/db_impl_readonly.cc |   2 +-
 4 files changed, 188 insertions(+), 179 deletions(-)

diff --git a/db/db_filesnapshot.cc b/db/db_filesnapshot.cc
index aa1408f38..9f05b8d30 100644
--- a/db/db_filesnapshot.cc
+++ b/db/db_filesnapshot.cc
@@ -32,9 +32,9 @@ Status DBImpl::DisableFileDeletions() {
   MutexLock l(&mutex_);
   ++disable_delete_obsolete_files_;
   if (disable_delete_obsolete_files_ == 1) {
-    Log(options_.info_log, "File Deletions Disabled");
+    Log(db_options_.info_log, "File Deletions Disabled");
   } else {
-    Log(options_.info_log,
+    Log(db_options_.info_log,
         "File Deletions Disabled, but already disabled. Counter: %d",
         disable_delete_obsolete_files_);
   }
@@ -53,11 +53,11 @@ Status DBImpl::EnableFileDeletions(bool force) {
       --disable_delete_obsolete_files_;
     }
     if (disable_delete_obsolete_files_ == 0)  {
-      Log(options_.info_log, "File Deletions Enabled");
+      Log(db_options_.info_log, "File Deletions Enabled");
       should_purge_files = true;
       FindObsoleteFiles(deletion_state, true);
     } else {
-      Log(options_.info_log,
+      Log(db_options_.info_log,
           "File Deletions Enable, but not really enabled. Counter: %d",
           disable_delete_obsolete_files_);
     }
@@ -65,7 +65,7 @@ Status DBImpl::EnableFileDeletions(bool force) {
   if (should_purge_files)  {
     PurgeObsoleteFiles(deletion_state);
   }
-  LogFlush(options_.info_log);
+  LogFlush(db_options_.info_log);
   return Status::OK();
 }
 
@@ -98,7 +98,7 @@ Status DBImpl::GetLiveFiles(std::vector<std::string>& ret,
 
     if (!status.ok()) {
       mutex_.Unlock();
-      Log(options_.info_log, "Cannot Flush data %s\n",
+      Log(db_options_.info_log, "Cannot Flush data %s\n",
           status.ToString().c_str());
       return status;
     }
@@ -136,7 +136,7 @@ Status DBImpl::GetSortedWalFiles(VectorLogPtr& files) {
   Status s;
   // list wal files in main db dir.
   VectorLogPtr logs;
-  s = GetSortedWalsOfType(options_.wal_dir, logs, kAliveLogFile);
+  s = GetSortedWalsOfType(db_options_.wal_dir, logs, kAliveLogFile);
   if (!s.ok()) {
     return s;
   }
@@ -149,7 +149,7 @@ Status DBImpl::GetSortedWalFiles(VectorLogPtr& files) {
 
   files.clear();
   // list wal files in archive dir.
-  std::string archivedir = ArchivalDirectory(options_.wal_dir);
+  std::string archivedir = ArchivalDirectory(db_options_.wal_dir);
   if (env_->FileExists(archivedir)) {
     s = GetSortedWalsOfType(archivedir, files, kArchivedLogFile);
     if (!s.ok()) {
@@ -160,7 +160,7 @@ Status DBImpl::GetSortedWalFiles(VectorLogPtr& files) {
   uint64_t latest_archived_log_number = 0;
   if (!files.empty()) {
     latest_archived_log_number = files.back()->LogNumber();
-    Log(options_.info_log, "Latest Archived log: %" PRIu64,
+    Log(db_options_.info_log, "Latest Archived log: %" PRIu64,
         latest_archived_log_number);
   }
 
@@ -173,7 +173,7 @@ Status DBImpl::GetSortedWalFiles(VectorLogPtr& files) {
       // same log in both db dir and archived dir. Simply
       // ignore the one in db dir. Note that, if we read
       // archived dir first, we would have missed the log file.
-      Log(options_.info_log, "%s already moved to archive",
+      Log(db_options_.info_log, "%s already moved to archive",
           log->PathName().c_str());
     }
   }
diff --git a/db/db_impl.cc b/db/db_impl.cc
index f18304407..1769471cf 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -335,8 +335,8 @@ CompressionType GetCompressionFlush(const Options& options) {
 DBImpl::DBImpl(const DBOptions& options, const std::string& dbname)
     : env_(options.env),
       dbname_(dbname),
-      options_(SanitizeOptions(dbname, options)),
-      stats_(options_.statistics.get()),
+      db_options_(SanitizeOptions(dbname, options)),
+      stats_(db_options_.statistics.get()),
       db_lock_(nullptr),
       mutex_(options.use_adaptive_mutex),
       shutting_down_(nullptr),
@@ -367,23 +367,23 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname)
 
   // Reserve ten files or so for other uses and give the rest to TableCache.
   // Give a large number for setting of "infinite" open files.
-  const int table_cache_size =
-      (options_.max_open_files == -1) ? 4194304 : options_.max_open_files - 10;
+  const int table_cache_size = (db_options_.max_open_files == -1) ?
+        4194304 : db_options_.max_open_files - 10;
   // Reserve ten files or so for other uses and give the rest to TableCache.
   table_cache_ =
-      NewLRUCache(table_cache_size, options_.table_cache_numshardbits,
-                  options_.table_cache_remove_scan_count_limit);
+      NewLRUCache(table_cache_size, db_options_.table_cache_numshardbits,
+                  db_options_.table_cache_remove_scan_count_limit);
 
   versions_.reset(
-      new VersionSet(dbname_, &options_, env_options_, table_cache_.get()));
+      new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get()));
   column_family_memtables_.reset(
       new ColumnFamilyMemTablesImpl(versions_->GetColumnFamilySet()));
 
-  DumpLeveldbBuildVersion(options_.info_log.get());
-  DumpDBFileSummary(options_, dbname_);
-  options_.Dump(options_.info_log.get());
+  DumpLeveldbBuildVersion(db_options_.info_log.get());
+  DumpDBFileSummary(db_options_, dbname_);
+  db_options_.Dump(db_options_.info_log.get());
 
-  LogFlush(options_.info_log);
+  LogFlush(db_options_.info_log);
 }
 
 DBImpl::~DBImpl() {
@@ -414,7 +414,7 @@ DBImpl::~DBImpl() {
     mutex_.Lock();
   }
 
-  if (options_.allow_thread_local) {
+  if (db_options_.allow_thread_local) {
     // Clean up obsolete files due to SuperVersion release.
     // (1) Need to delete to obsolete files before closing because RepairDB()
     // scans all existing files in the file system and builds manifest file.
@@ -443,7 +443,7 @@ DBImpl::~DBImpl() {
     env_->UnlockFile(db_lock_);
   }
 
-  LogFlush(options_.info_log);
+  LogFlush(db_options_.info_log);
 }
 
 Status DBImpl::NewDB() {
@@ -452,7 +452,7 @@ Status DBImpl::NewDB() {
   new_db.SetNextFile(2);
   new_db.SetLastSequence(0);
 
-  Log(options_.info_log, "Creating manifest 1 \n");
+  Log(db_options_.info_log, "Creating manifest 1 \n");
   const std::string manifest = DescriptorFileName(dbname_, 1);
   unique_ptr<WritableFile> file;
   Status s = env_->NewWritableFile(
@@ -460,7 +460,7 @@ Status DBImpl::NewDB() {
   if (!s.ok()) {
     return s;
   }
-  file->SetPreallocationBlockSize(options_.manifest_preallocation_size);
+  file->SetPreallocationBlockSize(db_options_.manifest_preallocation_size);
   {
     log::Writer log(std::move(file));
     std::string record;
@@ -477,38 +477,38 @@ Status DBImpl::NewDB() {
 }
 
 void DBImpl::MaybeIgnoreError(Status* s) const {
-  if (s->ok() || options_.paranoid_checks) {
+  if (s->ok() || db_options_.paranoid_checks) {
     // No change needed
   } else {
-    Log(options_.info_log, "Ignoring error %s", s->ToString().c_str());
+    Log(db_options_.info_log, "Ignoring error %s", s->ToString().c_str());
     *s = Status::OK();
   }
 }
 
 const Status DBImpl::CreateArchivalDirectory() {
-  if (options_.WAL_ttl_seconds > 0 || options_.WAL_size_limit_MB > 0) {
-    std::string archivalPath = ArchivalDirectory(options_.wal_dir);
+  if (db_options_.WAL_ttl_seconds > 0 || db_options_.WAL_size_limit_MB > 0) {
+    std::string archivalPath = ArchivalDirectory(db_options_.wal_dir);
     return env_->CreateDirIfMissing(archivalPath);
   }
   return Status::OK();
 }
 
 void DBImpl::PrintStatistics() {
-  auto dbstats = options_.statistics.get();
+  auto dbstats = db_options_.statistics.get();
   if (dbstats) {
-    Log(options_.info_log,
+    Log(db_options_.info_log,
         "STATISTCS:\n %s",
         dbstats->ToString().c_str());
   }
 }
 
 void DBImpl::MaybeDumpStats() {
-  if (options_.stats_dump_period_sec == 0) return;
+  if (db_options_.stats_dump_period_sec == 0) return;
 
   const uint64_t now_micros = env_->NowMicros();
 
   if (last_stats_dump_time_microsec_ +
-      options_.stats_dump_period_sec * 1000000
+      db_options_.stats_dump_period_sec * 1000000
       <= now_micros) {
     // Multiple threads could race in here simultaneously.
     // However, the last one will update last_stats_dump_time_microsec_
@@ -532,8 +532,8 @@ void DBImpl::MaybeDumpStats() {
       default_cf_internal_stats_->GetStringProperty(db_property_type,
                                                     "rocksdb.dbstats", &stats);
     }
-    Log(options_.info_log, "------- DUMPING STATS -------");
-    Log(options_.info_log, "%s", stats.c_str());
+    Log(db_options_.info_log, "------- DUMPING STATS -------");
+    Log(db_options_.info_log, "%s", stats.c_str());
 
     PrintStatistics();
   }
@@ -543,7 +543,7 @@ void DBImpl::MaybeDumpStats() {
 // of all files in the filesystem in 'candidate_files'.
 // no_full_scan = true -- never do the full scan using GetChildren()
 // force = false -- don't force the full scan, except every
-//  options_.delete_obsolete_files_period_micros
+//  db_options_.delete_obsolete_files_period_micros
 // force = true -- force the full scan
 void DBImpl::FindObsoleteFiles(DeletionState& deletion_state,
                                bool force,
@@ -560,12 +560,12 @@ void DBImpl::FindObsoleteFiles(DeletionState& deletion_state,
   // logic for figurint out if we're doing the full scan
   if (no_full_scan) {
     doing_the_full_scan = false;
-  } else if (force || options_.delete_obsolete_files_period_micros == 0) {
+  } else if (force || db_options_.delete_obsolete_files_period_micros == 0) {
     doing_the_full_scan = true;
   } else {
     const uint64_t now_micros = env_->NowMicros();
     if (delete_obsolete_files_last_run_ +
-        options_.delete_obsolete_files_period_micros < now_micros) {
+        db_options_.delete_obsolete_files_period_micros < now_micros) {
       doing_the_full_scan = true;
       delete_obsolete_files_last_run_ = now_micros;
     }
@@ -597,11 +597,12 @@ void DBImpl::FindObsoleteFiles(DeletionState& deletion_state,
   versions_->AddLiveFiles(&deletion_state.sst_live);
 
   if (doing_the_full_scan) {
-    for (uint32_t path_id = 0; path_id < options_.db_paths.size(); path_id++) {
+    for (uint32_t path_id = 0;
+         path_id < db_options_.db_paths.size(); path_id++) {
       // set of all files in the directory. We'll exclude files that are still
       // alive in the subsequent processings.
       std::vector<std::string> files;
-      env_->GetChildren(options_.db_paths[path_id].path,
+      env_->GetChildren(db_options_.db_paths[path_id].path,
                         &files);  // Ignore errors
       for (std::string file : files) {
         deletion_state.candidate_files.emplace_back(file, path_id);
@@ -609,17 +610,18 @@ void DBImpl::FindObsoleteFiles(DeletionState& deletion_state,
     }
 
     //Add log files in wal_dir
-    if (options_.wal_dir != dbname_) {
+    if (db_options_.wal_dir != dbname_) {
       std::vector<std::string> log_files;
-      env_->GetChildren(options_.wal_dir, &log_files); // Ignore errors
+      env_->GetChildren(db_options_.wal_dir, &log_files);  // Ignore errors
       for (std::string log_file : log_files) {
         deletion_state.candidate_files.emplace_back(log_file, 0);
       }
     }
     // Add info log files in db_log_dir
-    if (!options_.db_log_dir.empty() && options_.db_log_dir != dbname_) {
+    if (!db_options_.db_log_dir.empty() && db_options_.db_log_dir != dbname_) {
       std::vector<std::string> info_log_files;
-      env_->GetChildren(options_.db_log_dir, &info_log_files);  // Ignore errors
+      // Ignore errors
+      env_->GetChildren(db_options_.db_log_dir, &info_log_files);
       for (std::string log_file : info_log_files) {
         deletion_state.candidate_files.emplace_back(log_file, 0);
       }
@@ -690,7 +692,7 @@ void DBImpl::PurgeObsoleteFiles(DeletionState& state) {
                         candidate_files.end());
 
   std::vector<std::string> old_info_log_files;
-  InfoLogPrefix info_log_prefix(!options_.db_log_dir.empty(), dbname_);
+  InfoLogPrefix info_log_prefix(!db_options_.db_log_dir.empty(), dbname_);
   for (const auto& candidate_file : candidate_files) {
     std::string to_delete = candidate_file.file_name;
     uint32_t path_id = candidate_file.path_id;
@@ -746,51 +748,51 @@ void DBImpl::PurgeObsoleteFiles(DeletionState& state) {
     if (type == kTableFile) {
       // evict from cache
       TableCache::Evict(table_cache_.get(), number);
-      fname = TableFileName(options_.db_paths, number, path_id);
+      fname = TableFileName(db_options_.db_paths, number, path_id);
     } else {
-      fname =
-          ((type == kLogFile) ? options_.wal_dir : dbname_) + "/" + to_delete;
+      fname = ((type == kLogFile) ?
+          db_options_.wal_dir : dbname_) + "/" + to_delete;
     }
 
     if (type == kLogFile &&
-        (options_.WAL_ttl_seconds > 0 || options_.WAL_size_limit_MB > 0)) {
-      auto archived_log_name = ArchivedLogFileName(options_.wal_dir, number);
+        (db_options_.WAL_ttl_seconds > 0 ||
+         db_options_.WAL_size_limit_MB > 0)) {
+      auto archived_log_name = ArchivedLogFileName(db_options_.wal_dir, number);
       // The sync point below is used in (DBTest,TransactionLogIteratorRace)
       TEST_SYNC_POINT("DBImpl::PurgeObsoleteFiles:1");
       Status s = env_->RenameFile(fname, archived_log_name);
       // The sync point below is used in (DBTest,TransactionLogIteratorRace)
       TEST_SYNC_POINT("DBImpl::PurgeObsoleteFiles:2");
-      Log(options_.info_log,
+      Log(db_options_.info_log,
           "Move log file %s to %s -- %s\n",
           fname.c_str(), archived_log_name.c_str(), s.ToString().c_str());
     } else {
       Status s = env_->DeleteFile(fname);
-      Log(options_.info_log, "Delete %s type=%d #%" PRIu64 " -- %s\n",
+      Log(db_options_.info_log, "Delete %s type=%d #%" PRIu64 " -- %s\n",
           fname.c_str(), type, number, s.ToString().c_str());
     }
   }
 
   // Delete old info log files.
   size_t old_info_log_file_count = old_info_log_files.size();
-  if (old_info_log_file_count >= options_.keep_log_file_num) {
+  if (old_info_log_file_count >= db_options_.keep_log_file_num) {
     std::sort(old_info_log_files.begin(), old_info_log_files.end());
-    size_t end = old_info_log_file_count - options_.keep_log_file_num;
+    size_t end = old_info_log_file_count - db_options_.keep_log_file_num;
     for (unsigned int i = 0; i <= end; i++) {
       std::string& to_delete = old_info_log_files.at(i);
-      std::string full_path_to_delete =
-          (options_.db_log_dir.empty() ? dbname_ : options_.db_log_dir) + "/" +
-          to_delete;
-      Log(options_.info_log, "Delete info log file %s\n",
+      std::string full_path_to_delete = (db_options_.db_log_dir.empty() ?
+           dbname_ : db_options_.db_log_dir) + "/" + to_delete;
+      Log(db_options_.info_log, "Delete info log file %s\n",
           full_path_to_delete.c_str());
       Status s = env_->DeleteFile(full_path_to_delete);
       if (!s.ok()) {
-        Log(options_.info_log, "Delete info log file %s FAILED -- %s\n",
+        Log(db_options_.info_log, "Delete info log file %s FAILED -- %s\n",
             to_delete.c_str(), s.ToString().c_str());
       }
     }
   }
   PurgeObsoleteWALFiles();
-  LogFlush(options_.info_log);
+  LogFlush(db_options_.info_log);
 }
 
 void DBImpl::DeleteObsoleteFiles() {
@@ -812,8 +814,8 @@ void DBImpl::DeleteObsoleteFiles() {
 //    b. get sorted non-empty archived logs
 //    c. delete what should be deleted
 void DBImpl::PurgeObsoleteWALFiles() {
-  bool const ttl_enabled = options_.WAL_ttl_seconds > 0;
-  bool const size_limit_enabled =  options_.WAL_size_limit_MB > 0;
+  bool const ttl_enabled = db_options_.WAL_ttl_seconds > 0;
+  bool const size_limit_enabled =  db_options_.WAL_size_limit_MB > 0;
   if (!ttl_enabled && !size_limit_enabled) {
     return;
   }
@@ -821,13 +823,14 @@ void DBImpl::PurgeObsoleteWALFiles() {
   int64_t current_time;
   Status s = env_->GetCurrentTime(&current_time);
   if (!s.ok()) {
-    Log(options_.info_log, "Can't get current time: %s", s.ToString().c_str());
+    Log(db_options_.info_log, "Can't get current time: %s",
+        s.ToString().c_str());
     assert(false);
     return;
   }
   uint64_t const now_seconds = static_cast<uint64_t>(current_time);
   uint64_t const time_to_check = (ttl_enabled && !size_limit_enabled) ?
-    options_.WAL_ttl_seconds / 2 : default_interval_to_delete_obsolete_WAL_;
+    db_options_.WAL_ttl_seconds / 2 : default_interval_to_delete_obsolete_WAL_;
 
   if (purge_wal_files_last_run_ + time_to_check > now_seconds) {
     return;
@@ -835,11 +838,12 @@ void DBImpl::PurgeObsoleteWALFiles() {
 
   purge_wal_files_last_run_ = now_seconds;
 
-  std::string archival_dir = ArchivalDirectory(options_.wal_dir);
+  std::string archival_dir = ArchivalDirectory(db_options_.wal_dir);
   std::vector<std::string> files;
   s = env_->GetChildren(archival_dir, &files);
   if (!s.ok()) {
-    Log(options_.info_log, "Can't get archive files: %s", s.ToString().c_str());
+    Log(db_options_.info_log, "Can't get archive files: %s",
+        s.ToString().c_str());
     assert(false);
     return;
   }
@@ -857,14 +861,14 @@ void DBImpl::PurgeObsoleteWALFiles() {
         Status const s = env_->GetFileModificationTime(file_path,
           &file_m_time);
         if (!s.ok()) {
-          Log(options_.info_log, "Can't get file mod time: %s: %s",
+          Log(db_options_.info_log, "Can't get file mod time: %s: %s",
               file_path.c_str(), s.ToString().c_str());
           continue;
         }
-        if (now_seconds - file_m_time > options_.WAL_ttl_seconds) {
+        if (now_seconds - file_m_time > db_options_.WAL_ttl_seconds) {
           Status const s = env_->DeleteFile(file_path);
           if (!s.ok()) {
-            Log(options_.info_log, "Can't delete file: %s: %s",
+            Log(db_options_.info_log, "Can't delete file: %s: %s",
                 file_path.c_str(), s.ToString().c_str());
             continue;
           } else {
@@ -879,7 +883,7 @@ void DBImpl::PurgeObsoleteWALFiles() {
         uint64_t file_size;
         Status const s = env_->GetFileSize(file_path, &file_size);
         if (!s.ok()) {
-          Log(options_.info_log, "Can't get file size: %s: %s",
+          Log(db_options_.info_log, "Can't get file size: %s: %s",
               file_path.c_str(), s.ToString().c_str());
           return;
         } else {
@@ -889,7 +893,7 @@ void DBImpl::PurgeObsoleteWALFiles() {
           } else {
             Status s = env_->DeleteFile(file_path);
             if (!s.ok()) {
-              Log(options_.info_log, "Can't delete file: %s: %s",
+              Log(db_options_.info_log, "Can't delete file: %s: %s",
                   file_path.c_str(), s.ToString().c_str());
               continue;
             } else {
@@ -906,7 +910,7 @@ void DBImpl::PurgeObsoleteWALFiles() {
     return;
   }
 
-  size_t const files_keep_num = options_.WAL_size_limit_MB *
+  size_t const files_keep_num = db_options_.WAL_size_limit_MB *
     1024 * 1024 / log_file_size;
   if (log_files_num <= files_keep_num) {
     return;
@@ -917,7 +921,7 @@ void DBImpl::PurgeObsoleteWALFiles() {
   GetSortedWalsOfType(archival_dir, archived_logs, kArchivedLogFile);
 
   if (files_del_num > archived_logs.size()) {
-    Log(options_.info_log, "Trying to delete more archived log files than "
+    Log(db_options_.info_log, "Trying to delete more archived log files than "
         "exist. Deleting all");
     files_del_num = archived_logs.size();
   }
@@ -926,7 +930,7 @@ void DBImpl::PurgeObsoleteWALFiles() {
     std::string const file_path = archived_logs[i]->PathName();
     Status const s = DeleteFile(file_path);
     if (!s.ok()) {
-      Log(options_.info_log, "Can't delete file: %s: %s",
+      Log(db_options_.info_log, "Can't delete file: %s: %s",
           file_path.c_str(), s.ToString().c_str());
       continue;
     } else {
@@ -1034,7 +1038,7 @@ Status DBImpl::ReadFirstRecord(const WalFileType type, const uint64_t number,
   }
   Status s;
   if (type == kAliveLogFile) {
-    std::string fname = LogFileName(options_.wal_dir, number);
+    std::string fname = LogFileName(db_options_.wal_dir, number);
     s = ReadFirstLine(fname, sequence);
     if (env_->FileExists(fname) && !s.ok()) {
       // return any error that is not caused by non-existing file
@@ -1044,7 +1048,8 @@ Status DBImpl::ReadFirstRecord(const WalFileType type, const uint64_t number,
 
   if (type == kArchivedLogFile || !s.ok()) {
     //  check if the file got moved to archive.
-    std::string archived_file = ArchivedLogFileName(options_.wal_dir, number);
+    std::string archived_file =
+        ArchivedLogFileName(db_options_.wal_dir, number);
     s = ReadFirstLine(archived_file, sequence);
   }
 
@@ -1065,7 +1070,7 @@ Status DBImpl::ReadFirstLine(const std::string& fname,
     const char* fname;
 
     Status* status;
-    bool ignore_error;  // true if options_.paranoid_checks==false
+    bool ignore_error;  // true if db_options_.paranoid_checks==false
     virtual void Corruption(size_t bytes, const Status& s) {
       Log(info_log, "%s%s: dropping %d bytes; %s",
           (this->ignore_error ? "(ignoring error) " : ""), fname,
@@ -1086,17 +1091,17 @@ Status DBImpl::ReadFirstLine(const std::string& fname,
 
   LogReporter reporter;
   reporter.env = env_;
-  reporter.info_log = options_.info_log.get();
+  reporter.info_log = db_options_.info_log.get();
   reporter.fname = fname.c_str();
   reporter.status = &status;
-  reporter.ignore_error = !options_.paranoid_checks;
+  reporter.ignore_error = !db_options_.paranoid_checks;
   log::Reader reader(std::move(file), &reporter, true /*checksum*/,
                      0 /*initial_offset*/);
   std::string scratch;
   Slice record;
 
   if (reader.ReadRecord(&record, &scratch) &&
-      (status.ok() || !options_.paranoid_checks)) {
+      (status.ok() || !db_options_.paranoid_checks)) {
     if (record.size() < 12) {
       reporter.Corruption(record.size(),
                           Status::Corruption("log record too small"));
@@ -1137,7 +1142,7 @@ Status DBImpl::Recover(
       return s;
     }
 
-    for (auto& db_path : options_.db_paths) {
+    for (auto& db_path : db_options_.db_paths) {
       s = env_->CreateDirIfMissing(db_path.path);
       if (!s.ok()) {
         return s;
@@ -1155,7 +1160,7 @@ Status DBImpl::Recover(
     }
 
     if (!env_->FileExists(CurrentFileName(dbname_))) {
-      if (options_.create_if_missing) {
+      if (db_options_.create_if_missing) {
         s = NewDB();
         is_new_db = true;
         if (!s.ok()) {
@@ -1166,7 +1171,7 @@ Status DBImpl::Recover(
             dbname_, "does not exist (create_if_missing is false)");
       }
     } else {
-      if (options_.error_if_exists) {
+      if (db_options_.error_if_exists) {
         return Status::InvalidArgument(
             dbname_, "exists (error_if_exists is true)");
       }
@@ -1181,7 +1186,7 @@ Status DBImpl::Recover(
   }
 
   Status s = versions_->Recover(column_families, read_only);
-  if (options_.paranoid_checks && s.ok()) {
+  if (db_options_.paranoid_checks && s.ok()) {
     s = CheckConsistency();
   }
   if (s.ok()) {
@@ -1202,7 +1207,7 @@ Status DBImpl::Recover(
     const uint64_t min_log = versions_->MinLogNumber();
     const uint64_t prev_log = versions_->PrevLogNumber();
     std::vector<std::string> filenames;
-    s = env_->GetChildren(options_.wal_dir, &filenames);
+    s = env_->GetChildren(db_options_.wal_dir, &filenames);
     if (!s.ok()) {
       return s;
     }
@@ -1255,8 +1260,8 @@ Status DBImpl::RecoverLogFile(uint64_t log_number, SequenceNumber* max_sequence,
     Env* env;
     Logger* info_log;
     const char* fname;
-    Status* status;  // nullptr if options_.paranoid_checks==false or
-                     //            options_.skip_log_error_on_recovery==true
+    Status* status;  // nullptr if db_options_.paranoid_checks==false or
+                     //            db_options_.skip_log_error_on_recovery==true
     virtual void Corruption(size_t bytes, const Status& s) {
       Log(info_log, "%s%s: dropping %d bytes; %s",
           (this->status == nullptr ? "(ignoring error) " : ""),
@@ -1276,7 +1281,7 @@ Status DBImpl::RecoverLogFile(uint64_t log_number, SequenceNumber* max_sequence,
   }
 
   // Open the log file
-  std::string fname = LogFileName(options_.wal_dir, log_number);
+  std::string fname = LogFileName(db_options_.wal_dir, log_number);
   unique_ptr<SequentialFile> file;
   Status status = env_->NewSequentialFile(fname, &file, env_options_);
   if (!status.ok()) {
@@ -1287,17 +1292,18 @@ Status DBImpl::RecoverLogFile(uint64_t log_number, SequenceNumber* max_sequence,
   // Create the log reader.
   LogReporter reporter;
   reporter.env = env_;
-  reporter.info_log = options_.info_log.get();
+  reporter.info_log = db_options_.info_log.get();
   reporter.fname = fname.c_str();
-  reporter.status = (options_.paranoid_checks &&
-                     !options_.skip_log_error_on_recovery ? &status : nullptr);
+  reporter.status = (db_options_.paranoid_checks &&
+                     !db_options_.skip_log_error_on_recovery ? &status
+                                                             : nullptr);
   // We intentially make log::Reader do checksumming even if
   // paranoid_checks==false so that corruptions cause entire commits
   // to be skipped instead of propagating bad information (like overly
   // large sequence numbers).
   log::Reader reader(std::move(file), &reporter, true/*checksum*/,
                      0/*initial_offset*/);
-  Log(options_.info_log, "Recovering log #%" PRIu64 "", log_number);
+  Log(db_options_.info_log, "Recovering log #%" PRIu64 "", log_number);
 
   // Read all the records and add to a memtable
   std::string scratch;
@@ -1425,7 +1431,7 @@ Status DBImpl::WriteLevel0TableForRecovery(ColumnFamilyData* cfd, MemTable* mem,
     const SequenceNumber newest_snapshot = snapshots_.GetNewest();
     const SequenceNumber earliest_seqno_in_memtable =
         mem->GetFirstSequenceNumber();
-    Log(options_.info_log, "[%s] Level-0 table #%" PRIu64 ": started",
+    Log(db_options_.info_log, "[%s] Level-0 table #%" PRIu64 ": started",
         cfd->GetName().c_str(), meta.fd.GetNumber());
 
     {
@@ -1435,11 +1441,11 @@ Status DBImpl::WriteLevel0TableForRecovery(ColumnFamilyData* cfd, MemTable* mem,
           iter.get(), &meta, cfd->internal_comparator(), newest_snapshot,
           earliest_seqno_in_memtable, GetCompressionFlush(*cfd->options()),
           cfd->options()->compression_opts, Env::IO_HIGH);
-      LogFlush(options_.info_log);
+      LogFlush(db_options_.info_log);
       mutex_.Lock();
     }
 
-    Log(options_.info_log,
+    Log(db_options_.info_log,
         "[%s] Level-0 table #%" PRIu64 ": %" PRIu64 " bytes %s",
         cfd->GetName().c_str(), meta.fd.GetNumber(), meta.fd.GetFileSize(),
         s.ToString().c_str());
@@ -1491,7 +1497,7 @@ Status DBImpl::WriteLevel0Table(ColumnFamilyData* cfd,
     ro.total_order_seek = true;
     Arena arena;
     for (MemTable* m : mems) {
-      Log(options_.info_log,
+      Log(db_options_.info_log,
           "[%s] Flushing memtable with next log file: %" PRIu64 "\n",
           cfd->GetName().c_str(), m->GetNextLogNumber());
       memtables.push_back(m->NewIterator(ro, &arena));
@@ -1500,7 +1506,8 @@ Status DBImpl::WriteLevel0Table(ColumnFamilyData* cfd,
       ScopedArenaIterator iter(NewMergingIterator(&cfd->internal_comparator(),
                                                   &memtables[0],
                                                   memtables.size(), &arena));
-      Log(options_.info_log, "[%s] Level-0 flush table #%" PRIu64 ": started",
+      Log(db_options_.info_log,
+           "[%s] Level-0 flush table #%" PRIu64 ": started",
           cfd->GetName().c_str(), meta.fd.GetNumber());
 
       s = BuildTable(
@@ -1508,14 +1515,14 @@ Status DBImpl::WriteLevel0Table(ColumnFamilyData* cfd,
           iter.get(), &meta, cfd->internal_comparator(), newest_snapshot,
           earliest_seqno_in_memtable, GetCompressionFlush(*cfd->options()),
           cfd->options()->compression_opts, Env::IO_HIGH);
-      LogFlush(options_.info_log);
+      LogFlush(db_options_.info_log);
     }
-    Log(options_.info_log,
+    Log(db_options_.info_log,
         "[%s] Level-0 flush table #%" PRIu64 ": %" PRIu64 " bytes %s",
         cfd->GetName().c_str(), meta.fd.GetNumber(), meta.fd.GetFileSize(),
         s.ToString().c_str());
 
-    if (!options_.disableDataSync) {
+    if (!db_options_.disableDataSync) {
       db_directory_->Fsync();
     }
     mutex_.Lock();
@@ -1544,7 +1551,7 @@ Status DBImpl::WriteLevel0Table(ColumnFamilyData* cfd,
     // insert files directly into higher levels because some other
     // threads could be concurrently producing compacted files for
     // that key range.
-    if (base != nullptr && options_.max_background_compactions <= 1 &&
+    if (base != nullptr && db_options_.max_background_compactions <= 1 &&
         cfd->options()->compaction_style == kCompactionStyleLevel) {
       level = base->PickLevelForMemTableOutput(min_user_key, max_user_key);
     }
@@ -1606,7 +1613,7 @@ Status DBImpl::FlushMemTableToOutputFile(ColumnFamilyData* cfd,
   } else {
     // Replace immutable memtable with the generated Table
     s = cfd->imm()->InstallMemtableFlushResults(
-        cfd, mems, versions_.get(), &mutex_, options_.info_log.get(),
+        cfd, mems, versions_.get(), &mutex_, db_options_.info_log.get(),
         file_number, &pending_outputs_, &deletion_state.memtables_to_free,
         db_directory_.get(), log_buffer);
   }
@@ -1632,7 +1639,7 @@ Status DBImpl::FlushMemTableToOutputFile(ColumnFamilyData* cfd,
     }
   }
 
-  if (!s.ok() && !s.IsShutdownInProgress() && options_.paranoid_checks &&
+  if (!s.ok() && !s.IsShutdownInProgress() && db_options_.paranoid_checks &&
       bg_error_.ok()) {
     // if a bad error happened (not ShutdownInProgress) and paranoid_checks is
     // true, mark DB read-only
@@ -1646,7 +1653,7 @@ Status DBImpl::CompactRange(ColumnFamilyHandle* column_family,
                             const Slice* begin, const Slice* end,
                             bool reduce_level, int target_level,
                             uint32_t target_path_id) {
-  if (target_path_id >= options_.db_paths.size()) {
+  if (target_path_id >= db_options_.db_paths.size()) {
     return Status::InvalidArgument("Invalid target path ID");
   }
 
@@ -1655,7 +1662,7 @@ Status DBImpl::CompactRange(ColumnFamilyHandle* column_family,
 
   Status s = FlushMemTable(cfd, FlushOptions());
   if (!s.ok()) {
-    LogFlush(options_.info_log);
+    LogFlush(db_options_.info_log);
     return s;
   }
 
@@ -1683,7 +1690,7 @@ Status DBImpl::CompactRange(ColumnFamilyHandle* column_family,
                               end);
     }
     if (!s.ok()) {
-      LogFlush(options_.info_log);
+      LogFlush(db_options_.info_log);
       return s;
     }
   }
@@ -1691,7 +1698,7 @@ Status DBImpl::CompactRange(ColumnFamilyHandle* column_family,
   if (reduce_level) {
     s = ReFitLevel(cfd, max_level_with_files, target_level);
   }
-  LogFlush(options_.info_log);
+  LogFlush(db_options_.info_log);
 
   {
     MutexLock l(&mutex_);
@@ -1733,7 +1740,7 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) {
   // only allow one thread refitting
   if (refitting_level_) {
     mutex_.Unlock();
-    Log(options_.info_log, "ReFitLevel: another thread is refitting");
+    Log(db_options_.info_log, "ReFitLevel: another thread is refitting");
     delete new_superversion;
     return Status::NotSupported("another thread is refitting");
   }
@@ -1742,7 +1749,7 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) {
   // wait for all background threads to stop
   bg_work_gate_closed_ = true;
   while (bg_compaction_scheduled_ > 0 || bg_flush_scheduled_) {
-    Log(options_.info_log,
+    Log(db_options_.info_log,
         "RefitLevel: waiting for background threads to stop: %d %d",
         bg_compaction_scheduled_, bg_flush_scheduled_);
     bg_cv_.Wait();
@@ -1758,8 +1765,8 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) {
 
   Status status;
   if (to_level < level) {
-    Log(options_.info_log, "[%s] Before refitting:\n%s", cfd->GetName().c_str(),
-        cfd->current()->DebugString().data());
+    Log(db_options_.info_log, "[%s] Before refitting:\n%s",
+        cfd->GetName().c_str(), cfd->current()->DebugString().data());
 
     VersionEdit edit;
     edit.SetColumnFamily(cfd->GetID());
@@ -1769,18 +1776,18 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) {
                    f->fd.GetFileSize(), f->smallest, f->largest,
                    f->smallest_seqno, f->largest_seqno);
     }
-    Log(options_.info_log, "[%s] Apply version edit:\n%s",
+    Log(db_options_.info_log, "[%s] Apply version edit:\n%s",
         cfd->GetName().c_str(), edit.DebugString().data());
 
     status = versions_->LogAndApply(cfd, &edit, &mutex_, db_directory_.get());
     superversion_to_free = cfd->InstallSuperVersion(new_superversion, &mutex_);
     new_superversion = nullptr;
 
-    Log(options_.info_log, "[%s] LogAndApply: %s\n", cfd->GetName().c_str(),
+    Log(db_options_.info_log, "[%s] LogAndApply: %s\n", cfd->GetName().c_str(),
         status.ToString().data());
 
     if (status.ok()) {
-      Log(options_.info_log, "[%s] After refitting:\n%s",
+      Log(db_options_.info_log, "[%s] After refitting:\n%s",
           cfd->GetName().c_str(), cfd->current()->DebugString().data());
     }
   }
@@ -1870,14 +1877,14 @@ Status DBImpl::RunManualCompaction(ColumnFamilyData* cfd, int input_level,
 
   ++bg_manual_only_;
   while (bg_compaction_scheduled_ > 0) {
-    Log(options_.info_log,
+    Log(db_options_.info_log,
         "[%s] Manual compaction waiting for all other scheduled background "
         "compactions to finish",
         cfd->GetName().c_str());
     bg_cv_.Wait();
   }
 
-  Log(options_.info_log, "[%s] Manual compaction starting",
+  Log(db_options_.info_log, "[%s] Manual compaction starting",
       cfd->GetName().c_str());
 
   while (!manual.done && !shutting_down_.Acquire_Load() && bg_error_.ok()) {
@@ -1965,10 +1972,10 @@ void DBImpl::MaybeScheduleFlushOrCompaction() {
     }
     if (is_flush_pending) {
       // memtable flush needed
-      if (bg_flush_scheduled_ < options_.max_background_flushes) {
+      if (bg_flush_scheduled_ < db_options_.max_background_flushes) {
         bg_flush_scheduled_++;
         env_->Schedule(&DBImpl::BGWorkFlush, this, Env::Priority::HIGH);
-      } else if (options_.max_background_flushes > 0) {
+      } else if (db_options_.max_background_flushes > 0) {
         bg_schedule_needed_ = true;
       }
     }
@@ -1987,8 +1994,8 @@ void DBImpl::MaybeScheduleFlushOrCompaction() {
     // bg_manual_only_ == 0
     if (!bg_manual_only_ &&
         (is_compaction_needed ||
-         (is_flush_pending && options_.max_background_flushes == 0))) {
-      if (bg_compaction_scheduled_ < options_.max_background_compactions) {
+         (is_flush_pending && db_options_.max_background_flushes == 0))) {
+      if (bg_compaction_scheduled_ < db_options_.max_background_compactions) {
         bg_compaction_scheduled_++;
         env_->Schedule(&DBImpl::BGWorkCompaction, this, Env::Priority::LOW);
       } else {
@@ -2038,7 +2045,7 @@ Status DBImpl::BackgroundFlush(bool* madeProgress,
           "BackgroundCallFlush doing FlushMemTableToOutputFile with column "
           "family [%s], flush slots available %d",
           cfd->GetName().c_str(),
-          options_.max_background_flushes - bg_flush_scheduled_);
+          db_options_.max_background_flushes - bg_flush_scheduled_);
       flush_status = FlushMemTableToOutputFile(cfd, madeProgress,
                                                deletion_state, log_buffer);
     }
@@ -2056,7 +2063,7 @@ void DBImpl::BackgroundCallFlush() {
   DeletionState deletion_state(true);
   assert(bg_flush_scheduled_);
 
-  LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, options_.info_log.get());
+  LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, db_options_.info_log.get());
   {
     MutexLock l(&mutex_);
 
@@ -2072,12 +2079,12 @@ void DBImpl::BackgroundCallFlush() {
           default_cf_internal_stats_->BumpAndGetBackgroundErrorCount();
         bg_cv_.SignalAll();  // In case a waiter can proceed despite the error
         mutex_.Unlock();
-        Log(options_.info_log,
+        Log(db_options_.info_log,
             "Waiting after background flush error: %s"
             "Accumulated background error counts: %" PRIu64,
             s.ToString().c_str(), error_cnt);
         log_buffer.FlushBufferToLog();
-        LogFlush(options_.info_log);
+        LogFlush(db_options_.info_log);
         env_->SleepForMicroseconds(1000000);
         mutex_.Lock();
       }
@@ -2123,7 +2130,7 @@ void DBImpl::BackgroundCallCompaction() {
   DeletionState deletion_state(true);
 
   MaybeDumpStats();
-  LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, options_.info_log.get());
+  LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, db_options_.info_log.get());
   {
     MutexLock l(&mutex_);
     assert(bg_compaction_scheduled_);
@@ -2140,11 +2147,11 @@ void DBImpl::BackgroundCallCompaction() {
         bg_cv_.SignalAll();  // In case a waiter can proceed despite the error
         mutex_.Unlock();
         log_buffer.FlushBufferToLog();
-        Log(options_.info_log,
+        Log(db_options_.info_log,
             "Waiting after background compaction error: %s, "
             "Accumulated background error counts: %" PRIu64,
             s.ToString().c_str(), error_cnt);
-        LogFlush(options_.info_log);
+        LogFlush(db_options_.info_log);
         env_->SleepForMicroseconds(1000000);
         mutex_.Lock();
       }
@@ -2228,7 +2235,7 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress,
           log_buffer,
           "BackgroundCompaction doing FlushMemTableToOutputFile, "
           "compaction slots available %d",
-          options_.max_background_compactions - bg_compaction_scheduled_);
+          db_options_.max_background_compactions - bg_compaction_scheduled_);
       cfd->Ref();
       flush_stat = FlushMemTableToOutputFile(cfd, madeProgress, deletion_state,
                                              log_buffer);
@@ -2340,9 +2347,9 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress,
   } else if (status.IsShutdownInProgress()) {
     // Ignore compaction errors found during shutting down
   } else {
-    Log(InfoLogLevel::WARN_LEVEL, options_.info_log, "Compaction error: %s",
+    Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log, "Compaction error: %s",
         status.ToString().c_str());
-    if (options_.paranoid_checks && bg_error_.ok()) {
+    if (db_options_.paranoid_checks && bg_error_.ok()) {
       bg_error_ = status;
     }
   }
@@ -2454,7 +2461,7 @@ Status DBImpl::OpenCompactionOutputFile(CompactionState* compact) {
   compact->outputs.push_back(out);
 
   // Make the output file
-  std::string fname = TableFileName(options_.db_paths, file_number,
+  std::string fname = TableFileName(db_options_.db_paths, file_number,
                                     compact->compaction->GetOutputPathId());
   Status s = env_->NewWritableFile(fname, &compact->outfile, env_options_);
 
@@ -2469,7 +2476,7 @@ Status DBImpl::OpenCompactionOutputFile(CompactionState* compact) {
         compact->compaction->OutputCompressionType(),
         cfd->options()->compression_opts));
   }
-  LogFlush(options_.info_log);
+  LogFlush(db_options_.info_log);
   return s;
 }
 
@@ -2497,8 +2504,8 @@ Status DBImpl::FinishCompactionOutputFile(CompactionState* compact,
   compact->builder.reset();
 
   // Finish and check for file errors
-  if (s.ok() && !options_.disableDataSync) {
-    if (options_.use_fsync) {
+  if (s.ok() && !db_options_.disableDataSync) {
+    if (db_options_.use_fsync) {
       StopWatch sw(env_, stats_, COMPACTION_OUTFILE_SYNC_MICROS);
       s = compact->outfile->Fsync();
     } else {
@@ -2520,7 +2527,7 @@ Status DBImpl::FinishCompactionOutputFile(CompactionState* compact,
     s = iter->status();
     delete iter;
     if (s.ok()) {
-      Log(options_.info_log, "[%s] Generated table #%" PRIu64 ": %" PRIu64
+      Log(db_options_.info_log, "[%s] Generated table #%" PRIu64 ": %" PRIu64
                              " keys, %" PRIu64 " bytes",
           cfd->GetName().c_str(), output_number, current_entries,
           current_bytes);
@@ -2539,7 +2546,7 @@ Status DBImpl::InstallCompactionResults(CompactionState* compact,
   // This ensures that a concurrent compaction did not erroneously
   // pick the same files to compact.
   if (!versions_->VerifyCompactionFileConsistency(compact->compaction)) {
-    Log(options_.info_log, "[%s] Compaction %d@%d + %d@%d files aborted",
+    Log(db_options_.info_log, "[%s] Compaction %d@%d + %d@%d files aborted",
         compact->compaction->column_family_data()->GetName().c_str(),
         compact->compaction->num_input_files(0), compact->compaction->level(),
         compact->compaction->num_input_files(1),
@@ -2588,7 +2595,7 @@ inline SequenceNumber DBImpl::findEarliestVisibleSnapshot(
     prev = cur; // assignment
     assert(prev);
   }
-  Log(options_.info_log,
+  Log(db_options_.info_log,
       "Looking for seqid %" PRIu64 " but maxseqid is %" PRIu64 "", in,
       snapshots[snapshots.size() - 1]);
   assert(0);
@@ -2598,7 +2605,7 @@ inline SequenceNumber DBImpl::findEarliestVisibleSnapshot(
 uint64_t DBImpl::CallFlushDuringCompaction(ColumnFamilyData* cfd,
                                            DeletionState& deletion_state,
                                            LogBuffer* log_buffer) {
-  if (options_.max_background_flushes > 0) {
+  if (db_options_.max_background_flushes > 0) {
     // flush thread will take care of this
     return 0;
   }
@@ -2643,7 +2650,7 @@ Status DBImpl::ProcessKeyValueCompaction(
   ColumnFamilyData* cfd = compact->compaction->column_family_data();
   MergeHelper merge(
       cfd->user_comparator(), cfd->options()->merge_operator.get(),
-      options_.info_log.get(), cfd->options()->min_partial_merge_operands,
+      db_options_.info_log.get(), cfd->options()->min_partial_merge_operands,
       false /* internal key corruption is expected */);
   auto compaction_filter = cfd->options()->compaction_filter;
   std::unique_ptr<CompactionFilter> compaction_filter_from_factory = nullptr;
@@ -2810,7 +2817,7 @@ Status DBImpl::ProcessKeyValueCompaction(
         // optimization in BuildTable.
         int steps = 0;
         merge.MergeUntil(input, prev_snapshot, bottommost_level,
-            options_.statistics.get(), &steps);
+            db_options_.statistics.get(), &steps);
         // Skip the Merge ops
         combined_idx = combined_idx - 1 + steps;
 
@@ -3037,7 +3044,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact,
       cfd->GetName().c_str(), compact->compaction->num_input_files(0),
       compact->compaction->level(), compact->compaction->num_input_files(1),
       compact->compaction->output_level(), compact->compaction->score(),
-      options_.max_background_compactions - bg_compaction_scheduled_);
+      db_options_.max_background_compactions - bg_compaction_scheduled_);
   char scratch[2345];
   compact->compaction->Summary(scratch, sizeof(scratch));
   LogToBuffer(log_buffer, "[%s] Compaction start summary: %s\n",
@@ -3113,7 +3120,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact,
 
       if (!ParseInternalKey(key, &ikey)) {
         // log error
-        Log(options_.info_log, "[%s] Failed to parse key: %s",
+        Log(db_options_.info_log, "[%s] Failed to parse key: %s",
             cfd->GetName().c_str(), key.ToString().c_str());
         continue;
       } else {
@@ -3254,7 +3261,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact,
   }
   input.reset();
 
-  if (!options_.disableDataSync) {
+  if (!db_options_.disableDataSync) {
     db_directory_->Fsync();
   }
 
@@ -3286,7 +3293,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact,
 
   RecordCompactionIOStats();
 
-  LogFlush(options_.info_log);
+  LogFlush(db_options_.info_log);
   mutex_.Lock();
   cfd->internal_stats()->AddCompactionStats(
       compact->compaction->output_level(), stats);
@@ -3598,12 +3605,12 @@ Status DBImpl::CreateColumnFamily(const ColumnFamilyOptions& options,
     assert(cfd != nullptr);
     delete cfd->InstallSuperVersion(new SuperVersion(), &mutex_);
     *handle = new ColumnFamilyHandleImpl(cfd, this, &mutex_);
-    Log(options_.info_log, "Created column family [%s] (ID %u)",
+    Log(db_options_.info_log, "Created column family [%s] (ID %u)",
         column_family_name.c_str(), (unsigned)cfd->GetID());
     max_total_in_memory_state_ += cfd->options()->write_buffer_size *
                                   cfd->options()->max_write_buffer_number;
   } else {
-    Log(options_.info_log, "Creating column family [%s] FAILED -- %s",
+    Log(db_options_.info_log, "Creating column family [%s] FAILED -- %s",
         column_family_name.c_str(), s.ToString().c_str());
   }
   return s;
@@ -3635,9 +3642,11 @@ Status DBImpl::DropColumnFamily(ColumnFamilyHandle* column_family) {
     assert(cfd->IsDropped());
     max_total_in_memory_state_ -= cfd->options()->write_buffer_size *
                                   cfd->options()->max_write_buffer_number;
-    Log(options_.info_log, "Dropped column family with id %u\n", cfd->GetID());
+    Log(db_options_.info_log, "Dropped column family with id %u\n",
+        cfd->GetID());
   } else {
-    Log(options_.info_log, "Dropping column family with id %u FAILED -- %s\n",
+    Log(db_options_.info_log,
+        "Dropping column family with id %u FAILED -- %s\n",
         cfd->GetID(), s.ToString().c_str());
   }
 
@@ -3967,15 +3976,15 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
          versions_->GetColumnFamilySet()->NumberOfColumnFamilies() == 1);
 
   uint64_t flush_column_family_if_log_file = 0;
-  uint64_t max_total_wal_size = (options_.max_total_wal_size == 0)
+  uint64_t max_total_wal_size = (db_options_.max_total_wal_size == 0)
                                     ? 4 * max_total_in_memory_state_
-                                    : options_.max_total_wal_size;
+                                    : db_options_.max_total_wal_size;
   if (UNLIKELY(!single_column_family_mode_) &&
       alive_log_files_.begin()->getting_flushed == false &&
       total_log_size_ > max_total_wal_size) {
     flush_column_family_if_log_file = alive_log_files_.begin()->number;
     alive_log_files_.begin()->getting_flushed = true;
-    Log(options_.info_log,
+    Log(db_options_.info_log,
         "Flushing all column families with data in WAL number %" PRIu64
         ". Total log size is %" PRIu64 " while max_total_wal_size is %" PRIu64,
         flush_column_family_if_log_file, total_log_size_, max_total_wal_size);
@@ -4061,7 +4070,7 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
         RecordTick(stats_, WAL_FILE_SYNCED);
         RecordTick(stats_, WAL_FILE_BYTES, log_size);
         if (status.ok() && options.sync) {
-          if (options_.use_fsync) {
+          if (db_options_.use_fsync) {
             StopWatch(env_, stats_, WAL_FILE_SYNC_MICROS);
             status = log_->file()->Fsync();
           } else {
@@ -4105,7 +4114,7 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
       }
     }
   }
-  if (options_.paranoid_checks && !status.ok() &&
+  if (db_options_.paranoid_checks && !status.ok() &&
       !status.IsTimedOut() && bg_error_.ok()) {
     bg_error_ = status; // stop compaction & fail any further writes
   }
@@ -4270,7 +4279,7 @@ Status DBImpl::MakeRoomForWrite(ColumnFamilyData* cfd,
       // We have filled up the current memtable, but the previous
       // ones are still being flushed, so we wait.
       DelayLoggingAndReset();
-      Log(options_.info_log, "[%s] wait for memtable flush...\n",
+      Log(db_options_.info_log, "[%s] wait for memtable flush...\n",
           cfd->GetName().c_str());
       if (schedule_background_work) {
         MaybeScheduleFlushOrCompaction();
@@ -4290,7 +4299,7 @@ Status DBImpl::MakeRoomForWrite(ColumnFamilyData* cfd,
           InternalStats::MEMTABLE_COMPACTION, stall);
     } else if (cfd->NeedWaitForNumLevel0Files()) {
       DelayLoggingAndReset();
-      Log(options_.info_log, "[%s] wait for fewer level0 files...\n",
+      Log(db_options_.info_log, "[%s] wait for fewer level0 files...\n",
           cfd->GetName().c_str());
       uint64_t stall;
       {
@@ -4374,9 +4383,9 @@ Status DBImpl::SetNewMemtableAndNewLogFile(ColumnFamilyData* cfd,
   {
     DelayLoggingAndReset();
     if (creating_new_log) {
-      s = env_->NewWritableFile(LogFileName(options_.wal_dir, new_log_number),
-                                &lfile,
-                                env_->OptimizeForLogWrite(env_options_));
+      s = env_->NewWritableFile(
+          LogFileName(db_options_.wal_dir, new_log_number),
+          &lfile, env_->OptimizeForLogWrite(env_options_));
       if (s.ok()) {
         // Our final size should be less than write_buffer_size
         // (compression, etc) but err on the side of caution.
@@ -4423,7 +4432,7 @@ Status DBImpl::SetNewMemtableAndNewLogFile(ColumnFamilyData* cfd,
   cfd->imm()->Add(cfd->mem());
   new_mem->Ref();
   cfd->SetMemtable(new_mem);
-  Log(options_.info_log,
+  Log(db_options_.info_log,
       "[%s] New memtable created with log file: #%" PRIu64 "\n",
       cfd->GetName().c_str(), logfile_number_);
   context->superversions_to_free_.push_back(
@@ -4528,7 +4537,7 @@ bool DBImpl::GetIntPropertyInternal(ColumnFamilyHandle* column_family,
 
 SuperVersion* DBImpl::GetAndRefSuperVersion(ColumnFamilyData* cfd) {
   // TODO(ljin): consider using GetReferencedSuperVersion() directly
-  if (LIKELY(options_.allow_thread_local)) {
+  if (LIKELY(db_options_.allow_thread_local)) {
     return cfd->GetThreadLocalSuperVersion(&mutex_);
   } else {
     MutexLock l(&mutex_);
@@ -4539,7 +4548,7 @@ SuperVersion* DBImpl::GetAndRefSuperVersion(ColumnFamilyData* cfd) {
 void DBImpl::ReturnAndCleanupSuperVersion(ColumnFamilyData* cfd,
                                           SuperVersion* sv) {
   bool unref_sv = true;
-  if (LIKELY(options_.allow_thread_local)) {
+  if (LIKELY(db_options_.allow_thread_local)) {
     unref_sv = !cfd->ReturnThreadLocalSuperVersion(sv);
   }
 
@@ -4586,7 +4595,7 @@ void DBImpl::GetApproximateSizes(ColumnFamilyHandle* column_family,
 
 inline void DBImpl::DelayLoggingAndReset() {
   if (delayed_writes_ > 0) {
-    Log(options_.info_log, "delayed %d write...\n", delayed_writes_ );
+    Log(db_options_.info_log, "delayed %d write...\n", delayed_writes_);
     delayed_writes_ = 0;
   }
 }
@@ -4613,7 +4622,7 @@ Status DBImpl::GetUpdatesSince(
   if (!s.ok()) {
     return s;
   }
-  iter->reset(new TransactionLogIteratorImpl(options_.wal_dir, &options_,
+  iter->reset(new TransactionLogIteratorImpl(db_options_.wal_dir, &db_options_,
                                              read_options, env_options_,
                                              seq, std::move(wal_files), this));
   return (*iter)->status();
@@ -4625,7 +4634,7 @@ Status DBImpl::DeleteFile(std::string name) {
   WalFileType log_type;
   if (!ParseFileName(name, &number, &type, &log_type) ||
       (type != kTableFile && type != kLogFile)) {
-    Log(options_.info_log, "DeleteFile %s failed.\n", name.c_str());
+    Log(db_options_.info_log, "DeleteFile %s failed.\n", name.c_str());
     return Status::InvalidArgument("Invalid file name");
   }
 
@@ -4633,13 +4642,13 @@ Status DBImpl::DeleteFile(std::string name) {
   if (type == kLogFile) {
     // Only allow deleting archived log files
     if (log_type != kArchivedLogFile) {
-      Log(options_.info_log, "DeleteFile %s failed - not archived log.\n",
+      Log(db_options_.info_log, "DeleteFile %s failed - not archived log.\n",
           name.c_str());
       return Status::NotSupported("Delete only supported for archived logs");
     }
-    status = env_->DeleteFile(options_.wal_dir + "/" + name.c_str());
+    status = env_->DeleteFile(db_options_.wal_dir + "/" + name.c_str());
     if (!status.ok()) {
-      Log(options_.info_log, "DeleteFile %s failed -- %s.\n",
+      Log(db_options_.info_log, "DeleteFile %s failed -- %s.\n",
           name.c_str(), status.ToString().c_str());
     }
     return status;
@@ -4654,7 +4663,7 @@ Status DBImpl::DeleteFile(std::string name) {
     MutexLock l(&mutex_);
     status = versions_->GetMetadataForFile(number, &level, &metadata, &cfd);
     if (!status.ok()) {
-      Log(options_.info_log, "DeleteFile %s failed. File not found\n",
+      Log(db_options_.info_log, "DeleteFile %s failed. File not found\n",
                              name.c_str());
       return Status::InvalidArgument("File not found");
     }
@@ -4662,7 +4671,7 @@ Status DBImpl::DeleteFile(std::string name) {
 
     // If the file is being compacted no need to delete.
     if (metadata->being_compacted) {
-      Log(options_.info_log,
+      Log(db_options_.info_log,
           "DeleteFile %s Skipped. File about to be compacted\n", name.c_str());
       return Status::OK();
     }
@@ -4672,7 +4681,7 @@ Status DBImpl::DeleteFile(std::string name) {
     // lost. Check that the level passed is the last level.
     for (int i = level + 1; i < cfd->NumberLevels(); i++) {
       if (cfd->current()->NumLevelFiles(i) != 0) {
-        Log(options_.info_log,
+        Log(db_options_.info_log,
             "DeleteFile %s FAILED. File not in last level\n", name.c_str());
         return Status::InvalidArgument("File not in last level");
       }
@@ -4684,7 +4693,7 @@ Status DBImpl::DeleteFile(std::string name) {
     }
     FindObsoleteFiles(deletion_state, false);
   } // lock released here
-  LogFlush(options_.info_log);
+  LogFlush(db_options_.info_log);
   // remove files outside the db-lock
   if (deletion_state.HaveSomethingToDelete()) {
     PurgeObsoleteFiles(deletion_state);
@@ -4846,9 +4855,9 @@ Status DB::Open(const DBOptions& db_options, const std::string& dbname,
   }
 
   DBImpl* impl = new DBImpl(db_options, dbname);
-  s = impl->env_->CreateDirIfMissing(impl->options_.wal_dir);
+  s = impl->env_->CreateDirIfMissing(impl->db_options_.wal_dir);
   if (s.ok()) {
-    for (auto db_path : impl->options_.db_paths) {
+    for (auto db_path : impl->db_options_.db_paths) {
       s = impl->env_->CreateDirIfMissing(db_path.path);
       if (!s.ok()) {
         break;
@@ -4873,9 +4882,9 @@ Status DB::Open(const DBOptions& db_options, const std::string& dbname,
     uint64_t new_log_number = impl->versions_->NewFileNumber();
     unique_ptr<WritableFile> lfile;
     EnvOptions soptions(db_options);
-    s = impl->options_.env->NewWritableFile(
-        LogFileName(impl->options_.wal_dir, new_log_number), &lfile,
-        impl->options_.env->OptimizeForLogWrite(soptions));
+    s = impl->db_options_.env->NewWritableFile(
+        LogFileName(impl->db_options_.wal_dir, new_log_number), &lfile,
+        impl->db_options_.env->OptimizeForLogWrite(soptions));
     if (s.ok()) {
       lfile->SetPreallocationBlockSize(1.1 * max_write_buffer_size);
       impl->logfile_number_ = new_log_number;
diff --git a/db/db_impl.h b/db/db_impl.h
index 1ccaabb6c..e49b954cc 100644
--- a/db/db_impl.h
+++ b/db/db_impl.h
@@ -276,7 +276,7 @@ class DBImpl : public DB {
   // Returns the list of live files in 'live' and the list
   // of all files in the filesystem in 'candidate_files'.
   // If force == false and the last call was less than
-  // options_.delete_obsolete_files_period_micros microseconds ago,
+  // db_options_.delete_obsolete_files_period_micros microseconds ago,
   // it will not fill up the deletion_state
   void FindObsoleteFiles(DeletionState& deletion_state,
                          bool force,
@@ -294,7 +294,7 @@ class DBImpl : public DB {
   Env* const env_;
   const std::string dbname_;
   unique_ptr<VersionSet> versions_;
-  const DBOptions options_;
+  const DBOptions db_options_;
   Statistics* stats_;
 
   Iterator* NewInternalIterator(const ReadOptions&, ColumnFamilyData* cfd,
diff --git a/db/db_impl_readonly.cc b/db/db_impl_readonly.cc
index 6c864aefd..db0718bd1 100644
--- a/db/db_impl_readonly.cc
+++ b/db/db_impl_readonly.cc
@@ -44,7 +44,7 @@ namespace rocksdb {
 DBImplReadOnly::DBImplReadOnly(const DBOptions& options,
                                const std::string& dbname)
     : DBImpl(options, dbname) {
-  Log(options_.info_log, "Opening the db in read only mode");
+  Log(db_options_.info_log, "Opening the db in read only mode");
 }
 
 DBImplReadOnly::~DBImplReadOnly() {

From 8de151bb9965fde93482d144fef27f27cc6dd862 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Fri, 5 Sep 2014 14:20:18 -0700
Subject: [PATCH 030/829] Add db_bench with lots of column families to
 regression tests

Summary:
That way we can see when this graph goes up and be happy.

Couple of changes:
1. title
2. fix db_bench to delete column families before deleting the DB. this was asserting when compiled in debug mode
3. don't sync manifest when disableDataSync. We discussed this offline. I can move it to separate diff if you'd like

Test Plan: ran it

Reviewers: sdong, yhchiang, ljin

Reviewed By: ljin

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D22815
---
 HISTORY.md                           |  2 +-
 build_tools/regression_build_test.sh | 34 ++++++++++++++++++++++++++++
 db/db_bench.cc                       |  4 ++++
 3 files changed, 39 insertions(+), 1 deletion(-)

diff --git a/HISTORY.md b/HISTORY.md
index 922d3e2c9..5b144ff3a 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,6 +1,6 @@
 # Rocksdb Change Log
 
-### Unreleased
+## Unreleased
 
 ----- Past Releases -----
 
diff --git a/build_tools/regression_build_test.sh b/build_tools/regression_build_test.sh
index 5e335afde..ee2d334f0 100755
--- a/build_tools/regression_build_test.sh
+++ b/build_tools/regression_build_test.sh
@@ -344,6 +344,38 @@ common_in_mem_args="--db=/dev/shm/rocksdb \
     --threads=32 \
     --writes_per_second=81920 > ${STAT_FILE}.seekwhilewriting_in_ram
 
+# measure fillseq with bunch of column families
+./db_bench \
+    --benchmarks=fillseq \
+    --num_column_families=500 \
+    --write_buffer_size=1048576 \
+    --db=$DATA_DIR \
+    --use_existing_db=0 \
+    --num=$NUM \
+    --writes=$NUM \
+    --open_files=55000 \
+    --statistics=1 \
+    --histogram=1 \
+    --disable_data_sync=1 \
+    --disable_wal=1 \
+    --sync=0  > ${STAT_FILE}.fillseq_lots_column_families
+
+# measure overwrite performance with bunch of column families
+./db_bench \
+    --benchmarks=overwrite \
+    --num_column_families=500 \
+    --write_buffer_size=1048576 \
+    --db=$DATA_DIR \
+    --use_existing_db=1 \
+    --num=$NUM \
+    --writes=$((NUM / 10)) \
+    --open_files=55000 \
+    --statistics=1 \
+    --histogram=1 \
+    --disable_data_sync=1 \
+    --disable_wal=1 \
+    --sync=0 \
+    --threads=8 > ${STAT_FILE}.overwrite_lots_column_families
 
 # send data to ods
 function send_to_ods {
@@ -392,3 +424,5 @@ send_benchmark_to_ods readrandom memtablereadrandom $STAT_FILE.memtablefillreadr
 send_benchmark_to_ods readwhilewriting readwhilewriting $STAT_FILE.readwhilewriting
 send_benchmark_to_ods readwhilewriting readwhilewriting_in_ram ${STAT_FILE}.readwhilewriting_in_ram
 send_benchmark_to_ods seekrandomwhilewriting seekwhilewriting_in_ram ${STAT_FILE}.seekwhilewriting_in_ram
+send_benchmark_to_ods fillseq fillseq_lots_column_families ${STAT_FILE}.fillseq_lots_column_families
+send_benchmark_to_ods overwrite overwrite_lots_column_families ${STAT_FILE}.overwrite_lots_column_families
diff --git a/db/db_bench.cc b/db/db_bench.cc
index bd4389b49..ced93f227 100644
--- a/db/db_bench.cc
+++ b/db/db_bench.cc
@@ -1110,6 +1110,8 @@ class Benchmark {
   }
 
   ~Benchmark() {
+    std::for_each(db_.cfh.begin(), db_.cfh.end(),
+                  [](ColumnFamilyHandle* cfh) { delete cfh; });
     delete db_.db;
     delete prefix_extractor_;
   }
@@ -1334,6 +1336,8 @@ class Benchmark {
           method = nullptr;
         } else {
           if (db_.db != nullptr) {
+            std::for_each(db_.cfh.begin(), db_.cfh.end(),
+                          [](ColumnFamilyHandle* cfh) { delete cfh; });
             delete db_.db;
             db_.db = nullptr;
             db_.cfh.clear();

From 9f1c80b55668e3009a276803c66ac0fa0c5887e3 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Fri, 5 Sep 2014 15:20:05 -0700
Subject: [PATCH 031/829] Drop column family from write thread

Summary: If we drop column family only from (single) write thread, we can be sure that nobody will drop the column family while we're writing (and our mutex is released). This greatly simplifies my patch that's getting rid of MakeRoomForWrite().

Test Plan: make check, but also running stress test

Reviewers: ljin, sdong

Reviewed By: sdong

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D22965
---
 db/db_impl.cc       | 44 +++++++++++++++++++++++---------------------
 db/db_impl.h        | 27 ++++++++++++++++++++++++++-
 db/db_impl_debug.cc | 27 +++++++++++++++++++++++++++
 db/db_test.cc       | 21 +++++++++++++++++++++
 4 files changed, 97 insertions(+), 22 deletions(-)

diff --git a/db/db_impl.cc b/db/db_impl.cc
index 1769471cf..b83d60f5e 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -77,20 +77,6 @@ const std::string kDefaultColumnFamilyName("default");
 
 void DumpLeveldbBuildVersion(Logger * log);
 
-// Information kept for every waiting writer
-struct DBImpl::Writer {
-  Status status;
-  WriteBatch* batch;
-  bool sync;
-  bool disableWAL;
-  bool in_batch_group;
-  bool done;
-  uint64_t timeout_hint_us;
-  port::CondVar cv;
-
-  explicit Writer(port::Mutex* mu) : cv(mu) { }
-};
-
 struct DBImpl::WriteContext {
   autovector<SuperVersion*> superversions_to_free_;
   autovector<log::Writer*> logs_to_free_;
@@ -3627,6 +3613,14 @@ Status DBImpl::DropColumnFamily(ColumnFamilyHandle* column_family) {
   edit.DropColumnFamily();
   edit.SetColumnFamily(cfd->GetID());
 
+  Writer w(&mutex_);
+  w.batch = nullptr;
+  w.sync = false;
+  w.disableWAL = false;
+  w.in_batch_group = false;
+  w.done = false;
+  w.timeout_hint_us = kNoTimeOut;
+
   Status s;
   {
     MutexLock l(&mutex_);
@@ -3634,7 +3628,11 @@ Status DBImpl::DropColumnFamily(ColumnFamilyHandle* column_family) {
       s = Status::InvalidArgument("Column family already dropped!\n");
     }
     if (s.ok()) {
+      // we drop column family from a single write thread
+      s = BeginWrite(&w, 0);
+      assert(s.ok() && !w.done);  // No timeout and nobody should do our job
       s = versions_->LogAndApply(cfd, &edit, &mutex_);
+      EndWrite(&w, &w, s);
     }
   }
 
@@ -4173,15 +4171,19 @@ void DBImpl::BuildBatchGroup(Writer** last_writer,
       break;
     }
 
-    if (w->batch != nullptr) {
-      size += WriteBatchInternal::ByteSize(w->batch);
-      if (size > max_size) {
-        // Do not make batch too big
-        break;
-      }
+    if (w->batch == nullptr) {
+      // Do not include those writes with nullptr batch. Those are not writes,
+      // those are something else. They want to be alone
+      break;
+    }
 
-      write_batch_group->push_back(w->batch);
+    size += WriteBatchInternal::ByteSize(w->batch);
+    if (size > max_size) {
+      // Do not make batch too big
+      break;
     }
+
+    write_batch_group->push_back(w->batch);
     w->in_batch_group = true;
     *last_writer = w;
   }
diff --git a/db/db_impl.h b/db/db_impl.h
index e49b954cc..69fe2eaac 100644
--- a/db/db_impl.h
+++ b/db/db_impl.h
@@ -203,6 +203,17 @@ class DBImpl : public DB {
                               SequenceNumber* sequence);
 
   Status TEST_ReadFirstLine(const std::string& fname, SequenceNumber* sequence);
+
+  void TEST_LockMutex();
+
+  void TEST_UnlockMutex();
+
+  // REQUIRES: mutex locked
+  void* TEST_BeginWrite();
+
+  // REQUIRES: mutex locked
+  // pass the pointer that you got from TEST_BeginWrite()
+  void TEST_EndWrite(void* w);
 #endif  // NDEBUG
 
   // Structure to store information for candidate files to delete.
@@ -309,7 +320,7 @@ class DBImpl : public DB {
 #endif
   friend struct SuperVersion;
   struct CompactionState;
-  struct Writer;
+
   struct WriteContext;
 
   Status NewDB();
@@ -349,6 +360,20 @@ class DBImpl : public DB {
 
   uint64_t SlowdownAmount(int n, double bottom, double top);
 
+  // Information kept for every waiting writer
+  struct Writer {
+    Status status;
+    WriteBatch* batch;
+    bool sync;
+    bool disableWAL;
+    bool in_batch_group;
+    bool done;
+    uint64_t timeout_hint_us;
+    port::CondVar cv;
+
+    explicit Writer(port::Mutex* mu) : cv(mu) {}
+  };
+
   // Before applying write operation (such as DBImpl::Write, DBImpl::Flush)
   // thread should grab the mutex_ and be the first on writers queue.
   // BeginWrite is used for it.
diff --git a/db/db_impl_debug.cc b/db/db_impl_debug.cc
index 77d4e0551..5f7a4818d 100644
--- a/db/db_impl_debug.cc
+++ b/db/db_impl_debug.cc
@@ -130,5 +130,32 @@ Status DBImpl::TEST_ReadFirstLine(const std::string& fname,
                                   SequenceNumber* sequence) {
   return ReadFirstLine(fname, sequence);
 }
+
+void DBImpl::TEST_LockMutex() {
+  mutex_.Lock();
+}
+
+void DBImpl::TEST_UnlockMutex() {
+  mutex_.Unlock();
+}
+
+void* DBImpl::TEST_BeginWrite() {
+  auto w = new Writer(&mutex_);
+  w->batch = nullptr;
+  w->sync = false;
+  w->disableWAL = false;
+  w->in_batch_group = false;
+  w->done = false;
+  w->timeout_hint_us = kNoTimeOut;
+  Status s = BeginWrite(w, 0);
+  assert(s.ok() && !w->done);  // No timeout and nobody should do our job
+  return reinterpret_cast<void*>(w);
+}
+
+void DBImpl::TEST_EndWrite(void* w) {
+  auto writer = reinterpret_cast<Writer*>(w);
+  EndWrite(writer, writer, Status::OK());
+}
+
 }  // namespace rocksdb
 #endif  // ROCKSDB_LITE
diff --git a/db/db_test.cc b/db/db_test.cc
index 570af31a5..5b913f43c 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -11,6 +11,7 @@
 #include <iostream>
 #include <set>
 #include <unistd.h>
+#include <thread>
 #include <unordered_set>
 #include <utility>
 
@@ -7894,6 +7895,26 @@ TEST(DBTest, DBIteratorBoundTest) {
   }
 }
 
+TEST(DBTest, WriteSingleThreadEntry) {
+  std::vector<std::thread> threads;
+  dbfull()->TEST_LockMutex();
+  auto w = dbfull()->TEST_BeginWrite();
+  threads.emplace_back([&] { Put("a", "b"); });
+  env_->SleepForMicroseconds(10000);
+  threads.emplace_back([&] { Flush(); });
+  env_->SleepForMicroseconds(10000);
+  dbfull()->TEST_UnlockMutex();
+  dbfull()->TEST_LockMutex();
+  dbfull()->TEST_EndWrite(w);
+  dbfull()->TEST_UnlockMutex();
+
+  for (auto& t : threads) {
+    t.join();
+  }
+}
+
+
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {

From 40ddc3d6c43efeba0d909245862d2c98f65b6fe0 Mon Sep 17 00:00:00 2001
From: Feng Zhu <zagfox@fb.com>
Date: Fri, 5 Sep 2014 15:55:43 -0700
Subject: [PATCH 032/829] add cache bench

Summary: 1. A benchmark for cache

Test Plan: ./cache_bench

Reviewers: yhchiang, dhruba, sdong, igor, ljin

Reviewed By: ljin

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D22809
---
 Makefile            |   5 +-
 util/cache_bench.cc | 257 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 261 insertions(+), 1 deletion(-)
 create mode 100644 util/cache_bench.cc

diff --git a/Makefile b/Makefile
index c75274cd0..b9c00158a 100644
--- a/Makefile
+++ b/Makefile
@@ -132,7 +132,7 @@ TOOLS = \
   options_test \
 	blob_store_bench
 
-PROGRAMS = db_bench signal_test table_reader_bench log_and_apply_bench $(TOOLS)
+PROGRAMS = db_bench signal_test table_reader_bench log_and_apply_bench cache_bench $(TOOLS)
 
 # The library name is configurable since we are maintaining libraries of both
 # debug/release mode.
@@ -264,6 +264,9 @@ $(LIBRARY): $(LIBOBJECTS)
 db_bench: db/db_bench.o $(LIBOBJECTS) $(TESTUTIL)
 	$(CXX) db/db_bench.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@  $(LDFLAGS) $(COVERAGEFLAGS)
 
+cache_bench: util/cache_bench.o $(LIBOBJECTS) $(TESTUTIL)
+	$(CXX) util/cache_bench.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@  $(LDFLAGS) $(COVERAGEFLAGS)
+
 block_hash_index_test: table/block_hash_index_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	 $(CXX) table/block_hash_index_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
 
diff --git a/util/cache_bench.cc b/util/cache_bench.cc
new file mode 100644
index 000000000..ccaf5ce5b
--- /dev/null
+++ b/util/cache_bench.cc
@@ -0,0 +1,257 @@
+//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+#ifndef GFLAGS
+#include <cstdio>
+int main() {
+  fprintf(stderr, "Please install gflags to run rocksdb tools\n");
+  return 1;
+}
+#else
+
+#include <sys/types.h>
+#include <stdio.h>
+#include <gflags/gflags.h>
+
+#include "rocksdb/db.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/env.h"
+#include "port/port.h"
+#include "util/mutexlock.h"
+#include "util/random.h"
+
+using GFLAGS::ParseCommandLineFlags;
+
+static const uint32_t KB = 1024;
+
+DEFINE_int32(threads, 10, "Number of concurrent threads to run.");
+DEFINE_int64(cache_size, 2 * KB * KB * KB,
+             "Number of bytes to use as a cache of uncompressed data.");
+DEFINE_int32(num_shard_bits, 4, "shard_bits.");
+
+DEFINE_int64(max_key, 1 * KB* KB, "Max number of key to place in cache");
+DEFINE_uint64(ops_per_thread, 1200000, "Number of operations per thread.");
+
+DEFINE_int32(insert_percent, 40,
+             "Ratio of insert to total workload (expressed as a percentage)");
+DEFINE_int32(lookup_percent, 50,
+             "Ratio of lookup to total workload (expressed as a percentage)");
+DEFINE_int32(erase_percent, 10,
+             "Ratio of erase to total workload (expressed as a percentage)");
+
+namespace rocksdb {
+
+class CacheBench;
+namespace {
+void deleter(const Slice& key, void* value) {
+    delete reinterpret_cast<char *>(value);
+}
+
+// State shared by all concurrent executions of the same benchmark.
+class SharedState {
+ public:
+  explicit SharedState(CacheBench* cache_bench)
+      : cv_(&mu_),
+        num_threads_(FLAGS_threads),
+        num_initialized_(0),
+        start_(false),
+        num_done_(0),
+        cache_bench_(cache_bench) {
+  }
+
+  ~SharedState() {}
+
+  port::Mutex* GetMutex() {
+    return &mu_;
+  }
+
+  port::CondVar* GetCondVar() {
+    return &cv_;
+  }
+
+  CacheBench* GetCacheBench() const {
+    return cache_bench_;
+  }
+
+  void IncInitialized() {
+    num_initialized_++;
+  }
+
+  void IncDone() {
+    num_done_++;
+  }
+
+  bool AllInitialized() const {
+    return num_initialized_ >= num_threads_;
+  }
+
+  bool AllDone() const {
+    return num_done_ >= num_threads_;
+  }
+
+  void SetStart() {
+    start_ = true;
+  }
+
+  bool Started() const {
+    return start_;
+  }
+
+ private:
+  port::Mutex mu_;
+  port::CondVar cv_;
+
+  const uint64_t num_threads_;
+  uint64_t num_initialized_;
+  bool start_;
+  uint64_t num_done_;
+
+  CacheBench* cache_bench_;
+};
+
+// Per-thread state for concurrent executions of the same benchmark.
+struct ThreadState {
+  uint32_t tid;
+  Random rnd;
+  SharedState* shared;
+
+  ThreadState(uint32_t index, SharedState *shared)
+      : tid(index),
+        rnd(1000 + index),
+        shared(shared) {}
+};
+}  // namespace
+
+class CacheBench {
+ public:
+  CacheBench() :
+      cache_(NewLRUCache(FLAGS_cache_size, FLAGS_num_shard_bits)),
+      num_threads_(FLAGS_threads) {}
+
+  ~CacheBench() {}
+
+  bool Run() {
+    rocksdb::Env* env = rocksdb::Env::Default();
+
+    PrintEnv();
+    SharedState shared(this);
+    std::vector<ThreadState*> threads(num_threads_);
+    for (uint32_t i = 0; i < num_threads_; i++) {
+      threads[i] = new ThreadState(i, &shared);
+      env->StartThread(ThreadBody, threads[i]);
+    }
+    {
+      MutexLock l(shared.GetMutex());
+      while (!shared.AllInitialized()) {
+        shared.GetCondVar()->Wait();
+      }
+      // Record start time
+      uint64_t start_time = env->NowMicros();
+
+      // Start all threads
+      shared.SetStart();
+      shared.GetCondVar()->SignalAll();
+
+      // Wait threads to complete
+      while (!shared.AllDone()) {
+        shared.GetCondVar()->Wait();
+      }
+
+      // Record end time
+      uint64_t end_time = env->NowMicros();
+      fprintf(stdout, "Complete in %" PRIu64 "ms\n", end_time - start_time);
+    }
+    return true;
+  }
+
+ private:
+  std::shared_ptr<Cache> cache_;
+  uint32_t num_threads_;
+
+  static void ThreadBody(void* v) {
+    ThreadState* thread = reinterpret_cast<ThreadState*>(v);
+    SharedState* shared = thread->shared;
+
+    {
+      MutexLock l(shared->GetMutex());
+      shared->IncInitialized();
+      if (shared->AllInitialized()) {
+        shared->GetCondVar()->SignalAll();
+      }
+      while (!shared->Started()) {
+        shared->GetCondVar()->Wait();
+      }
+    }
+    thread->shared->GetCacheBench()->OperateCache(thread);
+
+    {
+      MutexLock l(shared->GetMutex());
+      shared->IncDone();
+      if (shared->AllDone()) {
+        shared->GetCondVar()->SignalAll();
+      }
+    }
+  }
+
+  void OperateCache(ThreadState* thread) {
+    for (uint64_t i = 0; i < FLAGS_ops_per_thread; i++) {
+      uint64_t rand_key = thread->rnd.Next() % FLAGS_max_key;
+      // Cast uint64* to be char*, data would be copied to cache
+      Slice key(reinterpret_cast<char*>(&rand_key), 8);
+      int32_t prob_op = thread->rnd.Uniform(100);
+      if (prob_op >= 0 && prob_op < FLAGS_insert_percent) {
+        // do insert
+        auto handle = cache_->Insert(key, new char[10], 1, &deleter);
+        cache_->Release(handle);
+      } else if (prob_op -= FLAGS_insert_percent &&
+                 prob_op < FLAGS_lookup_percent) {
+        // do lookup
+        auto handle = cache_->Lookup(key);
+        if (handle) {
+          cache_->Release(handle);
+        }
+      } else if (prob_op -= FLAGS_lookup_percent &&
+                 prob_op < FLAGS_erase_percent) {
+        // do erase
+        cache_->Erase(key);
+      }
+    }
+  }
+
+  void PrintEnv() const {
+    printf("RocksDB version     : %d.%d\n", kMajorVersion, kMinorVersion);
+    printf("Number of threads   : %d\n", FLAGS_threads);
+    printf("Ops per thread      : %" PRIu64 "\n", FLAGS_ops_per_thread);
+    printf("Cache size          : %" PRIu64 "\n", FLAGS_cache_size);
+    printf("Num shard bits      : %d\n", FLAGS_num_shard_bits);
+    printf("Max key             : %" PRIu64 "\n", FLAGS_max_key);
+    printf("Insert percentage   : %d%%\n", FLAGS_insert_percent);
+    printf("Lookup percentage   : %d%%\n", FLAGS_lookup_percent);
+    printf("Erase percentage    : %d%%\n", FLAGS_erase_percent);
+    printf("----------------------------\n");
+  }
+};
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  ParseCommandLineFlags(&argc, &argv, true);
+
+  if (FLAGS_threads <= 0) {
+    fprintf(stderr, "threads number <= 0\n");
+    exit(1);
+  }
+
+  rocksdb::CacheBench bench;
+  if (bench.Run()) {
+    return 0;
+  } else {
+    return 1;
+  }
+}
+
+#endif  // GFLAGS

From 2e97c38980a445bd824f5dd67b1850966405539c Mon Sep 17 00:00:00 2001
From: Nik Bougalis <nikb@bougalis.net>
Date: Thu, 4 Sep 2014 23:11:28 -0700
Subject: [PATCH 033/829] Avoid off-by-one error when using readlink

---
 port/stack_trace.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/port/stack_trace.cc b/port/stack_trace.cc
index 76866e63c..296b1f620 100644
--- a/port/stack_trace.cc
+++ b/port/stack_trace.cc
@@ -33,7 +33,7 @@ const char* GetExecutableName() {
 
   char link[1024];
   snprintf(link, sizeof(link), "/proc/%d/exe", getpid());
-  auto read = readlink(link, name, sizeof(name));
+  auto read = readlink(link, name, sizeof(name) - 1);
   if (-1 == read) {
     return nullptr;
   } else {

From d40c1f742ff9cbbb7c6dc1a582c2937b142862d2 Mon Sep 17 00:00:00 2001
From: Nik Bougalis <nikb@bougalis.net>
Date: Fri, 5 Sep 2014 14:14:30 -0700
Subject: [PATCH 034/829] Add missing break statement

---
 utilities/spatialdb/spatial_db.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/utilities/spatialdb/spatial_db.cc b/utilities/spatialdb/spatial_db.cc
index 21a111d3e..f0aed8faa 100644
--- a/utilities/spatialdb/spatial_db.cc
+++ b/utilities/spatialdb/spatial_db.cc
@@ -221,6 +221,7 @@ std::string FeatureSet::DebugString() const {
     switch (iter.second.type()) {
       case Variant::kNull:
         out.append("null");
+        break;
       case Variant::kBool:
         if (iter.second.get_bool()) {
           out.append("true");

From bfee319fb08ab057ec07b250336afa703b63bc7f Mon Sep 17 00:00:00 2001
From: Nik Bougalis <nikb@bougalis.net>
Date: Fri, 5 Sep 2014 16:07:14 -0700
Subject: [PATCH 035/829] sizeof(int*) where sizeof(int) was intended

---
 util/hash_cuckoo_rep.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/util/hash_cuckoo_rep.cc b/util/hash_cuckoo_rep.cc
index a9a79a274..2ee05faac 100644
--- a/util/hash_cuckoo_rep.cc
+++ b/util/hash_cuckoo_rep.cc
@@ -70,7 +70,7 @@ class HashCuckooRep : public MemTableRep {
     }
 
     cuckoo_path_ = reinterpret_cast<int*>(
-        arena_->Allocate(sizeof(int*) * (cuckoo_path_max_depth_ + 1)));
+        arena_->Allocate(sizeof(int) * (cuckoo_path_max_depth_ + 1)));
     is_nearly_full_ = false;
   }
 

From d1cfb71ec7f91ef317b56a2423305b90003bcb66 Mon Sep 17 00:00:00 2001
From: Nik Bougalis <nikb@bougalis.net>
Date: Fri, 5 Sep 2014 20:47:10 -0700
Subject: [PATCH 036/829] Remove unused member(s)

---
 include/rocksdb/cache.h | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/include/rocksdb/cache.h b/include/rocksdb/cache.h
index 65d44b6cb..a8a6f9b73 100644
--- a/include/rocksdb/cache.h
+++ b/include/rocksdb/cache.h
@@ -127,9 +127,6 @@ class Cache {
   void LRU_Append(Handle* e);
   void Unref(Handle* e);
 
-  struct Rep;
-  Rep* rep_;
-
   // No copying allowed
   Cache(const Cache&);
   void operator=(const Cache&);

From 9f8aa0939529f1c6f7ba752968e638429074425c Mon Sep 17 00:00:00 2001
From: Nik Bougalis <nikb@bougalis.net>
Date: Fri, 5 Sep 2014 20:47:57 -0700
Subject: [PATCH 037/829] Don't leak data returned by opendir

---
 util/ldb_cmd.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/util/ldb_cmd.cc b/util/ldb_cmd.cc
index aef84fa35..53e15e0ba 100644
--- a/util/ldb_cmd.cc
+++ b/util/ldb_cmd.cc
@@ -541,6 +541,7 @@ void ManifestDumpCommand::DoCommand() {
         } else {
           exec_state_ = LDBCommandExecuteResult::FAILED(
             "Multiple MANIFEST files found; use --path to select one");
+          closedir(d);
           return;
         }
       }

From a5d2863074d3529e1254cdcfeecd4bf1808de6b0 Mon Sep 17 00:00:00 2001
From: wankai <wankaizhang@gmail.com>
Date: Sat, 6 Sep 2014 23:21:26 +0800
Subject: [PATCH 038/829] typo improvement

---
 table/plain_table_key_coding.cc | 10 +++++-----
 table/plain_table_key_coding.h  |  4 ++--
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/table/plain_table_key_coding.cc b/table/plain_table_key_coding.cc
index eedf58aea..dbe53c0c6 100644
--- a/table/plain_table_key_coding.cc
+++ b/table/plain_table_key_coding.cc
@@ -97,9 +97,9 @@ Status PlainTableKeyEncoder::AppendKey(const Slice& key, WritableFile* file,
 
     Slice prefix =
         prefix_extractor_->Transform(Slice(key.data(), user_key_size));
-    if (key_count_for_prefix == 0 || prefix != pre_prefix_.GetKey() ||
-        key_count_for_prefix % index_sparseness_ == 0) {
-      key_count_for_prefix = 1;
+    if (key_count_for_prefix_ == 0 || prefix != pre_prefix_.GetKey() ||
+        key_count_for_prefix_ % index_sparseness_ == 0) {
+      key_count_for_prefix_ = 1;
       pre_prefix_.SetKey(prefix);
       size_bytes_pos += EncodeSize(kFullKey, user_key_size, size_bytes);
       Status s = file->Append(Slice(size_bytes, size_bytes_pos));
@@ -108,8 +108,8 @@ Status PlainTableKeyEncoder::AppendKey(const Slice& key, WritableFile* file,
       }
       *offset += size_bytes_pos;
     } else {
-      key_count_for_prefix++;
-      if (key_count_for_prefix == 2) {
+      key_count_for_prefix_++;
+      if (key_count_for_prefix_ == 2) {
         // For second key within a prefix, need to encode prefix length
         size_bytes_pos +=
             EncodeSize(kPrefixFromPreviousKey, pre_prefix_.GetKey().size(),
diff --git a/table/plain_table_key_coding.h b/table/plain_table_key_coding.h
index ba66c2645..9047087ae 100644
--- a/table/plain_table_key_coding.h
+++ b/table/plain_table_key_coding.h
@@ -26,7 +26,7 @@ class PlainTableKeyEncoder {
         fixed_user_key_len_(user_key_len),
         prefix_extractor_(prefix_extractor),
         index_sparseness_((index_sparseness > 1) ? index_sparseness : 1),
-        key_count_for_prefix(0) {}
+        key_count_for_prefix_(0) {}
   // key: the key to write out, in the format of internal key.
   // file: the output file to write out
   // offset: offset in the file. Needs to be updated after appending bytes
@@ -45,7 +45,7 @@ class PlainTableKeyEncoder {
   uint32_t fixed_user_key_len_;
   const SliceTransform* prefix_extractor_;
   const size_t index_sparseness_;
-  size_t key_count_for_prefix;
+  size_t key_count_for_prefix_;
   IterKey pre_prefix_;
 };
 

From 823773837ba275d23af107a3e1499915e625e823 Mon Sep 17 00:00:00 2001
From: wankai <wankaizhang@gmail.com>
Date: Mon, 8 Sep 2014 11:10:17 +0800
Subject: [PATCH 039/829] replace hard-coded number with named variable

---
 table/plain_table_key_coding.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/table/plain_table_key_coding.cc b/table/plain_table_key_coding.cc
index dbe53c0c6..c553752e1 100644
--- a/table/plain_table_key_coding.cc
+++ b/table/plain_table_key_coding.cc
@@ -30,7 +30,7 @@ const unsigned char kSizeInlineLimit = 0x3F;
 size_t EncodeSize(EntryType type, uint32_t key_size, char* out_buffer) {
   out_buffer[0] = type << 6;
 
-  if (key_size < 0x3F) {
+  if (key_size < static_cast<uint32_t>(kSizeInlineLimit)) {
     // size inlined
     out_buffer[0] |= static_cast<char>(key_size);
     return 1;

From 88a2f44f99192a156ad70724339ab7e962b1851f Mon Sep 17 00:00:00 2001
From: wankai <wankaizhang@gmail.com>
Date: Mon, 8 Sep 2014 16:34:04 +0800
Subject: [PATCH 040/829] fix comments

---
 table/plain_table_factory.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/table/plain_table_factory.h b/table/plain_table_factory.h
index 54c628c15..e79475221 100644
--- a/table/plain_table_factory.h
+++ b/table/plain_table_factory.h
@@ -127,7 +127,7 @@ class TableBuilder;
 class PlainTableFactory : public TableFactory {
  public:
   ~PlainTableFactory() {}
-  // user_key_size is the length of the user key. If it is set to be
+  // user_key_len is the length of the user key. If it is set to be
   // kPlainTableVariableLength, then it means variable length. Otherwise, all
   // the keys need to have the fix length of this value. bloom_bits_per_key is
   // number of bits used for bloom filer per key. hash_table_ratio is

From 9360cc690e735fbd8a13ebbc448b4da8463fbe46 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Mon, 8 Sep 2014 08:01:25 -0700
Subject: [PATCH 041/829] Fix valgrind issue

---
 db/db_impl_debug.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/db/db_impl_debug.cc b/db/db_impl_debug.cc
index 5f7a4818d..3446571eb 100644
--- a/db/db_impl_debug.cc
+++ b/db/db_impl_debug.cc
@@ -155,6 +155,7 @@ void* DBImpl::TEST_BeginWrite() {
 void DBImpl::TEST_EndWrite(void* w) {
   auto writer = reinterpret_cast<Writer*>(w);
   EndWrite(writer, writer, Status::OK());
+  delete writer;
 }
 
 }  // namespace rocksdb

From 0af157f9bfef20b01e32f18db4a9ac03df0ba0d0 Mon Sep 17 00:00:00 2001
From: Feng Zhu <zagfox@fb.com>
Date: Mon, 8 Sep 2014 10:37:05 -0700
Subject: [PATCH 042/829] Implement full filter for block based table.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary:
1. Make filter_block.h a base class. Derive block_based_filter_block and full_filter_block. The previous one is the traditional filter block. The full_filter_block is newly added. It would generate a filter block that contain all the keys in SST file.

2. When querying a key, table would first check if full_filter is available. If not, it would go to the exact data block and check using block_based filter.

3. User could choose to use full_filter or tradional(block_based_filter). They would be stored in SST file with different meta index name. "filter.filter_policy" or "full_filter.filter_policy". Then, Table reader is able to know the fllter block type.

4. Some optimizations have been done for full_filter_block, thus it requires a different interface compared to the original one in filter_policy.h.

5. Actual implementation of filter bits coding/decoding is placed in util/bloom_impl.cc

Benchmark: base commit 1d23b5c470844c1208301311f0889eca750431c0
Command:
db_bench --db=/dev/shm/rocksdb --num_levels=6 --key_size=20 --prefix_size=20 --keys_per_prefix=0 --value_size=100 --write_buffer_size=134217728 --max_write_buffer_number=2 --target_file_size_base=33554432 --max_bytes_for_level_base=1073741824 --verify_checksum=false --max_background_compactions=4 --use_plain_table=0 --memtablerep=prefix_hash --open_files=-1 --mmap_read=1 --mmap_write=0 --bloom_bits=10 --bloom_locality=1 --memtable_bloom_bits=500000 --compression_type=lz4 --num=393216000 --use_hash_search=1 --block_size=1024 --block_restart_interval=16 --use_existing_db=1 --threads=1 --benchmarks=readrandom —disable_auto_compactions=1
Read QPS increase for about 30% from 2230002 to 2991411.

Test Plan:
make all check
valgrind db_test
db_stress --use_block_based_filter = 0
./auto_sanity_test.sh

Reviewers: igor, yhchiang, ljin, sdong

Reviewed By: sdong

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D20979
---
 Makefile                                      |  10 +-
 db/c.cc                                       |   2 +-
 db/db_bench.cc                                |  12 +-
 db/db_test.cc                                 | 224 +++++++++----
 include/rocksdb/filter_policy.h               |  64 +++-
 include/rocksdb/statistics.h                  |   2 +-
 ...r_block.cc => block_based_filter_block.cc} | 124 +++----
 table/block_based_filter_block.h              | 102 ++++++
 table/block_based_filter_block_test.cc        | 242 ++++++++++++++
 table/block_based_table_builder.cc            |  39 ++-
 table/block_based_table_reader.cc             | 274 +++++++++-------
 table/block_based_table_reader.h              |   7 +-
 table/filter_block.h                          |  82 ++---
 table/filter_block_test.cc                    | 139 --------
 table/full_filter_block.cc                    |  99 ++++++
 table/full_filter_block.h                     | 107 ++++++
 table/full_filter_block_test.cc               | 181 +++++++++++
 table/plain_table_builder.cc                  |   1 -
 table/plain_table_reader.cc                   |   1 -
 tools/db_sanity_test.cc                       |  20 +-
 tools/db_stress.cc                            |   9 +-
 util/bloom.cc                                 | 305 ++++++++++++++++--
 util/bloom_test.cc                            | 147 ++++++++-
 23 files changed, 1709 insertions(+), 484 deletions(-)
 rename table/{filter_block.cc => block_based_filter_block.cc} (60%)
 create mode 100644 table/block_based_filter_block.h
 create mode 100644 table/block_based_filter_block_test.cc
 delete mode 100644 table/filter_block_test.cc
 create mode 100644 table/full_filter_block.cc
 create mode 100644 table/full_filter_block.h
 create mode 100644 table/full_filter_block_test.cc

diff --git a/Makefile b/Makefile
index b9c00158a..c05d82af7 100644
--- a/Makefile
+++ b/Makefile
@@ -90,7 +90,8 @@ TESTS = \
 	blob_store_test \
 	filelock_test \
 	filename_test \
-	filter_block_test \
+	block_based_filter_block_test \
+	full_filter_block_test \
 	histogram_test \
 	log_test \
 	manual_compaction_test \
@@ -393,8 +394,11 @@ rate_limiter_test: util/rate_limiter_test.o $(LIBOBJECTS) $(TESTHARNESS)
 filename_test: db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
 
-filter_block_test: table/filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS)
-	$(CXX) table/filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+block_based_filter_block_test: table/block_based_filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) table/block_based_filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+full_filter_block_test: table/full_filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) table/full_filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
 
 log_test: db/log_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) db/log_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
diff --git a/db/c.cc b/db/c.cc
index 9ea549646..d9dee46fb 100644
--- a/db/c.cc
+++ b/db/c.cc
@@ -118,7 +118,7 @@ struct rocksdb_compactionfilter_t : public CompactionFilter {
       const Slice& existing_value,
       std::string* new_value,
       bool* value_changed) const {
-    char* c_new_value = NULL;
+    char* c_new_value = nullptr;
     size_t new_value_length = 0;
     unsigned char c_value_changed = 0;
     unsigned char result = (*filter_)(
diff --git a/db/db_bench.cc b/db/db_bench.cc
index ced93f227..eada95b6b 100644
--- a/db/db_bench.cc
+++ b/db/db_bench.cc
@@ -39,8 +39,8 @@ int main() {
 #include "rocksdb/memtablerep.h"
 #include "rocksdb/write_batch.h"
 #include "rocksdb/slice.h"
+#include "rocksdb/filter_policy.h"
 #include "rocksdb/slice_transform.h"
-#include "rocksdb/statistics.h"
 #include "rocksdb/perf_context.h"
 #include "port/port.h"
 #include "port/stack_trace.h"
@@ -553,7 +553,9 @@ DEFINE_double(cuckoo_hash_ratio, 0.9, "Hash ratio for Cuckoo SST table.");
 DEFINE_bool(use_hash_search, false, "if use kHashSearch "
             "instead of kBinarySearch. "
             "This is valid if only we use BlockTable");
-
+DEFINE_bool(use_block_based_filter, false, "if use kBlockBasedFilter "
+            "instead of kFullFilter for filter block. "
+            "This is valid if only we use BlockTable");
 DEFINE_string(merge_operator, "", "The merge operator to use with the database."
               "If a new merge operator is specified, be sure to use fresh"
               " database The possible merge operators are defined in"
@@ -1076,9 +1078,9 @@ class Benchmark {
            (FLAGS_cache_numshardbits >= 1 ?
             NewLRUCache(FLAGS_compressed_cache_size, FLAGS_cache_numshardbits) :
             NewLRUCache(FLAGS_compressed_cache_size)) : nullptr),
-    filter_policy_(FLAGS_bloom_bits >= 0
-                   ? NewBloomFilterPolicy(FLAGS_bloom_bits)
-                   : nullptr),
+    filter_policy_(FLAGS_bloom_bits >= 0 ?
+        NewBloomFilterPolicy(FLAGS_bloom_bits, FLAGS_use_block_based_filter)
+        : nullptr),
     prefix_extractor_(NewFixedPrefixTransform(FLAGS_prefix_size)),
     num_(FLAGS_num),
     value_size_(FLAGS_value_size),
diff --git a/db/db_test.cc b/db/db_test.cc
index 5b913f43c..b30bfd70d 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -324,21 +324,22 @@ class DBTest {
     kHashCuckoo = 7,
     kMergePut = 8,
     kFilter = 9,
-    kUncompressed = 10,
-    kNumLevel_3 = 11,
-    kDBLogDir = 12,
-    kWalDir = 13,
-    kManifestFileSize = 14,
-    kCompactOnFlush = 15,
-    kPerfOptions = 16,
-    kDeletesFilterFirst = 17,
-    kHashSkipList = 18,
-    kUniversalCompaction = 19,
-    kCompressedBlockCache = 20,
-    kInfiniteMaxOpenFiles = 21,
-    kxxHashChecksum = 22,
-    kFIFOCompaction = 23,
-    kEnd = 24
+    kFullFilter = 10,
+    kUncompressed = 11,
+    kNumLevel_3 = 12,
+    kDBLogDir = 13,
+    kWalDir = 14,
+    kManifestFileSize = 15,
+    kCompactOnFlush = 16,
+    kPerfOptions = 17,
+    kDeletesFilterFirst = 18,
+    kHashSkipList = 19,
+    kUniversalCompaction = 20,
+    kCompressedBlockCache = 21,
+    kInfiniteMaxOpenFiles = 22,
+    kxxHashChecksum = 23,
+    kFIFOCompaction = 24,
+    kEnd = 25
   };
   int option_config_;
 
@@ -448,6 +449,30 @@ class DBTest {
     }
   }
 
+  // Switch between different filter policy
+  // Jump from kDefault to kFilter to kFullFilter
+  bool ChangeFilterOptions(Options* prev_options = nullptr) {
+    if (option_config_ == kDefault) {
+      option_config_ = kFilter;
+      if (prev_options == nullptr) {
+        prev_options = &last_options_;
+      }
+      Destroy(prev_options);
+      TryReopen();
+      return true;
+    } else if (option_config_ == kFilter) {
+      option_config_ = kFullFilter;
+      if (prev_options == nullptr) {
+        prev_options = &last_options_;
+      }
+      Destroy(prev_options);
+      TryReopen();
+      return true;
+    } else {
+      return false;
+    }
+  }
+
   // Return the current option configuration.
   Options CurrentOptions(
       const anon::OptionsOverride& options_override = anon::OptionsOverride()) {
@@ -486,7 +511,10 @@ class DBTest {
         options.merge_operator = MergeOperators::CreatePutOperator();
         break;
       case kFilter:
-        table_options.filter_policy.reset(NewBloomFilterPolicy(10));
+        table_options.filter_policy.reset(NewBloomFilterPolicy(10, true));
+        break;
+      case kFullFilter:
+        table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
         break;
       case kUncompressed:
         options.compression = kNoCompression;
@@ -5744,6 +5772,92 @@ TEST(DBTest, BloomFilter) {
   } while (ChangeCompactOptions());
 }
 
+TEST(DBTest, BloomFilterRate) {
+  while (ChangeFilterOptions()) {
+    Options options = CurrentOptions();
+    options.statistics = rocksdb::CreateDBStatistics();
+    CreateAndReopenWithCF({"pikachu"}, &options);
+
+    const int maxKey = 10000;
+    for (int i = 0; i < maxKey; i++) {
+      ASSERT_OK(Put(1, Key(i), Key(i)));
+    }
+    // Add a large key to make the file contain wide range
+    ASSERT_OK(Put(1, Key(maxKey + 55555), Key(maxKey + 55555)));
+    Flush(1);
+
+    // Check if they can be found
+    for (int i = 0; i < maxKey; i++) {
+      ASSERT_EQ(Key(i), Get(1, Key(i)));
+    }
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
+
+    // Check if filter is useful
+    for (int i = 0; i < maxKey; i++) {
+      ASSERT_EQ("NOT_FOUND", Get(1, Key(i+33333)));
+    }
+    ASSERT_GE(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), maxKey*0.98);
+  }
+}
+
+TEST(DBTest, BloomFilterCompatibility) {
+  Options options;
+  options.statistics = rocksdb::CreateDBStatistics();
+  BlockBasedTableOptions table_options;
+  table_options.filter_policy.reset(NewBloomFilterPolicy(10, true));
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  // Create with block based filter
+  CreateAndReopenWithCF({"pikachu"}, &options);
+
+  const int maxKey = 10000;
+  for (int i = 0; i < maxKey; i++) {
+    ASSERT_OK(Put(1, Key(i), Key(i)));
+  }
+  ASSERT_OK(Put(1, Key(maxKey + 55555), Key(maxKey + 55555)));
+  Flush(1);
+
+  // Check db with full filter
+  table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  ReopenWithColumnFamilies({"default", "pikachu"}, &options);
+
+  // Check if they can be found
+  for (int i = 0; i < maxKey; i++) {
+    ASSERT_EQ(Key(i), Get(1, Key(i)));
+  }
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
+}
+
+TEST(DBTest, BloomFilterReverseCompatibility) {
+  Options options;
+  options.statistics = rocksdb::CreateDBStatistics();
+  BlockBasedTableOptions table_options;
+  table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  // Create with full filter
+  CreateAndReopenWithCF({"pikachu"}, &options);
+
+  const int maxKey = 10000;
+  for (int i = 0; i < maxKey; i++) {
+    ASSERT_OK(Put(1, Key(i), Key(i)));
+  }
+  ASSERT_OK(Put(1, Key(maxKey + 55555), Key(maxKey + 55555)));
+  Flush(1);
+
+  // Check db with block_based filter
+  table_options.filter_policy.reset(NewBloomFilterPolicy(10, true));
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  ReopenWithColumnFamilies({"default", "pikachu"}, &options);
+
+  // Check if they can be found
+  for (int i = 0; i < maxKey; i++) {
+    ASSERT_EQ(Key(i), Get(1, Key(i)));
+  }
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
+}
+
 TEST(DBTest, SnapshotFiles) {
   do {
     Options options = CurrentOptions();
@@ -7194,47 +7308,49 @@ void PrefixScanInit(DBTest *dbtest) {
 }  // namespace
 
 TEST(DBTest, PrefixScan) {
-  int count;
-  Slice prefix;
-  Slice key;
-  char buf[100];
-  Iterator* iter;
-  snprintf(buf, sizeof(buf), "03______:");
-  prefix = Slice(buf, 8);
-  key = Slice(buf, 9);
-  // db configs
-  env_->count_random_reads_ = true;
-  Options options = CurrentOptions();
-  options.env = env_;
-  options.prefix_extractor.reset(NewFixedPrefixTransform(8));
-  options.disable_auto_compactions = true;
-  options.max_background_compactions = 2;
-  options.create_if_missing = true;
-  options.memtable_factory.reset(NewHashSkipListRepFactory(16));
+  while (ChangeFilterOptions()) {
+    int count;
+    Slice prefix;
+    Slice key;
+    char buf[100];
+    Iterator* iter;
+    snprintf(buf, sizeof(buf), "03______:");
+    prefix = Slice(buf, 8);
+    key = Slice(buf, 9);
+    // db configs
+    env_->count_random_reads_ = true;
+    Options options = CurrentOptions();
+    options.env = env_;
+    options.prefix_extractor.reset(NewFixedPrefixTransform(8));
+    options.disable_auto_compactions = true;
+    options.max_background_compactions = 2;
+    options.create_if_missing = true;
+    options.memtable_factory.reset(NewHashSkipListRepFactory(16));
 
-  BlockBasedTableOptions table_options;
-  table_options.no_block_cache = true;
-  table_options.filter_policy.reset(NewBloomFilterPolicy(10));
-  table_options.whole_key_filtering = false;
-  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+    BlockBasedTableOptions table_options;
+    table_options.no_block_cache = true;
+    table_options.filter_policy.reset(NewBloomFilterPolicy(10));
+    table_options.whole_key_filtering = false;
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
 
-  // 11 RAND I/Os
-  DestroyAndReopen(&options);
-  PrefixScanInit(this);
-  count = 0;
-  env_->random_read_counter_.Reset();
-  iter = db_->NewIterator(ReadOptions());
-  for (iter->Seek(prefix); iter->Valid(); iter->Next()) {
-    if (! iter->key().starts_with(prefix)) {
-      break;
+    // 11 RAND I/Os
+    DestroyAndReopen(&options);
+    PrefixScanInit(this);
+    count = 0;
+    env_->random_read_counter_.Reset();
+    iter = db_->NewIterator(ReadOptions());
+    for (iter->Seek(prefix); iter->Valid(); iter->Next()) {
+      if (! iter->key().starts_with(prefix)) {
+        break;
+      }
+      count++;
     }
-    count++;
-  }
-  ASSERT_OK(iter->status());
-  delete iter;
-  ASSERT_EQ(count, 2);
-  ASSERT_EQ(env_->random_read_counter_.Read(), 2);
-  Close();
+    ASSERT_OK(iter->status());
+    delete iter;
+    ASSERT_EQ(count, 2);
+    ASSERT_EQ(env_->random_read_counter_.Read(), 2);
+    Close();
+  }  // end of while
 }
 
 TEST(DBTest, TailingIteratorSingle) {
diff --git a/include/rocksdb/filter_policy.h b/include/rocksdb/filter_policy.h
index fa44db45f..90aefb388 100644
--- a/include/rocksdb/filter_policy.h
+++ b/include/rocksdb/filter_policy.h
@@ -21,11 +21,52 @@
 #define STORAGE_ROCKSDB_INCLUDE_FILTER_POLICY_H_
 
 #include <string>
+#include <memory>
 
 namespace rocksdb {
 
 class Slice;
 
+// A class that takes a bunch of keys, then generates filter
+class FilterBitsBuilder {
+ public:
+  virtual ~FilterBitsBuilder() {}
+
+  // Add Key to filter, you could use any way to store the key.
+  // Such as: storing hashes or original keys
+  // Keys are in sorted order and duplicated keys are possible.
+  virtual void AddKey(const Slice& key) = 0;
+
+  // Generate the filter using the keys that are added
+  // The return value of this function would be the filter bits,
+  // The ownership of actual data is set to buf
+  virtual Slice Finish(std::unique_ptr<const char[]>* buf) = 0;
+};
+
+// A class that checks if a key can be in filter
+// It should be initialized by Slice generated by BitsBuilder
+class FilterBitsReader {
+ public:
+  virtual ~FilterBitsReader() {}
+
+  // Check if the entry match the bits in filter
+  virtual bool MayMatch(const Slice& entry) = 0;
+};
+
+// We add a new format of filter block called full filter block
+// This new interface gives you more space of customization
+//
+// For the full filter block, you can plug in your version by implement
+// the FilterBitsBuilder and FilterBitsReader
+//
+// There are two sets of interface in FilterPolicy
+// Set 1: CreateFilter, KeyMayMatch: used for blockbased filter
+// Set 2: GetFilterBitsBuilder, GetFilterBitsReader, they are used for
+// full filter.
+// Set 1 MUST be implemented correctly, Set 2 is optional
+// RocksDB would first try using functions in Set 2. if they return nullptr,
+// it would use Set 1 instead.
+// You can choose filter type in NewBloomFilterPolicy
 class FilterPolicy {
  public:
   virtual ~FilterPolicy();
@@ -51,11 +92,28 @@ class FilterPolicy {
   // This method may return true or false if the key was not on the
   // list, but it should aim to return false with a high probability.
   virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const = 0;
+
+  // Get the FilterBitsBuilder, which is ONLY used for full filter block
+  // It contains interface to take individual key, then generate filter
+  virtual FilterBitsBuilder* GetFilterBitsBuilder() const {
+    return nullptr;
+  }
+
+  // Get the FilterBitsReader, which is ONLY used for full filter block
+  // It contains interface to tell if key can be in filter
+  // The input slice should NOT be deleted by FilterPolicy
+  virtual FilterBitsReader* GetFilterBitsReader(const Slice& contents) const {
+    return nullptr;
+  }
 };
 
 // Return a new filter policy that uses a bloom filter with approximately
-// the specified number of bits per key.  A good value for bits_per_key
+// the specified number of bits per key.
+//
+// bits_per_key: bits per key in bloom filter. A good value for bits_per_key
 // is 10, which yields a filter with ~ 1% false positive rate.
+// use_block_based_builder: use block based filter rather than full fiter.
+// If you want to builder full filter, it needs to be set to false.
 //
 // Callers must delete the result after any database that is using the
 // result has been closed.
@@ -67,8 +125,8 @@ class FilterPolicy {
 // ignores trailing spaces, it would be incorrect to use a
 // FilterPolicy (like NewBloomFilterPolicy) that does not ignore
 // trailing spaces in keys.
-extern const FilterPolicy* NewBloomFilterPolicy(int bits_per_key);
-
+extern const FilterPolicy* NewBloomFilterPolicy(int bits_per_key,
+    bool use_block_based_builder = true);
 }
 
 #endif  // STORAGE_ROCKSDB_INCLUDE_FILTER_POLICY_H_
diff --git a/include/rocksdb/statistics.h b/include/rocksdb/statistics.h
index 6785833b4..a7f2c1408 100644
--- a/include/rocksdb/statistics.h
+++ b/include/rocksdb/statistics.h
@@ -115,7 +115,7 @@ enum Tickers : uint32_t {
   // head of the writers queue.
   WRITE_DONE_BY_SELF,
   WRITE_DONE_BY_OTHER,
-  WRITE_TIMEDOUT,        // Number of writes ending up with timed-out.
+  WRITE_TIMEDOUT,       // Number of writes ending up with timed-out.
   WRITE_WITH_WAL,       // Number of Write calls that request WAL
   COMPACT_READ_BYTES,   // Bytes read during compaction
   COMPACT_WRITE_BYTES,  // Bytes written during compaction
diff --git a/table/filter_block.cc b/table/block_based_filter_block.cc
similarity index 60%
rename from table/filter_block.cc
rename to table/block_based_filter_block.cc
index 30284017b..c2c34c628 100644
--- a/table/filter_block.cc
+++ b/table/block_based_filter_block.cc
@@ -7,7 +7,7 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#include "table/filter_block.h"
+#include "table/block_based_filter_block.h"
 
 #include "db/dbformat.h"
 #include "rocksdb/filter_policy.h"
@@ -15,21 +15,39 @@
 
 namespace rocksdb {
 
+namespace {
+bool SamePrefix(const SliceTransform* prefix_extractor,
+                const Slice& key1, const Slice& key2) {
+  if (!prefix_extractor->InDomain(key1) &&
+      !prefix_extractor->InDomain(key2)) {
+    return true;
+  } else if (!prefix_extractor->InDomain(key1) ||
+             !prefix_extractor->InDomain(key2)) {
+    return false;
+  } else {
+    return (prefix_extractor->Transform(key1) ==
+            prefix_extractor->Transform(key2));
+  }
+}
+}  // namespace
+
+
 // See doc/table_format.txt for an explanation of the filter block format.
 
 // Generate new filter every 2KB of data
 static const size_t kFilterBaseLg = 11;
 static const size_t kFilterBase = 1 << kFilterBaseLg;
 
-FilterBlockBuilder::FilterBlockBuilder(const SliceTransform* prefix_extractor,
-                                       const BlockBasedTableOptions& table_opt,
-                                       const Comparator* internal_comparator)
+BlockBasedFilterBlockBuilder::BlockBasedFilterBlockBuilder(
+    const SliceTransform* prefix_extractor,
+    const BlockBasedTableOptions& table_opt)
     : policy_(table_opt.filter_policy.get()),
       prefix_extractor_(prefix_extractor),
-      whole_key_filtering_(table_opt.whole_key_filtering),
-      comparator_(internal_comparator) {}
+      whole_key_filtering_(table_opt.whole_key_filtering) {
+  assert(policy_);
+}
 
-void FilterBlockBuilder::StartBlock(uint64_t block_offset) {
+void BlockBasedFilterBlockBuilder::StartBlock(uint64_t block_offset) {
   uint64_t filter_index = (block_offset / kFilterBase);
   assert(filter_index >= filter_offsets_.size());
   while (filter_index > filter_offsets_.size()) {
@@ -37,53 +55,45 @@ void FilterBlockBuilder::StartBlock(uint64_t block_offset) {
   }
 }
 
-bool FilterBlockBuilder::SamePrefix(const Slice &key1,
-                                    const Slice &key2) const {
-  if (!prefix_extractor_->InDomain(key1) &&
-      !prefix_extractor_->InDomain(key2)) {
-    return true;
-  } else if (!prefix_extractor_->InDomain(key1) ||
-             !prefix_extractor_->InDomain(key2)) {
-    return false;
-  } else {
-    return (prefix_extractor_->Transform(key1) ==
-            prefix_extractor_->Transform(key2));
+void BlockBasedFilterBlockBuilder::Add(const Slice& key) {
+  added_to_start_ = 0;
+  if (whole_key_filtering_) {
+    AddKey(key);
+    added_to_start_ = 1;
+  }
+  if (prefix_extractor_ && prefix_extractor_->InDomain(key)) {
+    AddPrefix(key);
   }
 }
 
-void FilterBlockBuilder::AddKey(const Slice& key) {
+// Add key to filter if needed
+inline void BlockBasedFilterBlockBuilder::AddKey(const Slice& key) {
+  start_.push_back(entries_.size());
+  entries_.append(key.data(), key.size());
+}
+
+// Add prefix to filter if needed
+inline void BlockBasedFilterBlockBuilder::AddPrefix(const Slice& key) {
   // get slice for most recently added entry
   Slice prev;
-  size_t added_to_start = 0;
-
-  // add key to filter if needed
-  if (whole_key_filtering_) {
-    start_.push_back(entries_.size());
-    ++added_to_start;
-    entries_.append(key.data(), key.size());
-  }
-
-  if (start_.size() > added_to_start) {
-    size_t prev_start = start_[start_.size() - 1 - added_to_start];
+  if (start_.size() > added_to_start_) {
+    size_t prev_start = start_[start_.size() - 1 - added_to_start_];
     const char* base = entries_.data() + prev_start;
     size_t length = entries_.size() - prev_start;
     prev = Slice(base, length);
   }
 
-  // add prefix to filter if needed
-  if (prefix_extractor_ && prefix_extractor_->InDomain(key)) {
-    // this assumes prefix(prefix(key)) == prefix(key), as the last
-    // entry in entries_ may be either a key or prefix, and we use
-    // prefix(last entry) to get the prefix of the last key.
-    if (prev.size() == 0 || !SamePrefix(key, prev)) {
-      Slice prefix = prefix_extractor_->Transform(key);
-      start_.push_back(entries_.size());
-      entries_.append(prefix.data(), prefix.size());
-    }
+  // this assumes prefix(prefix(key)) == prefix(key), as the last
+  // entry in entries_ may be either a key or prefix, and we use
+  // prefix(last entry) to get the prefix of the last key.
+  if (prev.size() == 0 || !SamePrefix(prefix_extractor_, key, prev)) {
+    Slice prefix = prefix_extractor_->Transform(key);
+    start_.push_back(entries_.size());
+    entries_.append(prefix.data(), prefix.size());
   }
 }
 
-Slice FilterBlockBuilder::Finish() {
+Slice BlockBasedFilterBlockBuilder::Finish() {
   if (!start_.empty()) {
     GenerateFilter();
   }
@@ -99,7 +109,7 @@ Slice FilterBlockBuilder::Finish() {
   return Slice(result_);
 }
 
-void FilterBlockBuilder::GenerateFilter() {
+void BlockBasedFilterBlockBuilder::GenerateFilter() {
   const size_t num_entries = start_.size();
   if (num_entries == 0) {
     // Fast path if there are no keys for this filter
@@ -112,7 +122,7 @@ void FilterBlockBuilder::GenerateFilter() {
   tmp_entries_.resize(num_entries);
   for (size_t i = 0; i < num_entries; i++) {
     const char* base = entries_.data() + start_[i];
-    size_t length = start_[i+1] - start_[i];
+    size_t length = start_[i + 1] - start_[i];
     tmp_entries_[i] = Slice(base, length);
   }
 
@@ -125,7 +135,7 @@ void FilterBlockBuilder::GenerateFilter() {
   start_.clear();
 }
 
-FilterBlockReader::FilterBlockReader(
+BlockBasedFilterBlockReader::BlockBasedFilterBlockReader(
     const SliceTransform* prefix_extractor,
     const BlockBasedTableOptions& table_opt,
     const Slice& contents, bool delete_contents_after_use)
@@ -136,9 +146,10 @@ FilterBlockReader::FilterBlockReader(
       offset_(nullptr),
       num_(0),
       base_lg_(0) {
+  assert(policy_);
   size_t n = contents.size();
   if (n < 5) return;  // 1 byte for base_lg_ and 4 for start of offset array
-  base_lg_ = contents[n-1];
+  base_lg_ = contents[n - 1];
   uint32_t last_word = DecodeFixed32(contents.data() + n - 5);
   if (last_word > n - 5) return;
   data_ = contents.data();
@@ -149,27 +160,30 @@ FilterBlockReader::FilterBlockReader(
   }
 }
 
-bool FilterBlockReader::KeyMayMatch(uint64_t block_offset,
-                                    const Slice& key) {
+bool BlockBasedFilterBlockReader::KeyMayMatch(const Slice& key,
+                                              uint64_t block_offset) {
+  assert(block_offset != kNotValid);
   if (!whole_key_filtering_) {
     return true;
   }
-  return MayMatch(block_offset, key);
+  return MayMatch(key, block_offset);
 }
 
-bool FilterBlockReader::PrefixMayMatch(uint64_t block_offset,
-                                       const Slice& prefix) {
+bool BlockBasedFilterBlockReader::PrefixMayMatch(const Slice& prefix,
+                                                 uint64_t block_offset) {
+  assert(block_offset != kNotValid);
   if (!prefix_extractor_) {
     return true;
   }
-  return MayMatch(block_offset, prefix);
+  return MayMatch(prefix, block_offset);
 }
 
-bool FilterBlockReader::MayMatch(uint64_t block_offset, const Slice& entry) {
+bool BlockBasedFilterBlockReader::MayMatch(const Slice& entry,
+                                           uint64_t block_offset) {
   uint64_t index = block_offset >> base_lg_;
   if (index < num_) {
-    uint32_t start = DecodeFixed32(offset_ + index*4);
-    uint32_t limit = DecodeFixed32(offset_ + index*4 + 4);
+    uint32_t start = DecodeFixed32(offset_ + index * 4);
+    uint32_t limit = DecodeFixed32(offset_ + index * 4 + 4);
     if (start <= limit && limit <= (uint32_t)(offset_ - data_)) {
       Slice filter = Slice(data_ + start, limit - start);
       return policy_->KeyMayMatch(entry, filter);
@@ -181,7 +195,7 @@ bool FilterBlockReader::MayMatch(uint64_t block_offset, const Slice& entry) {
   return true;  // Errors are treated as potential matches
 }
 
-size_t FilterBlockReader::ApproximateMemoryUsage() const {
+size_t BlockBasedFilterBlockReader::ApproximateMemoryUsage() const {
   return num_ * 4 + 5 + (offset_ - data_);
 }
 }
diff --git a/table/block_based_filter_block.h b/table/block_based_filter_block.h
new file mode 100644
index 000000000..9bbc93531
--- /dev/null
+++ b/table/block_based_filter_block.h
@@ -0,0 +1,102 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// A filter block is stored near the end of a Table file.  It contains
+// filters (e.g., bloom filters) for all data blocks in the table combined
+// into a single filter block.
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string>
+#include <memory>
+#include <vector>
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "table/filter_block.h"
+#include "util/hash.h"
+
+namespace rocksdb {
+
+
+// A BlockBasedFilterBlockBuilder is used to construct all of the filters for a
+// particular Table.  It generates a single string which is stored as
+// a special block in the Table.
+//
+// The sequence of calls to BlockBasedFilterBlockBuilder must match the regexp:
+//      (StartBlock Add*)* Finish
+class BlockBasedFilterBlockBuilder : public FilterBlockBuilder {
+ public:
+  BlockBasedFilterBlockBuilder(const SliceTransform* prefix_extractor,
+      const BlockBasedTableOptions& table_opt);
+
+  virtual bool IsBlockBased() override { return true; }
+  virtual void StartBlock(uint64_t block_offset) override;
+  virtual void Add(const Slice& key) override;
+  virtual Slice Finish() override;
+
+ private:
+  void AddKey(const Slice& key);
+  void AddPrefix(const Slice& key);
+  void GenerateFilter();
+
+  // important: all of these might point to invalid addresses
+  // at the time of destruction of this filter block. destructor
+  // should NOT dereference them.
+  const FilterPolicy* policy_;
+  const SliceTransform* prefix_extractor_;
+  bool whole_key_filtering_;
+
+  std::string entries_;             // Flattened entry contents
+  std::vector<size_t> start_;       // Starting index in entries_ of each entry
+  uint32_t added_to_start_;         // To indicate if key is added
+  std::string result_;              // Filter data computed so far
+  std::vector<Slice> tmp_entries_;  // policy_->CreateFilter() argument
+  std::vector<uint32_t> filter_offsets_;
+
+  // No copying allowed
+  BlockBasedFilterBlockBuilder(const BlockBasedFilterBlockBuilder&);
+  void operator=(const BlockBasedFilterBlockBuilder&);
+};
+
+// A FilterBlockReader is used to parse filter from SST table.
+// KeyMayMatch and PrefixMayMatch would trigger filter checking
+class BlockBasedFilterBlockReader : public FilterBlockReader {
+ public:
+  // REQUIRES: "contents" and *policy must stay live while *this is live.
+  BlockBasedFilterBlockReader(const SliceTransform* prefix_extractor,
+                              const BlockBasedTableOptions& table_opt,
+                              const Slice& contents,
+                              bool delete_contents_after_use = false);
+  virtual bool IsBlockBased() override { return true; }
+  virtual bool KeyMayMatch(const Slice& key,
+                           uint64_t block_offset = kNotValid) override;
+  virtual bool PrefixMayMatch(const Slice& prefix,
+                              uint64_t block_offset = kNotValid) override;
+  virtual size_t ApproximateMemoryUsage() const override;
+
+ private:
+  const FilterPolicy* policy_;
+  const SliceTransform* prefix_extractor_;
+  bool whole_key_filtering_;
+  const char* data_;    // Pointer to filter data (at block-start)
+  const char* offset_;  // Pointer to beginning of offset array (at block-end)
+  size_t num_;          // Number of entries in offset array
+  size_t base_lg_;      // Encoding parameter (see kFilterBaseLg in .cc file)
+  std::unique_ptr<const char[]> filter_data;
+
+  bool MayMatch(const Slice& entry, uint64_t block_offset);
+
+  // No copying allowed
+  BlockBasedFilterBlockReader(const BlockBasedFilterBlockReader&);
+  void operator=(const BlockBasedFilterBlockReader&);
+};
+}  // namespace rocksdb
diff --git a/table/block_based_filter_block_test.cc b/table/block_based_filter_block_test.cc
new file mode 100644
index 000000000..4fd8c1cf5
--- /dev/null
+++ b/table/block_based_filter_block_test.cc
@@ -0,0 +1,242 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "table/block_based_filter_block.h"
+
+#include "rocksdb/filter_policy.h"
+#include "util/coding.h"
+#include "util/hash.h"
+#include "util/logging.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+namespace rocksdb {
+
+// For testing: emit an array with one hash value per key
+class TestHashFilter : public FilterPolicy {
+ public:
+  virtual const char* Name() const {
+    return "TestHashFilter";
+  }
+
+  virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const {
+    for (int i = 0; i < n; i++) {
+      uint32_t h = Hash(keys[i].data(), keys[i].size(), 1);
+      PutFixed32(dst, h);
+    }
+  }
+
+  virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const {
+    uint32_t h = Hash(key.data(), key.size(), 1);
+    for (unsigned int i = 0; i + 4 <= filter.size(); i += 4) {
+      if (h == DecodeFixed32(filter.data() + i)) {
+        return true;
+      }
+    }
+    return false;
+  }
+};
+
+class FilterBlockTest {
+ public:
+  TestHashFilter policy_;
+  BlockBasedTableOptions table_options_;
+
+  FilterBlockTest() {
+    table_options_.filter_policy.reset(new TestHashFilter());
+  }
+};
+
+TEST(FilterBlockTest, EmptyBuilder) {
+  BlockBasedFilterBlockBuilder builder(nullptr, table_options_);
+  Slice block = builder.Finish();
+  ASSERT_EQ("\\x00\\x00\\x00\\x00\\x0b", EscapeString(block));
+  BlockBasedFilterBlockReader reader(nullptr, table_options_, block);
+  ASSERT_TRUE(reader.KeyMayMatch("foo", 0));
+  ASSERT_TRUE(reader.KeyMayMatch("foo", 100000));
+}
+
+TEST(FilterBlockTest, SingleChunk) {
+  BlockBasedFilterBlockBuilder builder(nullptr, table_options_);
+  builder.StartBlock(100);
+  builder.Add("foo");
+  builder.Add("bar");
+  builder.Add("box");
+  builder.StartBlock(200);
+  builder.Add("box");
+  builder.StartBlock(300);
+  builder.Add("hello");
+  Slice block = builder.Finish();
+  BlockBasedFilterBlockReader reader(nullptr, table_options_, block);
+  ASSERT_TRUE(reader.KeyMayMatch("foo", 100));
+  ASSERT_TRUE(reader.KeyMayMatch("bar", 100));
+  ASSERT_TRUE(reader.KeyMayMatch("box", 100));
+  ASSERT_TRUE(reader.KeyMayMatch("hello", 100));
+  ASSERT_TRUE(reader.KeyMayMatch("foo", 100));
+  ASSERT_TRUE(!reader.KeyMayMatch("missing", 100));
+  ASSERT_TRUE(!reader.KeyMayMatch("other", 100));
+}
+
+TEST(FilterBlockTest, MultiChunk) {
+  BlockBasedFilterBlockBuilder builder(nullptr, table_options_);
+
+  // First filter
+  builder.StartBlock(0);
+  builder.Add("foo");
+  builder.StartBlock(2000);
+  builder.Add("bar");
+
+  // Second filter
+  builder.StartBlock(3100);
+  builder.Add("box");
+
+  // Third filter is empty
+
+  // Last filter
+  builder.StartBlock(9000);
+  builder.Add("box");
+  builder.Add("hello");
+
+  Slice block = builder.Finish();
+  BlockBasedFilterBlockReader reader(nullptr, table_options_, block);
+
+  // Check first filter
+  ASSERT_TRUE(reader.KeyMayMatch("foo", 0));
+  ASSERT_TRUE(reader.KeyMayMatch("bar", 2000));
+  ASSERT_TRUE(!reader.KeyMayMatch("box", 0));
+  ASSERT_TRUE(!reader.KeyMayMatch("hello", 0));
+
+  // Check second filter
+  ASSERT_TRUE(reader.KeyMayMatch("box", 3100));
+  ASSERT_TRUE(!reader.KeyMayMatch("foo", 3100));
+  ASSERT_TRUE(!reader.KeyMayMatch("bar", 3100));
+  ASSERT_TRUE(!reader.KeyMayMatch("hello", 3100));
+
+  // Check third filter (empty)
+  ASSERT_TRUE(!reader.KeyMayMatch("foo", 4100));
+  ASSERT_TRUE(!reader.KeyMayMatch("bar", 4100));
+  ASSERT_TRUE(!reader.KeyMayMatch("box", 4100));
+  ASSERT_TRUE(!reader.KeyMayMatch("hello", 4100));
+
+  // Check last filter
+  ASSERT_TRUE(reader.KeyMayMatch("box", 9000));
+  ASSERT_TRUE(reader.KeyMayMatch("hello", 9000));
+  ASSERT_TRUE(!reader.KeyMayMatch("foo", 9000));
+  ASSERT_TRUE(!reader.KeyMayMatch("bar", 9000));
+}
+
+// Test for block based filter block
+// use new interface in FilterPolicy to create filter builder/reader
+class BlockBasedFilterBlockTest {
+ public:
+  BlockBasedTableOptions table_options_;
+
+  BlockBasedFilterBlockTest() {
+    table_options_.filter_policy.reset(NewBloomFilterPolicy(10));
+  }
+
+  ~BlockBasedFilterBlockTest() {}
+};
+
+TEST(BlockBasedFilterBlockTest, BlockBasedEmptyBuilder) {
+  FilterBlockBuilder* builder = new BlockBasedFilterBlockBuilder(
+      nullptr, table_options_);
+  Slice block = builder->Finish();
+  ASSERT_EQ("\\x00\\x00\\x00\\x00\\x0b", EscapeString(block));
+  FilterBlockReader* reader = new BlockBasedFilterBlockReader(
+      nullptr, table_options_, block);
+  ASSERT_TRUE(reader->KeyMayMatch("foo", 0));
+  ASSERT_TRUE(reader->KeyMayMatch("foo", 100000));
+
+  delete builder;
+  delete reader;
+}
+
+TEST(BlockBasedFilterBlockTest, BlockBasedSingleChunk) {
+  FilterBlockBuilder* builder = new BlockBasedFilterBlockBuilder(
+      nullptr, table_options_);
+  builder->StartBlock(100);
+  builder->Add("foo");
+  builder->Add("bar");
+  builder->Add("box");
+  builder->StartBlock(200);
+  builder->Add("box");
+  builder->StartBlock(300);
+  builder->Add("hello");
+  Slice block = builder->Finish();
+  FilterBlockReader* reader = new BlockBasedFilterBlockReader(
+      nullptr, table_options_, block);
+  ASSERT_TRUE(reader->KeyMayMatch("foo", 100));
+  ASSERT_TRUE(reader->KeyMayMatch("bar", 100));
+  ASSERT_TRUE(reader->KeyMayMatch("box", 100));
+  ASSERT_TRUE(reader->KeyMayMatch("hello", 100));
+  ASSERT_TRUE(reader->KeyMayMatch("foo", 100));
+  ASSERT_TRUE(!reader->KeyMayMatch("missing", 100));
+  ASSERT_TRUE(!reader->KeyMayMatch("other", 100));
+
+  delete builder;
+  delete reader;
+}
+
+TEST(BlockBasedFilterBlockTest, BlockBasedMultiChunk) {
+  FilterBlockBuilder* builder = new BlockBasedFilterBlockBuilder(
+      nullptr, table_options_);
+
+  // First filter
+  builder->StartBlock(0);
+  builder->Add("foo");
+  builder->StartBlock(2000);
+  builder->Add("bar");
+
+  // Second filter
+  builder->StartBlock(3100);
+  builder->Add("box");
+
+  // Third filter is empty
+
+  // Last filter
+  builder->StartBlock(9000);
+  builder->Add("box");
+  builder->Add("hello");
+
+  Slice block = builder->Finish();
+  FilterBlockReader* reader = new BlockBasedFilterBlockReader(
+      nullptr, table_options_, block);
+
+  // Check first filter
+  ASSERT_TRUE(reader->KeyMayMatch("foo", 0));
+  ASSERT_TRUE(reader->KeyMayMatch("bar", 2000));
+  ASSERT_TRUE(!reader->KeyMayMatch("box", 0));
+  ASSERT_TRUE(!reader->KeyMayMatch("hello", 0));
+
+  // Check second filter
+  ASSERT_TRUE(reader->KeyMayMatch("box", 3100));
+  ASSERT_TRUE(!reader->KeyMayMatch("foo", 3100));
+  ASSERT_TRUE(!reader->KeyMayMatch("bar", 3100));
+  ASSERT_TRUE(!reader->KeyMayMatch("hello", 3100));
+
+  // Check third filter (empty)
+  ASSERT_TRUE(!reader->KeyMayMatch("foo", 4100));
+  ASSERT_TRUE(!reader->KeyMayMatch("bar", 4100));
+  ASSERT_TRUE(!reader->KeyMayMatch("box", 4100));
+  ASSERT_TRUE(!reader->KeyMayMatch("hello", 4100));
+
+  // Check last filter
+  ASSERT_TRUE(reader->KeyMayMatch("box", 9000));
+  ASSERT_TRUE(reader->KeyMayMatch("hello", 9000));
+  ASSERT_TRUE(!reader->KeyMayMatch("foo", 9000));
+  ASSERT_TRUE(!reader->KeyMayMatch("bar", 9000));
+
+  delete builder;
+  delete reader;
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) { return rocksdb::test::RunAllTests(); }
diff --git a/table/block_based_table_builder.cc b/table/block_based_table_builder.cc
index fde363760..7fb662d88 100644
--- a/table/block_based_table_builder.cc
+++ b/table/block_based_table_builder.cc
@@ -31,6 +31,8 @@
 #include "table/block_based_table_reader.h"
 #include "table/block_builder.h"
 #include "table/filter_block.h"
+#include "table/block_based_filter_block.h"
+#include "table/full_filter_block.h"
 #include "table/format.h"
 #include "table/meta_blocks.h"
 #include "table/table_builder.h"
@@ -274,6 +276,21 @@ IndexBuilder* CreateIndexBuilder(IndexType type, const Comparator* comparator,
   return nullptr;
 }
 
+// Create a index builder based on its type.
+FilterBlockBuilder* CreateFilterBlockBuilder(const ImmutableCFOptions& opt,
+    const BlockBasedTableOptions& table_opt) {
+  if (table_opt.filter_policy == nullptr) return nullptr;
+
+  FilterBitsBuilder* filter_bits_builder =
+      table_opt.filter_policy->GetFilterBitsBuilder();
+  if (filter_bits_builder == nullptr) {
+    return new BlockBasedFilterBlockBuilder(opt.prefix_extractor, table_opt);
+  } else {
+    return new FullFilterBlockBuilder(opt.prefix_extractor, table_opt,
+                                      filter_bits_builder);
+  }
+}
+
 bool GoodCompressionRatio(size_t compressed_size, size_t raw_size) {
   // Check to see if compressed less than 12.5%
   return compressed_size < raw_size - (raw_size / 8u);
@@ -365,7 +382,6 @@ class BlockBasedTableBuilder::BlockBasedTablePropertiesCollector
     std::string val;
     PutFixed32(&val, static_cast<uint32_t>(index_type_));
     properties->insert({BlockBasedTablePropertyNames::kIndexType, val});
-
     return Status::OK();
   }
 
@@ -428,11 +444,7 @@ struct BlockBasedTableBuilder::Rep {
               table_options.index_type, &internal_comparator,
               &this->internal_prefix_transform)),
         compression_type(compression_type),
-        compression_opts(compression_opts),
-        filter_block(table_options.filter_policy == nullptr ?
-            nullptr :
-            new FilterBlockBuilder(ioptions.prefix_extractor,
-                                   table_options, &internal_comparator)),
+        filter_block(CreateFilterBlockBuilder(ioptions, table_options)),
         flush_block_policy(
             table_options.flush_block_policy_factory->NewFlushBlockPolicy(
               table_options, data_block)) {
@@ -497,7 +509,7 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) {
   }
 
   if (r->filter_block != nullptr) {
-    r->filter_block->AddKey(ExtractUserKey(key));
+    r->filter_block->Add(ExtractUserKey(key));
   }
 
   r->last_key.assign(key.data(), key.size());
@@ -661,10 +673,7 @@ Status BlockBasedTableBuilder::Finish() {
   assert(!r->closed);
   r->closed = true;
 
-  BlockHandle filter_block_handle,
-              metaindex_block_handle,
-              index_block_handle;
-
+  BlockHandle filter_block_handle, metaindex_block_handle, index_block_handle;
   // Write filter block
   if (ok() && r->filter_block != nullptr) {
     auto filter_contents = r->filter_block->Finish();
@@ -703,7 +712,12 @@ Status BlockBasedTableBuilder::Finish() {
     if (r->filter_block != nullptr) {
       // Add mapping from "<filter_block_prefix>.Name" to location
       // of filter data.
-      std::string key = BlockBasedTable::kFilterBlockPrefix;
+      std::string key;
+      if (r->filter_block->IsBlockBased()) {
+        key = BlockBasedTable::kFilterBlockPrefix;
+      } else {
+        key = BlockBasedTable::kFullFilterBlockPrefix;
+      }
       key.append(r->table_options.filter_policy->Name());
       meta_index_builder.Add(key, filter_block_handle);
     }
@@ -807,5 +821,6 @@ uint64_t BlockBasedTableBuilder::FileSize() const {
 }
 
 const std::string BlockBasedTable::kFilterBlockPrefix = "filter.";
+const std::string BlockBasedTable::kFullFilterBlockPrefix = "fullfilter.";
 
 }  // namespace rocksdb
diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc
index cf915e105..b38f88588 100644
--- a/table/block_based_table_reader.cc
+++ b/table/block_based_table_reader.cc
@@ -26,6 +26,8 @@
 
 #include "table/block.h"
 #include "table/filter_block.h"
+#include "table/block_based_filter_block.h"
+#include "table/full_filter_block.h"
 #include "table/block_hash_index.h"
 #include "table/block_prefix_index.h"
 #include "table/format.h"
@@ -46,7 +48,6 @@ using std::unique_ptr;
 typedef BlockBasedTable::IndexReader IndexReader;
 
 namespace {
-
 // The longest the prefix of the cache key used to identify blocks can be.
 // We are using the fact that we know for Posix files the unique ID is three
 // varints.
@@ -527,11 +528,18 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions,
 
       // Set filter block
       if (rep->filter_policy) {
-        std::string key = kFilterBlockPrefix;
-        key.append(rep->filter_policy->Name());
-        BlockHandle handle;
-        if (FindMetaBlock(meta_iter.get(), key, &handle).ok()) {
-          rep->filter.reset(ReadFilter(handle, rep));
+        // First try reading full_filter, then reading block_based_filter
+        for (auto filter_block_prefix : { kFullFilterBlockPrefix,
+                                          kFilterBlockPrefix }) {
+          std::string key = filter_block_prefix;
+          key.append(rep->filter_policy->Name());
+
+          BlockHandle handle;
+          if (FindMetaBlock(meta_iter.get(), key, &handle).ok()) {
+            rep->filter.reset(ReadFilter(handle, rep,
+                filter_block_prefix, nullptr));
+            break;
+          }
         }
       }
     } else {
@@ -741,9 +749,9 @@ Status BlockBasedTable::PutDataBlockToCache(
   return s;
 }
 
-FilterBlockReader* BlockBasedTable::ReadFilter(const BlockHandle& filter_handle,
-                                               BlockBasedTable::Rep* rep,
-                                               size_t* filter_size) {
+FilterBlockReader* BlockBasedTable::ReadFilter(
+    const BlockHandle& filter_handle, BlockBasedTable::Rep* rep,
+    const std::string& filter_block_prefix, size_t* filter_size) {
   // TODO: We might want to unify with ReadBlockFromFile() if we start
   // requiring checksum verification in Table::Open.
   ReadOptions opt;
@@ -757,13 +765,25 @@ FilterBlockReader* BlockBasedTable::ReadFilter(const BlockHandle& filter_handle,
     *filter_size = block.data.size();
   }
 
-  return new FilterBlockReader(
-       rep->ioptions.prefix_extractor, rep->table_options,
-       block.data, block.heap_allocated);
+  assert(rep->filter_policy);
+  if (kFilterBlockPrefix == filter_block_prefix) {
+    return new BlockBasedFilterBlockReader(rep->ioptions.prefix_extractor,
+        rep->table_options, block.data, block.heap_allocated);
+  } else if (kFullFilterBlockPrefix == filter_block_prefix) {
+    auto filter_bits_reader = rep->filter_policy->
+        GetFilterBitsReader(block.data);
+
+    if (filter_bits_reader != nullptr) {
+      return new FullFilterBlockReader(rep->ioptions.prefix_extractor,
+          rep->table_options, block.data, filter_bits_reader,
+          block.heap_allocated);
+    }
+  }
+  return nullptr;
 }
 
 BlockBasedTable::CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter(
-    bool no_io) const {
+                                                          bool no_io) const {
   // filter pre-populated
   if (rep_->filter != nullptr) {
     return {rep_->filter.get(), nullptr /* cache handle */};
@@ -777,11 +797,9 @@ BlockBasedTable::CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter(
 
   // Fetching from the cache
   char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
-  auto key = GetCacheKey(
-      rep_->cache_key_prefix,
-      rep_->cache_key_prefix_size,
-      rep_->footer.metaindex_handle(),
-      cache_key
+  auto key = GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size,
+                         rep_->footer.metaindex_handle(),
+                         cache_key
   );
 
   Statistics* statistics = rep_->ioptions.statistics;
@@ -791,8 +809,8 @@ BlockBasedTable::CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter(
 
   FilterBlockReader* filter = nullptr;
   if (cache_handle != nullptr) {
-     filter = reinterpret_cast<FilterBlockReader*>(
-         block_cache->Value(cache_handle));
+    filter = reinterpret_cast<FilterBlockReader*>(
+        block_cache->Value(cache_handle));
   } else if (no_io) {
     // Do not invoke any io.
     return CachableEntry<FilterBlockReader>();
@@ -803,17 +821,22 @@ BlockBasedTable::CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter(
     auto s = ReadMetaBlock(rep_, &meta, &iter);
 
     if (s.ok()) {
-      std::string filter_block_key = kFilterBlockPrefix;
-      filter_block_key.append(rep_->filter_policy->Name());
-      BlockHandle handle;
-      if (FindMetaBlock(iter.get(), filter_block_key, &handle).ok()) {
-        filter = ReadFilter(handle, rep_, &filter_size);
-        assert(filter);
-        assert(filter_size > 0);
-
-        cache_handle = block_cache->Insert(
-            key, filter, filter_size, &DeleteCachedEntry<FilterBlockReader>);
-        RecordTick(statistics, BLOCK_CACHE_ADD);
+      // First try reading full_filter, then reading block_based_filter
+      for (auto filter_block_prefix : {kFullFilterBlockPrefix,
+                                       kFilterBlockPrefix}) {
+        std::string filter_block_key = filter_block_prefix;
+        filter_block_key.append(rep_->filter_policy->Name());
+        BlockHandle handle;
+        if (FindMetaBlock(iter.get(), filter_block_key, &handle).ok()) {
+          filter = ReadFilter(handle, rep_, filter_block_prefix, &filter_size);
+
+          if (filter == nullptr) break;  // err happen in ReadFilter
+          assert(filter_size > 0);
+          cache_handle = block_cache->Insert(
+              key, filter, filter_size, &DeleteCachedEntry<FilterBlockReader>);
+          RecordTick(statistics, BLOCK_CACHE_ADD);
+          break;
+        }
       }
     }
   }
@@ -918,8 +941,8 @@ Iterator* BlockBasedTable::NewDataBlockIterator(Rep* rep,
 
     // create key for block cache
     if (block_cache != nullptr) {
-      key = GetCacheKey(rep->cache_key_prefix,
-                        rep->cache_key_prefix_size, handle, cache_key);
+      key = GetCacheKey(rep->cache_key_prefix, rep->cache_key_prefix_size,
+                        handle, cache_key);
     }
 
     if (block_cache_compressed != nullptr) {
@@ -1039,42 +1062,50 @@ bool BlockBasedTable::PrefixMayMatch(const Slice& internal_key) {
   // loaded to memory.
   ReadOptions no_io_read_options;
   no_io_read_options.read_tier = kBlockCacheTier;
-  unique_ptr<Iterator> iiter(NewIndexIterator(no_io_read_options));
-  iiter->Seek(internal_prefix);
-
-  if (!iiter->Valid()) {
-    // we're past end of file
-    // if it's incomplete, it means that we avoided I/O
-    // and we're not really sure that we're past the end
-    // of the file
-    may_match = iiter->status().IsIncomplete();
-  } else if (ExtractUserKey(iiter->key()).starts_with(
-              ExtractUserKey(internal_prefix))) {
-    // we need to check for this subtle case because our only
-    // guarantee is that "the key is a string >= last key in that data
-    // block" according to the doc/table_format.txt spec.
-    //
-    // Suppose iiter->key() starts with the desired prefix; it is not
-    // necessarily the case that the corresponding data block will
-    // contain the prefix, since iiter->key() need not be in the
-    // block.  However, the next data block may contain the prefix, so
-    // we return true to play it safe.
-    may_match = true;
-  } else {
-    // iiter->key() does NOT start with the desired prefix.  Because
-    // Seek() finds the first key that is >= the seek target, this
-    // means that iiter->key() > prefix.  Thus, any data blocks coming
-    // after the data block corresponding to iiter->key() cannot
-    // possibly contain the key.  Thus, the corresponding data block
-    // is the only one which could potentially contain the prefix.
-    Slice handle_value = iiter->value();
-    BlockHandle handle;
-    s = handle.DecodeFrom(&handle_value);
-    assert(s.ok());
-    auto filter_entry = GetFilter(true /* no io */);
-    may_match = filter_entry.value == nullptr ||
-                filter_entry.value->PrefixMayMatch(handle.offset(), prefix);
-    filter_entry.Release(rep_->table_options.block_cache.get());
+
+  // First, try check with full filter
+  auto filter_entry = GetFilter(true /* no io */);
+  FilterBlockReader* filter = filter_entry.value;
+  if (filter != nullptr && !filter->IsBlockBased()) {
+    may_match = filter->PrefixMayMatch(prefix);
+  }
+
+  // Then, try find it within each block
+  if (may_match) {
+    unique_ptr<Iterator> iiter(NewIndexIterator(no_io_read_options));
+    iiter->Seek(internal_prefix);
+
+    if (!iiter->Valid()) {
+      // we're past end of file
+      // if it's incomplete, it means that we avoided I/O
+      // and we're not really sure that we're past the end
+      // of the file
+      may_match = iiter->status().IsIncomplete();
+    } else if (ExtractUserKey(iiter->key()).starts_with(
+                ExtractUserKey(internal_prefix))) {
+      // we need to check for this subtle case because our only
+      // guarantee is that "the key is a string >= last key in that data
+      // block" according to the doc/table_format.txt spec.
+      //
+      // Suppose iiter->key() starts with the desired prefix; it is not
+      // necessarily the case that the corresponding data block will
+      // contain the prefix, since iiter->key() need not be in the
+      // block.  However, the next data block may contain the prefix, so
+      // we return true to play it safe.
+      may_match = true;
+    } else if (filter != nullptr && filter->IsBlockBased()) {
+      // iiter->key() does NOT start with the desired prefix.  Because
+      // Seek() finds the first key that is >= the seek target, this
+      // means that iiter->key() > prefix.  Thus, any data blocks coming
+      // after the data block corresponding to iiter->key() cannot
+      // possibly contain the key.  Thus, the corresponding data block
+      // is the only on could potentially contain the prefix.
+      Slice handle_value = iiter->value();
+      BlockHandle handle;
+      s = handle.DecodeFrom(&handle_value);
+      assert(s.ok());
+      may_match = filter->PrefixMayMatch(prefix, handle.offset());
+    }
   }
 
   Statistics* statistics = rep_->ioptions.statistics;
@@ -1083,6 +1114,7 @@ bool BlockBasedTable::PrefixMayMatch(const Slice& internal_key) {
     RecordTick(statistics, BLOOM_FILTER_PREFIX_USEFUL);
   }
 
+  filter_entry.Release(rep_->table_options.block_cache.get());
   return may_match;
 }
 
@@ -1098,64 +1130,72 @@ Status BlockBasedTable::Get(
                            const Slice& v),
     void (*mark_key_may_exist_handler)(void* handle_context)) {
   Status s;
-  BlockIter iiter;
-  NewIndexIterator(read_options, &iiter);
-
   auto filter_entry = GetFilter(read_options.read_tier == kBlockCacheTier);
   FilterBlockReader* filter = filter_entry.value;
-  bool done = false;
-  for (iiter.Seek(key); iiter.Valid() && !done; iiter.Next()) {
-    Slice handle_value = iiter.value();
 
-    BlockHandle handle;
-    bool may_not_exist_in_filter =
-        filter != nullptr && handle.DecodeFrom(&handle_value).ok() &&
-        !filter->KeyMayMatch(handle.offset(), ExtractUserKey(key));
-
-    if (may_not_exist_in_filter) {
-      // Not found
-      // TODO: think about interaction with Merge. If a user key cannot
-      // cross one data block, we should be fine.
-      RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL);
-      break;
-    } else {
-      BlockIter biter;
-      NewDataBlockIterator(rep_, read_options, iiter.value(), &biter);
-
-      if (read_options.read_tier && biter.status().IsIncomplete()) {
-        // couldn't get block from block_cache
-        // Update Saver.state to Found because we are only looking for whether
-        // we can guarantee the key is not there when "no_io" is set
-        (*mark_key_may_exist_handler)(handle_context);
-        break;
-      }
-      if (!biter.status().ok()) {
-        s = biter.status();
-        break;
-      }
+  // First check the full filter
+  // If full filter not useful, Then go into each block
+  if (filter != nullptr && !filter->IsBlockBased()
+                        && !filter->KeyMayMatch(ExtractUserKey(key))) {
+    RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL);
+  } else {
+    BlockIter iiter;
+    NewIndexIterator(read_options, &iiter);
 
-      // Call the *saver function on each entry/block until it returns false
-      for (biter.Seek(key); biter.Valid(); biter.Next()) {
-        ParsedInternalKey parsed_key;
-        if (!ParseInternalKey(biter.key(), &parsed_key)) {
-          s = Status::Corruption(Slice());
-        }
+    bool done = false;
+    for (iiter.Seek(key); iiter.Valid() && !done; iiter.Next()) {
+      Slice handle_value = iiter.value();
 
-        if (!(*result_handler)(handle_context, parsed_key,
-                               biter.value())) {
-          done = true;
+      BlockHandle handle;
+      bool not_exist_in_filter =
+          filter != nullptr && filter->IsBlockBased() == true &&
+          handle.DecodeFrom(&handle_value).ok() &&
+          !filter->KeyMayMatch(ExtractUserKey(key), handle.offset());
+
+      if (not_exist_in_filter) {
+        // Not found
+        // TODO: think about interaction with Merge. If a user key cannot
+        // cross one data block, we should be fine.
+        RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL);
+        break;
+      } else {
+        BlockIter biter;
+        NewDataBlockIterator(rep_, read_options, iiter.value(), &biter);
+
+        if (read_options.read_tier && biter.status().IsIncomplete()) {
+          // couldn't get block from block_cache
+          // Update Saver.state to Found because we are only looking for whether
+          // we can guarantee the key is not there when "no_io" is set
+          (*mark_key_may_exist_handler)(handle_context);
           break;
         }
+        if (!biter.status().ok()) {
+          s = biter.status();
+          break;
+        }
+
+        // Call the *saver function on each entry/block until it returns false
+        for (biter.Seek(key); biter.Valid(); biter.Next()) {
+          ParsedInternalKey parsed_key;
+          if (!ParseInternalKey(biter.key(), &parsed_key)) {
+            s = Status::Corruption(Slice());
+          }
+
+          if (!(*result_handler)(handle_context, parsed_key,
+                                 biter.value())) {
+            done = true;
+            break;
+          }
+        }
+        s = biter.status();
       }
-      s = biter.status();
+    }
+    if (s.ok()) {
+      s = iiter.status();
     }
   }
 
   filter_entry.Release(rep_->table_options.block_cache.get());
-  if (s.ok()) {
-    s = iiter.status();
-  }
-
   return s;
 }
 
@@ -1175,8 +1215,8 @@ bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options,
 
   char cache_key_storage[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
   Slice cache_key =
-      GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size, handle,
-                  cache_key_storage);
+      GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size,
+                  handle, cache_key_storage);
   Slice ckey;
 
   s = GetDataBlockFromCache(cache_key, ckey, block_cache, nullptr, nullptr,
diff --git a/table/block_based_table_reader.h b/table/block_based_table_reader.h
index b5686d265..503a91bb3 100644
--- a/table/block_based_table_reader.h
+++ b/table/block_based_table_reader.h
@@ -28,6 +28,8 @@ class BlockIter;
 class BlockHandle;
 class Cache;
 class FilterBlockReader;
+class BlockBasedFilterBlockReader;
+class FullFilterBlockReader;
 class Footer;
 class InternalKeyComparator;
 class Iterator;
@@ -47,6 +49,7 @@ using std::unique_ptr;
 class BlockBasedTable : public TableReader {
  public:
   static const std::string kFilterBlockPrefix;
+  static const std::string kFullFilterBlockPrefix;
 
   // Attempt to open the table that is stored in bytes [0..file_size)
   // of "file", and read the metadata entries necessary to allow
@@ -184,7 +187,9 @@ class BlockBasedTable : public TableReader {
 
   // Create the filter from the filter block.
   static FilterBlockReader* ReadFilter(const BlockHandle& filter_handle,
-                                       Rep* rep, size_t* filter_size = nullptr);
+                                       Rep* rep,
+                                       const std::string& filter_block_prefix,
+                                       size_t* filter_size = nullptr);
 
   static void SetupCacheKeyPrefix(Rep* rep);
 
diff --git a/table/filter_block.h b/table/filter_block.h
index efee5ac71..adbb7c496 100644
--- a/table/filter_block.h
+++ b/table/filter_block.h
@@ -10,86 +10,70 @@
 // A filter block is stored near the end of a Table file.  It contains
 // filters (e.g., bloom filters) for all data blocks in the table combined
 // into a single filter block.
+//
+// It is a base class for BlockBasedFilter and FullFilter.
+// These two are both used in BlockBasedTable. The first one contain filter
+// For a part of keys in sst file, the second contain filter for all keys
+// in sst file.
 
 #pragma once
 
-#include <memory>
 #include <stddef.h>
 #include <stdint.h>
 #include <string>
 #include <vector>
+#include "rocksdb/options.h"
 #include "rocksdb/slice.h"
-#include "rocksdb/slice_transform.h"
 #include "rocksdb/table.h"
-#include "util/hash.h"
 
 namespace rocksdb {
 
-class FilterPolicy;
+const uint64_t kNotValid = ULLONG_MAX;
 
 // A FilterBlockBuilder is used to construct all of the filters for a
 // particular Table.  It generates a single string which is stored as
 // a special block in the Table.
 //
 // The sequence of calls to FilterBlockBuilder must match the regexp:
-//      (StartBlock AddKey*)* Finish
+//      (StartBlock Add*)* Finish
+//
+// BlockBased/Full FilterBlock would be called in the same way.
 class FilterBlockBuilder {
  public:
-  explicit FilterBlockBuilder(const SliceTransform* prefix_extractor,
-                              const BlockBasedTableOptions& table_opt,
-                              const Comparator* internal_comparator);
+  explicit FilterBlockBuilder() {}
+  virtual ~FilterBlockBuilder() {}
 
-  void StartBlock(uint64_t block_offset);
-  void AddKey(const Slice& key);
-  Slice Finish();
+  virtual bool IsBlockBased() = 0;                    // If is blockbased filter
+  virtual void StartBlock(uint64_t block_offset) = 0;  // Start new block filter
+  virtual void Add(const Slice& key) = 0;      // Add a key to current filter
+  virtual Slice Finish() = 0;                     // Generate Filter
 
  private:
-  bool SamePrefix(const Slice &key1, const Slice &key2) const;
-  void GenerateFilter();
-
-  // important: all of these might point to invalid addresses
-  // at the time of destruction of this filter block. destructor
-  // should NOT dereference them.
-  const FilterPolicy* policy_;
-  const SliceTransform* prefix_extractor_;
-  bool whole_key_filtering_;
-  const Comparator* comparator_;
-
-  std::string entries_;         // Flattened entry contents
-  std::vector<size_t> start_;   // Starting index in entries_ of each entry
-  std::string result_;          // Filter data computed so far
-  std::vector<Slice> tmp_entries_; // policy_->CreateFilter() argument
-  std::vector<uint32_t> filter_offsets_;
-
   // No copying allowed
   FilterBlockBuilder(const FilterBlockBuilder&);
   void operator=(const FilterBlockBuilder&);
 };
 
+// A FilterBlockReader is used to parse filter from SST table.
+// KeyMayMatch and PrefixMayMatch would trigger filter checking
+//
+// BlockBased/Full FilterBlock would be called in the same way.
 class FilterBlockReader {
  public:
- // REQUIRES: "contents" and *policy must stay live while *this is live.
-  FilterBlockReader(
-    const SliceTransform* prefix_extractor,
-    const BlockBasedTableOptions& table_opt,
-    const Slice& contents,
-    bool delete_contents_after_use = false);
-  bool KeyMayMatch(uint64_t block_offset, const Slice& key);
-  bool PrefixMayMatch(uint64_t block_offset, const Slice& prefix);
-  size_t ApproximateMemoryUsage() const;
-
- private:
-  const FilterPolicy* policy_;
-  const SliceTransform* prefix_extractor_;
-  bool whole_key_filtering_;
-  const char* data_;    // Pointer to filter data (at block-start)
-  const char* offset_;  // Pointer to beginning of offset array (at block-end)
-  size_t num_;          // Number of entries in offset array
-  size_t base_lg_;      // Encoding parameter (see kFilterBaseLg in .cc file)
-  std::unique_ptr<const char[]> filter_data;
+  explicit FilterBlockReader() {}
+  virtual ~FilterBlockReader() {}
 
+  virtual bool IsBlockBased() = 0;  // If is blockbased filter
+  virtual bool KeyMayMatch(const Slice& key,
+                           uint64_t block_offset = kNotValid) = 0;
+  virtual bool PrefixMayMatch(const Slice& prefix,
+                              uint64_t block_offset = kNotValid) = 0;
+  virtual size_t ApproximateMemoryUsage() const = 0;
 
-  bool MayMatch(uint64_t block_offset, const Slice& entry);
+ private:
+  // No copying allowed
+  FilterBlockReader(const FilterBlockReader&);
+  void operator=(const FilterBlockReader&);
 };
 
-}
+}  // namespace rocksdb
diff --git a/table/filter_block_test.cc b/table/filter_block_test.cc
deleted file mode 100644
index 903247e80..000000000
--- a/table/filter_block_test.cc
+++ /dev/null
@@ -1,139 +0,0 @@
-//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under the BSD-style license found in the
-//  LICENSE file in the root directory of this source tree. An additional grant
-//  of patent rights can be found in the PATENTS file in the same directory.
-//
-// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-
-#include "table/filter_block.h"
-
-#include "rocksdb/filter_policy.h"
-#include "util/coding.h"
-#include "util/hash.h"
-#include "util/logging.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
-
-namespace rocksdb {
-
-// For testing: emit an array with one hash value per key
-class TestHashFilter : public FilterPolicy {
- public:
-  virtual const char* Name() const {
-    return "TestHashFilter";
-  }
-
-  virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const {
-    for (int i = 0; i < n; i++) {
-      uint32_t h = Hash(keys[i].data(), keys[i].size(), 1);
-      PutFixed32(dst, h);
-    }
-  }
-
-  virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const {
-    uint32_t h = Hash(key.data(), key.size(), 1);
-    for (unsigned int i = 0; i + 4 <= filter.size(); i += 4) {
-      if (h == DecodeFixed32(filter.data() + i)) {
-        return true;
-      }
-    }
-    return false;
-  }
-};
-
-class FilterBlockTest {
- public:
-  const Comparator* comparator_;
-  BlockBasedTableOptions table_options_;
-
-  FilterBlockTest()
-    : comparator_(BytewiseComparator()) {
-    table_options_.filter_policy.reset(new TestHashFilter());
-  }
-};
-
-TEST(FilterBlockTest, EmptyBuilder) {
-  FilterBlockBuilder builder(nullptr, table_options_, comparator_);
-  Slice block = builder.Finish();
-  ASSERT_EQ("\\x00\\x00\\x00\\x00\\x0b", EscapeString(block));
-  FilterBlockReader reader(nullptr, table_options_, block);
-  ASSERT_TRUE(reader.KeyMayMatch(0, "foo"));
-  ASSERT_TRUE(reader.KeyMayMatch(100000, "foo"));
-}
-
-TEST(FilterBlockTest, SingleChunk) {
-  FilterBlockBuilder builder(nullptr, table_options_, comparator_);
-  builder.StartBlock(100);
-  builder.AddKey("foo");
-  builder.AddKey("bar");
-  builder.AddKey("box");
-  builder.StartBlock(200);
-  builder.AddKey("box");
-  builder.StartBlock(300);
-  builder.AddKey("hello");
-  Slice block = builder.Finish();
-  FilterBlockReader reader(nullptr, table_options_, block);
-  ASSERT_TRUE(reader.KeyMayMatch(100, "foo"));
-  ASSERT_TRUE(reader.KeyMayMatch(100, "bar"));
-  ASSERT_TRUE(reader.KeyMayMatch(100, "box"));
-  ASSERT_TRUE(reader.KeyMayMatch(100, "hello"));
-  ASSERT_TRUE(reader.KeyMayMatch(100, "foo"));
-  ASSERT_TRUE(! reader.KeyMayMatch(100, "missing"));
-  ASSERT_TRUE(! reader.KeyMayMatch(100, "other"));
-}
-
-TEST(FilterBlockTest, MultiChunk) {
-  FilterBlockBuilder builder(nullptr, table_options_, comparator_);
-
-  // First filter
-  builder.StartBlock(0);
-  builder.AddKey("foo");
-  builder.StartBlock(2000);
-  builder.AddKey("bar");
-
-  // Second filter
-  builder.StartBlock(3100);
-  builder.AddKey("box");
-
-  // Third filter is empty
-
-  // Last filter
-  builder.StartBlock(9000);
-  builder.AddKey("box");
-  builder.AddKey("hello");
-
-  Slice block = builder.Finish();
-  FilterBlockReader reader(nullptr, table_options_, block);
-
-  // Check first filter
-  ASSERT_TRUE(reader.KeyMayMatch(0, "foo"));
-  ASSERT_TRUE(reader.KeyMayMatch(2000, "bar"));
-  ASSERT_TRUE(! reader.KeyMayMatch(0, "box"));
-  ASSERT_TRUE(! reader.KeyMayMatch(0, "hello"));
-
-  // Check second filter
-  ASSERT_TRUE(reader.KeyMayMatch(3100, "box"));
-  ASSERT_TRUE(! reader.KeyMayMatch(3100, "foo"));
-  ASSERT_TRUE(! reader.KeyMayMatch(3100, "bar"));
-  ASSERT_TRUE(! reader.KeyMayMatch(3100, "hello"));
-
-  // Check third filter (empty)
-  ASSERT_TRUE(! reader.KeyMayMatch(4100, "foo"));
-  ASSERT_TRUE(! reader.KeyMayMatch(4100, "bar"));
-  ASSERT_TRUE(! reader.KeyMayMatch(4100, "box"));
-  ASSERT_TRUE(! reader.KeyMayMatch(4100, "hello"));
-
-  // Check last filter
-  ASSERT_TRUE(reader.KeyMayMatch(9000, "box"));
-  ASSERT_TRUE(reader.KeyMayMatch(9000, "hello"));
-  ASSERT_TRUE(! reader.KeyMayMatch(9000, "foo"));
-  ASSERT_TRUE(! reader.KeyMayMatch(9000, "bar"));
-}
-
-}  // namespace rocksdb
-
-int main(int argc, char** argv) {
-  return rocksdb::test::RunAllTests();
-}
diff --git a/table/full_filter_block.cc b/table/full_filter_block.cc
new file mode 100644
index 000000000..8a481b7d0
--- /dev/null
+++ b/table/full_filter_block.cc
@@ -0,0 +1,99 @@
+//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include "table/full_filter_block.h"
+
+#include "rocksdb/filter_policy.h"
+#include "port/port.h"
+#include "util/coding.h"
+
+namespace rocksdb {
+
+FullFilterBlockBuilder::FullFilterBlockBuilder(
+    const SliceTransform* prefix_extractor,
+    const BlockBasedTableOptions& table_opt,
+    FilterBitsBuilder* filter_bits_builder)
+    : prefix_extractor_(prefix_extractor),
+      whole_key_filtering_(table_opt.whole_key_filtering),
+      num_added_(0) {
+  assert(filter_bits_builder != nullptr);
+  filter_bits_builder_.reset(filter_bits_builder);
+}
+
+void FullFilterBlockBuilder::Add(const Slice& key) {
+  if (whole_key_filtering_) {
+    AddKey(key);
+  }
+  if (prefix_extractor_ && prefix_extractor_->InDomain(key)) {
+    AddPrefix(key);
+  }
+}
+
+// Add key to filter if needed
+inline void FullFilterBlockBuilder::AddKey(const Slice& key) {
+  filter_bits_builder_->AddKey(key);
+  num_added_++;
+}
+
+// Add prefix to filter if needed
+inline void FullFilterBlockBuilder::AddPrefix(const Slice& key) {
+  Slice prefix = prefix_extractor_->Transform(key);
+  filter_bits_builder_->AddKey(prefix);
+  num_added_++;
+}
+
+Slice FullFilterBlockBuilder::Finish() {
+  if (num_added_ != 0) {
+    num_added_ = 0;
+    return filter_bits_builder_->Finish(&filter_data);
+  }
+  return Slice();
+}
+
+FullFilterBlockReader::FullFilterBlockReader(
+    const SliceTransform* prefix_extractor,
+    const BlockBasedTableOptions& table_opt,
+    const Slice& contents,
+    FilterBitsReader* filter_bits_reader, bool delete_contents_after_use)
+    : prefix_extractor_(prefix_extractor),
+      whole_key_filtering_(table_opt.whole_key_filtering),
+      contents_(contents) {
+  assert(filter_bits_reader != nullptr);
+  filter_bits_reader_.reset(filter_bits_reader);
+
+  if (delete_contents_after_use) {
+    filter_data.reset(contents.data());
+  }
+}
+
+bool FullFilterBlockReader::KeyMayMatch(const Slice& key,
+    uint64_t block_offset) {
+  assert(block_offset == kNotValid);
+  if (!whole_key_filtering_) {
+    return true;
+  }
+  return MayMatch(key);
+}
+
+bool FullFilterBlockReader::PrefixMayMatch(const Slice& prefix,
+                                           uint64_t block_offset) {
+  assert(block_offset == kNotValid);
+  if (!prefix_extractor_) {
+    return true;
+  }
+  return MayMatch(prefix);
+}
+
+bool FullFilterBlockReader::MayMatch(const Slice& entry) {
+  if (contents_.size() != 0)  {
+    return filter_bits_reader_->MayMatch(entry);
+  }
+  return true;  // remain the same with block_based filter
+}
+
+size_t FullFilterBlockReader::ApproximateMemoryUsage() const {
+  return contents_.size();
+}
+}  // namespace rocksdb
diff --git a/table/full_filter_block.h b/table/full_filter_block.h
new file mode 100644
index 000000000..24d20e032
--- /dev/null
+++ b/table/full_filter_block.h
@@ -0,0 +1,107 @@
+//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+#include <memory>
+#include <string>
+#include <vector>
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "db/dbformat.h"
+#include "util/hash.h"
+#include "table/filter_block.h"
+
+namespace rocksdb {
+
+class FilterPolicy;
+class FilterBitsBuilder;
+class FilterBitsReader;
+
+// A FullFilterBlockBuilder is used to construct a full filter for a
+// particular Table.  It generates a single string which is stored as
+// a special block in the Table.
+// The format of full filter block is:
+// +----------------------------------------------------------------+
+// |              full filter for all keys in sst file              |
+// +----------------------------------------------------------------+
+// The full filter can be very large. At the end of it, we put
+// num_probes: how many hash functions are used in bloom filter
+//
+class FullFilterBlockBuilder : public FilterBlockBuilder {
+ public:
+  explicit FullFilterBlockBuilder(const SliceTransform* prefix_extractor,
+                                  const BlockBasedTableOptions& table_opt,
+                                  FilterBitsBuilder* filter_bits_builder);
+  // bits_builder is created in filter_policy, it should be passed in here
+  // directly. and be deleted here
+  ~FullFilterBlockBuilder() {}
+
+  virtual bool IsBlockBased() override { return false; }
+  virtual void StartBlock(uint64_t block_offset) override {}
+  virtual void Add(const Slice& key) override;
+  virtual Slice Finish() override;
+
+ private:
+  // important: all of these might point to invalid addresses
+  // at the time of destruction of this filter block. destructor
+  // should NOT dereference them.
+  const SliceTransform* prefix_extractor_;
+  bool whole_key_filtering_;
+
+  uint32_t num_added_;
+  std::unique_ptr<FilterBitsBuilder> filter_bits_builder_;
+  std::unique_ptr<const char[]> filter_data;
+
+  void AddKey(const Slice& key);
+  void AddPrefix(const Slice& key);
+
+  // No copying allowed
+  FullFilterBlockBuilder(const FullFilterBlockBuilder&);
+  void operator=(const FullFilterBlockBuilder&);
+};
+
+// A FilterBlockReader is used to parse filter from SST table.
+// KeyMayMatch and PrefixMayMatch would trigger filter checking
+class FullFilterBlockReader : public FilterBlockReader {
+ public:
+  // REQUIRES: "contents" and filter_bits_reader must stay live
+  // while *this is live.
+  explicit FullFilterBlockReader(const SliceTransform* prefix_extractor,
+                                 const BlockBasedTableOptions& table_opt,
+                                 const Slice& contents,
+                                 FilterBitsReader* filter_bits_reader,
+                                 bool delete_contents_after_use = false);
+
+  // bits_reader is created in filter_policy, it should be passed in here
+  // directly. and be deleted here
+  ~FullFilterBlockReader() {}
+
+  virtual bool IsBlockBased() override { return false; }
+  virtual bool KeyMayMatch(const Slice& key,
+                           uint64_t block_offset = kNotValid) override;
+  virtual bool PrefixMayMatch(const Slice& prefix,
+                              uint64_t block_offset = kNotValid) override;
+  virtual size_t ApproximateMemoryUsage() const override;
+
+ private:
+  const SliceTransform* prefix_extractor_;
+  bool whole_key_filtering_;
+
+  std::unique_ptr<FilterBitsReader> filter_bits_reader_;
+  Slice contents_;
+  std::unique_ptr<const char[]> filter_data;
+
+  bool MayMatch(const Slice& entry);
+
+  // No copying allowed
+  FullFilterBlockReader(const FullFilterBlockReader&);
+  void operator=(const FullFilterBlockReader&);
+};
+
+}  // namespace rocksdb
diff --git a/table/full_filter_block_test.cc b/table/full_filter_block_test.cc
new file mode 100644
index 000000000..12e783b4a
--- /dev/null
+++ b/table/full_filter_block_test.cc
@@ -0,0 +1,181 @@
+//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include "table/full_filter_block.h"
+
+#include "rocksdb/filter_policy.h"
+#include "util/coding.h"
+#include "util/hash.h"
+#include "util/logging.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+namespace rocksdb {
+
+class TestFilterBitsBuilder : public FilterBitsBuilder {
+ public:
+  explicit TestFilterBitsBuilder() {}
+
+  // Add Key to filter
+  virtual void AddKey(const Slice& key) override {
+    hash_entries_.push_back(Hash(key.data(), key.size(), 1));
+  }
+
+  // Generate the filter using the keys that are added
+  virtual Slice Finish(std::unique_ptr<const char[]>* buf) override {
+    uint32_t len = hash_entries_.size() * 4;
+    char* data = new char[len];
+    for (size_t i = 0; i < hash_entries_.size(); i++) {
+      EncodeFixed32(data + i * 4, hash_entries_[i]);
+    }
+    buf->reset(data);
+    return Slice(data, len);
+  }
+
+ private:
+  std::vector<uint32_t> hash_entries_;
+};
+
+class TestFilterBitsReader : public FilterBitsReader {
+ public:
+  explicit TestFilterBitsReader(const Slice& contents)
+    : data_(contents.data()), len_(contents.size()) {}
+
+  virtual bool MayMatch(const Slice& entry) override {
+    uint32_t h = Hash(entry.data(), entry.size(), 1);
+    for (size_t i = 0; i + 4 <= len_; i += 4) {
+      if (h == DecodeFixed32(data_ + i)) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+ private:
+  const char* data_;
+  uint32_t len_;
+};
+
+
+class TestHashFilter : public FilterPolicy {
+ public:
+  virtual const char* Name() const {
+    return "TestHashFilter";
+  }
+
+  virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const {
+    for (int i = 0; i < n; i++) {
+      uint32_t h = Hash(keys[i].data(), keys[i].size(), 1);
+      PutFixed32(dst, h);
+    }
+  }
+
+  virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const {
+    uint32_t h = Hash(key.data(), key.size(), 1);
+    for (unsigned int i = 0; i + 4 <= filter.size(); i += 4) {
+      if (h == DecodeFixed32(filter.data() + i)) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  virtual FilterBitsBuilder* GetFilterBitsBuilder() const override {
+    return new TestFilterBitsBuilder();
+  }
+
+  virtual FilterBitsReader* GetFilterBitsReader(const Slice& contents)
+      const override {
+    return new TestFilterBitsReader(contents);
+  }
+};
+
+class PluginFullFilterBlockTest {
+ public:
+  BlockBasedTableOptions table_options_;
+
+  PluginFullFilterBlockTest() {
+    table_options_.filter_policy.reset(new TestHashFilter());
+  }
+};
+
+TEST(PluginFullFilterBlockTest, PluginEmptyBuilder) {
+  FullFilterBlockBuilder builder(nullptr, table_options_,
+      table_options_.filter_policy->GetFilterBitsBuilder());
+  Slice block = builder.Finish();
+  ASSERT_EQ("", EscapeString(block));
+
+  FullFilterBlockReader reader(nullptr, table_options_, block,
+      table_options_.filter_policy->GetFilterBitsReader(block));
+  // Remain same symantic with blockbased filter
+  ASSERT_TRUE(reader.KeyMayMatch("foo"));
+}
+
+TEST(PluginFullFilterBlockTest, PluginSingleChunk) {
+  FullFilterBlockBuilder builder(nullptr, table_options_,
+      table_options_.filter_policy->GetFilterBitsBuilder());
+  builder.Add("foo");
+  builder.Add("bar");
+  builder.Add("box");
+  builder.Add("box");
+  builder.Add("hello");
+  Slice block = builder.Finish();
+  FullFilterBlockReader reader(nullptr, table_options_, block,
+      table_options_.filter_policy->GetFilterBitsReader(block));
+  ASSERT_TRUE(reader.KeyMayMatch("foo"));
+  ASSERT_TRUE(reader.KeyMayMatch("bar"));
+  ASSERT_TRUE(reader.KeyMayMatch("box"));
+  ASSERT_TRUE(reader.KeyMayMatch("hello"));
+  ASSERT_TRUE(reader.KeyMayMatch("foo"));
+  ASSERT_TRUE(!reader.KeyMayMatch("missing"));
+  ASSERT_TRUE(!reader.KeyMayMatch("other"));
+}
+
+class FullFilterBlockTest {
+ public:
+  BlockBasedTableOptions table_options_;
+
+  FullFilterBlockTest() {
+    table_options_.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  }
+
+  ~FullFilterBlockTest() {}
+};
+
+TEST(FullFilterBlockTest, EmptyBuilder) {
+  FullFilterBlockBuilder builder(nullptr, table_options_,
+      table_options_.filter_policy->GetFilterBitsBuilder());
+  Slice block = builder.Finish();
+  ASSERT_EQ("", EscapeString(block));
+
+  FullFilterBlockReader reader(nullptr, table_options_, block,
+      table_options_.filter_policy->GetFilterBitsReader(block));
+  // Remain same symantic with blockbased filter
+  ASSERT_TRUE(reader.KeyMayMatch("foo"));
+}
+
+TEST(FullFilterBlockTest, SingleChunk) {
+  FullFilterBlockBuilder builder(nullptr, table_options_,
+      table_options_.filter_policy->GetFilterBitsBuilder());
+  builder.Add("foo");
+  builder.Add("bar");
+  builder.Add("box");
+  builder.Add("box");
+  builder.Add("hello");
+  Slice block = builder.Finish();
+  FullFilterBlockReader reader(nullptr, table_options_, block,
+      table_options_.filter_policy->GetFilterBitsReader(block));
+  ASSERT_TRUE(reader.KeyMayMatch("foo"));
+  ASSERT_TRUE(reader.KeyMayMatch("bar"));
+  ASSERT_TRUE(reader.KeyMayMatch("box"));
+  ASSERT_TRUE(reader.KeyMayMatch("hello"));
+  ASSERT_TRUE(reader.KeyMayMatch("foo"));
+  ASSERT_TRUE(!reader.KeyMayMatch("missing"));
+  ASSERT_TRUE(!reader.KeyMayMatch("other"));
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) { return rocksdb::test::RunAllTests(); }
diff --git a/table/plain_table_builder.cc b/table/plain_table_builder.cc
index 49489ed64..b5914554b 100644
--- a/table/plain_table_builder.cc
+++ b/table/plain_table_builder.cc
@@ -20,7 +20,6 @@
 #include "table/block_builder.h"
 #include "table/bloom_block.h"
 #include "table/plain_table_index.h"
-#include "table/filter_block.h"
 #include "table/format.h"
 #include "table/meta_blocks.h"
 #include "util/coding.h"
diff --git a/table/plain_table_reader.cc b/table/plain_table_reader.cc
index 3a6d48be8..23e53bcf7 100644
--- a/table/plain_table_reader.cc
+++ b/table/plain_table_reader.cc
@@ -20,7 +20,6 @@
 
 #include "table/block.h"
 #include "table/bloom_block.h"
-#include "table/filter_block.h"
 #include "table/format.h"
 #include "table/meta_blocks.h"
 #include "table/two_level_iterator.h"
diff --git a/tools/db_sanity_test.cc b/tools/db_sanity_test.cc
index 4ae120c21..7cf7c1cca 100644
--- a/tools/db_sanity_test.cc
+++ b/tools/db_sanity_test.cc
@@ -16,6 +16,7 @@
 #include "include/rocksdb/comparator.h"
 #include "include/rocksdb/table.h"
 #include "include/rocksdb/slice_transform.h"
+#include "include/rocksdb/filter_policy.h"
 
 namespace rocksdb {
 
@@ -146,13 +147,30 @@ class SanityTestPlainTableFactory : public SanityTest {
   Options options_;
 };
 
+class SanityTestBloomFilter : public SanityTest {
+ public:
+  explicit SanityTestBloomFilter(const std::string& path)
+      : SanityTest(path) {
+    table_options_.filter_policy.reset(NewBloomFilterPolicy(10));
+    options_.table_factory.reset(NewBlockBasedTableFactory(table_options_));
+  }
+  ~SanityTestBloomFilter() {}
+  virtual Options GetOptions() const { return options_; }
+  virtual std::string Name() const { return "BloomFilter"; }
+
+ private:
+  Options options_;
+  BlockBasedTableOptions table_options_;
+};
+
 namespace {
 bool RunSanityTests(const std::string& command, const std::string& path) {
   std::vector<SanityTest*> sanity_tests = {
       new SanityTestBasic(path),
       new SanityTestSpecialComparator(path),
       new SanityTestZlibCompression(path),
-      new SanityTestPlainTableFactory(path)};
+      new SanityTestPlainTableFactory(path),
+      new SanityTestBloomFilter(path)};
 
   if (command == "create") {
     fprintf(stderr, "Creating...\n");
diff --git a/tools/db_stress.cc b/tools/db_stress.cc
index e9955953d..b5c79bf3b 100644
--- a/tools/db_stress.cc
+++ b/tools/db_stress.cc
@@ -209,6 +209,9 @@ static const bool FLAGS_reopen_dummy __attribute__((unused)) =
 DEFINE_int32(bloom_bits, 10, "Bloom filter bits per key. "
              "Negative means use default settings.");
 
+DEFINE_bool(use_block_based_filter, false, "use block based filter"
+              "instead of full filter for block based table");
+
 DEFINE_string(db, "", "Use the db with the following name.");
 
 DEFINE_bool(verify_checksum, false,
@@ -757,8 +760,10 @@ class StressTest {
                               ? NewLRUCache(FLAGS_compressed_cache_size)
                               : nullptr),
         filter_policy_(FLAGS_bloom_bits >= 0
-                           ? NewBloomFilterPolicy(FLAGS_bloom_bits)
-                           : nullptr),
+                   ? FLAGS_use_block_based_filter
+                     ? NewBloomFilterPolicy(FLAGS_bloom_bits, true)
+                     : NewBloomFilterPolicy(FLAGS_bloom_bits, false)
+                   : nullptr),
         db_(nullptr),
         new_column_family_name_(1),
         num_times_reopened_(0) {
diff --git a/util/bloom.cc b/util/bloom.cc
index 723adf843..f19e2a670 100644
--- a/util/bloom.cc
+++ b/util/bloom.cc
@@ -1,4 +1,4 @@
-//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
 //  This source code is licensed under the BSD-style license found in the
 //  LICENSE file in the root directory of this source tree. An additional grant
 //  of patent rights can be found in the PATENTS file in the same directory.
@@ -10,42 +10,266 @@
 #include "rocksdb/filter_policy.h"
 
 #include "rocksdb/slice.h"
+#include "table/block_based_filter_block.h"
+#include "table/full_filter_block.h"
 #include "util/hash.h"
+#include "util/coding.h"
 
 namespace rocksdb {
 
+class BlockBasedFilterBlockBuilder;
+class FullFilterBlockBuilder;
+
 namespace {
+class FullFilterBitsBuilder : public FilterBitsBuilder {
+ public:
+  explicit FullFilterBitsBuilder(const size_t bits_per_key,
+                                 const size_t num_probes)
+      : bits_per_key_(bits_per_key),
+        num_probes_(num_probes) {
+    assert(bits_per_key_);
+  }
+
+  ~FullFilterBitsBuilder() {}
+
+  virtual void AddKey(const Slice& key) override {
+    uint32_t hash = BloomHash(key);
+    if (hash_entries_.size() == 0 || hash != hash_entries_.back()) {
+      hash_entries_.push_back(hash);
+    }
+  }
+
+  // Create a filter that for hashes [0, n-1], the filter is allocated here
+  // When creating filter, it is ensured that
+  // total_bits = num_lines * CACHE_LINE_SIZE * 8
+  // dst len is >= 5, 1 for num_probes, 4 for num_lines
+  // Then total_bits = (len - 5) * 8, and cache_line_size could be calulated
+  // +----------------------------------------------------------------+
+  // |              filter data with length total_bits/8              |
+  // +----------------------------------------------------------------+
+  // |                                                                |
+  // | ...                                                            |
+  // |                                                                |
+  // +----------------------------------------------------------------+
+  // | ...                | num_probes : 1 byte | num_lines : 4 bytes |
+  // +----------------------------------------------------------------+
+  virtual Slice Finish(std::unique_ptr<const char[]>* buf) override {
+    char* data = nullptr;
+    uint32_t total_bits, num_lines;
+    data = ReserveSpace(hash_entries_.size(), &total_bits, &num_lines);
+    assert(data);
+
+    if (total_bits != 0 && num_lines != 0) {
+      for (auto h : hash_entries_) {
+        AddHash(h, data, num_lines, total_bits);
+      }
+    }
+    data[total_bits/8] = static_cast<char>(num_probes_);
+    EncodeFixed32(data + total_bits/8 + 1, static_cast<uint32_t>(num_lines));
+
+    buf->reset(data);
+    hash_entries_.clear();
+
+    return Slice(data, total_bits / 8 + 5);
+  }
 
-class BloomFilterPolicy : public FilterPolicy {
  private:
   size_t bits_per_key_;
-  size_t k_;
-  uint32_t (*hash_func_)(const Slice& key);
+  size_t num_probes_;
+  std::vector<uint32_t> hash_entries_;
 
-  void initialize() {
-    // We intentionally round down to reduce probing cost a little bit
-    k_ = static_cast<size_t>(bits_per_key_ * 0.69);  // 0.69 =~ ln(2)
-    if (k_ < 1) k_ = 1;
-    if (k_ > 30) k_ = 30;
+  // Get totalbits that optimized for cpu cache line
+  uint32_t GetTotalBitsForLocality(uint32_t total_bits);
+
+  // Reserve space for new filter
+  char* ReserveSpace(const int num_entry, uint32_t* total_bits,
+      uint32_t* num_lines);
+
+  // Assuming single threaded access to this function.
+  void AddHash(uint32_t h, char* data, uint32_t num_lines,
+      uint32_t total_bits);
+
+  // No Copy allowed
+  FullFilterBitsBuilder(const FullFilterBitsBuilder&);
+  void operator=(const FullFilterBitsBuilder&);
+};
+
+uint32_t FullFilterBitsBuilder::GetTotalBitsForLocality(uint32_t total_bits) {
+  uint32_t num_lines =
+      (total_bits + CACHE_LINE_SIZE * 8 - 1) / (CACHE_LINE_SIZE * 8);
+
+  // Make num_lines an odd number to make sure more bits are involved
+  // when determining which block.
+  if (num_lines % 2 == 0) {
+    num_lines++;
+  }
+  return num_lines * (CACHE_LINE_SIZE * 8);
+}
+
+char* FullFilterBitsBuilder::ReserveSpace(const int num_entry,
+    uint32_t* total_bits, uint32_t* num_lines) {
+  assert(bits_per_key_);
+  char* data = nullptr;
+  if (num_entry != 0) {
+    uint32_t total_bits_tmp = num_entry * bits_per_key_;
+
+    *total_bits = GetTotalBitsForLocality(total_bits_tmp);
+    *num_lines = *total_bits / (CACHE_LINE_SIZE * 8);
+    assert(*total_bits > 0 && *total_bits % 8 == 0);
+  } else {
+    // filter is empty, just leave space for metadata
+    *total_bits = 0;
+    *num_lines = 0;
   }
 
+  // Reserve space for Filter
+  uint32_t sz = *total_bits / 8;
+  sz += 5;  // 4 bytes for num_lines, 1 byte for num_probes
+
+  data = new char[sz];
+  memset(data, 0, sz);
+  return data;
+}
+
+inline void FullFilterBitsBuilder::AddHash(uint32_t h, char* data,
+    uint32_t num_lines, uint32_t total_bits) {
+  assert(num_lines > 0 && total_bits > 0);
+
+  const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
+  uint32_t b = (h % num_lines) * (CACHE_LINE_SIZE * 8);
+
+  for (uint32_t i = 0; i < num_probes_; ++i) {
+    // Since CACHE_LINE_SIZE is defined as 2^n, this line will be optimized
+    // to a simple operation by compiler.
+    const uint32_t bitpos = b + (h % (CACHE_LINE_SIZE * 8));
+    data[bitpos / 8] |= (1 << (bitpos % 8));
+
+    h += delta;
+  }
+}
+
+class FullFilterBitsReader : public FilterBitsReader {
  public:
-  explicit BloomFilterPolicy(int bits_per_key,
-                             uint32_t (*hash_func)(const Slice& key))
-      : bits_per_key_(bits_per_key), hash_func_(hash_func) {
-    initialize();
+  explicit FullFilterBitsReader(const Slice& contents)
+      : data_(const_cast<char*>(contents.data())),
+        data_len_(contents.size()),
+        num_probes_(0), num_lines_(0) {
+    assert(data_);
+    GetFilterMeta(contents, &num_probes_, &num_lines_);
+    // Sanitize broken parameter
+    if (num_lines_ != 0 && (data_len_-5) % num_lines_ != 0) {
+      num_lines_ = 0;
+      num_probes_ = 0;
+    }
+  }
+
+  ~FullFilterBitsReader() {}
+
+  virtual bool MayMatch(const Slice& entry) override {
+    if (data_len_ <= 5) {   // remain same with original filter
+      return false;
+    }
+    // Other Error params, including a broken filter, regarded as match
+    if (num_probes_ == 0 || num_lines_ == 0) return true;
+    uint32_t hash = BloomHash(entry);
+    return HashMayMatch(hash, Slice(data_, data_len_),
+                        num_probes_, num_lines_);
   }
-  explicit BloomFilterPolicy(int bits_per_key)
-      : bits_per_key_(bits_per_key) {
-    hash_func_ = BloomHash;
+
+ private:
+  // Filter meta data
+  char* data_;
+  uint32_t data_len_;
+  size_t num_probes_;
+  uint32_t num_lines_;
+
+  // Get num_probes, and num_lines from filter
+  // If filter format broken, set both to 0.
+  void GetFilterMeta(const Slice& filter, size_t* num_probes,
+                             uint32_t* num_lines);
+
+  // "filter" contains the data appended by a preceding call to
+  // CreateFilterFromHash() on this class.  This method must return true if
+  // the key was in the list of keys passed to CreateFilter().
+  // This method may return true or false if the key was not on the
+  // list, but it should aim to return false with a high probability.
+  //
+  // hash: target to be checked
+  // filter: the whole filter, including meta data bytes
+  // num_probes: number of probes, read before hand
+  // num_lines: filter metadata, read before hand
+  // Before calling this function, need to ensure the input meta data
+  // is valid.
+  bool HashMayMatch(const uint32_t& hash, const Slice& filter,
+      const size_t& num_probes, const uint32_t& num_lines);
+
+  // No Copy allowed
+  FullFilterBitsReader(const FullFilterBitsReader&);
+  void operator=(const FullFilterBitsReader&);
+};
+
+void FullFilterBitsReader::GetFilterMeta(const Slice& filter,
+    size_t* num_probes, uint32_t* num_lines) {
+  uint32_t len = filter.size();
+  if (len <= 5) {
+    // filter is empty or broken
+    *num_probes = 0;
+    *num_lines = 0;
+    return;
+  }
+
+  *num_probes = filter.data()[len - 5];
+  *num_lines = DecodeFixed32(filter.data() + len - 4);
+}
+
+bool FullFilterBitsReader::HashMayMatch(const uint32_t& hash,
+    const Slice& filter, const size_t& num_probes,
+    const uint32_t& num_lines) {
+  uint32_t len = filter.size();
+  if (len <= 5) return false;  // remain the same with original filter
+
+  // It is ensured the params are valid before calling it
+  assert(num_probes != 0);
+  assert(num_lines != 0 && (len - 5) % num_lines == 0);
+  uint32_t cache_line_size = (len - 5) / num_lines;
+  const char* data = filter.data();
+
+  uint32_t h = hash;
+  const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
+  uint32_t b = (h % num_lines) * (cache_line_size * 8);
+
+  for (uint32_t i = 0; i < num_probes; ++i) {
+    // Since CACHE_LINE_SIZE is defined as 2^n, this line will be optimized
+    //  to a simple and operation by compiler.
+    const uint32_t bitpos = b + (h % (cache_line_size * 8));
+    if (((data[bitpos / 8]) & (1 << (bitpos % 8))) == 0) {
+      return false;
+    }
+
+    h += delta;
+  }
+
+  return true;
+}
+
+// An implementation of filter policy
+class BloomFilterPolicy : public FilterPolicy {
+ public:
+  explicit BloomFilterPolicy(int bits_per_key, bool use_block_based_builder)
+      : bits_per_key_(bits_per_key), hash_func_(BloomHash),
+        use_block_based_builder_(use_block_based_builder) {
     initialize();
   }
 
-  virtual const char* Name() const {
+  ~BloomFilterPolicy() {
+  }
+
+  virtual const char* Name() const override {
     return "rocksdb.BuiltinBloomFilter";
   }
 
-  virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const {
+  virtual void CreateFilter(const Slice* keys, int n,
+                            std::string* dst) const override {
     // Compute bloom filter size (in both bits and bytes)
     size_t bits = n * bits_per_key_;
 
@@ -58,14 +282,14 @@ class BloomFilterPolicy : public FilterPolicy {
 
     const size_t init_size = dst->size();
     dst->resize(init_size + bytes, 0);
-    dst->push_back(static_cast<char>(k_));  // Remember # of probes in filter
+    dst->push_back(static_cast<char>(num_probes_));  // Remember # of probes
     char* array = &(*dst)[init_size];
     for (size_t i = 0; i < (size_t)n; i++) {
       // Use double-hashing to generate a sequence of hash values.
       // See analysis in [Kirsch,Mitzenmacher 2006].
       uint32_t h = hash_func_(keys[i]);
       const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
-      for (size_t j = 0; j < k_; j++) {
+      for (size_t j = 0; j < num_probes_; j++) {
         const uint32_t bitpos = h % bits;
         array[bitpos/8] |= (1 << (bitpos % 8));
         h += delta;
@@ -73,7 +297,8 @@ class BloomFilterPolicy : public FilterPolicy {
     }
   }
 
-  virtual bool KeyMayMatch(const Slice& key, const Slice& bloom_filter) const {
+  virtual bool KeyMayMatch(const Slice& key,
+                           const Slice& bloom_filter) const override {
     const size_t len = bloom_filter.size();
     if (len < 2) return false;
 
@@ -98,11 +323,43 @@ class BloomFilterPolicy : public FilterPolicy {
     }
     return true;
   }
+
+  virtual FilterBitsBuilder* GetFilterBitsBuilder() const override {
+    if (use_block_based_builder_) {
+      return nullptr;
+    }
+
+    return new FullFilterBitsBuilder(bits_per_key_, num_probes_);
+  }
+
+  virtual FilterBitsReader* GetFilterBitsReader(const Slice& contents)
+      const override {
+    return new FullFilterBitsReader(contents);
+  }
+
+  // If choose to use block based builder
+  bool UseBlockBasedBuilder() { return use_block_based_builder_; }
+
+ private:
+  size_t bits_per_key_;
+  size_t num_probes_;
+  uint32_t (*hash_func_)(const Slice& key);
+
+  const bool use_block_based_builder_;
+
+  void initialize() {
+    // We intentionally round down to reduce probing cost a little bit
+    num_probes_ = static_cast<size_t>(bits_per_key_ * 0.69);  // 0.69 =~ ln(2)
+    if (num_probes_ < 1) num_probes_ = 1;
+    if (num_probes_ > 30) num_probes_ = 30;
+  }
 };
-}
 
-const FilterPolicy* NewBloomFilterPolicy(int bits_per_key) {
-  return new BloomFilterPolicy(bits_per_key);
+}  // namespace
+
+const FilterPolicy* NewBloomFilterPolicy(int bits_per_key,
+                                         bool use_block_based_builder) {
+  return new BloomFilterPolicy(bits_per_key, use_block_based_builder);
 }
 
 }  // namespace rocksdb
diff --git a/util/bloom_test.cc b/util/bloom_test.cc
index 881e3b0f5..275592b70 100644
--- a/util/bloom_test.cc
+++ b/util/bloom_test.cc
@@ -16,12 +16,13 @@ int main() {
 #else
 
 #include <gflags/gflags.h>
+#include <vector>
 
 #include "rocksdb/filter_policy.h"
-
 #include "util/logging.h"
 #include "util/testharness.h"
 #include "util/testutil.h"
+#include "util/arena.h"
 
 using GFLAGS::ParseCommandLineFlags;
 
@@ -36,6 +37,19 @@ static Slice Key(int i, char* buffer) {
   return Slice(buffer, sizeof(i));
 }
 
+static int NextLength(int length) {
+  if (length < 10) {
+    length += 1;
+  } else if (length < 100) {
+    length += 10;
+  } else if (length < 1000) {
+    length += 100;
+  } else {
+    length += 1000;
+  }
+  return length;
+}
+
 class BloomTest {
  private:
   const FilterPolicy* policy_;
@@ -43,7 +57,8 @@ class BloomTest {
   std::vector<std::string> keys_;
 
  public:
-  BloomTest() : policy_(NewBloomFilterPolicy(FLAGS_bits_per_key)) { }
+  BloomTest() : policy_(
+      NewBloomFilterPolicy(FLAGS_bits_per_key)) {}
 
   ~BloomTest() {
     delete policy_;
@@ -117,19 +132,6 @@ TEST(BloomTest, Small) {
   ASSERT_TRUE(! Matches("foo"));
 }
 
-static int NextLength(int length) {
-  if (length < 10) {
-    length += 1;
-  } else if (length < 100) {
-    length += 10;
-  } else if (length < 1000) {
-    length += 100;
-  } else {
-    length += 1000;
-  }
-  return length;
-}
-
 TEST(BloomTest, VaryingLengths) {
   char buffer[sizeof(int)];
 
@@ -171,6 +173,121 @@ TEST(BloomTest, VaryingLengths) {
 
 // Different bits-per-byte
 
+class FullBloomTest {
+ private:
+  const FilterPolicy* policy_;
+  std::unique_ptr<FilterBitsBuilder> bits_builder_;
+  std::unique_ptr<FilterBitsReader> bits_reader_;
+  std::unique_ptr<const char[]> buf_;
+  size_t filter_size_;
+
+ public:
+  FullBloomTest() :
+      policy_(NewBloomFilterPolicy(FLAGS_bits_per_key, false)),
+      filter_size_(0) {
+    Reset();
+  }
+
+  ~FullBloomTest() {
+    delete policy_;
+  }
+
+  void Reset() {
+    bits_builder_.reset(policy_->GetFilterBitsBuilder());
+    bits_reader_.reset(nullptr);
+    buf_.reset(nullptr);
+    filter_size_ = 0;
+  }
+
+  void Add(const Slice& s) {
+    bits_builder_->AddKey(s);
+  }
+
+  void Build() {
+    Slice filter = bits_builder_->Finish(&buf_);
+    bits_reader_.reset(policy_->GetFilterBitsReader(filter));
+    filter_size_ = filter.size();
+  }
+
+  size_t FilterSize() const {
+    return filter_size_;
+  }
+
+  bool Matches(const Slice& s) {
+    if (bits_reader_ == nullptr) {
+      Build();
+    }
+    return bits_reader_->MayMatch(s);
+  }
+
+  double FalsePositiveRate() {
+    char buffer[sizeof(int)];
+    int result = 0;
+    for (int i = 0; i < 10000; i++) {
+      if (Matches(Key(i + 1000000000, buffer))) {
+        result++;
+      }
+    }
+    return result / 10000.0;
+  }
+};
+
+TEST(FullBloomTest, FullEmptyFilter) {
+  // Empty filter is not match, at this level
+  ASSERT_TRUE(!Matches("hello"));
+  ASSERT_TRUE(!Matches("world"));
+}
+
+TEST(FullBloomTest, FullSmall) {
+  Add("hello");
+  Add("world");
+  ASSERT_TRUE(Matches("hello"));
+  ASSERT_TRUE(Matches("world"));
+  ASSERT_TRUE(!Matches("x"));
+  ASSERT_TRUE(!Matches("foo"));
+}
+
+TEST(FullBloomTest, FullVaryingLengths) {
+  char buffer[sizeof(int)];
+
+  // Count number of filters that significantly exceed the false positive rate
+  int mediocre_filters = 0;
+  int good_filters = 0;
+
+  for (int length = 1; length <= 10000; length = NextLength(length)) {
+    Reset();
+    for (int i = 0; i < length; i++) {
+      Add(Key(i, buffer));
+    }
+    Build();
+
+    ASSERT_LE(FilterSize(), (size_t)((length * 10 / 8) + 128 + 5)) << length;
+
+    // All added keys must match
+    for (int i = 0; i < length; i++) {
+      ASSERT_TRUE(Matches(Key(i, buffer)))
+          << "Length " << length << "; key " << i;
+    }
+
+    // Check false positive rate
+    double rate = FalsePositiveRate();
+    if (kVerbose >= 1) {
+      fprintf(stderr, "False positives: %5.2f%% @ length = %6d ; bytes = %6d\n",
+              rate*100.0, length, static_cast<int>(FilterSize()));
+    }
+    ASSERT_LE(rate, 0.02);   // Must not be over 2%
+    if (rate > 0.0125)
+      mediocre_filters++;  // Allowed, but not too often
+    else
+      good_filters++;
+  }
+  if (kVerbose >= 1) {
+    fprintf(stderr, "Filters: %d good, %d mediocre\n",
+            good_filters, mediocre_filters);
+  }
+  ASSERT_LE(mediocre_filters, good_filters/5);
+}
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {

From a2bb7c3c332f226a435d81845fe53a2a7c63d38f Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Mon, 8 Sep 2014 11:20:25 -0700
Subject: [PATCH 043/829] Push- instead of pull-model for managing Write stalls

Summary:
Introducing WriteController, which is a source of truth about per-DB write delays. Let's define an DB epoch as a period where there are no flushes and compactions (i.e. new epoch is started when flush or compaction finishes). Each epoch can either:
* proceed with all writes without delay
* delay all writes by fixed time
* stop all writes

The three modes are recomputed at each epoch change (flush, compaction), rather than on every write (which is currently the case).

When we have a lot of column families, our current pull behavior adds a big overhead, since we need to loop over every column family for every write. With new push model, overhead on Write code-path is minimal.

This is just the start. Next step is to also take care of stalls introduced by slow memtable flushes. The final goal is to eliminate function MakeRoomForWrite(), which currently needs to be called for every column family by every write.

Test Plan: make check for now. I'll add some unit tests later. Also, perf test.

Reviewers: dhruba, yhchiang, MarkCallaghan, sdong, ljin

Reviewed By: ljin

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D22791
---
 HISTORY.md                   |   5 +-
 Makefile                     |   6 +-
 db/column_family.cc          | 130 ++++++++++++++++++-------
 db/column_family.h           |  54 ++---------
 db/db_impl.cc                | 180 ++++++-----------------------------
 db/db_impl.h                 |  17 ++--
 db/log_and_apply_bench.cc    |   4 +-
 db/version_set.cc            |   8 +-
 db/version_set.h             |   8 +-
 db/write_controller.cc       |  37 +++++++
 db/write_controller.h        |  78 +++++++++++++++
 db/write_controller_test.cc  |  40 ++++++++
 include/rocksdb/options.h    |   4 +-
 include/rocksdb/statistics.h |   2 +-
 util/ldb_cmd.cc              |   6 +-
 15 files changed, 321 insertions(+), 258 deletions(-)
 create mode 100644 db/write_controller.cc
 create mode 100644 db/write_controller.h
 create mode 100644 db/write_controller_test.cc

diff --git a/HISTORY.md b/HISTORY.md
index 5b144ff3a..ca117b273 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,6 +1,9 @@
 # Rocksdb Change Log
 
-## Unreleased
+## Unreleased (will be released with 3.6)
+
+### Behavior changes
+* We have refactored our system of stalling writes.  Any stall-related statistics' meanings are changed. Instead of per-write stall counts, we now count stalls per-epoch, where epochs are periods between flushes and compactions. You'll find more information in our Tuning Perf Guide once we release RocksDB 3.6.
 
 ----- Past Releases -----
 
diff --git a/Makefile b/Makefile
index c05d82af7..da85ae2fc 100644
--- a/Makefile
+++ b/Makefile
@@ -112,7 +112,8 @@ TESTS = \
 	version_edit_test \
 	version_set_test \
 	file_indexer_test \
-	write_batch_test\
+	write_batch_test \
+	write_controller_test\
 	deletefile_test \
 	table_test \
 	thread_local_test \
@@ -427,6 +428,9 @@ reduce_levels_test: tools/reduce_levels_test.o $(LIBOBJECTS) $(TESTHARNESS)
 write_batch_test: db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
 
+write_controller_test: db/write_controller_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) db/write_controller_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
 merge_test: db/merge_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) db/merge_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
 
diff --git a/db/column_family.cc b/db/column_family.cc
index 7e06c9bd7..eb2f21e9f 100644
--- a/db/column_family.cc
+++ b/db/column_family.cc
@@ -9,6 +9,11 @@
 
 #include "db/column_family.h"
 
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include <inttypes.h>
 #include <vector>
 #include <string>
 #include <algorithm>
@@ -19,11 +24,42 @@
 #include "db/internal_stats.h"
 #include "db/compaction_picker.h"
 #include "db/table_properties_collector.h"
+#include "db/write_controller.h"
 #include "util/autovector.h"
 #include "util/hash_skiplist_rep.h"
 
 namespace rocksdb {
 
+namespace {
+// This function computes the amount of time in microseconds by which a write
+// should be delayed based on the number of level-0 files according to the
+// following formula:
+// if n < bottom, return 0;
+// if n >= top, return 1000;
+// otherwise, let r = (n - bottom) /
+//                    (top - bottom)
+//  and return r^2 * 1000.
+// The goal of this formula is to gradually increase the rate at which writes
+// are slowed. We also tried linear delay (r * 1000), but it seemed to do
+// slightly worse. There is no other particular reason for choosing quadratic.
+uint64_t SlowdownAmount(int n, double bottom, double top) {
+  uint64_t delay;
+  if (n >= top) {
+    delay = 1000;
+  } else if (n < bottom) {
+    delay = 0;
+  } else {
+    // If we are here, we know that:
+    //   level0_start_slowdown <= n < level0_slowdown
+    // since the previous two conditions are false.
+    double how_much = static_cast<double>(n - bottom) / (top - bottom);
+    delay = std::max(how_much * how_much * 1000, 100.0);
+  }
+  assert(delay <= 1000);
+  return delay;
+}
+}  // namespace
+
 ColumnFamilyHandleImpl::ColumnFamilyHandleImpl(ColumnFamilyData* cfd,
                                                DBImpl* db, port::Mutex* mutex)
     : cfd_(cfd), db_(db), mutex_(mutex) {
@@ -197,7 +233,6 @@ ColumnFamilyData::ColumnFamilyData(uint32_t id, const std::string& name,
       next_(nullptr),
       prev_(nullptr),
       log_number_(0),
-      need_slowdown_for_num_level0_files_(false),
       column_family_set_(column_family_set) {
   Ref();
 
@@ -278,31 +313,62 @@ ColumnFamilyData::~ColumnFamilyData() {
 }
 
 void ColumnFamilyData::RecalculateWriteStallConditions() {
-  need_wait_for_num_memtables_ =
-    (imm()->size() == options()->max_write_buffer_number - 1);
-
   if (current_ != nullptr) {
-    need_wait_for_num_level0_files_ =
-      (current_->NumLevelFiles(0) >= options()->level0_stop_writes_trigger);
-  } else {
-    need_wait_for_num_level0_files_ = false;
-  }
-
-  RecalculateWriteStallRateLimitsConditions();
-}
-
-void ColumnFamilyData::RecalculateWriteStallRateLimitsConditions() {
-  if (current_ != nullptr) {
-    exceeds_hard_rate_limit_ =
-        (options()->hard_rate_limit > 1.0 &&
-         current_->MaxCompactionScore() > options()->hard_rate_limit);
-
-    exceeds_soft_rate_limit_ =
-        (options()->soft_rate_limit > 0.0 &&
-         current_->MaxCompactionScore() > options()->soft_rate_limit);
-  } else {
-    exceeds_hard_rate_limit_ = false;
-    exceeds_soft_rate_limit_ = false;
+    const double score = current_->MaxCompactionScore();
+    const int max_level = current_->MaxCompactionScoreLevel();
+
+    auto write_controller = column_family_set_->write_controller_;
+
+    if (imm()->size() == options_.max_write_buffer_number) {
+      write_controller_token_ = write_controller->GetStopToken();
+      internal_stats_->AddCFStats(InternalStats::MEMTABLE_COMPACTION, 1);
+      Log(options_.info_log,
+          "[%s] Stopping writes because we have %d immutable memtables "
+          "(waiting for flush)",
+          name_.c_str(), imm()->size());
+    } else if (options_.level0_slowdown_writes_trigger >= 0 &&
+               current_->NumLevelFiles(0) >=
+                   options_.level0_slowdown_writes_trigger) {
+      uint64_t slowdown = SlowdownAmount(
+          current_->NumLevelFiles(0), options_.level0_slowdown_writes_trigger,
+          options_.level0_stop_writes_trigger);
+      write_controller_token_ = write_controller->GetDelayToken(slowdown);
+      internal_stats_->AddCFStats(InternalStats::LEVEL0_SLOWDOWN, slowdown);
+      Log(options_.info_log,
+          "[%s] Stalling writes because we have %d level-0 files (%" PRIu64
+          "us)",
+          name_.c_str(), current_->NumLevelFiles(0), slowdown);
+    } else if (current_->NumLevelFiles(0) >=
+               options_.level0_stop_writes_trigger) {
+      write_controller_token_ = write_controller->GetStopToken();
+      internal_stats_->AddCFStats(InternalStats::LEVEL0_NUM_FILES, 1);
+      Log(options_.info_log,
+          "[%s] Stopping writes because we have %d level-0 files",
+          name_.c_str(), current_->NumLevelFiles(0));
+    } else if (options_.hard_rate_limit > 1.0 &&
+               score > options_.hard_rate_limit) {
+      uint64_t kHardLimitSlowdown = 1000;
+      write_controller_token_ =
+          write_controller->GetDelayToken(kHardLimitSlowdown);
+      internal_stats_->RecordLevelNSlowdown(max_level, kHardLimitSlowdown,
+                                            false);
+      Log(options_.info_log,
+          "[%s] Stalling writes because we hit hard limit on level %d. "
+          "(%" PRIu64 "us)",
+          name_.c_str(), max_level, kHardLimitSlowdown);
+    } else if (options_.soft_rate_limit > 0.0 &&
+               score > options_.soft_rate_limit) {
+      uint64_t slowdown = SlowdownAmount(score, options_.soft_rate_limit,
+                                         options_.hard_rate_limit);
+      write_controller_token_ = write_controller->GetDelayToken(slowdown);
+      internal_stats_->RecordLevelNSlowdown(max_level, slowdown, true);
+      Log(options_.info_log,
+          "[%s] Stalling writes because we hit soft limit on level %d (%" PRIu64
+          "us)",
+          name_.c_str(), max_level, slowdown);
+    } else {
+      write_controller_token_.reset();
+    }
   }
 }
 
@@ -310,12 +376,7 @@ const EnvOptions* ColumnFamilyData::soptions() const {
   return &(column_family_set_->env_options_);
 }
 
-void ColumnFamilyData::SetCurrent(Version* current) {
-  current_ = current;
-  need_slowdown_for_num_level0_files_ =
-      (options_.level0_slowdown_writes_trigger >= 0 &&
-       current_->NumLevelFiles(0) >= options_.level0_slowdown_writes_trigger);
-}
+void ColumnFamilyData::SetCurrent(Version* current) { current_ = current; }
 
 void ColumnFamilyData::CreateNewMemtable() {
   assert(current_ != nullptr);
@@ -328,7 +389,6 @@ void ColumnFamilyData::CreateNewMemtable() {
 
 Compaction* ColumnFamilyData::PickCompaction(LogBuffer* log_buffer) {
   auto result = compaction_picker_->PickCompaction(current_, log_buffer);
-  RecalculateWriteStallRateLimitsConditions();
   return result;
 }
 
@@ -464,16 +524,18 @@ void ColumnFamilyData::ResetThreadLocalSuperVersions() {
 ColumnFamilySet::ColumnFamilySet(const std::string& dbname,
                                  const DBOptions* db_options,
                                  const EnvOptions& env_options,
-                                 Cache* table_cache)
+                                 Cache* table_cache,
+                                 WriteController* write_controller)
     : max_column_family_(0),
       dummy_cfd_(new ColumnFamilyData(0, "", nullptr, nullptr,
                                       ColumnFamilyOptions(), db_options,
-                                      env_options_, nullptr)),
+                                      env_options, nullptr)),
       default_cfd_cache_(nullptr),
       db_name_(dbname),
       db_options_(db_options),
       env_options_(env_options),
       table_cache_(table_cache),
+      write_controller_(write_controller),
       spin_lock_(ATOMIC_FLAG_INIT) {
   // initialize linked list
   dummy_cfd_->prev_ = dummy_cfd_;
diff --git a/db/column_family.h b/db/column_family.h
index a68189d51..b5363fe30 100644
--- a/db/column_family.h
+++ b/db/column_family.h
@@ -19,6 +19,7 @@
 #include "rocksdb/env.h"
 #include "db/memtable_list.h"
 #include "db/write_batch_internal.h"
+#include "db/write_controller.h"
 #include "db/table_cache.h"
 #include "util/thread_local.h"
 
@@ -156,6 +157,7 @@ class ColumnFamilyData {
     // can't drop default CF
     assert(id_ != 0);
     dropped_ = true;
+    write_controller_token_.reset();
   }
   bool IsDropped() const { return dropped_; }
 
@@ -225,35 +227,12 @@ class ColumnFamilyData {
 
   void ResetThreadLocalSuperVersions();
 
-  // A Flag indicating whether write needs to slowdown because of there are
-  // too many number of level0 files.
-  bool NeedSlowdownForNumLevel0Files() const {
-    return need_slowdown_for_num_level0_files_;
-  }
-
-  bool NeedWaitForNumLevel0Files() const {
-    return need_wait_for_num_level0_files_;
-  }
-
-  bool NeedWaitForNumMemtables() const {
-    return need_wait_for_num_memtables_;
-  }
-
-  bool ExceedsSoftRateLimit() const {
-    return exceeds_soft_rate_limit_;
-  }
-
-  bool ExceedsHardRateLimit() const {
-    return exceeds_hard_rate_limit_;
-  }
-
  private:
   friend class ColumnFamilySet;
   ColumnFamilyData(uint32_t id, const std::string& name,
                    Version* dummy_versions, Cache* table_cache,
                    const ColumnFamilyOptions& options,
-                   const DBOptions* db_options,
-                   const EnvOptions& env_options,
+                   const DBOptions* db_options, const EnvOptions& env_options,
                    ColumnFamilySet* column_family_set);
 
   // Recalculate some small conditions, which are changed only during
@@ -262,7 +241,6 @@ class ColumnFamilyData {
   // DBImpl::MakeRoomForWrite function to decide, if it need to make
   // a write stall
   void RecalculateWriteStallConditions();
-  void RecalculateWriteStallRateLimitsConditions();
 
   uint32_t id_;
   const std::string name_;
@@ -304,31 +282,13 @@ class ColumnFamilyData {
   // recovered from
   uint64_t log_number_;
 
-  // A flag indicating whether we should delay writes because
-  // we have too many level 0 files
-  bool need_slowdown_for_num_level0_files_;
-
-  // These 4 variables are updated only after compaction,
-  // adding new memtable, flushing memtables to files
-  // and/or add recalculation of compaction score.
-  // That's why theirs values are cached in ColumnFamilyData.
-  // Recalculation is made by RecalculateWriteStallConditions and
-  // RecalculateWriteStallRateLimitsConditions function. They are used
-  // in DBImpl::MakeRoomForWrite function to decide, if it need
-  // to sleep during write operation
-  bool need_wait_for_num_memtables_;
-
-  bool need_wait_for_num_level0_files_;
-
-  bool exceeds_hard_rate_limit_;
-
-  bool exceeds_soft_rate_limit_;
-
   // An object that keeps all the compaction stats
   // and picks the next compaction
   std::unique_ptr<CompactionPicker> compaction_picker_;
 
   ColumnFamilySet* column_family_set_;
+
+  std::unique_ptr<WriteControllerToken> write_controller_token_;
 };
 
 // ColumnFamilySet has interesting thread-safety requirements
@@ -370,7 +330,8 @@ class ColumnFamilySet {
   };
 
   ColumnFamilySet(const std::string& dbname, const DBOptions* db_options,
-                  const EnvOptions& env_options, Cache* table_cache);
+                  const EnvOptions& env_options, Cache* table_cache,
+                  WriteController* write_controller);
   ~ColumnFamilySet();
 
   ColumnFamilyData* GetDefault() const;
@@ -425,6 +386,7 @@ class ColumnFamilySet {
   const DBOptions* const db_options_;
   const EnvOptions env_options_;
   Cache* table_cache_;
+  WriteController* write_controller_;
   std::atomic_flag spin_lock_;
 };
 
diff --git a/db/db_impl.cc b/db/db_impl.cc
index b83d60f5e..cff2d5a20 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -344,7 +344,6 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname)
       last_stats_dump_time_microsec_(0),
       default_interval_to_delete_obsolete_WAL_(600),
       flush_on_destroy_(false),
-      delayed_writes_(0),
       env_options_(options),
       bg_work_gate_closed_(false),
       refitting_level_(false),
@@ -360,8 +359,8 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname)
       NewLRUCache(table_cache_size, db_options_.table_cache_numshardbits,
                   db_options_.table_cache_remove_scan_count_limit);
 
-  versions_.reset(
-      new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get()));
+  versions_.reset(new VersionSet(dbname_, &db_options_, env_options_,
+                                 table_cache_.get(), &write_controller_));
   column_family_memtables_.reset(
       new ColumnFamilyMemTablesImpl(versions_->GetColumnFamilySet()));
 
@@ -3988,6 +3987,10 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
         flush_column_family_if_log_file, total_log_size_, max_total_wal_size);
   }
 
+  if (write_controller_.IsStopped() || write_controller_.GetDelay() > 0) {
+    DelayWrite(expiration_time);
+  }
+
   if (LIKELY(single_column_family_mode_)) {
     // fast path
     status = MakeRoomForWrite(default_cf_handle_->cfd(),
@@ -4189,36 +4192,28 @@ void DBImpl::BuildBatchGroup(Writer** last_writer,
   }
 }
 
-// This function computes the amount of time in microseconds by which a write
-// should be delayed based on the number of level-0 files according to the
-// following formula:
-// if n < bottom, return 0;
-// if n >= top, return 1000;
-// otherwise, let r = (n - bottom) /
-//                    (top - bottom)
-//  and return r^2 * 1000.
-// The goal of this formula is to gradually increase the rate at which writes
-// are slowed. We also tried linear delay (r * 1000), but it seemed to do
-// slightly worse. There is no other particular reason for choosing quadratic.
-uint64_t DBImpl::SlowdownAmount(int n, double bottom, double top) {
-  uint64_t delay;
-  if (n >= top) {
-    delay = 1000;
-  }
-  else if (n < bottom) {
-    delay = 0;
-  }
-  else {
-    // If we are here, we know that:
-    //   level0_start_slowdown <= n < level0_slowdown
-    // since the previous two conditions are false.
-    double how_much =
-      (double) (n - bottom) /
-              (top - bottom);
-    delay = std::max(how_much * how_much * 1000, 100.0);
-  }
-  assert(delay <= 1000);
-  return delay;
+// REQUIRES: mutex_ is held
+// REQUIRES: this thread is currently at the front of the writer queue
+void DBImpl::DelayWrite(uint64_t expiration_time) {
+  StopWatch sw(env_, stats_, WRITE_STALL);
+  bool has_timeout = (expiration_time > 0);
+  auto delay = write_controller_.GetDelay();
+  if (write_controller_.IsStopped() == false && delay > 0) {
+    mutex_.Unlock();
+    env_->SleepForMicroseconds(delay);
+    mutex_.Lock();
+  }
+
+  while (write_controller_.IsStopped()) {
+    if (has_timeout) {
+      bg_cv_.TimedWait(expiration_time);
+      if (env_->NowMicros() > expiration_time) {
+        break;
+      }
+    } else {
+      bg_cv_.Wait();
+    }
+  }
 }
 
 // REQUIRES: mutex_ is held
@@ -4228,16 +4223,7 @@ Status DBImpl::MakeRoomForWrite(ColumnFamilyData* cfd,
                                 uint64_t expiration_time) {
   mutex_.AssertHeld();
   assert(!writers_.empty());
-  bool allow_delay = true;
-  bool allow_hard_rate_limit_delay = true;
-  bool allow_soft_rate_limit_delay = true;
-  uint64_t rate_limit_delay_millis = 0;
   Status s;
-  double score;
-  // Once we schedule background work, we shouldn't schedule it again, since it
-  // might generate a tight feedback loop, constantly scheduling more background
-  // work, even if additional background work is not needed
-  bool schedule_background_work = true;
   bool has_timeout = (expiration_time > 0);
 
   while (true) {
@@ -4248,111 +4234,9 @@ Status DBImpl::MakeRoomForWrite(ColumnFamilyData* cfd,
     } else if (has_timeout && env_->NowMicros() > expiration_time) {
       s = Status::TimedOut();
       break;
-    } else if (allow_delay && cfd->NeedSlowdownForNumLevel0Files()) {
-      // We are getting close to hitting a hard limit on the number of
-      // L0 files.  Rather than delaying a single write by several
-      // seconds when we hit the hard limit, start delaying each
-      // individual write by 0-1ms to reduce latency variance.  Also,
-      // this delay hands over some CPU to the compaction thread in
-      // case it is sharing the same core as the writer.
-      uint64_t slowdown =
-          SlowdownAmount(cfd->current()->NumLevelFiles(0),
-                         cfd->options()->level0_slowdown_writes_trigger,
-                         cfd->options()->level0_stop_writes_trigger);
-      mutex_.Unlock();
-      uint64_t delayed;
-      {
-        StopWatch sw(env_, stats_, STALL_L0_SLOWDOWN_COUNT, &delayed);
-        env_->SleepForMicroseconds(slowdown);
-      }
-      RecordTick(stats_, STALL_L0_SLOWDOWN_MICROS, delayed);
-      allow_delay = false;  // Do not delay a single write more than once
-      mutex_.Lock();
-      cfd->internal_stats()->AddCFStats(
-          InternalStats::LEVEL0_SLOWDOWN, delayed);
-      delayed_writes_++;
     } else if (!cfd->mem()->ShouldFlush()) {
       // There is room in current memtable
-      if (allow_delay) {
-        DelayLoggingAndReset();
-      }
       break;
-    } else if (cfd->NeedWaitForNumMemtables()) {
-      // We have filled up the current memtable, but the previous
-      // ones are still being flushed, so we wait.
-      DelayLoggingAndReset();
-      Log(db_options_.info_log, "[%s] wait for memtable flush...\n",
-          cfd->GetName().c_str());
-      if (schedule_background_work) {
-        MaybeScheduleFlushOrCompaction();
-        schedule_background_work = false;
-      }
-      uint64_t stall;
-      {
-        StopWatch sw(env_, stats_, STALL_MEMTABLE_COMPACTION_COUNT, &stall);
-        if (!has_timeout) {
-          bg_cv_.Wait();
-        } else {
-          bg_cv_.TimedWait(expiration_time);
-        }
-      }
-      RecordTick(stats_, STALL_MEMTABLE_COMPACTION_MICROS, stall);
-      cfd->internal_stats()->AddCFStats(
-          InternalStats::MEMTABLE_COMPACTION, stall);
-    } else if (cfd->NeedWaitForNumLevel0Files()) {
-      DelayLoggingAndReset();
-      Log(db_options_.info_log, "[%s] wait for fewer level0 files...\n",
-          cfd->GetName().c_str());
-      uint64_t stall;
-      {
-        StopWatch sw(env_, stats_, STALL_L0_NUM_FILES_COUNT, &stall);
-        if (!has_timeout) {
-          bg_cv_.Wait();
-        } else {
-          bg_cv_.TimedWait(expiration_time);
-        }
-      }
-      RecordTick(stats_, STALL_L0_NUM_FILES_MICROS, stall);
-      cfd->internal_stats()->AddCFStats(
-          InternalStats::LEVEL0_NUM_FILES, stall);
-    } else if (allow_hard_rate_limit_delay && cfd->ExceedsHardRateLimit()) {
-      // Delay a write when the compaction score for any level is too large.
-      const int max_level = cfd->current()->MaxCompactionScoreLevel();
-      score = cfd->current()->MaxCompactionScore();
-      mutex_.Unlock();
-      uint64_t delayed;
-      {
-        StopWatch sw(env_, stats_, HARD_RATE_LIMIT_DELAY_COUNT, &delayed);
-        env_->SleepForMicroseconds(1000);
-      }
-      // Make sure the following value doesn't round to zero.
-      uint64_t rate_limit = std::max((delayed / 1000), (uint64_t) 1);
-      rate_limit_delay_millis += rate_limit;
-      RecordTick(stats_, RATE_LIMIT_DELAY_MILLIS, rate_limit);
-      if (cfd->options()->rate_limit_delay_max_milliseconds > 0 &&
-          rate_limit_delay_millis >=
-              (unsigned)cfd->options()->rate_limit_delay_max_milliseconds) {
-        allow_hard_rate_limit_delay = false;
-      }
-      mutex_.Lock();
-      cfd->internal_stats()->RecordLevelNSlowdown(max_level, delayed, false);
-    } else if (allow_soft_rate_limit_delay && cfd->ExceedsSoftRateLimit()) {
-      const int max_level = cfd->current()->MaxCompactionScoreLevel();
-      score = cfd->current()->MaxCompactionScore();
-      // Delay a write when the compaction score for any level is too large.
-      // TODO: add statistics
-      uint64_t slowdown = SlowdownAmount(score, cfd->options()->soft_rate_limit,
-                                         cfd->options()->hard_rate_limit);
-      uint64_t elapsed = 0;
-      mutex_.Unlock();
-      {
-        StopWatch sw(env_, stats_, SOFT_RATE_LIMIT_DELAY_COUNT, &elapsed);
-        env_->SleepForMicroseconds(slowdown);
-        rate_limit_delay_millis += slowdown;
-      }
-      allow_soft_rate_limit_delay = false;
-      mutex_.Lock();
-      cfd->internal_stats()->RecordLevelNSlowdown(max_level, elapsed, true);
     } else {
       s = SetNewMemtableAndNewLogFile(cfd, context);
       if (!s.ok()) {
@@ -4383,7 +4267,6 @@ Status DBImpl::SetNewMemtableAndNewLogFile(ColumnFamilyData* cfd,
   mutex_.Unlock();
   Status s;
   {
-    DelayLoggingAndReset();
     if (creating_new_log) {
       s = env_->NewWritableFile(
           LogFileName(db_options_.wal_dir, new_log_number),
@@ -4595,13 +4478,6 @@ void DBImpl::GetApproximateSizes(ColumnFamilyHandle* column_family,
   }
 }
 
-inline void DBImpl::DelayLoggingAndReset() {
-  if (delayed_writes_ > 0) {
-    Log(db_options_.info_log, "delayed %d write...\n", delayed_writes_);
-    delayed_writes_ = 0;
-  }
-}
-
 #ifndef ROCKSDB_LITE
 Status DBImpl::GetUpdatesSince(
     SequenceNumber seq, unique_ptr<TransactionLogIterator>* iter,
diff --git a/db/db_impl.h b/db/db_impl.h
index 69fe2eaac..c2bb48597 100644
--- a/db/db_impl.h
+++ b/db/db_impl.h
@@ -32,6 +32,7 @@
 #include "util/thread_local.h"
 #include "util/scoped_arena_iterator.h"
 #include "db/internal_stats.h"
+#include "db/write_controller.h"
 
 namespace rocksdb {
 
@@ -357,9 +358,6 @@ class DBImpl : public DB {
   Status WriteLevel0Table(ColumnFamilyData* cfd, autovector<MemTable*>& mems,
                           VersionEdit* edit, uint64_t* filenumber,
                           LogBuffer* log_buffer);
-
-  uint64_t SlowdownAmount(int n, double bottom, double top);
-
   // Information kept for every waiting writer
   struct Writer {
     Status status;
@@ -399,8 +397,9 @@ class DBImpl : public DB {
   // See also: BeginWrite
   void EndWrite(Writer* w, Writer* last_writer, Status status);
 
-  Status MakeRoomForWrite(ColumnFamilyData* cfd,
-                          WriteContext* context,
+  void DelayWrite(uint64_t expiration_time);
+
+  Status MakeRoomForWrite(ColumnFamilyData* cfd, WriteContext* context,
                           uint64_t expiration_time);
 
   Status SetNewMemtableAndNewLogFile(ColumnFamilyData* cfd,
@@ -557,6 +556,8 @@ class DBImpl : public DB {
   std::deque<Writer*> writers_;
   WriteBatch tmp_batch_;
 
+  WriteController write_controller_;
+
   SnapshotList snapshots_;
 
   // cache for ReadFirstRecord() calls
@@ -628,9 +629,6 @@ class DBImpl : public DB {
   static const uint64_t kNoTimeOut = std::numeric_limits<uint64_t>::max();
   std::string db_absolute_path_;
 
-  // count of the number of contiguous delaying writes
-  int delayed_writes_;
-
   // The options to access storage files
   const EnvOptions env_options_;
 
@@ -647,9 +645,6 @@ class DBImpl : public DB {
   DBImpl(const DBImpl&);
   void operator=(const DBImpl&);
 
-  // dump the delayed_writes_ to the log file and reset counter.
-  void DelayLoggingAndReset();
-
   // Return the earliest snapshot where seqno is visible.
   // Store the snapshot right before that, if any, in prev_snapshot
   inline SequenceNumber findEarliestVisibleSnapshot(
diff --git a/db/log_and_apply_bench.cc b/db/log_and_apply_bench.cc
index a5aa95017..60baeb5ec 100644
--- a/db/log_and_apply_bench.cc
+++ b/db/log_and_apply_bench.cc
@@ -21,6 +21,7 @@ std::string MakeKey(unsigned int num) {
 
 void BM_LogAndApply(int iters, int num_base_files) {
   VersionSet* vset;
+  WriteController wc;
   ColumnFamilyData* default_cfd;
   uint64_t fnum = 1;
   port::Mutex mu;
@@ -47,7 +48,7 @@ void BM_LogAndApply(int iters, int num_base_files) {
     options.db_paths.emplace_back(dbname, 0);
     // The parameter of table cache is passed in as null, so any file I/O
     // operation is likely to fail.
-    vset = new VersionSet(dbname, &options, sopt, nullptr);
+    vset = new VersionSet(dbname, &options, sopt, nullptr, &wc);
     std::vector<ColumnFamilyDescriptor> dummy;
     dummy.push_back(ColumnFamilyDescriptor());
     ASSERT_OK(vset->Recover(dummy));
@@ -69,6 +70,7 @@ void BM_LogAndApply(int iters, int num_base_files) {
     vedit.AddFile(2, ++fnum, 0, 1 /* file size */, start, limit, 1, 1);
     vset->LogAndApply(default_cfd, &vedit, &mu);
   }
+  delete vset;
 }
 
 BENCHMARK_NAMED_PARAM(BM_LogAndApply, 1000_iters_1_file, 1000, 1)
diff --git a/db/version_set.cc b/db/version_set.cc
index 82183a982..7e9393e3c 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -1706,9 +1706,10 @@ class VersionSet::Builder {
 };
 
 VersionSet::VersionSet(const std::string& dbname, const DBOptions* options,
-                       const EnvOptions& storage_options, Cache* table_cache)
+                       const EnvOptions& storage_options, Cache* table_cache,
+                       WriteController* write_controller)
     : column_family_set_(new ColumnFamilySet(dbname, options, storage_options,
-                                             table_cache)),
+                                             table_cache, write_controller)),
       env_(options->env),
       dbname_(dbname),
       options_(options),
@@ -2411,7 +2412,8 @@ Status VersionSet::ReduceNumberOfLevels(const std::string& dbname,
   std::shared_ptr<Cache> tc(NewLRUCache(
       options->max_open_files - 10, options->table_cache_numshardbits,
       options->table_cache_remove_scan_count_limit));
-  VersionSet versions(dbname, options, storage_options, tc.get());
+  WriteController wc;
+  VersionSet versions(dbname, options, storage_options, tc.get(), &wc);
   Status status;
 
   std::vector<ColumnFamilyDescriptor> dummy;
diff --git a/db/version_set.h b/db/version_set.h
index e9747f839..bfb567036 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -34,6 +34,7 @@
 #include "db/column_family.h"
 #include "db/log_reader.h"
 #include "db/file_indexer.h"
+#include "db/write_controller.h"
 
 namespace rocksdb {
 
@@ -321,8 +322,8 @@ class Version {
   // These are used to pick the best compaction level
   std::vector<double> compaction_score_;
   std::vector<int> compaction_level_;
-  double max_compaction_score_; // max score in l1 to ln-1
-  int max_compaction_score_level_; // level on which max score occurs
+  double max_compaction_score_ = 0.0;   // max score in l1 to ln-1
+  int max_compaction_score_level_ = 0;  // level on which max score occurs
 
   // A version number that uniquely represents this version. This is
   // used for debugging and logging purposes only.
@@ -357,7 +358,8 @@ class Version {
 class VersionSet {
  public:
   VersionSet(const std::string& dbname, const DBOptions* options,
-             const EnvOptions& storage_options, Cache* table_cache);
+             const EnvOptions& storage_options, Cache* table_cache,
+             WriteController* write_controller);
   ~VersionSet();
 
   // Apply *edit to the current version to form a new descriptor that
diff --git a/db/write_controller.cc b/db/write_controller.cc
new file mode 100644
index 000000000..bb6f8ecf7
--- /dev/null
+++ b/db/write_controller.cc
@@ -0,0 +1,37 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include "db/write_controller.h"
+
+#include <cassert>
+
+namespace rocksdb {
+
+std::unique_ptr<WriteControllerToken> WriteController::GetStopToken() {
+  ++total_stopped_;
+  return std::unique_ptr<WriteControllerToken>(new StopWriteToken(this));
+}
+
+std::unique_ptr<WriteControllerToken> WriteController::GetDelayToken(
+    uint64_t delay_us) {
+  total_delay_us_ += delay_us;
+  return std::unique_ptr<WriteControllerToken>(
+      new DelayWriteToken(this, delay_us));
+}
+
+bool WriteController::IsStopped() const { return total_stopped_ > 0; }
+uint64_t WriteController::GetDelay() const { return total_delay_us_; }
+
+StopWriteToken::~StopWriteToken() {
+  assert(controller_->total_stopped_ >= 1);
+  --controller_->total_stopped_;
+}
+
+DelayWriteToken::~DelayWriteToken() {
+  assert(controller_->total_delay_us_ >= delay_us_);
+  controller_->total_delay_us_ -= delay_us_;
+}
+
+}  // namespace rocksdb
diff --git a/db/write_controller.h b/db/write_controller.h
new file mode 100644
index 000000000..4ed221df1
--- /dev/null
+++ b/db/write_controller.h
@@ -0,0 +1,78 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#pragma once
+
+#include <stdint.h>
+
+#include <memory>
+
+namespace rocksdb {
+
+class WriteControllerToken;
+
+// WriteController is controlling write stalls in our write code-path. Write
+// stalls happen when compaction can't keep up with write rate.
+// All of the methods here (including WriteControllerToken's destructors) need
+// to be called while holding DB mutex
+class WriteController {
+ public:
+  WriteController() : total_stopped_(0), total_delay_us_(0) {}
+  ~WriteController() = default;
+
+  // When an actor (column family) requests a stop token, all writes will be
+  // stopped until the stop token is released (deleted)
+  std::unique_ptr<WriteControllerToken> GetStopToken();
+  // When an actor (column family) requests a delay token, total delay for all
+  // writes will be increased by delay_us. The delay will last until delay token
+  // is released
+  std::unique_ptr<WriteControllerToken> GetDelayToken(uint64_t delay_us);
+
+  // these two metods are querying the state of the WriteController
+  bool IsStopped() const;
+  uint64_t GetDelay() const;
+
+ private:
+  friend class WriteControllerToken;
+  friend class StopWriteToken;
+  friend class DelayWriteToken;
+
+  int total_stopped_;
+  uint64_t total_delay_us_;
+};
+
+class WriteControllerToken {
+ public:
+  explicit WriteControllerToken(WriteController* controller)
+      : controller_(controller) {}
+  virtual ~WriteControllerToken() = default;
+
+ protected:
+  WriteController* controller_;
+
+ private:
+  // no copying allowed
+  WriteControllerToken(const WriteControllerToken&) = delete;
+  void operator=(const WriteControllerToken&) = delete;
+};
+
+class StopWriteToken : public WriteControllerToken {
+ public:
+  explicit StopWriteToken(WriteController* controller)
+      : WriteControllerToken(controller) {}
+  ~StopWriteToken();
+};
+
+class DelayWriteToken : public WriteControllerToken {
+ public:
+  DelayWriteToken(WriteController* controller, uint64_t delay_us)
+      : WriteControllerToken(controller), delay_us_(delay_us) {}
+  ~DelayWriteToken();
+
+ private:
+  uint64_t delay_us_;
+};
+
+}  // namespace rocksdb
diff --git a/db/write_controller_test.cc b/db/write_controller_test.cc
new file mode 100644
index 000000000..1cec9658d
--- /dev/null
+++ b/db/write_controller_test.cc
@@ -0,0 +1,40 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#include "db/write_controller.h"
+
+#include "util/testharness.h"
+
+namespace rocksdb {
+
+class WriteControllerTest {};
+
+TEST(WriteControllerTest, SanityTest) {
+  WriteController controller;
+  auto stop_token_1 = controller.GetStopToken();
+  auto stop_token_2 = controller.GetStopToken();
+
+  ASSERT_EQ(true, controller.IsStopped());
+  stop_token_1.reset();
+  ASSERT_EQ(true, controller.IsStopped());
+  stop_token_2.reset();
+  ASSERT_EQ(false, controller.IsStopped());
+
+  auto delay_token_1 = controller.GetDelayToken(5);
+  ASSERT_EQ(static_cast<uint64_t>(5), controller.GetDelay());
+  auto delay_token_2 = controller.GetDelayToken(8);
+  ASSERT_EQ(static_cast<uint64_t>(13), controller.GetDelay());
+
+  delay_token_2.reset();
+  ASSERT_EQ(static_cast<uint64_t>(5), controller.GetDelay());
+  delay_token_1.reset();
+  ASSERT_EQ(static_cast<uint64_t>(0), controller.GetDelay());
+  delay_token_1.reset();
+  ASSERT_EQ(false, controller.IsStopped());
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) { return rocksdb::test::RunAllTests(); }
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index fbb3b6ddb..fc5e039a7 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -347,9 +347,7 @@ struct ColumnFamilyOptions {
   // Default: 0 (disabled)
   double hard_rate_limit;
 
-  // Max time a put will be stalled when hard_rate_limit is enforced. If 0, then
-  // there is no limit.
-  // Default: 1000
+  // DEPRECATED -- this options is no longer used
   unsigned int rate_limit_delay_max_milliseconds;
 
   // size of one block in arena memory allocation.
diff --git a/include/rocksdb/statistics.h b/include/rocksdb/statistics.h
index a7f2c1408..87ac321c9 100644
--- a/include/rocksdb/statistics.h
+++ b/include/rocksdb/statistics.h
@@ -212,7 +212,6 @@ enum Histograms : uint32_t {
   READ_BLOCK_COMPACTION_MICROS,
   READ_BLOCK_GET_MICROS,
   WRITE_RAW_BLOCK_MICROS,
-
   STALL_L0_SLOWDOWN_COUNT,
   STALL_MEMTABLE_COMPACTION_COUNT,
   STALL_L0_NUM_FILES_COUNT,
@@ -220,6 +219,7 @@ enum Histograms : uint32_t {
   SOFT_RATE_LIMIT_DELAY_COUNT,
   NUM_FILES_IN_SINGLE_COMPACTION,
   DB_SEEK,
+  WRITE_STALL,
   HISTOGRAM_ENUM_MAX,
 };
 
diff --git a/util/ldb_cmd.cc b/util/ldb_cmd.cc
index 53e15e0ba..9f00757b8 100644
--- a/util/ldb_cmd.cc
+++ b/util/ldb_cmd.cc
@@ -564,7 +564,8 @@ void ManifestDumpCommand::DoCommand() {
   // if VersionSet::DumpManifest() depends on any option done by
   // SanitizeOptions(), we need to initialize it manually.
   options.db_paths.emplace_back("dummy", 0);
-  VersionSet versions(dbname, &options, sopt, tc.get());
+  WriteController wc;
+  VersionSet versions(dbname, &options, sopt, tc.get(), &wc);
   Status s = versions.DumpManifest(options, file, verbose_, is_key_hex_);
   if (!s.ok()) {
     printf("Error in processing file %s %s\n", manifestfile.c_str(),
@@ -1089,7 +1090,8 @@ Status ReduceDBLevelsCommand::GetOldNumOfLevels(Options& opt,
       NewLRUCache(opt.max_open_files - 10, opt.table_cache_numshardbits,
                   opt.table_cache_remove_scan_count_limit));
   const InternalKeyComparator cmp(opt.comparator);
-  VersionSet versions(db_path_, &opt, soptions, tc.get());
+  WriteController wc;
+  VersionSet versions(db_path_, &opt, soptions, tc.get(), &wc);
   std::vector<ColumnFamilyDescriptor> dummy;
   ColumnFamilyDescriptor dummy_descriptor(kDefaultColumnFamilyName,
                                           ColumnFamilyOptions(opt));

From 011241bb993c0cb8e24660b7f3cc16e815e54670 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Fri, 5 Sep 2014 12:01:01 -0700
Subject: [PATCH 044/829] DB::Flush() Do not wait for background threads when
 there is nothing in mem table

Summary:
When we have multiple column families, users can issue Flush() on every column families to make sure everything is flushes, even if some of them might be empty. By skipping the waiting for empty cases, it can be greatly speed up.

Still wait for people's comments before writing unit tests for it.

Test Plan: Will write a unit test to make sure it is correct.

Reviewers: ljin, yhchiang, igor

Reviewed By: igor

Subscribers: leveldb, dhruba

Differential Revision: https://reviews.facebook.net/D22953
---
 db/db_impl.cc  |  8 +++++++-
 db/db_test.cc  | 43 +++++++++++++++++++++++++++++++++++++++++++
 db/memtable.cc |  2 +-
 db/memtable.h  |  3 +++
 4 files changed, 54 insertions(+), 2 deletions(-)

diff --git a/db/db_impl.cc b/db/db_impl.cc
index cff2d5a20..1d72a1ea4 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -375,7 +375,7 @@ DBImpl::~DBImpl() {
   mutex_.Lock();
   if (flush_on_destroy_) {
     for (auto cfd : *versions_->GetColumnFamilySet()) {
-      if (cfd->mem()->GetFirstSequenceNumber() != 0) {
+      if (!cfd->mem()->IsEmpty()) {
         cfd->Ref();
         mutex_.Unlock();
         FlushMemTable(cfd, FlushOptions());
@@ -1905,6 +1905,12 @@ Status DBImpl::FlushMemTable(ColumnFamilyData* cfd,
   {
     WriteContext context;
     MutexLock guard_lock(&mutex_);
+
+    if (cfd->imm()->size() == 0 && cfd->mem()->IsEmpty()) {
+      // Nothing to flush
+      return Status::OK();
+    }
+
     s = BeginWrite(&w, 0);
     assert(s.ok() && !w.done);  // No timeout and nobody should do our job
 
diff --git a/db/db_test.cc b/db/db_test.cc
index b30bfd70d..140e87078 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -2535,6 +2535,49 @@ class SleepingBackgroundTask {
   bool done_with_sleep_;
 };
 
+TEST(DBTest, FlushEmptyColumnFamily) {
+  // Block flush thread and disable compaction thread
+  env_->SetBackgroundThreads(1, Env::HIGH);
+  env_->SetBackgroundThreads(1, Env::LOW);
+  SleepingBackgroundTask sleeping_task_low;
+  env_->Schedule(&SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+  SleepingBackgroundTask sleeping_task_high;
+  env_->Schedule(&SleepingBackgroundTask::DoSleepTask, &sleeping_task_high,
+                 Env::Priority::HIGH);
+
+  Options options = CurrentOptions();
+  // disable compaction
+  options.disable_auto_compactions = true;
+  WriteOptions writeOpt = WriteOptions();
+  writeOpt.disableWAL = true;
+  options.max_write_buffer_number = 2;
+  options.min_write_buffer_number_to_merge = 1;
+  CreateAndReopenWithCF({"pikachu"}, &options);
+
+  // Compaction can still go through even if no thread can flush the
+  // mem table.
+  ASSERT_OK(Flush(0));
+  ASSERT_OK(Flush(1));
+
+  // Insert can go through
+  ASSERT_OK(dbfull()->Put(writeOpt, handles_[0], "foo", "v1"));
+  ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1"));
+
+  ASSERT_EQ("v1", Get(0, "foo"));
+  ASSERT_EQ("v1", Get(1, "bar"));
+
+  sleeping_task_high.WakeUp();
+  sleeping_task_high.WaitUntilDone();
+
+  // Flush can still go through.
+  ASSERT_OK(Flush(0));
+  ASSERT_OK(Flush(1));
+
+  sleeping_task_low.WakeUp();
+  sleeping_task_low.WaitUntilDone();
+}
+
 TEST(DBTest, GetProperty) {
   // Set sizes to both background thread pool to be 1 and block them.
   env_->SetBackgroundThreads(1, Env::HIGH);
diff --git a/db/memtable.cc b/db/memtable.cc
index e102575a4..1ed0e2cea 100644
--- a/db/memtable.cc
+++ b/db/memtable.cc
@@ -414,7 +414,7 @@ static bool SaveValue(void* arg, const char* entry) {
 bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
                    MergeContext& merge_context, const Options& options) {
   // The sequence number is updated synchronously in version_set.h
-  if (first_seqno_ == 0) {
+  if (IsEmpty()) {
     // Avoiding recording stats for speed.
     return false;
   }
diff --git a/db/memtable.h b/db/memtable.h
index 2723f30d8..80dcdd42e 100644
--- a/db/memtable.h
+++ b/db/memtable.h
@@ -137,6 +137,9 @@ class MemTable {
   // Returns the edits area that is needed for flushing the memtable
   VersionEdit* GetEdits() { return &edit_; }
 
+  // Returns if there is no entry inserted to the mem table.
+  bool IsEmpty() const { return first_seqno_ == 0; }
+
   // Returns the sequence number of the first element that was inserted
   // into the memtable
   SequenceNumber GetFirstSequenceNumber() { return first_seqno_; }

From 048560a6421af7de370022e217c098311f7971a6 Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Mon, 8 Sep 2014 15:04:34 -0700
Subject: [PATCH 045/829] reduce references to cfd->options() in DBImpl

Summary:
I found it is almost impossible to get rid of this function in a single
batch. I will take a step by step approach

Test Plan: make release

Reviewers: sdong, yhchiang, igor

Reviewed By: igor

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D22995
---
 db/compaction.cc                    |  10 +-
 db/db_impl.cc                       | 103 +++++++++--------
 db/db_impl_readonly.cc              |  30 ++---
 db/db_iter.cc                       |  32 +++---
 db/db_iter.h                        |   9 +-
 db/db_iter_test.cc                  | 168 +++++++++++++++++++++-------
 db/version_set.cc                   |  24 ++--
 include/rocksdb/immutable_options.h |  10 ++
 util/options.cc                     |   9 +-
 9 files changed, 257 insertions(+), 138 deletions(-)

diff --git a/db/compaction.cc b/db/compaction.cc
index cf0b682aa..28a3174b0 100644
--- a/db/compaction.cc
+++ b/db/compaction.cc
@@ -113,8 +113,8 @@ void Compaction::AddInputDeletions(VersionEdit* edit) {
 }
 
 bool Compaction::KeyNotExistsBeyondOutputLevel(const Slice& user_key) {
-  assert(cfd_->options()->compaction_style != kCompactionStyleFIFO);
-  if (cfd_->options()->compaction_style == kCompactionStyleUniversal) {
+  assert(cfd_->ioptions()->compaction_style != kCompactionStyleFIFO);
+  if (cfd_->ioptions()->compaction_style == kCompactionStyleUniversal) {
     return bottommost_level_;
   }
   // Maybe use binary search to find right entry instead of linear search?
@@ -177,8 +177,8 @@ void Compaction::MarkFilesBeingCompacted(bool mark_as_compacted) {
 
 // Is this compaction producing files at the bottommost level?
 void Compaction::SetupBottomMostLevel(bool is_manual) {
-  assert(cfd_->options()->compaction_style != kCompactionStyleFIFO);
-  if (cfd_->options()->compaction_style == kCompactionStyleUniversal) {
+  assert(cfd_->ioptions()->compaction_style != kCompactionStyleFIFO);
+  if (cfd_->ioptions()->compaction_style == kCompactionStyleUniversal) {
     // If universal compaction style is used and manual
     // compaction is occuring, then we are guaranteed that
     // all files will be picked in a single compaction
@@ -270,7 +270,7 @@ void Compaction::Summary(char* output, int len) {
 uint64_t Compaction::OutputFilePreallocationSize() {
   uint64_t preallocation_size = 0;
 
-  if (cfd_->options()->compaction_style == kCompactionStyleLevel) {
+  if (cfd_->ioptions()->compaction_style == kCompactionStyleLevel) {
     preallocation_size =
         cfd_->compaction_picker()->MaxFileSizeForLevel(output_level());
   } else {
diff --git a/db/db_impl.cc b/db/db_impl.cc
index 1d72a1ea4..54c14b455 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -294,24 +294,24 @@ Status SanitizeDBOptionsByCFOptions(
   return Status::OK();
 }
 
-CompressionType GetCompressionFlush(const Options& options) {
+CompressionType GetCompressionFlush(const ImmutableCFOptions& ioptions) {
   // Compressing memtable flushes might not help unless the sequential load
   // optimization is used for leveled compaction. Otherwise the CPU and
   // latency overhead is not offset by saving much space.
 
   bool can_compress;
 
-  if (options.compaction_style == kCompactionStyleUniversal) {
+  if (ioptions.compaction_style == kCompactionStyleUniversal) {
     can_compress =
-        (options.compaction_options_universal.compression_size_percent < 0);
+        (ioptions.compaction_options_universal.compression_size_percent < 0);
   } else {
     // For leveled compress when min_level_to_compress == 0.
-    can_compress = options.compression_per_level.empty() ||
-                   options.compression_per_level[0] != kNoCompression;
+    can_compress = ioptions.compression_per_level.empty() ||
+                   ioptions.compression_per_level[0] != kNoCompression;
   }
 
   if (can_compress) {
-    return options.compression;
+    return ioptions.compression;
   } else {
     return kNoCompression;
   }
@@ -1424,8 +1424,8 @@ Status DBImpl::WriteLevel0TableForRecovery(ColumnFamilyData* cfd, MemTable* mem,
       s = BuildTable(
           dbname_, env_, *cfd->ioptions(), env_options_, cfd->table_cache(),
           iter.get(), &meta, cfd->internal_comparator(), newest_snapshot,
-          earliest_seqno_in_memtable, GetCompressionFlush(*cfd->options()),
-          cfd->options()->compression_opts, Env::IO_HIGH);
+          earliest_seqno_in_memtable, GetCompressionFlush(*cfd->ioptions()),
+          cfd->ioptions()->compression_opts, Env::IO_HIGH);
       LogFlush(db_options_.info_log);
       mutex_.Lock();
     }
@@ -1498,8 +1498,8 @@ Status DBImpl::WriteLevel0Table(ColumnFamilyData* cfd,
       s = BuildTable(
           dbname_, env_, *cfd->ioptions(), env_options_, cfd->table_cache(),
           iter.get(), &meta, cfd->internal_comparator(), newest_snapshot,
-          earliest_seqno_in_memtable, GetCompressionFlush(*cfd->options()),
-          cfd->options()->compression_opts, Env::IO_HIGH);
+          earliest_seqno_in_memtable, GetCompressionFlush(*cfd->ioptions()),
+          cfd->ioptions()->compression_opts, Env::IO_HIGH);
       LogFlush(db_options_.info_log);
     }
     Log(db_options_.info_log,
@@ -1537,7 +1537,7 @@ Status DBImpl::WriteLevel0Table(ColumnFamilyData* cfd,
     // threads could be concurrently producing compacted files for
     // that key range.
     if (base != nullptr && db_options_.max_background_compactions <= 1 &&
-        cfd->options()->compaction_style == kCompactionStyleLevel) {
+        cfd->ioptions()->compaction_style == kCompactionStyleLevel) {
       level = base->PickLevelForMemTableOutput(min_user_key, max_user_key);
     }
     edit->AddFile(level, meta.fd.GetNumber(), meta.fd.GetPathId(),
@@ -1666,8 +1666,8 @@ Status DBImpl::CompactRange(ColumnFamilyHandle* column_family,
     // bottom-most level, the output level will be the same as input one.
     // level 0 can never be the bottommost level (i.e. if all files are in level
     // 0, we will compact to level 1)
-    if (cfd->options()->compaction_style == kCompactionStyleUniversal ||
-        cfd->options()->compaction_style == kCompactionStyleFIFO ||
+    if (cfd->ioptions()->compaction_style == kCompactionStyleUniversal ||
+        cfd->ioptions()->compaction_style == kCompactionStyleFIFO ||
         (level == max_level_with_files && level > 0)) {
       s = RunManualCompaction(cfd, level, level, target_path_id, begin, end);
     } else {
@@ -1828,16 +1828,16 @@ Status DBImpl::RunManualCompaction(ColumnFamilyData* cfd, int input_level,
   // For universal compaction, we enforce every manual compaction to compact
   // all files.
   if (begin == nullptr ||
-      cfd->options()->compaction_style == kCompactionStyleUniversal ||
-      cfd->options()->compaction_style == kCompactionStyleFIFO) {
+      cfd->ioptions()->compaction_style == kCompactionStyleUniversal ||
+      cfd->ioptions()->compaction_style == kCompactionStyleFIFO) {
     manual.begin = nullptr;
   } else {
     begin_storage = InternalKey(*begin, kMaxSequenceNumber, kValueTypeForSeek);
     manual.begin = &begin_storage;
   }
   if (end == nullptr ||
-      cfd->options()->compaction_style == kCompactionStyleUniversal ||
-      cfd->options()->compaction_style == kCompactionStyleFIFO) {
+      cfd->ioptions()->compaction_style == kCompactionStyleUniversal ||
+      cfd->ioptions()->compaction_style == kCompactionStyleFIFO) {
     manual.end = nullptr;
   } else {
     end_storage = InternalKey(*end, 0, static_cast<ValueType>(0));
@@ -2288,7 +2288,7 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress,
     // file if there is alive snapshot pointing to it
     assert(c->num_input_files(1) == 0);
     assert(c->level() == 0);
-    assert(c->column_family_data()->options()->compaction_style ==
+    assert(c->column_family_data()->ioptions()->compaction_style ==
            kCompactionStyleFIFO);
     for (const auto& f : *c->inputs(0)) {
       c->edit()->DeleteFile(c->level(), f->fd.GetNumber());
@@ -2371,8 +2371,8 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress,
       // We only compacted part of the requested range.  Update *m
       // to the range that is left to be compacted.
       // Universal and FIFO compactions should always compact the whole range
-      assert(m->cfd->options()->compaction_style != kCompactionStyleUniversal);
-      assert(m->cfd->options()->compaction_style != kCompactionStyleFIFO);
+      assert(m->cfd->ioptions()->compaction_style != kCompactionStyleUniversal);
+      assert(m->cfd->ioptions()->compaction_style != kCompactionStyleFIFO);
       m->tmp_storage = *manual_end;
       m->begin = &m->tmp_storage;
     }
@@ -2465,7 +2465,7 @@ Status DBImpl::OpenCompactionOutputFile(CompactionState* compact) {
     compact->builder.reset(NewTableBuilder(
         *cfd->ioptions(), cfd->internal_comparator(), compact->outfile.get(),
         compact->compaction->OutputCompressionType(),
-        cfd->options()->compression_opts));
+        cfd->ioptions()->compression_opts));
   }
   LogFlush(db_options_.info_log);
   return s;
@@ -2640,7 +2640,7 @@ Status DBImpl::ProcessKeyValueCompaction(
   SequenceNumber visible_in_snapshot = kMaxSequenceNumber;
   ColumnFamilyData* cfd = compact->compaction->column_family_data();
   MergeHelper merge(
-      cfd->user_comparator(), cfd->options()->merge_operator.get(),
+      cfd->user_comparator(), cfd->ioptions()->merge_operator,
       db_options_.info_log.get(), cfd->options()->min_partial_merge_operands,
       false /* internal key corruption is expected */);
   auto compaction_filter = cfd->options()->compaction_filter;
@@ -3673,21 +3673,21 @@ bool DBImpl::KeyMayExist(const ReadOptions& options,
   return s.ok() || s.IsIncomplete();
 }
 
-Iterator* DBImpl::NewIterator(const ReadOptions& options,
+Iterator* DBImpl::NewIterator(const ReadOptions& read_options,
                               ColumnFamilyHandle* column_family) {
   auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
   auto cfd = cfh->cfd();
 
-  if (options.tailing) {
+  if (read_options.tailing) {
 #ifdef ROCKSDB_LITE
     // not supported in lite version
     return nullptr;
 #else
-    // TODO(ljin): remove tailing iterator
-    auto iter = new ForwardIterator(this, options, cfd);
-    return NewDBIterator(env_, *cfd->options(), cfd->user_comparator(), iter,
-                         kMaxSequenceNumber, options.iterate_upper_bound);
-// return new TailingIterator(env_, this, options, cfd);
+    auto iter = new ForwardIterator(this, read_options, cfd);
+    return NewDBIterator(env_, *cfd->ioptions(), cfd->user_comparator(), iter,
+                         kMaxSequenceNumber,
+                         cfd->options()->max_sequential_skip_in_iterations,
+                         read_options.iterate_upper_bound);
 #endif
   } else {
     SequenceNumber latest_snapshot = versions_->LastSequence();
@@ -3695,8 +3695,9 @@ Iterator* DBImpl::NewIterator(const ReadOptions& options,
     sv = cfd->GetReferencedSuperVersion(&mutex_);
 
     auto snapshot =
-        options.snapshot != nullptr
-            ? reinterpret_cast<const SnapshotImpl*>(options.snapshot)->number_
+        read_options.snapshot != nullptr
+            ? reinterpret_cast<const SnapshotImpl*>(
+                read_options.snapshot)->number_
             : latest_snapshot;
 
     // Try to generate a DB iterator tree in continuous memory area to be
@@ -3742,19 +3743,22 @@ Iterator* DBImpl::NewIterator(const ReadOptions& options,
     // likely that any iterator pointer is close to the iterator it points to so
     // that they are likely to be in the same cache line and/or page.
     ArenaWrappedDBIter* db_iter = NewArenaWrappedDbIterator(
-        env_, *cfd->options(), cfd->user_comparator(),
-        snapshot, options.iterate_upper_bound);
+        env_, *cfd->ioptions(), cfd->user_comparator(),
+        snapshot, cfd->options()->max_sequential_skip_in_iterations,
+        read_options.iterate_upper_bound);
 
     Iterator* internal_iter =
-        NewInternalIterator(options, cfd, sv, db_iter->GetArena());
+        NewInternalIterator(read_options, cfd, sv, db_iter->GetArena());
     db_iter->SetIterUnderDBIter(internal_iter);
 
     return db_iter;
   }
+  // To stop compiler from complaining
+  return nullptr;
 }
 
 Status DBImpl::NewIterators(
-    const ReadOptions& options,
+    const ReadOptions& read_options,
     const std::vector<ColumnFamilyHandle*>& column_families,
     std::vector<Iterator*>* iterators) {
   iterators->clear();
@@ -3763,7 +3767,7 @@ Status DBImpl::NewIterators(
   std::vector<SuperVersion*> super_versions;
   super_versions.reserve(column_families.size());
 
-  if (!options.tailing) {
+  if (!read_options.tailing) {
     mutex_.Lock();
     latest_snapshot = versions_->LastSequence();
     for (auto cfh : column_families) {
@@ -3773,17 +3777,18 @@ Status DBImpl::NewIterators(
     mutex_.Unlock();
   }
 
-  if (options.tailing) {
+  if (read_options.tailing) {
 #ifdef ROCKSDB_LITE
     return Status::InvalidArgument(
         "Tailing interator not supported in RocksDB lite");
 #else
     for (auto cfh : column_families) {
       auto cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(cfh)->cfd();
-      auto iter = new ForwardIterator(this, options, cfd);
+      auto iter = new ForwardIterator(this, read_options, cfd);
       iterators->push_back(
-          NewDBIterator(env_, *cfd->options(), cfd->user_comparator(), iter,
-                        kMaxSequenceNumber));
+          NewDBIterator(env_, *cfd->ioptions(), cfd->user_comparator(), iter,
+                        kMaxSequenceNumber,
+                        cfd->options()->max_sequential_skip_in_iterations));
     }
 #endif
   } else {
@@ -3792,14 +3797,16 @@ Status DBImpl::NewIterators(
       auto cfd = cfh->cfd();
 
       auto snapshot =
-          options.snapshot != nullptr
-              ? reinterpret_cast<const SnapshotImpl*>(options.snapshot)->number_
+          read_options.snapshot != nullptr
+              ? reinterpret_cast<const SnapshotImpl*>(
+                  read_options.snapshot)->number_
               : latest_snapshot;
 
       ArenaWrappedDBIter* db_iter = NewArenaWrappedDbIterator(
-          env_, *cfd->options(), cfd->user_comparator(), snapshot);
+          env_, *cfd->ioptions(), cfd->user_comparator(), snapshot,
+          cfd->options()->max_sequential_skip_in_iterations);
       Iterator* internal_iter = NewInternalIterator(
-          options, cfd, super_versions[i], db_iter->GetArena());
+          read_options, cfd, super_versions[i], db_iter->GetArena());
       db_iter->SetIterUnderDBIter(internal_iter);
       iterators->push_back(db_iter);
     }
@@ -3838,7 +3845,7 @@ Status DBImpl::Put(const WriteOptions& o, ColumnFamilyHandle* column_family,
 Status DBImpl::Merge(const WriteOptions& o, ColumnFamilyHandle* column_family,
                      const Slice& key, const Slice& val) {
   auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
-  if (!cfh->cfd()->options()->merge_operator) {
+  if (!cfh->cfd()->ioptions()->merge_operator) {
     return Status::NotSupported("Provide a merge_operator when opening DB");
   } else {
     return DB::Merge(o, column_family, key, val);
@@ -4814,8 +4821,8 @@ Status DB::Open(const DBOptions& db_options, const std::string& dbname,
 
   if (s.ok()) {
     for (auto cfd : *impl->versions_->GetColumnFamilySet()) {
-      if (cfd->options()->compaction_style == kCompactionStyleUniversal ||
-          cfd->options()->compaction_style == kCompactionStyleFIFO) {
+      if (cfd->ioptions()->compaction_style == kCompactionStyleUniversal ||
+          cfd->ioptions()->compaction_style == kCompactionStyleFIFO) {
         Version* current = cfd->current();
         for (int i = 1; i < current->NumberLevels(); ++i) {
           int num_files = current->NumLevelFiles(i);
@@ -4827,7 +4834,7 @@ Status DB::Open(const DBOptions& db_options, const std::string& dbname,
           }
         }
       }
-      if (cfd->options()->merge_operator != nullptr &&
+      if (cfd->ioptions()->merge_operator != nullptr &&
           !cfd->mem()->IsMergeOperatorSupported()) {
         s = Status::InvalidArgument(
             "The memtable of column family %s does not support merge operator "
diff --git a/db/db_impl_readonly.cc b/db/db_impl_readonly.cc
index db0718bd1..86fa0852b 100644
--- a/db/db_impl_readonly.cc
+++ b/db/db_impl_readonly.cc
@@ -69,25 +69,27 @@ Status DBImplReadOnly::Get(const ReadOptions& options,
   return s;
 }
 
-Iterator* DBImplReadOnly::NewIterator(const ReadOptions& options,
+Iterator* DBImplReadOnly::NewIterator(const ReadOptions& read_options,
                                       ColumnFamilyHandle* column_family) {
   auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
   auto cfd = cfh->cfd();
   SuperVersion* super_version = cfd->GetSuperVersion()->Ref();
   SequenceNumber latest_snapshot = versions_->LastSequence();
   auto db_iter = NewArenaWrappedDbIterator(
-      env_, *cfd->options(), cfd->user_comparator(),
-      (options.snapshot != nullptr
-           ? reinterpret_cast<const SnapshotImpl*>(options.snapshot)->number_
-           : latest_snapshot));
-  auto internal_iter =
-      NewInternalIterator(options, cfd, super_version, db_iter->GetArena());
+      env_, *cfd->ioptions(), cfd->user_comparator(),
+      (read_options.snapshot != nullptr
+           ? reinterpret_cast<const SnapshotImpl*>(
+                read_options.snapshot)->number_
+           : latest_snapshot),
+      cfd->options()->max_sequential_skip_in_iterations);
+  auto internal_iter = NewInternalIterator(
+      read_options, cfd, super_version, db_iter->GetArena());
   db_iter->SetIterUnderDBIter(internal_iter);
   return db_iter;
 }
 
 Status DBImplReadOnly::NewIterators(
-    const ReadOptions& options,
+    const ReadOptions& read_options,
     const std::vector<ColumnFamilyHandle*>& column_families,
     std::vector<Iterator*>* iterators) {
   if (iterators == nullptr) {
@@ -100,12 +102,14 @@ Status DBImplReadOnly::NewIterators(
   for (auto cfh : column_families) {
     auto cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(cfh)->cfd();
     auto db_iter = NewArenaWrappedDbIterator(
-        env_, *cfd->options(), cfd->user_comparator(),
-        options.snapshot != nullptr
-            ? reinterpret_cast<const SnapshotImpl*>(options.snapshot)->number_
-            : latest_snapshot);
+        env_, *cfd->ioptions(), cfd->user_comparator(),
+        (read_options.snapshot != nullptr
+            ? reinterpret_cast<const SnapshotImpl*>(
+                  read_options.snapshot)->number_
+            : latest_snapshot),
+        cfd->options()->max_sequential_skip_in_iterations);
     auto internal_iter = NewInternalIterator(
-        options, cfd, cfd->GetSuperVersion()->Ref(), db_iter->GetArena());
+        read_options, cfd, cfd->GetSuperVersion()->Ref(), db_iter->GetArena());
     db_iter->SetIterUnderDBIter(internal_iter);
     iterators->push_back(db_iter);
   }
diff --git a/db/db_iter.cc b/db/db_iter.cc
index bfdcd4edb..db86ebc2c 100644
--- a/db/db_iter.cc
+++ b/db/db_iter.cc
@@ -58,24 +58,25 @@ class DBIter: public Iterator {
     kReverse
   };
 
-  DBIter(Env* env, const Options& options, const Comparator* cmp,
-         Iterator* iter, SequenceNumber s, bool arena_mode,
+  DBIter(Env* env, const ImmutableCFOptions& ioptions,
+         const Comparator* cmp, Iterator* iter, SequenceNumber s,
+         bool arena_mode, uint64_t max_sequential_skip_in_iterations,
          const Slice* iterate_upper_bound = nullptr)
       : arena_mode_(arena_mode),
         env_(env),
-        logger_(options.info_log.get()),
+        logger_(ioptions.info_log),
         user_comparator_(cmp),
-        user_merge_operator_(options.merge_operator.get()),
+        user_merge_operator_(ioptions.merge_operator),
         iter_(iter),
         sequence_(s),
         direction_(kForward),
         valid_(false),
         current_entry_is_merged_(false),
-        statistics_(options.statistics.get()),
+        statistics_(ioptions.statistics),
         iterate_upper_bound_(iterate_upper_bound) {
     RecordTick(statistics_, NO_ITERATORS);
-    prefix_extractor_ = options.prefix_extractor.get();
-    max_skip_ = options.max_sequential_skip_in_iterations;
+    prefix_extractor_ = ioptions.prefix_extractor;
+    max_skip_ = max_sequential_skip_in_iterations;
   }
   virtual ~DBIter() {
     RecordTick(statistics_, NO_ITERATORS, -1);
@@ -636,13 +637,15 @@ void DBIter::SeekToLast() {
   PrevInternal();
 }
 
-Iterator* NewDBIterator(Env* env, const Options& options,
+Iterator* NewDBIterator(Env* env, const ImmutableCFOptions& ioptions,
                         const Comparator* user_key_comparator,
                         Iterator* internal_iter,
                         const SequenceNumber& sequence,
+                        uint64_t max_sequential_skip_in_iterations,
                         const Slice* iterate_upper_bound) {
-  return new DBIter(env, options, user_key_comparator, internal_iter, sequence,
-                    false, iterate_upper_bound);
+  return new DBIter(env, ioptions, user_key_comparator, internal_iter, sequence,
+                    false, max_sequential_skip_in_iterations,
+                    iterate_upper_bound);
 }
 
 ArenaWrappedDBIter::~ArenaWrappedDBIter() { db_iter_->~DBIter(); }
@@ -670,14 +673,17 @@ void ArenaWrappedDBIter::RegisterCleanup(CleanupFunction function, void* arg1,
 }
 
 ArenaWrappedDBIter* NewArenaWrappedDbIterator(
-    Env* env, const Options& options, const Comparator* user_key_comparator,
+    Env* env, const ImmutableCFOptions& ioptions,
+    const Comparator* user_key_comparator,
     const SequenceNumber& sequence,
+    uint64_t max_sequential_skip_in_iterations,
     const Slice* iterate_upper_bound) {
   ArenaWrappedDBIter* iter = new ArenaWrappedDBIter();
   Arena* arena = iter->GetArena();
   auto mem = arena->AllocateAligned(sizeof(DBIter));
-  DBIter* db_iter = new (mem) DBIter(env, options, user_key_comparator,
-      nullptr, sequence, true, iterate_upper_bound);
+  DBIter* db_iter = new (mem) DBIter(env, ioptions, user_key_comparator,
+      nullptr, sequence, true, max_sequential_skip_in_iterations,
+      iterate_upper_bound);
 
   iter->SetDBIter(db_iter);
 
diff --git a/db/db_iter.h b/db/db_iter.h
index ffea34fa9..c676d6cda 100644
--- a/db/db_iter.h
+++ b/db/db_iter.h
@@ -24,10 +24,11 @@ class DBIter;
 // into appropriate user keys.
 extern Iterator* NewDBIterator(
     Env* env,
-    const Options& options,
+    const ImmutableCFOptions& options,
     const Comparator *user_key_comparator,
     Iterator* internal_iter,
     const SequenceNumber& sequence,
+    uint64_t max_sequential_skip_in_iterations,
     const Slice* iterate_upper_bound = nullptr);
 
 // A wrapper iterator which wraps DB Iterator and the arena, with which the DB
@@ -68,7 +69,9 @@ class ArenaWrappedDBIter : public Iterator {
 
 // Generate the arena wrapped iterator class.
 extern ArenaWrappedDBIter* NewArenaWrappedDbIterator(
-    Env* env, const Options& options, const Comparator* user_key_comparator,
-    const SequenceNumber& sequence, const Slice* iterate_upper_bound = nullptr);
+    Env* env, const ImmutableCFOptions& options,
+    const Comparator* user_key_comparator,
+    const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iterations,
+    const Slice* iterate_upper_bound = nullptr);
 
 }  // namespace rocksdb
diff --git a/db/db_iter_test.cc b/db/db_iter_test.cc
index 4ce79da1b..2aa30e327 100644
--- a/db/db_iter_test.cc
+++ b/db/db_iter_test.cc
@@ -158,7 +158,9 @@ TEST(DBIteratorTest, DBIteratorPrevNext) {
     internal_iter->Finish();
 
     std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 10));
+        NewDBIterator(env_, ImmutableCFOptions(options),
+                      BytewiseComparator(), internal_iter, 10,
+                      options.max_sequential_skip_in_iterations));
 
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
@@ -191,7 +193,9 @@ TEST(DBIteratorTest, DBIteratorPrevNext) {
     internal_iter->Finish();
 
     std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 10));
+        NewDBIterator(env_, ImmutableCFOptions(options),
+                      BytewiseComparator(), internal_iter, 10,
+                      options.max_sequential_skip_in_iterations));
 
     db_iter->SeekToFirst();
     ASSERT_TRUE(db_iter->Valid());
@@ -232,7 +236,9 @@ TEST(DBIteratorTest, DBIteratorPrevNext) {
     internal_iter->Finish();
 
     std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 2));
+        NewDBIterator(env_, ImmutableCFOptions(options),
+                      BytewiseComparator(), internal_iter, 2,
+                      options.max_sequential_skip_in_iterations));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "b");
@@ -262,7 +268,9 @@ TEST(DBIteratorTest, DBIteratorPrevNext) {
     internal_iter->Finish();
 
     std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 10));
+        NewDBIterator(env_, ImmutableCFOptions(options),
+                      BytewiseComparator(), internal_iter, 10,
+                      options.max_sequential_skip_in_iterations));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "c");
@@ -288,7 +296,9 @@ TEST(DBIteratorTest, DBIteratorEmpty) {
     internal_iter->Finish();
 
     std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 0));
+        NewDBIterator(env_, ImmutableCFOptions(options),
+                      BytewiseComparator(), internal_iter, 0,
+                      options.max_sequential_skip_in_iterations));
     db_iter->SeekToLast();
     ASSERT_TRUE(!db_iter->Valid());
   }
@@ -298,7 +308,9 @@ TEST(DBIteratorTest, DBIteratorEmpty) {
     internal_iter->Finish();
 
     std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 0));
+        NewDBIterator(env_, ImmutableCFOptions(options),
+                      BytewiseComparator(), internal_iter, 0,
+                      options.max_sequential_skip_in_iterations));
     db_iter->SeekToFirst();
     ASSERT_TRUE(!db_iter->Valid());
   }
@@ -318,7 +330,9 @@ TEST(DBIteratorTest, DBIteratorUseSkipCountSkips) {
   internal_iter->Finish();
 
   std::unique_ptr<Iterator> db_iter(
-      NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 2));
+      NewDBIterator(env_, ImmutableCFOptions(options),
+                    BytewiseComparator(), internal_iter, 2,
+                    options.max_sequential_skip_in_iterations));
   db_iter->SeekToLast();
   ASSERT_TRUE(db_iter->Valid());
   ASSERT_EQ(db_iter->key().ToString(), "c");
@@ -357,7 +371,9 @@ TEST(DBIteratorTest, DBIteratorUseSkip) {
 
       options.statistics = rocksdb::CreateDBStatistics();
       std::unique_ptr<Iterator> db_iter(NewDBIterator(
-          env_, options, BytewiseComparator(), internal_iter, i + 2));
+          env_, ImmutableCFOptions(options),
+          BytewiseComparator(), internal_iter, i + 2,
+          options.max_sequential_skip_in_iterations));
       db_iter->SeekToLast();
       ASSERT_TRUE(db_iter->Valid());
 
@@ -391,7 +407,9 @@ TEST(DBIteratorTest, DBIteratorUseSkip) {
       internal_iter->Finish();
 
       std::unique_ptr<Iterator> db_iter(NewDBIterator(
-          env_, options, BytewiseComparator(), internal_iter, i + 2));
+          env_, ImmutableCFOptions(options),
+          BytewiseComparator(), internal_iter, i + 2,
+          options.max_sequential_skip_in_iterations));
       db_iter->SeekToLast();
       ASSERT_TRUE(db_iter->Valid());
 
@@ -418,7 +436,9 @@ TEST(DBIteratorTest, DBIteratorUseSkip) {
       internal_iter->Finish();
 
       std::unique_ptr<Iterator> db_iter(NewDBIterator(
-          env_, options, BytewiseComparator(), internal_iter, 202));
+          env_, ImmutableCFOptions(options),
+          BytewiseComparator(), internal_iter, 202,
+          options.max_sequential_skip_in_iterations));
       db_iter->SeekToLast();
       ASSERT_TRUE(db_iter->Valid());
 
@@ -449,7 +469,9 @@ TEST(DBIteratorTest, DBIteratorUseSkip) {
       internal_iter->AddPut("c", "200");
       internal_iter->Finish();
       std::unique_ptr<Iterator> db_iter(
-          NewDBIterator(env_, options, BytewiseComparator(), internal_iter, i));
+          NewDBIterator(env_, ImmutableCFOptions(options),
+                        BytewiseComparator(), internal_iter, i,
+                        options.max_sequential_skip_in_iterations));
       db_iter->SeekToLast();
       ASSERT_TRUE(!db_iter->Valid());
 
@@ -464,7 +486,9 @@ TEST(DBIteratorTest, DBIteratorUseSkip) {
     internal_iter->AddPut("c", "200");
     internal_iter->Finish();
     std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 200));
+        NewDBIterator(env_, ImmutableCFOptions(options),
+                      BytewiseComparator(), internal_iter, 200,
+                      options.max_sequential_skip_in_iterations));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "c");
@@ -497,7 +521,9 @@ TEST(DBIteratorTest, DBIteratorUseSkip) {
       internal_iter->Finish();
 
       std::unique_ptr<Iterator> db_iter(NewDBIterator(
-          env_, options, BytewiseComparator(), internal_iter, i + 2));
+          env_, ImmutableCFOptions(options),
+          BytewiseComparator(), internal_iter, i + 2,
+          options.max_sequential_skip_in_iterations));
       db_iter->SeekToLast();
       ASSERT_TRUE(db_iter->Valid());
 
@@ -530,7 +556,9 @@ TEST(DBIteratorTest, DBIteratorUseSkip) {
       internal_iter->Finish();
 
       std::unique_ptr<Iterator> db_iter(NewDBIterator(
-          env_, options, BytewiseComparator(), internal_iter, i + 2));
+          env_, ImmutableCFOptions(options),
+          BytewiseComparator(), internal_iter, i + 2,
+          options.max_sequential_skip_in_iterations));
       db_iter->SeekToLast();
       ASSERT_TRUE(db_iter->Valid());
 
@@ -570,7 +598,9 @@ TEST(DBIteratorTest, DBIterator) {
     internal_iter->Finish();
 
     std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 1));
+        NewDBIterator(env_, ImmutableCFOptions(options),
+                      BytewiseComparator(), internal_iter, 1,
+                      options.max_sequential_skip_in_iterations));
     db_iter->SeekToFirst();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -590,7 +620,9 @@ TEST(DBIteratorTest, DBIterator) {
     internal_iter->Finish();
 
     std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 0));
+        NewDBIterator(env_, ImmutableCFOptions(options),
+                      BytewiseComparator(), internal_iter, 0,
+                      options.max_sequential_skip_in_iterations));
     db_iter->SeekToFirst();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -609,7 +641,9 @@ TEST(DBIteratorTest, DBIterator) {
     internal_iter->Finish();
 
     std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 2));
+        NewDBIterator(env_, ImmutableCFOptions(options),
+                      BytewiseComparator(), internal_iter, 2,
+                      options.max_sequential_skip_in_iterations));
     db_iter->SeekToFirst();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -628,7 +662,9 @@ TEST(DBIteratorTest, DBIterator) {
     internal_iter->Finish();
 
     std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 4));
+        NewDBIterator(env_, ImmutableCFOptions(options),
+                      BytewiseComparator(), internal_iter, 4,
+                      options.max_sequential_skip_in_iterations));
     db_iter->SeekToFirst();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -654,7 +690,9 @@ TEST(DBIteratorTest, DBIterator) {
       internal_iter->Finish();
 
       std::unique_ptr<Iterator> db_iter(
-          NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 0));
+          NewDBIterator(env_, ImmutableCFOptions(options),
+                        BytewiseComparator(), internal_iter, 0,
+                        options.max_sequential_skip_in_iterations));
       db_iter->SeekToLast();
       ASSERT_TRUE(db_iter->Valid());
       ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -675,7 +713,9 @@ TEST(DBIteratorTest, DBIterator) {
       internal_iter->Finish();
 
       std::unique_ptr<Iterator> db_iter(
-          NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 1));
+          NewDBIterator(env_, ImmutableCFOptions(options),
+                        BytewiseComparator(), internal_iter, 1,
+                        options.max_sequential_skip_in_iterations));
       db_iter->SeekToLast();
       ASSERT_TRUE(db_iter->Valid());
       ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -696,7 +736,9 @@ TEST(DBIteratorTest, DBIterator) {
       internal_iter->Finish();
 
       std::unique_ptr<Iterator> db_iter(
-          NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 2));
+          NewDBIterator(env_, ImmutableCFOptions(options),
+                        BytewiseComparator(), internal_iter, 2,
+                        options.max_sequential_skip_in_iterations));
       db_iter->SeekToLast();
       ASSERT_TRUE(db_iter->Valid());
       ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -717,7 +759,9 @@ TEST(DBIteratorTest, DBIterator) {
       internal_iter->Finish();
 
       std::unique_ptr<Iterator> db_iter(
-          NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 3));
+          NewDBIterator(env_, ImmutableCFOptions(options),
+                        BytewiseComparator(), internal_iter, 3,
+                        options.max_sequential_skip_in_iterations));
       db_iter->SeekToLast();
       ASSERT_TRUE(db_iter->Valid());
       ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -738,7 +782,9 @@ TEST(DBIteratorTest, DBIterator) {
       internal_iter->Finish();
 
       std::unique_ptr<Iterator> db_iter(
-          NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 4));
+          NewDBIterator(env_, ImmutableCFOptions(options),
+                        BytewiseComparator(), internal_iter, 4,
+                        options.max_sequential_skip_in_iterations));
       db_iter->SeekToLast();
       ASSERT_TRUE(db_iter->Valid());
       ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -759,7 +805,9 @@ TEST(DBIteratorTest, DBIterator) {
       internal_iter->Finish();
 
       std::unique_ptr<Iterator> db_iter(
-          NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 5));
+          NewDBIterator(env_, ImmutableCFOptions(options),
+                        BytewiseComparator(), internal_iter, 5,
+                        options.max_sequential_skip_in_iterations));
       db_iter->SeekToLast();
       ASSERT_TRUE(db_iter->Valid());
       ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -780,7 +828,9 @@ TEST(DBIteratorTest, DBIterator) {
       internal_iter->Finish();
 
       std::unique_ptr<Iterator> db_iter(
-          NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 6));
+          NewDBIterator(env_, ImmutableCFOptions(options),
+                        BytewiseComparator(), internal_iter, 6,
+                        options.max_sequential_skip_in_iterations));
       db_iter->SeekToLast();
       ASSERT_TRUE(db_iter->Valid());
       ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -803,7 +853,9 @@ TEST(DBIteratorTest, DBIterator) {
       internal_iter->Finish();
 
       std::unique_ptr<Iterator> db_iter(
-          NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 0));
+          NewDBIterator(env_, ImmutableCFOptions(options),
+                        BytewiseComparator(), internal_iter, 0,
+                        options.max_sequential_skip_in_iterations));
       db_iter->SeekToLast();
       ASSERT_TRUE(db_iter->Valid());
       ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -824,7 +876,9 @@ TEST(DBIteratorTest, DBIterator) {
       internal_iter->Finish();
 
       std::unique_ptr<Iterator> db_iter(
-          NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 1));
+          NewDBIterator(env_, ImmutableCFOptions(options),
+                        BytewiseComparator(), internal_iter, 1,
+                        options.max_sequential_skip_in_iterations));
       db_iter->SeekToLast();
       ASSERT_TRUE(db_iter->Valid());
       ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -845,7 +899,9 @@ TEST(DBIteratorTest, DBIterator) {
       internal_iter->Finish();
 
       std::unique_ptr<Iterator> db_iter(
-          NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 2));
+          NewDBIterator(env_, ImmutableCFOptions(options),
+                        BytewiseComparator(), internal_iter, 2,
+                        options.max_sequential_skip_in_iterations));
       db_iter->SeekToLast();
       ASSERT_TRUE(db_iter->Valid());
       ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -866,7 +922,9 @@ TEST(DBIteratorTest, DBIterator) {
       internal_iter->Finish();
 
       std::unique_ptr<Iterator> db_iter(
-          NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 3));
+          NewDBIterator(env_, ImmutableCFOptions(options),
+                        BytewiseComparator(), internal_iter, 3,
+                        options.max_sequential_skip_in_iterations));
       db_iter->SeekToLast();
       ASSERT_TRUE(!db_iter->Valid());
     }
@@ -883,7 +941,9 @@ TEST(DBIteratorTest, DBIterator) {
       internal_iter->Finish();
 
       std::unique_ptr<Iterator> db_iter(
-          NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 4));
+          NewDBIterator(env_, ImmutableCFOptions(options),
+                        BytewiseComparator(), internal_iter, 4,
+                        options.max_sequential_skip_in_iterations));
       db_iter->SeekToLast();
       ASSERT_TRUE(db_iter->Valid());
       ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -904,7 +964,9 @@ TEST(DBIteratorTest, DBIterator) {
       internal_iter->Finish();
 
       std::unique_ptr<Iterator> db_iter(
-          NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 5));
+          NewDBIterator(env_, ImmutableCFOptions(options),
+                        BytewiseComparator(), internal_iter, 5,
+                        options.max_sequential_skip_in_iterations));
       db_iter->SeekToLast();
       ASSERT_TRUE(db_iter->Valid());
       ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -925,7 +987,9 @@ TEST(DBIteratorTest, DBIterator) {
       internal_iter->Finish();
 
       std::unique_ptr<Iterator> db_iter(
-          NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 6));
+          NewDBIterator(env_, ImmutableCFOptions(options),
+                        BytewiseComparator(), internal_iter, 6,
+                        options.max_sequential_skip_in_iterations));
       db_iter->SeekToLast();
       ASSERT_TRUE(db_iter->Valid());
       ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -960,7 +1024,9 @@ TEST(DBIteratorTest, DBIterator) {
       internal_iter->Finish();
 
       std::unique_ptr<Iterator> db_iter(
-          NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 0));
+          NewDBIterator(env_, ImmutableCFOptions(options),
+                        BytewiseComparator(), internal_iter, 0,
+                        options.max_sequential_skip_in_iterations));
       db_iter->SeekToLast();
       ASSERT_TRUE(db_iter->Valid());
       ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -993,7 +1059,9 @@ TEST(DBIteratorTest, DBIterator) {
       internal_iter->Finish();
 
       std::unique_ptr<Iterator> db_iter(
-          NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 2));
+          NewDBIterator(env_, ImmutableCFOptions(options),
+                        BytewiseComparator(), internal_iter, 2,
+                        options.max_sequential_skip_in_iterations));
       db_iter->SeekToLast();
       ASSERT_TRUE(db_iter->Valid());
 
@@ -1032,7 +1100,9 @@ TEST(DBIteratorTest, DBIterator) {
       internal_iter->Finish();
 
       std::unique_ptr<Iterator> db_iter(
-          NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 4));
+          NewDBIterator(env_, ImmutableCFOptions(options),
+                        BytewiseComparator(), internal_iter, 4,
+                        options.max_sequential_skip_in_iterations));
       db_iter->SeekToLast();
       ASSERT_TRUE(db_iter->Valid());
 
@@ -1071,7 +1141,9 @@ TEST(DBIteratorTest, DBIterator) {
       internal_iter->Finish();
 
       std::unique_ptr<Iterator> db_iter(
-          NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 5));
+          NewDBIterator(env_, ImmutableCFOptions(options),
+                        BytewiseComparator(), internal_iter, 5,
+                        options.max_sequential_skip_in_iterations));
       db_iter->SeekToLast();
       ASSERT_TRUE(db_iter->Valid());
 
@@ -1115,7 +1187,9 @@ TEST(DBIteratorTest, DBIterator) {
       internal_iter->Finish();
 
       std::unique_ptr<Iterator> db_iter(
-          NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 6));
+          NewDBIterator(env_, ImmutableCFOptions(options),
+                        BytewiseComparator(), internal_iter, 6,
+                        options.max_sequential_skip_in_iterations));
       db_iter->SeekToLast();
       ASSERT_TRUE(db_iter->Valid());
 
@@ -1160,7 +1234,9 @@ TEST(DBIteratorTest, DBIterator) {
       internal_iter->Finish();
 
       std::unique_ptr<Iterator> db_iter(
-          NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 7));
+          NewDBIterator(env_, ImmutableCFOptions(options),
+                        BytewiseComparator(), internal_iter, 7,
+                        options.max_sequential_skip_in_iterations));
       db_iter->SeekToLast();
       ASSERT_TRUE(db_iter->Valid());
 
@@ -1199,7 +1275,9 @@ TEST(DBIteratorTest, DBIterator) {
       internal_iter->Finish();
 
       std::unique_ptr<Iterator> db_iter(
-          NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 9));
+          NewDBIterator(env_, ImmutableCFOptions(options),
+                        BytewiseComparator(), internal_iter, 9,
+                        options.max_sequential_skip_in_iterations));
       db_iter->SeekToLast();
       ASSERT_TRUE(db_iter->Valid());
 
@@ -1244,7 +1322,9 @@ TEST(DBIteratorTest, DBIterator) {
       internal_iter->Finish();
 
       std::unique_ptr<Iterator> db_iter(NewDBIterator(
-          env_, options, BytewiseComparator(), internal_iter, 13));
+          env_, ImmutableCFOptions(options),
+          BytewiseComparator(), internal_iter, 13,
+          options.max_sequential_skip_in_iterations));
       db_iter->SeekToLast();
       ASSERT_TRUE(db_iter->Valid());
 
@@ -1290,7 +1370,9 @@ TEST(DBIteratorTest, DBIterator) {
       internal_iter->Finish();
 
       std::unique_ptr<Iterator> db_iter(NewDBIterator(
-          env_, options, BytewiseComparator(), internal_iter, 14));
+          env_, ImmutableCFOptions(options),
+          BytewiseComparator(), internal_iter, 14,
+          options.max_sequential_skip_in_iterations));
       db_iter->SeekToLast();
       ASSERT_TRUE(db_iter->Valid());
 
@@ -1316,7 +1398,9 @@ TEST(DBIteratorTest, DBIterator) {
     internal_iter->Finish();
 
     std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 10));
+        NewDBIterator(env_, ImmutableCFOptions(options),
+                      BytewiseComparator(), internal_iter, 10,
+                      options.max_sequential_skip_in_iterations));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "b");
diff --git a/db/version_set.cc b/db/version_set.cc
index 7e9393e3c..9788137af 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -512,7 +512,7 @@ Status Version::GetTableProperties(std::shared_ptr<const TableProperties>* tp,
                                    const FileMetaData* file_meta,
                                    const std::string* fname) {
   auto table_cache = cfd_->table_cache();
-  auto options = cfd_->options();
+  auto ioptions = cfd_->ioptions();
   Status s = table_cache->GetTableProperties(
       vset_->storage_options_, cfd_->internal_comparator(), file_meta->fd,
       tp, true /* no io */);
@@ -530,10 +530,10 @@ Status Version::GetTableProperties(std::shared_ptr<const TableProperties>* tp,
   // directly from the properties block in the file.
   std::unique_ptr<RandomAccessFile> file;
   if (fname != nullptr) {
-    s = options->env->NewRandomAccessFile(
+    s = ioptions->env->NewRandomAccessFile(
         *fname, &file, vset_->storage_options_);
   } else {
-    s = options->env->NewRandomAccessFile(
+    s = ioptions->env->NewRandomAccessFile(
         TableFileName(vset_->options_->db_paths, file_meta->fd.GetNumber(),
                       file_meta->fd.GetPathId()),
         &file, vset_->storage_options_);
@@ -548,11 +548,11 @@ Status Version::GetTableProperties(std::shared_ptr<const TableProperties>* tp,
   s = ReadTableProperties(
       file.get(), file_meta->fd.GetFileSize(),
       Footer::kInvalidTableMagicNumber /* table's magic number */,
-      vset_->env_, options->info_log.get(), &raw_table_properties);
+      vset_->env_, ioptions->info_log, &raw_table_properties);
   if (!s.ok()) {
     return s;
   }
-  RecordTick(options->statistics.get(), NUMBER_DIRECT_LOAD_TABLE_PROPERTIES);
+  RecordTick(ioptions->statistics, NUMBER_DIRECT_LOAD_TABLE_PROPERTIES);
 
   *tp = std::shared_ptr<const TableProperties>(raw_table_properties);
   return s;
@@ -619,7 +619,7 @@ void Version::AddIterators(const ReadOptions& read_options,
           new LevelFileIteratorState(
               cfd_->table_cache(), read_options, soptions,
               cfd_->internal_comparator(), false /* for_compaction */,
-              cfd_->options()->prefix_extractor != nullptr),
+              cfd_->ioptions()->prefix_extractor != nullptr),
           new LevelFileNumIterator(cfd_->internal_comparator(),
               &file_levels_[level]), merge_iter_builder->GetArena()));
     }
@@ -735,10 +735,10 @@ Version::Version(ColumnFamilyData* cfd, VersionSet* vset,
           (cfd == nullptr) ? nullptr : internal_comparator_->user_comparator()),
       table_cache_((cfd == nullptr) ? nullptr : cfd->table_cache()),
       merge_operator_((cfd == nullptr) ? nullptr
-                                       : cfd->options()->merge_operator.get()),
-      info_log_((cfd == nullptr) ? nullptr : cfd->options()->info_log.get()),
+                                       : cfd->ioptions()->merge_operator),
+      info_log_((cfd == nullptr) ? nullptr : cfd->ioptions()->info_log),
       db_statistics_((cfd == nullptr) ? nullptr
-                                      : cfd->options()->statistics.get()),
+                                      : cfd->ioptions()->statistics),
       // cfd is nullptr if Version is dummy
       num_levels_(cfd == nullptr ? 0 : cfd->NumberLevels()),
       num_non_empty_levels_(num_levels_),
@@ -947,7 +947,7 @@ void Version::ComputeCompactionScore(
           numfiles++;
         }
       }
-      if (cfd_->options()->compaction_style == kCompactionStyleFIFO) {
+      if (cfd_->ioptions()->compaction_style == kCompactionStyleFIFO) {
         score = static_cast<double>(total_size) /
                 cfd_->options()->compaction_options_fifo.max_table_files_size;
       } else if (numfiles >= cfd_->options()->level0_stop_writes_trigger) {
@@ -1016,8 +1016,8 @@ void Version::UpdateNumNonEmptyLevels() {
 }
 
 void Version::UpdateFilesBySize() {
-  if (cfd_->options()->compaction_style == kCompactionStyleFIFO ||
-      cfd_->options()->compaction_style == kCompactionStyleUniversal) {
+  if (cfd_->ioptions()->compaction_style == kCompactionStyleFIFO ||
+      cfd_->ioptions()->compaction_style == kCompactionStyleUniversal) {
     // don't need this
     return;
   }
diff --git a/include/rocksdb/immutable_options.h b/include/rocksdb/immutable_options.h
index 22084f6f0..be7b095e0 100644
--- a/include/rocksdb/immutable_options.h
+++ b/include/rocksdb/immutable_options.h
@@ -17,6 +17,10 @@ namespace rocksdb {
 struct ImmutableCFOptions {
   explicit ImmutableCFOptions(const Options& options);
 
+  CompactionStyle compaction_style;
+
+  CompactionOptionsUniversal compaction_options_universal;
+
   const SliceTransform* prefix_extractor;
 
   const Comparator* comparator;
@@ -57,6 +61,12 @@ struct ImmutableCFOptions {
   bool disable_data_sync;
 
   bool use_fsync;
+
+  CompressionType compression;
+
+  std::vector<CompressionType> compression_per_level;
+
+  CompressionOptions compression_opts;
 };
 
 }  // namespace rocksdb
diff --git a/util/options.cc b/util/options.cc
index 371ecda78..a4e83cc78 100644
--- a/util/options.cc
+++ b/util/options.cc
@@ -33,7 +33,9 @@
 namespace rocksdb {
 
 ImmutableCFOptions::ImmutableCFOptions(const Options& options)
-  : prefix_extractor(options.prefix_extractor.get()),
+  : compaction_style(options.compaction_style),
+    compaction_options_universal(options.compaction_options_universal),
+    prefix_extractor(options.prefix_extractor.get()),
     comparator(options.comparator),
     merge_operator(options.merge_operator.get()),
     info_log(options.info_log.get()),
@@ -50,7 +52,10 @@ ImmutableCFOptions::ImmutableCFOptions(const Options& options)
     purge_redundant_kvs_while_flush(options.purge_redundant_kvs_while_flush),
     min_partial_merge_operands(options.min_partial_merge_operands),
     disable_data_sync(options.disableDataSync),
-    use_fsync(options.use_fsync) {}
+    use_fsync(options.use_fsync),
+    compression(options.compression),
+    compression_per_level(options.compression_per_level),
+    compression_opts(options.compression_opts) {}
 
 ColumnFamilyOptions::ColumnFamilyOptions()
     : comparator(BytewiseComparator()),

From 659d2d50c344e07e13d9118d1fe4aec99fd207cb Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Mon, 8 Sep 2014 15:09:25 -0700
Subject: [PATCH 046/829] move compaction_filter to immutable_options

Summary:
all shared_ptrs are in immutable_options now. This will also make
options assignment a little cheaper

Test Plan: make release

Reviewers: sdong, yhchiang, igor

Reviewed By: igor

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D23001
---
 db/db_impl.cc                       | 10 +++++-----
 include/rocksdb/immutable_options.h | 10 +++++++++-
 util/options.cc                     |  3 +++
 3 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/db/db_impl.cc b/db/db_impl.cc
index 54c14b455..a7c9206f9 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -2643,12 +2643,12 @@ Status DBImpl::ProcessKeyValueCompaction(
       cfd->user_comparator(), cfd->ioptions()->merge_operator,
       db_options_.info_log.get(), cfd->options()->min_partial_merge_operands,
       false /* internal key corruption is expected */);
-  auto compaction_filter = cfd->options()->compaction_filter;
+  auto compaction_filter = cfd->ioptions()->compaction_filter;
   std::unique_ptr<CompactionFilter> compaction_filter_from_factory = nullptr;
   if (!compaction_filter) {
     auto context = compact->GetFilterContextV1();
     compaction_filter_from_factory =
-        cfd->options()->compaction_filter_factory->CreateCompactionFilter(
+        cfd->ioptions()->compaction_filter_factory->CreateCompactionFilter(
             context);
     compaction_filter = compaction_filter_from_factory.get();
   }
@@ -3085,8 +3085,8 @@ Status DBImpl::DoCompactionWork(CompactionState* compact,
     = nullptr;
   auto context = compact->GetFilterContext();
   compaction_filter_from_factory_v2 =
-      cfd->options()->compaction_filter_factory_v2->CreateCompactionFilterV2(
-          context);
+      cfd->ioptions()->compaction_filter_factory_v2->
+          CreateCompactionFilterV2(context);
   auto compaction_filter_v2 =
     compaction_filter_from_factory_v2.get();
 
@@ -3116,7 +3116,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact,
         continue;
       } else {
         const SliceTransform* transformer =
-            cfd->options()->compaction_filter_factory_v2->GetPrefixExtractor();
+            cfd->ioptions()->compaction_filter_factory_v2->GetPrefixExtractor();
         const auto key_prefix = transformer->Transform(ikey.user_key);
         if (!prefix_initialized) {
           compact->cur_prefix_ = key_prefix.ToString();
diff --git a/include/rocksdb/immutable_options.h b/include/rocksdb/immutable_options.h
index be7b095e0..f3e41c89e 100644
--- a/include/rocksdb/immutable_options.h
+++ b/include/rocksdb/immutable_options.h
@@ -13,7 +13,9 @@ namespace rocksdb {
 // ImmutableCFOptions is a data struct used by RocksDB internal. It contains a
 // subset of Options that should not be changed during the entire lifetime
 // of DB. You shouldn't need to access this data structure unless you are
-// implementing a new TableFactory.
+// implementing a new TableFactory. Raw pointers defined in this struct do
+// not have ownership to the data they point to. Options contains shared_ptr
+// to these data.
 struct ImmutableCFOptions {
   explicit ImmutableCFOptions(const Options& options);
 
@@ -27,6 +29,12 @@ struct ImmutableCFOptions {
 
   MergeOperator* merge_operator;
 
+  const CompactionFilter* compaction_filter;
+
+  CompactionFilterFactory* compaction_filter_factory;
+
+  CompactionFilterFactoryV2* compaction_filter_factory_v2;
+
   Logger* info_log;
 
   Statistics* statistics;
diff --git a/util/options.cc b/util/options.cc
index a4e83cc78..4def58ffe 100644
--- a/util/options.cc
+++ b/util/options.cc
@@ -38,6 +38,9 @@ ImmutableCFOptions::ImmutableCFOptions(const Options& options)
     prefix_extractor(options.prefix_extractor.get()),
     comparator(options.comparator),
     merge_operator(options.merge_operator.get()),
+    compaction_filter(options.compaction_filter),
+    compaction_filter_factory(options.compaction_filter_factory.get()),
+    compaction_filter_factory_v2(options.compaction_filter_factory_v2.get()),
     info_log(options.info_log.get()),
     statistics(options.statistics.get()),
     env(options.env),

From 2d57828d0ecb55eaf7a90b8f13568884d48938c7 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Mon, 8 Sep 2014 15:23:58 -0700
Subject: [PATCH 047/829] Check stop level trigger-0 before slowdown level-0
 trigger

Summary: ...

Test Plan: Can't repro the test failure, but let's see what jenkins says

Reviewers: zagfox, sdong, ljin

Reviewed By: sdong, ljin

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D23061
---
 db/column_family.cc | 14 +++++++-------
 db/db_test.cc       | 20 ++++++++++++++++++++
 2 files changed, 27 insertions(+), 7 deletions(-)

diff --git a/db/column_family.cc b/db/column_family.cc
index eb2f21e9f..b10b800b4 100644
--- a/db/column_family.cc
+++ b/db/column_family.cc
@@ -326,6 +326,13 @@ void ColumnFamilyData::RecalculateWriteStallConditions() {
           "[%s] Stopping writes because we have %d immutable memtables "
           "(waiting for flush)",
           name_.c_str(), imm()->size());
+    } else if (current_->NumLevelFiles(0) >=
+               options_.level0_stop_writes_trigger) {
+      write_controller_token_ = write_controller->GetStopToken();
+      internal_stats_->AddCFStats(InternalStats::LEVEL0_NUM_FILES, 1);
+      Log(options_.info_log,
+          "[%s] Stopping writes because we have %d level-0 files",
+          name_.c_str(), current_->NumLevelFiles(0));
     } else if (options_.level0_slowdown_writes_trigger >= 0 &&
                current_->NumLevelFiles(0) >=
                    options_.level0_slowdown_writes_trigger) {
@@ -338,13 +345,6 @@ void ColumnFamilyData::RecalculateWriteStallConditions() {
           "[%s] Stalling writes because we have %d level-0 files (%" PRIu64
           "us)",
           name_.c_str(), current_->NumLevelFiles(0), slowdown);
-    } else if (current_->NumLevelFiles(0) >=
-               options_.level0_stop_writes_trigger) {
-      write_controller_token_ = write_controller->GetStopToken();
-      internal_stats_->AddCFStats(InternalStats::LEVEL0_NUM_FILES, 1);
-      Log(options_.info_log,
-          "[%s] Stopping writes because we have %d level-0 files",
-          name_.c_str(), current_->NumLevelFiles(0));
     } else if (options_.hard_rate_limit > 1.0 &&
                score > options_.hard_rate_limit) {
       uint64_t kHardLimitSlowdown = 1000;
diff --git a/db/db_test.cc b/db/db_test.cc
index 140e87078..96f7e208a 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -7827,6 +7827,26 @@ TEST(DBTest, MTRandomTimeoutTest) {
   }
 }
 
+TEST(DBTest, Level0StopWritesTest) {
+  Options options = CurrentOptions();
+  options.level0_slowdown_writes_trigger = 2;
+  options.level0_stop_writes_trigger = 4;
+  options.disable_auto_compactions = 4;
+  options.max_mem_compaction_level = 0;
+  Reopen(&options);
+
+  // create 4 level0 tables
+  for (int i = 0; i < 4; ++i) {
+    Put("a", "b");
+    Flush();
+  }
+
+  WriteOptions woptions;
+  woptions.timeout_hint_us = 30 * 1000;  // 30 ms
+  Status s = Put("a", "b", woptions);
+  ASSERT_TRUE(s.IsTimedOut());
+}
+
 }  // anonymous namespace
 
 /*

From 9b0f7ffa1ca38927897669858836c2c6370d67d1 Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Mon, 8 Sep 2014 15:25:01 -0700
Subject: [PATCH 048/829] rename version_set options_ to db_options_ to avoid
 confusion

Summary: as title

Test Plan: make release

Reviewers: sdong, yhchiang, igor

Reviewed By: igor

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D23007
---
 db/version_set.cc | 104 +++++++++++++++++++++++-----------------------
 db/version_set.h  |  22 +++++-----
 2 files changed, 64 insertions(+), 62 deletions(-)

diff --git a/db/version_set.cc b/db/version_set.cc
index 9788137af..bd3d1b81c 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -514,7 +514,7 @@ Status Version::GetTableProperties(std::shared_ptr<const TableProperties>* tp,
   auto table_cache = cfd_->table_cache();
   auto ioptions = cfd_->ioptions();
   Status s = table_cache->GetTableProperties(
-      vset_->storage_options_, cfd_->internal_comparator(), file_meta->fd,
+      vset_->env_options_, cfd_->internal_comparator(), file_meta->fd,
       tp, true /* no io */);
   if (s.ok()) {
     return s;
@@ -531,12 +531,12 @@ Status Version::GetTableProperties(std::shared_ptr<const TableProperties>* tp,
   std::unique_ptr<RandomAccessFile> file;
   if (fname != nullptr) {
     s = ioptions->env->NewRandomAccessFile(
-        *fname, &file, vset_->storage_options_);
+        *fname, &file, vset_->env_options_);
   } else {
     s = ioptions->env->NewRandomAccessFile(
-        TableFileName(vset_->options_->db_paths, file_meta->fd.GetNumber(),
+        TableFileName(vset_->db_options_->db_paths, file_meta->fd.GetNumber(),
                       file_meta->fd.GetPathId()),
-        &file, vset_->storage_options_);
+        &file, vset_->env_options_);
   }
   if (!s.ok()) {
     return s;
@@ -562,7 +562,7 @@ Status Version::GetPropertiesOfAllTables(TablePropertiesCollection* props) {
   for (int level = 0; level < num_levels_; level++) {
     for (const auto& file_meta : files_[level]) {
       auto fname =
-          TableFileName(vset_->options_->db_paths, file_meta->fd.GetNumber(),
+          TableFileName(vset_->db_options_->db_paths, file_meta->fd.GetNumber(),
                         file_meta->fd.GetPathId());
       // 1. If the table is already present in table cache, load table
       // properties from there.
@@ -584,7 +584,7 @@ size_t Version::GetMemoryUsageByTableReaders() {
   for (auto& file_level : file_levels_) {
     for (size_t i = 0; i < file_level.num_files; i++) {
       total_usage += cfd_->table_cache()->GetMemoryUsageByTableReader(
-          vset_->storage_options_, cfd_->internal_comparator(),
+          vset_->env_options_, cfd_->internal_comparator(),
           file_level.files[i].fd);
     }
   }
@@ -864,7 +864,7 @@ bool Version::MaybeInitializeFileMetaData(FileMetaData* file_meta) {
   Status s = GetTableProperties(&tp, file_meta);
   file_meta->init_stats_from_file = true;
   if (!s.ok()) {
-    Log(vset_->options_->info_log,
+    Log(vset_->db_options_->info_log,
         "Unable to load table properties for file %" PRIu64 " --- %s\n",
         file_meta->fd.GetNumber(), s.ToString().c_str());
     return false;
@@ -1677,7 +1677,7 @@ class VersionSet::Builder {
       for (auto& file_meta : *(levels_[level].added_files)) {
         assert (!file_meta->table_reader_handle);
         cfd_->table_cache()->FindTable(
-            base_->vset_->storage_options_, cfd_->internal_comparator(),
+            base_->vset_->env_options_, cfd_->internal_comparator(),
             file_meta->fd, &file_meta->table_reader_handle, false);
         if (file_meta->table_reader_handle != nullptr) {
           // Load table_reader
@@ -1705,14 +1705,14 @@ class VersionSet::Builder {
   }
 };
 
-VersionSet::VersionSet(const std::string& dbname, const DBOptions* options,
-                       const EnvOptions& storage_options, Cache* table_cache,
+VersionSet::VersionSet(const std::string& dbname, const DBOptions* db_options,
+                       const EnvOptions& env_options, Cache* table_cache,
                        WriteController* write_controller)
-    : column_family_set_(new ColumnFamilySet(dbname, options, storage_options,
+    : column_family_set_(new ColumnFamilySet(dbname, db_options, env_options,
                                              table_cache, write_controller)),
-      env_(options->env),
+      env_(db_options->env),
       dbname_(dbname),
-      options_(options),
+      db_options_(db_options),
       next_file_number_(2),
       manifest_file_number_(0),  // Filled by Recover()
       pending_manifest_file_number_(0),
@@ -1720,8 +1720,8 @@ VersionSet::VersionSet(const std::string& dbname, const DBOptions* options,
       prev_log_number_(0),
       current_version_number_(0),
       manifest_file_size_(0),
-      storage_options_(storage_options),
-      storage_options_compactions_(storage_options_) {}
+      env_options_(env_options),
+      env_options_compactions_(env_options_) {}
 
 VersionSet::~VersionSet() {
   // we need to delete column_family_set_ because its destructor depends on
@@ -1823,7 +1823,7 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
 
   assert(pending_manifest_file_number_ == 0);
   if (!descriptor_log_ ||
-      manifest_file_size_ > options_->max_manifest_file_size) {
+      manifest_file_size_ > db_options_->max_manifest_file_size) {
     pending_manifest_file_number_ = NewFileNumber();
     batch_edits.back()->SetNextFile(next_file_number_);
     new_descriptor_log = true;
@@ -1851,7 +1851,8 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
 
     mu->Unlock();
 
-    if (!edit->IsColumnFamilyManipulation() && options_->max_open_files == -1) {
+    if (!edit->IsColumnFamilyManipulation() &&
+        db_options_->max_open_files == -1) {
       // unlimited table cache. Pre-load table handle now.
       // Need to do it out of the mutex.
       builder->LoadTableHandlers();
@@ -1861,15 +1862,15 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
     // only one thread can be here at the same time
     if (new_descriptor_log) {
       // create manifest file
-      Log(options_->info_log,
+      Log(db_options_->info_log,
           "Creating manifest %" PRIu64 "\n", pending_manifest_file_number_);
       unique_ptr<WritableFile> descriptor_file;
       s = env_->NewWritableFile(
           DescriptorFileName(dbname_, pending_manifest_file_number_),
-          &descriptor_file, env_->OptimizeForManifestWrite(storage_options_));
+          &descriptor_file, env_->OptimizeForManifestWrite(env_options_));
       if (s.ok()) {
         descriptor_file->SetPreallocationBlockSize(
-            options_->manifest_preallocation_size);
+            db_options_->manifest_preallocation_size);
         descriptor_log_.reset(new log::Writer(std::move(descriptor_file)));
         s = WriteSnapshot(descriptor_log_.get());
       }
@@ -1891,18 +1892,19 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
         }
       }
       if (s.ok()) {
-        if (options_->use_fsync) {
-          StopWatch sw(env_, options_->statistics.get(),
+        if (db_options_->use_fsync) {
+          StopWatch sw(env_, db_options_->statistics.get(),
                        MANIFEST_FILE_SYNC_MICROS);
           s = descriptor_log_->file()->Fsync();
         } else {
-          StopWatch sw(env_, options_->statistics.get(),
+          StopWatch sw(env_, db_options_->statistics.get(),
                        MANIFEST_FILE_SYNC_MICROS);
           s = descriptor_log_->file()->Sync();
         }
       }
       if (!s.ok()) {
-        Log(options_->info_log, "MANIFEST write: %s\n", s.ToString().c_str());
+        Log(db_options_->info_log, "MANIFEST write: %s\n",
+            s.ToString().c_str());
         bool all_records_in = true;
         for (auto& e : batch_edits) {
           std::string record;
@@ -1913,7 +1915,7 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
           }
         }
         if (all_records_in) {
-          Log(options_->info_log,
+          Log(db_options_->info_log,
               "MANIFEST contains log record despite error; advancing to new "
               "version to prevent mismatch between in-memory and logged state"
               " If paranoid is set, then the db is now in readonly mode.");
@@ -1929,7 +1931,7 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
                          db_directory);
       if (s.ok() && pending_manifest_file_number_ > manifest_file_number_) {
         // delete old manifest file
-        Log(options_->info_log,
+        Log(db_options_->info_log,
             "Deleting manifest %" PRIu64 " current manifest %" PRIu64 "\n",
             manifest_file_number_, pending_manifest_file_number_);
         // we don't care about an error here, PurgeObsoleteFiles will take care
@@ -1943,7 +1945,7 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
       new_manifest_file_size = descriptor_log_->file()->GetFileSize();
     }
 
-    LogFlush(options_->info_log);
+    LogFlush(db_options_->info_log);
     mu->Lock();
   }
 
@@ -1979,12 +1981,12 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
     manifest_file_size_ = new_manifest_file_size;
     prev_log_number_ = edit->prev_log_number_;
   } else {
-    Log(options_->info_log, "Error in committing version %lu to [%s]",
+    Log(db_options_->info_log, "Error in committing version %lu to [%s]",
         (unsigned long)v->GetVersionNumber(),
         column_family_data->GetName().c_str());
     delete v;
     if (new_descriptor_log) {
-      Log(options_->info_log,
+      Log(db_options_->info_log,
         "Deleting manifest %" PRIu64 " current manifest %" PRIu64 "\n",
         manifest_file_number_, pending_manifest_file_number_);
       descriptor_log_.reset();
@@ -2076,13 +2078,13 @@ Status VersionSet::Recover(
     return Status::Corruption("CURRENT file corrupted");
   }
 
-  Log(options_->info_log, "Recovering from manifest file: %s\n",
+  Log(db_options_->info_log, "Recovering from manifest file: %s\n",
       manifest_filename.c_str());
 
   manifest_filename = dbname_ + "/" + manifest_filename;
   unique_ptr<SequentialFile> manifest_file;
   s = env_->NewSequentialFile(manifest_filename, &manifest_file,
-                              storage_options_);
+                              env_options_);
   if (!s.ok()) {
     return s;
   }
@@ -2209,7 +2211,7 @@ Status VersionSet::Recover(
       if (cfd != nullptr) {
         if (edit.has_log_number_) {
           if (cfd->GetLogNumber() > edit.log_number_) {
-            Log(options_->info_log,
+            Log(db_options_->info_log,
                 "MANIFEST corruption detected, but ignored - Log numbers in "
                 "records NOT monotonically increasing");
           } else {
@@ -2285,7 +2287,7 @@ Status VersionSet::Recover(
       assert(builders_iter != builders.end());
       auto builder = builders_iter->second;
 
-      if (options_->max_open_files == -1) {
+      if (db_options_->max_open_files == -1) {
       // unlimited table cache. Pre-load table handle now.
       // Need to do it out of the mutex.
         builder->LoadTableHandlers();
@@ -2306,7 +2308,7 @@ Status VersionSet::Recover(
     last_sequence_ = last_sequence;
     prev_log_number_ = prev_log_number;
 
-    Log(options_->info_log,
+    Log(db_options_->info_log,
         "Recovered from manifest file:%s succeeded,"
         "manifest_file_number is %lu, next_file_number is %lu, "
         "last_sequence is %lu, log_number is %lu,"
@@ -2318,7 +2320,7 @@ Status VersionSet::Recover(
         column_family_set_->GetMaxColumnFamily());
 
     for (auto cfd : *column_family_set_) {
-      Log(options_->info_log,
+      Log(db_options_->info_log,
           "Column family [%s] (ID %u), log number is %" PRIu64 "\n",
           cfd->GetName().c_str(), cfd->GetID(), cfd->GetLogNumber());
     }
@@ -2401,7 +2403,7 @@ Status VersionSet::ListColumnFamilies(std::vector<std::string>* column_families,
 #ifndef ROCKSDB_LITE
 Status VersionSet::ReduceNumberOfLevels(const std::string& dbname,
                                         const Options* options,
-                                        const EnvOptions& storage_options,
+                                        const EnvOptions& env_options,
                                         int new_levels) {
   if (new_levels <= 1) {
     return Status::InvalidArgument(
@@ -2413,7 +2415,7 @@ Status VersionSet::ReduceNumberOfLevels(const std::string& dbname,
       options->max_open_files - 10, options->table_cache_numshardbits,
       options->table_cache_remove_scan_count_limit));
   WriteController wc;
-  VersionSet versions(dbname, options, storage_options, tc.get(), &wc);
+  VersionSet versions(dbname, options, env_options, tc.get(), &wc);
   Status status;
 
   std::vector<ColumnFamilyDescriptor> dummy;
@@ -2484,7 +2486,7 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname,
                                 bool verbose, bool hex) {
   // Open the specified manifest file.
   unique_ptr<SequentialFile> file;
-  Status s = options.env->NewSequentialFile(dscname, &file, storage_options_);
+  Status s = options.env->NewSequentialFile(dscname, &file, env_options_);
   if (!s.ok()) {
     return s;
   }
@@ -2726,12 +2728,12 @@ bool VersionSet::ManifestContains(uint64_t manifest_file_number,
                                   const std::string& record) const {
   std::string fname =
       DescriptorFileName(dbname_, manifest_file_number);
-  Log(options_->info_log, "ManifestContains: checking %s\n", fname.c_str());
+  Log(db_options_->info_log, "ManifestContains: checking %s\n", fname.c_str());
   unique_ptr<SequentialFile> file;
-  Status s = env_->NewSequentialFile(fname, &file, storage_options_);
+  Status s = env_->NewSequentialFile(fname, &file, env_options_);
   if (!s.ok()) {
-    Log(options_->info_log, "ManifestContains: %s\n", s.ToString().c_str());
-    Log(options_->info_log,
+    Log(db_options_->info_log, "ManifestContains: %s\n", s.ToString().c_str());
+    Log(db_options_->info_log,
         "ManifestContains: is unable to reopen the manifest file  %s",
         fname.c_str());
     return false;
@@ -2746,7 +2748,7 @@ bool VersionSet::ManifestContains(uint64_t manifest_file_number,
       break;
     }
   }
-  Log(options_->info_log, "ManifestContains: result = %d\n", result ? 1 : 0);
+  Log(db_options_->info_log, "ManifestContains: result = %d\n", result ? 1 : 0);
   return result;
 }
 
@@ -2774,7 +2776,7 @@ uint64_t VersionSet::ApproximateOffsetOf(Version* v, const InternalKey& ikey) {
         // approximate offset of "ikey" within the table.
         TableReader* table_reader_ptr;
         Iterator* iter = v->cfd_->table_cache()->NewIterator(
-            ReadOptions(), storage_options_, v->cfd_->internal_comparator(),
+            ReadOptions(), env_options_, v->cfd_->internal_comparator(),
             files[i]->fd, &table_reader_ptr);
         if (table_reader_ptr != nullptr) {
           result += table_reader_ptr->ApproximateOffsetOf(ikey.Encode());
@@ -2836,14 +2838,14 @@ Iterator* VersionSet::MakeInputIterator(Compaction* c) {
         const FileLevel* flevel = c->input_levels(which);
         for (size_t i = 0; i < flevel->num_files; i++) {
           list[num++] = cfd->table_cache()->NewIterator(
-              read_options, storage_options_compactions_,
+              read_options, env_options_compactions_,
               cfd->internal_comparator(), flevel->files[i].fd, nullptr,
               true /* for compaction */);
         }
       } else {
         // Create concatenating iterator for the files from this level
         list[num++] = NewTwoLevelIterator(new Version::LevelFileIteratorState(
-              cfd->table_cache(), read_options, storage_options_,
+              cfd->table_cache(), read_options, env_options_,
               cfd->internal_comparator(), true /* for_compaction */,
               false /* prefix enabled */),
             new Version::LevelFileNumIterator(cfd->internal_comparator(),
@@ -2864,7 +2866,7 @@ bool VersionSet::VerifyCompactionFileConsistency(Compaction* c) {
 #ifndef NDEBUG
   Version* version = c->column_family_data()->current();
   if (c->input_version() != version) {
-    Log(options_->info_log,
+    Log(db_options_->info_log,
         "[%s] VerifyCompactionFileConsistency version mismatch",
         c->column_family_data()->GetName().c_str());
   }
@@ -2935,11 +2937,11 @@ void VersionSet::GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata) {
         LiveFileMetaData filemetadata;
         filemetadata.column_family_name = cfd->GetName();
         uint32_t path_id = file->fd.GetPathId();
-        if (path_id < options_->db_paths.size()) {
-          filemetadata.db_path = options_->db_paths[path_id].path;
+        if (path_id < db_options_->db_paths.size()) {
+          filemetadata.db_path = db_options_->db_paths[path_id].path;
         } else {
-          assert(!options_->db_paths.empty());
-          filemetadata.db_path = options_->db_paths.back().path;
+          assert(!db_options_->db_paths.empty());
+          filemetadata.db_path = db_options_->db_paths.back().path;
         }
         filemetadata.name = MakeTableFileName("", file->fd.GetNumber());
         filemetadata.level = level;
diff --git a/db/version_set.h b/db/version_set.h
index bfb567036..353adbfec 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -256,7 +256,7 @@ class Version {
   class LevelFileNumIterator;
   class LevelFileIteratorState;
 
-  bool PrefixMayMatch(const ReadOptions& options, Iterator* level_iter,
+  bool PrefixMayMatch(const ReadOptions& read_options, Iterator* level_iter,
                       const Slice& internal_prefix) const;
 
   // Update num_non_empty_levels_.
@@ -357,8 +357,8 @@ class Version {
 
 class VersionSet {
  public:
-  VersionSet(const std::string& dbname, const DBOptions* options,
-             const EnvOptions& storage_options, Cache* table_cache,
+  VersionSet(const std::string& dbname, const DBOptions* db_options,
+             const EnvOptions& env_options, Cache* table_cache,
              WriteController* write_controller);
   ~VersionSet();
 
@@ -397,7 +397,7 @@ class VersionSet {
   // among [4-6] contains files.
   static Status ReduceNumberOfLevels(const std::string& dbname,
                                      const Options* options,
-                                     const EnvOptions& storage_options,
+                                     const EnvOptions& env_options,
                                      int new_levels);
 
   // printf contents (for debugging)
@@ -506,14 +506,14 @@ class VersionSet {
   bool ManifestContains(uint64_t manifest_file_number,
                         const std::string& record) const;
 
-  ColumnFamilyData* CreateColumnFamily(const ColumnFamilyOptions& options,
+  ColumnFamilyData* CreateColumnFamily(const ColumnFamilyOptions& cf_options,
                                        VersionEdit* edit);
 
   std::unique_ptr<ColumnFamilySet> column_family_set_;
 
   Env* const env_;
   const std::string dbname_;
-  const DBOptions* const options_;
+  const DBOptions* const db_options_;
   uint64_t next_file_number_;
   uint64_t manifest_file_number_;
   uint64_t pending_manifest_file_number_;
@@ -534,12 +534,12 @@ class VersionSet {
 
   std::vector<FileMetaData*> obsolete_files_;
 
-  // storage options for all reads and writes except compactions
-  const EnvOptions& storage_options_;
+  // env options for all reads and writes except compactions
+  const EnvOptions& env_options_;
 
-  // storage options used for compactions. This is a copy of
-  // storage_options_ but with readaheads set to readahead_compactions_.
-  const EnvOptions storage_options_compactions_;
+  // env options used for compactions. This is a copy of
+  // env_options_ but with readaheads set to readahead_compactions_.
+  const EnvOptions env_options_compactions_;
 
   // No copying allowed
   VersionSet(const VersionSet&);

From 171d4ff4a29fec153d9e53d44bb39657044927a7 Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Mon, 8 Sep 2014 15:39:53 -0700
Subject: [PATCH 049/829] remove TailingIterator reference in db_impl.h

Summary: as title

Test Plan: make release

Reviewers: igor

Differential Revision: https://reviews.facebook.net/D23073
---
 db/db_impl.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/db/db_impl.h b/db/db_impl.h
index c2bb48597..4d6ba0495 100644
--- a/db/db_impl.h
+++ b/db/db_impl.h
@@ -316,7 +316,6 @@ class DBImpl : public DB {
   friend class DB;
   friend class InternalStats;
 #ifndef ROCKSDB_LITE
-  friend class TailingIterator;
   friend class ForwardIterator;
 #endif
   friend struct SuperVersion;

From 55114e7f40470bb12bbdadca4b8100e612a41da2 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Mon, 8 Sep 2014 17:00:43 -0700
Subject: [PATCH 050/829] Some updates for SpatialDB

Summary:
1. add db statistics
2. write out batch every millionth write

Test Plan: unit tests

Reviewers: ljin, sdong, yinwang

Reviewed By: yinwang

Differential Revision: https://reviews.facebook.net/D22755
---
 utilities/spatialdb/spatial_db.cc | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/utilities/spatialdb/spatial_db.cc b/utilities/spatialdb/spatial_db.cc
index f0aed8faa..cdddbd85b 100644
--- a/utilities/spatialdb/spatial_db.cc
+++ b/utilities/spatialdb/spatial_db.cc
@@ -22,6 +22,7 @@
 #include "rocksdb/options.h"
 #include "rocksdb/memtablerep.h"
 #include "rocksdb/slice_transform.h"
+#include "rocksdb/statistics.h"
 #include "rocksdb/table.h"
 #include "rocksdb/db.h"
 #include "rocksdb/utilities/stackable_db.h"
@@ -516,6 +517,7 @@ class SpatialDBImpl : public SpatialDB {
       return Status::InvalidArgument("Spatial indexes can't be empty");
     }
 
+    const int kWriteOutEveryBytes = 1024 * 1024;  // 1MB
     uint64_t id = next_id_.fetch_add(1);
 
     for (const auto& si : spatial_indexes) {
@@ -537,6 +539,13 @@ class SpatialDBImpl : public SpatialDB {
               &key, GetQuadKeyFromTile(x, y, spatial_index.tile_bits));
           PutFixed64BigEndian(&key, id);
           batch.Put(itr->second.column_family, key, Slice());
+          if (batch.GetDataSize() >= kWriteOutEveryBytes) {
+            Status s = Write(write_options, &batch);
+            batch.Clear();
+            if (!s.ok()) {
+              return s;
+            }
+          }
         }
       }
     }
@@ -553,6 +562,7 @@ class SpatialDBImpl : public SpatialDB {
   }
 
   virtual Status Compact() override {
+    // TODO(icanadi) maybe do this in parallel?
     Status s, t;
     for (auto& iter : name_to_index_) {
       t = Flush(FlushOptions(), iter.second.column_family);
@@ -625,6 +635,7 @@ class SpatialDBImpl : public SpatialDB {
 namespace {
 DBOptions GetDBOptions(const SpatialDBOptions& options) {
   DBOptions db_options;
+  db_options.max_open_files = 50000;
   db_options.max_background_compactions = 3 * options.num_threads / 4;
   db_options.max_background_flushes =
       options.num_threads - db_options.max_background_compactions;
@@ -632,8 +643,12 @@ DBOptions GetDBOptions(const SpatialDBOptions& options) {
                                        Env::LOW);
   db_options.env->SetBackgroundThreads(db_options.max_background_flushes,
                                        Env::HIGH);
+  db_options.statistics = CreateDBStatistics();
   if (options.bulk_load) {
+    db_options.stats_dump_period_sec = 600;
     db_options.disableDataSync = true;
+  } else {
+    db_options.stats_dump_period_sec = 1800;  // 30min
   }
   return db_options;
 }
@@ -643,6 +658,8 @@ ColumnFamilyOptions GetColumnFamilyOptions(const SpatialDBOptions& options,
   ColumnFamilyOptions column_family_options;
   column_family_options.write_buffer_size = 128 * 1024 * 1024;  // 128MB
   column_family_options.max_write_buffer_number = 4;
+  column_family_options.max_bytes_for_level_base = 256 * 1024 * 1024;  // 256MB
+  column_family_options.target_file_size_base = 64 * 1024 * 1024;      // 64MB
   column_family_options.level0_file_num_compaction_trigger = 2;
   column_family_options.level0_slowdown_writes_trigger = 16;
   column_family_options.level0_slowdown_writes_trigger = 32;

From 1d284db2135ecb10d8bcd7cd4c1c10fc9f57f621 Mon Sep 17 00:00:00 2001
From: Naveen <nsomasundaram@linkedin.com>
Date: Mon, 8 Sep 2014 17:44:52 -0700
Subject: [PATCH 051/829] Addressing review comments

---
 Makefile                                  | 27 ++++++++++++-----------
 java/org/rocksdb/NativeLibraryLoader.java | 10 ++++-----
 java/org/rocksdb/Options.java             |  2 +-
 java/org/rocksdb/RocksDB.java             |  9 +++-----
 4 files changed, 23 insertions(+), 25 deletions(-)

diff --git a/Makefile b/Makefile
index fd7c8c7d1..4c58e0b0a 100644
--- a/Makefile
+++ b/Makefile
@@ -245,7 +245,7 @@ unity: unity.cc unity.o
 clean:
 	-rm -f $(PROGRAMS) $(TESTS) $(LIBRARY) $(SHARED) $(MEMENVLIBRARY) build_config.mk unity.cc
 	-rm -rf ios-x86/* ios-arm/*
-	-find . -name "*.[od]" -exec rm {} \;
+	-find . -name "*.[oda]" -exec rm {} \;
 	-find . -type f -regex ".*\.\(\(gcda\)\|\(gcno\)\)" -exec rm {} \;
 tags:
 	ctags * -R
@@ -480,29 +480,30 @@ ROCKSDBJNILIB = librocksdbjni.jnilib
 JAVA_INCLUDE = -I/System/Library/Frameworks/JavaVM.framework/Headers/
 endif
 
-
-rocksdbjavastatic:
-	#build zlib 
+libz.a:
+	-rm -rf zlib-1.2.8
 	curl -O http://zlib.net/zlib-1.2.8.tar.gz
 	tar xvzf zlib-1.2.8.tar.gz
 	cd zlib-1.2.8 && CFLAGS='-fPIC' ./configure --static && make
-	cp zlib-1.2.8/libz.a .
-	rm -rf zlib-1.2.8.tar.gz zlib-1.2.8
-	
-	#build bzip
-	curl -O  http://www.bzip.org/1.0.6/bzip2-1.0.6.tar.gz
+	cp zlib-1.2.8/libz.a . 
+
+libbz2.a:
+	-rm -rf bzip2-1.0.6
+	curl -O  http://www.bzip.org/1.0.6/bzip2-1.0.6.tar.gz 
 	tar xvzf bzip2-1.0.6.tar.gz
 	cd bzip2-1.0.6 && make CFLAGS='-fPIC -Wall -Winline -O2 -g -D_FILE_OFFSET_BITS=64'
 	cp bzip2-1.0.6/libbz2.a .
-	rm -rf bzip2-1.0.6.tar.gz bzip2-1.0.6
 
-	#build snappy
+libsnappy.a:
+	-rm -rf snappy-1.1.1
 	curl -O https://snappy.googlecode.com/files/snappy-1.1.1.tar.gz
 	tar xvzf snappy-1.1.1.tar.gz
-	cd snappy-1.1.1 && ./configure --with-pic --enable-static 
+	cd snappy-1.1.1 && ./configure --with-pic --enable-static
 	cd snappy-1.1.1 && make
 	cp snappy-1.1.1/.libs/libsnappy.a .
-	rm -rf snappy-1.1.1 snappy-1.1.1.tar.gz
+		
+
+rocksdbjavastatic: libz.a libbz2.a libsnappy.a
 	OPT="-fPIC -DNDEBUG -O2" $(MAKE) $(LIBRARY) -j
 	cd java;$(MAKE) java;
 	rm -f ./java/$(ROCKSDBJNILIB)
diff --git a/java/org/rocksdb/NativeLibraryLoader.java b/java/org/rocksdb/NativeLibraryLoader.java
index f49f54488..f6b8520f5 100644
--- a/java/org/rocksdb/NativeLibraryLoader.java
+++ b/java/org/rocksdb/NativeLibraryLoader.java
@@ -2,8 +2,7 @@ package org.rocksdb;
 
 import java.io.*;
 
-public class NativeLibraryLoader
-{
+public class NativeLibraryLoader {
 
   private static String sharedLibraryName = "librocksdbjni.so";
   private static String tempFilePrefix = "librocksdbjni";
@@ -14,13 +13,14 @@ public class NativeLibraryLoader
   private NativeLibraryLoader() {
   }
 
-  public static void loadLibraryFromJar() throws IOException {
+  public static void loadLibraryFromJar()
+      throws IOException {
 
     File temp = File.createTempFile(tempFilePrefix, tempFileSuffix);
     temp.deleteOnExit();
 
     if (!temp.exists()) {
-      throw new FileNotFoundException("File " + temp.getAbsolutePath() + " does not exist.");
+      throw new RuntimeException("File " + temp.getAbsolutePath() + " does not exist.");
     }
 
     byte[] buffer = new byte[1024];
@@ -28,7 +28,7 @@ public class NativeLibraryLoader
 
     InputStream is = ClassLoader.getSystemClassLoader().getResourceAsStream(sharedLibraryName);
     if (is == null) {
-      throw new FileNotFoundException(sharedLibraryName + " was not found inside JAR.");
+      throw new RuntimeException(sharedLibraryName + " was not found inside JAR.");
     }
 
     OutputStream os = new FileOutputStream(temp);
diff --git a/java/org/rocksdb/Options.java b/java/org/rocksdb/Options.java
index 420bfebba..4ed0b8025 100644
--- a/java/org/rocksdb/Options.java
+++ b/java/org/rocksdb/Options.java
@@ -13,7 +13,7 @@ package org.rocksdb;
  * native resources will be released as part of the process.
  */
 public class Options extends RocksObject {
-  static{
+  static {
     RocksDB.loadLibrary();
   }
   static final long DEFAULT_CACHE_SIZE = 8 << 20;
diff --git a/java/org/rocksdb/RocksDB.java b/java/org/rocksdb/RocksDB.java
index 6825bf3c5..132b9ac39 100644
--- a/java/org/rocksdb/RocksDB.java
+++ b/java/org/rocksdb/RocksDB.java
@@ -19,8 +19,7 @@ import org.rocksdb.NativeLibraryLoader;
  * All methods of this class could potentially throw RocksDBException, which
  * indicates sth wrong at the rocksdb library side and the call failed.
  */
-public class RocksDB extends RocksObject
-{
+public class RocksDB extends RocksObject {
 
   public static final int NOT_FOUND = -1;
   private static final String[] compressionLibs_ = {
@@ -35,8 +34,7 @@ public class RocksDB extends RocksObject
    * Loads the necessary library files.
    * Calling this method twice will have no effect.
    */
-  public static synchronized void loadLibrary()
-  {
+  public static synchronized void loadLibrary() {
     // loading possibly necessary libraries.
     for (String lib : compressionLibs_) {
       try {
@@ -45,14 +43,13 @@ public class RocksDB extends RocksObject
         // since it may be optional, we ignore its loading failure here.
       }
     }
-
     try
     {
       NativeLibraryLoader.loadLibraryFromJar();
     }
     catch (IOException e)
     {
-      e.printStackTrace();
+      throw new RuntimeException("Unable to load the RocksDB shared library" + e);
     }
   }
 

From 52311463e96c7a8d24009b69752b24608f0cfc89 Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Mon, 8 Sep 2014 18:46:52 -0700
Subject: [PATCH 052/829] MemTableOptions

Summary: removed reference to options in WriteBatch and DBImpl::Get()

Test Plan: make all check

Reviewers: yhchiang, igor, sdong

Reviewed By: sdong

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D23049
---
 db/column_family.cc                 |  3 +-
 db/db_impl.cc                       | 15 +++--
 db/db_impl_readonly.cc              | 11 ++--
 db/memtable.cc                      | 87 +++++++++++++++++------------
 db/memtable.h                       | 34 +++++++++--
 db/memtable_list.cc                 |  5 +-
 db/memtable_list.h                  |  2 +-
 db/repair.cc                        |  2 +-
 db/write_batch.cc                   | 42 +++++++-------
 db/write_batch_test.cc              |  3 +-
 include/rocksdb/immutable_options.h |  2 +
 table/table_test.cc                 | 17 ++++--
 util/options.cc                     |  1 +
 13 files changed, 136 insertions(+), 88 deletions(-)

diff --git a/db/column_family.cc b/db/column_family.cc
index b10b800b4..94aef3819 100644
--- a/db/column_family.cc
+++ b/db/column_family.cc
@@ -383,7 +383,8 @@ void ColumnFamilyData::CreateNewMemtable() {
   if (mem_ != nullptr) {
     delete mem_->Unref();
   }
-  mem_ = new MemTable(internal_comparator_, options_);
+  mem_ = new MemTable(internal_comparator_, ioptions_,
+                      MemTableOptions(options_));
   mem_->Ref();
 }
 
diff --git a/db/db_impl.cc b/db/db_impl.cc
index a7c9206f9..fc98b2abd 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -3434,10 +3434,10 @@ Status DBImpl::GetImpl(const ReadOptions& options,
   LookupKey lkey(key, snapshot);
   PERF_TIMER_STOP(get_snapshot_time);
 
-  if (sv->mem->Get(lkey, value, &s, merge_context, *cfd->options())) {
+  if (sv->mem->Get(lkey, value, &s, merge_context)) {
     // Done
     RecordTick(stats_, MEMTABLE_HIT);
-  } else if (sv->imm->Get(lkey, value, &s, merge_context, *cfd->options())) {
+  } else if (sv->imm->Get(lkey, value, &s, merge_context)) {
     // Done
     RecordTick(stats_, MEMTABLE_HIT);
   } else {
@@ -3522,12 +3522,9 @@ std::vector<Status> DBImpl::MultiGet(
     assert(mgd_iter != multiget_cf_data.end());
     auto mgd = mgd_iter->second;
     auto super_version = mgd->super_version;
-    auto cfd = mgd->cfd;
-    if (super_version->mem->Get(lkey, value, &s, merge_context,
-                                *cfd->options())) {
+    if (super_version->mem->Get(lkey, value, &s, merge_context)) {
       // Done
-    } else if (super_version->imm->Get(lkey, value, &s, merge_context,
-                                       *cfd->options())) {
+    } else if (super_version->imm->Get(lkey, value, &s, merge_context)) {
       // Done
     } else {
       super_version->current->Get(options, lkey, value, &s, &merge_context);
@@ -4294,7 +4291,9 @@ Status DBImpl::SetNewMemtableAndNewLogFile(ColumnFamilyData* cfd,
     }
 
     if (s.ok()) {
-      new_mem = new MemTable(cfd->internal_comparator(), *cfd->options());
+      new_mem = new MemTable(cfd->internal_comparator(),
+                             *cfd->ioptions(),
+                             MemTableOptions(*cfd->options()));
       new_superversion = new SuperVersion();
     }
   }
diff --git a/db/db_impl_readonly.cc b/db/db_impl_readonly.cc
index 86fa0852b..b1fae82cf 100644
--- a/db/db_impl_readonly.cc
+++ b/db/db_impl_readonly.cc
@@ -41,9 +41,9 @@
 
 namespace rocksdb {
 
-DBImplReadOnly::DBImplReadOnly(const DBOptions& options,
+DBImplReadOnly::DBImplReadOnly(const DBOptions& db_options,
                                const std::string& dbname)
-    : DBImpl(options, dbname) {
+    : DBImpl(db_options, dbname) {
   Log(db_options_.info_log, "Opening the db in read only mode");
 }
 
@@ -51,7 +51,7 @@ DBImplReadOnly::~DBImplReadOnly() {
 }
 
 // Implementations of the DB interface
-Status DBImplReadOnly::Get(const ReadOptions& options,
+Status DBImplReadOnly::Get(const ReadOptions& read_options,
                            ColumnFamilyHandle* column_family, const Slice& key,
                            std::string* value) {
   Status s;
@@ -61,10 +61,9 @@ Status DBImplReadOnly::Get(const ReadOptions& options,
   SuperVersion* super_version = cfd->GetSuperVersion();
   MergeContext merge_context;
   LookupKey lkey(key, snapshot);
-  if (super_version->mem->Get(lkey, value, &s, merge_context,
-                              *cfd->options())) {
+  if (super_version->mem->Get(lkey, value, &s, merge_context)) {
   } else {
-    super_version->current->Get(options, lkey, value, &s, &merge_context);
+    super_version->current->Get(read_options, lkey, value, &s, &merge_context);
   }
   return s;
 }
diff --git a/db/memtable.cc b/db/memtable.cc
index 1ed0e2cea..d7923711a 100644
--- a/db/memtable.cc
+++ b/db/memtable.cc
@@ -31,35 +31,51 @@
 
 namespace rocksdb {
 
-MemTable::MemTable(const InternalKeyComparator& cmp, const Options& options)
+MemTableOptions::MemTableOptions(const Options& options)
+  : write_buffer_size(options.write_buffer_size),
+    arena_block_size(options.arena_block_size),
+    memtable_prefix_bloom_bits(options.memtable_prefix_bloom_bits),
+    memtable_prefix_bloom_probes(options.memtable_prefix_bloom_probes),
+    memtable_prefix_bloom_huge_page_tlb_size(
+        options.memtable_prefix_bloom_huge_page_tlb_size),
+    inplace_update_support(options.inplace_update_support),
+    inplace_update_num_locks(options.inplace_update_num_locks),
+    inplace_callback(options.inplace_callback),
+    max_successive_merges(options.max_successive_merges),
+    filter_deletes(options.filter_deletes) {}
+
+MemTable::MemTable(const InternalKeyComparator& cmp,
+                   const ImmutableCFOptions& ioptions,
+                   const MemTableOptions& moptions)
     : comparator_(cmp),
+      ioptions_(ioptions),
+      moptions_(moptions),
       refs_(0),
-      kArenaBlockSize(OptimizeBlockSize(options.arena_block_size)),
-      kWriteBufferSize(options.write_buffer_size),
-      arena_(options.arena_block_size),
-      table_(options.memtable_factory->CreateMemTableRep(
-          comparator_, &arena_, options.prefix_extractor.get(),
-          options.info_log.get())),
+      kArenaBlockSize(OptimizeBlockSize(moptions.arena_block_size)),
+      arena_(moptions.arena_block_size),
+      table_(ioptions.memtable_factory->CreateMemTableRep(
+          comparator_, &arena_, ioptions.prefix_extractor,
+          ioptions.info_log)),
       num_entries_(0),
       flush_in_progress_(false),
       flush_completed_(false),
       file_number_(0),
       first_seqno_(0),
       mem_next_logfile_number_(0),
-      locks_(options.inplace_update_support ? options.inplace_update_num_locks
-                                            : 0),
-      prefix_extractor_(options.prefix_extractor.get()),
+      locks_(moptions.inplace_update_support ? moptions.inplace_update_num_locks
+                                             : 0),
+      prefix_extractor_(ioptions.prefix_extractor),
       should_flush_(ShouldFlushNow()) {
   // if should_flush_ == true without an entry inserted, something must have
   // gone wrong already.
   assert(!should_flush_);
-  if (prefix_extractor_ && options.memtable_prefix_bloom_bits > 0) {
+  if (prefix_extractor_ && moptions.memtable_prefix_bloom_bits > 0) {
     prefix_bloom_.reset(new DynamicBloom(
         &arena_,
-        options.memtable_prefix_bloom_bits, options.bloom_locality,
-        options.memtable_prefix_bloom_probes, nullptr,
-        options.memtable_prefix_bloom_huge_page_tlb_size,
-        options.info_log.get()));
+        moptions.memtable_prefix_bloom_bits, ioptions.bloom_locality,
+        moptions.memtable_prefix_bloom_probes, nullptr,
+        moptions.memtable_prefix_bloom_huge_page_tlb_size,
+        ioptions.info_log));
   }
 }
 
@@ -97,14 +113,16 @@ bool MemTable::ShouldFlushNow() const {
   // if we can still allocate one more block without exceeding the
   // over-allocation ratio, then we should not flush.
   if (allocated_memory + kArenaBlockSize <
-      kWriteBufferSize + kArenaBlockSize * kAllowOverAllocationRatio) {
+      moptions_.write_buffer_size +
+      kArenaBlockSize * kAllowOverAllocationRatio) {
     return false;
   }
 
-  // if user keeps adding entries that exceeds kWriteBufferSize, we need to
-  // flush earlier even though we still have much available memory left.
-  if (allocated_memory >
-      kWriteBufferSize + kArenaBlockSize * kAllowOverAllocationRatio) {
+  // if user keeps adding entries that exceeds moptions.write_buffer_size,
+  // we need to flush earlier even though we still have much available
+  // memory left.
+  if (allocated_memory > moptions_.write_buffer_size +
+      kArenaBlockSize * kAllowOverAllocationRatio) {
     return true;
   }
 
@@ -175,12 +193,12 @@ const char* EncodeKey(std::string* scratch, const Slice& target) {
 class MemTableIterator: public Iterator {
  public:
   MemTableIterator(
-      const MemTable& mem, const ReadOptions& options, Arena* arena)
+      const MemTable& mem, const ReadOptions& read_options, Arena* arena)
       : bloom_(nullptr),
         prefix_extractor_(mem.prefix_extractor_),
         valid_(false),
         arena_mode_(arena != nullptr) {
-    if (prefix_extractor_ != nullptr && !options.total_order_seek) {
+    if (prefix_extractor_ != nullptr && !read_options.total_order_seek) {
       bloom_ = mem.prefix_bloom_.get();
       iter_ = mem.table_->GetDynamicPrefixIterator(arena);
     } else {
@@ -248,10 +266,10 @@ class MemTableIterator: public Iterator {
   void operator=(const MemTableIterator&);
 };
 
-Iterator* MemTable::NewIterator(const ReadOptions& options, Arena* arena) {
+Iterator* MemTable::NewIterator(const ReadOptions& read_options, Arena* arena) {
   assert(arena != nullptr);
   auto mem = arena->AllocateAligned(sizeof(MemTableIterator));
-  return new (mem) MemTableIterator(*this, options, arena);
+  return new (mem) MemTableIterator(*this, read_options, arena);
 }
 
 port::RWMutex* MemTable::GetLock(const Slice& key) {
@@ -412,7 +430,7 @@ static bool SaveValue(void* arg, const char* entry) {
 }
 
 bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
-                   MergeContext& merge_context, const Options& options) {
+                   MergeContext& merge_context) {
   // The sequence number is updated synchronously in version_set.h
   if (IsEmpty()) {
     // Avoiding recording stats for speed.
@@ -437,10 +455,10 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
     saver.status = s;
     saver.mem = this;
     saver.merge_context = &merge_context;
-    saver.merge_operator = options.merge_operator.get();
-    saver.logger = options.info_log.get();
-    saver.inplace_update_support = options.inplace_update_support;
-    saver.statistics = options.statistics.get();
+    saver.merge_operator = ioptions_.merge_operator;
+    saver.logger = ioptions_.info_log;
+    saver.inplace_update_support = moptions_.inplace_update_support;
+    saver.statistics = ioptions_.statistics;
     table_->Get(key, &saver, SaveValue);
   }
 
@@ -512,8 +530,7 @@ void MemTable::Update(SequenceNumber seq,
 
 bool MemTable::UpdateCallback(SequenceNumber seq,
                               const Slice& key,
-                              const Slice& delta,
-                              const Options& options) {
+                              const Slice& delta) {
   LookupKey lkey(key, seq);
   Slice memkey = lkey.memtable_key();
 
@@ -548,8 +565,8 @@ bool MemTable::UpdateCallback(SequenceNumber seq,
 
           std::string str_value;
           WriteLock wl(GetLock(lkey.user_key()));
-          auto status = options.inplace_callback(prev_buffer, &new_prev_size,
-                                                    delta, &str_value);
+          auto status = moptions_.inplace_callback(prev_buffer, &new_prev_size,
+                                                   delta, &str_value);
           if (status == UpdateStatus::UPDATED_INPLACE) {
             // Value already updated by callback.
             assert(new_prev_size <= prev_size);
@@ -562,12 +579,12 @@ bool MemTable::UpdateCallback(SequenceNumber seq,
                 memcpy(p, prev_buffer, new_prev_size);
               }
             }
-            RecordTick(options.statistics.get(), NUMBER_KEYS_UPDATED);
+            RecordTick(ioptions_.statistics, NUMBER_KEYS_UPDATED);
             should_flush_ = ShouldFlushNow();
             return true;
           } else if (status == UpdateStatus::UPDATED) {
             Add(seq, kTypeValue, key, Slice(str_value));
-            RecordTick(options.statistics.get(), NUMBER_KEYS_WRITTEN);
+            RecordTick(ioptions_.statistics, NUMBER_KEYS_WRITTEN);
             should_flush_ = ShouldFlushNow();
             return true;
           } else if (status == UpdateStatus::UPDATE_FAILED) {
diff --git a/db/memtable.h b/db/memtable.h
index 80dcdd42e..26772e0f5 100644
--- a/db/memtable.h
+++ b/db/memtable.h
@@ -16,6 +16,7 @@
 #include "db/version_edit.h"
 #include "rocksdb/db.h"
 #include "rocksdb/memtablerep.h"
+#include "rocksdb/immutable_options.h"
 #include "util/arena.h"
 #include "util/dynamic_bloom.h"
 
@@ -26,6 +27,23 @@ class Mutex;
 class MemTableIterator;
 class MergeContext;
 
+struct MemTableOptions {
+  explicit MemTableOptions(const Options& options);
+  size_t write_buffer_size;
+  size_t arena_block_size;
+  uint32_t memtable_prefix_bloom_bits;
+  uint32_t memtable_prefix_bloom_probes;
+  size_t memtable_prefix_bloom_huge_page_tlb_size;
+  bool inplace_update_support;
+  size_t inplace_update_num_locks;
+  UpdateStatus (*inplace_callback)(char* existing_value,
+                                   uint32_t* existing_value_size,
+                                   Slice delta_value,
+                                   std::string* merged_value);
+  size_t max_successive_merges;
+  bool filter_deletes;
+};
+
 class MemTable {
  public:
   struct KeyComparator : public MemTableRep::KeyComparator {
@@ -40,7 +58,8 @@ class MemTable {
   // MemTables are reference counted.  The initial reference count
   // is zero and the caller must call Ref() at least once.
   explicit MemTable(const InternalKeyComparator& comparator,
-                    const Options& options);
+                    const ImmutableCFOptions& ioptions,
+                    const MemTableOptions& moptions);
 
   ~MemTable();
 
@@ -81,7 +100,7 @@ class MemTable {
   // arena: If not null, the arena needs to be used to allocate the Iterator.
   //        Calling ~Iterator of the iterator will destroy all the states but
   //        those allocated in arena.
-  Iterator* NewIterator(const ReadOptions& options, Arena* arena);
+  Iterator* NewIterator(const ReadOptions& read_options, Arena* arena);
 
   // Add an entry into memtable that maps key to value at the
   // specified sequence number and with the specified type.
@@ -99,7 +118,7 @@ class MemTable {
   //   store MergeInProgress in s, and return false.
   // Else, return false.
   bool Get(const LookupKey& key, std::string* value, Status* s,
-           MergeContext& merge_context, const Options& options);
+           MergeContext& merge_context);
 
   // Attempts to update the new_value inplace, else does normal Add
   // Pseudocode
@@ -123,8 +142,7 @@ class MemTable {
   //   else return false
   bool UpdateCallback(SequenceNumber seq,
                       const Slice& key,
-                      const Slice& delta,
-                      const Options& options);
+                      const Slice& delta);
 
   // Returns the number of successive merge entries starting from the newest
   // entry for the key up to the last non-merge entry or last entry for the
@@ -172,6 +190,9 @@ class MemTable {
 
   const Arena& TEST_GetArena() const { return arena_; }
 
+  const ImmutableCFOptions* GetImmutableOptions() const { return &ioptions_; }
+  const MemTableOptions* GetMemTableOptions() const { return &moptions_; }
+
  private:
   // Dynamically check if we can add more incoming entries.
   bool ShouldFlushNow() const;
@@ -181,9 +202,10 @@ class MemTable {
   friend class MemTableList;
 
   KeyComparator comparator_;
+  const ImmutableCFOptions& ioptions_;
+  const MemTableOptions moptions_;
   int refs_;
   const size_t kArenaBlockSize;
-  const size_t kWriteBufferSize;
   Arena arena_;
   unique_ptr<MemTableRep> table_;
 
diff --git a/db/memtable_list.cc b/db/memtable_list.cc
index 418aae230..ced03dc82 100644
--- a/db/memtable_list.cc
+++ b/db/memtable_list.cc
@@ -62,10 +62,9 @@ int MemTableList::size() const {
 // Return the most recent value found, if any.
 // Operands stores the list of merge operations to apply, so far.
 bool MemTableListVersion::Get(const LookupKey& key, std::string* value,
-                              Status* s, MergeContext& merge_context,
-                              const Options& options) {
+                              Status* s, MergeContext& merge_context) {
   for (auto& memtable : memlist_) {
-    if (memtable->Get(key, value, s, merge_context, options)) {
+    if (memtable->Get(key, value, s, merge_context)) {
       return true;
     }
   }
diff --git a/db/memtable_list.h b/db/memtable_list.h
index 997834e78..042ffc5cf 100644
--- a/db/memtable_list.h
+++ b/db/memtable_list.h
@@ -46,7 +46,7 @@ class MemTableListVersion {
   // Search all the memtables starting from the most recent one.
   // Return the most recent value found, if any.
   bool Get(const LookupKey& key, std::string* value, Status* s,
-           MergeContext& merge_context, const Options& options);
+           MergeContext& merge_context);
 
   void AddIterators(const ReadOptions& options,
                     std::vector<Iterator*>* iterator_list, Arena* arena);
diff --git a/db/repair.cc b/db/repair.cc
index ea6cdd642..bff81991e 100644
--- a/db/repair.cc
+++ b/db/repair.cc
@@ -219,7 +219,7 @@ class Repairer {
     std::string scratch;
     Slice record;
     WriteBatch batch;
-    MemTable* mem = new MemTable(icmp_, options_);
+    MemTable* mem = new MemTable(icmp_, ioptions_, MemTableOptions(options_));
     auto cf_mems_default = new ColumnFamilyMemTablesDefault(mem, &options_);
     mem->Ref();
     int counter = 0;
diff --git a/db/write_batch.cc b/db/write_batch.cc
index bfa5e3f6f..cacb4a5e3 100644
--- a/db/write_batch.cc
+++ b/db/write_batch.cc
@@ -23,7 +23,6 @@
 //    data: uint8[len]
 
 #include "rocksdb/write_batch.h"
-#include "rocksdb/options.h"
 #include "rocksdb/merge_operator.h"
 #include "db/dbformat.h"
 #include "db/db_impl.h"
@@ -350,14 +349,15 @@ class MemTableInserter : public WriteBatch::Handler {
       return seek_status;
     }
     MemTable* mem = cf_mems_->GetMemTable();
-    const Options* options = cf_mems_->GetOptions();
-    if (!options->inplace_update_support) {
+    auto* ioptions = mem->GetImmutableOptions();
+    auto* moptions = mem->GetMemTableOptions();
+    if (!moptions->inplace_update_support) {
       mem->Add(sequence_, kTypeValue, key, value);
-    } else if (options->inplace_callback == nullptr) {
+    } else if (moptions->inplace_callback == nullptr) {
       mem->Update(sequence_, key, value);
-      RecordTick(options->statistics.get(), NUMBER_KEYS_UPDATED);
+      RecordTick(ioptions->statistics, NUMBER_KEYS_UPDATED);
     } else {
-      if (mem->UpdateCallback(sequence_, key, value, *options)) {
+      if (mem->UpdateCallback(sequence_, key, value)) {
       } else {
         // key not found in memtable. Do sst get, update, add
         SnapshotImpl read_from_snapshot;
@@ -376,17 +376,17 @@ class MemTableInserter : public WriteBatch::Handler {
 
         char* prev_buffer = const_cast<char*>(prev_value.c_str());
         uint32_t prev_size = prev_value.size();
-        auto status = options->inplace_callback(s.ok() ? prev_buffer : nullptr,
-                                                s.ok() ? &prev_size : nullptr,
-                                                value, &merged_value);
+        auto status = moptions->inplace_callback(s.ok() ? prev_buffer : nullptr,
+                                                 s.ok() ? &prev_size : nullptr,
+                                                 value, &merged_value);
         if (status == UpdateStatus::UPDATED_INPLACE) {
           // prev_value is updated in-place with final value.
           mem->Add(sequence_, kTypeValue, key, Slice(prev_buffer, prev_size));
-          RecordTick(options->statistics.get(), NUMBER_KEYS_WRITTEN);
+          RecordTick(ioptions->statistics, NUMBER_KEYS_WRITTEN);
         } else if (status == UpdateStatus::UPDATED) {
           // merged_value contains the final value.
           mem->Add(sequence_, kTypeValue, key, Slice(merged_value));
-          RecordTick(options->statistics.get(), NUMBER_KEYS_WRITTEN);
+          RecordTick(ioptions->statistics, NUMBER_KEYS_WRITTEN);
         }
       }
     }
@@ -405,17 +405,18 @@ class MemTableInserter : public WriteBatch::Handler {
       return seek_status;
     }
     MemTable* mem = cf_mems_->GetMemTable();
-    const Options* options = cf_mems_->GetOptions();
+    auto* ioptions = mem->GetImmutableOptions();
+    auto* moptions = mem->GetMemTableOptions();
     bool perform_merge = false;
 
-    if (options->max_successive_merges > 0 && db_ != nullptr) {
+    if (moptions->max_successive_merges > 0 && db_ != nullptr) {
       LookupKey lkey(key, sequence_);
 
       // Count the number of successive merges at the head
       // of the key in the memtable
       size_t num_merges = mem->CountSuccessiveMergeEntries(lkey);
 
-      if (num_merges >= options->max_successive_merges) {
+      if (num_merges >= moptions->max_successive_merges) {
         perform_merge = true;
       }
     }
@@ -439,16 +440,16 @@ class MemTableInserter : public WriteBatch::Handler {
       Slice get_value_slice = Slice(get_value);
 
       // 2) Apply this merge
-      auto merge_operator = options->merge_operator.get();
+      auto merge_operator = ioptions->merge_operator;
       assert(merge_operator);
 
       std::deque<std::string> operands;
       operands.push_front(value.ToString());
       std::string new_value;
       if (!merge_operator->FullMerge(key, &get_value_slice, operands,
-                                     &new_value, options->info_log.get())) {
+                                     &new_value, ioptions->info_log)) {
           // Failed to merge!
-        RecordTick(options->statistics.get(), NUMBER_MERGE_FAILURES);
+        RecordTick(ioptions->statistics, NUMBER_MERGE_FAILURES);
 
         // Store the delta in memtable
         perform_merge = false;
@@ -474,8 +475,9 @@ class MemTableInserter : public WriteBatch::Handler {
       return seek_status;
     }
     MemTable* mem = cf_mems_->GetMemTable();
-    const Options* options = cf_mems_->GetOptions();
-    if (!dont_filter_deletes_ && options->filter_deletes) {
+    auto* ioptions = mem->GetImmutableOptions();
+    auto* moptions = mem->GetMemTableOptions();
+    if (!dont_filter_deletes_ && moptions->filter_deletes) {
       SnapshotImpl read_from_snapshot;
       read_from_snapshot.number_ = sequence_;
       ReadOptions ropts;
@@ -486,7 +488,7 @@ class MemTableInserter : public WriteBatch::Handler {
         cf_handle = db_->DefaultColumnFamily();
       }
       if (!db_->KeyMayExist(ropts, cf_handle, key, &value)) {
-        RecordTick(options->statistics.get(), NUMBER_FILTERED_DELETES);
+        RecordTick(ioptions->statistics, NUMBER_FILTERED_DELETES);
         return Status::OK();
       }
     }
diff --git a/db/write_batch_test.cc b/db/write_batch_test.cc
index aefb01e79..0c69b6af9 100644
--- a/db/write_batch_test.cc
+++ b/db/write_batch_test.cc
@@ -27,7 +27,8 @@ static std::string PrintContents(WriteBatch* b) {
   auto factory = std::make_shared<SkipListFactory>();
   Options options;
   options.memtable_factory = factory;
-  MemTable* mem = new MemTable(cmp, options);
+  MemTable* mem = new MemTable(cmp, ImmutableCFOptions(options),
+                               MemTableOptions(options));
   mem->Ref();
   std::string state;
   ColumnFamilyMemTablesDefault cf_mems_default(mem, &options);
diff --git a/include/rocksdb/immutable_options.h b/include/rocksdb/immutable_options.h
index f3e41c89e..de4480cff 100644
--- a/include/rocksdb/immutable_options.h
+++ b/include/rocksdb/immutable_options.h
@@ -51,6 +51,8 @@ struct ImmutableCFOptions {
 
   std::vector<DbPath> db_paths;
 
+  MemTableRepFactory* memtable_factory;
+
   TableFactory* table_factory;
 
   Options::TablePropertiesCollectorFactories
diff --git a/table/table_test.cc b/table/table_test.cc
index a0f844014..118291daa 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -437,21 +437,25 @@ class MemTableConstructor: public Constructor {
         table_factory_(new SkipListFactory) {
     Options options;
     options.memtable_factory = table_factory_;
-    memtable_ = new MemTable(internal_comparator_, options);
+    memtable_ = new MemTable(internal_comparator_,
+                             ImmutableCFOptions(options),
+                             MemTableOptions(options));
     memtable_->Ref();
   }
   ~MemTableConstructor() {
     delete memtable_->Unref();
   }
-  virtual Status FinishImpl(const Options& options,
+  virtual Status FinishImpl(const Options&,
                             const ImmutableCFOptions& ioptions,
                             const BlockBasedTableOptions& table_options,
                             const InternalKeyComparator& internal_comparator,
                             const KVMap& data) {
     delete memtable_->Unref();
-    Options memtable_options;
-    memtable_options.memtable_factory = table_factory_;
-    memtable_ = new MemTable(internal_comparator_, memtable_options);
+    Options options;
+    options.memtable_factory = table_factory_;
+    memtable_ = new MemTable(internal_comparator_,
+                             ImmutableCFOptions(options),
+                             MemTableOptions(options));
     memtable_->Ref();
     int seq = 1;
     for (KVMap::const_iterator it = data.begin();
@@ -1859,7 +1863,8 @@ TEST(MemTableTest, Simple) {
   auto table_factory = std::make_shared<SkipListFactory>();
   Options options;
   options.memtable_factory = table_factory;
-  MemTable* memtable = new MemTable(cmp, options);
+  MemTable* memtable = new MemTable(cmp, ImmutableCFOptions(options),
+                                    MemTableOptions(options));
   memtable->Ref();
   WriteBatch batch;
   WriteBatchInternal::SetSequence(&batch, 100);
diff --git a/util/options.cc b/util/options.cc
index 4def58ffe..a61d9d633 100644
--- a/util/options.cc
+++ b/util/options.cc
@@ -47,6 +47,7 @@ ImmutableCFOptions::ImmutableCFOptions(const Options& options)
     allow_mmap_reads(options.allow_mmap_reads),
     allow_mmap_writes(options.allow_mmap_writes),
     db_paths(options.db_paths),
+    memtable_factory(options.memtable_factory.get()),
     table_factory(options.table_factory.get()),
     table_properties_collector_factories(
         options.table_properties_collector_factories),

From 88841bd007a66417055c1dcad97f86263262de00 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Mon, 8 Sep 2014 18:57:40 -0700
Subject: [PATCH 053/829] Explicitly cast char to signed char in Hash()

Summary:
The compilers we use treat char as signed. However, this is not guarantee of C standard and some compilers (for ARM platform for example), treat char as unsigned. Code that assumes that char is either signed or unsigned is wrong.

This change explicitly casts the char to signed version. This will not break any of our use cases on x86, which, I believe are all of them. In case somebody out there is using RocksDB on ARM AND using bloom filters, they're going to have a bad time. However, it is very unlikely that this is the case.

Test Plan: sanity test with previous commit (with new sanity test)

Reviewers: yhchiang, ljin, sdong

Reviewed By: ljin

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D22767
---
 HISTORY.md                |  2 ++
 tools/auto_sanity_test.sh | 10 ++++++++++
 tools/db_sanity_test.cc   | 29 ++++++++++++++---------------
 util/hash.cc              | 22 +++++++++++++++++-----
 4 files changed, 43 insertions(+), 20 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index ca117b273..80cac265b 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,6 +1,8 @@
 # Rocksdb Change Log
 
 ## Unreleased (will be released with 3.6)
+### Disk format changes
+* If you're using RocksDB on ARM platforms and you're using default bloom filter, there is a disk format change you need to be aware of. There are three steps you need to do when you convert to new release: 1. turn off filter policy, 2. compact the whole database, 3. turn on filter policy 
 
 ### Behavior changes
 * We have refactored our system of stalling writes.  Any stall-related statistics' meanings are changed. Instead of per-write stall counts, we now count stalls per-epoch, where epochs are periods between flushes and compactions. You'll find more information in our Tuning Perf Guide once we release RocksDB 3.6.
diff --git a/tools/auto_sanity_test.sh b/tools/auto_sanity_test.sh
index 2d63c0a85..138c855c0 100755
--- a/tools/auto_sanity_test.sh
+++ b/tools/auto_sanity_test.sh
@@ -37,6 +37,11 @@ echo "Running db sanity check with commits $commit_new and $commit_old."
 
 echo "============================================================="
 echo "Making build $commit_new"
+git checkout $commit_new
+if [ $? -ne 0 ]; then
+  echo "[ERROR] Can't checkout $commit_new"
+  exit 1
+fi
 makestuff
 mv db_sanity_test new_db_sanity_test
 echo "Creating db based on the new commit --- $commit_new"
@@ -44,6 +49,11 @@ echo "Creating db based on the new commit --- $commit_new"
 
 echo "============================================================="
 echo "Making build $commit_old"
+git checkout $commit_old
+if [ $? -ne 0 ]; then
+  echo "[ERROR] Can't checkout $commit_old"
+  exit 1
+fi
 makestuff
 mv db_sanity_test old_db_sanity_test
 echo "Creating db based on the old commit --- $commit_old"
diff --git a/tools/db_sanity_test.cc b/tools/db_sanity_test.cc
index 7cf7c1cca..237ef07d0 100644
--- a/tools/db_sanity_test.cc
+++ b/tools/db_sanity_test.cc
@@ -8,15 +8,15 @@
 #include <vector>
 #include <memory>
 
-#include "include/rocksdb/db.h"
-#include "include/rocksdb/options.h"
-#include "include/rocksdb/env.h"
-#include "include/rocksdb/slice.h"
-#include "include/rocksdb/status.h"
-#include "include/rocksdb/comparator.h"
-#include "include/rocksdb/table.h"
-#include "include/rocksdb/slice_transform.h"
-#include "include/rocksdb/filter_policy.h"
+#include "rocksdb/db.h"
+#include "rocksdb/options.h"
+#include "rocksdb/env.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/table.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/filter_policy.h"
 
 namespace rocksdb {
 
@@ -50,7 +50,7 @@ class SanityTest {
         return s;
       }
     }
-    return Status::OK();
+    return db->Flush(FlushOptions());
   }
   Status Verify() {
     DB* db;
@@ -149,10 +149,10 @@ class SanityTestPlainTableFactory : public SanityTest {
 
 class SanityTestBloomFilter : public SanityTest {
  public:
-  explicit SanityTestBloomFilter(const std::string& path)
-      : SanityTest(path) {
-    table_options_.filter_policy.reset(NewBloomFilterPolicy(10));
-    options_.table_factory.reset(NewBlockBasedTableFactory(table_options_));
+  explicit SanityTestBloomFilter(const std::string& path) : SanityTest(path) {
+    BlockBasedTableOptions table_options;
+    table_options.filter_policy.reset(NewBloomFilterPolicy(10));
+    options_.table_factory.reset(NewBlockBasedTableFactory(table_options));
   }
   ~SanityTestBloomFilter() {}
   virtual Options GetOptions() const { return options_; }
@@ -160,7 +160,6 @@ class SanityTestBloomFilter : public SanityTest {
 
  private:
   Options options_;
-  BlockBasedTableOptions table_options_;
 };
 
 namespace {
diff --git a/util/hash.cc b/util/hash.cc
index e38c186c3..37eaa4057 100644
--- a/util/hash.cc
+++ b/util/hash.cc
@@ -31,14 +31,26 @@ uint32_t Hash(const char* data, size_t n, uint32_t seed) {
 
   // Pick up remaining bytes
   switch (limit - data) {
+    // Note: It would be better if this was cast to unsigned char, but that
+    // would be a disk format change since we previously didn't have any cast
+    // at all (so gcc used signed char).
+    // To understand the difference between shifting unsigned and signed chars,
+    // let's use 250 as an example. unsigned char will be 250, while signed char
+    // will be -6. Bit-wise, they are equivalent: 11111010. However, when
+    // converting negative number (signed char) to int, it will be converted
+    // into negative int (of equivalent value, which is -6), while converting
+    // positive number (unsigned char) will be converted to 250. Bitwise,
+    // this looks like this:
+    // signed char 11111010 -> int 11111111111111111111111111111010
+    // unsigned char 11111010 -> int 00000000000000000000000011111010
     case 3:
-      h += data[2] << 16;
-      // fall through
+      h += static_cast<uint32_t>(static_cast<signed char>(data[2]) << 16);
+    // fall through
     case 2:
-      h += data[1] << 8;
-      // fall through
+      h += static_cast<uint32_t>(static_cast<signed char>(data[1]) << 8);
+    // fall through
     case 1:
-      h += data[0];
+      h += static_cast<uint32_t>(static_cast<signed char>(data[0]));
       h *= m;
       h ^= (h >> r);
       break;

From 6bb7e3ef253b466baa4bd9df9ff47205f818a284 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Mon, 8 Sep 2014 22:24:40 -0700
Subject: [PATCH 054/829] Merger test

Summary: I abandoned https://reviews.facebook.net/D18789, but I wrote a good unit test there, so let's check it in. :)

Test Plan: this is test

Reviewers: sdong, yhchiang, ljin

Reviewed By: ljin

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D22827
---
 Makefile             |   4 +
 table/merger_test.cc | 197 +++++++++++++++++++++++++++++++++++++++++++
 util/testutil.cc     |   9 ++
 util/testutil.h      |   2 +
 4 files changed, 212 insertions(+)
 create mode 100644 table/merger_test.cc

diff --git a/Makefile b/Makefile
index da85ae2fc..a438230cb 100644
--- a/Makefile
+++ b/Makefile
@@ -97,6 +97,7 @@ TESTS = \
 	manual_compaction_test \
 	memenv_test \
 	merge_test \
+	merger_test \
 	redis_test \
 	reduce_levels_test \
 	plain_table_db_test \
@@ -434,6 +435,9 @@ write_controller_test: db/write_controller_test.o $(LIBOBJECTS) $(TESTHARNESS)
 merge_test: db/merge_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) db/merge_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
 
+merger_test: table/merger_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) table/merger_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
 deletefile_test: db/deletefile_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) db/deletefile_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS)
 
diff --git a/table/merger_test.cc b/table/merger_test.cc
new file mode 100644
index 000000000..3a10527f4
--- /dev/null
+++ b/table/merger_test.cc
@@ -0,0 +1,197 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include <vector>
+#include <string>
+#include <algorithm>
+
+#include "rocksdb/iterator.h"
+#include "table/merger.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+namespace rocksdb {
+
+class VectorIterator : public Iterator {
+ public:
+  explicit VectorIterator(const std::vector<std::string>& keys)
+      : keys_(keys), current_(keys.size()) {
+    std::sort(keys_.begin(), keys_.end());
+  }
+
+  virtual bool Valid() const { return current_ < keys_.size(); }
+
+  virtual void SeekToFirst() { current_ = 0; }
+  virtual void SeekToLast() { current_ = keys_.size() - 1; }
+
+  virtual void Seek(const Slice& target) {
+    current_ = std::lower_bound(keys_.begin(), keys_.end(), target.ToString()) -
+               keys_.begin();
+  }
+
+  virtual void Next() { current_++; }
+  virtual void Prev() { current_--; }
+
+  virtual Slice key() const { return Slice(keys_[current_]); }
+  virtual Slice value() const { return Slice(); }
+
+  virtual Status status() const { return Status::OK(); }
+
+ private:
+  std::vector<std::string> keys_;
+  size_t current_;
+};
+
+class MergerTest {
+ public:
+  MergerTest()
+      : rnd_(3), merging_iterator_(nullptr), single_iterator_(nullptr) {}
+  ~MergerTest() = default;
+  std::vector<std::string> GenerateStrings(int len, int string_len) {
+    std::vector<std::string> ret;
+    for (int i = 0; i < len; ++i) {
+      ret.push_back(test::RandomHumanReadableString(&rnd_, string_len));
+    }
+    return ret;
+  }
+
+  void AssertEquivalence() {
+    auto a = merging_iterator_.get();
+    auto b = single_iterator_.get();
+    if (!a->Valid()) {
+      ASSERT_TRUE(!b->Valid());
+    } else {
+      ASSERT_TRUE(b->Valid());
+      ASSERT_EQ(b->key().ToString(), a->key().ToString());
+      ASSERT_EQ(b->value().ToString(), a->value().ToString());
+    }
+  }
+
+  void SeekToRandom() { Seek(test::RandomHumanReadableString(&rnd_, 5)); }
+
+  void Seek(std::string target) {
+    merging_iterator_->Seek(target);
+    single_iterator_->Seek(target);
+  }
+
+  void SeekToFirst() {
+    merging_iterator_->SeekToFirst();
+    single_iterator_->SeekToFirst();
+  }
+
+  void SeekToLast() {
+    merging_iterator_->SeekToLast();
+    single_iterator_->SeekToLast();
+  }
+
+  void Next(int times) {
+    for (int i = 0; i < times && merging_iterator_->Valid(); ++i) {
+      AssertEquivalence();
+      merging_iterator_->Next();
+      single_iterator_->Next();
+    }
+    AssertEquivalence();
+  }
+
+  void Prev(int times) {
+    for (int i = 0; i < times && merging_iterator_->Valid(); ++i) {
+      AssertEquivalence();
+      merging_iterator_->Prev();
+      single_iterator_->Prev();
+    }
+    AssertEquivalence();
+  }
+
+  void NextAndPrev(int times) {
+    for (int i = 0; i < times && merging_iterator_->Valid(); ++i) {
+      AssertEquivalence();
+      if (rnd_.OneIn(2)) {
+        merging_iterator_->Prev();
+        single_iterator_->Prev();
+      } else {
+        merging_iterator_->Next();
+        single_iterator_->Next();
+      }
+    }
+    AssertEquivalence();
+  }
+
+  void Generate(size_t num_iterators, size_t strings_per_iterator,
+                size_t letters_per_string) {
+    std::vector<Iterator*> small_iterators;
+    for (size_t i = 0; i < num_iterators; ++i) {
+      auto strings = GenerateStrings(strings_per_iterator, letters_per_string);
+      small_iterators.push_back(new VectorIterator(strings));
+      all_keys_.insert(all_keys_.end(), strings.begin(), strings.end());
+    }
+
+    merging_iterator_.reset(NewMergingIterator(
+        BytewiseComparator(), &small_iterators[0], small_iterators.size()));
+    single_iterator_.reset(new VectorIterator(all_keys_));
+  }
+
+  Random rnd_;
+  std::unique_ptr<Iterator> merging_iterator_;
+  std::unique_ptr<Iterator> single_iterator_;
+  std::vector<std::string> all_keys_;
+};
+
+TEST(MergerTest, SeekToRandomNextTest) {
+  Generate(1000, 50, 50);
+  for (int i = 0; i < 10; ++i) {
+    SeekToRandom();
+    AssertEquivalence();
+    Next(50000);
+  }
+}
+
+TEST(MergerTest, SeekToRandomNextSmallStringsTest) {
+  Generate(1000, 50, 2);
+  for (int i = 0; i < 10; ++i) {
+    SeekToRandom();
+    AssertEquivalence();
+    Next(50000);
+  }
+}
+
+TEST(MergerTest, SeekToRandomPrevTest) {
+  Generate(1000, 50, 50);
+  for (int i = 0; i < 10; ++i) {
+    SeekToRandom();
+    AssertEquivalence();
+    Prev(50000);
+  }
+}
+
+TEST(MergerTest, SeekToRandomRandomTest) {
+  Generate(200, 50, 50);
+  for (int i = 0; i < 3; ++i) {
+    SeekToRandom();
+    AssertEquivalence();
+    NextAndPrev(5000);
+  }
+}
+
+TEST(MergerTest, SeekToFirstTest) {
+  Generate(1000, 50, 50);
+  for (int i = 0; i < 10; ++i) {
+    SeekToFirst();
+    AssertEquivalence();
+    Next(50000);
+  }
+}
+
+TEST(MergerTest, SeekToLastTest) {
+  Generate(1000, 50, 50);
+  for (int i = 0; i < 10; ++i) {
+    SeekToLast();
+    AssertEquivalence();
+    Prev(50000);
+  }
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) { return rocksdb::test::RunAllTests(); }
diff --git a/util/testutil.cc b/util/testutil.cc
index 363b8ff19..20f22c2dc 100644
--- a/util/testutil.cc
+++ b/util/testutil.cc
@@ -23,6 +23,15 @@ Slice RandomString(Random* rnd, int len, std::string* dst) {
   return Slice(*dst);
 }
 
+extern std::string RandomHumanReadableString(Random* rnd, int len) {
+  std::string ret;
+  ret.resize(len);
+  for (int i = 0; i < len; ++i) {
+    ret[i] = static_cast<char>('a' + rnd->Uniform(26));
+  }
+  return ret;
+}
+
 std::string RandomKey(Random* rnd, int len) {
   // Make sure to generate a wide variety of characters so we
   // test the boundary conditions for short-key optimizations.
diff --git a/util/testutil.h b/util/testutil.h
index c615fc1e7..eff0d7e7d 100644
--- a/util/testutil.h
+++ b/util/testutil.h
@@ -21,6 +21,8 @@ namespace test {
 // references the generated data.
 extern Slice RandomString(Random* rnd, int len, std::string* dst);
 
+extern std::string RandomHumanReadableString(Random* rnd, int len);
+
 // Return a random key with the specified length that may contain interesting
 // characters (e.g. \x00, \xff, etc.).
 extern std::string RandomKey(Random* rnd, int len);

From d343c3fe46c2b1db47ca76ee6244c0d07e329c38 Mon Sep 17 00:00:00 2001
From: Stanislau Hlebik <stash@fb.com>
Date: Tue, 9 Sep 2014 11:18:50 -0700
Subject: [PATCH 055/829] Improve db recovery

Summary: Avoid creating unnecessary sst files while db opening

Test Plan: make all check

Reviewers: sdong, igor

Reviewed By: igor

Subscribers: zagfox, yhchiang, ljin, leveldb

Differential Revision: https://reviews.facebook.net/D20661
---
 db/db_impl.cc | 211 +++++++++++++++++++++++++++-----------------------
 db/db_impl.h  |   5 +-
 db/db_test.cc | 130 ++++++++++++++++++++++++++++++-
 3 files changed, 243 insertions(+), 103 deletions(-)

diff --git a/db/db_impl.cc b/db/db_impl.cc
index fc98b2abd..0708c49a8 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -1219,14 +1219,16 @@ Status DBImpl::Recover(
           "flag but a log file already exists");
     }
 
-    // Recover in the order in which the logs were generated
-    std::sort(logs.begin(), logs.end());
-    for (const auto& log : logs) {
-      // The previous incarnation may not have written any MANIFEST
-      // records after allocating this log number.  So we manually
-      // update the file number allocation counter in VersionSet.
-      versions_->MarkFileNumberUsed(log);
-      s = RecoverLogFile(log, &max_sequence, read_only);
+    if (!logs.empty()) {
+      // Recover in the order in which the logs were generated
+      std::sort(logs.begin(), logs.end());
+      s = RecoverLogFiles(logs, &max_sequence, read_only);
+      if (!s.ok()) {
+        // Clear memtables if recovery failed
+        for (auto cfd : *versions_->GetColumnFamilySet()) {
+          cfd->CreateNewMemtable();
+        }
+      }
     }
     SetTickerCount(stats_, SEQUENCE_NUMBER, versions_->LastSequence());
   }
@@ -1239,8 +1241,9 @@ Status DBImpl::Recover(
   return s;
 }
 
-Status DBImpl::RecoverLogFile(uint64_t log_number, SequenceNumber* max_sequence,
-                              bool read_only) {
+// REQUIRES: log_numbers are sorted in ascending order
+Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
+                               SequenceNumber* max_sequence, bool read_only) {
   struct LogReporter : public log::Reader::Reporter {
     Env* env;
     Logger* info_log;
@@ -1256,7 +1259,7 @@ Status DBImpl::RecoverLogFile(uint64_t log_number, SequenceNumber* max_sequence,
   };
 
   mutex_.AssertHeld();
-
+  Status status;
   std::unordered_map<int, VersionEdit> version_edits;
   // no need to refcount because iteration is under mutex
   for (auto cfd : *versions_->GetColumnFamilySet()) {
@@ -1265,102 +1268,113 @@ Status DBImpl::RecoverLogFile(uint64_t log_number, SequenceNumber* max_sequence,
     version_edits.insert({cfd->GetID(), edit});
   }
 
-  // Open the log file
-  std::string fname = LogFileName(db_options_.wal_dir, log_number);
-  unique_ptr<SequentialFile> file;
-  Status status = env_->NewSequentialFile(fname, &file, env_options_);
-  if (!status.ok()) {
-    MaybeIgnoreError(&status);
-    return status;
-  }
-
-  // Create the log reader.
-  LogReporter reporter;
-  reporter.env = env_;
-  reporter.info_log = db_options_.info_log.get();
-  reporter.fname = fname.c_str();
-  reporter.status = (db_options_.paranoid_checks &&
-                     !db_options_.skip_log_error_on_recovery ? &status
-                                                             : nullptr);
-  // We intentially make log::Reader do checksumming even if
-  // paranoid_checks==false so that corruptions cause entire commits
-  // to be skipped instead of propagating bad information (like overly
-  // large sequence numbers).
-  log::Reader reader(std::move(file), &reporter, true/*checksum*/,
-                     0/*initial_offset*/);
-  Log(db_options_.info_log, "Recovering log #%" PRIu64 "", log_number);
-
-  // Read all the records and add to a memtable
-  std::string scratch;
-  Slice record;
-  WriteBatch batch;
-  while (reader.ReadRecord(&record, &scratch)) {
-    if (record.size() < 12) {
-      reporter.Corruption(record.size(),
-                          Status::Corruption("log record too small"));
-      continue;
+  for (auto log_number : log_numbers) {
+    // The previous incarnation may not have written any MANIFEST
+    // records after allocating this log number.  So we manually
+    // update the file number allocation counter in VersionSet.
+    versions_->MarkFileNumberUsed(log_number);
+    // Open the log file
+    std::string fname = LogFileName(db_options_.wal_dir, log_number);
+    unique_ptr<SequentialFile> file;
+    status = env_->NewSequentialFile(fname, &file, env_options_);
+    if (!status.ok()) {
+      MaybeIgnoreError(&status);
+      if (!status.ok()) {
+        return status;
+      } else {
+        // Fail with one log file, but that's ok.
+        // Try next one.
+        continue;
+      }
     }
-    WriteBatchInternal::SetContents(&batch, record);
 
-    // If column family was not found, it might mean that the WAL write
-    // batch references to the column family that was dropped after the
-    // insert. We don't want to fail the whole write batch in that case -- we
-    // just ignore the update. That's why we set ignore missing column families
-    // to true
-    status = WriteBatchInternal::InsertInto(
-        &batch, column_family_memtables_.get(),
-        true /* ignore missing column families */, log_number);
+    // Create the log reader.
+    LogReporter reporter;
+    reporter.env = env_;
+    reporter.info_log = db_options_.info_log.get();
+    reporter.fname = fname.c_str();
+    reporter.status =
+        (db_options_.paranoid_checks && !db_options_.skip_log_error_on_recovery
+             ? &status
+             : nullptr);
+    // We intentially make log::Reader do checksumming even if
+    // paranoid_checks==false so that corruptions cause entire commits
+    // to be skipped instead of propagating bad information (like overly
+    // large sequence numbers).
+    log::Reader reader(std::move(file), &reporter, true /*checksum*/,
+                       0 /*initial_offset*/);
+    Log(db_options_.info_log, "Recovering log #%" PRIu64 "", log_number);
+
+    // Read all the records and add to a memtable
+    std::string scratch;
+    Slice record;
+    WriteBatch batch;
+    while (reader.ReadRecord(&record, &scratch)) {
+      if (record.size() < 12) {
+        reporter.Corruption(record.size(),
+                            Status::Corruption("log record too small"));
+        continue;
+      }
+      WriteBatchInternal::SetContents(&batch, record);
 
-    MaybeIgnoreError(&status);
-    if (!status.ok()) {
-      return status;
-    }
-    const SequenceNumber last_seq =
-        WriteBatchInternal::Sequence(&batch) +
-        WriteBatchInternal::Count(&batch) - 1;
-    if (last_seq > *max_sequence) {
-      *max_sequence = last_seq;
-    }
+      // If column family was not found, it might mean that the WAL write
+      // batch references to the column family that was dropped after the
+      // insert. We don't want to fail the whole write batch in that case --
+      // we just ignore the update.
+      // That's why we set ignore missing column families to true
+      status = WriteBatchInternal::InsertInto(
+          &batch, column_family_memtables_.get(), true, log_number);
 
-    if (!read_only) {
-      // no need to refcount since client still doesn't have access
-      // to the DB and can not drop column families while we iterate
-      for (auto cfd : *versions_->GetColumnFamilySet()) {
-        if (cfd->mem()->ShouldFlush()) {
-          // If this asserts, it means that InsertInto failed in
-          // filtering updates to already-flushed column families
-          assert(cfd->GetLogNumber() <= log_number);
-          auto iter = version_edits.find(cfd->GetID());
-          assert(iter != version_edits.end());
-          VersionEdit* edit = &iter->second;
-          status = WriteLevel0TableForRecovery(cfd, cfd->mem(), edit);
-          // we still want to clear the memtable, even if the recovery failed
-          cfd->CreateNewMemtable();
-          if (!status.ok()) {
-            // Reflect errors immediately so that conditions like full
-            // file-systems cause the DB::Open() to fail.
-            return status;
+      MaybeIgnoreError(&status);
+      if (!status.ok()) {
+        return status;
+      }
+      const SequenceNumber last_seq = WriteBatchInternal::Sequence(&batch) +
+                                      WriteBatchInternal::Count(&batch) - 1;
+      if (last_seq > *max_sequence) {
+        *max_sequence = last_seq;
+      }
+
+      if (!read_only) {
+        // no need to refcount since client still doesn't have access
+        // to the DB and can not drop column families while we iterate
+        for (auto cfd : *versions_->GetColumnFamilySet()) {
+          if (cfd->mem()->ShouldFlush()) {
+            // If this asserts, it means that InsertInto failed in
+            // filtering updates to already-flushed column families
+            assert(cfd->GetLogNumber() <= log_number);
+            auto iter = version_edits.find(cfd->GetID());
+            assert(iter != version_edits.end());
+            VersionEdit* edit = &iter->second;
+            status = WriteLevel0TableForRecovery(cfd, cfd->mem(), edit);
+            if (!status.ok()) {
+              // Reflect errors immediately so that conditions like full
+              // file-systems cause the DB::Open() to fail.
+              return status;
+            }
+            cfd->CreateNewMemtable();
           }
         }
       }
     }
-  }
 
-  if (versions_->LastSequence() < *max_sequence) {
-    versions_->SetLastSequence(*max_sequence);
+    if (versions_->LastSequence() < *max_sequence) {
+      versions_->SetLastSequence(*max_sequence);
+    }
   }
 
   if (!read_only) {
     // no need to refcount since client still doesn't have access
     // to the DB and can not drop column families while we iterate
+    auto max_log_number = log_numbers.back();
     for (auto cfd : *versions_->GetColumnFamilySet()) {
       auto iter = version_edits.find(cfd->GetID());
       assert(iter != version_edits.end());
       VersionEdit* edit = &iter->second;
 
-      if (cfd->GetLogNumber() > log_number) {
+      if (cfd->GetLogNumber() > max_log_number) {
         // Column family cfd has already flushed the data
-        // from log_number. Memtable has to be empty because
+        // from all logs. Memtable has to be empty because
         // we filter the updates based on log_number
         // (in WriteBatch::InsertInto)
         assert(cfd->mem()->GetFirstSequenceNumber() == 0);
@@ -1371,28 +1385,29 @@ Status DBImpl::RecoverLogFile(uint64_t log_number, SequenceNumber* max_sequence,
       // flush the final memtable (if non-empty)
       if (cfd->mem()->GetFirstSequenceNumber() != 0) {
         status = WriteLevel0TableForRecovery(cfd, cfd->mem(), edit);
-      }
-      // we still want to clear the memtable, even if the recovery failed
-      cfd->CreateNewMemtable();
-      if (!status.ok()) {
-        return status;
+        if (!status.ok()) {
+          // Recovery failed
+          break;
+        }
+        cfd->CreateNewMemtable();
       }
 
       // write MANIFEST with update
-      // writing log number in the manifest means that any log file
+      // writing log_number in the manifest means that any log file
       // with number strongly less than (log_number + 1) is already
       // recovered and should be ignored on next reincarnation.
-      // Since we already recovered log_number, we want all logs
-      // with numbers `<= log_number` (includes this one) to be ignored
-      edit->SetLogNumber(log_number + 1);
+      // Since we already recovered max_log_number, we want all logs
+      // with numbers `<= max_log_number` (includes this one) to be ignored
+      edit->SetLogNumber(max_log_number + 1);
       // we must mark the next log number as used, even though it's
       // not actually used. that is because VersionSet assumes
       // VersionSet::next_file_number_ always to be strictly greater than any
       // log number
-      versions_->MarkFileNumberUsed(log_number + 1);
+      versions_->MarkFileNumberUsed(max_log_number + 1);
       status = versions_->LogAndApply(cfd, edit, &mutex_);
       if (!status.ok()) {
-        return status;
+        // Recovery failed
+        break;
       }
     }
   }
diff --git a/db/db_impl.h b/db/db_impl.h
index 4d6ba0495..cf7914fec 100644
--- a/db/db_impl.h
+++ b/db/db_impl.h
@@ -344,8 +344,9 @@ class DBImpl : public DB {
                                    DeletionState& deletion_state,
                                    LogBuffer* log_buffer);
 
-  Status RecoverLogFile(uint64_t log_number, SequenceNumber* max_sequence,
-                        bool read_only);
+  // REQUIRES: log_numbers are sorted in ascending order
+  Status RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
+                         SequenceNumber* max_sequence, bool read_only);
 
   // The following two methods are used to flush a memtable to
   // storage. The first one is used atdatabase RecoveryTime (when the
diff --git a/db/db_test.cc b/db/db_test.cc
index 96f7e208a..92d623468 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -6120,18 +6120,18 @@ namespace {
 std::vector<std::uint64_t> ListSpecificFiles(
     Env* env, const std::string& path, const FileType expected_file_type) {
   std::vector<std::string> files;
-  std::vector<uint64_t> log_files;
+  std::vector<uint64_t> file_numbers;
   env->GetChildren(path, &files);
   uint64_t number;
   FileType type;
   for (size_t i = 0; i < files.size(); ++i) {
     if (ParseFileName(files[i], &number, &type)) {
       if (type == expected_file_type) {
-        log_files.push_back(number);
+        file_numbers.push_back(number);
       }
     }
   }
-  return std::move(log_files);
+  return std::move(file_numbers);
 }
 
 std::vector<std::uint64_t> ListLogFiles(Env* env, const std::string& path) {
@@ -6141,6 +6141,17 @@ std::vector<std::uint64_t> ListLogFiles(Env* env, const std::string& path) {
 std::vector<std::uint64_t> ListTableFiles(Env* env, const std::string& path) {
   return ListSpecificFiles(env, path, kTableFile);
 }
+
+std::uint64_t GetNumberOfSstFilesForColumnFamily(
+    DB* db, std::string column_family_name) {
+  std::vector<LiveFileMetaData> metadata;
+  db->GetLiveFilesMetaData(&metadata);
+  uint64_t result = 0;
+  for (auto& fileMetadata : metadata) {
+    result += (fileMetadata.column_family_name == column_family_name);
+  }
+  return result;
+}
 }  // namespace
 
 TEST(DBTest, FlushOneColumnFamily) {
@@ -6165,6 +6176,119 @@ TEST(DBTest, FlushOneColumnFamily) {
   }
 }
 
+// In https://reviews.facebook.net/D20661 we change
+// recovery behavior: previously for each log file each column family
+// memtable was flushed, even it was empty. Now it's changed:
+// we try to create the smallest number of table files by merging
+// updates from multiple logs
+TEST(DBTest, RecoverCheckFileAmountWithSmallWriteBuffer) {
+  Options options;
+  options.write_buffer_size = 5000000;
+  CreateAndReopenWithCF({"pikachu", "dobrynia", "nikitich"}, &options);
+
+  // Since we will reopen DB with smaller write_buffer_size,
+  // each key will go to new SST file
+  ASSERT_OK(Put(1, Key(10), DummyString(1000000)));
+  ASSERT_OK(Put(1, Key(10), DummyString(1000000)));
+  ASSERT_OK(Put(1, Key(10), DummyString(1000000)));
+  ASSERT_OK(Put(1, Key(10), DummyString(1000000)));
+
+  ASSERT_OK(Put(3, Key(10), DummyString(1)));
+  // Make 'dobrynia' to be flushed and new WAL file to be created
+  ASSERT_OK(Put(2, Key(10), DummyString(7500000)));
+  ASSERT_OK(Put(2, Key(1), DummyString(1)));
+  dbfull()->TEST_WaitForFlushMemTable(handles_[2]);
+  {
+    auto tables = ListTableFiles(env_, dbname_);
+    ASSERT_EQ(tables.size(), 1);
+    // Make sure 'dobrynia' was flushed: check sst files amount
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"), 1);
+  }
+  // New WAL file
+  ASSERT_OK(Put(1, Key(1), DummyString(1)));
+  ASSERT_OK(Put(1, Key(1), DummyString(1)));
+  ASSERT_OK(Put(3, Key(10), DummyString(1)));
+  ASSERT_OK(Put(3, Key(10), DummyString(1)));
+  ASSERT_OK(Put(3, Key(10), DummyString(1)));
+
+  options.write_buffer_size = 10;
+  ReopenWithColumnFamilies({"default", "pikachu", "dobrynia", "nikitich"},
+                           &options);
+  {
+    // No inserts => default is empty
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), 0);
+    // First 4 keys goes to separate SSTs + 1 more SST for 2 smaller keys
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), 5);
+    // 1 SST for big key + 1 SST for small one
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"), 2);
+    // 1 SST for all keys
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), 1);
+  }
+}
+
+// In https://reviews.facebook.net/D20661 we change
+// recovery behavior: previously for each log file each column family
+// memtable was flushed, even it wasn't empty. Now it's changed:
+// we try to create the smallest number of table files by merging
+// updates from multiple logs
+TEST(DBTest, RecoverCheckFileAmount) {
+  Options options;
+  options.write_buffer_size = 100000;
+  CreateAndReopenWithCF({"pikachu", "dobrynia", "nikitich"}, &options);
+
+  ASSERT_OK(Put(0, Key(1), DummyString(1)));
+  ASSERT_OK(Put(1, Key(1), DummyString(1)));
+  ASSERT_OK(Put(2, Key(1), DummyString(1)));
+
+  // Make 'nikitich' memtable to be flushed
+  ASSERT_OK(Put(3, Key(10), DummyString(1002400)));
+  ASSERT_OK(Put(3, Key(1), DummyString(1)));
+  dbfull()->TEST_WaitForFlushMemTable(handles_[3]);
+  // 4 memtable are not flushed, 1 sst file
+  {
+    auto tables = ListTableFiles(env_, dbname_);
+    ASSERT_EQ(tables.size(), 1);
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), 1);
+  }
+  // Memtable for 'nikitich' has flushed, new WAL file has opened
+  // 4 memtable still not flushed
+
+  // Write to new WAL file
+  ASSERT_OK(Put(0, Key(1), DummyString(1)));
+  ASSERT_OK(Put(1, Key(1), DummyString(1)));
+  ASSERT_OK(Put(2, Key(1), DummyString(1)));
+
+  // Fill up 'nikitich' one more time
+  ASSERT_OK(Put(3, Key(10), DummyString(1002400)));
+  // make it flush
+  ASSERT_OK(Put(3, Key(1), DummyString(1)));
+  dbfull()->TEST_WaitForFlushMemTable(handles_[3]);
+  // There are still 4 memtable not flushed, and 2 sst tables
+  ASSERT_OK(Put(0, Key(1), DummyString(1)));
+  ASSERT_OK(Put(1, Key(1), DummyString(1)));
+  ASSERT_OK(Put(2, Key(1), DummyString(1)));
+
+  {
+    auto tables = ListTableFiles(env_, dbname_);
+    ASSERT_EQ(tables.size(), 2);
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), 2);
+  }
+
+  ReopenWithColumnFamilies({"default", "pikachu", "dobrynia", "nikitich"},
+                           &options);
+  {
+    std::vector<uint64_t> table_files = ListTableFiles(env_, dbname_);
+    // Check, that records for 'default', 'dobrynia' and 'pikachu' from
+    // first, second and third WALs  went to the same SST.
+    // So, there is 6 SSTs: three  for 'nikitich', one for 'default', one for
+    // 'dobrynia', one for 'pikachu'
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), 1);
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), 3);
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"), 1);
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), 1);
+  }
+}
+
 TEST(DBTest, WALArchivalTtl) {
   do {
     Options options = CurrentOptions();

From 06d986252a6e998251f3bb921c872ef13d61b51f Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Mon, 8 Sep 2014 17:45:06 -0700
Subject: [PATCH 056/829] Always pass MergeContext as pointer, not reference

Summary: To follow the coding convention and make sure when passing reference as a parameter it is also const, pass MergeContext as a pointer to mem tables.

Test Plan: make all check

Reviewers: ljin, igor

Reviewed By: igor

Subscribers: leveldb, dhruba, yhchiang

Differential Revision: https://reviews.facebook.net/D23085
---
 db/db_impl.cc          | 8 ++++----
 db/db_impl_readonly.cc | 2 +-
 db/memtable.cc         | 4 ++--
 db/memtable.h          | 2 +-
 db/memtable_list.cc    | 2 +-
 db/memtable_list.h     | 2 +-
 6 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/db/db_impl.cc b/db/db_impl.cc
index 0708c49a8..f6634b6c4 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -3449,10 +3449,10 @@ Status DBImpl::GetImpl(const ReadOptions& options,
   LookupKey lkey(key, snapshot);
   PERF_TIMER_STOP(get_snapshot_time);
 
-  if (sv->mem->Get(lkey, value, &s, merge_context)) {
+  if (sv->mem->Get(lkey, value, &s, &merge_context)) {
     // Done
     RecordTick(stats_, MEMTABLE_HIT);
-  } else if (sv->imm->Get(lkey, value, &s, merge_context)) {
+  } else if (sv->imm->Get(lkey, value, &s, &merge_context)) {
     // Done
     RecordTick(stats_, MEMTABLE_HIT);
   } else {
@@ -3537,9 +3537,9 @@ std::vector<Status> DBImpl::MultiGet(
     assert(mgd_iter != multiget_cf_data.end());
     auto mgd = mgd_iter->second;
     auto super_version = mgd->super_version;
-    if (super_version->mem->Get(lkey, value, &s, merge_context)) {
+    if (super_version->mem->Get(lkey, value, &s, &merge_context)) {
       // Done
-    } else if (super_version->imm->Get(lkey, value, &s, merge_context)) {
+    } else if (super_version->imm->Get(lkey, value, &s, &merge_context)) {
       // Done
     } else {
       super_version->current->Get(options, lkey, value, &s, &merge_context);
diff --git a/db/db_impl_readonly.cc b/db/db_impl_readonly.cc
index b1fae82cf..8cea58736 100644
--- a/db/db_impl_readonly.cc
+++ b/db/db_impl_readonly.cc
@@ -61,7 +61,7 @@ Status DBImplReadOnly::Get(const ReadOptions& read_options,
   SuperVersion* super_version = cfd->GetSuperVersion();
   MergeContext merge_context;
   LookupKey lkey(key, snapshot);
-  if (super_version->mem->Get(lkey, value, &s, merge_context)) {
+  if (super_version->mem->Get(lkey, value, &s, &merge_context)) {
   } else {
     super_version->current->Get(read_options, lkey, value, &s, &merge_context);
   }
diff --git a/db/memtable.cc b/db/memtable.cc
index d7923711a..23cc62270 100644
--- a/db/memtable.cc
+++ b/db/memtable.cc
@@ -430,7 +430,7 @@ static bool SaveValue(void* arg, const char* entry) {
 }
 
 bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
-                   MergeContext& merge_context) {
+                   MergeContext* merge_context) {
   // The sequence number is updated synchronously in version_set.h
   if (IsEmpty()) {
     // Avoiding recording stats for speed.
@@ -454,7 +454,7 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
     saver.value = value;
     saver.status = s;
     saver.mem = this;
-    saver.merge_context = &merge_context;
+    saver.merge_context = merge_context;
     saver.merge_operator = ioptions_.merge_operator;
     saver.logger = ioptions_.info_log;
     saver.inplace_update_support = moptions_.inplace_update_support;
diff --git a/db/memtable.h b/db/memtable.h
index 26772e0f5..0371dc3cf 100644
--- a/db/memtable.h
+++ b/db/memtable.h
@@ -118,7 +118,7 @@ class MemTable {
   //   store MergeInProgress in s, and return false.
   // Else, return false.
   bool Get(const LookupKey& key, std::string* value, Status* s,
-           MergeContext& merge_context);
+           MergeContext* merge_context);
 
   // Attempts to update the new_value inplace, else does normal Add
   // Pseudocode
diff --git a/db/memtable_list.cc b/db/memtable_list.cc
index ced03dc82..728b1c0a0 100644
--- a/db/memtable_list.cc
+++ b/db/memtable_list.cc
@@ -62,7 +62,7 @@ int MemTableList::size() const {
 // Return the most recent value found, if any.
 // Operands stores the list of merge operations to apply, so far.
 bool MemTableListVersion::Get(const LookupKey& key, std::string* value,
-                              Status* s, MergeContext& merge_context) {
+                              Status* s, MergeContext* merge_context) {
   for (auto& memtable : memlist_) {
     if (memtable->Get(key, value, s, merge_context)) {
       return true;
diff --git a/db/memtable_list.h b/db/memtable_list.h
index 042ffc5cf..92688825a 100644
--- a/db/memtable_list.h
+++ b/db/memtable_list.h
@@ -46,7 +46,7 @@ class MemTableListVersion {
   // Search all the memtables starting from the most recent one.
   // Return the most recent value found, if any.
   bool Get(const LookupKey& key, std::string* value, Status* s,
-           MergeContext& merge_context);
+           MergeContext* merge_context);
 
   void AddIterators(const ReadOptions& options,
                     std::vector<Iterator*>* iterator_list, Arena* arena);

From 0a42295a248742fe5058492095d4ea59e543aa34 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Tue, 9 Sep 2014 11:50:05 -0700
Subject: [PATCH 057/829] Fix SimpleWriteTimeoutTest

Summary:
In column family's SanitizeOptions() [1], we make sure that min_write_buffer_number_to_merge is normal value. However, this test depended on the fact that setting min_write_buffer_number_to_merge to be bigger than max_write_buffer_number will cause a deadlock. I'm not sure how it worked before.

This diff fixes it by scheduling sleeping background task, which will actually block any attempts of flushing.

[1] https://github.com/facebook/rocksdb/blob/master/db/column_family.cc#L104

Test Plan: the test works now

Reviewers: yhchiang, sdong, ljin

Reviewed By: ljin

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D23103
---
 db/db_test.cc | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/db/db_test.cc b/db/db_test.cc
index 92d623468..2c06c241d 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -7832,25 +7832,39 @@ TEST(DBTest, FIFOCompactionTest) {
 }
 
 TEST(DBTest, SimpleWriteTimeoutTest) {
+  // Block compaction thread, which will also block the flushes because
+  // max_background_flushes == 0, so flushes are getting executed by the
+  // compaction thread
+  env_->SetBackgroundThreads(1, Env::LOW);
+  SleepingBackgroundTask sleeping_task_low;
+  env_->Schedule(&SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+
   Options options;
   options.env = env_;
   options.create_if_missing = true;
   options.write_buffer_size = 100000;
   options.max_background_flushes = 0;
   options.max_write_buffer_number = 2;
-  options.min_write_buffer_number_to_merge = 3;
   options.max_total_wal_size = std::numeric_limits<uint64_t>::max();
   WriteOptions write_opt = WriteOptions();
   write_opt.timeout_hint_us = 0;
   DestroyAndReopen(&options);
-  // fill the two write buffer
+  // fill the two write buffers
   ASSERT_OK(Put(Key(1), Key(1) + std::string(100000, 'v'), write_opt));
   ASSERT_OK(Put(Key(2), Key(2) + std::string(100000, 'v'), write_opt));
+  // this will switch the previous memtable, but will not cause block because
+  // DelayWrite() is called before MakeRoomForWrite()
+  // TODO(icanadi) remove this as part of https://reviews.facebook.net/D23067
+  ASSERT_OK(Put(Key(3), Key(3), write_opt));
   // As the only two write buffers are full in this moment, the third
   // Put is expected to be timed-out.
   write_opt.timeout_hint_us = 50;
   ASSERT_TRUE(
       Put(Key(3), Key(3) + std::string(100000, 'v'), write_opt).IsTimedOut());
+
+  sleeping_task_low.WakeUp();
+  sleeping_task_low.WaitUntilDone();
 }
 
 // Multi-threaded Timeout Test

From 6cc12860f0cb90d502cdf18d7c4991909412d0ae Mon Sep 17 00:00:00 2001
From: Xiaozheng Tie <xiaozhengtie@devbig252.prn2.facebook.com>
Date: Tue, 9 Sep 2014 13:44:42 -0700
Subject: [PATCH 058/829] Added a few statistics for BackupableDB

Summary:
Added the following statistics to BackupableDB:

1. Number of successful and failed backups in class BackupStatistics
2. Time taken to do a backup
3. Number of files in a backup

1 is implemented in the BackupStatistics class
2 and 3 are added in the BackupMeta and BackupInfo class

Test Plan:
1 can be tested using BackupStatistics::ToString(),
2 and 3 can be tested in the BackupInfo class

Reviewers: sdong, igor2, ljin, igor

Reviewed By: igor

Differential Revision: https://reviews.facebook.net/D22785
---
 include/rocksdb/utilities/backupable_db.h | 36 +++++++++++++++-
 utilities/backupable/backupable_db.cc     | 50 ++++++++++++++++++++++-
 2 files changed, 82 insertions(+), 4 deletions(-)

diff --git a/include/rocksdb/utilities/backupable_db.h b/include/rocksdb/utilities/backupable_db.h
index bf3f919ae..57a8accdf 100644
--- a/include/rocksdb/utilities/backupable_db.h
+++ b/include/rocksdb/utilities/backupable_db.h
@@ -130,9 +130,41 @@ struct BackupInfo {
   int64_t timestamp;
   uint64_t size;
 
+  uint32_t number_files;
+
   BackupInfo() {}
-  BackupInfo(BackupID _backup_id, int64_t _timestamp, uint64_t _size)
-      : backup_id(_backup_id), timestamp(_timestamp), size(_size) {}
+
+  BackupInfo(BackupID _backup_id, int64_t _timestamp, uint64_t _size,
+             uint32_t _number_files)
+      : backup_id(_backup_id), timestamp(_timestamp), size(_size),
+        number_files(_number_files) {}
+};
+
+class BackupStatistics {
+ public:
+  BackupStatistics() {
+    number_success_backup = 0;
+    number_fail_backup = 0;
+  }
+
+  BackupStatistics(uint32_t _number_success_backup,
+                   uint32_t _number_fail_backup)
+      : number_success_backup(_number_success_backup),
+        number_fail_backup(_number_fail_backup) {}
+
+  ~BackupStatistics() {}
+
+  void IncrementNumberSuccessBackup();
+  void IncrementNumberFailBackup();
+
+  uint32_t GetNumberSuccessBackup() const;
+  uint32_t GetNumberFailBackup() const;
+
+  std::string ToString() const;
+
+ private:
+  uint32_t number_success_backup;
+  uint32_t number_fail_backup;
 };
 
 class BackupEngineReadOnly {
diff --git a/utilities/backupable/backupable_db.cc b/utilities/backupable/backupable_db.cc
index 4d1a9b76b..71248b47c 100644
--- a/utilities/backupable/backupable_db.cc
+++ b/utilities/backupable/backupable_db.cc
@@ -72,6 +72,27 @@ class BackupRateLimiter {
 };
 }  // namespace
 
+void BackupStatistics::IncrementNumberSuccessBackup() {
+  number_success_backup++;
+}
+void BackupStatistics::IncrementNumberFailBackup() {
+  number_fail_backup++;
+}
+
+uint32_t BackupStatistics::GetNumberSuccessBackup() const {
+  return number_success_backup;
+}
+uint32_t BackupStatistics::GetNumberFailBackup() const {
+  return number_fail_backup;
+}
+
+std::string BackupStatistics::ToString() const {
+  char result[50];
+  snprintf(result, sizeof(result), "# success backup: %u, # fail backup: %u",
+           GetNumberSuccessBackup(), GetNumberFailBackup());
+  return result;
+}
+
 void BackupableDBOptions::Dump(Logger* logger) const {
   Log(logger, "        Options.backup_dir: %s", backup_dir.c_str());
   Log(logger, "        Options.backup_env: %p", backup_env);
@@ -144,6 +165,9 @@ class BackupEngineImpl : public BackupEngine {
     uint64_t GetSize() const {
       return size_;
     }
+    uint32_t GetNumberFiles() {
+      return files_.size();
+    }
     void SetSequenceNumber(uint64_t sequence_number) {
       sequence_number_ = sequence_number;
     }
@@ -288,6 +312,7 @@ class BackupEngineImpl : public BackupEngine {
   static const size_t kDefaultCopyFileBufferSize = 5 * 1024 * 1024LL;  // 5MB
   size_t copy_file_buffer_size_;
   bool read_only_;
+  BackupStatistics backup_statistics_;
 };
 
 BackupEngine* BackupEngine::NewBackupEngine(
@@ -443,6 +468,8 @@ Status BackupEngineImpl::CreateNewBackup(DB* db, bool flush_before_backup) {
   new_backup.RecordTimestamp();
   new_backup.SetSequenceNumber(sequence_number);
 
+  auto start_backup = backup_env_-> NowMicros();
+
   Log(options_.info_log, "Started the backup process -- creating backup %u",
       new_backup_id);
 
@@ -507,6 +534,8 @@ Status BackupEngineImpl::CreateNewBackup(DB* db, bool flush_before_backup) {
         GetAbsolutePath(GetPrivateFileRel(new_backup_id, false)));
   }
 
+  auto backup_time = backup_env_->NowMicros() - start_backup;
+
   if (s.ok()) {
     // persist the backup metadata on the disk
     s = new_backup.StoreToFile(options_.sync);
@@ -537,9 +566,15 @@ Status BackupEngineImpl::CreateNewBackup(DB* db, bool flush_before_backup) {
     }
   }
 
+  if (s.ok()) {
+    backup_statistics_.IncrementNumberSuccessBackup();
+  }
   if (!s.ok()) {
+    backup_statistics_.IncrementNumberFailBackup();
     // clean all the files we might have created
     Log(options_.info_log, "Backup failed -- %s", s.ToString().c_str());
+    Log(options_.info_log, "Backup Statistics %s\n",
+        backup_statistics_.ToString().c_str());
     backups_.erase(new_backup_id);
     GarbageCollection(true);
     return s;
@@ -549,6 +584,16 @@ Status BackupEngineImpl::CreateNewBackup(DB* db, bool flush_before_backup) {
   // in the LATEST_BACKUP file
   latest_backup_id_ = new_backup_id;
   Log(options_.info_log, "Backup DONE. All is good");
+
+  // backup_speed is in byte/second
+  double backup_speed = new_backup.GetSize() / (1.048576 * backup_time);
+  Log(options_.info_log, "Backup number of files: %u",
+      new_backup.GetNumberFiles());
+  Log(options_.info_log, "Backup size: %lu bytes", new_backup.GetSize());
+  Log(options_.info_log, "Backup time: %lu microseconds", backup_time);
+  Log(options_.info_log, "Backup speed: %.3f MB/s", backup_speed);
+  Log(options_.info_log, "Backup Statistics %s",
+      backup_statistics_.ToString().c_str());
   return s;
 }
 
@@ -584,8 +629,9 @@ void BackupEngineImpl::GetBackupInfo(std::vector<BackupInfo>* backup_info) {
   backup_info->reserve(backups_.size());
   for (auto& backup : backups_) {
     if (!backup.second.Empty()) {
-      backup_info->push_back(BackupInfo(
-          backup.first, backup.second.GetTimestamp(), backup.second.GetSize()));
+        backup_info->push_back(BackupInfo(
+            backup.first, backup.second.GetTimestamp(), backup.second.GetSize(),
+            backup.second.GetNumberFiles()));
     }
   }
 }

From 092f97e2198905d6ad1829f198924eb835cbbcb0 Mon Sep 17 00:00:00 2001
From: Jonah Cohen <jonah@fb.com>
Date: Tue, 9 Sep 2014 15:20:49 -0700
Subject: [PATCH 059/829] Fix comments and typos

Summary: Correct some comments and typos in RocksDB.

Test Plan: Inspection

Reviewers: sdong, igor

Reviewed By: igor

Differential Revision: https://reviews.facebook.net/D23133
---
 db/column_family.h        |  2 +-
 db/compaction_picker.cc   |  4 ++--
 include/rocksdb/db.h      |  2 +-
 include/rocksdb/options.h | 15 +++++----------
 4 files changed, 9 insertions(+), 14 deletions(-)

diff --git a/db/column_family.h b/db/column_family.h
index b5363fe30..42e65afee 100644
--- a/db/column_family.h
+++ b/db/column_family.h
@@ -133,7 +133,7 @@ class ColumnFamilyData {
   void Ref() { ++refs_; }
   // will just decrease reference count to 0, but will not delete it. returns
   // true if the ref count was decreased to zero. in that case, it can be
-  // deleted by the caller immediatelly, or later, by calling
+  // deleted by the caller immediately, or later, by calling
   // FreeDeadColumnFamilies()
   bool Unref() {
     assert(refs_ > 0);
diff --git a/db/compaction_picker.cc b/db/compaction_picker.cc
index 6e9a46ed4..04d5c6f47 100644
--- a/db/compaction_picker.cc
+++ b/db/compaction_picker.cc
@@ -42,13 +42,13 @@ CompressionType GetCompressionType(const Options& options, int level,
     return kNoCompression;
   }
   // If the use has specified a different compression level for each level,
-  // then pick the compresison for that level.
+  // then pick the compression for that level.
   if (!options.compression_per_level.empty()) {
     const int n = options.compression_per_level.size() - 1;
     // It is possible for level_ to be -1; in that case, we use level
     // 0's compression.  This occurs mostly in backwards compatibility
     // situations when the builder doesn't know what level the file
-    // belongs to.  Likewise, if level_ is beyond the end of the
+    // belongs to.  Likewise, if level is beyond the end of the
     // specified compression levels, use the last value.
     return options.compression_per_level[std::max(0, std::min(level, n))];
   } else {
diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h
index d9be6b427..47372c42b 100644
--- a/include/rocksdb/db.h
+++ b/include/rocksdb/db.h
@@ -123,7 +123,7 @@ class DB {
 
   // Open DB with column families.
   // db_options specify database specific options
-  // column_families is the vector of all column families in the databse,
+  // column_families is the vector of all column families in the database,
   // containing column family name and options. You need to open ALL column
   // families in the database. To get the list of column families, you can use
   // ListColumnFamilies(). Also, you can open only a subset of column families
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index fc5e039a7..acab6f992 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -224,17 +224,12 @@ struct ColumnFamilyOptions {
   CompressionType compression;
 
   // Different levels can have different compression policies. There
-  // are cases where most lower levels would like to quick compression
-  // algorithm while the higher levels (which have more data) use
+  // are cases where most lower levels would like to use quick compression
+  // algorithms while the higher levels (which have more data) use
   // compression algorithms that have better compression but could
-  // be slower. This array, if non nullptr, should have an entry for
-  // each level of the database. This array, if non nullptr, overides the
-  // value specified in the previous field 'compression'. The caller is
-  // reponsible for allocating memory and initializing the values in it
-  // before invoking Open(). The caller is responsible for freeing this
-  // array and it could be freed anytime after the return from Open().
-  // This could have been a std::vector but that makes the equivalent
-  // java/C api hard to construct.
+  // be slower. This array, if non-empty, should have an entry for
+  // each level of the database; these override the value specified in
+  // the previous field 'compression'.
   std::vector<CompressionType> compression_per_level;
 
   // different options for compression algorithms

From a52cecb56c652a9ede84fed47ceb746a3f5e767a Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Tue, 9 Sep 2014 18:42:35 -0700
Subject: [PATCH 060/829] Fix Mac compile

---
 db/db_test.cc                         | 43 +++++++++++++++++----------
 utilities/backupable/backupable_db.cc |  5 ++--
 2 files changed, 30 insertions(+), 18 deletions(-)

diff --git a/db/db_test.cc b/db/db_test.cc
index 2c06c241d..74e664fc1 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -6142,8 +6142,8 @@ std::vector<std::uint64_t> ListTableFiles(Env* env, const std::string& path) {
   return ListSpecificFiles(env, path, kTableFile);
 }
 
-std::uint64_t GetNumberOfSstFilesForColumnFamily(
-    DB* db, std::string column_family_name) {
+uint64_t GetNumberOfSstFilesForColumnFamily(DB* db,
+                                            std::string column_family_name) {
   std::vector<LiveFileMetaData> metadata;
   db->GetLiveFilesMetaData(&metadata);
   uint64_t result = 0;
@@ -6200,9 +6200,10 @@ TEST(DBTest, RecoverCheckFileAmountWithSmallWriteBuffer) {
   dbfull()->TEST_WaitForFlushMemTable(handles_[2]);
   {
     auto tables = ListTableFiles(env_, dbname_);
-    ASSERT_EQ(tables.size(), 1);
+    ASSERT_EQ(tables.size(), static_cast<size_t>(1));
     // Make sure 'dobrynia' was flushed: check sst files amount
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"), 1);
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
+              static_cast<uint64_t>(1));
   }
   // New WAL file
   ASSERT_OK(Put(1, Key(1), DummyString(1)));
@@ -6216,13 +6217,17 @@ TEST(DBTest, RecoverCheckFileAmountWithSmallWriteBuffer) {
                            &options);
   {
     // No inserts => default is empty
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), 0);
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+              static_cast<uint64_t>(0));
     // First 4 keys goes to separate SSTs + 1 more SST for 2 smaller keys
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), 5);
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
+              static_cast<uint64_t>(5));
     // 1 SST for big key + 1 SST for small one
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"), 2);
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
+              static_cast<uint64_t>(2));
     // 1 SST for all keys
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), 1);
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+              static_cast<uint64_t>(1));
   }
 }
 
@@ -6247,8 +6252,9 @@ TEST(DBTest, RecoverCheckFileAmount) {
   // 4 memtable are not flushed, 1 sst file
   {
     auto tables = ListTableFiles(env_, dbname_);
-    ASSERT_EQ(tables.size(), 1);
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), 1);
+    ASSERT_EQ(tables.size(), static_cast<size_t>(1));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+              static_cast<uint64_t>(1));
   }
   // Memtable for 'nikitich' has flushed, new WAL file has opened
   // 4 memtable still not flushed
@@ -6270,8 +6276,9 @@ TEST(DBTest, RecoverCheckFileAmount) {
 
   {
     auto tables = ListTableFiles(env_, dbname_);
-    ASSERT_EQ(tables.size(), 2);
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), 2);
+    ASSERT_EQ(tables.size(), static_cast<size_t>(2));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+              static_cast<uint64_t>(2));
   }
 
   ReopenWithColumnFamilies({"default", "pikachu", "dobrynia", "nikitich"},
@@ -6282,10 +6289,14 @@ TEST(DBTest, RecoverCheckFileAmount) {
     // first, second and third WALs  went to the same SST.
     // So, there is 6 SSTs: three  for 'nikitich', one for 'default', one for
     // 'dobrynia', one for 'pikachu'
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), 1);
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), 3);
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"), 1);
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), 1);
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+              static_cast<uint64_t>(1));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+              static_cast<uint64_t>(3));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
+              static_cast<uint64_t>(1));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
+              static_cast<uint64_t>(1));
   }
 }
 
diff --git a/utilities/backupable/backupable_db.cc b/utilities/backupable/backupable_db.cc
index 71248b47c..20ec9db85 100644
--- a/utilities/backupable/backupable_db.cc
+++ b/utilities/backupable/backupable_db.cc
@@ -589,8 +589,9 @@ Status BackupEngineImpl::CreateNewBackup(DB* db, bool flush_before_backup) {
   double backup_speed = new_backup.GetSize() / (1.048576 * backup_time);
   Log(options_.info_log, "Backup number of files: %u",
       new_backup.GetNumberFiles());
-  Log(options_.info_log, "Backup size: %lu bytes", new_backup.GetSize());
-  Log(options_.info_log, "Backup time: %lu microseconds", backup_time);
+  Log(options_.info_log, "Backup size: %" PRIu64 " bytes",
+      new_backup.GetSize());
+  Log(options_.info_log, "Backup time: %" PRIu64 " microseconds", backup_time);
   Log(options_.info_log, "Backup speed: %.3f MB/s", backup_speed);
   Log(options_.info_log, "Backup Statistics %s",
       backup_statistics_.ToString().c_str());

From 53404d9fb7907fafed43096050dbbda76d9b4174 Mon Sep 17 00:00:00 2001
From: Feng Zhu <zagfox@fb.com>
Date: Wed, 10 Sep 2014 09:46:56 -0700
Subject: [PATCH 061/829] add_qps_info_in cache bench

Summary: print qps in summary

Test Plan: ./cache_bench

Reviewers: yhchiang, ljin, sdong, igor

Reviewed By: igor

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D23079
---
 util/cache_bench.cc | 28 ++++++++++++++++++++++++----
 1 file changed, 24 insertions(+), 4 deletions(-)

diff --git a/util/cache_bench.cc b/util/cache_bench.cc
index ccaf5ce5b..985eb06a3 100644
--- a/util/cache_bench.cc
+++ b/util/cache_bench.cc
@@ -29,14 +29,15 @@ using GFLAGS::ParseCommandLineFlags;
 
 static const uint32_t KB = 1024;
 
-DEFINE_int32(threads, 10, "Number of concurrent threads to run.");
-DEFINE_int64(cache_size, 2 * KB * KB * KB,
+DEFINE_int32(threads, 16, "Number of concurrent threads to run.");
+DEFINE_int64(cache_size, 8 * KB * KB,
              "Number of bytes to use as a cache of uncompressed data.");
 DEFINE_int32(num_shard_bits, 4, "shard_bits.");
 
-DEFINE_int64(max_key, 1 * KB* KB, "Max number of key to place in cache");
+DEFINE_int64(max_key, 1 * KB * KB * KB, "Max number of key to place in cache");
 DEFINE_uint64(ops_per_thread, 1200000, "Number of operations per thread.");
 
+DEFINE_bool(populate_cache, false, "Populate cache before operations");
 DEFINE_int32(insert_percent, 40,
              "Ratio of insert to total workload (expressed as a percentage)");
 DEFINE_int32(lookup_percent, 50,
@@ -135,6 +136,18 @@ class CacheBench {
 
   ~CacheBench() {}
 
+  void PopulateCache() {
+    Random rnd(1);
+    for (int64_t i = 0; i < FLAGS_cache_size; i++) {
+      uint64_t rand_key = rnd.Next() % FLAGS_max_key;
+      // Cast uint64* to be char*, data would be copied to cache
+      Slice key(reinterpret_cast<char*>(&rand_key), 8);
+      // do insert
+      auto handle = cache_->Insert(key, new char[10], 1, &deleter);
+      cache_->Release(handle);
+    }
+  }
+
   bool Run() {
     rocksdb::Env* env = rocksdb::Env::Default();
 
@@ -164,7 +177,10 @@ class CacheBench {
 
       // Record end time
       uint64_t end_time = env->NowMicros();
-      fprintf(stdout, "Complete in %" PRIu64 "ms\n", end_time - start_time);
+      double elapsed = static_cast<double>(end_time - start_time) * 1e-6;
+      uint32_t qps = static_cast<uint32_t>(
+          static_cast<double>(FLAGS_threads * FLAGS_ops_per_thread) / elapsed);
+      fprintf(stdout, "Complete in %.3f s; QPS = %u\n", elapsed, qps);
     }
     return true;
   }
@@ -230,6 +246,7 @@ class CacheBench {
     printf("Cache size          : %" PRIu64 "\n", FLAGS_cache_size);
     printf("Num shard bits      : %d\n", FLAGS_num_shard_bits);
     printf("Max key             : %" PRIu64 "\n", FLAGS_max_key);
+    printf("Populate cache      : %d\n", FLAGS_populate_cache);
     printf("Insert percentage   : %d%%\n", FLAGS_insert_percent);
     printf("Lookup percentage   : %d%%\n", FLAGS_lookup_percent);
     printf("Erase percentage    : %d%%\n", FLAGS_erase_percent);
@@ -247,6 +264,9 @@ int main(int argc, char** argv) {
   }
 
   rocksdb::CacheBench bench;
+  if (FLAGS_populate_cache) {
+    bench.PopulateCache();
+  }
   if (bench.Run()) {
     return 0;
   } else {

From dd641b2117f4f8d820e572dce379f5663453473f Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Wed, 10 Sep 2014 12:00:32 -0700
Subject: [PATCH 062/829] fix RocksDB java build

Summary: as title

Test Plan: make rocksdbjava

Reviewers: sdong, yhchiang, igor

Reviewed By: igor

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D23193
---
 java/rocksjni/write_batch.cc | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/java/rocksjni/write_batch.cc b/java/rocksjni/write_batch.cc
index 9a4eb70fd..a2cb67016 100644
--- a/java/rocksjni/write_batch.cc
+++ b/java/rocksjni/write_batch.cc
@@ -12,6 +12,7 @@
 #include "include/org_rocksdb_WriteBatchTest.h"
 #include "rocksjni/portal.h"
 #include "rocksdb/db.h"
+#include "rocksdb/immutable_options.h"
 #include "db/memtable.h"
 #include "rocksdb/write_batch.h"
 #include "db/write_batch_internal.h"
@@ -203,16 +204,18 @@ jbyteArray Java_org_rocksdb_WriteBatchTest_getContents(
   auto factory = std::make_shared<rocksdb::SkipListFactory>();
   rocksdb::Options options;
   options.memtable_factory = factory;
-  rocksdb::MemTable* mem = new rocksdb::MemTable(cmp, options);
+  rocksdb::MemTable* mem = new rocksdb::MemTable(
+      cmp, rocksdb::ImmutableCFOptions(options),
+      rocksdb::MemTableOptions(options));
   mem->Ref();
   std::string state;
   rocksdb::ColumnFamilyMemTablesDefault cf_mems_default(mem, &options);
   rocksdb::Status s =
       rocksdb::WriteBatchInternal::InsertInto(b, &cf_mems_default);
   int count = 0;
-  Arena arena;
-  ScopedArenaIterator iter(mem->NewIterator(
-      rocksdb::ReadOptions(), false /*don't enforce total order*/, &arena));
+  rocksdb::Arena arena;
+  rocksdb::ScopedArenaIterator iter(mem->NewIterator(
+      rocksdb::ReadOptions(), &arena));
   for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
     rocksdb::ParsedInternalKey ikey;
     memset(reinterpret_cast<void*>(&ikey), 0, sizeof(ikey));

From 059e584dd395015d687a6d84a91856b866332ef1 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Wed, 10 Sep 2014 17:00:00 -0700
Subject: [PATCH 063/829] [unit test] CompactRange should fail if we don't have
 space

Summary:
See t5106397.

Also, few more changes:
1. in unit tests, the assumption is that writes will be dropped when there is no space left on device. I changed the wording around it.
2. InvalidArgument() errors are only when user-provided arguments are invalid. When the file is corrupted, we need to return Status::Corruption

Test Plan: make check

Reviewers: sdong, ljin

Reviewed By: ljin

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D23145
---
 db/db_test.cc   | 44 +++++++++++++++++++++++++++++++++++++-------
 table/format.cc |  8 ++++----
 2 files changed, 41 insertions(+), 11 deletions(-)

diff --git a/db/db_test.cc b/db/db_test.cc
index 74e664fc1..f4f7c2c40 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -123,6 +123,9 @@ class SpecialEnv : public EnvWrapper {
   // sstable Sync() calls are blocked while this pointer is non-nullptr.
   port::AtomicPointer delay_sstable_sync_;
 
+  // Drop writes on the floor while this pointer is non-nullptr.
+  port::AtomicPointer drop_writes_;
+
   // Simulate no-space errors while this pointer is non-nullptr.
   port::AtomicPointer no_space_;
 
@@ -150,6 +153,7 @@ class SpecialEnv : public EnvWrapper {
 
   explicit SpecialEnv(Env* base) : EnvWrapper(base) {
     delay_sstable_sync_.Release_Store(nullptr);
+    drop_writes_.Release_Store(nullptr);
     no_space_.Release_Store(nullptr);
     non_writable_.Release_Store(nullptr);
     count_random_reads_ = false;
@@ -173,9 +177,11 @@ class SpecialEnv : public EnvWrapper {
             base_(std::move(base)) {
       }
       Status Append(const Slice& data) {
-        if (env_->no_space_.Acquire_Load() != nullptr) {
+        if (env_->drop_writes_.Acquire_Load() != nullptr) {
           // Drop writes on the floor
           return Status::OK();
+        } else if (env_->no_space_.Acquire_Load() != nullptr) {
+          return Status::IOError("No space left on device");
         } else {
           env_->bytes_written_ += data.size();
           return base_->Append(data);
@@ -5573,8 +5579,8 @@ TEST(DBTest, DestroyDBMetaDatabase) {
   ASSERT_TRUE(!(DB::Open(opts, metametadbname, &db)).ok());
 }
 
-// Check that number of files does not grow when we are out of space
-TEST(DBTest, NoSpace) {
+// Check that number of files does not grow when writes are dropped
+TEST(DBTest, DropWrites) {
   do {
     Options options = CurrentOptions();
     options.env = env_;
@@ -5585,7 +5591,7 @@ TEST(DBTest, NoSpace) {
     ASSERT_EQ("v1", Get("foo"));
     Compact("a", "z");
     const int num_files = CountFiles();
-    env_->no_space_.Release_Store(env_);   // Force out-of-space errors
+    env_->drop_writes_.Release_Store(env_);   // Force out-of-space errors
     env_->sleep_counter_.Reset();
     for (int i = 0; i < 5; i++) {
       for (int level = 0; level < dbfull()->NumberLevels()-1; level++) {
@@ -5597,7 +5603,7 @@ TEST(DBTest, NoSpace) {
     ASSERT_TRUE(db_->GetProperty("rocksdb.background-errors", &property_value));
     ASSERT_EQ("5", property_value);
 
-    env_->no_space_.Release_Store(nullptr);
+    env_->drop_writes_.Release_Store(nullptr);
     ASSERT_LT(CountFiles(), num_files + 3);
 
     // Check that compaction attempts slept after errors
@@ -5606,7 +5612,7 @@ TEST(DBTest, NoSpace) {
 }
 
 // Check background error counter bumped on flush failures.
-TEST(DBTest, NoSpaceFlush) {
+TEST(DBTest, DropWritesFlush) {
   do {
     Options options = CurrentOptions();
     options.env = env_;
@@ -5614,7 +5620,7 @@ TEST(DBTest, NoSpaceFlush) {
     Reopen(&options);
 
     ASSERT_OK(Put("foo", "v1"));
-    env_->no_space_.Release_Store(env_);  // Force out-of-space errors
+    env_->drop_writes_.Release_Store(env_);  // Force out-of-space errors
 
     std::string property_value;
     // Background error count is 0 now.
@@ -5638,6 +5644,30 @@ TEST(DBTest, NoSpaceFlush) {
     }
     ASSERT_EQ("1", property_value);
 
+    env_->drop_writes_.Release_Store(nullptr);
+  } while (ChangeCompactOptions());
+}
+
+// Check that CompactRange() returns failure if there is not enough space left
+// on device
+TEST(DBTest, NoSpaceCompactRange) {
+  do {
+    Options options = CurrentOptions();
+    options.env = env_;
+    options.disable_auto_compactions = true;
+    Reopen(&options);
+
+    // generate 5 tables
+    for (int i = 0; i < 5; ++i) {
+      ASSERT_OK(Put(Key(i), Key(i) + "v"));
+      ASSERT_OK(Flush());
+    }
+
+    env_->no_space_.Release_Store(env_);  // Force out-of-space errors
+
+    Status s = db_->CompactRange(nullptr, nullptr);
+    ASSERT_TRUE(s.IsIOError());
+
     env_->no_space_.Release_Store(nullptr);
   } while (ChangeCompactOptions());
 }
diff --git a/table/format.cc b/table/format.cc
index 46105247f..70cc6eb83 100644
--- a/table/format.cc
+++ b/table/format.cc
@@ -135,7 +135,7 @@ Status Footer::DecodeFrom(Slice* input) {
       snprintf(buffer, sizeof(buffer) - 1,
                "not an sstable (bad magic number --- %lx)",
                (long)magic);
-      return Status::InvalidArgument(buffer);
+      return Status::Corruption(buffer);
     }
   } else {
     set_table_magic_number(magic);
@@ -156,7 +156,7 @@ Status Footer::DecodeFrom(Slice* input) {
     // It consists of the checksum type, two block handles, padding,
     // a version number, and a magic number
     if (input->size() < kVersion1EncodedLength) {
-      return Status::InvalidArgument("input is too short to be an sstable");
+      return Status::Corruption("input is too short to be an sstable");
     } else {
       input->remove_prefix(input->size() - kVersion1EncodedLength);
     }
@@ -183,7 +183,7 @@ Status ReadFooterFromFile(RandomAccessFile* file,
                           uint64_t file_size,
                           Footer* footer) {
   if (file_size < Footer::kMinEncodedLength) {
-    return Status::InvalidArgument("file is too short to be an sstable");
+    return Status::Corruption("file is too short to be an sstable");
   }
 
   char footer_space[Footer::kMaxEncodedLength];
@@ -198,7 +198,7 @@ Status ReadFooterFromFile(RandomAccessFile* file,
   // Check that we actually read the whole footer from the file. It may be
   // that size isn't correct.
   if (footer_input.size() < Footer::kMinEncodedLength) {
-    return Status::InvalidArgument("file is too short to be an sstable");
+    return Status::Corruption("file is too short to be an sstable");
   }
 
   return footer->DecodeFrom(&footer_input);

From 3d9e6f77598aa7927df1c428a400549258411e3f Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Wed, 10 Sep 2014 18:46:09 -0700
Subject: [PATCH 064/829] Push model for flushing memtables

Summary:
When memtable is full it calls the registered callback. That callback then registers column family as needing the flush. Every write checks if there are some column families that need to be flushed. This completely eliminates the need for MakeRoomForWrite() function and simplifies our Write code-path.

There is some complexity with the concurrency when the column family is dropped. I made it a bit less complex by dropping the column family from the write thread in https://reviews.facebook.net/D22965. Let me know if you want to discuss this.

Test Plan: make check works. I'll also run db_stress with creating and dropping column families for a while.

Reviewers: yhchiang, sdong, ljin

Reviewed By: ljin

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D23067
---
 db/column_family.cc       |  12 +++
 db/column_family.h        |  11 ++-
 db/db_impl.cc             | 151 +++++++++++++++++---------------------
 db/db_impl.h              |   5 +-
 db/db_test.cc             |  63 ++++++++++++----
 db/flush_scheduler.cc     |  62 ++++++++++++++++
 db/flush_scheduler.h      |  39 ++++++++++
 db/log_and_apply_bench.cc |   1 +
 db/memtable.cc            |  10 +--
 db/memtable.h             |  13 +++-
 db/write_batch.cc         |   3 +
 db/write_batch_internal.h |   3 +
 12 files changed, 262 insertions(+), 111 deletions(-)
 create mode 100644 db/flush_scheduler.cc
 create mode 100644 db/flush_scheduler.h

diff --git a/db/column_family.cc b/db/column_family.cc
index 94aef3819..c8ea7accf 100644
--- a/db/column_family.cc
+++ b/db/column_family.cc
@@ -660,6 +660,11 @@ bool ColumnFamilyMemTablesImpl::Seek(uint32_t column_family_id) {
     column_family_set_->Lock();
     current_ = column_family_set_->GetColumnFamily(column_family_id);
     column_family_set_->Unlock();
+    // TODO(icanadi) Maybe remove column family from the hash table when it's
+    // dropped?
+    if (current_ != nullptr && current_->IsDropped()) {
+      current_ = nullptr;
+    }
   }
   handle_.SetCFD(current_);
   return current_ != nullptr;
@@ -685,6 +690,13 @@ ColumnFamilyHandle* ColumnFamilyMemTablesImpl::GetColumnFamilyHandle() {
   return &handle_;
 }
 
+void ColumnFamilyMemTablesImpl::CheckMemtableFull() {
+  if (current_ != nullptr && current_->mem()->ShouldScheduleFlush()) {
+    flush_scheduler_->ScheduleFlush(current_);
+    current_->mem()->MarkFlushScheduled();
+  }
+}
+
 uint32_t GetColumnFamilyID(ColumnFamilyHandle* column_family) {
   uint32_t column_family_id = 0;
   if (column_family != nullptr) {
diff --git a/db/column_family.h b/db/column_family.h
index 42e65afee..e7b21036f 100644
--- a/db/column_family.h
+++ b/db/column_family.h
@@ -22,6 +22,7 @@
 #include "db/write_controller.h"
 #include "db/table_cache.h"
 #include "util/thread_local.h"
+#include "db/flush_scheduler.h"
 
 namespace rocksdb {
 
@@ -394,8 +395,11 @@ class ColumnFamilySet {
 // memtables of different column families (specified by ID in the write batch)
 class ColumnFamilyMemTablesImpl : public ColumnFamilyMemTables {
  public:
-  explicit ColumnFamilyMemTablesImpl(ColumnFamilySet* column_family_set)
-      : column_family_set_(column_family_set), current_(nullptr) {}
+  explicit ColumnFamilyMemTablesImpl(ColumnFamilySet* column_family_set,
+                                     FlushScheduler* flush_scheduler)
+      : column_family_set_(column_family_set),
+        current_(nullptr),
+        flush_scheduler_(flush_scheduler) {}
 
   // sets current_ to ColumnFamilyData with column_family_id
   // returns false if column family doesn't exist
@@ -414,9 +418,12 @@ class ColumnFamilyMemTablesImpl : public ColumnFamilyMemTables {
   // Returns column family handle for the selected column family
   virtual ColumnFamilyHandle* GetColumnFamilyHandle() override;
 
+  virtual void CheckMemtableFull() override;
+
  private:
   ColumnFamilySet* column_family_set_;
   ColumnFamilyData* current_;
+  FlushScheduler* flush_scheduler_;
   ColumnFamilyHandleInternal handle_;
 };
 
diff --git a/db/db_impl.cc b/db/db_impl.cc
index f6634b6c4..74c114bfd 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -361,8 +361,8 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname)
 
   versions_.reset(new VersionSet(dbname_, &db_options_, env_options_,
                                  table_cache_.get(), &write_controller_));
-  column_family_memtables_.reset(
-      new ColumnFamilyMemTablesImpl(versions_->GetColumnFamilySet()));
+  column_family_memtables_.reset(new ColumnFamilyMemTablesImpl(
+      versions_->GetColumnFamilySet(), &flush_scheduler_));
 
   DumpLeveldbBuildVersion(db_options_.info_log.get());
   DumpDBFileSummary(db_options_, dbname_);
@@ -392,6 +392,8 @@ DBImpl::~DBImpl() {
     bg_cv_.Wait();
   }
 
+  flush_scheduler_.Clear();
+
   if (default_cf_handle_ != nullptr) {
     // we need to delete handle outside of lock because it does its own locking
     mutex_.Unlock();
@@ -1336,28 +1338,30 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
       }
 
       if (!read_only) {
-        // no need to refcount since client still doesn't have access
-        // to the DB and can not drop column families while we iterate
-        for (auto cfd : *versions_->GetColumnFamilySet()) {
-          if (cfd->mem()->ShouldFlush()) {
-            // If this asserts, it means that InsertInto failed in
-            // filtering updates to already-flushed column families
-            assert(cfd->GetLogNumber() <= log_number);
-            auto iter = version_edits.find(cfd->GetID());
-            assert(iter != version_edits.end());
-            VersionEdit* edit = &iter->second;
-            status = WriteLevel0TableForRecovery(cfd, cfd->mem(), edit);
-            if (!status.ok()) {
-              // Reflect errors immediately so that conditions like full
-              // file-systems cause the DB::Open() to fail.
-              return status;
-            }
-            cfd->CreateNewMemtable();
+        // we can do this because this is called before client has access to the
+        // DB and there is only a single thread operating on DB
+        ColumnFamilyData* cfd;
+
+        while ((cfd = flush_scheduler_.GetNextColumnFamily()) != nullptr) {
+          cfd->Unref();
+          // If this asserts, it means that InsertInto failed in
+          // filtering updates to already-flushed column families
+          assert(cfd->GetLogNumber() <= log_number);
+          auto iter = version_edits.find(cfd->GetID());
+          assert(iter != version_edits.end());
+          VersionEdit* edit = &iter->second;
+          status = WriteLevel0TableForRecovery(cfd, cfd->mem(), edit);
+          if (!status.ok()) {
+            // Reflect errors immediately so that conditions like full
+            // file-systems cause the DB::Open() to fail.
+            return status;
           }
+          cfd->CreateNewMemtable();
         }
       }
     }
 
+    flush_scheduler_.Clear();
     if (versions_->LastSequence() < *max_sequence) {
       versions_->SetLastSequence(*max_sequence);
     }
@@ -2201,7 +2205,7 @@ void DBImpl::BackgroundCallCompaction() {
     }
     if (madeProgress || bg_compaction_scheduled_ == 0 || bg_manual_only_ > 0) {
       // signal if
-      // * madeProgress -- need to wakeup MakeRoomForWrite
+      // * madeProgress -- need to wakeup DelayWrite
       // * bg_compaction_scheduled_ == 0 -- need to wakeup ~DBImpl
       // * bg_manual_only_ > 0 -- need to wakeup RunManualCompaction
       // If none of this is true, there is no need to signal since nobody is
@@ -2622,7 +2626,7 @@ uint64_t DBImpl::CallFlushDuringCompaction(ColumnFamilyData* cfd,
       cfd->Ref();
       FlushMemTableToOutputFile(cfd, nullptr, deletion_state, log_buffer);
       cfd->Unref();
-      bg_cv_.SignalAll();  // Wakeup MakeRoomForWrite() if necessary
+      bg_cv_.SignalAll();  // Wakeup DelayWrite() if necessary
     }
     mutex_.Unlock();
     log_buffer->FlushBufferToLog();
@@ -3959,10 +3963,12 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
   w.timeout_hint_us = options.timeout_hint_us;
 
   uint64_t expiration_time = 0;
+  bool has_timeout = false;
   if (w.timeout_hint_us == 0) {
     w.timeout_hint_us = kNoTimeOut;
   } else {
     expiration_time = env_->NowMicros() + w.timeout_hint_us;
+    has_timeout = true;
   }
 
   if (!options.disableWAL) {
@@ -3997,56 +4003,48 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
   assert(!single_column_family_mode_ ||
          versions_->GetColumnFamilySet()->NumberOfColumnFamilies() == 1);
 
-  uint64_t flush_column_family_if_log_file = 0;
   uint64_t max_total_wal_size = (db_options_.max_total_wal_size == 0)
                                     ? 4 * max_total_in_memory_state_
                                     : db_options_.max_total_wal_size;
   if (UNLIKELY(!single_column_family_mode_) &&
       alive_log_files_.begin()->getting_flushed == false &&
       total_log_size_ > max_total_wal_size) {
-    flush_column_family_if_log_file = alive_log_files_.begin()->number;
+    uint64_t flush_column_family_if_log_file = alive_log_files_.begin()->number;
     alive_log_files_.begin()->getting_flushed = true;
     Log(db_options_.info_log,
         "Flushing all column families with data in WAL number %" PRIu64
         ". Total log size is %" PRIu64 " while max_total_wal_size is %" PRIu64,
         flush_column_family_if_log_file, total_log_size_, max_total_wal_size);
-  }
-
-  if (write_controller_.IsStopped() || write_controller_.GetDelay() > 0) {
-    DelayWrite(expiration_time);
-  }
-
-  if (LIKELY(single_column_family_mode_)) {
-    // fast path
-    status = MakeRoomForWrite(default_cf_handle_->cfd(),
-                              &context, expiration_time);
-  } else {
-    // refcounting cfd in iteration
-    bool dead_cfd = false;
+    // no need to refcount because drop is happening in write thread, so can't
+    // happen while we're in the write thread
     for (auto cfd : *versions_->GetColumnFamilySet()) {
-      cfd->Ref();
-      if (flush_column_family_if_log_file != 0 &&
-          cfd->GetLogNumber() <= flush_column_family_if_log_file) {
-        // log size excedded limit and we need to do flush
-        // SetNewMemtableAndNewLogFie may temporarily unlock and wait
+      if (cfd->GetLogNumber() <= flush_column_family_if_log_file) {
         status = SetNewMemtableAndNewLogFile(cfd, &context);
+        if (!status.ok()) {
+          break;
+        }
         cfd->imm()->FlushRequested();
-        MaybeScheduleFlushOrCompaction();
-      } else {
-        // May temporarily unlock and wait.
-        status = MakeRoomForWrite(cfd, &context, expiration_time);
-      }
-
-      if (cfd->Unref()) {
-        dead_cfd = true;
-      }
-      if (!status.ok()) {
-        break;
       }
     }
-    if (dead_cfd) {
-      versions_->GetColumnFamilySet()->FreeDeadColumnFamilies();
-    }
+    MaybeScheduleFlushOrCompaction();
+  }
+
+  if (UNLIKELY(status.ok() && !bg_error_.ok())) {
+    status = bg_error_;
+  }
+
+  if (UNLIKELY(status.ok() && !flush_scheduler_.Empty())) {
+    status = ScheduleFlushes(&context);
+  }
+
+  if (UNLIKELY(status.ok()) &&
+      (write_controller_.IsStopped() || write_controller_.GetDelay() > 0)) {
+    DelayWrite(expiration_time);
+  }
+
+  if (UNLIKELY(status.ok() && has_timeout &&
+               env_->NowMicros() > expiration_time)) {
+    status = Status::TimedOut();
   }
 
   uint64_t last_sequence = versions_->LastSequence();
@@ -4241,36 +4239,23 @@ void DBImpl::DelayWrite(uint64_t expiration_time) {
   }
 }
 
-// REQUIRES: mutex_ is held
-// REQUIRES: this thread is currently at the front of the writer queue
-Status DBImpl::MakeRoomForWrite(ColumnFamilyData* cfd,
-                                WriteContext* context,
-                                uint64_t expiration_time) {
-  mutex_.AssertHeld();
-  assert(!writers_.empty());
-  Status s;
-  bool has_timeout = (expiration_time > 0);
-
-  while (true) {
-    if (!bg_error_.ok()) {
-      // Yield previous error
-      s = bg_error_;
-      break;
-    } else if (has_timeout && env_->NowMicros() > expiration_time) {
-      s = Status::TimedOut();
-      break;
-    } else if (!cfd->mem()->ShouldFlush()) {
-      // There is room in current memtable
-      break;
-    } else {
-      s = SetNewMemtableAndNewLogFile(cfd, context);
-      if (!s.ok()) {
-        break;
-      }
-      MaybeScheduleFlushOrCompaction();
+Status DBImpl::ScheduleFlushes(WriteContext* context) {
+  bool schedule_bg_work = false;
+  ColumnFamilyData* cfd;
+  while ((cfd = flush_scheduler_.GetNextColumnFamily()) != nullptr) {
+    schedule_bg_work = true;
+    auto status = SetNewMemtableAndNewLogFile(cfd, context);
+    if (cfd->Unref()) {
+      delete cfd;
+    }
+    if (!status.ok()) {
+      return status;
     }
   }
-  return s;
+  if (schedule_bg_work) {
+    MaybeScheduleFlushOrCompaction();
+  }
+  return Status::OK();
 }
 
 // REQUIRES: mutex_ is held
diff --git a/db/db_impl.h b/db/db_impl.h
index cf7914fec..0336b3af5 100644
--- a/db/db_impl.h
+++ b/db/db_impl.h
@@ -33,6 +33,7 @@
 #include "util/scoped_arena_iterator.h"
 #include "db/internal_stats.h"
 #include "db/write_controller.h"
+#include "db/flush_scheduler.h"
 
 namespace rocksdb {
 
@@ -399,8 +400,7 @@ class DBImpl : public DB {
 
   void DelayWrite(uint64_t expiration_time);
 
-  Status MakeRoomForWrite(ColumnFamilyData* cfd, WriteContext* context,
-                          uint64_t expiration_time);
+  Status ScheduleFlushes(WriteContext* context);
 
   Status SetNewMemtableAndNewLogFile(ColumnFamilyData* cfd,
                                      WriteContext* context);
@@ -557,6 +557,7 @@ class DBImpl : public DB {
   WriteBatch tmp_batch_;
 
   WriteController write_controller_;
+  FlushScheduler flush_scheduler_;
 
   SnapshotList snapshots_;
 
diff --git a/db/db_test.cc b/db/db_test.cc
index f4f7c2c40..dcfdb2aae 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -1151,6 +1151,17 @@ void VerifyTableProperties(DB* db, uint64_t expected_entries_size) {
   ASSERT_EQ(props.size(), unique_entries.size());
   ASSERT_EQ(expected_entries_size, sum);
 }
+
+uint64_t GetNumberOfSstFilesForColumnFamily(DB* db,
+                                            std::string column_family_name) {
+  std::vector<LiveFileMetaData> metadata;
+  db->GetLiveFilesMetaData(&metadata);
+  uint64_t result = 0;
+  for (auto& fileMetadata : metadata) {
+    result += (fileMetadata.column_family_name == column_family_name);
+  }
+  return result;
+}
 }  // namespace
 
 TEST(DBTest, Empty) {
@@ -2777,6 +2788,41 @@ TEST(DBTest, RecoverDuringMemtableCompaction) {
   } while (ChangeOptions());
 }
 
+TEST(DBTest, FlushSchedule) {
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.level0_stop_writes_trigger = 1 << 10;
+  options.level0_slowdown_writes_trigger = 1 << 10;
+  options.min_write_buffer_number_to_merge = 1;
+  options.max_write_buffer_number = 2;
+  options.write_buffer_size = 100 * 1000;
+  CreateAndReopenWithCF({"pikachu"}, &options);
+  std::vector<std::thread> threads;
+
+  std::atomic<int> thread_num;
+  // each column family will have 5 thread, each thread generating 2 memtables.
+  // each column family should end up with 10 table files
+  for (int i = 0; i < 10; ++i) {
+    threads.emplace_back([&]() {
+      int a = thread_num.fetch_add(1);
+      Random rnd(a);
+      // this should fill up 2 memtables
+      for (int k = 0; k < 5000; ++k) {
+        Put(a & 1, RandomString(&rnd, 13), "");
+      }
+    });
+  }
+
+  for (auto& t : threads) {
+    t.join();
+  }
+
+  ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+            static_cast<uint64_t>(10));
+  ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
+            static_cast<uint64_t>(10));
+}
+
 TEST(DBTest, MinorCompactionsHappen) {
   do {
     Options options;
@@ -6171,17 +6217,6 @@ std::vector<std::uint64_t> ListLogFiles(Env* env, const std::string& path) {
 std::vector<std::uint64_t> ListTableFiles(Env* env, const std::string& path) {
   return ListSpecificFiles(env, path, kTableFile);
 }
-
-uint64_t GetNumberOfSstFilesForColumnFamily(DB* db,
-                                            std::string column_family_name) {
-  std::vector<LiveFileMetaData> metadata;
-  db->GetLiveFilesMetaData(&metadata);
-  uint64_t result = 0;
-  for (auto& fileMetadata : metadata) {
-    result += (fileMetadata.column_family_name == column_family_name);
-  }
-  return result;
-}
 }  // namespace
 
 TEST(DBTest, FlushOneColumnFamily) {
@@ -6465,7 +6500,7 @@ TEST(DBTest, PurgeInfoLogs) {
     ASSERT_EQ(5, info_log_count);
 
     Destroy(&options);
-    // For mode (1), test DestoryDB() to delete all the logs under DB dir.
+    // For mode (1), test DestroyDB() to delete all the logs under DB dir.
     // For mode (2), no info log file should have been put under DB dir.
     std::vector<std::string> db_files;
     env_->GetChildren(dbname_, &db_files);
@@ -7894,10 +7929,6 @@ TEST(DBTest, SimpleWriteTimeoutTest) {
   // fill the two write buffers
   ASSERT_OK(Put(Key(1), Key(1) + std::string(100000, 'v'), write_opt));
   ASSERT_OK(Put(Key(2), Key(2) + std::string(100000, 'v'), write_opt));
-  // this will switch the previous memtable, but will not cause block because
-  // DelayWrite() is called before MakeRoomForWrite()
-  // TODO(icanadi) remove this as part of https://reviews.facebook.net/D23067
-  ASSERT_OK(Put(Key(3), Key(3), write_opt));
   // As the only two write buffers are full in this moment, the third
   // Put is expected to be timed-out.
   write_opt.timeout_hint_us = 50;
diff --git a/db/flush_scheduler.cc b/db/flush_scheduler.cc
new file mode 100644
index 000000000..636ff5a98
--- /dev/null
+++ b/db/flush_scheduler.cc
@@ -0,0 +1,62 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include "db/flush_scheduler.h"
+
+#include <cassert>
+
+#include "db/column_family.h"
+
+namespace rocksdb {
+
+void FlushScheduler::ScheduleFlush(ColumnFamilyData* cfd) {
+#ifndef NDEBUG
+  assert(column_families_set_.find(cfd) == column_families_set_.end());
+  column_families_set_.insert(cfd);
+#endif  // NDEBUG
+  cfd->Ref();
+  column_families_.push_back(cfd);
+}
+
+ColumnFamilyData* FlushScheduler::GetNextColumnFamily() {
+  ColumnFamilyData* cfd = nullptr;
+  while (column_families_.size() > 0) {
+    cfd = column_families_.front();
+    column_families_.pop_front();
+    if (cfd->IsDropped()) {
+      if (cfd->Unref()) {
+        delete cfd;
+      }
+    } else {
+      break;
+    }
+  }
+#ifndef NDEBUG
+  if (cfd != nullptr) {
+    auto itr = column_families_set_.find(cfd);
+    assert(itr != column_families_set_.end());
+    column_families_set_.erase(itr);
+  }
+#endif  // NDEBUG
+  return cfd;
+}
+
+bool FlushScheduler::Empty() { return column_families_.empty(); }
+
+void FlushScheduler::Clear() {
+  for (auto cfd : column_families_) {
+#ifndef NDEBUG
+    auto itr = column_families_set_.find(cfd);
+    assert(itr != column_families_set_.end());
+    column_families_set_.erase(itr);
+#endif  // NDEBUG
+    if (cfd->Unref()) {
+      delete cfd;
+    }
+  }
+  column_families_.clear();
+}
+
+}  // namespace rocksdb
diff --git a/db/flush_scheduler.h b/db/flush_scheduler.h
new file mode 100644
index 000000000..201e4a13c
--- /dev/null
+++ b/db/flush_scheduler.h
@@ -0,0 +1,39 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#pragma once
+
+#include <stdint.h>
+#include <deque>
+#include <set>
+#include <vector>
+
+namespace rocksdb {
+
+class ColumnFamilyData;
+
+// This class is thread-compatible. It's should only be accessed from single
+// write thread (between BeginWrite() and EndWrite())
+class FlushScheduler {
+ public:
+  FlushScheduler() = default;
+  ~FlushScheduler() = default;
+
+  void ScheduleFlush(ColumnFamilyData* cfd);
+  // Returns Ref()-ed column family. Client needs to Unref()
+  ColumnFamilyData* GetNextColumnFamily();
+
+  bool Empty();
+
+  void Clear();
+
+ private:
+  std::deque<ColumnFamilyData*> column_families_;
+#ifndef NDEBUG
+  std::set<ColumnFamilyData*> column_families_set_;
+#endif  // NDEBUG
+};
+
+}  // namespace rocksdb
diff --git a/db/log_and_apply_bench.cc b/db/log_and_apply_bench.cc
index 60baeb5ec..3a5535d2d 100644
--- a/db/log_and_apply_bench.cc
+++ b/db/log_and_apply_bench.cc
@@ -9,6 +9,7 @@
 #include "util/testharness.h"
 #include "util/benchharness.h"
 #include "db/version_set.h"
+#include "db/write_controller.h"
 #include "util/mutexlock.h"
 
 namespace rocksdb {
diff --git a/db/memtable.cc b/db/memtable.cc
index 23cc62270..804404bb8 100644
--- a/db/memtable.cc
+++ b/db/memtable.cc
@@ -54,8 +54,7 @@ MemTable::MemTable(const InternalKeyComparator& cmp,
       kArenaBlockSize(OptimizeBlockSize(moptions.arena_block_size)),
       arena_(moptions.arena_block_size),
       table_(ioptions.memtable_factory->CreateMemTableRep(
-          comparator_, &arena_, ioptions.prefix_extractor,
-          ioptions.info_log)),
+          comparator_, &arena_, ioptions.prefix_extractor, ioptions.info_log)),
       num_entries_(0),
       flush_in_progress_(false),
       flush_completed_(false),
@@ -65,7 +64,8 @@ MemTable::MemTable(const InternalKeyComparator& cmp,
       locks_(moptions.inplace_update_support ? moptions.inplace_update_num_locks
                                              : 0),
       prefix_extractor_(ioptions.prefix_extractor),
-      should_flush_(ShouldFlushNow()) {
+      should_flush_(ShouldFlushNow()),
+      flush_scheduled_(false) {
   // if should_flush_ == true without an entry inserted, something must have
   // gone wrong already.
   assert(!should_flush_);
@@ -79,9 +79,7 @@ MemTable::MemTable(const InternalKeyComparator& cmp,
   }
 }
 
-MemTable::~MemTable() {
-  assert(refs_ == 0);
-}
+MemTable::~MemTable() { assert(refs_ == 0); }
 
 size_t MemTable::ApproximateMemoryUsage() {
   size_t arena_usage = arena_.ApproximateMemoryUsage();
diff --git a/db/memtable.h b/db/memtable.h
index 0371dc3cf..fa6db6fe1 100644
--- a/db/memtable.h
+++ b/db/memtable.h
@@ -10,7 +10,9 @@
 #pragma once
 #include <string>
 #include <memory>
+#include <functional>
 #include <deque>
+#include <vector>
 #include "db/dbformat.h"
 #include "db/skiplist.h"
 #include "db/version_edit.h"
@@ -86,7 +88,11 @@ class MemTable {
 
   // This method heuristically determines if the memtable should continue to
   // host more data.
-  bool ShouldFlush() const { return should_flush_; }
+  bool ShouldScheduleFlush() const {
+    return flush_scheduled_ == false && should_flush_;
+  }
+
+  void MarkFlushScheduled() { flush_scheduled_ = true; }
 
   // Return an iterator that yields the contents of the memtable.
   //
@@ -194,7 +200,7 @@ class MemTable {
   const MemTableOptions* GetMemTableOptions() const { return &moptions_; }
 
  private:
-  // Dynamically check if we can add more incoming entries.
+  // Dynamically check if we can add more incoming entries
   bool ShouldFlushNow() const;
 
   friend class MemTableIterator;
@@ -238,6 +244,9 @@ class MemTable {
 
   // a flag indicating if a memtable has met the criteria to flush
   bool should_flush_;
+
+  // a flag indicating if flush has been scheduled
+  bool flush_scheduled_;
 };
 
 extern const char* EncodeKey(std::string* scratch, const Slice& target);
diff --git a/db/write_batch.cc b/db/write_batch.cc
index cacb4a5e3..b8d0322d8 100644
--- a/db/write_batch.cc
+++ b/db/write_batch.cc
@@ -394,6 +394,7 @@ class MemTableInserter : public WriteBatch::Handler {
     // sequence number. Even if the update eventually fails and does not result
     // in memtable add/update.
     sequence_++;
+    cf_mems_->CheckMemtableFull();
     return Status::OK();
   }
 
@@ -465,6 +466,7 @@ class MemTableInserter : public WriteBatch::Handler {
     }
 
     sequence_++;
+    cf_mems_->CheckMemtableFull();
     return Status::OK();
   }
 
@@ -494,6 +496,7 @@ class MemTableInserter : public WriteBatch::Handler {
     }
     mem->Add(sequence_, kTypeDeletion, key, Slice());
     sequence_++;
+    cf_mems_->CheckMemtableFull();
     return Status::OK();
   }
 };
diff --git a/db/write_batch_internal.h b/db/write_batch_internal.h
index 615a47f5e..568cd70d8 100644
--- a/db/write_batch_internal.h
+++ b/db/write_batch_internal.h
@@ -28,6 +28,7 @@ class ColumnFamilyMemTables {
   virtual MemTable* GetMemTable() const = 0;
   virtual const Options* GetOptions() const = 0;
   virtual ColumnFamilyHandle* GetColumnFamilyHandle() = 0;
+  virtual void CheckMemtableFull() = 0;
 };
 
 class ColumnFamilyMemTablesDefault : public ColumnFamilyMemTables {
@@ -54,6 +55,8 @@ class ColumnFamilyMemTablesDefault : public ColumnFamilyMemTables {
 
   ColumnFamilyHandle* GetColumnFamilyHandle() override { return nullptr; }
 
+  void CheckMemtableFull() override {}
+
  private:
   bool ok_;
   MemTable* mem_;

From d1f24dc7eef7c55984f66ebd32ce4ada08293b90 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Wed, 10 Sep 2014 19:14:17 -0700
Subject: [PATCH 065/829] Relax FlushSchedule test

Summary: The test makes sure that we don't call flush too often. For that, it's ok to check if we have less than 10 table files. Otherwise, the test is flaky because it's hard to estimate number of entries in the memtable before it gets flushed (any ideas?)

Test Plan: Still works, but hopefully less flaky.

Reviewers: ljin, sdong, yhchiang

Reviewed by: yhchiang

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D23241
---
 db/db_test.cc | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/db/db_test.cc b/db/db_test.cc
index dcfdb2aae..f79167adb 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -2817,10 +2817,12 @@ TEST(DBTest, FlushSchedule) {
     t.join();
   }
 
-  ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
-            static_cast<uint64_t>(10));
-  ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
-            static_cast<uint64_t>(10));
+  auto default_tables = GetNumberOfSstFilesForColumnFamily(db_, "default");
+  auto pikachu_tables = GetNumberOfSstFilesForColumnFamily(db_, "pikachu");
+  ASSERT_LE(default_tables, static_cast<uint64_t>(10));
+  ASSERT_GT(default_tables, static_cast<uint64_t>(0));
+  ASSERT_LE(pikachu_tables, static_cast<uint64_t>(10));
+  ASSERT_GT(pikachu_tables, static_cast<uint64_t>(0));
 }
 
 TEST(DBTest, MinorCompactionsHappen) {

From a9639bda8471ff57ed5bcefefa25ba67207dd96f Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Thu, 11 Sep 2014 15:36:30 -0700
Subject: [PATCH 066/829] Fix valgrind test

Summary: Get valgrind to stop complaining about uninitialized value

Test Plan: valgrind not complaining anymore

Reviewers: sdong, yhchiang, ljin

Reviewed By: ljin

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D23289
---
 db/db_test.cc | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/db/db_test.cc b/db/db_test.cc
index f79167adb..a7c88eddd 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -2799,16 +2799,17 @@ TEST(DBTest, FlushSchedule) {
   CreateAndReopenWithCF({"pikachu"}, &options);
   std::vector<std::thread> threads;
 
-  std::atomic<int> thread_num;
+  std::atomic<int> thread_num(0);
   // each column family will have 5 thread, each thread generating 2 memtables.
   // each column family should end up with 10 table files
   for (int i = 0; i < 10; ++i) {
     threads.emplace_back([&]() {
       int a = thread_num.fetch_add(1);
       Random rnd(a);
+      WriteOptions wo;
       // this should fill up 2 memtables
       for (int k = 0; k < 5000; ++k) {
-        Put(a & 1, RandomString(&rnd, 13), "");
+        ASSERT_OK(db_->Put(wo, handles_[a & 1], RandomString(&rnd, 13), ""));
       }
     });
   }

From 9c0e66ce9811bf16e56254d85231a5b980ef4571 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Thu, 11 Sep 2014 16:24:16 -0700
Subject: [PATCH 067/829] Don't run background jobs (flush, compactions) when
 bg_error_ is set

Summary:
If bg_error_ is set, that means that we mark DB read only. However, current behavior still continues the flushes and compactions, even though bg_error_ is set.

On the other hand, if bg_error_ is set, we will return Status::OK() from CompactRange(), although the compaction didn't actually succeed.

This is clearly not desired behavior. I found this when I was debugging t5132159, although I'm pretty sure these aren't related.

Also, when we're shutting down, it's dangerous to exit RunManualCompaction(), since that will destruct ManualCompaction object. Background compaction job might still hold a reference to manual_compaction_ and this will lead to undefined behavior. I changed the behavior so that we only exit RunManualCompaction when manual compaction job is marked done.

Test Plan: make check

Reviewers: sdong, ljin, yhchiang

Reviewed By: yhchiang

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D23223
---
 db/db_impl.cc | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/db/db_impl.cc b/db/db_impl.cc
index 74c114bfd..3a9596686 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -1891,7 +1891,10 @@ Status DBImpl::RunManualCompaction(ColumnFamilyData* cfd, int input_level,
   Log(db_options_.info_log, "[%s] Manual compaction starting",
       cfd->GetName().c_str());
 
-  while (!manual.done && !shutting_down_.Acquire_Load() && bg_error_.ok()) {
+  // We don't check bg_error_ here, because if we get the error in compaction,
+  // the compaction will set manual.status to bg_error_ and set manual.done to
+  // true.
+  while (!manual.done) {
     assert(bg_manual_only_ > 0);
     if (manual_compaction_ != nullptr) {
       // Running either this or some other manual compaction
@@ -2041,6 +2044,11 @@ Status DBImpl::BackgroundFlush(bool* madeProgress,
                                DeletionState& deletion_state,
                                LogBuffer* log_buffer) {
   mutex_.AssertHeld();
+
+  if (!bg_error_.ok()) {
+    return bg_error_;
+  }
+
   // call_status is failure if at least one flush was a failure. even if
   // flushing one column family reports a failure, we will continue flushing
   // other column families. however, call_status will be a failure in that case.
@@ -2228,6 +2236,16 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress,
   bool is_manual = (manual_compaction_ != nullptr) &&
                    (manual_compaction_->in_progress == false);
 
+  if (!bg_error_.ok()) {
+    if (is_manual) {
+      manual_compaction_->status = bg_error_;
+      manual_compaction_->done = true;
+      manual_compaction_->in_progress = false;
+      manual_compaction_ = nullptr;
+    }
+    return bg_error_;
+  }
+
   if (is_manual) {
     // another thread cannot pick up the same work
     manual_compaction_->in_progress = true;

From 0352a9fa913120e0dbdc923f5fd614780585a7b0 Mon Sep 17 00:00:00 2001
From: Feng Zhu <zagfox@fb.com>
Date: Thu, 11 Sep 2014 16:33:46 -0700
Subject: [PATCH 068/829] add_wrapped_bloom_test

Summary:
1. wrap a filter policy like what fbcode/multifeed/rocksdb/MultifeedRocksDbKey.h
   to ensure that rocksdb works fine after filterpolicy interface change

Test Plan: 1. valgrind ./bloom_test

Reviewers: ljin, igor, yhchiang, dhruba, sdong

Reviewed By: sdong

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D23229
---
 db/db_test.cc | 74 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 74 insertions(+)

diff --git a/db/db_test.cc b/db/db_test.cc
index a7c88eddd..6f56f3688 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -5980,6 +5980,80 @@ TEST(DBTest, BloomFilterReverseCompatibility) {
   ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
 }
 
+namespace {
+// A wrapped bloom over default FilterPolicy
+class WrappedBloom : public FilterPolicy {
+ public:
+  explicit WrappedBloom(int bits_per_key) :
+        filter_(NewBloomFilterPolicy(bits_per_key)),
+        counter_(0) {}
+
+  ~WrappedBloom() { delete filter_; }
+
+  const char* Name() const override { return "WrappedRocksDbFilterPolicy"; }
+
+  void CreateFilter(const rocksdb::Slice* keys, int n, std::string* dst)
+      const override {
+    std::unique_ptr<rocksdb::Slice[]> user_keys(new rocksdb::Slice[n]);
+    for (int i = 0; i < n; ++i) {
+      user_keys[i] = convertKey(keys[i]);
+    }
+    return filter_->CreateFilter(user_keys.get(), n, dst);
+  }
+
+  bool KeyMayMatch(const rocksdb::Slice& key, const rocksdb::Slice& filter)
+      const override {
+    counter_++;
+    return filter_->KeyMayMatch(convertKey(key), filter);
+  }
+
+  uint32_t GetCounter() { return counter_; }
+
+ private:
+  const FilterPolicy* filter_;
+  mutable uint32_t counter_;
+
+  rocksdb::Slice convertKey(const rocksdb::Slice key) const {
+    return key;
+  }
+};
+}  // namespace
+
+TEST(DBTest, BloomFilterWrapper) {
+  Options options;
+  options.statistics = rocksdb::CreateDBStatistics();
+
+  BlockBasedTableOptions table_options;
+  WrappedBloom* policy = new WrappedBloom(10);
+  table_options.filter_policy.reset(policy);
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  CreateAndReopenWithCF({"pikachu"}, &options);
+
+  const int maxKey = 10000;
+  for (int i = 0; i < maxKey; i++) {
+    ASSERT_OK(Put(1, Key(i), Key(i)));
+  }
+  // Add a large key to make the file contain wide range
+  ASSERT_OK(Put(1, Key(maxKey + 55555), Key(maxKey + 55555)));
+  ASSERT_EQ(0, policy->GetCounter());
+  Flush(1);
+
+  // Check if they can be found
+  for (int i = 0; i < maxKey; i++) {
+    ASSERT_EQ(Key(i), Get(1, Key(i)));
+  }
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
+  ASSERT_EQ(maxKey, policy->GetCounter());
+
+  // Check if filter is useful
+  for (int i = 0; i < maxKey; i++) {
+    ASSERT_EQ("NOT_FOUND", Get(1, Key(i+33333)));
+  }
+  ASSERT_GE(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), maxKey*0.98);
+  ASSERT_EQ(2 * maxKey, policy->GetCounter());
+}
+
 TEST(DBTest, SnapshotFiles) {
   do {
     Options options = CurrentOptions();

From ebb5c65e6088f2a8d7bf22d3e3dbfafd590fb48d Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Thu, 11 Sep 2014 20:41:02 -0700
Subject: [PATCH 069/829] Add make install

Summary:
Add make install.  If INSTALL_PATH is not set, then rocksdb will be
installed under "/usr/local" directory (/usr/local/include for headers
and /usr/local/lib for library file(s).)

Test Plan:
Develop a simple rocksdb app, called test.cc, and do the followings.

make clean
make static_lib -j32
sudo make install
g++ -std=c++11 test.cc -lrocksdb -lbz2 -lz -o test
./test

sudo make uninstall
make clean
make shared_lib -j32
sudo make install
g++ -std=c++11 test.cc -lrocksdb -lbz2 -lz -o test
./test

make INSTALL_PATH=/tmp/path install
make INSTALL_PATH=/tmp/path uninstall
and make sure things are installed / uninstalled in the specified path.

Reviewers: ljin, sdong, igor

Reviewed By: igor

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D23211
---
 Makefile | 30 ++++++++++++++++++++++++++++--
 1 file changed, 28 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index a438230cb..f37bcf5cc 100644
--- a/Makefile
+++ b/Makefile
@@ -3,7 +3,6 @@
 # found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 # Inherit some settings from environment variables, if available
-INSTALL_PATH ?= $(CURDIR)
 
 #-----------------------------------------------
 
@@ -49,6 +48,33 @@ else
 	PLATFORM_CCFLAGS += $(JEMALLOC_INCLUDE) -DHAVE_JEMALLOC
 endif
 
+#-------------------------------------------------
+# make install related stuff
+INSTALL_PATH ?= /usr/local
+
+uninstall:
+	rm -rf $(INSTALL_PATH)/include/rocksdb
+	if [ -a $(LIBRARY) ]; then \
+		rm -rf $(INSTALL_PATH)/lib/$(LIBRARY); \
+	fi
+	if [ -a $(SHARED) ]; then \
+		rm -rf $(INSTALL_PATH)/lib/$(SHARED); \
+	fi
+
+install:
+	install -d $(INSTALL_PATH)/include/rocksdb
+	install -d $(INSTALL_PATH)/lib
+	for header in `find "include/rocksdb" -type f -name *.h`; do \
+		install -C -m 644 -D $$header $(INSTALL_PATH)/$$header; \
+	done
+	if [ -a $(LIBRARY) ]; then \
+		install -C -m 644 $(LIBRARY) $(INSTALL_PATH)/lib/.; \
+	fi;
+	if [ -a $(SHARED) ]; then \
+		install -C -m 644 $(SHARED) $(INSTALL_PATH)/lib/.; \
+	fi;
+#-------------------------------------------------
+
 WARNING_FLAGS = -Wall -Werror -Wsign-compare
 CFLAGS += $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CCFLAGS) $(OPT)
 CXXFLAGS += $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CXXFLAGS) $(OPT) -Woverloaded-virtual
@@ -178,7 +204,7 @@ endif  # PLATFORM_SHARED_EXT
 
 .PHONY: blackbox_crash_test check clean coverage crash_test ldb_tests \
 	release tags valgrind_check whitebox_crash_test format static_lib shared_lib all \
-	dbg
+	dbg install uninstall
 
 all: $(LIBRARY) $(PROGRAMS) $(TESTS)
 

From 49fe329e5e9b20118fb20cd5c6bb2523819c697c Mon Sep 17 00:00:00 2001
From: Chilledheart <rwindz0@gmail.com>
Date: Sat, 13 Sep 2014 05:05:07 +0800
Subject: [PATCH 070/829] Fix build issue under macosx

---
 db/db_test.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/db/db_test.cc b/db/db_test.cc
index 6f56f3688..d14dcab71 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -6036,7 +6036,7 @@ TEST(DBTest, BloomFilterWrapper) {
   }
   // Add a large key to make the file contain wide range
   ASSERT_OK(Put(1, Key(maxKey + 55555), Key(maxKey + 55555)));
-  ASSERT_EQ(0, policy->GetCounter());
+  ASSERT_EQ(0U, policy->GetCounter());
   Flush(1);
 
   // Check if they can be found
@@ -6044,14 +6044,14 @@ TEST(DBTest, BloomFilterWrapper) {
     ASSERT_EQ(Key(i), Get(1, Key(i)));
   }
   ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
-  ASSERT_EQ(maxKey, policy->GetCounter());
+  ASSERT_EQ(1U * maxKey, policy->GetCounter());
 
   // Check if filter is useful
   for (int i = 0; i < maxKey; i++) {
     ASSERT_EQ("NOT_FOUND", Get(1, Key(i+33333)));
   }
   ASSERT_GE(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), maxKey*0.98);
-  ASSERT_EQ(2 * maxKey, policy->GetCounter());
+  ASSERT_EQ(2U * maxKey, policy->GetCounter());
 }
 
 TEST(DBTest, SnapshotFiles) {

From 540a257f2c593ff3875a8d9e74006ac7139fcf77 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Fri, 12 Sep 2014 16:15:29 -0700
Subject: [PATCH 071/829] Fix WAL synced

Summary: Uhm...

Test Plan: nope

Reviewers: sdong, yhchiang, tnovak, ljin

Reviewed By: ljin

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D23343
---
 db/db_impl.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/db/db_impl.cc b/db/db_impl.cc
index 3a9596686..13fec6f85 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -4109,9 +4109,9 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
         alive_log_files_.back().AddSize(log_entry.size());
         log_empty_ = false;
         log_size = log_entry.size();
-        RecordTick(stats_, WAL_FILE_SYNCED);
         RecordTick(stats_, WAL_FILE_BYTES, log_size);
         if (status.ok() && options.sync) {
+          RecordTick(stats_, WAL_FILE_SYNCED);
           if (db_options_.use_fsync) {
             StopWatch(env_, stats_, WAL_FILE_SYNC_MICROS);
             status = log_->file()->Fsync();

From dee91c259d416e3920f96ad561fcdf93e797dad2 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Fri, 12 Sep 2014 16:23:58 -0700
Subject: [PATCH 072/829] WriteThread

Summary: This diff just moves the write thread control out of the DBImpl. I will need this as I will control column family data concurrency by only accessing some data in the write thread. That way, we won't have to lock our accesses to column family hash table (mappings from IDs to CFDs).

Test Plan: make check

Reviewers: sdong, yhchiang, ljin

Reviewed By: ljin

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D23301
---
 db/db_impl.cc       | 178 +++-----------------------------------------
 db/db_impl.h        |  47 +-----------
 db/db_impl_debug.cc |  14 +---
 db/write_thread.cc  | 147 ++++++++++++++++++++++++++++++++++++
 db/write_thread.h   |  80 ++++++++++++++++++++
 5 files changed, 246 insertions(+), 220 deletions(-)
 create mode 100644 db/write_thread.cc
 create mode 100644 db/write_thread.h

diff --git a/db/db_impl.cc b/db/db_impl.cc
index 13fec6f85..0b332b72f 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -1915,14 +1915,6 @@ Status DBImpl::RunManualCompaction(ColumnFamilyData* cfd, int input_level,
 
 Status DBImpl::FlushMemTable(ColumnFamilyData* cfd,
                              const FlushOptions& options) {
-  Writer w(&mutex_);
-  w.batch = nullptr;
-  w.sync = false;
-  w.disableWAL = false;
-  w.in_batch_group = false;
-  w.done = false;
-  w.timeout_hint_us = kNoTimeOut;
-
   Status s;
   {
     WriteContext context;
@@ -1933,7 +1925,8 @@ Status DBImpl::FlushMemTable(ColumnFamilyData* cfd,
       return Status::OK();
     }
 
-    s = BeginWrite(&w, 0);
+    WriteThread::Writer w(&mutex_);
+    s = write_thread_.EnterWriteThread(&w, 0);
     assert(s.ok() && !w.done);  // No timeout and nobody should do our job
 
     // SetNewMemtableAndNewLogFile() will release and reacquire mutex
@@ -1942,12 +1935,9 @@ Status DBImpl::FlushMemTable(ColumnFamilyData* cfd,
     cfd->imm()->FlushRequested();
     MaybeScheduleFlushOrCompaction();
 
-    assert(!writers_.empty());
-    assert(writers_.front() == &w);
-    EndWrite(&w, &w, s);
+    write_thread_.ExitWriteThread(&w, &w, s);
   }
 
-
   if (s.ok() && options.wait) {
     // Wait until the compaction completes
     s = WaitForFlushMemTable(cfd);
@@ -3652,13 +3642,6 @@ Status DBImpl::DropColumnFamily(ColumnFamilyHandle* column_family) {
   edit.DropColumnFamily();
   edit.SetColumnFamily(cfd->GetID());
 
-  Writer w(&mutex_);
-  w.batch = nullptr;
-  w.sync = false;
-  w.disableWAL = false;
-  w.in_batch_group = false;
-  w.done = false;
-  w.timeout_hint_us = kNoTimeOut;
 
   Status s;
   {
@@ -3668,10 +3651,11 @@ Status DBImpl::DropColumnFamily(ColumnFamilyHandle* column_family) {
     }
     if (s.ok()) {
       // we drop column family from a single write thread
-      s = BeginWrite(&w, 0);
+      WriteThread::Writer w(&mutex_);
+      s = write_thread_.EnterWriteThread(&w, 0);
       assert(s.ok() && !w.done);  // No timeout and nobody should do our job
       s = versions_->LogAndApply(cfd, &edit, &mutex_);
-      EndWrite(&w, &w, s);
+      write_thread_.ExitWriteThread(&w, &w, s);
     }
   }
 
@@ -3891,88 +3875,12 @@ Status DBImpl::Delete(const WriteOptions& options,
   return DB::Delete(options, column_family, key);
 }
 
-// REQUIRES: mutex_ is held
-Status DBImpl::BeginWrite(Writer* w, uint64_t expiration_time) {
-  // the following code block pushes the current writer "w" into the writer
-  // queue "writers_" and wait until one of the following conditions met:
-  // 1. the job of "w" has been done by some other writers.
-  // 2. "w" becomes the first writer in "writers_"
-  // 3. "w" timed-out.
-  mutex_.AssertHeld();
-  writers_.push_back(w);
-
-  bool timed_out = false;
-  while (!w->done && w != writers_.front()) {
-    if (expiration_time == 0) {
-      w->cv.Wait();
-    } else if (w->cv.TimedWait(expiration_time)) {
-      if (w->in_batch_group) {
-        // then it means the front writer is currently doing the
-        // write on behalf of this "timed-out" writer.  Then it
-        // should wait until the write completes.
-        expiration_time = 0;
-      } else {
-        timed_out = true;
-        break;
-      }
-    }
-  }
-
-  if (timed_out) {
-#ifndef NDEBUG
-    bool found = false;
-#endif
-    for (auto iter = writers_.begin(); iter != writers_.end(); iter++) {
-      if (*iter == w) {
-        writers_.erase(iter);
-#ifndef NDEBUG
-        found = true;
-#endif
-        break;
-      }
-    }
-#ifndef NDEBUG
-    assert(found);
-#endif
-    // writers_.front() might still be in cond_wait without a time-out.
-    // As a result, we need to signal it to wake it up.  Otherwise no
-    // one else will wake him up, and RocksDB will hang.
-    if (!writers_.empty()) {
-      writers_.front()->cv.Signal();
-    }
-    return Status::TimedOut();
-  }
-  return Status::OK();
-}
-
-// REQUIRES: mutex_ is held
-void DBImpl::EndWrite(Writer* w, Writer* last_writer, Status status) {
-  // Pop out the current writer and all writers being pushed before the
-  // current writer from the writer queue.
-  mutex_.AssertHeld();
-  while (!writers_.empty()) {
-    Writer* ready = writers_.front();
-    writers_.pop_front();
-    if (ready != w) {
-      ready->status = status;
-      ready->done = true;
-      ready->cv.Signal();
-    }
-    if (ready == last_writer) break;
-  }
-
-  // Notify new head of write queue
-  if (!writers_.empty()) {
-    writers_.front()->cv.Signal();
-  }
-}
-
 Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
   if (my_batch == nullptr) {
     return Status::Corruption("Batch is nullptr!");
   }
   PERF_TIMER_GUARD(write_pre_and_post_process_time);
-  Writer w(&mutex_);
+  WriteThread::Writer w(&mutex_);
   w.batch = my_batch;
   w.sync = options.sync;
   w.disableWAL = options.disableWAL;
@@ -3983,7 +3891,7 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
   uint64_t expiration_time = 0;
   bool has_timeout = false;
   if (w.timeout_hint_us == 0) {
-    w.timeout_hint_us = kNoTimeOut;
+    w.timeout_hint_us = WriteThread::kNoTimeOut;
   } else {
     expiration_time = env_->NowMicros() + w.timeout_hint_us;
     has_timeout = true;
@@ -3996,7 +3904,7 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
 
   WriteContext context;
   mutex_.Lock();
-  Status status = BeginWrite(&w, expiration_time);
+  Status status = write_thread_.EnterWriteThread(&w, expiration_time);
   assert(status.ok() || status.IsTimedOut());
   if (status.IsTimedOut()) {
     mutex_.Unlock();
@@ -4066,10 +3974,10 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
   }
 
   uint64_t last_sequence = versions_->LastSequence();
-  Writer* last_writer = &w;
+  WriteThread::Writer* last_writer = &w;
   if (status.ok()) {
     autovector<WriteBatch*> write_batch_group;
-    BuildBatchGroup(&last_writer, &write_batch_group);
+    write_thread_.BuildBatchGroup(&last_writer, &write_batch_group);
 
     // Add to log and apply to memtable.  We can release the lock
     // during this phase since &w is currently responsible for logging
@@ -4161,7 +4069,7 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
     bg_error_ = status; // stop compaction & fail any further writes
   }
 
-  EndWrite(&w, last_writer, status);
+  write_thread_.ExitWriteThread(&w, last_writer, status);
   mutex_.Unlock();
 
   if (status.IsTimedOut()) {
@@ -4171,68 +4079,6 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
   return status;
 }
 
-// This function will be called only when the first writer succeeds.
-// All writers in the to-be-built batch group will be processed.
-//
-// REQUIRES: Writer list must be non-empty
-// REQUIRES: First writer must have a non-nullptr batch
-void DBImpl::BuildBatchGroup(Writer** last_writer,
-                             autovector<WriteBatch*>* write_batch_group) {
-  assert(!writers_.empty());
-  Writer* first = writers_.front();
-  assert(first->batch != nullptr);
-
-  size_t size = WriteBatchInternal::ByteSize(first->batch);
-  write_batch_group->push_back(first->batch);
-
-  // Allow the group to grow up to a maximum size, but if the
-  // original write is small, limit the growth so we do not slow
-  // down the small write too much.
-  size_t max_size = 1 << 20;
-  if (size <= (128<<10)) {
-    max_size = size + (128<<10);
-  }
-
-  *last_writer = first;
-  std::deque<Writer*>::iterator iter = writers_.begin();
-  ++iter;  // Advance past "first"
-  for (; iter != writers_.end(); ++iter) {
-    Writer* w = *iter;
-    if (w->sync && !first->sync) {
-      // Do not include a sync write into a batch handled by a non-sync write.
-      break;
-    }
-
-    if (!w->disableWAL && first->disableWAL) {
-      // Do not include a write that needs WAL into a batch that has
-      // WAL disabled.
-      break;
-    }
-
-    if (w->timeout_hint_us < first->timeout_hint_us) {
-      // Do not include those writes with shorter timeout.  Otherwise, we might
-      // execute a write that should instead be aborted because of timeout.
-      break;
-    }
-
-    if (w->batch == nullptr) {
-      // Do not include those writes with nullptr batch. Those are not writes,
-      // those are something else. They want to be alone
-      break;
-    }
-
-    size += WriteBatchInternal::ByteSize(w->batch);
-    if (size > max_size) {
-      // Do not make batch too big
-      break;
-    }
-
-    write_batch_group->push_back(w->batch);
-    w->in_batch_group = true;
-    *last_writer = w;
-  }
-}
-
 // REQUIRES: mutex_ is held
 // REQUIRES: this thread is currently at the front of the writer queue
 void DBImpl::DelayWrite(uint64_t expiration_time) {
diff --git a/db/db_impl.h b/db/db_impl.h
index 0336b3af5..d2b0dfc94 100644
--- a/db/db_impl.h
+++ b/db/db_impl.h
@@ -34,6 +34,7 @@
 #include "db/internal_stats.h"
 #include "db/write_controller.h"
 #include "db/flush_scheduler.h"
+#include "db/write_thread.h"
 
 namespace rocksdb {
 
@@ -359,44 +360,6 @@ class DBImpl : public DB {
   Status WriteLevel0Table(ColumnFamilyData* cfd, autovector<MemTable*>& mems,
                           VersionEdit* edit, uint64_t* filenumber,
                           LogBuffer* log_buffer);
-  // Information kept for every waiting writer
-  struct Writer {
-    Status status;
-    WriteBatch* batch;
-    bool sync;
-    bool disableWAL;
-    bool in_batch_group;
-    bool done;
-    uint64_t timeout_hint_us;
-    port::CondVar cv;
-
-    explicit Writer(port::Mutex* mu) : cv(mu) {}
-  };
-
-  // Before applying write operation (such as DBImpl::Write, DBImpl::Flush)
-  // thread should grab the mutex_ and be the first on writers queue.
-  // BeginWrite is used for it.
-  // Be aware! Writer's job can be done by other thread (see DBImpl::Write
-  // for examples), so check it via w.done before applying changes.
-  //
-  // Writer* w:                writer to be placed in the queue
-  // uint64_t expiration_time: maximum time to be in the queue
-  // See also: EndWrite
-  Status BeginWrite(Writer* w, uint64_t expiration_time);
-
-  // After doing write job, we need to remove already used writers from
-  // writers_ queue and notify head of the queue about it.
-  // EndWrite is used for this.
-  //
-  // Writer* w:           Writer, that was added by BeginWrite function
-  // Writer* last_writer: Since we can join a few Writers (as DBImpl::Write
-  //                      does)
-  //                      we should pass last_writer as a parameter to
-  //                      EndWrite
-  //                      (if you don't touch other writers, just pass w)
-  // Status status:       Status of write operation
-  // See also: BeginWrite
-  void EndWrite(Writer* w, Writer* last_writer, Status status);
 
   void DelayWrite(uint64_t expiration_time);
 
@@ -405,9 +368,6 @@ class DBImpl : public DB {
   Status SetNewMemtableAndNewLogFile(ColumnFamilyData* cfd,
                                      WriteContext* context);
 
-  void BuildBatchGroup(Writer** last_writer,
-                       autovector<WriteBatch*>* write_batch_group);
-
   // Force current memtable contents to be flushed.
   Status FlushMemTable(ColumnFamilyData* cfd, const FlushOptions& options);
 
@@ -552,8 +512,8 @@ class DBImpl : public DB {
 
   std::unique_ptr<Directory> db_directory_;
 
-  // Queue of writers.
-  std::deque<Writer*> writers_;
+  WriteThread write_thread_;
+
   WriteBatch tmp_batch_;
 
   WriteController write_controller_;
@@ -627,7 +587,6 @@ class DBImpl : public DB {
   bool flush_on_destroy_; // Used when disableWAL is true.
 
   static const int KEEP_LOG_FILE_NUM = 1000;
-  static const uint64_t kNoTimeOut = std::numeric_limits<uint64_t>::max();
   std::string db_absolute_path_;
 
   // The options to access storage files
diff --git a/db/db_impl_debug.cc b/db/db_impl_debug.cc
index 3446571eb..6c073d4d5 100644
--- a/db/db_impl_debug.cc
+++ b/db/db_impl_debug.cc
@@ -140,21 +140,15 @@ void DBImpl::TEST_UnlockMutex() {
 }
 
 void* DBImpl::TEST_BeginWrite() {
-  auto w = new Writer(&mutex_);
-  w->batch = nullptr;
-  w->sync = false;
-  w->disableWAL = false;
-  w->in_batch_group = false;
-  w->done = false;
-  w->timeout_hint_us = kNoTimeOut;
-  Status s = BeginWrite(w, 0);
+  auto w = new WriteThread::Writer(&mutex_);
+  Status s = write_thread_.EnterWriteThread(w, 0);
   assert(s.ok() && !w->done);  // No timeout and nobody should do our job
   return reinterpret_cast<void*>(w);
 }
 
 void DBImpl::TEST_EndWrite(void* w) {
-  auto writer = reinterpret_cast<Writer*>(w);
-  EndWrite(writer, writer, Status::OK());
+  auto writer = reinterpret_cast<WriteThread::Writer*>(w);
+  write_thread_.ExitWriteThread(writer, writer, Status::OK());
   delete writer;
 }
 
diff --git a/db/write_thread.cc b/db/write_thread.cc
new file mode 100644
index 000000000..052e1209e
--- /dev/null
+++ b/db/write_thread.cc
@@ -0,0 +1,147 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include "db/write_thread.h"
+
+namespace rocksdb {
+
+Status WriteThread::EnterWriteThread(WriteThread::Writer* w,
+                                     uint64_t expiration_time) {
+  // the following code block pushes the current writer "w" into the writer
+  // queue "writers_" and wait until one of the following conditions met:
+  // 1. the job of "w" has been done by some other writers.
+  // 2. "w" becomes the first writer in "writers_"
+  // 3. "w" timed-out.
+  writers_.push_back(w);
+
+  bool timed_out = false;
+  while (!w->done && w != writers_.front()) {
+    if (expiration_time == 0) {
+      w->cv.Wait();
+    } else if (w->cv.TimedWait(expiration_time)) {
+      if (w->in_batch_group) {
+        // then it means the front writer is currently doing the
+        // write on behalf of this "timed-out" writer.  Then it
+        // should wait until the write completes.
+        expiration_time = 0;
+      } else {
+        timed_out = true;
+        break;
+      }
+    }
+  }
+
+  if (timed_out) {
+#ifndef NDEBUG
+    bool found = false;
+#endif
+    for (auto iter = writers_.begin(); iter != writers_.end(); iter++) {
+      if (*iter == w) {
+        writers_.erase(iter);
+#ifndef NDEBUG
+        found = true;
+#endif
+        break;
+      }
+    }
+#ifndef NDEBUG
+    assert(found);
+#endif
+    // writers_.front() might still be in cond_wait without a time-out.
+    // As a result, we need to signal it to wake it up.  Otherwise no
+    // one else will wake him up, and RocksDB will hang.
+    if (!writers_.empty()) {
+      writers_.front()->cv.Signal();
+    }
+    return Status::TimedOut();
+  }
+  return Status::OK();
+}
+
+void WriteThread::ExitWriteThread(WriteThread::Writer* w,
+                                  WriteThread::Writer* last_writer,
+                                  Status status) {
+  // Pop out the current writer and all writers being pushed before the
+  // current writer from the writer queue.
+  while (!writers_.empty()) {
+    Writer* ready = writers_.front();
+    writers_.pop_front();
+    if (ready != w) {
+      ready->status = status;
+      ready->done = true;
+      ready->cv.Signal();
+    }
+    if (ready == last_writer) break;
+  }
+
+  // Notify new head of write queue
+  if (!writers_.empty()) {
+    writers_.front()->cv.Signal();
+  }
+}
+
+// This function will be called only when the first writer succeeds.
+// All writers in the to-be-built batch group will be processed.
+//
+// REQUIRES: Writer list must be non-empty
+// REQUIRES: First writer must have a non-nullptr batch
+void WriteThread::BuildBatchGroup(WriteThread::Writer** last_writer,
+                                  autovector<WriteBatch*>* write_batch_group) {
+  assert(!writers_.empty());
+  Writer* first = writers_.front();
+  assert(first->batch != nullptr);
+
+  size_t size = WriteBatchInternal::ByteSize(first->batch);
+  write_batch_group->push_back(first->batch);
+
+  // Allow the group to grow up to a maximum size, but if the
+  // original write is small, limit the growth so we do not slow
+  // down the small write too much.
+  size_t max_size = 1 << 20;
+  if (size <= (128<<10)) {
+    max_size = size + (128<<10);
+  }
+
+  *last_writer = first;
+  std::deque<Writer*>::iterator iter = writers_.begin();
+  ++iter;  // Advance past "first"
+  for (; iter != writers_.end(); ++iter) {
+    Writer* w = *iter;
+    if (w->sync && !first->sync) {
+      // Do not include a sync write into a batch handled by a non-sync write.
+      break;
+    }
+
+    if (!w->disableWAL && first->disableWAL) {
+      // Do not include a write that needs WAL into a batch that has
+      // WAL disabled.
+      break;
+    }
+
+    if (w->timeout_hint_us < first->timeout_hint_us) {
+      // Do not include those writes with shorter timeout.  Otherwise, we might
+      // execute a write that should instead be aborted because of timeout.
+      break;
+    }
+
+    if (w->batch == nullptr) {
+      // Do not include those writes with nullptr batch. Those are not writes,
+      // those are something else. They want to be alone
+      break;
+    }
+
+    size += WriteBatchInternal::ByteSize(w->batch);
+    if (size > max_size) {
+      // Do not make batch too big
+      break;
+    }
+
+    write_batch_group->push_back(w->batch);
+    w->in_batch_group = true;
+    *last_writer = w;
+  }
+}
+
+}  // namespace rocksdb
diff --git a/db/write_thread.h b/db/write_thread.h
new file mode 100644
index 000000000..8c5baa664
--- /dev/null
+++ b/db/write_thread.h
@@ -0,0 +1,80 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#pragma once
+
+#include <stdint.h>
+#include <deque>
+#include <limits>
+#include "rocksdb/status.h"
+#include "db/write_batch_internal.h"
+#include "util/autovector.h"
+#include "port/port.h"
+
+namespace rocksdb {
+
+class WriteThread {
+ public:
+  static const uint64_t kNoTimeOut = std::numeric_limits<uint64_t>::max();
+  // Information kept for every waiting writer
+  struct Writer {
+    Status status;
+    WriteBatch* batch;
+    bool sync;
+    bool disableWAL;
+    bool in_batch_group;
+    bool done;
+    uint64_t timeout_hint_us;
+    port::CondVar cv;
+
+    explicit Writer(port::Mutex* mu)
+        : batch(nullptr),
+          sync(false),
+          disableWAL(false),
+          in_batch_group(false),
+          done(false),
+          timeout_hint_us(kNoTimeOut),
+          cv(mu) {}
+  };
+
+  WriteThread() = default;
+  ~WriteThread() = default;
+
+  // Before applying write operation (such as DBImpl::Write, DBImpl::Flush)
+  // thread should grab the mutex_ and be the first on writers queue.
+  // EnterWriteThread is used for it.
+  // Be aware! Writer's job can be done by other thread (see DBImpl::Write
+  // for examples), so check it via w.done before applying changes.
+  //
+  // Writer* w:                writer to be placed in the queue
+  // uint64_t expiration_time: maximum time to be in the queue
+  // See also: ExitWriteThread
+  // REQUIRES: db mutex held
+  Status EnterWriteThread(Writer* w, uint64_t expiration_time);
+
+  // After doing write job, we need to remove already used writers from
+  // writers_ queue and notify head of the queue about it.
+  // ExitWriteThread is used for this.
+  //
+  // Writer* w:           Writer, that was added by EnterWriteThread function
+  // Writer* last_writer: Since we can join a few Writers (as DBImpl::Write
+  //                      does)
+  //                      we should pass last_writer as a parameter to
+  //                      ExitWriteThread
+  //                      (if you don't touch other writers, just pass w)
+  // Status status:       Status of write operation
+  // See also: EnterWriteThread
+  // REQUIRES: db mutex held
+  void ExitWriteThread(Writer* w, Writer* last_writer, Status status);
+
+  void BuildBatchGroup(Writer** last_writer,
+                       autovector<WriteBatch*>* write_batch_group);
+
+ private:
+  // Queue of writers.
+  std::deque<Writer*> writers_;
+};
+
+}  // namespace rocksdb

From add22e35159b4edde934740a84b417882b18cddf Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Fri, 12 Sep 2014 16:25:35 -0700
Subject: [PATCH 073/829] standardize scripts to run RocksDB benchmarks

Summary:
Hope these scripts will allow people to run/repro benchmark easily
I think it is time to re-run flash benchmarks and report results
Please comment if any other benchmark runs are needed

Test Plan: ran it

Reviewers: yhchiang, igor, sdong

Reviewed By: igor

Subscribers: dhruba, MarkCallaghan, leveldb

Differential Revision: https://reviews.facebook.net/D23139
---
 tools/benchmark.sh       | 205 +++++++++++++++++++++++++++++++++++++++
 tools/run_flash_bench.sh |  45 +++++++++
 2 files changed, 250 insertions(+)
 create mode 100755 tools/benchmark.sh
 create mode 100755 tools/run_flash_bench.sh

diff --git a/tools/benchmark.sh b/tools/benchmark.sh
new file mode 100755
index 000000000..cde545801
--- /dev/null
+++ b/tools/benchmark.sh
@@ -0,0 +1,205 @@
+#!/bin/bash
+# REQUIRE: db_bench binary exists in the current directory
+
+if [ $# -ne 1 ]; then
+  echo "./benchmark.sh [bulkload/fillseq/overwrite/filluniquerandom/readrandom/readwhilewriting]"
+  exit 0
+fi
+
+# size constants
+K=1024
+M=$((1024 * K))
+G=$((1024 * M))
+
+if [ -z $DB_DIR ]; then
+  echo "DB_DIR is not defined"
+  exit 0
+fi
+
+if [ -z $WAL_DIR ]; then
+  echo "WAL_DIR is not defined"
+  exit 0
+fi
+
+output_dir=${OUTPUT_DIR:-/tmp/}
+if [ ! -d $output_dir ]; then
+  mkdir -p $output_dir
+fi
+
+num_read_threads=${NUM_READ_THREADS:-16}
+writes_per_second=${WRITES_PER_SEC:-$((80 * K))}  # (only for readwhilewriting)
+cache_size=$((16 * G))
+duration=${DURATION:-0}
+
+num_keys=${NUM_KEYS:-$((1 * G))}
+key_size=20
+value_size=800
+
+const_params="
+  --db=$DB_DIR \
+  --wal_dir=$WAL_DIR \
+  \
+  --num_levels=6 \
+  --key_size=$key_size \
+  --value_size=$value_size \
+  --block_size=4096 \
+  --cache_size=$cache_size \
+  --cache_numshardbits=6 \
+  --compression_type=snappy \
+  --compression_ratio=0.5 \
+  \
+  --hard_rate_limit=2 \
+  --rate_limit_delay_max_milliseconds=1000000 \
+  --write_buffer_size=$((128 * M)) \
+  --max_write_buffer_number=2 \
+  --target_file_size_base=$((128 * M)) \
+  --max_bytes_for_level_base=$((1 * G)) \
+  \
+  --sync=0 \
+  --disable_data_sync=1 \
+  --verify_checksum=1 \
+  --delete_obsolete_files_period_micros=$((60 * M)) \
+  --max_grandparent_overlap_factor=10 \
+  \
+  --statistics=1 \
+  --stats_per_interval=1 \
+  --stats_interval=$((1 * M)) \
+  --histogram=1 \
+  \
+  --memtablerep=skip_list \
+  --bloom_bits=10 \
+  --open_files=$((20 * K))"
+
+l0_config="
+  --level0_file_num_compaction_trigger=8 \
+  --level0_slowdown_writes_trigger=16 \
+  --level0_stop_writes_trigger=24"
+
+if [ $duration -gt 0 ]; then
+  const_params="$const_params --duration=$duration"
+fi
+
+params_r="$const_params $l0_config --max_background_compactions=4 --max_background_flushes=1"
+params_w="$const_params $l0_config --max_background_compactions=16 --max_background_flushes=16"
+params_bulkload="$const_params --max_background_compactions=16 --max_background_flushes=16 \
+                 --level0_file_num_compaction_trigger=$((100 * M)) \
+                 --level0_slowdown_writes_trigger=$((100 * M)) \
+                 --level0_stop_writes_trigger=$((100 * M))"
+
+function run_bulkload {
+  echo "Bulk loading $num_keys random keys into database..."
+  cmd="./db_bench $params_bulkload --benchmarks=fillrandom \
+       --use_existing_db=0 \
+       --num=$num_keys \
+       --disable_auto_compactions=1 \
+       --disable_data_sync=1 \
+       --threads=1 2>&1 | tee $output_dir/benchmark_bulkload_fillrandom.log"
+  echo $cmd | tee $output_dir/benchmark_bulkload_fillrandom.log
+  eval $cmd
+  echo "Compacting..."
+  cmd="./db_bench $params_w --benchmarks=compact \
+       --use_existing_db=1 \
+       --num=$num_keys \
+       --disable_auto_compactions=1 \
+       --disable_data_sync=1 \
+       --threads=1 2>&1 | tee $output_dir/benchmark_bulkload_compact.log"
+  echo $cmd | tee $output_dir/benchmark_bulkload_compact.log
+  eval $cmd
+}
+
+function run_fillseq {
+  echo "Loading $num_keys keys sequentially into database..."
+  cmd="./db_bench $params_w --benchmarks=fillseq \
+       --use_existing_db=0 \
+       --num=$num_keys \
+       --threads=1 2>&1 | tee $output_dir/benchmark_fillseq.log"
+  echo $cmd | tee $output_dir/benchmark_fillseq.log
+  eval $cmd
+}
+
+function run_overwrite {
+  echo "Loading $num_keys keys sequentially into database..."
+  cmd="./db_bench $params_w --benchmarks=overwrite \
+       --use_existing_db=1 \
+       --num=$num_keys \
+       --threads=1 2>&1 | tee $output_dir/benchmark_overwrite.log"
+  echo $cmd | tee $output_dir/benchmark_overwrite.log
+  eval $cmd
+}
+
+function run_filluniquerandom {
+  echo "Loading $num_keys unique keys randomly into database..."
+  cmd="./db_bench $params_w --benchmarks=filluniquerandom \
+       --use_existing_db=0 \
+       --num=$num_keys \
+       --threads=1 2>&1 | tee $output_dir/benchmark_filluniquerandom.log"
+  echo $cmd | tee $output_dir/benchmark_filluniquerandom.log
+  eval $cmd
+}
+
+function run_readrandom {
+  echo "Reading $num_keys random keys from database..."
+  cmd="./db_bench $params_r --benchmarks=readrandom \
+       --use_existing_db=1 \
+       --num=$num_keys \
+       --threads=$num_read_threads \
+       --disable_auto_compactions=1 \
+       2>&1 | tee $output_dir/benchmark_readrandom.log"
+  echo $cmd | tee $output_dir/benchmark_readrandom.log
+  eval $cmd
+}
+
+function run_readwhilewriting {
+  echo "Reading $num_keys random keys from database whiling writing.."
+  cmd="./db_bench $params_r --benchmarks=readwhilewriting \
+       --use_existing_db=1 \
+       --num=$num_keys \
+       --threads=$num_read_threads \
+       --writes_per_second=$writes_per_second \
+       2>&1 | tee $output_dir/benchmark_readwhilewriting.log"
+  echo $cmd | tee $output_dir/benchmark_readwhilewriting.log
+  eval $cmd
+}
+
+function now() {
+  echo `date +"%s"`
+}
+
+report="$output_dir/report.txt"
+
+# print start time
+echo "===== Benchmark ====="
+
+# Run!!!
+IFS=',' read -a jobs <<< $1
+for job in ${jobs[@]}; do
+  echo "Start $job at `date`" | tee -a $report
+  start=$(now)
+  if [ $job = bulkload ]; then
+    run_bulkload
+  elif [ $job = fillseq ]; then
+    run_fillseq
+  elif [ $job = overwrite ]; then
+    run_overwrite
+  elif [ $job = filluniquerandom ]; then
+    run_filluniquerandom
+  elif [ $job = readrandom ]; then
+    run_readrandom
+  elif [ $job = readwhilewriting ]; then
+    run_readwhilewriting
+  else
+    echo "unknown job $job"
+    exit
+  fi
+  end=$(now)
+
+  echo "Complete $job in $((end-start)) seconds" | tee -a $report
+  if [[ $job = readrandom || $job = readwhilewriting ]]; then
+    qps=$(grep "micros\/op" "$output_dir/benchmark_$job.log" | grep "ops\/sec" | awk '{print $5}')
+    line=$(grep "rocksdb.db.get.micros" "$output_dir/benchmark_$job.log")
+    p50=$(echo $line | awk '{print $7}')
+    p99=$(echo $line | awk '{print $13}')
+    echo "Read latency p50 = $p50 us, p99 = $p99 us" | tee -a $report
+    echo "QPS = $qps ops/sec" | tee -a $report
+  fi
+done
diff --git a/tools/run_flash_bench.sh b/tools/run_flash_bench.sh
new file mode 100755
index 000000000..be7d1631f
--- /dev/null
+++ b/tools/run_flash_bench.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+# REQUIRE: benchmark.sh exists in the current directory
+# After execution of this script, log files are generated in $output_dir.
+# report.txt provides a high level statistics
+
+# Size constants
+K=1024
+M=$((1024 * K))
+G=$((1024 * M))
+
+n=$((1 * G))
+wps=$((80 * K))
+duration=$((6 * 60 * 60))
+num_read_threads=24
+
+# Update these parameters before execution !!!
+db_dir="/tmp/rocksdb/"
+wal_dir="/tmp/rocksdb/"
+output_dir="/tmp/output"
+
+# Test 1: bulk load
+OUTPUT_DIR=$output_dir NUM_KEYS=$n DB_DIR=$db_dir WAL_DIR=$wal_dir \
+  ./benchmark.sh bulkload
+
+# Test 2: sequential fill
+OUTPUT_DIR=$output_dir NUM_KEYS=$n DB_DIR=$db_dir WAL_DIR=$wal_dir \
+  ./benchmark.sh fillseq
+
+# Test 3: overwrite
+OUTPUT_DIR=$output_dir NUM_KEYS=$n DB_DIR=$db_dir WAL_DIR=$wal_dir \
+  ./benchmark.sh overwrite
+
+# Prepare: populate DB with random data
+OUTPUT_DIR=$output_dir NUM_KEYS=$n DB_DIR=$db_dir WAL_DIR=$wal_dir \
+  ./benchmark.sh filluniquerandom
+
+# Test 4: random read
+OUTPUT_DIR=$output_dir NUM_KEYS=$n DB_DIR=$db_dir WAL_DIR=$wal_dir \
+  DURATION=$duration NUM_READ_THREADS=$num_read_threads \
+  ./benchmark.sh readrandom
+
+# Test 5: random read while writing
+OUTPUT_DIR=$output_dir NUM_KEYS=$n DB_DIR=$db_dir WAL_DIR=$wal_dir \
+  DURATION=$duration NUM_READ_THREADS=$num_read_threads WRITES_PER_SECOND=$wps \
+  ./benchmark.sh readwhilewriting

From 04ce1b25f3d7623c62cd5771cd8391d02da7e31a Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Sat, 13 Sep 2014 14:06:22 -0700
Subject: [PATCH 074/829] Fix #284

---
 db/write_controller.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/db/write_controller.h b/db/write_controller.h
index 4ed221df1..32e1d58f1 100644
--- a/db/write_controller.h
+++ b/db/write_controller.h
@@ -47,7 +47,7 @@ class WriteControllerToken {
  public:
   explicit WriteControllerToken(WriteController* controller)
       : controller_(controller) {}
-  virtual ~WriteControllerToken() = default;
+  virtual ~WriteControllerToken() {}
 
  protected:
   WriteController* controller_;
@@ -62,14 +62,14 @@ class StopWriteToken : public WriteControllerToken {
  public:
   explicit StopWriteToken(WriteController* controller)
       : WriteControllerToken(controller) {}
-  ~StopWriteToken();
+  virtual ~StopWriteToken();
 };
 
 class DelayWriteToken : public WriteControllerToken {
  public:
   DelayWriteToken(WriteController* controller, uint64_t delay_us)
       : WriteControllerToken(controller), delay_us_(delay_us) {}
-  ~DelayWriteToken();
+  virtual ~DelayWriteToken();
 
  private:
   uint64_t delay_us_;

From 28be16b1dc60db6cb29503ce726a582d7f52a915 Mon Sep 17 00:00:00 2001
From: yinqiwen <yinqiwen@gmail.com>
Date: Mon, 15 Sep 2014 20:43:38 +0800
Subject: [PATCH 075/829] fix rate limiter crash #286

---
 util/rate_limiter.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/util/rate_limiter.cc b/util/rate_limiter.cc
index cde86f3c9..47f96de84 100644
--- a/util/rate_limiter.cc
+++ b/util/rate_limiter.cc
@@ -60,7 +60,7 @@ GenericRateLimiter::~GenericRateLimiter() {
 }
 
 void GenericRateLimiter::Request(int64_t bytes, const Env::IOPriority pri) {
-  assert(bytes < refill_bytes_per_period_);
+  assert(bytes <= refill_bytes_per_period_);
 
   MutexLock g(&request_mutex_);
   if (stop_) {

From 4a27a2f19393af173afd016c7bf6d0b69a99ea79 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Mon, 15 Sep 2014 11:32:01 -0700
Subject: [PATCH 076/829] Don't sync manifest when disableDataSync = true

Summary: As we discussed offline

Test Plan: compiles

Reviewers: yhchiang, sdong, ljin, dhruba

Reviewed By: sdong

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D22989
---
 HISTORY.md                |  1 +
 db/db_test.cc             | 32 +++++++++++++++++++++++++++++++-
 db/version_set.cc         |  4 ++--
 include/rocksdb/options.h |  2 +-
 4 files changed, 35 insertions(+), 4 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index 80cac265b..7a05c54e8 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -6,6 +6,7 @@
 
 ### Behavior changes
 * We have refactored our system of stalling writes.  Any stall-related statistics' meanings are changed. Instead of per-write stall counts, we now count stalls per-epoch, where epochs are periods between flushes and compactions. You'll find more information in our Tuning Perf Guide once we release RocksDB 3.6.
+* When disableDataSync=true, we no longer sync the MANIFEST file.
 
 ----- Past Releases -----
 
diff --git a/db/db_test.cc b/db/db_test.cc
index d14dcab71..f33ab5ad2 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -151,6 +151,8 @@ class SpecialEnv : public EnvWrapper {
 
   std::atomic<int64_t> bytes_written_;
 
+  std::atomic<int> sync_counter_;
+
   explicit SpecialEnv(Env* base) : EnvWrapper(base) {
     delay_sstable_sync_.Release_Store(nullptr);
     drop_writes_.Release_Store(nullptr);
@@ -162,6 +164,7 @@ class SpecialEnv : public EnvWrapper {
     manifest_write_error_.Release_Store(nullptr);
     log_write_error_.Release_Store(nullptr);
     bytes_written_ = 0;
+    sync_counter_ = 0;
   }
 
   Status NewWritableFile(const std::string& f, unique_ptr<WritableFile>* r,
@@ -190,6 +193,7 @@ class SpecialEnv : public EnvWrapper {
       Status Close() { return base_->Close(); }
       Status Flush() { return base_->Flush(); }
       Status Sync() {
+        ++env_->sync_counter_;
         while (env_->delay_sstable_sync_.Acquire_Load() != nullptr) {
           env_->SleepForMicroseconds(100000);
         }
@@ -216,6 +220,7 @@ class SpecialEnv : public EnvWrapper {
       Status Close() { return base_->Close(); }
       Status Flush() { return base_->Flush(); }
       Status Sync() {
+        ++env_->sync_counter_;
         if (env_->manifest_sync_error_.Acquire_Load() != nullptr) {
           return Status::IOError("simulated sync error");
         } else {
@@ -239,7 +244,10 @@ class SpecialEnv : public EnvWrapper {
       }
       Status Close() { return base_->Close(); }
       Status Flush() { return base_->Flush(); }
-      Status Sync() { return base_->Sync(); }
+      Status Sync() {
+        ++env_->sync_counter_;
+        return base_->Sync();
+      }
     };
 
     if (non_writable_.Acquire_Load() != nullptr) {
@@ -8379,6 +8387,28 @@ TEST(DBTest, WriteSingleThreadEntry) {
   }
 }
 
+TEST(DBTest, DisableDataSyncTest) {
+  // iter 0 -- no sync
+  // iter 1 -- sync
+  for (int iter = 0; iter < 2; ++iter) {
+    Options options = CurrentOptions();
+    options.disableDataSync = iter == 0;
+    options.create_if_missing = true;
+    options.env = env_;
+    Reopen(&options);
+    CreateAndReopenWithCF({"pikachu"}, &options);
+
+    MakeTables(10, "a", "z");
+    Compact("a", "z");
+
+    if (iter == 0) {
+      ASSERT_EQ(env_->sync_counter_.load(), 0);
+    } else {
+      ASSERT_GT(env_->sync_counter_.load(), 0);
+    }
+    Destroy(&options);
+  }
+}
 
 
 }  // namespace rocksdb
diff --git a/db/version_set.cc b/db/version_set.cc
index bd3d1b81c..dbf055ce2 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -1891,7 +1891,7 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
           break;
         }
       }
-      if (s.ok()) {
+      if (s.ok() && db_options_->disableDataSync == false) {
         if (db_options_->use_fsync) {
           StopWatch sw(env_, db_options_->statistics.get(),
                        MANIFEST_FILE_SYNC_MICROS);
@@ -1928,7 +1928,7 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
     // new CURRENT file that points to it.
     if (s.ok() && new_descriptor_log) {
       s = SetCurrentFile(env_, dbname_, pending_manifest_file_number_,
-                         db_directory);
+                         db_options_->disableDataSync ? nullptr : db_directory);
       if (s.ok() && pending_manifest_file_number_ > manifest_file_number_) {
         // delete old manifest file
         Log(db_options_->info_log,
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index acab6f992..36c1108b1 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -610,7 +610,7 @@ struct DBOptions {
   // it does not use any locks to prevent concurrent updates.
   std::shared_ptr<Statistics> statistics;
 
-  // If true, then the contents of data files are not synced
+  // If true, then the contents of manifest and data files are not synced
   // to stable storage. Their contents remain in the OS buffers till the
   // OS decides to flush them. This option is good for bulk-loading
   // of data. Once the bulk-loading is complete, please issue a

From acb9348ff3de48218bbd27068923849d8cee342d Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Mon, 15 Sep 2014 12:34:12 -0700
Subject: [PATCH 077/829] [Java] Include WriteBatch into RocksDBSample.java,
 fix how DbBenchmark.java handles WriteBatch.

Summary:
Include WriteBatch into RocksDBSample.java, fix how DbBenchmark.java handles WriteBatch.
Previously DbBenchmark.java does not use WriteBatch when benchmarks is set to fillbatch.

Test Plan:
make rocksdbjava -j32
make jtest
make jdb_bench
cd java
./jdb_bench.sh --benchmarks=fillbatch

Reviewers: naveenatceg, ljin, sdong, ankgup87

Reviewed By: ankgup87

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D22983
---
 java/RocksDBSample.java                     | 25 +++++++++++++++++++++
 java/org/rocksdb/benchmark/DbBenchmark.java |  2 +-
 2 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/java/RocksDBSample.java b/java/RocksDBSample.java
index 72da4b5e8..9ec3d8345 100644
--- a/java/RocksDBSample.java
+++ b/java/RocksDBSample.java
@@ -72,6 +72,8 @@ public class RocksDBSample {
     assert(options.memTableFactoryName().equals("SkipListFactory"));
 
     options.setTableFormatConfig(new PlainTableConfig());
+    // Plain-Table requires mmap read
+    options.setAllowMmapReads(true);
     assert(options.tableFactoryName().equals("PlainTable"));
 
     BlockBasedTableConfig table_options = new BlockBasedTableConfig();
@@ -121,6 +123,29 @@ public class RocksDBSample {
         System.out.println("");
       }
 
+      // write batch test
+      WriteOptions writeOpt = new WriteOptions();
+      for (int i = 10; i <= 19; ++i) {
+        WriteBatch batch = new WriteBatch();
+        for (int j = 10; j <= 19; ++j) {
+          batch.put(String.format("%dx%d", i, j).getBytes(),
+                    String.format("%d", i * j).getBytes());
+        }
+        db.write(writeOpt, batch);
+        batch.dispose();
+      }
+      for (int i = 10; i <= 19; ++i) {
+        for (int j = 10; j <= 19; ++j) {
+          assert(new String(
+              db.get(String.format("%dx%d", i, j).getBytes())).equals(
+                  String.format("%d", i * j)));
+          System.out.format("%s ", new String(db.get(
+              String.format("%dx%d", i, j).getBytes())));
+        }
+        System.out.println("");
+      }
+      writeOpt.dispose();
+
       value = db.get("1x1".getBytes());
       assert(value != null);
       value = db.get("world".getBytes());
diff --git a/java/org/rocksdb/benchmark/DbBenchmark.java b/java/org/rocksdb/benchmark/DbBenchmark.java
index b715f9af1..686d39445 100644
--- a/java/org/rocksdb/benchmark/DbBenchmark.java
+++ b/java/org/rocksdb/benchmark/DbBenchmark.java
@@ -255,7 +255,7 @@ public class DbBenchmark {
             for (long j = 0; j < entriesPerBatch_; j++) {
               getKey(key, i + j, keyRange_);
               DbBenchmark.this.gen_.generate(value);
-              db_.put(writeOpt_, key, value);
+              batch.put(key, value);
               stats_.finishedSingleOp(keySize_ + valueSize_);
             }
             db_.write(writeOpt_, batch);

From 49aacd8d2b84cf92658983cabe6fc62491c143ba Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Mon, 15 Sep 2014 15:30:17 -0700
Subject: [PATCH 078/829] Fix make install

Summary: See https://github.com/facebook/rocksdb/issues/283

Test Plan: make install/uninstall

Reviewers: ljin, sdong, yhchiang

Reviewed By: yhchiang

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D23373
---
 Makefile | 28 +++++++++++-----------------
 1 file changed, 11 insertions(+), 17 deletions(-)

diff --git a/Makefile b/Makefile
index f37bcf5cc..260a51d1a 100644
--- a/Makefile
+++ b/Makefile
@@ -53,26 +53,20 @@ endif
 INSTALL_PATH ?= /usr/local
 
 uninstall:
-	rm -rf $(INSTALL_PATH)/include/rocksdb
-	if [ -a $(LIBRARY) ]; then \
-		rm -rf $(INSTALL_PATH)/lib/$(LIBRARY); \
-	fi
-	if [ -a $(SHARED) ]; then \
-		rm -rf $(INSTALL_PATH)/lib/$(SHARED); \
-	fi
+	@rm -rf $(INSTALL_PATH)/include/rocksdb
+	@rm -rf $(INSTALL_PATH)/lib/$(LIBRARY)
+	@rm -rf $(INSTALL_PATH)/lib/$(SHARED)
 
 install:
-	install -d $(INSTALL_PATH)/include/rocksdb
-	install -d $(INSTALL_PATH)/lib
-	for header in `find "include/rocksdb" -type f -name *.h`; do \
-		install -C -m 644 -D $$header $(INSTALL_PATH)/$$header; \
+	@install -d $(INSTALL_PATH)/lib
+	@for header_dir in `find "include/rocksdb" -type d`; do \
+		install -d $(INSTALL_PATH)/$$header_dir; \
 	done
-	if [ -a $(LIBRARY) ]; then \
-		install -C -m 644 $(LIBRARY) $(INSTALL_PATH)/lib/.; \
-	fi;
-	if [ -a $(SHARED) ]; then \
-		install -C -m 644 $(SHARED) $(INSTALL_PATH)/lib/.; \
-	fi;
+	@for header in `find "include/rocksdb" -type f -name *.h`; do \
+		install -C -m 644 $$header $(INSTALL_PATH)/$$header; \
+	done
+	@[ ! -e $(LIBRARY) ] || install -C -m 644 $(LIBRARY) $(INSTALL_PATH)/lib
+	@[ ! -e $(SHARED) ] || install -C -m 644 $(SHARED) $(INSTALL_PATH)/lib
 #-------------------------------------------------
 
 WARNING_FLAGS = -Wall -Werror -Wsign-compare

From faad439ac40d2c4591faf61d314caeac4f9adb1e Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Tue, 16 Sep 2014 10:30:32 -0700
Subject: [PATCH 079/829] Fix #284

Summary: This work on my compiler, but it turns out some compilers don't implicitly add constness, see: https://github.com/facebook/rocksdb/issues/284. This diff adds constness explicitly.

Test Plan: still compiles

Reviewers: sdong

Reviewed By: sdong

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D23409
---
 table/full_filter_block.cc | 4 ++--
 table/full_filter_block.h  | 4 ++--
 util/bloom.cc              | 6 +++---
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/table/full_filter_block.cc b/table/full_filter_block.cc
index 8a481b7d0..4ccc2e2b4 100644
--- a/table/full_filter_block.cc
+++ b/table/full_filter_block.cc
@@ -47,7 +47,7 @@ inline void FullFilterBlockBuilder::AddPrefix(const Slice& key) {
 Slice FullFilterBlockBuilder::Finish() {
   if (num_added_ != 0) {
     num_added_ = 0;
-    return filter_bits_builder_->Finish(&filter_data);
+    return filter_bits_builder_->Finish(&filter_data_);
   }
   return Slice();
 }
@@ -64,7 +64,7 @@ FullFilterBlockReader::FullFilterBlockReader(
   filter_bits_reader_.reset(filter_bits_reader);
 
   if (delete_contents_after_use) {
-    filter_data.reset(contents.data());
+    filter_data_.reset(contents.data());
   }
 }
 
diff --git a/table/full_filter_block.h b/table/full_filter_block.h
index 24d20e032..46ba5d1de 100644
--- a/table/full_filter_block.h
+++ b/table/full_filter_block.h
@@ -56,7 +56,7 @@ class FullFilterBlockBuilder : public FilterBlockBuilder {
 
   uint32_t num_added_;
   std::unique_ptr<FilterBitsBuilder> filter_bits_builder_;
-  std::unique_ptr<const char[]> filter_data;
+  std::unique_ptr<const char[]> filter_data_;
 
   void AddKey(const Slice& key);
   void AddPrefix(const Slice& key);
@@ -95,7 +95,7 @@ class FullFilterBlockReader : public FilterBlockReader {
 
   std::unique_ptr<FilterBitsReader> filter_bits_reader_;
   Slice contents_;
-  std::unique_ptr<const char[]> filter_data;
+  std::unique_ptr<const char[]> filter_data_;
 
   bool MayMatch(const Slice& entry);
 
diff --git a/util/bloom.cc b/util/bloom.cc
index f19e2a670..19d8edead 100644
--- a/util/bloom.cc
+++ b/util/bloom.cc
@@ -54,9 +54,8 @@ class FullFilterBitsBuilder : public FilterBitsBuilder {
   // | ...                | num_probes : 1 byte | num_lines : 4 bytes |
   // +----------------------------------------------------------------+
   virtual Slice Finish(std::unique_ptr<const char[]>* buf) override {
-    char* data = nullptr;
     uint32_t total_bits, num_lines;
-    data = ReserveSpace(hash_entries_.size(), &total_bits, &num_lines);
+    char* data = ReserveSpace(hash_entries_.size(), &total_bits, &num_lines);
     assert(data);
 
     if (total_bits != 0 && num_lines != 0) {
@@ -67,7 +66,8 @@ class FullFilterBitsBuilder : public FilterBitsBuilder {
     data[total_bits/8] = static_cast<char>(num_probes_);
     EncodeFixed32(data + total_bits/8 + 1, static_cast<uint32_t>(num_lines));
 
-    buf->reset(data);
+    const char* const_data = data;
+    buf->reset(const_data);
     hash_entries_.clear();
 
     return Slice(data, total_bits / 8 + 5);

From f090575e4307cdffaa2be8b31f4f24d4e827c4de Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Wed, 17 Sep 2014 01:16:17 -0700
Subject: [PATCH 080/829] Replaced "built on on earlier work" by "built on
 earlier work" in README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index bda801fd7..916bdecde 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,7 @@
 [![Build Status](https://travis-ci.org/facebook/rocksdb.svg?branch=master)](https://travis-ci.org/facebook/rocksdb)
 
 RocksDB is developed and maintained by Facebook Database Engineering Team.
-It is built on on earlier work on LevelDB by Sanjay Ghemawat (sanjay@google.com)
+It is built on earlier work on LevelDB by Sanjay Ghemawat (sanjay@google.com)
 and Jeff Dean (jeff@google.com)
 
 This code is a library that forms the core building block for a fast

From f9eaaa66e6e087d542f4fb2c325df02ab5cab916 Mon Sep 17 00:00:00 2001
From: Saghm Rossi <saghmrossi@gmail.com>
Date: Wed, 17 Sep 2014 15:15:53 -0400
Subject: [PATCH 081/829] added include for inttypes.h to fix nonworking printf
 statements

---
 util/cache_bench.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/util/cache_bench.cc b/util/cache_bench.cc
index 985eb06a3..3d006ecf8 100644
--- a/util/cache_bench.cc
+++ b/util/cache_bench.cc
@@ -14,6 +14,7 @@ int main() {
 }
 #else
 
+#include <inttypes.h>
 #include <sys/types.h>
 #include <stdio.h>
 #include <gflags/gflags.h>

From 94e43a1dfec18c00679deaaab8ed90003355405a Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Wed, 17 Sep 2014 12:30:06 -0700
Subject: [PATCH 082/829] [Java] Fixed 32-bit overflowing issue when converting
 jlong to size_t

Summary:
Fixed 32-bit overflowing issue when converting jlong to size_t by
capping jlong to std::numeric_limits<size_t>::max().

Test Plan:
make rocksdbjava
make jtest

Reviewers: ankgup87, ljin, sdong, igor

Reviewed By: igor

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D23511
---
 java/rocksjni/memtablejni.cc |  7 ++++---
 java/rocksjni/options.cc     | 19 ++++++++++---------
 java/rocksjni/portal.h       |  7 +++++++
 java/rocksjni/write_batch.cc |  2 +-
 4 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/java/rocksjni/memtablejni.cc b/java/rocksjni/memtablejni.cc
index a0d50f5f5..9b0dc252c 100644
--- a/java/rocksjni/memtablejni.cc
+++ b/java/rocksjni/memtablejni.cc
@@ -5,6 +5,7 @@
 //
 // This file implements the "bridge" between Java and C++ for MemTables.
 
+#include "rocksjni/portal.h"
 #include "include/org_rocksdb_HashSkipListMemTableConfig.h"
 #include "include/org_rocksdb_HashLinkedListMemTableConfig.h"
 #include "include/org_rocksdb_VectorMemTableConfig.h"
@@ -20,7 +21,7 @@ jlong Java_org_rocksdb_HashSkipListMemTableConfig_newMemTableFactoryHandle(
     JNIEnv* env, jobject jobj, jlong jbucket_count,
     jint jheight, jint jbranching_factor) {
   return reinterpret_cast<jlong>(rocksdb::NewHashSkipListRepFactory(
-      static_cast<size_t>(jbucket_count),
+      rocksdb::jlong_to_size_t(jbucket_count),
       static_cast<int32_t>(jheight),
       static_cast<int32_t>(jbranching_factor)));
 }
@@ -33,7 +34,7 @@ jlong Java_org_rocksdb_HashSkipListMemTableConfig_newMemTableFactoryHandle(
 jlong Java_org_rocksdb_HashLinkedListMemTableConfig_newMemTableFactoryHandle(
     JNIEnv* env, jobject jobj, jlong jbucket_count) {
   return reinterpret_cast<jlong>(rocksdb::NewHashLinkListRepFactory(
-       static_cast<size_t>(jbucket_count)));
+       rocksdb::jlong_to_size_t(jbucket_count)));
 }
 
 /*
@@ -44,7 +45,7 @@ jlong Java_org_rocksdb_HashLinkedListMemTableConfig_newMemTableFactoryHandle(
 jlong Java_org_rocksdb_VectorMemTableConfig_newMemTableFactoryHandle(
     JNIEnv* env, jobject jobj, jlong jreserved_size) {
   return reinterpret_cast<jlong>(new rocksdb::VectorRepFactory(
-      static_cast<size_t>(jreserved_size)));
+      rocksdb::jlong_to_size_t(jreserved_size)));
 }
 
 /*
diff --git a/java/rocksjni/options.cc b/java/rocksjni/options.cc
index da420c78f..a72eecd28 100644
--- a/java/rocksjni/options.cc
+++ b/java/rocksjni/options.cc
@@ -70,7 +70,7 @@ jboolean Java_org_rocksdb_Options_createIfMissing(
 void Java_org_rocksdb_Options_setWriteBufferSize(
     JNIEnv* env, jobject jobj, jlong jhandle, jlong jwrite_buffer_size) {
   reinterpret_cast<rocksdb::Options*>(jhandle)->write_buffer_size =
-          static_cast<size_t>(jwrite_buffer_size);
+          rocksdb::jlong_to_size_t(jwrite_buffer_size);
 }
 
 
@@ -362,7 +362,7 @@ jlong Java_org_rocksdb_Options_maxLogFileSize(
 void Java_org_rocksdb_Options_setMaxLogFileSize(
     JNIEnv* env, jobject jobj, jlong jhandle, jlong max_log_file_size) {
   reinterpret_cast<rocksdb::Options*>(jhandle)->max_log_file_size =
-      static_cast<size_t>(max_log_file_size);
+      rocksdb::jlong_to_size_t(max_log_file_size);
 }
 
 /*
@@ -383,7 +383,7 @@ jlong Java_org_rocksdb_Options_logFileTimeToRoll(
 void Java_org_rocksdb_Options_setLogFileTimeToRoll(
     JNIEnv* env, jobject jobj, jlong jhandle, jlong log_file_time_to_roll) {
   reinterpret_cast<rocksdb::Options*>(jhandle)->log_file_time_to_roll =
-      static_cast<size_t>(log_file_time_to_roll);
+      rocksdb::jlong_to_size_t(log_file_time_to_roll);
 }
 
 /*
@@ -404,7 +404,7 @@ jlong Java_org_rocksdb_Options_keepLogFileNum(
 void Java_org_rocksdb_Options_setKeepLogFileNum(
     JNIEnv* env, jobject jobj, jlong jhandle, jlong keep_log_file_num) {
   reinterpret_cast<rocksdb::Options*>(jhandle)->keep_log_file_num =
-      static_cast<size_t>(keep_log_file_num);
+      rocksdb::jlong_to_size_t(keep_log_file_num);
 }
 
 /*
@@ -509,7 +509,8 @@ void Java_org_rocksdb_Options_setTableCacheRemoveScanCountLimit(
 void Java_org_rocksdb_Options_useFixedLengthPrefixExtractor(
     JNIEnv* env, jobject jobj, jlong jhandle, jint jprefix_length) {
   reinterpret_cast<rocksdb::Options*>(jhandle)->prefix_extractor.reset(
-      rocksdb::NewFixedPrefixTransform(static_cast<size_t>(jprefix_length)));
+      rocksdb::NewFixedPrefixTransform(
+          rocksdb::jlong_to_size_t(jprefix_length)));
 }
 
 /*
@@ -573,7 +574,7 @@ jlong Java_org_rocksdb_Options_manifestPreallocationSize(
 void Java_org_rocksdb_Options_setManifestPreallocationSize(
     JNIEnv* env, jobject jobj, jlong jhandle, jlong preallocation_size) {
   reinterpret_cast<rocksdb::Options*>(jhandle)->manifest_preallocation_size =
-      static_cast<size_t>(preallocation_size);
+      rocksdb::jlong_to_size_t(preallocation_size);
 }
 
 /*
@@ -1245,7 +1246,7 @@ jlong Java_org_rocksdb_Options_arenaBlockSize(
 void Java_org_rocksdb_Options_setArenaBlockSize(
     JNIEnv* env, jobject jobj, jlong jhandle, jlong jarena_block_size) {
   reinterpret_cast<rocksdb::Options*>(jhandle)->arena_block_size =
-      static_cast<size_t>(jarena_block_size);
+      rocksdb::jlong_to_size_t(jarena_block_size);
 }
 
 /*
@@ -1410,7 +1411,7 @@ void Java_org_rocksdb_Options_setInplaceUpdateNumLocks(
     jlong jinplace_update_num_locks) {
   reinterpret_cast<rocksdb::Options*>(
       jhandle)->inplace_update_num_locks =
-          static_cast<size_t>(jinplace_update_num_locks);
+          rocksdb::jlong_to_size_t(jinplace_update_num_locks);
 }
 
 /*
@@ -1501,7 +1502,7 @@ void Java_org_rocksdb_Options_setMaxSuccessiveMerges(
     JNIEnv* env, jobject jobj, jlong jhandle,
     jlong jmax_successive_merges) {
   reinterpret_cast<rocksdb::Options*>(jhandle)->max_successive_merges =
-      static_cast<size_t>(jmax_successive_merges);
+      rocksdb::jlong_to_size_t(jmax_successive_merges);
 }
 
 /*
diff --git a/java/rocksjni/portal.h b/java/rocksjni/portal.h
index 28fe754f0..4c7a8b9b9 100644
--- a/java/rocksjni/portal.h
+++ b/java/rocksjni/portal.h
@@ -11,12 +11,19 @@
 #define JAVA_ROCKSJNI_PORTAL_H_
 
 #include <jni.h>
+#include <limits>
 #include "rocksdb/db.h"
 #include "rocksdb/filter_policy.h"
 #include "rocksdb/utilities/backupable_db.h"
 
 namespace rocksdb {
 
+inline size_t jlong_to_size_t(const jlong& jvalue) {
+  return static_cast<uint64_t>(jvalue) <=
+      static_cast<uint64_t>(std::numeric_limits<size_t>::max()) ?
+      static_cast<size_t>(jvalue) : std::numeric_limits<size_t>::max();
+}
+
 // The portal class for org.rocksdb.RocksDB
 class RocksDBJni {
  public:
diff --git a/java/rocksjni/write_batch.cc b/java/rocksjni/write_batch.cc
index a2cb67016..ff94309fe 100644
--- a/java/rocksjni/write_batch.cc
+++ b/java/rocksjni/write_batch.cc
@@ -30,7 +30,7 @@
 void Java_org_rocksdb_WriteBatch_newWriteBatch(
     JNIEnv* env, jobject jobj, jint jreserved_bytes) {
   rocksdb::WriteBatch* wb = new rocksdb::WriteBatch(
-      static_cast<size_t>(jreserved_bytes));
+      rocksdb::jlong_to_size_t(jreserved_bytes));
 
   rocksdb::WriteBatchJni::setHandle(env, jobj, wb);
 }

From 60a4aa175e5ce4fc3fa9ba47c52e12ac9483b152 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Wed, 17 Sep 2014 12:31:53 -0700
Subject: [PATCH 083/829] Test use_mmap_reads

Summary: We currently don't test mmap reads as part of db_test. Piggyback it on kWalDir test config.

Test Plan: make check

Reviewers: ljin, sdong, yhchiang

Reviewed By: yhchiang

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D23337
---
 db/db_test.cc | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/db/db_test.cc b/db/db_test.cc
index f33ab5ad2..796792b22 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -342,7 +342,7 @@ class DBTest {
     kUncompressed = 11,
     kNumLevel_3 = 12,
     kDBLogDir = 13,
-    kWalDir = 14,
+    kWalDirAndMmapReads = 14,
     kManifestFileSize = 15,
     kCompactOnFlush = 16,
     kPerfOptions = 17,
@@ -377,6 +377,7 @@ class DBTest {
     kSkipNoSeekToLast = 32,
     kSkipHashCuckoo = 64,
     kSkipFIFOCompaction = 128,
+    kSkipMmapReads = 256,
   };
 
 
@@ -436,6 +437,10 @@ class DBTest {
           option_config_ == kFIFOCompaction) {
         continue;
       }
+      if ((skip_mask & kSkipMmapReads) &&
+          option_config_ == kWalDirAndMmapReads) {
+        continue;
+      }
       break;
     }
 
@@ -539,8 +544,11 @@ class DBTest {
       case kDBLogDir:
         options.db_log_dir = test::TmpDir();
         break;
-      case kWalDir:
+      case kWalDirAndMmapReads:
         options.wal_dir = test::TmpDir() + "/wal";
+        // mmap reads should be orthogonal to WalDir setting, so we piggyback to
+        // this option config to test mmap reads as well
+        options.allow_mmap_reads = true;
         break;
       case kManifestFileSize:
         options.max_manifest_file_size = 50; // 50 bytes
@@ -1675,8 +1683,8 @@ TEST(DBTest, NonBlockingIteration) {
     // This test verifies block cache behaviors, which is not used by plain
     // table format.
     // Exclude kHashCuckoo as it does not support iteration currently
-  } while (ChangeOptions(kSkipPlainTable | kSkipNoSeekToLast |
-                         kSkipHashCuckoo));
+  } while (ChangeOptions(kSkipPlainTable | kSkipNoSeekToLast | kSkipHashCuckoo |
+                         kSkipMmapReads));
 }
 
 // A delete is skipped for key if KeyMayExist(key) returns False

From e4eca6a1e5738f670deef6ded4c9020567e855ed Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Wed, 17 Sep 2014 12:46:32 -0700
Subject: [PATCH 084/829] Options conversion function for convenience

Summary: as title

Test Plan: options_test

Reviewers: sdong, yhchiang, igor

Reviewed By: igor

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D23283
---
 include/rocksdb/options.h |   7 +
 util/options_helper.cc    | 292 ++++++++++++++++++++++++++++++++++++++
 util/options_test.cc      | 172 ++++++++++++++++++++++
 3 files changed, 471 insertions(+)
 create mode 100644 util/options_helper.cc

diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index 36c1108b1..2c9734d24 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -14,6 +14,7 @@
 #include <memory>
 #include <vector>
 #include <stdint.h>
+#include <unordered_map>
 
 #include "rocksdb/version.h"
 #include "rocksdb/universal_compaction.h"
@@ -1012,6 +1013,12 @@ extern Options GetOptions(size_t total_write_buffer_limit,
                           int read_amplification_threshold = 8,
                           int write_amplification_threshold = 32,
                           uint64_t target_db_size = 68719476736 /* 64GB */);
+
+bool GetOptionsFromStrings(
+    const Options& base_options,
+    const std::unordered_map<std::string, std::string>& options_map,
+    Options* new_options);
+
 }  // namespace rocksdb
 
 #endif  // STORAGE_ROCKSDB_INCLUDE_OPTIONS_H_
diff --git a/util/options_helper.cc b/util/options_helper.cc
new file mode 100644
index 000000000..a4d46ccb0
--- /dev/null
+++ b/util/options_helper.cc
@@ -0,0 +1,292 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <cassert>
+#include "rocksdb/options.h"
+
+namespace rocksdb {
+
+namespace {
+CompressionType ParseCompressionType(const std::string& type) {
+  if (type == "kNoCompression") {
+    return kNoCompression;
+  } else if (type == "kSnappyCompression") {
+    return kSnappyCompression;
+  } else if (type == "kZlibCompression") {
+    return kZlibCompression;
+  } else if (type == "kBZip2Compression") {
+    return kBZip2Compression;
+  } else if (type == "kLZ4Compression") {
+    return kLZ4Compression;
+  } else if (type == "kLZ4HCCompression") {
+    return kLZ4HCCompression;
+  } else {
+    throw "unknown compression type: " + type;
+  }
+  return kNoCompression;
+}
+
+bool ParseBoolean(const std::string& type, const std::string& value) {
+  if (value == "true" || value == "1") {
+    return true;
+  } else if (value == "false" || value == "0") {
+    return false;
+  } else {
+    throw type;
+  }
+}
+uint32_t ParseInt(const std::string& value) {
+  return std::stoi(value);
+}
+
+uint32_t ParseUint32(const std::string& value) {
+  return std::stoul(value);
+}
+
+uint64_t ParseUint64(const std::string& value) {
+  return std::stoull(value);
+}
+
+int64_t ParseInt64(const std::string& value) {
+  return std::stol(value);
+}
+
+double ParseDouble(const std::string& value) {
+  return std::stod(value);
+}
+
+CompactionStyle ParseCompactionStyle(const std::string& type) {
+  if (type == "kCompactionStyleLevel") {
+    return kCompactionStyleLevel;
+  } else if (type == "kCompactionStyleUniversal") {
+    return kCompactionStyleUniversal;
+  } else if (type == "kCompactionStyleFIFO") {
+    return kCompactionStyleFIFO;
+  } else {
+    throw "unknown compaction style: " + type;
+  }
+  return kCompactionStyleLevel;
+}
+}  // anonymouse namespace
+
+bool GetOptionsFromStrings(
+    const Options& base_options,
+    const std::unordered_map<std::string, std::string>& options_map,
+    Options* new_options) {
+  assert(new_options);
+  *new_options = base_options;
+  for (const auto& o : options_map) {
+    try {
+      if (o.first == "write_buffer_size") {
+        new_options->write_buffer_size = ParseInt64(o.second);
+      } else if (o.first == "max_write_buffer_number") {
+        new_options->max_write_buffer_number = ParseInt(o.second);
+      } else if (o.first == "min_write_buffer_number_to_merge") {
+        new_options->min_write_buffer_number_to_merge = ParseInt(o.second);
+      } else if (o.first == "compression") {
+        new_options->compression = ParseCompressionType(o.second);
+      } else if (o.first == "compression_per_level") {
+        new_options->compression_per_level.clear();
+        size_t start = 0;
+        while (true) {
+          size_t end = o.second.find_first_of(':', start);
+          if (end == std::string::npos) {
+            new_options->compression_per_level.push_back(
+                ParseCompressionType(o.second.substr(start)));
+            break;
+          } else {
+            new_options->compression_per_level.push_back(
+                ParseCompressionType(o.second.substr(start, end - start)));
+            start = end + 1;
+          }
+        }
+      } else if (o.first == "compression_opts") {
+        size_t start = 0;
+        size_t end = o.second.find_first_of(':');
+        if (end == std::string::npos) {
+          throw o.first;
+        }
+        new_options->compression_opts.window_bits =
+            ParseInt(o.second.substr(start, end - start));
+        start = end + 1;
+        end = o.second.find_first_of(':', start);
+        if (end == std::string::npos) {
+          throw o.first;
+        }
+        new_options->compression_opts.level =
+            ParseInt(o.second.substr(start, end - start));
+        start = end + 1;
+        if (start >= o.second.size()) {
+          throw o.first;
+        }
+        new_options->compression_opts.strategy =
+            ParseInt(o.second.substr(start, o.second.size() - start));
+      } else if (o.first == "num_levels") {
+        new_options->num_levels = ParseInt(o.second);
+      } else if (o.first == "level0_file_num_compaction_trigger") {
+        new_options->level0_file_num_compaction_trigger = ParseInt(o.second);
+      } else if (o.first == "level0_slowdown_writes_trigger") {
+        new_options->level0_slowdown_writes_trigger = ParseInt(o.second);
+      } else if (o.first == "level0_stop_writes_trigger") {
+        new_options->level0_stop_writes_trigger = ParseInt(o.second);
+      } else if (o.first == "max_mem_compaction_level") {
+        new_options->max_mem_compaction_level = ParseInt(o.second);
+      } else if (o.first == "target_file_size_base") {
+        new_options->target_file_size_base = ParseInt(o.second);
+      } else if (o.first == "target_file_size_multiplier") {
+        new_options->target_file_size_multiplier = ParseInt(o.second);
+      } else if (o.first == "max_bytes_for_level_base") {
+        new_options->max_bytes_for_level_base = ParseUint64(o.second);
+      } else if (o.first == "max_bytes_for_level_multiplier") {
+        new_options->max_bytes_for_level_multiplier = ParseInt(o.second);
+      } else if (o.first == "max_bytes_for_level_multiplier_additional") {
+        new_options->max_bytes_for_level_multiplier_additional.clear();
+        size_t start = 0;
+        while (true) {
+          size_t end = o.second.find_first_of(':', start);
+          if (end == std::string::npos) {
+            new_options->max_bytes_for_level_multiplier_additional.push_back(
+                ParseInt(o.second.substr(start)));
+            break;
+          } else {
+            new_options->max_bytes_for_level_multiplier_additional.push_back(
+                ParseInt(o.second.substr(start, end - start)));
+            start = end + 1;
+          }
+        }
+      } else if (o.first == "expanded_compaction_factor") {
+        new_options->expanded_compaction_factor = ParseInt(o.second);
+      } else if (o.first == "source_compaction_factor") {
+        new_options->source_compaction_factor = ParseInt(o.second);
+      } else if (o.first == "max_grandparent_overlap_factor") {
+        new_options->max_grandparent_overlap_factor = ParseInt(o.second);
+      } else if (o.first == "soft_rate_limit") {
+        new_options->soft_rate_limit = ParseDouble(o.second);
+      } else if (o.first == "hard_rate_limit") {
+        new_options->hard_rate_limit = ParseDouble(o.second);
+      } else if (o.first == "arena_block_size") {
+        new_options->arena_block_size = ParseInt64(o.second);
+      } else if (o.first == "disable_auto_compactions") {
+        new_options->disable_auto_compactions = ParseBoolean(o.first, o.second);
+      } else if (o.first == "purge_redundant_kvs_while_flush") {
+        new_options->purge_redundant_kvs_while_flush =
+          ParseBoolean(o.first, o.second);
+      } else if (o.first == "compaction_style") {
+        new_options->compaction_style = ParseCompactionStyle(o.second);
+      } else if (o.first == "verify_checksums_in_compaction") {
+        new_options->verify_checksums_in_compaction =
+          ParseBoolean(o.first, o.second);
+      } else if (o.first == "compaction_options_universal") {
+        // TODO(ljin): add support
+        throw o.first;
+      } else if (o.first == "compaction_options_fifo") {
+        new_options->compaction_options_fifo.max_table_files_size
+          = ParseUint64(o.second);
+      } else if (o.first == "filter_deletes") {
+        new_options->filter_deletes = ParseBoolean(o.first, o.second);
+      } else if (o.first == "max_sequential_skip_in_iterations") {
+        new_options->max_sequential_skip_in_iterations = ParseUint64(o.second);
+      } else if (o.first == "inplace_update_support") {
+        new_options->inplace_update_support = ParseBoolean(o.first, o.second);
+      } else if (o.first == "inplace_update_num_locks") {
+        new_options->inplace_update_num_locks = ParseInt64(o.second);
+      } else if (o.first == "memtable_prefix_bloom_bits") {
+        new_options->memtable_prefix_bloom_bits = stoul(o.second);
+      } else if (o.first == "memtable_prefix_bloom_probes") {
+        new_options->memtable_prefix_bloom_probes = stoul(o.second);
+      } else if (o.first == "memtable_prefix_bloom_huge_page_tlb_size") {
+        new_options->memtable_prefix_bloom_huge_page_tlb_size =
+          ParseInt64(o.second);
+      } else if (o.first == "bloom_locality") {
+        new_options->bloom_locality = ParseUint32(o.second);
+      } else if (o.first == "max_successive_merges") {
+        new_options->max_successive_merges = ParseInt64(o.second);
+      } else if (o.first == "min_partial_merge_operands") {
+        new_options->min_partial_merge_operands = ParseUint32(o.second);
+      } else if (o.first == "create_if_missing") {
+        new_options->create_if_missing = ParseBoolean(o.first, o.second);
+      } else if (o.first == "create_missing_column_families") {
+        new_options->create_missing_column_families =
+          ParseBoolean(o.first, o.second);
+      } else if (o.first == "error_if_exists") {
+        new_options->error_if_exists = ParseBoolean(o.first, o.second);
+      } else if (o.first == "paranoid_checks") {
+        new_options->paranoid_checks = ParseBoolean(o.first, o.second);
+      } else if (o.first == "max_open_files") {
+        new_options->max_open_files = ParseInt(o.second);
+      } else if (o.first == "max_total_wal_size") {
+        new_options->max_total_wal_size = ParseUint64(o.second);
+      } else if (o.first == "disable_data_sync") {
+        new_options->disableDataSync = ParseBoolean(o.first, o.second);
+      } else if (o.first == "use_fsync") {
+        new_options->use_fsync = ParseBoolean(o.first, o.second);
+      } else if (o.first == "db_paths") {
+        // TODO(ljin): add support
+        throw o.first;
+      } else if (o.first == "db_log_dir") {
+        new_options->db_log_dir = o.second;
+      } else if (o.first == "wal_dir") {
+        new_options->wal_dir = o.second;
+      } else if (o.first == "delete_obsolete_files_period_micros") {
+        new_options->delete_obsolete_files_period_micros =
+          ParseUint64(o.second);
+      } else if (o.first == "max_background_compactions") {
+        new_options->max_background_compactions = ParseInt(o.second);
+      } else if (o.first == "max_background_flushes") {
+        new_options->max_background_flushes = ParseInt(o.second);
+      } else if (o.first == "max_log_file_size") {
+        new_options->max_log_file_size = ParseInt64(o.second);
+      } else if (o.first == "log_file_time_to_roll") {
+        new_options->log_file_time_to_roll = ParseInt64(o.second);
+      } else if (o.first == "keep_log_file_num") {
+        new_options->keep_log_file_num = ParseInt64(o.second);
+      } else if (o.first == "max_manifest_file_size") {
+        new_options->max_manifest_file_size = ParseUint64(o.second);
+      } else if (o.first == "table_cache_numshardbits") {
+        new_options->table_cache_numshardbits = ParseInt(o.second);
+      } else if (o.first == "table_cache_remove_scan_count_limit") {
+        new_options->table_cache_remove_scan_count_limit = ParseInt(o.second);
+      } else if (o.first == "WAL_ttl_seconds") {
+        new_options->WAL_ttl_seconds = ParseUint64(o.second);
+      } else if (o.first == "WAL_size_limit_MB") {
+        new_options->WAL_size_limit_MB = ParseUint64(o.second);
+      } else if (o.first == "manifest_preallocation_size") {
+        new_options->manifest_preallocation_size = ParseInt64(o.second);
+      } else if (o.first == "allow_os_buffer") {
+        new_options->allow_os_buffer = ParseBoolean(o.first, o.second);
+      } else if (o.first == "allow_mmap_reads") {
+        new_options->allow_mmap_reads = ParseBoolean(o.first, o.second);
+      } else if (o.first == "allow_mmap_writes") {
+        new_options->allow_mmap_writes = ParseBoolean(o.first, o.second);
+      } else if (o.first == "is_fd_close_on_exec") {
+        new_options->is_fd_close_on_exec = ParseBoolean(o.first, o.second);
+      } else if (o.first == "skip_log_error_on_recovery") {
+        new_options->skip_log_error_on_recovery =
+          ParseBoolean(o.first, o.second);
+      } else if (o.first == "stats_dump_period_sec") {
+        new_options->stats_dump_period_sec = ParseUint32(o.second);
+      } else if (o.first == "advise_random_on_open") {
+        new_options->advise_random_on_open = ParseBoolean(o.first, o.second);
+      } else if (o.first == "use_adaptive_mutex") {
+        new_options->use_adaptive_mutex = ParseBoolean(o.first, o.second);
+      } else if (o.first == "allow_thread_local") {
+        new_options->allow_thread_local = ParseBoolean(o.first, o.second);
+      } else if (o.first == "bytes_per_sync") {
+        new_options->bytes_per_sync = ParseUint64(o.second);
+      } else {
+        return false;
+      }
+    } catch (std::exception) {
+      return false;
+    }
+  }
+  return true;
+}
+
+}  // namespace rocksdb
diff --git a/util/options_test.cc b/util/options_test.cc
index afe3795f9..c675cb87f 100644
--- a/util/options_test.cc
+++ b/util/options_test.cc
@@ -11,6 +11,7 @@
 #define __STDC_FORMAT_MACROS
 #endif
 
+#include <unordered_map>
 #include <inttypes.h>
 #include <gflags/gflags.h>
 
@@ -75,6 +76,177 @@ TEST(OptionsTest, LooseCondition) {
   // Both tight amplifications
   PrintAndGetOptions(128 * 1024 * 1024, 4, 8);
 }
+
+TEST(OptionsTest, GetOptionsFromStringsTest) {
+  std::unordered_map<std::string, std::string> options_map = {
+    {"write_buffer_size", "1"},
+    {"max_write_buffer_number", "2"},
+    {"min_write_buffer_number_to_merge", "3"},
+    {"compression", "kSnappyCompression"},
+    {"compression_per_level", "kNoCompression:"
+                              "kSnappyCompression:"
+                              "kZlibCompression:"
+                              "kBZip2Compression:"
+                              "kLZ4Compression:"
+                              "kLZ4HCCompression"},
+    {"compression_opts", "4:5:6"},
+    {"num_levels", "7"},
+    {"level0_file_num_compaction_trigger", "8"},
+    {"level0_slowdown_writes_trigger", "9"},
+    {"level0_stop_writes_trigger", "10"},
+    {"max_mem_compaction_level", "11"},
+    {"target_file_size_base", "12"},
+    {"target_file_size_multiplier", "13"},
+    {"max_bytes_for_level_base", "14"},
+    {"max_bytes_for_level_multiplier", "15"},
+    {"max_bytes_for_level_multiplier_additional", "16:17:18"},
+    {"expanded_compaction_factor", "19"},
+    {"source_compaction_factor", "20"},
+    {"max_grandparent_overlap_factor", "21"},
+    {"soft_rate_limit", "1.1"},
+    {"hard_rate_limit", "2.1"},
+    {"arena_block_size", "22"},
+    {"disable_auto_compactions", "true"},
+    {"purge_redundant_kvs_while_flush", "1"},
+    {"compaction_style", "kCompactionStyleLevel"},
+    {"verify_checksums_in_compaction", "false"},
+    {"compaction_options_fifo", "23"},
+    {"filter_deletes", "0"},
+    {"max_sequential_skip_in_iterations", "24"},
+    {"inplace_update_support", "true"},
+    {"inplace_update_num_locks", "25"},
+    {"memtable_prefix_bloom_bits", "26"},
+    {"memtable_prefix_bloom_probes", "27"},
+    {"memtable_prefix_bloom_huge_page_tlb_size", "28"},
+    {"bloom_locality", "29"},
+    {"max_successive_merges", "30"},
+    {"min_partial_merge_operands", "31"},
+    {"create_if_missing", "false"},
+    {"create_missing_column_families", "true"},
+    {"error_if_exists", "false"},
+    {"paranoid_checks", "true"},
+    {"max_open_files", "32"},
+    {"max_total_wal_size", "33"},
+    {"disable_data_sync", "false"},
+    {"use_fsync", "true"},
+    {"db_log_dir", "/db_log_dir"},
+    {"wal_dir", "/wal_dir"},
+    {"delete_obsolete_files_period_micros", "34"},
+    {"max_background_compactions", "35"},
+    {"max_background_flushes", "36"},
+    {"max_log_file_size", "37"},
+    {"log_file_time_to_roll", "38"},
+    {"keep_log_file_num", "39"},
+    {"max_manifest_file_size", "40"},
+    {"table_cache_numshardbits", "41"},
+    {"table_cache_remove_scan_count_limit", "42"},
+    {"WAL_ttl_seconds", "43"},
+    {"WAL_size_limit_MB", "44"},
+    {"manifest_preallocation_size", "45"},
+    {"allow_os_buffer", "false"},
+    {"allow_mmap_reads", "true"},
+    {"allow_mmap_writes", "false"},
+    {"is_fd_close_on_exec", "true"},
+    {"skip_log_error_on_recovery", "false"},
+    {"stats_dump_period_sec", "46"},
+    {"advise_random_on_open", "true"},
+    {"use_adaptive_mutex", "false"},
+    {"allow_thread_local", "true"},
+    {"bytes_per_sync", "47"},
+  };
+
+  Options base_opt;
+  Options new_opt;
+  ASSERT_TRUE(GetOptionsFromStrings(base_opt, options_map, &new_opt));
+  ASSERT_EQ(new_opt.write_buffer_size, 1);
+  ASSERT_EQ(new_opt.max_write_buffer_number, 2);
+  ASSERT_EQ(new_opt.min_write_buffer_number_to_merge, 3);
+  ASSERT_EQ(new_opt.compression, kSnappyCompression);
+  ASSERT_EQ(new_opt.compression_per_level.size(), 6);
+  ASSERT_EQ(new_opt.compression_per_level[0], kNoCompression);
+  ASSERT_EQ(new_opt.compression_per_level[1], kSnappyCompression);
+  ASSERT_EQ(new_opt.compression_per_level[2], kZlibCompression);
+  ASSERT_EQ(new_opt.compression_per_level[3], kBZip2Compression);
+  ASSERT_EQ(new_opt.compression_per_level[4], kLZ4Compression);
+  ASSERT_EQ(new_opt.compression_per_level[5], kLZ4HCCompression);
+  ASSERT_EQ(new_opt.compression_opts.window_bits, 4);
+  ASSERT_EQ(new_opt.compression_opts.level, 5);
+  ASSERT_EQ(new_opt.compression_opts.strategy, 6);
+  ASSERT_EQ(new_opt.num_levels, 7);
+  ASSERT_EQ(new_opt.level0_file_num_compaction_trigger, 8);
+  ASSERT_EQ(new_opt.level0_slowdown_writes_trigger, 9);
+  ASSERT_EQ(new_opt.level0_stop_writes_trigger, 10);
+  ASSERT_EQ(new_opt.max_mem_compaction_level, 11);
+  ASSERT_EQ(new_opt.target_file_size_base, 12);
+  ASSERT_EQ(new_opt.target_file_size_multiplier, 13);
+  ASSERT_EQ(new_opt.max_bytes_for_level_base, 14);
+  ASSERT_EQ(new_opt.max_bytes_for_level_multiplier, 15);
+  ASSERT_EQ(new_opt.max_bytes_for_level_multiplier_additional.size(), 3);
+  ASSERT_EQ(new_opt.max_bytes_for_level_multiplier_additional[0], 16);
+  ASSERT_EQ(new_opt.max_bytes_for_level_multiplier_additional[1], 17);
+  ASSERT_EQ(new_opt.max_bytes_for_level_multiplier_additional[2], 18);
+  ASSERT_EQ(new_opt.expanded_compaction_factor, 19);
+  ASSERT_EQ(new_opt.source_compaction_factor, 20);
+  ASSERT_EQ(new_opt.max_grandparent_overlap_factor, 21);
+  ASSERT_EQ(new_opt.soft_rate_limit, 1.1);
+  ASSERT_EQ(new_opt.hard_rate_limit, 2.1);
+  ASSERT_EQ(new_opt.arena_block_size, 22);
+  ASSERT_EQ(new_opt.disable_auto_compactions, true);
+  ASSERT_EQ(new_opt.purge_redundant_kvs_while_flush, true);
+  ASSERT_EQ(new_opt.compaction_style, kCompactionStyleLevel);
+  ASSERT_EQ(new_opt.verify_checksums_in_compaction, false);
+  ASSERT_EQ(new_opt.compaction_options_fifo.max_table_files_size, 23);
+  ASSERT_EQ(new_opt.filter_deletes, false);
+  ASSERT_EQ(new_opt.max_sequential_skip_in_iterations, 24);
+  ASSERT_EQ(new_opt.inplace_update_support, true);
+  ASSERT_EQ(new_opt.inplace_update_num_locks, 25);
+  ASSERT_EQ(new_opt.memtable_prefix_bloom_bits, 26);
+  ASSERT_EQ(new_opt.memtable_prefix_bloom_probes, 27);
+  ASSERT_EQ(new_opt.memtable_prefix_bloom_huge_page_tlb_size, 28);
+  ASSERT_EQ(new_opt.bloom_locality, 29);
+  ASSERT_EQ(new_opt.max_successive_merges, 30);
+  ASSERT_EQ(new_opt.min_partial_merge_operands, 31);
+  ASSERT_EQ(new_opt.create_if_missing, false);
+  ASSERT_EQ(new_opt.create_missing_column_families, true);
+  ASSERT_EQ(new_opt.error_if_exists, false);
+  ASSERT_EQ(new_opt.paranoid_checks, true);
+  ASSERT_EQ(new_opt.max_open_files, 32);
+  ASSERT_EQ(new_opt.max_total_wal_size, 33);
+  ASSERT_EQ(new_opt.disableDataSync, false);
+  ASSERT_EQ(new_opt.use_fsync, true);
+  ASSERT_EQ(new_opt.db_log_dir, "/db_log_dir");
+  ASSERT_EQ(new_opt.wal_dir, "/wal_dir");
+  ASSERT_EQ(new_opt.delete_obsolete_files_period_micros, 34);
+  ASSERT_EQ(new_opt.max_background_compactions, 35);
+  ASSERT_EQ(new_opt.max_background_flushes, 36);
+  ASSERT_EQ(new_opt.max_log_file_size, 37);
+  ASSERT_EQ(new_opt.log_file_time_to_roll, 38);
+  ASSERT_EQ(new_opt.keep_log_file_num, 39);
+  ASSERT_EQ(new_opt.max_manifest_file_size, 40);
+  ASSERT_EQ(new_opt.table_cache_numshardbits, 41);
+  ASSERT_EQ(new_opt.table_cache_remove_scan_count_limit, 42);
+  ASSERT_EQ(new_opt.WAL_ttl_seconds, 43);
+  ASSERT_EQ(new_opt.WAL_size_limit_MB, 44);
+  ASSERT_EQ(new_opt.manifest_preallocation_size, 45);
+  ASSERT_EQ(new_opt.allow_os_buffer, false);
+  ASSERT_EQ(new_opt.allow_mmap_reads, true);
+  ASSERT_EQ(new_opt.allow_mmap_writes, false);
+  ASSERT_EQ(new_opt.is_fd_close_on_exec, true);
+  ASSERT_EQ(new_opt.skip_log_error_on_recovery, false);
+  ASSERT_EQ(new_opt.stats_dump_period_sec, 46);
+  ASSERT_EQ(new_opt.advise_random_on_open, true);
+  ASSERT_EQ(new_opt.use_adaptive_mutex, false);
+  ASSERT_EQ(new_opt.allow_thread_local, true);
+  ASSERT_EQ(new_opt.bytes_per_sync, 47);
+
+  options_map["write_buffer_size"] = "hello";
+  ASSERT_TRUE(!GetOptionsFromStrings(base_opt, options_map, &new_opt));
+  options_map["write_buffer_size"] = "1";
+  ASSERT_TRUE(GetOptionsFromStrings(base_opt, options_map, &new_opt));
+  options_map["unknown_option"] = "1";
+  ASSERT_TRUE(!GetOptionsFromStrings(base_opt, options_map, &new_opt));
+}
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {

From a062e1f2c4ccb0d7208917ce4a57bab9277af31f Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Wed, 17 Sep 2014 12:49:13 -0700
Subject: [PATCH 085/829] SetOptions() for memtable related options

Summary: as title

Test Plan:
make all check
I will think a way to set up stress test for this

Reviewers: sdong, yhchiang, igor

Reviewed By: igor

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D23055
---
 db/column_family.cc       | 32 +++++++++++++++----
 db/column_family.h        | 25 +++++++++++++--
 db/db_impl.cc             | 31 ++++++++++++++----
 db/db_impl.h              |  4 +++
 db/memtable.cc            | 18 ++++++-----
 db/memtable.h             |  5 ++-
 db/repair.cc              |  3 +-
 db/version_set.cc         | 10 ++++--
 db/write_batch_test.cc    |  2 +-
 include/rocksdb/db.h      |  8 +++++
 table/table_test.cc       | 12 +++----
 util/mutable_cf_options.h | 41 ++++++++++++++++++++++++
 util/options_helper.cc    | 66 +++++++++++++++++++++++++++------------
 util/options_helper.h     | 18 +++++++++++
 14 files changed, 219 insertions(+), 56 deletions(-)
 create mode 100644 util/mutable_cf_options.h
 create mode 100644 util/options_helper.h

diff --git a/db/column_family.cc b/db/column_family.cc
index c8ea7accf..ff6b8fe6c 100644
--- a/db/column_family.cc
+++ b/db/column_family.cc
@@ -27,6 +27,7 @@
 #include "db/write_controller.h"
 #include "util/autovector.h"
 #include "util/hash_skiplist_rep.h"
+#include "util/options_helper.h"
 
 namespace rocksdb {
 
@@ -212,7 +213,7 @@ void SuperVersionUnrefHandle(void* ptr) {
 
 ColumnFamilyData::ColumnFamilyData(uint32_t id, const std::string& name,
                                    Version* dummy_versions, Cache* table_cache,
-                                   const ColumnFamilyOptions& options,
+                                   const ColumnFamilyOptions& cf_options,
                                    const DBOptions* db_options,
                                    const EnvOptions& env_options,
                                    ColumnFamilySet* column_family_set)
@@ -222,9 +223,10 @@ ColumnFamilyData::ColumnFamilyData(uint32_t id, const std::string& name,
       current_(nullptr),
       refs_(0),
       dropped_(false),
-      internal_comparator_(options.comparator),
-      options_(*db_options, SanitizeOptions(&internal_comparator_, options)),
+      internal_comparator_(cf_options.comparator),
+      options_(*db_options, SanitizeOptions(&internal_comparator_, cf_options)),
       ioptions_(options_),
+      mutable_cf_options_(options_),
       mem_(nullptr),
       imm_(options_.min_write_buffer_number_to_merge),
       super_version_(nullptr),
@@ -378,13 +380,12 @@ const EnvOptions* ColumnFamilyData::soptions() const {
 
 void ColumnFamilyData::SetCurrent(Version* current) { current_ = current; }
 
-void ColumnFamilyData::CreateNewMemtable() {
+void ColumnFamilyData::CreateNewMemtable(const MemTableOptions& moptions) {
   assert(current_ != nullptr);
   if (mem_ != nullptr) {
     delete mem_->Unref();
   }
-  mem_ = new MemTable(internal_comparator_, ioptions_,
-                      MemTableOptions(options_));
+  mem_ = new MemTable(internal_comparator_, ioptions_, moptions);
   mem_->Ref();
 }
 
@@ -486,7 +487,15 @@ bool ColumnFamilyData::ReturnThreadLocalSuperVersion(SuperVersion* sv) {
 
 SuperVersion* ColumnFamilyData::InstallSuperVersion(
     SuperVersion* new_superversion, port::Mutex* db_mutex) {
+  db_mutex->AssertHeld();
+  return InstallSuperVersion(new_superversion, db_mutex, mutable_cf_options_);
+}
+
+SuperVersion* ColumnFamilyData::InstallSuperVersion(
+    SuperVersion* new_superversion, port::Mutex* db_mutex,
+    const MutableCFOptions& mutable_cf_options) {
   new_superversion->db_mutex = db_mutex;
+  new_superversion->mutable_cf_options = mutable_cf_options;
   new_superversion->Init(mem_, imm_.current(), current_);
   SuperVersion* old_superversion = super_version_;
   super_version_ = new_superversion;
@@ -522,6 +531,17 @@ void ColumnFamilyData::ResetThreadLocalSuperVersions() {
   }
 }
 
+bool ColumnFamilyData::SetOptions(
+      const std::unordered_map<std::string, std::string>& options_map) {
+  MutableCFOptions new_mutable_cf_options;
+  if (GetMutableOptionsFromStrings(mutable_cf_options_, options_map,
+                                   &new_mutable_cf_options)) {
+    mutable_cf_options_ = new_mutable_cf_options;
+    return true;
+  }
+  return false;
+}
+
 ColumnFamilySet::ColumnFamilySet(const std::string& dbname,
                                  const DBOptions* db_options,
                                  const EnvOptions& env_options,
diff --git a/db/column_family.h b/db/column_family.h
index e7b21036f..f1ef13cf1 100644
--- a/db/column_family.h
+++ b/db/column_family.h
@@ -23,6 +23,7 @@
 #include "db/table_cache.h"
 #include "util/thread_local.h"
 #include "db/flush_scheduler.h"
+#include "util/mutable_cf_options.h"
 
 namespace rocksdb {
 
@@ -80,6 +81,7 @@ struct SuperVersion {
   MemTable* mem;
   MemTableListVersion* imm;
   Version* current;
+  MutableCFOptions mutable_cf_options;
   std::atomic<uint32_t> refs;
   // We need to_delete because during Cleanup(), imm->Unref() returns
   // all memtables that we need to free through this vector. We then
@@ -168,11 +170,24 @@ class ColumnFamilyData {
   void SetLogNumber(uint64_t log_number) { log_number_ = log_number; }
   uint64_t GetLogNumber() const { return log_number_; }
 
-  // TODO(ljin): make this API thread-safe once we allow updating options_
-  const Options* options() const { return &options_; }
   // thread-safe
+  const Options* options() const { return &options_; }
   const EnvOptions* soptions() const;
   const ImmutableCFOptions* ioptions() const { return &ioptions_; }
+  // REQUIRES: DB mutex held
+  // This returns the MutableCFOptions used by current SuperVersion
+  // You shoul use this API to reference MutableCFOptions most of the time.
+  const MutableCFOptions* mutable_cf_options() const {
+    return &(super_version_->mutable_cf_options);
+  }
+  // REQUIRES: DB mutex held
+  // This returns the latest MutableCFOptions, which may be not in effect yet.
+  const MutableCFOptions* GetLatestMutableCFOptions() const {
+    return &mutable_cf_options_;
+  }
+  // REQUIRES: DB mutex held
+  bool SetOptions(
+      const std::unordered_map<std::string, std::string>& options_map);
 
   InternalStats* internal_stats() { return internal_stats_.get(); }
 
@@ -182,7 +197,7 @@ class ColumnFamilyData {
   Version* dummy_versions() { return dummy_versions_; }
   void SetMemtable(MemTable* new_mem) { mem_ = new_mem; }
   void SetCurrent(Version* current);
-  void CreateNewMemtable();
+  void CreateNewMemtable(const MemTableOptions& moptions);
 
   TableCache* table_cache() const { return table_cache_.get(); }
 
@@ -223,6 +238,9 @@ class ColumnFamilyData {
   // if its reference count is zero and needs deletion or nullptr if not
   // As argument takes a pointer to allocated SuperVersion to enable
   // the clients to allocate SuperVersion outside of mutex.
+  SuperVersion* InstallSuperVersion(SuperVersion* new_superversion,
+                                    port::Mutex* db_mutex,
+                                    const MutableCFOptions& mutable_cf_options);
   SuperVersion* InstallSuperVersion(SuperVersion* new_superversion,
                                     port::Mutex* db_mutex);
 
@@ -255,6 +273,7 @@ class ColumnFamilyData {
 
   const Options options_;
   const ImmutableCFOptions ioptions_;
+  MutableCFOptions mutable_cf_options_;
 
   std::unique_ptr<TableCache> table_cache_;
 
diff --git a/db/db_impl.cc b/db/db_impl.cc
index 0b332b72f..addce91c2 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -1228,7 +1228,8 @@ Status DBImpl::Recover(
       if (!s.ok()) {
         // Clear memtables if recovery failed
         for (auto cfd : *versions_->GetColumnFamilySet()) {
-          cfd->CreateNewMemtable();
+          cfd->CreateNewMemtable(MemTableOptions(
+              *cfd->GetLatestMutableCFOptions(), *cfd->options()));
         }
       }
     }
@@ -1356,7 +1357,8 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
             // file-systems cause the DB::Open() to fail.
             return status;
           }
-          cfd->CreateNewMemtable();
+          cfd->CreateNewMemtable(MemTableOptions(
+              *cfd->GetLatestMutableCFOptions(), *cfd->options()));
         }
       }
     }
@@ -1393,7 +1395,8 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
           // Recovery failed
           break;
         }
-        cfd->CreateNewMemtable();
+        cfd->CreateNewMemtable(MemTableOptions(
+            *cfd->GetLatestMutableCFOptions(), *cfd->options()));
       }
 
       // write MANIFEST with update
@@ -1623,6 +1626,7 @@ Status DBImpl::FlushMemTableToOutputFile(ColumnFamilyData* cfd,
   }
 
   if (s.ok()) {
+    // Use latest MutableCFOptions
     InstallSuperVersion(cfd, deletion_state);
     if (madeProgress) {
       *madeProgress = 1;
@@ -1714,6 +1718,13 @@ Status DBImpl::CompactRange(ColumnFamilyHandle* column_family,
   return s;
 }
 
+bool DBImpl::SetOptions(ColumnFamilyHandle* column_family,
+    const std::unordered_map<std::string, std::string>& options_map) {
+  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+  MutexLock l(&mutex_);
+  return cfh->cfd()->SetOptions(options_map);
+}
+
 // return the same level if it cannot be moved
 int DBImpl::FindMinimumEmptyLevelFitting(ColumnFamilyData* cfd, int level) {
   mutex_.AssertHeld();
@@ -1784,6 +1795,7 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) {
         cfd->GetName().c_str(), edit.DebugString().data());
 
     status = versions_->LogAndApply(cfd, &edit, &mutex_, db_directory_.get());
+    // Use latest MutableCFOptions
     superversion_to_free = cfd->InstallSuperVersion(new_superversion, &mutex_);
     new_superversion = nullptr;
 
@@ -2322,6 +2334,7 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress,
     }
     status = versions_->LogAndApply(c->column_family_data(), c->edit(), &mutex_,
                                     db_directory_.get());
+    // Use latest MutableCFOptions
     InstallSuperVersion(c->column_family_data(), deletion_state);
     LogToBuffer(log_buffer, "[%s] Deleted %d files\n",
                 c->column_family_data()->GetName().c_str(),
@@ -2338,6 +2351,7 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress,
                        f->smallest_seqno, f->largest_seqno);
     status = versions_->LogAndApply(c->column_family_data(), c->edit(), &mutex_,
                                     db_directory_.get());
+    // Use latest MutableCFOptions
     InstallSuperVersion(c->column_family_data(), deletion_state);
 
     Version::LevelSummaryStorage tmp;
@@ -3322,6 +3336,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact,
 
   if (status.ok()) {
     status = InstallCompactionResults(compact, log_buffer);
+    // Use latest MutableCFOptions
     InstallSuperVersion(cfd, deletion_state);
   }
   Version::LevelSummaryStorage tmp;
@@ -3426,6 +3441,7 @@ void DBImpl::InstallSuperVersion(ColumnFamilyData* cfd,
   SuperVersion* new_superversion =
     (deletion_state.new_superversion != nullptr) ?
     deletion_state.new_superversion : new SuperVersion();
+  // Use latest MutableCFOptions
   SuperVersion* old_superversion =
       cfd->InstallSuperVersion(new_superversion, &mutex_);
   deletion_state.new_superversion = nullptr;
@@ -3618,6 +3634,7 @@ Status DBImpl::CreateColumnFamily(const ColumnFamilyOptions& options,
     auto cfd =
         versions_->GetColumnFamilySet()->GetColumnFamily(column_family_name);
     assert(cfd != nullptr);
+    // Use latest MutableCFOptions
     delete cfd->InstallSuperVersion(new SuperVersion(), &mutex_);
     *handle = new ColumnFamilyHandleImpl(cfd, this, &mutex_);
     Log(db_options_.info_log, "Created column family [%s] (ID %u)",
@@ -4138,6 +4155,7 @@ Status DBImpl::SetNewMemtableAndNewLogFile(ColumnFamilyData* cfd,
   uint64_t new_log_number =
       creating_new_log ? versions_->NewFileNumber() : logfile_number_;
   SuperVersion* new_superversion = nullptr;
+  const MutableCFOptions mutable_cf_options = *cfd->GetLatestMutableCFOptions();
   mutex_.Unlock();
   Status s;
   {
@@ -4156,8 +4174,8 @@ Status DBImpl::SetNewMemtableAndNewLogFile(ColumnFamilyData* cfd,
 
     if (s.ok()) {
       new_mem = new MemTable(cfd->internal_comparator(),
-                             *cfd->ioptions(),
-                             MemTableOptions(*cfd->options()));
+          *cfd->ioptions(), MemTableOptions(mutable_cf_options,
+          *cfd->options()));
       new_superversion = new SuperVersion();
     }
   }
@@ -4197,7 +4215,7 @@ Status DBImpl::SetNewMemtableAndNewLogFile(ColumnFamilyData* cfd,
       "[%s] New memtable created with log file: #%" PRIu64 "\n",
       cfd->GetName().c_str(), logfile_number_);
   context->superversions_to_free_.push_back(
-      cfd->InstallSuperVersion(new_superversion, &mutex_));
+      cfd->InstallSuperVersion(new_superversion, &mutex_, mutable_cf_options));
   return s;
 }
 
@@ -4672,6 +4690,7 @@ Status DB::Open(const DBOptions& db_options, const std::string& dbname,
     }
     if (s.ok()) {
       for (auto cfd : *impl->versions_->GetColumnFamilySet()) {
+        // Use latest MutableCFOptions
         delete cfd->InstallSuperVersion(new SuperVersion(), &impl->mutex_);
       }
       impl->alive_log_files_.push_back(
diff --git a/db/db_impl.h b/db/db_impl.h
index d2b0dfc94..0bc2018b4 100644
--- a/db/db_impl.h
+++ b/db/db_impl.h
@@ -112,6 +112,10 @@ class DBImpl : public DB {
                               bool reduce_level = false, int target_level = -1,
                               uint32_t target_path_id = 0);
 
+  using DB::SetOptions;
+  bool SetOptions(ColumnFamilyHandle* column_family,
+      const std::unordered_map<std::string, std::string>& options_map);
+
   using DB::NumberLevels;
   virtual int NumberLevels(ColumnFamilyHandle* column_family);
   using DB::MaxMemCompactionLevel;
diff --git a/db/memtable.cc b/db/memtable.cc
index 804404bb8..bdfbc805f 100644
--- a/db/memtable.cc
+++ b/db/memtable.cc
@@ -31,18 +31,20 @@
 
 namespace rocksdb {
 
-MemTableOptions::MemTableOptions(const Options& options)
-  : write_buffer_size(options.write_buffer_size),
-    arena_block_size(options.arena_block_size),
-    memtable_prefix_bloom_bits(options.memtable_prefix_bloom_bits),
-    memtable_prefix_bloom_probes(options.memtable_prefix_bloom_probes),
+MemTableOptions::MemTableOptions(
+    const MutableCFOptions& mutable_cf_options, const Options& options)
+  : write_buffer_size(mutable_cf_options.write_buffer_size),
+    arena_block_size(mutable_cf_options.arena_block_size),
+    memtable_prefix_bloom_bits(mutable_cf_options.memtable_prefix_bloom_bits),
+    memtable_prefix_bloom_probes(
+        mutable_cf_options.memtable_prefix_bloom_probes),
     memtable_prefix_bloom_huge_page_tlb_size(
-        options.memtable_prefix_bloom_huge_page_tlb_size),
+        mutable_cf_options.memtable_prefix_bloom_huge_page_tlb_size),
     inplace_update_support(options.inplace_update_support),
     inplace_update_num_locks(options.inplace_update_num_locks),
     inplace_callback(options.inplace_callback),
-    max_successive_merges(options.max_successive_merges),
-    filter_deletes(options.filter_deletes) {}
+    max_successive_merges(mutable_cf_options.max_successive_merges),
+    filter_deletes(mutable_cf_options.filter_deletes) {}
 
 MemTable::MemTable(const InternalKeyComparator& cmp,
                    const ImmutableCFOptions& ioptions,
diff --git a/db/memtable.h b/db/memtable.h
index fa6db6fe1..ce6cce7f6 100644
--- a/db/memtable.h
+++ b/db/memtable.h
@@ -21,6 +21,7 @@
 #include "rocksdb/immutable_options.h"
 #include "util/arena.h"
 #include "util/dynamic_bloom.h"
+#include "util/mutable_cf_options.h"
 
 namespace rocksdb {
 
@@ -30,7 +31,9 @@ class MemTableIterator;
 class MergeContext;
 
 struct MemTableOptions {
-  explicit MemTableOptions(const Options& options);
+  explicit MemTableOptions(
+      const MutableCFOptions& mutable_cf_options,
+      const Options& options);
   size_t write_buffer_size;
   size_t arena_block_size;
   uint32_t memtable_prefix_bloom_bits;
diff --git a/db/repair.cc b/db/repair.cc
index bff81991e..2773d4c71 100644
--- a/db/repair.cc
+++ b/db/repair.cc
@@ -219,7 +219,8 @@ class Repairer {
     std::string scratch;
     Slice record;
     WriteBatch batch;
-    MemTable* mem = new MemTable(icmp_, ioptions_, MemTableOptions(options_));
+    MemTable* mem = new MemTable(icmp_, ioptions_,
+        MemTableOptions(MutableCFOptions(options_), options_));
     auto cf_mems_default = new ColumnFamilyMemTablesDefault(mem, &options_);
     mem->Ref();
     int counter = 0;
diff --git a/db/version_set.cc b/db/version_set.cc
index dbf055ce2..7edfaa788 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -2962,17 +2962,21 @@ void VersionSet::GetObsoleteFiles(std::vector<FileMetaData*>* files) {
 }
 
 ColumnFamilyData* VersionSet::CreateColumnFamily(
-    const ColumnFamilyOptions& options, VersionEdit* edit) {
+    const ColumnFamilyOptions& cf_options, VersionEdit* edit) {
   assert(edit->is_column_family_add_);
 
   Version* dummy_versions = new Version(nullptr, this);
   auto new_cfd = column_family_set_->CreateColumnFamily(
-      edit->column_family_name_, edit->column_family_, dummy_versions, options);
+      edit->column_family_name_, edit->column_family_, dummy_versions,
+      cf_options);
 
   Version* v = new Version(new_cfd, this, current_version_number_++);
 
   AppendVersion(new_cfd, v);
-  new_cfd->CreateNewMemtable();
+  // GetLatestMutableCFOptions() is safe here without mutex since the
+  // cfd is not available to client
+  new_cfd->CreateNewMemtable(MemTableOptions(
+        *new_cfd->GetLatestMutableCFOptions(), *new_cfd->options()));
   new_cfd->SetLogNumber(edit->log_number_);
   return new_cfd;
 }
diff --git a/db/write_batch_test.cc b/db/write_batch_test.cc
index 0c69b6af9..d8fa52d40 100644
--- a/db/write_batch_test.cc
+++ b/db/write_batch_test.cc
@@ -28,7 +28,7 @@ static std::string PrintContents(WriteBatch* b) {
   Options options;
   options.memtable_factory = factory;
   MemTable* mem = new MemTable(cmp, ImmutableCFOptions(options),
-                               MemTableOptions(options));
+      MemTableOptions(MutableCFOptions(options), options));
   mem->Ref();
   std::string state;
   ColumnFamilyMemTablesDefault cf_mems_default(mem, &options);
diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h
index 47372c42b..0653a8386 100644
--- a/include/rocksdb/db.h
+++ b/include/rocksdb/db.h
@@ -359,6 +359,14 @@ class DB {
     return CompactRange(DefaultColumnFamily(), begin, end, reduce_level,
                         target_level, target_path_id);
   }
+  virtual bool SetOptions(ColumnFamilyHandle* column_family,
+      const std::unordered_map<std::string, std::string>& new_options) {
+    return true;
+  }
+  virtual bool SetOptions(
+      const std::unordered_map<std::string, std::string>& new_options) {
+    return SetOptions(DefaultColumnFamily(), new_options);
+  }
 
   // Number of levels used for this DB.
   virtual int NumberLevels(ColumnFamilyHandle* column_family) = 0;
diff --git a/table/table_test.cc b/table/table_test.cc
index 118291daa..ed8123352 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -437,9 +437,8 @@ class MemTableConstructor: public Constructor {
         table_factory_(new SkipListFactory) {
     Options options;
     options.memtable_factory = table_factory_;
-    memtable_ = new MemTable(internal_comparator_,
-                             ImmutableCFOptions(options),
-                             MemTableOptions(options));
+    memtable_ = new MemTable(internal_comparator_, ImmutableCFOptions(options),
+        MemTableOptions(MutableCFOptions(options), options));
     memtable_->Ref();
   }
   ~MemTableConstructor() {
@@ -453,9 +452,8 @@ class MemTableConstructor: public Constructor {
     delete memtable_->Unref();
     Options options;
     options.memtable_factory = table_factory_;
-    memtable_ = new MemTable(internal_comparator_,
-                             ImmutableCFOptions(options),
-                             MemTableOptions(options));
+    memtable_ = new MemTable(internal_comparator_, ImmutableCFOptions(options),
+        MemTableOptions(MutableCFOptions(options), options));
     memtable_->Ref();
     int seq = 1;
     for (KVMap::const_iterator it = data.begin();
@@ -1864,7 +1862,7 @@ TEST(MemTableTest, Simple) {
   Options options;
   options.memtable_factory = table_factory;
   MemTable* memtable = new MemTable(cmp, ImmutableCFOptions(options),
-                                    MemTableOptions(options));
+        MemTableOptions(MutableCFOptions(options), options));
   memtable->Ref();
   WriteBatch batch;
   WriteBatchInternal::SetSequence(&batch, 100);
diff --git a/util/mutable_cf_options.h b/util/mutable_cf_options.h
new file mode 100644
index 000000000..39ebe2d85
--- /dev/null
+++ b/util/mutable_cf_options.h
@@ -0,0 +1,41 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#pragma once
+
+#include "rocksdb/options.h"
+
+namespace rocksdb {
+
+struct MutableCFOptions {
+  explicit MutableCFOptions(const Options& options)
+    : write_buffer_size(options.write_buffer_size),
+      arena_block_size(options.arena_block_size),
+      memtable_prefix_bloom_bits(options.memtable_prefix_bloom_bits),
+      memtable_prefix_bloom_probes(options.memtable_prefix_bloom_probes),
+      memtable_prefix_bloom_huge_page_tlb_size(
+          options.memtable_prefix_bloom_huge_page_tlb_size),
+      max_successive_merges(options.max_successive_merges),
+      filter_deletes(options.filter_deletes) {
+  }
+  MutableCFOptions()
+    : write_buffer_size(0),
+      arena_block_size(0),
+      memtable_prefix_bloom_bits(0),
+      memtable_prefix_bloom_probes(0),
+      memtable_prefix_bloom_huge_page_tlb_size(0),
+      max_successive_merges(0),
+      filter_deletes(false) {}
+
+  size_t write_buffer_size;
+  size_t arena_block_size;
+  uint32_t memtable_prefix_bloom_bits;
+  uint32_t memtable_prefix_bloom_probes;
+  size_t memtable_prefix_bloom_huge_page_tlb_size;
+  size_t max_successive_merges;
+  bool filter_deletes;
+};
+
+}  // namespace rocksdb
diff --git a/util/options_helper.cc b/util/options_helper.cc
index a4d46ccb0..db066f747 100644
--- a/util/options_helper.cc
+++ b/util/options_helper.cc
@@ -1,14 +1,11 @@
-//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
 //  This source code is licensed under the BSD-style license found in the
 //  LICENSE file in the root directory of this source tree. An additional grant
 //  of patent rights can be found in the PATENTS file in the same directory.
-//
-// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #include <cassert>
 #include "rocksdb/options.h"
+#include "util/options_helper.h"
 
 namespace rocksdb {
 
@@ -75,6 +72,49 @@ CompactionStyle ParseCompactionStyle(const std::string& type) {
 }
 }  // anonymouse namespace
 
+template<typename OptionsType>
+bool ParseMemtableOption(const std::string& name, const std::string& value,
+                         OptionsType* new_options) {
+  if (name == "write_buffer_size") {
+    new_options->write_buffer_size = ParseInt64(value);
+  } else if (name == "arena_block_size") {
+    new_options->arena_block_size = ParseInt64(value);
+  } else if (name == "memtable_prefix_bloom_bits") {
+    new_options->memtable_prefix_bloom_bits = stoul(value);
+  } else if (name == "memtable_prefix_bloom_probes") {
+    new_options->memtable_prefix_bloom_probes = stoul(value);
+  } else if (name == "memtable_prefix_bloom_huge_page_tlb_size") {
+    new_options->memtable_prefix_bloom_huge_page_tlb_size =
+      ParseInt64(value);
+  } else if (name == "max_successive_merges") {
+    new_options->max_successive_merges = ParseInt64(value);
+  } else if (name == "filter_deletes") {
+    new_options->filter_deletes = ParseBoolean(name, value);
+  } else {
+    return false;
+  }
+  return true;
+}
+
+bool GetMutableOptionsFromStrings(
+    const MutableCFOptions& base_options,
+    const std::unordered_map<std::string, std::string>& options_map,
+    MutableCFOptions* new_options) {
+  assert(new_options);
+  *new_options = base_options;
+  try {
+    for (const auto& o : options_map) {
+      if (ParseMemtableOption(o.first, o.second, new_options)) {
+      } else {
+        return false;
+      }
+    }
+  } catch (std::exception) {
+    return false;
+  }
+  return true;
+}
+
 bool GetOptionsFromStrings(
     const Options& base_options,
     const std::unordered_map<std::string, std::string>& options_map,
@@ -83,8 +123,7 @@ bool GetOptionsFromStrings(
   *new_options = base_options;
   for (const auto& o : options_map) {
     try {
-      if (o.first == "write_buffer_size") {
-        new_options->write_buffer_size = ParseInt64(o.second);
+      if (ParseMemtableOption(o.first, o.second, new_options)) {
       } else if (o.first == "max_write_buffer_number") {
         new_options->max_write_buffer_number = ParseInt(o.second);
       } else if (o.first == "min_write_buffer_number_to_merge") {
@@ -170,8 +209,6 @@ bool GetOptionsFromStrings(
         new_options->soft_rate_limit = ParseDouble(o.second);
       } else if (o.first == "hard_rate_limit") {
         new_options->hard_rate_limit = ParseDouble(o.second);
-      } else if (o.first == "arena_block_size") {
-        new_options->arena_block_size = ParseInt64(o.second);
       } else if (o.first == "disable_auto_compactions") {
         new_options->disable_auto_compactions = ParseBoolean(o.first, o.second);
       } else if (o.first == "purge_redundant_kvs_while_flush") {
@@ -188,25 +225,14 @@ bool GetOptionsFromStrings(
       } else if (o.first == "compaction_options_fifo") {
         new_options->compaction_options_fifo.max_table_files_size
           = ParseUint64(o.second);
-      } else if (o.first == "filter_deletes") {
-        new_options->filter_deletes = ParseBoolean(o.first, o.second);
       } else if (o.first == "max_sequential_skip_in_iterations") {
         new_options->max_sequential_skip_in_iterations = ParseUint64(o.second);
       } else if (o.first == "inplace_update_support") {
         new_options->inplace_update_support = ParseBoolean(o.first, o.second);
       } else if (o.first == "inplace_update_num_locks") {
         new_options->inplace_update_num_locks = ParseInt64(o.second);
-      } else if (o.first == "memtable_prefix_bloom_bits") {
-        new_options->memtable_prefix_bloom_bits = stoul(o.second);
-      } else if (o.first == "memtable_prefix_bloom_probes") {
-        new_options->memtable_prefix_bloom_probes = stoul(o.second);
-      } else if (o.first == "memtable_prefix_bloom_huge_page_tlb_size") {
-        new_options->memtable_prefix_bloom_huge_page_tlb_size =
-          ParseInt64(o.second);
       } else if (o.first == "bloom_locality") {
         new_options->bloom_locality = ParseUint32(o.second);
-      } else if (o.first == "max_successive_merges") {
-        new_options->max_successive_merges = ParseInt64(o.second);
       } else if (o.first == "min_partial_merge_operands") {
         new_options->min_partial_merge_operands = ParseUint32(o.second);
       } else if (o.first == "create_if_missing") {
diff --git a/util/options_helper.h b/util/options_helper.h
new file mode 100644
index 000000000..c04d2a5d7
--- /dev/null
+++ b/util/options_helper.h
@@ -0,0 +1,18 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#pragma once
+
+#include <string>
+#include "util/mutable_cf_options.h"
+
+namespace rocksdb {
+
+bool GetMutableOptionsFromStrings(
+    const MutableCFOptions& base_options,
+    const std::unordered_map<std::string, std::string>& options_map,
+    MutableCFOptions* new_options);
+
+}  // namespace rocksdb

From 5600c8f6e55e677053b87b67b92e6b80aa1cabb4 Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Wed, 17 Sep 2014 13:25:29 -0700
Subject: [PATCH 086/829] cuckoo table: return estimated size - 1

Summary:
This is to avoid cutting file prematurely and resulting file size to be
half of specified.

Test Plan: db_bench

Reviewers: sdong, yhchiang, igor

Reviewed By: igor

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D23541
---
 table/cuckoo_table_builder.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/table/cuckoo_table_builder.cc b/table/cuckoo_table_builder.cc
index e107071f2..1cf19e3aa 100644
--- a/table/cuckoo_table_builder.cc
+++ b/table/cuckoo_table_builder.cc
@@ -323,7 +323,7 @@ uint64_t CuckooTableBuilder::FileSize() const {
     expected_hash_table_size *= 2;
   }
   return (kvs_[0].first.size() + kvs_[0].second.size()) *
-    expected_hash_table_size;
+    expected_hash_table_size - 1;
 }
 
 // This method is invoked when there is no place to insert the target key.

From fb6456b00d2f8b22bf538fd14a37632c53289524 Mon Sep 17 00:00:00 2001
From: Torrie Fischer <torrie@ripple.com>
Date: Fri, 15 Aug 2014 15:05:09 -0700
Subject: [PATCH 087/829] Replace naked calls to operator new and delete (Fixes
 #222)

This replaces a mishmash of pointers in the Block and BlockContents classes with
std::unique_ptr. It also changes the semantics of BlockContents to be limited to
use as a constructor parameter for Block objects, as it owns any block buffers
handed to it.
---
 table/block.cc                     |  11 +-
 table/block.h                      |  15 +--
 table/block_based_filter_block.cc  |  13 ++-
 table/block_based_filter_block.h   |   8 +-
 table/block_based_table_builder.cc |  14 +--
 table/block_based_table_reader.cc  |  40 ++-----
 table/block_test.cc                |   6 +-
 table/filter_block.h               |   5 +
 table/format.cc                    | 167 +++++++++--------------------
 table/format.h                     |  31 ++++--
 table/full_filter_block.cc         |  13 ++-
 table/full_filter_block.h          |   8 +-
 table/meta_blocks.cc               |  44 ++++----
 table/plain_table_reader.cc        |   1 +
 table/table_test.cc                |   3 +-
 15 files changed, 153 insertions(+), 226 deletions(-)

diff --git a/table/block.cc b/table/block.cc
index 0db23a1bd..1a1accb2f 100644
--- a/table/block.cc
+++ b/table/block.cc
@@ -299,10 +299,7 @@ uint32_t Block::NumRestarts() const {
 
 Block::Block(const BlockContents& contents)
     : data_(contents.data.data()),
-      size_(contents.data.size()),
-      owned_(contents.heap_allocated),
-      cachable_(contents.cachable),
-      compression_type_(contents.compression_type) {
+      size_(contents.data.size()) {
   if (size_ < sizeof(uint32_t)) {
     size_ = 0;  // Error marker
   } else {
@@ -315,10 +312,8 @@ Block::Block(const BlockContents& contents)
   }
 }
 
-Block::~Block() {
-  if (owned_) {
-    delete[] data_;
-  }
+Block::Block(BlockContents&& contents) : Block(contents) {
+  contents_ = std::move(contents);
 }
 
 Iterator* Block::NewIterator(
diff --git a/table/block.h b/table/block.h
index 49bcf12cf..21dacc395 100644
--- a/table/block.h
+++ b/table/block.h
@@ -14,6 +14,10 @@
 #include "rocksdb/iterator.h"
 #include "rocksdb/options.h"
 #include "db/dbformat.h"
+#include "table/block_prefix_index.h"
+#include "table/block_hash_index.h"
+
+#include "format.h"
 
 namespace rocksdb {
 
@@ -26,15 +30,16 @@ class BlockPrefixIndex;
 class Block {
  public:
   // Initialize the block with the specified contents.
+  explicit Block(BlockContents&& contents);
   explicit Block(const BlockContents& contents);
 
-  ~Block();
+  ~Block() = default;
 
   size_t size() const { return size_; }
   const char* data() const { return data_; }
-  bool cachable() const { return cachable_; }
+  bool cachable() const { return contents_.cachable; }
   uint32_t NumRestarts() const;
-  CompressionType compression_type() const { return compression_type_; }
+  CompressionType compression_type() const { return contents_.compression_type; }
 
   // If hash index lookup is enabled and `use_hash_index` is true. This block
   // will do hash lookup for the key prefix.
@@ -58,12 +63,10 @@ class Block {
   size_t ApproximateMemoryUsage() const;
 
  private:
+  BlockContents contents_;
   const char* data_;
   size_t size_;
   uint32_t restart_offset_;     // Offset in data_ of restart array
-  bool owned_;                  // Block owns data_[]
-  bool cachable_;
-  CompressionType compression_type_;
   std::unique_ptr<BlockHashIndex> hash_index_;
   std::unique_ptr<BlockPrefixIndex> prefix_index_;
 
diff --git a/table/block_based_filter_block.cc b/table/block_based_filter_block.cc
index c2c34c628..bed605a68 100644
--- a/table/block_based_filter_block.cc
+++ b/table/block_based_filter_block.cc
@@ -138,7 +138,7 @@ void BlockBasedFilterBlockBuilder::GenerateFilter() {
 BlockBasedFilterBlockReader::BlockBasedFilterBlockReader(
     const SliceTransform* prefix_extractor,
     const BlockBasedTableOptions& table_opt,
-    const Slice& contents, bool delete_contents_after_use)
+    const Slice& contents)
     : policy_(table_opt.filter_policy.get()),
       prefix_extractor_(prefix_extractor),
       whole_key_filtering_(table_opt.whole_key_filtering),
@@ -155,9 +155,14 @@ BlockBasedFilterBlockReader::BlockBasedFilterBlockReader(
   data_ = contents.data();
   offset_ = data_ + last_word;
   num_ = (n - 5 - last_word) / 4;
-  if (delete_contents_after_use) {
-    filter_data.reset(contents.data());
-  }
+}
+
+BlockBasedFilterBlockReader::BlockBasedFilterBlockReader(
+    const SliceTransform* prefix_extractor,
+    const BlockBasedTableOptions& table_opt,
+    BlockContents &&contents)
+    : BlockBasedFilterBlockReader (prefix_extractor, table_opt, contents.data) {
+  contents_ = std::move(contents);
 }
 
 bool BlockBasedFilterBlockReader::KeyMayMatch(const Slice& key,
diff --git a/table/block_based_filter_block.h b/table/block_based_filter_block.h
index 9bbc93531..856b88910 100644
--- a/table/block_based_filter_block.h
+++ b/table/block_based_filter_block.h
@@ -74,8 +74,10 @@ class BlockBasedFilterBlockReader : public FilterBlockReader {
   // REQUIRES: "contents" and *policy must stay live while *this is live.
   BlockBasedFilterBlockReader(const SliceTransform* prefix_extractor,
                               const BlockBasedTableOptions& table_opt,
-                              const Slice& contents,
-                              bool delete_contents_after_use = false);
+                              const Slice& contents);
+  BlockBasedFilterBlockReader(const SliceTransform* prefix_extractor,
+                              const BlockBasedTableOptions& table_opt,
+                              BlockContents&& contents);
   virtual bool IsBlockBased() override { return true; }
   virtual bool KeyMayMatch(const Slice& key,
                            uint64_t block_offset = kNotValid) override;
@@ -91,7 +93,7 @@ class BlockBasedFilterBlockReader : public FilterBlockReader {
   const char* offset_;  // Pointer to beginning of offset array (at block-end)
   size_t num_;          // Number of entries in offset array
   size_t base_lg_;      // Encoding parameter (see kFilterBaseLg in .cc file)
-  std::unique_ptr<const char[]> filter_data;
+  BlockContents contents_;
 
   bool MayMatch(const Slice& entry, uint64_t block_offset);
 
diff --git a/table/block_based_table_builder.cc b/table/block_based_table_builder.cc
index 7fb662d88..eb32e9942 100644
--- a/table/block_based_table_builder.cc
+++ b/table/block_based_table_builder.cc
@@ -17,6 +17,7 @@
 #include <memory>
 #include <string>
 #include <unordered_map>
+#include <utility>
 
 #include "db/dbformat.h"
 
@@ -634,18 +635,13 @@ Status BlockBasedTableBuilder::InsertBlockInCache(const Slice& block_contents,
     Cache::Handle* cache_handle = nullptr;
     size_t size = block_contents.size();
 
-    char* ubuf = new char[size + 1];  // make a new copy
-    memcpy(ubuf, block_contents.data(), size);
+    std::unique_ptr<char[]> ubuf(new char[size+1]);
+    memcpy(ubuf.get(), block_contents.data(), size);
     ubuf[size] = type;
 
-    BlockContents results;
-    Slice sl(ubuf, size);
-    results.data = sl;
-    results.cachable = true; // XXX
-    results.heap_allocated = true;
-    results.compression_type = type;
+    BlockContents results(std::move(ubuf), size, true, type);
 
-    Block* block = new Block(results);
+    Block* block = new Block(std::move(results));
 
     // make cache key by appending the file offset to the cache prefix id
     char* end = EncodeVarint64(
diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc
index b38f88588..1b41085af 100644
--- a/table/block_based_table_reader.cc
+++ b/table/block_based_table_reader.cc
@@ -66,7 +66,7 @@ Status ReadBlockFromFile(RandomAccessFile* file, const Footer& footer,
   Status s = ReadBlockContents(file, footer, options, handle, &contents, env,
                                do_uncompress);
   if (s.ok()) {
-    *result = new Block(contents);
+    *result = new Block(std::move(contents));
   }
 
   return s;
@@ -252,9 +252,6 @@ class HashIndexReader : public IndexReader {
                           &prefixes_meta_contents, env,
                           true /* do decompression */);
     if (!s.ok()) {
-      if (prefixes_contents.heap_allocated) {
-        delete[] prefixes_contents.data.data();
-      }
       // TODO: log error
       return Status::OK();
     }
@@ -269,7 +266,7 @@ class HashIndexReader : public IndexReader {
       // TODO: log error
       if (s.ok()) {
         new_index_reader->index_block_->SetBlockHashIndex(hash_index);
-        new_index_reader->OwnPrefixesContents(prefixes_contents);
+        new_index_reader->OwnPrefixesContents(std::move(prefixes_contents));
       }
     } else {
       BlockPrefixIndex* prefix_index = nullptr;
@@ -283,18 +280,6 @@ class HashIndexReader : public IndexReader {
       }
     }
 
-    // Always release prefix meta block
-    if (prefixes_meta_contents.heap_allocated) {
-      delete[] prefixes_meta_contents.data.data();
-    }
-
-    // Release prefix content block if we don't own it.
-    if (!new_index_reader->own_prefixes_contents_) {
-      if (prefixes_contents.heap_allocated) {
-        delete[] prefixes_contents.data.data();
-      }
-    }
-
     return Status::OK();
   }
 
@@ -314,24 +299,18 @@ class HashIndexReader : public IndexReader {
  private:
   HashIndexReader(const Comparator* comparator, Block* index_block)
       : IndexReader(comparator),
-        index_block_(index_block),
-        own_prefixes_contents_(false) {
+        index_block_(index_block) {
     assert(index_block_ != nullptr);
   }
 
   ~HashIndexReader() {
-    if (own_prefixes_contents_ && prefixes_contents_.heap_allocated) {
-      delete[] prefixes_contents_.data.data();
-    }
   }
 
-  void OwnPrefixesContents(const BlockContents& prefixes_contents) {
-    prefixes_contents_ = prefixes_contents;
-    own_prefixes_contents_ = true;
+  void OwnPrefixesContents(BlockContents&& prefixes_contents) {
+    prefixes_contents_ = std::move(prefixes_contents);
   }
 
   std::unique_ptr<Block> index_block_;
-  bool own_prefixes_contents_;
   BlockContents prefixes_contents_;
 };
 
@@ -677,7 +656,7 @@ Status BlockBasedTable::GetDataBlockFromCache(
 
   // Insert uncompressed block into block cache
   if (s.ok()) {
-    block->value = new Block(contents);  // uncompressed block
+    block->value = new Block(std::move(contents));  // uncompressed block
     assert(block->value->compression_type() == kNoCompression);
     if (block_cache != nullptr && block->value->cachable() &&
         read_options.fill_cache) {
@@ -715,7 +694,7 @@ Status BlockBasedTable::PutDataBlockToCache(
   }
 
   if (raw_block->compression_type() != kNoCompression) {
-    block->value = new Block(contents);  // uncompressed block
+    block->value = new Block(std::move(contents));  // uncompressed block
   } else {
     block->value = raw_block;
     raw_block = nullptr;
@@ -768,15 +747,14 @@ FilterBlockReader* BlockBasedTable::ReadFilter(
   assert(rep->filter_policy);
   if (kFilterBlockPrefix == filter_block_prefix) {
     return new BlockBasedFilterBlockReader(rep->ioptions.prefix_extractor,
-        rep->table_options, block.data, block.heap_allocated);
+        rep->table_options, std::move(block));
   } else if (kFullFilterBlockPrefix == filter_block_prefix) {
     auto filter_bits_reader = rep->filter_policy->
         GetFilterBitsReader(block.data);
 
     if (filter_bits_reader != nullptr) {
       return new FullFilterBlockReader(rep->ioptions.prefix_extractor,
-          rep->table_options, block.data, filter_bits_reader,
-          block.heap_allocated);
+          rep->table_options, std::move(block), filter_bits_reader);
     }
   }
   return nullptr;
diff --git a/table/block_test.cc b/table/block_test.cc
index b36787f8f..c341617a7 100644
--- a/table/block_test.cc
+++ b/table/block_test.cc
@@ -92,8 +92,7 @@ TEST(BlockTest, SimpleTest) {
   BlockContents contents;
   contents.data = rawblock;
   contents.cachable = false;
-  contents.heap_allocated = false;
-  Block reader(contents);
+  Block reader(std::move(contents));
 
   // read contents of block sequentially
   int count = 0;
@@ -143,12 +142,11 @@ BlockContents GetBlockContents(std::unique_ptr<BlockBuilder> *builder,
   BlockContents contents;
   contents.data = rawblock;
   contents.cachable = false;
-  contents.heap_allocated = false;
 
   return contents;
 }
 
-void CheckBlockContents(BlockContents contents, const int max_key,
+void CheckBlockContents(const BlockContents &contents, const int max_key,
                         const std::vector<std::string> &keys,
                         const std::vector<std::string> &values) {
   const size_t prefix_size = 6;
diff --git a/table/filter_block.h b/table/filter_block.h
index adbb7c496..197676827 100644
--- a/table/filter_block.h
+++ b/table/filter_block.h
@@ -18,17 +18,22 @@
 
 #pragma once
 
+#include <memory>
 #include <stddef.h>
 #include <stdint.h>
 #include <string>
 #include <vector>
 #include "rocksdb/options.h"
 #include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
 #include "rocksdb/table.h"
+#include "util/hash.h"
+#include "format.h"
 
 namespace rocksdb {
 
 const uint64_t kNotValid = ULLONG_MAX;
+class FilterPolicy;
 
 // A FilterBlockBuilder is used to construct all of the filters for a
 // particular Table.  It generates a single string which is stored as
diff --git a/table/format.cc b/table/format.cc
index 70cc6eb83..255e1e834 100644
--- a/table/format.cc
+++ b/table/format.cc
@@ -255,112 +255,53 @@ Status ReadBlock(RandomAccessFile* file, const Footer& footer,
   return s;
 }
 
-// Decompress a block according to params
-// May need to malloc a space for cache usage
-Status DecompressBlock(BlockContents* result, size_t block_size,
-                          bool do_uncompress, const char* buf,
-                          const Slice& contents, bool use_stack_buf) {
-  Status s;
-  size_t n = block_size;
-  const char* data = contents.data();
-
-  result->data = Slice();
-  result->cachable = false;
-  result->heap_allocated = false;
-
-  PERF_TIMER_GUARD(block_decompress_time);
-  rocksdb::CompressionType compression_type =
-      static_cast<rocksdb::CompressionType>(data[n]);
-  // If the caller has requested that the block not be uncompressed
-  if (!do_uncompress || compression_type == kNoCompression) {
-    if (data != buf) {
-      // File implementation gave us pointer to some other data.
-      // Use it directly under the assumption that it will be live
-      // while the file is open.
-      result->data = Slice(data, n);
-      result->heap_allocated = false;
-      result->cachable = false;  // Do not double-cache
-    } else {
-      if (use_stack_buf) {
-        // Need to allocate space in heap for cache usage
-        char* new_buf = new char[n];
-        memcpy(new_buf, buf, n);
-        result->data = Slice(new_buf, n);
-      } else {
-        result->data = Slice(buf, n);
-      }
-
-      result->heap_allocated = true;
-      result->cachable = true;
-    }
-    result->compression_type = compression_type;
-    s = Status::OK();
+Status ReadBlockContents(RandomAccessFile* file, const Footer& footer,
+                         const ReadOptions& options, const BlockHandle& handle,
+                         BlockContents *contents, Env* env,
+                         bool decompression_requested) {
+  Status status;
+  Slice slice;
+  size_t n = static_cast<size_t>(handle.size());
+  std::unique_ptr<char[]> heap_buf;
+  char stack_buf[DefaultStackBufferSize];
+  char *used_buf = nullptr;
+  rocksdb::CompressionType compression_type;
+
+  if (decompression_requested && n + kBlockTrailerSize < DefaultStackBufferSize) {
+    //If we've got a small enough hunk of data, read it in to the
+    //trivially allocated stack buffer instead of needing a full malloc()
+    used_buf = &stack_buf[0];
   } else {
-    s = UncompressBlockContents(data, n, result);
+    heap_buf = std::unique_ptr<char[]>(new char[n + kBlockTrailerSize]);
+    used_buf = heap_buf.get();
   }
-  return s;
-}
 
-// Read and Decompress block
-// Use buf in stack as temp reading buffer
-Status ReadAndDecompressFast(RandomAccessFile* file, const Footer& footer,
-                             const ReadOptions& options,
-                             const BlockHandle& handle, BlockContents* result,
-                             Env* env, bool do_uncompress) {
-  Status s;
-  Slice contents;
-  size_t n = static_cast<size_t>(handle.size());
-  char buf[DefaultStackBufferSize];
+  status = ReadBlock(file, footer, options, handle, &slice, used_buf);
 
-  s = ReadBlock(file, footer, options, handle, &contents, buf);
-  if (!s.ok()) {
-    return s;
-  }
-  s = DecompressBlock(result, n, do_uncompress, buf, contents, true);
-  if (!s.ok()) {
-    return s;
+  if (!status.ok()) {
+    return status;
   }
-  return s;
-}
 
-// Read and Decompress block
-// Use buf in heap as temp reading buffer
-Status ReadAndDecompress(RandomAccessFile* file, const Footer& footer,
-                         const ReadOptions& options, const BlockHandle& handle,
-                         BlockContents* result, Env* env, bool do_uncompress) {
-  Status s;
-  Slice contents;
-  size_t n = static_cast<size_t>(handle.size());
-  char* buf = new char[n + kBlockTrailerSize];
+  PERF_TIMER_GUARD(block_decompress_time);
 
-  s = ReadBlock(file, footer, options, handle, &contents, buf);
-  if (!s.ok()) {
-    delete[] buf;
-    return s;
-  }
-  s = DecompressBlock(result, n, do_uncompress, buf, contents, false);
-  if (!s.ok()) {
-    delete[] buf;
-    return s;
+  compression_type = static_cast<rocksdb::CompressionType>(slice.data()[n]);
+
+  if (decompression_requested && compression_type != kNoCompression) {
+    return UncompressBlockContents(slice.data(), n, contents);
   }
 
-  if (result->data.data() != buf) {
-    delete[] buf;
+  if (slice.data() != used_buf) {
+    *contents = BlockContents(Slice(slice.data(), n), false, compression_type);
+    return status;
   }
-  return s;
-}
 
-Status ReadBlockContents(RandomAccessFile* file, const Footer& footer,
-                         const ReadOptions& options, const BlockHandle& handle,
-                         BlockContents* result, Env* env, bool do_uncompress) {
-  size_t n = static_cast<size_t>(handle.size());
-  if (do_uncompress && n + kBlockTrailerSize < DefaultStackBufferSize) {
-    return ReadAndDecompressFast(file, footer, options, handle, result, env,
-                                 do_uncompress);
-  } else {
-    return ReadAndDecompress(file, footer, options, handle, result, env,
-                             do_uncompress);
+  if (used_buf == &stack_buf[0]) {
+    heap_buf = std::unique_ptr<char[]>(new char[n]);
+    memcpy(heap_buf.get(), stack_buf, n);
   }
+
+  *contents = BlockContents(std::move(heap_buf), n, true, compression_type);
+  return status;
 }
 
 //
@@ -370,8 +311,8 @@ Status ReadBlockContents(RandomAccessFile* file, const Footer& footer,
 // buffer is returned via 'result' and it is upto the caller to
 // free this buffer.
 Status UncompressBlockContents(const char* data, size_t n,
-                               BlockContents* result) {
-  char* ubuf = nullptr;
+                               BlockContents* contents) {
+  std::unique_ptr<char[]> ubuf;
   int decompress_size = 0;
   assert(data[n] != kNoCompression);
   switch (data[n]) {
@@ -382,64 +323,52 @@ Status UncompressBlockContents(const char* data, size_t n,
       if (!port::Snappy_GetUncompressedLength(data, n, &ulength)) {
         return Status::Corruption(snappy_corrupt_msg);
       }
-      ubuf = new char[ulength];
-      if (!port::Snappy_Uncompress(data, n, ubuf)) {
-        delete[] ubuf;
+      ubuf = std::unique_ptr<char[]>(new char[ulength]);
+      if (!port::Snappy_Uncompress(data, n, ubuf.get())) {
         return Status::Corruption(snappy_corrupt_msg);
       }
-      result->data = Slice(ubuf, ulength);
-      result->heap_allocated = true;
-      result->cachable = true;
+      *contents = BlockContents(std::move(ubuf), ulength, true, kNoCompression);
       break;
     }
     case kZlibCompression:
-      ubuf = port::Zlib_Uncompress(data, n, &decompress_size);
+      ubuf = std::unique_ptr<char[]>(port::Zlib_Uncompress(data, n, &decompress_size));
       static char zlib_corrupt_msg[] =
         "Zlib not supported or corrupted Zlib compressed block contents";
       if (!ubuf) {
         return Status::Corruption(zlib_corrupt_msg);
       }
-      result->data = Slice(ubuf, decompress_size);
-      result->heap_allocated = true;
-      result->cachable = true;
+      *contents = BlockContents(std::move(ubuf), decompress_size, true, kNoCompression);
       break;
     case kBZip2Compression:
-      ubuf = port::BZip2_Uncompress(data, n, &decompress_size);
+      ubuf = std::unique_ptr<char[]>(port::BZip2_Uncompress(data, n, &decompress_size));
       static char bzip2_corrupt_msg[] =
         "Bzip2 not supported or corrupted Bzip2 compressed block contents";
       if (!ubuf) {
         return Status::Corruption(bzip2_corrupt_msg);
       }
-      result->data = Slice(ubuf, decompress_size);
-      result->heap_allocated = true;
-      result->cachable = true;
+      *contents = BlockContents(std::move(ubuf), decompress_size, true, kNoCompression);
       break;
     case kLZ4Compression:
-      ubuf = port::LZ4_Uncompress(data, n, &decompress_size);
+      ubuf = std::unique_ptr<char[]>(port::LZ4_Uncompress(data, n, &decompress_size));
       static char lz4_corrupt_msg[] =
           "LZ4 not supported or corrupted LZ4 compressed block contents";
       if (!ubuf) {
         return Status::Corruption(lz4_corrupt_msg);
       }
-      result->data = Slice(ubuf, decompress_size);
-      result->heap_allocated = true;
-      result->cachable = true;
+      *contents = BlockContents(std::move(ubuf), decompress_size, true, kNoCompression);
       break;
     case kLZ4HCCompression:
-      ubuf = port::LZ4_Uncompress(data, n, &decompress_size);
+      ubuf = std::unique_ptr<char[]>(port::LZ4_Uncompress(data, n, &decompress_size));
       static char lz4hc_corrupt_msg[] =
           "LZ4HC not supported or corrupted LZ4HC compressed block contents";
       if (!ubuf) {
         return Status::Corruption(lz4hc_corrupt_msg);
       }
-      result->data = Slice(ubuf, decompress_size);
-      result->heap_allocated = true;
-      result->cachable = true;
+      *contents = BlockContents(std::move(ubuf), decompress_size, true, kNoCompression);
       break;
     default:
       return Status::Corruption("bad block type");
   }
-  result->compression_type = kNoCompression;  // not compressed any more
   return Status::OK();
 }
 
diff --git a/table/format.h b/table/format.h
index a971c1a67..9f5d6ce89 100644
--- a/table/format.h
+++ b/table/format.h
@@ -160,28 +160,39 @@ static const size_t kBlockTrailerSize = 5;
 struct BlockContents {
   Slice data;           // Actual contents of data
   bool cachable;        // True iff data can be cached
-  bool heap_allocated;  // True iff caller should delete[] data.data()
   CompressionType compression_type;
+  std::unique_ptr<char[]> allocation;
+
+  BlockContents()
+   : cachable(false),
+     compression_type(kNoCompression) {}
+
+  BlockContents(const Slice &_data, bool _cachable, CompressionType _compression_type)
+    : data(_data),
+      cachable(_cachable),
+      compression_type(_compression_type) {}
+
+  BlockContents(std::unique_ptr<char[]> &&_data, size_t _size, bool _cachable, CompressionType _compression_type)
+    : data(_data.get(), _size),
+      cachable(_cachable),
+      compression_type(_compression_type),
+      allocation(std::move(_data)) {}
 };
 
 // Read the block identified by "handle" from "file".  On failure
 // return non-OK.  On success fill *result and return OK.
-extern Status ReadBlockContents(RandomAccessFile* file,
-                                const Footer& footer,
+extern Status ReadBlockContents(RandomAccessFile* file, const Footer& footer,
                                 const ReadOptions& options,
-                                const BlockHandle& handle,
-                                BlockContents* result,
-                                Env* env,
-                                bool do_uncompress);
+                                const BlockHandle& handle, BlockContents* contents,
+                                Env* env, bool do_uncompress);
 
 // The 'data' points to the raw block contents read in from file.
 // This method allocates a new heap buffer and the raw block
 // contents are uncompresed into this buffer. This buffer is
 // returned via 'result' and it is upto the caller to
 // free this buffer.
-extern Status UncompressBlockContents(const char* data,
-                                      size_t n,
-                                      BlockContents* result);
+extern Status UncompressBlockContents(const char* data, size_t n,
+                                      BlockContents* contents);
 
 // Implementation details follow.  Clients should ignore,
 
diff --git a/table/full_filter_block.cc b/table/full_filter_block.cc
index 4ccc2e2b4..b0efef39a 100644
--- a/table/full_filter_block.cc
+++ b/table/full_filter_block.cc
@@ -56,16 +56,21 @@ FullFilterBlockReader::FullFilterBlockReader(
     const SliceTransform* prefix_extractor,
     const BlockBasedTableOptions& table_opt,
     const Slice& contents,
-    FilterBitsReader* filter_bits_reader, bool delete_contents_after_use)
+    FilterBitsReader* filter_bits_reader)
     : prefix_extractor_(prefix_extractor),
       whole_key_filtering_(table_opt.whole_key_filtering),
       contents_(contents) {
   assert(filter_bits_reader != nullptr);
   filter_bits_reader_.reset(filter_bits_reader);
+}
 
-  if (delete_contents_after_use) {
-    filter_data_.reset(contents.data());
-  }
+FullFilterBlockReader::FullFilterBlockReader(
+    const SliceTransform* prefix_extractor,
+    const BlockBasedTableOptions& table_opt,
+    BlockContents&& contents,
+    FilterBitsReader* filter_bits_reader)
+    : FullFilterBlockReader(prefix_extractor, table_opt, contents.data, filter_bits_reader) {
+  block_contents_ = std::move(contents);
 }
 
 bool FullFilterBlockReader::KeyMayMatch(const Slice& key,
diff --git a/table/full_filter_block.h b/table/full_filter_block.h
index 46ba5d1de..6d6294cf2 100644
--- a/table/full_filter_block.h
+++ b/table/full_filter_block.h
@@ -75,8 +75,11 @@ class FullFilterBlockReader : public FilterBlockReader {
   explicit FullFilterBlockReader(const SliceTransform* prefix_extractor,
                                  const BlockBasedTableOptions& table_opt,
                                  const Slice& contents,
-                                 FilterBitsReader* filter_bits_reader,
-                                 bool delete_contents_after_use = false);
+                                 FilterBitsReader* filter_bits_reader);
+  explicit FullFilterBlockReader(const SliceTransform* prefix_extractor,
+                                 const BlockBasedTableOptions& table_opt,
+                                 BlockContents&& contents,
+                                 FilterBitsReader* filter_bits_reader);
 
   // bits_reader is created in filter_policy, it should be passed in here
   // directly. and be deleted here
@@ -95,6 +98,7 @@ class FullFilterBlockReader : public FilterBlockReader {
 
   std::unique_ptr<FilterBitsReader> filter_bits_reader_;
   Slice contents_;
+  BlockContents block_contents_;
   std::unique_ptr<const char[]> filter_data_;
 
   bool MayMatch(const Slice& entry);
diff --git a/table/meta_blocks.cc b/table/meta_blocks.cc
index d9d0ed6c9..ebbe0f5a5 100644
--- a/table/meta_blocks.cc
+++ b/table/meta_blocks.cc
@@ -141,14 +141,15 @@ Status ReadProperties(const Slice &handle_value, RandomAccessFile *file,
   BlockContents block_contents;
   ReadOptions read_options;
   read_options.verify_checksums = false;
-  Status s = ReadBlockContents(file, footer, read_options, handle,
-                               &block_contents, env, false);
+  Status s;
+  s = ReadBlockContents(file, footer, read_options, handle, &block_contents,
+                        env, false);
 
   if (!s.ok()) {
     return s;
   }
 
-  Block properties_block(block_contents);
+  Block properties_block(std::move(block_contents));
   std::unique_ptr<Iterator> iter(
       properties_block.NewIterator(BytewiseComparator()));
 
@@ -228,12 +229,12 @@ Status ReadTableProperties(RandomAccessFile* file, uint64_t file_size,
   BlockContents metaindex_contents;
   ReadOptions read_options;
   read_options.verify_checksums = false;
-  s = ReadBlockContents(file, footer, read_options, metaindex_handle,
-                        &metaindex_contents, env, false);
+  s = ReadBlockContents(file, footer, read_options, metaindex_handle, &metaindex_contents,
+                        env, false);
   if (!s.ok()) {
     return s;
   }
-  Block metaindex_block(metaindex_contents);
+  Block metaindex_block(std::move(metaindex_contents));
   std::unique_ptr<Iterator> meta_iter(
       metaindex_block.NewIterator(BytewiseComparator()));
 
@@ -287,7 +288,7 @@ Status FindMetaBlock(RandomAccessFile* file, uint64_t file_size,
   if (!s.ok()) {
     return s;
   }
-  Block metaindex_block(metaindex_contents);
+  Block metaindex_block(std::move(metaindex_contents));
 
   std::unique_ptr<Iterator> meta_iter;
   meta_iter.reset(metaindex_block.NewIterator(BytewiseComparator()));
@@ -299,41 +300,36 @@ Status ReadMetaBlock(RandomAccessFile* file, uint64_t file_size,
                      uint64_t table_magic_number, Env* env,
                      const std::string& meta_block_name,
                      BlockContents* contents) {
+  Status status;
   Footer footer(table_magic_number);
-  auto s = ReadFooterFromFile(file, file_size, &footer);
-  if (!s.ok()) {
-    return s;
-  }
+  status = ReadFooterFromFile(file, file_size, &footer);
+  if (!status.ok()) return status;
 
   // Reading metaindex block
   auto metaindex_handle = footer.metaindex_handle();
   BlockContents metaindex_contents;
   ReadOptions read_options;
   read_options.verify_checksums = false;
-  s = ReadBlockContents(file, footer, read_options, metaindex_handle,
-                        &metaindex_contents, env, false);
-  if (!s.ok()) {
-    return s;
-  }
+  status = ReadBlockContents(file, footer, read_options, metaindex_handle,
+                             &metaindex_contents, env, false);
+  if (!status.ok()) return status;
 
   // Finding metablock
-  Block metaindex_block(metaindex_contents);
+  Block metaindex_block(std::move(metaindex_contents));
 
   std::unique_ptr<Iterator> meta_iter;
   meta_iter.reset(metaindex_block.NewIterator(BytewiseComparator()));
 
   BlockHandle block_handle;
-  s = FindMetaBlock(meta_iter.get(), meta_block_name, &block_handle);
+  status = FindMetaBlock(meta_iter.get(), meta_block_name, &block_handle);
 
-  if (!s.ok()) {
-    return s;
+  if (!status.ok()) {
+    return status;
   }
 
   // Reading metablock
-  s = ReadBlockContents(file, footer, read_options, block_handle, contents, env,
-                        false);
-
-  return s;
+  return ReadBlockContents(file, footer, read_options, block_handle, contents,
+                           env, false);
 }
 
 }  // namespace rocksdb
diff --git a/table/plain_table_reader.cc b/table/plain_table_reader.cc
index 23e53bcf7..3a6d48be8 100644
--- a/table/plain_table_reader.cc
+++ b/table/plain_table_reader.cc
@@ -20,6 +20,7 @@
 
 #include "table/block.h"
 #include "table/bloom_block.h"
+#include "table/filter_block.h"
 #include "table/format.h"
 #include "table/meta_blocks.h"
 #include "table/two_level_iterator.h"
diff --git a/table/table_test.cc b/table/table_test.cc
index 118291daa..0dd8fd7d4 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -265,8 +265,7 @@ class BlockConstructor: public Constructor {
     BlockContents contents;
     contents.data = data_;
     contents.cachable = false;
-    contents.heap_allocated = false;
-    block_ = new Block(contents);
+    block_ = new Block(std::move(contents));
     return Status::OK();
   }
   virtual Iterator* NewIterator() const {

From 54cada92b1426749aa11a6d55f1628508d6f3454 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Wed, 17 Sep 2014 15:08:19 -0700
Subject: [PATCH 088/829] Run make format on PR #249

---
 table/block.cc                     |  3 +--
 table/block.h                      |  4 +++-
 table/block_based_filter_block.cc  |  8 +++----
 table/block_based_table_builder.cc |  2 +-
 table/block_based_table_reader.cc  | 10 ++++-----
 table/format.cc                    | 35 +++++++++++++++++++-----------
 table/format.h                     | 31 +++++++++++++-------------
 table/full_filter_block.cc         |  9 ++++----
 table/meta_blocks.cc               | 12 ++++++----
 9 files changed, 62 insertions(+), 52 deletions(-)

diff --git a/table/block.cc b/table/block.cc
index 1a1accb2f..c3066cf5b 100644
--- a/table/block.cc
+++ b/table/block.cc
@@ -298,8 +298,7 @@ uint32_t Block::NumRestarts() const {
 }
 
 Block::Block(const BlockContents& contents)
-    : data_(contents.data.data()),
-      size_(contents.data.size()) {
+    : data_(contents.data.data()), size_(contents.data.size()) {
   if (size_ < sizeof(uint32_t)) {
     size_ = 0;  // Error marker
   } else {
diff --git a/table/block.h b/table/block.h
index 21dacc395..b86b615bc 100644
--- a/table/block.h
+++ b/table/block.h
@@ -39,7 +39,9 @@ class Block {
   const char* data() const { return data_; }
   bool cachable() const { return contents_.cachable; }
   uint32_t NumRestarts() const;
-  CompressionType compression_type() const { return contents_.compression_type; }
+  CompressionType compression_type() const {
+    return contents_.compression_type;
+  }
 
   // If hash index lookup is enabled and `use_hash_index` is true. This block
   // will do hash lookup for the key prefix.
diff --git a/table/block_based_filter_block.cc b/table/block_based_filter_block.cc
index bed605a68..05d5beb88 100644
--- a/table/block_based_filter_block.cc
+++ b/table/block_based_filter_block.cc
@@ -137,8 +137,7 @@ void BlockBasedFilterBlockBuilder::GenerateFilter() {
 
 BlockBasedFilterBlockReader::BlockBasedFilterBlockReader(
     const SliceTransform* prefix_extractor,
-    const BlockBasedTableOptions& table_opt,
-    const Slice& contents)
+    const BlockBasedTableOptions& table_opt, const Slice& contents)
     : policy_(table_opt.filter_policy.get()),
       prefix_extractor_(prefix_extractor),
       whole_key_filtering_(table_opt.whole_key_filtering),
@@ -159,9 +158,8 @@ BlockBasedFilterBlockReader::BlockBasedFilterBlockReader(
 
 BlockBasedFilterBlockReader::BlockBasedFilterBlockReader(
     const SliceTransform* prefix_extractor,
-    const BlockBasedTableOptions& table_opt,
-    BlockContents &&contents)
-    : BlockBasedFilterBlockReader (prefix_extractor, table_opt, contents.data) {
+    const BlockBasedTableOptions& table_opt, BlockContents&& contents)
+    : BlockBasedFilterBlockReader(prefix_extractor, table_opt, contents.data) {
   contents_ = std::move(contents);
 }
 
diff --git a/table/block_based_table_builder.cc b/table/block_based_table_builder.cc
index eb32e9942..2f373fff1 100644
--- a/table/block_based_table_builder.cc
+++ b/table/block_based_table_builder.cc
@@ -635,7 +635,7 @@ Status BlockBasedTableBuilder::InsertBlockInCache(const Slice& block_contents,
     Cache::Handle* cache_handle = nullptr;
     size_t size = block_contents.size();
 
-    std::unique_ptr<char[]> ubuf(new char[size+1]);
+    std::unique_ptr<char[]> ubuf(new char[size + 1]);
     memcpy(ubuf.get(), block_contents.data(), size);
     ubuf[size] = type;
 
diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc
index 1b41085af..2e883632f 100644
--- a/table/block_based_table_reader.cc
+++ b/table/block_based_table_reader.cc
@@ -298,8 +298,7 @@ class HashIndexReader : public IndexReader {
 
  private:
   HashIndexReader(const Comparator* comparator, Block* index_block)
-      : IndexReader(comparator),
-        index_block_(index_block) {
+      : IndexReader(comparator), index_block_(index_block) {
     assert(index_block_ != nullptr);
   }
 
@@ -746,15 +745,16 @@ FilterBlockReader* BlockBasedTable::ReadFilter(
 
   assert(rep->filter_policy);
   if (kFilterBlockPrefix == filter_block_prefix) {
-    return new BlockBasedFilterBlockReader(rep->ioptions.prefix_extractor,
-        rep->table_options, std::move(block));
+    return new BlockBasedFilterBlockReader(
+        rep->ioptions.prefix_extractor, rep->table_options, std::move(block));
   } else if (kFullFilterBlockPrefix == filter_block_prefix) {
     auto filter_bits_reader = rep->filter_policy->
         GetFilterBitsReader(block.data);
 
     if (filter_bits_reader != nullptr) {
       return new FullFilterBlockReader(rep->ioptions.prefix_extractor,
-          rep->table_options, std::move(block), filter_bits_reader);
+                                       rep->table_options, std::move(block),
+                                       filter_bits_reader);
     }
   }
   return nullptr;
diff --git a/table/format.cc b/table/format.cc
index 255e1e834..db11f9d4a 100644
--- a/table/format.cc
+++ b/table/format.cc
@@ -257,19 +257,20 @@ Status ReadBlock(RandomAccessFile* file, const Footer& footer,
 
 Status ReadBlockContents(RandomAccessFile* file, const Footer& footer,
                          const ReadOptions& options, const BlockHandle& handle,
-                         BlockContents *contents, Env* env,
+                         BlockContents* contents, Env* env,
                          bool decompression_requested) {
   Status status;
   Slice slice;
   size_t n = static_cast<size_t>(handle.size());
   std::unique_ptr<char[]> heap_buf;
   char stack_buf[DefaultStackBufferSize];
-  char *used_buf = nullptr;
+  char* used_buf = nullptr;
   rocksdb::CompressionType compression_type;
 
-  if (decompression_requested && n + kBlockTrailerSize < DefaultStackBufferSize) {
-    //If we've got a small enough hunk of data, read it in to the
-    //trivially allocated stack buffer instead of needing a full malloc()
+  if (decompression_requested &&
+      n + kBlockTrailerSize < DefaultStackBufferSize) {
+    // If we've got a small enough hunk of data, read it in to the
+    // trivially allocated stack buffer instead of needing a full malloc()
     used_buf = &stack_buf[0];
   } else {
     heap_buf = std::unique_ptr<char[]>(new char[n + kBlockTrailerSize]);
@@ -331,40 +332,48 @@ Status UncompressBlockContents(const char* data, size_t n,
       break;
     }
     case kZlibCompression:
-      ubuf = std::unique_ptr<char[]>(port::Zlib_Uncompress(data, n, &decompress_size));
+      ubuf = std::unique_ptr<char[]>(
+          port::Zlib_Uncompress(data, n, &decompress_size));
       static char zlib_corrupt_msg[] =
         "Zlib not supported or corrupted Zlib compressed block contents";
       if (!ubuf) {
         return Status::Corruption(zlib_corrupt_msg);
       }
-      *contents = BlockContents(std::move(ubuf), decompress_size, true, kNoCompression);
+      *contents =
+          BlockContents(std::move(ubuf), decompress_size, true, kNoCompression);
       break;
     case kBZip2Compression:
-      ubuf = std::unique_ptr<char[]>(port::BZip2_Uncompress(data, n, &decompress_size));
+      ubuf = std::unique_ptr<char[]>(
+          port::BZip2_Uncompress(data, n, &decompress_size));
       static char bzip2_corrupt_msg[] =
         "Bzip2 not supported or corrupted Bzip2 compressed block contents";
       if (!ubuf) {
         return Status::Corruption(bzip2_corrupt_msg);
       }
-      *contents = BlockContents(std::move(ubuf), decompress_size, true, kNoCompression);
+      *contents =
+          BlockContents(std::move(ubuf), decompress_size, true, kNoCompression);
       break;
     case kLZ4Compression:
-      ubuf = std::unique_ptr<char[]>(port::LZ4_Uncompress(data, n, &decompress_size));
+      ubuf = std::unique_ptr<char[]>(
+          port::LZ4_Uncompress(data, n, &decompress_size));
       static char lz4_corrupt_msg[] =
           "LZ4 not supported or corrupted LZ4 compressed block contents";
       if (!ubuf) {
         return Status::Corruption(lz4_corrupt_msg);
       }
-      *contents = BlockContents(std::move(ubuf), decompress_size, true, kNoCompression);
+      *contents =
+          BlockContents(std::move(ubuf), decompress_size, true, kNoCompression);
       break;
     case kLZ4HCCompression:
-      ubuf = std::unique_ptr<char[]>(port::LZ4_Uncompress(data, n, &decompress_size));
+      ubuf = std::unique_ptr<char[]>(
+          port::LZ4_Uncompress(data, n, &decompress_size));
       static char lz4hc_corrupt_msg[] =
           "LZ4HC not supported or corrupted LZ4HC compressed block contents";
       if (!ubuf) {
         return Status::Corruption(lz4hc_corrupt_msg);
       }
-      *contents = BlockContents(std::move(ubuf), decompress_size, true, kNoCompression);
+      *contents =
+          BlockContents(std::move(ubuf), decompress_size, true, kNoCompression);
       break;
     default:
       return Status::Corruption("bad block type");
diff --git a/table/format.h b/table/format.h
index 9f5d6ce89..986164d81 100644
--- a/table/format.h
+++ b/table/format.h
@@ -163,28 +163,27 @@ struct BlockContents {
   CompressionType compression_type;
   std::unique_ptr<char[]> allocation;
 
-  BlockContents()
-   : cachable(false),
-     compression_type(kNoCompression) {}
-
-  BlockContents(const Slice &_data, bool _cachable, CompressionType _compression_type)
-    : data(_data),
-      cachable(_cachable),
-      compression_type(_compression_type) {}
-
-  BlockContents(std::unique_ptr<char[]> &&_data, size_t _size, bool _cachable, CompressionType _compression_type)
-    : data(_data.get(), _size),
-      cachable(_cachable),
-      compression_type(_compression_type),
-      allocation(std::move(_data)) {}
+  BlockContents() : cachable(false), compression_type(kNoCompression) {}
+
+  BlockContents(const Slice& _data, bool _cachable,
+                CompressionType _compression_type)
+      : data(_data), cachable(_cachable), compression_type(_compression_type) {}
+
+  BlockContents(std::unique_ptr<char[]>&& _data, size_t _size, bool _cachable,
+                CompressionType _compression_type)
+      : data(_data.get(), _size),
+        cachable(_cachable),
+        compression_type(_compression_type),
+        allocation(std::move(_data)) {}
 };
 
 // Read the block identified by "handle" from "file".  On failure
 // return non-OK.  On success fill *result and return OK.
 extern Status ReadBlockContents(RandomAccessFile* file, const Footer& footer,
                                 const ReadOptions& options,
-                                const BlockHandle& handle, BlockContents* contents,
-                                Env* env, bool do_uncompress);
+                                const BlockHandle& handle,
+                                BlockContents* contents, Env* env,
+                                bool do_uncompress);
 
 // The 'data' points to the raw block contents read in from file.
 // This method allocates a new heap buffer and the raw block
diff --git a/table/full_filter_block.cc b/table/full_filter_block.cc
index b0efef39a..4113ec57a 100644
--- a/table/full_filter_block.cc
+++ b/table/full_filter_block.cc
@@ -54,8 +54,7 @@ Slice FullFilterBlockBuilder::Finish() {
 
 FullFilterBlockReader::FullFilterBlockReader(
     const SliceTransform* prefix_extractor,
-    const BlockBasedTableOptions& table_opt,
-    const Slice& contents,
+    const BlockBasedTableOptions& table_opt, const Slice& contents,
     FilterBitsReader* filter_bits_reader)
     : prefix_extractor_(prefix_extractor),
       whole_key_filtering_(table_opt.whole_key_filtering),
@@ -66,10 +65,10 @@ FullFilterBlockReader::FullFilterBlockReader(
 
 FullFilterBlockReader::FullFilterBlockReader(
     const SliceTransform* prefix_extractor,
-    const BlockBasedTableOptions& table_opt,
-    BlockContents&& contents,
+    const BlockBasedTableOptions& table_opt, BlockContents&& contents,
     FilterBitsReader* filter_bits_reader)
-    : FullFilterBlockReader(prefix_extractor, table_opt, contents.data, filter_bits_reader) {
+    : FullFilterBlockReader(prefix_extractor, table_opt, contents.data,
+                            filter_bits_reader) {
   block_contents_ = std::move(contents);
 }
 
diff --git a/table/meta_blocks.cc b/table/meta_blocks.cc
index ebbe0f5a5..5aabffcb0 100644
--- a/table/meta_blocks.cc
+++ b/table/meta_blocks.cc
@@ -229,8 +229,8 @@ Status ReadTableProperties(RandomAccessFile* file, uint64_t file_size,
   BlockContents metaindex_contents;
   ReadOptions read_options;
   read_options.verify_checksums = false;
-  s = ReadBlockContents(file, footer, read_options, metaindex_handle, &metaindex_contents,
-                        env, false);
+  s = ReadBlockContents(file, footer, read_options, metaindex_handle,
+                        &metaindex_contents, env, false);
   if (!s.ok()) {
     return s;
   }
@@ -303,7 +303,9 @@ Status ReadMetaBlock(RandomAccessFile* file, uint64_t file_size,
   Status status;
   Footer footer(table_magic_number);
   status = ReadFooterFromFile(file, file_size, &footer);
-  if (!status.ok()) return status;
+  if (!status.ok()) {
+    return status;
+  }
 
   // Reading metaindex block
   auto metaindex_handle = footer.metaindex_handle();
@@ -312,7 +314,9 @@ Status ReadMetaBlock(RandomAccessFile* file, uint64_t file_size,
   read_options.verify_checksums = false;
   status = ReadBlockContents(file, footer, read_options, metaindex_handle,
                              &metaindex_contents, env, false);
-  if (!status.ok()) return status;
+  if (!status.ok()) {
+    return status;
+  }
 
   // Finding metablock
   Block metaindex_block(std::move(metaindex_contents));

From 3c232e16470aa60a5e0c5ec5fe30e9073aa1f093 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Wed, 17 Sep 2014 15:40:25 -0700
Subject: [PATCH 089/829] Fix mac compile

---
 util/options_test.cc | 53 +++++++++++++++++++++++---------------------
 1 file changed, 28 insertions(+), 25 deletions(-)

diff --git a/util/options_test.cc b/util/options_test.cc
index c675cb87f..f640b991f 100644
--- a/util/options_test.cc
+++ b/util/options_test.cc
@@ -158,11 +158,11 @@ TEST(OptionsTest, GetOptionsFromStringsTest) {
   Options base_opt;
   Options new_opt;
   ASSERT_TRUE(GetOptionsFromStrings(base_opt, options_map, &new_opt));
-  ASSERT_EQ(new_opt.write_buffer_size, 1);
+  ASSERT_EQ(new_opt.write_buffer_size, 1U);
   ASSERT_EQ(new_opt.max_write_buffer_number, 2);
   ASSERT_EQ(new_opt.min_write_buffer_number_to_merge, 3);
   ASSERT_EQ(new_opt.compression, kSnappyCompression);
-  ASSERT_EQ(new_opt.compression_per_level.size(), 6);
+  ASSERT_EQ(new_opt.compression_per_level.size(), 6U);
   ASSERT_EQ(new_opt.compression_per_level[0], kNoCompression);
   ASSERT_EQ(new_opt.compression_per_level[1], kSnappyCompression);
   ASSERT_EQ(new_opt.compression_per_level[2], kZlibCompression);
@@ -179,9 +179,9 @@ TEST(OptionsTest, GetOptionsFromStringsTest) {
   ASSERT_EQ(new_opt.max_mem_compaction_level, 11);
   ASSERT_EQ(new_opt.target_file_size_base, 12);
   ASSERT_EQ(new_opt.target_file_size_multiplier, 13);
-  ASSERT_EQ(new_opt.max_bytes_for_level_base, 14);
+  ASSERT_EQ(new_opt.max_bytes_for_level_base, 14U);
   ASSERT_EQ(new_opt.max_bytes_for_level_multiplier, 15);
-  ASSERT_EQ(new_opt.max_bytes_for_level_multiplier_additional.size(), 3);
+  ASSERT_EQ(new_opt.max_bytes_for_level_multiplier_additional.size(), 3U);
   ASSERT_EQ(new_opt.max_bytes_for_level_multiplier_additional[0], 16);
   ASSERT_EQ(new_opt.max_bytes_for_level_multiplier_additional[1], 17);
   ASSERT_EQ(new_opt.max_bytes_for_level_multiplier_additional[2], 18);
@@ -190,54 +190,57 @@ TEST(OptionsTest, GetOptionsFromStringsTest) {
   ASSERT_EQ(new_opt.max_grandparent_overlap_factor, 21);
   ASSERT_EQ(new_opt.soft_rate_limit, 1.1);
   ASSERT_EQ(new_opt.hard_rate_limit, 2.1);
-  ASSERT_EQ(new_opt.arena_block_size, 22);
+  ASSERT_EQ(new_opt.arena_block_size, 22U);
   ASSERT_EQ(new_opt.disable_auto_compactions, true);
   ASSERT_EQ(new_opt.purge_redundant_kvs_while_flush, true);
   ASSERT_EQ(new_opt.compaction_style, kCompactionStyleLevel);
   ASSERT_EQ(new_opt.verify_checksums_in_compaction, false);
-  ASSERT_EQ(new_opt.compaction_options_fifo.max_table_files_size, 23);
+  ASSERT_EQ(new_opt.compaction_options_fifo.max_table_files_size,
+            static_cast<uint64_t>(23));
   ASSERT_EQ(new_opt.filter_deletes, false);
-  ASSERT_EQ(new_opt.max_sequential_skip_in_iterations, 24);
+  ASSERT_EQ(new_opt.max_sequential_skip_in_iterations,
+            static_cast<uint64_t>(24));
   ASSERT_EQ(new_opt.inplace_update_support, true);
-  ASSERT_EQ(new_opt.inplace_update_num_locks, 25);
-  ASSERT_EQ(new_opt.memtable_prefix_bloom_bits, 26);
-  ASSERT_EQ(new_opt.memtable_prefix_bloom_probes, 27);
-  ASSERT_EQ(new_opt.memtable_prefix_bloom_huge_page_tlb_size, 28);
-  ASSERT_EQ(new_opt.bloom_locality, 29);
-  ASSERT_EQ(new_opt.max_successive_merges, 30);
-  ASSERT_EQ(new_opt.min_partial_merge_operands, 31);
+  ASSERT_EQ(new_opt.inplace_update_num_locks, 25U);
+  ASSERT_EQ(new_opt.memtable_prefix_bloom_bits, 26U);
+  ASSERT_EQ(new_opt.memtable_prefix_bloom_probes, 27U);
+  ASSERT_EQ(new_opt.memtable_prefix_bloom_huge_page_tlb_size, 28U);
+  ASSERT_EQ(new_opt.bloom_locality, 29U);
+  ASSERT_EQ(new_opt.max_successive_merges, 30U);
+  ASSERT_EQ(new_opt.min_partial_merge_operands, 31U);
   ASSERT_EQ(new_opt.create_if_missing, false);
   ASSERT_EQ(new_opt.create_missing_column_families, true);
   ASSERT_EQ(new_opt.error_if_exists, false);
   ASSERT_EQ(new_opt.paranoid_checks, true);
   ASSERT_EQ(new_opt.max_open_files, 32);
-  ASSERT_EQ(new_opt.max_total_wal_size, 33);
+  ASSERT_EQ(new_opt.max_total_wal_size, static_cast<uint64_t>(33));
   ASSERT_EQ(new_opt.disableDataSync, false);
   ASSERT_EQ(new_opt.use_fsync, true);
   ASSERT_EQ(new_opt.db_log_dir, "/db_log_dir");
   ASSERT_EQ(new_opt.wal_dir, "/wal_dir");
-  ASSERT_EQ(new_opt.delete_obsolete_files_period_micros, 34);
+  ASSERT_EQ(new_opt.delete_obsolete_files_period_micros,
+            static_cast<uint64_t>(34));
   ASSERT_EQ(new_opt.max_background_compactions, 35);
   ASSERT_EQ(new_opt.max_background_flushes, 36);
-  ASSERT_EQ(new_opt.max_log_file_size, 37);
-  ASSERT_EQ(new_opt.log_file_time_to_roll, 38);
-  ASSERT_EQ(new_opt.keep_log_file_num, 39);
-  ASSERT_EQ(new_opt.max_manifest_file_size, 40);
+  ASSERT_EQ(new_opt.max_log_file_size, 37U);
+  ASSERT_EQ(new_opt.log_file_time_to_roll, 38U);
+  ASSERT_EQ(new_opt.keep_log_file_num, 39U);
+  ASSERT_EQ(new_opt.max_manifest_file_size, static_cast<uint64_t>(40));
   ASSERT_EQ(new_opt.table_cache_numshardbits, 41);
   ASSERT_EQ(new_opt.table_cache_remove_scan_count_limit, 42);
-  ASSERT_EQ(new_opt.WAL_ttl_seconds, 43);
-  ASSERT_EQ(new_opt.WAL_size_limit_MB, 44);
-  ASSERT_EQ(new_opt.manifest_preallocation_size, 45);
+  ASSERT_EQ(new_opt.WAL_ttl_seconds, static_cast<uint64_t>(43));
+  ASSERT_EQ(new_opt.WAL_size_limit_MB, static_cast<uint64_t>(44));
+  ASSERT_EQ(new_opt.manifest_preallocation_size, 45U);
   ASSERT_EQ(new_opt.allow_os_buffer, false);
   ASSERT_EQ(new_opt.allow_mmap_reads, true);
   ASSERT_EQ(new_opt.allow_mmap_writes, false);
   ASSERT_EQ(new_opt.is_fd_close_on_exec, true);
   ASSERT_EQ(new_opt.skip_log_error_on_recovery, false);
-  ASSERT_EQ(new_opt.stats_dump_period_sec, 46);
+  ASSERT_EQ(new_opt.stats_dump_period_sec, 46U);
   ASSERT_EQ(new_opt.advise_random_on_open, true);
   ASSERT_EQ(new_opt.use_adaptive_mutex, false);
   ASSERT_EQ(new_opt.allow_thread_local, true);
-  ASSERT_EQ(new_opt.bytes_per_sync, 47);
+  ASSERT_EQ(new_opt.bytes_per_sync, static_cast<uint64_t>(47));
 
   options_map["write_buffer_size"] = "hello";
   ASSERT_TRUE(!GetOptionsFromStrings(base_opt, options_map, &new_opt));

From feadb9df533963ffbbb52bf3267dbd43e9b502ee Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Wed, 17 Sep 2014 15:34:10 -0700
Subject: [PATCH 090/829] fix cuckoo table builder test

Summary:
as title

Test Plan:
./cuckoo_table_builder_test

Reviewers:igor

CC:leveldb

Task ID: #

Blame Rev:
---
 table/cuckoo_table_builder_test.cc | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/table/cuckoo_table_builder_test.cc b/table/cuckoo_table_builder_test.cc
index be13dc9a3..62183dd9c 100644
--- a/table/cuckoo_table_builder_test.cc
+++ b/table/cuckoo_table_builder_test.cc
@@ -170,7 +170,7 @@ TEST(CuckooBuilderTest, WriteSuccessNoCollisionFullKey) {
     ASSERT_OK(builder.status());
   }
   uint32_t bucket_size = keys[0].size() + values[0].size();
-  ASSERT_LE(expected_table_size * bucket_size, builder.FileSize());
+  ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize());
   ASSERT_OK(builder.Finish());
   ASSERT_OK(writable_file->Close());
   ASSERT_LE(expected_table_size * bucket_size, builder.FileSize());
@@ -210,7 +210,7 @@ TEST(CuckooBuilderTest, WriteSuccessWithCollisionFullKey) {
     ASSERT_OK(builder.status());
   }
   uint32_t bucket_size = keys[0].size() + values[0].size();
-  ASSERT_LE(expected_table_size * bucket_size, builder.FileSize());
+  ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize());
   ASSERT_OK(builder.Finish());
   ASSERT_OK(writable_file->Close());
   ASSERT_LE(expected_table_size * bucket_size, builder.FileSize());
@@ -251,7 +251,7 @@ TEST(CuckooBuilderTest, WriteSuccessWithCollisionAndCuckooBlock) {
     ASSERT_OK(builder.status());
   }
   uint32_t bucket_size = keys[0].size() + values[0].size();
-  ASSERT_LE(expected_table_size * bucket_size, builder.FileSize());
+  ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize());
   ASSERT_OK(builder.Finish());
   ASSERT_OK(writable_file->Close());
   ASSERT_LE(expected_table_size * bucket_size, builder.FileSize());
@@ -296,7 +296,7 @@ TEST(CuckooBuilderTest, WithCollisionPathFullKey) {
     ASSERT_OK(builder.status());
   }
   uint32_t bucket_size = keys[0].size() + values[0].size();
-  ASSERT_LE(expected_table_size * bucket_size, builder.FileSize());
+  ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize());
   ASSERT_OK(builder.Finish());
   ASSERT_OK(writable_file->Close());
   ASSERT_LE(expected_table_size * bucket_size, builder.FileSize());
@@ -338,7 +338,7 @@ TEST(CuckooBuilderTest, WithCollisionPathFullKeyAndCuckooBlock) {
     ASSERT_OK(builder.status());
   }
   uint32_t bucket_size = keys[0].size() + values[0].size();
-  ASSERT_LE(expected_table_size * bucket_size, builder.FileSize());
+  ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize());
   ASSERT_OK(builder.Finish());
   ASSERT_OK(writable_file->Close());
   ASSERT_LE(expected_table_size * bucket_size, builder.FileSize());
@@ -374,7 +374,7 @@ TEST(CuckooBuilderTest, WriteSuccessNoCollisionUserKey) {
     ASSERT_OK(builder.status());
   }
   uint32_t bucket_size = user_keys[0].size() + values[0].size();
-  ASSERT_LE(expected_table_size * bucket_size, builder.FileSize());
+  ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize());
   ASSERT_OK(builder.Finish());
   ASSERT_OK(writable_file->Close());
   ASSERT_LE(expected_table_size * bucket_size, builder.FileSize());
@@ -410,7 +410,7 @@ TEST(CuckooBuilderTest, WriteSuccessWithCollisionUserKey) {
     ASSERT_OK(builder.status());
   }
   uint32_t bucket_size = user_keys[0].size() + values[0].size();
-  ASSERT_LE(expected_table_size * bucket_size, builder.FileSize());
+  ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize());
   ASSERT_OK(builder.Finish());
   ASSERT_OK(writable_file->Close());
   ASSERT_LE(expected_table_size * bucket_size, builder.FileSize());
@@ -448,7 +448,7 @@ TEST(CuckooBuilderTest, WithCollisionPathUserKey) {
     ASSERT_OK(builder.status());
   }
   uint32_t bucket_size = user_keys[0].size() + values[0].size();
-  ASSERT_LE(expected_table_size * bucket_size, builder.FileSize());
+  ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize());
   ASSERT_OK(builder.Finish());
   ASSERT_OK(writable_file->Close());
   ASSERT_LE(expected_table_size * bucket_size, builder.FileSize());

From ff768956146d7f1c38c465568d34bd048f48062d Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Wed, 17 Sep 2014 16:45:58 -0700
Subject: [PATCH 091/829] Remove some unnecessary constructors

Summary:
This is continuing the work done by https://github.com/facebook/rocksdb/commit/27b22f13a300c00c72de7fc826fdae21a734eb49

It's just cleaning up some unnecessary constructors. The most important change is removing Block::Block(const BlockContents& contents) constructor. It was only used from the unit test.

Test Plan: compiles

Reviewers: sdong, yhchiang, ljin

Reviewed By: ljin

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D23547
---
 table/block.cc                         | 10 ++++-----
 table/block.h                          |  5 ++---
 table/block_based_filter_block.cc      | 20 +++++++-----------
 table/block_based_filter_block.h       |  3 ---
 table/block_based_filter_block_test.cc | 28 +++++++++++++-------------
 table/block_test.cc                    | 12 ++++++-----
 6 files changed, 34 insertions(+), 44 deletions(-)

diff --git a/table/block.cc b/table/block.cc
index c3066cf5b..592d175b1 100644
--- a/table/block.cc
+++ b/table/block.cc
@@ -297,8 +297,10 @@ uint32_t Block::NumRestarts() const {
   return DecodeFixed32(data_ + size_ - sizeof(uint32_t));
 }
 
-Block::Block(const BlockContents& contents)
-    : data_(contents.data.data()), size_(contents.data.size()) {
+Block::Block(BlockContents&& contents)
+    : contents_(std::move(contents)),
+      data_(contents_.data.data()),
+      size_(contents_.data.size()) {
   if (size_ < sizeof(uint32_t)) {
     size_ = 0;  // Error marker
   } else {
@@ -311,10 +313,6 @@ Block::Block(const BlockContents& contents)
   }
 }
 
-Block::Block(BlockContents&& contents) : Block(contents) {
-  contents_ = std::move(contents);
-}
-
 Iterator* Block::NewIterator(
     const Comparator* cmp, BlockIter* iter, bool total_order_seek) {
   if (size_ < 2*sizeof(uint32_t)) {
diff --git a/table/block.h b/table/block.h
index b86b615bc..68b16ea1f 100644
--- a/table/block.h
+++ b/table/block.h
@@ -31,7 +31,6 @@ class Block {
  public:
   // Initialize the block with the specified contents.
   explicit Block(BlockContents&& contents);
-  explicit Block(const BlockContents& contents);
 
   ~Block() = default;
 
@@ -66,8 +65,8 @@ class Block {
 
  private:
   BlockContents contents_;
-  const char* data_;
-  size_t size_;
+  const char* data_;            // contents_.data.data()
+  size_t size_;                 // contents_.data.size()
   uint32_t restart_offset_;     // Offset in data_ of restart array
   std::unique_ptr<BlockHashIndex> hash_index_;
   std::unique_ptr<BlockPrefixIndex> prefix_index_;
diff --git a/table/block_based_filter_block.cc b/table/block_based_filter_block.cc
index 05d5beb88..fea37b67f 100644
--- a/table/block_based_filter_block.cc
+++ b/table/block_based_filter_block.cc
@@ -137,32 +137,26 @@ void BlockBasedFilterBlockBuilder::GenerateFilter() {
 
 BlockBasedFilterBlockReader::BlockBasedFilterBlockReader(
     const SliceTransform* prefix_extractor,
-    const BlockBasedTableOptions& table_opt, const Slice& contents)
+    const BlockBasedTableOptions& table_opt, BlockContents&& contents)
     : policy_(table_opt.filter_policy.get()),
       prefix_extractor_(prefix_extractor),
       whole_key_filtering_(table_opt.whole_key_filtering),
       data_(nullptr),
       offset_(nullptr),
       num_(0),
-      base_lg_(0) {
+      base_lg_(0),
+      contents_(std::move(contents)) {
   assert(policy_);
-  size_t n = contents.size();
+  size_t n = contents_.data.size();
   if (n < 5) return;  // 1 byte for base_lg_ and 4 for start of offset array
-  base_lg_ = contents[n - 1];
-  uint32_t last_word = DecodeFixed32(contents.data() + n - 5);
+  base_lg_ = contents_.data[n - 1];
+  uint32_t last_word = DecodeFixed32(contents_.data.data() + n - 5);
   if (last_word > n - 5) return;
-  data_ = contents.data();
+  data_ = contents_.data.data();
   offset_ = data_ + last_word;
   num_ = (n - 5 - last_word) / 4;
 }
 
-BlockBasedFilterBlockReader::BlockBasedFilterBlockReader(
-    const SliceTransform* prefix_extractor,
-    const BlockBasedTableOptions& table_opt, BlockContents&& contents)
-    : BlockBasedFilterBlockReader(prefix_extractor, table_opt, contents.data) {
-  contents_ = std::move(contents);
-}
-
 bool BlockBasedFilterBlockReader::KeyMayMatch(const Slice& key,
                                               uint64_t block_offset) {
   assert(block_offset != kNotValid);
diff --git a/table/block_based_filter_block.h b/table/block_based_filter_block.h
index 856b88910..9621425e3 100644
--- a/table/block_based_filter_block.h
+++ b/table/block_based_filter_block.h
@@ -72,9 +72,6 @@ class BlockBasedFilterBlockBuilder : public FilterBlockBuilder {
 class BlockBasedFilterBlockReader : public FilterBlockReader {
  public:
   // REQUIRES: "contents" and *policy must stay live while *this is live.
-  BlockBasedFilterBlockReader(const SliceTransform* prefix_extractor,
-                              const BlockBasedTableOptions& table_opt,
-                              const Slice& contents);
   BlockBasedFilterBlockReader(const SliceTransform* prefix_extractor,
                               const BlockBasedTableOptions& table_opt,
                               BlockContents&& contents);
diff --git a/table/block_based_filter_block_test.cc b/table/block_based_filter_block_test.cc
index 4fd8c1cf5..28eea16ce 100644
--- a/table/block_based_filter_block_test.cc
+++ b/table/block_based_filter_block_test.cc
@@ -55,9 +55,9 @@ class FilterBlockTest {
 
 TEST(FilterBlockTest, EmptyBuilder) {
   BlockBasedFilterBlockBuilder builder(nullptr, table_options_);
-  Slice block = builder.Finish();
-  ASSERT_EQ("\\x00\\x00\\x00\\x00\\x0b", EscapeString(block));
-  BlockBasedFilterBlockReader reader(nullptr, table_options_, block);
+  BlockContents block(builder.Finish(), false, kNoCompression);
+  ASSERT_EQ("\\x00\\x00\\x00\\x00\\x0b", EscapeString(block.data));
+  BlockBasedFilterBlockReader reader(nullptr, table_options_, std::move(block));
   ASSERT_TRUE(reader.KeyMayMatch("foo", 0));
   ASSERT_TRUE(reader.KeyMayMatch("foo", 100000));
 }
@@ -72,8 +72,8 @@ TEST(FilterBlockTest, SingleChunk) {
   builder.Add("box");
   builder.StartBlock(300);
   builder.Add("hello");
-  Slice block = builder.Finish();
-  BlockBasedFilterBlockReader reader(nullptr, table_options_, block);
+  BlockContents block(builder.Finish(), false, kNoCompression);
+  BlockBasedFilterBlockReader reader(nullptr, table_options_, std::move(block));
   ASSERT_TRUE(reader.KeyMayMatch("foo", 100));
   ASSERT_TRUE(reader.KeyMayMatch("bar", 100));
   ASSERT_TRUE(reader.KeyMayMatch("box", 100));
@@ -103,8 +103,8 @@ TEST(FilterBlockTest, MultiChunk) {
   builder.Add("box");
   builder.Add("hello");
 
-  Slice block = builder.Finish();
-  BlockBasedFilterBlockReader reader(nullptr, table_options_, block);
+  BlockContents block(builder.Finish(), false, kNoCompression);
+  BlockBasedFilterBlockReader reader(nullptr, table_options_, std::move(block));
 
   // Check first filter
   ASSERT_TRUE(reader.KeyMayMatch("foo", 0));
@@ -147,10 +147,10 @@ class BlockBasedFilterBlockTest {
 TEST(BlockBasedFilterBlockTest, BlockBasedEmptyBuilder) {
   FilterBlockBuilder* builder = new BlockBasedFilterBlockBuilder(
       nullptr, table_options_);
-  Slice block = builder->Finish();
-  ASSERT_EQ("\\x00\\x00\\x00\\x00\\x0b", EscapeString(block));
+  BlockContents block(builder->Finish(), false, kNoCompression);
+  ASSERT_EQ("\\x00\\x00\\x00\\x00\\x0b", EscapeString(block.data));
   FilterBlockReader* reader = new BlockBasedFilterBlockReader(
-      nullptr, table_options_, block);
+      nullptr, table_options_, std::move(block));
   ASSERT_TRUE(reader->KeyMayMatch("foo", 0));
   ASSERT_TRUE(reader->KeyMayMatch("foo", 100000));
 
@@ -169,9 +169,9 @@ TEST(BlockBasedFilterBlockTest, BlockBasedSingleChunk) {
   builder->Add("box");
   builder->StartBlock(300);
   builder->Add("hello");
-  Slice block = builder->Finish();
+  BlockContents block(builder->Finish(), false, kNoCompression);
   FilterBlockReader* reader = new BlockBasedFilterBlockReader(
-      nullptr, table_options_, block);
+      nullptr, table_options_, std::move(block));
   ASSERT_TRUE(reader->KeyMayMatch("foo", 100));
   ASSERT_TRUE(reader->KeyMayMatch("bar", 100));
   ASSERT_TRUE(reader->KeyMayMatch("box", 100));
@@ -205,9 +205,9 @@ TEST(BlockBasedFilterBlockTest, BlockBasedMultiChunk) {
   builder->Add("box");
   builder->Add("hello");
 
-  Slice block = builder->Finish();
+  BlockContents block(builder->Finish(), false, kNoCompression);
   FilterBlockReader* reader = new BlockBasedFilterBlockReader(
-      nullptr, table_options_, block);
+      nullptr, table_options_, std::move(block));
 
   // Check first filter
   ASSERT_TRUE(reader->KeyMayMatch("foo", 0));
diff --git a/table/block_test.cc b/table/block_test.cc
index c341617a7..6b82c4d93 100644
--- a/table/block_test.cc
+++ b/table/block_test.cc
@@ -146,13 +146,15 @@ BlockContents GetBlockContents(std::unique_ptr<BlockBuilder> *builder,
   return contents;
 }
 
-void CheckBlockContents(const BlockContents &contents, const int max_key,
+void CheckBlockContents(BlockContents contents, const int max_key,
                         const std::vector<std::string> &keys,
                         const std::vector<std::string> &values) {
   const size_t prefix_size = 6;
   // create block reader
-  Block reader1(contents);
-  Block reader2(contents);
+  BlockContents contents_ref(contents.data, contents.cachable,
+                             contents.compression_type);
+  Block reader1(std::move(contents));
+  Block reader2(std::move(contents_ref));
 
   std::unique_ptr<const SliceTransform> prefix_extractor(
       NewFixedPrefixTransform(prefix_size));
@@ -210,7 +212,7 @@ TEST(BlockTest, SimpleIndexHash) {
   std::unique_ptr<BlockBuilder> builder;
   auto contents = GetBlockContents(&builder, keys, values);
 
-  CheckBlockContents(contents, kMaxKey, keys, values);
+  CheckBlockContents(std::move(contents), kMaxKey, keys, values);
 }
 
 TEST(BlockTest, IndexHashWithSharedPrefix) {
@@ -229,7 +231,7 @@ TEST(BlockTest, IndexHashWithSharedPrefix) {
   std::unique_ptr<BlockBuilder> builder;
   auto contents = GetBlockContents(&builder, keys, values, kPrefixGroup);
 
-  CheckBlockContents(contents, kMaxKey, keys, values);
+  CheckBlockContents(std::move(contents), kMaxKey, keys, values);
 }
 
 }  // namespace rocksdb

From 9ed1b49a24cba1a96287c25fa1203d9565b32bd3 Mon Sep 17 00:00:00 2001
From: Torrie Fischer <torrie@ripple.com>
Date: Thu, 4 Sep 2014 09:14:44 -0700
Subject: [PATCH 092/829] Build unity build on make check

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 260a51d1a..d1e60445a 100644
--- a/Makefile
+++ b/Makefile
@@ -220,7 +220,7 @@ coverage:
 	# Delete intermediate files
 	find . -type f -regex ".*\.\(\(gcda\)\|\(gcno\)\)" -exec rm {} \;
 
-check: $(TESTS) ldb
+check: $(TESTS) ldb unity
 	for t in $(TESTS); do echo "***** Running $$t"; ./$$t || exit 1; done
 	python tools/ldb_test.py
 

From 6f0964e37c90bbe81a197f48133bbb2a1b9ca973 Mon Sep 17 00:00:00 2001
From: Torrie Fischer <torrie@ripple.com>
Date: Thu, 18 Sep 2014 09:53:13 -0700
Subject: [PATCH 093/829] Only run make unity on travis instead of make check

---
 .travis.yml | 2 +-
 Makefile    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index bcb852cf0..8f1bcb0ae 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -14,6 +14,6 @@ before_install:
  - sudo dpkg -i libgflags-dev_2.0-1_amd64.deb
 # Lousy hack to disable use and testing of fallocate, which doesn't behave quite
 # as EnvPosixTest::AllocateTest expects within the Travis OpenVZ environment.
-script: OPT=-DTRAVIS make check -j8
+script: OPT=-DTRAVIS make unity  && OPT=-DTRAVIS make check -j8
 notifications:
     email: false
diff --git a/Makefile b/Makefile
index d1e60445a..260a51d1a 100644
--- a/Makefile
+++ b/Makefile
@@ -220,7 +220,7 @@ coverage:
 	# Delete intermediate files
 	find . -type f -regex ".*\.\(\(gcda\)\|\(gcno\)\)" -exec rm {} \;
 
-check: $(TESTS) ldb unity
+check: $(TESTS) ldb
 	for t in $(TESTS); do echo "***** Running $$t"; ./$$t || exit 1; done
 	python tools/ldb_test.py
 

From 2fb1fea30fd027bbd824a26b682d04d91a8661dc Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Thu, 18 Sep 2014 10:42:54 -0700
Subject: [PATCH 094/829] Fix syncronization issues

---
 db/db_impl.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/db/db_impl.cc b/db/db_impl.cc
index addce91c2..26df8fb2a 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -1559,6 +1559,7 @@ Status DBImpl::WriteLevel0Table(ColumnFamilyData* cfd,
     // threads could be concurrently producing compacted files for
     // that key range.
     if (base != nullptr && db_options_.max_background_compactions <= 1 &&
+        db_options_.max_background_flushes == 0 &&
         cfd->ioptions()->compaction_style == kCompactionStyleLevel) {
       level = base->PickLevelForMemTableOutput(min_user_key, max_user_key);
     }
@@ -1913,7 +1914,6 @@ Status DBImpl::RunManualCompaction(ColumnFamilyData* cfd, int input_level,
       bg_cv_.Wait();
     } else {
       manual_compaction_ = &manual;
-      assert(bg_compaction_scheduled_ == 0);
       bg_compaction_scheduled_++;
       env_->Schedule(&DBImpl::BGWorkCompaction, this, Env::Priority::LOW);
     }

From 035043559d300d8fd001c48676964a6b91254328 Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Thu, 18 Sep 2014 10:57:20 -0700
Subject: [PATCH 095/829] Fixed a signed-unsigned comparison in spatial_db.cc
 -- issue #293

Summary:
Fixed a signed-unsigned comparison in spatial_db.cc

utilities/spatialdb/spatial_db.cc:542:38: error: comparison between signed and unsigned integer expressions [-Werror=sign-compare]
cc1plus: all warnings being treated as errors
make: *** [utilities/spatialdb/spatial_db.o] Error 1

Test Plan:
make spatial_db_test
./spatial_db_test

Reviewers: ljin, sdong, reddragon, igor

Reviewed By: reddragon

Subscribers: reddragon, leveldb

Differential Revision: https://reviews.facebook.net/D23565
---
 utilities/spatialdb/spatial_db.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utilities/spatialdb/spatial_db.cc b/utilities/spatialdb/spatial_db.cc
index cdddbd85b..9c44027c8 100644
--- a/utilities/spatialdb/spatial_db.cc
+++ b/utilities/spatialdb/spatial_db.cc
@@ -517,7 +517,7 @@ class SpatialDBImpl : public SpatialDB {
       return Status::InvalidArgument("Spatial indexes can't be empty");
     }
 
-    const int kWriteOutEveryBytes = 1024 * 1024;  // 1MB
+    const size_t kWriteOutEveryBytes = 1024 * 1024;  // 1MB
     uint64_t id = next_id_.fetch_add(1);
 
     for (const auto& si : spatial_indexes) {

From 51af7c326c7f0a4bbde2db182119432f59579ac4 Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Thu, 18 Sep 2014 11:00:48 -0700
Subject: [PATCH 096/829] CuckooTable: add one option to allow identity
 function for the first hash function

Summary:
MurmurHash becomes expensive when we do millions Get() a second in one
thread. Add this option to allow the first hash function to use identity
function as hash function. It results in QPS increase from 3.7M/s to
~4.3M/s. I did not observe improvement for end to end RocksDB
performance. This may be caused by other bottlenecks that I will address
in a separate diff.

Test Plan:
```
[ljin@dev1964 rocksdb] ./cuckoo_table_reader_test --enable_perf --file_dir=/dev/shm --write --identity_as_first_hash=0
==== Test CuckooReaderTest.WhenKeyExists
==== Test CuckooReaderTest.WhenKeyExistsWithUint64Comparator
==== Test CuckooReaderTest.CheckIterator
==== Test CuckooReaderTest.CheckIteratorUint64
==== Test CuckooReaderTest.WhenKeyNotFound
==== Test CuckooReaderTest.TestReadPerformance
With 125829120 items, utilization is 93.75%, number of hash functions: 2.
Time taken per op is 0.272us (3.7 Mqps) with batch size of 0, # of found keys 125829120
With 125829120 items, utilization is 93.75%, number of hash functions: 2.
Time taken per op is 0.138us (7.2 Mqps) with batch size of 10, # of found keys 125829120
With 125829120 items, utilization is 93.75%, number of hash functions: 2.
Time taken per op is 0.142us (7.1 Mqps) with batch size of 25, # of found keys 125829120
With 125829120 items, utilization is 93.75%, number of hash functions: 2.
Time taken per op is 0.142us (7.0 Mqps) with batch size of 50, # of found keys 125829120
With 125829120 items, utilization is 93.75%, number of hash functions: 2.
Time taken per op is 0.144us (6.9 Mqps) with batch size of 100, # of found keys 125829120

With 104857600 items, utilization is 78.12%, number of hash functions: 2.
Time taken per op is 0.201us (5.0 Mqps) with batch size of 0, # of found keys 104857600
With 104857600 items, utilization is 78.12%, number of hash functions: 2.
Time taken per op is 0.121us (8.3 Mqps) with batch size of 10, # of found keys 104857600
With 104857600 items, utilization is 78.12%, number of hash functions: 2.
Time taken per op is 0.123us (8.1 Mqps) with batch size of 25, # of found keys 104857600
With 104857600 items, utilization is 78.12%, number of hash functions: 2.
Time taken per op is 0.121us (8.3 Mqps) with batch size of 50, # of found keys 104857600
With 104857600 items, utilization is 78.12%, number of hash functions: 2.
Time taken per op is 0.112us (8.9 Mqps) with batch size of 100, # of found keys 104857600

With 83886080 items, utilization is 62.50%, number of hash functions: 2.
Time taken per op is 0.251us (4.0 Mqps) with batch size of 0, # of found keys 83886080
With 83886080 items, utilization is 62.50%, number of hash functions: 2.
Time taken per op is 0.107us (9.4 Mqps) with batch size of 10, # of found keys 83886080
With 83886080 items, utilization is 62.50%, number of hash functions: 2.
Time taken per op is 0.099us (10.1 Mqps) with batch size of 25, # of found keys 83886080
With 83886080 items, utilization is 62.50%, number of hash functions: 2.
Time taken per op is 0.100us (10.0 Mqps) with batch size of 50, # of found keys 83886080
With 83886080 items, utilization is 62.50%, number of hash functions: 2.
Time taken per op is 0.116us (8.6 Mqps) with batch size of 100, # of found keys 83886080

With 73400320 items, utilization is 54.69%, number of hash functions: 2.
Time taken per op is 0.189us (5.3 Mqps) with batch size of 0, # of found keys 73400320
With 73400320 items, utilization is 54.69%, number of hash functions: 2.
Time taken per op is 0.095us (10.5 Mqps) with batch size of 10, # of found keys 73400320
With 73400320 items, utilization is 54.69%, number of hash functions: 2.
Time taken per op is 0.096us (10.4 Mqps) with batch size of 25, # of found keys 73400320
With 73400320 items, utilization is 54.69%, number of hash functions: 2.
Time taken per op is 0.098us (10.2 Mqps) with batch size of 50, # of found keys 73400320
With 73400320 items, utilization is 54.69%, number of hash functions: 2.
Time taken per op is 0.105us (9.5 Mqps) with batch size of 100, # of found keys 73400320

[ljin@dev1964 rocksdb] ./cuckoo_table_reader_test --enable_perf --file_dir=/dev/shm --write --identity_as_first_hash=1
==== Test CuckooReaderTest.WhenKeyExists
==== Test CuckooReaderTest.WhenKeyExistsWithUint64Comparator
==== Test CuckooReaderTest.CheckIterator
==== Test CuckooReaderTest.CheckIteratorUint64
==== Test CuckooReaderTest.WhenKeyNotFound
==== Test CuckooReaderTest.TestReadPerformance
With 125829120 items, utilization is 93.75%, number of hash functions: 2.
Time taken per op is 0.230us (4.3 Mqps) with batch size of 0, # of found keys 125829120
With 125829120 items, utilization is 93.75%, number of hash functions: 2.
Time taken per op is 0.086us (11.7 Mqps) with batch size of 10, # of found keys 125829120
With 125829120 items, utilization is 93.75%, number of hash functions: 2.
Time taken per op is 0.088us (11.3 Mqps) with batch size of 25, # of found keys 125829120
With 125829120 items, utilization is 93.75%, number of hash functions: 2.
Time taken per op is 0.083us (12.1 Mqps) with batch size of 50, # of found keys 125829120
With 125829120 items, utilization is 93.75%, number of hash functions: 2.
Time taken per op is 0.083us (12.1 Mqps) with batch size of 100, # of found keys 125829120

With 104857600 items, utilization is 78.12%, number of hash functions: 2.
Time taken per op is 0.159us (6.3 Mqps) with batch size of 0, # of found keys 104857600
With 104857600 items, utilization is 78.12%, number of hash functions: 2.
Time taken per op is 0.078us (12.8 Mqps) with batch size of 10, # of found keys 104857600
With 104857600 items, utilization is 78.12%, number of hash functions: 2.
Time taken per op is 0.080us (12.6 Mqps) with batch size of 25, # of found keys 104857600
With 104857600 items, utilization is 78.12%, number of hash functions: 2.
Time taken per op is 0.080us (12.5 Mqps) with batch size of 50, # of found keys 104857600
With 104857600 items, utilization is 78.12%, number of hash functions: 2.
Time taken per op is 0.082us (12.2 Mqps) with batch size of 100, # of found keys 104857600

With 83886080 items, utilization is 62.50%, number of hash functions: 2.
Time taken per op is 0.154us (6.5 Mqps) with batch size of 0, # of found keys 83886080
With 83886080 items, utilization is 62.50%, number of hash functions: 2.
Time taken per op is 0.077us (13.0 Mqps) with batch size of 10, # of found keys 83886080
With 83886080 items, utilization is 62.50%, number of hash functions: 2.
Time taken per op is 0.077us (12.9 Mqps) with batch size of 25, # of found keys 83886080
With 83886080 items, utilization is 62.50%, number of hash functions: 2.
Time taken per op is 0.078us (12.8 Mqps) with batch size of 50, # of found keys 83886080
With 83886080 items, utilization is 62.50%, number of hash functions: 2.
Time taken per op is 0.079us (12.6 Mqps) with batch size of 100, # of found keys 83886080

With 73400320 items, utilization is 54.69%, number of hash functions: 2.
Time taken per op is 0.218us (4.6 Mqps) with batch size of 0, # of found keys 73400320
With 73400320 items, utilization is 54.69%, number of hash functions: 2.
Time taken per op is 0.083us (12.0 Mqps) with batch size of 10, # of found keys 73400320
With 73400320 items, utilization is 54.69%, number of hash functions: 2.
Time taken per op is 0.085us (11.7 Mqps) with batch size of 25, # of found keys 73400320
With 73400320 items, utilization is 54.69%, number of hash functions: 2.
Time taken per op is 0.086us (11.6 Mqps) with batch size of 50, # of found keys 73400320
With 73400320 items, utilization is 54.69%, number of hash functions: 2.
Time taken per op is 0.078us (12.8 Mqps) with batch size of 100, # of found keys 73400320
```

Reviewers: sdong, igor, yhchiang

Reviewed By: igor

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D23451
---
 HISTORY.md                         |  3 +-
 db/db_bench.cc                     |  8 ++++-
 include/rocksdb/table.h            | 41 +++++++++++++++++---------
 table/cuckoo_table_builder.cc      | 15 ++++++++--
 table/cuckoo_table_builder.h       |  2 ++
 table/cuckoo_table_builder_test.cc | 23 ++++++++-------
 table/cuckoo_table_factory.cc      | 22 +++++++-------
 table/cuckoo_table_factory.h       | 16 +++++-----
 table/cuckoo_table_reader.cc       | 30 +++++++++++++------
 table/cuckoo_table_reader.h        |  1 +
 table/cuckoo_table_reader_test.cc  | 47 +++++++++++++++++++++---------
 table/table_reader_bench.cc        |  5 ++--
 12 files changed, 141 insertions(+), 72 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index 7a05c54e8..b64e12b42 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -2,11 +2,12 @@
 
 ## Unreleased (will be released with 3.6)
 ### Disk format changes
-* If you're using RocksDB on ARM platforms and you're using default bloom filter, there is a disk format change you need to be aware of. There are three steps you need to do when you convert to new release: 1. turn off filter policy, 2. compact the whole database, 3. turn on filter policy 
+* If you're using RocksDB on ARM platforms and you're using default bloom filter, there is a disk format change you need to be aware of. There are three steps you need to do when you convert to new release: 1. turn off filter policy, 2. compact the whole database, 3. turn on filter policy
 
 ### Behavior changes
 * We have refactored our system of stalling writes.  Any stall-related statistics' meanings are changed. Instead of per-write stall counts, we now count stalls per-epoch, where epochs are periods between flushes and compactions. You'll find more information in our Tuning Perf Guide once we release RocksDB 3.6.
 * When disableDataSync=true, we no longer sync the MANIFEST file.
+* Add identity_as_first_hash property to CuckooTable. SST file needs to be rebuilt to be opened by reader properly.
 
 ----- Past Releases -----
 
diff --git a/db/db_bench.cc b/db/db_bench.cc
index eada95b6b..08e61e46b 100644
--- a/db/db_bench.cc
+++ b/db/db_bench.cc
@@ -514,6 +514,9 @@ DEFINE_int64(keys_per_prefix, 0, "control average number of keys generated "
              "i.e. use the prefix comes with the generated random number.");
 DEFINE_bool(enable_io_prio, false, "Lower the background flush/compaction "
             "threads' IO priority");
+DEFINE_bool(identity_as_first_hash, false, "the first hash function of cuckoo "
+            "table becomes an identity function. This is only valid when key "
+            "is 8 bytes");
 
 enum RepFactory {
   kSkipList,
@@ -1739,8 +1742,11 @@ class Benchmark {
         fprintf(stderr, "Invalid cuckoo_hash_ratio\n");
         exit(1);
       }
+      rocksdb::CuckooTableOptions table_options;
+      table_options.hash_table_ratio = FLAGS_cuckoo_hash_ratio;
+      table_options.identity_as_first_hash = FLAGS_identity_as_first_hash;
       options.table_factory = std::shared_ptr<TableFactory>(
-          NewCuckooTableFactory(FLAGS_cuckoo_hash_ratio));
+          NewCuckooTableFactory(table_options));
     } else {
       BlockBasedTableOptions block_based_options;
       if (FLAGS_use_hash_search) {
diff --git a/include/rocksdb/table.h b/include/rocksdb/table.h
index 2fb4f50dd..2b0255a97 100644
--- a/include/rocksdb/table.h
+++ b/include/rocksdb/table.h
@@ -251,23 +251,36 @@ struct CuckooTablePropertyNames {
   // Denotes if the key sorted in the file is Internal Key (if false)
   // or User Key only (if true).
   static const std::string kIsLastLevel;
+  // Indicate if using identity function for the first hash function.
+  static const std::string kIdentityAsFirstHash;
+};
+
+struct CuckooTableOptions {
+  // Determines the utilization of hash tables. Smaller values
+  // result in larger hash tables with fewer collisions.
+  double hash_table_ratio = 0.9;
+  // A property used by builder to determine the depth to go to
+  // to search for a path to displace elements in case of
+  // collision. See Builder.MakeSpaceForKey method. Higher
+  // values result in more efficient hash tables with fewer
+  // lookups but take more time to build.
+  uint32_t max_search_depth = 100;
+  // In case of collision while inserting, the builder
+  // attempts to insert in the next cuckoo_block_size
+  // locations before skipping over to the next Cuckoo hash
+  // function. This makes lookups more cache friendly in case
+  // of collisions.
+  uint32_t cuckoo_block_size = 5;
+  // If this options is enabled, user key is treated as uint64_t and its value
+  // is used as hash value directly. This option changes builder's behavior.
+  // Reader ignore this option and behave according to what specified in table
+  // property.
+  bool identity_as_first_hash = false;
 };
 
 // Cuckoo Table Factory for SST table format using Cache Friendly Cuckoo Hashing
-// @hash_table_ratio: Determines the utilization of hash tables. Smaller values
-//                    result in larger hash tables with fewer collisions.
-// @max_search_depth: A property used by builder to determine the depth to go to
-//                    to search for a path to displace elements in case of
-//                    collision. See Builder.MakeSpaceForKey method.  Higher
-//                    values result in more efficient hash tables with fewer
-//                    lookups but take more time to build.
-// @cuckoo_block_size: In case of collision while inserting, the builder
-//                     attempts to insert in the next cuckoo_block_size
-//                     locations before skipping over to the next Cuckoo hash
-//                     function. This makes lookups more cache friendly in case
-//                     of collisions.
-extern TableFactory* NewCuckooTableFactory(double hash_table_ratio = 0.9,
-    uint32_t max_search_depth = 100, uint32_t cuckoo_block_size = 5);
+extern TableFactory* NewCuckooTableFactory(
+    const CuckooTableOptions& table_options = CuckooTableOptions());
 
 #endif  // ROCKSDB_LITE
 
diff --git a/table/cuckoo_table_builder.cc b/table/cuckoo_table_builder.cc
index 1cf19e3aa..51c80d9df 100644
--- a/table/cuckoo_table_builder.cc
+++ b/table/cuckoo_table_builder.cc
@@ -35,6 +35,8 @@ const std::string CuckooTablePropertyNames::kIsLastLevel =
       "rocksdb.cuckoo.file.islastlevel";
 const std::string CuckooTablePropertyNames::kCuckooBlockSize =
       "rocksdb.cuckoo.hash.cuckooblocksize";
+const std::string CuckooTablePropertyNames::kIdentityAsFirstHash =
+      "rocksdb.cuckoo.hash.identityfirst";
 
 // Obtained by running echo rocksdb.table.cuckoo | sha1sum
 extern const uint64_t kCuckooTableMagicNumber = 0x926789d0c5f17873ull;
@@ -43,6 +45,7 @@ CuckooTableBuilder::CuckooTableBuilder(
     WritableFile* file, double max_hash_table_ratio,
     uint32_t max_num_hash_table, uint32_t max_search_depth,
     const Comparator* user_comparator, uint32_t cuckoo_block_size,
+    bool identity_as_first_hash,
     uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t))
     : num_hash_func_(2),
       file_(file),
@@ -54,6 +57,7 @@ CuckooTableBuilder::CuckooTableBuilder(
       is_last_level_file_(false),
       has_seen_first_key_(false),
       ucomp_(user_comparator),
+      identity_as_first_hash_(identity_as_first_hash),
       get_slice_hash_(get_slice_hash),
       closed_(false) {
   // Data is in a huge block.
@@ -119,7 +123,7 @@ Status CuckooTableBuilder::MakeHashTable(std::vector<CuckooBucket>* buckets) {
     for (uint32_t hash_cnt = 0; hash_cnt < num_hash_func_ && !bucket_found;
         ++hash_cnt) {
       uint64_t hash_val = CuckooHash(user_key, hash_cnt,
-          hash_table_size_minus_one, get_slice_hash_);
+          hash_table_size_minus_one, identity_as_first_hash_, get_slice_hash_);
       // If there is a collision, check next cuckoo_block_size_ locations for
       // empty locations. While checking, if we reach end of the hash table,
       // stop searching and proceed for next hash function.
@@ -149,7 +153,7 @@ Status CuckooTableBuilder::MakeHashTable(std::vector<CuckooBucket>* buckets) {
       // We don't really need to rehash the entire table because old hashes are
       // still valid and we only increased the number of hash functions.
       uint64_t hash_val = CuckooHash(user_key, num_hash_func_,
-          hash_table_size_minus_one, get_slice_hash_);
+          hash_table_size_minus_one, identity_as_first_hash_, get_slice_hash_);
       ++num_hash_func_;
       for (uint32_t block_idx = 0; block_idx < cuckoo_block_size_;
           ++block_idx, ++hash_val) {
@@ -261,6 +265,10 @@ Status CuckooTableBuilder::Finish() {
     CuckooTablePropertyNames::kCuckooBlockSize].assign(
         reinterpret_cast<const char*>(&cuckoo_block_size_),
         sizeof(cuckoo_block_size_));
+  properties_.user_collected_properties[
+    CuckooTablePropertyNames::kIdentityAsFirstHash].assign(
+        reinterpret_cast<const char*>(&identity_as_first_hash_),
+        sizeof(identity_as_first_hash_));
 
   // Write meta blocks.
   MetaIndexBuilder meta_index_builder;
@@ -380,7 +388,8 @@ bool CuckooTableBuilder::MakeSpaceForKey(
       uint64_t child_bucket_id = CuckooHash(
           (is_last_level_file_ ? kvs_[curr_bucket.vector_idx].first :
            ExtractUserKey(Slice(kvs_[curr_bucket.vector_idx].first))),
-          hash_cnt, hash_table_size_minus_one, get_slice_hash_);
+          hash_cnt, hash_table_size_minus_one, identity_as_first_hash_,
+          get_slice_hash_);
       // Iterate inside Cuckoo Block.
       for (uint32_t block_idx = 0; block_idx < cuckoo_block_size_;
           ++block_idx, ++child_bucket_id) {
diff --git a/table/cuckoo_table_builder.h b/table/cuckoo_table_builder.h
index 2bf206102..45cf49315 100644
--- a/table/cuckoo_table_builder.h
+++ b/table/cuckoo_table_builder.h
@@ -24,6 +24,7 @@ class CuckooTableBuilder: public TableBuilder {
       WritableFile* file, double max_hash_table_ratio,
       uint32_t max_num_hash_func, uint32_t max_search_depth,
       const Comparator* user_comparator, uint32_t cuckoo_block_size,
+      bool identity_as_first_hash,
       uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t));
 
   // REQUIRES: Either Finish() or Abandon() has been called.
@@ -87,6 +88,7 @@ class CuckooTableBuilder: public TableBuilder {
   TableProperties properties_;
   bool has_seen_first_key_;
   const Comparator* ucomp_;
+  bool identity_as_first_hash_;
   uint64_t (*get_slice_hash_)(const Slice& s, uint32_t index,
     uint64_t max_num_buckets);
   std::string largest_user_key_ = "";
diff --git a/table/cuckoo_table_builder_test.cc b/table/cuckoo_table_builder_test.cc
index 62183dd9c..d25950728 100644
--- a/table/cuckoo_table_builder_test.cc
+++ b/table/cuckoo_table_builder_test.cc
@@ -133,7 +133,7 @@ TEST(CuckooBuilderTest, SuccessWithEmptyFile) {
   fname = test::TmpDir() + "/EmptyFile";
   ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
   CuckooTableBuilder builder(writable_file.get(), kHashTableRatio,
-      4, 100, BytewiseComparator(), 1, GetSliceHash);
+      4, 100, BytewiseComparator(), 1, false, GetSliceHash);
   ASSERT_OK(builder.status());
   ASSERT_EQ(0UL, builder.FileSize());
   ASSERT_OK(builder.Finish());
@@ -162,7 +162,7 @@ TEST(CuckooBuilderTest, WriteSuccessNoCollisionFullKey) {
   fname = test::TmpDir() + "/NoCollisionFullKey";
   ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
   CuckooTableBuilder builder(writable_file.get(), kHashTableRatio,
-      num_hash_fun, 100, BytewiseComparator(), 1, GetSliceHash);
+      num_hash_fun, 100, BytewiseComparator(), 1, false, GetSliceHash);
   ASSERT_OK(builder.status());
   for (uint32_t i = 0; i < user_keys.size(); i++) {
     builder.Add(Slice(keys[i]), Slice(values[i]));
@@ -202,7 +202,7 @@ TEST(CuckooBuilderTest, WriteSuccessWithCollisionFullKey) {
   fname = test::TmpDir() + "/WithCollisionFullKey";
   ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
   CuckooTableBuilder builder(writable_file.get(), kHashTableRatio,
-      num_hash_fun, 100, BytewiseComparator(), 1, GetSliceHash);
+      num_hash_fun, 100, BytewiseComparator(), 1, false, GetSliceHash);
   ASSERT_OK(builder.status());
   for (uint32_t i = 0; i < user_keys.size(); i++) {
     builder.Add(Slice(keys[i]), Slice(values[i]));
@@ -243,7 +243,8 @@ TEST(CuckooBuilderTest, WriteSuccessWithCollisionAndCuckooBlock) {
   fname = test::TmpDir() + "/WithCollisionFullKey2";
   ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
   CuckooTableBuilder builder(writable_file.get(), kHashTableRatio,
-      num_hash_fun, 100, BytewiseComparator(), cuckoo_block_size, GetSliceHash);
+      num_hash_fun, 100, BytewiseComparator(), cuckoo_block_size, false,
+      GetSliceHash);
   ASSERT_OK(builder.status());
   for (uint32_t i = 0; i < user_keys.size(); i++) {
     builder.Add(Slice(keys[i]), Slice(values[i]));
@@ -288,7 +289,7 @@ TEST(CuckooBuilderTest, WithCollisionPathFullKey) {
   fname = test::TmpDir() + "/WithCollisionPathFullKey";
   ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
   CuckooTableBuilder builder(writable_file.get(), kHashTableRatio,
-      num_hash_fun, 100, BytewiseComparator(), 1, GetSliceHash);
+      num_hash_fun, 100, BytewiseComparator(), 1, false, GetSliceHash);
   ASSERT_OK(builder.status());
   for (uint32_t i = 0; i < user_keys.size(); i++) {
     builder.Add(Slice(keys[i]), Slice(values[i]));
@@ -330,7 +331,7 @@ TEST(CuckooBuilderTest, WithCollisionPathFullKeyAndCuckooBlock) {
   fname = test::TmpDir() + "/WithCollisionPathFullKeyAndCuckooBlock";
   ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
   CuckooTableBuilder builder(writable_file.get(), kHashTableRatio,
-      num_hash_fun, 100, BytewiseComparator(), 2, GetSliceHash);
+      num_hash_fun, 100, BytewiseComparator(), 2, false, GetSliceHash);
   ASSERT_OK(builder.status());
   for (uint32_t i = 0; i < user_keys.size(); i++) {
     builder.Add(Slice(keys[i]), Slice(values[i]));
@@ -366,7 +367,7 @@ TEST(CuckooBuilderTest, WriteSuccessNoCollisionUserKey) {
   fname = test::TmpDir() + "/NoCollisionUserKey";
   ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
   CuckooTableBuilder builder(writable_file.get(), kHashTableRatio,
-      num_hash_fun, 100, BytewiseComparator(), 1, GetSliceHash);
+      num_hash_fun, 100, BytewiseComparator(), 1, false, GetSliceHash);
   ASSERT_OK(builder.status());
   for (uint32_t i = 0; i < user_keys.size(); i++) {
     builder.Add(Slice(GetInternalKey(user_keys[i], true)), Slice(values[i]));
@@ -402,7 +403,7 @@ TEST(CuckooBuilderTest, WriteSuccessWithCollisionUserKey) {
   fname = test::TmpDir() + "/WithCollisionUserKey";
   ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
   CuckooTableBuilder builder(writable_file.get(), kHashTableRatio,
-      num_hash_fun, 100, BytewiseComparator(), 1, GetSliceHash);
+      num_hash_fun, 100, BytewiseComparator(), 1, false, GetSliceHash);
   ASSERT_OK(builder.status());
   for (uint32_t i = 0; i < user_keys.size(); i++) {
     builder.Add(Slice(GetInternalKey(user_keys[i], true)), Slice(values[i]));
@@ -440,7 +441,7 @@ TEST(CuckooBuilderTest, WithCollisionPathUserKey) {
   fname = test::TmpDir() + "/WithCollisionPathUserKey";
   ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
   CuckooTableBuilder builder(writable_file.get(), kHashTableRatio,
-      num_hash_fun, 2, BytewiseComparator(), 1, GetSliceHash);
+      num_hash_fun, 2, BytewiseComparator(), 1, false, GetSliceHash);
   ASSERT_OK(builder.status());
   for (uint32_t i = 0; i < user_keys.size(); i++) {
     builder.Add(Slice(GetInternalKey(user_keys[i], true)), Slice(values[i]));
@@ -478,7 +479,7 @@ TEST(CuckooBuilderTest, FailWhenCollisionPathTooLong) {
   fname = test::TmpDir() + "/WithCollisionPathUserKey";
   ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
   CuckooTableBuilder builder(writable_file.get(), kHashTableRatio,
-      num_hash_fun, 2, BytewiseComparator(), 1, GetSliceHash);
+      num_hash_fun, 2, BytewiseComparator(), 1, false, GetSliceHash);
   ASSERT_OK(builder.status());
   for (uint32_t i = 0; i < user_keys.size(); i++) {
     builder.Add(Slice(GetInternalKey(user_keys[i], false)), Slice("value"));
@@ -498,7 +499,7 @@ TEST(CuckooBuilderTest, FailWhenSameKeyInserted) {
   fname = test::TmpDir() + "/FailWhenSameKeyInserted";
   ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
   CuckooTableBuilder builder(writable_file.get(), kHashTableRatio,
-      num_hash_fun, 100, BytewiseComparator(), 1, GetSliceHash);
+      num_hash_fun, 100, BytewiseComparator(), 1, false, GetSliceHash);
   ASSERT_OK(builder.status());
 
   builder.Add(Slice(GetInternalKey(user_key, false)), Slice("value1"));
diff --git a/table/cuckoo_table_factory.cc b/table/cuckoo_table_factory.cc
index 5727a91c0..18db54ed7 100644
--- a/table/cuckoo_table_factory.cc
+++ b/table/cuckoo_table_factory.cc
@@ -30,9 +30,10 @@ TableBuilder* CuckooTableFactory::NewTableBuilder(
     const InternalKeyComparator& internal_comparator,
     WritableFile* file, const CompressionType,
     const CompressionOptions&) const {
-  return new CuckooTableBuilder(file, hash_table_ratio_, 64,
-      max_search_depth_, internal_comparator.user_comparator(),
-      cuckoo_block_size_, nullptr);
+  return new CuckooTableBuilder(file, table_options_.hash_table_ratio, 64,
+      table_options_.max_search_depth, internal_comparator.user_comparator(),
+      table_options_.cuckoo_block_size, table_options_.identity_as_first_hash,
+      nullptr);
 }
 
 std::string CuckooTableFactory::GetPrintableTableOptions() const {
@@ -42,21 +43,22 @@ std::string CuckooTableFactory::GetPrintableTableOptions() const {
   char buffer[kBufferSize];
 
   snprintf(buffer, kBufferSize, "  hash_table_ratio: %lf\n",
-           hash_table_ratio_);
+           table_options_.hash_table_ratio);
   ret.append(buffer);
   snprintf(buffer, kBufferSize, "  max_search_depth: %u\n",
-           max_search_depth_);
+           table_options_.max_search_depth);
   ret.append(buffer);
   snprintf(buffer, kBufferSize, "  cuckoo_block_size: %u\n",
-           cuckoo_block_size_);
+           table_options_.cuckoo_block_size);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  identity_as_first_hash: %d\n",
+           table_options_.identity_as_first_hash);
   ret.append(buffer);
   return ret;
 }
 
-TableFactory* NewCuckooTableFactory(double hash_table_ratio,
-    uint32_t max_search_depth, uint32_t cuckoo_block_size) {
-  return new CuckooTableFactory(
-      hash_table_ratio, max_search_depth, cuckoo_block_size);
+TableFactory* NewCuckooTableFactory(const CuckooTableOptions& table_options) {
+  return new CuckooTableFactory(table_options);
 }
 
 }  // namespace rocksdb
diff --git a/table/cuckoo_table_factory.h b/table/cuckoo_table_factory.h
index 2b575dc45..7b2f32ce3 100644
--- a/table/cuckoo_table_factory.h
+++ b/table/cuckoo_table_factory.h
@@ -16,6 +16,7 @@ namespace rocksdb {
 const uint32_t kCuckooMurmurSeedMultiplier = 816922183;
 static inline uint64_t CuckooHash(
     const Slice& user_key, uint32_t hash_cnt, uint64_t table_size_minus_one,
+    bool identity_as_first_hash,
     uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t)) {
 #ifndef NDEBUG
   // This part is used only in unit tests.
@@ -23,6 +24,10 @@ static inline uint64_t CuckooHash(
     return get_slice_hash(user_key, hash_cnt, table_size_minus_one + 1);
   }
 #endif
+  if (hash_cnt == 0 && identity_as_first_hash) {
+    return (*reinterpret_cast<const int64_t*>(user_key.data())) &
+           table_size_minus_one;
+  }
   return MurmurHash(user_key.data(), user_key.size(),
       kCuckooMurmurSeedMultiplier * hash_cnt) & table_size_minus_one;
 }
@@ -36,11 +41,8 @@ static inline uint64_t CuckooHash(
 // - Does not support Merge operations.
 class CuckooTableFactory : public TableFactory {
  public:
-  CuckooTableFactory(double hash_table_ratio, uint32_t max_search_depth,
-      uint32_t cuckoo_block_size)
-    : hash_table_ratio_(hash_table_ratio),
-      max_search_depth_(max_search_depth),
-      cuckoo_block_size_(cuckoo_block_size) {}
+  explicit CuckooTableFactory(const CuckooTableOptions& table_options)
+    : table_options_(table_options) {}
   ~CuckooTableFactory() {}
 
   const char* Name() const override { return "CuckooTable"; }
@@ -63,9 +65,7 @@ class CuckooTableFactory : public TableFactory {
   std::string GetPrintableTableOptions() const override;
 
  private:
-  const double hash_table_ratio_;
-  const uint32_t max_search_depth_;
-  const uint32_t cuckoo_block_size_;
+  const CuckooTableOptions table_options_;
 };
 
 }  // namespace rocksdb
diff --git a/table/cuckoo_table_reader.cc b/table/cuckoo_table_reader.cc
index 1fdbc4475..63b8a2c8c 100644
--- a/table/cuckoo_table_reader.cc
+++ b/table/cuckoo_table_reader.cc
@@ -50,13 +50,13 @@ CuckooTableReader::CuckooTableReader(
   auto& user_props = props->user_collected_properties;
   auto hash_funs = user_props.find(CuckooTablePropertyNames::kNumHashFunc);
   if (hash_funs == user_props.end()) {
-    status_ = Status::InvalidArgument("Number of hash functions not found");
+    status_ = Status::Corruption("Number of hash functions not found");
     return;
   }
   num_hash_func_ = *reinterpret_cast<const uint32_t*>(hash_funs->second.data());
   auto unused_key = user_props.find(CuckooTablePropertyNames::kEmptyKey);
   if (unused_key == user_props.end()) {
-    status_ = Status::InvalidArgument("Empty bucket value not found");
+    status_ = Status::Corruption("Empty bucket value not found");
     return;
   }
   unused_key_ = unused_key->second;
@@ -64,7 +64,7 @@ CuckooTableReader::CuckooTableReader(
   key_length_ = props->fixed_key_len;
   auto value_length = user_props.find(CuckooTablePropertyNames::kValueLength);
   if (value_length == user_props.end()) {
-    status_ = Status::InvalidArgument("Value length not found");
+    status_ = Status::Corruption("Value length not found");
     return;
   }
   value_length_ = *reinterpret_cast<const uint32_t*>(
@@ -74,21 +74,31 @@ CuckooTableReader::CuckooTableReader(
   auto hash_table_size = user_props.find(
       CuckooTablePropertyNames::kHashTableSize);
   if (hash_table_size == user_props.end()) {
-    status_ = Status::InvalidArgument("Hash table size not found");
+    status_ = Status::Corruption("Hash table size not found");
     return;
   }
   table_size_minus_one_ = *reinterpret_cast<const uint64_t*>(
       hash_table_size->second.data()) - 1;
   auto is_last_level = user_props.find(CuckooTablePropertyNames::kIsLastLevel);
   if (is_last_level == user_props.end()) {
-    status_ = Status::InvalidArgument("Is last level not found");
+    status_ = Status::Corruption("Is last level not found");
     return;
   }
   is_last_level_ = *reinterpret_cast<const bool*>(is_last_level->second.data());
+
+  auto identity_as_first_hash = user_props.find(
+      CuckooTablePropertyNames::kIdentityAsFirstHash);
+  if (identity_as_first_hash == user_props.end()) {
+    status_ = Status::Corruption("identity as first hash not found");
+    return;
+  }
+  identity_as_first_hash_ = *reinterpret_cast<const bool*>(
+      identity_as_first_hash->second.data());
+
   auto cuckoo_block_size = user_props.find(
       CuckooTablePropertyNames::kCuckooBlockSize);
   if (cuckoo_block_size == user_props.end()) {
-    status_ = Status::InvalidArgument("Cuckoo block size not found");
+    status_ = Status::Corruption("Cuckoo block size not found");
     return;
   }
   cuckoo_block_size_ = *reinterpret_cast<const uint32_t*>(
@@ -106,7 +116,8 @@ Status CuckooTableReader::Get(
   Slice user_key = ExtractUserKey(key);
   for (uint32_t hash_cnt = 0; hash_cnt < num_hash_func_; ++hash_cnt) {
     uint64_t offset = bucket_length_ * CuckooHash(
-        user_key, hash_cnt, table_size_minus_one_, get_slice_hash_);
+        user_key, hash_cnt, table_size_minus_one_, identity_as_first_hash_,
+        get_slice_hash_);
     const char* bucket = &file_data_.data()[offset];
     for (uint32_t block_idx = 0; block_idx < cuckoo_block_size_;
         ++block_idx, bucket += bucket_length_) {
@@ -117,7 +128,7 @@ Status CuckooTableReader::Get(
       // Here, we compare only the user key part as we support only one entry
       // per user key and we don't support sanpshot.
       if (ucomp_->Compare(user_key, Slice(bucket, user_key.size())) == 0) {
-        Slice value = Slice(&bucket[key_length_], value_length_);
+        Slice value(bucket + key_length_, value_length_);
         if (is_last_level_) {
           ParsedInternalKey found_ikey(
               Slice(bucket, key_length_), 0, kTypeValue);
@@ -140,7 +151,8 @@ void CuckooTableReader::Prepare(const Slice& key) {
   // Prefetch the first Cuckoo Block.
   Slice user_key = ExtractUserKey(key);
   uint64_t addr = reinterpret_cast<uint64_t>(file_data_.data()) +
-    bucket_length_ * CuckooHash(user_key, 0, table_size_minus_one_, nullptr);
+    bucket_length_ * CuckooHash(user_key, 0, table_size_minus_one_,
+                                identity_as_first_hash_, nullptr);
   uint64_t end_addr = addr + cuckoo_block_bytes_minus_one_;
   for (addr &= CACHE_LINE_MASK; addr < end_addr; addr += CACHE_LINE_SIZE) {
     PREFETCH(reinterpret_cast<const char*>(addr), 0, 3);
diff --git a/table/cuckoo_table_reader.h b/table/cuckoo_table_reader.h
index 61e048eb6..f9e93abf4 100644
--- a/table/cuckoo_table_reader.h
+++ b/table/cuckoo_table_reader.h
@@ -64,6 +64,7 @@ class CuckooTableReader: public TableReader {
   std::unique_ptr<RandomAccessFile> file_;
   Slice file_data_;
   bool is_last_level_;
+  bool identity_as_first_hash_;
   std::shared_ptr<const TableProperties> table_props_;
   Status status_;
   uint32_t num_hash_func_;
diff --git a/table/cuckoo_table_reader_test.cc b/table/cuckoo_table_reader_test.cc
index 3138fb9ef..3b170b638 100644
--- a/table/cuckoo_table_reader_test.cc
+++ b/table/cuckoo_table_reader_test.cc
@@ -38,6 +38,7 @@ DEFINE_string(file_dir, "", "Directory where the files will be created"
 DEFINE_bool(enable_perf, false, "Run Benchmark Tests too.");
 DEFINE_bool(write, false,
     "Should write new values to file in performance tests?");
+DEFINE_bool(identity_as_first_hash, true, "use identity as first hash");
 
 namespace rocksdb {
 
@@ -109,7 +110,8 @@ class CuckooReaderTest {
     std::unique_ptr<WritableFile> writable_file;
     ASSERT_OK(env->NewWritableFile(fname, &writable_file, env_options));
     CuckooTableBuilder builder(
-        writable_file.get(), 0.9, kNumHashFunc, 100, ucomp, 2, GetSliceHash);
+        writable_file.get(), 0.9, kNumHashFunc, 100, ucomp, 2, false,
+        GetSliceHash);
     ASSERT_OK(builder.status());
     for (uint32_t key_idx = 0; key_idx < num_items; ++key_idx) {
       builder.Add(Slice(keys[key_idx]), Slice(values[key_idx]));
@@ -375,8 +377,15 @@ TEST(CuckooReaderTest, WhenKeyNotFound) {
 
 // Performance tests
 namespace {
+int64_t found_count = 0;
+std::string value;
 bool DoNothing(void* arg, const ParsedInternalKey& k, const Slice& v) {
   // Deliberately empty.
+  if (*reinterpret_cast<const int32_t*>(k.user_key.data()) ==
+      *reinterpret_cast<const int32_t*>(v.data())) {
+    ++found_count;
+    value.assign(v.data(), v.size());
+  }
   return false;
 }
 
@@ -389,12 +398,14 @@ bool CheckValue(void* cnt_ptr, const ParsedInternalKey& k, const Slice& v) {
 }
 
 void GetKeys(uint64_t num, std::vector<std::string>* keys) {
+  keys->clear();
   IterKey k;
   k.SetInternalKey("", 0, kTypeValue);
   std::string internal_key_suffix = k.GetKey().ToString();
   ASSERT_EQ(static_cast<size_t>(8), internal_key_suffix.size());
   for (uint64_t key_idx = 0; key_idx < num; ++key_idx) {
-    std::string new_key(reinterpret_cast<char*>(&key_idx), sizeof(key_idx));
+    uint64_t value = 2 * key_idx;
+    std::string new_key(reinterpret_cast<char*>(&value), sizeof(value));
     new_key += internal_key_suffix;
     keys->push_back(new_key);
   }
@@ -422,7 +433,8 @@ void WriteFile(const std::vector<std::string>& keys,
   ASSERT_OK(env->NewWritableFile(fname, &writable_file, env_options));
   CuckooTableBuilder builder(
       writable_file.get(), hash_ratio,
-      64, 1000, test::Uint64Comparator(), 5, nullptr);
+      64, 1000, test::Uint64Comparator(), 5,
+      FLAGS_identity_as_first_hash, nullptr);
   ASSERT_OK(builder.status());
   for (uint64_t key_idx = 0; key_idx < num; ++key_idx) {
     // Value is just a part of key.
@@ -482,27 +494,36 @@ void ReadKeys(uint64_t num, uint32_t batch_size) {
       " hash functions: %u.\n", num, num * 100.0 / (table_size), num_hash_fun);
   ReadOptions r_options;
 
+  std::vector<uint64_t> keys;
+  keys.reserve(num);
+  for (uint64_t i = 0; i < num; ++i) {
+    keys.push_back(2 * i);
+  }
+  std::random_shuffle(keys.begin(), keys.end());
+
+  found_count = 0;
   uint64_t start_time = env->NowMicros();
   if (batch_size > 0) {
     for (uint64_t i = 0; i < num; i += batch_size) {
       for (uint64_t j = i; j < i+batch_size && j < num; ++j) {
-        reader.Prepare(Slice(reinterpret_cast<char*>(&j), 16));
+        reader.Prepare(Slice(reinterpret_cast<char*>(&keys[j]), 16));
       }
       for (uint64_t j = i; j < i+batch_size && j < num; ++j) {
-        reader.Get(r_options, Slice(reinterpret_cast<char*>(&j), 16),
-            nullptr, DoNothing, nullptr);
+        reader.Get(r_options, Slice(reinterpret_cast<char*>(&keys[j]), 16),
+                   nullptr, DoNothing, nullptr);
       }
     }
   } else {
     for (uint64_t i = 0; i < num; i++) {
-      reader.Get(r_options, Slice(reinterpret_cast<char*>(&i), 16), nullptr,
-          DoNothing, nullptr);
+      reader.Get(r_options, Slice(reinterpret_cast<char*>(&keys[i]), 16),
+                 nullptr, DoNothing, nullptr);
     }
   }
   float time_per_op = (env->NowMicros() - start_time) * 1.0 / num;
   fprintf(stderr,
-      "Time taken per op is %.3fus (%.1f Mqps) with batch size of %u\n",
-      time_per_op, 1.0 / time_per_op, batch_size);
+      "Time taken per op is %.3fus (%.1f Mqps) with batch size of %u, "
+      "# of found keys %ld\n",
+      time_per_op, 1.0 / time_per_op, batch_size, found_count);
 }
 }  // namespace.
 
@@ -514,16 +535,16 @@ TEST(CuckooReaderTest, TestReadPerformance) {
   // These numbers are chosen to have a hash utilizaiton % close to
   // 0.9, 0.75, 0.6 and 0.5 respectively.
   // They all create 128 M buckets.
-  std::vector<uint64_t> nums = {120*1000*1000, 100*1000*1000, 80*1000*1000,
-    70*1000*1000};
+  std::vector<uint64_t> nums = {120*1024*1024, 100*1024*1024, 80*1024*1024,
+    70*1024*1024};
 #ifndef NDEBUG
   fprintf(stdout,
       "WARNING: Not compiled with DNDEBUG. Performance tests may be slow.\n");
 #endif
   std::vector<std::string> keys;
-  GetKeys(*std::max_element(nums.begin(), nums.end()), &keys);
   for (uint64_t num : nums) {
     if (FLAGS_write || !Env::Default()->FileExists(GetFileName(num))) {
+      GetKeys(num, &keys);
       WriteFile(keys, num, hash_ratio);
     }
     ReadKeys(num, 0);
diff --git a/table/table_reader_bench.cc b/table/table_reader_bench.cc
index 584937587..aa791f4c4 100644
--- a/table/table_reader_bench.cc
+++ b/table/table_reader_bench.cc
@@ -260,8 +260,9 @@ int main(int argc, char** argv) {
   if (FLAGS_table_factory == "cuckoo_hash") {
     options.allow_mmap_reads = true;
     env_options.use_mmap_reads = true;
-
-    tf.reset(rocksdb::NewCuckooTableFactory(0.75));
+    rocksdb::CuckooTableOptions table_options;
+    table_options.hash_table_ratio = 0.75;
+    tf.reset(rocksdb::NewCuckooTableFactory(table_options));
   } else if (FLAGS_table_factory == "plain_table") {
     options.allow_mmap_reads = true;
     env_options.use_mmap_reads = true;

From 90b8c07b48d63f28d082a410fd6a3d382710897e Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Thu, 18 Sep 2014 13:32:44 -0700
Subject: [PATCH 097/829] Fix unit tests errors

Summary: Those were introduced with https://github.com/facebook/rocksdb/commit/2fb1fea30fd027bbd824a26b682d04d91a8661dc because the flushing behavior changed when max_background_flushes is > 0.

Test Plan: make check

Reviewers: ljin, yhchiang, sdong

Reviewed By: sdong

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D23577
---
 db/corruption_test.cc       |  4 ++++
 db/db_test.cc               | 29 +++++++++++++++++++++++------
 db/deletefile_test.cc       |  1 +
 tools/reduce_levels_test.cc |  1 +
 4 files changed, 29 insertions(+), 6 deletions(-)

diff --git a/db/corruption_test.cc b/db/corruption_test.cc
index 7a1a5221b..09d78f89f 100644
--- a/db/corruption_test.cc
+++ b/db/corruption_test.cc
@@ -332,6 +332,9 @@ TEST(CorruptionTest, CorruptedDescriptor) {
 }
 
 TEST(CorruptionTest, CompactionInputError) {
+  Options options;
+  options.max_background_flushes = 0;
+  Reopen(&options);
   Build(10);
   DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
   dbi->TEST_FlushMemTable();
@@ -351,6 +354,7 @@ TEST(CorruptionTest, CompactionInputErrorParanoid) {
   options.paranoid_checks = true;
   options.write_buffer_size = 131072;
   options.max_write_buffer_number = 2;
+  options.max_background_flushes = 0;
   Reopen(&options);
   DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
 
diff --git a/db/db_test.cc b/db/db_test.cc
index 796792b22..7ad249d7f 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -1325,6 +1325,7 @@ TEST(DBTest, IndexAndFilterBlocksOfNewTableAddedToCache) {
 
 TEST(DBTest, GetPropertiesOfAllTablesTest) {
   Options options = CurrentOptions();
+  options.max_background_flushes = 0;
   Reopen(&options);
   // Create 4 tables
   for (int table = 0; table < 4; ++table) {
@@ -1520,7 +1521,10 @@ TEST(DBTest, GetPicksCorrectFile) {
 
 TEST(DBTest, GetEncountersEmptyLevel) {
   do {
-    CreateAndReopenWithCF({"pikachu"});
+    Options options = CurrentOptions();
+    options.max_background_flushes = 0;
+    options.disableDataSync = true;
+    CreateAndReopenWithCF({"pikachu"}, &options);
     // Arrange for the following to happen:
     //   * sstable A in level 0
     //   * nothing in level 1
@@ -5124,7 +5128,9 @@ TEST(DBTest, Snapshot) {
 
 TEST(DBTest, HiddenValuesAreRemoved) {
   do {
-    CreateAndReopenWithCF({"pikachu"});
+    Options options = CurrentOptions();
+    options.max_background_flushes = 0;
+    CreateAndReopenWithCF({"pikachu"}, &options);
     Random rnd(301);
     FillLevels("a", "z", 1);
 
@@ -5215,7 +5221,9 @@ TEST(DBTest, CompactBetweenSnapshots) {
 }
 
 TEST(DBTest, DeletionMarkers1) {
-  CreateAndReopenWithCF({"pikachu"});
+  Options options = CurrentOptions();
+  options.max_background_flushes = 0;
+  CreateAndReopenWithCF({"pikachu"}, &options);
   Put(1, "foo", "v1");
   ASSERT_OK(Flush(1));
   const int last = CurrentOptions().max_mem_compaction_level;
@@ -5250,7 +5258,9 @@ TEST(DBTest, DeletionMarkers1) {
 }
 
 TEST(DBTest, DeletionMarkers2) {
-  CreateAndReopenWithCF({"pikachu"});
+  Options options = CurrentOptions();
+  options.max_background_flushes = 0;
+  CreateAndReopenWithCF({"pikachu"}, &options);
   Put(1, "foo", "v1");
   ASSERT_OK(Flush(1));
   const int last = CurrentOptions().max_mem_compaction_level;
@@ -5279,7 +5289,9 @@ TEST(DBTest, DeletionMarkers2) {
 
 TEST(DBTest, OverlapInLevel0) {
   do {
-    CreateAndReopenWithCF({"pikachu"});
+    Options options = CurrentOptions();
+    options.max_background_flushes = 0;
+    CreateAndReopenWithCF({"pikachu"}, &options);
     int tmp = CurrentOptions().max_mem_compaction_level;
     ASSERT_EQ(tmp, 2) << "Fix test to match config";
 
@@ -5457,7 +5469,9 @@ TEST(DBTest, CustomComparator) {
 }
 
 TEST(DBTest, ManualCompaction) {
-  CreateAndReopenWithCF({"pikachu"});
+  Options options = CurrentOptions();
+  options.max_background_flushes = 0;
+  CreateAndReopenWithCF({"pikachu"}, &options);
   ASSERT_EQ(dbfull()->MaxMemCompactionLevel(), 2)
       << "Need to update this test to match kMaxMemCompactLevel";
 
@@ -5495,6 +5509,7 @@ TEST(DBTest, ManualCompaction) {
 
     if (iter == 0) {
       Options options = CurrentOptions();
+      options.max_background_flushes = 0;
       options.num_levels = 3;
       options.create_if_missing = true;
       DestroyAndReopen(&options);
@@ -5594,6 +5609,7 @@ TEST(DBTest, DBOpen_Options) {
 TEST(DBTest, DBOpen_Change_NumLevels) {
   Options opts;
   opts.create_if_missing = true;
+  opts.max_background_flushes = 0;
   DestroyAndReopen(&opts);
   ASSERT_TRUE(db_ != nullptr);
   CreateAndReopenWithCF({"pikachu"}, &opts);
@@ -5777,6 +5793,7 @@ TEST(DBTest, ManifestWriteError) {
     options.env = env_;
     options.create_if_missing = true;
     options.error_if_exists = false;
+    options.max_background_flushes = 0;
     DestroyAndReopen(&options);
     ASSERT_OK(Put("foo", "bar"));
     ASSERT_EQ("bar", Get("foo"));
diff --git a/db/deletefile_test.cc b/db/deletefile_test.cc
index 14f0324c1..a5af31284 100644
--- a/db/deletefile_test.cc
+++ b/db/deletefile_test.cc
@@ -34,6 +34,7 @@ class DeleteFileTest {
   DeleteFileTest() {
     db_ = nullptr;
     env_ = Env::Default();
+    options_.max_background_flushes = 0;
     options_.write_buffer_size = 1024*1024*1000;
     options_.target_file_size_base = 1024*1024*1000;
     options_.max_bytes_for_level_base = 1024*1024*1000;
diff --git a/tools/reduce_levels_test.cc b/tools/reduce_levels_test.cc
index b41f36d01..b1d58e10e 100644
--- a/tools/reduce_levels_test.cc
+++ b/tools/reduce_levels_test.cc
@@ -76,6 +76,7 @@ Status ReduceLevelTest::OpenDB(bool create_if_missing, int num_levels,
   opt.num_levels = num_levels;
   opt.create_if_missing = create_if_missing;
   opt.max_mem_compaction_level = mem_table_compact_level;
+  opt.max_background_flushes = 0;
   rocksdb::Status st = rocksdb::DB::Open(opt, dbname_, &db_);
   if (!st.ok()) {
     fprintf(stderr, "Can't open the db:%s\n", st.ToString().c_str());

From adae3ca1fe3f090f3814763942450a0533b66395 Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Thu, 18 Sep 2014 21:09:44 -0700
Subject: [PATCH 098/829] [Java] Fix JNI link error caused by the removal of
 options.db_stats_log_interval

Summary: Fix JNI link error caused by the removal of options.db_stats_log_interval in https://reviews.facebook.net/D21915.

Test Plan:
make rocksdbjava
make jtest

Reviewers: ljin, ankgup87

Reviewed By: ankgup87

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D23505
---
 java/org/rocksdb/Options.java          | 34 --------------------------
 java/org/rocksdb/test/OptionsTest.java |  6 -----
 2 files changed, 40 deletions(-)

diff --git a/java/org/rocksdb/Options.java b/java/org/rocksdb/Options.java
index 125f06afd..922bdbdb0 100644
--- a/java/org/rocksdb/Options.java
+++ b/java/org/rocksdb/Options.java
@@ -305,40 +305,6 @@ public class Options extends RocksObject {
   }
   private native void setUseFsync(long handle, boolean useFsync);
 
-  /**
-   * The time interval in seconds between each two consecutive stats logs.
-   * This number controls how often a new scribe log about
-   * db deploy stats is written out.
-   * -1 indicates no logging at all.
-   *
-   * @return the time interval in seconds between each two consecutive
-   *     stats logs.
-   */
-  public int dbStatsLogInterval() {
-    assert(isInitialized());
-    return dbStatsLogInterval(nativeHandle_);
-  }
-  private native int dbStatsLogInterval(long handle);
-
-  /**
-   * The time interval in seconds between each two consecutive stats logs.
-   * This number controls how often a new scribe log about
-   * db deploy stats is written out.
-   * -1 indicates no logging at all.
-   * Default value is 1800 (half an hour).
-   *
-   * @param dbStatsLogInterval the time interval in seconds between each
-   *     two consecutive stats logs.
-   * @return the reference to the current option.
-   */
-  public Options setDbStatsLogInterval(int dbStatsLogInterval) {
-    assert(isInitialized());
-    setDbStatsLogInterval(nativeHandle_, dbStatsLogInterval);
-    return this;
-  }
-  private native void setDbStatsLogInterval(
-      long handle, int dbStatsLogInterval);
-
   /**
    * Returns the directory of info log.
    *
diff --git a/java/org/rocksdb/test/OptionsTest.java b/java/org/rocksdb/test/OptionsTest.java
index b065c9023..d81ca1076 100644
--- a/java/org/rocksdb/test/OptionsTest.java
+++ b/java/org/rocksdb/test/OptionsTest.java
@@ -52,12 +52,6 @@ public class OptionsTest {
       assert(opt.useFsync() == boolValue);
     }
 
-    { // DbStatsLogInterval test
-      int intValue = rand.nextInt();
-      opt.setDbStatsLogInterval(intValue);
-      assert(opt.dbStatsLogInterval() == intValue);
-    }
-
     { // DbLogDir test
       String str = "path/to/DbLogDir";
       opt.setDbLogDir(str);

From b93797abc43c68652691e2de19d797c4264650e1 Mon Sep 17 00:00:00 2001
From: Ankit Gupta <aigupta@linkedin.com>
Date: Thu, 18 Sep 2014 22:13:52 -0700
Subject: [PATCH 099/829] Fix build

---
 java/rocksjni/write_batch.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/java/rocksjni/write_batch.cc b/java/rocksjni/write_batch.cc
index ff94309fe..0492ea1be 100644
--- a/java/rocksjni/write_batch.cc
+++ b/java/rocksjni/write_batch.cc
@@ -206,7 +206,7 @@ jbyteArray Java_org_rocksdb_WriteBatchTest_getContents(
   options.memtable_factory = factory;
   rocksdb::MemTable* mem = new rocksdb::MemTable(
       cmp, rocksdb::ImmutableCFOptions(options),
-      rocksdb::MemTableOptions(options));
+      rocksdb::MemTableOptions(rocksdb::MutableCFOptions(options), options));
   mem->Ref();
   std::string state;
   rocksdb::ColumnFamilyMemTablesDefault cf_mems_default(mem, &options);

From f44594743f92a33121d45061a0be13bca09acbbd Mon Sep 17 00:00:00 2001
From: Venkatesh Radhakrishnan <rven@fb.com>
Date: Thu, 18 Sep 2014 12:05:59 -0700
Subject: [PATCH 100/829] RocksDB: Format uint64 using PRIu64 in db_impl.cc

Summary: Use PRIu64 to format uint64 in a portable manner

Test Plan: Run "make all check"

Reviewers: sdong

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D23595
---
 db/db_impl.cc | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/db/db_impl.cc b/db/db_impl.cc
index 26df8fb2a..6038c2ce5 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -2356,10 +2356,11 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress,
 
     Version::LevelSummaryStorage tmp;
     LogToBuffer(
-        log_buffer, "[%s] Moved #%lld to level-%d %lld bytes %s: %s\n",
+        log_buffer,
+        "[%s] Moved #%" PRIu64 " to level-%d %" PRIu64 " bytes %s: %s\n",
         c->column_family_data()->GetName().c_str(),
-        static_cast<unsigned long long>(f->fd.GetNumber()), c->level() + 1,
-        static_cast<unsigned long long>(f->fd.GetFileSize()),
+        f->fd.GetNumber(), c->level() + 1,
+        f->fd.GetFileSize(),
         status.ToString().c_str(), c->input_version()->LevelSummary(&tmp));
     c->ReleaseCompactionFiles(status);
     *madeProgress = true;

From 3b897cddd78518ea85b2e15f71ba94673e3cab59 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Fri, 19 Sep 2014 09:27:16 -0700
Subject: [PATCH 101/829] Enable no-fbcode RocksDB build

Summary: I want to use open source build rather than fbcode one. This enables me to run `ROCKSDB_NO_FBCODE=1 make` and run it with my system g++.

Test Plan:
ROCKSDB_NO_FBCODE=1 make
make

Reviewers: sdong, ljin, yhchiang

Reviewed By: yhchiang

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D23613
---
 build_tools/build_detect_platform | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/build_tools/build_detect_platform b/build_tools/build_detect_platform
index 3389d2851..8479e3127 100755
--- a/build_tools/build_detect_platform
+++ b/build_tools/build_detect_platform
@@ -46,7 +46,7 @@ PLATFORM_CXXFLAGS="-std=c++11"
 COMMON_FLAGS="-DROCKSDB_PLATFORM_POSIX"
 
 # Default to fbcode gcc on internal fb machines
-if [ -d /mnt/gvfs/third-party -a -z "$CXX" ]; then
+if [ -z "$ROCKSDB_NO_FBCODE" -a -d /mnt/gvfs/third-party ]; then
     FBCODE_BUILD="true"
     if [ -z "$USE_CLANG" ]; then
         CENTOS_VERSION=`rpm -q --qf "%{VERSION}" \

From 976caca09bb0ce6cac15fbf2a3ddd4fba98d1bad Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Fri, 19 Sep 2014 10:37:42 -0700
Subject: [PATCH 102/829] Skip AllocateTest if fallocate() is not supported in
 the file system

Summary: To avoid false positive test failures when the file system doesn't support fallocate. In EnvTest.AllocateTest, we first make a simple fallocate call and check the error codes to rule out the possibility that it is not supported. Skip the test if the error code indicates it is not supported.

Test Plan: Run the test and make sure it passes on file systems supporting and not supporting fallocate

Reviewers: yhchiang, ljin, igor

Reviewed By: igor

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D23667
---
 util/env_test.cc | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/util/env_test.cc b/util/env_test.cc
index 1c4d0bba0..3e811a98d 100644
--- a/util/env_test.cc
+++ b/util/env_test.cc
@@ -17,6 +17,11 @@
 #include <unistd.h>
 #endif
 
+#ifdef ROCKSDB_FALLOCATE_PRESENT
+#include <errno.h>
+#include <fcntl.h>
+#endif
+
 #include "rocksdb/env.h"
 #include "port/port.h"
 #include "util/coding.h"
@@ -478,6 +483,31 @@ TEST(EnvPosixTest, RandomAccessUniqueID) {
 #ifdef ROCKSDB_FALLOCATE_PRESENT
 TEST(EnvPosixTest, AllocateTest) {
   std::string fname = GetOnDiskTestDir() + "/preallocate_testfile";
+
+  // Try fallocate in a file to see whether the target file system supports it.
+  // Skip the test if fallocate is not supported.
+  std::string fname_test_fallocate =
+      GetOnDiskTestDir() + "/preallocate_testfile_2";
+  int fd = -1;
+  do {
+    fd = open(fname_test_fallocate.c_str(), O_CREAT | O_RDWR | O_TRUNC, 0644);
+  } while (fd < 0 && errno == EINTR);
+  ASSERT_GT(fd, 0);
+
+  int alloc_status = fallocate(fd, 0, 0, 1);
+
+  int err_number = 0;
+  if (alloc_status != 0) {
+    err_number = errno;
+    fprintf(stderr, "Warning: fallocate() fails, %s\n", strerror(err_number));
+  }
+  close(fd);
+  ASSERT_OK(env_->DeleteFile(fname_test_fallocate));
+  if (alloc_status != 0 && err_number == EOPNOTSUPP) {
+    // The filesystem containing the file does not support fallocate
+    return;
+  }
+
   EnvOptions soptions;
   soptions.use_mmap_writes = false;
   unique_ptr<WritableFile> wfile;

From 32f2532a0b18d7ef2bf6e9c1638519a1ab75dd75 Mon Sep 17 00:00:00 2001
From: Mark Callaghan <mdcallag@gmail.com>
Date: Fri, 19 Sep 2014 13:09:25 -0700
Subject: [PATCH 103/829] Print compression_size_percent as a signed int

Summary:
compression_size_percent is an int but was printed as
an unsigned int. So the default of -1 is displayed as a big number.

Test Plan: make check

Reviewers: sdong

Reviewed By: sdong

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D23679
---
 util/options.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/util/options.cc b/util/options.cc
index a61d9d633..f0042cbda 100644
--- a/util/options.cc
+++ b/util/options.cc
@@ -419,7 +419,7 @@ void ColumnFamilyOptions::Dump(Logger* log) const {
             "max_size_amplification_percent: %u",
         compaction_options_universal.max_size_amplification_percent);
     Log(log,
-        "Options.compaction_options_universal.compression_size_percent: %u",
+        "Options.compaction_options_universal.compression_size_percent: %d",
         compaction_options_universal.compression_size_percent);
     Log(log, "Options.compaction_options_fifo.max_table_files_size: %" PRIu64,
         compaction_options_fifo.max_table_files_size);

From ed9a2df8cc44e10bbee8ff7861aba8fcbc9ec500 Mon Sep 17 00:00:00 2001
From: Torrie Fischer <torrie@ripple.com>
Date: Fri, 19 Sep 2014 15:43:51 -0700
Subject: [PATCH 104/829] fix unity build

---
 utilities/write_batch_with_index/write_batch_with_index.cc | 2 --
 1 file changed, 2 deletions(-)

diff --git a/utilities/write_batch_with_index/write_batch_with_index.cc b/utilities/write_batch_with_index/write_batch_with_index.cc
index 68b3d3970..917460694 100644
--- a/utilities/write_batch_with_index/write_batch_with_index.cc
+++ b/utilities/write_batch_with_index/write_batch_with_index.cc
@@ -10,7 +10,6 @@
 #include "util/arena.h"
 
 namespace rocksdb {
-namespace {
 class ReadableWriteBatch : public WriteBatch {
  public:
   explicit ReadableWriteBatch(size_t reserved_bytes = 0)
@@ -20,7 +19,6 @@ class ReadableWriteBatch : public WriteBatch {
   Status GetEntryFromDataOffset(size_t data_offset, WriteType* type, Slice* Key,
                                 Slice* value, Slice* blob) const;
 };
-}  // namespace
 
 // Key used by skip list, as the binary searchable index of WriteBatchWithIndex.
 struct WriteBatchIndexEntry {

From bfeef94d31293c57690d63ef040e111cea8f0dec Mon Sep 17 00:00:00 2001
From: Ankit Gupta <aigupta@linkedin.com>
Date: Fri, 19 Sep 2014 16:11:59 -0700
Subject: [PATCH 105/829] Add rate limiter

---
 java/Makefile                                 |  2 +-
 java/RocksDBSample.java                       |  4 +++
 .../org/rocksdb/GenericRateLimiterConfig.java | 36 +++++++++++++++++++
 java/org/rocksdb/Options.java                 | 15 ++++++++
 java/org/rocksdb/RateLimiterConfig.java       | 20 +++++++++++
 java/rocksjni/options.cc                      | 12 +++++++
 java/rocksjni/ratelimiterjni.cc               | 24 +++++++++++++
 7 files changed, 112 insertions(+), 1 deletion(-)
 create mode 100644 java/org/rocksdb/GenericRateLimiterConfig.java
 create mode 100644 java/org/rocksdb/RateLimiterConfig.java
 create mode 100644 java/rocksjni/ratelimiterjni.cc

diff --git a/java/Makefile b/java/Makefile
index 47b2afb9e..b2f3674f0 100644
--- a/java/Makefile
+++ b/java/Makefile
@@ -1,4 +1,4 @@
-NATIVE_JAVA_CLASSES = org.rocksdb.RocksDB org.rocksdb.Options org.rocksdb.WriteBatch org.rocksdb.WriteBatchInternal org.rocksdb.WriteBatchTest org.rocksdb.WriteOptions org.rocksdb.BackupableDB org.rocksdb.BackupableDBOptions org.rocksdb.Statistics org.rocksdb.RocksIterator org.rocksdb.VectorMemTableConfig org.rocksdb.SkipListMemTableConfig org.rocksdb.HashLinkedListMemTableConfig org.rocksdb.HashSkipListMemTableConfig org.rocksdb.PlainTableConfig org.rocksdb.BlockBasedTableConfig org.rocksdb.ReadOptions org.rocksdb.Filter org.rocksdb.BloomFilter org.rocksdb.RestoreOptions org.rocksdb.RestoreBackupableDB org.rocksdb.RocksEnv
+NATIVE_JAVA_CLASSES = org.rocksdb.RocksDB org.rocksdb.Options org.rocksdb.WriteBatch org.rocksdb.WriteBatchInternal org.rocksdb.WriteBatchTest org.rocksdb.WriteOptions org.rocksdb.BackupableDB org.rocksdb.BackupableDBOptions org.rocksdb.Statistics org.rocksdb.RocksIterator org.rocksdb.VectorMemTableConfig org.rocksdb.SkipListMemTableConfig org.rocksdb.HashLinkedListMemTableConfig org.rocksdb.HashSkipListMemTableConfig org.rocksdb.PlainTableConfig org.rocksdb.BlockBasedTableConfig org.rocksdb.ReadOptions org.rocksdb.Filter org.rocksdb.BloomFilter org.rocksdb.RestoreOptions org.rocksdb.RestoreBackupableDB org.rocksdb.RocksEnv org.rocksdb.GenericRateLimiterConfig
 
 NATIVE_INCLUDE = ./include
 ROCKSDB_JAR = rocksdbjni.jar
diff --git a/java/RocksDBSample.java b/java/RocksDBSample.java
index 9ec3d8345..d78a070df 100644
--- a/java/RocksDBSample.java
+++ b/java/RocksDBSample.java
@@ -75,6 +75,10 @@ public class RocksDBSample {
     // Plain-Table requires mmap read
     options.setAllowMmapReads(true);
     assert(options.tableFactoryName().equals("PlainTable"));
+    
+    options.setRateLimiterConfig(new GenericRateLimiterConfig(10000000,
+            10000, 10));
+    options.setRateLimiterConfig(new GenericRateLimiterConfig(10000000));
 
     BlockBasedTableConfig table_options = new BlockBasedTableConfig();
     table_options.setBlockCacheSize(64 * SizeUnit.KB)
diff --git a/java/org/rocksdb/GenericRateLimiterConfig.java b/java/org/rocksdb/GenericRateLimiterConfig.java
new file mode 100644
index 000000000..78b8b37ec
--- /dev/null
+++ b/java/org/rocksdb/GenericRateLimiterConfig.java
@@ -0,0 +1,36 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+package org.rocksdb;
+
+/**
+ * Config for rate limiter, which is used to control write rate of flush and
+ * compaction.
+ */
+public class GenericRateLimiterConfig extends RateLimiterConfig {
+  private static final long DEFAULT_REFILL_PERIOD_MICROS = (100 * 1000);
+  private static final int DEFAULT_FAIRNESS = 10;
+    
+  public GenericRateLimiterConfig(long rateBytesPerSecond,
+      long refillPeriodMicros, int fairness) {
+    rateBytesPerSecond_ = rateBytesPerSecond;
+    refillPeriodMicros_ = refillPeriodMicros;
+    fairness_ = fairness;
+  }
+  
+  public GenericRateLimiterConfig(long rateBytesPerSecond) {
+    this(rateBytesPerSecond, DEFAULT_REFILL_PERIOD_MICROS, DEFAULT_FAIRNESS);
+  }
+  
+  @Override protected long newRateLimiterHandle() {
+    return newRateLimiterHandle(rateBytesPerSecond_, refillPeriodMicros_,
+        fairness_);
+  }
+    
+  private native long newRateLimiterHandle(long rateBytesPerSecond,
+      long refillPeriodMicros, int fairness);
+  private final long rateBytesPerSecond_;
+  private final long refillPeriodMicros_;
+  private final int fairness_;
+}
diff --git a/java/org/rocksdb/Options.java b/java/org/rocksdb/Options.java
index 922bdbdb0..876a06285 100644
--- a/java/org/rocksdb/Options.java
+++ b/java/org/rocksdb/Options.java
@@ -1104,6 +1104,19 @@ public class Options extends RocksObject {
     setMemTableFactory(nativeHandle_, config.newMemTableFactoryHandle());
     return this;
   }
+  
+  /**
+   * Use to control write rate of flush and compaction. Flush has higher
+   * priority than compaction. Rate limiting is disabled if nullptr.
+   * Default: nullptr
+   *
+   * @param config rate limiter config.
+   * @return the instance of the current Options.
+   */
+  public Options setRateLimiterConfig(RateLimiterConfig config) {
+    setRateLimiter(nativeHandle_, config.newRateLimiterHandle());
+    return this;
+  }
 
   /**
    * Returns the name of the current mem table representation.
@@ -2192,6 +2205,8 @@ public class Options extends RocksObject {
   private native long statisticsPtr(long optHandle);
 
   private native void setMemTableFactory(long handle, long factoryHandle);
+  private native void setRateLimiter(long handle,
+      long rateLimiterHandle);
   private native String memTableFactoryName(long handle);
 
   private native void setTableFactory(long handle, long factoryHandle);
diff --git a/java/org/rocksdb/RateLimiterConfig.java b/java/org/rocksdb/RateLimiterConfig.java
new file mode 100644
index 000000000..22de65921
--- /dev/null
+++ b/java/org/rocksdb/RateLimiterConfig.java
@@ -0,0 +1,20 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+package org.rocksdb;
+
+/**
+ * Config for rate limiter, which is used to control write rate of flush and
+ * compaction.
+ */
+public abstract class RateLimiterConfig {
+  /**
+   * This function should only be called by Options.setRateLimiter(),
+   * which will create a c++ shared-pointer to the c++ RateLimiter
+   * that is associated with the Java RateLimtierConifg.
+   *
+   * @see Options.setRateLimiter()
+   */
+  abstract protected long newRateLimiterHandle();
+}
diff --git a/java/rocksjni/options.cc b/java/rocksjni/options.cc
index a72eecd28..705e9ff8c 100644
--- a/java/rocksjni/options.cc
+++ b/java/rocksjni/options.cc
@@ -21,6 +21,7 @@
 #include "rocksdb/memtablerep.h"
 #include "rocksdb/table.h"
 #include "rocksdb/slice_transform.h"
+#include "rocksdb/rate_limiter.h"
 
 /*
  * Class:     org_rocksdb_Options
@@ -459,6 +460,17 @@ void Java_org_rocksdb_Options_setMemTableFactory(
       reinterpret_cast<rocksdb::MemTableRepFactory*>(jfactory_handle));
 }
 
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setRateLimiter
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setRateLimiter(
+    JNIEnv* env, jobject jobj, jlong jhandle, jlong jrate_limiter_handle) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->rate_limiter.reset(
+      reinterpret_cast<rocksdb::RateLimiter*>(jrate_limiter_handle));
+}
+
 /*
  * Class:     org_rocksdb_Options
  * Method:    tableCacheNumshardbits
diff --git a/java/rocksjni/ratelimiterjni.cc b/java/rocksjni/ratelimiterjni.cc
new file mode 100644
index 000000000..5413978a0
--- /dev/null
+++ b/java/rocksjni/ratelimiterjni.cc
@@ -0,0 +1,24 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+//
+// This file implements the "bridge" between Java and C++ for RateLimiter.
+
+#include "rocksjni/portal.h"
+#include "include/org_rocksdb_GenericRateLimiterConfig.h"
+#include "rocksdb/rate_limiter.h"
+
+/*
+ * Class:     org_rocksdb_GenericRateLimiterConfig
+ * Method:    newRateLimiterHandle
+ * Signature: (JJI)J
+ */
+jlong Java_org_rocksdb_GenericRateLimiterConfig_newRateLimiterHandle(
+    JNIEnv* env, jobject jobj, jlong jrate_bytes_per_second,
+    jlong jrefill_period_micros, jint jfairness) {
+  return reinterpret_cast<jlong>(rocksdb::NewGenericRateLimiter(
+      rocksdb::jlong_to_size_t(jrate_bytes_per_second),
+      rocksdb::jlong_to_size_t(jrefill_period_micros),
+      static_cast<int32_t>(jfairness)));
+}

From 4436f17bd80fd3e953787821a0e46b7d8369e4f3 Mon Sep 17 00:00:00 2001
From: liuchang0812 <liuchang0812@gmail.com>
Date: Sun, 21 Sep 2014 22:09:48 +0800
Subject: [PATCH 106/829] fixed #303: replace %ld with % PRId64

---
 table/cuckoo_table_reader_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/table/cuckoo_table_reader_test.cc b/table/cuckoo_table_reader_test.cc
index 3b170b638..6dd5e5525 100644
--- a/table/cuckoo_table_reader_test.cc
+++ b/table/cuckoo_table_reader_test.cc
@@ -522,7 +522,7 @@ void ReadKeys(uint64_t num, uint32_t batch_size) {
   float time_per_op = (env->NowMicros() - start_time) * 1.0 / num;
   fprintf(stderr,
       "Time taken per op is %.3fus (%.1f Mqps) with batch size of %u, "
-      "# of found keys %ld\n",
+      "# of found keys %" PRId64 "\n",
       time_per_op, 1.0 / time_per_op, batch_size, found_count);
 }
 }  // namespace.

From 6a031b6a81d9b3592d5cba41d18069999f499938 Mon Sep 17 00:00:00 2001
From: liuchang0812 <liuchang0812@gmail.com>
Date: Sun, 21 Sep 2014 22:20:00 +0800
Subject: [PATCH 107/829] remove unused variable

---
 util/logging.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/util/logging.h b/util/logging.h
index ce0269726..7ca8ae0a3 100644
--- a/util/logging.h
+++ b/util/logging.h
@@ -19,7 +19,6 @@
 namespace rocksdb {
 
 class Slice;
-class WritableFile;
 
 // Append a human-readable size in bytes
 int AppendHumanBytes(uint64_t bytes, char* output, int len);

From 7e0dcb953f44e1567762a0a77c26cdde0d4b9e9c Mon Sep 17 00:00:00 2001
From: whu_liuchang <nfcj000@qq.com>
Date: Mon, 22 Sep 2014 23:24:53 +0800
Subject: [PATCH 108/829] Update logging.cc

fix cast style to cpp static_cast
---
 util/logging.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/util/logging.cc b/util/logging.cc
index 4dfb9a449..1f5b8e2a7 100644
--- a/util/logging.cc
+++ b/util/logging.cc
@@ -45,7 +45,7 @@ int AppendHumanBytes(uint64_t bytes, char* output, int len) {
 
 void AppendNumberTo(std::string* str, uint64_t num) {
   char buf[30];
-  snprintf(buf, sizeof(buf), "%llu", (unsigned long long) num);
+  snprintf(buf, sizeof(buf), "%llu", static_cast(unsigned long long) num);
   str->append(buf);
 }
 

From a7574d4fa17c6110b878bd91c674fb012982b7ec Mon Sep 17 00:00:00 2001
From: whu_liuchang <nfcj000@qq.com>
Date: Mon, 22 Sep 2014 23:37:00 +0800
Subject: [PATCH 109/829] Update logging.cc

---
 util/logging.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/util/logging.cc b/util/logging.cc
index 1f5b8e2a7..c54ea1cdc 100644
--- a/util/logging.cc
+++ b/util/logging.cc
@@ -45,7 +45,7 @@ int AppendHumanBytes(uint64_t bytes, char* output, int len) {
 
 void AppendNumberTo(std::string* str, uint64_t num) {
   char buf[30];
-  snprintf(buf, sizeof(buf), "%llu", static_cast(unsigned long long) num);
+  snprintf(buf, sizeof(buf), "%llu", static_cast<unsigned long long>(num));
   str->append(buf);
 }
 

From 787cb4db29b995297935819cf7f2f07f6ffd9977 Mon Sep 17 00:00:00 2001
From: liuchang0812 <liuchang0812@gmail.com>
Date: Tue, 23 Sep 2014 01:10:46 +0800
Subject: [PATCH 110/829] remove cast, replace %llu with % PRIu64

---
 util/logging.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/util/logging.cc b/util/logging.cc
index c54ea1cdc..98d96b82b 100644
--- a/util/logging.cc
+++ b/util/logging.cc
@@ -45,7 +45,7 @@ int AppendHumanBytes(uint64_t bytes, char* output, int len) {
 
 void AppendNumberTo(std::string* str, uint64_t num) {
   char buf[30];
-  snprintf(buf, sizeof(buf), "%llu", static_cast<unsigned long long>(num));
+  snprintf(buf, sizeof(buf), "%" PRIu64, num);
   str->append(buf);
 }
 

From 5e6aee4325ae9dabe38905828f5b63944a40c06a Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Mon, 22 Sep 2014 10:36:53 -0700
Subject: [PATCH 111/829] dont create backup_input if compaction filter v2 is
 not used

Summary:
Compaction creates backup_input iterator even though it only needed
when compaction filter v2 is enabled

Test Plan: make all check

Reviewers: sdong, yhchiang, igor

Reviewed By: igor

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D23769
---
 db/db_impl.cc | 50 ++++++++++++++++++++++++--------------------------
 1 file changed, 24 insertions(+), 26 deletions(-)

diff --git a/db/db_impl.cc b/db/db_impl.cc
index 6038c2ce5..260939810 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -3117,9 +3117,6 @@ Status DBImpl::DoCompactionWork(CompactionState* compact,
   const uint64_t start_micros = env_->NowMicros();
   unique_ptr<Iterator> input(versions_->MakeInputIterator(compact->compaction));
   input->SeekToFirst();
-  shared_ptr<Iterator> backup_input(
-      versions_->MakeInputIterator(compact->compaction));
-  backup_input->SeekToFirst();
 
   Status status;
   ParsedInternalKey ikey;
@@ -3132,14 +3129,30 @@ Status DBImpl::DoCompactionWork(CompactionState* compact,
   auto compaction_filter_v2 =
     compaction_filter_from_factory_v2.get();
 
-  // temp_backup_input always point to the start of the current buffer
-  // temp_backup_input = backup_input;
-  // iterate through input,
-  // 1) buffer ineligible keys and value keys into 2 separate buffers;
-  // 2) send value_buffer to compaction filter and alternate the values;
-  // 3) merge value_buffer with ineligible_value_buffer;
-  // 4) run the modified "compaction" using the old for loop.
-  if (compaction_filter_v2) {
+  if (!compaction_filter_v2) {
+    status = ProcessKeyValueCompaction(
+      is_snapshot_supported,
+      visible_at_tip,
+      earliest_snapshot,
+      latest_snapshot,
+      deletion_state,
+      bottommost_level,
+      imm_micros,
+      input.get(),
+      compact,
+      false,
+      log_buffer);
+  } else {
+    // temp_backup_input always point to the start of the current buffer
+    // temp_backup_input = backup_input;
+    // iterate through input,
+    // 1) buffer ineligible keys and value keys into 2 separate buffers;
+    // 2) send value_buffer to compaction filter and alternate the values;
+    // 3) merge value_buffer with ineligible_value_buffer;
+    // 4) run the modified "compaction" using the old for loop.
+    shared_ptr<Iterator> backup_input(
+        versions_->MakeInputIterator(compact->compaction));
+    backup_input->SeekToFirst();
     while (backup_input->Valid() && !shutting_down_.Acquire_Load() &&
            !cfd->IsDropped()) {
       // FLUSH preempts compaction
@@ -3267,21 +3280,6 @@ Status DBImpl::DoCompactionWork(CompactionState* compact,
         log_buffer);
   }  // checking for compaction filter v2
 
-  if (!compaction_filter_v2) {
-    status = ProcessKeyValueCompaction(
-      is_snapshot_supported,
-      visible_at_tip,
-      earliest_snapshot,
-      latest_snapshot,
-      deletion_state,
-      bottommost_level,
-      imm_micros,
-      input.get(),
-      compact,
-      false,
-      log_buffer);
-  }
-
   if (status.ok() && (shutting_down_.Acquire_Load() || cfd->IsDropped())) {
     status = Status::ShutdownInProgress(
         "Database shutdown or Column family drop during compaction");

From 57a32f147f08b9ddccaf40c53f082e695355915a Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Mon, 22 Sep 2014 11:15:03 -0700
Subject: [PATCH 112/829] change target_file_size_base to uint64_t

Summary: It contrains the file size to be 4G max with int

Test Plan:
tried to grep instance and made sure other related variables are also
uint64

Reviewers: sdong, yhchiang, igor

Reviewed By: igor

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D23697
---
 HISTORY.md                |  1 +
 db/db_bench.cc            |  2 +-
 include/rocksdb/options.h |  2 +-
 util/options.cc           | 30 +++++++++++++++---------------
 util/options_helper.cc    |  2 +-
 5 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index b64e12b42..a8b89f54f 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -8,6 +8,7 @@
 * We have refactored our system of stalling writes.  Any stall-related statistics' meanings are changed. Instead of per-write stall counts, we now count stalls per-epoch, where epochs are periods between flushes and compactions. You'll find more information in our Tuning Perf Guide once we release RocksDB 3.6.
 * When disableDataSync=true, we no longer sync the MANIFEST file.
 * Add identity_as_first_hash property to CuckooTable. SST file needs to be rebuilt to be opened by reader properly.
+* Change target_file_size_base type to uint64_t from int.
 
 ----- Past Releases -----
 
diff --git a/db/db_bench.cc b/db/db_bench.cc
index 08e61e46b..d90c628a9 100644
--- a/db/db_bench.cc
+++ b/db/db_bench.cc
@@ -307,7 +307,7 @@ DEFINE_string(wal_dir, "", "If not empty, use the given dir for WAL");
 
 DEFINE_int32(num_levels, 7, "The total number of levels");
 
-DEFINE_int32(target_file_size_base, 2 * 1048576, "Target file size at level-1");
+DEFINE_int64(target_file_size_base, 2 * 1048576, "Target file size at level-1");
 
 DEFINE_int32(target_file_size_multiplier, 1,
              "A multiplier to compute target level-N file size (N >= 2)");
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index 2c9734d24..84a0422c1 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -287,7 +287,7 @@ struct ColumnFamilyOptions {
   // and each file on level-3 will be 200MB.
 
   // by default target_file_size_base is 2MB.
-  int target_file_size_base;
+  uint64_t target_file_size_base;
   // by default target_file_size_multiplier is 1, which means
   // by default files in different levels will have similar size.
   int target_file_size_multiplier;
diff --git a/util/options.cc b/util/options.cc
index f0042cbda..32612d6a7 100644
--- a/util/options.cc
+++ b/util/options.cc
@@ -273,8 +273,8 @@ void DBOptions::Dump(Logger* log) const {
     Log(log, "       Options.disableDataSync: %d", disableDataSync);
     Log(log, "             Options.use_fsync: %d", use_fsync);
     Log(log, "     Options.max_log_file_size: %zu", max_log_file_size);
-    Log(log, "Options.max_manifest_file_size: %lu",
-        (unsigned long)max_manifest_file_size);
+    Log(log, "Options.max_manifest_file_size: %" PRIu64,
+        max_manifest_file_size);
     Log(log, "     Options.log_file_time_to_roll: %zu", log_file_time_to_roll);
     Log(log, "     Options.keep_log_file_num: %zu", keep_log_file_num);
     Log(log, "       Options.allow_os_buffer: %d", allow_os_buffer);
@@ -290,16 +290,16 @@ void DBOptions::Dump(Logger* log) const {
         table_cache_numshardbits);
     Log(log, "    Options.table_cache_remove_scan_count_limit: %d",
         table_cache_remove_scan_count_limit);
-    Log(log, "    Options.delete_obsolete_files_period_micros: %lu",
-        (unsigned long)delete_obsolete_files_period_micros);
+    Log(log, "    Options.delete_obsolete_files_period_micros: %" PRIu64,
+        delete_obsolete_files_period_micros);
     Log(log, "             Options.max_background_compactions: %d",
         max_background_compactions);
     Log(log, "                 Options.max_background_flushes: %d",
         max_background_flushes);
-    Log(log, "                        Options.WAL_ttl_seconds: %lu",
-        (unsigned long)WAL_ttl_seconds);
-    Log(log, "                      Options.WAL_size_limit_MB: %lu",
-        (unsigned long)WAL_size_limit_MB);
+    Log(log, "                        Options.WAL_ttl_seconds: %" PRIu64,
+        WAL_ttl_seconds);
+    Log(log, "                      Options.WAL_size_limit_MB: %" PRIu64,
+        WAL_size_limit_MB);
     Log(log, "            Options.manifest_preallocation_size: %zu",
         manifest_preallocation_size);
     Log(log, "                         Options.allow_os_buffer: %d",
@@ -322,8 +322,8 @@ void DBOptions::Dump(Logger* log) const {
         use_adaptive_mutex);
     Log(log, "                            Options.rate_limiter: %p",
         rate_limiter.get());
-    Log(log, "                          Options.bytes_per_sync: %lu",
-        (unsigned long)bytes_per_sync);
+    Log(log, "                          Options.bytes_per_sync: %" PRIu64,
+        bytes_per_sync);
 }  // DBOptions::Dump
 
 void ColumnFamilyOptions::Dump(Logger* log) const {
@@ -371,20 +371,20 @@ void ColumnFamilyOptions::Dump(Logger* log) const {
         level0_stop_writes_trigger);
     Log(log,"               Options.max_mem_compaction_level: %d",
         max_mem_compaction_level);
-    Log(log,"                  Options.target_file_size_base: %d",
+    Log(log,"                  Options.target_file_size_base: %" PRIu64,
         target_file_size_base);
     Log(log,"            Options.target_file_size_multiplier: %d",
         target_file_size_multiplier);
-    Log(log,"               Options.max_bytes_for_level_base: %lu",
-        (unsigned long)max_bytes_for_level_base);
+    Log(log,"               Options.max_bytes_for_level_base: %" PRIu64,
+        max_bytes_for_level_base);
     Log(log,"         Options.max_bytes_for_level_multiplier: %d",
         max_bytes_for_level_multiplier);
     for (int i = 0; i < num_levels; i++) {
       Log(log,"Options.max_bytes_for_level_multiplier_addtl[%d]: %d",
           i, max_bytes_for_level_multiplier_additional[i]);
     }
-    Log(log,"      Options.max_sequential_skip_in_iterations: %lu",
-        (unsigned long)max_sequential_skip_in_iterations);
+    Log(log,"      Options.max_sequential_skip_in_iterations: %" PRIu64,
+        max_sequential_skip_in_iterations);
     Log(log,"             Options.expanded_compaction_factor: %d",
         expanded_compaction_factor);
     Log(log,"               Options.source_compaction_factor: %d",
diff --git a/util/options_helper.cc b/util/options_helper.cc
index db066f747..d552a2b9e 100644
--- a/util/options_helper.cc
+++ b/util/options_helper.cc
@@ -177,7 +177,7 @@ bool GetOptionsFromStrings(
       } else if (o.first == "max_mem_compaction_level") {
         new_options->max_mem_compaction_level = ParseInt(o.second);
       } else if (o.first == "target_file_size_base") {
-        new_options->target_file_size_base = ParseInt(o.second);
+        new_options->target_file_size_base = ParseUint64(o.second);
       } else if (o.first == "target_file_size_multiplier") {
         new_options->target_file_size_multiplier = ParseInt(o.second);
       } else if (o.first == "max_bytes_for_level_base") {

From d0de413f4dc94e539401608e2007540a5ea01098 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Mon, 22 Sep 2014 11:37:35 -0700
Subject: [PATCH 113/829] WriteBatchWithIndex to allow different Comparators
 for different column families

Summary:
Previously, one single column family is given to WriteBatchWithIndex to index keys for all column families. An extra map from column family ID to comparator is maintained which can override the default comparator given in the constructor. A WriteBatchWithIndex::SetComparatorForCF() is added for user to add comparators per column family.

Also move more codes into anonymous namespace.

Test Plan: Add a unit test

Reviewers: ljin, igor

Reviewed By: igor

Subscribers: dhruba, leveldb, yhchiang

Differential Revision: https://reviews.facebook.net/D23355
---
 db/column_family.cc                           |  13 ++
 db/column_family.h                            |   4 +
 db/write_batch_test.cc                        |   5 +-
 .../utilities/write_batch_with_index.h        |  15 ++-
 .../write_batch_with_index.cc                 |  79 +++++++-----
 .../write_batch_with_index_test.cc            | 114 +++++++++++++++++-
 6 files changed, 189 insertions(+), 41 deletions(-)

diff --git a/db/column_family.cc b/db/column_family.cc
index ff6b8fe6c..8b4e007ed 100644
--- a/db/column_family.cc
+++ b/db/column_family.cc
@@ -86,6 +86,10 @@ ColumnFamilyHandleImpl::~ColumnFamilyHandleImpl() {
 
 uint32_t ColumnFamilyHandleImpl::GetID() const { return cfd()->GetID(); }
 
+const Comparator* ColumnFamilyHandleImpl::user_comparator() const {
+  return cfd()->user_comparator();
+}
+
 ColumnFamilyOptions SanitizeOptions(const InternalKeyComparator* icmp,
                                     const ColumnFamilyOptions& src) {
   ColumnFamilyOptions result = src;
@@ -726,4 +730,13 @@ uint32_t GetColumnFamilyID(ColumnFamilyHandle* column_family) {
   return column_family_id;
 }
 
+const Comparator* GetColumnFamilyUserComparator(
+    ColumnFamilyHandle* column_family) {
+  if (column_family != nullptr) {
+    auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+    return cfh->user_comparator();
+  }
+  return nullptr;
+}
+
 }  // namespace rocksdb
diff --git a/db/column_family.h b/db/column_family.h
index f1ef13cf1..65b4b53ba 100644
--- a/db/column_family.h
+++ b/db/column_family.h
@@ -49,6 +49,7 @@ class ColumnFamilyHandleImpl : public ColumnFamilyHandle {
   // destroy without mutex
   virtual ~ColumnFamilyHandleImpl();
   virtual ColumnFamilyData* cfd() const { return cfd_; }
+  virtual const Comparator* user_comparator() const;
 
   virtual uint32_t GetID() const;
 
@@ -448,4 +449,7 @@ class ColumnFamilyMemTablesImpl : public ColumnFamilyMemTables {
 
 extern uint32_t GetColumnFamilyID(ColumnFamilyHandle* column_family);
 
+extern const Comparator* GetColumnFamilyUserComparator(
+    ColumnFamilyHandle* column_family);
+
 }  // namespace rocksdb
diff --git a/db/write_batch_test.cc b/db/write_batch_test.cc
index d8fa52d40..ba7451078 100644
--- a/db/write_batch_test.cc
+++ b/db/write_batch_test.cc
@@ -289,6 +289,9 @@ class ColumnFamilyHandleImplDummy : public ColumnFamilyHandleImpl {
   explicit ColumnFamilyHandleImplDummy(int id)
       : ColumnFamilyHandleImpl(nullptr, nullptr, nullptr), id_(id) {}
   uint32_t GetID() const override { return id_; }
+  const Comparator* user_comparator() const override {
+    return BytewiseComparator();
+  }
 
  private:
   uint32_t id_;
@@ -320,7 +323,7 @@ TEST(WriteBatchTest, ColumnFamiliesBatchTest) {
 }
 
 TEST(WriteBatchTest, ColumnFamiliesBatchWithIndexTest) {
-  WriteBatchWithIndex batch(BytewiseComparator(), 20);
+  WriteBatchWithIndex batch;
   ColumnFamilyHandleImplDummy zero(0), two(2), three(3), eight(8);
   batch.Put(&zero, Slice("foo"), Slice("bar"));
   batch.Put(&two, Slice("twofoo"), Slice("bar2"));
diff --git a/include/rocksdb/utilities/write_batch_with_index.h b/include/rocksdb/utilities/write_batch_with_index.h
index c09f53d11..85c80850f 100644
--- a/include/rocksdb/utilities/write_batch_with_index.h
+++ b/include/rocksdb/utilities/write_batch_with_index.h
@@ -11,8 +11,9 @@
 
 #pragma once
 
-#include "rocksdb/status.h"
+#include "rocksdb/comparator.h"
 #include "rocksdb/slice.h"
+#include "rocksdb/status.h"
 #include "rocksdb/write_batch.h"
 
 namespace rocksdb {
@@ -56,12 +57,14 @@ class WBWIIterator {
 // A user can call NewIterator() to create an iterator.
 class WriteBatchWithIndex {
  public:
-  // index_comparator indicates the order when iterating data in the write
-  // batch. Technically, it doesn't have to be the same as the one used in
-  // the DB.
+  // backup_index_comparator: the backup comparator used to compare keys
+  // within the same column family, if column family is not given in the
+  // interface, or we can't find a column family from the column family handle
+  // passed in, backup_index_comparator will be used for the column family.
   // reserved_bytes: reserved bytes in underlying WriteBatch
-  explicit WriteBatchWithIndex(const Comparator* index_comparator,
-                               size_t reserved_bytes = 0);
+  explicit WriteBatchWithIndex(
+      const Comparator* backup_index_comparator = BytewiseComparator(),
+      size_t reserved_bytes = 0);
   virtual ~WriteBatchWithIndex();
 
   WriteBatch* GetWriteBatch();
diff --git a/utilities/write_batch_with_index/write_batch_with_index.cc b/utilities/write_batch_with_index/write_batch_with_index.cc
index 68b3d3970..2caa2e4cc 100644
--- a/utilities/write_batch_with_index/write_batch_with_index.cc
+++ b/utilities/write_batch_with_index/write_batch_with_index.cc
@@ -20,7 +20,6 @@ class ReadableWriteBatch : public WriteBatch {
   Status GetEntryFromDataOffset(size_t data_offset, WriteType* type, Slice* Key,
                                 Slice* value, Slice* blob) const;
 };
-}  // namespace
 
 // Key used by skip list, as the binary searchable index of WriteBatchWithIndex.
 struct WriteBatchIndexEntry {
@@ -38,44 +37,28 @@ struct WriteBatchIndexEntry {
 
 class WriteBatchEntryComparator {
  public:
-  WriteBatchEntryComparator(const Comparator* comparator,
+  WriteBatchEntryComparator(const Comparator* default_comparator,
                             const ReadableWriteBatch* write_batch)
-      : comparator_(comparator), write_batch_(write_batch) {}
+      : default_comparator_(default_comparator), write_batch_(write_batch) {}
   // Compare a and b. Return a negative value if a is less than b, 0 if they
   // are equal, and a positive value if a is greater than b
   int operator()(const WriteBatchIndexEntry* entry1,
                  const WriteBatchIndexEntry* entry2) const;
 
+  void SetComparatorForCF(uint32_t column_family_id,
+                          const Comparator* comparator) {
+    cf_comparator_map_[column_family_id] = comparator;
+  }
+
  private:
-  const Comparator* comparator_;
+  const Comparator* default_comparator_;
+  std::unordered_map<uint32_t, const Comparator*> cf_comparator_map_;
   const ReadableWriteBatch* write_batch_;
 };
 
 typedef SkipList<WriteBatchIndexEntry*, const WriteBatchEntryComparator&>
     WriteBatchEntrySkipList;
 
-struct WriteBatchWithIndex::Rep {
-  Rep(const Comparator* index_comparator, size_t reserved_bytes = 0)
-      : write_batch(reserved_bytes),
-        comparator(index_comparator, &write_batch),
-        skip_list(comparator, &arena) {}
-  ReadableWriteBatch write_batch;
-  WriteBatchEntryComparator comparator;
-  Arena arena;
-  WriteBatchEntrySkipList skip_list;
-
-  WriteBatchIndexEntry* GetEntry(ColumnFamilyHandle* column_family) {
-    return GetEntryWithCfId(GetColumnFamilyID(column_family));
-  }
-
-  WriteBatchIndexEntry* GetEntryWithCfId(uint32_t column_family_id) {
-    auto* mem = arena.Allocate(sizeof(WriteBatchIndexEntry));
-    auto* index_entry = new (mem)
-        WriteBatchIndexEntry(write_batch.GetDataSize(), column_family_id);
-    return index_entry;
-  }
-};
-
 class WBWIIteratorImpl : public WBWIIterator {
  public:
   WBWIIteratorImpl(uint32_t column_family_id,
@@ -138,6 +121,35 @@ class WBWIIteratorImpl : public WBWIIterator {
     }
   }
 };
+}  // namespace
+
+struct WriteBatchWithIndex::Rep {
+  Rep(const Comparator* index_comparator, size_t reserved_bytes = 0)
+      : write_batch(reserved_bytes),
+        comparator(index_comparator, &write_batch),
+        skip_list(comparator, &arena) {}
+  ReadableWriteBatch write_batch;
+  WriteBatchEntryComparator comparator;
+  Arena arena;
+  WriteBatchEntrySkipList skip_list;
+
+  WriteBatchIndexEntry* GetEntry(ColumnFamilyHandle* column_family) {
+    uint32_t cf_id = GetColumnFamilyID(column_family);
+    const auto* cf_cmp = GetColumnFamilyUserComparator(column_family);
+    if (cf_cmp != nullptr) {
+      comparator.SetComparatorForCF(cf_id, cf_cmp);
+    }
+
+    return GetEntryWithCfId(cf_id);
+  }
+
+  WriteBatchIndexEntry* GetEntryWithCfId(uint32_t column_family_id) {
+    auto* mem = arena.Allocate(sizeof(WriteBatchIndexEntry));
+    auto* index_entry = new (mem)
+        WriteBatchIndexEntry(write_batch.GetDataSize(), column_family_id);
+    return index_entry;
+  }
+};
 
 Status ReadableWriteBatch::GetEntryFromDataOffset(size_t data_offset,
                                                   WriteType* type, Slice* Key,
@@ -179,9 +191,9 @@ Status ReadableWriteBatch::GetEntryFromDataOffset(size_t data_offset,
   return Status::OK();
 }
 
-WriteBatchWithIndex::WriteBatchWithIndex(const Comparator* index_comparator,
-                                         size_t reserved_bytes)
-    : rep(new Rep(index_comparator, reserved_bytes)) {}
+WriteBatchWithIndex::WriteBatchWithIndex(
+    const Comparator* default_index_comparator, size_t reserved_bytes)
+    : rep(new Rep(default_index_comparator, reserved_bytes)) {}
 
 WriteBatchWithIndex::~WriteBatchWithIndex() { delete rep; }
 
@@ -287,7 +299,14 @@ int WriteBatchEntryComparator::operator()(
     key2 = *(entry2->search_key);
   }
 
-  int cmp = comparator_->Compare(key1, key2);
+  int cmp;
+  auto comparator_for_cf = cf_comparator_map_.find(entry1->column_family);
+  if (comparator_for_cf != cf_comparator_map_.end()) {
+    cmp = comparator_for_cf->second->Compare(key1, key2);
+  } else {
+    cmp = default_comparator_->Compare(key1, key2);
+  }
+
   if (cmp != 0) {
     return cmp;
   } else if (entry1->offset > entry2->offset) {
diff --git a/utilities/write_batch_with_index/write_batch_with_index_test.cc b/utilities/write_batch_with_index/write_batch_with_index_test.cc
index fdceed4c4..ad8c110c1 100644
--- a/utilities/write_batch_with_index/write_batch_with_index_test.cc
+++ b/utilities/write_batch_with_index/write_batch_with_index_test.cc
@@ -19,12 +19,16 @@ namespace rocksdb {
 namespace {
 class ColumnFamilyHandleImplDummy : public ColumnFamilyHandleImpl {
  public:
-  explicit ColumnFamilyHandleImplDummy(int id)
-      : ColumnFamilyHandleImpl(nullptr, nullptr, nullptr), id_(id) {}
+  explicit ColumnFamilyHandleImplDummy(int id, const Comparator* comparator)
+      : ColumnFamilyHandleImpl(nullptr, nullptr, nullptr),
+        id_(id),
+        comparator_(comparator) {}
   uint32_t GetID() const override { return id_; }
+  const Comparator* user_comparator() const override { return comparator_; }
 
  private:
   uint32_t id_;
+  const Comparator* comparator_;
 };
 
 struct Entry {
@@ -90,8 +94,9 @@ TEST(WriteBatchWithIndexTest, TestValueAsSecondaryIndex) {
     index_map[e.value].push_back(&e);
   }
 
-  WriteBatchWithIndex batch(BytewiseComparator(), 20);
-  ColumnFamilyHandleImplDummy data(6), index(8);
+  WriteBatchWithIndex batch(nullptr, 20);
+  ColumnFamilyHandleImplDummy data(6, BytewiseComparator());
+  ColumnFamilyHandleImplDummy index(8, BytewiseComparator());
   for (auto& e : entries) {
     if (e.type == kPutRecord) {
       batch.Put(&data, e.key, e.value);
@@ -230,6 +235,107 @@ TEST(WriteBatchWithIndexTest, TestValueAsSecondaryIndex) {
   }
 }
 
+class ReverseComparator : public Comparator {
+ public:
+  ReverseComparator() {}
+
+  virtual const char* Name() const override {
+    return "rocksdb.ReverseComparator";
+  }
+
+  virtual int Compare(const Slice& a, const Slice& b) const override {
+    return 0 - BytewiseComparator()->Compare(a, b);
+  }
+
+  virtual void FindShortestSeparator(std::string* start,
+                                     const Slice& limit) const {}
+  virtual void FindShortSuccessor(std::string* key) const {}
+};
+
+TEST(WriteBatchWithIndexTest, TestComparatorForCF) {
+  ReverseComparator reverse_cmp;
+  ColumnFamilyHandleImplDummy cf1(6, nullptr);
+  ColumnFamilyHandleImplDummy reverse_cf(66, &reverse_cmp);
+  ColumnFamilyHandleImplDummy cf2(88, BytewiseComparator());
+  WriteBatchWithIndex batch(BytewiseComparator(), 20);
+
+  batch.Put(&cf1, "ddd", "");
+  batch.Put(&cf2, "aaa", "");
+  batch.Put(&cf2, "eee", "");
+  batch.Put(&cf1, "ccc", "");
+  batch.Put(&reverse_cf, "a11", "");
+  batch.Put(&cf1, "bbb", "");
+  batch.Put(&reverse_cf, "a33", "");
+  batch.Put(&reverse_cf, "a22", "");
+
+  {
+    std::unique_ptr<WBWIIterator> iter(batch.NewIterator(&cf1));
+    iter->Seek("");
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("bbb", iter->Entry().key.ToString());
+    iter->Next();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("ccc", iter->Entry().key.ToString());
+    iter->Next();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("ddd", iter->Entry().key.ToString());
+    iter->Next();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(!iter->Valid());
+  }
+
+  {
+    std::unique_ptr<WBWIIterator> iter(batch.NewIterator(&cf2));
+    iter->Seek("");
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("aaa", iter->Entry().key.ToString());
+    iter->Next();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("eee", iter->Entry().key.ToString());
+    iter->Next();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(!iter->Valid());
+  }
+
+  {
+    std::unique_ptr<WBWIIterator> iter(batch.NewIterator(&reverse_cf));
+    iter->Seek("");
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(!iter->Valid());
+
+    iter->Seek("z");
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("a33", iter->Entry().key.ToString());
+    iter->Next();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("a22", iter->Entry().key.ToString());
+    iter->Next();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("a11", iter->Entry().key.ToString());
+    iter->Next();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(!iter->Valid());
+
+    iter->Seek("a22");
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("a22", iter->Entry().key.ToString());
+
+    iter->Seek("a13");
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("a11", iter->Entry().key.ToString());
+  }
+}
+
 }  // namespace
 
 int main(int argc, char** argv) { return rocksdb::test::RunAllTests(); }

From 53b00399548eb8eb1cab396e17890aca6c5f497e Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Mon, 22 Sep 2014 15:00:03 -0700
Subject: [PATCH 114/829] Fix release compile

---
 util/options_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/util/options_test.cc b/util/options_test.cc
index f640b991f..eee285e2a 100644
--- a/util/options_test.cc
+++ b/util/options_test.cc
@@ -177,7 +177,7 @@ TEST(OptionsTest, GetOptionsFromStringsTest) {
   ASSERT_EQ(new_opt.level0_slowdown_writes_trigger, 9);
   ASSERT_EQ(new_opt.level0_stop_writes_trigger, 10);
   ASSERT_EQ(new_opt.max_mem_compaction_level, 11);
-  ASSERT_EQ(new_opt.target_file_size_base, 12);
+  ASSERT_EQ(new_opt.target_file_size_base, static_cast<uint64_t>(12));
   ASSERT_EQ(new_opt.target_file_size_multiplier, 13);
   ASSERT_EQ(new_opt.max_bytes_for_level_base, 14U);
   ASSERT_EQ(new_opt.max_bytes_for_level_multiplier, 15);

From 3d74f09979a2eadc1711a13ca4e221b53a6c44b3 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Mon, 22 Sep 2014 15:19:20 -0700
Subject: [PATCH 115/829] Fix compile

---
 table/full_filter_block_test.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/table/full_filter_block_test.cc b/table/full_filter_block_test.cc
index 12e783b4a..7bf61f238 100644
--- a/table/full_filter_block_test.cc
+++ b/table/full_filter_block_test.cc
@@ -30,7 +30,8 @@ class TestFilterBitsBuilder : public FilterBitsBuilder {
     for (size_t i = 0; i < hash_entries_.size(); i++) {
       EncodeFixed32(data + i * 4, hash_entries_[i]);
     }
-    buf->reset(data);
+    const char* const_data = data;
+    buf->reset(const_data);
     return Slice(data, len);
   }
 

From 55af370756af6f11edd79e431a4f9cc0a04e784b Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Tue, 23 Sep 2014 13:02:23 -0700
Subject: [PATCH 116/829] Remove TODO for checking index checksums

---
 table/block_based_table_reader.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc
index 2e883632f..eb3de7a3b 100644
--- a/table/block_based_table_reader.cc
+++ b/table/block_based_table_reader.cc
@@ -498,7 +498,6 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions,
     // pre-load these blocks, which will kept in member variables in Rep
     // and with a same life-time as this table object.
     IndexReader* index_reader = nullptr;
-    // TODO: we never really verify check sum for index block
     s = new_table->CreateIndexReader(&index_reader, meta_iter.get());
 
     if (s.ok()) {

From 0a29ce53938a7c5db3484c196fb98e5e61a952df Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Tue, 23 Sep 2014 14:18:57 -0700
Subject: [PATCH 117/829] re-enable BlockBasedTable::SetupForCompaction()

Summary:
It was commented out in D22545 by accident. Keep the option in
ImmutableOptions for now. I can make it dynamic in
https://reviews.facebook.net/D23349

Test Plan: make release

Reviewers: sdong, yhchiang, igor

Reviewed By: igor

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D23865
---
 include/rocksdb/immutable_options.h |  2 ++
 include/rocksdb/options.h           | 14 ++++++++------
 table/block_based_table_reader.cc   |  4 +---
 util/options.cc                     |  3 ++-
 4 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/include/rocksdb/immutable_options.h b/include/rocksdb/immutable_options.h
index de4480cff..54b676626 100644
--- a/include/rocksdb/immutable_options.h
+++ b/include/rocksdb/immutable_options.h
@@ -77,6 +77,8 @@ struct ImmutableCFOptions {
   std::vector<CompressionType> compression_per_level;
 
   CompressionOptions compression_opts;
+
+  Options::AccessHint access_hint_on_compaction_start;
 };
 
 }  // namespace rocksdb
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index 84a0422c1..a60f94268 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -58,6 +58,7 @@ enum CompactionStyle : char {
   kCompactionStyleFIFO = 0x2,       // FIFO compaction style
 };
 
+
 struct CompactionOptionsFIFO {
   // once the total sum of table files reaches this, we will delete the oldest
   // table file
@@ -783,12 +784,13 @@ struct DBOptions {
   // Specify the file access pattern once a compaction is started.
   // It will be applied to all input files of a compaction.
   // Default: NORMAL
-  enum {
-    NONE,
-    NORMAL,
-    SEQUENTIAL,
-    WILLNEED
-  } access_hint_on_compaction_start;
+  enum AccessHint {
+      NONE,
+      NORMAL,
+      SEQUENTIAL,
+      WILLNEED
+  };
+  AccessHint access_hint_on_compaction_start;
 
   // Use adaptive mutex, which spins in the user space before resorting
   // to kernel. This could reduce context switch when the mutex is not
diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc
index eb3de7a3b..09328dc3b 100644
--- a/table/block_based_table_reader.cc
+++ b/table/block_based_table_reader.cc
@@ -532,8 +532,7 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions,
 }
 
 void BlockBasedTable::SetupForCompaction() {
-  /*
-  switch (.access_hint_on_compaction_start) {
+  switch (rep_->ioptions.access_hint_on_compaction_start) {
     case Options::NONE:
       break;
     case Options::NORMAL:
@@ -549,7 +548,6 @@ void BlockBasedTable::SetupForCompaction() {
       assert(false);
   }
   compaction_optimized_ = true;
-  */
 }
 
 std::shared_ptr<const TableProperties> BlockBasedTable::GetTableProperties()
diff --git a/util/options.cc b/util/options.cc
index 32612d6a7..28120659b 100644
--- a/util/options.cc
+++ b/util/options.cc
@@ -59,7 +59,8 @@ ImmutableCFOptions::ImmutableCFOptions(const Options& options)
     use_fsync(options.use_fsync),
     compression(options.compression),
     compression_per_level(options.compression_per_level),
-    compression_opts(options.compression_opts) {}
+    compression_opts(options.compression_opts),
+    access_hint_on_compaction_start(options.access_hint_on_compaction_start) {}
 
 ColumnFamilyOptions::ColumnFamilyOptions()
     : comparator(BytewiseComparator()),

From cf7ace886573f387194d158e09497c05a30bd24a Mon Sep 17 00:00:00 2001
From: Naveen <nsomasun@linkedin.com>
Date: Wed, 10 Sep 2014 12:12:31 -0700
Subject: [PATCH 118/829] Addressing review comments

---
 java/org/rocksdb/NativeLibraryLoader.java | 19 ++++++++++++-------
 java/org/rocksdb/RocksDB.java             |  2 +-
 2 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/java/org/rocksdb/NativeLibraryLoader.java b/java/org/rocksdb/NativeLibraryLoader.java
index f6b8520f5..880e90acc 100644
--- a/java/org/rocksdb/NativeLibraryLoader.java
+++ b/java/org/rocksdb/NativeLibraryLoader.java
@@ -2,16 +2,16 @@ package org.rocksdb;
 
 import java.io.*;
 
+
+/**
+ * This class is used to load the RocksDB shared library from within the jar.
+ * The shared library is extracted to a temp folder and loaded from there.
+ */
 public class NativeLibraryLoader {
 
   private static String sharedLibraryName = "librocksdbjni.so";
   private static String tempFilePrefix = "librocksdbjni";
   private static String tempFileSuffix = ".so";
-  /**
-   * Private constructor - this class will never be instanced
-   */
-  private NativeLibraryLoader() {
-  }
 
   public static void loadLibraryFromJar()
       throws IOException {
@@ -23,7 +23,7 @@ public class NativeLibraryLoader {
       throw new RuntimeException("File " + temp.getAbsolutePath() + " does not exist.");
     }
 
-    byte[] buffer = new byte[1024];
+    byte[] buffer = new byte[102400];
     int readBytes;
 
     InputStream is = ClassLoader.getSystemClassLoader().getResourceAsStream(sharedLibraryName);
@@ -31,8 +31,8 @@ public class NativeLibraryLoader {
       throw new RuntimeException(sharedLibraryName + " was not found inside JAR.");
     }
 
-    OutputStream os = new FileOutputStream(temp);
     try {
+      OutputStream os = new FileOutputStream(temp);
       while ((readBytes = is.read(buffer)) != -1) {
         os.write(buffer, 0, readBytes);
       }
@@ -43,4 +43,9 @@ public class NativeLibraryLoader {
 
     System.load(temp.getAbsolutePath());
   }
+  /**
+   * Private constructor to disallow instantiation
+   */
+  private NativeLibraryLoader() {
+  }
 }
diff --git a/java/org/rocksdb/RocksDB.java b/java/org/rocksdb/RocksDB.java
index 132b9ac39..f45a608e2 100644
--- a/java/org/rocksdb/RocksDB.java
+++ b/java/org/rocksdb/RocksDB.java
@@ -26,7 +26,7 @@ public class RocksDB extends RocksObject {
       "snappy", "z", "bzip2", "lz4", "lz4hc"};
 
   static {
-      loadLibrary();
+      RocksDB.loadLibrary();
   }
 
 

From fd7d3fe604191d9e076c9a76485111a0109acadf Mon Sep 17 00:00:00 2001
From: Naveen <nsomasun@linkedin.com>
Date: Mon, 22 Sep 2014 18:20:02 -0700
Subject: [PATCH 119/829] Addressing review comments (adding a env variable to
 override temp directory)

---
 java/org/rocksdb/NativeLibraryLoader.java | 19 +++++++++++++------
 java/org/rocksdb/RocksDB.java             |  7 +++----
 2 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/java/org/rocksdb/NativeLibraryLoader.java b/java/org/rocksdb/NativeLibraryLoader.java
index 880e90acc..ad4315dd4 100644
--- a/java/org/rocksdb/NativeLibraryLoader.java
+++ b/java/org/rocksdb/NativeLibraryLoader.java
@@ -8,15 +8,18 @@ import java.io.*;
  * The shared library is extracted to a temp folder and loaded from there.
  */
 public class NativeLibraryLoader {
-
   private static String sharedLibraryName = "librocksdbjni.so";
   private static String tempFilePrefix = "librocksdbjni";
   private static String tempFileSuffix = ".so";
 
-  public static void loadLibraryFromJar()
+  public static void loadLibraryFromJar(String tmpDir)
       throws IOException {
+    File temp;
+    if(tmpDir == null || tmpDir.equals(""))
+      temp = File.createTempFile(tempFilePrefix, tempFileSuffix);
+    else
+      temp = new File(tmpDir+"/"+sharedLibraryName);
 
-    File temp = File.createTempFile(tempFilePrefix, tempFileSuffix);
     temp.deleteOnExit();
 
     if (!temp.exists()) {
@@ -31,14 +34,18 @@ public class NativeLibraryLoader {
       throw new RuntimeException(sharedLibraryName + " was not found inside JAR.");
     }
 
+    OutputStream os = null;
     try {
-      OutputStream os = new FileOutputStream(temp);
+      os = new FileOutputStream(temp);
       while ((readBytes = is.read(buffer)) != -1) {
         os.write(buffer, 0, readBytes);
       }
     } finally {
-      os.close();
-      is.close();
+      if(os != null)
+        os.close();
+
+      if(is != null)
+        is.close();
     }
 
     System.load(temp.getAbsolutePath());
diff --git a/java/org/rocksdb/RocksDB.java b/java/org/rocksdb/RocksDB.java
index f45a608e2..387e34282 100644
--- a/java/org/rocksdb/RocksDB.java
+++ b/java/org/rocksdb/RocksDB.java
@@ -20,21 +20,20 @@ import org.rocksdb.NativeLibraryLoader;
  * indicates sth wrong at the rocksdb library side and the call failed.
  */
 public class RocksDB extends RocksObject {
-
   public static final int NOT_FOUND = -1;
   private static final String[] compressionLibs_ = {
       "snappy", "z", "bzip2", "lz4", "lz4hc"};
 
   static {
-      RocksDB.loadLibrary();
+    RocksDB.loadLibrary();
   }
 
-
   /**
    * Loads the necessary library files.
    * Calling this method twice will have no effect.
    */
   public static synchronized void loadLibrary() {
+    String tmpDir = System.getenv("ROCKSDB_SHAREDLIB_DIR");
     // loading possibly necessary libraries.
     for (String lib : compressionLibs_) {
       try {
@@ -45,7 +44,7 @@ public class RocksDB extends RocksObject {
     }
     try
     {
-      NativeLibraryLoader.loadLibraryFromJar();
+      NativeLibraryLoader.loadLibraryFromJar(tmpDir);
     }
     catch (IOException e)
     {

From 51eeaf65e26bab134d7ebef1a69bc7356c815d60 Mon Sep 17 00:00:00 2001
From: Naveen <nsomasun@linkedin.com>
Date: Tue, 23 Sep 2014 10:31:55 -0700
Subject: [PATCH 120/829] Addressing review comments

---
 java/org/rocksdb/NativeLibraryLoader.java | 2 +-
 java/org/rocksdb/RocksDB.java             | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/java/org/rocksdb/NativeLibraryLoader.java b/java/org/rocksdb/NativeLibraryLoader.java
index ad4315dd4..440056582 100644
--- a/java/org/rocksdb/NativeLibraryLoader.java
+++ b/java/org/rocksdb/NativeLibraryLoader.java
@@ -18,7 +18,7 @@ public class NativeLibraryLoader {
     if(tmpDir == null || tmpDir.equals(""))
       temp = File.createTempFile(tempFilePrefix, tempFileSuffix);
     else
-      temp = new File(tmpDir+"/"+sharedLibraryName);
+      temp = new File(tmpDir + "/" + sharedLibraryName);
 
     temp.deleteOnExit();
 
diff --git a/java/org/rocksdb/RocksDB.java b/java/org/rocksdb/RocksDB.java
index 387e34282..ec1cb8a28 100644
--- a/java/org/rocksdb/RocksDB.java
+++ b/java/org/rocksdb/RocksDB.java
@@ -31,6 +31,9 @@ public class RocksDB extends RocksObject {
   /**
    * Loads the necessary library files.
    * Calling this method twice will have no effect.
+   * By default the method extracts the shared library for loading at
+   * java.io.tmpdir, however, you can override this temporary location by
+   * setting the environment variable ROCKSDB_SHAREDLIB_DIR.
    */
   public static synchronized void loadLibrary() {
     String tmpDir = System.getenv("ROCKSDB_SHAREDLIB_DIR");

From cdaf44f9aeebb38281fc742c76cc0602516a55d8 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Tue, 23 Sep 2014 13:43:03 -0700
Subject: [PATCH 121/829] Enlarge log size cap when printing file summary

Summary:
Now the file summary is too small for printing. Enlarge it.
To enable it, allow to pass a size to log buffer.

Test Plan:
Add a unit test.
make all check

Reviewers: ljin, yhchiang

Reviewed By: yhchiang

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D21723
---
 db/compaction_picker.cc |  2 +-
 db/version_set.h        |  4 ++--
 util/env_test.cc        | 35 +++++++++++++++++++++++++++++++++++
 util/log_buffer.cc      | 21 ++++++++++++++++-----
 util/log_buffer.h       |  9 +++++++--
 5 files changed, 61 insertions(+), 10 deletions(-)

diff --git a/db/compaction_picker.cc b/db/compaction_picker.cc
index 04d5c6f47..7cd965c20 100644
--- a/db/compaction_picker.cc
+++ b/db/compaction_picker.cc
@@ -575,7 +575,7 @@ Compaction* UniversalCompactionPicker::PickCompaction(Version* version,
     return nullptr;
   }
   Version::FileSummaryStorage tmp;
-  LogToBuffer(log_buffer, "[%s] Universal: candidate files(%zu): %s\n",
+  LogToBuffer(log_buffer, 3072, "[%s] Universal: candidate files(%zu): %s\n",
               version->cfd_->GetName().c_str(), version->files_[level].size(),
               version->LevelFileSummary(&tmp, 0));
 
diff --git a/db/version_set.h b/db/version_set.h
index 353adbfec..211fca179 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -183,10 +183,10 @@ class Version {
   // Return a human-readable short (single-line) summary of the number
   // of files per level.  Uses *scratch as backing store.
   struct LevelSummaryStorage {
-    char buffer[100];
+    char buffer[1000];
   };
   struct FileSummaryStorage {
-    char buffer[1000];
+    char buffer[3000];
   };
   const char* LevelSummary(LevelSummaryStorage* scratch) const;
   // Return a human-readable short (single-line) summary of files
diff --git a/util/env_test.cc b/util/env_test.cc
index 3e811a98d..1779f1aa0 100644
--- a/util/env_test.cc
+++ b/util/env_test.cc
@@ -768,6 +768,41 @@ TEST(EnvPosixTest, LogBufferTest) {
   ASSERT_EQ(10, test_logger.char_x_count);
 }
 
+class TestLogger2 : public Logger {
+ public:
+  explicit TestLogger2(size_t max_log_size) : max_log_size_(max_log_size) {}
+  virtual void Logv(const char* format, va_list ap) override {
+    char new_format[2000];
+    std::fill_n(new_format, sizeof(new_format), '2');
+    {
+      va_list backup_ap;
+      va_copy(backup_ap, ap);
+      int n = vsnprintf(new_format, sizeof(new_format) - 1, format, backup_ap);
+      // 48 bytes for extra information + bytes allocated
+      ASSERT_TRUE(
+          n <= 48 + static_cast<int>(max_log_size_ - sizeof(struct timeval)));
+      ASSERT_TRUE(n > static_cast<int>(max_log_size_ - sizeof(struct timeval)));
+      va_end(backup_ap);
+    }
+  }
+  size_t max_log_size_;
+};
+
+TEST(EnvPosixTest, LogBufferMaxSizeTest) {
+  char bytes9000[9000];
+  std::fill_n(bytes9000, sizeof(bytes9000), '1');
+  bytes9000[sizeof(bytes9000) - 1] = '\0';
+
+  for (size_t max_log_size = 256; max_log_size <= 1024;
+       max_log_size += 1024 - 256) {
+    TestLogger2 test_logger(max_log_size);
+    test_logger.SetInfoLogLevel(InfoLogLevel::INFO_LEVEL);
+    LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, &test_logger);
+    LogToBuffer(&log_buffer, max_log_size, "%s", bytes9000);
+    log_buffer.FlushBufferToLog();
+  }
+}
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
diff --git a/util/log_buffer.cc b/util/log_buffer.cc
index 726c01442..ddddaec9f 100644
--- a/util/log_buffer.cc
+++ b/util/log_buffer.cc
@@ -13,17 +13,17 @@ LogBuffer::LogBuffer(const InfoLogLevel log_level,
                      Logger*info_log)
     : log_level_(log_level), info_log_(info_log) {}
 
-void LogBuffer::AddLogToBuffer(const char* format, va_list ap) {
+void LogBuffer::AddLogToBuffer(size_t max_log_size, const char* format,
+                               va_list ap) {
   if (!info_log_ || log_level_ < info_log_->GetInfoLogLevel()) {
     // Skip the level because of its level.
     return;
   }
 
-  const size_t kLogSizeLimit = 512;
-  char* alloc_mem = arena_.AllocateAligned(kLogSizeLimit);
+  char* alloc_mem = arena_.AllocateAligned(max_log_size);
   BufferedLog* buffered_log = new (alloc_mem) BufferedLog();
   char* p = buffered_log->message;
-  char* limit = alloc_mem + kLogSizeLimit - 1;
+  char* limit = alloc_mem + max_log_size - 1;
 
   // store the time
   gettimeofday(&(buffered_log->now_tv), nullptr);
@@ -61,11 +61,22 @@ void LogBuffer::FlushBufferToLog() {
   logs_.clear();
 }
 
+void LogToBuffer(LogBuffer* log_buffer, size_t max_log_size, const char* format,
+                 ...) {
+  if (log_buffer != nullptr) {
+    va_list ap;
+    va_start(ap, format);
+    log_buffer->AddLogToBuffer(max_log_size, format, ap);
+    va_end(ap);
+  }
+}
+
 void LogToBuffer(LogBuffer* log_buffer, const char* format, ...) {
+  const size_t kDefaultMaxLogSize = 512;
   if (log_buffer != nullptr) {
     va_list ap;
     va_start(ap, format);
-    log_buffer->AddLogToBuffer(format, ap);
+    log_buffer->AddLogToBuffer(kDefaultMaxLogSize, format, ap);
     va_end(ap);
   }
 }
diff --git a/util/log_buffer.h b/util/log_buffer.h
index 2a24bf854..2d790086e 100644
--- a/util/log_buffer.h
+++ b/util/log_buffer.h
@@ -21,8 +21,9 @@ class LogBuffer {
   // info_log:  logger to write the logs to
   LogBuffer(const InfoLogLevel log_level, Logger* info_log);
 
-  // Add a log entry to the buffer.
-  void AddLogToBuffer(const char* format, va_list ap);
+  // Add a log entry to the buffer. Use default max_log_size.
+  // max_log_size indicates maximize log size, including some metadata.
+  void AddLogToBuffer(size_t max_log_size, const char* format, va_list ap);
 
   size_t IsEmpty() const { return logs_.empty(); }
 
@@ -44,6 +45,10 @@ class LogBuffer {
 
 // Add log to the LogBuffer for a delayed info logging. It can be used when
 // we want to add some logs inside a mutex.
+// max_log_size indicates maximize log size, including some metadata.
+extern void LogToBuffer(LogBuffer* log_buffer, size_t max_log_size,
+                        const char* format, ...);
+// Same as previous function, but with default max log size.
 extern void LogToBuffer(LogBuffer* log_buffer, const char* format, ...);
 
 }  // namespace rocksdb

From 1dfb7bb980175e0025af09cabc771aadbe7f01c9 Mon Sep 17 00:00:00 2001
From: Ankit Gupta <aigupta@linkedin.com>
Date: Wed, 24 Sep 2014 11:43:35 -0700
Subject: [PATCH 122/829] Add block based table config options

---
 java/RocksDBSample.java                     |  18 ++-
 java/org/rocksdb/BlockBasedTableConfig.java | 139 ++++++++++++++++++--
 java/org/rocksdb/RocksDB.java               |  22 ++++
 java/rocksjni/rocksjni.cc                   |  24 ++++
 java/rocksjni/table.cc                      |  24 +++-
 5 files changed, 208 insertions(+), 19 deletions(-)

diff --git a/java/RocksDBSample.java b/java/RocksDBSample.java
index 9ec3d8345..54e5c2086 100644
--- a/java/RocksDBSample.java
+++ b/java/RocksDBSample.java
@@ -79,9 +79,23 @@ public class RocksDBSample {
     BlockBasedTableConfig table_options = new BlockBasedTableConfig();
     table_options.setBlockCacheSize(64 * SizeUnit.KB)
                  .setFilterBitsPerKey(10)
-                 .setCacheNumShardBits(6);
+                 .setCacheNumShardBits(6)
+                 .setBlockSizeDeviation(5)
+                 .setBlockRestartInterval(10)
+                 .setCacheIndexAndFilterBlocks(true)
+                 .setHashIndexAllowCollision(false)
+                 .setBlockCacheCompressedSize(64 * SizeUnit.KB)
+                 .setBlockCacheCompressedNumShardBits(10);
+                 
     assert(table_options.blockCacheSize() == 64 * SizeUnit.KB);
     assert(table_options.cacheNumShardBits() == 6);
+    assert(table_options.blockSizeDeviation() == 5);
+    assert(table_options.blockRestartInterval() == 10);
+    assert(table_options.cacheIndexAndFilterBlocks() == true);
+    assert(table_options.hashIndexAllowCollision() == false);
+    assert(table_options.blockCacheCompressedSize() == 64 * SizeUnit.KB);
+    assert(table_options.blockCacheCompressedNumShardBits() == 10);
+    
     options.setTableFormatConfig(table_options);
     assert(options.tableFactoryName().equals("BlockBasedTable"));
 
@@ -90,6 +104,8 @@ public class RocksDBSample {
       db.put("hello".getBytes(), "world".getBytes());
       byte[] value = db.get("hello".getBytes());
       assert("world".equals(new String(value)));
+      String str = db.getProperty("rocksdb.stats");
+      assert(str != null && str != "");
     } catch (RocksDBException e) {
       System.out.format("[ERROR] caught the unexpceted exception -- %s\n", e);
       assert(db == null);
diff --git a/java/org/rocksdb/BlockBasedTableConfig.java b/java/org/rocksdb/BlockBasedTableConfig.java
index 523a57691..bdb27d6c2 100644
--- a/java/org/rocksdb/BlockBasedTableConfig.java
+++ b/java/org/rocksdb/BlockBasedTableConfig.java
@@ -14,11 +14,14 @@ public class BlockBasedTableConfig extends TableFormatConfig {
   public BlockBasedTableConfig() {
     noBlockCache_ = false;
     blockCacheSize_ = 8 * 1024 * 1024;
-    blockSize_ =  4 * 1024;
-    blockSizeDeviation_ =10;
-    blockRestartInterval_ =16;
+    blockSize_ = 4 * 1024;
+    blockSizeDeviation_ = 10;
+    blockRestartInterval_ = 16;
     wholeKeyFiltering_ = true;
-    bitsPerKey_ = 0;
+    bitsPerKey_ = 10;
+    cacheIndexAndFilterBlocks_ = false;
+    hashIndexAllowCollision_ = true;
+    blockCacheCompressedSize_ = 0;
   }
 
   /**
@@ -71,8 +74,8 @@ public class BlockBasedTableConfig extends TableFormatConfig {
    *     number means use default settings."
    * @return the reference to the current option.
    */
-  public BlockBasedTableConfig setCacheNumShardBits(int numShardBits) {
-    numShardBits_ = numShardBits;
+  public BlockBasedTableConfig setCacheNumShardBits(int blockCacheNumShardBits) {
+    blockCacheNumShardBits_ = blockCacheNumShardBits;
     return this;
   }
 
@@ -84,7 +87,7 @@ public class BlockBasedTableConfig extends TableFormatConfig {
    * @return the number of shard bits used in the block cache.
    */
   public int cacheNumShardBits() {
-    return numShardBits_;
+    return blockCacheNumShardBits_;
   }
 
   /**
@@ -186,25 +189,135 @@ public class BlockBasedTableConfig extends TableFormatConfig {
     bitsPerKey_ = bitsPerKey;
     return this;
   }
+  
+  /**
+   * Indicating if we'd put index/filter blocks to the block cache.
+     If not specified, each "table reader" object will pre-load index/filter
+     block during table initialization.
+   * 
+   * @return if index and filter blocks should be put in block cache.
+   */
+  public boolean cacheIndexAndFilterBlocks() {
+    return cacheIndexAndFilterBlocks_;
+  }
+  
+  /**
+   * Indicating if we'd put index/filter blocks to the block cache.
+     If not specified, each "table reader" object will pre-load index/filter
+     block during table initialization.
+   * 
+   * @param index and filter blocks should be put in block cache.
+   * @return the reference to the current config.
+   */
+  public BlockBasedTableConfig setCacheIndexAndFilterBlocks(
+      boolean cacheIndexAndFilterBlocks) {
+    cacheIndexAndFilterBlocks_ = cacheIndexAndFilterBlocks;
+    return this;
+  }
+  
+  /**
+   * Influence the behavior when kHashSearch is used.
+     if false, stores a precise prefix to block range mapping
+     if true, does not store prefix and allows prefix hash collision
+     (less memory consumption)
+   * 
+   * @return if hash collisions should be allowed.
+   */
+  public boolean hashIndexAllowCollision() {
+    return hashIndexAllowCollision_;
+  }
+  
+  /**
+   * Influence the behavior when kHashSearch is used.
+     if false, stores a precise prefix to block range mapping
+     if true, does not store prefix and allows prefix hash collision
+     (less memory consumption)
+   * 
+   * @param if hash collisions should be allowed.
+   * @return the reference to the current config.
+   */
+  public BlockBasedTableConfig setHashIndexAllowCollision(
+      boolean hashIndexAllowCollision) {
+    hashIndexAllowCollision_ = hashIndexAllowCollision;
+    return this;
+  }
+  
+  /**
+   * Size of compressed block cache. If 0, then block_cache_compressed is set
+   * to null.
+   * 
+   * @return size of compressed block cache.
+   */
+  public long blockCacheCompressedSize() {
+    return blockCacheCompressedSize_;
+  }
+  
+  /**
+   * Size of compressed block cache. If 0, then block_cache_compressed is set
+   * to null.
+   * 
+   * @param size of compressed block cache.
+   * @return the reference to the current config.
+   */
+  public BlockBasedTableConfig setBlockCacheCompressedSize(
+      long blockCacheCompressedSize) {
+    blockCacheCompressedSize_ = blockCacheCompressedSize;
+    return this;
+  }
+  
+  /**
+   * Controls the number of shards for the block compressed cache.
+   * This is applied only if blockCompressedCacheSize is set to non-negative.
+   *
+   * @return numShardBits the number of shard bits.  The resulting
+   *     number of shards would be 2 ^ numShardBits.  Any negative
+   *     number means use default settings.
+   */
+  public int blockCacheCompressedNumShardBits() {
+    return blockCacheCompressedNumShardBits_;
+  }
+  
+  /**
+   * Controls the number of shards for the block compressed cache.
+   * This is applied only if blockCompressedCacheSize is set to non-negative.
+   *
+   * @param numShardBits the number of shard bits.  The resulting
+   *     number of shards would be 2 ^ numShardBits.  Any negative
+   *     number means use default settings."
+   * @return the reference to the current option.
+   */
+  public BlockBasedTableConfig setBlockCacheCompressedNumShardBits(
+      int blockCacheCompressedNumShardBits) {
+    blockCacheCompressedNumShardBits_ = blockCacheCompressedNumShardBits;
+    return this;
+  }
 
   @Override protected long newTableFactoryHandle() {
-    return newTableFactoryHandle(noBlockCache_, blockCacheSize_, numShardBits_,
-        blockSize_, blockSizeDeviation_, blockRestartInterval_,
-        wholeKeyFiltering_, bitsPerKey_);
+    return newTableFactoryHandle(noBlockCache_, blockCacheSize_,
+        blockCacheNumShardBits_, blockSize_, blockSizeDeviation_,
+        blockRestartInterval_, wholeKeyFiltering_, bitsPerKey_,
+        cacheIndexAndFilterBlocks_, hashIndexAllowCollision_,
+        blockCacheCompressedSize_, blockCacheCompressedNumShardBits_);
   }
 
   private native long newTableFactoryHandle(
-      boolean noBlockCache, long blockCacheSize, int numShardbits,
+      boolean noBlockCache, long blockCacheSize, int blockCacheNumShardBits,
       long blockSize, int blockSizeDeviation, int blockRestartInterval,
-      boolean wholeKeyFiltering, int bitsPerKey);
+      boolean wholeKeyFiltering, int bitsPerKey,
+      boolean cacheIndexAndFilterBlocks, boolean hashIndexAllowCollision,
+      long blockCacheCompressedSize, int blockCacheCompressedNumShardBits);
 
   private boolean noBlockCache_;
   private long blockCacheSize_;
-  private int numShardBits_;
+  private int blockCacheNumShardBits_;
   private long shard;
   private long blockSize_;
   private int blockSizeDeviation_;
   private int blockRestartInterval_;
   private boolean wholeKeyFiltering_;
   private int bitsPerKey_;
+  private boolean cacheIndexAndFilterBlocks_;
+  private boolean hashIndexAllowCollision_;
+  private long blockCacheCompressedSize_;
+  private int blockCacheCompressedNumShardBits_;
 }
diff --git a/java/org/rocksdb/RocksDB.java b/java/org/rocksdb/RocksDB.java
index f8968d14d..829ac48df 100644
--- a/java/org/rocksdb/RocksDB.java
+++ b/java/org/rocksdb/RocksDB.java
@@ -309,6 +309,26 @@ public class RocksDB extends RocksObject {
       throws RocksDBException {
     remove(nativeHandle_, writeOpt.nativeHandle_, key, key.length);
   }
+  
+  /**
+   * DB implementations can export properties about their state
+     via this method.  If "property" is a valid property understood by this
+     DB implementation, fills "*value" with its current value and returns
+     true.  Otherwise returns false.
+  
+  
+     Valid property names include:
+   
+     "rocksdb.num-files-at-level<N>" - return the number of files at level <N>,
+         where <N> is an ASCII representation of a level number (e.g. "0").
+     "rocksdb.stats" - returns a multi-line string that describes statistics
+         about the internal operation of the DB.
+     "rocksdb.sstables" - returns a multi-line string that describes all
+       of the sstables that make up the db contents.
+   */
+  public String getProperty(String property) throws RocksDBException {
+    return getProperty0(nativeHandle_, property, property.length());
+  }
 
   /**
    * Return a heap-allocated iterator over the contents of the database.
@@ -363,6 +383,8 @@ public class RocksDB extends RocksObject {
   protected native void remove(
       long handle, long writeOptHandle,
       byte[] key, int keyLen) throws RocksDBException;
+  protected native String getProperty0(long nativeHandle,
+      String property, int propertyLength) throws RocksDBException;
   protected native long iterator0(long optHandle);
   private native void disposeInternal(long handle);
 
diff --git a/java/rocksjni/rocksjni.cc b/java/rocksjni/rocksjni.cc
index f55290f64..f1b9cc758 100644
--- a/java/rocksjni/rocksjni.cc
+++ b/java/rocksjni/rocksjni.cc
@@ -425,3 +425,27 @@ jlong Java_org_rocksdb_RocksDB_iterator0(
   rocksdb::Iterator* iterator = db->NewIterator(rocksdb::ReadOptions());
   return reinterpret_cast<jlong>(iterator);
 }
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    getProperty0
+ * Signature: (JLjava/lang/String;I)Ljava/lang/String;
+ */
+jstring Java_org_rocksdb_RocksDB_getProperty0(
+    JNIEnv* env, jobject jdb, jlong db_handle, jstring jproperty,
+    jint jproperty_len) {
+  auto db = reinterpret_cast<rocksdb::DB*>(db_handle);
+  
+  const char* property = env->GetStringUTFChars(jproperty, 0);
+  rocksdb::Slice property_slice(property, jproperty_len);
+  
+  std::string property_value;
+  bool retCode = db->GetProperty(property_slice, &property_value);
+  env->ReleaseStringUTFChars(jproperty, property);
+  
+  if (!retCode) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, rocksdb::Status::NotFound());
+  }
+  
+  return env->NewStringUTF(property_value.data());
+}
diff --git a/java/rocksjni/table.cc b/java/rocksjni/table.cc
index ffda1a2ba..500cb255e 100644
--- a/java/rocksjni/table.cc
+++ b/java/rocksjni/table.cc
@@ -31,20 +31,22 @@ jlong Java_org_rocksdb_PlainTableConfig_newTableFactoryHandle(
 /*
  * Class:     org_rocksdb_BlockBasedTableConfig
  * Method:    newTableFactoryHandle
- * Signature: (ZJIJIIZI)J
+ * Signature: (ZJIJIIZIZZJI)J
  */
 jlong Java_org_rocksdb_BlockBasedTableConfig_newTableFactoryHandle(
     JNIEnv* env, jobject jobj, jboolean no_block_cache, jlong block_cache_size,
-    jint num_shardbits, jlong block_size, jint block_size_deviation,
+    jint block_cache_num_shardbits, jlong block_size, jint block_size_deviation,
     jint block_restart_interval, jboolean whole_key_filtering,
-    jint bits_per_key) {
+    jint bits_per_key, jboolean cache_index_and_filter_blocks,
+    jboolean hash_index_allow_collision, jlong block_cache_compressed_size,
+    jint block_cache_compressd_num_shard_bits) {
   rocksdb::BlockBasedTableOptions options;
   options.no_block_cache = no_block_cache;
 
   if (!no_block_cache && block_cache_size > 0) {
-    if (num_shardbits > 0) {
+    if (block_cache_num_shardbits > 0) {
       options.block_cache =
-          rocksdb::NewLRUCache(block_cache_size, num_shardbits);
+          rocksdb::NewLRUCache(block_cache_size, block_cache_num_shardbits);
     } else {
       options.block_cache = rocksdb::NewLRUCache(block_cache_size);
     }
@@ -56,5 +58,17 @@ jlong Java_org_rocksdb_BlockBasedTableConfig_newTableFactoryHandle(
   if (bits_per_key > 0) {
     options.filter_policy.reset(rocksdb::NewBloomFilterPolicy(bits_per_key));
   }
+  options.cache_index_and_filter_blocks = cache_index_and_filter_blocks;
+  options.hash_index_allow_collision = hash_index_allow_collision;
+  if (block_cache_compressed_size > 0) {
+    if (block_cache_compressd_num_shard_bits > 0) {
+      options.block_cache =
+          rocksdb::NewLRUCache(block_cache_compressed_size,
+              block_cache_compressd_num_shard_bits);
+    } else {
+      options.block_cache = rocksdb::NewLRUCache(block_cache_compressed_size);
+    }
+  }
+  
   return reinterpret_cast<jlong>(rocksdb::NewBlockBasedTableFactory(options));
 }

From 21ddcf6e4f4f3e87a344e5a1c68e1756d2ac25a7 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Wed, 24 Sep 2014 13:12:16 -0700
Subject: [PATCH 123/829] Remove allow_thread_local

Summary: See https://reviews.facebook.net/D19365

Test Plan: compiles

Reviewers: sdong, yhchiang, ljin

Reviewed By: ljin

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D23907
---
 HISTORY.md                             |  3 ++
 db/c.cc                                |  9 ++---
 db/column_family.cc                    | 18 +++-------
 db/db_impl.cc                          | 46 ++++++++++----------------
 include/rocksdb/c.h                    |  2 --
 include/rocksdb/options.h              |  4 ---
 java/org/rocksdb/Options.java          | 27 ---------------
 java/org/rocksdb/test/OptionsTest.java |  6 ----
 java/rocksjni/options.cc               | 21 ------------
 util/options.cc                        |  2 --
 util/options_helper.cc                 |  2 --
 util/options_test.cc                   |  2 --
 12 files changed, 28 insertions(+), 114 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index a8b89f54f..41c49cc1a 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -8,7 +8,10 @@
 * We have refactored our system of stalling writes.  Any stall-related statistics' meanings are changed. Instead of per-write stall counts, we now count stalls per-epoch, where epochs are periods between flushes and compactions. You'll find more information in our Tuning Perf Guide once we release RocksDB 3.6.
 * When disableDataSync=true, we no longer sync the MANIFEST file.
 * Add identity_as_first_hash property to CuckooTable. SST file needs to be rebuilt to be opened by reader properly.
+
+### Public API changes
 * Change target_file_size_base type to uint64_t from int.
+* Remove allow_thread_local. This feature was proved to be stable, so we are turning it always-on.
 
 ----- Past Releases -----
 
diff --git a/db/c.cc b/db/c.cc
index d9dee46fb..b3077aaad 100644
--- a/db/c.cc
+++ b/db/c.cc
@@ -1355,8 +1355,8 @@ void rocksdb_options_set_purge_redundant_kvs_while_flush(
   opt->rep.purge_redundant_kvs_while_flush = v;
 }
 
-void rocksdb_options_set_allow_os_buffer(
-    rocksdb_options_t* opt, unsigned char v) {
+void rocksdb_options_set_allow_os_buffer(rocksdb_options_t* opt,
+                                         unsigned char v) {
   opt->rep.allow_os_buffer = v;
 }
 
@@ -1581,11 +1581,6 @@ void rocksdb_options_set_bloom_locality(
   opt->rep.bloom_locality = v;
 }
 
-void rocksdb_options_set_allow_thread_local(
-    rocksdb_options_t* opt, unsigned char v) {
-  opt->rep.allow_thread_local = v;
-}
-
 void rocksdb_options_set_inplace_update_support(
     rocksdb_options_t* opt, unsigned char v) {
   opt->rep.inplace_update_support = v;
diff --git a/db/column_family.cc b/db/column_family.cc
index 8b4e007ed..f95090225 100644
--- a/db/column_family.cc
+++ b/db/column_family.cc
@@ -411,16 +411,10 @@ Compaction* ColumnFamilyData::CompactRange(int input_level, int output_level,
 SuperVersion* ColumnFamilyData::GetReferencedSuperVersion(
     port::Mutex* db_mutex) {
   SuperVersion* sv = nullptr;
-  if (LIKELY(column_family_set_->db_options_->allow_thread_local)) {
-    sv = GetThreadLocalSuperVersion(db_mutex);
-    sv->Ref();
-    if (!ReturnThreadLocalSuperVersion(sv)) {
-      sv->Unref();
-    }
-  } else {
-    db_mutex->Lock();
-    sv = super_version_->Ref();
-    db_mutex->Unlock();
+  sv = GetThreadLocalSuperVersion(db_mutex);
+  sv->Ref();
+  if (!ReturnThreadLocalSuperVersion(sv)) {
+    sv->Unref();
   }
   return sv;
 }
@@ -506,9 +500,7 @@ SuperVersion* ColumnFamilyData::InstallSuperVersion(
   ++super_version_number_;
   super_version_->version_number = super_version_number_;
   // Reset SuperVersions cached in thread local storage
-  if (column_family_set_->db_options_->allow_thread_local) {
-    ResetThreadLocalSuperVersions();
-  }
+  ResetThreadLocalSuperVersions();
 
   RecalculateWriteStallConditions();
 
diff --git a/db/db_impl.cc b/db/db_impl.cc
index 260939810..cb03d7ea6 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -401,24 +401,22 @@ DBImpl::~DBImpl() {
     mutex_.Lock();
   }
 
-  if (db_options_.allow_thread_local) {
-    // Clean up obsolete files due to SuperVersion release.
-    // (1) Need to delete to obsolete files before closing because RepairDB()
-    // scans all existing files in the file system and builds manifest file.
-    // Keeping obsolete files confuses the repair process.
-    // (2) Need to check if we Open()/Recover() the DB successfully before
-    // deleting because if VersionSet recover fails (may be due to corrupted
-    // manifest file), it is not able to identify live files correctly. As a
-    // result, all "live" files can get deleted by accident. However, corrupted
-    // manifest is recoverable by RepairDB().
-    if (opened_successfully_) {
-      DeletionState deletion_state;
-      FindObsoleteFiles(deletion_state, true);
-      // manifest number starting from 2
-      deletion_state.manifest_file_number = 1;
-      if (deletion_state.HaveSomethingToDelete()) {
-        PurgeObsoleteFiles(deletion_state);
-      }
+  // Clean up obsolete files due to SuperVersion release.
+  // (1) Need to delete to obsolete files before closing because RepairDB()
+  // scans all existing files in the file system and builds manifest file.
+  // Keeping obsolete files confuses the repair process.
+  // (2) Need to check if we Open()/Recover() the DB successfully before
+  // deleting because if VersionSet recover fails (may be due to corrupted
+  // manifest file), it is not able to identify live files correctly. As a
+  // result, all "live" files can get deleted by accident. However, corrupted
+  // manifest is recoverable by RepairDB().
+  if (opened_successfully_) {
+    DeletionState deletion_state;
+    FindObsoleteFiles(deletion_state, true);
+    // manifest number starting from 2
+    deletion_state.manifest_file_number = 1;
+    if (deletion_state.HaveSomethingToDelete()) {
+      PurgeObsoleteFiles(deletion_state);
     }
   }
 
@@ -4315,20 +4313,12 @@ bool DBImpl::GetIntPropertyInternal(ColumnFamilyHandle* column_family,
 
 SuperVersion* DBImpl::GetAndRefSuperVersion(ColumnFamilyData* cfd) {
   // TODO(ljin): consider using GetReferencedSuperVersion() directly
-  if (LIKELY(db_options_.allow_thread_local)) {
-    return cfd->GetThreadLocalSuperVersion(&mutex_);
-  } else {
-    MutexLock l(&mutex_);
-    return cfd->GetSuperVersion()->Ref();
-  }
+  return cfd->GetThreadLocalSuperVersion(&mutex_);
 }
 
 void DBImpl::ReturnAndCleanupSuperVersion(ColumnFamilyData* cfd,
                                           SuperVersion* sv) {
-  bool unref_sv = true;
-  if (LIKELY(db_options_.allow_thread_local)) {
-    unref_sv = !cfd->ReturnThreadLocalSuperVersion(sv);
-  }
+  bool unref_sv = !cfd->ReturnThreadLocalSuperVersion(sv);
 
   if (unref_sv) {
     // Release SuperVersion
diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h
index e4b1bb753..726a1edc3 100644
--- a/include/rocksdb/c.h
+++ b/include/rocksdb/c.h
@@ -537,8 +537,6 @@ extern void rocksdb_options_set_min_partial_merge_operands(
     rocksdb_options_t*, uint32_t);
 extern void rocksdb_options_set_bloom_locality(
     rocksdb_options_t*, uint32_t);
-extern void rocksdb_options_set_allow_thread_local(
-    rocksdb_options_t*, unsigned char);
 extern void rocksdb_options_set_inplace_update_support(
     rocksdb_options_t*, unsigned char);
 extern void rocksdb_options_set_inplace_update_num_locks(
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index a60f94268..467c7bb1e 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -799,10 +799,6 @@ struct DBOptions {
   // Default: false
   bool use_adaptive_mutex;
 
-  // Allow RocksDB to use thread local storage to optimize performance.
-  // Default: true
-  bool allow_thread_local;
-
   // Create DBOptions with default values for all fields
   DBOptions();
   // Create DBOptions from Options
diff --git a/java/org/rocksdb/Options.java b/java/org/rocksdb/Options.java
index 33ca19d9d..8446136f8 100644
--- a/java/org/rocksdb/Options.java
+++ b/java/org/rocksdb/Options.java
@@ -1070,33 +1070,6 @@ public class Options extends RocksObject {
   private native void setBytesPerSync(
       long handle, long bytesPerSync);
 
-  /**
-   * Allow RocksDB to use thread local storage to optimize performance.
-   * Default: true
-   *
-   * @return true if thread-local storage is allowed
-   */
-  public boolean allowThreadLocal() {
-    assert(isInitialized());
-    return allowThreadLocal(nativeHandle_);
-  }
-  private native boolean allowThreadLocal(long handle);
-
-  /**
-   * Allow RocksDB to use thread local storage to optimize performance.
-   * Default: true
-   *
-   * @param allowThreadLocal true if thread-local storage is allowed.
-   * @return the reference to the current option.
-   */
-  public Options setAllowThreadLocal(boolean allowThreadLocal) {
-    assert(isInitialized());
-    setAllowThreadLocal(nativeHandle_, allowThreadLocal);
-    return this;
-  }
-  private native void setAllowThreadLocal(
-      long handle, boolean allowThreadLocal);
-
   /**
    * Set the config for mem-table.
    *
diff --git a/java/org/rocksdb/test/OptionsTest.java b/java/org/rocksdb/test/OptionsTest.java
index d81ca1076..d3abb48cd 100644
--- a/java/org/rocksdb/test/OptionsTest.java
+++ b/java/org/rocksdb/test/OptionsTest.java
@@ -184,12 +184,6 @@ public class OptionsTest {
       assert(opt.bytesPerSync() == longValue);
     }
 
-    { // AllowThreadLocal test
-      boolean boolValue = rand.nextBoolean();
-      opt.setAllowThreadLocal(boolValue);
-      assert(opt.allowThreadLocal() == boolValue);
-    }
-
     { // WriteBufferSize test
       long longValue = rand.nextLong();
       opt.setWriteBufferSize(longValue);
diff --git a/java/rocksjni/options.cc b/java/rocksjni/options.cc
index 705e9ff8c..2dc2ffdc8 100644
--- a/java/rocksjni/options.cc
+++ b/java/rocksjni/options.cc
@@ -789,27 +789,6 @@ void Java_org_rocksdb_Options_setBytesPerSync(
       static_cast<int64_t>(bytes_per_sync);
 }
 
-/*
- * Class:     org_rocksdb_Options
- * Method:    allowThreadLocal
- * Signature: (J)Z
- */
-jboolean Java_org_rocksdb_Options_allowThreadLocal(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(jhandle)->allow_thread_local;
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    setAllowThreadLocal
- * Signature: (JZ)V
- */
-void Java_org_rocksdb_Options_setAllowThreadLocal(
-    JNIEnv* env, jobject jobj, jlong jhandle, jboolean allow_thread_local) {
-  reinterpret_cast<rocksdb::Options*>(jhandle)->allow_thread_local =
-      static_cast<bool>(allow_thread_local);
-}
-
 /*
  * Method:    tableFactoryName
  * Signature: (J)Ljava/lang/String
diff --git a/util/options.cc b/util/options.cc
index 28120659b..8716b465d 100644
--- a/util/options.cc
+++ b/util/options.cc
@@ -213,7 +213,6 @@ DBOptions::DBOptions()
       advise_random_on_open(true),
       access_hint_on_compaction_start(NORMAL),
       use_adaptive_mutex(false),
-      allow_thread_local(true),
       bytes_per_sync(0) {}
 
 DBOptions::DBOptions(const Options& options)
@@ -256,7 +255,6 @@ DBOptions::DBOptions(const Options& options)
       advise_random_on_open(options.advise_random_on_open),
       access_hint_on_compaction_start(options.access_hint_on_compaction_start),
       use_adaptive_mutex(options.use_adaptive_mutex),
-      allow_thread_local(options.allow_thread_local),
       bytes_per_sync(options.bytes_per_sync) {}
 
 static const char* const access_hints[] = {
diff --git a/util/options_helper.cc b/util/options_helper.cc
index d552a2b9e..35c3f63df 100644
--- a/util/options_helper.cc
+++ b/util/options_helper.cc
@@ -301,8 +301,6 @@ bool GetOptionsFromStrings(
         new_options->advise_random_on_open = ParseBoolean(o.first, o.second);
       } else if (o.first == "use_adaptive_mutex") {
         new_options->use_adaptive_mutex = ParseBoolean(o.first, o.second);
-      } else if (o.first == "allow_thread_local") {
-        new_options->allow_thread_local = ParseBoolean(o.first, o.second);
       } else if (o.first == "bytes_per_sync") {
         new_options->bytes_per_sync = ParseUint64(o.second);
       } else {
diff --git a/util/options_test.cc b/util/options_test.cc
index eee285e2a..1e26c343d 100644
--- a/util/options_test.cc
+++ b/util/options_test.cc
@@ -151,7 +151,6 @@ TEST(OptionsTest, GetOptionsFromStringsTest) {
     {"stats_dump_period_sec", "46"},
     {"advise_random_on_open", "true"},
     {"use_adaptive_mutex", "false"},
-    {"allow_thread_local", "true"},
     {"bytes_per_sync", "47"},
   };
 
@@ -239,7 +238,6 @@ TEST(OptionsTest, GetOptionsFromStringsTest) {
   ASSERT_EQ(new_opt.stats_dump_period_sec, 46U);
   ASSERT_EQ(new_opt.advise_random_on_open, true);
   ASSERT_EQ(new_opt.use_adaptive_mutex, false);
-  ASSERT_EQ(new_opt.allow_thread_local, true);
   ASSERT_EQ(new_opt.bytes_per_sync, static_cast<uint64_t>(47));
 
   options_map["write_buffer_size"] = "hello";

From f7375f39fd5db29ecdc3d6821a15efedadaa4a66 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Thu, 25 Sep 2014 11:08:16 -0700
Subject: [PATCH 124/829] Fix double deletes

Summary: While debugging clients compaction issues, I noticed bunch of delete bugs: P16329995. MakeTableName returns sst file with "/" prefix. We also need "/" prefix when we get the files though GetChildren(), so that we can properly dedup the files.

Test Plan: none

Reviewers: sdong, yhchiang, ljin

Reviewed By: ljin

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D23457
---
 db/db_impl.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/db/db_impl.cc b/db/db_impl.cc
index cb03d7ea6..bd9b222b3 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -590,7 +590,8 @@ void DBImpl::FindObsoleteFiles(DeletionState& deletion_state,
       env_->GetChildren(db_options_.db_paths[path_id].path,
                         &files);  // Ignore errors
       for (std::string file : files) {
-        deletion_state.candidate_files.emplace_back(file, path_id);
+        // TODO(icanadi) clean up this mess to avoid having one-off "/" prefixes
+        deletion_state.candidate_files.emplace_back("/" + file, path_id);
       }
     }
 

From 3c6800610995c5eee3c04254e9f1d4cbef9e96a0 Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Thu, 25 Sep 2014 11:14:01 -0700
Subject: [PATCH 125/829] CompactedDBImpl

Summary:
Add a CompactedDBImpl that will enabled when calling OpenForReadOnly()
and the DB only has one level (>0) of files. As a performan comparison,
CuckooTable performs 2.1M/s with CompactedDBImpl vs. 1.78M/s with
ReadOnlyDBImpl.

Test Plan: db_bench

Reviewers: yhchiang, igor, sdong

Reviewed By: sdong

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D23553
---
 db/db_bench.cc                              |  40 ++++++
 db/db_impl.h                                |   1 +
 db/db_impl_readonly.cc                      |  46 ++-----
 db/db_impl_readonly.h                       |  13 --
 db/db_test.cc                               |  74 +++++++++++
 db/version_set.cc                           |  61 +++------
 db/version_set.h                            |  23 ++++
 table/cuckoo_table_reader.cc                |   4 +-
 utilities/compacted_db/compacted_db_impl.cc | 132 ++++++++++++++++++++
 utilities/compacted_db/compacted_db_impl.h  |  92 ++++++++++++++
 10 files changed, 395 insertions(+), 91 deletions(-)
 create mode 100644 utilities/compacted_db/compacted_db_impl.cc
 create mode 100644 utilities/compacted_db/compacted_db_impl.h

diff --git a/db/db_bench.cc b/db/db_bench.cc
index d90c628a9..926d8de69 100644
--- a/db/db_bench.cc
+++ b/db/db_bench.cc
@@ -1262,6 +1262,8 @@ class Benchmark {
         method = &Benchmark::ReadReverse;
       } else if (name == Slice("readrandom")) {
         method = &Benchmark::ReadRandom;
+      } else if (name == Slice("readrandomfast")) {
+        method = &Benchmark::ReadRandomFast;
       } else if (name == Slice("multireadrandom")) {
         method = &Benchmark::MultiReadRandom;
       } else if (name == Slice("readmissing")) {
@@ -2071,6 +2073,44 @@ class Benchmark {
     thread->stats.AddBytes(bytes);
   }
 
+  void ReadRandomFast(ThreadState* thread) {
+    int64_t read = 0;
+    int64_t found = 0;
+    ReadOptions options(FLAGS_verify_checksum, true);
+    Slice key = AllocateKey();
+    std::unique_ptr<const char[]> key_guard(key.data());
+    std::string value;
+    DB* db = SelectDBWithCfh(thread)->db;
+
+    int64_t pot = 1;
+    while (pot < FLAGS_num) {
+      pot <<= 1;
+    }
+
+    Duration duration(FLAGS_duration, reads_);
+    do {
+      for (int i = 0; i < 100; ++i) {
+        int64_t key_rand = thread->rand.Next() & (pot - 1);
+        GenerateKeyFromInt(key_rand, FLAGS_num, &key);
+        ++read;
+        if (db->Get(options, key, &value).ok()) {
+          ++found;
+        }
+      }
+      thread->stats.FinishedOps(db, 100);
+    } while (!duration.Done(100));
+
+    char msg[100];
+    snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)\n",
+             found, read);
+
+    thread->stats.AddMessage(msg);
+
+    if (FLAGS_perf_level > 0) {
+      thread->stats.AddMessage(perf_context.ToString());
+    }
+  }
+
   void ReadRandom(ThreadState* thread) {
     int64_t read = 0;
     int64_t found = 0;
diff --git a/db/db_impl.h b/db/db_impl.h
index 0bc2018b4..c6baf9c95 100644
--- a/db/db_impl.h
+++ b/db/db_impl.h
@@ -325,6 +325,7 @@ class DBImpl : public DB {
   friend class ForwardIterator;
 #endif
   friend struct SuperVersion;
+  friend class CompactedDBImpl;
   struct CompactionState;
 
   struct WriteContext;
diff --git a/db/db_impl_readonly.cc b/db/db_impl_readonly.cc
index 8cea58736..98e2bfeb0 100644
--- a/db/db_impl_readonly.cc
+++ b/db/db_impl_readonly.cc
@@ -2,42 +2,12 @@
 //  This source code is licensed under the BSD-style license found in the
 //  LICENSE file in the root directory of this source tree. An additional grant
 //  of patent rights can be found in the PATENTS file in the same directory.
-//
-// Copyright (c) 2012 Facebook. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
 
 #include "db/db_impl_readonly.h"
+#include "utilities/compacted_db/compacted_db_impl.h"
 #include "db/db_impl.h"
-
-#include <algorithm>
-#include <set>
-#include <string>
-#include <stdint.h>
-#include <stdio.h>
-#include <vector>
-#include "db/db_iter.h"
-#include "db/dbformat.h"
-#include "db/filename.h"
-#include "db/log_reader.h"
-#include "db/log_writer.h"
-#include "db/memtable.h"
 #include "db/merge_context.h"
-#include "db/table_cache.h"
-#include "db/version_set.h"
-#include "db/write_batch_internal.h"
-#include "rocksdb/db.h"
-#include "rocksdb/env.h"
-#include "rocksdb/status.h"
-#include "rocksdb/table.h"
-#include "rocksdb/merge_operator.h"
-#include "port/port.h"
-#include "table/block.h"
-#include "table/merger.h"
-#include "table/two_level_iterator.h"
-#include "util/coding.h"
-#include "util/logging.h"
-#include "util/build_version.h"
+#include "db/db_iter.h"
 
 namespace rocksdb {
 
@@ -120,6 +90,15 @@ Status DB::OpenForReadOnly(const Options& options, const std::string& dbname,
                            DB** dbptr, bool error_if_log_file_exist) {
   *dbptr = nullptr;
 
+  // Try to first open DB as fully compacted DB
+  Status s;
+#ifndef ROCKSDB_LITE
+  s = CompactedDBImpl::Open(options, dbname, dbptr);
+  if (s.ok()) {
+    return s;
+  }
+#endif
+
   DBOptions db_options(options);
   ColumnFamilyOptions cf_options(options);
   std::vector<ColumnFamilyDescriptor> column_families;
@@ -127,8 +106,7 @@ Status DB::OpenForReadOnly(const Options& options, const std::string& dbname,
       ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options));
   std::vector<ColumnFamilyHandle*> handles;
 
-  Status s =
-      DB::OpenForReadOnly(db_options, dbname, column_families, &handles, dbptr);
+  s = DB::OpenForReadOnly(db_options, dbname, column_families, &handles, dbptr);
   if (s.ok()) {
     assert(handles.size() == 1);
     // i can delete the handle since DBImpl is always holding a
diff --git a/db/db_impl_readonly.h b/db/db_impl_readonly.h
index 1dfdf422e..9b10b83fb 100644
--- a/db/db_impl_readonly.h
+++ b/db/db_impl_readonly.h
@@ -2,24 +2,11 @@
 //  This source code is licensed under the BSD-style license found in the
 //  LICENSE file in the root directory of this source tree. An additional grant
 //  of patent rights can be found in the PATENTS file in the same directory.
-//
-// Copyright (c) 2012 Facebook. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
 
 #pragma once
 #include "db/db_impl.h"
-
-#include <deque>
-#include <set>
 #include <vector>
 #include <string>
-#include "db/dbformat.h"
-#include "db/log_writer.h"
-#include "db/snapshot.h"
-#include "rocksdb/db.h"
-#include "rocksdb/env.h"
-#include "port/port.h"
 
 namespace rocksdb {
 
diff --git a/db/db_test.cc b/db/db_test.cc
index 7ad249d7f..09e59f46c 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -1270,6 +1270,80 @@ TEST(DBTest, ReadOnlyDB) {
   ASSERT_EQ("v2", Get("bar"));
 }
 
+TEST(DBTest, CompactedDB) {
+  const uint64_t kFileSize = 1 << 20;
+  Options options;
+  options.disable_auto_compactions = true;
+  options.max_mem_compaction_level = 0;
+  options.write_buffer_size = kFileSize;
+  options.target_file_size_base = kFileSize;
+  options.max_bytes_for_level_base = 1 << 30;
+  options.compression = kNoCompression;
+  Reopen(&options);
+  // 1 L0 file, use CompactedDB if max_open_files = -1
+  ASSERT_OK(Put("aaa", DummyString(kFileSize / 2, '1')));
+  Flush();
+  Close();
+  ASSERT_OK(ReadOnlyReopen(&options));
+  Status s = Put("new", "value");
+  ASSERT_EQ(s.ToString(),
+            "Not implemented: Not supported operation in read only mode.");
+  ASSERT_EQ(DummyString(kFileSize / 2, '1'), Get("aaa"));
+  Close();
+  options.max_open_files = -1;
+  ASSERT_OK(ReadOnlyReopen(&options));
+  s = Put("new", "value");
+  ASSERT_EQ(s.ToString(),
+            "Not implemented: Not supported in compacted db mode.");
+  ASSERT_EQ(DummyString(kFileSize / 2, '1'), Get("aaa"));
+  Close();
+  Reopen(&options);
+  // Add more L0 files
+  ASSERT_OK(Put("bbb", DummyString(kFileSize / 2, '2')));
+  Flush();
+  ASSERT_OK(Put("aaa", DummyString(kFileSize / 2, 'a')));
+  Flush();
+  ASSERT_OK(Put("bbb", DummyString(kFileSize / 2, 'b')));
+  Flush();
+  Close();
+
+  ASSERT_OK(ReadOnlyReopen(&options));
+  // Fallback to read-only DB
+  s = Put("new", "value");
+  ASSERT_EQ(s.ToString(),
+            "Not implemented: Not supported operation in read only mode.");
+  Close();
+
+  // Full compaction
+  Reopen(&options);
+  // Add more keys
+  ASSERT_OK(Put("eee", DummyString(kFileSize / 2, 'e')));
+  ASSERT_OK(Put("fff", DummyString(kFileSize / 2, 'f')));
+  ASSERT_OK(Put("hhh", DummyString(kFileSize / 2, 'h')));
+  ASSERT_OK(Put("iii", DummyString(kFileSize / 2, 'i')));
+  ASSERT_OK(Put("jjj", DummyString(kFileSize / 2, 'j')));
+  db_->CompactRange(nullptr, nullptr);
+  ASSERT_EQ(3, NumTableFilesAtLevel(1));
+  Close();
+
+  // CompactedDB
+  ASSERT_OK(ReadOnlyReopen(&options));
+  s = Put("new", "value");
+  ASSERT_EQ(s.ToString(),
+            "Not implemented: Not supported in compacted db mode.");
+  ASSERT_EQ("NOT_FOUND", Get("abc"));
+  ASSERT_EQ(DummyString(kFileSize / 2, 'a'), Get("aaa"));
+  ASSERT_EQ(DummyString(kFileSize / 2, 'b'), Get("bbb"));
+  ASSERT_EQ("NOT_FOUND", Get("ccc"));
+  ASSERT_EQ(DummyString(kFileSize / 2, 'e'), Get("eee"));
+  ASSERT_EQ(DummyString(kFileSize / 2, 'f'), Get("fff"));
+  ASSERT_EQ("NOT_FOUND", Get("ggg"));
+  ASSERT_EQ(DummyString(kFileSize / 2, 'h'), Get("hhh"));
+  ASSERT_EQ(DummyString(kFileSize / 2, 'i'), Get("iii"));
+  ASSERT_EQ(DummyString(kFileSize / 2, 'j'), Get("jjj"));
+  ASSERT_EQ("NOT_FOUND", Get("kkk"));
+}
+
 // Make sure that when options.block_cache is set, after a new table is
 // created its index/filter blocks are added to block cache.
 TEST(DBTest, IndexAndFilterBlocksOfNewTableAddedToCache) {
diff --git a/db/version_set.cc b/db/version_set.cc
index 7edfaa788..0a46d7edc 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -626,46 +626,23 @@ void Version::AddIterators(const ReadOptions& read_options,
   }
 }
 
-// Callback from TableCache::Get()
-enum SaverState {
-  kNotFound,
-  kFound,
-  kDeleted,
-  kCorrupt,
-  kMerge // saver contains the current merge result (the operands)
-};
-
-namespace version_set {
-struct Saver {
-  SaverState state;
-  const Comparator* ucmp;
-  Slice user_key;
-  bool* value_found; // Is value set correctly? Used by KeyMayExist
-  std::string* value;
-  const MergeOperator* merge_operator;
-  // the merge operations encountered;
-  MergeContext* merge_context;
-  Logger* logger;
-  Statistics* statistics;
-};
-} // namespace version_set
 
 // Called from TableCache::Get and Table::Get when file/block in which
 // key may  exist are not there in TableCache/BlockCache respectively. In this
 // case we  can't guarantee that key does not exist and are not permitted to do
 // IO to be  certain.Set the status=kFound and value_found=false to let the
 // caller know that key may exist but is not there in memory
-static void MarkKeyMayExist(void* arg) {
-  version_set::Saver* s = reinterpret_cast<version_set::Saver*>(arg);
-  s->state = kFound;
+void MarkKeyMayExist(void* arg) {
+  Version::Saver* s = reinterpret_cast<Version::Saver*>(arg);
+  s->state = Version::kFound;
   if (s->value_found != nullptr) {
     *(s->value_found) = false;
   }
 }
 
-static bool SaveValue(void* arg, const ParsedInternalKey& parsed_key,
-                      const Slice& v) {
-  version_set::Saver* s = reinterpret_cast<version_set::Saver*>(arg);
+bool SaveValue(void* arg, const ParsedInternalKey& parsed_key,
+               const Slice& v) {
+  Version::Saver* s = reinterpret_cast<Version::Saver*>(arg);
   MergeContext* merge_contex = s->merge_context;
   std::string merge_result;  // temporary area for merge results later
 
@@ -676,17 +653,17 @@ static bool SaveValue(void* arg, const ParsedInternalKey& parsed_key,
     // Key matches. Process it
     switch (parsed_key.type) {
       case kTypeValue:
-        if (kNotFound == s->state) {
-          s->state = kFound;
+        if (Version::kNotFound == s->state) {
+          s->state = Version::kFound;
           s->value->assign(v.data(), v.size());
-        } else if (kMerge == s->state) {
+        } else if (Version::kMerge == s->state) {
           assert(s->merge_operator != nullptr);
-          s->state = kFound;
+          s->state = Version::kFound;
           if (!s->merge_operator->FullMerge(s->user_key, &v,
                                             merge_contex->GetOperands(),
                                             s->value, s->logger)) {
             RecordTick(s->statistics, NUMBER_MERGE_FAILURES);
-            s->state = kCorrupt;
+            s->state = Version::kCorrupt;
           }
         } else {
           assert(false);
@@ -694,15 +671,15 @@ static bool SaveValue(void* arg, const ParsedInternalKey& parsed_key,
         return false;
 
       case kTypeDeletion:
-        if (kNotFound == s->state) {
-          s->state = kDeleted;
-        } else if (kMerge == s->state) {
-          s->state = kFound;
+        if (Version::kNotFound == s->state) {
+          s->state = Version::kDeleted;
+        } else if (Version::kMerge == s->state) {
+          s->state = Version::kFound;
           if (!s->merge_operator->FullMerge(s->user_key, nullptr,
                                             merge_contex->GetOperands(),
                                             s->value, s->logger)) {
             RecordTick(s->statistics, NUMBER_MERGE_FAILURES);
-            s->state = kCorrupt;
+            s->state = Version::kCorrupt;
           }
         } else {
           assert(false);
@@ -710,8 +687,8 @@ static bool SaveValue(void* arg, const ParsedInternalKey& parsed_key,
         return false;
 
       case kTypeMerge:
-        assert(s->state == kNotFound || s->state == kMerge);
-        s->state = kMerge;
+        assert(s->state == Version::kNotFound || s->state == Version::kMerge);
+        s->state = Version::kMerge;
         merge_contex->PushOperand(v);
         return true;
 
@@ -779,7 +756,7 @@ void Version::Get(const ReadOptions& options,
   Slice user_key = k.user_key();
 
   assert(status->ok() || status->IsMergeInProgress());
-  version_set::Saver saver;
+  Saver saver;
   saver.state = status->ok()? kNotFound : kMerge;
   saver.ucmp = user_comparator_;
   saver.user_key = user_key;
diff --git a/db/version_set.h b/db/version_set.h
index 211fca179..9e6cc1e34 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -241,10 +241,33 @@ class Version {
     FileMetaData* file;
   };
 
+  enum SaverState {
+    kNotFound,
+    kFound,
+    kDeleted,
+    kCorrupt,
+    kMerge  // saver contains the current merge result (the operands)
+  };
+
+  // Callback from TableCache::Get()
+  struct Saver {
+    SaverState state;
+    const Comparator* ucmp;
+    Slice user_key;
+    bool* value_found;  // Is value set correctly? Used by KeyMayExist
+    std::string* value;
+    const MergeOperator* merge_operator;
+    // the merge operations encountered;
+    MergeContext* merge_context;
+    Logger* logger;
+    Statistics* statistics;
+  };
+
  private:
   friend class Compaction;
   friend class VersionSet;
   friend class DBImpl;
+  friend class CompactedDBImpl;
   friend class ColumnFamilyData;
   friend class CompactionPicker;
   friend class LevelCompactionPicker;
diff --git a/table/cuckoo_table_reader.cc b/table/cuckoo_table_reader.cc
index 63b8a2c8c..b8ac5a47e 100644
--- a/table/cuckoo_table_reader.cc
+++ b/table/cuckoo_table_reader.cc
@@ -120,9 +120,9 @@ Status CuckooTableReader::Get(
         get_slice_hash_);
     const char* bucket = &file_data_.data()[offset];
     for (uint32_t block_idx = 0; block_idx < cuckoo_block_size_;
-        ++block_idx, bucket += bucket_length_) {
+         ++block_idx, bucket += bucket_length_) {
       if (ucomp_->Compare(Slice(unused_key_.data(), user_key.size()),
-            Slice(bucket, user_key.size())) == 0) {
+                          Slice(bucket, user_key.size())) == 0) {
         return Status::OK();
       }
       // Here, we compare only the user key part as we support only one entry
diff --git a/utilities/compacted_db/compacted_db_impl.cc b/utilities/compacted_db/compacted_db_impl.cc
new file mode 100644
index 000000000..07dc71ea9
--- /dev/null
+++ b/utilities/compacted_db/compacted_db_impl.cc
@@ -0,0 +1,132 @@
+//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#ifndef ROCKSDB_LITE
+#include "utilities/compacted_db/compacted_db_impl.h"
+#include "db/db_impl.h"
+#include "db/version_set.h"
+#include "db/merge_context.h"
+
+namespace rocksdb {
+
+extern void MarkKeyMayExist(void* arg);
+extern bool SaveValue(void* arg, const ParsedInternalKey& parsed_key,
+                      const Slice& v);
+
+CompactedDBImpl::CompactedDBImpl(
+  const DBOptions& options, const std::string& dbname)
+  : DBImpl(options, dbname) {
+}
+
+CompactedDBImpl::~CompactedDBImpl() {
+}
+
+Status CompactedDBImpl::Get(const ReadOptions& options,
+     ColumnFamilyHandle*, const Slice& key, std::string* value) {
+  size_t left = 0;
+  size_t right = files_.num_files - 1;
+  while (left < right) {
+    size_t mid = (left + right) >> 1;
+    const FdWithKeyRange& f = files_.files[mid];
+    if (user_comparator_->Compare(ExtractUserKey(f.largest_key), key) < 0) {
+      // Key at "mid.largest" is < "target".  Therefore all
+      // files at or before "mid" are uninteresting.
+      left = mid + 1;
+    } else {
+      // Key at "mid.largest" is >= "target".  Therefore all files
+      // after "mid" are uninteresting.
+      right = mid;
+    }
+  }
+  const FdWithKeyRange& f = files_.files[right];
+
+  bool value_found;
+  MergeContext merge_context;
+  Version::Saver saver;
+  saver.state = Version::kNotFound;
+  saver.ucmp = user_comparator_;
+  saver.user_key = key;
+  saver.value_found = &value_found;
+  saver.value = value;
+  saver.merge_operator = nullptr;
+  saver.merge_context = &merge_context;
+  saver.logger = info_log_;
+  saver.statistics = statistics_;
+  LookupKey lkey(key, kMaxSequenceNumber);
+  f.fd.table_reader->Get(options, lkey.internal_key(),
+                         reinterpret_cast<void*>(&saver), SaveValue,
+                         MarkKeyMayExist);
+  if (saver.state == Version::kFound) {
+    return Status::OK();
+  }
+  return Status::NotFound();
+}
+
+Status CompactedDBImpl::Init(const Options& options) {
+  mutex_.Lock();
+  ColumnFamilyDescriptor cf(kDefaultColumnFamilyName,
+                            ColumnFamilyOptions(options));
+  Status s = Recover({ cf }, true /* read only */, false);
+  if (s.ok()) {
+    cfd_ = reinterpret_cast<ColumnFamilyHandleImpl*>(
+              DefaultColumnFamily())->cfd();
+    delete cfd_->InstallSuperVersion(new SuperVersion(), &mutex_);
+  }
+  mutex_.Unlock();
+  if (!s.ok()) {
+    return s;
+  }
+  version_ = cfd_->GetSuperVersion()->current;
+  user_comparator_ = cfd_->user_comparator();
+  statistics_ = cfd_->ioptions()->statistics;
+  info_log_ = cfd_->ioptions()->info_log;
+  // L0 should not have files
+  if (version_->file_levels_[0].num_files > 1) {
+    return Status::NotSupported("L0 contain more than 1 file");
+  }
+  if (version_->file_levels_[0].num_files == 1) {
+    if (version_->num_non_empty_levels_ > 1) {
+      return Status::NotSupported("Both L0 and other level contain files");
+    }
+    files_ = version_->file_levels_[0];
+    return Status::OK();
+  }
+
+  for (int i = 1; i < version_->num_non_empty_levels_ - 1; ++i) {
+    if (version_->file_levels_[i].num_files > 0) {
+      return Status::NotSupported("Other levels also contain files");
+    }
+  }
+
+  int level = version_->num_non_empty_levels_ - 1;
+  if (version_->file_levels_[level].num_files > 0) {
+    files_ = version_->file_levels_[version_->num_non_empty_levels_ - 1];
+    return Status::OK();
+  }
+  return Status::NotSupported("no file exists");
+}
+
+Status CompactedDBImpl::Open(const Options& options,
+                             const std::string& dbname, DB** dbptr) {
+  *dbptr = nullptr;
+
+  if (options.max_open_files != -1) {
+    return Status::InvalidArgument("require max_open_files = -1");
+  }
+  if (options.merge_operator.get() != nullptr) {
+    return Status::InvalidArgument("merge operator is not supported");
+  }
+  DBOptions db_options(options);
+  std::unique_ptr<CompactedDBImpl> db(new CompactedDBImpl(db_options, dbname));
+  Status s = db->Init(options);
+  if (s.ok()) {
+    *dbptr = db.release();
+    Log(options.info_log, "Opened the db as fully compacted mode");
+  }
+  return s;
+}
+
+}   // namespace rocksdb
+#endif  // ROCKSDB_LITE
diff --git a/utilities/compacted_db/compacted_db_impl.h b/utilities/compacted_db/compacted_db_impl.h
new file mode 100644
index 000000000..8237a2cdd
--- /dev/null
+++ b/utilities/compacted_db/compacted_db_impl.h
@@ -0,0 +1,92 @@
+//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#pragma once
+#ifndef ROCKSDB_LITE
+#include "db/db_impl.h"
+#include <vector>
+#include <string>
+
+namespace rocksdb {
+
+class CompactedDBImpl : public DBImpl {
+ public:
+  CompactedDBImpl(const DBOptions& options, const std::string& dbname);
+  virtual ~CompactedDBImpl();
+
+  static Status Open(const Options& options, const std::string& dbname,
+                     DB** dbptr);
+
+  // Implementations of the DB interface
+  using DB::Get;
+  virtual Status Get(const ReadOptions& options,
+                     ColumnFamilyHandle* column_family, const Slice& key,
+                     std::string* value) override;
+
+  using DBImpl::Put;
+  virtual Status Put(const WriteOptions& options,
+                     ColumnFamilyHandle* column_family, const Slice& key,
+                     const Slice& value) override {
+    return Status::NotSupported("Not supported in compacted db mode.");
+  }
+  using DBImpl::Merge;
+  virtual Status Merge(const WriteOptions& options,
+                       ColumnFamilyHandle* column_family, const Slice& key,
+                       const Slice& value) override {
+    return Status::NotSupported("Not supported in compacted db mode.");
+  }
+  using DBImpl::Delete;
+  virtual Status Delete(const WriteOptions& options,
+                        ColumnFamilyHandle* column_family,
+                        const Slice& key) override {
+    return Status::NotSupported("Not supported in compacted db mode.");
+  }
+  virtual Status Write(const WriteOptions& options,
+                       WriteBatch* updates) override {
+    return Status::NotSupported("Not supported in compacted db mode.");
+  }
+  using DBImpl::CompactRange;
+  virtual Status CompactRange(ColumnFamilyHandle* column_family,
+                              const Slice* begin, const Slice* end,
+                              bool reduce_level = false, int target_level = -1,
+                              uint32_t target_path_id = 0) override {
+    return Status::NotSupported("Not supported in compacted db mode.");
+  }
+
+  virtual Status DisableFileDeletions() override {
+    return Status::NotSupported("Not supported in compacted db mode.");
+  }
+  virtual Status EnableFileDeletions(bool force) override {
+    return Status::NotSupported("Not supported in compacted db mode.");
+  }
+  virtual Status GetLiveFiles(std::vector<std::string>&,
+                              uint64_t* manifest_file_size,
+                              bool flush_memtable = true) override {
+    return Status::NotSupported("Not supported in compacted db mode.");
+  }
+  using DBImpl::Flush;
+  virtual Status Flush(const FlushOptions& options,
+                       ColumnFamilyHandle* column_family) override {
+    return Status::NotSupported("Not supported in compacted db mode.");
+  }
+
+ private:
+  friend class DB;
+  Status Init(const Options& options);
+
+  ColumnFamilyData* cfd_;
+  Version* version_;
+  const Comparator* user_comparator_;
+  FileLevel files_;
+
+  Statistics* statistics_;
+  Logger* info_log_;
+
+  // No copying allowed
+  CompactedDBImpl(const CompactedDBImpl&);
+  void operator=(const CompactedDBImpl&);
+};
+}
+#endif  // ROCKSDB_LITE

From fbd2dafc9f22484ace3bb330e57fa50a3261a4ba Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Thu, 25 Sep 2014 13:34:51 -0700
Subject: [PATCH 126/829] CompactedDBImpl::MultiGet() for better CuckooTable
 performance

Summary:
Add the MultiGet API to allow prefetching.
With file size of 1.5G, I configured it to have 0.9 hash ratio that can
fill With 115M keys and result in 2 hash functions, the lookup QPS is
~4.9M/s  vs. 3M/s for Get().
It is tricky to set the parameters right. Since files size is determined
by power-of-two factor, that means # of keys is fixed in each file. With
big file size (thus smaller # of files), we will have more chance to
waste lot of space in the last file - lower space utilization as a
result. Using smaller file size can improve the situation, but that
harms lookup speed.

Test Plan: db_bench

Reviewers: yhchiang, sdong, igor

Reviewed By: sdong

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D23673
---
 db/db_bench.cc                              | 13 ++++-
 db/db_test.cc                               | 18 +++++++
 utilities/compacted_db/compacted_db_impl.cc | 54 +++++++++++++++++++--
 utilities/compacted_db/compacted_db_impl.h  |  7 +++
 4 files changed, 87 insertions(+), 5 deletions(-)

diff --git a/db/db_bench.cc b/db/db_bench.cc
index 926d8de69..85e840a7f 100644
--- a/db/db_bench.cc
+++ b/db/db_bench.cc
@@ -167,6 +167,8 @@ DEFINE_int32(value_size, 100, "Size of each value");
 
 DEFINE_bool(use_uint64_comparator, false, "use Uint64 user comparator");
 
+DEFINE_int64(batch_size, 1, "Batch size");
+
 static bool ValidateKeySize(const char* flagname, int32_t value) {
   return true;
 }
@@ -1265,6 +1267,8 @@ class Benchmark {
       } else if (name == Slice("readrandomfast")) {
         method = &Benchmark::ReadRandomFast;
       } else if (name == Slice("multireadrandom")) {
+        entries_per_batch_ = FLAGS_batch_size;
+        fprintf(stderr, "entries_per_batch_ = %ld\n", entries_per_batch_);
         method = &Benchmark::MultiReadRandom;
       } else if (name == Slice("readmissing")) {
         ++key_size_;
@@ -2076,6 +2080,7 @@ class Benchmark {
   void ReadRandomFast(ThreadState* thread) {
     int64_t read = 0;
     int64_t found = 0;
+    int64_t nonexist = 0;
     ReadOptions options(FLAGS_verify_checksum, true);
     Slice key = AllocateKey();
     std::unique_ptr<const char[]> key_guard(key.data());
@@ -2096,13 +2101,17 @@ class Benchmark {
         if (db->Get(options, key, &value).ok()) {
           ++found;
         }
+        if (key_rand >= FLAGS_num) {
+          ++nonexist;
+        }
       }
       thread->stats.FinishedOps(db, 100);
     } while (!duration.Done(100));
 
     char msg[100];
-    snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)\n",
-             found, read);
+    snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found, "
+             "issued %" PRIu64 " non-exist keys)\n",
+             found, read, nonexist);
 
     thread->stats.AddMessage(msg);
 
diff --git a/db/db_test.cc b/db/db_test.cc
index 09e59f46c..ab290d108 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -1342,6 +1342,24 @@ TEST(DBTest, CompactedDB) {
   ASSERT_EQ(DummyString(kFileSize / 2, 'i'), Get("iii"));
   ASSERT_EQ(DummyString(kFileSize / 2, 'j'), Get("jjj"));
   ASSERT_EQ("NOT_FOUND", Get("kkk"));
+
+  // MultiGet
+  std::vector<std::string> values;
+  std::vector<Status> status_list = dbfull()->MultiGet(ReadOptions(),
+      std::vector<Slice>({Slice("aaa"), Slice("ccc"), Slice("eee"),
+                          Slice("ggg"), Slice("iii"), Slice("kkk")}),
+      &values);
+  ASSERT_EQ(status_list.size(), 6);
+  ASSERT_EQ(values.size(), 6);
+  ASSERT_OK(status_list[0]);
+  ASSERT_EQ(DummyString(kFileSize / 2, 'a'), values[0]);
+  ASSERT_TRUE(status_list[1].IsNotFound());
+  ASSERT_OK(status_list[2]);
+  ASSERT_EQ(DummyString(kFileSize / 2, 'e'), values[2]);
+  ASSERT_TRUE(status_list[3].IsNotFound());
+  ASSERT_OK(status_list[4]);
+  ASSERT_EQ(DummyString(kFileSize / 2, 'i'), values[4]);
+  ASSERT_TRUE(status_list[5].IsNotFound());
 }
 
 // Make sure that when options.block_cache is set, after a new table is
diff --git a/utilities/compacted_db/compacted_db_impl.cc b/utilities/compacted_db/compacted_db_impl.cc
index 07dc71ea9..431eb3ba7 100644
--- a/utilities/compacted_db/compacted_db_impl.cc
+++ b/utilities/compacted_db/compacted_db_impl.cc
@@ -23,8 +23,7 @@ CompactedDBImpl::CompactedDBImpl(
 CompactedDBImpl::~CompactedDBImpl() {
 }
 
-Status CompactedDBImpl::Get(const ReadOptions& options,
-     ColumnFamilyHandle*, const Slice& key, std::string* value) {
+size_t CompactedDBImpl::FindFile(const Slice& key) {
   size_t left = 0;
   size_t right = files_.num_files - 1;
   while (left < right) {
@@ -40,7 +39,12 @@ Status CompactedDBImpl::Get(const ReadOptions& options,
       right = mid;
     }
   }
-  const FdWithKeyRange& f = files_.files[right];
+  return right;
+}
+
+Status CompactedDBImpl::Get(const ReadOptions& options,
+     ColumnFamilyHandle*, const Slice& key, std::string* value) {
+  const FdWithKeyRange& f = files_.files[FindFile(key)];
 
   bool value_found;
   MergeContext merge_context;
@@ -64,6 +68,50 @@ Status CompactedDBImpl::Get(const ReadOptions& options,
   return Status::NotFound();
 }
 
+std::vector<Status> CompactedDBImpl::MultiGet(const ReadOptions& options,
+    const std::vector<ColumnFamilyHandle*>&,
+    const std::vector<Slice>& keys, std::vector<std::string>* values) {
+  autovector<TableReader*, 16> reader_list;
+  for (const auto& key : keys) {
+    const FdWithKeyRange& f = files_.files[FindFile(key)];
+    if (user_comparator_->Compare(key, ExtractUserKey(f.smallest_key)) < 0) {
+      reader_list.push_back(nullptr);
+    } else {
+      LookupKey lkey(key, kMaxSequenceNumber);
+      f.fd.table_reader->Prepare(lkey.internal_key());
+      reader_list.push_back(f.fd.table_reader);
+    }
+  }
+  std::vector<Status> statuses(keys.size(), Status::NotFound());
+  values->resize(keys.size());
+  bool value_found;
+  MergeContext merge_context;
+  Version::Saver saver;
+  saver.ucmp = user_comparator_;
+  saver.value_found = &value_found;
+  saver.merge_operator = nullptr;
+  saver.merge_context = &merge_context;
+  saver.logger = info_log_;
+  saver.statistics = statistics_;
+  int idx = 0;
+  for (auto* r : reader_list) {
+    if (r != nullptr) {
+      saver.state = Version::kNotFound;
+      saver.user_key = keys[idx];
+      saver.value = &(*values)[idx];
+      LookupKey lkey(keys[idx], kMaxSequenceNumber);
+      r->Get(options, lkey.internal_key(),
+             reinterpret_cast<void*>(&saver), SaveValue,
+             MarkKeyMayExist);
+      if (saver.state == Version::kFound) {
+        statuses[idx] = Status::OK();
+      }
+    }
+    ++idx;
+  }
+  return statuses;
+}
+
 Status CompactedDBImpl::Init(const Options& options) {
   mutex_.Lock();
   ColumnFamilyDescriptor cf(kDefaultColumnFamilyName,
diff --git a/utilities/compacted_db/compacted_db_impl.h b/utilities/compacted_db/compacted_db_impl.h
index 8237a2cdd..ef3effced 100644
--- a/utilities/compacted_db/compacted_db_impl.h
+++ b/utilities/compacted_db/compacted_db_impl.h
@@ -24,6 +24,12 @@ class CompactedDBImpl : public DBImpl {
   virtual Status Get(const ReadOptions& options,
                      ColumnFamilyHandle* column_family, const Slice& key,
                      std::string* value) override;
+  using DB::MultiGet;
+  virtual std::vector<Status> MultiGet(
+      const ReadOptions& options,
+      const std::vector<ColumnFamilyHandle*>&,
+      const std::vector<Slice>& keys, std::vector<std::string>* values)
+    override;
 
   using DBImpl::Put;
   virtual Status Put(const WriteOptions& options,
@@ -74,6 +80,7 @@ class CompactedDBImpl : public DBImpl {
 
  private:
   friend class DB;
+  inline size_t FindFile(const Slice& key);
   Status Init(const Options& options);
 
   ColumnFamilyData* cfd_;

From 581442d44609876705a7ec399ab5b9b7843954fe Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Thu, 25 Sep 2014 13:53:27 -0700
Subject: [PATCH 127/829] option to choose module when calculating CuckooTable
 hash

Summary:
Using module to calculate hash makes lookup ~8% slower. But it has its
benefit: file size is more predictable, more space enffient

Test Plan: db_bench

Reviewers: igor, yhchiang, sdong

Reviewed By: sdong

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D23691
---
 include/rocksdb/table.h            | 10 ++++-
 table/cuckoo_table_builder.cc      | 64 +++++++++++++++++++-----------
 table/cuckoo_table_builder.h       |  3 +-
 table/cuckoo_table_builder_test.cc | 38 +++++++++---------
 table/cuckoo_table_factory.cc      |  5 ++-
 table/cuckoo_table_factory.h       | 20 ++++++----
 table/cuckoo_table_reader.cc       | 23 +++++++----
 table/cuckoo_table_reader.h        |  3 +-
 table/cuckoo_table_reader_test.cc  |  6 +--
 9 files changed, 107 insertions(+), 65 deletions(-)

diff --git a/include/rocksdb/table.h b/include/rocksdb/table.h
index 2b0255a97..e8ac6bd62 100644
--- a/include/rocksdb/table.h
+++ b/include/rocksdb/table.h
@@ -253,6 +253,8 @@ struct CuckooTablePropertyNames {
   static const std::string kIsLastLevel;
   // Indicate if using identity function for the first hash function.
   static const std::string kIdentityAsFirstHash;
+  // Indicate if using module or bit and to calculate hash value
+  static const std::string kUseModuleHash;
 };
 
 struct CuckooTableOptions {
@@ -271,11 +273,17 @@ struct CuckooTableOptions {
   // function. This makes lookups more cache friendly in case
   // of collisions.
   uint32_t cuckoo_block_size = 5;
-  // If this options is enabled, user key is treated as uint64_t and its value
+  // If this option is enabled, user key is treated as uint64_t and its value
   // is used as hash value directly. This option changes builder's behavior.
   // Reader ignore this option and behave according to what specified in table
   // property.
   bool identity_as_first_hash = false;
+  // If this option is set to true, module is used during hash calculation.
+  // This often yields better space efficiency at the cost of performance.
+  // If this optino is set to false, # of entries in table is constrained to be
+  // power of two, and bit and is used to calculate hash, which is faster in
+  // general.
+  bool use_module_hash = true;
 };
 
 // Cuckoo Table Factory for SST table format using Cache Friendly Cuckoo Hashing
diff --git a/table/cuckoo_table_builder.cc b/table/cuckoo_table_builder.cc
index 51c80d9df..17184ae2c 100644
--- a/table/cuckoo_table_builder.cc
+++ b/table/cuckoo_table_builder.cc
@@ -37,6 +37,8 @@ const std::string CuckooTablePropertyNames::kCuckooBlockSize =
       "rocksdb.cuckoo.hash.cuckooblocksize";
 const std::string CuckooTablePropertyNames::kIdentityAsFirstHash =
       "rocksdb.cuckoo.hash.identityfirst";
+const std::string CuckooTablePropertyNames::kUseModuleHash =
+      "rocksdb.cuckoo.hash.usemodule";
 
 // Obtained by running echo rocksdb.table.cuckoo | sha1sum
 extern const uint64_t kCuckooTableMagicNumber = 0x926789d0c5f17873ull;
@@ -45,7 +47,7 @@ CuckooTableBuilder::CuckooTableBuilder(
     WritableFile* file, double max_hash_table_ratio,
     uint32_t max_num_hash_table, uint32_t max_search_depth,
     const Comparator* user_comparator, uint32_t cuckoo_block_size,
-    bool identity_as_first_hash,
+    bool use_module_hash, bool identity_as_first_hash,
     uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t))
     : num_hash_func_(2),
       file_(file),
@@ -53,10 +55,11 @@ CuckooTableBuilder::CuckooTableBuilder(
       max_num_hash_func_(max_num_hash_table),
       max_search_depth_(max_search_depth),
       cuckoo_block_size_(std::max(1U, cuckoo_block_size)),
-      hash_table_size_(2),
+      hash_table_size_(use_module_hash ? 0 : 2),
       is_last_level_file_(false),
       has_seen_first_key_(false),
       ucomp_(user_comparator),
+      use_module_hash_(use_module_hash),
       identity_as_first_hash_(identity_as_first_hash),
       get_slice_hash_(get_slice_hash),
       closed_(false) {
@@ -105,14 +108,15 @@ void CuckooTableBuilder::Add(const Slice& key, const Slice& value) {
   } else if (ikey.user_key.compare(largest_user_key_) > 0) {
     largest_user_key_.assign(ikey.user_key.data(), ikey.user_key.size());
   }
-  if (hash_table_size_ < kvs_.size() / max_hash_table_ratio_) {
-    hash_table_size_ *= 2;
+  if (!use_module_hash_) {
+    if (hash_table_size_ < kvs_.size() / max_hash_table_ratio_) {
+      hash_table_size_ *= 2;
+    }
   }
 }
 
 Status CuckooTableBuilder::MakeHashTable(std::vector<CuckooBucket>* buckets) {
-  uint64_t hash_table_size_minus_one = hash_table_size_ - 1;
-  buckets->resize(hash_table_size_minus_one + cuckoo_block_size_);
+  buckets->resize(hash_table_size_ + cuckoo_block_size_ - 1);
   uint64_t make_space_for_key_call_id = 0;
   for (uint32_t vector_idx = 0; vector_idx < kvs_.size(); vector_idx++) {
     uint64_t bucket_id;
@@ -122,8 +126,8 @@ Status CuckooTableBuilder::MakeHashTable(std::vector<CuckooBucket>* buckets) {
       ExtractUserKey(kvs_[vector_idx].first);
     for (uint32_t hash_cnt = 0; hash_cnt < num_hash_func_ && !bucket_found;
         ++hash_cnt) {
-      uint64_t hash_val = CuckooHash(user_key, hash_cnt,
-          hash_table_size_minus_one, identity_as_first_hash_, get_slice_hash_);
+      uint64_t hash_val = CuckooHash(user_key, hash_cnt, use_module_hash_,
+          hash_table_size_, identity_as_first_hash_, get_slice_hash_);
       // If there is a collision, check next cuckoo_block_size_ locations for
       // empty locations. While checking, if we reach end of the hash table,
       // stop searching and proceed for next hash function.
@@ -152,8 +156,8 @@ Status CuckooTableBuilder::MakeHashTable(std::vector<CuckooBucket>* buckets) {
       }
       // We don't really need to rehash the entire table because old hashes are
       // still valid and we only increased the number of hash functions.
-      uint64_t hash_val = CuckooHash(user_key, num_hash_func_,
-          hash_table_size_minus_one, identity_as_first_hash_, get_slice_hash_);
+      uint64_t hash_val = CuckooHash(user_key, num_hash_func_, use_module_hash_,
+          hash_table_size_, identity_as_first_hash_, get_slice_hash_);
       ++num_hash_func_;
       for (uint32_t block_idx = 0; block_idx < cuckoo_block_size_;
           ++block_idx, ++hash_val) {
@@ -178,6 +182,10 @@ Status CuckooTableBuilder::Finish() {
   Status s;
   std::string unused_bucket;
   if (!kvs_.empty()) {
+    // Calculate the real hash size if module hash is enabled.
+    if (use_module_hash_) {
+      hash_table_size_ = kvs_.size() / max_hash_table_ratio_;
+    }
     s = MakeHashTable(&buckets);
     if (!s.ok()) {
       return s;
@@ -252,11 +260,10 @@ Status CuckooTableBuilder::Finish() {
     CuckooTablePropertyNames::kNumHashFunc].assign(
         reinterpret_cast<char*>(&num_hash_func_), sizeof(num_hash_func_));
 
-  uint64_t hash_table_size = buckets.size() - cuckoo_block_size_ + 1;
   properties_.user_collected_properties[
     CuckooTablePropertyNames::kHashTableSize].assign(
-        reinterpret_cast<const char*>(&hash_table_size),
-        sizeof(hash_table_size));
+        reinterpret_cast<const char*>(&hash_table_size_),
+        sizeof(hash_table_size_));
   properties_.user_collected_properties[
     CuckooTablePropertyNames::kIsLastLevel].assign(
         reinterpret_cast<const char*>(&is_last_level_file_),
@@ -269,6 +276,10 @@ Status CuckooTableBuilder::Finish() {
     CuckooTablePropertyNames::kIdentityAsFirstHash].assign(
         reinterpret_cast<const char*>(&identity_as_first_hash_),
         sizeof(identity_as_first_hash_));
+  properties_.user_collected_properties[
+    CuckooTablePropertyNames::kUseModuleHash].assign(
+        reinterpret_cast<const char*>(&use_module_hash_),
+        sizeof(use_module_hash_));
 
   // Write meta blocks.
   MetaIndexBuilder meta_index_builder;
@@ -322,16 +333,22 @@ uint64_t CuckooTableBuilder::FileSize() const {
     return 0;
   }
 
-  // Account for buckets being a power of two.
-  // As elements are added, file size remains constant for a while and doubles
-  // its size. Since compaction algorithm stops adding elements only after it
-  // exceeds the file limit, we account for the extra element being added here.
-  uint64_t expected_hash_table_size = hash_table_size_;
-  if (expected_hash_table_size < (kvs_.size() + 1) / max_hash_table_ratio_) {
-    expected_hash_table_size *= 2;
+  if (use_module_hash_) {
+    return (kvs_[0].first.size() + kvs_[0].second.size()) * kvs_.size() /
+           max_hash_table_ratio_;
+  } else {
+    // Account for buckets being a power of two.
+    // As elements are added, file size remains constant for a while and
+    // doubles its size. Since compaction algorithm stops adding elements
+    // only after it exceeds the file limit, we account for the extra element
+    // being added here.
+    uint64_t expected_hash_table_size = hash_table_size_;
+    if (expected_hash_table_size < (kvs_.size() + 1) / max_hash_table_ratio_) {
+      expected_hash_table_size *= 2;
+    }
+    return (kvs_[0].first.size() + kvs_[0].second.size()) *
+           expected_hash_table_size - 1;
   }
-  return (kvs_[0].first.size() + kvs_[0].second.size()) *
-    expected_hash_table_size - 1;
 }
 
 // This method is invoked when there is no place to insert the target key.
@@ -373,7 +390,6 @@ bool CuckooTableBuilder::MakeSpaceForKey(
       make_space_for_key_call_id;
     tree.push_back(CuckooNode(bucket_id, 0, 0));
   }
-  uint64_t hash_table_size_minus_one = hash_table_size_ - 1;
   bool null_found = false;
   uint32_t curr_pos = 0;
   while (!null_found && curr_pos < tree.size()) {
@@ -388,7 +404,7 @@ bool CuckooTableBuilder::MakeSpaceForKey(
       uint64_t child_bucket_id = CuckooHash(
           (is_last_level_file_ ? kvs_[curr_bucket.vector_idx].first :
            ExtractUserKey(Slice(kvs_[curr_bucket.vector_idx].first))),
-          hash_cnt, hash_table_size_minus_one, identity_as_first_hash_,
+          hash_cnt, use_module_hash_, hash_table_size_, identity_as_first_hash_,
           get_slice_hash_);
       // Iterate inside Cuckoo Block.
       for (uint32_t block_idx = 0; block_idx < cuckoo_block_size_;
diff --git a/table/cuckoo_table_builder.h b/table/cuckoo_table_builder.h
index 45cf49315..d5fe3f5dc 100644
--- a/table/cuckoo_table_builder.h
+++ b/table/cuckoo_table_builder.h
@@ -24,7 +24,7 @@ class CuckooTableBuilder: public TableBuilder {
       WritableFile* file, double max_hash_table_ratio,
       uint32_t max_num_hash_func, uint32_t max_search_depth,
       const Comparator* user_comparator, uint32_t cuckoo_block_size,
-      bool identity_as_first_hash,
+      bool use_module_hash, bool identity_as_first_hash,
       uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t));
 
   // REQUIRES: Either Finish() or Abandon() has been called.
@@ -88,6 +88,7 @@ class CuckooTableBuilder: public TableBuilder {
   TableProperties properties_;
   bool has_seen_first_key_;
   const Comparator* ucomp_;
+  bool use_module_hash_;
   bool identity_as_first_hash_;
   uint64_t (*get_slice_hash_)(const Slice& s, uint32_t index,
     uint64_t max_num_buckets);
diff --git a/table/cuckoo_table_builder_test.cc b/table/cuckoo_table_builder_test.cc
index d25950728..d3b3a713e 100644
--- a/table/cuckoo_table_builder_test.cc
+++ b/table/cuckoo_table_builder_test.cc
@@ -50,12 +50,6 @@ class CuckooBuilderTest {
     TableProperties* props = nullptr;
     ASSERT_OK(ReadTableProperties(read_file.get(), read_file_size,
           kCuckooTableMagicNumber, env_, nullptr, &props));
-    ASSERT_EQ(props->num_entries, keys.size());
-    ASSERT_EQ(props->fixed_key_len, keys.empty() ? 0 : keys[0].size());
-    ASSERT_EQ(props->data_size, expected_unused_bucket.size() *
-        (expected_table_size + expected_cuckoo_block_size - 1));
-    ASSERT_EQ(props->raw_key_size, keys.size()*props->fixed_key_len);
-
     // Check unused bucket.
     std::string unused_key = props->user_collected_properties[
       CuckooTablePropertyNames::kEmptyKey];
@@ -83,6 +77,12 @@ class CuckooBuilderTest {
       *reinterpret_cast<const bool*>(props->user_collected_properties[
                 CuckooTablePropertyNames::kIsLastLevel].data());
     ASSERT_EQ(expected_is_last_level, is_last_level_found);
+
+    ASSERT_EQ(props->num_entries, keys.size());
+    ASSERT_EQ(props->fixed_key_len, keys.empty() ? 0 : keys[0].size());
+    ASSERT_EQ(props->data_size, expected_unused_bucket.size() *
+        (expected_table_size + expected_cuckoo_block_size - 1));
+    ASSERT_EQ(props->raw_key_size, keys.size()*props->fixed_key_len);
     delete props;
 
     // Check contents of the bucket.
@@ -133,12 +133,12 @@ TEST(CuckooBuilderTest, SuccessWithEmptyFile) {
   fname = test::TmpDir() + "/EmptyFile";
   ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
   CuckooTableBuilder builder(writable_file.get(), kHashTableRatio,
-      4, 100, BytewiseComparator(), 1, false, GetSliceHash);
+      4, 100, BytewiseComparator(), 1, false, false, GetSliceHash);
   ASSERT_OK(builder.status());
   ASSERT_EQ(0UL, builder.FileSize());
   ASSERT_OK(builder.Finish());
   ASSERT_OK(writable_file->Close());
-  CheckFileContents({}, {}, {}, "", 0, 2, false);
+  CheckFileContents({}, {}, {}, "", 2, 2, false);
 }
 
 TEST(CuckooBuilderTest, WriteSuccessNoCollisionFullKey) {
@@ -162,7 +162,7 @@ TEST(CuckooBuilderTest, WriteSuccessNoCollisionFullKey) {
   fname = test::TmpDir() + "/NoCollisionFullKey";
   ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
   CuckooTableBuilder builder(writable_file.get(), kHashTableRatio,
-      num_hash_fun, 100, BytewiseComparator(), 1, false, GetSliceHash);
+      num_hash_fun, 100, BytewiseComparator(), 1, false, false, GetSliceHash);
   ASSERT_OK(builder.status());
   for (uint32_t i = 0; i < user_keys.size(); i++) {
     builder.Add(Slice(keys[i]), Slice(values[i]));
@@ -202,7 +202,7 @@ TEST(CuckooBuilderTest, WriteSuccessWithCollisionFullKey) {
   fname = test::TmpDir() + "/WithCollisionFullKey";
   ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
   CuckooTableBuilder builder(writable_file.get(), kHashTableRatio,
-      num_hash_fun, 100, BytewiseComparator(), 1, false, GetSliceHash);
+      num_hash_fun, 100, BytewiseComparator(), 1, false, false, GetSliceHash);
   ASSERT_OK(builder.status());
   for (uint32_t i = 0; i < user_keys.size(); i++) {
     builder.Add(Slice(keys[i]), Slice(values[i]));
@@ -243,8 +243,8 @@ TEST(CuckooBuilderTest, WriteSuccessWithCollisionAndCuckooBlock) {
   fname = test::TmpDir() + "/WithCollisionFullKey2";
   ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
   CuckooTableBuilder builder(writable_file.get(), kHashTableRatio,
-      num_hash_fun, 100, BytewiseComparator(), cuckoo_block_size, false,
-      GetSliceHash);
+      num_hash_fun, 100, BytewiseComparator(), cuckoo_block_size,
+      false, false, GetSliceHash);
   ASSERT_OK(builder.status());
   for (uint32_t i = 0; i < user_keys.size(); i++) {
     builder.Add(Slice(keys[i]), Slice(values[i]));
@@ -289,7 +289,7 @@ TEST(CuckooBuilderTest, WithCollisionPathFullKey) {
   fname = test::TmpDir() + "/WithCollisionPathFullKey";
   ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
   CuckooTableBuilder builder(writable_file.get(), kHashTableRatio,
-      num_hash_fun, 100, BytewiseComparator(), 1, false, GetSliceHash);
+      num_hash_fun, 100, BytewiseComparator(), 1, false, false, GetSliceHash);
   ASSERT_OK(builder.status());
   for (uint32_t i = 0; i < user_keys.size(); i++) {
     builder.Add(Slice(keys[i]), Slice(values[i]));
@@ -331,7 +331,7 @@ TEST(CuckooBuilderTest, WithCollisionPathFullKeyAndCuckooBlock) {
   fname = test::TmpDir() + "/WithCollisionPathFullKeyAndCuckooBlock";
   ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
   CuckooTableBuilder builder(writable_file.get(), kHashTableRatio,
-      num_hash_fun, 100, BytewiseComparator(), 2, false, GetSliceHash);
+      num_hash_fun, 100, BytewiseComparator(), 2, false, false, GetSliceHash);
   ASSERT_OK(builder.status());
   for (uint32_t i = 0; i < user_keys.size(); i++) {
     builder.Add(Slice(keys[i]), Slice(values[i]));
@@ -367,7 +367,7 @@ TEST(CuckooBuilderTest, WriteSuccessNoCollisionUserKey) {
   fname = test::TmpDir() + "/NoCollisionUserKey";
   ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
   CuckooTableBuilder builder(writable_file.get(), kHashTableRatio,
-      num_hash_fun, 100, BytewiseComparator(), 1, false, GetSliceHash);
+      num_hash_fun, 100, BytewiseComparator(), 1, false, false, GetSliceHash);
   ASSERT_OK(builder.status());
   for (uint32_t i = 0; i < user_keys.size(); i++) {
     builder.Add(Slice(GetInternalKey(user_keys[i], true)), Slice(values[i]));
@@ -403,7 +403,7 @@ TEST(CuckooBuilderTest, WriteSuccessWithCollisionUserKey) {
   fname = test::TmpDir() + "/WithCollisionUserKey";
   ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
   CuckooTableBuilder builder(writable_file.get(), kHashTableRatio,
-      num_hash_fun, 100, BytewiseComparator(), 1, false, GetSliceHash);
+      num_hash_fun, 100, BytewiseComparator(), 1, false, false, GetSliceHash);
   ASSERT_OK(builder.status());
   for (uint32_t i = 0; i < user_keys.size(); i++) {
     builder.Add(Slice(GetInternalKey(user_keys[i], true)), Slice(values[i]));
@@ -441,7 +441,7 @@ TEST(CuckooBuilderTest, WithCollisionPathUserKey) {
   fname = test::TmpDir() + "/WithCollisionPathUserKey";
   ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
   CuckooTableBuilder builder(writable_file.get(), kHashTableRatio,
-      num_hash_fun, 2, BytewiseComparator(), 1, false, GetSliceHash);
+      num_hash_fun, 2, BytewiseComparator(), 1, false, false, GetSliceHash);
   ASSERT_OK(builder.status());
   for (uint32_t i = 0; i < user_keys.size(); i++) {
     builder.Add(Slice(GetInternalKey(user_keys[i], true)), Slice(values[i]));
@@ -479,7 +479,7 @@ TEST(CuckooBuilderTest, FailWhenCollisionPathTooLong) {
   fname = test::TmpDir() + "/WithCollisionPathUserKey";
   ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
   CuckooTableBuilder builder(writable_file.get(), kHashTableRatio,
-      num_hash_fun, 2, BytewiseComparator(), 1, false, GetSliceHash);
+      num_hash_fun, 2, BytewiseComparator(), 1, false, false, GetSliceHash);
   ASSERT_OK(builder.status());
   for (uint32_t i = 0; i < user_keys.size(); i++) {
     builder.Add(Slice(GetInternalKey(user_keys[i], false)), Slice("value"));
@@ -499,7 +499,7 @@ TEST(CuckooBuilderTest, FailWhenSameKeyInserted) {
   fname = test::TmpDir() + "/FailWhenSameKeyInserted";
   ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
   CuckooTableBuilder builder(writable_file.get(), kHashTableRatio,
-      num_hash_fun, 100, BytewiseComparator(), 1, false, GetSliceHash);
+      num_hash_fun, 100, BytewiseComparator(), 1, false, false, GetSliceHash);
   ASSERT_OK(builder.status());
 
   builder.Add(Slice(GetInternalKey(user_key, false)), Slice("value1"));
diff --git a/table/cuckoo_table_factory.cc b/table/cuckoo_table_factory.cc
index 18db54ed7..4afc9fc2e 100644
--- a/table/cuckoo_table_factory.cc
+++ b/table/cuckoo_table_factory.cc
@@ -30,10 +30,11 @@ TableBuilder* CuckooTableFactory::NewTableBuilder(
     const InternalKeyComparator& internal_comparator,
     WritableFile* file, const CompressionType,
     const CompressionOptions&) const {
+  // TODO: change builder to take the option struct
   return new CuckooTableBuilder(file, table_options_.hash_table_ratio, 64,
       table_options_.max_search_depth, internal_comparator.user_comparator(),
-      table_options_.cuckoo_block_size, table_options_.identity_as_first_hash,
-      nullptr);
+      table_options_.cuckoo_block_size, table_options_.use_module_hash,
+      table_options_.identity_as_first_hash, nullptr);
 }
 
 std::string CuckooTableFactory::GetPrintableTableOptions() const {
diff --git a/table/cuckoo_table_factory.h b/table/cuckoo_table_factory.h
index 7b2f32ce3..599908678 100644
--- a/table/cuckoo_table_factory.h
+++ b/table/cuckoo_table_factory.h
@@ -15,21 +15,27 @@ namespace rocksdb {
 
 const uint32_t kCuckooMurmurSeedMultiplier = 816922183;
 static inline uint64_t CuckooHash(
-    const Slice& user_key, uint32_t hash_cnt, uint64_t table_size_minus_one,
-    bool identity_as_first_hash,
+    const Slice& user_key, uint32_t hash_cnt, bool use_module_hash,
+    uint64_t table_size_, bool identity_as_first_hash,
     uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t)) {
 #ifndef NDEBUG
   // This part is used only in unit tests.
   if (get_slice_hash != nullptr) {
-    return get_slice_hash(user_key, hash_cnt, table_size_minus_one + 1);
+    return get_slice_hash(user_key, hash_cnt, table_size_);
   }
 #endif
+  uint64_t value = 0;
   if (hash_cnt == 0 && identity_as_first_hash) {
-    return (*reinterpret_cast<const int64_t*>(user_key.data())) &
-           table_size_minus_one;
+    value = (*reinterpret_cast<const int64_t*>(user_key.data()));
+  } else {
+    value = MurmurHash(user_key.data(), user_key.size(),
+                       kCuckooMurmurSeedMultiplier * hash_cnt);
+  }
+  if (use_module_hash) {
+    return value % table_size_;
+  } else {
+    return value & (table_size_ - 1);
   }
-  return MurmurHash(user_key.data(), user_key.size(),
-      kCuckooMurmurSeedMultiplier * hash_cnt) & table_size_minus_one;
 }
 
 // Cuckoo Table is designed for applications that require fast point lookups
diff --git a/table/cuckoo_table_reader.cc b/table/cuckoo_table_reader.cc
index b8ac5a47e..30a8d8079 100644
--- a/table/cuckoo_table_reader.cc
+++ b/table/cuckoo_table_reader.cc
@@ -77,8 +77,9 @@ CuckooTableReader::CuckooTableReader(
     status_ = Status::Corruption("Hash table size not found");
     return;
   }
-  table_size_minus_one_ = *reinterpret_cast<const uint64_t*>(
-      hash_table_size->second.data()) - 1;
+  table_size_ = *reinterpret_cast<const uint64_t*>(
+      hash_table_size->second.data());
+
   auto is_last_level = user_props.find(CuckooTablePropertyNames::kIsLastLevel);
   if (is_last_level == user_props.end()) {
     status_ = Status::Corruption("Is last level not found");
@@ -95,6 +96,15 @@ CuckooTableReader::CuckooTableReader(
   identity_as_first_hash_ = *reinterpret_cast<const bool*>(
       identity_as_first_hash->second.data());
 
+  auto use_module_hash = user_props.find(
+      CuckooTablePropertyNames::kUseModuleHash);
+  if (use_module_hash == user_props.end()) {
+    status_ = Status::Corruption("hash type is not found");
+    return;
+  }
+  use_module_hash_ = *reinterpret_cast<const bool*>(
+      use_module_hash->second.data());
+  fprintf(stderr, "use_module_hash %d\n", use_module_hash_);
   auto cuckoo_block_size = user_props.find(
       CuckooTablePropertyNames::kCuckooBlockSize);
   if (cuckoo_block_size == user_props.end()) {
@@ -116,8 +126,8 @@ Status CuckooTableReader::Get(
   Slice user_key = ExtractUserKey(key);
   for (uint32_t hash_cnt = 0; hash_cnt < num_hash_func_; ++hash_cnt) {
     uint64_t offset = bucket_length_ * CuckooHash(
-        user_key, hash_cnt, table_size_minus_one_, identity_as_first_hash_,
-        get_slice_hash_);
+        user_key, hash_cnt, use_module_hash_, table_size_,
+        identity_as_first_hash_, get_slice_hash_);
     const char* bucket = &file_data_.data()[offset];
     for (uint32_t block_idx = 0; block_idx < cuckoo_block_size_;
          ++block_idx, bucket += bucket_length_) {
@@ -151,7 +161,7 @@ void CuckooTableReader::Prepare(const Slice& key) {
   // Prefetch the first Cuckoo Block.
   Slice user_key = ExtractUserKey(key);
   uint64_t addr = reinterpret_cast<uint64_t>(file_data_.data()) +
-    bucket_length_ * CuckooHash(user_key, 0, table_size_minus_one_,
+    bucket_length_ * CuckooHash(user_key, 0, use_module_hash_, table_size_,
                                 identity_as_first_hash_, nullptr);
   uint64_t end_addr = addr + cuckoo_block_bytes_minus_one_;
   for (addr &= CACHE_LINE_MASK; addr < end_addr; addr += CACHE_LINE_SIZE) {
@@ -219,8 +229,7 @@ CuckooTableIterator::CuckooTableIterator(CuckooTableReader* reader)
 
 void CuckooTableIterator::LoadKeysFromReader() {
   key_to_bucket_id_.reserve(reader_->GetTableProperties()->num_entries);
-  uint64_t num_buckets = reader_->table_size_minus_one_ +
-    reader_->cuckoo_block_size_;
+  uint64_t num_buckets = reader_->table_size_ + reader_->cuckoo_block_size_ - 1;
   for (uint32_t bucket_id = 0; bucket_id < num_buckets; bucket_id++) {
     Slice read_key;
     status_ = reader_->file_->Read(bucket_id * reader_->bucket_length_,
diff --git a/table/cuckoo_table_reader.h b/table/cuckoo_table_reader.h
index f9e93abf4..8b3ad4b91 100644
--- a/table/cuckoo_table_reader.h
+++ b/table/cuckoo_table_reader.h
@@ -65,6 +65,7 @@ class CuckooTableReader: public TableReader {
   Slice file_data_;
   bool is_last_level_;
   bool identity_as_first_hash_;
+  bool use_module_hash_;
   std::shared_ptr<const TableProperties> table_props_;
   Status status_;
   uint32_t num_hash_func_;
@@ -74,7 +75,7 @@ class CuckooTableReader: public TableReader {
   uint32_t bucket_length_;
   uint32_t cuckoo_block_size_;
   uint32_t cuckoo_block_bytes_minus_one_;
-  uint64_t table_size_minus_one_;
+  uint64_t table_size_;
   const Comparator* ucomp_;
   uint64_t (*get_slice_hash_)(const Slice& s, uint32_t index,
       uint64_t max_num_buckets);
diff --git a/table/cuckoo_table_reader_test.cc b/table/cuckoo_table_reader_test.cc
index 6dd5e5525..6566b7a29 100644
--- a/table/cuckoo_table_reader_test.cc
+++ b/table/cuckoo_table_reader_test.cc
@@ -110,8 +110,8 @@ class CuckooReaderTest {
     std::unique_ptr<WritableFile> writable_file;
     ASSERT_OK(env->NewWritableFile(fname, &writable_file, env_options));
     CuckooTableBuilder builder(
-        writable_file.get(), 0.9, kNumHashFunc, 100, ucomp, 2, false,
-        GetSliceHash);
+        writable_file.get(), 0.9, kNumHashFunc, 100, ucomp, 2,
+        false, false, GetSliceHash);
     ASSERT_OK(builder.status());
     for (uint32_t key_idx = 0; key_idx < num_items; ++key_idx) {
       builder.Add(Slice(keys[key_idx]), Slice(values[key_idx]));
@@ -434,7 +434,7 @@ void WriteFile(const std::vector<std::string>& keys,
   CuckooTableBuilder builder(
       writable_file.get(), hash_ratio,
       64, 1000, test::Uint64Comparator(), 5,
-      FLAGS_identity_as_first_hash, nullptr);
+      false, FLAGS_identity_as_first_hash, nullptr);
   ASSERT_OK(builder.status());
   for (uint64_t key_idx = 0; key_idx < num; ++key_idx) {
     // Value is just a part of key.

From c6275956e2d73b6aa5023d0b1298a495620289d8 Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Thu, 25 Sep 2014 16:15:23 -0700
Subject: [PATCH 128/829] improve memory efficiency of cuckoo reader

Summary:
When creating a new iterator, instead of storing mapping from key to
bucket id for sorting, store only bucket id and read key from mmap file
based on the id. This reduces from 20 bytes per entry to only 4 bytes.

Test Plan: db_bench

Reviewers: igor, yhchiang, sdong

Reviewed By: sdong

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D23757
---
 include/rocksdb/table.h       |   2 +
 table/cuckoo_table_builder.cc |   7 ++
 table/cuckoo_table_reader.cc  | 126 +++++++++++++++++++---------------
 table/cuckoo_table_reader.h   |   1 +
 4 files changed, 82 insertions(+), 54 deletions(-)

diff --git a/include/rocksdb/table.h b/include/rocksdb/table.h
index e8ac6bd62..4c06c23f7 100644
--- a/include/rocksdb/table.h
+++ b/include/rocksdb/table.h
@@ -255,6 +255,8 @@ struct CuckooTablePropertyNames {
   static const std::string kIdentityAsFirstHash;
   // Indicate if using module or bit and to calculate hash value
   static const std::string kUseModuleHash;
+  // Fixed user key length
+  static const std::string kUserKeyLength;
 };
 
 struct CuckooTableOptions {
diff --git a/table/cuckoo_table_builder.cc b/table/cuckoo_table_builder.cc
index 17184ae2c..56eb377fa 100644
--- a/table/cuckoo_table_builder.cc
+++ b/table/cuckoo_table_builder.cc
@@ -39,6 +39,8 @@ const std::string CuckooTablePropertyNames::kIdentityAsFirstHash =
       "rocksdb.cuckoo.hash.identityfirst";
 const std::string CuckooTablePropertyNames::kUseModuleHash =
       "rocksdb.cuckoo.hash.usemodule";
+const std::string CuckooTablePropertyNames::kUserKeyLength =
+      "rocksdb.cuckoo.hash.userkeylength";
 
 // Obtained by running echo rocksdb.table.cuckoo | sha1sum
 extern const uint64_t kCuckooTableMagicNumber = 0x926789d0c5f17873ull;
@@ -280,6 +282,11 @@ Status CuckooTableBuilder::Finish() {
     CuckooTablePropertyNames::kUseModuleHash].assign(
         reinterpret_cast<const char*>(&use_module_hash_),
         sizeof(use_module_hash_));
+  uint32_t user_key_len = static_cast<uint32_t>(smallest_user_key_.size());
+  properties_.user_collected_properties[
+    CuckooTablePropertyNames::kUserKeyLength].assign(
+        reinterpret_cast<const char*>(&user_key_len),
+        sizeof(user_key_len));
 
   // Write meta blocks.
   MetaIndexBuilder meta_index_builder;
diff --git a/table/cuckoo_table_reader.cc b/table/cuckoo_table_reader.cc
index 30a8d8079..8c3f58eac 100644
--- a/table/cuckoo_table_reader.cc
+++ b/table/cuckoo_table_reader.cc
@@ -16,6 +16,7 @@
 #include <utility>
 #include <vector>
 #include "rocksdb/iterator.h"
+#include "rocksdb/table.h"
 #include "table/meta_blocks.h"
 #include "table/cuckoo_table_factory.h"
 #include "util/arena.h"
@@ -23,7 +24,8 @@
 
 namespace rocksdb {
 namespace {
-  static const uint64_t CACHE_LINE_MASK = ~((uint64_t)CACHE_LINE_SIZE - 1);
+const uint64_t CACHE_LINE_MASK = ~((uint64_t)CACHE_LINE_SIZE - 1);
+const uint32_t kInvalidIndex = std::numeric_limits<uint32_t>::max();
 }
 
 extern const uint64_t kCuckooTableMagicNumber;
@@ -62,6 +64,14 @@ CuckooTableReader::CuckooTableReader(
   unused_key_ = unused_key->second;
 
   key_length_ = props->fixed_key_len;
+  auto user_key_len = user_props.find(CuckooTablePropertyNames::kUserKeyLength);
+  if (user_key_len == user_props.end()) {
+    status_ = Status::Corruption("User key length not found");
+    return;
+  }
+  user_key_length_ = *reinterpret_cast<const uint32_t*>(
+      user_key_len->second.data());
+
   auto value_length = user_props.find(CuckooTablePropertyNames::kValueLength);
   if (value_length == user_props.end()) {
     status_ = Status::Corruption("Value length not found");
@@ -104,7 +114,6 @@ CuckooTableReader::CuckooTableReader(
   }
   use_module_hash_ = *reinterpret_cast<const bool*>(
       use_module_hash->second.data());
-  fprintf(stderr, "use_module_hash %d\n", use_module_hash_);
   auto cuckoo_block_size = user_props.find(
       CuckooTablePropertyNames::kCuckooBlockSize);
   if (cuckoo_block_size == user_props.end()) {
@@ -185,30 +194,39 @@ class CuckooTableIterator : public Iterator {
   void LoadKeysFromReader();
 
  private:
-  struct CompareKeys {
-    CompareKeys(const Comparator* ucomp, const bool last_level)
-      : ucomp_(ucomp),
-        is_last_level_(last_level) {}
-    bool operator()(const std::pair<Slice, uint32_t>& first,
-        const std::pair<Slice, uint32_t>& second) const {
-      if (is_last_level_) {
-        return ucomp_->Compare(first.first, second.first) < 0;
-      } else {
-        return ucomp_->Compare(ExtractUserKey(first.first),
-            ExtractUserKey(second.first)) < 0;
-      }
+  struct BucketComparator {
+    BucketComparator(const Slice file_data, const Comparator* ucomp,
+                     uint32_t bucket_len, uint32_t user_key_len,
+                     const Slice target = Slice())
+      : file_data_(file_data),
+        ucomp_(ucomp),
+        bucket_len_(bucket_len),
+        user_key_len_(user_key_len),
+        target_(target) {}
+    bool operator()(const uint32_t first, const uint32_t second) const {
+      const char* first_bucket =
+        (first == kInvalidIndex) ? target_.data() :
+                                   &file_data_.data()[first * bucket_len_];
+      const char* second_bucket =
+        (second == kInvalidIndex) ? target_.data() :
+                                    &file_data_.data()[second * bucket_len_];
+      return ucomp_->Compare(Slice(first_bucket, user_key_len_),
+                             Slice(second_bucket, user_key_len_)) < 0;
     }
-
    private:
+    const Slice file_data_;
     const Comparator* ucomp_;
-    const bool is_last_level_;
+    const uint32_t bucket_len_;
+    const uint32_t user_key_len_;
+    const Slice target_;
   };
-  const CompareKeys comparator_;
+
+  const BucketComparator bucket_comparator_;
   void PrepareKVAtCurrIdx();
   CuckooTableReader* reader_;
   Status status_;
   // Contains a map of keys to bucket_id sorted in key order.
-  std::vector<std::pair<Slice, uint32_t>> key_to_bucket_id_;
+  std::vector<uint32_t> sorted_bucket_ids_;
   // We assume that the number of items can be stored in uint32 (4 Billion).
   uint32_t curr_key_idx_;
   Slice curr_value_;
@@ -219,29 +237,31 @@ class CuckooTableIterator : public Iterator {
 };
 
 CuckooTableIterator::CuckooTableIterator(CuckooTableReader* reader)
-  : comparator_(reader->ucomp_, reader->is_last_level_),
+  : bucket_comparator_(reader->file_data_, reader->ucomp_,
+                       reader->bucket_length_, reader->user_key_length_),
     reader_(reader),
-    curr_key_idx_(std::numeric_limits<int32_t>::max()) {
-  key_to_bucket_id_.clear();
+    curr_key_idx_(kInvalidIndex) {
+  sorted_bucket_ids_.clear();
   curr_value_.clear();
   curr_key_.Clear();
 }
 
 void CuckooTableIterator::LoadKeysFromReader() {
-  key_to_bucket_id_.reserve(reader_->GetTableProperties()->num_entries);
+  sorted_bucket_ids_.reserve(reader_->GetTableProperties()->num_entries);
   uint64_t num_buckets = reader_->table_size_ + reader_->cuckoo_block_size_ - 1;
-  for (uint32_t bucket_id = 0; bucket_id < num_buckets; bucket_id++) {
-    Slice read_key;
-    status_ = reader_->file_->Read(bucket_id * reader_->bucket_length_,
-        reader_->key_length_, &read_key, nullptr);
-    if (read_key != Slice(reader_->unused_key_)) {
-      key_to_bucket_id_.push_back(std::make_pair(read_key, bucket_id));
+  assert(num_buckets < kInvalidIndex);
+  const char* bucket = reader_->file_data_.data();
+  for (uint32_t bucket_id = 0; bucket_id < num_buckets; ++bucket_id) {
+    if (Slice(bucket, reader_->key_length_) != Slice(reader_->unused_key_)) {
+      sorted_bucket_ids_.push_back(bucket_id);
     }
+    bucket += reader_->bucket_length_;
   }
-  assert(key_to_bucket_id_.size() ==
+  assert(sorted_bucket_ids_.size() ==
       reader_->GetTableProperties()->num_entries);
-  std::sort(key_to_bucket_id_.begin(), key_to_bucket_id_.end(), comparator_);
-  curr_key_idx_ = key_to_bucket_id_.size();
+  std::sort(sorted_bucket_ids_.begin(), sorted_bucket_ids_.end(),
+            bucket_comparator_);
+  curr_key_idx_ = kInvalidIndex;
 }
 
 void CuckooTableIterator::SeekToFirst() {
@@ -250,25 +270,25 @@ void CuckooTableIterator::SeekToFirst() {
 }
 
 void CuckooTableIterator::SeekToLast() {
-  curr_key_idx_ = key_to_bucket_id_.size() - 1;
+  curr_key_idx_ = sorted_bucket_ids_.size() - 1;
   PrepareKVAtCurrIdx();
 }
 
 void CuckooTableIterator::Seek(const Slice& target) {
-  // We assume that the target is an internal key. If this is last level file,
-  // we need to take only the user key part to seek.
-  Slice target_to_search = reader_->is_last_level_ ?
-    ExtractUserKey(target) : target;
-  auto seek_it = std::lower_bound(key_to_bucket_id_.begin(),
-      key_to_bucket_id_.end(),
-      std::make_pair(target_to_search, 0),
-      comparator_);
-  curr_key_idx_ = std::distance(key_to_bucket_id_.begin(), seek_it);
+  const BucketComparator seek_comparator(
+      reader_->file_data_, reader_->ucomp_,
+      reader_->bucket_length_, reader_->user_key_length_,
+      ExtractUserKey(target));
+  auto seek_it = std::lower_bound(sorted_bucket_ids_.begin(),
+      sorted_bucket_ids_.end(),
+      kInvalidIndex,
+      seek_comparator);
+  curr_key_idx_ = std::distance(sorted_bucket_ids_.begin(), seek_it);
   PrepareKVAtCurrIdx();
 }
 
 bool CuckooTableIterator::Valid() const {
-  return curr_key_idx_ < key_to_bucket_id_.size();
+  return curr_key_idx_ < sorted_bucket_ids_.size();
 }
 
 void CuckooTableIterator::PrepareKVAtCurrIdx() {
@@ -277,15 +297,17 @@ void CuckooTableIterator::PrepareKVAtCurrIdx() {
     curr_key_.Clear();
     return;
   }
-  uint64_t offset = ((uint64_t) key_to_bucket_id_[curr_key_idx_].second
-      * reader_->bucket_length_) + reader_->key_length_;
-  status_ = reader_->file_->Read(offset, reader_->value_length_,
-      &curr_value_, nullptr);
+  uint32_t id = sorted_bucket_ids_[curr_key_idx_];
+  const char* offset = reader_->file_data_.data() +
+                       id * reader_->bucket_length_;
   if (reader_->is_last_level_) {
     // Always return internal key.
-    curr_key_.SetInternalKey(
-        key_to_bucket_id_[curr_key_idx_].first, 0, kTypeValue);
+    curr_key_.SetInternalKey(Slice(offset, reader_->user_key_length_),
+                             0, kTypeValue);
+  } else {
+    curr_key_.SetKey(Slice(offset, reader_->key_length_));
   }
+  curr_value_ = Slice(offset + reader_->key_length_, reader_->value_length_);
 }
 
 void CuckooTableIterator::Next() {
@@ -300,7 +322,7 @@ void CuckooTableIterator::Next() {
 
 void CuckooTableIterator::Prev() {
   if (curr_key_idx_ == 0) {
-    curr_key_idx_ = key_to_bucket_id_.size();
+    curr_key_idx_ = sorted_bucket_ids_.size();
   }
   if (!Valid()) {
     curr_value_.clear();
@@ -313,11 +335,7 @@ void CuckooTableIterator::Prev() {
 
 Slice CuckooTableIterator::key() const {
   assert(Valid());
-  if (reader_->is_last_level_) {
-    return curr_key_.GetKey();
-  } else {
-    return key_to_bucket_id_[curr_key_idx_].first;
-  }
+  return curr_key_.GetKey();
 }
 
 Slice CuckooTableIterator::value() const {
diff --git a/table/cuckoo_table_reader.h b/table/cuckoo_table_reader.h
index 8b3ad4b91..8f7635cfa 100644
--- a/table/cuckoo_table_reader.h
+++ b/table/cuckoo_table_reader.h
@@ -71,6 +71,7 @@ class CuckooTableReader: public TableReader {
   uint32_t num_hash_func_;
   std::string unused_key_;
   uint32_t key_length_;
+  uint32_t user_key_length_;
   uint32_t value_length_;
   uint32_t bucket_length_;
   uint32_t cuckoo_block_size_;

From 94997eab5e91d77850f91a03217241739d938edc Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Thu, 25 Sep 2014 16:34:24 -0700
Subject: [PATCH 129/829] reduce memory usage of cuckoo table builder

Summary:
builder currently buffers all key value pairs as a vector of
pair<string, string>. That is too much due to std::string
overhead. It wasn't able to fit 1B key/values (12bytes total) in 100GB
of ram. Switch to use a plain string to store the key/value sequence and
use only 12GB of ram as a result.

Test Plan: db_bench

Reviewers: igor, sdong, yhchiang

Reviewed By: sdong

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D23763
---
 table/cuckoo_table_builder.cc | 76 ++++++++++++++++++++---------------
 table/cuckoo_table_builder.h  | 15 ++++++-
 2 files changed, 56 insertions(+), 35 deletions(-)

diff --git a/table/cuckoo_table_builder.cc b/table/cuckoo_table_builder.cc
index 56eb377fa..a66e9899e 100644
--- a/table/cuckoo_table_builder.cc
+++ b/table/cuckoo_table_builder.cc
@@ -60,6 +60,9 @@ CuckooTableBuilder::CuckooTableBuilder(
       hash_table_size_(use_module_hash ? 0 : 2),
       is_last_level_file_(false),
       has_seen_first_key_(false),
+      key_size_(0),
+      value_size_(0),
+      num_entries_(0),
       ucomp_(user_comparator),
       use_module_hash_(use_module_hash),
       identity_as_first_hash_(identity_as_first_hash),
@@ -72,7 +75,7 @@ CuckooTableBuilder::CuckooTableBuilder(
 }
 
 void CuckooTableBuilder::Add(const Slice& key, const Slice& value) {
-  if (kvs_.size() >= kMaxVectorIdx - 1) {
+  if (num_entries_ >= kMaxVectorIdx - 1) {
     status_ = Status::NotSupported("Number of keys in a file must be < 2^32-1");
     return;
   }
@@ -90,15 +93,18 @@ void CuckooTableBuilder::Add(const Slice& key, const Slice& value) {
     has_seen_first_key_ = true;
     smallest_user_key_.assign(ikey.user_key.data(), ikey.user_key.size());
     largest_user_key_.assign(ikey.user_key.data(), ikey.user_key.size());
+    key_size_ = is_last_level_file_ ? ikey.user_key.size() : key.size();
+    value_size_ = value.size();
   }
   // Even if one sequence number is non-zero, then it is not last level.
   assert(!is_last_level_file_ || ikey.sequence == 0);
   if (is_last_level_file_) {
-    kvs_.emplace_back(std::make_pair(
-          ikey.user_key.ToString(), value.ToString()));
+    kvs_.append(ikey.user_key.data(), ikey.user_key.size());
   } else {
-    kvs_.emplace_back(std::make_pair(key.ToString(), value.ToString()));
+    kvs_.append(key.data(), key.size());
   }
+  kvs_.append(value.data(), value.size());
+  ++num_entries_;
 
   // In order to fill the empty buckets in the hash table, we identify a
   // key which is not used so far (unused_user_key). We determine this by
@@ -111,21 +117,32 @@ void CuckooTableBuilder::Add(const Slice& key, const Slice& value) {
     largest_user_key_.assign(ikey.user_key.data(), ikey.user_key.size());
   }
   if (!use_module_hash_) {
-    if (hash_table_size_ < kvs_.size() / max_hash_table_ratio_) {
+    if (hash_table_size_ < num_entries_ / max_hash_table_ratio_) {
       hash_table_size_ *= 2;
     }
   }
 }
 
+Slice CuckooTableBuilder::GetKey(uint64_t idx) const {
+  return Slice(&kvs_[idx * (key_size_ + value_size_)], key_size_);
+}
+
+Slice CuckooTableBuilder::GetUserKey(uint64_t idx) const {
+  return is_last_level_file_ ? GetKey(idx) : ExtractUserKey(GetKey(idx));
+}
+
+Slice CuckooTableBuilder::GetValue(uint64_t idx) const {
+  return Slice(&kvs_[idx * (key_size_ + value_size_) + key_size_], value_size_);
+}
+
 Status CuckooTableBuilder::MakeHashTable(std::vector<CuckooBucket>* buckets) {
   buckets->resize(hash_table_size_ + cuckoo_block_size_ - 1);
   uint64_t make_space_for_key_call_id = 0;
-  for (uint32_t vector_idx = 0; vector_idx < kvs_.size(); vector_idx++) {
+  for (uint32_t vector_idx = 0; vector_idx < num_entries_; vector_idx++) {
     uint64_t bucket_id;
     bool bucket_found = false;
     autovector<uint64_t> hash_vals;
-    Slice user_key = is_last_level_file_ ? kvs_[vector_idx].first :
-      ExtractUserKey(kvs_[vector_idx].first);
+    Slice user_key = GetUserKey(vector_idx);
     for (uint32_t hash_cnt = 0; hash_cnt < num_hash_func_ && !bucket_found;
         ++hash_cnt) {
       uint64_t hash_val = CuckooHash(user_key, hash_cnt, use_module_hash_,
@@ -140,10 +157,8 @@ Status CuckooTableBuilder::MakeHashTable(std::vector<CuckooBucket>* buckets) {
           bucket_found = true;
           break;
         } else {
-          if (ucomp_->Compare(user_key, is_last_level_file_
-                ? Slice(kvs_[(*buckets)[hash_val].vector_idx].first)
-                : ExtractUserKey(
-                  kvs_[(*buckets)[hash_val].vector_idx].first)) == 0) {
+          if (ucomp_->Compare(user_key,
+                GetUserKey((*buckets)[hash_val].vector_idx)) == 0) {
             return Status::NotSupported("Same key is being inserted again.");
           }
           hash_vals.push_back(hash_val);
@@ -183,10 +198,10 @@ Status CuckooTableBuilder::Finish() {
   std::vector<CuckooBucket> buckets;
   Status s;
   std::string unused_bucket;
-  if (!kvs_.empty()) {
+  if (num_entries_ > 0) {
     // Calculate the real hash size if module hash is enabled.
     if (use_module_hash_) {
-      hash_table_size_ = kvs_.size() / max_hash_table_ratio_;
+      hash_table_size_ = num_entries_ / max_hash_table_ratio_;
     }
     s = MakeHashTable(&buckets);
     if (!s.ok()) {
@@ -224,14 +239,13 @@ Status CuckooTableBuilder::Finish() {
       AppendInternalKey(&unused_bucket, ikey);
     }
   }
-  properties_.num_entries = kvs_.size();
-  properties_.fixed_key_len = unused_bucket.size();
-  uint32_t value_length = kvs_.empty() ? 0 : kvs_[0].second.size();
-  uint32_t bucket_size = value_length + properties_.fixed_key_len;
+  properties_.num_entries = num_entries_;
+  properties_.fixed_key_len = key_size_;
   properties_.user_collected_properties[
         CuckooTablePropertyNames::kValueLength].assign(
-        reinterpret_cast<const char*>(&value_length), sizeof(value_length));
+        reinterpret_cast<const char*>(&value_size_), sizeof(value_size_));
 
+  uint64_t bucket_size = key_size_ + value_size_;
   unused_bucket.resize(bucket_size, 'a');
   // Write the table.
   uint32_t num_added = 0;
@@ -240,9 +254,9 @@ Status CuckooTableBuilder::Finish() {
       s = file_->Append(Slice(unused_bucket));
     } else {
       ++num_added;
-      s = file_->Append(kvs_[bucket.vector_idx].first);
+      s = file_->Append(GetKey(bucket.vector_idx));
       if (s.ok()) {
-        s = file_->Append(kvs_[bucket.vector_idx].second);
+        s = file_->Append(GetValue(bucket.vector_idx));
       }
     }
     if (!s.ok()) {
@@ -251,7 +265,7 @@ Status CuckooTableBuilder::Finish() {
   }
   assert(num_added == NumEntries());
   properties_.raw_key_size = num_added * properties_.fixed_key_len;
-  properties_.raw_value_size = num_added * value_length;
+  properties_.raw_value_size = num_added * value_size_;
 
   uint64_t offset = buckets.size() * bucket_size;
   properties_.data_size = offset;
@@ -330,19 +344,18 @@ void CuckooTableBuilder::Abandon() {
 }
 
 uint64_t CuckooTableBuilder::NumEntries() const {
-  return kvs_.size();
+  return num_entries_;
 }
 
 uint64_t CuckooTableBuilder::FileSize() const {
   if (closed_) {
     return file_->GetFileSize();
-  } else if (kvs_.size() == 0) {
+  } else if (num_entries_ == 0) {
     return 0;
   }
 
   if (use_module_hash_) {
-    return (kvs_[0].first.size() + kvs_[0].second.size()) * kvs_.size() /
-           max_hash_table_ratio_;
+    return (key_size_ + value_size_) * num_entries_ / max_hash_table_ratio_;
   } else {
     // Account for buckets being a power of two.
     // As elements are added, file size remains constant for a while and
@@ -350,11 +363,10 @@ uint64_t CuckooTableBuilder::FileSize() const {
     // only after it exceeds the file limit, we account for the extra element
     // being added here.
     uint64_t expected_hash_table_size = hash_table_size_;
-    if (expected_hash_table_size < (kvs_.size() + 1) / max_hash_table_ratio_) {
+    if (expected_hash_table_size < (num_entries_ + 1) / max_hash_table_ratio_) {
       expected_hash_table_size *= 2;
     }
-    return (kvs_[0].first.size() + kvs_[0].second.size()) *
-           expected_hash_table_size - 1;
+    return (key_size_ + value_size_) * expected_hash_table_size - 1;
   }
 }
 
@@ -390,7 +402,7 @@ bool CuckooTableBuilder::MakeSpaceForKey(
   // of the method. We store this number into the nodes that we explore in
   // current method call.
   // It is unlikely for the increment operation to overflow because the maximum
-  // no. of times this will be called is <= max_num_hash_func_ + kvs_.size().
+  // no. of times this will be called is <= max_num_hash_func_ + num_entries_.
   for (uint32_t hash_cnt = 0; hash_cnt < num_hash_func_; ++hash_cnt) {
     uint64_t bucket_id = hash_vals[hash_cnt];
     (*buckets)[bucket_id].make_space_for_key_call_id =
@@ -408,9 +420,7 @@ bool CuckooTableBuilder::MakeSpaceForKey(
     CuckooBucket& curr_bucket = (*buckets)[curr_node.bucket_id];
     for (uint32_t hash_cnt = 0;
         hash_cnt < num_hash_func_ && !null_found; ++hash_cnt) {
-      uint64_t child_bucket_id = CuckooHash(
-          (is_last_level_file_ ? kvs_[curr_bucket.vector_idx].first :
-           ExtractUserKey(Slice(kvs_[curr_bucket.vector_idx].first))),
+      uint64_t child_bucket_id = CuckooHash(GetUserKey(curr_bucket.vector_idx),
           hash_cnt, use_module_hash_, hash_table_size_, identity_as_first_hash_,
           get_slice_hash_);
       // Iterate inside Cuckoo Block.
diff --git a/table/cuckoo_table_builder.h b/table/cuckoo_table_builder.h
index d5fe3f5dc..b1d7e649c 100644
--- a/table/cuckoo_table_builder.h
+++ b/table/cuckoo_table_builder.h
@@ -75,6 +75,10 @@ class CuckooTableBuilder: public TableBuilder {
       uint64_t* bucket_id);
   Status MakeHashTable(std::vector<CuckooBucket>* buckets);
 
+  inline Slice GetKey(uint64_t idx) const;
+  inline Slice GetUserKey(uint64_t idx) const;
+  inline Slice GetValue(uint64_t idx) const;
+
   uint32_t num_hash_func_;
   WritableFile* file_;
   const double max_hash_table_ratio_;
@@ -83,10 +87,17 @@ class CuckooTableBuilder: public TableBuilder {
   const uint32_t cuckoo_block_size_;
   uint64_t hash_table_size_;
   bool is_last_level_file_;
+  bool has_seen_first_key_;
+  uint64_t key_size_;
+  uint64_t value_size_;
+  // A list of fixed-size key-value pairs concatenating into a string.
+  // Use GetKey(), GetUserKey(), and GetValue() to retrieve a specific
+  // key / value given an index
+  std::string kvs_;
+  // Number of key-value pairs stored in kvs_
+  uint64_t num_entries_;
   Status status_;
-  std::vector<std::pair<std::string, std::string>> kvs_;
   TableProperties properties_;
-  bool has_seen_first_key_;
   const Comparator* ucomp_;
   bool use_module_hash_;
   bool identity_as_first_hash_;

From d439451fab490ce0cabc90f2880443c93d9eab12 Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Thu, 25 Sep 2014 16:45:37 -0700
Subject: [PATCH 130/829] delay initialization of cuckoo table iterator

Summary:
cuckoo table iterator creation is quite expensive since it needs to load
all data and sort them. After compaction, RocksDB creates a new iterator
of the new file to make sure it is in good state. That makes the DB
creation quite slow. Delay the iterator db sort to the seek time to
speed it up.

Test Plan: db_bench

Reviewers: igor, yhchiang, sdong

Reviewed By: sdong

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D23775
---
 table/cuckoo_table_reader.cc | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/table/cuckoo_table_reader.cc b/table/cuckoo_table_reader.cc
index 8c3f58eac..af6fe6e88 100644
--- a/table/cuckoo_table_reader.cc
+++ b/table/cuckoo_table_reader.cc
@@ -191,7 +191,7 @@ class CuckooTableIterator : public Iterator {
   Slice key() const override;
   Slice value() const override;
   Status status() const override { return status_; }
-  void LoadKeysFromReader();
+  void InitIfNeeded();
 
  private:
   struct BucketComparator {
@@ -224,6 +224,7 @@ class CuckooTableIterator : public Iterator {
   const BucketComparator bucket_comparator_;
   void PrepareKVAtCurrIdx();
   CuckooTableReader* reader_;
+  bool initialized_;
   Status status_;
   // Contains a map of keys to bucket_id sorted in key order.
   std::vector<uint32_t> sorted_bucket_ids_;
@@ -240,13 +241,17 @@ CuckooTableIterator::CuckooTableIterator(CuckooTableReader* reader)
   : bucket_comparator_(reader->file_data_, reader->ucomp_,
                        reader->bucket_length_, reader->user_key_length_),
     reader_(reader),
+    initialized_(false),
     curr_key_idx_(kInvalidIndex) {
   sorted_bucket_ids_.clear();
   curr_value_.clear();
   curr_key_.Clear();
 }
 
-void CuckooTableIterator::LoadKeysFromReader() {
+void CuckooTableIterator::InitIfNeeded() {
+  if (initialized_) {
+    return;
+  }
   sorted_bucket_ids_.reserve(reader_->GetTableProperties()->num_entries);
   uint64_t num_buckets = reader_->table_size_ + reader_->cuckoo_block_size_ - 1;
   assert(num_buckets < kInvalidIndex);
@@ -262,19 +267,23 @@ void CuckooTableIterator::LoadKeysFromReader() {
   std::sort(sorted_bucket_ids_.begin(), sorted_bucket_ids_.end(),
             bucket_comparator_);
   curr_key_idx_ = kInvalidIndex;
+  initialized_ = true;
 }
 
 void CuckooTableIterator::SeekToFirst() {
+  InitIfNeeded();
   curr_key_idx_ = 0;
   PrepareKVAtCurrIdx();
 }
 
 void CuckooTableIterator::SeekToLast() {
+  InitIfNeeded();
   curr_key_idx_ = sorted_bucket_ids_.size() - 1;
   PrepareKVAtCurrIdx();
 }
 
 void CuckooTableIterator::Seek(const Slice& target) {
+  InitIfNeeded();
   const BucketComparator seek_comparator(
       reader_->file_data_, reader_->ucomp_,
       reader_->bucket_length_, reader_->user_key_length_,
@@ -362,9 +371,6 @@ Iterator* CuckooTableReader::NewIterator(
     auto iter_mem = arena->AllocateAligned(sizeof(CuckooTableIterator));
     iter = new (iter_mem) CuckooTableIterator(this);
   }
-  if (iter->status().ok()) {
-    iter->LoadKeysFromReader();
-  }
   return iter;
 }
 

From 5340484266df3e07544b6f4aa82f30dbbdc39af2 Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Fri, 26 Sep 2014 10:35:12 +0200
Subject: [PATCH 131/829] Built-in comparator(s) in RocksJava

Extended Built-in comparators with ReverseBytewiseComparator.

Reverse key handling is under certain conditions essential. E.g. while
using timestamp versioned data.

As native-comparators were not available using JAVA-API. Both built-in comparators
were exposed via JNI to be set upon database creation time.
---
 include/rocksdb/comparator.h  |  4 ++++
 java/org/rocksdb/Options.java | 23 +++++++++++++++++++++++
 java/rocksjni/options.cc      | 18 ++++++++++++++++++
 util/comparator.cc            | 23 ++++++++++++++++++++++-
 4 files changed, 67 insertions(+), 1 deletion(-)

diff --git a/include/rocksdb/comparator.h b/include/rocksdb/comparator.h
index f3a8499a8..8e7366752 100644
--- a/include/rocksdb/comparator.h
+++ b/include/rocksdb/comparator.h
@@ -62,6 +62,10 @@ class Comparator {
 // must not be deleted.
 extern const Comparator* BytewiseComparator();
 
+// Return a builtin comparator that uses reverse lexicographic byte-wise
+// ordering. 
+extern const Comparator* ReverseBytewiseComparator();
+
 }  // namespace rocksdb
 
 #endif  // STORAGE_ROCKSDB_INCLUDE_COMPARATOR_H_
diff --git a/java/org/rocksdb/Options.java b/java/org/rocksdb/Options.java
index 8446136f8..7ccc74834 100644
--- a/java/org/rocksdb/Options.java
+++ b/java/org/rocksdb/Options.java
@@ -18,6 +18,14 @@ public class Options extends RocksObject {
   }
   static final long DEFAULT_CACHE_SIZE = 8 << 20;
   static final int DEFAULT_NUM_SHARD_BITS = -1;
+
+  /**
+   * Builtin RocksDB comparators 
+   */
+  public enum BuiltinComparator {
+      BYTEWISE_COMPARATOR, REVERSE_BYTEWISE_COMPARATOR;
+  }
+
   /**
    * Construct options for opening a RocksDB.
    *
@@ -78,6 +86,21 @@ public class Options extends RocksObject {
     return createIfMissing(nativeHandle_);
   }
 
+  /**
+   * Set BuiltinComparator to be used with RocksDB. 
+   *
+   * Note: Comparator can be set once upon database creation.
+   *
+   * Default: BytewiseComparator.
+   * @param builtinComparator a BuiltinComparator type.
+   */
+  public void setBuiltinComparator(BuiltinComparator builtinComparator) {
+    assert(isInitialized());
+    setBuiltinComparator(nativeHandle_, builtinComparator.ordinal());
+  }
+
+  private native void setBuiltinComparator(long handle, int builtinComparator);
+
   /**
    * Amount of data to build up in memory (backed by an unsorted log
    * on disk) before converting to a sorted on-disk file.
diff --git a/java/rocksjni/options.cc b/java/rocksjni/options.cc
index 2dc2ffdc8..50416ef81 100644
--- a/java/rocksjni/options.cc
+++ b/java/rocksjni/options.cc
@@ -22,6 +22,7 @@
 #include "rocksdb/table.h"
 #include "rocksdb/slice_transform.h"
 #include "rocksdb/rate_limiter.h"
+#include "rocksdb/comparator.h"
 
 /*
  * Class:     org_rocksdb_Options
@@ -63,6 +64,23 @@ jboolean Java_org_rocksdb_Options_createIfMissing(
   return reinterpret_cast<rocksdb::Options*>(jhandle)->create_if_missing;
 }
 
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    useReverseBytewiseComparator
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_setBuiltinComparator(
+    JNIEnv* env, jobject jobj, jlong jhandle, jint builtinComparator) {
+  switch (builtinComparator){
+    case 1:
+    	reinterpret_cast<rocksdb::Options*>(jhandle)->comparator = rocksdb::ReverseBytewiseComparator();
+    	break;
+    default:
+    	reinterpret_cast<rocksdb::Options*>(jhandle)->comparator = rocksdb::BytewiseComparator();
+	break;
+  }
+}
+
 /*
  * Class:     org_rocksdb_Options
  * Method:    setWriteBufferSize
diff --git a/util/comparator.cc b/util/comparator.cc
index adeacac0a..d77d43117 100644
--- a/util/comparator.cc
+++ b/util/comparator.cc
@@ -69,13 +69,29 @@ class BytewiseComparatorImpl : public Comparator {
     // *key is a run of 0xffs.  Leave it alone.
   }
 };
-}  // namespace
+
+class ReverseBytewiseComparatorImpl : public BytewiseComparatorImpl {
+ public:
+  ReverseBytewiseComparatorImpl() { }
+
+  virtual const char* Name() const {
+    return "leveldb.ReverseBytewiseComparator";
+  }
+
+  virtual int Compare(const Slice& a, const Slice& b) const {
+    return -a.compare(b);
+  }
+};
+
+}// namespace
 
 static port::OnceType once = LEVELDB_ONCE_INIT;
 static const Comparator* bytewise;
+static const Comparator* rbytewise;
 
 static void InitModule() {
   bytewise = new BytewiseComparatorImpl;
+  rbytewise= new ReverseBytewiseComparatorImpl;
 }
 
 const Comparator* BytewiseComparator() {
@@ -83,4 +99,9 @@ const Comparator* BytewiseComparator() {
   return bytewise;
 }
 
+const Comparator* ReverseBytewiseComparator() {
+  port::InitOnce(&once, InitModule);
+  return rbytewise;
+}
+
 }  // namespace rocksdb

From 9db13987b1095fddbc18c328b163332ed295dda9 Mon Sep 17 00:00:00 2001
From: Chris Riccomini <criccomi@criccomi-mn.linkedin.biz>
Date: Fri, 26 Sep 2014 13:57:12 -0700
Subject: [PATCH 132/829] Update RocksDB's Java bindings to support multiple
 native RocksDB builds in the same Jar file. Cross build RocksDB for linux32
 and linux64 using Vagrant. Build a cross-platform fat jar that contains osx,
 linux32, and linux64 RocksDB static builds.

---
 Makefile                                  | 14 ++++++++++---
 java/Makefile                             |  8 ++++++--
 java/crossbuild/README.md                 |  0
 java/crossbuild/Vagrantfile               | 25 +++++++++++++++++++++++
 java/crossbuild/build-linux.sh            | 11 ++++++++++
 java/org/rocksdb/NativeLibraryLoader.java |  6 +++---
 java/org/rocksdb/util/Environment.java    | 14 +++++++++++--
 7 files changed, 68 insertions(+), 10 deletions(-)
 create mode 100644 java/crossbuild/README.md
 create mode 100644 java/crossbuild/Vagrantfile
 create mode 100755 java/crossbuild/build-linux.sh

diff --git a/Makefile b/Makefile
index 9d626e17f..9fc1fe6de 100644
--- a/Makefile
+++ b/Makefile
@@ -270,6 +270,7 @@ clean:
 	-rm -rf ios-x86/* ios-arm/*
 	-find . -name "*.[oda]" -exec rm {} \;
 	-find . -type f -regex ".*\.\(\(gcda\)\|\(gcno\)\)" -exec rm {} \;
+	-rm -rf bzip2* snappy* zlib*
 tags:
 	ctags * -R
 	cscope -b `find . -name '*.cc'` `find . -name '*.h'`
@@ -510,11 +511,14 @@ ldb: tools/ldb.o $(LIBOBJECTS)
 
 JNI_NATIVE_SOURCES = ./java/rocksjni/*.cc
 JAVA_INCLUDE = -I$(JAVA_HOME)/include/ -I$(JAVA_HOME)/include/linux
-ROCKSDBJNILIB = librocksdbjni.so
-ROCKSDB_JAR = rocksdbjni.jar
+ARCH := $(shell getconf LONG_BIT)
+ROCKSDBJNILIB = librocksdbjni-linux$(ARCH).so
+ROCKSDB_JAR = rocksdbjni-linux$(ARCH).jar
+ROCKSDB_JAR_ALL = rocksdbjni-all.jar
 
 ifeq ($(PLATFORM), OS_MACOSX)
-ROCKSDBJNILIB = librocksdbjni.jnilib
+ROCKSDBJNILIB = librocksdbjni-osx.jnilib
+ROCKSDB_JAR = rocksdbjni-osx.jar
 JAVA_INCLUDE = -I/System/Library/Frameworks/JavaVM.framework/Headers/
 endif
 
@@ -549,6 +553,10 @@ rocksdbjavastatic: libz.a libbz2.a libsnappy.a
 	cd java;jar -cf $(ROCKSDB_JAR) org/rocksdb/*.class org/rocksdb/util/*.class HISTORY*.md $(ROCKSDBJNILIB)
 	
 
+rocksdbjavastaticrelease: rocksdbjavastatic
+	cd java/crossbuild && vagrant destroy -f && vagrant up
+	cd java;jar -cf $(ROCKSDB_JAR_ALL) org/rocksdb/*.class org/rocksdb/util/*.class HISTORY*.md librocksdbjni-*.so librocksdbjni-*.jnilib
+
 rocksdbjava:
 	OPT="-fPIC -DNDEBUG -O2" $(MAKE) $(LIBRARY) -j32
 	cd java;$(MAKE) java;
diff --git a/java/Makefile b/java/Makefile
index b2f3674f0..1b854755b 100644
--- a/java/Makefile
+++ b/java/Makefile
@@ -1,12 +1,16 @@
 NATIVE_JAVA_CLASSES = org.rocksdb.RocksDB org.rocksdb.Options org.rocksdb.WriteBatch org.rocksdb.WriteBatchInternal org.rocksdb.WriteBatchTest org.rocksdb.WriteOptions org.rocksdb.BackupableDB org.rocksdb.BackupableDBOptions org.rocksdb.Statistics org.rocksdb.RocksIterator org.rocksdb.VectorMemTableConfig org.rocksdb.SkipListMemTableConfig org.rocksdb.HashLinkedListMemTableConfig org.rocksdb.HashSkipListMemTableConfig org.rocksdb.PlainTableConfig org.rocksdb.BlockBasedTableConfig org.rocksdb.ReadOptions org.rocksdb.Filter org.rocksdb.BloomFilter org.rocksdb.RestoreOptions org.rocksdb.RestoreBackupableDB org.rocksdb.RocksEnv org.rocksdb.GenericRateLimiterConfig
 
 NATIVE_INCLUDE = ./include
-ROCKSDB_JAR = rocksdbjni.jar
+ARCH := $(shell getconf LONG_BIT)
+ROCKSDB_JAR = rocksdbjni-linux$(ARCH).jar
+
+ifeq ($(PLATFORM), OS_MACOSX)
+ROCKSDB_JAR = rocksdbjni-osx.jar
+endif
 
 clean:
 	-find . -name "*.class" -exec rm {} \;
 	-find . -name "hs*.log" -exec rm {} \;
-	rm -f $(ROCKSDB_JAR)
 
 java:
 	javac org/rocksdb/util/*.java org/rocksdb/*.java
diff --git a/java/crossbuild/README.md b/java/crossbuild/README.md
new file mode 100644
index 000000000..e69de29bb
diff --git a/java/crossbuild/Vagrantfile b/java/crossbuild/Vagrantfile
new file mode 100644
index 000000000..47e76b7da
--- /dev/null
+++ b/java/crossbuild/Vagrantfile
@@ -0,0 +1,25 @@
+# -*- mode: ruby -*-
+# vi: set ft=ruby :
+
+# Vagrantfile API/syntax version. Don't touch unless you know what you're doing!
+VAGRANTFILE_API_VERSION = "2"
+
+Vagrant.configure(VAGRANTFILE_API_VERSION) do |config|
+
+  config.vm.define "linux32" do |linux32|
+    linux32.vm.box = "ubuntu/trusty32"
+    linux32.vm.provision :shell, path: "build-linux.sh"
+    linux32.vm.synced_folder "../..", "/rocksdb"
+  end
+
+  config.vm.define "linux64" do |linux64|
+    linux64.vm.box = "ubuntu/trusty64"
+    linux64.vm.provision :shell, path: "build-linux.sh"
+    linux64.vm.synced_folder "../..", "/rocksdb"
+  end
+
+  config.vm.provider "virtualbox" do |v|
+    v.memory = 2048
+    v.cpus = 4
+  end
+end
diff --git a/java/crossbuild/build-linux.sh b/java/crossbuild/build-linux.sh
new file mode 100755
index 000000000..37b808140
--- /dev/null
+++ b/java/crossbuild/build-linux.sh
@@ -0,0 +1,11 @@
+#!/usr/bin/env bash
+# install all required packages for rocksdb
+sudo apt-get update
+sudo apt-get -y install git make gcc g++ libgflags-dev libsnappy-dev zlib1g-dev libbz2-dev default-jdk
+
+# set java home so we can build rocksdb jars
+export JAVA_HOME=$(echo /usr/lib/jvm/java-7-openjdk*)
+cd /rocksdb
+make jclean clean -j 4 rocksdbjavastatic
+sudo shutdown -h now
+
diff --git a/java/org/rocksdb/NativeLibraryLoader.java b/java/org/rocksdb/NativeLibraryLoader.java
index 440056582..367c4dc5b 100644
--- a/java/org/rocksdb/NativeLibraryLoader.java
+++ b/java/org/rocksdb/NativeLibraryLoader.java
@@ -1,16 +1,16 @@
 package org.rocksdb;
 
 import java.io.*;
-
+import org.rocksdb.util.Environment;
 
 /**
  * This class is used to load the RocksDB shared library from within the jar.
  * The shared library is extracted to a temp folder and loaded from there.
  */
 public class NativeLibraryLoader {
-  private static String sharedLibraryName = "librocksdbjni.so";
+  private static String sharedLibraryName = Environment.getJniLibraryName("rockdsb");
   private static String tempFilePrefix = "librocksdbjni";
-  private static String tempFileSuffix = ".so";
+  private static String tempFileSuffix = "." + Environment.getJniLibraryExtension();
 
   public static void loadLibraryFromJar(String tmpDir)
       throws IOException {
diff --git a/java/org/rocksdb/util/Environment.java b/java/org/rocksdb/util/Environment.java
index c2e3bc088..1158908a2 100644
--- a/java/org/rocksdb/util/Environment.java
+++ b/java/org/rocksdb/util/Environment.java
@@ -2,6 +2,7 @@ package org.rocksdb.util;
 
 public class Environment {
   private static String OS = System.getProperty("os.name").toLowerCase();
+  private static String ARCH = System.getProperty("os.arch").toLowerCase();
 
   public static boolean isWindows() {
     return (OS.indexOf("win") >= 0);
@@ -17,6 +18,10 @@ public class Environment {
             OS.indexOf("aix") >= 0);
   }
 
+  public static boolean is64Bit() {
+    return (ARCH.indexOf("64") > 0);
+  }
+
   public static String getSharedLibraryName(String name) {
     if (isUnix()) {
       return String.format("lib%s.so", name);
@@ -28,10 +33,15 @@ public class Environment {
 
   public static String getJniLibraryName(String name) {
     if (isUnix()) {
-      return String.format("lib%s.so", name);
+      String arch = (is64Bit()) ? "64" : "32";
+      return String.format("lib%s-linux%s.so", name, arch);
     } else if (isMac()) {
-      return String.format("lib%s.jnilib", name);
+      return String.format("lib%s-osx.jnilib", name);
     }
     throw new UnsupportedOperationException();
   }
+
+  public static String getJniLibraryExtension() {
+    return (isMac()) ? ".jnilib" : ".so";
+  }
 }

From 82a8f43ccc99f47b351b9d8d13b69d40ab84f97c Mon Sep 17 00:00:00 2001
From: Chris Riccomini <criccomi@criccomi-mn.linkedin.biz>
Date: Fri, 26 Sep 2014 14:58:33 -0700
Subject: [PATCH 133/829] Document RELEASE.mdgit status

---
 java/RELEASE.md                | 29 +++++++++++++++++++++++++++++
 java/crossbuild/README.md      |  0
 java/crossbuild/build-linux.sh |  3 ++-
 3 files changed, 31 insertions(+), 1 deletion(-)
 create mode 100644 java/RELEASE.md
 delete mode 100644 java/crossbuild/README.md

diff --git a/java/RELEASE.md b/java/RELEASE.md
new file mode 100644
index 000000000..5df6ad353
--- /dev/null
+++ b/java/RELEASE.md
@@ -0,0 +1,29 @@
+## Cross-building
+
+To build RocksDB as a single self contained cross-platform JAR. The cross-platform jar can be usd on any 64-bit OSX system, 32-bit Linux system, or 64-bit Linux system.
+
+Building a cross-platform JAR requires:
+
+ * [Vagrant](https://www.vagrantup.com/)
+ * [Virtualbox](https://www.virtualbox.org/)
+ * A Mac OSX machine
+
+Once you have these items, run this make command from RocksDB's root source directory:
+
+    make jclean clean rocksdbjavastaticrelease
+
+This command will build RocksDB natively on OSX, and will then spin up two Vagrant Virtualbox Ubuntu images to build RocksDB for both 32-bit and 64-bit Linux. 
+
+You can find all native binaries and JARs in the java directory upon completion:
+
+    librocksdbjni-linux32.so
+    librocksdbjni-linux64.so
+    librocksdbjni-osx.jnilib
+    rocksdbjni-all.jar
+    rocksdbjni-linux32.jar
+    rocksdbjni-linux64.jar
+    rocksdbjni-osx.jar
+
+## Maven publication
+
+TODO
diff --git a/java/crossbuild/README.md b/java/crossbuild/README.md
deleted file mode 100644
index e69de29bb..000000000
diff --git a/java/crossbuild/build-linux.sh b/java/crossbuild/build-linux.sh
index 37b808140..7d5831510 100755
--- a/java/crossbuild/build-linux.sh
+++ b/java/crossbuild/build-linux.sh
@@ -6,6 +6,7 @@ sudo apt-get -y install git make gcc g++ libgflags-dev libsnappy-dev zlib1g-dev
 # set java home so we can build rocksdb jars
 export JAVA_HOME=$(echo /usr/lib/jvm/java-7-openjdk*)
 cd /rocksdb
-make jclean clean -j 4 rocksdbjavastatic
+make jclean clean
+make -j 4 rocksdbjavastatic
 sudo shutdown -h now
 

From 4e735bb7f970012a2b045511a216acd76b0f3df0 Mon Sep 17 00:00:00 2001
From: Chris Riccomini <criccomi@criccomi-mn.linkedin.biz>
Date: Fri, 26 Sep 2014 15:41:28 -0700
Subject: [PATCH 134/829] Rsync files to VM rather than sync folders, since
 sync folders was causing clock skew and confusig make.

---
 .gitignore                  | 1 +
 java/crossbuild/Vagrantfile | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/.gitignore b/.gitignore
index 99a7d61d6..e9fdf1368 100644
--- a/.gitignore
+++ b/.gitignore
@@ -33,3 +33,4 @@ tags
 java/*.log
 java/include/org_rocksdb_*.h
 unity.cc
+java/crossbuild/.vagrant
diff --git a/java/crossbuild/Vagrantfile b/java/crossbuild/Vagrantfile
index 47e76b7da..634271dc3 100644
--- a/java/crossbuild/Vagrantfile
+++ b/java/crossbuild/Vagrantfile
@@ -9,17 +9,17 @@ Vagrant.configure(VAGRANTFILE_API_VERSION) do |config|
   config.vm.define "linux32" do |linux32|
     linux32.vm.box = "ubuntu/trusty32"
     linux32.vm.provision :shell, path: "build-linux.sh"
-    linux32.vm.synced_folder "../..", "/rocksdb"
   end
 
   config.vm.define "linux64" do |linux64|
     linux64.vm.box = "ubuntu/trusty64"
     linux64.vm.provision :shell, path: "build-linux.sh"
-    linux64.vm.synced_folder "../..", "/rocksdb"
   end
 
   config.vm.provider "virtualbox" do |v|
     v.memory = 2048
     v.cpus = 4
   end
+
+  config.vm.synced_folder "../..", "/rocksdb", type: "rsync"
 end

From 389edb6b1bd8ead82090feff7a723e26ba8d001e Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Fri, 26 Sep 2014 14:15:09 -0700
Subject: [PATCH 135/829] universal compaction picker: use double for potential
 overflow

Summary: There is a possible overflow case in universal compaction picker. Use double to make the logic straight-forward

Test Plan: make all check

Reviewers: yhchiang, igor, MarkCallaghan, ljin

Reviewed By: ljin

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D23817
---
 db/compaction_picker.cc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/db/compaction_picker.cc b/db/compaction_picker.cc
index 7cd965c20..eb434eeac 100644
--- a/db/compaction_picker.cc
+++ b/db/compaction_picker.cc
@@ -746,15 +746,15 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalReadAmp(
       // default kCompactionStopStyleTotalSize; with
       // kCompactionStopStyleSimilarSize, it's simply the size of the last
       // picked file.
-      uint64_t sz = (candidate_size * (100L + ratio)) /100;
-      if (sz < f->fd.GetFileSize()) {
+      double sz = candidate_size * (100.0 + ratio) / 100.0;
+      if (sz < static_cast<double>(f->fd.GetFileSize())) {
         break;
       }
       if (options_->compaction_options_universal.stop_style == kCompactionStopStyleSimilarSize) {
         // Similar-size stopping rule: also check the last picked file isn't
         // far larger than the next candidate file.
-        sz = (f->fd.GetFileSize() * (100L + ratio)) / 100;
-        if (sz < candidate_size) {
+        sz = (f->fd.GetFileSize() * (100.0 + ratio)) / 100.0;
+        if (sz < static_cast<double>(candidate_size)) {
           // If the small file we've encountered begins a run of similar-size
           // files, we'll pick them up on a future iteration of the outer
           // loop. If it's some lonely straggler, it'll eventually get picked

From b8e26615aac81f85046380c3420cd823009d0941 Mon Sep 17 00:00:00 2001
From: Chris Riccomini <criccomi@criccomi-mn.(none)>
Date: Fri, 26 Sep 2014 18:27:32 -0700
Subject: [PATCH 136/829] since we're not sharing folders with the vm, copy
 built .so files and jars back to host system.

---
 java/RELEASE.md                | 4 ++--
 java/crossbuild/Vagrantfile    | 1 +
 java/crossbuild/build-linux.sh | 2 ++
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/java/RELEASE.md b/java/RELEASE.md
index 5df6ad353..6b5eaf5af 100644
--- a/java/RELEASE.md
+++ b/java/RELEASE.md
@@ -1,12 +1,12 @@
 ## Cross-building
 
-To build RocksDB as a single self contained cross-platform JAR. The cross-platform jar can be usd on any 64-bit OSX system, 32-bit Linux system, or 64-bit Linux system.
+RocksDB can be built as a single self contained cross-platform JAR. The cross-platform jar can be usd on any 64-bit OSX system, 32-bit Linux system, or 64-bit Linux system.
 
 Building a cross-platform JAR requires:
 
  * [Vagrant](https://www.vagrantup.com/)
  * [Virtualbox](https://www.virtualbox.org/)
- * A Mac OSX machine
+ * A Mac OSX machine that can compile RocksDB.
 
 Once you have these items, run this make command from RocksDB's root source directory:
 
diff --git a/java/crossbuild/Vagrantfile b/java/crossbuild/Vagrantfile
index 634271dc3..ed591be71 100644
--- a/java/crossbuild/Vagrantfile
+++ b/java/crossbuild/Vagrantfile
@@ -21,5 +21,6 @@ Vagrant.configure(VAGRANTFILE_API_VERSION) do |config|
     v.cpus = 4
   end
 
+  config.vm.synced_folder "../", "/rocksdb-build"
   config.vm.synced_folder "../..", "/rocksdb", type: "rsync"
 end
diff --git a/java/crossbuild/build-linux.sh b/java/crossbuild/build-linux.sh
index 7d5831510..75edac526 100755
--- a/java/crossbuild/build-linux.sh
+++ b/java/crossbuild/build-linux.sh
@@ -8,5 +8,7 @@ export JAVA_HOME=$(echo /usr/lib/jvm/java-7-openjdk*)
 cd /rocksdb
 make jclean clean
 make -j 4 rocksdbjavastatic
+cp /rocksdb/java/librocksdbjni-* /rocksdb-build
+cp /rocksdb/java/rocksdbjni-* /rocksdb-build
 sudo shutdown -h now
 

From 8b8011a68ca80fc2523447302e27ceb912af9771 Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Sat, 27 Sep 2014 10:06:13 +0200
Subject: [PATCH 137/829] Changed name of ReverseBytewiseComparator based on
 review comment

---
 util/comparator.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/util/comparator.cc b/util/comparator.cc
index d77d43117..bbf0262f0 100644
--- a/util/comparator.cc
+++ b/util/comparator.cc
@@ -75,7 +75,7 @@ class ReverseBytewiseComparatorImpl : public BytewiseComparatorImpl {
   ReverseBytewiseComparatorImpl() { }
 
   virtual const char* Name() const {
-    return "leveldb.ReverseBytewiseComparator";
+    return "rocksdb.ReverseBytewiseComparator";
   }
 
   virtual int Compare(const Slice& a, const Slice& b) const {

From 2dc6f62bb9205d5b071757d72bd28bbb77ab0745 Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Mon, 29 Sep 2014 10:25:21 -0700
Subject: [PATCH 138/829] handle kDelete type in cuckoo builder

Summary:
when I changed std::vector<std::string, std::string> to std::string to
store key/value pairs in builder, I missed the handling for kDeletion
type. As a result, value_size_ can be wrong if the first add key is for
deletion.
The is captured by ./cuckoo_table_db_test

Test Plan:
./cuckoo_table_db_test
./cuckoo_table_reader_test
./cuckoo_table_builder_test

Reviewers: sdong, yhchiang, igor

Reviewed By: igor

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D24045
---
 table/cuckoo_table_builder.cc | 59 +++++++++++++++++++++++++++++++----
 table/cuckoo_table_builder.h  |  7 ++++-
 2 files changed, 59 insertions(+), 7 deletions(-)

diff --git a/table/cuckoo_table_builder.cc b/table/cuckoo_table_builder.cc
index a66e9899e..6ff1fa0cf 100644
--- a/table/cuckoo_table_builder.cc
+++ b/table/cuckoo_table_builder.cc
@@ -60,9 +60,11 @@ CuckooTableBuilder::CuckooTableBuilder(
       hash_table_size_(use_module_hash ? 0 : 2),
       is_last_level_file_(false),
       has_seen_first_key_(false),
+      has_seen_first_value_(false),
       key_size_(0),
       value_size_(0),
       num_entries_(0),
+      num_values_(0),
       ucomp_(user_comparator),
       use_module_hash_(use_module_hash),
       identity_as_first_hash_(identity_as_first_hash),
@@ -84,6 +86,12 @@ void CuckooTableBuilder::Add(const Slice& key, const Slice& value) {
     status_ = Status::Corruption("Unable to parse key into inernal key.");
     return;
   }
+  if (ikey.type != kTypeDeletion && ikey.type != kTypeValue) {
+    status_ = Status::NotSupported("Unsupported key type " +
+                                   std::to_string(ikey.type));
+    return;
+  }
+
   // Determine if we can ignore the sequence number and value type from
   // internal keys by looking at sequence number from first key. We assume
   // that if first key has a zero sequence number, then all the remaining
@@ -94,16 +102,38 @@ void CuckooTableBuilder::Add(const Slice& key, const Slice& value) {
     smallest_user_key_.assign(ikey.user_key.data(), ikey.user_key.size());
     largest_user_key_.assign(ikey.user_key.data(), ikey.user_key.size());
     key_size_ = is_last_level_file_ ? ikey.user_key.size() : key.size();
-    value_size_ = value.size();
+  }
+  if (key_size_ != (is_last_level_file_ ? ikey.user_key.size() : key.size())) {
+    status_ = Status::NotSupported("all keys have to be the same size");
+    return;
   }
   // Even if one sequence number is non-zero, then it is not last level.
   assert(!is_last_level_file_ || ikey.sequence == 0);
-  if (is_last_level_file_) {
-    kvs_.append(ikey.user_key.data(), ikey.user_key.size());
+
+  if (ikey.type == kTypeValue) {
+    if (!has_seen_first_value_) {
+      has_seen_first_value_ = true;
+      value_size_ = value.size();
+    }
+    if (value_size_ != value.size()) {
+      status_ = Status::NotSupported("all values have to be the same size");
+      return;
+    }
+
+    if (is_last_level_file_) {
+      kvs_.append(ikey.user_key.data(), ikey.user_key.size());
+    } else {
+      kvs_.append(key.data(), key.size());
+    }
+    kvs_.append(value.data(), value.size());
+    ++num_values_;
   } else {
-    kvs_.append(key.data(), key.size());
+    if (is_last_level_file_) {
+      deleted_keys_.append(ikey.user_key.data(), ikey.user_key.size());
+    } else {
+      deleted_keys_.append(key.data(), key.size());
+    }
   }
-  kvs_.append(value.data(), value.size());
   ++num_entries_;
 
   // In order to fill the empty buckets in the hash table, we identify a
@@ -123,15 +153,30 @@ void CuckooTableBuilder::Add(const Slice& key, const Slice& value) {
   }
 }
 
+bool CuckooTableBuilder::IsDeletedKey(uint64_t idx) const {
+  assert(closed_);
+  return idx >= num_values_;
+}
+
 Slice CuckooTableBuilder::GetKey(uint64_t idx) const {
+  assert(closed_);
+  if (IsDeletedKey(idx)) {
+    return Slice(&deleted_keys_[(idx - num_values_) * key_size_], key_size_);
+  }
   return Slice(&kvs_[idx * (key_size_ + value_size_)], key_size_);
 }
 
 Slice CuckooTableBuilder::GetUserKey(uint64_t idx) const {
+  assert(closed_);
   return is_last_level_file_ ? GetKey(idx) : ExtractUserKey(GetKey(idx));
 }
 
 Slice CuckooTableBuilder::GetValue(uint64_t idx) const {
+  assert(closed_);
+  if (IsDeletedKey(idx)) {
+    static std::string empty_value(value_size_, 'a');
+    return Slice(empty_value);
+  }
   return Slice(&kvs_[idx * (key_size_ + value_size_) + key_size_], value_size_);
 }
 
@@ -256,7 +301,9 @@ Status CuckooTableBuilder::Finish() {
       ++num_added;
       s = file_->Append(GetKey(bucket.vector_idx));
       if (s.ok()) {
-        s = file_->Append(GetValue(bucket.vector_idx));
+        if (value_size_ > 0) {
+          s = file_->Append(GetValue(bucket.vector_idx));
+        }
       }
     }
     if (!s.ok()) {
diff --git a/table/cuckoo_table_builder.h b/table/cuckoo_table_builder.h
index b1d7e649c..6898c1ef6 100644
--- a/table/cuckoo_table_builder.h
+++ b/table/cuckoo_table_builder.h
@@ -75,6 +75,7 @@ class CuckooTableBuilder: public TableBuilder {
       uint64_t* bucket_id);
   Status MakeHashTable(std::vector<CuckooBucket>* buckets);
 
+  inline bool IsDeletedKey(uint64_t idx) const;
   inline Slice GetKey(uint64_t idx) const;
   inline Slice GetUserKey(uint64_t idx) const;
   inline Slice GetValue(uint64_t idx) const;
@@ -88,14 +89,18 @@ class CuckooTableBuilder: public TableBuilder {
   uint64_t hash_table_size_;
   bool is_last_level_file_;
   bool has_seen_first_key_;
+  bool has_seen_first_value_;
   uint64_t key_size_;
   uint64_t value_size_;
   // A list of fixed-size key-value pairs concatenating into a string.
   // Use GetKey(), GetUserKey(), and GetValue() to retrieve a specific
   // key / value given an index
   std::string kvs_;
-  // Number of key-value pairs stored in kvs_
+  std::string deleted_keys_;
+  // Number of key-value pairs stored in kvs_ + number of deleted keys
   uint64_t num_entries_;
+  // Number of keys that contain value (non-deletion op)
+  uint64_t num_values_;
   Status status_;
   TableProperties properties_;
   const Comparator* ucomp_;

From c4519c777f7dcd996c5bd8d726877bda3f0e65ac Mon Sep 17 00:00:00 2001
From: Chris Riccomini <criccomi@criccomi-mn.linkedin.biz>
Date: Mon, 29 Sep 2014 10:42:00 -0700
Subject: [PATCH 139/829] fix mis-named jar in JNI loader

---
 java/org/rocksdb/NativeLibraryLoader.java | 2 +-
 java/org/rocksdb/util/Environment.java    | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/java/org/rocksdb/NativeLibraryLoader.java b/java/org/rocksdb/NativeLibraryLoader.java
index 367c4dc5b..26a26bbca 100644
--- a/java/org/rocksdb/NativeLibraryLoader.java
+++ b/java/org/rocksdb/NativeLibraryLoader.java
@@ -8,7 +8,7 @@ import org.rocksdb.util.Environment;
  * The shared library is extracted to a temp folder and loaded from there.
  */
 public class NativeLibraryLoader {
-  private static String sharedLibraryName = Environment.getJniLibraryName("rockdsb");
+  private static String sharedLibraryName = Environment.getJniLibraryName("rocksdb");
   private static String tempFilePrefix = "librocksdbjni";
   private static String tempFileSuffix = "." + Environment.getJniLibraryExtension();
 
diff --git a/java/org/rocksdb/util/Environment.java b/java/org/rocksdb/util/Environment.java
index 1158908a2..7bb42ace7 100644
--- a/java/org/rocksdb/util/Environment.java
+++ b/java/org/rocksdb/util/Environment.java
@@ -24,9 +24,9 @@ public class Environment {
 
   public static String getSharedLibraryName(String name) {
     if (isUnix()) {
-      return String.format("lib%s.so", name);
+      return String.format("lib%sjni.so", name);
     } else if (isMac()) {
-      return String.format("lib%s.dylib", name);
+      return String.format("lib%sjni.dylib", name);
     }
     throw new UnsupportedOperationException();
   }
@@ -34,9 +34,9 @@ public class Environment {
   public static String getJniLibraryName(String name) {
     if (isUnix()) {
       String arch = (is64Bit()) ? "64" : "32";
-      return String.format("lib%s-linux%s.so", name, arch);
+      return String.format("lib%sjni-linux%s.so", name, arch);
     } else if (isMac()) {
-      return String.format("lib%s-osx.jnilib", name);
+      return String.format("lib%sjni-osx.jnilib", name);
     }
     throw new UnsupportedOperationException();
   }

From 983d2de2de3468476c855be0ce405bd07dc31643 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Mon, 29 Sep 2014 10:52:18 -0700
Subject: [PATCH 140/829] Add AUTHORS file. Fix #203

---
 AUTHORS | 11 +++++++++++
 1 file changed, 11 insertions(+)
 create mode 100644 AUTHORS

diff --git a/AUTHORS b/AUTHORS
new file mode 100644
index 000000000..e644f5530
--- /dev/null
+++ b/AUTHORS
@@ -0,0 +1,11 @@
+Facebook Inc.
+Facebook Engineering Team
+
+Google Inc.
+# Initial version authors:
+Jeffrey Dean <jeff@google.com>
+Sanjay Ghemawat <sanjay@google.com>
+
+# Partial list of contributors:
+Kevin Regan <kevin.d.regan@gmail.com>
+Johan Bilien <jobi@litl.com>

From 6a64ea6171518ff09dbbee5fb5e94a90b4e63471 Mon Sep 17 00:00:00 2001
From: Chris Riccomini <criccomi@criccomi-mn.linkedin.biz>
Date: Mon, 29 Sep 2014 10:57:38 -0700
Subject: [PATCH 141/829] add note about java 7

---
 java/RELEASE.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/java/RELEASE.md b/java/RELEASE.md
index 6b5eaf5af..cc35dc33c 100644
--- a/java/RELEASE.md
+++ b/java/RELEASE.md
@@ -7,6 +7,7 @@ Building a cross-platform JAR requires:
  * [Vagrant](https://www.vagrantup.com/)
  * [Virtualbox](https://www.virtualbox.org/)
  * A Mac OSX machine that can compile RocksDB.
+ * Java 7 set as JAVA_HOME.
 
 Once you have these items, run this make command from RocksDB's root source directory:
 

From 2faf49d5f15dc2e1c1bc79e84cee585631632019 Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Mon, 29 Sep 2014 11:09:09 -0700
Subject: [PATCH 142/829] use GetContext to replace callback function pointer

Summary:
Intead of passing callback function pointer and its arg on Table::Get()
interface, passing GetContext. This makes the interface cleaner and
possible better perf. Also adding a fast pass for SaveValue()

Test Plan: make all check

Reviewers: igor, yhchiang, sdong

Reviewed By: sdong

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D24057
---
 Makefile                                    |  12 +-
 db/simple_table_db_test.cc                  | 815 --------------------
 db/table_cache.cc                           |  11 +-
 db/table_cache.h                            |   7 +-
 db/version_set.cc                           | 114 +--
 db/version_set.h                            |  22 -
 table/block_based_table_reader.cc           |  12 +-
 table/block_based_table_reader.h            |   7 +-
 table/cuckoo_table_reader.cc                |  14 +-
 table/cuckoo_table_reader.h                 |   8 +-
 table/cuckoo_table_reader_test.cc           | 108 +--
 table/get_context.cc                        | 101 +++
 table/get_context.h                         |  47 ++
 table/plain_table_reader.cc                 |  10 +-
 table/plain_table_reader.h                  |   7 +-
 table/table_reader.h                        |  21 +-
 table/table_reader_bench.cc                 |  16 +-
 table/table_test.cc                         |   6 +-
 utilities/compacted_db/compacted_db_impl.cc |  50 +-
 utilities/compacted_db/compacted_db_impl.h  |   3 -
 20 files changed, 275 insertions(+), 1116 deletions(-)
 delete mode 100644 db/simple_table_db_test.cc
 create mode 100644 table/get_context.cc
 create mode 100644 table/get_context.h

diff --git a/Makefile b/Makefile
index 9d626e17f..75da74b08 100644
--- a/Makefile
+++ b/Makefile
@@ -122,7 +122,6 @@ TESTS = \
 	reduce_levels_test \
 	plain_table_db_test \
 	prefix_test \
-	simple_table_db_test \
 	skiplist_test \
 	stringappend_test \
 	ttl_test \
@@ -371,9 +370,6 @@ log_write_bench: util/log_write_bench.o $(LIBOBJECTS) $(TESTHARNESS)
 plain_table_db_test: db/plain_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) db/plain_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
 
-simple_table_db_test: db/simple_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS)
-	$(CXX) db/simple_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
-
 table_reader_bench: table/table_reader_bench.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) table/table_reader_bench.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) -pg
 
@@ -523,11 +519,11 @@ libz.a:
 	curl -O http://zlib.net/zlib-1.2.8.tar.gz
 	tar xvzf zlib-1.2.8.tar.gz
 	cd zlib-1.2.8 && CFLAGS='-fPIC' ./configure --static && make
-	cp zlib-1.2.8/libz.a . 
+	cp zlib-1.2.8/libz.a .
 
 libbz2.a:
 	-rm -rf bzip2-1.0.6
-	curl -O  http://www.bzip.org/1.0.6/bzip2-1.0.6.tar.gz 
+	curl -O  http://www.bzip.org/1.0.6/bzip2-1.0.6.tar.gz
 	tar xvzf bzip2-1.0.6.tar.gz
 	cd bzip2-1.0.6 && make CFLAGS='-fPIC -Wall -Winline -O2 -g -D_FILE_OFFSET_BITS=64'
 	cp bzip2-1.0.6/libbz2.a .
@@ -539,7 +535,7 @@ libsnappy.a:
 	cd snappy-1.1.1 && ./configure --with-pic --enable-static
 	cd snappy-1.1.1 && make
 	cp snappy-1.1.1/.libs/libsnappy.a .
-		
+
 
 rocksdbjavastatic: libz.a libbz2.a libsnappy.a
 	OPT="-fPIC -DNDEBUG -O2" $(MAKE) $(LIBRARY) -j
@@ -547,7 +543,7 @@ rocksdbjavastatic: libz.a libbz2.a libsnappy.a
 	rm -f ./java/$(ROCKSDBJNILIB)
 	$(CXX) $(CXXFLAGS) -I./java/. $(JAVA_INCLUDE) -shared -fPIC -o ./java/$(ROCKSDBJNILIB) $(JNI_NATIVE_SOURCES) $(LIBOBJECTS) $(COVERAGEFLAGS) libz.a libbz2.a libsnappy.a
 	cd java;jar -cf $(ROCKSDB_JAR) org/rocksdb/*.class org/rocksdb/util/*.class HISTORY*.md $(ROCKSDBJNILIB)
-	
+
 
 rocksdbjava:
 	OPT="-fPIC -DNDEBUG -O2" $(MAKE) $(LIBRARY) -j32
diff --git a/db/simple_table_db_test.cc b/db/simple_table_db_test.cc
deleted file mode 100644
index 0a0ecf064..000000000
--- a/db/simple_table_db_test.cc
+++ /dev/null
@@ -1,815 +0,0 @@
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under the BSD-style license found in the
-//  LICENSE file in the root directory of this source tree. An additional grant
-//  of patent rights can be found in the PATENTS file in the same directory.
-//
-// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-#include <algorithm>
-#include <set>
-
-#include "rocksdb/db.h"
-#include "rocksdb/filter_policy.h"
-#include "db/db_impl.h"
-#include "db/filename.h"
-#include "db/version_set.h"
-#include "db/write_batch_internal.h"
-#include "rocksdb/statistics.h"
-#include "rocksdb/cache.h"
-#include "rocksdb/compaction_filter.h"
-#include "rocksdb/env.h"
-#include "rocksdb/table.h"
-#include "rocksdb/table_properties.h"
-#include "table/table_builder.h"
-#include "util/hash.h"
-#include "util/logging.h"
-#include "util/mutexlock.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
-#include "utilities/merge_operators.h"
-
-using std::unique_ptr;
-
-// IS THIS FILE STILL NEEDED?
-namespace rocksdb {
-
-// SimpleTable is a simple table format for UNIT TEST ONLY. It is not built
-// as production quality.
-// SimpleTable requires the input key size to be fixed 16 bytes, value cannot
-// be longer than 150000 bytes and stored data on disk in this format:
-// +--------------------------------------------+  <= key1 offset
-// | key1            | value_size (4 bytes) |   |
-// +----------------------------------------+   |
-// | value1                                     |
-// |                                            |
-// +----------------------------------------+---+  <= key2 offset
-// | key2            | value_size (4 bytes) |   |
-// +----------------------------------------+   |
-// | value2                                     |
-// |                                            |
-// |        ......                              |
-// +-----------------+--------------------------+   <= index_block_offset
-// | key1            | key1 offset (8 bytes)    |
-// +-----------------+--------------------------+
-// | key2            | key2 offset (8 bytes)    |
-// +-----------------+--------------------------+
-// | key3            | key3 offset (8 bytes)    |
-// +-----------------+--------------------------+
-// |        ......                              |
-// +-----------------+------------+-------------+
-// | index_block_offset (8 bytes) |
-// +------------------------------+
-
-// SimpleTable is a simple table format for UNIT TEST ONLY. It is not built
-// as production quality.
-class SimpleTableReader: public TableReader {
-public:
-  // Attempt to open the table that is stored in bytes [0..file_size)
-  // of "file", and read the metadata entries necessary to allow
-  // retrieving data from the table.
-  //
-  // If successful, returns ok and sets "*table" to the newly opened
-  // table.  The client should delete "*table" when no longer needed.
-  // If there was an error while initializing the table, sets "*table"
-  // to nullptr and returns a non-ok status.  Does not take ownership of
-  // "*source", but the client must ensure that "source" remains live
-  // for the duration of the returned table's lifetime.
-  //
-  // *file must remain live while this Table is in use.
-  static Status Open(const ImmutableCFOptions& options,
-                     const EnvOptions& env_options,
-                     unique_ptr<RandomAccessFile> && file, uint64_t file_size,
-                     unique_ptr<TableReader>* table_reader);
-
-  Iterator* NewIterator(const ReadOptions&, Arena* arena) override;
-
-  Status Get(const ReadOptions&, const Slice& key, void* arg,
-             bool (*handle_result)(void* arg, const ParsedInternalKey& k,
-                                   const Slice& v),
-             void (*mark_key_may_exist)(void*) = nullptr) override;
-
-  uint64_t ApproximateOffsetOf(const Slice& key) override;
-
-  virtual size_t ApproximateMemoryUsage() const override { return 0; }
-
-  void SetupForCompaction() override;
-
-  std::shared_ptr<const TableProperties> GetTableProperties() const override;
-
-  ~SimpleTableReader();
-
-private:
-  struct Rep;
-  Rep* rep_;
-
-  explicit SimpleTableReader(Rep* rep) {
-    rep_ = rep;
-  }
-  friend class TableCache;
-  friend class SimpleTableIterator;
-
-  Status GetOffset(const Slice& target, uint64_t* offset);
-
-  // No copying allowed
-  explicit SimpleTableReader(const TableReader&) = delete;
-  void operator=(const TableReader&) = delete;
-};
-
-// Iterator to iterate SimpleTable
-class SimpleTableIterator: public Iterator {
-public:
-  explicit SimpleTableIterator(SimpleTableReader* table);
-  ~SimpleTableIterator();
-
-  bool Valid() const;
-
-  void SeekToFirst();
-
-  void SeekToLast();
-
-  void Seek(const Slice& target);
-
-  void Next();
-
-  void Prev();
-
-  Slice key() const;
-
-  Slice value() const;
-
-  Status status() const;
-
-private:
-  SimpleTableReader* table_;
-  uint64_t offset_;
-  uint64_t next_offset_;
-  Slice key_;
-  Slice value_;
-  char tmp_str_[4];
-  char* key_str_;
-  char* value_str_;
-  int value_str_len_;
-  Status status_;
-  // No copying allowed
-  SimpleTableIterator(const SimpleTableIterator&) = delete;
-  void operator=(const Iterator&) = delete;
-};
-
-struct SimpleTableReader::Rep {
-  ~Rep() {
-  }
-  Rep(const ImmutableCFOptions& ioptions, const EnvOptions& env_options,
-      uint64_t index_start_offset, int num_entries) :
-      ioptions(ioptions), env_options(env_options),
-      index_start_offset(index_start_offset), num_entries(num_entries) {
-  }
-
-  const ImmutableCFOptions& ioptions;
-  const EnvOptions& env_options;
-  Status status;
-  unique_ptr<RandomAccessFile> file;
-  uint64_t index_start_offset;
-  int num_entries;
-  std::shared_ptr<TableProperties> table_properties;
-
-  const static int user_key_size = 16;
-  const static int offset_length = 8;
-  const static int key_footer_len = 8;
-
-  static int GetInternalKeyLength() {
-    return user_key_size + key_footer_len;
-  }
-};
-
-SimpleTableReader::~SimpleTableReader() {
-  delete rep_;
-}
-
-Status SimpleTableReader::Open(const ImmutableCFOptions& ioptions,
-                               const EnvOptions& env_options,
-                               unique_ptr<RandomAccessFile> && file,
-                               uint64_t size,
-                               unique_ptr<TableReader>* table_reader) {
-  char footer_space[Rep::offset_length];
-  Slice footer_input;
-  Status s = file->Read(size - Rep::offset_length, Rep::offset_length,
-                        &footer_input, footer_space);
-  if (s.ok()) {
-    uint64_t index_start_offset = DecodeFixed64(footer_space);
-
-    int num_entries = (size - Rep::offset_length - index_start_offset)
-        / (Rep::GetInternalKeyLength() + Rep::offset_length);
-    SimpleTableReader::Rep* rep = new SimpleTableReader::Rep(
-        ioptions, env_options, index_start_offset, num_entries);
-
-    rep->file = std::move(file);
-    table_reader->reset(new SimpleTableReader(rep));
-  }
-  return s;
-}
-
-void SimpleTableReader::SetupForCompaction() {
-}
-
-std::shared_ptr<const TableProperties> SimpleTableReader::GetTableProperties()
-    const {
-  return rep_->table_properties;
-}
-
-Iterator* SimpleTableReader::NewIterator(const ReadOptions& options,
-                                         Arena* arena) {
-  if (arena == nullptr) {
-    return new SimpleTableIterator(this);
-  } else {
-    auto mem = arena->AllocateAligned(sizeof(SimpleTableIterator));
-    return new (mem) SimpleTableIterator(this);
-  }
-}
-
-Status SimpleTableReader::GetOffset(const Slice& target, uint64_t* offset) {
-  uint32_t left = 0;
-  uint32_t right = rep_->num_entries - 1;
-  char key_chars[Rep::GetInternalKeyLength()];
-  Slice tmp_slice;
-
-  uint32_t target_offset = 0;
-  while (left <= right) {
-    uint32_t mid = (left + right + 1) / 2;
-
-    uint64_t offset_to_read = rep_->index_start_offset
-        + (Rep::GetInternalKeyLength() + Rep::offset_length) * mid;
-    Status s = rep_->file->Read(offset_to_read, Rep::GetInternalKeyLength(),
-                                &tmp_slice, key_chars);
-    if (!s.ok()) {
-      return s;
-    }
-
-    InternalKeyComparator ikc(rep_->ioptions.comparator);
-    int compare_result = ikc.Compare(tmp_slice, target);
-
-    if (compare_result < 0) {
-      if (left == right) {
-        target_offset = right + 1;
-        break;
-      }
-      left = mid;
-    } else {
-      if (left == right) {
-        target_offset = left;
-        break;
-      }
-      right = mid - 1;
-    }
-  }
-
-  if (target_offset >= (uint32_t) rep_->num_entries) {
-    *offset = rep_->index_start_offset;
-    return Status::OK();
-  }
-
-  char value_offset_chars[Rep::offset_length];
-
-  int64_t offset_for_value_offset = rep_->index_start_offset
-      + (Rep::GetInternalKeyLength() + Rep::offset_length) * target_offset
-      + Rep::GetInternalKeyLength();
-  Status s = rep_->file->Read(offset_for_value_offset, Rep::offset_length,
-                              &tmp_slice, value_offset_chars);
-  if (s.ok()) {
-    *offset = DecodeFixed64(value_offset_chars);
-  }
-  return s;
-}
-
-Status SimpleTableReader::Get(const ReadOptions& options, const Slice& k,
-                              void* arg,
-                              bool (*saver)(void*, const ParsedInternalKey&,
-                                            const Slice&),
-                              void (*mark_key_may_exist)(void*)) {
-  Status s;
-  SimpleTableIterator* iter = new SimpleTableIterator(this);
-  for (iter->Seek(k); iter->Valid(); iter->Next()) {
-    ParsedInternalKey parsed_key;
-    if (!ParseInternalKey(iter->key(), &parsed_key)) {
-      return Status::Corruption(Slice());
-    }
-
-    if (!(*saver)(arg, parsed_key, iter->value())) {
-      break;
-    }
-  }
-  s = iter->status();
-  delete iter;
-  return s;
-}
-
-uint64_t SimpleTableReader::ApproximateOffsetOf(const Slice& key) {
-  return 0;
-}
-
-SimpleTableIterator::SimpleTableIterator(SimpleTableReader* table) :
-    table_(table) {
-  key_str_ = new char[SimpleTableReader::Rep::GetInternalKeyLength()];
-  value_str_len_ = -1;
-  SeekToFirst();
-}
-
-SimpleTableIterator::~SimpleTableIterator() {
- delete[] key_str_;
- if (value_str_len_ >= 0) {
-   delete[] value_str_;
- }
-}
-
-bool SimpleTableIterator::Valid() const {
-  return offset_ < table_->rep_->index_start_offset;
-}
-
-void SimpleTableIterator::SeekToFirst() {
-  next_offset_ = 0;
-  Next();
-}
-
-void SimpleTableIterator::SeekToLast() {
-  assert(false);
-}
-
-void SimpleTableIterator::Seek(const Slice& target) {
-  Status s = table_->GetOffset(target, &next_offset_);
-  if (!s.ok()) {
-    status_ = s;
-  }
-  Next();
-}
-
-void SimpleTableIterator::Next() {
-  offset_ = next_offset_;
-  if (offset_ >= table_->rep_->index_start_offset) {
-    return;
-  }
-  Slice result;
-  int internal_key_size = SimpleTableReader::Rep::GetInternalKeyLength();
-
-  Status s = table_->rep_->file->Read(next_offset_, internal_key_size, &result,
-                                      key_str_);
-  next_offset_ += internal_key_size;
-  key_ = result;
-
-  Slice value_size_slice;
-  s = table_->rep_->file->Read(next_offset_, 4, &value_size_slice, tmp_str_);
-  next_offset_ += 4;
-  uint32_t value_size = DecodeFixed32(tmp_str_);
-
-  Slice value_slice;
-  if ((int) value_size > value_str_len_) {
-    if (value_str_len_ >= 0) {
-      delete[] value_str_;
-    }
-    value_str_ = new char[value_size];
-    value_str_len_ = value_size;
-  }
-  s = table_->rep_->file->Read(next_offset_, value_size, &value_slice,
-                               value_str_);
-  next_offset_ += value_size;
-  value_ = value_slice;
-}
-
-void SimpleTableIterator::Prev() {
-  assert(false);
-}
-
-Slice SimpleTableIterator::key() const {
-  Log(table_->rep_->ioptions.info_log, "key!!!!");
-  return key_;
-}
-
-Slice SimpleTableIterator::value() const {
-  return value_;
-}
-
-Status SimpleTableIterator::status() const {
-  return status_;
-}
-
-class SimpleTableBuilder: public TableBuilder {
-public:
-  // Create a builder that will store the contents of the table it is
-  // building in *file.  Does not close the file.  It is up to the
-  // caller to close the file after calling Finish(). The output file
-  // will be part of level specified by 'level'.  A value of -1 means
-  // that the caller does not know which level the output file will reside.
-  SimpleTableBuilder(const ImmutableCFOptions& ioptions, WritableFile* file,
-                     CompressionType compression_type);
-
-  // REQUIRES: Either Finish() or Abandon() has been called.
-  ~SimpleTableBuilder();
-
-  // Add key,value to the table being constructed.
-  // REQUIRES: key is after any previously added key according to comparator.
-  // REQUIRES: Finish(), Abandon() have not been called
-  void Add(const Slice& key, const Slice& value) override;
-
-  // Return non-ok iff some error has been detected.
-  Status status() const override;
-
-  // Finish building the table.  Stops using the file passed to the
-  // constructor after this function returns.
-  // REQUIRES: Finish(), Abandon() have not been called
-  Status Finish() override;
-
-  // Indicate that the contents of this builder should be abandoned.  Stops
-  // using the file passed to the constructor after this function returns.
-  // If the caller is not going to call Finish(), it must call Abandon()
-  // before destroying this builder.
-  // REQUIRES: Finish(), Abandon() have not been called
-  void Abandon() override;
-
-  // Number of calls to Add() so far.
-  uint64_t NumEntries() const override;
-
-  // Size of the file generated so far.  If invoked after a successful
-  // Finish() call, returns the size of the final generated file.
-  uint64_t FileSize() const override;
-
-private:
-  struct Rep;
-  Rep* rep_;
-
-  // No copying allowed
-  SimpleTableBuilder(const SimpleTableBuilder&) = delete;
-  void operator=(const SimpleTableBuilder&) = delete;
-};
-
-struct SimpleTableBuilder::Rep {
-  const ImmutableCFOptions& ioptions;
-  WritableFile* file;
-  uint64_t offset = 0;
-  Status status;
-
-  uint64_t num_entries = 0;
-
-  bool closed = false;  // Either Finish() or Abandon() has been called.
-
-  const static int user_key_size = 16;
-  const static int offset_length = 8;
-  const static int key_footer_len = 8;
-
-  static int GetInternalKeyLength() {
-    return user_key_size + key_footer_len;
-  }
-
-  std::string index;
-
-  Rep(const ImmutableCFOptions& iopt, WritableFile* f) :
-      ioptions(iopt), file(f) {
-  }
-  ~Rep() {
-  }
-};
-
-SimpleTableBuilder::SimpleTableBuilder(const ImmutableCFOptions& ioptions,
-                                       WritableFile* file,
-                                       CompressionType compression_type) :
-    rep_(new SimpleTableBuilder::Rep(ioptions, file)) {
-}
-
-SimpleTableBuilder::~SimpleTableBuilder() {
-  delete (rep_);
-}
-
-void SimpleTableBuilder::Add(const Slice& key, const Slice& value) {
-  assert((int ) key.size() == Rep::GetInternalKeyLength());
-
-  // Update index
-  rep_->index.append(key.data(), key.size());
-  PutFixed64(&(rep_->index), rep_->offset);
-
-  // Write key-value pair
-  rep_->file->Append(key);
-  rep_->offset += Rep::GetInternalKeyLength();
-
-  std::string size;
-  int value_size = value.size();
-  PutFixed32(&size, value_size);
-  Slice sizeSlice(size);
-  rep_->file->Append(sizeSlice);
-  rep_->file->Append(value);
-  rep_->offset += value_size + 4;
-
-  rep_->num_entries++;
-}
-
-Status SimpleTableBuilder::status() const {
-  return Status::OK();
-}
-
-Status SimpleTableBuilder::Finish() {
-  Rep* r = rep_;
-  assert(!r->closed);
-  r->closed = true;
-
-  uint64_t index_offset = rep_->offset;
-  Slice index_slice(rep_->index);
-  rep_->file->Append(index_slice);
-  rep_->offset += index_slice.size();
-
-  std::string index_offset_str;
-  PutFixed64(&index_offset_str, index_offset);
-  Slice foot_slice(index_offset_str);
-  rep_->file->Append(foot_slice);
-  rep_->offset += foot_slice.size();
-
-  return Status::OK();
-}
-
-void SimpleTableBuilder::Abandon() {
-  rep_->closed = true;
-}
-
-uint64_t SimpleTableBuilder::NumEntries() const {
-  return rep_->num_entries;
-}
-
-uint64_t SimpleTableBuilder::FileSize() const {
-  return rep_->offset;
-}
-
-class SimpleTableFactory: public TableFactory {
-public:
-  ~SimpleTableFactory() {
-  }
-  SimpleTableFactory() {
-  }
-  const char* Name() const override {
-    return "SimpleTable";
-  }
-  Status NewTableReader(const ImmutableCFOptions& ioptions,
-                        const EnvOptions& env_options,
-                        const InternalKeyComparator& internal_key,
-                        unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
-                        unique_ptr<TableReader>* table_reader) const;
-
-  TableBuilder* NewTableBuilder(
-      const ImmutableCFOptions& ioptions,
-      const InternalKeyComparator& internal_key,
-      WritableFile* file,
-      const CompressionType compression_type,
-      const CompressionOptions& compression_opts) const;
-
-  virtual Status SanitizeDBOptions(const DBOptions* db_opts) const override {
-    return Status::OK();
-  }
-
-  virtual std::string GetPrintableTableOptions() const override {
-    return std::string();
-  }
-};
-
-Status SimpleTableFactory::NewTableReader(
-    const ImmutableCFOptions& ioptions,
-    const EnvOptions& env_options,
-    const InternalKeyComparator& internal_key,
-    unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
-    unique_ptr<TableReader>* table_reader) const {
-
-  return SimpleTableReader::Open(ioptions, env_options, std::move(file),
-                                 file_size, table_reader);
-}
-
-TableBuilder* SimpleTableFactory::NewTableBuilder(
-    const ImmutableCFOptions& ioptions,
-    const InternalKeyComparator& internal_key,
-    WritableFile* file, const CompressionType compression_type,
-    const CompressionOptions& compression_opts) const {
-  return new SimpleTableBuilder(ioptions, file, compression_type);
-}
-
-class SimpleTableDBTest {
-protected:
-public:
-  std::string dbname_;
-  Env* env_;
-  DB* db_;
-
-  Options last_options_;
-
-  SimpleTableDBTest() :
-      env_(Env::Default()) {
-    dbname_ = test::TmpDir() + "/simple_table_db_test";
-    ASSERT_OK(DestroyDB(dbname_, Options()));
-    db_ = nullptr;
-    Reopen();
-  }
-
-  ~SimpleTableDBTest() {
-    delete db_;
-    ASSERT_OK(DestroyDB(dbname_, Options()));
-  }
-
-  // Return the current option configuration.
-  Options CurrentOptions() {
-    Options options;
-    options.table_factory.reset(new SimpleTableFactory());
-    return options;
-  }
-
-  DBImpl* dbfull() {
-    return reinterpret_cast<DBImpl*>(db_);
-  }
-
-  void Reopen(Options* options = nullptr) {
-    ASSERT_OK(TryReopen(options));
-  }
-
-  void Close() {
-    delete db_;
-    db_ = nullptr;
-  }
-
-  void DestroyAndReopen(Options* options = nullptr) {
-    //Destroy using last options
-    Destroy(&last_options_);
-    ASSERT_OK(TryReopen(options));
-  }
-
-  void Destroy(Options* options) {
-    delete db_;
-    db_ = nullptr;
-    ASSERT_OK(DestroyDB(dbname_, *options));
-  }
-
-  Status PureReopen(Options* options, DB** db) {
-    return DB::Open(*options, dbname_, db);
-  }
-
-  Status TryReopen(Options* options = nullptr) {
-    delete db_;
-    db_ = nullptr;
-    Options opts;
-    if (options != nullptr) {
-      opts = *options;
-    } else {
-      opts = CurrentOptions();
-      opts.create_if_missing = true;
-    }
-    last_options_ = opts;
-
-    return DB::Open(opts, dbname_, &db_);
-  }
-
-  Status Put(const Slice& k, const Slice& v) {
-    return db_->Put(WriteOptions(), k, v);
-  }
-
-  Status Delete(const std::string& k) {
-    return db_->Delete(WriteOptions(), k);
-  }
-
-  std::string Get(const std::string& k, const Snapshot* snapshot = nullptr) {
-    ReadOptions options;
-    options.snapshot = snapshot;
-    std::string result;
-    Status s = db_->Get(options, k, &result);
-    if (s.IsNotFound()) {
-      result = "NOT_FOUND";
-    } else if (!s.ok()) {
-      result = s.ToString();
-    }
-    return result;
-  }
-
-
-  int NumTableFilesAtLevel(int level) {
-    std::string property;
-    ASSERT_TRUE(
-        db_->GetProperty("rocksdb.num-files-at-level" + NumberToString(level),
-                         &property));
-    return atoi(property.c_str());
-  }
-
-  // Return spread of files per level
-  std::string FilesPerLevel() {
-    std::string result;
-    int last_non_zero_offset = 0;
-    for (int level = 0; level < db_->NumberLevels(); level++) {
-      int f = NumTableFilesAtLevel(level);
-      char buf[100];
-      snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f);
-      result += buf;
-      if (f > 0) {
-        last_non_zero_offset = result.size();
-      }
-    }
-    result.resize(last_non_zero_offset);
-    return result;
-  }
-
-  std::string IterStatus(Iterator* iter) {
-    std::string result;
-    if (iter->Valid()) {
-      result = iter->key().ToString() + "->" + iter->value().ToString();
-    } else {
-      result = "(invalid)";
-    }
-    return result;
-  }
-};
-
-TEST(SimpleTableDBTest, Empty) {
-  ASSERT_TRUE(db_ != nullptr);
-  ASSERT_EQ("NOT_FOUND", Get("0000000000000foo"));
-}
-
-TEST(SimpleTableDBTest, ReadWrite) {
-  ASSERT_OK(Put("0000000000000foo", "v1"));
-  ASSERT_EQ("v1", Get("0000000000000foo"));
-  ASSERT_OK(Put("0000000000000bar", "v2"));
-  ASSERT_OK(Put("0000000000000foo", "v3"));
-  ASSERT_EQ("v3", Get("0000000000000foo"));
-  ASSERT_EQ("v2", Get("0000000000000bar"));
-}
-
-TEST(SimpleTableDBTest, Flush) {
-  ASSERT_OK(Put("0000000000000foo", "v1"));
-  ASSERT_OK(Put("0000000000000bar", "v2"));
-  ASSERT_OK(Put("0000000000000foo", "v3"));
-  dbfull()->TEST_FlushMemTable();
-  ASSERT_EQ("v3", Get("0000000000000foo"));
-  ASSERT_EQ("v2", Get("0000000000000bar"));
-}
-
-TEST(SimpleTableDBTest, Flush2) {
-  ASSERT_OK(Put("0000000000000bar", "b"));
-  ASSERT_OK(Put("0000000000000foo", "v1"));
-  dbfull()->TEST_FlushMemTable();
-
-  ASSERT_OK(Put("0000000000000foo", "v2"));
-  dbfull()->TEST_FlushMemTable();
-  ASSERT_EQ("v2", Get("0000000000000foo"));
-
-  ASSERT_OK(Put("0000000000000eee", "v3"));
-  dbfull()->TEST_FlushMemTable();
-  ASSERT_EQ("v3", Get("0000000000000eee"));
-
-  ASSERT_OK(Delete("0000000000000bar"));
-  dbfull()->TEST_FlushMemTable();
-  ASSERT_EQ("NOT_FOUND", Get("0000000000000bar"));
-
-  ASSERT_OK(Put("0000000000000eee", "v5"));
-  dbfull()->TEST_FlushMemTable();
-  ASSERT_EQ("v5", Get("0000000000000eee"));
-}
-
-static std::string Key(int i) {
-  char buf[100];
-  snprintf(buf, sizeof(buf), "key_______%06d", i);
-  return std::string(buf);
-}
-
-static std::string RandomString(Random* rnd, int len) {
-  std::string r;
-  test::RandomString(rnd, len, &r);
-  return r;
-}
-
-TEST(SimpleTableDBTest, CompactionTrigger) {
-  Options options = CurrentOptions();
-  options.write_buffer_size = 100 << 10; //100KB
-  options.num_levels = 3;
-  options.max_mem_compaction_level = 0;
-  options.level0_file_num_compaction_trigger = 3;
-  Reopen(&options);
-
-  Random rnd(301);
-
-  for (int num = 0; num < options.level0_file_num_compaction_trigger - 1;
-      num++) {
-    std::vector<std::string> values;
-    // Write 120KB (12 values, each 10K)
-    for (int i = 0; i < 12; i++) {
-      values.push_back(RandomString(&rnd, 10000));
-      ASSERT_OK(Put(Key(i), values[i]));
-    }
-    dbfull()->TEST_WaitForFlushMemTable();
-    ASSERT_EQ(NumTableFilesAtLevel(0), num + 1);
-  }
-
-  //generate one more file in level-0, and should trigger level-0 compaction
-  std::vector<std::string> values;
-  for (int i = 0; i < 12; i++) {
-    values.push_back(RandomString(&rnd, 10000));
-    ASSERT_OK(Put(Key(i), values[i]));
-  }
-  dbfull()->TEST_WaitForCompact();
-
-  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
-  ASSERT_EQ(NumTableFilesAtLevel(1), 1);
-}
-
-}  // namespace rocksdb
-
-int main(int argc, char** argv) {
-  return rocksdb::test::RunAllTests();
-}
diff --git a/db/table_cache.cc b/db/table_cache.cc
index 5cb96f8bf..580e8049d 100644
--- a/db/table_cache.cc
+++ b/db/table_cache.cc
@@ -15,6 +15,7 @@
 #include "rocksdb/statistics.h"
 #include "table/iterator_wrapper.h"
 #include "table/table_reader.h"
+#include "table/get_context.h"
 #include "util/coding.h"
 #include "util/stop_watch.h"
 
@@ -132,10 +133,8 @@ Iterator* TableCache::NewIterator(const ReadOptions& options,
 
 Status TableCache::Get(const ReadOptions& options,
                        const InternalKeyComparator& internal_comparator,
-                       const FileDescriptor& fd, const Slice& k, void* arg,
-                       bool (*saver)(void*, const ParsedInternalKey&,
-                                     const Slice&),
-                       void (*mark_key_may_exist)(void*)) {
+                       const FileDescriptor& fd, const Slice& k,
+                       GetContext* get_context) {
   TableReader* t = fd.table_reader;
   Status s;
   Cache::Handle* handle = nullptr;
@@ -147,13 +146,13 @@ Status TableCache::Get(const ReadOptions& options,
     }
   }
   if (s.ok()) {
-    s = t->Get(options, k, arg, saver, mark_key_may_exist);
+    s = t->Get(options, k, get_context);
     if (handle != nullptr) {
       ReleaseHandle(handle);
     }
   } else if (options.read_tier && s.IsIncomplete()) {
     // Couldnt find Table in cache but treat as kFound if no_io set
-    (*mark_key_may_exist)(arg);
+    get_context->MarkKeyMayExist();
     return Status::OK();
   }
   return s;
diff --git a/db/table_cache.h b/db/table_cache.h
index 2f6740d9f..76bb1c0a2 100644
--- a/db/table_cache.h
+++ b/db/table_cache.h
@@ -27,6 +27,7 @@ namespace rocksdb {
 class Env;
 class Arena;
 struct FileDescriptor;
+class GetContext;
 
 class TableCache {
  public:
@@ -52,10 +53,8 @@ class TableCache {
   // it returns false.
   Status Get(const ReadOptions& options,
              const InternalKeyComparator& internal_comparator,
-             const FileDescriptor& file_fd, const Slice& k, void* arg,
-             bool (*handle_result)(void*, const ParsedInternalKey&,
-                                   const Slice&),
-             void (*mark_key_may_exist)(void*) = nullptr);
+             const FileDescriptor& file_fd, const Slice& k,
+             GetContext* get_context);
 
   // Evict any entry for the specified file number
   static void Evict(Cache* cache, uint64_t file_number);
diff --git a/db/version_set.cc b/db/version_set.cc
index 0a46d7edc..10649fa6c 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -37,6 +37,7 @@
 #include "table/format.h"
 #include "table/plain_table_factory.h"
 #include "table/meta_blocks.h"
+#include "table/get_context.h"
 #include "util/coding.h"
 #include "util/logging.h"
 #include "util/stop_watch.h"
@@ -627,81 +628,6 @@ void Version::AddIterators(const ReadOptions& read_options,
 }
 
 
-// Called from TableCache::Get and Table::Get when file/block in which
-// key may  exist are not there in TableCache/BlockCache respectively. In this
-// case we  can't guarantee that key does not exist and are not permitted to do
-// IO to be  certain.Set the status=kFound and value_found=false to let the
-// caller know that key may exist but is not there in memory
-void MarkKeyMayExist(void* arg) {
-  Version::Saver* s = reinterpret_cast<Version::Saver*>(arg);
-  s->state = Version::kFound;
-  if (s->value_found != nullptr) {
-    *(s->value_found) = false;
-  }
-}
-
-bool SaveValue(void* arg, const ParsedInternalKey& parsed_key,
-               const Slice& v) {
-  Version::Saver* s = reinterpret_cast<Version::Saver*>(arg);
-  MergeContext* merge_contex = s->merge_context;
-  std::string merge_result;  // temporary area for merge results later
-
-  assert(s != nullptr && merge_contex != nullptr);
-
-  // TODO: Merge?
-  if (s->ucmp->Compare(parsed_key.user_key, s->user_key) == 0) {
-    // Key matches. Process it
-    switch (parsed_key.type) {
-      case kTypeValue:
-        if (Version::kNotFound == s->state) {
-          s->state = Version::kFound;
-          s->value->assign(v.data(), v.size());
-        } else if (Version::kMerge == s->state) {
-          assert(s->merge_operator != nullptr);
-          s->state = Version::kFound;
-          if (!s->merge_operator->FullMerge(s->user_key, &v,
-                                            merge_contex->GetOperands(),
-                                            s->value, s->logger)) {
-            RecordTick(s->statistics, NUMBER_MERGE_FAILURES);
-            s->state = Version::kCorrupt;
-          }
-        } else {
-          assert(false);
-        }
-        return false;
-
-      case kTypeDeletion:
-        if (Version::kNotFound == s->state) {
-          s->state = Version::kDeleted;
-        } else if (Version::kMerge == s->state) {
-          s->state = Version::kFound;
-          if (!s->merge_operator->FullMerge(s->user_key, nullptr,
-                                            merge_contex->GetOperands(),
-                                            s->value, s->logger)) {
-            RecordTick(s->statistics, NUMBER_MERGE_FAILURES);
-            s->state = Version::kCorrupt;
-          }
-        } else {
-          assert(false);
-        }
-        return false;
-
-      case kTypeMerge:
-        assert(s->state == Version::kNotFound || s->state == Version::kMerge);
-        s->state = Version::kMerge;
-        merge_contex->PushOperand(v);
-        return true;
-
-      default:
-        assert(false);
-        break;
-    }
-  }
-
-  // s->state could be Corrupt, merge or notfound
-
-  return false;
-}
 
 Version::Version(ColumnFamilyData* cfd, VersionSet* vset,
                  uint64_t version_number)
@@ -756,46 +682,42 @@ void Version::Get(const ReadOptions& options,
   Slice user_key = k.user_key();
 
   assert(status->ok() || status->IsMergeInProgress());
-  Saver saver;
-  saver.state = status->ok()? kNotFound : kMerge;
-  saver.ucmp = user_comparator_;
-  saver.user_key = user_key;
-  saver.value_found = value_found;
-  saver.value = value;
-  saver.merge_operator = merge_operator_;
-  saver.merge_context = merge_context;
-  saver.logger = info_log_;
-  saver.statistics = db_statistics_;
+
+  GetContext get_context(user_comparator_, merge_operator_, info_log_,
+      db_statistics_, status->ok() ? GetContext::kNotFound : GetContext::kMerge,
+      user_key, value, value_found, merge_context);
 
   FilePicker fp(files_, user_key, ikey, &file_levels_, num_non_empty_levels_,
       &file_indexer_, user_comparator_, internal_comparator_);
   FdWithKeyRange* f = fp.GetNextFile();
   while (f != nullptr) {
     *status = table_cache_->Get(options, *internal_comparator_, f->fd, ikey,
-                                &saver, SaveValue, MarkKeyMayExist);
+                                &get_context);
     // TODO: examine the behavior for corrupted key
     if (!status->ok()) {
       return;
     }
 
-    switch (saver.state) {
-      case kNotFound:
-        break;      // Keep searching in other files
-      case kFound:
+    switch (get_context.State()) {
+      case GetContext::kNotFound:
+        // Keep searching in other files
+        break;
+      case GetContext::kFound:
         return;
-      case kDeleted:
-        *status = Status::NotFound();  // Use empty error message for speed
+      case GetContext::kDeleted:
+        // Use empty error message for speed
+        *status = Status::NotFound();
         return;
-      case kCorrupt:
+      case GetContext::kCorrupt:
         *status = Status::Corruption("corrupted key for ", user_key);
         return;
-      case kMerge:
+      case GetContext::kMerge:
         break;
     }
     f = fp.GetNextFile();
   }
 
-  if (kMerge == saver.state) {
+  if (GetContext::kMerge == get_context.State()) {
     if (!merge_operator_) {
       *status =  Status::InvalidArgument(
           "merge_operator is not properly initialized.");
@@ -804,7 +726,7 @@ void Version::Get(const ReadOptions& options,
     // merge_operands are in saver and we hit the beginning of the key history
     // do a final merge of nullptr and operands;
     if (merge_operator_->FullMerge(user_key, nullptr,
-                                   saver.merge_context->GetOperands(), value,
+                                   merge_context->GetOperands(), value,
                                    info_log_)) {
       *status = Status::OK();
     } else {
diff --git a/db/version_set.h b/db/version_set.h
index 9e6cc1e34..4a27a9592 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -241,28 +241,6 @@ class Version {
     FileMetaData* file;
   };
 
-  enum SaverState {
-    kNotFound,
-    kFound,
-    kDeleted,
-    kCorrupt,
-    kMerge  // saver contains the current merge result (the operands)
-  };
-
-  // Callback from TableCache::Get()
-  struct Saver {
-    SaverState state;
-    const Comparator* ucmp;
-    Slice user_key;
-    bool* value_found;  // Is value set correctly? Used by KeyMayExist
-    std::string* value;
-    const MergeOperator* merge_operator;
-    // the merge operations encountered;
-    MergeContext* merge_context;
-    Logger* logger;
-    Statistics* statistics;
-  };
-
  private:
   friend class Compaction;
   friend class VersionSet;
diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc
index 09328dc3b..4b2050e03 100644
--- a/table/block_based_table_reader.cc
+++ b/table/block_based_table_reader.cc
@@ -33,6 +33,7 @@
 #include "table/format.h"
 #include "table/meta_blocks.h"
 #include "table/two_level_iterator.h"
+#include "table/get_context.h"
 
 #include "util/coding.h"
 #include "util/perf_context_imp.h"
@@ -1100,10 +1101,8 @@ Iterator* BlockBasedTable::NewIterator(const ReadOptions& read_options,
 }
 
 Status BlockBasedTable::Get(
-    const ReadOptions& read_options, const Slice& key, void* handle_context,
-    bool (*result_handler)(void* handle_context, const ParsedInternalKey& k,
-                           const Slice& v),
-    void (*mark_key_may_exist_handler)(void* handle_context)) {
+    const ReadOptions& read_options, const Slice& key,
+    GetContext* get_context) {
   Status s;
   auto filter_entry = GetFilter(read_options.read_tier == kBlockCacheTier);
   FilterBlockReader* filter = filter_entry.value;
@@ -1141,7 +1140,7 @@ Status BlockBasedTable::Get(
           // couldn't get block from block_cache
           // Update Saver.state to Found because we are only looking for whether
           // we can guarantee the key is not there when "no_io" is set
-          (*mark_key_may_exist_handler)(handle_context);
+          get_context->MarkKeyMayExist();
           break;
         }
         if (!biter.status().ok()) {
@@ -1156,8 +1155,7 @@ Status BlockBasedTable::Get(
             s = Status::Corruption(Slice());
           }
 
-          if (!(*result_handler)(handle_context, parsed_key,
-                                 biter.value())) {
+          if (!get_context->SaveValue(parsed_key, biter.value())) {
             done = true;
             break;
           }
diff --git a/table/block_based_table_reader.h b/table/block_based_table_reader.h
index 503a91bb3..b272c4d13 100644
--- a/table/block_based_table_reader.h
+++ b/table/block_based_table_reader.h
@@ -40,6 +40,7 @@ class WritableFile;
 struct BlockBasedTableOptions;
 struct EnvOptions;
 struct ReadOptions;
+class GetContext;
 
 using std::unique_ptr;
 
@@ -76,11 +77,7 @@ class BlockBasedTable : public TableReader {
   Iterator* NewIterator(const ReadOptions&, Arena* arena = nullptr) override;
 
   Status Get(const ReadOptions& readOptions, const Slice& key,
-             void* handle_context,
-             bool (*result_handler)(void* handle_context,
-                                    const ParsedInternalKey& k, const Slice& v),
-             void (*mark_key_may_exist_handler)(void* handle_context) =
-                 nullptr) override;
+             GetContext* get_context) override;
 
   // Given a key, return an approximate byte offset in the file where
   // the data for that key begins (or would begin if the key were
diff --git a/table/cuckoo_table_reader.cc b/table/cuckoo_table_reader.cc
index af6fe6e88..f8da4e288 100644
--- a/table/cuckoo_table_reader.cc
+++ b/table/cuckoo_table_reader.cc
@@ -19,6 +19,7 @@
 #include "rocksdb/table.h"
 #include "table/meta_blocks.h"
 #include "table/cuckoo_table_factory.h"
+#include "table/get_context.h"
 #include "util/arena.h"
 #include "util/coding.h"
 
@@ -126,11 +127,8 @@ CuckooTableReader::CuckooTableReader(
   status_ = file_->Read(0, file_size, &file_data_, nullptr);
 }
 
-Status CuckooTableReader::Get(
-    const ReadOptions& readOptions, const Slice& key, void* handle_context,
-    bool (*result_handler)(void* arg, const ParsedInternalKey& k,
-                           const Slice& v),
-    void (*mark_key_may_exist_handler)(void* handle_context)) {
+Status CuckooTableReader::Get(const ReadOptions& readOptions, const Slice& key,
+                              GetContext* get_context) {
   assert(key.size() == key_length_ + (is_last_level_ ? 8 : 0));
   Slice user_key = ExtractUserKey(key);
   for (uint32_t hash_cnt = 0; hash_cnt < num_hash_func_; ++hash_cnt) {
@@ -149,14 +147,12 @@ Status CuckooTableReader::Get(
       if (ucomp_->Compare(user_key, Slice(bucket, user_key.size())) == 0) {
         Slice value(bucket + key_length_, value_length_);
         if (is_last_level_) {
-          ParsedInternalKey found_ikey(
-              Slice(bucket, key_length_), 0, kTypeValue);
-          result_handler(handle_context, found_ikey, value);
+          get_context->SaveValue(value);
         } else {
           Slice full_key(bucket, key_length_);
           ParsedInternalKey found_ikey;
           ParseInternalKey(full_key, &found_ikey);
-          result_handler(handle_context, found_ikey, value);
+          get_context->SaveValue(found_ikey, value);
         }
         // We don't support merge operations. So, we return here.
         return Status::OK();
diff --git a/table/cuckoo_table_reader.h b/table/cuckoo_table_reader.h
index 8f7635cfa..4f00a9e41 100644
--- a/table/cuckoo_table_reader.h
+++ b/table/cuckoo_table_reader.h
@@ -40,12 +40,8 @@ class CuckooTableReader: public TableReader {
 
   Status status() const { return status_; }
 
-  Status Get(
-      const ReadOptions& read_options, const Slice& key, void* handle_context,
-      bool (*result_handler)(void* arg, const ParsedInternalKey& k,
-                             const Slice& v),
-      void (*mark_key_may_exist_handler)(void* handle_context) = nullptr)
-    override;
+  Status Get(const ReadOptions& read_options, const Slice& key,
+             GetContext* get_context) override;
 
   Iterator* NewIterator(const ReadOptions&, Arena* arena = nullptr) override;
   void Prepare(const Slice& target) override;
diff --git a/table/cuckoo_table_reader_test.cc b/table/cuckoo_table_reader_test.cc
index 6566b7a29..66d88fc71 100644
--- a/table/cuckoo_table_reader_test.cc
+++ b/table/cuckoo_table_reader_test.cc
@@ -25,6 +25,7 @@ int main() {
 #include "table/cuckoo_table_builder.h"
 #include "table/cuckoo_table_reader.h"
 #include "table/cuckoo_table_factory.h"
+#include "table/get_context.h"
 #include "util/arena.h"
 #include "util/random.h"
 #include "util/testharness.h"
@@ -61,25 +62,6 @@ uint64_t GetSliceHash(const Slice& s, uint32_t index,
   return hash_map[s.ToString()][index];
 }
 
-// Methods, variables for checking key and values read.
-struct ValuesToAssert {
-  ValuesToAssert(const std::string& key, const Slice& value)
-    : expected_user_key(key),
-      expected_value(value),
-      call_count(0) {}
-  std::string expected_user_key;
-  Slice expected_value;
-  int call_count;
-};
-
-bool AssertValues(void* assert_obj,
-    const ParsedInternalKey& k, const Slice& v) {
-  ValuesToAssert *ptr = reinterpret_cast<ValuesToAssert*>(assert_obj);
-  ASSERT_EQ(ptr->expected_value.ToString(), v.ToString());
-  ASSERT_EQ(ptr->expected_user_key, k.user_key.ToString());
-  ++ptr->call_count;
-  return false;
-}
 }  // namespace
 
 class CuckooReaderTest {
@@ -134,11 +116,14 @@ class CuckooReaderTest {
         ucomp,
         GetSliceHash);
     ASSERT_OK(reader.status());
+    // Assume no merge/deletion
     for (uint32_t i = 0; i < num_items; ++i) {
-      ValuesToAssert v(user_keys[i], values[i]);
-      ASSERT_OK(reader.Get(
-            ReadOptions(), Slice(keys[i]), &v, AssertValues, nullptr));
-      ASSERT_EQ(1, v.call_count);
+      std::string value;
+      GetContext get_context(ucomp, nullptr, nullptr, nullptr,
+                             GetContext::kNotFound, Slice(user_keys[i]), &value,
+                             nullptr, nullptr);
+      ASSERT_OK(reader.Get(ReadOptions(), Slice(keys[i]), &get_context));
+      ASSERT_EQ(values[i], value);
     }
   }
   void UpdateKeys(bool with_zero_seqno) {
@@ -329,6 +314,7 @@ TEST(CuckooReaderTest, WhenKeyNotFound) {
     // Make all hash values collide.
     AddHashLookups(user_keys[i], 0, kNumHashFunc);
   }
+  auto* ucmp = BytewiseComparator();
   CreateCuckooFileAndCheckReader();
   std::unique_ptr<RandomAccessFile> read_file;
   ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options));
@@ -337,7 +323,7 @@ TEST(CuckooReaderTest, WhenKeyNotFound) {
       ioptions,
       std::move(read_file),
       file_size,
-      BytewiseComparator(),
+      ucmp,
       GetSliceHash);
   ASSERT_OK(reader.status());
   // Search for a key with colliding hash values.
@@ -346,10 +332,11 @@ TEST(CuckooReaderTest, WhenKeyNotFound) {
   AddHashLookups(not_found_user_key, 0, kNumHashFunc);
   ParsedInternalKey ikey(not_found_user_key, 1000, kTypeValue);
   AppendInternalKey(&not_found_key, ikey);
-  ValuesToAssert v("", "");
-  ASSERT_OK(reader.Get(
-        ReadOptions(), Slice(not_found_key), &v, AssertValues, nullptr));
-  ASSERT_EQ(0, v.call_count);
+  std::string value;
+  GetContext get_context(ucmp, nullptr, nullptr, nullptr, GetContext::kNotFound,
+                         Slice(not_found_key), &value, nullptr, nullptr);
+  ASSERT_OK(reader.Get(ReadOptions(), Slice(not_found_key), &get_context));
+  ASSERT_TRUE(value.empty());
   ASSERT_OK(reader.status());
   // Search for a key with an independent hash value.
   std::string not_found_user_key2 = "key" + NumToStr(num_items + 1);
@@ -357,9 +344,11 @@ TEST(CuckooReaderTest, WhenKeyNotFound) {
   ParsedInternalKey ikey2(not_found_user_key2, 1000, kTypeValue);
   std::string not_found_key2;
   AppendInternalKey(&not_found_key2, ikey2);
-  ASSERT_OK(reader.Get(
-        ReadOptions(), Slice(not_found_key2), &v, AssertValues, nullptr));
-  ASSERT_EQ(0, v.call_count);
+  GetContext get_context2(ucmp, nullptr, nullptr, nullptr,
+                          GetContext::kNotFound, Slice(not_found_key2), &value,
+                          nullptr, nullptr);
+  ASSERT_OK(reader.Get(ReadOptions(), Slice(not_found_key2), &get_context2));
+  ASSERT_TRUE(value.empty());
   ASSERT_OK(reader.status());
 
   // Test read when key is unused key.
@@ -369,34 +358,16 @@ TEST(CuckooReaderTest, WhenKeyNotFound) {
   // Add hash values that map to empty buckets.
   AddHashLookups(ExtractUserKey(unused_key).ToString(),
       kNumHashFunc, kNumHashFunc);
-  ASSERT_OK(reader.Get(
-        ReadOptions(), Slice(unused_key), &v, AssertValues, nullptr));
-  ASSERT_EQ(0, v.call_count);
+  GetContext get_context3(ucmp, nullptr, nullptr, nullptr,
+                          GetContext::kNotFound, Slice(unused_key), &value,
+                          nullptr, nullptr);
+  ASSERT_OK(reader.Get(ReadOptions(), Slice(unused_key), &get_context3));
+  ASSERT_TRUE(value.empty());
   ASSERT_OK(reader.status());
 }
 
 // Performance tests
 namespace {
-int64_t found_count = 0;
-std::string value;
-bool DoNothing(void* arg, const ParsedInternalKey& k, const Slice& v) {
-  // Deliberately empty.
-  if (*reinterpret_cast<const int32_t*>(k.user_key.data()) ==
-      *reinterpret_cast<const int32_t*>(v.data())) {
-    ++found_count;
-    value.assign(v.data(), v.size());
-  }
-  return false;
-}
-
-bool CheckValue(void* cnt_ptr, const ParsedInternalKey& k, const Slice& v) {
-  ++*reinterpret_cast<int*>(cnt_ptr);
-  std::string expected_value;
-  AppendInternalKey(&expected_value, k);
-  ASSERT_EQ(0, v.compare(Slice(&expected_value[0], v.size())));
-  return false;
-}
-
 void GetKeys(uint64_t num, std::vector<std::string>* keys) {
   keys->clear();
   IterKey k;
@@ -457,13 +428,15 @@ void WriteFile(const std::vector<std::string>& keys,
       test::Uint64Comparator(), nullptr);
   ASSERT_OK(reader.status());
   ReadOptions r_options;
+  std::string value;
+  // Assume only the fast path is triggered
+  GetContext get_context(nullptr, nullptr, nullptr, nullptr,
+                         GetContext::kNotFound, Slice(), &value,
+                         nullptr, nullptr);
   for (uint64_t i = 0; i < num; ++i) {
-    int cnt = 0;
-    ASSERT_OK(reader.Get(r_options, Slice(keys[i]), &cnt, CheckValue, nullptr));
-    if (cnt != 1) {
-      fprintf(stderr, "%" PRIu64 " not found.\n", i);
-      ASSERT_EQ(1, cnt);
-    }
+    value.clear();
+    ASSERT_OK(reader.Get(r_options, Slice(keys[i]), &get_context));
+    ASSERT_TRUE(Slice(keys[i]) == Slice(&keys[i][0], 4));
   }
 }
 
@@ -501,7 +474,11 @@ void ReadKeys(uint64_t num, uint32_t batch_size) {
   }
   std::random_shuffle(keys.begin(), keys.end());
 
-  found_count = 0;
+  std::string value;
+  // Assume only the fast path is triggered
+  GetContext get_context(nullptr, nullptr, nullptr, nullptr,
+                         GetContext::kNotFound, Slice(), &value,
+                         nullptr, nullptr);
   uint64_t start_time = env->NowMicros();
   if (batch_size > 0) {
     for (uint64_t i = 0; i < num; i += batch_size) {
@@ -510,20 +487,19 @@ void ReadKeys(uint64_t num, uint32_t batch_size) {
       }
       for (uint64_t j = i; j < i+batch_size && j < num; ++j) {
         reader.Get(r_options, Slice(reinterpret_cast<char*>(&keys[j]), 16),
-                   nullptr, DoNothing, nullptr);
+                   &get_context);
       }
     }
   } else {
     for (uint64_t i = 0; i < num; i++) {
       reader.Get(r_options, Slice(reinterpret_cast<char*>(&keys[i]), 16),
-                 nullptr, DoNothing, nullptr);
+                 &get_context);
     }
   }
   float time_per_op = (env->NowMicros() - start_time) * 1.0 / num;
   fprintf(stderr,
-      "Time taken per op is %.3fus (%.1f Mqps) with batch size of %u, "
-      "# of found keys %" PRId64 "\n",
-      time_per_op, 1.0 / time_per_op, batch_size, found_count);
+      "Time taken per op is %.3fus (%.1f Mqps) with batch size of %u\n",
+      time_per_op, 1.0 / time_per_op, batch_size);
 }
 }  // namespace.
 
diff --git a/table/get_context.cc b/table/get_context.cc
new file mode 100644
index 000000000..59dfa41e6
--- /dev/null
+++ b/table/get_context.cc
@@ -0,0 +1,101 @@
+//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include "table/get_context.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/statistics.h"
+#include "util/statistics.h"
+
+namespace rocksdb {
+
+GetContext::GetContext(const Comparator* ucmp,
+      const MergeOperator* merge_operator,
+      Logger* logger, Statistics* statistics,
+      GetState init_state, const Slice& user_key, std::string* ret_value,
+      bool* value_found, MergeContext* merge_context)
+  : ucmp_(ucmp),
+    merge_operator_(merge_operator),
+    logger_(logger),
+    statistics_(statistics),
+    state_(init_state),
+    user_key_(user_key),
+    value_(ret_value),
+    value_found_(value_found),
+    merge_context_(merge_context) {
+}
+
+// Called from TableCache::Get and Table::Get when file/block in which
+// key may exist are not there in TableCache/BlockCache respectively. In this
+// case we can't guarantee that key does not exist and are not permitted to do
+// IO to be certain.Set the status=kFound and value_found=false to let the
+// caller know that key may exist but is not there in memory
+void GetContext::MarkKeyMayExist() {
+  state_ = kFound;
+  if (value_found_ != nullptr) {
+    *value_found_ = false;
+  }
+}
+
+void GetContext::SaveValue(const Slice& value) {
+  state_ = kFound;
+  value_->assign(value.data(), value.size());
+}
+
+bool GetContext::SaveValue(const ParsedInternalKey& parsed_key,
+                           const Slice& value) {
+  assert((state_ != kMerge && parsed_key.type != kTypeMerge) ||
+         merge_context_ != nullptr);
+  if (ucmp_->Compare(parsed_key.user_key, user_key_) == 0) {
+    // Key matches. Process it
+    switch (parsed_key.type) {
+      case kTypeValue:
+        assert(state_ == kNotFound || state_ == kMerge);
+        if (kNotFound == state_) {
+          state_ = kFound;
+          value_->assign(value.data(), value.size());
+        } else if (kMerge == state_) {
+          assert(merge_operator_ != nullptr);
+          state_ = kFound;
+          if (!merge_operator_->FullMerge(user_key_, &value,
+                                          merge_context_->GetOperands(),
+                                          value_, logger_)) {
+            RecordTick(statistics_, NUMBER_MERGE_FAILURES);
+            state_ = kCorrupt;
+          }
+        }
+        return false;
+
+      case kTypeDeletion:
+        assert(state_ == kNotFound || state_ == kMerge);
+        if (kNotFound == state_) {
+          state_ = kDeleted;
+        } else if (kMerge == state_) {
+          state_ = kFound;
+          if (!merge_operator_->FullMerge(user_key_, nullptr,
+                                          merge_context_->GetOperands(),
+                                          value_, logger_)) {
+            RecordTick(statistics_, NUMBER_MERGE_FAILURES);
+            state_ = kCorrupt;
+          }
+        }
+        return false;
+
+      case kTypeMerge:
+        assert(state_ == kNotFound || state_ == kMerge);
+        state_ = kMerge;
+        merge_context_->PushOperand(value);
+        return true;
+
+      default:
+        assert(false);
+        break;
+    }
+  }
+
+  // state_ could be Corrupt, merge or notfound
+  return false;
+}
+
+}  // namespace rocksdb
diff --git a/table/get_context.h b/table/get_context.h
new file mode 100644
index 000000000..a38f3c533
--- /dev/null
+++ b/table/get_context.h
@@ -0,0 +1,47 @@
+//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#pragma once
+#include <string>
+#include "db/merge_context.h"
+
+namespace rocksdb {
+class MergeContext;
+
+class GetContext {
+ public:
+  enum GetState {
+    kNotFound,
+    kFound,
+    kDeleted,
+    kCorrupt,
+    kMerge  // saver contains the current merge result (the operands)
+  };
+
+  GetContext(const Comparator* ucmp, const MergeOperator* merge_operator,
+             Logger* logger, Statistics* statistics,
+             GetState init_state, const Slice& user_key, std::string* ret_value,
+             bool* value_found, MergeContext* merge_context);
+
+  void MarkKeyMayExist();
+  void SaveValue(const Slice& value);
+  bool SaveValue(const ParsedInternalKey& parsed_key, const Slice& value);
+  GetState State() const { return state_; }
+
+ private:
+  const Comparator* ucmp_;
+  const MergeOperator* merge_operator_;
+  // the merge operations encountered;
+  Logger* logger_;
+  Statistics* statistics_;
+
+  GetState state_;
+  Slice user_key_;
+  std::string* value_;
+  bool* value_found_;  // Is value set correctly? Used by KeyMayExist
+  MergeContext* merge_context_;
+};
+
+}  // namespace rocksdb
diff --git a/table/plain_table_reader.cc b/table/plain_table_reader.cc
index 3a6d48be8..db37241a9 100644
--- a/table/plain_table_reader.cc
+++ b/table/plain_table_reader.cc
@@ -26,6 +26,7 @@
 #include "table/two_level_iterator.h"
 #include "table/plain_table_factory.h"
 #include "table/plain_table_key_coding.h"
+#include "table/get_context.h"
 
 #include "util/arena.h"
 #include "util/coding.h"
@@ -525,10 +526,7 @@ void PlainTableReader::Prepare(const Slice& target) {
 }
 
 Status PlainTableReader::Get(const ReadOptions& ro, const Slice& target,
-                             void* arg,
-                             bool (*saver)(void*, const ParsedInternalKey&,
-                                           const Slice&),
-                             void (*mark_key_may_exist)(void*)) {
+                             GetContext* get_context) {
   // Check bloom filter first.
   Slice prefix_slice;
   uint32_t prefix_hash;
@@ -580,8 +578,10 @@ Status PlainTableReader::Get(const ReadOptions& ro, const Slice& target,
       }
       prefix_match = true;
     }
+    // TODO(ljin): since we know the key comparison result here,
+    // can we enable the fast path?
     if (internal_comparator_.Compare(found_key, parsed_target) >= 0) {
-      if (!(*saver)(arg, found_key, found_value)) {
+      if (!get_context->SaveValue(found_key, found_value)) {
         break;
       }
     }
diff --git a/table/plain_table_reader.h b/table/plain_table_reader.h
index fcc94a53e..531ac8e8b 100644
--- a/table/plain_table_reader.h
+++ b/table/plain_table_reader.h
@@ -36,6 +36,7 @@ class TableCache;
 class TableReader;
 class InternalKeyComparator;
 class PlainTableKeyDecoder;
+class GetContext;
 
 using std::unique_ptr;
 using std::unordered_map;
@@ -65,10 +66,8 @@ class PlainTableReader: public TableReader {
 
   void Prepare(const Slice& target);
 
-  Status Get(const ReadOptions&, const Slice& key, void* arg,
-             bool (*result_handler)(void* arg, const ParsedInternalKey& k,
-                                    const Slice& v),
-             void (*mark_key_may_exist)(void*) = nullptr);
+  Status Get(const ReadOptions&, const Slice& key,
+             GetContext* get_context) override;
 
   uint64_t ApproximateOffsetOf(const Slice& key);
 
diff --git a/table/table_reader.h b/table/table_reader.h
index 22f5a859e..2f6360ad1 100644
--- a/table/table_reader.h
+++ b/table/table_reader.h
@@ -18,6 +18,7 @@ class Slice;
 class Arena;
 struct ReadOptions;
 struct TableProperties;
+class GetContext;
 
 // A Table is a sorted map from strings to strings.  Tables are
 // immutable and persistent.  A Table may be safely accessed from
@@ -55,23 +56,17 @@ class TableReader {
   // Report an approximation of how much memory has been used.
   virtual size_t ApproximateMemoryUsage() const = 0;
 
-  // Calls (*result_handler)(handle_context, ...) repeatedly, starting with
-  // the entry found after a call to Seek(key), until result_handler returns
-  // false, where k is the actual internal key for a row found and v as the
-  // value of the key. May not make such a call if filter policy says that key
-  // is not present.
+  // Calls get_context->SaveValue() repeatedly, starting with
+  // the entry found after a call to Seek(key), until it returns false.
+  // May not make such a call if filter policy says that key is not present.
   //
-  // mark_key_may_exist_handler needs to be called when it is configured to be
-  // memory only and the key is not found in the block cache, with
-  // the parameter to be handle_context.
+  // get_context->MarkKeyMayExist needs to be called when it is configured to be
+  // memory only and the key is not found in the block cache.
   //
   // readOptions is the options for the read
   // key is the key to search for
-  virtual Status Get(
-      const ReadOptions& readOptions, const Slice& key, void* handle_context,
-      bool (*result_handler)(void* arg, const ParsedInternalKey& k,
-                             const Slice& v),
-      void (*mark_key_may_exist_handler)(void* handle_context) = nullptr) = 0;
+  virtual Status Get(const ReadOptions& readOptions, const Slice& key,
+                     GetContext* get_context) = 0;
 };
 
 }  // namespace rocksdb
diff --git a/table/table_reader_bench.cc b/table/table_reader_bench.cc
index aa791f4c4..52fa20ec0 100644
--- a/table/table_reader_bench.cc
+++ b/table/table_reader_bench.cc
@@ -22,6 +22,7 @@ int main() {
 #include "table/block_based_table_factory.h"
 #include "table/plain_table_factory.h"
 #include "table/table_builder.h"
+#include "table/get_context.h"
 #include "util/histogram.h"
 #include "util/testharness.h"
 #include "util/testutil.h"
@@ -48,11 +49,6 @@ static std::string MakeKey(int i, int j, bool through_db) {
   return key.Encode().ToString();
 }
 
-static bool DummySaveValue(void* arg, const ParsedInternalKey& ikey,
-                           const Slice& v) {
-  return false;
-}
-
 uint64_t Now(Env* env, bool measured_by_nanosecond) {
   return measured_by_nanosecond ? env->NowNanos() : env->NowMicros();
 }
@@ -131,7 +127,6 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options,
   std::string result;
   HistogramImpl hist;
 
-  void* arg = nullptr;
   for (int it = 0; it < num_iter; it++) {
     for (int i = 0; i < num_keys1; i++) {
       for (int j = 0; j < num_keys2; j++) {
@@ -147,8 +142,13 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options,
           std::string key = MakeKey(r1, r2, through_db);
           uint64_t start_time = Now(env, measured_by_nanosecond);
           if (!through_db) {
-            s = table_reader->Get(read_options, key, arg, DummySaveValue,
-                                  nullptr);
+            std::string value;
+            MergeContext merge_context;
+            GetContext get_context(ioptions.comparator, ioptions.merge_operator,
+                                   ioptions.info_log, ioptions.statistics,
+                                   GetContext::kNotFound, Slice(key), &value,
+                                   nullptr, &merge_context);
+            s = table_reader->Get(read_options, key, &get_context);
           } else {
             s = db->Get(read_options, key, &result);
           }
diff --git a/table/table_test.cc b/table/table_test.cc
index 776490871..1b032db53 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -37,6 +37,7 @@
 #include "table/format.h"
 #include "table/meta_blocks.h"
 #include "table/plain_table_factory.h"
+#include "table/get_context.h"
 
 #include "util/random.h"
 #include "util/statistics.h"
@@ -1485,8 +1486,11 @@ TEST(BlockBasedTableTest, BlockCacheDisabledTest) {
   }
 
   {
+    GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
+                           GetContext::kNotFound, Slice(), nullptr,
+                           nullptr, nullptr);
     // a hack that just to trigger BlockBasedTable::GetFilter.
-    reader->Get(ReadOptions(), "non-exist-key", nullptr, nullptr, nullptr);
+    reader->Get(ReadOptions(), "non-exist-key", &get_context);
     BlockCachePropertiesSnapshot props(options.statistics.get());
     props.AssertIndexBlockStat(0, 0);
     props.AssertFilterBlockStat(0, 0);
diff --git a/utilities/compacted_db/compacted_db_impl.cc b/utilities/compacted_db/compacted_db_impl.cc
index 431eb3ba7..70ddb27cf 100644
--- a/utilities/compacted_db/compacted_db_impl.cc
+++ b/utilities/compacted_db/compacted_db_impl.cc
@@ -7,13 +7,13 @@
 #include "utilities/compacted_db/compacted_db_impl.h"
 #include "db/db_impl.h"
 #include "db/version_set.h"
-#include "db/merge_context.h"
+#include "table/get_context.h"
 
 namespace rocksdb {
 
 extern void MarkKeyMayExist(void* arg);
 extern bool SaveValue(void* arg, const ParsedInternalKey& parsed_key,
-                      const Slice& v);
+                      const Slice& v, bool hit_and_return);
 
 CompactedDBImpl::CompactedDBImpl(
   const DBOptions& options, const std::string& dbname)
@@ -44,25 +44,12 @@ size_t CompactedDBImpl::FindFile(const Slice& key) {
 
 Status CompactedDBImpl::Get(const ReadOptions& options,
      ColumnFamilyHandle*, const Slice& key, std::string* value) {
-  const FdWithKeyRange& f = files_.files[FindFile(key)];
-
-  bool value_found;
-  MergeContext merge_context;
-  Version::Saver saver;
-  saver.state = Version::kNotFound;
-  saver.ucmp = user_comparator_;
-  saver.user_key = key;
-  saver.value_found = &value_found;
-  saver.value = value;
-  saver.merge_operator = nullptr;
-  saver.merge_context = &merge_context;
-  saver.logger = info_log_;
-  saver.statistics = statistics_;
+  GetContext get_context(user_comparator_, nullptr, nullptr, nullptr,
+                         GetContext::kNotFound, key, value, nullptr, nullptr);
   LookupKey lkey(key, kMaxSequenceNumber);
-  f.fd.table_reader->Get(options, lkey.internal_key(),
-                         reinterpret_cast<void*>(&saver), SaveValue,
-                         MarkKeyMayExist);
-  if (saver.state == Version::kFound) {
+  files_.files[FindFile(key)].fd.table_reader->Get(
+      options, lkey.internal_key(), &get_context);
+  if (get_context.State() == GetContext::kFound) {
     return Status::OK();
   }
   return Status::NotFound();
@@ -84,26 +71,15 @@ std::vector<Status> CompactedDBImpl::MultiGet(const ReadOptions& options,
   }
   std::vector<Status> statuses(keys.size(), Status::NotFound());
   values->resize(keys.size());
-  bool value_found;
-  MergeContext merge_context;
-  Version::Saver saver;
-  saver.ucmp = user_comparator_;
-  saver.value_found = &value_found;
-  saver.merge_operator = nullptr;
-  saver.merge_context = &merge_context;
-  saver.logger = info_log_;
-  saver.statistics = statistics_;
   int idx = 0;
   for (auto* r : reader_list) {
     if (r != nullptr) {
-      saver.state = Version::kNotFound;
-      saver.user_key = keys[idx];
-      saver.value = &(*values)[idx];
+      GetContext get_context(user_comparator_, nullptr, nullptr, nullptr,
+                             GetContext::kNotFound, keys[idx], &(*values)[idx],
+                             nullptr, nullptr);
       LookupKey lkey(keys[idx], kMaxSequenceNumber);
-      r->Get(options, lkey.internal_key(),
-             reinterpret_cast<void*>(&saver), SaveValue,
-             MarkKeyMayExist);
-      if (saver.state == Version::kFound) {
+      r->Get(options, lkey.internal_key(), &get_context);
+      if (get_context.State() == GetContext::kFound) {
         statuses[idx] = Status::OK();
       }
     }
@@ -128,8 +104,6 @@ Status CompactedDBImpl::Init(const Options& options) {
   }
   version_ = cfd_->GetSuperVersion()->current;
   user_comparator_ = cfd_->user_comparator();
-  statistics_ = cfd_->ioptions()->statistics;
-  info_log_ = cfd_->ioptions()->info_log;
   // L0 should not have files
   if (version_->file_levels_[0].num_files > 1) {
     return Status::NotSupported("L0 contain more than 1 file");
diff --git a/utilities/compacted_db/compacted_db_impl.h b/utilities/compacted_db/compacted_db_impl.h
index ef3effced..49aca53b1 100644
--- a/utilities/compacted_db/compacted_db_impl.h
+++ b/utilities/compacted_db/compacted_db_impl.h
@@ -88,9 +88,6 @@ class CompactedDBImpl : public DBImpl {
   const Comparator* user_comparator_;
   FileLevel files_;
 
-  Statistics* statistics_;
-  Logger* info_log_;
-
   // No copying allowed
   CompactedDBImpl(const CompactedDBImpl&);
   void operator=(const CompactedDBImpl&);

From fd5d80d55e9bcb8c0dd4b6200937e0727fd4b819 Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Mon, 29 Sep 2014 12:45:04 -0700
Subject: [PATCH 143/829] CompactedDB: log using the correct info_log

Summary:
info_log from supplied Options can be nullptr. Using the one from
db_impl. Also call flush after that since no more loggging will happen
and LOG can contain partial output

Test Plan: verified with db_bench

Reviewers: igor, yhchiang, sdong

Reviewed By: sdong

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D24183
---
 db/db_impl_readonly.cc                      | 3 ++-
 utilities/compacted_db/compacted_db_impl.cc | 4 +++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/db/db_impl_readonly.cc b/db/db_impl_readonly.cc
index 98e2bfeb0..9faebd8c2 100644
--- a/db/db_impl_readonly.cc
+++ b/db/db_impl_readonly.cc
@@ -14,7 +14,8 @@ namespace rocksdb {
 DBImplReadOnly::DBImplReadOnly(const DBOptions& db_options,
                                const std::string& dbname)
     : DBImpl(db_options, dbname) {
-  Log(db_options_.info_log, "Opening the db in read only mode");
+  Log(INFO_LEVEL, db_options_.info_log, "Opening the db in read only mode");
+  LogFlush(db_options_.info_log);
 }
 
 DBImplReadOnly::~DBImplReadOnly() {
diff --git a/utilities/compacted_db/compacted_db_impl.cc b/utilities/compacted_db/compacted_db_impl.cc
index 70ddb27cf..775033e2a 100644
--- a/utilities/compacted_db/compacted_db_impl.cc
+++ b/utilities/compacted_db/compacted_db_impl.cc
@@ -144,8 +144,10 @@ Status CompactedDBImpl::Open(const Options& options,
   std::unique_ptr<CompactedDBImpl> db(new CompactedDBImpl(db_options, dbname));
   Status s = db->Init(options);
   if (s.ok()) {
+    Log(INFO_LEVEL, db->db_options_.info_log,
+        "Opened the db as fully compacted mode");
+    LogFlush(db->db_options_.info_log);
     *dbptr = db.release();
-    Log(options.info_log, "Opened the db as fully compacted mode");
   }
   return s;
 }

From 827e31c746b64668f9ea67e07d9df706fd69668b Mon Sep 17 00:00:00 2001
From: erik <erik@ripfog.com>
Date: Mon, 29 Sep 2014 14:52:16 -0700
Subject: [PATCH 144/829] Make test use a compatible type in the size checks.

---
 db/db_test.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/db/db_test.cc b/db/db_test.cc
index ab290d108..c09cc74df 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -1349,8 +1349,8 @@ TEST(DBTest, CompactedDB) {
       std::vector<Slice>({Slice("aaa"), Slice("ccc"), Slice("eee"),
                           Slice("ggg"), Slice("iii"), Slice("kkk")}),
       &values);
-  ASSERT_EQ(status_list.size(), 6);
-  ASSERT_EQ(values.size(), 6);
+  ASSERT_EQ(status_list.size(), static_cast<uint64_t>(6));
+  ASSERT_EQ(values.size(), static_cast<uint64_t>(6));
   ASSERT_OK(status_list[0]);
   ASSERT_EQ(DummyString(kFileSize / 2, 'a'), values[0]);
   ASSERT_TRUE(status_list[1].IsNotFound());

From 56ebd408776620246de2db83f9fa73bf60652144 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Mon, 29 Sep 2014 15:10:37 -0700
Subject: [PATCH 145/829] Fix arc lint (should fix #238)

Summary: See https://secure.phabricator.com/D9114

Test Plan: arc lint

Reviewers: yhchiang

Reviewed By: yhchiang

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D24195
---
 linters/lint_engine/FacebookFbcodeLintEngine.php | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/linters/lint_engine/FacebookFbcodeLintEngine.php b/linters/lint_engine/FacebookFbcodeLintEngine.php
index cb9cf9bdb..3d28de7dd 100644
--- a/linters/lint_engine/FacebookFbcodeLintEngine.php
+++ b/linters/lint_engine/FacebookFbcodeLintEngine.php
@@ -36,7 +36,7 @@ class FacebookFbcodeLintEngine extends ArcanistLintEngine {
     ));
     $linters[] = $java_text_linter;
 
-    $pep8_options = $this->getPEP8WithTextOptions().',E302';
+    $pep8_options = '--ignore=E101,E501,W291,W292,W293,E302';
 
     $python_linter = new ArcanistPEP8Linter();
     $python_linter->setConfig(array('options' => $pep8_options));

From 747523d241c472de64f4689e85a7e01f19534b15 Mon Sep 17 00:00:00 2001
From: Mark Callaghan <mdcallag@gmail.com>
Date: Mon, 29 Sep 2014 09:50:41 -0700
Subject: [PATCH 146/829] Print per column family metrics in db_bench

Summary: see above

Test Plan:
make check, ran db_bench and looked at output

- begin *PUBLIC* platform impact section -
Bugzilla: #
- end platform impact -

Reviewers: igor

Differential Revision: https://reviews.facebook.net/D24189
---
 db/db_bench.cc | 71 +++++++++++++++++++++++++++++---------------------
 1 file changed, 41 insertions(+), 30 deletions(-)

diff --git a/db/db_bench.cc b/db/db_bench.cc
index 85e840a7f..3dec1f321 100644
--- a/db/db_bench.cc
+++ b/db/db_bench.cc
@@ -636,6 +636,14 @@ static void AppendWithSpace(std::string* str, Slice msg) {
   str->append(msg.data(), msg.size());
 }
 
+struct DBWithColumnFamilies {
+  std::vector<ColumnFamilyHandle*> cfh;
+  DB* db;
+  DBWithColumnFamilies() : db(nullptr) {
+    cfh.clear();
+  }
+};
+
 class Stats {
  private:
   int id_;
@@ -699,7 +707,7 @@ class Stats {
   void SetId(int id) { id_ = id; }
   void SetExcludeFromMerge() { exclude_from_merge_ = true; }
 
-  void FinishedOps(DB* db, int64_t num_ops) {
+  void FinishedOps(DBWithColumnFamilies* db_with_cfh, DB* db, int64_t num_ops) {
     if (FLAGS_histogram) {
       double now = FLAGS_env->NowMicros();
       double micros = now - last_op_finish_;
@@ -739,8 +747,17 @@ class Stats {
 
         if (FLAGS_stats_per_interval) {
           std::string stats;
-          if (db && db->GetProperty("rocksdb.stats", &stats))
+
+          if (db_with_cfh && db_with_cfh->cfh.size()) {
+            for (size_t i = 0; i < db_with_cfh->cfh.size(); ++i) {
+              if (db->GetProperty(db_with_cfh->cfh[i], "rocksdb.cfstats",
+                                  &stats))
+                fprintf(stderr, "%s\n", stats.c_str());
+            }
+
+          } else if (db && db->GetProperty("rocksdb.stats", &stats)) {
             fprintf(stderr, "%s\n", stats.c_str());
+          }
         }
 
         fflush(stderr);
@@ -859,13 +876,6 @@ class Benchmark {
   std::shared_ptr<Cache> compressed_cache_;
   std::shared_ptr<const FilterPolicy> filter_policy_;
   const SliceTransform* prefix_extractor_;
-  struct DBWithColumnFamilies {
-    std::vector<ColumnFamilyHandle*> cfh;
-    DB* db;
-    DBWithColumnFamilies() : db(nullptr) {
-      cfh.clear();
-    }
-  };
   DBWithColumnFamilies db_;
   std::vector<DBWithColumnFamilies> multi_dbs_;
   int64_t num_;
@@ -1480,7 +1490,7 @@ class Benchmark {
     uint32_t crc = 0;
     while (bytes < 500 * 1048576) {
       crc = crc32c::Value(data.data(), size);
-      thread->stats.FinishedOps(nullptr, 1);
+      thread->stats.FinishedOps(nullptr, nullptr, 1);
       bytes += size;
     }
     // Print so result is not dead
@@ -1499,7 +1509,7 @@ class Benchmark {
     unsigned int xxh32 = 0;
     while (bytes < 500 * 1048576) {
       xxh32 = XXH32(data.data(), size, 0);
-      thread->stats.FinishedOps(nullptr, 1);
+      thread->stats.FinishedOps(nullptr, nullptr, 1);
       bytes += size;
     }
     // Print so result is not dead
@@ -1520,7 +1530,7 @@ class Benchmark {
         ptr = ap.Acquire_Load();
       }
       count++;
-      thread->stats.FinishedOps(nullptr, 1);
+      thread->stats.FinishedOps(nullptr, nullptr, 1);
     }
     if (ptr == nullptr) exit(1); // Disable unused variable warning.
   }
@@ -1561,7 +1571,7 @@ class Benchmark {
       }
       produced += compressed.size();
       bytes += input.size();
-      thread->stats.FinishedOps(nullptr, 1);
+      thread->stats.FinishedOps(nullptr, nullptr, 1);
     }
 
     if (!ok) {
@@ -1642,7 +1652,7 @@ class Benchmark {
       }
       delete[] uncompressed;
       bytes += input.size();
-      thread->stats.FinishedOps(nullptr, 1);
+      thread->stats.FinishedOps(nullptr, nullptr, 1);
     }
 
     if (!ok) {
@@ -2022,7 +2032,8 @@ class Benchmark {
         bytes += value_size_ + key_size_;
       }
       s = db_with_cfh->db->Write(write_options_, &batch);
-      thread->stats.FinishedOps(db_with_cfh->db, entries_per_batch_);
+      thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db,
+                                entries_per_batch_);
       if (!s.ok()) {
         fprintf(stderr, "put error: %s\n", s.ToString().c_str());
         exit(1);
@@ -2047,7 +2058,7 @@ class Benchmark {
     int64_t bytes = 0;
     for (iter->SeekToFirst(); i < reads_ && iter->Valid(); iter->Next()) {
       bytes += iter->key().size() + iter->value().size();
-      thread->stats.FinishedOps(db, 1);
+      thread->stats.FinishedOps(nullptr, db, 1);
       ++i;
     }
     delete iter;
@@ -2070,7 +2081,7 @@ class Benchmark {
     int64_t bytes = 0;
     for (iter->SeekToLast(); i < reads_ && iter->Valid(); iter->Prev()) {
       bytes += iter->key().size() + iter->value().size();
-      thread->stats.FinishedOps(db, 1);
+      thread->stats.FinishedOps(nullptr, db, 1);
       ++i;
     }
     delete iter;
@@ -2105,7 +2116,7 @@ class Benchmark {
           ++nonexist;
         }
       }
-      thread->stats.FinishedOps(db, 100);
+      thread->stats.FinishedOps(nullptr, db, 100);
     } while (!duration.Done(100));
 
     char msg[100];
@@ -2147,7 +2158,7 @@ class Benchmark {
       if (s.ok()) {
         found++;
       }
-      thread->stats.FinishedOps(db_with_cfh->db, 1);
+      thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db, 1);
     }
 
     char msg[100];
@@ -2189,7 +2200,7 @@ class Benchmark {
           ++found;
         }
       }
-      thread->stats.FinishedOps(db, entries_per_batch_);
+      thread->stats.FinishedOps(nullptr, db, entries_per_batch_);
     }
     for (auto& k : keys) {
       delete k.data();
@@ -2208,7 +2219,7 @@ class Benchmark {
       DB* db = SelectDB(thread);
       Iterator* iter = db->NewIterator(options);
       delete iter;
-      thread->stats.FinishedOps(db, 1);
+      thread->stats.FinishedOps(nullptr, db, 1);
     }
   }
 
@@ -2272,7 +2283,7 @@ class Benchmark {
       if (iter_to_use->Valid() && iter_to_use->key().compare(key) == 0) {
         found++;
       }
-      thread->stats.FinishedOps(db_.db, 1);
+      thread->stats.FinishedOps(&db_, db_.db, 1);
     }
     delete single_iter;
     for (auto iter : multi_iters) {
@@ -2312,7 +2323,7 @@ class Benchmark {
         batch.Delete(key);
       }
       auto s = db->Write(write_options_, &batch);
-      thread->stats.FinishedOps(db, entries_per_batch_);
+      thread->stats.FinishedOps(nullptr, db, entries_per_batch_);
       if (!s.ok()) {
         fprintf(stderr, "del error: %s\n", s.ToString().c_str());
         exit(1);
@@ -2372,7 +2383,7 @@ class Benchmark {
         fprintf(stderr, "put error: %s\n", s.ToString().c_str());
         exit(1);
       }
-      thread->stats.FinishedOps(db_.db, 1);
+      thread->stats.FinishedOps(&db_, db_.db, 1);
 
       ++num_writes;
       if (writes_per_second_by_10 && num_writes >= writes_per_second_by_10) {
@@ -2532,7 +2543,7 @@ class Benchmark {
         deletes_done++;
       }
 
-      thread->stats.FinishedOps(db_.db, 1);
+      thread->stats.FinishedOps(&db_, db_.db, 1);
     }
     char msg[100];
     snprintf(msg, sizeof(msg),
@@ -2590,7 +2601,7 @@ class Benchmark {
         put_weight--;
         writes_done++;
       }
-      thread->stats.FinishedOps(db, 1);
+      thread->stats.FinishedOps(nullptr, db, 1);
     }
     char msg[100];
     snprintf(msg, sizeof(msg), "( reads:%" PRIu64 " writes:%" PRIu64 \
@@ -2624,7 +2635,7 @@ class Benchmark {
         fprintf(stderr, "put error: %s\n", s.ToString().c_str());
         exit(1);
       }
-      thread->stats.FinishedOps(db, 1);
+      thread->stats.FinishedOps(nullptr, db, 1);
     }
     char msg[100];
     snprintf(msg, sizeof(msg),
@@ -2671,7 +2682,7 @@ class Benchmark {
         fprintf(stderr, "put error: %s\n", s.ToString().c_str());
         exit(1);
       }
-      thread->stats.FinishedOps(db, 1);
+      thread->stats.FinishedOps(nullptr, db, 1);
     }
 
     char msg[100];
@@ -2707,7 +2718,7 @@ class Benchmark {
         fprintf(stderr, "merge error: %s\n", s.ToString().c_str());
         exit(1);
       }
-      thread->stats.FinishedOps(db, 1);
+      thread->stats.FinishedOps(nullptr, db, 1);
     }
 
     // Print some statistics
@@ -2768,7 +2779,7 @@ class Benchmark {
 
       }
 
-      thread->stats.FinishedOps(db, 1);
+      thread->stats.FinishedOps(nullptr, db, 1);
     }
 
     char msg[100];

From b2d64a4861beef3cbf0c7270cbf5256ce6a658b4 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Mon, 29 Sep 2014 15:48:27 -0700
Subject: [PATCH 147/829] Fix linters, second try

---
 linters/lint_engine/FacebookFbcodeLintEngine.php | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/linters/lint_engine/FacebookFbcodeLintEngine.php b/linters/lint_engine/FacebookFbcodeLintEngine.php
index 3d28de7dd..33d2f0de3 100644
--- a/linters/lint_engine/FacebookFbcodeLintEngine.php
+++ b/linters/lint_engine/FacebookFbcodeLintEngine.php
@@ -39,13 +39,8 @@ class FacebookFbcodeLintEngine extends ArcanistLintEngine {
     $pep8_options = '--ignore=E101,E501,W291,W292,W293,E302';
 
     $python_linter = new ArcanistPEP8Linter();
-    $python_linter->setConfig(array('options' => $pep8_options));
     $linters[] = $python_linter;
 
-    $python_2space_linter = new ArcanistPEP8Linter();
-    $python_2space_linter->setConfig(array('options' => $pep8_options.',E111'));
-    $linters[] = $python_2space_linter;
-
    // Currently we can't run cpplint in commit hook mode, because it
     // depends on having access to the working directory.
     if (!$this->getCommitHookMode()) {

From 99fb613e542db5fcf91dba976e98136014aee7b7 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Mon, 29 Sep 2014 15:52:12 -0700
Subject: [PATCH 148/829] remove 2 space linter

---
 linters/lint_engine/FacebookFbcodeLintEngine.php | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/linters/lint_engine/FacebookFbcodeLintEngine.php b/linters/lint_engine/FacebookFbcodeLintEngine.php
index 33d2f0de3..6765c33d1 100644
--- a/linters/lint_engine/FacebookFbcodeLintEngine.php
+++ b/linters/lint_engine/FacebookFbcodeLintEngine.php
@@ -36,8 +36,6 @@ class FacebookFbcodeLintEngine extends ArcanistLintEngine {
     ));
     $linters[] = $java_text_linter;
 
-    $pep8_options = '--ignore=E101,E501,W291,W292,W293,E302';
-
     $python_linter = new ArcanistPEP8Linter();
     $linters[] = $python_linter;
 
@@ -114,11 +112,7 @@ class FacebookFbcodeLintEngine extends ArcanistLintEngine {
           $dir = dirname($dir);
         } while ($dir != '/' && $dir != '.');
 
-        if ($space_count == 4) {
-          $cur_path_linter = $python_linter;
-        } else {
-          $cur_path_linter = $python_2space_linter;
-        }
+        $cur_path_linter = $python_linter;
         $cur_path_linter->addPath($path);
         $cur_path_linter->addData($path, $this->loadData($path));
 

From f0f795549745c54ebc8cef85fdb3809f481a1ab1 Mon Sep 17 00:00:00 2001
From: "mike@arpaia.co" <mike@arpaia.co>
Date: Mon, 29 Sep 2014 16:05:25 -0700
Subject: [PATCH 149/829] Fixing comile errors on OS X

Summary: Building master on OS X has some compile errors due to implicit type conversions which generate warnings which RocksDB's build settings raise as errors.

Test Plan: It compiles!

Reviewers: ljin, igor

Reviewed By: ljin

Differential Revision: https://reviews.facebook.net/D24135
---
 db/db_bench.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/db/db_bench.cc b/db/db_bench.cc
index 3dec1f321..bbd807c2c 100644
--- a/db/db_bench.cc
+++ b/db/db_bench.cc
@@ -1278,7 +1278,8 @@ class Benchmark {
         method = &Benchmark::ReadRandomFast;
       } else if (name == Slice("multireadrandom")) {
         entries_per_batch_ = FLAGS_batch_size;
-        fprintf(stderr, "entries_per_batch_ = %ld\n", entries_per_batch_);
+        fprintf(stderr, "entries_per_batch = %" PRIi64 "\n",
+                entries_per_batch_);
         method = &Benchmark::MultiReadRandom;
       } else if (name == Slice("readmissing")) {
         ++key_size_;

From ee1f3ccb061da17c5d8904b734205faa360d36a6 Mon Sep 17 00:00:00 2001
From: "mike@arpaia.co" <mike@arpaia.co>
Date: Mon, 29 Sep 2014 16:09:46 -0700
Subject: [PATCH 150/829] Package generation for Ubuntu and CentOS

Summary:
I put together a script to assist in the generation of deb's and
rpm's. I've tested that this works on ubuntu via vagrant. I've included the
Vagrantfile here, but I can remove it if it's not useful. The package.sh
script should work on any ubuntu or centos machine, I just added a bit of
logic in there to allow a base Ubuntu or Centos machine to be able to build
RocksDB from scratch.

Example output on Ubuntu 14.04:

```
root@vagrant-ubuntu-trusty-64:/vagrant# ./tools/package.sh
[+] g++-4.7 is already installed. skipping.
[+] libgflags-dev is already installed. skipping.
[+] ruby-all-dev is already installed. skipping.
[+] fpm is already installed. skipping.
Created package {:path=>"rocksdb_3.5_amd64.deb"}
root@vagrant-ubuntu-trusty-64:/vagrant# dpkg --info rocksdb_3.5_amd64.deb
 new debian package, version 2.0.
 size 17392022 bytes: control archive=1518 bytes.
     275 bytes,    11 lines      control
    2911 bytes,    38 lines      md5sums
 Package: rocksdb
 Version: 3.5
 License: BSD
 Vendor: Facebook
 Architecture: amd64
 Maintainer: rocksdb@fb.com
 Installed-Size: 83358
 Section: default
 Priority: extra
 Homepage: http://rocksdb.org/
 Description: RocksDB is an embeddable persistent key-value store for fast storage.
 ```

 Example output on CentOS 6.5:

 ```
 [root@localhost vagrant]# rpm -qip rocksdb-3.5-1.x86_64.rpm
 Name        : rocksdb                      Relocations: /usr
 Version     : 3.5                               Vendor: Facebook
 Release     : 1                             Build Date: Mon 29 Sep 2014 01:26:11 AM UTC
 Install Date: (not installed)               Build Host: localhost
 Group       : default                       Source RPM: rocksdb-3.5-1.src.rpm
 Size        : 96231106                         License: BSD
 Signature   : (none)
 Packager    : rocksdb@fb.com
 URL         : http://rocksdb.org/
 Summary     : RocksDB is an embeddable persistent key-value store for fast storage.
 Description :
 RocksDB is an embeddable persistent key-value store for fast storage.
 ```

Test Plan:
How this gets used is really up to the RocksDB core team. If you
want to actually get this into mainline, you might have to change `make
install` such that it install the RocksDB shared object file as well, which
would require you to link against gflags (maybe?) and that would require some
potential modifications to the script here (basically add a depends on that
package).

Currently, this will install the headers and a pre-compiled statically linked
object file. If that's what you want out of life, than this requires no
modifications.

Reviewers: ljin, yhchiang, igor

Reviewed By: igor

Differential Revision: https://reviews.facebook.net/D24141
---
 .gitignore                  |   2 +
 Makefile                    |  14 ++++-
 Vagrantfile                 |  16 +++++
 build_tools/make_package.sh | 116 ++++++++++++++++++++++++++++++++++++
 4 files changed, 145 insertions(+), 3 deletions(-)
 create mode 100644 Vagrantfile
 create mode 100755 build_tools/make_package.sh

diff --git a/.gitignore b/.gitignore
index 99a7d61d6..cbb817f61 100644
--- a/.gitignore
+++ b/.gitignore
@@ -28,8 +28,10 @@ util/build_version.cc
 build_tools/VALGRIND_LOGS/
 coverage/COVERAGE_REPORT
 .gdbhistory
+package/
 .phutil_module_cache
 tags
 java/*.log
 java/include/org_rocksdb_*.h
 unity.cc
+.vagrant/
diff --git a/Makefile b/Makefile
index 75da74b08..4deb8fc5f 100644
--- a/Makefile
+++ b/Makefile
@@ -164,6 +164,9 @@ endif
 LIBRARY = ${LIBNAME}.a
 MEMENVLIBRARY = libmemenv.a
 
+ROCKSDB_MAJOR = 3
+ROCKSDB_MINOR = 4
+
 default: all
 
 #-----------------------------------------------
@@ -178,8 +181,8 @@ SHARED3 = $(SHARED1)
 SHARED = $(SHARED1)
 else
 # Update db.h if you change these.
-SHARED_MAJOR = 3
-SHARED_MINOR = 4
+SHARED_MAJOR = $(ROCKSDB_MAJOR)
+SHARED_MINOR = $(ROCKSDB_MINOR)
 SHARED1 = ${LIBNAME}.$(PLATFORM_SHARED_EXT)
 SHARED2 = $(SHARED1).$(SHARED_MAJOR)
 SHARED3 = $(SHARED1).$(SHARED_MAJOR).$(SHARED_MINOR)
@@ -195,7 +198,7 @@ $(SHARED3):
 
 endif  # PLATFORM_SHARED_EXT
 
-.PHONY: blackbox_crash_test check clean coverage crash_test ldb_tests \
+.PHONY: blackbox_crash_test check clean coverage crash_test ldb_tests package \
 	release tags valgrind_check whitebox_crash_test format static_lib shared_lib all \
 	dbg rocksdbjavastatic rocksdbjava install uninstall
 
@@ -276,6 +279,9 @@ tags:
 format:
 	build_tools/format-diff.sh
 
+package:
+	bash build_tools/make_package.sh $(SHARED_MAJOR).$(SHARED_MINOR)
+
 # ---------------------------------------------------------------------------
 # 	Unit tests and tools
 # ---------------------------------------------------------------------------
@@ -627,8 +633,10 @@ ifneq ($(MAKECMDGOALS),clean)
 ifneq ($(MAKECMDGOALS),format)
 ifneq ($(MAKECMDGOALS),jclean)
 ifneq ($(MAKECMDGOALS),jtest)
+ifneq ($(MAKECMDGOALS),package)
 -include $(DEPFILES)
 endif
 endif
 endif
 endif
+endif
diff --git a/Vagrantfile b/Vagrantfile
new file mode 100644
index 000000000..cdee5db53
--- /dev/null
+++ b/Vagrantfile
@@ -0,0 +1,16 @@
+Vagrant.configure("2") do |config|
+
+  config.vm.provider "virtualbox" do |v|
+    v.memory = 4096
+    v.cpus = 2
+  end
+
+  config.vm.define "ubuntu14" do |box|
+    box.vm.box = "ubuntu/trusty64"
+  end
+
+  config.vm.define "centos65" do |box|
+    box.vm.box = "chef/centos-6.5"
+  end
+
+end
diff --git a/build_tools/make_package.sh b/build_tools/make_package.sh
new file mode 100755
index 000000000..2ca28023d
--- /dev/null
+++ b/build_tools/make_package.sh
@@ -0,0 +1,116 @@
+#/usr/bin/env bash
+
+set -e
+
+function log() {
+  echo "[+] $1"
+}
+
+function fatal() {
+  echo "[!] $1"
+  exit 1
+}
+
+function platform() {
+  local  __resultvar=$1
+  if [[ -f "/etc/yum.conf" ]]; then
+    eval $__resultvar="centos"
+  elif [[ -f "/etc/dpkg/dpkg.cfg" ]]; then
+    eval $__resultvar="ubuntu"
+  else
+    fatal "Unknwon operating system"
+  fi
+}
+platform OS
+
+function package() {
+  if [[ $OS = "ubuntu" ]]; then
+    if dpkg --get-selections | grep --quiet $1; then
+      log "$1 is already installed. skipping."
+    else
+      apt-get install $@ -y
+    fi
+  elif [[ $OS = "centos" ]]; then
+    if rpm -qa | grep --quiet $1; then
+      log "$1 is already installed. skipping."
+    else
+      yum install $@ -y
+    fi
+  fi
+}
+
+function detect_fpm_output() {
+  if [[ $OS = "ubuntu" ]]; then
+    export FPM_OUTPUT=deb
+  elif [[ $OS = "centos" ]]; then
+    export FPM_OUTPUT=rpm
+  fi
+}
+detect_fpm_output
+
+function gem_install() {
+  if gem list | grep --quiet $1; then
+    log "$1 is already installed. skipping."
+  else
+    gem install $@
+  fi
+}
+
+function main() {
+  if [[ $# -ne 1 ]]; then
+    fatal "Usage: $0 <rocksdb_version>"
+  else
+    log "using rocksdb version: $1"
+  fi
+
+  if [[ -d /vagrant ]]; then
+    if [[ $OS = "ubuntu" ]]; then
+      package g++-4.7
+      export CXX=g++-4.7
+
+      # the deb would depend on libgflags2, but the static lib is the only thing
+      # installed by make install
+      package libgflags-dev
+
+      package ruby-all-dev
+    elif [[ $OS = "centos" ]]; then
+      pushd /etc/yum.repos.d
+      if [[ ! -f /etc/yum.repos.d/devtools-1.1.repo ]]; then
+        wget http://people.centos.org/tru/devtools-1.1/devtools-1.1.repo
+      fi
+      package devtoolset-1.1-gcc --enablerepo=testing-1.1-devtools-6
+      package devtoolset-1.1-gcc-c++ --enablerepo=testing-1.1-devtools-6
+      export CC=/opt/centos/devtoolset-1.1/root/usr/bin/gcc
+      export CPP=/opt/centos/devtoolset-1.1/root/usr/bin/cpp
+      export CXX=/opt/centos/devtoolset-1.1/root/usr/bin/c++
+      export PATH=$PATH:/opt/centos/devtoolset-1.1/root/usr/bin
+      popd
+      if ! rpm -qa | grep --quiet gflags; then
+        rpm -i https://github.com/schuhschuh/gflags/releases/download/v2.1.0/gflags-devel-2.1.0-1.amd64.rpm
+      fi
+
+      package ruby
+      package ruby-devel
+      package rubygems
+      package rpm-build
+    fi
+  fi
+  gem_install fpm
+
+  make static_lib
+  make install INSTALL_PATH=package
+  fpm \
+    -s dir \
+    -t $FPM_OUTPUT \
+    -n rocksdb \
+    -v $1 \
+    --prefix /usr \
+    --url http://rocksdb.org/ \
+    -m rocksdb@fb.com \
+    --license BSD \
+    --vendor Facebook \
+    --description "RocksDB is an embeddable persistent key-value store for fast storage." \
+    package
+}
+
+main $@

From 0b923f0f9aa385e2a60000631526b9397eff88ce Mon Sep 17 00:00:00 2001
From: Chris Riccomini <criccomi@criccomi-mn.linkedin.biz>
Date: Mon, 29 Sep 2014 16:58:16 -0700
Subject: [PATCH 151/829] add centos 5.6 build instead of ubuntu.

---
 Makefile                              |  6 +++++-
 java/crossbuild/Vagrantfile           |  7 +++----
 java/crossbuild/build-linux-centos.sh | 23 +++++++++++++++++++++++
 3 files changed, 31 insertions(+), 5 deletions(-)
 create mode 100755 java/crossbuild/build-linux-centos.sh

diff --git a/Makefile b/Makefile
index 9fc1fe6de..e0c836ec1 100644
--- a/Makefile
+++ b/Makefile
@@ -554,7 +554,11 @@ rocksdbjavastatic: libz.a libbz2.a libsnappy.a
 	
 
 rocksdbjavastaticrelease: rocksdbjavastatic
-	cd java/crossbuild && vagrant destroy -f && vagrant up
+	cd java/crossbuild && vagrant destroy -f
+  vagrant up linux32
+  vagrant halt linux32
+  vagrant up linux64
+  vagrant halt linux64
 	cd java;jar -cf $(ROCKSDB_JAR_ALL) org/rocksdb/*.class org/rocksdb/util/*.class HISTORY*.md librocksdbjni-*.so librocksdbjni-*.jnilib
 
 rocksdbjava:
diff --git a/java/crossbuild/Vagrantfile b/java/crossbuild/Vagrantfile
index ed591be71..c4b1b1df2 100644
--- a/java/crossbuild/Vagrantfile
+++ b/java/crossbuild/Vagrantfile
@@ -7,13 +7,11 @@ VAGRANTFILE_API_VERSION = "2"
 Vagrant.configure(VAGRANTFILE_API_VERSION) do |config|
 
   config.vm.define "linux32" do |linux32|
-    linux32.vm.box = "ubuntu/trusty32"
-    linux32.vm.provision :shell, path: "build-linux.sh"
+    linux32.vm.box = "hansode/centos-5.6-i386"
   end
 
   config.vm.define "linux64" do |linux64|
-    linux64.vm.box = "ubuntu/trusty64"
-    linux64.vm.provision :shell, path: "build-linux.sh"
+    linux64.vm.box = "hansode/centos-5.6-x86_64"
   end
 
   config.vm.provider "virtualbox" do |v|
@@ -21,6 +19,7 @@ Vagrant.configure(VAGRANTFILE_API_VERSION) do |config|
     v.cpus = 4
   end
 
+  config.vm.provision :shell, path: "build-linux-centos.sh"
   config.vm.synced_folder "../", "/rocksdb-build"
   config.vm.synced_folder "../..", "/rocksdb", type: "rsync"
 end
diff --git a/java/crossbuild/build-linux-centos.sh b/java/crossbuild/build-linux-centos.sh
new file mode 100755
index 000000000..55f179b62
--- /dev/null
+++ b/java/crossbuild/build-linux-centos.sh
@@ -0,0 +1,23 @@
+#!/usr/bin/env bash
+# install all required packages for rocksdb that are available through yum
+ARCH=$(uname -i)
+sudo yum -y install java-1.6.0-openjdk-devel.$ARCH zlib zlib-devel bzip2 bzip2-devel
+
+# install gcc/g++ 4.7 via CERN (http://linux.web.cern.ch/linux/devtoolset/)
+sudo wget -O /etc/yum.repos.d/slc5-devtoolset.repo http://linuxsoft.cern.ch/cern/devtoolset/slc5-devtoolset.repo
+sudo wget -O /etc/pki/rpm-gpg/RPM-GPG-KEY-cern http://ftp.mirrorservice.org/sites/ftp.scientificlinux.org/linux/scientific/51/i386/RPM-GPG-KEYs/RPM-GPG-KEY-cern
+sudo yum -y install devtoolset-1.1 
+wget http://gflags.googlecode.com/files/gflags-1.6.tar.gz
+tar xvfz gflags-1.6.tar.gz; cd gflags-1.6; scl enable devtoolset-1.1 ./configure; scl enable devtoolset-1.1 make; sudo make install
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib
+
+# set java home so we can build rocksdb jars
+export JAVA_HOME=/usr/lib/jvm/java-1.6.0
+
+# build rocksdb
+cd /rocksdb
+scl enable devtoolset-1.1 'make jclean clean'
+scl enable devtoolset-1.1 'make -j 4 rocksdbjavastatic'
+cp /rocksdb/java/librocksdbjni-* /rocksdb-build
+cp /rocksdb/java/rocksdbjni-* /rocksdb-build
+

From ffe3d490d406ffe516fc3d24186c85511a9a43dd Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Mon, 29 Sep 2014 17:25:02 -0700
Subject: [PATCH 152/829] Add an instruction about SSE in INSTALL.md

Summary: As tittle.

Test Plan: Not needed

Reviewers: MarkCallaghan, ljin, yhchiang, igor

Reviewed By: igor

Subscribers: rven, leveldb

Differential Revision: https://reviews.facebook.net/D24231
---
 INSTALL.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/INSTALL.md b/INSTALL.md
index 8cf66e6ab..607450f85 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -15,6 +15,10 @@ There are few options when compiling RocksDB:
 * `make all` will compile our static library, and all our tools and unit tests. Our tools
 depend on gflags. You will need to have gflags installed to run `make all`.
 
+* if Intel SSE instruction set is supported, set USE_SSE=" -msse -msse4.2 " to make sure
+SSE4.2 is used to speed up CRC32 when calculating data checksum.
+
+
 ## Dependencies
 
 * You can link RocksDB with following compression libraries:

From 1f963305a8f3384da3215c37ed7a264c5c99417c Mon Sep 17 00:00:00 2001
From: Mark Callaghan <mdcallag@gmail.com>
Date: Mon, 29 Sep 2014 17:31:26 -0700
Subject: [PATCH 153/829] Print MB per second compaction throughput separately
 for reads and writes

Summary:
From this line there used to be one column (MB/sec) that includes reads and writes. This change splits it and for real workloads the rd and wr rates might not match when keys are dropped.
2014/09/29-17:31:01.213162 7f929fbff700 (Original Log Time 2014/09/29-17:31:01.180025) [default] compacted to: files[2 5 0 0 0 0 0], MB/sec: 14.0 rd, 14.0 wr, level 1, files in(4, 0) out(5) MB in(8.5, 0.0) out(8.5), read-write-amplify(2.0) write-amplify(1.0) OK

Test Plan:
make check, grepped LOG

- begin *PUBLIC* platform impact section -
Bugzilla: #
- end platform impact -

Reviewers: igor

Differential Revision: https://reviews.facebook.net/D24237
---
 db/db_impl.cc | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/db/db_impl.cc b/db/db_impl.cc
index bd9b222b3..7f5a382c0 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -3340,12 +3340,14 @@ Status DBImpl::DoCompactionWork(CompactionState* compact,
   Version::LevelSummaryStorage tmp;
   LogToBuffer(
       log_buffer,
-      "[%s] compacted to: %s, %.1f MB/sec, level %d, files in(%d, %d) out(%d) "
+      "[%s] compacted to: %s, MB/sec: %.1f rd, %.1f wr, level %d, "
+      "files in(%d, %d) out(%d) "
       "MB in(%.1f, %.1f) out(%.1f), read-write-amplify(%.1f) "
       "write-amplify(%.1f) %s\n",
       cfd->GetName().c_str(), cfd->current()->LevelSummary(&tmp),
-      (stats.bytes_readn + stats.bytes_readnp1 + stats.bytes_written) /
-          (double)stats.micros,
+      (stats.bytes_readn + stats.bytes_readnp1) /
+          static_cast<double>(stats.micros),
+      stats.bytes_written / static_cast<double>(stats.micros),
       compact->compaction->output_level(), stats.files_in_leveln,
       stats.files_in_levelnp1, stats.files_out_levelnp1,
       stats.bytes_readn / 1048576.0, stats.bytes_readnp1 / 1048576.0,

From 7506198da25273d0323be2a8db89390d85dc138c Mon Sep 17 00:00:00 2001
From: Danny Al-Gaaf <danny.al-gaaf@bisect.de>
Date: Mon, 29 Sep 2014 21:14:18 +0200
Subject: [PATCH 154/829] cuckoo_table_db_test.cc: add flush after delete

It seems that a FlushMemTable() call is needed in the
Uint64Comparator test after call Delete(). Otherwise the later
via Put() added keys get lost with the next FlushMemTable()
call before the check.

Signed-off-by: Danny Al-Gaaf <danny.al-gaaf@bisect.de>
---
 db/cuckoo_table_db_test.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/db/cuckoo_table_db_test.cc b/db/cuckoo_table_db_test.cc
index 2652d1776..4beee59e4 100644
--- a/db/cuckoo_table_db_test.cc
+++ b/db/cuckoo_table_db_test.cc
@@ -218,6 +218,7 @@ TEST(CuckooTableDBTest, Uint64Comparator) {
 
   // Add more keys.
   ASSERT_OK(Delete(Uint64Key(2)));  // Delete.
+  dbfull()->TEST_FlushMemTable();
   ASSERT_OK(Put(Uint64Key(3), "v0"));  // Update.
   ASSERT_OK(Put(Uint64Key(4), "v4"));
   dbfull()->TEST_FlushMemTable();

From a2f98ef618de4bc0fb626c55cbb102d10f047131 Mon Sep 17 00:00:00 2001
From: Chris Riccomini <criccomi@criccomi-mn.linkedin.biz>
Date: Tue, 30 Sep 2014 11:59:18 -0700
Subject: [PATCH 155/829] fix tabs in Makefile

---
 Makefile | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/Makefile b/Makefile
index e0c836ec1..c22da1e46 100644
--- a/Makefile
+++ b/Makefile
@@ -555,10 +555,10 @@ rocksdbjavastatic: libz.a libbz2.a libsnappy.a
 
 rocksdbjavastaticrelease: rocksdbjavastatic
 	cd java/crossbuild && vagrant destroy -f
-  vagrant up linux32
-  vagrant halt linux32
-  vagrant up linux64
-  vagrant halt linux64
+	vagrant up linux32
+	vagrant halt linux32
+	vagrant up linux64
+	vagrant halt linux64
 	cd java;jar -cf $(ROCKSDB_JAR_ALL) org/rocksdb/*.class org/rocksdb/util/*.class HISTORY*.md librocksdbjni-*.so librocksdbjni-*.jnilib
 
 rocksdbjava:

From 726ac5bca8be2f743a4eb3b8de4a847fb6545f03 Mon Sep 17 00:00:00 2001
From: Chris Riccomini <criccomi@criccomi-mn.linkedin.biz>
Date: Tue, 30 Sep 2014 12:03:32 -0700
Subject: [PATCH 156/829] shrink vagrant commands to single line

---
 Makefile | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/Makefile b/Makefile
index c22da1e46..6248fefdc 100644
--- a/Makefile
+++ b/Makefile
@@ -554,11 +554,7 @@ rocksdbjavastatic: libz.a libbz2.a libsnappy.a
 	
 
 rocksdbjavastaticrelease: rocksdbjavastatic
-	cd java/crossbuild && vagrant destroy -f
-	vagrant up linux32
-	vagrant halt linux32
-	vagrant up linux64
-	vagrant halt linux64
+	cd java/crossbuild && vagrant destroy -f && vagrant up linux32 && vagrant halt linux32 && vagrant up linux64 && vagrant halt linux64
 	cd java;jar -cf $(ROCKSDB_JAR_ALL) org/rocksdb/*.class org/rocksdb/util/*.class HISTORY*.md librocksdbjni-*.so librocksdbjni-*.jnilib
 
 rocksdbjava:

From 68ca534169a4f9e1930f6511109e973b43cf5998 Mon Sep 17 00:00:00 2001
From: Danny Al-Gaaf <danny.al-gaaf@bisect.de>
Date: Fri, 26 Sep 2014 17:43:47 +0200
Subject: [PATCH 157/829] corruption_test.cc: pass parameter by reference

Fix for:

[db/corruption_test.cc:134]: (performance) Function parameter
 'fname' should be passed by reference.

Signed-off-by: Danny Al-Gaaf <danny.al-gaaf@bisect.de>
---
 db/corruption_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/db/corruption_test.cc b/db/corruption_test.cc
index 09d78f89f..4fcea0d5a 100644
--- a/db/corruption_test.cc
+++ b/db/corruption_test.cc
@@ -131,7 +131,7 @@ class CorruptionTest {
     ASSERT_GE(max_expected, correct);
   }
 
-  void CorruptFile(const std::string fname, int offset, int bytes_to_corrupt) {
+  void CorruptFile(const std::string& fname, int offset, int bytes_to_corrupt) {
     struct stat sbuf;
     if (stat(fname.c_str(), &sbuf) != 0) {
       const char* msg = strerror(errno);

From 53910ddb152fbcba95a3e04b058a997c40f654ae Mon Sep 17 00:00:00 2001
From: Danny Al-Gaaf <danny.al-gaaf@bisect.de>
Date: Fri, 26 Sep 2014 18:01:23 +0200
Subject: [PATCH 158/829] db_test.cc: pass parameter by reference

Fix for:

[db/db_test.cc:6141]: (performance) Function parameter
 'key' should be passed by reference.

Signed-off-by: Danny Al-Gaaf <danny.al-gaaf@bisect.de>
---
 db/db_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/db/db_test.cc b/db/db_test.cc
index c09cc74df..986d5810e 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -6138,7 +6138,7 @@ class WrappedBloom : public FilterPolicy {
   const FilterPolicy* filter_;
   mutable uint32_t counter_;
 
-  rocksdb::Slice convertKey(const rocksdb::Slice key) const {
+  rocksdb::Slice convertKey(const rocksdb::Slice& key) const {
     return key;
   }
 };

From 8ce050b51b1abb226a97573fe9b48916342ec29f Mon Sep 17 00:00:00 2001
From: Danny Al-Gaaf <danny.al-gaaf@bisect.de>
Date: Fri, 26 Sep 2014 18:03:12 +0200
Subject: [PATCH 159/829] table/bloom_block.*: pass func parameter by reference

[table/bloom_block.h:29]: (performance) Function parameter
 'keys_hashes' should be passed by reference.

Signed-off-by: Danny Al-Gaaf <danny.al-gaaf@bisect.de>
---
 table/bloom_block.cc | 2 +-
 table/bloom_block.h  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/table/bloom_block.cc b/table/bloom_block.cc
index c44ab66ca..cfea8a2c5 100644
--- a/table/bloom_block.cc
+++ b/table/bloom_block.cc
@@ -11,7 +11,7 @@
 
 namespace rocksdb {
 
-void BloomBlockBuilder::AddKeysHashes(const std::vector<uint32_t> keys_hashes) {
+void BloomBlockBuilder::AddKeysHashes(const std::vector<uint32_t>& keys_hashes) {
   for (auto hash : keys_hashes) {
     bloom_.AddHash(hash);
   }
diff --git a/table/bloom_block.h b/table/bloom_block.h
index d55453eda..7ef5d14b6 100644
--- a/table/bloom_block.h
+++ b/table/bloom_block.h
@@ -26,7 +26,7 @@ class BloomBlockBuilder {
 
   uint32_t GetNumBlocks() const { return bloom_.GetNumBlocks(); }
 
-  void AddKeysHashes(const std::vector<uint32_t> keys_hashes);
+  void AddKeysHashes(const std::vector<uint32_t>& keys_hashes);
 
   Slice Finish();
 

From b8b7117e97e649fc65d0a4dd397caf9a39fb71b1 Mon Sep 17 00:00:00 2001
From: Danny Al-Gaaf <danny.al-gaaf@bisect.de>
Date: Fri, 26 Sep 2014 18:04:44 +0200
Subject: [PATCH 160/829] db/version_set.cc: use !empty() instead of 'size() >
 0'

Use empty() since it should be prefered as it has, following
the standard, a constant time complexity regardless of the
containter type. The same is not guaranteed for size().

Fix for:
[db/version_set.cc:2250]: (performance) Possible inefficient
 checking for 'column_families_not_found' emptiness.

Signed-off-by: Danny Al-Gaaf <danny.al-gaaf@bisect.de>
---
 db/version_set.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/db/version_set.cc b/db/version_set.cc
index 10649fa6c..1d1d53813 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -2169,7 +2169,7 @@ Status VersionSet::Recover(
 
   // there were some column families in the MANIFEST that weren't specified
   // in the argument. This is OK in read_only mode
-  if (read_only == false && column_families_not_found.size() > 0) {
+  if (read_only == false && !column_families_not_found.empty()) {
     std::string list_of_not_found;
     for (const auto& cf : column_families_not_found) {
       list_of_not_found += ", " + cf.second;

From 93548ce8f451a701ad0967ba705f04fef80aa11a Mon Sep 17 00:00:00 2001
From: Danny Al-Gaaf <danny.al-gaaf@bisect.de>
Date: Fri, 26 Sep 2014 18:12:04 +0200
Subject: [PATCH 161/829] table/cuckoo_table_reader.cc: pass func parameter by
 ref

Fix for:

[table/cuckoo_table_reader.cc:198]: (performance) Function
 parameter 'file_data' should be passed by reference.

Signed-off-by: Danny Al-Gaaf <danny.al-gaaf@bisect.de>
---
 table/cuckoo_table_reader.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/table/cuckoo_table_reader.cc b/table/cuckoo_table_reader.cc
index f8da4e288..f39900add 100644
--- a/table/cuckoo_table_reader.cc
+++ b/table/cuckoo_table_reader.cc
@@ -191,7 +191,7 @@ class CuckooTableIterator : public Iterator {
 
  private:
   struct BucketComparator {
-    BucketComparator(const Slice file_data, const Comparator* ucomp,
+    BucketComparator(const Slice& file_data, const Comparator* ucomp,
                      uint32_t bucket_len, uint32_t user_key_len,
                      const Slice target = Slice())
       : file_data_(file_data),

From 063471bf7613544496a4d4b5a1e1ba4a7aa605cf Mon Sep 17 00:00:00 2001
From: Danny Al-Gaaf <danny.al-gaaf@bisect.de>
Date: Fri, 26 Sep 2014 18:14:05 +0200
Subject: [PATCH 162/829] table/table_test.cc: pass func parameter by reference

Fix for:

[table/table_test.cc:1218]: (performance) Function parameter
 'prefix' should be passed by reference.

Signed-off-by: Danny Al-Gaaf <danny.al-gaaf@bisect.de>
---
 table/table_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/table/table_test.cc b/table/table_test.cc
index 1b032db53..df662ad88 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -1216,7 +1216,7 @@ static std::string RandomString(Random* rnd, int len) {
   return r;
 }
 
-void AddInternalKey(TableConstructor* c, const std::string prefix,
+void AddInternalKey(TableConstructor* c, const std::string& prefix,
                     int suffix_len = 800) {
   static Random rnd(1023);
   InternalKey k(prefix + RandomString(&rnd, 800), 0, kTypeValue);

From 8558457143bfa76d61e0d2f715e40ec2ddb6ffc2 Mon Sep 17 00:00:00 2001
From: Danny Al-Gaaf <danny.al-gaaf@bisect.de>
Date: Fri, 26 Sep 2014 19:32:46 +0200
Subject: [PATCH 163/829] ldb_cmd_execute_result.h: perform init in
 initialization list

Fix for:

[util/ldb_cmd_execute_result.h:18]: (performance) Variable 'message_'
 is assigned in constructor body. Consider performing initialization
 in initialization list.
[util/ldb_cmd_execute_result.h:23]: (performance) Variable 'message_'
 is assigned in constructor body. Consider performing initialization
 in initialization list.

Signed-off-by: Danny Al-Gaaf <danny.al-gaaf@bisect.de>
---
 util/ldb_cmd_execute_result.h | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/util/ldb_cmd_execute_result.h b/util/ldb_cmd_execute_result.h
index b9121b2b0..b8e6c4634 100644
--- a/util/ldb_cmd_execute_result.h
+++ b/util/ldb_cmd_execute_result.h
@@ -13,15 +13,10 @@ public:
     EXEC_NOT_STARTED = 0, EXEC_SUCCEED = 1, EXEC_FAILED = 2,
   };
 
-  LDBCommandExecuteResult() {
-    state_ = EXEC_NOT_STARTED;
-    message_ = "";
-  }
+  LDBCommandExecuteResult() : state_(EXEC_NOT_STARTED), message_("") {}
 
-  LDBCommandExecuteResult(State state, std::string& msg) {
-    state_ = state;
-    message_ = msg;
-  }
+  LDBCommandExecuteResult(State state, std::string& msg) :
+    state_(state), message_(msg) {}
 
   std::string ToString() {
     std::string ret;

From 873f1356a1781e8d638973ea320b722d3240fc5a Mon Sep 17 00:00:00 2001
From: Danny Al-Gaaf <danny.al-gaaf@bisect.de>
Date: Fri, 26 Sep 2014 19:35:20 +0200
Subject: [PATCH 164/829] db_ttl_impl.h: pass func parameter by reference

Fix for:

[utilities/ttl/db_ttl_impl.h:209]: (performance) Function parameter
 'merge_op' should be passed by reference.

Signed-off-by: Danny Al-Gaaf <danny.al-gaaf@bisect.de>
---
 utilities/ttl/db_ttl_impl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utilities/ttl/db_ttl_impl.h b/utilities/ttl/db_ttl_impl.h
index 84fb55568..92b8eab7f 100644
--- a/utilities/ttl/db_ttl_impl.h
+++ b/utilities/ttl/db_ttl_impl.h
@@ -206,7 +206,7 @@ class TtlCompactionFilterFactory : public CompactionFilterFactory {
 class TtlMergeOperator : public MergeOperator {
 
  public:
-  explicit TtlMergeOperator(const std::shared_ptr<MergeOperator> merge_op,
+  explicit TtlMergeOperator(const std::shared_ptr<MergeOperator>& merge_op,
                             Env* env)
       : user_merge_op_(merge_op), env_(env) {
     assert(merge_op);

From 33580fa39a0aef3d1115e47fb4f13154be5e5993 Mon Sep 17 00:00:00 2001
From: Danny Al-Gaaf <danny.al-gaaf@bisect.de>
Date: Mon, 29 Sep 2014 22:53:39 +0200
Subject: [PATCH 165/829] db/db_impl.cc: fix object handling, remove double
 lines

Fix for:

[db/db_impl.cc:4039]: (error) Instance of 'StopWatch' object is
 destroyed immediately.
[db/db_impl.cc:4042]: (error) Instance of 'StopWatch' object is
 destroyed immediately.

Signed-off-by: Danny Al-Gaaf <danny.al-gaaf@bisect.de>
---
 db/db_impl.cc | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/db/db_impl.cc b/db/db_impl.cc
index 7f5a382c0..3bf60a2d1 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -4037,11 +4037,10 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
         RecordTick(stats_, WAL_FILE_BYTES, log_size);
         if (status.ok() && options.sync) {
           RecordTick(stats_, WAL_FILE_SYNCED);
+          StopWatch sw(env_, stats_, WAL_FILE_SYNC_MICROS);
           if (db_options_.use_fsync) {
-            StopWatch(env_, stats_, WAL_FILE_SYNC_MICROS);
             status = log_->file()->Fsync();
           } else {
-            StopWatch(env_, stats_, WAL_FILE_SYNC_MICROS);
             status = log_->file()->Sync();
           }
         }

From af8c2b2d97d8473d97107bcf326a450e7700def6 Mon Sep 17 00:00:00 2001
From: Danny Al-Gaaf <danny.al-gaaf@bisect.de>
Date: Mon, 29 Sep 2014 23:00:18 +0200
Subject: [PATCH 166/829] util/signal_test.cc: suppress intentional null
 pointer deref

Add comment to enabele cppcheck suppression of intentional null
pointer deref via --inline-suppr option.

Signed-off-by: Danny Al-Gaaf <danny.al-gaaf@bisect.de>
---
 util/signal_test.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/util/signal_test.cc b/util/signal_test.cc
index f51fa548e..b23ad6a98 100644
--- a/util/signal_test.cc
+++ b/util/signal_test.cc
@@ -9,6 +9,7 @@
 namespace {
 void f0() {
   char *p = nullptr;
+  // cppcheck-suppress nullPointer
   *p = 10;  /* SIGSEGV here!! */
 }
 

From 4cc8643baf5b4e4a25fb20a77b3257100747d483 Mon Sep 17 00:00:00 2001
From: Danny Al-Gaaf <danny.al-gaaf@bisect.de>
Date: Mon, 29 Sep 2014 23:05:12 +0200
Subject: [PATCH 167/829] util/ldb_cmd.cc: prefer prefix ++operator for
 non-primitive types

Prefer prefix ++operator for non-primitive types like iterators for
performance reasons. Prefix ++/-- operators avoid creating a temporary
copy.

Signed-off-by: Danny Al-Gaaf <danny.al-gaaf@bisect.de>
---
 util/ldb_cmd.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/util/ldb_cmd.cc b/util/ldb_cmd.cc
index 9f00757b8..8eda39bf9 100644
--- a/util/ldb_cmd.cc
+++ b/util/ldb_cmd.cc
@@ -325,7 +325,7 @@ bool LDBCommand::ParseKeyValue(const string& line, string* key, string* value,
 bool LDBCommand::ValidateCmdLineOptions() {
 
   for (map<string, string>::const_iterator itr = option_map_.begin();
-        itr != option_map_.end(); itr++) {
+        itr != option_map_.end(); ++itr) {
     if (find(valid_cmd_line_options_.begin(),
           valid_cmd_line_options_.end(), itr->first) ==
           valid_cmd_line_options_.end()) {
@@ -335,7 +335,7 @@ bool LDBCommand::ValidateCmdLineOptions() {
   }
 
   for (vector<string>::const_iterator itr = flags_.begin();
-        itr != flags_.end(); itr++) {
+        itr != flags_.end(); ++itr) {
     if (find(valid_cmd_line_options_.begin(),
           valid_cmd_line_options_.end(), *itr) ==
           valid_cmd_line_options_.end()) {
@@ -1538,7 +1538,7 @@ void BatchPutCommand::DoCommand() {
   WriteBatch batch;
 
   for (vector<pair<string, string>>::const_iterator itr
-        = key_values_.begin(); itr != key_values_.end(); itr++) {
+        = key_values_.begin(); itr != key_values_.end(); ++itr) {
       batch.Put(itr->first, itr->second);
   }
   Status st = db_->Write(WriteOptions(), &batch);

From 0de452ee9f8afa9cae2a72b8fb3c3dd59b28a9d9 Mon Sep 17 00:00:00 2001
From: Danny Al-Gaaf <danny.al-gaaf@bisect.de>
Date: Mon, 29 Sep 2014 23:09:28 +0200
Subject: [PATCH 168/829] document_db.cc: pass const parameter by reference

Signed-off-by: Danny Al-Gaaf <danny.al-gaaf@bisect.de>
---
 utilities/document/document_db.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utilities/document/document_db.cc b/utilities/document/document_db.cc
index 8e15a52ca..b9157f274 100644
--- a/utilities/document/document_db.cc
+++ b/utilities/document/document_db.cc
@@ -376,7 +376,7 @@ class IndexKey {
 
 class SimpleSortedIndex : public Index {
  public:
-  SimpleSortedIndex(const std::string field, const std::string& name)
+  SimpleSortedIndex(const std::string& field, const std::string& name)
       : field_(field), name_(name) {}
 
   virtual const char* Name() const override { return name_.c_str(); }

From 43c789c8f246a2a35864e3fca9585b55c40c2095 Mon Sep 17 00:00:00 2001
From: Danny Al-Gaaf <danny.al-gaaf@bisect.de>
Date: Mon, 29 Sep 2014 23:11:00 +0200
Subject: [PATCH 169/829] spatialdb/spatial_db.cc: use !empty() instead of
 'size() > 0'

Use empty() since it should be prefered as it has, following
the standard, a constant time complexity regardless of the
containter type. The same is not guaranteed for size().

Signed-off-by: Danny Al-Gaaf <danny.al-gaaf@bisect.de>
---
 utilities/spatialdb/spatial_db.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utilities/spatialdb/spatial_db.cc b/utilities/spatialdb/spatial_db.cc
index 9c44027c8..6fbb780bc 100644
--- a/utilities/spatialdb/spatial_db.cc
+++ b/utilities/spatialdb/spatial_db.cc
@@ -369,7 +369,7 @@ class SpatialIndexCursor : public Cursor {
     }
     delete spatial_iterator;
 
-    valid_ = valid_ && primary_key_ids_.size() > 0;
+    valid_ = valid_ && !primary_key_ids_.empty();
 
     if (valid_) {
       primary_keys_iterator_ = primary_key_ids_.begin();

From b140375565878fb59235bfd2673913b96ae021fe Mon Sep 17 00:00:00 2001
From: Danny Al-Gaaf <danny.al-gaaf@bisect.de>
Date: Mon, 29 Sep 2014 23:13:01 +0200
Subject: [PATCH 170/829] ttl/ttl_test.cc: prefer prefix ++operator for
 non-primitive types

Signed-off-by: Danny Al-Gaaf <danny.al-gaaf@bisect.de>
---
 utilities/ttl/ttl_test.cc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/utilities/ttl/ttl_test.cc b/utilities/ttl/ttl_test.cc
index e6d64e54e..66cabe8e3 100644
--- a/utilities/ttl/ttl_test.cc
+++ b/utilities/ttl/ttl_test.cc
@@ -120,7 +120,7 @@ class TtlTest {
     static FlushOptions flush_opts;
     WriteBatch batch;
     kv_it_ = kvmap_.begin();
-    for (int i = 0; i < num_ops && kv_it_ != kvmap_.end(); i++, kv_it_++) {
+    for (int i = 0; i < num_ops && kv_it_ != kvmap_.end(); i++, ++kv_it_) {
       switch (batch_ops[i]) {
         case PUT:
           batch.Put(kv_it_->first, kv_it_->second);
@@ -145,7 +145,7 @@ class TtlTest {
     static FlushOptions flush_opts;
     kv_it_ = kvmap_.begin();
     advance(kv_it_, start_pos_map);
-    for (int i = 0; kv_it_ != kvmap_.end() && i < num_entries; i++, kv_it_++) {
+    for (int i = 0; kv_it_ != kvmap_.end() && i < num_entries; i++, ++kv_it_) {
       ASSERT_OK(cf == nullptr
                     ? db_ttl_->Put(wopts, kv_it_->first, kv_it_->second)
                     : db_ttl_->Put(wopts, cf, kv_it_->first, kv_it_->second));
@@ -207,7 +207,7 @@ class TtlTest {
     kv_it_ = kvmap_.begin();
     advance(kv_it_, st_pos);
     std::string v;
-    for (int i = 0; kv_it_ != kvmap_.end() && i < span; i++, kv_it_++) {
+    for (int i = 0; kv_it_ != kvmap_.end() && i < span; i++, ++kv_it_) {
       Status s = (cf == nullptr) ? db_ttl_->Get(ropts, kv_it_->first, &v)
                                  : db_ttl_->Get(ropts, cf, kv_it_->first, &v);
       if (s.ok() != check) {
@@ -252,7 +252,7 @@ class TtlTest {
     } else {  // dbiter should have found out kvmap_[st_pos]
       for (int i = st_pos;
            kv_it_ != kvmap_.end() && i < st_pos + span;
-           i++, kv_it_++)  {
+           i++, ++kv_it_)  {
         ASSERT_TRUE(dbiter->Valid());
         ASSERT_EQ(dbiter->value().compare(kv_it_->second), 0);
         dbiter->Next();

From d517c836483b2017ec7d64997fb7a331fdb5f150 Mon Sep 17 00:00:00 2001
From: Danny Al-Gaaf <danny.al-gaaf@bisect.de>
Date: Mon, 29 Sep 2014 23:23:09 +0200
Subject: [PATCH 171/829] in_table_factory.cc: use correct format specifier

Use %zu instead of %zd since size_t and uint32_t are unsigned.

Fix for:

[table/plain_table_factory.cc:55]: (warning) %zd in format string (no. 1)
 requires 'ssize_t' but the argument type is 'size_t {aka unsigned long}'.
[table/plain_table_factory.cc:58]: (warning) %zd in format string (no. 1)
 requires 'ssize_t' but the argument type is 'size_t {aka unsigned long}'.

Signed-off-by: Danny Al-Gaaf <danny.al-gaaf@bisect.de>
---
 table/plain_table_factory.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/table/plain_table_factory.cc b/table/plain_table_factory.cc
index de23cc902..fae0d8018 100644
--- a/table/plain_table_factory.cc
+++ b/table/plain_table_factory.cc
@@ -52,10 +52,10 @@ std::string PlainTableFactory::GetPrintableTableOptions() const {
   snprintf(buffer, kBufferSize, "  hash_table_ratio: %lf\n",
            hash_table_ratio_);
   ret.append(buffer);
-  snprintf(buffer, kBufferSize, "  index_sparseness: %zd\n",
+  snprintf(buffer, kBufferSize, "  index_sparseness: %zu\n",
            index_sparseness_);
   ret.append(buffer);
-  snprintf(buffer, kBufferSize, "  huge_page_tlb_size: %zd\n",
+  snprintf(buffer, kBufferSize, "  huge_page_tlb_size: %zu\n",
            huge_page_tlb_size_);
   ret.append(buffer);
   snprintf(buffer, kBufferSize, "  encoding_type: %d\n",

From e55aea5512dfc2c7bc9595b5d307269f3d002a03 Mon Sep 17 00:00:00 2001
From: Danny Al-Gaaf <danny.al-gaaf@bisect.de>
Date: Mon, 29 Sep 2014 23:27:35 +0200
Subject: [PATCH 172/829] document_db.cc: fix assert

Check for lhs and not twice for rhs.

Fix for:

[utilities/document/document_db.cc:36] ->
 [utilities/document/document_db.cc:36]: (style) Same expression on both
 sides of '&&'.

Signed-off-by: Danny Al-Gaaf <danny.al-gaaf@bisect.de>
---
 utilities/document/document_db.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utilities/document/document_db.cc b/utilities/document/document_db.cc
index b9157f274..901e91163 100644
--- a/utilities/document/document_db.cc
+++ b/utilities/document/document_db.cc
@@ -33,7 +33,7 @@ namespace {
 // > 0   <=>  lhs == rhs
 // TODO(icanadi) move this to JSONDocument?
 int DocumentCompare(const JSONDocument& lhs, const JSONDocument& rhs) {
-  assert(rhs.IsObject() == false && rhs.IsObject() == false &&
+  assert(lhs.IsObject() == false && rhs.IsObject() == false &&
          lhs.type() == rhs.type());
 
   switch (lhs.type()) {

From 676ff7b1fbd9c7aaa1b3c9100a14f91d63823d3c Mon Sep 17 00:00:00 2001
From: Danny Al-Gaaf <danny.al-gaaf@bisect.de>
Date: Mon, 29 Sep 2014 23:31:47 +0200
Subject: [PATCH 173/829] compaction_picker.cc: remove check for >=0 for
 unsigned

Fix for:

[db/compaction_picker.cc:923]: (style) Unsigned variable
 'start_index' can't be negative so it is unnecessary to test it.

Signed-off-by: Danny Al-Gaaf <danny.al-gaaf@bisect.de>
---
 db/compaction_picker.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/db/compaction_picker.cc b/db/compaction_picker.cc
index eb434eeac..add3556d8 100644
--- a/db/compaction_picker.cc
+++ b/db/compaction_picker.cc
@@ -920,7 +920,7 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalSizeAmp(
         "earliest-file-size %" PRIu64,
         version->cfd_->GetName().c_str(), candidate_size, earliest_file_size);
   }
-  assert(start_index >= 0 && start_index < files.size() - 1);
+  assert(start_index < files.size() - 1);
 
   // Estimate total file size
   uint64_t estimated_total_size = 0;

From 0fd8bbca53fafb3a729b0f6f470c2d024d3e6473 Mon Sep 17 00:00:00 2001
From: Danny Al-Gaaf <danny.al-gaaf@bisect.de>
Date: Mon, 29 Sep 2014 23:45:23 +0200
Subject: [PATCH 174/829] db/db_impl.cc: reduce scope of prefix_initialized

Signed-off-by: Danny Al-Gaaf <danny.al-gaaf@bisect.de>
---
 db/db_impl.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/db/db_impl.cc b/db/db_impl.cc
index 3bf60a2d1..1a2b7f7b2 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -3064,7 +3064,6 @@ Status DBImpl::DoCompactionWork(CompactionState* compact,
   assert(compact);
   compact->CleanupBatchBuffer();
   compact->CleanupMergedBuffer();
-  bool prefix_initialized = false;
 
   // Generate file_levels_ for compaction berfore making Iterator
   compact->compaction->GenerateFileLevels();
@@ -3149,6 +3148,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact,
     // 2) send value_buffer to compaction filter and alternate the values;
     // 3) merge value_buffer with ineligible_value_buffer;
     // 4) run the modified "compaction" using the old for loop.
+    bool prefix_initialized = false;
     shared_ptr<Iterator> backup_input(
         versions_->MakeInputIterator(compact->compaction));
     backup_input->SeekToFirst();

From 8ee75dca2e9ef385e0358aa785cb43becc4f09d2 Mon Sep 17 00:00:00 2001
From: Danny Al-Gaaf <danny.al-gaaf@bisect.de>
Date: Mon, 29 Sep 2014 23:55:21 +0200
Subject: [PATCH 175/829] db/memtable.cc: remove unused variable merge_result

Signed-off-by: Danny Al-Gaaf <danny.al-gaaf@bisect.de>
---
 db/memtable.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/db/memtable.cc b/db/memtable.cc
index bdfbc805f..b9b99a684 100644
--- a/db/memtable.cc
+++ b/db/memtable.cc
@@ -413,7 +413,6 @@ static bool SaveValue(void* arg, const char* entry) {
           *(s->found_final_value) = true;
           return false;
         }
-        std::string merge_result;  // temporary area for merge results later
         Slice v = GetLengthPrefixedSlice(key_ptr + key_length);
         *(s->merge_in_progress) = true;
         merge_context->PushOperand(v);

From 44cca0cd8fb0903c5946ec1c79c31fad992691c6 Mon Sep 17 00:00:00 2001
From: Danny Al-Gaaf <danny.al-gaaf@bisect.de>
Date: Wed, 1 Oct 2014 07:09:06 +0200
Subject: [PATCH 176/829] db/db_iter.cc: remove unused variable

Signed-off-by: Danny Al-Gaaf <danny.al-gaaf@bisect.de>
---
 db/db_iter.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/db/db_iter.cc b/db/db_iter.cc
index db86ebc2c..815562c9f 100644
--- a/db/db_iter.cc
+++ b/db/db_iter.cc
@@ -287,7 +287,6 @@ void DBIter::MergeValuesNewToOld() {
   std::deque<std::string> operands;
   operands.push_front(iter_->value().ToString());
 
-  std::string merge_result;   // Temporary string to hold merge result later
   ParsedInternalKey ikey;
   for (iter_->Next(); iter_->Valid(); iter_->Next()) {
     if (!ParseKey(&ikey)) {

From d6483af870685875ca1452d1a329dfeab529c86d Mon Sep 17 00:00:00 2001
From: Danny Al-Gaaf <danny.al-gaaf@bisect.de>
Date: Wed, 1 Oct 2014 07:12:12 +0200
Subject: [PATCH 177/829] db/db_test.cc: reduce scope of some variables

Signed-off-by: Danny Al-Gaaf <danny.al-gaaf@bisect.de>
---
 db/db_test.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/db/db_test.cc b/db/db_test.cc
index 986d5810e..119ec5bd8 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -4672,9 +4672,9 @@ TEST(DBTest, CompactionFilterContextManual) {
   ASSERT_EQ(NumTableFilesAtLevel(0), 1);
 
   // Verify total number of keys is correct after manual compaction.
-  int count = 0;
-  int total = 0;
   {
+    int count = 0;
+    int total = 0;
     Arena arena;
     ScopedArenaIterator iter(dbfull()->TEST_NewInternalIterator(&arena));
     iter->SeekToFirst();

From 5abd8add7d7217199c022ac575ee28e5d9ba94b2 Mon Sep 17 00:00:00 2001
From: Danny Al-Gaaf <danny.al-gaaf@bisect.de>
Date: Wed, 1 Oct 2014 07:17:07 +0200
Subject: [PATCH 178/829] db/deletefile_test.cc: remove unused variable

Signed-off-by: Danny Al-Gaaf <danny.al-gaaf@bisect.de>
---
 db/deletefile_test.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/db/deletefile_test.cc b/db/deletefile_test.cc
index a5af31284..f1cd4b040 100644
--- a/db/deletefile_test.cc
+++ b/db/deletefile_test.cc
@@ -148,7 +148,6 @@ class DeleteFileTest {
 TEST(DeleteFileTest, AddKeysAndQueryLevels) {
   CreateTwoLevels();
   std::vector<LiveFileMetaData> metadata;
-  std::vector<int> keysinlevel;
   db_->GetLiveFilesMetaData(&metadata);
 
   std::string level1file = "";

From 55652043c83c463ce57b7748e01c6d12bb5bf9fe Mon Sep 17 00:00:00 2001
From: Danny Al-Gaaf <danny.al-gaaf@bisect.de>
Date: Wed, 1 Oct 2014 07:24:20 +0200
Subject: [PATCH 179/829] table/cuckoo_table_reader.cc: pass func parameter by
 reference

Fix for:

[table/cuckoo_table_reader.cc:196]: (performance) Function
 parameter 'target' should be passed by reference.

Signed-off-by: Danny Al-Gaaf <danny.al-gaaf@bisect.de>
---
 table/cuckoo_table_reader.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/table/cuckoo_table_reader.cc b/table/cuckoo_table_reader.cc
index f39900add..c0ca38bb7 100644
--- a/table/cuckoo_table_reader.cc
+++ b/table/cuckoo_table_reader.cc
@@ -193,7 +193,7 @@ class CuckooTableIterator : public Iterator {
   struct BucketComparator {
     BucketComparator(const Slice& file_data, const Comparator* ucomp,
                      uint32_t bucket_len, uint32_t user_key_len,
-                     const Slice target = Slice())
+                     const Slice& target = Slice())
       : file_data_(file_data),
         ucomp_(ucomp),
         bucket_len_(bucket_len),

From 6b6cedbb1ba49c819f9fe00a44e82a5d6564aee7 Mon Sep 17 00:00:00 2001
From: Danny Al-Gaaf <danny.al-gaaf@bisect.de>
Date: Wed, 1 Oct 2014 07:27:39 +0200
Subject: [PATCH 180/829] table/format.cc: reduce scope of some variables

Signed-off-by: Danny Al-Gaaf <danny.al-gaaf@bisect.de>
---
 table/format.cc | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/table/format.cc b/table/format.cc
index db11f9d4a..768e00165 100644
--- a/table/format.cc
+++ b/table/format.cc
@@ -334,9 +334,9 @@ Status UncompressBlockContents(const char* data, size_t n,
     case kZlibCompression:
       ubuf = std::unique_ptr<char[]>(
           port::Zlib_Uncompress(data, n, &decompress_size));
-      static char zlib_corrupt_msg[] =
-        "Zlib not supported or corrupted Zlib compressed block contents";
       if (!ubuf) {
+        static char zlib_corrupt_msg[] =
+          "Zlib not supported or corrupted Zlib compressed block contents";
         return Status::Corruption(zlib_corrupt_msg);
       }
       *contents =
@@ -345,9 +345,9 @@ Status UncompressBlockContents(const char* data, size_t n,
     case kBZip2Compression:
       ubuf = std::unique_ptr<char[]>(
           port::BZip2_Uncompress(data, n, &decompress_size));
-      static char bzip2_corrupt_msg[] =
-        "Bzip2 not supported or corrupted Bzip2 compressed block contents";
       if (!ubuf) {
+        static char bzip2_corrupt_msg[] =
+          "Bzip2 not supported or corrupted Bzip2 compressed block contents";
         return Status::Corruption(bzip2_corrupt_msg);
       }
       *contents =
@@ -356,9 +356,9 @@ Status UncompressBlockContents(const char* data, size_t n,
     case kLZ4Compression:
       ubuf = std::unique_ptr<char[]>(
           port::LZ4_Uncompress(data, n, &decompress_size));
-      static char lz4_corrupt_msg[] =
-          "LZ4 not supported or corrupted LZ4 compressed block contents";
       if (!ubuf) {
+        static char lz4_corrupt_msg[] =
+          "LZ4 not supported or corrupted LZ4 compressed block contents";
         return Status::Corruption(lz4_corrupt_msg);
       }
       *contents =
@@ -367,9 +367,9 @@ Status UncompressBlockContents(const char* data, size_t n,
     case kLZ4HCCompression:
       ubuf = std::unique_ptr<char[]>(
           port::LZ4_Uncompress(data, n, &decompress_size));
-      static char lz4hc_corrupt_msg[] =
-          "LZ4HC not supported or corrupted LZ4HC compressed block contents";
       if (!ubuf) {
+        static char lz4hc_corrupt_msg[] =
+          "LZ4HC not supported or corrupted LZ4HC compressed block contents";
         return Status::Corruption(lz4hc_corrupt_msg);
       }
       *contents =

From 091153493cf71f3004bf921759a2f49435e8e4c6 Mon Sep 17 00:00:00 2001
From: Danny Al-Gaaf <danny.al-gaaf@bisect.de>
Date: Wed, 1 Oct 2014 10:35:56 +0200
Subject: [PATCH 181/829] db/db_test.cc: remove unused variable

Signed-off-by: Danny Al-Gaaf <danny.al-gaaf@bisect.de>
---
 db/db_test.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/db/db_test.cc b/db/db_test.cc
index 119ec5bd8..d6f9e027f 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -8193,7 +8193,6 @@ static void RandomTimeoutWriter(void* arg) {
     if (write_opt.timeout_hint_us == 0 ||
         put_duration + kTimerBias < write_opt.timeout_hint_us) {
       ASSERT_OK(s);
-      std::string result;
     }
     if (s.IsTimedOut()) {
       timeout_count++;

From 28a6e31583de187eed56d7f5dca13caecd64b640 Mon Sep 17 00:00:00 2001
From: Danny Al-Gaaf <danny.al-gaaf@bisect.de>
Date: Wed, 1 Oct 2014 10:36:51 +0200
Subject: [PATCH 182/829] table/block_based_table_builder.cc: remove unused
 variable

Signed-off-by: Danny Al-Gaaf <danny.al-gaaf@bisect.de>
---
 table/block_based_table_builder.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/table/block_based_table_builder.cc b/table/block_based_table_builder.cc
index 2f373fff1..9e4328cd4 100644
--- a/table/block_based_table_builder.cc
+++ b/table/block_based_table_builder.cc
@@ -721,7 +721,6 @@ Status BlockBasedTableBuilder::Finish() {
     // Write properties block.
     {
       PropertyBlockBuilder property_block_builder;
-      std::vector<std::string> failed_user_prop_collectors;
       r->props.filter_policy_name = r->table_options.filter_policy != nullptr ?
           r->table_options.filter_policy->Name() : "";
       r->props.index_size =

From 86e29f0334b80fbdb96e0b3f3b35608d7813c537 Mon Sep 17 00:00:00 2001
From: Danny Al-Gaaf <danny.al-gaaf@bisect.de>
Date: Wed, 1 Oct 2014 10:37:31 +0200
Subject: [PATCH 183/829] document_db.cc: remove unused variable

Signed-off-by: Danny Al-Gaaf <danny.al-gaaf@bisect.de>
---
 utilities/document/document_db.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/utilities/document/document_db.cc b/utilities/document/document_db.cc
index 901e91163..b19618533 100644
--- a/utilities/document/document_db.cc
+++ b/utilities/document/document_db.cc
@@ -407,7 +407,6 @@ class SimpleSortedIndex : public Index {
     assert(interval != nullptr);  // because index is useful
     Direction direction;
 
-    std::string op;
     const JSONDocument* limit;
     if (interval->lower_bound != nullptr) {
       limit = interval->lower_bound;

From bf3bfd04486945b0bf92d6ab8bce4f991be8e81d Mon Sep 17 00:00:00 2001
From: Danny Al-Gaaf <danny.al-gaaf@bisect.de>
Date: Wed, 1 Oct 2014 10:53:37 +0200
Subject: [PATCH 184/829] util/cache_test.cc: use static_cast over C-Style cast

Signed-off-by: Danny Al-Gaaf <danny.al-gaaf@bisect.de>
---
 util/cache_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/util/cache_test.cc b/util/cache_test.cc
index c12cdb7e1..74109ff0c 100644
--- a/util/cache_test.cc
+++ b/util/cache_test.cc
@@ -386,7 +386,7 @@ class Value {
 
 namespace {
 void deleter(const Slice& key, void* value) {
-  delete (Value *)value;
+  delete static_cast<Value *>(value);
 }
 }  // namespace
 

From 4a171882d6f9e31e44a751738211e47ac0759336 Mon Sep 17 00:00:00 2001
From: Danny Al-Gaaf <danny.al-gaaf@bisect.de>
Date: Wed, 1 Oct 2014 11:09:22 +0200
Subject: [PATCH 185/829] db/version_set.cc: remove unnecessary checks

Fix for:

[db/version_set.cc:1219]: (style) Unsigned variable 'last_file'
 can't be negative so it is unnecessary to test it.
[db/version_set.cc:1234]: (style) Unsigned variable 'first_file'
 can't be negative so it is unnecessary to test it.

Signed-off-by: Danny Al-Gaaf <danny.al-gaaf@bisect.de>
---
 db/version_set.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/db/version_set.cc b/db/version_set.cc
index 1d1d53813..6b38ee777 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -1216,7 +1216,7 @@ bool Version::HasOverlappingUserKey(
   // Check the last file in inputs against the file after it
   size_t last_file = FindFile(cfd_->internal_comparator(), file_level,
                               inputs->back()->largest.Encode());
-  assert(0 <= last_file && last_file < kNumFiles);  // File should exist!
+  assert(last_file < kNumFiles);  // File should exist!
   if (last_file < kNumFiles-1) {                    // If not the last file
     const Slice last_key_in_input = ExtractUserKey(
         files[last_file].largest_key);
@@ -1231,7 +1231,7 @@ bool Version::HasOverlappingUserKey(
   // Check the first file in inputs against the file just before it
   size_t first_file = FindFile(cfd_->internal_comparator(), file_level,
                                inputs->front()->smallest.Encode());
-  assert(0 <= first_file && first_file <= last_file);   // File should exist!
+  assert(first_file <= last_file);   // File should exist!
   if (first_file > 0) {                                 // If not first file
     const Slice& first_key_in_input = ExtractUserKey(
         files[first_file].smallest_key);

From 177caca4283fcffdfd74d6c02de2fcd98049adc5 Mon Sep 17 00:00:00 2001
From: Danny Al-Gaaf <danny.al-gaaf@bisect.de>
Date: Wed, 1 Oct 2014 11:53:00 +0200
Subject: [PATCH 186/829] ttl/ttl_test.cc: pass const string param by reference

Signed-off-by: Danny Al-Gaaf <danny.al-gaaf@bisect.de>
---
 utilities/ttl/ttl_test.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/utilities/ttl/ttl_test.cc b/utilities/ttl/ttl_test.cc
index 66cabe8e3..d1c1235c3 100644
--- a/utilities/ttl/ttl_test.cc
+++ b/utilities/ttl/ttl_test.cc
@@ -263,7 +263,7 @@ class TtlTest {
 
   class TestFilter : public CompactionFilter {
    public:
-    TestFilter(const int64_t kSampleSize, const std::string kNewValue)
+    TestFilter(const int64_t kSampleSize, const std::string& kNewValue)
       : kSampleSize_(kSampleSize),
         kNewValue_(kNewValue) {
     }
@@ -311,7 +311,7 @@ class TtlTest {
 
   class TestFilterFactory : public CompactionFilterFactory {
     public:
-      TestFilterFactory(const int64_t kSampleSize, const std::string kNewValue)
+      TestFilterFactory(const int64_t kSampleSize, const std::string& kNewValue)
         : kSampleSize_(kSampleSize),
           kNewValue_(kNewValue) {
       }

From 8ff0b409553076f9eea34ad57ca821c787184992 Mon Sep 17 00:00:00 2001
From: Danny Al-Gaaf <danny.al-gaaf@bisect.de>
Date: Wed, 1 Oct 2014 11:55:43 +0200
Subject: [PATCH 187/829] document_db_test.cc: pass const string param by
 reference

Signed-off-by: Danny Al-Gaaf <danny.al-gaaf@bisect.de>
---
 utilities/document/document_db_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utilities/document/document_db_test.cc b/utilities/document/document_db_test.cc
index d4c632cce..5b36a2060 100644
--- a/utilities/document/document_db_test.cc
+++ b/utilities/document/document_db_test.cc
@@ -56,7 +56,7 @@ class DocumentDBTest {
     }
   }
 
-  JSONDocument* Parse(const std::string doc) {
+  JSONDocument* Parse(const std::string& doc) {
     return JSONDocument::ParseJSON(ConvertQuotes(doc).c_str());
   }
 

From 9d6f380866349b90f3834cafc6e79af43a78d346 Mon Sep 17 00:00:00 2001
From: Danny Al-Gaaf <danny.al-gaaf@bisect.de>
Date: Wed, 1 Oct 2014 11:57:01 +0200
Subject: [PATCH 188/829] backupable_db_test.cc: pass const string param by
 reference

Signed-off-by: Danny Al-Gaaf <danny.al-gaaf@bisect.de>
---
 utilities/backupable/backupable_db_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utilities/backupable/backupable_db_test.cc b/utilities/backupable/backupable_db_test.cc
index a585d1a9c..281837773 100644
--- a/utilities/backupable/backupable_db_test.cc
+++ b/utilities/backupable/backupable_db_test.cc
@@ -228,7 +228,7 @@ class FileManager : public EnvWrapper {
  public:
   explicit FileManager(Env* t) : EnvWrapper(t), rnd_(5) {}
 
-  Status DeleteRandomFileInDir(const std::string dir) {
+  Status DeleteRandomFileInDir(const std::string& dir) {
     std::vector<std::string> children;
     GetChildren(dir, &children);
     if (children.size() <= 2) { // . and ..

From d122e7bcf45526349fee791fbc1274dacfacdf92 Mon Sep 17 00:00:00 2001
From: Igor Canadi <igor.canadi@gmail.com>
Date: Wed, 1 Oct 2014 11:15:42 -0700
Subject: [PATCH 189/829] Update INSTALL.md

---
 INSTALL.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/INSTALL.md b/INSTALL.md
index 607450f85..21e8d26f0 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -85,4 +85,4 @@ SSE4.2 is used to speed up CRC32 when calculating data checksum.
     We did not run any production workloads on it.
 
 * **iOS**:
-  * Run: `TARGET_OS=IOS make static_lib`
+  * Run: `TARGET_OS=IOS make static_lib`. When building the project which uses rocksdb iOS library, make sure to define two important pre-processing macros: `ROCKSDB_LITE` and `IOS_CROSS_COMPILE`.

From 5ec53f3edf62bec1b690ce12fb21a6c52203f3c8 Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Wed, 1 Oct 2014 16:19:16 -0700
Subject: [PATCH 190/829] make compaction related options changeable

Summary:
make compaction related options changeable. Most of changes are tedious,
following the same convention: grabs MutableCFOptions at the beginning
of compaction under mutex, then pass it throughout the job and register
it in SuperVersion at the end.

Test Plan: make all check

Reviewers: igor, yhchiang, sdong

Reviewed By: sdong

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D23349
---
 db/column_family.cc                 |  75 +++++----
 db/column_family.h                  |  16 +-
 db/compaction.cc                    |   7 +-
 db/compaction.h                     |   8 +-
 db/compaction_picker.cc             | 238 ++++++++++++----------------
 db/compaction_picker.h              | 107 ++++++-------
 db/db_impl.cc                       | 159 ++++++++++++-------
 db/db_impl.h                        |  30 ++--
 db/db_test.cc                       | 108 +++++++++++++
 db/log_and_apply_bench.cc           |   6 +-
 db/memtable_list.cc                 |   5 +-
 db/memtable_list.h                  |   3 +-
 db/repair.cc                        |   2 +-
 db/version_set.cc                   |  48 +++---
 db/version_set.h                    |  17 +-
 db/write_batch_test.cc              |   5 +-
 include/rocksdb/immutable_options.h |   3 +
 table/table_test.cc                 |  15 +-
 util/mutable_cf_options.cc          |  72 +++++++++
 util/mutable_cf_options.h           |  67 +++++++-
 util/options.cc                     |   4 +-
 util/options_helper.cc              |  90 ++++++-----
 22 files changed, 686 insertions(+), 399 deletions(-)
 create mode 100644 util/mutable_cf_options.cc

diff --git a/db/column_family.cc b/db/column_family.cc
index f95090225..0beb23c91 100644
--- a/db/column_family.cc
+++ b/db/column_family.cc
@@ -230,7 +230,7 @@ ColumnFamilyData::ColumnFamilyData(uint32_t id, const std::string& name,
       internal_comparator_(cf_options.comparator),
       options_(*db_options, SanitizeOptions(&internal_comparator_, cf_options)),
       ioptions_(options_),
-      mutable_cf_options_(options_),
+      mutable_cf_options_(options_, ioptions_),
       mem_(nullptr),
       imm_(options_.min_write_buffer_number_to_merge),
       super_version_(nullptr),
@@ -245,27 +245,27 @@ ColumnFamilyData::ColumnFamilyData(uint32_t id, const std::string& name,
   // if dummy_versions is nullptr, then this is a dummy column family.
   if (dummy_versions != nullptr) {
     internal_stats_.reset(
-        new InternalStats(options_.num_levels, db_options->env, this));
+        new InternalStats(ioptions_.num_levels, db_options->env, this));
     table_cache_.reset(new TableCache(ioptions_, env_options, table_cache));
-    if (options_.compaction_style == kCompactionStyleUniversal) {
+    if (ioptions_.compaction_style == kCompactionStyleUniversal) {
       compaction_picker_.reset(
-          new UniversalCompactionPicker(&options_, &internal_comparator_));
-    } else if (options_.compaction_style == kCompactionStyleLevel) {
+          new UniversalCompactionPicker(ioptions_, &internal_comparator_));
+    } else if (ioptions_.compaction_style == kCompactionStyleLevel) {
       compaction_picker_.reset(
-          new LevelCompactionPicker(&options_, &internal_comparator_));
+          new LevelCompactionPicker(ioptions_, &internal_comparator_));
     } else {
-      assert(options_.compaction_style == kCompactionStyleFIFO);
+      assert(ioptions_.compaction_style == kCompactionStyleFIFO);
       compaction_picker_.reset(
-          new FIFOCompactionPicker(&options_, &internal_comparator_));
+          new FIFOCompactionPicker(ioptions_, &internal_comparator_));
     }
 
-    Log(options_.info_log, "Options for column family \"%s\":\n",
+    Log(ioptions_.info_log, "Options for column family \"%s\":\n",
         name.c_str());
     const ColumnFamilyOptions* cf_options = &options_;
-    cf_options->Dump(options_.info_log.get());
+    cf_options->Dump(ioptions_.info_log);
   }
 
-  RecalculateWriteStallConditions();
+  RecalculateWriteStallConditions(mutable_cf_options_);
 }
 
 // DB mutex held
@@ -318,7 +318,8 @@ ColumnFamilyData::~ColumnFamilyData() {
   }
 }
 
-void ColumnFamilyData::RecalculateWriteStallConditions() {
+void ColumnFamilyData::RecalculateWriteStallConditions(
+      const MutableCFOptions& mutable_cf_options) {
   if (current_ != nullptr) {
     const double score = current_->MaxCompactionScore();
     const int max_level = current_->MaxCompactionScoreLevel();
@@ -328,26 +329,27 @@ void ColumnFamilyData::RecalculateWriteStallConditions() {
     if (imm()->size() == options_.max_write_buffer_number) {
       write_controller_token_ = write_controller->GetStopToken();
       internal_stats_->AddCFStats(InternalStats::MEMTABLE_COMPACTION, 1);
-      Log(options_.info_log,
+      Log(ioptions_.info_log,
           "[%s] Stopping writes because we have %d immutable memtables "
           "(waiting for flush)",
           name_.c_str(), imm()->size());
     } else if (current_->NumLevelFiles(0) >=
-               options_.level0_stop_writes_trigger) {
+               mutable_cf_options.level0_stop_writes_trigger) {
       write_controller_token_ = write_controller->GetStopToken();
       internal_stats_->AddCFStats(InternalStats::LEVEL0_NUM_FILES, 1);
-      Log(options_.info_log,
+      Log(ioptions_.info_log,
           "[%s] Stopping writes because we have %d level-0 files",
           name_.c_str(), current_->NumLevelFiles(0));
-    } else if (options_.level0_slowdown_writes_trigger >= 0 &&
+    } else if (mutable_cf_options.level0_slowdown_writes_trigger >= 0 &&
                current_->NumLevelFiles(0) >=
-                   options_.level0_slowdown_writes_trigger) {
+                   mutable_cf_options.level0_slowdown_writes_trigger) {
       uint64_t slowdown = SlowdownAmount(
-          current_->NumLevelFiles(0), options_.level0_slowdown_writes_trigger,
-          options_.level0_stop_writes_trigger);
+          current_->NumLevelFiles(0),
+          mutable_cf_options.level0_slowdown_writes_trigger,
+          mutable_cf_options.level0_stop_writes_trigger);
       write_controller_token_ = write_controller->GetDelayToken(slowdown);
       internal_stats_->AddCFStats(InternalStats::LEVEL0_SLOWDOWN, slowdown);
-      Log(options_.info_log,
+      Log(ioptions_.info_log,
           "[%s] Stalling writes because we have %d level-0 files (%" PRIu64
           "us)",
           name_.c_str(), current_->NumLevelFiles(0), slowdown);
@@ -358,7 +360,7 @@ void ColumnFamilyData::RecalculateWriteStallConditions() {
           write_controller->GetDelayToken(kHardLimitSlowdown);
       internal_stats_->RecordLevelNSlowdown(max_level, kHardLimitSlowdown,
                                             false);
-      Log(options_.info_log,
+      Log(ioptions_.info_log,
           "[%s] Stalling writes because we hit hard limit on level %d. "
           "(%" PRIu64 "us)",
           name_.c_str(), max_level, kHardLimitSlowdown);
@@ -368,7 +370,7 @@ void ColumnFamilyData::RecalculateWriteStallConditions() {
                                          options_.hard_rate_limit);
       write_controller_token_ = write_controller->GetDelayToken(slowdown);
       internal_stats_->RecordLevelNSlowdown(max_level, slowdown, true);
-      Log(options_.info_log,
+      Log(ioptions_.info_log,
           "[%s] Stalling writes because we hit soft limit on level %d (%" PRIu64
           "us)",
           name_.c_str(), max_level, slowdown);
@@ -393,19 +395,21 @@ void ColumnFamilyData::CreateNewMemtable(const MemTableOptions& moptions) {
   mem_->Ref();
 }
 
-Compaction* ColumnFamilyData::PickCompaction(LogBuffer* log_buffer) {
-  auto result = compaction_picker_->PickCompaction(current_, log_buffer);
+Compaction* ColumnFamilyData::PickCompaction(
+    const MutableCFOptions& mutable_options, LogBuffer* log_buffer) {
+  auto result = compaction_picker_->PickCompaction(
+      mutable_options, current_, log_buffer);
   return result;
 }
 
-Compaction* ColumnFamilyData::CompactRange(int input_level, int output_level,
-                                           uint32_t output_path_id,
-                                           const InternalKey* begin,
-                                           const InternalKey* end,
-                                           InternalKey** compaction_end) {
-  return compaction_picker_->CompactRange(current_, input_level, output_level,
-                                          output_path_id, begin, end,
-                                          compaction_end);
+Compaction* ColumnFamilyData::CompactRange(
+    const MutableCFOptions& mutable_cf_options,
+    int input_level, int output_level, uint32_t output_path_id,
+    const InternalKey* begin, const InternalKey* end,
+    InternalKey** compaction_end) {
+  return compaction_picker_->CompactRange(
+      mutable_cf_options, current_, input_level, output_level,
+      output_path_id, begin, end, compaction_end);
 }
 
 SuperVersion* ColumnFamilyData::GetReferencedSuperVersion(
@@ -443,11 +447,11 @@ SuperVersion* ColumnFamilyData::GetThreadLocalSuperVersion(
   sv = static_cast<SuperVersion*>(ptr);
   if (sv == SuperVersion::kSVObsolete ||
       sv->version_number != super_version_number_.load()) {
-    RecordTick(options_.statistics.get(), NUMBER_SUPERVERSION_ACQUIRES);
+    RecordTick(ioptions_.statistics, NUMBER_SUPERVERSION_ACQUIRES);
     SuperVersion* sv_to_delete = nullptr;
 
     if (sv && sv->Unref()) {
-      RecordTick(options_.statistics.get(), NUMBER_SUPERVERSION_CLEANUPS);
+      RecordTick(ioptions_.statistics, NUMBER_SUPERVERSION_CLEANUPS);
       db_mutex->Lock();
       // NOTE: underlying resources held by superversion (sst files) might
       // not be released until the next background job.
@@ -502,7 +506,7 @@ SuperVersion* ColumnFamilyData::InstallSuperVersion(
   // Reset SuperVersions cached in thread local storage
   ResetThreadLocalSuperVersions();
 
-  RecalculateWriteStallConditions();
+  RecalculateWriteStallConditions(mutable_cf_options);
 
   if (old_superversion != nullptr && old_superversion->Unref()) {
     old_superversion->Cleanup();
@@ -533,6 +537,7 @@ bool ColumnFamilyData::SetOptions(
   if (GetMutableOptionsFromStrings(mutable_cf_options_, options_map,
                                    &new_mutable_cf_options)) {
     mutable_cf_options_ = new_mutable_cf_options;
+    mutable_cf_options_.RefreshDerivedOptions(ioptions_);
     return true;
   }
   return false;
diff --git a/db/column_family.h b/db/column_family.h
index 65b4b53ba..9c415c2a8 100644
--- a/db/column_family.h
+++ b/db/column_family.h
@@ -203,11 +203,14 @@ class ColumnFamilyData {
   TableCache* table_cache() const { return table_cache_.get(); }
 
   // See documentation in compaction_picker.h
-  Compaction* PickCompaction(LogBuffer* log_buffer);
-  Compaction* CompactRange(int input_level, int output_level,
-                           uint32_t output_path_id, const InternalKey* begin,
-                           const InternalKey* end,
-                           InternalKey** compaction_end);
+  // REQUIRES: DB mutex held
+  Compaction* PickCompaction(const MutableCFOptions& mutable_options,
+                             LogBuffer* log_buffer);
+  Compaction* CompactRange(
+      const MutableCFOptions& mutable_cf_options,
+      int input_level, int output_level, uint32_t output_path_id,
+      const InternalKey* begin, const InternalKey* end,
+      InternalKey** compaction_end);
 
   CompactionPicker* compaction_picker() { return compaction_picker_.get(); }
   // thread-safe
@@ -260,7 +263,8 @@ class ColumnFamilyData {
   // recalculation of compaction score. These values are used in
   // DBImpl::MakeRoomForWrite function to decide, if it need to make
   // a write stall
-  void RecalculateWriteStallConditions();
+  void RecalculateWriteStallConditions(
+      const MutableCFOptions& mutable_cf_options);
 
   uint32_t id_;
   const std::string name_;
diff --git a/db/compaction.cc b/db/compaction.cc
index 28a3174b0..f02feeee7 100644
--- a/db/compaction.cc
+++ b/db/compaction.cc
@@ -56,7 +56,6 @@ Compaction::Compaction(Version* input_version, int start_level, int out_level,
       is_full_compaction_(false),
       is_manual_compaction_(false),
       level_ptrs_(std::vector<size_t>(number_levels_)) {
-
   cfd_->Ref();
   input_version_->Ref();
   edit_ = new VersionEdit();
@@ -267,12 +266,12 @@ void Compaction::Summary(char* output, int len) {
   snprintf(output + write, len - write, "]");
 }
 
-uint64_t Compaction::OutputFilePreallocationSize() {
+uint64_t Compaction::OutputFilePreallocationSize(
+    const MutableCFOptions& mutable_options) {
   uint64_t preallocation_size = 0;
 
   if (cfd_->ioptions()->compaction_style == kCompactionStyleLevel) {
-    preallocation_size =
-        cfd_->compaction_picker()->MaxFileSizeForLevel(output_level());
+    preallocation_size = mutable_options.MaxFileSizeForLevel(output_level());
   } else {
     for (int level = 0; level < num_input_levels(); ++level) {
       for (const auto& f : inputs_[level].files) {
diff --git a/db/compaction.h b/db/compaction.h
index 6000f636b..7c490946a 100644
--- a/db/compaction.h
+++ b/db/compaction.h
@@ -10,6 +10,7 @@
 #pragma once
 #include "util/arena.h"
 #include "util/autovector.h"
+#include "util/mutable_cf_options.h"
 #include "db/version_set.h"
 
 namespace rocksdb {
@@ -151,10 +152,14 @@ class Compaction {
   // Was this compaction triggered manually by the client?
   bool IsManualCompaction() { return is_manual_compaction_; }
 
+  // Return the MutableCFOptions that should be used throughout the compaction
+  // procedure
+  const MutableCFOptions* mutable_cf_options() { return &mutable_cf_options_; }
+
   // Returns the size in bytes that the output file should be preallocated to.
   // In level compaction, that is max_file_size_. In universal compaction, that
   // is the sum of all input file sizes.
-  uint64_t OutputFilePreallocationSize();
+  uint64_t OutputFilePreallocationSize(const MutableCFOptions& mutable_options);
 
  private:
   friend class CompactionPicker;
@@ -171,6 +176,7 @@ class Compaction {
   const int output_level_;  // levels to which output files are stored
   uint64_t max_output_file_size_;
   uint64_t max_grandparent_overlap_bytes_;
+  MutableCFOptions mutable_cf_options_;
   Version* input_version_;
   VersionEdit* edit_;
   int number_levels_;
diff --git a/db/compaction_picker.cc b/db/compaction_picker.cc
index add3556d8..84bd95839 100644
--- a/db/compaction_picker.cc
+++ b/db/compaction_picker.cc
@@ -35,70 +35,36 @@ namespace {
 // If enable_compression is false, then compression is always disabled no
 // matter what the values of the other two parameters are.
 // Otherwise, the compression type is determined based on options and level.
-CompressionType GetCompressionType(const Options& options, int level,
-                                   const bool enable_compression = true) {
+CompressionType GetCompressionType(
+    const ImmutableCFOptions& ioptions, int level,
+    const bool enable_compression = true) {
   if (!enable_compression) {
     // disable compression
     return kNoCompression;
   }
   // If the use has specified a different compression level for each level,
   // then pick the compression for that level.
-  if (!options.compression_per_level.empty()) {
-    const int n = options.compression_per_level.size() - 1;
+  if (!ioptions.compression_per_level.empty()) {
+    const int n = ioptions.compression_per_level.size() - 1;
     // It is possible for level_ to be -1; in that case, we use level
     // 0's compression.  This occurs mostly in backwards compatibility
     // situations when the builder doesn't know what level the file
     // belongs to.  Likewise, if level is beyond the end of the
     // specified compression levels, use the last value.
-    return options.compression_per_level[std::max(0, std::min(level, n))];
+    return ioptions.compression_per_level[std::max(0, std::min(level, n))];
   } else {
-    return options.compression;
+    return ioptions.compression;
   }
 }
 
-// Multiple two operands. If they overflow, return op1.
-uint64_t MultiplyCheckOverflow(uint64_t op1, int op2) {
-  if (op1 == 0) {
-    return 0;
-  }
-  if (op2 <= 0) {
-    return op1;
-  }
-  uint64_t casted_op2 = (uint64_t) op2;
-  if (std::numeric_limits<uint64_t>::max() / op1 < casted_op2) {
-    return op1;
-  }
-  return op1 * casted_op2;
-}
 
 }  // anonymous namespace
 
-CompactionPicker::CompactionPicker(const Options* options,
+CompactionPicker::CompactionPicker(const ImmutableCFOptions& ioptions,
                                    const InternalKeyComparator* icmp)
-    : compactions_in_progress_(options->num_levels),
-      options_(options),
-      num_levels_(options->num_levels),
+    : ioptions_(ioptions),
+      compactions_in_progress_(ioptions_.num_levels),
       icmp_(icmp) {
-
-  max_file_size_.reset(new uint64_t[NumberLevels()]);
-  level_max_bytes_.reset(new uint64_t[NumberLevels()]);
-  int target_file_size_multiplier = options_->target_file_size_multiplier;
-  int max_bytes_multiplier = options_->max_bytes_for_level_multiplier;
-  for (int i = 0; i < NumberLevels(); i++) {
-    if (i == 0 && options_->compaction_style == kCompactionStyleUniversal) {
-      max_file_size_[i] = ULLONG_MAX;
-      level_max_bytes_[i] = options_->max_bytes_for_level_base;
-    } else if (i > 1) {
-      max_file_size_[i] = MultiplyCheckOverflow(max_file_size_[i - 1],
-                                                target_file_size_multiplier);
-      level_max_bytes_[i] = MultiplyCheckOverflow(
-          MultiplyCheckOverflow(level_max_bytes_[i - 1], max_bytes_multiplier),
-          options_->max_bytes_for_level_multiplier_additional[i - 1]);
-    } else {
-      max_file_size_[i] = options_->target_file_size_base;
-      level_max_bytes_[i] = options_->max_bytes_for_level_base;
-    }
-  }
 }
 
 CompactionPicker::~CompactionPicker() {}
@@ -126,26 +92,6 @@ void CompactionPicker::ReleaseCompactionFiles(Compaction* c, Status status) {
   }
 }
 
-uint64_t CompactionPicker::MaxFileSizeForLevel(int level) const {
-  assert(level >= 0);
-  assert(level < NumberLevels());
-  return max_file_size_[level];
-}
-
-uint64_t CompactionPicker::MaxGrandParentOverlapBytes(int level) {
-  uint64_t result = MaxFileSizeForLevel(level);
-  result *= options_->max_grandparent_overlap_factor;
-  return result;
-}
-
-double CompactionPicker::MaxBytesForLevel(int level) {
-  // Note: the result for level zero is not really used since we set
-  // the level-0 compaction threshold based on number of files.
-  assert(level >= 0);
-  assert(level < NumberLevels());
-  return level_max_bytes_[level];
-}
-
 void CompactionPicker::GetRange(const std::vector<FileMetaData*>& inputs,
                                 InternalKey* smallest, InternalKey* largest) {
   assert(!inputs.empty());
@@ -214,7 +160,7 @@ bool CompactionPicker::ExpandWhileOverlapping(Compaction* c) {
   // compaction, then we must drop/cancel this compaction.
   int parent_index = -1;
   if (c->inputs_[0].empty()) {
-    Log(options_->info_log,
+    Log(ioptions_.info_log,
         "[%s] ExpandWhileOverlapping() failure because zero input files",
         c->column_family_data()->GetName().c_str());
   }
@@ -229,12 +175,6 @@ bool CompactionPicker::ExpandWhileOverlapping(Compaction* c) {
   return true;
 }
 
-uint64_t CompactionPicker::ExpandedCompactionByteSizeLimit(int level) {
-  uint64_t result = MaxFileSizeForLevel(level);
-  result *= options_->expanded_compaction_factor;
-  return result;
-}
-
 // Returns true if any one of specified files are being compacted
 bool CompactionPicker::FilesInCompaction(std::vector<FileMetaData*>& files) {
   for (unsigned int i = 0; i < files.size(); i++) {
@@ -262,7 +202,8 @@ bool CompactionPicker::ParentRangeInCompaction(Version* version,
 // Will also attempt to expand "level" if that doesn't expand "level+1"
 // or cause "level" to include a file for compaction that has an overlapping
 // user-key with another file.
-void CompactionPicker::SetupOtherInputs(Compaction* c) {
+void CompactionPicker::SetupOtherInputs(
+    const MutableCFOptions& mutable_cf_options, Compaction* c) {
   // If inputs are empty, then there is nothing to expand.
   // If both input and output levels are the same, no need to consider
   // files at level "level+1"
@@ -298,7 +239,7 @@ void CompactionPicker::SetupOtherInputs(Compaction* c) {
     const uint64_t inputs0_size = TotalCompensatedFileSize(c->inputs_[0].files);
     const uint64_t inputs1_size = TotalCompensatedFileSize(c->inputs_[1].files);
     const uint64_t expanded0_size = TotalCompensatedFileSize(expanded0);
-    uint64_t limit = ExpandedCompactionByteSizeLimit(level);
+    uint64_t limit = mutable_cf_options.ExpandedCompactionByteSizeLimit(level);
     if (expanded0.size() > c->inputs_[0].size() &&
         inputs1_size + expanded0_size < limit &&
         !FilesInCompaction(expanded0) &&
@@ -311,7 +252,7 @@ void CompactionPicker::SetupOtherInputs(Compaction* c) {
                                               &c->parent_index_);
       if (expanded1.size() == c->inputs_[1].size() &&
           !FilesInCompaction(expanded1)) {
-        Log(options_->info_log,
+        Log(ioptions_.info_log,
             "[%s] Expanding@%d %zu+%zu (%" PRIu64 "+%" PRIu64
             " bytes) to %zu+%zu (%" PRIu64 "+%" PRIu64 "bytes)\n",
             c->column_family_data()->GetName().c_str(), level,
@@ -336,21 +277,20 @@ void CompactionPicker::SetupOtherInputs(Compaction* c) {
   }
 }
 
-Compaction* CompactionPicker::CompactRange(Version* version, int input_level,
-                                           int output_level,
-                                           uint32_t output_path_id,
-                                           const InternalKey* begin,
-                                           const InternalKey* end,
-                                           InternalKey** compaction_end) {
+Compaction* CompactionPicker::CompactRange(
+    const MutableCFOptions& mutable_cf_options, Version* version,
+    int input_level, int output_level, uint32_t output_path_id,
+    const InternalKey* begin, const InternalKey* end,
+    InternalKey** compaction_end) {
   // CompactionPickerFIFO has its own implementation of compact range
-  assert(options_->compaction_style != kCompactionStyleFIFO);
+  assert(ioptions_.compaction_style != kCompactionStyleFIFO);
 
   std::vector<FileMetaData*> inputs;
   bool covering_the_whole_range = true;
 
   // All files are 'overlapping' in universal style compaction.
   // We have to compact the entire range in one shot.
-  if (options_->compaction_style == kCompactionStyleUniversal) {
+  if (ioptions_.compaction_style == kCompactionStyleUniversal) {
     begin = nullptr;
     end = nullptr;
   }
@@ -364,8 +304,8 @@ Compaction* CompactionPicker::CompactRange(Version* version, int input_level,
   // and we must not pick one file and drop another older file if the
   // two files overlap.
   if (input_level > 0) {
-    const uint64_t limit =
-        MaxFileSizeForLevel(input_level) * options_->source_compaction_factor;
+    const uint64_t limit = mutable_cf_options.MaxFileSizeForLevel(input_level) *
+      mutable_cf_options.source_compaction_factor;
     uint64_t total = 0;
     for (size_t i = 0; i + 1 < inputs.size(); ++i) {
       uint64_t s = inputs[i]->compensated_file_size;
@@ -378,22 +318,24 @@ Compaction* CompactionPicker::CompactRange(Version* version, int input_level,
       }
     }
   }
-  assert(output_path_id < static_cast<uint32_t>(options_->db_paths.size()));
+  assert(output_path_id < static_cast<uint32_t>(ioptions_.db_paths.size()));
   Compaction* c = new Compaction(
-      version, input_level, output_level, MaxFileSizeForLevel(output_level),
-      MaxGrandParentOverlapBytes(input_level), output_path_id,
-      GetCompressionType(*options_, output_level));
+      version, input_level, output_level,
+      mutable_cf_options.MaxFileSizeForLevel(output_level),
+      mutable_cf_options.MaxGrandParentOverlapBytes(input_level),
+      output_path_id,
+      GetCompressionType(ioptions_, output_level));
 
   c->inputs_[0].files = inputs;
   if (ExpandWhileOverlapping(c) == false) {
     delete c;
-    Log(options_->info_log,
+    Log(ioptions_.info_log,
         "[%s] Could not compact due to expansion failure.\n",
         version->cfd_->GetName().c_str());
     return nullptr;
   }
 
-  SetupOtherInputs(c);
+  SetupOtherInputs(mutable_cf_options, c);
 
   if (covering_the_whole_range) {
     *compaction_end = nullptr;
@@ -408,12 +350,14 @@ Compaction* CompactionPicker::CompactRange(Version* version, int input_level,
   c->SetupBottomMostLevel(true);
 
   c->is_manual_compaction_ = true;
+  c->mutable_cf_options_ = mutable_cf_options;
 
   return c;
 }
 
-Compaction* LevelCompactionPicker::PickCompaction(Version* version,
-                                                  LogBuffer* log_buffer) {
+Compaction* LevelCompactionPicker::PickCompaction(
+    const MutableCFOptions& mutable_cf_options,
+    Version* version, LogBuffer* log_buffer) {
   Compaction* c = nullptr;
   int level = -1;
 
@@ -421,7 +365,7 @@ Compaction* LevelCompactionPicker::PickCompaction(Version* version,
   // and also in LogAndApply(), otherwise the values could be stale.
   std::vector<uint64_t> size_being_compacted(NumberLevels() - 1);
   SizeBeingCompacted(size_being_compacted);
-  version->ComputeCompactionScore(size_being_compacted);
+  version->ComputeCompactionScore(mutable_cf_options, size_being_compacted);
 
   // We prefer compactions triggered by too much data in a level over
   // the compactions triggered by seeks.
@@ -432,7 +376,8 @@ Compaction* LevelCompactionPicker::PickCompaction(Version* version,
            version->compaction_score_[i] <= version->compaction_score_[i - 1]);
     level = version->compaction_level_[i];
     if ((version->compaction_score_[i] >= 1)) {
-      c = PickCompactionBySize(version, level, version->compaction_score_[i]);
+      c = PickCompactionBySize(mutable_cf_options, version, level,
+          version->compaction_score_[i]);
       if (c == nullptr || ExpandWhileOverlapping(c) == false) {
         delete c;
         c = nullptr;
@@ -472,7 +417,7 @@ Compaction* LevelCompactionPicker::PickCompaction(Version* version,
   }
 
   // Setup "level+1" files (inputs_[1])
-  SetupOtherInputs(c);
+  SetupOtherInputs(mutable_cf_options, c);
 
   // mark all the files that are being compacted
   c->MarkFilesBeingCompacted(true);
@@ -483,12 +428,13 @@ Compaction* LevelCompactionPicker::PickCompaction(Version* version,
   // remember this currently undergoing compaction
   compactions_in_progress_[level].insert(c);
 
+  c->mutable_cf_options_ = mutable_cf_options;
   return c;
 }
 
-Compaction* LevelCompactionPicker::PickCompactionBySize(Version* version,
-                                                        int level,
-                                                        double score) {
+Compaction* LevelCompactionPicker::PickCompactionBySize(
+    const MutableCFOptions& mutable_cf_options,
+    Version* version, int level, double score) {
   Compaction* c = nullptr;
 
   // level 0 files are overlapping. So we cannot pick more
@@ -501,9 +447,10 @@ Compaction* LevelCompactionPicker::PickCompactionBySize(Version* version,
 
   assert(level >= 0);
   assert(level + 1 < NumberLevels());
-  c = new Compaction(version, level, level + 1, MaxFileSizeForLevel(level + 1),
-                     MaxGrandParentOverlapBytes(level), 0,
-                     GetCompressionType(*options_, level + 1));
+  c = new Compaction(version, level, level + 1,
+                     mutable_cf_options.MaxFileSizeForLevel(level + 1),
+                     mutable_cf_options.MaxGrandParentOverlapBytes(level), 0,
+                     GetCompressionType(ioptions_, level + 1));
   c->score_ = score;
 
   // Pick the largest file in this level that is not already
@@ -563,13 +510,14 @@ Compaction* LevelCompactionPicker::PickCompactionBySize(Version* version,
 // Universal style of compaction. Pick files that are contiguous in
 // time-range to compact.
 //
-Compaction* UniversalCompactionPicker::PickCompaction(Version* version,
-                                                      LogBuffer* log_buffer) {
+Compaction* UniversalCompactionPicker::PickCompaction(
+    const MutableCFOptions& mutable_cf_options,
+    Version* version, LogBuffer* log_buffer) {
   int level = 0;
   double score = version->compaction_score_[0];
 
   if ((version->files_[level].size() <
-       (unsigned int)options_->level0_file_num_compaction_trigger)) {
+       (unsigned int)mutable_cf_options.level0_file_num_compaction_trigger)) {
     LogToBuffer(log_buffer, "[%s] Universal: nothing to do\n",
                 version->cfd_->GetName().c_str());
     return nullptr;
@@ -581,17 +529,18 @@ Compaction* UniversalCompactionPicker::PickCompaction(Version* version,
 
   // Check for size amplification first.
   Compaction* c;
-  if ((c = PickCompactionUniversalSizeAmp(version, score, log_buffer)) !=
-      nullptr) {
+  if ((c = PickCompactionUniversalSizeAmp(
+          mutable_cf_options, version, score, log_buffer)) != nullptr) {
     LogToBuffer(log_buffer, "[%s] Universal: compacting for size amp\n",
                 version->cfd_->GetName().c_str());
   } else {
     // Size amplification is within limits. Try reducing read
     // amplification while maintaining file size ratios.
-    unsigned int ratio = options_->compaction_options_universal.size_ratio;
+    unsigned int ratio = ioptions_.compaction_options_universal.size_ratio;
 
-    if ((c = PickCompactionUniversalReadAmp(version, score, ratio, UINT_MAX,
-                                            log_buffer)) != nullptr) {
+    if ((c = PickCompactionUniversalReadAmp(
+            mutable_cf_options, version, score, ratio,
+            UINT_MAX, log_buffer)) != nullptr) {
       LogToBuffer(log_buffer, "[%s] Universal: compacting for size ratio\n",
                   version->cfd_->GetName().c_str());
     } else {
@@ -600,9 +549,10 @@ Compaction* UniversalCompactionPicker::PickCompaction(Version* version,
       // compaction without looking at filesize ratios and try to reduce
       // the number of files to fewer than level0_file_num_compaction_trigger.
       unsigned int num_files = version->files_[level].size() -
-                               options_->level0_file_num_compaction_trigger;
+          mutable_cf_options.level0_file_num_compaction_trigger;
       if ((c = PickCompactionUniversalReadAmp(
-               version, score, UINT_MAX, num_files, log_buffer)) != nullptr) {
+               mutable_cf_options, version, score, UINT_MAX,
+               num_files, log_buffer)) != nullptr) {
         LogToBuffer(log_buffer, "[%s] Universal: compacting for file num\n",
                     version->cfd_->GetName().c_str());
       }
@@ -628,7 +578,7 @@ Compaction* UniversalCompactionPicker::PickCompaction(Version* version,
   c->bottommost_level_ = c->inputs_[0].files.back() == last_file;
 
   // update statistics
-  MeasureTime(options_->statistics.get(),
+  MeasureTime(ioptions_.statistics,
               NUM_FILES_IN_SINGLE_COMPACTION, c->inputs_[0].size());
 
   // mark all the files that are being compacted
@@ -642,11 +592,12 @@ Compaction* UniversalCompactionPicker::PickCompaction(Version* version,
   c->is_full_compaction_ =
       (c->inputs_[0].size() == c->input_version_->files_[0].size());
 
+  c->mutable_cf_options_ = mutable_cf_options;
   return c;
 }
 
-uint32_t UniversalCompactionPicker::GetPathId(const Options& options,
-                                              uint64_t file_size) {
+uint32_t UniversalCompactionPicker::GetPathId(
+    const ImmutableCFOptions& ioptions, uint64_t file_size) {
   // Two conditions need to be satisfied:
   // (1) the target path needs to be able to hold the file's size
   // (2) Total size left in this and previous paths need to be not
@@ -662,11 +613,11 @@ uint32_t UniversalCompactionPicker::GetPathId(const Options& options,
   // considered in this algorithm. So the target size can be violated in
   // that case. We need to improve it.
   uint64_t accumulated_size = 0;
-  uint64_t future_size =
-      file_size * (100 - options.compaction_options_universal.size_ratio) / 100;
+  uint64_t future_size = file_size *
+    (100 - ioptions.compaction_options_universal.size_ratio) / 100;
   uint32_t p = 0;
-  for (; p < options.db_paths.size() - 1; p++) {
-    uint64_t target_size = options.db_paths[p].target_size;
+  for (; p < ioptions.db_paths.size() - 1; p++) {
+    uint64_t target_size = ioptions.db_paths[p].target_size;
     if (target_size > file_size &&
         accumulated_size + (target_size - file_size) > future_size) {
       return p;
@@ -681,14 +632,15 @@ uint32_t UniversalCompactionPicker::GetPathId(const Options& options,
 // the next file in time order.
 //
 Compaction* UniversalCompactionPicker::PickCompactionUniversalReadAmp(
-    Version* version, double score, unsigned int ratio,
+    const MutableCFOptions& mutable_cf_options, Version* version,
+    double score, unsigned int ratio,
     unsigned int max_number_of_files_to_compact, LogBuffer* log_buffer) {
   int level = 0;
 
   unsigned int min_merge_width =
-    options_->compaction_options_universal.min_merge_width;
+    ioptions_.compaction_options_universal.min_merge_width;
   unsigned int max_merge_width =
-    options_->compaction_options_universal.max_merge_width;
+    ioptions_.compaction_options_universal.max_merge_width;
 
   // The files are sorted from newest first to oldest last.
   const auto& files = version->files_[level];
@@ -750,7 +702,8 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalReadAmp(
       if (sz < static_cast<double>(f->fd.GetFileSize())) {
         break;
       }
-      if (options_->compaction_options_universal.stop_style == kCompactionStopStyleSimilarSize) {
+      if (ioptions_.compaction_options_universal.stop_style ==
+          kCompactionStopStyleSimilarSize) {
         // Similar-size stopping rule: also check the last picked file isn't
         // far larger than the next candidate file.
         sz = (f->fd.GetFileSize() * (100.0 + ratio)) / 100.0;
@@ -794,7 +747,7 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalReadAmp(
   // size ratio of compression.
   bool enable_compression = true;
   int ratio_to_compress =
-      options_->compaction_options_universal.compression_size_percent;
+      ioptions_.compaction_options_universal.compression_size_percent;
   if (ratio_to_compress >= 0) {
     uint64_t total_size = version->NumLevelBytes(level);
     uint64_t older_file_size = 0;
@@ -812,11 +765,12 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalReadAmp(
   for (unsigned int i = 0; i < first_index_after; i++) {
     estimated_total_size += files[i]->fd.GetFileSize();
   }
-  uint32_t path_id = GetPathId(*options_, estimated_total_size);
+  uint32_t path_id = GetPathId(ioptions_, estimated_total_size);
 
   Compaction* c = new Compaction(
-      version, level, level, MaxFileSizeForLevel(level), LLONG_MAX, path_id,
-      GetCompressionType(*options_, level, enable_compression));
+      version, level, level, mutable_cf_options.MaxFileSizeForLevel(level),
+      LLONG_MAX, path_id, GetCompressionType(ioptions_, level,
+      enable_compression));
   c->score_ = score;
 
   for (unsigned int i = start_index; i < first_index_after; i++) {
@@ -841,11 +795,12 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalReadAmp(
 // min_merge_width and max_merge_width).
 //
 Compaction* UniversalCompactionPicker::PickCompactionUniversalSizeAmp(
-    Version* version, double score, LogBuffer* log_buffer) {
+    const MutableCFOptions& mutable_cf_options, Version* version,
+    double score, LogBuffer* log_buffer) {
   int level = 0;
 
   // percentage flexibilty while reducing size amplification
-  uint64_t ratio = options_->compaction_options_universal.
+  uint64_t ratio = ioptions_.compaction_options_universal.
                      max_size_amplification_percent;
 
   // The files are sorted from newest first to oldest last.
@@ -927,13 +882,14 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalSizeAmp(
   for (unsigned int loop = start_index; loop < files.size(); loop++) {
     estimated_total_size += files[loop]->fd.GetFileSize();
   }
-  uint32_t path_id = GetPathId(*options_, estimated_total_size);
+  uint32_t path_id = GetPathId(ioptions_, estimated_total_size);
 
   // create a compaction request
   // We always compact all the files, so always compress.
   Compaction* c =
-      new Compaction(version, level, level, MaxFileSizeForLevel(level),
-                     LLONG_MAX, path_id, GetCompressionType(*options_, level));
+      new Compaction(version, level, level,
+                     mutable_cf_options.MaxFileSizeForLevel(level),
+                     LLONG_MAX, path_id, GetCompressionType(ioptions_, level));
   c->score_ = score;
   for (unsigned int loop = start_index; loop < files.size(); loop++) {
     f = c->input_version_->files_[level][loop];
@@ -948,22 +904,23 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalSizeAmp(
   return c;
 }
 
-Compaction* FIFOCompactionPicker::PickCompaction(Version* version,
-                                                 LogBuffer* log_buffer) {
+Compaction* FIFOCompactionPicker::PickCompaction(
+    const MutableCFOptions& mutable_cf_options,
+    Version* version, LogBuffer* log_buffer) {
   assert(version->NumberLevels() == 1);
   uint64_t total_size = 0;
   for (const auto& file : version->files_[0]) {
     total_size += file->compensated_file_size;
   }
 
-  if (total_size <= options_->compaction_options_fifo.max_table_files_size ||
+  if (total_size <= ioptions_.compaction_options_fifo.max_table_files_size ||
       version->files_[0].size() == 0) {
     // total size not exceeded
     LogToBuffer(log_buffer,
                 "[%s] FIFO compaction: nothing to do. Total size %" PRIu64
                 ", max size %" PRIu64 "\n",
                 version->cfd_->GetName().c_str(), total_size,
-                options_->compaction_options_fifo.max_table_files_size);
+                ioptions_.compaction_options_fifo.max_table_files_size);
     return nullptr;
   }
 
@@ -988,28 +945,29 @@ Compaction* FIFOCompactionPicker::PickCompaction(Version* version,
     LogToBuffer(log_buffer, "[%s] FIFO compaction: picking file %" PRIu64
                             " with size %s for deletion",
                 version->cfd_->GetName().c_str(), f->fd.GetNumber(), tmp_fsize);
-    if (total_size <= options_->compaction_options_fifo.max_table_files_size) {
+    if (total_size <= ioptions_.compaction_options_fifo.max_table_files_size) {
       break;
     }
   }
 
   c->MarkFilesBeingCompacted(true);
   compactions_in_progress_[0].insert(c);
-
+  c->mutable_cf_options_ = mutable_cf_options;
   return c;
 }
 
 Compaction* FIFOCompactionPicker::CompactRange(
+    const MutableCFOptions& mutable_cf_options,
     Version* version, int input_level, int output_level,
     uint32_t output_path_id, const InternalKey* begin, const InternalKey* end,
     InternalKey** compaction_end) {
   assert(input_level == 0);
   assert(output_level == 0);
   *compaction_end = nullptr;
-  LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, options_->info_log.get());
-  Compaction* c = PickCompaction(version, &log_buffer);
+  LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, ioptions_.info_log);
+  Compaction* c = PickCompaction(mutable_cf_options, version, &log_buffer);
   if (c != nullptr) {
-    assert(output_path_id < static_cast<uint32_t>(options_->db_paths.size()));
+    assert(output_path_id < static_cast<uint32_t>(ioptions_.db_paths.size()));
     c->output_path_id_ = output_path_id;
   }
   log_buffer.FlushBufferToLog();
diff --git a/db/compaction_picker.h b/db/compaction_picker.h
index c1e27c471..9862bdfea 100644
--- a/db/compaction_picker.h
+++ b/db/compaction_picker.h
@@ -13,6 +13,7 @@
 #include "rocksdb/status.h"
 #include "rocksdb/options.h"
 #include "rocksdb/env.h"
+#include "util/mutable_cf_options.h"
 
 #include <vector>
 #include <memory>
@@ -26,15 +27,17 @@ class Version;
 
 class CompactionPicker {
  public:
-  CompactionPicker(const Options* options, const InternalKeyComparator* icmp);
+  CompactionPicker(const ImmutableCFOptions& ioptions,
+                   const InternalKeyComparator* icmp);
   virtual ~CompactionPicker();
 
   // Pick level and inputs for a new compaction.
   // Returns nullptr if there is no compaction to be done.
   // Otherwise returns a pointer to a heap-allocated object that
   // describes the compaction.  Caller should delete the result.
-  virtual Compaction* PickCompaction(Version* version,
-                                     LogBuffer* log_buffer) = 0;
+  virtual Compaction* PickCompaction(
+      const MutableCFOptions& mutable_cf_options,
+      Version* version, LogBuffer* log_buffer) = 0;
 
   // Return a compaction object for compacting the range [begin,end] in
   // the specified level.  Returns nullptr if there is nothing in that
@@ -47,11 +50,11 @@ class CompactionPicker {
   // compaction_end will be set to nullptr.
   // Client is responsible for compaction_end storage -- when called,
   // *compaction_end should point to valid InternalKey!
-  virtual Compaction* CompactRange(Version* version, int input_level,
-                                   int output_level, uint32_t output_path_id,
-                                   const InternalKey* begin,
-                                   const InternalKey* end,
-                                   InternalKey** compaction_end);
+  virtual Compaction* CompactRange(
+      const MutableCFOptions& mutable_cf_options, Version* version,
+      int input_level, int output_level, uint32_t output_path_id,
+      const InternalKey* begin, const InternalKey* end,
+      InternalKey** compaction_end);
 
   // Given the current number of levels, returns the lowest allowed level
   // for compaction input.
@@ -64,19 +67,8 @@ class CompactionPicker {
   // compactions per level
   void SizeBeingCompacted(std::vector<uint64_t>& sizes);
 
-  // Returns maximum total overlap bytes with grandparent
-  // level (i.e., level+2) before we stop building a single
-  // file in level->level+1 compaction.
-  uint64_t MaxGrandParentOverlapBytes(int level);
-
-  // Returns maximum total bytes of data on a given level.
-  double MaxBytesForLevel(int level);
-
-  // Get the max file size in a given level.
-  uint64_t MaxFileSizeForLevel(int level) const;
-
  protected:
-  int NumberLevels() const { return num_levels_; }
+  int NumberLevels() const { return ioptions_.num_levels; }
 
   // Stores the minimal range that covers all entries in inputs in
   // *smallest, *largest.
@@ -103,8 +95,6 @@ class CompactionPicker {
   // Will return false if it is impossible to apply this compaction.
   bool ExpandWhileOverlapping(Compaction* c);
 
-  uint64_t ExpandedCompactionByteSizeLimit(int level);
-
   // Returns true if any one of the specified files are being compacted
   bool FilesInCompaction(std::vector<FileMetaData*>& files);
 
@@ -113,32 +103,30 @@ class CompactionPicker {
                                const InternalKey* largest, int level,
                                int* index);
 
-  void SetupOtherInputs(Compaction* c);
+  void SetupOtherInputs(const MutableCFOptions& mutable_cf_options,
+                        Compaction* c);
+
+  const ImmutableCFOptions& ioptions_;
 
   // record all the ongoing compactions for all levels
   std::vector<std::set<Compaction*>> compactions_in_progress_;
 
-  // Per-level target file size.
-  std::unique_ptr<uint64_t[]> max_file_size_;
-
-  // Per-level max bytes
-  std::unique_ptr<uint64_t[]> level_max_bytes_;
-
-  const Options* const options_;
 
  private:
-  int num_levels_;
-
   const InternalKeyComparator* const icmp_;
+
+  int max_grandparent_overlap_factor_;
+  int expanded_compaction_factor_;
 };
 
 class UniversalCompactionPicker : public CompactionPicker {
  public:
-  UniversalCompactionPicker(const Options* options,
+  UniversalCompactionPicker(const ImmutableCFOptions& ioptions,
                             const InternalKeyComparator* icmp)
-      : CompactionPicker(options, icmp) {}
-  virtual Compaction* PickCompaction(Version* version,
-                                     LogBuffer* log_buffer) override;
+      : CompactionPicker(ioptions, icmp) {}
+  virtual Compaction* PickCompaction(
+      const MutableCFOptions& mutable_cf_options,
+      Version* version, LogBuffer* log_buffer) override;
 
   // The maxinum allowed input level.  Always return 0.
   virtual int MaxInputLevel(int current_num_levels) const override {
@@ -147,27 +135,30 @@ class UniversalCompactionPicker : public CompactionPicker {
 
  private:
   // Pick Universal compaction to limit read amplification
-  Compaction* PickCompactionUniversalReadAmp(Version* version, double score,
-                                             unsigned int ratio,
-                                             unsigned int num_files,
-                                             LogBuffer* log_buffer);
+  Compaction* PickCompactionUniversalReadAmp(
+      const MutableCFOptions& mutable_cf_options,
+      Version* version, double score, unsigned int ratio,
+      unsigned int num_files, LogBuffer* log_buffer);
 
   // Pick Universal compaction to limit space amplification.
-  Compaction* PickCompactionUniversalSizeAmp(Version* version, double score,
-                                             LogBuffer* log_buffer);
+  Compaction* PickCompactionUniversalSizeAmp(
+      const MutableCFOptions& mutable_cf_options,
+      Version* version, double score, LogBuffer* log_buffer);
 
   // Pick a path ID to place a newly generated file, with its estimated file
   // size.
-  static uint32_t GetPathId(const Options& options, uint64_t file_size);
+  static uint32_t GetPathId(const ImmutableCFOptions& ioptions,
+                            uint64_t file_size);
 };
 
 class LevelCompactionPicker : public CompactionPicker {
  public:
-  LevelCompactionPicker(const Options* options,
+  LevelCompactionPicker(const ImmutableCFOptions& ioptions,
                         const InternalKeyComparator* icmp)
-      : CompactionPicker(options, icmp) {}
-  virtual Compaction* PickCompaction(Version* version,
-                                     LogBuffer* log_buffer) override;
+      : CompactionPicker(ioptions, icmp) {}
+  virtual Compaction* PickCompaction(
+      const MutableCFOptions& mutable_cf_options,
+      Version* version, LogBuffer* log_buffer) override;
 
   // Returns current_num_levels - 2, meaning the last level cannot be
   // compaction input level.
@@ -180,23 +171,25 @@ class LevelCompactionPicker : public CompactionPicker {
   // Returns nullptr if there is no compaction to be done.
   // If level is 0 and there is already a compaction on that level, this
   // function will return nullptr.
-  Compaction* PickCompactionBySize(Version* version, int level, double score);
+  Compaction* PickCompactionBySize(const MutableCFOptions& mutable_cf_options,
+      Version* version, int level, double score);
 };
 
 class FIFOCompactionPicker : public CompactionPicker {
  public:
-  FIFOCompactionPicker(const Options* options,
+  FIFOCompactionPicker(const ImmutableCFOptions& ioptions,
                        const InternalKeyComparator* icmp)
-      : CompactionPicker(options, icmp) {}
+      : CompactionPicker(ioptions, icmp) {}
 
-  virtual Compaction* PickCompaction(Version* version,
-                                     LogBuffer* log_buffer) override;
+  virtual Compaction* PickCompaction(
+      const MutableCFOptions& mutable_cf_options,
+      Version* version, LogBuffer* log_buffer) override;
 
-  virtual Compaction* CompactRange(Version* version, int input_level,
-                                   int output_level, uint32_t output_path_id,
-                                   const InternalKey* begin,
-                                   const InternalKey* end,
-                                   InternalKey** compaction_end) override;
+  virtual Compaction* CompactRange(
+      const MutableCFOptions& mutable_cf_options, Version* version,
+      int input_level, int output_level, uint32_t output_path_id,
+      const InternalKey* begin, const InternalKey* end,
+      InternalKey** compaction_end) override;
 
   // The maxinum allowed input level.  Always return 0.
   virtual int MaxInputLevel(int current_num_levels) const override {
diff --git a/db/db_impl.cc b/db/db_impl.cc
index 1a2b7f7b2..680a22cb3 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -1410,7 +1410,8 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
       // VersionSet::next_file_number_ always to be strictly greater than any
       // log number
       versions_->MarkFileNumberUsed(max_log_number + 1);
-      status = versions_->LogAndApply(cfd, edit, &mutex_);
+      status = versions_->LogAndApply(
+          cfd, *cfd->GetLatestMutableCFOptions(), edit, &mutex_);
       if (!status.ok()) {
         // Recovery failed
         break;
@@ -1479,8 +1480,9 @@ Status DBImpl::WriteLevel0TableForRecovery(ColumnFamilyData* cfd, MemTable* mem,
 }
 
 Status DBImpl::WriteLevel0Table(ColumnFamilyData* cfd,
-                                autovector<MemTable*>& mems, VersionEdit* edit,
-                                uint64_t* filenumber, LogBuffer* log_buffer) {
+    const MutableCFOptions& mutable_cf_options,
+    const autovector<MemTable*>& mems,
+    VersionEdit* edit, uint64_t* filenumber, LogBuffer* log_buffer) {
   mutex_.AssertHeld();
   const uint64_t start_micros = env_->NowMicros();
   FileMetaData meta;
@@ -1560,7 +1562,8 @@ Status DBImpl::WriteLevel0Table(ColumnFamilyData* cfd,
     if (base != nullptr && db_options_.max_background_compactions <= 1 &&
         db_options_.max_background_flushes == 0 &&
         cfd->ioptions()->compaction_style == kCompactionStyleLevel) {
-      level = base->PickLevelForMemTableOutput(min_user_key, max_user_key);
+      level = base->PickLevelForMemTableOutput(
+          mutable_cf_options, min_user_key, max_user_key);
     }
     edit->AddFile(level, meta.fd.GetNumber(), meta.fd.GetPathId(),
                   meta.fd.GetFileSize(), meta.smallest, meta.largest,
@@ -1577,10 +1580,9 @@ Status DBImpl::WriteLevel0Table(ColumnFamilyData* cfd,
   return s;
 }
 
-Status DBImpl::FlushMemTableToOutputFile(ColumnFamilyData* cfd,
-                                         bool* madeProgress,
-                                         DeletionState& deletion_state,
-                                         LogBuffer* log_buffer) {
+Status DBImpl::FlushMemTableToOutputFile(
+    ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
+    bool* madeProgress, DeletionState& deletion_state, LogBuffer* log_buffer) {
   mutex_.AssertHeld();
   assert(cfd->imm()->size() != 0);
   assert(cfd->imm()->IsFlushPending());
@@ -1607,8 +1609,10 @@ Status DBImpl::FlushMemTableToOutputFile(ColumnFamilyData* cfd,
   edit->SetLogNumber(mems.back()->GetNextLogNumber());
   edit->SetColumnFamily(cfd->GetID());
 
+
   // This will release and re-acquire the mutex.
-  Status s = WriteLevel0Table(cfd, mems, edit, &file_number, log_buffer);
+  Status s = WriteLevel0Table(cfd, mutable_cf_options, mems, edit,
+                              &file_number, log_buffer);
 
   if (s.ok() && shutting_down_.Acquire_Load() && cfd->IsDropped()) {
     s = Status::ShutdownInProgress(
@@ -1620,14 +1624,13 @@ Status DBImpl::FlushMemTableToOutputFile(ColumnFamilyData* cfd,
   } else {
     // Replace immutable memtable with the generated Table
     s = cfd->imm()->InstallMemtableFlushResults(
-        cfd, mems, versions_.get(), &mutex_, db_options_.info_log.get(),
-        file_number, &pending_outputs_, &deletion_state.memtables_to_free,
-        db_directory_.get(), log_buffer);
+        cfd, mutable_cf_options, mems, versions_.get(), &mutex_,
+        db_options_.info_log.get(), file_number, &pending_outputs_,
+        &deletion_state.memtables_to_free, db_directory_.get(), log_buffer);
   }
 
   if (s.ok()) {
-    // Use latest MutableCFOptions
-    InstallSuperVersion(cfd, deletion_state);
+    InstallSuperVersion(cfd, deletion_state, mutable_cf_options);
     if (madeProgress) {
       *madeProgress = 1;
     }
@@ -1726,7 +1729,8 @@ bool DBImpl::SetOptions(ColumnFamilyHandle* column_family,
 }
 
 // return the same level if it cannot be moved
-int DBImpl::FindMinimumEmptyLevelFitting(ColumnFamilyData* cfd, int level) {
+int DBImpl::FindMinimumEmptyLevelFitting(ColumnFamilyData* cfd,
+    const MutableCFOptions& mutable_cf_options, int level) {
   mutex_.AssertHeld();
   Version* current = cfd->current();
   int minimum_level = level;
@@ -1734,7 +1738,7 @@ int DBImpl::FindMinimumEmptyLevelFitting(ColumnFamilyData* cfd, int level) {
     // stop if level i is not empty
     if (current->NumLevelFiles(i) > 0) break;
     // stop if level i is too small (cannot fit the level files)
-    if (cfd->compaction_picker()->MaxBytesForLevel(i) <
+    if (mutable_cf_options.MaxBytesForLevel(i) <
         current->NumLevelBytes(level)) {
       break;
     }
@@ -1770,10 +1774,12 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) {
     bg_cv_.Wait();
   }
 
+  const MutableCFOptions mutable_cf_options =
+    *cfd->GetLatestMutableCFOptions();
   // move to a smaller level
   int to_level = target_level;
   if (target_level < 0) {
-    to_level = FindMinimumEmptyLevelFitting(cfd, level);
+    to_level = FindMinimumEmptyLevelFitting(cfd, mutable_cf_options, level);
   }
 
   assert(to_level <= level);
@@ -1794,9 +1800,10 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) {
     Log(db_options_.info_log, "[%s] Apply version edit:\n%s",
         cfd->GetName().c_str(), edit.DebugString().data());
 
-    status = versions_->LogAndApply(cfd, &edit, &mutex_, db_directory_.get());
-    // Use latest MutableCFOptions
-    superversion_to_free = cfd->InstallSuperVersion(new_superversion, &mutex_);
+    status = versions_->LogAndApply(cfd,
+        mutable_cf_options, &edit, &mutex_, db_directory_.get());
+    superversion_to_free = cfd->InstallSuperVersion(
+        new_superversion, &mutex_, mutable_cf_options);
     new_superversion = nullptr;
 
     Log(db_options_.info_log, "[%s] LogAndApply: %s\n", cfd->GetName().c_str(),
@@ -2058,6 +2065,8 @@ Status DBImpl::BackgroundFlush(bool* madeProgress,
   for (auto cfd : *versions_->GetColumnFamilySet()) {
     cfd->Ref();
     Status flush_status;
+    const MutableCFOptions mutable_cf_options =
+      *cfd->GetLatestMutableCFOptions();
     while (flush_status.ok() && cfd->imm()->IsFlushPending()) {
       LogToBuffer(
           log_buffer,
@@ -2065,8 +2074,8 @@ Status DBImpl::BackgroundFlush(bool* madeProgress,
           "family [%s], flush slots available %d",
           cfd->GetName().c_str(),
           db_options_.max_background_flushes - bg_flush_scheduled_);
-      flush_status = FlushMemTableToOutputFile(cfd, madeProgress,
-                                               deletion_state, log_buffer);
+      flush_status = FlushMemTableToOutputFile(
+          cfd, mutable_cf_options, madeProgress, deletion_state, log_buffer);
     }
     if (call_status.ok() && !flush_status.ok()) {
       call_status = flush_status;
@@ -2259,6 +2268,8 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress,
   // FLUSH preempts compaction
   Status flush_stat;
   for (auto cfd : *versions_->GetColumnFamilySet()) {
+    const MutableCFOptions mutable_cf_options =
+      *cfd->GetLatestMutableCFOptions();
     while (cfd->imm()->IsFlushPending()) {
       LogToBuffer(
           log_buffer,
@@ -2266,8 +2277,8 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress,
           "compaction slots available %d",
           db_options_.max_background_compactions - bg_compaction_scheduled_);
       cfd->Ref();
-      flush_stat = FlushMemTableToOutputFile(cfd, madeProgress, deletion_state,
-                                             log_buffer);
+      flush_stat = FlushMemTableToOutputFile(
+          cfd, mutable_cf_options, madeProgress, deletion_state, log_buffer);
       cfd->Unref();
       if (!flush_stat.ok()) {
         if (is_manual) {
@@ -2281,15 +2292,18 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress,
     }
   }
 
+  // Compaction makes a copy of the latest MutableCFOptions. It should be used
+  // throughout the compaction procedure to make sure consistency. It will
+  // eventually be installed into SuperVersion
   unique_ptr<Compaction> c;
   InternalKey manual_end_storage;
   InternalKey* manual_end = &manual_end_storage;
   if (is_manual) {
     ManualCompaction* m = manual_compaction_;
     assert(m->in_progress);
-    c.reset(m->cfd->CompactRange(m->input_level, m->output_level,
-                                 m->output_path_id, m->begin, m->end,
-                                 &manual_end));
+    c.reset(m->cfd->CompactRange(
+          *m->cfd->GetLatestMutableCFOptions(), m->input_level, m->output_level,
+          m->output_path_id, m->begin, m->end, &manual_end));
     if (!c) {
       m->done = true;
     }
@@ -2306,7 +2320,11 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress,
     // no need to refcount in iteration since it's always under a mutex
     for (auto cfd : *versions_->GetColumnFamilySet()) {
       if (!cfd->options()->disable_auto_compactions) {
-        c.reset(cfd->PickCompaction(log_buffer));
+        // NOTE: try to avoid unnecessary copy of MutableCFOptions if
+        // compaction is not necessary. Need to make sure mutex is held
+        // until we make a copy in the following code
+        c.reset(cfd->PickCompaction(
+              *cfd->GetLatestMutableCFOptions(), log_buffer));
         if (c != nullptr) {
           // update statistics
           MeasureTime(stats_, NUM_FILES_IN_SINGLE_COMPACTION,
@@ -2331,10 +2349,11 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress,
     for (const auto& f : *c->inputs(0)) {
       c->edit()->DeleteFile(c->level(), f->fd.GetNumber());
     }
-    status = versions_->LogAndApply(c->column_family_data(), c->edit(), &mutex_,
-                                    db_directory_.get());
-    // Use latest MutableCFOptions
-    InstallSuperVersion(c->column_family_data(), deletion_state);
+    status = versions_->LogAndApply(
+        c->column_family_data(), *c->mutable_cf_options(), c->edit(),
+        &mutex_, db_directory_.get());
+    InstallSuperVersion(c->column_family_data(), deletion_state,
+                        *c->mutable_cf_options());
     LogToBuffer(log_buffer, "[%s] Deleted %d files\n",
                 c->column_family_data()->GetName().c_str(),
                 c->num_input_files(0));
@@ -2348,10 +2367,12 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress,
     c->edit()->AddFile(c->level() + 1, f->fd.GetNumber(), f->fd.GetPathId(),
                        f->fd.GetFileSize(), f->smallest, f->largest,
                        f->smallest_seqno, f->largest_seqno);
-    status = versions_->LogAndApply(c->column_family_data(), c->edit(), &mutex_,
-                                    db_directory_.get());
+    status = versions_->LogAndApply(c->column_family_data(),
+                                    *c->mutable_cf_options(),
+                                    c->edit(), &mutex_, db_directory_.get());
     // Use latest MutableCFOptions
-    InstallSuperVersion(c->column_family_data(), deletion_state);
+    InstallSuperVersion(c->column_family_data(), deletion_state,
+                        *c->mutable_cf_options());
 
     Version::LevelSummaryStorage tmp;
     LogToBuffer(
@@ -2366,7 +2387,8 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress,
   } else {
     MaybeScheduleFlushOrCompaction(); // do more compaction work in parallel.
     CompactionState* compact = new CompactionState(c.get());
-    status = DoCompactionWork(compact, deletion_state, log_buffer);
+    status = DoCompactionWork(compact, *c->mutable_cf_options(),
+                              deletion_state, log_buffer);
     CleanupCompaction(compact, status);
     c->ReleaseCompactionFiles(status);
     c->ReleaseInputs();
@@ -2468,7 +2490,8 @@ void DBImpl::ReleaseCompactionUnusedFileNumbers(CompactionState* compact) {
   }
 }
 
-Status DBImpl::OpenCompactionOutputFile(CompactionState* compact) {
+Status DBImpl::OpenCompactionOutputFile(
+    CompactionState* compact, const MutableCFOptions& mutable_cf_options) {
   assert(compact != nullptr);
   assert(compact->builder == nullptr);
   uint64_t file_number;
@@ -2500,7 +2523,7 @@ Status DBImpl::OpenCompactionOutputFile(CompactionState* compact) {
   if (s.ok()) {
     compact->outfile->SetIOPriority(Env::IO_LOW);
     compact->outfile->SetPreallocationBlockSize(
-        compact->compaction->OutputFilePreallocationSize());
+        compact->compaction->OutputFilePreallocationSize(mutable_cf_options));
 
     ColumnFamilyData* cfd = compact->compaction->column_family_data();
     compact->builder.reset(NewTableBuilder(
@@ -2570,7 +2593,7 @@ Status DBImpl::FinishCompactionOutputFile(CompactionState* compact,
 
 
 Status DBImpl::InstallCompactionResults(CompactionState* compact,
-                                        LogBuffer* log_buffer) {
+    const MutableCFOptions& mutable_cf_options, LogBuffer* log_buffer) {
   mutex_.AssertHeld();
 
   // paranoia: verify that the files that we started with
@@ -2604,6 +2627,7 @@ Status DBImpl::InstallCompactionResults(CompactionState* compact,
                                          out.smallest_seqno, out.largest_seqno);
   }
   return versions_->LogAndApply(compact->compaction->column_family_data(),
+                                mutable_cf_options,
                                 compact->compaction->edit(), &mutex_,
                                 db_directory_.get());
 }
@@ -2635,8 +2659,8 @@ inline SequenceNumber DBImpl::findEarliestVisibleSnapshot(
 }
 
 uint64_t DBImpl::CallFlushDuringCompaction(ColumnFamilyData* cfd,
-                                           DeletionState& deletion_state,
-                                           LogBuffer* log_buffer) {
+    const MutableCFOptions& mutable_cf_options, DeletionState& deletion_state,
+    LogBuffer* log_buffer) {
   if (db_options_.max_background_flushes > 0) {
     // flush thread will take care of this
     return 0;
@@ -2646,7 +2670,8 @@ uint64_t DBImpl::CallFlushDuringCompaction(ColumnFamilyData* cfd,
     mutex_.Lock();
     if (cfd->imm()->IsFlushPending()) {
       cfd->Ref();
-      FlushMemTableToOutputFile(cfd, nullptr, deletion_state, log_buffer);
+      FlushMemTableToOutputFile(cfd, mutable_cf_options, nullptr,
+                                deletion_state, log_buffer);
       cfd->Unref();
       bg_cv_.SignalAll();  // Wakeup DelayWrite() if necessary
     }
@@ -2658,6 +2683,7 @@ uint64_t DBImpl::CallFlushDuringCompaction(ColumnFamilyData* cfd,
 }
 
 Status DBImpl::ProcessKeyValueCompaction(
+    const MutableCFOptions& mutable_cf_options,
     bool is_snapshot_supported,
     SequenceNumber visible_at_tip,
     SequenceNumber earliest_snapshot,
@@ -2721,7 +2747,8 @@ Status DBImpl::ProcessKeyValueCompaction(
     // TODO(icanadi) this currently only checks if flush is necessary on
     // compacting column family. we should also check if flush is necessary on
     // other column families, too
-    imm_micros += CallFlushDuringCompaction(cfd, deletion_state, log_buffer);
+    imm_micros += CallFlushDuringCompaction(
+        cfd, mutable_cf_options, deletion_state, log_buffer);
 
     Slice key;
     Slice value;
@@ -2922,7 +2949,7 @@ Status DBImpl::ProcessKeyValueCompaction(
 
         // Open output file if necessary
         if (compact->builder == nullptr) {
-          status = OpenCompactionOutputFile(compact);
+          status = OpenCompactionOutputFile(compact, mutable_cf_options);
           if (!status.ok()) {
             break;
           }
@@ -3059,6 +3086,7 @@ void DBImpl::CallCompactionFilterV2(CompactionState* compact,
 }
 
 Status DBImpl::DoCompactionWork(CompactionState* compact,
+                                const MutableCFOptions& mutable_cf_options,
                                 DeletionState& deletion_state,
                                 LogBuffer* log_buffer) {
   assert(compact);
@@ -3129,6 +3157,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact,
 
   if (!compaction_filter_v2) {
     status = ProcessKeyValueCompaction(
+      mutable_cf_options,
       is_snapshot_supported,
       visible_at_tip,
       earliest_snapshot,
@@ -3158,7 +3187,8 @@ Status DBImpl::DoCompactionWork(CompactionState* compact,
       // TODO(icanadi) this currently only checks if flush is necessary on
       // compacting column family. we should also check if flush is necessary on
       // other column families, too
-      imm_micros += CallFlushDuringCompaction(cfd, deletion_state, log_buffer);
+      imm_micros += CallFlushDuringCompaction(cfd, mutable_cf_options,
+          deletion_state, log_buffer);
 
       Slice key = backup_input->key();
       Slice value = backup_input->value();
@@ -3208,6 +3238,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact,
       // Done buffering for the current prefix. Spit it out to disk
       // Now just iterate through all the kv-pairs
       status = ProcessKeyValueCompaction(
+          mutable_cf_options,
           is_snapshot_supported,
           visible_at_tip,
           earliest_snapshot,
@@ -3244,6 +3275,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact,
         compact->MergeKeyValueSliceBuffer(&cfd->internal_comparator());
 
         status = ProcessKeyValueCompaction(
+            mutable_cf_options,
             is_snapshot_supported,
             visible_at_tip,
             earliest_snapshot,
@@ -3266,6 +3298,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact,
     }
     compact->MergeKeyValueSliceBuffer(&cfd->internal_comparator());
     status = ProcessKeyValueCompaction(
+        mutable_cf_options,
         is_snapshot_supported,
         visible_at_tip,
         earliest_snapshot,
@@ -3333,9 +3366,8 @@ Status DBImpl::DoCompactionWork(CompactionState* compact,
   ReleaseCompactionUnusedFileNumbers(compact);
 
   if (status.ok()) {
-    status = InstallCompactionResults(compact, log_buffer);
-    // Use latest MutableCFOptions
-    InstallSuperVersion(cfd, deletion_state);
+    status = InstallCompactionResults(compact, mutable_cf_options, log_buffer);
+    InstallSuperVersion(cfd, deletion_state, mutable_cf_options);
   }
   Version::LevelSummaryStorage tmp;
   LogToBuffer(
@@ -3434,16 +3466,16 @@ Status DBImpl::Get(const ReadOptions& options,
 // first call already used it. In that rare case, we take a hit and create a
 // new SuperVersion() inside of the mutex. We do similar thing
 // for superversion_to_free
-void DBImpl::InstallSuperVersion(ColumnFamilyData* cfd,
-                                 DeletionState& deletion_state) {
+void DBImpl::InstallSuperVersion(
+    ColumnFamilyData* cfd, DeletionState& deletion_state,
+    const MutableCFOptions& mutable_cf_options) {
   mutex_.AssertHeld();
   // if new_superversion == nullptr, it means somebody already used it
   SuperVersion* new_superversion =
     (deletion_state.new_superversion != nullptr) ?
     deletion_state.new_superversion : new SuperVersion();
-  // Use latest MutableCFOptions
   SuperVersion* old_superversion =
-      cfd->InstallSuperVersion(new_superversion, &mutex_);
+      cfd->InstallSuperVersion(new_superversion, &mutex_, mutable_cf_options);
   deletion_state.new_superversion = nullptr;
   deletion_state.superversions_to_free.push_back(old_superversion);
 }
@@ -3627,15 +3659,17 @@ Status DBImpl::CreateColumnFamily(const ColumnFamilyOptions& options,
 
   // LogAndApply will both write the creation in MANIFEST and create
   // ColumnFamilyData object
-  Status s = versions_->LogAndApply(nullptr, &edit, &mutex_,
-                                    db_directory_.get(), false, &options);
+  Options opt(db_options_, options);
+  Status s = versions_->LogAndApply(nullptr,
+      MutableCFOptions(opt, ImmutableCFOptions(opt)),
+      &edit, &mutex_, db_directory_.get(), false, &options);
   if (s.ok()) {
     single_column_family_mode_ = false;
     auto cfd =
         versions_->GetColumnFamilySet()->GetColumnFamily(column_family_name);
     assert(cfd != nullptr);
-    // Use latest MutableCFOptions
-    delete cfd->InstallSuperVersion(new SuperVersion(), &mutex_);
+    delete cfd->InstallSuperVersion(new SuperVersion(), &mutex_,
+                                    *cfd->GetLatestMutableCFOptions());
     *handle = new ColumnFamilyHandleImpl(cfd, this, &mutex_);
     Log(db_options_.info_log, "Created column family [%s] (ID %u)",
         column_family_name.c_str(), (unsigned)cfd->GetID());
@@ -3671,7 +3705,8 @@ Status DBImpl::DropColumnFamily(ColumnFamilyHandle* column_family) {
       WriteThread::Writer w(&mutex_);
       s = write_thread_.EnterWriteThread(&w, 0);
       assert(s.ok() && !w.done);  // No timeout and nobody should do our job
-      s = versions_->LogAndApply(cfd, &edit, &mutex_);
+      s = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(),
+                                 &edit, &mutex_);
       write_thread_.ExitWriteThread(&w, &w, s);
     }
   }
@@ -4450,9 +4485,11 @@ Status DBImpl::DeleteFile(std::string name) {
       }
     }
     edit.DeleteFile(level, number);
-    status = versions_->LogAndApply(cfd, &edit, &mutex_, db_directory_.get());
+    status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(),
+                                    &edit, &mutex_, db_directory_.get());
     if (status.ok()) {
-      InstallSuperVersion(cfd, deletion_state);
+      InstallSuperVersion(cfd, deletion_state,
+                          *cfd->GetLatestMutableCFOptions());
     }
     FindObsoleteFiles(deletion_state, false);
   } // lock released here
@@ -4681,8 +4718,8 @@ Status DB::Open(const DBOptions& db_options, const std::string& dbname,
     }
     if (s.ok()) {
       for (auto cfd : *impl->versions_->GetColumnFamilySet()) {
-        // Use latest MutableCFOptions
-        delete cfd->InstallSuperVersion(new SuperVersion(), &impl->mutex_);
+        delete cfd->InstallSuperVersion(new SuperVersion(), &impl->mutex_,
+            *cfd->GetLatestMutableCFOptions());
       }
       impl->alive_log_files_.push_back(
           DBImpl::LogFileNumberSize(impl->logfile_number_));
diff --git a/db/db_impl.h b/db/db_impl.h
index c6baf9c95..f1a81e00c 100644
--- a/db/db_impl.h
+++ b/db/db_impl.h
@@ -347,9 +347,9 @@ class DBImpl : public DB {
 
   // Flush the in-memory write buffer to storage.  Switches to a new
   // log-file/memtable and writes a new descriptor iff successful.
-  Status FlushMemTableToOutputFile(ColumnFamilyData* cfd, bool* madeProgress,
-                                   DeletionState& deletion_state,
-                                   LogBuffer* log_buffer);
+  Status FlushMemTableToOutputFile(
+      ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
+      bool* madeProgress, DeletionState& deletion_state, LogBuffer* log_buffer);
 
   // REQUIRES: log_numbers are sorted in ascending order
   Status RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
@@ -362,9 +362,10 @@ class DBImpl : public DB {
   // concurrent flush memtables to storage.
   Status WriteLevel0TableForRecovery(ColumnFamilyData* cfd, MemTable* mem,
                                      VersionEdit* edit);
-  Status WriteLevel0Table(ColumnFamilyData* cfd, autovector<MemTable*>& mems,
-                          VersionEdit* edit, uint64_t* filenumber,
-                          LogBuffer* log_buffer);
+  Status WriteLevel0Table(ColumnFamilyData* cfd,
+      const MutableCFOptions& mutable_cf_options,
+      const autovector<MemTable*>& mems,
+      VersionEdit* edit, uint64_t* filenumber, LogBuffer* log_buffer);
 
   void DelayWrite(uint64_t expiration_time);
 
@@ -393,6 +394,7 @@ class DBImpl : public DB {
                          LogBuffer* log_buffer);
   void CleanupCompaction(CompactionState* compact, Status status);
   Status DoCompactionWork(CompactionState* compact,
+                          const MutableCFOptions& mutable_cf_options,
                           DeletionState& deletion_state,
                           LogBuffer* log_buffer);
 
@@ -400,12 +402,13 @@ class DBImpl : public DB {
   // preempt compaction, since it's higher prioirty
   // Returns: micros spent executing
   uint64_t CallFlushDuringCompaction(ColumnFamilyData* cfd,
-                                     DeletionState& deletion_state,
-                                     LogBuffer* log_buffer);
+      const MutableCFOptions& mutable_cf_options, DeletionState& deletion_state,
+      LogBuffer* log_buffer);
 
   // Call compaction filter if is_compaction_v2 is not true. Then iterate
   // through input and compact the kv-pairs
   Status ProcessKeyValueCompaction(
+    const MutableCFOptions& mutable_cf_options,
     bool is_snapshot_supported,
     SequenceNumber visible_at_tip,
     SequenceNumber earliest_snapshot,
@@ -422,10 +425,11 @@ class DBImpl : public DB {
   void CallCompactionFilterV2(CompactionState* compact,
     CompactionFilterV2* compaction_filter_v2);
 
-  Status OpenCompactionOutputFile(CompactionState* compact);
+  Status OpenCompactionOutputFile(CompactionState* compact,
+      const MutableCFOptions& mutable_cf_options);
   Status FinishCompactionOutputFile(CompactionState* compact, Iterator* input);
   Status InstallCompactionResults(CompactionState* compact,
-                                  LogBuffer* log_buffer);
+      const MutableCFOptions& mutable_cf_options, LogBuffer* log_buffer);
   void AllocateCompactionOutputFileNumbers(CompactionState* compact);
   void ReleaseCompactionUnusedFileNumbers(CompactionState* compact);
 
@@ -467,7 +471,8 @@ class DBImpl : public DB {
 
   // Return the minimum empty level that could hold the total data in the
   // input level. Return the input level, if such level could not be found.
-  int FindMinimumEmptyLevelFitting(ColumnFamilyData* cfd, int level);
+  int FindMinimumEmptyLevelFitting(ColumnFamilyData* cfd,
+      const MutableCFOptions& mutable_cf_options, int level);
 
   // Move the files in the input level to the target level.
   // If target_level < 0, automatically calculate the minimum level that could
@@ -621,7 +626,8 @@ class DBImpl : public DB {
   // the cfd->InstallSuperVersion() function. Background threads carry
   // deletion_state which can have new_superversion already allocated.
   void InstallSuperVersion(ColumnFamilyData* cfd,
-                           DeletionState& deletion_state);
+                           DeletionState& deletion_state,
+                           const MutableCFOptions& mutable_cf_options);
 
   // Find Super version and reference it. Based on options, it might return
   // the thread local cached one.
diff --git a/db/db_test.cc b/db/db_test.cc
index 986d5810e..a2479d58e 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -874,6 +874,18 @@ class DBTest {
     return atoi(property.c_str());
   }
 
+  uint64_t SizeAtLevel(int level) {
+    std::vector<LiveFileMetaData> metadata;
+    db_->GetLiveFilesMetaData(&metadata);
+    uint64_t sum = 0;
+    for (const auto& m : metadata) {
+      if (m.level == level) {
+        sum += m.size;
+      }
+    }
+    return sum;
+  }
+
   int TotalTableFiles(int cf = 0, int levels = -1) {
     if (levels == -1) {
       levels = CurrentOptions().num_levels;
@@ -8527,6 +8539,102 @@ TEST(DBTest, DisableDataSyncTest) {
   }
 }
 
+TEST(DBTest, DynamicCompactionOptions) {
+  const uint64_t k64KB = 1 << 16;
+  const uint64_t k128KB = 1 << 17;
+  const uint64_t k256KB = 1 << 18;
+  const uint64_t k5KB = 5 * 1024;
+  Options options;
+  options.env = env_;
+  options.create_if_missing = true;
+  options.compression = kNoCompression;
+  options.max_background_compactions = 4;
+  options.hard_rate_limit = 1.1;
+  options.write_buffer_size = k128KB;
+  options.max_write_buffer_number = 2;
+  // Compaction related options
+  options.level0_file_num_compaction_trigger = 3;
+  options.level0_slowdown_writes_trigger = 10;
+  options.level0_stop_writes_trigger = 20;
+  options.max_grandparent_overlap_factor = 10;
+  options.expanded_compaction_factor = 25;
+  options.source_compaction_factor = 1;
+  options.target_file_size_base = k128KB;
+  options.target_file_size_multiplier = 1;
+  options.max_bytes_for_level_base = k256KB;
+  options.max_bytes_for_level_multiplier = 4;
+  DestroyAndReopen(&options);
+
+  auto gen_l0_kb = [this](int start, int size, int stride = 1) {
+    Random rnd(301);
+    std::vector<std::string> values;
+    for (int i = 0; i < size; i++) {
+      values.push_back(RandomString(&rnd, 1024));
+      ASSERT_OK(Put(Key(start + stride * i), values[i]));
+    }
+    dbfull()->TEST_WaitForFlushMemTable();
+  };
+
+  // Write 3 files that have the same key range, trigger compaction and
+  // result in one L1 file
+  gen_l0_kb(0, 128);
+  ASSERT_EQ(NumTableFilesAtLevel(0), 1);
+  gen_l0_kb(0, 128);
+  ASSERT_EQ(NumTableFilesAtLevel(0), 2);
+  gen_l0_kb(0, 128);
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ("0,1", FilesPerLevel());
+  std::vector<LiveFileMetaData> metadata;
+  db_->GetLiveFilesMetaData(&metadata);
+  ASSERT_EQ(1, metadata.size());
+  ASSERT_LE(metadata[0].size, k128KB + k5KB);  // < 128KB + 5KB
+  ASSERT_GE(metadata[0].size, k128KB - k5KB);  // > 128B - 5KB
+
+  // Make compaction trigger and file size smaller
+  ASSERT_TRUE(dbfull()->SetOptions({
+    {"level0_file_num_compaction_trigger", "2"},
+    {"target_file_size_base", "65536"}
+  }));
+
+  gen_l0_kb(0, 128);
+  ASSERT_EQ("1,1", FilesPerLevel());
+  gen_l0_kb(0, 128);
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ("0,2", FilesPerLevel());
+  metadata.clear();
+  db_->GetLiveFilesMetaData(&metadata);
+  ASSERT_EQ(2, metadata.size());
+  ASSERT_LE(metadata[0].size, k64KB + k5KB);  // < 64KB + 5KB
+  ASSERT_GE(metadata[0].size, k64KB - k5KB);  // > 64KB - 5KB
+
+  // Change base level size to 1MB
+  ASSERT_TRUE(dbfull()->SetOptions({ {"max_bytes_for_level_base", "1048576"} }));
+
+  // writing 56 x 128KB => 7MB
+  // (L1 + L2) = (1 + 4) * 1MB = 5MB
+  for (int i = 0; i < 56; ++i) {
+    gen_l0_kb(i, 128, 56);
+  }
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_TRUE(SizeAtLevel(1) < 1048576 * 1.1);
+  ASSERT_TRUE(SizeAtLevel(2) < 4 * 1048576 * 1.1);
+
+  // Change multiplier to 2 with smaller base
+  ASSERT_TRUE(dbfull()->SetOptions({
+    {"max_bytes_for_level_multiplier", "2"},
+    {"max_bytes_for_level_base", "262144"}
+  }));
+
+  // writing 16 x 128KB
+  // (L1 + L2 + L3) = (1 + 2 + 4) * 256KB
+  for (int i = 0; i < 16; ++i) {
+    gen_l0_kb(i, 128, 50);
+  }
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_TRUE(SizeAtLevel(1) < 262144 * 1.1);
+  ASSERT_TRUE(SizeAtLevel(2) < 2 * 262144 * 1.1);
+  ASSERT_TRUE(SizeAtLevel(3) < 4 * 262144 * 1.1);
+}
 
 }  // namespace rocksdb
 
diff --git a/db/log_and_apply_bench.cc b/db/log_and_apply_bench.cc
index 3a5535d2d..eba0a2787 100644
--- a/db/log_and_apply_bench.cc
+++ b/db/log_and_apply_bench.cc
@@ -60,7 +60,8 @@ void BM_LogAndApply(int iters, int num_base_files) {
       InternalKey limit(MakeKey(2 * fnum + 1), 1, kTypeDeletion);
       vbase.AddFile(2, ++fnum, 0, 1 /* file size */, start, limit, 1, 1);
     }
-    ASSERT_OK(vset->LogAndApply(default_cfd, &vbase, &mu));
+    ASSERT_OK(vset->LogAndApply(default_cfd,
+          *default_cfd->GetLatestMutableCFOptions(), &vbase, &mu));
   }
 
   for (int i = 0; i < iters; i++) {
@@ -69,7 +70,8 @@ void BM_LogAndApply(int iters, int num_base_files) {
     InternalKey start(MakeKey(2 * fnum), 1, kTypeValue);
     InternalKey limit(MakeKey(2 * fnum + 1), 1, kTypeDeletion);
     vedit.AddFile(2, ++fnum, 0, 1 /* file size */, start, limit, 1, 1);
-    vset->LogAndApply(default_cfd, &vedit, &mu);
+    vset->LogAndApply(default_cfd, *default_cfd->GetLatestMutableCFOptions(),
+                      &vedit, &mu);
   }
   delete vset;
 }
diff --git a/db/memtable_list.cc b/db/memtable_list.cc
index 728b1c0a0..bd48f1f47 100644
--- a/db/memtable_list.cc
+++ b/db/memtable_list.cc
@@ -160,7 +160,8 @@ void MemTableList::RollbackMemtableFlush(const autovector<MemTable*>& mems,
 
 // Record a successful flush in the manifest file
 Status MemTableList::InstallMemtableFlushResults(
-    ColumnFamilyData* cfd, const autovector<MemTable*>& mems, VersionSet* vset,
+    ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
+    const autovector<MemTable*>& mems, VersionSet* vset,
     port::Mutex* mu, Logger* info_log, uint64_t file_number,
     FileNumToPathIdMap* pending_outputs, autovector<MemTable*>* to_delete,
     Directory* db_directory, LogBuffer* log_buffer) {
@@ -197,7 +198,7 @@ Status MemTableList::InstallMemtableFlushResults(
                 cfd->GetName().c_str(), (unsigned long)m->file_number_);
 
     // this can release and reacquire the mutex.
-    s = vset->LogAndApply(cfd, &m->edit_, mu, db_directory);
+    s = vset->LogAndApply(cfd, mutable_cf_options, &m->edit_, mu, db_directory);
 
     // we will be changing the version in the next code path,
     // so we better create a new one, since versions are immutable
diff --git a/db/memtable_list.h b/db/memtable_list.h
index 92688825a..d93c7df92 100644
--- a/db/memtable_list.h
+++ b/db/memtable_list.h
@@ -113,7 +113,8 @@ class MemTableList {
 
   // Commit a successful flush in the manifest file
   Status InstallMemtableFlushResults(
-      ColumnFamilyData* cfd, const autovector<MemTable*>& m, VersionSet* vset,
+      ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
+      const autovector<MemTable*>& m, VersionSet* vset,
       port::Mutex* mu, Logger* info_log, uint64_t file_number,
       FileNumToPathIdMap* pending_outputs, autovector<MemTable*>* to_delete,
       Directory* db_directory, LogBuffer* log_buffer);
diff --git a/db/repair.cc b/db/repair.cc
index 2773d4c71..80fb92bd9 100644
--- a/db/repair.cc
+++ b/db/repair.cc
@@ -220,7 +220,7 @@ class Repairer {
     Slice record;
     WriteBatch batch;
     MemTable* mem = new MemTable(icmp_, ioptions_,
-        MemTableOptions(MutableCFOptions(options_), options_));
+        MemTableOptions(MutableCFOptions(options_, ioptions_), options_));
     auto cf_mems_default = new ColumnFamilyMemTablesDefault(mem, &options_);
     mem->Ref();
     int counter = 0;
diff --git a/db/version_set.cc b/db/version_set.cc
index 1d1d53813..a092277fa 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -672,7 +672,7 @@ Version::Version(ColumnFamilyData* cfd, VersionSet* vset,
   }
 }
 
-void Version::Get(const ReadOptions& options,
+void Version::Get(const ReadOptions& read_options,
                   const LookupKey& k,
                   std::string* value,
                   Status* status,
@@ -691,8 +691,8 @@ void Version::Get(const ReadOptions& options,
       &file_indexer_, user_comparator_, internal_comparator_);
   FdWithKeyRange* f = fp.GetNextFile();
   while (f != nullptr) {
-    *status = table_cache_->Get(options, *internal_comparator_, f->fd, ikey,
-                                &get_context);
+    *status = table_cache_->Get(read_options, *internal_comparator_, f->fd,
+                                ikey, &get_context);
     // TODO: examine the behavior for corrupted key
     if (!status->ok()) {
       return;
@@ -746,9 +746,10 @@ void Version::GenerateFileLevels() {
   }
 }
 
-void Version::PrepareApply(std::vector<uint64_t>& size_being_compacted) {
+void Version::PrepareApply(const MutableCFOptions& mutable_cf_options,
+                           std::vector<uint64_t>& size_being_compacted) {
   UpdateTemporaryStats();
-  ComputeCompactionScore(size_being_compacted);
+  ComputeCompactionScore(mutable_cf_options, size_being_compacted);
   UpdateFilesBySize();
   UpdateNumNonEmptyLevels();
   file_indexer_.UpdateIndex(&arena_, num_non_empty_levels_, files_);
@@ -817,13 +818,13 @@ void Version::UpdateTemporaryStats() {
 }
 
 void Version::ComputeCompactionScore(
+    const MutableCFOptions& mutable_cf_options,
     std::vector<uint64_t>& size_being_compacted) {
   double max_score = 0;
   int max_score_level = 0;
 
   int max_input_level =
       cfd_->compaction_picker()->MaxInputLevel(NumberLevels());
-
   for (int level = 0; level <= max_input_level; level++) {
     double score;
     if (level == 0) {
@@ -849,21 +850,22 @@ void Version::ComputeCompactionScore(
       if (cfd_->ioptions()->compaction_style == kCompactionStyleFIFO) {
         score = static_cast<double>(total_size) /
                 cfd_->options()->compaction_options_fifo.max_table_files_size;
-      } else if (numfiles >= cfd_->options()->level0_stop_writes_trigger) {
+      } else if (numfiles >= mutable_cf_options.level0_stop_writes_trigger) {
         // If we are slowing down writes, then we better compact that first
         score = 1000000;
-      } else if (numfiles >= cfd_->options()->level0_slowdown_writes_trigger) {
+      } else if (numfiles >=
+          mutable_cf_options.level0_slowdown_writes_trigger) {
         score = 10000;
       } else {
         score = static_cast<double>(numfiles) /
-                cfd_->options()->level0_file_num_compaction_trigger;
+                mutable_cf_options.level0_file_num_compaction_trigger;
       }
     } else {
       // Compute the ratio of current size to size limit.
       const uint64_t level_bytes =
           TotalCompensatedFileSize(files_[level]) - size_being_compacted[level];
       score = static_cast<double>(level_bytes) /
-              cfd_->compaction_picker()->MaxBytesForLevel(level);
+              mutable_cf_options.MaxBytesForLevel(level);
       if (max_score < score) {
         max_score = score;
         max_score_level = level;
@@ -993,6 +995,7 @@ bool Version::OverlapInLevel(int level,
 }
 
 int Version::PickLevelForMemTableOutput(
+    const MutableCFOptions& mutable_cf_options,
     const Slice& smallest_user_key,
     const Slice& largest_user_key) {
   int level = 0;
@@ -1013,7 +1016,7 @@ int Version::PickLevelForMemTableOutput(
       }
       GetOverlappingInputs(level + 2, &start, &limit, &overlaps);
       const uint64_t sum = TotalFileSize(overlaps);
-      if (sum > cfd_->compaction_picker()->MaxGrandParentOverlapBytes(level)) {
+      if (sum > mutable_cf_options.MaxGrandParentOverlapBytes(level)) {
         break;
       }
       level++;
@@ -1246,7 +1249,7 @@ bool Version::HasOverlappingUserKey(
   return false;
 }
 
-int64_t Version::NumLevelBytes(int level) const {
+uint64_t Version::NumLevelBytes(int level) const {
   assert(level >= 0);
   assert(level < NumberLevels());
   return TotalFileSize(files_[level]);
@@ -1653,16 +1656,17 @@ void VersionSet::AppendVersion(ColumnFamilyData* column_family_data,
 }
 
 Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
+                               const MutableCFOptions& mutable_cf_options,
                                VersionEdit* edit, port::Mutex* mu,
                                Directory* db_directory, bool new_descriptor_log,
-                               const ColumnFamilyOptions* options) {
+                               const ColumnFamilyOptions* new_cf_options) {
   mu->AssertHeld();
 
   // column_family_data can be nullptr only if this is column_family_add.
   // in that case, we also need to specify ColumnFamilyOptions
   if (column_family_data == nullptr) {
     assert(edit->is_column_family_add_);
-    assert(options != nullptr);
+    assert(new_cf_options != nullptr);
   }
 
   // queue our request
@@ -1777,7 +1781,7 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
 
     if (!edit->IsColumnFamilyManipulation()) {
       // This is cpu-heavy operations, which should be called outside mutex.
-      v->PrepareApply(size_being_compacted);
+      v->PrepareApply(mutable_cf_options, size_being_compacted);
     }
 
     // Write new record to MANIFEST log
@@ -1853,8 +1857,8 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
     if (edit->is_column_family_add_) {
       // no group commit on column family add
       assert(batch_edits.size() == 1);
-      assert(options != nullptr);
-      CreateColumnFamily(*options, edit);
+      assert(new_cf_options != nullptr);
+      CreateColumnFamily(*new_cf_options, edit);
     } else if (edit->is_column_family_drop_) {
       assert(batch_edits.size() == 1);
       column_family_data->SetDropped();
@@ -2198,7 +2202,7 @@ Status VersionSet::Recover(
       // Install recovered version
       std::vector<uint64_t> size_being_compacted(v->NumberLevels() - 1);
       cfd->compaction_picker()->SizeBeingCompacted(size_being_compacted);
-      v->PrepareApply(size_being_compacted);
+      v->PrepareApply(*cfd->GetLatestMutableCFOptions(), size_being_compacted);
       AppendVersion(cfd, v);
     }
 
@@ -2374,11 +2378,13 @@ Status VersionSet::ReduceNumberOfLevels(const std::string& dbname,
   current_version->files_ = new_files_list;
   current_version->num_levels_ = new_levels;
 
+  MutableCFOptions mutable_cf_options(*options, ImmutableCFOptions(*options));
   VersionEdit ve;
   port::Mutex dummy_mutex;
   MutexLock l(&dummy_mutex);
-  return versions.LogAndApply(versions.GetColumnFamilySet()->GetDefault(), &ve,
-                              &dummy_mutex, nullptr, true);
+  return versions.LogAndApply(
+      versions.GetColumnFamilySet()->GetDefault(),
+      mutable_cf_options, &ve, &dummy_mutex, nullptr, true);
 }
 
 Status VersionSet::DumpManifest(Options& options, std::string& dscname,
@@ -2530,7 +2536,7 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname,
       builder->SaveTo(v);
       std::vector<uint64_t> size_being_compacted(v->NumberLevels() - 1);
       cfd->compaction_picker()->SizeBeingCompacted(size_being_compacted);
-      v->PrepareApply(size_being_compacted);
+      v->PrepareApply(*cfd->GetLatestMutableCFOptions(), size_being_compacted);
       delete builder;
 
       printf("--------------- Column family \"%s\"  (ID %u) --------------\n",
diff --git a/db/version_set.h b/db/version_set.h
index 4a27a9592..05e6e9a65 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -103,14 +103,18 @@ class Version {
   // We use compaction scores to figure out which compaction to do next
   // REQUIRES: If Version is not yet saved to current_, it can be called without
   // a lock. Once a version is saved to current_, call only with mutex held
-  void ComputeCompactionScore(std::vector<uint64_t>& size_being_compacted);
+  void ComputeCompactionScore(
+      const MutableCFOptions& mutable_cf_options,
+      std::vector<uint64_t>& size_being_compacted);
 
   // Generate file_levels_ from files_
   void GenerateFileLevels();
 
   // Update scores, pre-calculated variables. It needs to be called before
   // applying the version to the version set.
-  void PrepareApply(std::vector<uint64_t>& size_being_compacted);
+  void PrepareApply(
+      const MutableCFOptions& mutable_cf_options,
+      std::vector<uint64_t>& size_being_compacted);
 
   // Reference count management (so Versions do not disappear out from
   // under live iterators)
@@ -169,7 +173,8 @@ class Version {
 
   // Return the level at which we should place a new memtable compaction
   // result that covers the range [smallest_user_key,largest_user_key].
-  int PickLevelForMemTableOutput(const Slice& smallest_user_key,
+  int PickLevelForMemTableOutput(const MutableCFOptions& mutable_cf_options,
+                                 const Slice& smallest_user_key,
                                  const Slice& largest_user_key);
 
   int NumberLevels() const { return num_levels_; }
@@ -178,7 +183,7 @@ class Version {
   int NumLevelFiles(int level) const { return files_[level].size(); }
 
   // Return the combined file size of all files at the specified level.
-  int64_t NumLevelBytes(int level) const;
+  uint64_t NumLevelBytes(int level) const;
 
   // Return a human-readable short (single-line) summary of the number
   // of files per level.  Uses *scratch as backing store.
@@ -369,7 +374,9 @@ class VersionSet {
   // column_family_options has to be set if edit is column family add
   // REQUIRES: *mu is held on entry.
   // REQUIRES: no other thread concurrently calls LogAndApply()
-  Status LogAndApply(ColumnFamilyData* column_family_data, VersionEdit* edit,
+  Status LogAndApply(ColumnFamilyData* column_family_data,
+                     const MutableCFOptions& mutable_cf_options,
+                     VersionEdit* edit,
                      port::Mutex* mu, Directory* db_directory = nullptr,
                      bool new_descriptor_log = false,
                      const ColumnFamilyOptions* column_family_options =
diff --git a/db/write_batch_test.cc b/db/write_batch_test.cc
index ba7451078..cb4048214 100644
--- a/db/write_batch_test.cc
+++ b/db/write_batch_test.cc
@@ -27,8 +27,9 @@ static std::string PrintContents(WriteBatch* b) {
   auto factory = std::make_shared<SkipListFactory>();
   Options options;
   options.memtable_factory = factory;
-  MemTable* mem = new MemTable(cmp, ImmutableCFOptions(options),
-      MemTableOptions(MutableCFOptions(options), options));
+  ImmutableCFOptions ioptions(options);
+  MemTable* mem = new MemTable(cmp, ioptions,
+      MemTableOptions(MutableCFOptions(options, ioptions), options));
   mem->Ref();
   std::string state;
   ColumnFamilyMemTablesDefault cf_mems_default(mem, &options);
diff --git a/include/rocksdb/immutable_options.h b/include/rocksdb/immutable_options.h
index 54b676626..2dd50f756 100644
--- a/include/rocksdb/immutable_options.h
+++ b/include/rocksdb/immutable_options.h
@@ -22,6 +22,7 @@ struct ImmutableCFOptions {
   CompactionStyle compaction_style;
 
   CompactionOptionsUniversal compaction_options_universal;
+  CompactionOptionsFIFO compaction_options_fifo;
 
   const SliceTransform* prefix_extractor;
 
@@ -79,6 +80,8 @@ struct ImmutableCFOptions {
   CompressionOptions compression_opts;
 
   Options::AccessHint access_hint_on_compaction_start;
+
+  int num_levels;
 };
 
 }  // namespace rocksdb
diff --git a/table/table_test.cc b/table/table_test.cc
index df662ad88..e4657e8cd 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -437,8 +437,9 @@ class MemTableConstructor: public Constructor {
         table_factory_(new SkipListFactory) {
     Options options;
     options.memtable_factory = table_factory_;
-    memtable_ = new MemTable(internal_comparator_, ImmutableCFOptions(options),
-        MemTableOptions(MutableCFOptions(options), options));
+    ImmutableCFOptions ioptions(options);
+    memtable_ = new MemTable(internal_comparator_, ioptions,
+        MemTableOptions(MutableCFOptions(options, ioptions), options));
     memtable_->Ref();
   }
   ~MemTableConstructor() {
@@ -452,8 +453,9 @@ class MemTableConstructor: public Constructor {
     delete memtable_->Unref();
     Options options;
     options.memtable_factory = table_factory_;
-    memtable_ = new MemTable(internal_comparator_, ImmutableCFOptions(options),
-        MemTableOptions(MutableCFOptions(options), options));
+    ImmutableCFOptions mem_ioptions(options);
+    memtable_ = new MemTable(internal_comparator_, mem_ioptions,
+        MemTableOptions(MutableCFOptions(options, mem_ioptions), options));
     memtable_->Ref();
     int seq = 1;
     for (KVMap::const_iterator it = data.begin();
@@ -1864,8 +1866,9 @@ TEST(MemTableTest, Simple) {
   auto table_factory = std::make_shared<SkipListFactory>();
   Options options;
   options.memtable_factory = table_factory;
-  MemTable* memtable = new MemTable(cmp, ImmutableCFOptions(options),
-        MemTableOptions(MutableCFOptions(options), options));
+  ImmutableCFOptions ioptions(options);
+  MemTable* memtable = new MemTable(cmp, ioptions,
+        MemTableOptions(MutableCFOptions(options, ioptions), options));
   memtable->Ref();
   WriteBatch batch;
   WriteBatchInternal::SetSequence(&batch, 100);
diff --git a/util/mutable_cf_options.cc b/util/mutable_cf_options.cc
new file mode 100644
index 000000000..1c710c656
--- /dev/null
+++ b/util/mutable_cf_options.cc
@@ -0,0 +1,72 @@
+//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include <limits>
+#include <cassert>
+#include "rocksdb/options.h"
+#include "rocksdb/immutable_options.h"
+#include "util/mutable_cf_options.h"
+
+namespace rocksdb {
+
+namespace {
+// Multiple two operands. If they overflow, return op1.
+uint64_t MultiplyCheckOverflow(uint64_t op1, int op2) {
+  if (op1 == 0) {
+    return 0;
+  }
+  if (op2 <= 0) {
+    return op1;
+  }
+  uint64_t casted_op2 = (uint64_t) op2;
+  if (std::numeric_limits<uint64_t>::max() / op1 < casted_op2) {
+    return op1;
+  }
+  return op1 * casted_op2;
+}
+}  // anonymous namespace
+
+void MutableCFOptions::RefreshDerivedOptions(
+    const ImmutableCFOptions& ioptions) {
+  max_file_size.resize(ioptions.num_levels);
+  level_max_bytes.resize(ioptions.num_levels);
+  for (int i = 0; i < ioptions.num_levels; ++i) {
+    if (i == 0 && ioptions.compaction_style == kCompactionStyleUniversal) {
+      max_file_size[i] = ULLONG_MAX;
+      level_max_bytes[i] = max_bytes_for_level_base;
+    } else if (i > 1) {
+      max_file_size[i] = MultiplyCheckOverflow(max_file_size[i - 1],
+                                               target_file_size_multiplier);
+      level_max_bytes[i] = MultiplyCheckOverflow(
+          MultiplyCheckOverflow(level_max_bytes[i - 1],
+                                max_bytes_for_level_multiplier),
+          max_bytes_for_level_multiplier_additional[i - 1]);
+    } else {
+      max_file_size[i] = target_file_size_base;
+      level_max_bytes[i] = max_bytes_for_level_base;
+    }
+  }
+}
+
+uint64_t MutableCFOptions::MaxFileSizeForLevel(int level) const {
+  assert(level >= 0);
+  assert(level < (int)max_file_size.size());
+  return max_file_size[level];
+}
+uint64_t MutableCFOptions::MaxBytesForLevel(int level) const {
+  // Note: the result for level zero is not really used since we set
+  // the level-0 compaction threshold based on number of files.
+  assert(level >= 0);
+  assert(level < (int)level_max_bytes.size());
+  return level_max_bytes[level];
+}
+uint64_t MutableCFOptions::MaxGrandParentOverlapBytes(int level) const {
+  return MaxFileSizeForLevel(level) * max_grandparent_overlap_factor;
+}
+uint64_t MutableCFOptions::ExpandedCompactionByteSizeLimit(int level) const {
+  return MaxFileSizeForLevel(level) * expanded_compaction_factor;
+}
+
+}  // namespace rocksdb
diff --git a/util/mutable_cf_options.h b/util/mutable_cf_options.h
index 39ebe2d85..02f63fed4 100644
--- a/util/mutable_cf_options.h
+++ b/util/mutable_cf_options.h
@@ -5,12 +5,14 @@
 
 #pragma once
 
+#include <vector>
 #include "rocksdb/options.h"
+#include "rocksdb/immutable_options.h"
 
 namespace rocksdb {
 
 struct MutableCFOptions {
-  explicit MutableCFOptions(const Options& options)
+  MutableCFOptions(const Options& options, const ImmutableCFOptions& ioptions)
     : write_buffer_size(options.write_buffer_size),
       arena_block_size(options.arena_block_size),
       memtable_prefix_bloom_bits(options.memtable_prefix_bloom_bits),
@@ -18,7 +20,22 @@ struct MutableCFOptions {
       memtable_prefix_bloom_huge_page_tlb_size(
           options.memtable_prefix_bloom_huge_page_tlb_size),
       max_successive_merges(options.max_successive_merges),
-      filter_deletes(options.filter_deletes) {
+      filter_deletes(options.filter_deletes),
+      level0_file_num_compaction_trigger(
+          options.level0_file_num_compaction_trigger),
+      level0_slowdown_writes_trigger(options.level0_slowdown_writes_trigger),
+      level0_stop_writes_trigger(options.level0_stop_writes_trigger),
+      max_grandparent_overlap_factor(options.max_grandparent_overlap_factor),
+      expanded_compaction_factor(options.expanded_compaction_factor),
+      source_compaction_factor(options.source_compaction_factor),
+      target_file_size_base(options.target_file_size_base),
+      target_file_size_multiplier(options.target_file_size_multiplier),
+      max_bytes_for_level_base(options.max_bytes_for_level_base),
+      max_bytes_for_level_multiplier(options.max_bytes_for_level_multiplier),
+      max_bytes_for_level_multiplier_additional(
+          options.max_bytes_for_level_multiplier_additional)
+  {
+    RefreshDerivedOptions(ioptions);
   }
   MutableCFOptions()
     : write_buffer_size(0),
@@ -27,8 +44,33 @@ struct MutableCFOptions {
       memtable_prefix_bloom_probes(0),
       memtable_prefix_bloom_huge_page_tlb_size(0),
       max_successive_merges(0),
-      filter_deletes(false) {}
+      filter_deletes(false),
+      level0_file_num_compaction_trigger(0),
+      level0_slowdown_writes_trigger(0),
+      level0_stop_writes_trigger(0),
+      max_grandparent_overlap_factor(0),
+      expanded_compaction_factor(0),
+      source_compaction_factor(0),
+      target_file_size_base(0),
+      target_file_size_multiplier(0),
+      max_bytes_for_level_base(0),
+      max_bytes_for_level_multiplier(0)
+  {}
 
+  // Must be called after any change to MutableCFOptions
+  void RefreshDerivedOptions(const ImmutableCFOptions& ioptions);
+
+  // Get the max file size in a given level.
+  uint64_t MaxFileSizeForLevel(int level) const;
+  // Returns maximum total bytes of data on a given level.
+  uint64_t MaxBytesForLevel(int level) const;
+  // Returns maximum total overlap bytes with grandparent
+  // level (i.e., level+2) before we stop building a single
+  // file in level->level+1 compaction.
+  uint64_t MaxGrandParentOverlapBytes(int level) const;
+  uint64_t ExpandedCompactionByteSizeLimit(int level) const;
+
+  // Memtable related options
   size_t write_buffer_size;
   size_t arena_block_size;
   uint32_t memtable_prefix_bloom_bits;
@@ -36,6 +78,25 @@ struct MutableCFOptions {
   size_t memtable_prefix_bloom_huge_page_tlb_size;
   size_t max_successive_merges;
   bool filter_deletes;
+
+  // Compaction related options
+  int level0_file_num_compaction_trigger;
+  int level0_slowdown_writes_trigger;
+  int level0_stop_writes_trigger;
+  int max_grandparent_overlap_factor;
+  int expanded_compaction_factor;
+  int source_compaction_factor;
+  int target_file_size_base;
+  int target_file_size_multiplier;
+  uint64_t max_bytes_for_level_base;
+  int max_bytes_for_level_multiplier;
+  std::vector<int> max_bytes_for_level_multiplier_additional;
+
+  // Derived options
+  // Per-level target file size.
+  std::vector<uint64_t> max_file_size;
+  // Per-level max bytes
+  std::vector<uint64_t> level_max_bytes;
 };
 
 }  // namespace rocksdb
diff --git a/util/options.cc b/util/options.cc
index 8716b465d..b5dc98317 100644
--- a/util/options.cc
+++ b/util/options.cc
@@ -35,6 +35,7 @@ namespace rocksdb {
 ImmutableCFOptions::ImmutableCFOptions(const Options& options)
   : compaction_style(options.compaction_style),
     compaction_options_universal(options.compaction_options_universal),
+    compaction_options_fifo(options.compaction_options_fifo),
     prefix_extractor(options.prefix_extractor.get()),
     comparator(options.comparator),
     merge_operator(options.merge_operator.get()),
@@ -60,7 +61,8 @@ ImmutableCFOptions::ImmutableCFOptions(const Options& options)
     compression(options.compression),
     compression_per_level(options.compression_per_level),
     compression_opts(options.compression_opts),
-    access_hint_on_compaction_start(options.access_hint_on_compaction_start) {}
+    access_hint_on_compaction_start(options.access_hint_on_compaction_start),
+    num_levels(options.num_levels) {}
 
 ColumnFamilyOptions::ColumnFamilyOptions()
     : comparator(BytewiseComparator()),
diff --git a/util/options_helper.cc b/util/options_helper.cc
index 35c3f63df..2a61c8b69 100644
--- a/util/options_helper.cc
+++ b/util/options_helper.cc
@@ -4,6 +4,7 @@
 //  of patent rights can be found in the PATENTS file in the same directory.
 
 #include <cassert>
+#include <unordered_set>
 #include "rocksdb/options.h"
 #include "util/options_helper.h"
 
@@ -73,8 +74,8 @@ CompactionStyle ParseCompactionStyle(const std::string& type) {
 }  // anonymouse namespace
 
 template<typename OptionsType>
-bool ParseMemtableOption(const std::string& name, const std::string& value,
-                         OptionsType* new_options) {
+bool ParseMemtableOptions(const std::string& name, const std::string& value,
+                          OptionsType* new_options) {
   if (name == "write_buffer_size") {
     new_options->write_buffer_size = ParseInt64(value);
   } else if (name == "arena_block_size") {
@@ -96,6 +97,50 @@ bool ParseMemtableOption(const std::string& name, const std::string& value,
   return true;
 }
 
+template<typename OptionsType>
+bool ParseCompactionOptions(const std::string& name, const std::string& value,
+                            OptionsType* new_options) {
+  if (name == "level0_file_num_compaction_trigger") {
+    new_options->level0_file_num_compaction_trigger = ParseInt(value);
+  } else if (name == "level0_slowdown_writes_trigger") {
+    new_options->level0_slowdown_writes_trigger = ParseInt(value);
+  } else if (name == "level0_stop_writes_trigger") {
+    new_options->level0_stop_writes_trigger = ParseInt(value);
+  } else if (name == "max_grandparent_overlap_factor") {
+    new_options->max_grandparent_overlap_factor = ParseInt(value);
+  } else if (name == "expanded_compaction_factor") {
+    new_options->expanded_compaction_factor = ParseInt(value);
+  } else if (name == "source_compaction_factor") {
+    new_options->source_compaction_factor = ParseInt(value);
+  } else if (name == "target_file_size_base") {
+    new_options->target_file_size_base = ParseInt(value);
+  } else if (name == "target_file_size_multiplier") {
+    new_options->target_file_size_multiplier = ParseInt(value);
+  } else if (name == "max_bytes_for_level_base") {
+    new_options->max_bytes_for_level_base = ParseUint64(value);
+  } else if (name == "max_bytes_for_level_multiplier") {
+    new_options->max_bytes_for_level_multiplier = ParseInt(value);
+  } else if (name == "max_bytes_for_level_multiplier_additional") {
+    new_options->max_bytes_for_level_multiplier_additional.clear();
+    size_t start = 0;
+    while (true) {
+      size_t end = value.find_first_of(':', start);
+      if (end == std::string::npos) {
+        new_options->max_bytes_for_level_multiplier_additional.push_back(
+            ParseInt(value.substr(start)));
+        break;
+      } else {
+        new_options->max_bytes_for_level_multiplier_additional.push_back(
+            ParseInt(value.substr(start, end - start)));
+        start = end + 1;
+      }
+    }
+  } else {
+    return false;
+  }
+  return true;
+}
+
 bool GetMutableOptionsFromStrings(
     const MutableCFOptions& base_options,
     const std::unordered_map<std::string, std::string>& options_map,
@@ -104,7 +149,8 @@ bool GetMutableOptionsFromStrings(
   *new_options = base_options;
   try {
     for (const auto& o : options_map) {
-      if (ParseMemtableOption(o.first, o.second, new_options)) {
+      if (ParseMemtableOptions(o.first, o.second, new_options)) {
+      } else if (ParseCompactionOptions(o.first, o.second, new_options)) {
       } else {
         return false;
       }
@@ -123,7 +169,8 @@ bool GetOptionsFromStrings(
   *new_options = base_options;
   for (const auto& o : options_map) {
     try {
-      if (ParseMemtableOption(o.first, o.second, new_options)) {
+      if (ParseMemtableOptions(o.first, o.second, new_options)) {
+      } else if (ParseCompactionOptions(o.first, o.second, new_options)) {
       } else if (o.first == "max_write_buffer_number") {
         new_options->max_write_buffer_number = ParseInt(o.second);
       } else if (o.first == "min_write_buffer_number_to_merge") {
@@ -168,43 +215,8 @@ bool GetOptionsFromStrings(
             ParseInt(o.second.substr(start, o.second.size() - start));
       } else if (o.first == "num_levels") {
         new_options->num_levels = ParseInt(o.second);
-      } else if (o.first == "level0_file_num_compaction_trigger") {
-        new_options->level0_file_num_compaction_trigger = ParseInt(o.second);
-      } else if (o.first == "level0_slowdown_writes_trigger") {
-        new_options->level0_slowdown_writes_trigger = ParseInt(o.second);
-      } else if (o.first == "level0_stop_writes_trigger") {
-        new_options->level0_stop_writes_trigger = ParseInt(o.second);
       } else if (o.first == "max_mem_compaction_level") {
         new_options->max_mem_compaction_level = ParseInt(o.second);
-      } else if (o.first == "target_file_size_base") {
-        new_options->target_file_size_base = ParseUint64(o.second);
-      } else if (o.first == "target_file_size_multiplier") {
-        new_options->target_file_size_multiplier = ParseInt(o.second);
-      } else if (o.first == "max_bytes_for_level_base") {
-        new_options->max_bytes_for_level_base = ParseUint64(o.second);
-      } else if (o.first == "max_bytes_for_level_multiplier") {
-        new_options->max_bytes_for_level_multiplier = ParseInt(o.second);
-      } else if (o.first == "max_bytes_for_level_multiplier_additional") {
-        new_options->max_bytes_for_level_multiplier_additional.clear();
-        size_t start = 0;
-        while (true) {
-          size_t end = o.second.find_first_of(':', start);
-          if (end == std::string::npos) {
-            new_options->max_bytes_for_level_multiplier_additional.push_back(
-                ParseInt(o.second.substr(start)));
-            break;
-          } else {
-            new_options->max_bytes_for_level_multiplier_additional.push_back(
-                ParseInt(o.second.substr(start, end - start)));
-            start = end + 1;
-          }
-        }
-      } else if (o.first == "expanded_compaction_factor") {
-        new_options->expanded_compaction_factor = ParseInt(o.second);
-      } else if (o.first == "source_compaction_factor") {
-        new_options->source_compaction_factor = ParseInt(o.second);
-      } else if (o.first == "max_grandparent_overlap_factor") {
-        new_options->max_grandparent_overlap_factor = ParseInt(o.second);
       } else if (o.first == "soft_rate_limit") {
         new_options->soft_rate_limit = ParseDouble(o.second);
       } else if (o.first == "hard_rate_limit") {

From 187b29938cf6dfce29418924a2bc80b1b9783a06 Mon Sep 17 00:00:00 2001
From: Tomislav Novak <tnovak@fb.com>
Date: Mon, 22 Sep 2014 15:20:03 -0700
Subject: [PATCH 191/829] ForwardIterator: update prev_key_ only if prefix
 hasn't changed

Summary:
Since ForwardIterator is on a level below DBIter, the latter may call Next() on
it (e.g. in order to skip deletion markers). Since this also updates
`prev_key_`, it may prevent the Seek() optimization.

For example, assume that there's only one SST file and it contains the following
entries: 0101, 0201 (`ValueType::kTypeDeletion`, i.e. a tombstone record), 0201
(`kTypeValue`), 0202. Memtable is empty. `Seek(0102)` will result in `prev_key_`
being set to `0201` instead of `0102`, since `DBIter::Seek()` will call
`ForwardIterator::Next()` to skip record 0201. Therefore, when `Seek(0102)` is
called again, `NeedToSeekImmutable()` will return true.

This fix relies on `prefix_extractor_` to detect prefix changes. `prev_key_` is
only set to `current_->key()` as long as they have the same prefix.

I also made a small change to `NeedToSeekImmutable()` so it no longer returns
true when the db is empty (i.e. there's nothing but a memtable).

Test Plan:
   $ TEST_TMPDIR=/dev/shm/rocksdbtest ROCKSDB_TESTS=TailingIterator ./db_test

Reviewers: sdong, igor, ljin

Reviewed By: ljin

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D23823
---
 db/forward_iterator.cc | 45 ++++++++++++++++++++++++++++++++----------
 db/forward_iterator.h  |  1 +
 2 files changed, 36 insertions(+), 10 deletions(-)

diff --git a/db/forward_iterator.cc b/db/forward_iterator.cc
index 6b78c4037..684045e05 100644
--- a/db/forward_iterator.cc
+++ b/db/forward_iterator.cc
@@ -125,7 +125,8 @@ ForwardIterator::ForwardIterator(DBImpl* db, const ReadOptions& read_options,
       mutable_iter_(nullptr),
       current_(nullptr),
       valid_(false),
-      is_prev_set_(false) {}
+      is_prev_set_(false),
+      is_prev_inclusive_(false) {}
 
 ForwardIterator::~ForwardIterator() {
   Cleanup();
@@ -314,11 +315,12 @@ void ForwardIterator::SeekInternal(const Slice& internal_key,
       }
     }
 
-    if (seek_to_first || immutable_min_heap_.empty()) {
+    if (seek_to_first) {
       is_prev_set_ = false;
     } else {
       prev_key_.SetKey(internal_key);
       is_prev_set_ = true;
+      is_prev_inclusive_ = true;
     }
   } else if (current_ && current_ != mutable_iter_) {
     // current_ is one of immutable iterators, push it back to the heap
@@ -343,8 +345,20 @@ void ForwardIterator::Next() {
     }
   } else if (current_ != mutable_iter_) {
     // It is going to advance immutable iterator
-    prev_key_.SetKey(current_->key());
-    is_prev_set_ = true;
+
+    bool update_prev_key = true;
+    if (is_prev_set_ && prefix_extractor_) {
+      // advance prev_key_ to current_ only if they share the same prefix
+      update_prev_key =
+        prefix_extractor_->Transform(prev_key_.GetKey()).compare(
+          prefix_extractor_->Transform(current_->key())) == 0;
+    }
+
+    if (update_prev_key) {
+      prev_key_.SetKey(current_->key());
+      is_prev_set_ = true;
+      is_prev_inclusive_ = false;
+    }
   }
 
   current_->Next();
@@ -476,7 +490,14 @@ void ForwardIterator::UpdateCurrent() {
 }
 
 bool ForwardIterator::NeedToSeekImmutable(const Slice& target) {
-  if (!valid_ || !is_prev_set_) {
+  // We maintain the interval (prev_key_, immutable_min_heap_.top()->key())
+  // such that there are no records with keys within that range in
+  // immutable_min_heap_. Since immutable structures (SST files and immutable
+  // memtables) can't change in this version, we don't need to do a seek if
+  // 'target' belongs to that interval (immutable_min_heap_.top() is already
+  // at the correct position).
+
+  if (!valid_ || !current_ || !is_prev_set_) {
     return true;
   }
   Slice prev_key = prev_key_.GetKey();
@@ -485,13 +506,17 @@ bool ForwardIterator::NeedToSeekImmutable(const Slice& target) {
     return true;
   }
   if (cfd_->internal_comparator().InternalKeyComparator::Compare(
-        prev_key, target) >= 0) {
+        prev_key, target) >= (is_prev_inclusive_ ? 1 : 0)) {
     return true;
   }
-  if (immutable_min_heap_.empty() ||
-      cfd_->internal_comparator().InternalKeyComparator::Compare(
-          target, current_ == mutable_iter_ ? immutable_min_heap_.top()->key()
-                                            : current_->key()) > 0) {
+
+  if (immutable_min_heap_.empty() && current_ == mutable_iter_) {
+    // Nothing to seek on.
+    return false;
+  }
+  if (cfd_->internal_comparator().InternalKeyComparator::Compare(
+        target, current_ == mutable_iter_ ? immutable_min_heap_.top()->key()
+                                          : current_->key()) > 0) {
     return true;
   }
   return false;
diff --git a/db/forward_iterator.h b/db/forward_iterator.h
index 653a0ac0c..4d3761ee1 100644
--- a/db/forward_iterator.h
+++ b/db/forward_iterator.h
@@ -101,6 +101,7 @@ class ForwardIterator : public Iterator {
 
   IterKey prev_key_;
   bool is_prev_set_;
+  bool is_prev_inclusive_;
   Arena arena_;
 };
 

From b3343fdeac6738162763206b7677d285ab7263c0 Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Thu, 2 Oct 2014 09:25:07 +0200
Subject: [PATCH 192/829] resolution for java build problem introduced by
 5ec53f3edf62bec1b690ce12fb21a6c52203f3c8

---
 java/rocksjni/write_batch.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/java/rocksjni/write_batch.cc b/java/rocksjni/write_batch.cc
index 0492ea1be..46e7a6fa0 100644
--- a/java/rocksjni/write_batch.cc
+++ b/java/rocksjni/write_batch.cc
@@ -206,7 +206,8 @@ jbyteArray Java_org_rocksdb_WriteBatchTest_getContents(
   options.memtable_factory = factory;
   rocksdb::MemTable* mem = new rocksdb::MemTable(
       cmp, rocksdb::ImmutableCFOptions(options),
-      rocksdb::MemTableOptions(rocksdb::MutableCFOptions(options), options));
+      rocksdb::MemTableOptions(rocksdb::MutableCFOptions(options,
+          rocksdb::ImmutableCFOptions(options)), options));
   mem->Ref();
   std::string state;
   rocksdb::ColumnFamilyMemTablesDefault cf_mems_default(mem, &options);

From fcac705f95cccd5fed074591bc472564ea193c43 Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Thu, 2 Oct 2014 01:01:29 -0700
Subject: [PATCH 193/829] Fixed compile warning on Mac caused by unused
 variables.

Summary:
Fixed compile warning caused by unused variables.

./db/compaction_picker.h:118:7: error: private field 'max_grandparent_overlap_factor_' is not used [-Werror,-Wunused-private-field]
  int max_grandparent_overlap_factor_;
      ^
./db/compaction_picker.h:119:7: error: private field 'expanded_compaction_factor_' is not used [-Werror,-Wunused-private-field]
  int expanded_compaction_factor_;
      ^
2 errors generated.

Test Plan:
make db_test
---
 db/compaction_picker.h | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/db/compaction_picker.h b/db/compaction_picker.h
index 9862bdfea..138b97eb4 100644
--- a/db/compaction_picker.h
+++ b/db/compaction_picker.h
@@ -114,9 +114,6 @@ class CompactionPicker {
 
  private:
   const InternalKeyComparator* const icmp_;
-
-  int max_grandparent_overlap_factor_;
-  int expanded_compaction_factor_;
 };
 
 class UniversalCompactionPicker : public CompactionPicker {

From 89833e5a859149795bf72dc01eb06f6ba9ef33a8 Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Thu, 2 Oct 2014 01:05:59 -0700
Subject: [PATCH 194/829] Fixed signed-unsigned comparison warning in
 db_test.cc

Summary:
Fixed signed-unsigned comparison warning in db_test.cc

db/db_test.cc:8606:3: note: in instantiation of function template specialization 'rocksdb::test::Tester::IsEq<int, unsigned long>' requested here
  ASSERT_EQ(2, metadata.size());
    ^

Test Plan:
make db_test
---
 db/db_test.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/db/db_test.cc b/db/db_test.cc
index a2479d58e..7c2f051d0 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -8586,7 +8586,7 @@ TEST(DBTest, DynamicCompactionOptions) {
   ASSERT_EQ("0,1", FilesPerLevel());
   std::vector<LiveFileMetaData> metadata;
   db_->GetLiveFilesMetaData(&metadata);
-  ASSERT_EQ(1, metadata.size());
+  ASSERT_EQ(1U, metadata.size());
   ASSERT_LE(metadata[0].size, k128KB + k5KB);  // < 128KB + 5KB
   ASSERT_GE(metadata[0].size, k128KB - k5KB);  // > 128B - 5KB
 
@@ -8603,7 +8603,7 @@ TEST(DBTest, DynamicCompactionOptions) {
   ASSERT_EQ("0,2", FilesPerLevel());
   metadata.clear();
   db_->GetLiveFilesMetaData(&metadata);
-  ASSERT_EQ(2, metadata.size());
+  ASSERT_EQ(2U, metadata.size());
   ASSERT_LE(metadata[0].size, k64KB + k5KB);  // < 64KB + 5KB
   ASSERT_GE(metadata[0].size, k64KB - k5KB);  // > 64KB - 5KB
 

From d0916f452f7564c2f7dfeeb202353e8b9e4d16cf Mon Sep 17 00:00:00 2001
From: Chris Riccomini <criccomi@criccomi-mn.linkedin.biz>
Date: Thu, 2 Oct 2014 11:07:45 -0700
Subject: [PATCH 195/829] add major minor micro version to java jars

---
 Makefile      | 11 ++++++-----
 java/Makefile |  8 ++++++--
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/Makefile b/Makefile
index ab920b592..2a1ae6735 100644
--- a/Makefile
+++ b/Makefile
@@ -164,8 +164,9 @@ endif
 LIBRARY = ${LIBNAME}.a
 MEMENVLIBRARY = libmemenv.a
 
-ROCKSDB_MAJOR = 3
-ROCKSDB_MINOR = 4
+ROCKSDB_MAJOR = $(shell egrep "ROCKSDB_MAJOR.[0-9]" include/rocksdb/version.h | cut -d ' ' -f 3)
+ROCKSDB_MINOR = $(shell egrep "ROCKSDB_MINOR.[0-9]" include/rocksdb/version.h | cut -d ' ' -f 3)
+ROCKSDB_PATCH = $(shell egrep "ROCKSDB_PATCH.[0-9]" include/rocksdb/version.h | cut -d ' ' -f 3)
 
 default: all
 
@@ -515,12 +516,12 @@ JNI_NATIVE_SOURCES = ./java/rocksjni/*.cc
 JAVA_INCLUDE = -I$(JAVA_HOME)/include/ -I$(JAVA_HOME)/include/linux
 ARCH := $(shell getconf LONG_BIT)
 ROCKSDBJNILIB = librocksdbjni-linux$(ARCH).so
-ROCKSDB_JAR = rocksdbjni-linux$(ARCH).jar
-ROCKSDB_JAR_ALL = rocksdbjni-all.jar
+ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-linux$(ARCH).jar
+ROCKSDB_JAR_ALL = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-all.jar
 
 ifeq ($(PLATFORM), OS_MACOSX)
 ROCKSDBJNILIB = librocksdbjni-osx.jnilib
-ROCKSDB_JAR = rocksdbjni-osx.jar
+ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-osx.jar
 JAVA_INCLUDE = -I/System/Library/Frameworks/JavaVM.framework/Headers/
 endif
 
diff --git a/java/Makefile b/java/Makefile
index 1b854755b..2741f36d3 100644
--- a/java/Makefile
+++ b/java/Makefile
@@ -1,11 +1,15 @@
 NATIVE_JAVA_CLASSES = org.rocksdb.RocksDB org.rocksdb.Options org.rocksdb.WriteBatch org.rocksdb.WriteBatchInternal org.rocksdb.WriteBatchTest org.rocksdb.WriteOptions org.rocksdb.BackupableDB org.rocksdb.BackupableDBOptions org.rocksdb.Statistics org.rocksdb.RocksIterator org.rocksdb.VectorMemTableConfig org.rocksdb.SkipListMemTableConfig org.rocksdb.HashLinkedListMemTableConfig org.rocksdb.HashSkipListMemTableConfig org.rocksdb.PlainTableConfig org.rocksdb.BlockBasedTableConfig org.rocksdb.ReadOptions org.rocksdb.Filter org.rocksdb.BloomFilter org.rocksdb.RestoreOptions org.rocksdb.RestoreBackupableDB org.rocksdb.RocksEnv org.rocksdb.GenericRateLimiterConfig
 
+ROCKSDB_MAJOR = $(shell egrep "ROCKSDB_MAJOR.[0-9]" include/rocksdb/version.h | cut -d ' ' -f 3)
+ROCKSDB_MINOR = $(shell egrep "ROCKSDB_MINOR.[0-9]" include/rocksdb/version.h | cut -d ' ' -f 3)
+ROCKSDB_PATCH = $(shell egrep "ROCKSDB_PATCH.[0-9]" include/rocksdb/version.h | cut -d ' ' -f 3)
+
 NATIVE_INCLUDE = ./include
 ARCH := $(shell getconf LONG_BIT)
-ROCKSDB_JAR = rocksdbjni-linux$(ARCH).jar
+ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-linux$(ARCH).jar
 
 ifeq ($(PLATFORM), OS_MACOSX)
-ROCKSDB_JAR = rocksdbjni-osx.jar
+ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-osx.jar
 endif
 
 clean:

From 0908ddcea5ecaeb75ac426e01fe39a0f5d9ff09a Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Thu, 2 Oct 2014 11:59:22 -0700
Subject: [PATCH 196/829] Don't keep managing two rocksdb version

Summary:
Before this diff, there are two places with rocksdb versions. After the diff:
1. we only have one source of truth for rocksdb version
2. we have a script that we can use to get the version that we can use in other compilations (java, go, etc).

Test Plan: make

Reviewers: yhchiang, sdong, ljin

Reviewed By: ljin

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D24333
---
 Makefile                          | 21 +++++++++++----------
 build_tools/build_detect_platform |  7 +++++++
 build_tools/version.sh            | 12 ++++++++++++
 include/rocksdb/version.h         |  3 +--
 4 files changed, 31 insertions(+), 12 deletions(-)
 create mode 100755 build_tools/version.sh

diff --git a/Makefile b/Makefile
index 4deb8fc5f..c9d12415b 100644
--- a/Makefile
+++ b/Makefile
@@ -164,9 +164,6 @@ endif
 LIBRARY = ${LIBNAME}.a
 MEMENVLIBRARY = libmemenv.a
 
-ROCKSDB_MAJOR = 3
-ROCKSDB_MINOR = 4
-
 default: all
 
 #-----------------------------------------------
@@ -178,22 +175,26 @@ ifneq ($(PLATFORM_SHARED_VERSIONED),true)
 SHARED1 = ${LIBNAME}.$(PLATFORM_SHARED_EXT)
 SHARED2 = $(SHARED1)
 SHARED3 = $(SHARED1)
+SHARED4 = $(SHARED1)
 SHARED = $(SHARED1)
 else
-# Update db.h if you change these.
 SHARED_MAJOR = $(ROCKSDB_MAJOR)
 SHARED_MINOR = $(ROCKSDB_MINOR)
+SHARED_PATCH = $(ROCKSDB_PATCH)
 SHARED1 = ${LIBNAME}.$(PLATFORM_SHARED_EXT)
 SHARED2 = $(SHARED1).$(SHARED_MAJOR)
 SHARED3 = $(SHARED1).$(SHARED_MAJOR).$(SHARED_MINOR)
-SHARED = $(SHARED1) $(SHARED2) $(SHARED3)
-$(SHARED1): $(SHARED3)
-	ln -fs $(SHARED3) $(SHARED1)
-$(SHARED2): $(SHARED3)
-	ln -fs $(SHARED3) $(SHARED2)
+SHARED4 = $(SHARED1).$(SHARED_MAJOR).$(SHARED_MINOR).$(SHARED_PATCH)
+SHARED = $(SHARED1) $(SHARED2) $(SHARED3) $(SHARED4)
+$(SHARED1): $(SHARED4)
+	ln -fs $(SHARED4) $(SHARED1)
+$(SHARED2): $(SHARED4)
+	ln -fs $(SHARED4) $(SHARED2)
+$(SHARED3): $(SHARED4)
+	ln -fs $(SHARED4) $(SHARED3)
 endif
 
-$(SHARED3):
+$(SHARED4):
 	$(CXX) $(PLATFORM_SHARED_LDFLAGS)$(SHARED2) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) $(SOURCES) $(LDFLAGS) -o $@
 
 endif  # PLATFORM_SHARED_EXT
diff --git a/build_tools/build_detect_platform b/build_tools/build_detect_platform
index 8479e3127..c026782f6 100755
--- a/build_tools/build_detect_platform
+++ b/build_tools/build_detect_platform
@@ -326,6 +326,10 @@ PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS $COMMON_FLAGS"
 
 VALGRIND_VER="$VALGRIND_VER"
 
+ROCKSDB_MAJOR=`build_tools/version.sh major`
+ROCKSDB_MINOR=`build_tools/version.sh minor`
+ROCKSDB_PATCH=`build_tools/version.sh patch`
+
 echo "CC=$CC" >> "$OUTPUT"
 echo "CXX=$CXX" >> "$OUTPUT"
 echo "PLATFORM=$PLATFORM" >> "$OUTPUT"
@@ -341,3 +345,6 @@ echo "PLATFORM_SHARED_VERSIONED=$PLATFORM_SHARED_VERSIONED" >> "$OUTPUT"
 echo "EXEC_LDFLAGS=$EXEC_LDFLAGS" >> "$OUTPUT"
 echo "JEMALLOC_INCLUDE=$JEMALLOC_INCLUDE" >> "$OUTPUT"
 echo "JEMALLOC_LIB=$JEMALLOC_LIB" >> "$OUTPUT"
+echo "ROCKSDB_MAJOR=$ROCKSDB_MAJOR" >> "$OUTPUT"
+echo "ROCKSDB_MINOR=$ROCKSDB_MINOR" >> "$OUTPUT"
+echo "ROCKSDB_PATCH=$ROCKSDB_PATCH" >> "$OUTPUT"
diff --git a/build_tools/version.sh b/build_tools/version.sh
new file mode 100755
index 000000000..3a619fded
--- /dev/null
+++ b/build_tools/version.sh
@@ -0,0 +1,12 @@
+if [ $# == 0 ]; then
+  echo "Usage: $0 major|minor|patch"
+fi
+if [[ $1 == "major" ]]; then
+  cat include/rocksdb/version.h  | grep MAJOR | head -n1 | awk '{print $3}'
+fi
+if [[ $1 = "minor" ]]; then
+  cat include/rocksdb/version.h  | grep MINOR | head -n1 | awk '{print $3}'
+fi
+if [[ $1 = "patch" ]]; then
+  cat include/rocksdb/version.h  | grep PATCH | head -n1 | awk '{print $3}'
+fi
diff --git a/include/rocksdb/version.h b/include/rocksdb/version.h
index d6ccaeda5..285278854 100644
--- a/include/rocksdb/version.h
+++ b/include/rocksdb/version.h
@@ -4,9 +4,8 @@
 // of patent rights can be found in the PATENTS file in the same directory.
 #pragma once
 
-// Also update Makefile if you change these
 #define ROCKSDB_MAJOR 3
-#define ROCKSDB_MINOR 5
+#define ROCKSDB_MINOR 6
 #define ROCKSDB_PATCH 0
 
 // Do not use these. We made the mistake of declaring macros starting with

From d410b39d59b222b1f24d6275c725477b8ded720e Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Thu, 2 Oct 2014 13:21:20 +0200
Subject: [PATCH 197/829] BlockBasedTableConfig Filter policy support RocksJava

As proposed by yhchiang the filter can now be set in Java for
a BlockBasedTableConfig instance.
---
 java/Makefile                               |  1 +
 java/RocksDBSample.java                     |  3 +-
 java/org/rocksdb/BlockBasedTableConfig.java | 52 ++++++++++++---------
 java/rocksjni/table.cc                      |  9 ++--
 4 files changed, 37 insertions(+), 28 deletions(-)

diff --git a/java/Makefile b/java/Makefile
index b2f3674f0..52c88fd84 100644
--- a/java/Makefile
+++ b/java/Makefile
@@ -26,6 +26,7 @@ test: java
 	javac org/rocksdb/test/*.java
 	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.WriteBatchTest
 	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.BackupableDBTest
+	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.FilterTest
 	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.OptionsTest
 	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.ReadOptionsTest
 	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.StatisticsCollectorTest
diff --git a/java/RocksDBSample.java b/java/RocksDBSample.java
index bd5a85076..c9a30476a 100644
--- a/java/RocksDBSample.java
+++ b/java/RocksDBSample.java
@@ -80,9 +80,10 @@ public class RocksDBSample {
             10000, 10));
     options.setRateLimiterConfig(new GenericRateLimiterConfig(10000000));
 
+    Filter bloomFilter = new BloomFilter(10);
     BlockBasedTableConfig table_options = new BlockBasedTableConfig();
     table_options.setBlockCacheSize(64 * SizeUnit.KB)
-                 .setFilterBitsPerKey(10)
+                 .setFilter(bloomFilter)
                  .setCacheNumShardBits(6)
                  .setBlockSizeDeviation(5)
                  .setBlockRestartInterval(10)
diff --git a/java/org/rocksdb/BlockBasedTableConfig.java b/java/org/rocksdb/BlockBasedTableConfig.java
index bdb27d6c2..9a6967a95 100644
--- a/java/org/rocksdb/BlockBasedTableConfig.java
+++ b/java/org/rocksdb/BlockBasedTableConfig.java
@@ -18,7 +18,7 @@ public class BlockBasedTableConfig extends TableFormatConfig {
     blockSizeDeviation_ = 10;
     blockRestartInterval_ = 16;
     wholeKeyFiltering_ = true;
-    bitsPerKey_ = 10;
+    filter_ = null;
     cacheIndexAndFilterBlocks_ = false;
     hashIndexAllowCollision_ = true;
     blockCacheCompressedSize_ = 0;
@@ -182,30 +182,30 @@ public class BlockBasedTableConfig extends TableFormatConfig {
    *
    * Filter instance can be re-used in multiple options instances.
    *
-   * @param Filter policy java instance.
+   * @param Filter Filter Policy java instance.
    * @return the reference to the current config.
    */
-  public BlockBasedTableConfig setFilterBitsPerKey(int bitsPerKey) {
-    bitsPerKey_ = bitsPerKey;
+  public BlockBasedTableConfig setFilter(Filter filter) {
+    filter_ = filter;
     return this;
   }
-  
+
   /**
    * Indicating if we'd put index/filter blocks to the block cache.
      If not specified, each "table reader" object will pre-load index/filter
      block during table initialization.
-   * 
+   *
    * @return if index and filter blocks should be put in block cache.
    */
   public boolean cacheIndexAndFilterBlocks() {
     return cacheIndexAndFilterBlocks_;
   }
-  
+
   /**
    * Indicating if we'd put index/filter blocks to the block cache.
      If not specified, each "table reader" object will pre-load index/filter
      block during table initialization.
-   * 
+   *
    * @param index and filter blocks should be put in block cache.
    * @return the reference to the current config.
    */
@@ -214,25 +214,25 @@ public class BlockBasedTableConfig extends TableFormatConfig {
     cacheIndexAndFilterBlocks_ = cacheIndexAndFilterBlocks;
     return this;
   }
-  
+
   /**
    * Influence the behavior when kHashSearch is used.
      if false, stores a precise prefix to block range mapping
      if true, does not store prefix and allows prefix hash collision
      (less memory consumption)
-   * 
+   *
    * @return if hash collisions should be allowed.
    */
   public boolean hashIndexAllowCollision() {
     return hashIndexAllowCollision_;
   }
-  
+
   /**
    * Influence the behavior when kHashSearch is used.
      if false, stores a precise prefix to block range mapping
      if true, does not store prefix and allows prefix hash collision
      (less memory consumption)
-   * 
+   *
    * @param if hash collisions should be allowed.
    * @return the reference to the current config.
    */
@@ -241,21 +241,21 @@ public class BlockBasedTableConfig extends TableFormatConfig {
     hashIndexAllowCollision_ = hashIndexAllowCollision;
     return this;
   }
-  
+
   /**
    * Size of compressed block cache. If 0, then block_cache_compressed is set
    * to null.
-   * 
+   *
    * @return size of compressed block cache.
    */
   public long blockCacheCompressedSize() {
     return blockCacheCompressedSize_;
   }
-  
+
   /**
    * Size of compressed block cache. If 0, then block_cache_compressed is set
    * to null.
-   * 
+   *
    * @param size of compressed block cache.
    * @return the reference to the current config.
    */
@@ -264,7 +264,7 @@ public class BlockBasedTableConfig extends TableFormatConfig {
     blockCacheCompressedSize_ = blockCacheCompressedSize;
     return this;
   }
-  
+
   /**
    * Controls the number of shards for the block compressed cache.
    * This is applied only if blockCompressedCacheSize is set to non-negative.
@@ -276,7 +276,7 @@ public class BlockBasedTableConfig extends TableFormatConfig {
   public int blockCacheCompressedNumShardBits() {
     return blockCacheCompressedNumShardBits_;
   }
-  
+
   /**
    * Controls the number of shards for the block compressed cache.
    * This is applied only if blockCompressedCacheSize is set to non-negative.
@@ -293,17 +293,23 @@ public class BlockBasedTableConfig extends TableFormatConfig {
   }
 
   @Override protected long newTableFactoryHandle() {
+    long filterHandle = 0;
+    if (filter_ != null) {
+      filterHandle = filter_.nativeHandle_;
+    }
+
     return newTableFactoryHandle(noBlockCache_, blockCacheSize_,
         blockCacheNumShardBits_, blockSize_, blockSizeDeviation_,
-        blockRestartInterval_, wholeKeyFiltering_, bitsPerKey_,
-        cacheIndexAndFilterBlocks_, hashIndexAllowCollision_,
-        blockCacheCompressedSize_, blockCacheCompressedNumShardBits_);
+        blockRestartInterval_, wholeKeyFiltering_,
+        filterHandle, cacheIndexAndFilterBlocks_,
+        hashIndexAllowCollision_, blockCacheCompressedSize_,
+        blockCacheCompressedNumShardBits_);
   }
 
   private native long newTableFactoryHandle(
       boolean noBlockCache, long blockCacheSize, int blockCacheNumShardBits,
       long blockSize, int blockSizeDeviation, int blockRestartInterval,
-      boolean wholeKeyFiltering, int bitsPerKey,
+      boolean wholeKeyFiltering, long filterPolicyHandle,
       boolean cacheIndexAndFilterBlocks, boolean hashIndexAllowCollision,
       long blockCacheCompressedSize, int blockCacheCompressedNumShardBits);
 
@@ -315,7 +321,7 @@ public class BlockBasedTableConfig extends TableFormatConfig {
   private int blockSizeDeviation_;
   private int blockRestartInterval_;
   private boolean wholeKeyFiltering_;
-  private int bitsPerKey_;
+  private Filter filter_;
   private boolean cacheIndexAndFilterBlocks_;
   private boolean hashIndexAllowCollision_;
   private long blockCacheCompressedSize_;
diff --git a/java/rocksjni/table.cc b/java/rocksjni/table.cc
index 500cb255e..846526292 100644
--- a/java/rocksjni/table.cc
+++ b/java/rocksjni/table.cc
@@ -37,7 +37,7 @@ jlong Java_org_rocksdb_BlockBasedTableConfig_newTableFactoryHandle(
     JNIEnv* env, jobject jobj, jboolean no_block_cache, jlong block_cache_size,
     jint block_cache_num_shardbits, jlong block_size, jint block_size_deviation,
     jint block_restart_interval, jboolean whole_key_filtering,
-    jint bits_per_key, jboolean cache_index_and_filter_blocks,
+    jlong jfilterPolicy, jboolean cache_index_and_filter_blocks,
     jboolean hash_index_allow_collision, jlong block_cache_compressed_size,
     jint block_cache_compressd_num_shard_bits) {
   rocksdb::BlockBasedTableOptions options;
@@ -55,8 +55,9 @@ jlong Java_org_rocksdb_BlockBasedTableConfig_newTableFactoryHandle(
   options.block_size_deviation = block_size_deviation;
   options.block_restart_interval = block_restart_interval;
   options.whole_key_filtering = whole_key_filtering;
-  if (bits_per_key > 0) {
-    options.filter_policy.reset(rocksdb::NewBloomFilterPolicy(bits_per_key));
+  if (jfilterPolicy > 0) {
+    options.filter_policy.reset(
+        reinterpret_cast<rocksdb::FilterPolicy*>(jfilterPolicy));
   }
   options.cache_index_and_filter_blocks = cache_index_and_filter_blocks;
   options.hash_index_allow_collision = hash_index_allow_collision;
@@ -69,6 +70,6 @@ jlong Java_org_rocksdb_BlockBasedTableConfig_newTableFactoryHandle(
       options.block_cache = rocksdb::NewLRUCache(block_cache_compressed_size);
     }
   }
-  
+
   return reinterpret_cast<jlong>(rocksdb::NewBlockBasedTableFactory(options));
 }

From 017354177825355421a5f43d0c97e08c87ff6913 Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Thu, 2 Oct 2014 13:33:18 +0200
Subject: [PATCH 198/829] FilterTest

---
 java/org/rocksdb/test/FilterTest.java | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)
 create mode 100644 java/org/rocksdb/test/FilterTest.java

diff --git a/java/org/rocksdb/test/FilterTest.java b/java/org/rocksdb/test/FilterTest.java
new file mode 100644
index 000000000..7475d2c34
--- /dev/null
+++ b/java/org/rocksdb/test/FilterTest.java
@@ -0,0 +1,27 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb.test;
+
+import org.rocksdb.*;
+
+public class FilterTest {
+  static {
+    RocksDB.loadLibrary();
+  }
+  public static void main(String[] args) {
+    Options options = new Options();
+    // test table config without filter
+    BlockBasedTableConfig blockConfig = new BlockBasedTableConfig();
+    options.setTableFormatConfig(blockConfig);
+    options.dispose();
+    // new Bloom filter
+    options = new Options();
+    blockConfig = new BlockBasedTableConfig();
+    blockConfig.setFilter(new BloomFilter());
+    options.setTableFormatConfig(blockConfig);
+    System.out.println("Filter test passed");
+  }
+}

From fd2545c80a60756387917683f3862cb1b086b294 Mon Sep 17 00:00:00 2001
From: Chris Riccomini <criccomi@criccomi-mn.linkedin.biz>
Date: Thu, 2 Oct 2014 13:29:47 -0700
Subject: [PATCH 199/829] add maven publication target and instructions

---
 Makefile        |  6 ++++++
 java/RELEASE.md | 28 +++++++++++++++++++++++++++-
 2 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 2a1ae6735..ec443df12 100644
--- a/Makefile
+++ b/Makefile
@@ -560,6 +560,12 @@ rocksdbjavastaticrelease: rocksdbjavastatic
 	cd java/crossbuild && vagrant destroy -f && vagrant up linux32 && vagrant halt linux32 && vagrant up linux64 && vagrant halt linux64
 	cd java;jar -cf $(ROCKSDB_JAR_ALL) org/rocksdb/*.class org/rocksdb/util/*.class HISTORY*.md librocksdbjni-*.so librocksdbjni-*.jnilib
 
+rocksdbjavastaticpublish:
+	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/maven/rocksdbjni-3.5.0-linux64.pom -Dfile=java/rocksdbjni-3.5.0-linux64.jar -Dclassifier=linux64
+	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/maven/rocksdbjni-3.5.0-linux32.pom -Dfile=java/rocksdbjni-3.5.0-linux32.jar -Dclassifier=linux32
+	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/maven/rocksdbjni-3.5.0-osx.pom -Dfile=java/rocksdbjni-3.5.0-osx.jar -Dclassifier=osx
+	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/maven/rocksdbjni-3.5.0-all.pom -Dfile=java/rocksdbjni-3.5.0-all.jar -Dclassifier=all
+
 rocksdbjava:
 	OPT="-fPIC -DNDEBUG -O2" $(MAKE) $(LIBRARY) -j32
 	cd java;$(MAKE) java;
diff --git a/java/RELEASE.md b/java/RELEASE.md
index cc35dc33c..c54cadffe 100644
--- a/java/RELEASE.md
+++ b/java/RELEASE.md
@@ -27,4 +27,30 @@ You can find all native binaries and JARs in the java directory upon completion:
 
 ## Maven publication
 
-TODO
+Set ~/.m2/settings.xml to contain:
+
+    <settings xmlns="http://maven.apache.org/SETTINGS/1.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/SETTINGS/1.0.0 http://maven.apache.org/xsd/settings-1.0.0.xsd">
+      <servers>
+        <server>
+          <id>sonatype-nexus-staging</id>
+          <username>your-sonatype-jira-username</username>
+          <password>your-sonatype-jira-password</password>
+        </server>
+      </servers>
+    </settings>
+
+From RocksDB's root directory, first build the Java static JARs:
+
+    make jclean clean rocksdbjavastaticrelease
+
+Then publish the release to Sonatype:
+
+    make rocksdbjavastaticpublish
+
+This command will [stage the JAR artifacts on the Sonatype staging repository](http://central.sonatype.org/pages/manual-staging-bundle-creation-and-deployment.html). To release the staged artifacts.
+
+1. Go to [https://oss.sonatype.org/#stagingRepositories](https://oss.sonatype.org/#stagingRepositories) and search for "rocksdb" in the upper right hand search box.
+2. Select the rocksdb staging repository, and inspect its contents.
+3. If all is well, follow [these steps](https://oss.sonatype.org/#stagingRepositories) to close the repository and release it.
+
+After the release has occurred, the artifacts will be synced to Maven central within 24-48 hours.

From deefcf476dcfa7dc172803df5ca7ca8aa7834361 Mon Sep 17 00:00:00 2001
From: Chris Riccomini <criccomi@criccomi-mn.linkedin.biz>
Date: Thu, 2 Oct 2014 13:46:43 -0700
Subject: [PATCH 200/829] make fat jar unclassified to satisfy sonatype

---
 .gitignore |  1 +
 Makefile   | 10 +++++-----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/.gitignore b/.gitignore
index 363481755..638f236c7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -36,3 +36,4 @@ java/include/org_rocksdb_*.h
 unity.cc
 java/crossbuild/.vagrant
 .vagrant/
+java/**.asc
diff --git a/Makefile b/Makefile
index ec443df12..01bb170a2 100644
--- a/Makefile
+++ b/Makefile
@@ -517,7 +517,7 @@ JAVA_INCLUDE = -I$(JAVA_HOME)/include/ -I$(JAVA_HOME)/include/linux
 ARCH := $(shell getconf LONG_BIT)
 ROCKSDBJNILIB = librocksdbjni-linux$(ARCH).so
 ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-linux$(ARCH).jar
-ROCKSDB_JAR_ALL = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-all.jar
+ROCKSDB_JAR_ALL = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH).jar
 
 ifeq ($(PLATFORM), OS_MACOSX)
 ROCKSDBJNILIB = librocksdbjni-osx.jnilib
@@ -561,10 +561,10 @@ rocksdbjavastaticrelease: rocksdbjavastatic
 	cd java;jar -cf $(ROCKSDB_JAR_ALL) org/rocksdb/*.class org/rocksdb/util/*.class HISTORY*.md librocksdbjni-*.so librocksdbjni-*.jnilib
 
 rocksdbjavastaticpublish:
-	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/maven/rocksdbjni-3.5.0-linux64.pom -Dfile=java/rocksdbjni-3.5.0-linux64.jar -Dclassifier=linux64
-	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/maven/rocksdbjni-3.5.0-linux32.pom -Dfile=java/rocksdbjni-3.5.0-linux32.jar -Dclassifier=linux32
-	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/maven/rocksdbjni-3.5.0-osx.pom -Dfile=java/rocksdbjni-3.5.0-osx.jar -Dclassifier=osx
-	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/maven/rocksdbjni-3.5.0-all.pom -Dfile=java/rocksdbjni-3.5.0-all.jar -Dclassifier=all
+	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksdbjni-3.5.0.pom -Dfile=java/rocksdbjni-3.5.0-linux64.jar -Dclassifier=linux64
+	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksdbjni-3.5.0.pom -Dfile=java/rocksdbjni-3.5.0-linux32.jar -Dclassifier=linux32
+	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksdbjni-3.5.0.pom -Dfile=java/rocksdbjni-3.5.0-osx.jar -Dclassifier=osx
+	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksdbjni-3.5.0.pom -Dfile=java/rocksdbjni-3.5.0.jar
 
 rocksdbjava:
 	OPT="-fPIC -DNDEBUG -O2" $(MAKE) $(LIBRARY) -j32

From df08a2d03c94d38da89672d45b28e30182349ac7 Mon Sep 17 00:00:00 2001
From: Chris Riccomini <criccomi@criccomi-mn.linkedin.biz>
Date: Thu, 2 Oct 2014 13:47:07 -0700
Subject: [PATCH 201/829] add single rocksdbjni pom

---
 java/rocksdbjni-3.5.0.pom | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)
 create mode 100644 java/rocksdbjni-3.5.0.pom

diff --git a/java/rocksdbjni-3.5.0.pom b/java/rocksdbjni-3.5.0.pom
new file mode 100644
index 000000000..109ae7b11
--- /dev/null
+++ b/java/rocksdbjni-3.5.0.pom
@@ -0,0 +1,34 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd" xmlns="http://maven.apache.org/POM/4.0.0"
+    xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+  <modelVersion>4.0.0</modelVersion>
+  <name>RocksDB JNI</name>
+  <url>http://rocksdb.org/</url>
+  <groupId>org.rocksdb</groupId>
+  <artifactId>rocksdbjni</artifactId>
+  <version>3.5.0</version>
+  <description>RocksDB fat jar that contains .so files for linux32 and linux64, and jnilib files for Mac OSX.</description>
+  <licenses>
+    <license>
+      <name>Apache License 2.0</name>
+      <url>http://www.apache.org/licenses/LICENSE-2.0.html</url>
+      <distribution>repo</distribution>
+    </license>
+  </licenses>
+  <scm>
+    <connection>scm:git:git://github.com/dropwizard/metrics.git</connection>
+    <developerConnection>scm:git:git@github.com:dropwizard/metrics.git</developerConnection>
+    <url>http://github.com/dropwizard/metrics/</url>
+    <tag>HEAD</tag>
+  </scm>
+  <developers>
+    <developer>
+      <name>Facebook</name>
+      <email>help@facebook.com</email>
+      <timezone>America/New_York</timezone>
+      <roles>
+        <role>architect</role>
+      </roles>
+    </developer>
+  </developers>
+</project>

From 2e80124982567cd0c9d1a23f6e2ca6ee2551741d Mon Sep 17 00:00:00 2001
From: Chris Riccomini <criccomi@criccomi-mn.linkedin.biz>
Date: Thu, 2 Oct 2014 13:57:54 -0700
Subject: [PATCH 202/829] add javadoc and sources targets for sonatype

---
 Makefile      | 7 ++++++-
 java/Makefile | 6 +++++-
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index 01bb170a2..8341e9772 100644
--- a/Makefile
+++ b/Makefile
@@ -518,6 +518,8 @@ ARCH := $(shell getconf LONG_BIT)
 ROCKSDBJNILIB = librocksdbjni-linux$(ARCH).so
 ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-linux$(ARCH).jar
 ROCKSDB_JAR_ALL = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH).jar
+ROCKSDB_JAVADOCS_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-javadocs.jar
+ROCKSDB_SOURCES_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-sources.jar
 
 ifeq ($(PLATFORM), OS_MACOSX)
 ROCKSDBJNILIB = librocksdbjni-osx.jnilib
@@ -554,13 +556,16 @@ rocksdbjavastatic: libz.a libbz2.a libsnappy.a
 	rm -f ./java/$(ROCKSDBJNILIB)
 	$(CXX) $(CXXFLAGS) -I./java/. $(JAVA_INCLUDE) -shared -fPIC -o ./java/$(ROCKSDBJNILIB) $(JNI_NATIVE_SOURCES) $(LIBOBJECTS) $(COVERAGEFLAGS) libz.a libbz2.a libsnappy.a
 	cd java;jar -cf $(ROCKSDB_JAR) org/rocksdb/*.class org/rocksdb/util/*.class HISTORY*.md $(ROCKSDBJNILIB)
-
+	cd java/javadocs;jar -cf ../$(ROCKSDB_JAVADOCS_JAR) *
+	cd java;jar -cf $(ROCKSDB_SOURCES_JAR) org
 
 rocksdbjavastaticrelease: rocksdbjavastatic
 	cd java/crossbuild && vagrant destroy -f && vagrant up linux32 && vagrant halt linux32 && vagrant up linux64 && vagrant halt linux64
 	cd java;jar -cf $(ROCKSDB_JAR_ALL) org/rocksdb/*.class org/rocksdb/util/*.class HISTORY*.md librocksdbjni-*.so librocksdbjni-*.jnilib
 
 rocksdbjavastaticpublish:
+	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksdbjni-3.5.0.pom -Dfile=java/rocksdbjni-3.5.0-javadocs.jar -Dclassifier=javadocs
+	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksdbjni-3.5.0.pom -Dfile=java/rocksdbjni-3.5.0-sources.jar -Dclassifier=sources
 	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksdbjni-3.5.0.pom -Dfile=java/rocksdbjni-3.5.0-linux64.jar -Dclassifier=linux64
 	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksdbjni-3.5.0.pom -Dfile=java/rocksdbjni-3.5.0-linux32.jar -Dclassifier=linux32
 	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksdbjni-3.5.0.pom -Dfile=java/rocksdbjni-3.5.0-osx.jar -Dclassifier=osx
diff --git a/java/Makefile b/java/Makefile
index 2741f36d3..dec0480dc 100644
--- a/java/Makefile
+++ b/java/Makefile
@@ -15,8 +15,12 @@ endif
 clean:
 	-find . -name "*.class" -exec rm {} \;
 	-find . -name "hs*.log" -exec rm {} \;
+	rm -rf javadocs/*
 
-java:
+javadocs:
+	mkdir -p javadocs; javadoc -d javadocs -sourcepath . -subpackages org
+
+java: javadocs
 	javac org/rocksdb/util/*.java org/rocksdb/*.java
 	@cp ../HISTORY.md ./HISTORY-CPP.md
 	@rm -f ./HISTORY-CPP.md

From 079a612b5ec5edca73dbde9692d1984b18dca98d Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Thu, 2 Oct 2014 14:03:49 -0700
Subject: [PATCH 203/829] Fix build_tools/version.sh

---
 build_tools/version.sh | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/build_tools/version.sh b/build_tools/version.sh
index 3a619fded..afa7ed277 100755
--- a/build_tools/version.sh
+++ b/build_tools/version.sh
@@ -1,12 +1,14 @@
+#!/bin/sh
 if [ $# == 0 ]; then
   echo "Usage: $0 major|minor|patch"
+  exit 1
 fi
-if [[ $1 == "major" ]]; then
+if [ $1 = "major" ]; then
   cat include/rocksdb/version.h  | grep MAJOR | head -n1 | awk '{print $3}'
 fi
-if [[ $1 = "minor" ]]; then
+if [ $1 = "minor" ]; then
   cat include/rocksdb/version.h  | grep MINOR | head -n1 | awk '{print $3}'
 fi
-if [[ $1 = "patch" ]]; then
+if [ $1 = "patch" ]; then
   cat include/rocksdb/version.h  | grep PATCH | head -n1 | awk '{print $3}'
 fi

From 8322cf000619872e36501681369e54f5dce5a6ec Mon Sep 17 00:00:00 2001
From: Chris Riccomini <criccomi@criccomi-mn.linkedin.biz>
Date: Thu, 2 Oct 2014 14:13:09 -0700
Subject: [PATCH 204/829] use javadoc instead of javadocs

---
 .gitignore | 1 +
 Makefile   | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/.gitignore b/.gitignore
index 638f236c7..0e53ea35b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -37,3 +37,4 @@ unity.cc
 java/crossbuild/.vagrant
 .vagrant/
 java/**.asc
+java/javadocs
diff --git a/Makefile b/Makefile
index 8341e9772..6fd3fa802 100644
--- a/Makefile
+++ b/Makefile
@@ -518,7 +518,7 @@ ARCH := $(shell getconf LONG_BIT)
 ROCKSDBJNILIB = librocksdbjni-linux$(ARCH).so
 ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-linux$(ARCH).jar
 ROCKSDB_JAR_ALL = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH).jar
-ROCKSDB_JAVADOCS_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-javadocs.jar
+ROCKSDB_JAVADOCS_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-javadoc.jar
 ROCKSDB_SOURCES_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-sources.jar
 
 ifeq ($(PLATFORM), OS_MACOSX)
@@ -564,7 +564,7 @@ rocksdbjavastaticrelease: rocksdbjavastatic
 	cd java;jar -cf $(ROCKSDB_JAR_ALL) org/rocksdb/*.class org/rocksdb/util/*.class HISTORY*.md librocksdbjni-*.so librocksdbjni-*.jnilib
 
 rocksdbjavastaticpublish:
-	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksdbjni-3.5.0.pom -Dfile=java/rocksdbjni-3.5.0-javadocs.jar -Dclassifier=javadocs
+	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksdbjni-3.5.0.pom -Dfile=java/rocksdbjni-3.5.0-javadoc.jar -Dclassifier=javadoc
 	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksdbjni-3.5.0.pom -Dfile=java/rocksdbjni-3.5.0-sources.jar -Dclassifier=sources
 	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksdbjni-3.5.0.pom -Dfile=java/rocksdbjni-3.5.0-linux64.jar -Dclassifier=linux64
 	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksdbjni-3.5.0.pom -Dfile=java/rocksdbjni-3.5.0-linux32.jar -Dclassifier=linux32

From 2d72f7807f432ce12d2aac3f7509b6ad3a359d32 Mon Sep 17 00:00:00 2001
From: Chris Riccomini <criccomi@criccomi-mn.linkedin.biz>
Date: Thu, 2 Oct 2014 14:26:52 -0700
Subject: [PATCH 205/829] update release docs in java

---
 java/RELEASE.md | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/java/RELEASE.md b/java/RELEASE.md
index c54cadffe..6792c0004 100644
--- a/java/RELEASE.md
+++ b/java/RELEASE.md
@@ -20,10 +20,12 @@ You can find all native binaries and JARs in the java directory upon completion:
     librocksdbjni-linux32.so
     librocksdbjni-linux64.so
     librocksdbjni-osx.jnilib
-    rocksdbjni-all.jar
-    rocksdbjni-linux32.jar
-    rocksdbjni-linux64.jar
-    rocksdbjni-osx.jar
+    rocksdbjni-3.5.0-javadoc.jar
+    rocksdbjni-3.5.0-linux32.jar
+    rocksdbjni-3.5.0-linux64.jar
+    rocksdbjni-3.5.0-osx.jar
+    rocksdbjni-3.5.0-sources.jar
+    rocksdbjni-3.5.0.jar
 
 ## Maven publication
 

From 2a1add6731dca246f97a4c86a52240aec224cfbd Mon Sep 17 00:00:00 2001
From: Chris Riccomini <criccomi@criccomi-mn.linkedin.biz>
Date: Thu, 2 Oct 2014 14:31:14 -0700
Subject: [PATCH 206/829] use proper major/minor/micro version rather than hard
 coding 3.5.0

---
 Makefile | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/Makefile b/Makefile
index 6fd3fa802..5d6b1d7bf 100644
--- a/Makefile
+++ b/Makefile
@@ -564,12 +564,12 @@ rocksdbjavastaticrelease: rocksdbjavastatic
 	cd java;jar -cf $(ROCKSDB_JAR_ALL) org/rocksdb/*.class org/rocksdb/util/*.class HISTORY*.md librocksdbjni-*.so librocksdbjni-*.jnilib
 
 rocksdbjavastaticpublish:
-	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksdbjni-3.5.0.pom -Dfile=java/rocksdbjni-3.5.0-javadoc.jar -Dclassifier=javadoc
-	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksdbjni-3.5.0.pom -Dfile=java/rocksdbjni-3.5.0-sources.jar -Dclassifier=sources
-	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksdbjni-3.5.0.pom -Dfile=java/rocksdbjni-3.5.0-linux64.jar -Dclassifier=linux64
-	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksdbjni-3.5.0.pom -Dfile=java/rocksdbjni-3.5.0-linux32.jar -Dclassifier=linux32
-	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksdbjni-3.5.0.pom -Dfile=java/rocksdbjni-3.5.0-osx.jar -Dclassifier=osx
-	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksdbjni-3.5.0.pom -Dfile=java/rocksdbjni-3.5.0.jar
+	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH).pom -Dfile=java/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-javadoc.jar -Dclassifier=javadoc
+	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH).pom -Dfile=java/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-sources.jar -Dclassifier=sources
+	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH).pom -Dfile=java/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-linux64.jar -Dclassifier=linux64
+	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH).pom -Dfile=java/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-linux32.jar -Dclassifier=linux32
+	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH).pom -Dfile=java/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-osx.jar -Dclassifier=osx
+	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH).pom -Dfile=java/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH).jar
 
 rocksdbjava:
 	OPT="-fPIC -DNDEBUG -O2" $(MAKE) $(LIBRARY) -j32

From a213971d8a59d2b0533eed898df1d867bbd8a868 Mon Sep 17 00:00:00 2001
From: Nik Bougalis <nikb@bougalis.net>
Date: Tue, 30 Sep 2014 17:32:37 -0700
Subject: [PATCH 207/829] Don't return (or dereference) dangling pointer

---
 db/flush_scheduler.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/db/flush_scheduler.cc b/db/flush_scheduler.cc
index 636ff5a98..56816159e 100644
--- a/db/flush_scheduler.cc
+++ b/db/flush_scheduler.cc
@@ -28,6 +28,7 @@ ColumnFamilyData* FlushScheduler::GetNextColumnFamily() {
     if (cfd->IsDropped()) {
       if (cfd->Unref()) {
         delete cfd;
+        cfd = nullptr;
       }
     } else {
       break;

From c832f1644af7f9536c7a3dd09671f3b36b1e494c Mon Sep 17 00:00:00 2001
From: Chris Riccomini <criccomi@criccomi-mn.linkedin.biz>
Date: Thu, 2 Oct 2014 14:42:49 -0700
Subject: [PATCH 208/829] add not about updating pom version and rename pom to
 be unversioned

---
 Makefile                                    | 12 ++++++------
 java/RELEASE.md                             |  2 ++
 java/{rocksdbjni-3.5.0.pom => rocksjni.pom} |  0
 3 files changed, 8 insertions(+), 6 deletions(-)
 rename java/{rocksdbjni-3.5.0.pom => rocksjni.pom} (100%)

diff --git a/Makefile b/Makefile
index 5d6b1d7bf..d03ceddc1 100644
--- a/Makefile
+++ b/Makefile
@@ -564,12 +564,12 @@ rocksdbjavastaticrelease: rocksdbjavastatic
 	cd java;jar -cf $(ROCKSDB_JAR_ALL) org/rocksdb/*.class org/rocksdb/util/*.class HISTORY*.md librocksdbjni-*.so librocksdbjni-*.jnilib
 
 rocksdbjavastaticpublish:
-	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH).pom -Dfile=java/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-javadoc.jar -Dclassifier=javadoc
-	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH).pom -Dfile=java/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-sources.jar -Dclassifier=sources
-	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH).pom -Dfile=java/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-linux64.jar -Dclassifier=linux64
-	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH).pom -Dfile=java/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-linux32.jar -Dclassifier=linux32
-	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH).pom -Dfile=java/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-osx.jar -Dclassifier=osx
-	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH).pom -Dfile=java/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH).jar
+	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-javadoc.jar -Dclassifier=javadoc
+	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-sources.jar -Dclassifier=sources
+	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-linux64.jar -Dclassifier=linux64
+	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-linux32.jar -Dclassifier=linux32
+	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-osx.jar -Dclassifier=osx
+	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH).jar
 
 rocksdbjava:
 	OPT="-fPIC -DNDEBUG -O2" $(MAKE) $(LIBRARY) -j32
diff --git a/java/RELEASE.md b/java/RELEASE.md
index 6792c0004..d2028073c 100644
--- a/java/RELEASE.md
+++ b/java/RELEASE.md
@@ -41,6 +41,8 @@ Set ~/.m2/settings.xml to contain:
       </servers>
     </settings>
 
+Then update rocksjni.pom's version tag to reflect the release version.
+
 From RocksDB's root directory, first build the Java static JARs:
 
     make jclean clean rocksdbjavastaticrelease
diff --git a/java/rocksdbjni-3.5.0.pom b/java/rocksjni.pom
similarity index 100%
rename from java/rocksdbjni-3.5.0.pom
rename to java/rocksjni.pom

From a1d3f0d2b23b533320ade5344c9afe34f77924e3 Mon Sep 17 00:00:00 2001
From: Chris Riccomini <criccomi@criccomi-mn.linkedin.biz>
Date: Thu, 2 Oct 2014 14:49:46 -0700
Subject: [PATCH 209/829] don't fail if javadocs diretory doesn't exist

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index d03ceddc1..c6b5404da 100644
--- a/Makefile
+++ b/Makefile
@@ -556,7 +556,7 @@ rocksdbjavastatic: libz.a libbz2.a libsnappy.a
 	rm -f ./java/$(ROCKSDBJNILIB)
 	$(CXX) $(CXXFLAGS) -I./java/. $(JAVA_INCLUDE) -shared -fPIC -o ./java/$(ROCKSDBJNILIB) $(JNI_NATIVE_SOURCES) $(LIBOBJECTS) $(COVERAGEFLAGS) libz.a libbz2.a libsnappy.a
 	cd java;jar -cf $(ROCKSDB_JAR) org/rocksdb/*.class org/rocksdb/util/*.class HISTORY*.md $(ROCKSDBJNILIB)
-	cd java/javadocs;jar -cf ../$(ROCKSDB_JAVADOCS_JAR) *
+	mkdir -p java/javadocs;cd java/javadocs;jar -cf ../$(ROCKSDB_JAVADOCS_JAR) *
 	cd java;jar -cf $(ROCKSDB_SOURCES_JAR) org
 
 rocksdbjavastaticrelease: rocksdbjavastatic

From 45d526e226dd99a6a25bffbfbbc2ea1a3a63b7e2 Mon Sep 17 00:00:00 2001
From: Chris Riccomini <criccomi@criccomi-mn.linkedin.biz>
Date: Thu, 2 Oct 2014 14:57:18 -0700
Subject: [PATCH 210/829] singular javadoc directory

---
 Makefile      | 2 +-
 java/Makefile | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index 8abcfee18..5d3f954de 100644
--- a/Makefile
+++ b/Makefile
@@ -560,7 +560,7 @@ rocksdbjavastatic: libz.a libbz2.a libsnappy.a
 	rm -f ./java/$(ROCKSDBJNILIB)
 	$(CXX) $(CXXFLAGS) -I./java/. $(JAVA_INCLUDE) -shared -fPIC -o ./java/$(ROCKSDBJNILIB) $(JNI_NATIVE_SOURCES) $(LIBOBJECTS) $(COVERAGEFLAGS) libz.a libbz2.a libsnappy.a
 	cd java;jar -cf $(ROCKSDB_JAR) org/rocksdb/*.class org/rocksdb/util/*.class HISTORY*.md $(ROCKSDBJNILIB)
-	mkdir -p java/javadocs;cd java/javadocs;jar -cf ../$(ROCKSDB_JAVADOCS_JAR) *
+	cd java/javadoc;jar -cf ../$(ROCKSDB_JAVADOCS_JAR) *
 	cd java;jar -cf $(ROCKSDB_SOURCES_JAR) org
 
 rocksdbjavastaticrelease: rocksdbjavastatic
diff --git a/java/Makefile b/java/Makefile
index 4ee73daf9..a500e599e 100644
--- a/java/Makefile
+++ b/java/Makefile
@@ -18,7 +18,7 @@ clean:
 	rm -rf javadocs/*
 
 javadocs:
-	mkdir -p javadocs; javadoc -d javadocs -sourcepath . -subpackages org
+	mkdir -p javadoc; javadoc -d javadoc -sourcepath . -subpackages org
 
 java: javadocs
 	javac org/rocksdb/util/*.java org/rocksdb/*.java

From 99744e0c4b5fe2e20c59908cb7ca8317edf19cb0 Mon Sep 17 00:00:00 2001
From: Chris Riccomini <criccomi@criccomi-mn.linkedin.biz>
Date: Thu, 2 Oct 2014 14:59:20 -0700
Subject: [PATCH 211/829] bump version to 3.6

---
 java/rocksjni.pom | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/java/rocksjni.pom b/java/rocksjni.pom
index 109ae7b11..554357031 100644
--- a/java/rocksjni.pom
+++ b/java/rocksjni.pom
@@ -6,7 +6,7 @@
   <url>http://rocksdb.org/</url>
   <groupId>org.rocksdb</groupId>
   <artifactId>rocksdbjni</artifactId>
-  <version>3.5.0</version>
+  <version>3.6.0</version>
   <description>RocksDB fat jar that contains .so files for linux32 and linux64, and jnilib files for Mac OSX.</description>
   <licenses>
     <license>

From e869fc6a887c6f176df8062a08e1931daf1db308 Mon Sep 17 00:00:00 2001
From: Chris Riccomini <criccomi@criccomi-mn.linkedin.biz>
Date: Thu, 2 Oct 2014 15:46:49 -0700
Subject: [PATCH 212/829] remove proper javadoc directory

---
 java/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/java/Makefile b/java/Makefile
index a500e599e..b2038355c 100644
--- a/java/Makefile
+++ b/java/Makefile
@@ -15,7 +15,7 @@ endif
 clean:
 	-find . -name "*.class" -exec rm {} \;
 	-find . -name "hs*.log" -exec rm {} \;
-	rm -rf javadocs/*
+	rm -rf javadoc/*
 
 javadocs:
 	mkdir -p javadoc; javadoc -d javadoc -sourcepath . -subpackages org

From f4086a88b4e5bafe9978f805a22a0b01e634c342 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Thu, 2 Oct 2014 17:02:30 -0700
Subject: [PATCH 213/829] perf_context.get_from_output_files_time is set for
 MultiGet() and ReadOnly DB too.

Summary: perf_context.get_from_output_files_time is now only set writable DB's DB::Get(). Extend it to MultiGet() and read only DB.

Test Plan:
make all check
Fix perf_context_test and extend it to cover MultiGet(), as long as read-only DB. Run it and watch the results

Reviewers: ljin, yhchiang, igor

Reviewed By: igor

Subscribers: rven, leveldb

Differential Revision: https://reviews.facebook.net/D24207
---
 Makefile                |   2 +-
 db/db_impl.cc           |   1 +
 db/db_impl_readonly.cc  |   2 +
 db/perf_context_test.cc | 195 ++++++++++++++++++++++++++++++++++++----
 4 files changed, 182 insertions(+), 18 deletions(-)

diff --git a/Makefile b/Makefile
index c9d12415b..ad9082d31 100644
--- a/Makefile
+++ b/Makefile
@@ -154,7 +154,7 @@ TOOLS = \
   options_test \
 	blob_store_bench
 
-PROGRAMS = db_bench signal_test table_reader_bench log_and_apply_bench cache_bench $(TOOLS)
+PROGRAMS = db_bench signal_test table_reader_bench log_and_apply_bench cache_bench perf_context_test $(TOOLS)
 
 # The library name is configurable since we are maintaining libraries of both
 # debug/release mode.
diff --git a/db/db_impl.cc b/db/db_impl.cc
index 680a22cb3..85dab9f9e 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -3602,6 +3602,7 @@ std::vector<Status> DBImpl::MultiGet(
     } else if (super_version->imm->Get(lkey, value, &s, &merge_context)) {
       // Done
     } else {
+      PERF_TIMER_GUARD(get_from_output_files_time);
       super_version->current->Get(options, lkey, value, &s, &merge_context);
     }
 
diff --git a/db/db_impl_readonly.cc b/db/db_impl_readonly.cc
index 9faebd8c2..31ebdbedd 100644
--- a/db/db_impl_readonly.cc
+++ b/db/db_impl_readonly.cc
@@ -8,6 +8,7 @@
 #include "db/db_impl.h"
 #include "db/merge_context.h"
 #include "db/db_iter.h"
+#include "util/perf_context_imp.h"
 
 namespace rocksdb {
 
@@ -34,6 +35,7 @@ Status DBImplReadOnly::Get(const ReadOptions& read_options,
   LookupKey lkey(key, snapshot);
   if (super_version->mem->Get(lkey, value, &s, &merge_context)) {
   } else {
+    PERF_TIMER_GUARD(get_from_output_files_time);
     super_version->current->Get(read_options, lkey, value, &s, &merge_context);
   }
   return s;
diff --git a/db/perf_context_test.cc b/db/perf_context_test.cc
index a182fb521..9d34409c3 100644
--- a/db/perf_context_test.cc
+++ b/db/perf_context_test.cc
@@ -6,7 +6,6 @@
 #include <algorithm>
 #include <iostream>
 #include <vector>
-#include "/usr/include/valgrind/callgrind.h"
 
 #include "rocksdb/db.h"
 #include "rocksdb/perf_context.h"
@@ -29,7 +28,7 @@ const std::string kDbName = rocksdb::test::TmpDir() + "/perf_context_test";
 
 namespace rocksdb {
 
-std::shared_ptr<DB> OpenDb() {
+std::shared_ptr<DB> OpenDb(bool read_only = false) {
     DB* db;
     Options options;
     options.create_if_missing = true;
@@ -39,12 +38,16 @@ std::shared_ptr<DB> OpenDb() {
       FLAGS_min_write_buffer_number_to_merge;
 
     if (FLAGS_use_set_based_memetable) {
-      auto prefix_extractor = rocksdb::NewFixedPrefixTransform(0);
-      options.memtable_factory.reset(
-          NewHashSkipListRepFactory(prefix_extractor));
+      options.prefix_extractor.reset(rocksdb::NewFixedPrefixTransform(0));
+      options.memtable_factory.reset(NewHashSkipListRepFactory());
     }
 
-    Status s = DB::Open(options, kDbName,  &db);
+    Status s;
+    if (!read_only) {
+      s = DB::Open(options, kDbName, &db);
+    } else {
+      s = DB::OpenForReadOnly(options, kDbName, &db);
+    }
     ASSERT_OK(s);
     return std::shared_ptr<DB>(db);
 }
@@ -76,7 +79,8 @@ TEST(PerfContextTest, SeekIntoDeletion) {
     std::string value;
 
     perf_context.Reset();
-    StopWatchNano timer(Env::Default(), true);
+    StopWatchNano timer(Env::Default());
+    timer.Start();
     auto status = db->Get(read_options, key, &value);
     auto elapsed_nanos = timer.ElapsedNanos();
     ASSERT_TRUE(status.IsNotFound());
@@ -149,11 +153,12 @@ TEST(PerfContextTest, StopWatchNanoOverhead) {
 TEST(PerfContextTest, StopWatchOverhead) {
   // profile the timer cost by itself!
   const int kTotalIterations = 1000000;
+  uint64_t elapsed = 0;
   std::vector<uint64_t> timings(kTotalIterations);
 
-  StopWatch timer(Env::Default());
+  StopWatch timer(Env::Default(), nullptr, 0, &elapsed);
   for (auto& timing : timings) {
-    timing = timer.ElapsedMicros();
+    timing = elapsed;
   }
 
   HistogramImpl histogram;
@@ -166,7 +171,7 @@ TEST(PerfContextTest, StopWatchOverhead) {
   std::cout << histogram.ToString();
 }
 
-void ProfileKeyComparison() {
+void ProfileQueries(bool enabled_time = false) {
   DestroyDB(kDbName, Options());    // Start this test with a fresh DB
 
   auto db = OpenDb();
@@ -175,11 +180,21 @@ void ProfileKeyComparison() {
   ReadOptions read_options;
 
   HistogramImpl hist_put;
+
   HistogramImpl hist_get;
   HistogramImpl hist_get_snapshot;
   HistogramImpl hist_get_memtable;
+  HistogramImpl hist_get_files;
   HistogramImpl hist_get_post_process;
   HistogramImpl hist_num_memtable_checked;
+
+  HistogramImpl hist_mget;
+  HistogramImpl hist_mget_snapshot;
+  HistogramImpl hist_mget_memtable;
+  HistogramImpl hist_mget_files;
+  HistogramImpl hist_mget_post_process;
+  HistogramImpl hist_mget_num_memtable_checked;
+
   HistogramImpl hist_write_pre_post;
   HistogramImpl hist_write_wal_time;
   HistogramImpl hist_write_memtable_time;
@@ -187,8 +202,13 @@ void ProfileKeyComparison() {
   std::cout << "Inserting " << FLAGS_total_keys << " key/value pairs\n...\n";
 
   std::vector<int> keys;
+  const int kFlushFlag = -1;
   for (int i = 0; i < FLAGS_total_keys; ++i) {
     keys.push_back(i);
+    if (i == FLAGS_total_keys / 2) {
+      // Issuing a flush in the middle.
+      keys.push_back(kFlushFlag);
+    }
   }
 
   if (FLAGS_random_key) {
@@ -196,27 +216,54 @@ void ProfileKeyComparison() {
   }
 
   for (const int i : keys) {
+    if (i == kFlushFlag) {
+      FlushOptions fo;
+      db->Flush(fo);
+      continue;
+    }
     std::string key = "k" + std::to_string(i);
     std::string value = "v" + std::to_string(i);
 
+    std::vector<Slice> keys = {Slice(key)};
+    std::vector<std::string> values;
+
     perf_context.Reset();
     db->Put(write_options, key, value);
     hist_write_pre_post.Add(perf_context.write_pre_and_post_process_time);
     hist_write_wal_time.Add(perf_context.write_wal_time);
     hist_write_memtable_time.Add(perf_context.write_memtable_time);
     hist_put.Add(perf_context.user_key_comparison_count);
+  }
+
+  for (const int i : keys) {
+    std::string key = "k" + std::to_string(i);
+    std::string value = "v" + std::to_string(i);
+
+    std::vector<Slice> keys = {Slice(key)};
+    std::vector<std::string> values;
 
     perf_context.Reset();
     db->Get(read_options, key, &value);
     hist_get_snapshot.Add(perf_context.get_snapshot_time);
     hist_get_memtable.Add(perf_context.get_from_memtable_time);
+    hist_get_files.Add(perf_context.get_from_output_files_time);
     hist_num_memtable_checked.Add(perf_context.get_from_memtable_count);
     hist_get_post_process.Add(perf_context.get_post_process_time);
     hist_get.Add(perf_context.user_key_comparison_count);
+
+    perf_context.Reset();
+    db->MultiGet(read_options, keys, &values);
+    hist_mget_snapshot.Add(perf_context.get_snapshot_time);
+    hist_mget_memtable.Add(perf_context.get_from_memtable_time);
+    hist_mget_files.Add(perf_context.get_from_output_files_time);
+    hist_mget_num_memtable_checked.Add(perf_context.get_from_memtable_count);
+    hist_mget_post_process.Add(perf_context.get_post_process_time);
+    hist_mget.Add(perf_context.user_key_comparison_count);
   }
 
   std::cout << "Put uesr key comparison: \n" << hist_put.ToString()
-            << "Get uesr key comparison: \n" << hist_get.ToString();
+            << "Get uesr key comparison: \n" << hist_get.ToString()
+            << "MultiGet uesr key comparison: \n" << hist_get.ToString();
   std::cout << "Put(): Pre and Post Process Time: \n"
             << hist_write_pre_post.ToString()
             << " Writing WAL time: \n"
@@ -224,25 +271,139 @@ void ProfileKeyComparison() {
             << " Writing Mem Table time: \n"
             << hist_write_memtable_time.ToString() << "\n";
 
-  std::cout << "Get(): Time to get snapshot: \n"
+  std::cout << "Get(): Time to get snapshot: \n" << hist_get_snapshot.ToString()
+            << " Time to get value from memtables: \n"
+            << hist_get_memtable.ToString() << "\n"
+            << " Time to get value from output files: \n"
+            << hist_get_files.ToString() << "\n"
+            << " Number of memtables checked: \n"
+            << hist_num_memtable_checked.ToString() << "\n"
+            << " Time to post process: \n" << hist_get_post_process.ToString()
+            << "\n";
+
+  std::cout << "MultiGet(): Time to get snapshot: \n"
+            << hist_mget_snapshot.ToString()
+            << " Time to get value from memtables: \n"
+            << hist_mget_memtable.ToString() << "\n"
+            << " Time to get value from output files: \n"
+            << hist_mget_files.ToString() << "\n"
+            << " Number of memtables checked: \n"
+            << hist_mget_num_memtable_checked.ToString() << "\n"
+            << " Time to post process: \n" << hist_mget_post_process.ToString()
+            << "\n";
+
+  if (enabled_time) {
+    ASSERT_GT(hist_get.Average(), 0);
+    ASSERT_GT(hist_get_snapshot.Average(), 0);
+    ASSERT_GT(hist_get_memtable.Average(), 0);
+    ASSERT_GT(hist_get_files.Average(), 0);
+    ASSERT_GT(hist_get_post_process.Average(), 0);
+    ASSERT_GT(hist_num_memtable_checked.Average(), 0);
+
+    ASSERT_GT(hist_mget.Average(), 0);
+    ASSERT_GT(hist_mget_snapshot.Average(), 0);
+    ASSERT_GT(hist_mget_memtable.Average(), 0);
+    ASSERT_GT(hist_mget_files.Average(), 0);
+    ASSERT_GT(hist_mget_post_process.Average(), 0);
+    ASSERT_GT(hist_mget_num_memtable_checked.Average(), 0);
+  }
+
+  db.reset();
+  db = OpenDb(true);
+
+  hist_get.Clear();
+  hist_get_snapshot.Clear();
+  hist_get_memtable.Clear();
+  hist_get_files.Clear();
+  hist_get_post_process.Clear();
+  hist_num_memtable_checked.Clear();
+
+  hist_mget.Clear();
+  hist_mget_snapshot.Clear();
+  hist_mget_memtable.Clear();
+  hist_mget_files.Clear();
+  hist_mget_post_process.Clear();
+  hist_mget_num_memtable_checked.Clear();
+
+  for (const int i : keys) {
+    std::string key = "k" + std::to_string(i);
+    std::string value = "v" + std::to_string(i);
+
+    std::vector<Slice> keys = {Slice(key)};
+    std::vector<std::string> values;
+
+    perf_context.Reset();
+    db->Get(read_options, key, &value);
+    hist_get_snapshot.Add(perf_context.get_snapshot_time);
+    hist_get_memtable.Add(perf_context.get_from_memtable_time);
+    hist_get_files.Add(perf_context.get_from_output_files_time);
+    hist_num_memtable_checked.Add(perf_context.get_from_memtable_count);
+    hist_get_post_process.Add(perf_context.get_post_process_time);
+    hist_get.Add(perf_context.user_key_comparison_count);
+
+    perf_context.Reset();
+    db->MultiGet(read_options, keys, &values);
+    hist_mget_snapshot.Add(perf_context.get_snapshot_time);
+    hist_mget_memtable.Add(perf_context.get_from_memtable_time);
+    hist_mget_files.Add(perf_context.get_from_output_files_time);
+    hist_mget_num_memtable_checked.Add(perf_context.get_from_memtable_count);
+    hist_mget_post_process.Add(perf_context.get_post_process_time);
+    hist_mget.Add(perf_context.user_key_comparison_count);
+  }
+
+  std::cout << "ReadOnly Get uesr key comparison: \n" << hist_get.ToString()
+            << "ReadOnly MultiGet uesr key comparison: \n"
+            << hist_mget.ToString();
+
+  std::cout << "ReadOnly Get(): Time to get snapshot: \n"
             << hist_get_snapshot.ToString()
             << " Time to get value from memtables: \n"
             << hist_get_memtable.ToString() << "\n"
+            << " Time to get value from output files: \n"
+            << hist_get_files.ToString() << "\n"
             << " Number of memtables checked: \n"
             << hist_num_memtable_checked.ToString() << "\n"
-            << " Time to post process: \n"
-            << hist_get_post_process.ToString() << "\n";
+            << " Time to post process: \n" << hist_get_post_process.ToString()
+            << "\n";
+
+  std::cout << "ReadOnly MultiGet(): Time to get snapshot: \n"
+            << hist_mget_snapshot.ToString()
+            << " Time to get value from memtables: \n"
+            << hist_mget_memtable.ToString() << "\n"
+            << " Time to get value from output files: \n"
+            << hist_mget_files.ToString() << "\n"
+            << " Number of memtables checked: \n"
+            << hist_mget_num_memtable_checked.ToString() << "\n"
+            << " Time to post process: \n" << hist_mget_post_process.ToString()
+            << "\n";
+
+  if (enabled_time) {
+    ASSERT_GT(hist_get.Average(), 0);
+    ASSERT_GT(hist_get_memtable.Average(), 0);
+    ASSERT_GT(hist_get_files.Average(), 0);
+    ASSERT_GT(hist_num_memtable_checked.Average(), 0);
+    // In read-only mode Get(), no super version operation is needed
+    ASSERT_EQ(hist_get_post_process.Average(), 0);
+    ASSERT_EQ(hist_get_snapshot.Average(), 0);
+
+    ASSERT_GT(hist_mget.Average(), 0);
+    ASSERT_GT(hist_mget_snapshot.Average(), 0);
+    ASSERT_GT(hist_mget_memtable.Average(), 0);
+    ASSERT_GT(hist_mget_files.Average(), 0);
+    ASSERT_GT(hist_mget_post_process.Average(), 0);
+    ASSERT_GT(hist_mget_num_memtable_checked.Average(), 0);
+  }
 }
 
 TEST(PerfContextTest, KeyComparisonCount) {
   SetPerfLevel(kEnableCount);
-  ProfileKeyComparison();
+  ProfileQueries();
 
   SetPerfLevel(kDisable);
-  ProfileKeyComparison();
+  ProfileQueries();
 
   SetPerfLevel(kEnableTime);
-  ProfileKeyComparison();
+  ProfileQueries(true);
 }
 
 // make perf_context_test

From 8ea232b9e3163ece812b1d8c2ae3653f9d0a7f13 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Thu, 2 Oct 2014 12:00:09 -0700
Subject: [PATCH 214/829] Add number of records dropped in compaction summary

Summary:
Add two stats to compaction summary:
1. Total input records from previous level
2. Total number of records dropped after compaction

Test Plan: See outputs of printing when runnning locally

Reviewers: ljin, igor, MarkCallaghan

Reviewed By: MarkCallaghan

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D24411
---
 db/db_impl.cc        | 21 +++++++++++++++++++--
 db/db_impl.h         |  1 +
 db/internal_stats.cc | 10 +++++++---
 db/internal_stats.h  | 15 +++++++++++++++
 4 files changed, 42 insertions(+), 5 deletions(-)

diff --git a/db/db_impl.cc b/db/db_impl.cc
index 85dab9f9e..5d6eaf197 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -2694,7 +2694,10 @@ Status DBImpl::ProcessKeyValueCompaction(
     Iterator* input,
     CompactionState* compact,
     bool is_compaction_v2,
+    int* num_output_records,
     LogBuffer* log_buffer) {
+  assert(num_output_records != nullptr);
+
   size_t combined_idx = 0;
   Status status;
   std::string compaction_filter_value;
@@ -2965,6 +2968,7 @@ Status DBImpl::ProcessKeyValueCompaction(
         }
         compact->current_output()->largest.DecodeFrom(newkey);
         compact->builder->Add(newkey, value);
+        (*num_output_records)++,
         compact->current_output()->largest_seqno =
           std::max(compact->current_output()->largest_seqno, seqno);
 
@@ -3140,6 +3144,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact,
   mutex_.Unlock();
   log_buffer->FlushBufferToLog();
 
+  int num_output_records = 0;
   const uint64_t start_micros = env_->NowMicros();
   unique_ptr<Iterator> input(versions_->MakeInputIterator(compact->compaction));
   input->SeekToFirst();
@@ -3168,6 +3173,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact,
       input.get(),
       compact,
       false,
+      &num_output_records,
       log_buffer);
   } else {
     // temp_backup_input always point to the start of the current buffer
@@ -3249,6 +3255,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact,
           input.get(),
           compact,
           true,
+          &num_output_records,
           log_buffer);
 
       if (!status.ok()) {
@@ -3286,6 +3293,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact,
             input.get(),
             compact,
             true,
+            &num_output_records,
             log_buffer);
 
         compact->CleanupBatchBuffer();
@@ -3309,6 +3317,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact,
         input.get(),
         compact,
         true,
+        &num_output_records,
         log_buffer);
   }  // checking for compaction filter v2
 
@@ -3342,17 +3351,24 @@ Status DBImpl::DoCompactionWork(CompactionState* compact,
   }
   stats.files_out_levelnp1 = num_output_files;
 
+  uint64_t num_input_records = 0;
+
   for (int i = 0; i < compact->compaction->num_input_files(0); i++) {
     stats.bytes_readn += compact->compaction->input(0, i)->fd.GetFileSize();
+    stats.num_input_records += compact->compaction->input(0, i)->num_entries;
+    num_input_records += compact->compaction->input(0, i)->num_entries;
   }
 
   for (int i = 0; i < compact->compaction->num_input_files(1); i++) {
     stats.bytes_readnp1 += compact->compaction->input(1, i)->fd.GetFileSize();
+    num_input_records += compact->compaction->input(1, i)->num_entries;
   }
 
   for (int i = 0; i < num_output_files; i++) {
     stats.bytes_written += compact->outputs[i].file_size;
   }
+  stats.num_dropped_records =
+      static_cast<int>(num_input_records) - num_output_records;
 
   RecordCompactionIOStats();
 
@@ -3375,7 +3391,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact,
       "[%s] compacted to: %s, MB/sec: %.1f rd, %.1f wr, level %d, "
       "files in(%d, %d) out(%d) "
       "MB in(%.1f, %.1f) out(%.1f), read-write-amplify(%.1f) "
-      "write-amplify(%.1f) %s\n",
+      "write-amplify(%.1f) %s, records in: %d, records dropped: %d\n",
       cfd->GetName().c_str(), cfd->current()->LevelSummary(&tmp),
       (stats.bytes_readn + stats.bytes_readnp1) /
           static_cast<double>(stats.micros),
@@ -3387,7 +3403,8 @@ Status DBImpl::DoCompactionWork(CompactionState* compact,
       (stats.bytes_written + stats.bytes_readnp1 + stats.bytes_readn) /
           (double)stats.bytes_readn,
       stats.bytes_written / (double)stats.bytes_readn,
-      status.ToString().c_str());
+      status.ToString().c_str(), stats.num_input_records,
+      stats.num_dropped_records);
 
   return status;
 }
diff --git a/db/db_impl.h b/db/db_impl.h
index f1a81e00c..622df4293 100644
--- a/db/db_impl.h
+++ b/db/db_impl.h
@@ -419,6 +419,7 @@ class DBImpl : public DB {
     Iterator* input,
     CompactionState* compact,
     bool is_compaction_v2,
+    int* num_output_records,
     LogBuffer* log_buffer);
 
   // Call compaction_filter_v2->Filter() on kv-pairs in compact
diff --git a/db/internal_stats.cc b/db/internal_stats.cc
index c9f9306e2..3f60d72ce 100644
--- a/db/internal_stats.cc
+++ b/db/internal_stats.cc
@@ -30,7 +30,7 @@ void PrintLevelStatsHeader(char* buf, size_t len, const std::string& cf_name) {
       "Level   Files   Size(MB) Score Read(GB)  Rn(GB) Rnp1(GB) "
       "Write(GB) Wnew(GB) RW-Amp W-Amp Rd(MB/s) Wr(MB/s)  Rn(cnt) "
       "Rnp1(cnt) Wnp1(cnt) Wnew(cnt)  Comp(sec) Comp(cnt) Avg(sec) "
-      "Stall(sec) Stall(cnt) Avg(ms)\n"
+      "Stall(sec) Stall(cnt) Avg(ms) RecordIn RecordDrop\n"
       "--------------------------------------------------------------------"
       "--------------------------------------------------------------------"
       "--------------------------------------------------------------------\n",
@@ -65,7 +65,9 @@ void PrintLevelStats(char* buf, size_t len, const std::string& name,
     "%8.3f " /* Avg(sec) */
     "%10.2f " /* Stall(sec) */
     "%10" PRIu64 " " /* Stall(cnt) */
-    "%7.2f\n" /* Avg(ms) */,
+    "%7.2f" /* Avg(ms) */
+    "%8d " /* input entries */
+    "%10d\n" /* number of records reduced */,
     name.c_str(), num_files, being_compacted, total_file_size / kMB, score,
     bytes_read / kGB,
     stats.bytes_readn / kGB,
@@ -85,7 +87,9 @@ void PrintLevelStats(char* buf, size_t len, const std::string& name,
     stats.count == 0 ? 0 : stats.micros / 1000000.0 / stats.count,
     stall_us / 1000000.0,
     stalls,
-    stalls == 0 ? 0 : stall_us / 1000.0 / stalls);
+    stalls == 0 ? 0 : stall_us / 1000.0 / stalls,
+    stats.num_input_records,
+    stats.num_dropped_records);
 }
 
 
diff --git a/db/internal_stats.h b/db/internal_stats.h
index 2e04f24e7..18d67de5c 100644
--- a/db/internal_stats.h
+++ b/db/internal_stats.h
@@ -123,6 +123,13 @@ class InternalStats {
     // Files written during compaction between levels N and N+1
     int files_out_levelnp1;
 
+    // Total incoming entries during compaction between levels N and N+1
+    int num_input_records;
+
+    // Accumulated diff number of entries
+    // (num input entries - num output entires) for compaction  levels N and N+1
+    int num_dropped_records;
+
     // Number of compactions done
     int count;
 
@@ -134,6 +141,8 @@ class InternalStats {
           files_in_leveln(0),
           files_in_levelnp1(0),
           files_out_levelnp1(0),
+          num_input_records(0),
+          num_dropped_records(0),
           count(count) {}
 
     explicit CompactionStats(const CompactionStats& c)
@@ -144,6 +153,8 @@ class InternalStats {
           files_in_leveln(c.files_in_leveln),
           files_in_levelnp1(c.files_in_levelnp1),
           files_out_levelnp1(c.files_out_levelnp1),
+          num_input_records(c.num_input_records),
+          num_dropped_records(c.num_dropped_records),
           count(c.count) {}
 
     void Add(const CompactionStats& c) {
@@ -154,6 +165,8 @@ class InternalStats {
       this->files_in_leveln += c.files_in_leveln;
       this->files_in_levelnp1 += c.files_in_levelnp1;
       this->files_out_levelnp1 += c.files_out_levelnp1;
+      this->num_input_records += c.num_input_records;
+      this->num_dropped_records += c.num_dropped_records;
       this->count += c.count;
     }
 
@@ -165,6 +178,8 @@ class InternalStats {
       this->files_in_leveln -= c.files_in_leveln;
       this->files_in_levelnp1 -= c.files_in_levelnp1;
       this->files_out_levelnp1 -= c.files_out_levelnp1;
+      this->num_input_records -= c.num_input_records;
+      this->num_dropped_records -= c.num_dropped_records;
       this->count -= c.count;
     }
   };

From 0e516a75da6b59e1c5894955478d44089785328e Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Fri, 3 Oct 2014 00:10:58 -0700
Subject: [PATCH 215/829] Fix lint errors in java/rocksjni/options.cc

Summary:
Fix lint errors in java/rocksjni/options.cc

Test Plan:
make rocksdbjava
---
 java/rocksjni/options.cc | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/java/rocksjni/options.cc b/java/rocksjni/options.cc
index 50416ef81..a8be5af8b 100644
--- a/java/rocksjni/options.cc
+++ b/java/rocksjni/options.cc
@@ -73,11 +73,13 @@ void Java_org_rocksdb_Options_setBuiltinComparator(
     JNIEnv* env, jobject jobj, jlong jhandle, jint builtinComparator) {
   switch (builtinComparator){
     case 1:
-    	reinterpret_cast<rocksdb::Options*>(jhandle)->comparator = rocksdb::ReverseBytewiseComparator();
-    	break;
+      reinterpret_cast<rocksdb::Options*>(jhandle)->comparator =
+          rocksdb::ReverseBytewiseComparator();
+      break;
     default:
-    	reinterpret_cast<rocksdb::Options*>(jhandle)->comparator = rocksdb::BytewiseComparator();
-	break;
+      reinterpret_cast<rocksdb::Options*>(jhandle)->comparator =
+          rocksdb::BytewiseComparator();
+      break;
   }
 }
 

From 56dfd363fd51aa10c7f1d9d965c8bbbefffa6c30 Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Fri, 3 Oct 2014 00:25:27 -0700
Subject: [PATCH 216/829] Fix a check in database shutdown or Column family
 drop during flush.

Summary:
Fix a check in database shutdown or Column family drop during flush.

Special thanks to Maurice Barnum who spots the problem :)

Test Plan: db_test

Reviewers: ljin, igor, sdong

Reviewed By: sdong

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D24273
---
 db/db_impl.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/db/db_impl.cc b/db/db_impl.cc
index 5d6eaf197..7463f749b 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -1614,7 +1614,7 @@ Status DBImpl::FlushMemTableToOutputFile(
   Status s = WriteLevel0Table(cfd, mutable_cf_options, mems, edit,
                               &file_number, log_buffer);
 
-  if (s.ok() && shutting_down_.Acquire_Load() && cfd->IsDropped()) {
+  if (s.ok() && (shutting_down_.Acquire_Load() || cfd->IsDropped())) {
     s = Status::ShutdownInProgress(
         "Database shutdown or Column family drop during flush");
   }

From 4eb5a40f7db18500b4f639698eb176eeecb150fc Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Fri, 3 Oct 2014 02:10:00 -0700
Subject: [PATCH 217/829] [Java] Fixed link error on library loading on Mac.

Summary:
Fixed link error on library loading on Mac.

Test Plan:
make rocksdbjava
make jtest
---
 Makefile | 1 -
 1 file changed, 1 deletion(-)

diff --git a/Makefile b/Makefile
index ad9082d31..705b7ff97 100644
--- a/Makefile
+++ b/Makefile
@@ -517,7 +517,6 @@ ROCKSDBJNILIB = librocksdbjni.so
 ROCKSDB_JAR = rocksdbjni.jar
 
 ifeq ($(PLATFORM), OS_MACOSX)
-ROCKSDBJNILIB = librocksdbjni.jnilib
 JAVA_INCLUDE = -I/System/Library/Frameworks/JavaVM.framework/Headers/
 endif
 

From df3373fbf7dcd7af0b0a27779bda0d6f37c05b9b Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Fri, 3 Oct 2014 02:14:43 -0700
Subject: [PATCH 218/829] [Java] Fix compile error on DbBenchmark.java

Summary:
Fix compile error on DbBenchmark.java

Test Plan:
make rocksdbjava
make jdb_bench
---
 java/org/rocksdb/benchmark/DbBenchmark.java | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/java/org/rocksdb/benchmark/DbBenchmark.java b/java/org/rocksdb/benchmark/DbBenchmark.java
index 686d39445..612fdaf28 100644
--- a/java/org/rocksdb/benchmark/DbBenchmark.java
+++ b/java/org/rocksdb/benchmark/DbBenchmark.java
@@ -523,8 +523,8 @@ public class DbBenchmark {
       BlockBasedTableConfig table_options = new BlockBasedTableConfig();
       table_options.setBlockSize((Long)flags_.get(Flag.block_size))
                    .setBlockCacheSize((Long)flags_.get(Flag.cache_size))
-                   .setFilterBitsPerKey((Integer)flags_.get(Flag.bloom_bits))
-                   .setCacheNumShardBits((Integer)flags_.get(Flag.cache_numshardbits));
+                   .setCacheNumShardBits(
+                      (Integer)flags_.get(Flag.cache_numshardbits));
       options.setTableFormatConfig(table_options);
     }
     options.setWriteBufferSize(

From a5757ff3c274215a97dee0de705c5b4220b23bc2 Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Fri, 3 Oct 2014 11:50:40 +0200
Subject: [PATCH 219/829] Listing of changes

- JavaDoc readability of RocksObject JavaDoc
- JavaDoc improvements BlockBasedTableConfig, GenericRateLimiterConfig, RocksDB
- JavaDoc improvements MemTableConfig
- JavaDoc improvements RocksObject
- JavaDoc improvements GenericRateLimiterConfig
- JavaDoc improvements ReadOptions
- JavaDoc improvements RateLimiterConfig
- JavaDoc improvements RestoreOptions
- JavaDoc improvements RestoreBackupableDB
- JavaDoc improvements BlockBasedTableConfig
- JavaDoc improvements Options
- JavaDoc improvements BackupableDB and BackupableDBOptions
---
 java/org/rocksdb/BackupableDB.java            |  23 +-
 java/org/rocksdb/BackupableDBOptions.java     |  50 +++--
 java/org/rocksdb/BlockBasedTableConfig.java   |  23 +-
 .../org/rocksdb/GenericRateLimiterConfig.java |  34 ++-
 java/org/rocksdb/MemTableConfig.java          |   2 +-
 java/org/rocksdb/Options.java                 | 204 +++++++++---------
 java/org/rocksdb/RateLimiterConfig.java       |   9 +-
 java/org/rocksdb/ReadOptions.java             |   5 +-
 java/org/rocksdb/RestoreBackupableDB.java     |  21 +-
 java/org/rocksdb/RestoreOptions.java          |  14 +-
 java/org/rocksdb/RocksDB.java                 |   9 +-
 java/org/rocksdb/RocksObject.java             |  77 ++++---
 12 files changed, 275 insertions(+), 196 deletions(-)

diff --git a/java/org/rocksdb/BackupableDB.java b/java/org/rocksdb/BackupableDB.java
index 108c4deb5..3ee29b347 100644
--- a/java/org/rocksdb/BackupableDB.java
+++ b/java/org/rocksdb/BackupableDB.java
@@ -8,19 +8,19 @@ package org.rocksdb;
 /**
  * A subclass of RocksDB which supports backup-related operations.
  *
- * @see BackupableDBOptions
+ * @see org.rocksdb.BackupableDBOptions
  */
 public class BackupableDB extends RocksDB {
   /**
-   * Open a BackupableDB under the specified path.
+   * Open a {@code BackupableDB} under the specified path.
    * Note that the backup path should be set properly in the
    * input BackupableDBOptions.
    *
-   * @param opt options for db.
-   * @param bopt backup related options.
-   * @param the db path for storing data.  The path for storing
-   *     backup should be specified in the BackupableDBOptions.
-   * @return reference to the opened BackupableDB.
+   * @param opt {@link org.rocksdb.Options} to set for the database.
+   * @param bopt {@link org.rocksdb.BackupableDBOptions} to use.
+   * @param db_path Path to store data to. The path for storing the backup should be
+   *     specified in the {@link org.rocksdb.BackupableDBOptions}.
+   * @return BackupableDB reference to the opened database.
    */
   public static BackupableDB open(
       Options opt, BackupableDBOptions bopt, String db_path)
@@ -61,10 +61,9 @@ public class BackupableDB extends RocksDB {
   /**
    * Close the BackupableDB instance and release resource.
    *
-   * Internally, BackupableDB owns the rocksdb::DB pointer to its
-   * associated RocksDB.  The release of that RocksDB pointer is
-   * handled in the destructor of the c++ rocksdb::BackupableDB and
-   * should be transparent to Java developers.
+   * Internally, BackupableDB owns the {@code rocksdb::DB} pointer to its associated
+   * {@link org.rocksdb.RocksDB}. The release of that RocksDB pointer is handled in the destructor
+   * of the c++ {@code rocksdb::BackupableDB} and should be transparent to Java developers.
    */
   @Override public synchronized void close() {
     if (isInitialized()) {
@@ -74,7 +73,7 @@ public class BackupableDB extends RocksDB {
 
   /**
    * A protected construction that will be used in the static factory
-   * method BackupableDB.open().
+   * method {@link #open(Options, BackupableDBOptions, String)}.
    */
   protected BackupableDB() {
     super();
diff --git a/java/org/rocksdb/BackupableDBOptions.java b/java/org/rocksdb/BackupableDBOptions.java
index 2c5047f77..07751a64d 100644
--- a/java/org/rocksdb/BackupableDBOptions.java
+++ b/java/org/rocksdb/BackupableDBOptions.java
@@ -7,33 +7,41 @@ package org.rocksdb;
 
 /**
  * BackupableDBOptions to control the behavior of a backupable database.
- * It will be used during the creation of a BackupableDB.
+ * It will be used during the creation of a {@link org.rocksdb.BackupableDB}.
  *
  * Note that dispose() must be called before an Options instance
  * become out-of-scope to release the allocated memory in c++.
  *
- * @param path Where to keep the backup files. Has to be different than dbname.
-       Best to set this to dbname_ + "/backups"
- * @param shareTableFiles If share_table_files == true, backup will assume that
- *     table files with same name have the same contents. This enables
- *     incremental backups and avoids unnecessary data copies. If
- *     share_table_files == false, each backup will be on its own and will not
- *     share any data with other backups. default: true
- * @param sync If sync == true, we can guarantee you'll get consistent backup
- *     even on a machine crash/reboot. Backup process is slower with sync
- *     enabled. If sync == false, we don't guarantee anything on machine reboot.
- *     However, chances are some of the backups are consistent. Default: true
- * @param destroyOldData If true, it will delete whatever backups there are
- *     already. Default: false
- * @param backupLogFiles If false, we won't backup log files. This option can be
- *     useful for backing up in-memory databases where log file are persisted,
- *     but table files are in memory. Default: true
- * @param backupRateLimit Max bytes that can be transferred in a second during
- *     backup. If 0 or negative, then go as fast as you can. Default: 0
- * @param restoreRateLimit Max bytes that can be transferred in a second during
- *     restore. If 0 or negative, then go as fast as you can. Default: 0
+ * @see org.rocksdb.BackupableDB
  */
 public class BackupableDBOptions extends RocksObject {
+
+  /**
+   * BackupableDBOptions constructor
+   *
+   * @param path Where to keep the backup files. Has to be different than db name.
+   *             Best to set this to {@code db name_ + "/backups"}
+   * @param shareTableFiles If {@code share_table_files == true}, backup will assume
+   *     that table files with same name have the same contents. This enables incremental
+   *     backups and avoids unnecessary data copies. If {@code share_table_files == false},
+   *     each backup will be on its own and will not share any data with other backups.
+   *     Default: true
+   * @param sync If {@code sync == true}, we can guarantee you'll get consistent backup
+   *     even on a machine crash/reboot. Backup process is slower with sync enabled.
+   *     If {@code sync == false}, we don't guarantee anything on machine reboot.
+   *     However,chances are some of the backups are consistent.
+   *     Default: true
+   * @param destroyOldData If true, it will delete whatever backups there are already.
+   *     Default: false
+   * @param backupLogFiles If false, we won't backup log files. This option can be
+   *     useful for backing up in-memory databases where log file are persisted,but table
+   *     files are in memory.
+   *     Default: true
+   * @param backupRateLimit Max bytes that can be transferred in a second during backup.
+   *     If 0 or negative, then go as fast as you can. Default: 0
+   * @param restoreRateLimit Max bytes that can be transferred in a second during restore.
+   *     If 0 or negative, then go as fast as you can. Default: 0
+   */
   public BackupableDBOptions(String path, boolean shareTableFiles, boolean sync,
       boolean destroyOldData, boolean backupLogFiles, long backupRateLimit,
       long restoreRateLimit) {
diff --git a/java/org/rocksdb/BlockBasedTableConfig.java b/java/org/rocksdb/BlockBasedTableConfig.java
index 9a6967a95..2f9f0ac64 100644
--- a/java/org/rocksdb/BlockBasedTableConfig.java
+++ b/java/org/rocksdb/BlockBasedTableConfig.java
@@ -27,7 +27,7 @@ public class BlockBasedTableConfig extends TableFormatConfig {
   /**
    * Disable block cache. If this is set to true,
    * then no block cache should be used, and the block_cache should
-   * point to a nullptr object.
+   * point to a {@code nullptr} object.
    * Default: false
    *
    * @param noBlockCache if use block cache
@@ -69,7 +69,7 @@ public class BlockBasedTableConfig extends TableFormatConfig {
    * Controls the number of shards for the block cache.
    * This is applied only if cacheSize is set to non-negative.
    *
-   * @param numShardBits the number of shard bits.  The resulting
+   * @param blockCacheNumShardBits the number of shard bits. The resulting
    *     number of shards would be 2 ^ numShardBits.  Any negative
    *     number means use default settings."
    * @return the reference to the current option.
@@ -176,13 +176,14 @@ public class BlockBasedTableConfig extends TableFormatConfig {
   /**
    * Use the specified filter policy to reduce disk reads.
    *
-   * Filter should not be disposed before options instances using this filter is
-   * disposed. If dispose() function is not called, then filter object will be
-   * GC'd automatically.
+   * {@link org.rocksdb.Filter} should not be disposed before options instances
+   * using this filter is disposed. If {@link Filter#dispose()} function is not
+   * called, then filter object will be GC'd automatically.
    *
-   * Filter instance can be re-used in multiple options instances.
+   * {@link org.rocksdb.Filter} instance can be re-used in multiple options
+   * instances.
    *
-   * @param Filter Filter Policy java instance.
+   * @param filter {@link org.rocksdb.Filter} Filter Policy java instance.
    * @return the reference to the current config.
    */
   public BlockBasedTableConfig setFilter(Filter filter) {
@@ -206,7 +207,7 @@ public class BlockBasedTableConfig extends TableFormatConfig {
      If not specified, each "table reader" object will pre-load index/filter
      block during table initialization.
    *
-   * @param index and filter blocks should be put in block cache.
+   * @param cacheIndexAndFilterBlocks and filter blocks should be put in block cache.
    * @return the reference to the current config.
    */
   public BlockBasedTableConfig setCacheIndexAndFilterBlocks(
@@ -233,7 +234,7 @@ public class BlockBasedTableConfig extends TableFormatConfig {
      if true, does not store prefix and allows prefix hash collision
      (less memory consumption)
    *
-   * @param if hash collisions should be allowed.
+   * @param hashIndexAllowCollision points out if hash collisions should be allowed.
    * @return the reference to the current config.
    */
   public BlockBasedTableConfig setHashIndexAllowCollision(
@@ -256,7 +257,7 @@ public class BlockBasedTableConfig extends TableFormatConfig {
    * Size of compressed block cache. If 0, then block_cache_compressed is set
    * to null.
    *
-   * @param size of compressed block cache.
+   * @param blockCacheCompressedSize of compressed block cache.
    * @return the reference to the current config.
    */
   public BlockBasedTableConfig setBlockCacheCompressedSize(
@@ -281,7 +282,7 @@ public class BlockBasedTableConfig extends TableFormatConfig {
    * Controls the number of shards for the block compressed cache.
    * This is applied only if blockCompressedCacheSize is set to non-negative.
    *
-   * @param numShardBits the number of shard bits.  The resulting
+   * @param blockCacheCompressedNumShardBits the number of shard bits.  The resulting
    *     number of shards would be 2 ^ numShardBits.  Any negative
    *     number means use default settings."
    * @return the reference to the current option.
diff --git a/java/org/rocksdb/GenericRateLimiterConfig.java b/java/org/rocksdb/GenericRateLimiterConfig.java
index 78b8b37ec..2a2e7b657 100644
--- a/java/org/rocksdb/GenericRateLimiterConfig.java
+++ b/java/org/rocksdb/GenericRateLimiterConfig.java
@@ -7,18 +7,48 @@ package org.rocksdb;
 /**
  * Config for rate limiter, which is used to control write rate of flush and
  * compaction.
+ *
+ * @see RateLimiterConfig
  */
 public class GenericRateLimiterConfig extends RateLimiterConfig {
   private static final long DEFAULT_REFILL_PERIOD_MICROS = (100 * 1000);
   private static final int DEFAULT_FAIRNESS = 10;
-    
+
+  /**
+   * GenericRateLimiterConfig constructor
+   *
+   * @param rateBytesPerSecond this is the only parameter you want to set
+   *     most of the time. It controls the total write rate of compaction
+   *     and flush in bytes per second. Currently, RocksDB does not enforce
+   *     rate limit for anything other than flush and compaction, e.g. write to WAL.
+   * @param refillPeriodMicros this controls how often tokens are refilled. For example,
+   *     when rate_bytes_per_sec is set to 10MB/s and refill_period_us is set to
+   *     100ms, then 1MB is refilled every 100ms internally. Larger value can lead to
+   *     burstier writes while smaller value introduces more CPU overhead.
+   *     The default should work for most cases.
+   * @param fairness RateLimiter accepts high-pri requests and low-pri requests.
+   *     A low-pri request is usually blocked in favor of hi-pri request. Currently,
+   *     RocksDB assigns low-pri to request from compaction and high-pri to request
+   *     from flush. Low-pri requests can get blocked if flush requests come in
+   *     continuously. This fairness parameter grants low-pri requests permission by
+   *     fairness chance even though high-pri requests exist to avoid starvation.
+   *     You should be good by leaving it at default 10.
+   */
   public GenericRateLimiterConfig(long rateBytesPerSecond,
       long refillPeriodMicros, int fairness) {
     rateBytesPerSecond_ = rateBytesPerSecond;
     refillPeriodMicros_ = refillPeriodMicros;
     fairness_ = fairness;
   }
-  
+
+  /**
+   * GenericRateLimiterConfig constructor
+   *
+   * @param rateBytesPerSecond this is the only parameter you want to set
+   *     most of the time. It controls the total write rate of compaction
+   *     and flush in bytes per second. Currently, RocksDB does not enforce
+   *     rate limit for anything other than flush and compaction, e.g. write to WAL.
+   */
   public GenericRateLimiterConfig(long rateBytesPerSecond) {
     this(rateBytesPerSecond, DEFAULT_REFILL_PERIOD_MICROS, DEFAULT_FAIRNESS);
   }
diff --git a/java/org/rocksdb/MemTableConfig.java b/java/org/rocksdb/MemTableConfig.java
index a473c2585..904aa37b5 100644
--- a/java/org/rocksdb/MemTableConfig.java
+++ b/java/org/rocksdb/MemTableConfig.java
@@ -21,7 +21,7 @@ public abstract class MemTableConfig {
    * which will create a c++ shared-pointer to the c++ MemTableRepFactory
    * that associated with the Java MemTableConfig.
    *
-   * @see Options.setMemTableFactory()
+   * @see Options#setMemTableConfig(MemTableConfig) 
    */
   abstract protected long newMemTableFactoryHandle();
 }
diff --git a/java/org/rocksdb/Options.java b/java/org/rocksdb/Options.java
index 7ccc74834..642c6c4dd 100644
--- a/java/org/rocksdb/Options.java
+++ b/java/org/rocksdb/Options.java
@@ -7,10 +7,10 @@ package org.rocksdb;
 
 /**
  * Options to control the behavior of a database.  It will be used
- * during the creation of a RocksDB (i.e., RocksDB.open()).
+ * during the creation of a {@link org.rocksdb.RocksDB} (i.e., RocksDB.open()).
  *
- * If dispose() function is not called, then it will be GC'd automatically and
- * native resources will be released as part of the process.
+ * If {@link #dispose()} function is not called, then it will be GC'd automatically
+ * and native resources will be released as part of the process.
  */
 public class Options extends RocksObject {
   static {
@@ -30,7 +30,7 @@ public class Options extends RocksObject {
    * Construct options for opening a RocksDB.
    *
    * This constructor will create (by allocating a block of memory)
-   * an rocksdb::Options in the c++ side.
+   * an {@code rocksdb::Options} in the c++ side.
    */
   public Options() {
     super();
@@ -42,13 +42,14 @@ public class Options extends RocksObject {
 
   /**
    * If this value is set to true, then the database will be created
-   * if it is missing during RocksDB.open().
+   * if it is missing during {@code RocksDB.open()}.
    * Default: false
    *
    * @param flag a flag indicating whether to create a database the
-   *     specified database in RocksDB.open() operation is missing.
-   * @return the instance of the current Options.
-   * @see RocksDB.open()
+   *     specified database in {@link org.rocksdb.RocksDB#open(Options, String)} operation
+   *     is missing.
+   * @return the instance of the current Options
+   * @see org.rocksdb.RocksDB#open(Options, String)
    */
   public Options setCreateIfMissing(boolean flag) {
     assert(isInitialized());
@@ -59,7 +60,7 @@ public class Options extends RocksObject {
   /**
    * Use the specified object to interact with the environment,
    * e.g. to read/write files, schedule background work, etc.
-   * Default: RocksEnv.getDefault()
+   * Default: {@link RocksEnv#getDefault()}
    */
   public Options setEnv(RocksEnv env) {
     assert(isInitialized());
@@ -79,7 +80,7 @@ public class Options extends RocksObject {
    * If true, the database will be created if it is missing.
    *
    * @return true if the createIfMissing option is set to true.
-   * @see setCreateIfMissing()
+   * @see #setCreateIfMissing(boolean)
    */
   public boolean createIfMissing() {
     assert(isInitialized());
@@ -87,12 +88,12 @@ public class Options extends RocksObject {
   }
 
   /**
-   * Set BuiltinComparator to be used with RocksDB. 
+   * Set {@link org.rocksdb.Options.BuiltinComparator} to be used with RocksDB.
    *
    * Note: Comparator can be set once upon database creation.
    *
    * Default: BytewiseComparator.
-   * @param builtinComparator a BuiltinComparator type.
+   * @param builtinComparator a {@link org.rocksdb.Options.BuiltinComparator} type.
    */
   public void setBuiltinComparator(BuiltinComparator builtinComparator) {
     assert(isInitialized());
@@ -106,7 +107,7 @@ public class Options extends RocksObject {
    * on disk) before converting to a sorted on-disk file.
    *
    * Larger values increase performance, especially during bulk loads.
-   * Up to max_write_buffer_number write buffers may be held in memory
+   * Up to {@code max_write_buffer_number} write buffers may be held in memory
    * at the same time, so you may wish to adjust this parameter
    * to control memory usage.
    *
@@ -116,7 +117,7 @@ public class Options extends RocksObject {
    * Default: 4MB
    * @param writeBufferSize the size of write buffer.
    * @return the instance of the current Options.
-   * @see RocksDB.open()
+   * @see org.rocksdb.RocksDB#open(Options, String)
    */
   public Options setWriteBufferSize(long writeBufferSize) {
     assert(isInitialized());
@@ -128,7 +129,7 @@ public class Options extends RocksObject {
    * Return size of write buffer size.
    *
    * @return size of write buffer.
-   * @see setWriteBufferSize()
+   * @see #setWriteBufferSize(long)
    */
   public long writeBufferSize()  {
     assert(isInitialized());
@@ -143,7 +144,7 @@ public class Options extends RocksObject {
    *
    * @param maxWriteBufferNumber maximum number of write buffers.
    * @return the instance of the current Options.
-   * @see RocksDB.open()
+   * @see org.rocksdb.RocksDB#open(Options, String)
    */
   public Options setMaxWriteBufferNumber(int maxWriteBufferNumber) {
     assert(isInitialized());
@@ -155,7 +156,7 @@ public class Options extends RocksObject {
    * Returns maximum number of write buffers.
    *
    * @return maximum number of write buffers.
-   * @see setMaxWriteBufferNumber()
+   * @see #setMaxWriteBufferNumber(int)
    */
   public int maxWriteBufferNumber() {
     assert(isInitialized());
@@ -181,9 +182,9 @@ public class Options extends RocksObject {
    * Default: false
    *
    * @param errorIfExists if true, an exception will be thrown
-   *     during RocksDB.open() if the database already exists.
+   *     during {@code RocksDB.open()} if the database already exists.
    * @return the reference to the current option.
-   * @see RocksDB.open()
+   * @see org.rocksdb.RocksDB#open(Options, String)
    */
   public Options setErrorIfExists(boolean errorIfExists) {
     assert(isInitialized());
@@ -237,8 +238,9 @@ public class Options extends RocksObject {
    * Number of open files that can be used by the DB.  You may need to
    * increase this if your database has a large working set. Value -1 means
    * files opened are always kept open. You can estimate number of files based
-   * on target_file_size_base and target_file_size_multiplier for level-based
-   * compaction. For universal-style compaction, you can usually set it to -1.
+   * on {@code target_file_size_base} and {@code target_file_size_multiplier}
+   * for level-based compaction. For universal-style compaction, you can usually
+   * set it to -1.
    *
    * @return the maximum number of open files.
    */
@@ -252,8 +254,9 @@ public class Options extends RocksObject {
    * Number of open files that can be used by the DB.  You may need to
    * increase this if your database has a large working set. Value -1 means
    * files opened are always kept open. You can estimate number of files based
-   * on target_file_size_base and target_file_size_multiplier for level-based
-   * compaction. For universal-style compaction, you can usually set it to -1.
+   * on {@code target_file_size_base} and {@code target_file_size_multiplier}
+   * for level-based compaction. For universal-style compaction, you can usually
+   * set it to -1.
    * Default: 5000
    *
    * @param maxOpenFiles the maximum number of open files.
@@ -271,7 +274,7 @@ public class Options extends RocksObject {
    * to stable storage. Their contents remain in the OS buffers till the
    * OS decides to flush them. This option is good for bulk-loading
    * of data. Once the bulk-loading is complete, please issue a
-   * sync to the OS to flush all dirty buffesrs to stable storage.
+   * sync to the OS to flush all dirty buffers to stable storage.
    *
    * @return if true, then data-sync is disabled.
    */
@@ -286,7 +289,7 @@ public class Options extends RocksObject {
    * to stable storage. Their contents remain in the OS buffers till the
    * OS decides to flush them. This option is good for bulk-loading
    * of data. Once the bulk-loading is complete, please issue a
-   * sync to the OS to flush all dirty buffesrs to stable storage.
+   * sync to the OS to flush all dirty buffers to stable storage.
    * Default: false
    *
    * @param disableDataSync a boolean flag to specify whether to
@@ -306,7 +309,7 @@ public class Options extends RocksObject {
    * This parameter should be set to true while storing data to
    * filesystem like ext3 that can lose files after a reboot.
    *
-   * @return true if fsync is used.
+   * @return boolean value indicating if fsync is used.
    */
   public boolean useFsync() {
     assert(isInitialized());
@@ -438,7 +441,8 @@ public class Options extends RocksObject {
    * Default: 1
    *
    * @return the maximum number of concurrent background compaction jobs.
-   * @see Env.setBackgroundThreads()
+   * @see org.rocksdb.RocksEnv#setBackgroundThreads(int)
+   * @see org.rocksdb.RocksEnv#setBackgroundThreads(int, int)
    */
   public int maxBackgroundCompactions() {
     assert(isInitialized());
@@ -451,7 +455,7 @@ public class Options extends RocksObject {
      it does not use any locks to prevent concurrent updates.
    *
    * @return the instance of the current Options.
-   * @see RocksDB.open()
+   * @see org.rocksdb.RocksDB#open(Options, String)
    */
   public Options createStatistics() {
     assert(isInitialized());
@@ -460,11 +464,11 @@ public class Options extends RocksObject {
   }
 
   /**
-   * Returns statistics object. Calls createStatistics() if
-   * C++ returns NULL pointer for statistics.
+   * Returns statistics object. Calls {@link #createStatistics()} if
+   * C++ returns {@code nullptr} for statistics.
    *
    * @return the instance of the statistics object.
-   * @see createStatistics()
+   * @see #createStatistics()
    */
   public Statistics statisticsPtr() {
     assert(isInitialized());
@@ -489,8 +493,9 @@ public class Options extends RocksObject {
    *     compaction jobs.
    * @return the reference to the current option.
    *
-   * @see Env.setBackgroundThreads()
-   * @see maxBackgroundFlushes()
+   * @see org.rocksdb.RocksEnv#setBackgroundThreads(int)
+   * @see org.rocksdb.RocksEnv#setBackgroundThreads(int, int)
+   * @see #maxBackgroundFlushes()
    */
   public Options setMaxBackgroundCompactions(int maxBackgroundCompactions) {
     assert(isInitialized());
@@ -505,7 +510,8 @@ public class Options extends RocksObject {
    * Default: 1
    *
    * @return the maximum number of concurrent background flush jobs.
-   * @see Env.setBackgroundThreads()
+   * @see org.rocksdb.RocksEnv#setBackgroundThreads(int)
+   * @see org.rocksdb.RocksEnv#setBackgroundThreads(int, int)
    */
   public int maxBackgroundFlushes() {
     assert(isInitialized());
@@ -519,11 +525,12 @@ public class Options extends RocksObject {
    * HIGH priority thread pool. For more information, see
    * Default: 1
    *
-   * @param maxBackgroundFlushes
+   * @param maxBackgroundFlushes number of max concurrent flush jobs
    * @return the reference to the current option.
    *
-   * @see Env.setBackgroundThreads()
-   * @see maxBackgroundCompactions()
+   * @see org.rocksdb.RocksEnv#setBackgroundThreads(int)
+   * @see org.rocksdb.RocksEnv#setBackgroundThreads(int, int)
+   * @see #maxBackgroundCompactions()
    */
   public Options setMaxBackgroundFlushes(int maxBackgroundFlushes) {
     assert(isInitialized());
@@ -713,20 +720,22 @@ public class Options extends RocksObject {
   /**
    * WalTtlSeconds() and walSizeLimitMB() affect how archived logs
    * will be deleted.
-   * 1. If both set to 0, logs will be deleted asap and will not get into
-   *    the archive.
-   * 2. If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0,
-   *    WAL files will be checked every 10 min and if total size is greater
-   *    then WAL_size_limit_MB, they will be deleted starting with the
-   *    earliest until size_limit is met. All empty files will be deleted.
-   * 3. If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then
-   *    WAL files will be checked every WAL_ttl_secondsi / 2 and those that
-   *    are older than WAL_ttl_seconds will be deleted.
-   * 4. If both are not 0, WAL files will be checked every 10 min and both
-   *    checks will be performed with ttl being first.
+   * <ol>
+   * <li>If both set to 0, logs will be deleted asap and will not get into
+   * the archive.</li>
+   * <li>If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0,
+   * WAL files will be checked every 10 min and if total size is greater
+   * then WAL_size_limit_MB, they will be deleted starting with the
+   * earliest until size_limit is met. All empty files will be deleted.</li>
+   * <li>If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then
+   * WAL files will be checked every WAL_ttl_secondsi / 2 and those that
+   * are older than WAL_ttl_seconds will be deleted.</li>
+   * <li>If both are not 0, WAL files will be checked every 10 min and both
+   * checks will be performed with ttl being first.</li>
+   * </ol>
    *
    * @return the wal-ttl seconds
-   * @see walSizeLimitMB()
+   * @see #walSizeLimitMB()
    */
   public long walTtlSeconds() {
     assert(isInitialized());
@@ -735,23 +744,24 @@ public class Options extends RocksObject {
   private native long walTtlSeconds(long handle);
 
   /**
-   * WalTtlSeconds() and walSizeLimitMB() affect how archived logs
+   * {@link #walTtlSeconds()} and {@link #walSizeLimitMB()} affect how archived logs
    * will be deleted.
-   * 1. If both set to 0, logs will be deleted asap and will not get into
-   *    the archive.
-   * 2. If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0,
+   * <ol>
+   * <li>If both set to 0, logs will be deleted asap and will not get into
+   * the archive.</li>
+   * <li>If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0,
    *    WAL files will be checked every 10 min and if total size is greater
    *    then WAL_size_limit_MB, they will be deleted starting with the
-   *    earliest until size_limit is met. All empty files will be deleted.
-   * 3. If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then
+   *    earliest until size_limit is met. All empty files will be deleted.</li>
+   * <li>If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then
    *    WAL files will be checked every WAL_ttl_secondsi / 2 and those that
-   *    are older than WAL_ttl_seconds will be deleted.
-   * 4. If both are not 0, WAL files will be checked every 10 min and both
-   *    checks will be performed with ttl being first.
+   *    are older than WAL_ttl_seconds will be deleted.</li>
+   * <li>If both are not 0, WAL files will be checked every 10 min and both
+   *    checks will be performed with ttl being first.</li>
    *
    * @param walTtlSeconds the ttl seconds
    * @return the reference to the current option.
-   * @see setWalSizeLimitMB()
+   * @see #setWalSizeLimitMB(long)
    */
   public Options setWalTtlSeconds(long walTtlSeconds) {
     assert(isInitialized());
@@ -761,22 +771,23 @@ public class Options extends RocksObject {
   private native void setWalTtlSeconds(long handle, long walTtlSeconds);
 
   /**
-   * WalTtlSeconds() and walSizeLimitMB() affect how archived logs
+   * {@link #walTtlSeconds()} and {@link #walSizeLimitMB()} affect how archived logs
    * will be deleted.
-   * 1. If both set to 0, logs will be deleted asap and will not get into
-   *    the archive.
-   * 2. If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0,
+   * <ol>
+   * <li>If both set to 0, logs will be deleted asap and will not get into
+   *    the archive.</li>
+   * <li>If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0,
    *    WAL files will be checked every 10 min and if total size is greater
    *    then WAL_size_limit_MB, they will be deleted starting with the
-   *    earliest until size_limit is met. All empty files will be deleted.
-   * 3. If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then
-   *    WAL files will be checked every WAL_ttl_secondsi / 2 and those that
-   *    are older than WAL_ttl_seconds will be deleted.
-   * 4. If both are not 0, WAL files will be checked every 10 min and both
-   *    checks will be performed with ttl being first.
-   *
+   *    earliest until size_limit is met. All empty files will be deleted.</li>
+   * <li>If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then
+   *    WAL files will be checked every WAL_ttl_seconds i / 2 and those that
+   *    are older than WAL_ttl_seconds will be deleted.</li>
+   * <li>If both are not 0, WAL files will be checked every 10 min and both
+   *    checks will be performed with ttl being first.</li>
+   * </ol>
    * @return size limit in mega-bytes.
-   * @see walSizeLimitMB()
+   * @see #walSizeLimitMB()
    */
   public long walSizeLimitMB() {
     assert(isInitialized());
@@ -787,21 +798,22 @@ public class Options extends RocksObject {
   /**
    * WalTtlSeconds() and walSizeLimitMB() affect how archived logs
    * will be deleted.
-   * 1. If both set to 0, logs will be deleted asap and will not get into
-   *    the archive.
-   * 2. If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0,
+   * <ol>
+   * <li>If both set to 0, logs will be deleted asap and will not get into
+   *    the archive.</li>
+   * <li>If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0,
    *    WAL files will be checked every 10 min and if total size is greater
    *    then WAL_size_limit_MB, they will be deleted starting with the
-   *    earliest until size_limit is met. All empty files will be deleted.
-   * 3. If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then
+   *    earliest until size_limit is met. All empty files will be deleted.</li>
+   * <li>If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then
    *    WAL files will be checked every WAL_ttl_secondsi / 2 and those that
-   *    are older than WAL_ttl_seconds will be deleted.
-   * 4. If both are not 0, WAL files will be checked every 10 min and both
-   *    checks will be performed with ttl being first.
+   *    are older than WAL_ttl_seconds will be deleted.</li>
+   * <li>If both are not 0, WAL files will be checked every 10 min and both
+   *    checks will be performed with ttl being first.</li>
    *
    * @param sizeLimitMB size limit in mega-bytes.
    * @return the reference to the current option.
-   * @see setWalSizeLimitMB()
+   * @see #setWalSizeLimitMB(long)
    */
   public Options setWalSizeLimitMB(long sizeLimitMB) {
     assert(isInitialized());
@@ -857,7 +869,7 @@ public class Options extends RocksObject {
    * Data being read from file storage may be buffered in the OS
    * Default: true
    *
-   * @param allowOsBufferif true, then OS buffering is allowed.
+   * @param allowOsBuffer if true, then OS buffering is allowed.
    * @return the reference to the current option.
    */
   public Options setAllowOsBuffer(boolean allowOsBuffer) {
@@ -1122,7 +1134,7 @@ public class Options extends RocksObject {
    * Memtable format can be set using setTableFormatConfig.
    *
    * @return the name of the currently-used memtable factory.
-   * @see setTableFormatConfig()
+   * @see #setTableFormatConfig(TableFormatConfig)
    */
   public String memTableFactoryName() {
     assert(isInitialized());
@@ -1273,7 +1285,7 @@ public class Options extends RocksObject {
       long handle, int numLevels);
 
   /**
-   * The number of files in leve 0 to trigger compaction from level-0 to
+   * The number of files in level 0 to trigger compaction from level-0 to
    * level-1.  A value < 0 means that level-0 compaction will not be
    * triggered by number of files at all.
    * Default: 4
@@ -1400,7 +1412,7 @@ public class Options extends RocksObject {
    *
    * @return the target size of a level-0 file.
    *
-   * @see targetFileSizeMultiplier()
+   * @see #targetFileSizeMultiplier()
    */
   public int targetFileSizeBase() {
     return targetFileSizeBase(nativeHandle_);
@@ -1421,7 +1433,7 @@ public class Options extends RocksObject {
    * @param targetFileSizeBase the target size of a level-0 file.
    * @return the reference to the current option.
    *
-   * @see setTargetFileSizeMultiplier()
+   * @see #setTargetFileSizeMultiplier(int)
    */
   public Options setTargetFileSizeBase(int targetFileSizeBase) {
     setTargetFileSizeBase(nativeHandle_, targetFileSizeBase);
@@ -1471,7 +1483,7 @@ public class Options extends RocksObject {
    * by default 'maxBytesForLevelBase' is 10MB.
    *
    * @return the upper-bound of the total size of leve-1 files in bytes.
-   * @see maxBytesForLevelMultiplier()
+   * @see #maxBytesForLevelMultiplier()
    */
   public long maxBytesForLevelBase() {
     return maxBytesForLevelBase(nativeHandle_);
@@ -1491,7 +1503,7 @@ public class Options extends RocksObject {
    * @return maxBytesForLevelBase the upper-bound of the total size of
    *     leve-1 files in bytes.
    * @return the reference to the current option.
-   * @see setMaxBytesForLevelMultiplier()
+   * @see #setMaxBytesForLevelMultiplier(int)
    */
   public Options setMaxBytesForLevelBase(long maxBytesForLevelBase) {
     setMaxBytesForLevelBase(nativeHandle_, maxBytesForLevelBase);
@@ -1507,7 +1519,7 @@ public class Options extends RocksObject {
    *
    * @return the ratio between the total size of level-(L+1) files and
    *     the total size of level-L files for all L.
-   * @see maxBytesForLevelBase()
+   * @see #maxBytesForLevelBase()
    */
   public int maxBytesForLevelMultiplier() {
     return maxBytesForLevelMultiplier(nativeHandle_);
@@ -1522,7 +1534,7 @@ public class Options extends RocksObject {
    * @param multiplier the ratio between the total size of level-(L+1)
    *     files and the total size of level-L files for all L.
    * @return the reference to the current option.
-   * @see setMaxBytesForLevelBase()
+   * @see #setMaxBytesForLevelBase(long)
    */
   public Options setMaxBytesForLevelMultiplier(int multiplier) {
     setMaxBytesForLevelMultiplier(nativeHandle_, multiplier);
@@ -1538,7 +1550,7 @@ public class Options extends RocksObject {
    * (expanded_compaction_factor * targetFileSizeLevel()) many bytes.
    *
    * @return the maximum number of bytes in all compacted files.
-   * @see sourceCompactionFactor()
+   * @see #sourceCompactionFactor()
    */
   public int expandedCompactionFactor() {
     return expandedCompactionFactor(nativeHandle_);
@@ -1554,7 +1566,7 @@ public class Options extends RocksObject {
    * @param expandedCompactionFactor the maximum number of bytes in all
    *     compacted files.
    * @return the reference to the current option.
-   * @see setSourceCompactionFactor()
+   * @see #setSourceCompactionFactor(int)
    */
   public Options setExpandedCompactionFactor(int expandedCompactionFactor) {
     setExpandedCompactionFactor(nativeHandle_, expandedCompactionFactor);
@@ -1573,7 +1585,7 @@ public class Options extends RocksObject {
    * a compaction.
    *
    * @return the maximum number of bytes in all source files to be compactedo.
-   * @see expendedCompactionFactor()
+   * @see #expandedCompactionFactor()
    */
   public int sourceCompactionFactor() {
     return sourceCompactionFactor(nativeHandle_);
@@ -1592,7 +1604,7 @@ public class Options extends RocksObject {
    * @param sourceCompactionFactor the maximum number of bytes in all
    *     source files to be compacted in a single compaction run.
    * @return the reference to the current option.
-   * @see setExpendedCompactionFactor()
+   * @see #setExpandedCompactionFactor(int)
    */
   public Options setSourceCompactionFactor(int sourceCompactionFactor) {
     setSourceCompactionFactor(nativeHandle_, sourceCompactionFactor);
@@ -1979,7 +1991,7 @@ public class Options extends RocksObject {
    * This value will be used only when a prefix-extractor is specified.
    *
    * @return the number of bloom-bits.
-   * @see useFixedLengthPrefixExtractor()
+   * @see #useFixedLengthPrefixExtractor(int)
    */
   public int memtablePrefixBloomBits() {
     return memtablePrefixBloomBits(nativeHandle_);
@@ -2037,7 +2049,7 @@ public class Options extends RocksObject {
    * Default: 0
    *
    * @return the level of locality of bloom-filter probes.
-   * @see setMemTablePrefixBloomProbes
+   * @see #setMemtablePrefixBloomProbes(int)
    */
   public int bloomLocality() {
     return bloomLocality(nativeHandle_);
@@ -2149,7 +2161,7 @@ public class Options extends RocksObject {
    *
    * Default: 2
    *
-   * @return
+   * @return min partial merge operands
    */
   public int minPartialMergeOperands() {
     return minPartialMergeOperands(nativeHandle_);
diff --git a/java/org/rocksdb/RateLimiterConfig.java b/java/org/rocksdb/RateLimiterConfig.java
index 22de65921..1b309e6c9 100644
--- a/java/org/rocksdb/RateLimiterConfig.java
+++ b/java/org/rocksdb/RateLimiterConfig.java
@@ -10,11 +10,12 @@ package org.rocksdb;
  */
 public abstract class RateLimiterConfig {
   /**
-   * This function should only be called by Options.setRateLimiter(),
-   * which will create a c++ shared-pointer to the c++ RateLimiter
-   * that is associated with the Java RateLimtierConifg.
+   * This function should only be called by
+   * {@link org.rocksdb.Options#setRateLimiter(long, long)}, which will
+   * create a c++ shared-pointer to the c++ {@code RateLimiter} that is associated
+   * with a Java RateLimiterConfig.
    *
-   * @see Options.setRateLimiter()
+   * @see org.rocksdb.Options#setRateLimiter(long, long)
    */
   abstract protected long newRateLimiterHandle();
 }
diff --git a/java/org/rocksdb/ReadOptions.java b/java/org/rocksdb/ReadOptions.java
index 97c47c7d6..3590a1a87 100644
--- a/java/org/rocksdb/ReadOptions.java
+++ b/java/org/rocksdb/ReadOptions.java
@@ -64,7 +64,7 @@ public class ReadOptions extends RocksObject {
   private native boolean fillCache(long handle);
 
   /**
-   * Fill the cache when loading the block-based sst formated db.
+   * Fill the cache when loading the block-based sst formatted db.
    * Callers may wish to set this field to false for bulk scans.
    * Default: true
    *
@@ -86,7 +86,8 @@ public class ReadOptions extends RocksObject {
    * added data) and is optimized for sequential reads. It will return records
    * that were inserted into the database after the creation of the iterator.
    * Default: false
-   * Not supported in ROCKSDB_LITE mode!
+   *
+   * Not supported in {@code ROCKSDB_LITE} mode!
    *
    * @return true if tailing iterator is enabled.
    */
diff --git a/java/org/rocksdb/RestoreBackupableDB.java b/java/org/rocksdb/RestoreBackupableDB.java
index dbde447a0..5bc8dfbec 100644
--- a/java/org/rocksdb/RestoreBackupableDB.java
+++ b/java/org/rocksdb/RestoreBackupableDB.java
@@ -11,9 +11,13 @@ package org.rocksdb;
  * Note that dispose() must be called before this instance become out-of-scope
  * to release the allocated memory in c++.
  *
- * @param options Instance of BackupableDBOptions.
  */
 public class RestoreBackupableDB extends RocksObject {
+  /**
+   * Constructor
+   *
+   * @param options {@link org.rocksdb.BackupableDBOptions} instance
+   */
   public RestoreBackupableDB(BackupableDBOptions options) {
     super();
     nativeHandle_ = newRestoreBackupableDB(options.nativeHandle_);
@@ -30,6 +34,12 @@ public class RestoreBackupableDB extends RocksObject {
    * database will diverge from backups 4 and 5 and the new backup will fail.
    * If you want to create new backup, you will first have to delete backups 4
    * and 5.
+   *
+   * @param backupId id pointing to backup
+   * @param dbDir database directory to restore to
+   * @param walDir directory where wal files are located
+   * @param restoreOptions {@link org.rocksdb.RestoreOptions} instance
+   * @throws RocksDBException
    */
   public void restoreDBFromBackup(long backupId, String dbDir, String walDir,
       RestoreOptions restoreOptions) throws RocksDBException {
@@ -39,6 +49,11 @@ public class RestoreBackupableDB extends RocksObject {
 
   /**
    * Restore from the latest backup.
+   *
+   * @param dbDir database directory to restore to
+   * @param walDir directory where wal files are located
+   * @param restoreOptions {@link org.rocksdb.RestoreOptions} instance
+   * @throws RocksDBException
    */
   public void restoreDBFromLatestBackup(String dbDir, String walDir,
       RestoreOptions restoreOptions) throws RocksDBException {
@@ -49,7 +64,7 @@ public class RestoreBackupableDB extends RocksObject {
   /**
    * Deletes old backups, keeping latest numBackupsToKeep alive.
    *
-   * @param Number of latest backups to keep
+   * @param numBackupsToKeep of latest backups to keep
    */
   public void purgeOldBackups(int numBackupsToKeep) throws RocksDBException {
     purgeOldBackups0(nativeHandle_, numBackupsToKeep);
@@ -58,7 +73,7 @@ public class RestoreBackupableDB extends RocksObject {
   /**
    * Deletes a specific backup.
    *
-   * @param ID of backup to delete.
+   * @param backupId of backup to delete.
    */
   public void deleteBackup(long backupId) throws RocksDBException {
     deleteBackup0(nativeHandle_, backupId);
diff --git a/java/org/rocksdb/RestoreOptions.java b/java/org/rocksdb/RestoreOptions.java
index 77a2b99bc..2325c8f6c 100644
--- a/java/org/rocksdb/RestoreOptions.java
+++ b/java/org/rocksdb/RestoreOptions.java
@@ -11,13 +11,17 @@ package org.rocksdb;
  * Note that dispose() must be called before this instance become out-of-scope
  * to release the allocated memory in c++.
  *
- * @param If true, restore won't overwrite the existing log files in wal_dir. It
- *     will also move all log files from archive directory to wal_dir. Use this
- *     option in combination with BackupableDBOptions::backup_log_files = false
- *     for persisting in-memory databases.
- *     Default: false
  */
 public class RestoreOptions extends RocksObject {
+  /**
+   * Constructor
+   *
+   * @param keepLogFiles If true, restore won't overwrite the existing log files in wal_dir. It
+   *     will also move all log files from archive directory to wal_dir. Use this
+   *     option in combination with BackupableDBOptions::backup_log_files = false
+   *     for persisting in-memory databases.
+   *     Default: false
+   */
   public RestoreOptions(boolean keepLogFiles) {
     super();
     nativeHandle_ = newRestoreOptions(keepLogFiles);
diff --git a/java/org/rocksdb/RocksDB.java b/java/org/rocksdb/RocksDB.java
index a16586551..3fa2079a8 100644
--- a/java/org/rocksdb/RocksDB.java
+++ b/java/org/rocksdb/RocksDB.java
@@ -17,7 +17,7 @@ import org.rocksdb.NativeLibraryLoader;
  * A RocksDB is a persistent ordered map from keys to values.  It is safe for
  * concurrent access from multiple threads without any external synchronization.
  * All methods of this class could potentially throw RocksDBException, which
- * indicates sth wrong at the rocksdb library side and the call failed.
+ * indicates sth wrong at the RocksDB library side and the call failed.
  */
 public class RocksDB extends RocksObject {
   public static final int NOT_FOUND = -1;
@@ -95,12 +95,11 @@ public class RocksDB extends RocksObject {
    * set to true.
    *
    * @param path the path to the rocksdb.
-   * @param status an out value indicating the status of the Open().
    * @return a rocksdb instance on success, null if the specified rocksdb can
    *     not be opened.
    *
-   * @see Options.setCreateIfMissing()
-   * @see Options.createIfMissing()
+   * @see Options#setCreateIfMissing(boolean)
+   * @see org.rocksdb.Options#createIfMissing()
    */
   public static RocksDB open(String path) throws RocksDBException {
     RocksDB db = new RocksDB();
@@ -280,8 +279,8 @@ public class RocksDB extends RocksObject {
   /**
    * Returns a map of keys for which values were found in DB.
    *
-   * @param List of keys for which values need to be retrieved.
    * @param opt Read options.
+   * @param keys of keys for which values need to be retrieved.
    * @return Map where key of map is the key passed by user and value for map
    * entry is the corresponding value in DB.
    *
diff --git a/java/org/rocksdb/RocksObject.java b/java/org/rocksdb/RocksObject.java
index 353918d2e..828bb4f3c 100644
--- a/java/org/rocksdb/RocksObject.java
+++ b/java/org/rocksdb/RocksObject.java
@@ -7,16 +7,22 @@ package org.rocksdb;
 
 /**
  * RocksObject is the base-class of all RocksDB classes that has a pointer to
- * some c++ rocksdb object.
+ * some c++ {@code rocksdb} object.
  *
- * RocksObject has dispose() function, which releases its associated c++ resource.
+ * <p>
+ * RocksObject has {@code dispose()} function, which releases its associated c++
+ * resource.
+ * </p>
+ * </p>
  * This function can be either called manually, or being called automatically
- * during the regular Java GC process.  However, since Java may wrongly assume a
+ * during the regular Java GC process. However, since Java may wrongly assume a
  * RocksObject only contains a long member variable and think it is small in size,
- * Java may give RocksObject low priority in the GC process.  For this, it is
- * suggested to call dispose() manually.  However, it is safe to let RocksObject go
- * out-of-scope without manually calling dispose() as dispose() will be called
- * in the finalizer during the regular GC process.
+ * </p>
+ * <p>Java may give {@code RocksObject} low priority in the GC process. For this, it is
+ * suggested to call {@code dispose()} manually. However, it is safe to let
+ * {@code RocksObject} go out-of-scope without manually calling {@code dispose()}
+ * as {@code dispose()} will be called in the finalizer during the
+ * regular GC process.</p>
  */
 public abstract class RocksObject {
   protected RocksObject() {
@@ -26,16 +32,18 @@ public abstract class RocksObject {
 
   /**
    * Release the c++ object manually pointed by the native handle.
-   *
-   * Note that dispose() will also be called during the GC process
-   * if it was not called before its RocksObject went out-of-scope.
+   * <p>
+   * Note that {@code dispose()} will also be called during the GC process
+   * if it was not called before its {@code RocksObject} went out-of-scope.
    * However, since Java may wrongly wrongly assume those objects are
    * small in that they seems to only hold a long variable. As a result,
    * they might have low priority in the GC process.  To prevent this,
-   * it is suggested to call dispose() manually.
-   *
-   * Note that once an instance of RocksObject has been disposed,
+   * it is suggested to call {@code dispose()} manually.
+   * <p>
+   * <p>
+   * Note that once an instance of {@code RocksObject} has been disposed,
    * calling its function will lead undefined behavior.
+   * </p>
    */
   public final synchronized void dispose() {
     if (isOwningNativeHandle() && isInitialized()) {
@@ -46,40 +54,41 @@ public abstract class RocksObject {
   }
 
   /**
-   * The helper function of dispose() which all subclasses of RocksObject
-   * must implement to release their associated C++ resource.
+   * The helper function of {@code dispose()} which all subclasses of
+   * {@code RocksObject} must implement to release their associated
+   * C++ resource.
    */
   protected abstract void disposeInternal();
 
   /**
    * Revoke ownership of the native object.
-   *
+   * <p>
    * This will prevent the object from attempting to delete the underlying
    * native object in its finalizer. This must be used when another object
    * takes over ownership of the native object or both will attempt to delete
    * the underlying object when garbage collected.
-   *
-   * When disOwnNativeHandle() is called, dispose() will simply set nativeHandle_
-   * to 0 without releasing its associated C++ resource.  As a result,
-   * incorrectly use this function may cause memory leak, and this function call
-   * will not affect the return value of isInitialized().
-   *
-   * @see dispose()
-   * @see isInitialized()
+   * <p>
+   * When {@code disOwnNativeHandle()} is called, {@code dispose()} will simply set
+   * {@code nativeHandle_} to 0 without releasing its associated C++ resource.
+   * As a result, incorrectly use this function may cause memory leak, and this
+   * function call will not affect the return value of {@code isInitialized()}.
+   * </p>
+   * @see #dispose()
+   * @see #isInitialized()
    */
   protected void disOwnNativeHandle() {
     owningHandle_ = false;
   }
 
   /**
-   * Returns true if the current RocksObject is responsable to release its
-   * native handle.
+   * Returns true if the current {@code RocksObject} is responsible to release
+   * its native handle.
    *
-   * @return true if the current RocksObject is responsible to release its
-   *   native handle.
+   * @return true if the current {@code RocksObject} is responsible to release
+   *     its native handle.
    *
-   * @see disOwnNativeHandle()
-   * @see dispose()
+   * @see #disOwnNativeHandle()
+   * @see #dispose()
    */
   protected boolean isOwningNativeHandle() {
     return owningHandle_;
@@ -90,14 +99,14 @@ public abstract class RocksObject {
    *
    * @return true if the associated native handle has been initialized.
    *
-   * @see dispose()
+   * @see #dispose()
    */
   protected boolean isInitialized() {
     return (nativeHandle_ != 0);
   }
 
   /**
-   * Simply calls dispose() and release its c++ resource if it has not
+   * Simply calls {@code dispose()} and release its c++ resource if it has not
    * yet released.
    */
   @Override protected void finalize() {
@@ -110,8 +119,8 @@ public abstract class RocksObject {
   protected long nativeHandle_;
 
   /**
-   * A flag indicating whether the current RocksObject is responsible to
-   * release the c++ object stored in its nativeHandle_.
+   * A flag indicating whether the current {@code RocksObject} is responsible to
+   * release the c++ object stored in its {@code nativeHandle_}.
    */
   private boolean owningHandle_;
 }

From da8ff9ff896ee4af400c4083c9f78eaf86576457 Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Thu, 2 Oct 2014 21:31:19 +0200
Subject: [PATCH 220/829] Fixed Findbugs issues

- BackupableDB missing call to super.finalize(major)
- WriteBatchTest inefficient String usage(minor)
- RocksDB local dead variable store(medium)
---
 java/org/rocksdb/BackupableDB.java   |  1 +
 java/org/rocksdb/RocksDB.java        |  2 --
 java/org/rocksdb/WriteBatchTest.java | 28 ++++++++++++++--------------
 3 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/java/org/rocksdb/BackupableDB.java b/java/org/rocksdb/BackupableDB.java
index 108c4deb5..1c8e3dc53 100644
--- a/java/org/rocksdb/BackupableDB.java
+++ b/java/org/rocksdb/BackupableDB.java
@@ -82,6 +82,7 @@ public class BackupableDB extends RocksDB {
 
   @Override protected void finalize() {
     close();
+    super.finalize();
   }
 
   protected native void open(long rocksDBHandle, long backupDBOptionsHandle);
diff --git a/java/org/rocksdb/RocksDB.java b/java/org/rocksdb/RocksDB.java
index a16586551..8985bc3b5 100644
--- a/java/org/rocksdb/RocksDB.java
+++ b/java/org/rocksdb/RocksDB.java
@@ -103,8 +103,6 @@ public class RocksDB extends RocksObject {
    * @see Options.createIfMissing()
    */
   public static RocksDB open(String path) throws RocksDBException {
-    RocksDB db = new RocksDB();
-
     // This allows to use the rocksjni default Options instead of
     // the c++ one.
     Options options = new Options();
diff --git a/java/org/rocksdb/WriteBatchTest.java b/java/org/rocksdb/WriteBatchTest.java
index 03a866313..770cd85b8 100644
--- a/java/org/rocksdb/WriteBatchTest.java
+++ b/java/org/rocksdb/WriteBatchTest.java
@@ -53,9 +53,9 @@ public class WriteBatchTest {
       WriteBatchInternal.setSequence(batch, 100);
       assert(100 == WriteBatchInternal.sequence(batch));
       assert(3 == batch.count());
-      assert(new String("Put(baz, boo)@102" +
-                        "Delete(box)@101" +
-                        "Put(foo, bar)@100")
+      assert(("Put(baz, boo)@102" +
+              "Delete(box)@101" +
+              "Put(foo, bar)@100")
                 .equals(new String(getContents(batch), "US-ASCII")));
     } catch (UnsupportedEncodingException e) {
       System.err.println(e);
@@ -79,16 +79,16 @@ public class WriteBatchTest {
       b2.clear();
       b2.put("b".getBytes("US-ASCII"), "vb".getBytes("US-ASCII"));
       WriteBatchInternal.append(b1, b2);
-      assert(new String("Put(a, va)@200" +
-                        "Put(b, vb)@201")
+      assert(("Put(a, va)@200" +
+              "Put(b, vb)@201")
                 .equals(new String(getContents(b1), "US-ASCII")));
       assert(2 == b1.count());
       b2.remove("foo".getBytes("US-ASCII"));
       WriteBatchInternal.append(b1, b2);
-      assert(new String("Put(a, va)@200" +
-                        "Put(b, vb)@202" +
-                        "Put(b, vb)@201" +
-                        "Delete(foo)@203")
+      assert(("Put(a, va)@200" +
+              "Put(b, vb)@202" +
+              "Put(b, vb)@201" +
+              "Delete(foo)@203")
                  .equals(new String(getContents(b1), "US-ASCII")));
       assert(4 == b1.count());
     } catch (UnsupportedEncodingException e) {
@@ -108,11 +108,11 @@ public class WriteBatchTest {
       batch.putLogData("blob2".getBytes("US-ASCII"));
       batch.merge("foo".getBytes("US-ASCII"), "bar".getBytes("US-ASCII"));
       assert(5 == batch.count());
-      assert(new String("Merge(foo, bar)@4" +
-                        "Put(k1, v1)@0" +
-                        "Delete(k2)@3" +
-                        "Put(k2, v2)@1" +
-                        "Put(k3, v3)@2")
+      assert(("Merge(foo, bar)@4" +
+              "Put(k1, v1)@0" +
+              "Delete(k2)@3" +
+              "Put(k2, v2)@1" +
+              "Put(k3, v3)@2")
                 .equals(new String(getContents(batch), "US-ASCII")));
     } catch (UnsupportedEncodingException e) {
       System.err.println(e);

From 05204bb1199bb736495bdcd8a5898514fc5c2dca Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Fri, 3 Oct 2014 21:43:47 +0200
Subject: [PATCH 221/829] Lint changes

---
 java/org/rocksdb/BackupableDB.java                |  4 ++--
 java/org/rocksdb/CompactionStyle.java             |  2 +-
 java/org/rocksdb/CompressionType.java             |  2 +-
 java/org/rocksdb/GenericRateLimiterConfig.java    |  4 ++--
 java/org/rocksdb/MemTableConfig.java              |  2 +-
 java/org/rocksdb/Options.java                     |  4 ++--
 java/org/rocksdb/RocksDB.java                     |  8 ++++----
 java/org/rocksdb/StatisticsCollector.java         | 10 +++++-----
 java/org/rocksdb/StatisticsCollectorCallback.java |  8 ++++----
 java/org/rocksdb/StatsCollectorInput.java         |  8 ++++----
 10 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/java/org/rocksdb/BackupableDB.java b/java/org/rocksdb/BackupableDB.java
index 3ee29b347..f8669fff4 100644
--- a/java/org/rocksdb/BackupableDB.java
+++ b/java/org/rocksdb/BackupableDB.java
@@ -47,10 +47,10 @@ public class BackupableDB extends RocksDB {
   public void createNewBackup(boolean flushBeforeBackup) {
     createNewBackup(nativeHandle_, flushBeforeBackup);
   }
-  
+
   /**
    * Deletes old backups, keeping latest numBackupsToKeep alive.
-   * 
+   *
    * @param numBackupsToKeep Number of latest backups to keep.
    */
   public void purgeOldBackups(int numBackupsToKeep) {
diff --git a/java/org/rocksdb/CompactionStyle.java b/java/org/rocksdb/CompactionStyle.java
index 5c41dfdd2..ade48358e 100644
--- a/java/org/rocksdb/CompactionStyle.java
+++ b/java/org/rocksdb/CompactionStyle.java
@@ -9,7 +9,7 @@ public enum CompactionStyle {
   LEVEL((byte) 0),
   UNIVERSAL((byte) 1),
   FIFO((byte) 2);
-  
+
   private final byte value_;
 
   private CompactionStyle(byte value) {
diff --git a/java/org/rocksdb/CompressionType.java b/java/org/rocksdb/CompressionType.java
index c5d6253a9..f29eccb9b 100644
--- a/java/org/rocksdb/CompressionType.java
+++ b/java/org/rocksdb/CompressionType.java
@@ -12,7 +12,7 @@ public enum CompressionType {
   BZLIB2_COMPRESSION((byte) 3),
   LZ4_COMPRESSION((byte) 4),
   LZ4HC_COMPRESSION((byte) 5);
-  
+
   private final byte value_;
 
   private CompressionType(byte value) {
diff --git a/java/org/rocksdb/GenericRateLimiterConfig.java b/java/org/rocksdb/GenericRateLimiterConfig.java
index 2a2e7b657..5023822a6 100644
--- a/java/org/rocksdb/GenericRateLimiterConfig.java
+++ b/java/org/rocksdb/GenericRateLimiterConfig.java
@@ -52,12 +52,12 @@ public class GenericRateLimiterConfig extends RateLimiterConfig {
   public GenericRateLimiterConfig(long rateBytesPerSecond) {
     this(rateBytesPerSecond, DEFAULT_REFILL_PERIOD_MICROS, DEFAULT_FAIRNESS);
   }
-  
+
   @Override protected long newRateLimiterHandle() {
     return newRateLimiterHandle(rateBytesPerSecond_, refillPeriodMicros_,
         fairness_);
   }
-    
+
   private native long newRateLimiterHandle(long rateBytesPerSecond,
       long refillPeriodMicros, int fairness);
   private final long rateBytesPerSecond_;
diff --git a/java/org/rocksdb/MemTableConfig.java b/java/org/rocksdb/MemTableConfig.java
index 904aa37b5..a69b1008f 100644
--- a/java/org/rocksdb/MemTableConfig.java
+++ b/java/org/rocksdb/MemTableConfig.java
@@ -21,7 +21,7 @@ public abstract class MemTableConfig {
    * which will create a c++ shared-pointer to the c++ MemTableRepFactory
    * that associated with the Java MemTableConfig.
    *
-   * @see Options#setMemTableConfig(MemTableConfig) 
+   * @see Options#setMemTableConfig(MemTableConfig)
    */
   abstract protected long newMemTableFactoryHandle();
 }
diff --git a/java/org/rocksdb/Options.java b/java/org/rocksdb/Options.java
index 642c6c4dd..b0989363b 100644
--- a/java/org/rocksdb/Options.java
+++ b/java/org/rocksdb/Options.java
@@ -20,7 +20,7 @@ public class Options extends RocksObject {
   static final int DEFAULT_NUM_SHARD_BITS = -1;
 
   /**
-   * Builtin RocksDB comparators 
+   * Builtin RocksDB comparators
    */
   public enum BuiltinComparator {
       BYTEWISE_COMPARATOR, REVERSE_BYTEWISE_COMPARATOR;
@@ -1115,7 +1115,7 @@ public class Options extends RocksObject {
     setMemTableFactory(nativeHandle_, config.newMemTableFactoryHandle());
     return this;
   }
-  
+
   /**
    * Use to control write rate of flush and compaction. Flush has higher
    * priority than compaction. Rate limiting is disabled if nullptr.
diff --git a/java/org/rocksdb/RocksDB.java b/java/org/rocksdb/RocksDB.java
index 3fa2079a8..facd2914b 100644
--- a/java/org/rocksdb/RocksDB.java
+++ b/java/org/rocksdb/RocksDB.java
@@ -323,16 +323,16 @@ public class RocksDB extends RocksObject {
       throws RocksDBException {
     remove(nativeHandle_, writeOpt.nativeHandle_, key, key.length);
   }
-  
+
   /**
    * DB implementations can export properties about their state
      via this method.  If "property" is a valid property understood by this
      DB implementation, fills "*value" with its current value and returns
      true.  Otherwise returns false.
-  
-  
+
+
      Valid property names include:
-   
+
      "rocksdb.num-files-at-level<N>" - return the number of files at level <N>,
          where <N> is an ASCII representation of a level number (e.g. "0").
      "rocksdb.stats" - returns a multi-line string that describes statistics
diff --git a/java/org/rocksdb/StatisticsCollector.java b/java/org/rocksdb/StatisticsCollector.java
index 29815c46d..965637697 100644
--- a/java/org/rocksdb/StatisticsCollector.java
+++ b/java/org/rocksdb/StatisticsCollector.java
@@ -29,9 +29,9 @@ public class StatisticsCollector {
 
   /**
    * Constructor for statistics collector.
-   * 
+   *
    * @param statsCollectorInputList List of statistics collector input.
-   * @param statsCollectionIntervalInMilliSeconds Statistics collection time 
+   * @param statsCollectionIntervalInMilliSeconds Statistics collection time
    *        period (specified in milliseconds).
    */
   public StatisticsCollector(List<StatsCollectorInput> statsCollectorInputList,
@@ -48,7 +48,7 @@ public class StatisticsCollector {
 
   /**
    * Shuts down statistics collector.
-   * 
+   *
    * @param shutdownTimeout Time in milli-seconds to wait for shutdown before
    *        killing the collection process.
    */
@@ -70,13 +70,13 @@ public class StatisticsCollector {
           try {
             if(Thread.currentThread().isInterrupted()) {
               break;
-            }  
+            }
             for(StatsCollectorInput statsCollectorInput :
                 _statsCollectorInputList) {
               Statistics statistics = statsCollectorInput.getStatistics();
               StatisticsCollectorCallback statsCallback =
                   statsCollectorInput.getCallback();
-              
+
                 // Collect ticker data
               for(TickerType ticker : TickerType.values()) {
                 long tickerValue = statistics.getTickerCount(ticker);
diff --git a/java/org/rocksdb/StatisticsCollectorCallback.java b/java/org/rocksdb/StatisticsCollectorCallback.java
index a955ec216..b8d7a24ec 100644
--- a/java/org/rocksdb/StatisticsCollectorCallback.java
+++ b/java/org/rocksdb/StatisticsCollectorCallback.java
@@ -7,13 +7,13 @@ package org.rocksdb;
 
 /**
  * Callback interface provided to StatisticsCollector.
- * 
+ *
  * Thread safety:
- * StatisticsCollector doesn't make any guarantees about thread safety. 
+ * StatisticsCollector doesn't make any guarantees about thread safety.
  * If the same reference of StatisticsCollectorCallback is passed to multiple
- * StatisticsCollector references, then its the responsibility of the 
+ * StatisticsCollector references, then its the responsibility of the
  * user to make StatisticsCollectorCallback's implementation thread-safe.
- * 
+ *
  * @param tickerType
  * @param tickerCount
 */
diff --git a/java/org/rocksdb/StatsCollectorInput.java b/java/org/rocksdb/StatsCollectorInput.java
index a1aa928d3..890977cdf 100644
--- a/java/org/rocksdb/StatsCollectorInput.java
+++ b/java/org/rocksdb/StatsCollectorInput.java
@@ -12,10 +12,10 @@ package org.rocksdb;
 public class StatsCollectorInput {
   private final Statistics _statistics;
   private final StatisticsCollectorCallback _statsCallback;
-      
+
   /**
    * Constructor for StatsCollectorInput.
-   * 
+   *
    * @param statistics Reference of DB statistics.
    * @param statsCallback Reference of statistics callback interface.
    */
@@ -24,11 +24,11 @@ public class StatsCollectorInput {
     _statistics = statistics;
     _statsCallback = statsCallback;
   }
-  
+
   public Statistics getStatistics() {
     return _statistics;
   }
-  
+
   public StatisticsCollectorCallback getCallback() {
     return _statsCallback;
   }

From 69d4c5123e812a2c88dd94abc2bf1a39b0a04dbd Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Sat, 4 Oct 2014 11:17:06 +0200
Subject: [PATCH 222/829] Cross-platform fix version.sh

version.sh now works also on linux properly.
---
 build_tools/version.sh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/build_tools/version.sh b/build_tools/version.sh
index afa7ed277..c5a8595fb 100755
--- a/build_tools/version.sh
+++ b/build_tools/version.sh
@@ -1,14 +1,14 @@
 #!/bin/sh
-if [ $# == 0 ]; then
+if [ "$#" = "0" ]; then
   echo "Usage: $0 major|minor|patch"
   exit 1
 fi
-if [ $1 = "major" ]; then
+if [ "$1" = "major" ]; then
   cat include/rocksdb/version.h  | grep MAJOR | head -n1 | awk '{print $3}'
 fi
-if [ $1 = "minor" ]; then
+if [ "$1" = "minor" ]; then
   cat include/rocksdb/version.h  | grep MINOR | head -n1 | awk '{print $3}'
 fi
-if [ $1 = "patch" ]; then
+if [ "$1" = "patch" ]; then
   cat include/rocksdb/version.h  | grep PATCH | head -n1 | awk '{print $3}'
 fi

From 22c64be439a9e1082e1400093731f74d439d348a Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Sun, 5 Oct 2014 10:46:55 +0200
Subject: [PATCH 223/829] Cross platform fix for Java benchmark shell script.

---
 java/jdb_bench.sh | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/java/jdb_bench.sh b/java/jdb_bench.sh
index dba7dbd31..92ee6e3db 100755
--- a/java/jdb_bench.sh
+++ b/java/jdb_bench.sh
@@ -1 +1,7 @@
-java -server -d64 -XX:NewSize=4m -XX:+AggressiveOpts -Djava.library.path=.:../ -cp "rocksdbjni.jar:.:./*" org.rocksdb.benchmark.DbBenchmark $@
+PLATFORM=64
+if [ `getconf LONG_BIT` != "64" ]
+then
+  PLATFORM=32
+fi
+echo "Running benchmark in $PLATFORM-Bit mode."
+java -server -d$PLATFORM -XX:NewSize=4m -XX:+AggressiveOpts -Djava.library.path=.:../ -cp "rocksdbjni.jar:.:./*" org.rocksdb.benchmark.DbBenchmark $@

From 1c7c764761899845d8755e8a8ba88aefcef4c3a9 Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Sun, 5 Oct 2014 21:51:09 +0200
Subject: [PATCH 224/829] Replaced obsolete comparator with builtin variant.

---
 .../write_batch_with_index_test.cc            | 20 +------------------
 1 file changed, 1 insertion(+), 19 deletions(-)

diff --git a/utilities/write_batch_with_index/write_batch_with_index_test.cc b/utilities/write_batch_with_index/write_batch_with_index_test.cc
index ad8c110c1..1152c7b88 100644
--- a/utilities/write_batch_with_index/write_batch_with_index_test.cc
+++ b/utilities/write_batch_with_index/write_batch_with_index_test.cc
@@ -235,27 +235,9 @@ TEST(WriteBatchWithIndexTest, TestValueAsSecondaryIndex) {
   }
 }
 
-class ReverseComparator : public Comparator {
- public:
-  ReverseComparator() {}
-
-  virtual const char* Name() const override {
-    return "rocksdb.ReverseComparator";
-  }
-
-  virtual int Compare(const Slice& a, const Slice& b) const override {
-    return 0 - BytewiseComparator()->Compare(a, b);
-  }
-
-  virtual void FindShortestSeparator(std::string* start,
-                                     const Slice& limit) const {}
-  virtual void FindShortSuccessor(std::string* key) const {}
-};
-
 TEST(WriteBatchWithIndexTest, TestComparatorForCF) {
-  ReverseComparator reverse_cmp;
   ColumnFamilyHandleImplDummy cf1(6, nullptr);
-  ColumnFamilyHandleImplDummy reverse_cf(66, &reverse_cmp);
+  ColumnFamilyHandleImplDummy reverse_cf(66, ReverseBytewiseComparator());
   ColumnFamilyHandleImplDummy cf2(88, BytewiseComparator());
   WriteBatchWithIndex batch(BytewiseComparator(), 20);
 

From 6b2c1d962052cf1b5e6ebd24d74db22bf827ba05 Mon Sep 17 00:00:00 2001
From: Chris Riccomini <criccomi@criccomi-mn.linkedin.biz>
Date: Mon, 6 Oct 2014 08:20:56 -0700
Subject: [PATCH 225/829] make publish jni jars depend on release jni jars

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 5d3f954de..9877eca04 100644
--- a/Makefile
+++ b/Makefile
@@ -567,7 +567,7 @@ rocksdbjavastaticrelease: rocksdbjavastatic
 	cd java/crossbuild && vagrant destroy -f && vagrant up linux32 && vagrant halt linux32 && vagrant up linux64 && vagrant halt linux64
 	cd java;jar -cf $(ROCKSDB_JAR_ALL) org/rocksdb/*.class org/rocksdb/util/*.class HISTORY*.md librocksdbjni-*.so librocksdbjni-*.jnilib
 
-rocksdbjavastaticpublish:
+rocksdbjavastaticpublish: rocksdbjavastaticrelease
 	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-javadoc.jar -Dclassifier=javadoc
 	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-sources.jar -Dclassifier=sources
 	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-linux64.jar -Dclassifier=linux64

From d44871e80f6aef63b5fa5f50155d8fc1f47a6b09 Mon Sep 17 00:00:00 2001
From: Chris Riccomini <criccomi@criccomi-mn.linkedin.biz>
Date: Mon, 6 Oct 2014 08:23:31 -0700
Subject: [PATCH 226/829] fix java doc directory in git ignore

---
 .gitignore | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 0e53ea35b..4ed05c5e3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -37,4 +37,4 @@ unity.cc
 java/crossbuild/.vagrant
 .vagrant/
 java/**.asc
-java/javadocs
+java/javadoc

From 1e5a52815be9b4c84e3c3e2b6c3ab615bd4a907f Mon Sep 17 00:00:00 2001
From: Chris Riccomini <criccomi@criccomi-mn.linkedin.biz>
Date: Mon, 6 Oct 2014 08:24:51 -0700
Subject: [PATCH 227/829] update release readme

---
 java/RELEASE.md | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/java/RELEASE.md b/java/RELEASE.md
index d2028073c..16feae6ee 100644
--- a/java/RELEASE.md
+++ b/java/RELEASE.md
@@ -45,11 +45,7 @@ Then update rocksjni.pom's version tag to reflect the release version.
 
 From RocksDB's root directory, first build the Java static JARs:
 
-    make jclean clean rocksdbjavastaticrelease
-
-Then publish the release to Sonatype:
-
-    make rocksdbjavastaticpublish
+    make jclean clean rocksdbjavastaticpublish
 
 This command will [stage the JAR artifacts on the Sonatype staging repository](http://central.sonatype.org/pages/manual-staging-bundle-creation-and-deployment.html). To release the staged artifacts.
 

From b87db071523a20af9ccee157ebe520e76089e24f Mon Sep 17 00:00:00 2001
From: Nik Bougalis <nikb@bougalis.net>
Date: Sat, 4 Oct 2014 14:38:37 -0700
Subject: [PATCH 228/829] Avoid dereferencing a null field

---
 utilities/document/document_db.cc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/utilities/document/document_db.cc b/utilities/document/document_db.cc
index b19618533..6540c2d8c 100644
--- a/utilities/document/document_db.cc
+++ b/utilities/document/document_db.cc
@@ -385,13 +385,13 @@ class SimpleSortedIndex : public Index {
       override {
     auto value = document.Get(field_);
     if (value == nullptr) {
-      // null
       if (!EncodeJSONPrimitive(JSONDocument(JSONDocument::kNull), key)) {
         assert(false);
       }
-    }
-    if (!EncodeJSONPrimitive(*value, key)) {
-      assert(false);
+    } else {
+      if (!EncodeJSONPrimitive(*value, key)) {
+        assert(false);
+      }
     }
   }
   virtual const Comparator* GetComparator() const override {

From d6169954bd7861d4ee951bcf4c9e6892e5fa64b2 Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Tue, 7 Oct 2014 17:40:19 +0200
Subject: [PATCH 229/829] Removed code which prevents `arc lint` from working
 properly.

---
 linters/lint_engine/FacebookFbcodeLintEngine.php | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/linters/lint_engine/FacebookFbcodeLintEngine.php b/linters/lint_engine/FacebookFbcodeLintEngine.php
index 6765c33d1..131b34efb 100644
--- a/linters/lint_engine/FacebookFbcodeLintEngine.php
+++ b/linters/lint_engine/FacebookFbcodeLintEngine.php
@@ -44,10 +44,6 @@ class FacebookFbcodeLintEngine extends ArcanistLintEngine {
     if (!$this->getCommitHookMode()) {
       $cpp_linters = array();
       $google_linter = new ArcanistCpplintLinter();
-      $google_linter->setConfig(array(
-        'lint.cpplint.prefix' => '',
-        'lint.cpplint.bin' => 'cpplint',
-      ));
       $cpp_linters[] = $linters[] = $google_linter;
       $cpp_linters[] = $linters[] = new FbcodeCppLinter();
       $cpp_linters[] = $linters[] = new PfffCppLinter();

From 63eade401a880e531d86f325c9553cdecd01853c Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Tue, 7 Oct 2014 09:47:16 -0700
Subject: [PATCH 230/829] Fix error introduced by merge

---
 utilities/write_batch_with_index/write_batch_with_index.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utilities/write_batch_with_index/write_batch_with_index.cc b/utilities/write_batch_with_index/write_batch_with_index.cc
index f5bab2637..481ec6867 100644
--- a/utilities/write_batch_with_index/write_batch_with_index.cc
+++ b/utilities/write_batch_with_index/write_batch_with_index.cc
@@ -10,6 +10,7 @@
 #include "util/arena.h"
 
 namespace rocksdb {
+
 class ReadableWriteBatch : public WriteBatch {
  public:
   explicit ReadableWriteBatch(size_t reserved_bytes = 0)
@@ -120,7 +121,6 @@ class WBWIIteratorImpl : public WBWIIterator {
     }
   }
 };
-}  // namespace
 
 struct WriteBatchWithIndex::Rep {
   Rep(const Comparator* index_comparator, size_t reserved_bytes = 0)

From 25f6a852e41a6dfa80b37b738bbb029a69b77c76 Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Tue, 7 Oct 2014 10:40:45 -0700
Subject: [PATCH 231/829] add db_test for changing memtable size

Summary:
The test only covers changing write_buffer_size. Other changable
parameters such bloom bits/probes are not obvious how to test.
Suggestions are welcome

Test Plan: db_test

Reviewers: sdong, yhchiang, igor

Reviewed By: igor

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D24429
---
 db/db_test.cc | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 51 insertions(+)

diff --git a/db/db_test.cc b/db/db_test.cc
index d402a3578..c67c45786 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -8538,6 +8538,57 @@ TEST(DBTest, DisableDataSyncTest) {
   }
 }
 
+TEST(DBTest, DynamicMemtableOptions) {
+  const uint64_t k64KB = 1 << 16;
+  const uint64_t k128KB = 1 << 17;
+  const uint64_t k5KB = 5 * 1024;
+  Options options;
+  options.env = env_;
+  options.create_if_missing = true;
+  options.compression = kNoCompression;
+  options.max_background_compactions = 4;
+  options.max_mem_compaction_level = 0;
+  options.write_buffer_size = k64KB;
+  options.max_write_buffer_number = 2;
+  // Don't trigger compact/slowdown/stop
+  options.level0_file_num_compaction_trigger = 1024;
+  options.level0_slowdown_writes_trigger = 1024;
+  options.level0_stop_writes_trigger = 1024;
+  DestroyAndReopen(&options);
+
+  auto gen_l0_kb = [this](int size) {
+    Random rnd(301);
+    std::vector<std::string> values;
+    for (int i = 0; i < size; i++) {
+      values.push_back(RandomString(&rnd, 1024));
+      ASSERT_OK(Put(Key(i), values[i]));
+    }
+    dbfull()->TEST_WaitForFlushMemTable();
+  };
+
+  gen_l0_kb(64);
+  ASSERT_EQ(NumTableFilesAtLevel(0), 1);
+  ASSERT_TRUE(SizeAtLevel(0) < k64KB + k5KB);
+  ASSERT_TRUE(SizeAtLevel(0) > k64KB - k5KB);
+
+  // Clean up L0
+  dbfull()->CompactRange(nullptr, nullptr);
+  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+
+  // Increase buffer size
+  ASSERT_TRUE(dbfull()->SetOptions({
+    {"write_buffer_size", "131072"},
+  }));
+
+  // The existing memtable is still 64KB in size, after it becomes immutable,
+  // the next memtable will be 128KB in size. Write 256KB total, we should
+  // have a 64KB L0 file, a 128KB L0 file, and a memtable with 64KB data
+  gen_l0_kb(256);
+  ASSERT_EQ(NumTableFilesAtLevel(0), 2);
+  ASSERT_TRUE(SizeAtLevel(0) < k128KB + k64KB + 2 * k5KB);
+  ASSERT_TRUE(SizeAtLevel(0) > k128KB + k64KB - 2 * k5KB);
+}
+
 TEST(DBTest, DynamicCompactionOptions) {
   const uint64_t k64KB = 1 << 16;
   const uint64_t k128KB = 1 << 17;

From f78b832e5d9df08a9809983dee5942d1a5f8ad0d Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Tue, 7 Oct 2014 10:40:57 -0700
Subject: [PATCH 232/829] Log RocksDB version

Summary: This will be much easier than reviewing git sha's we currently have in our LOGs

Test Plan: none

Reviewers: sdong, yhchiang, ljin

Reviewed By: ljin

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D24591
---
 db/db_impl.cc | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/db/db_impl.cc b/db/db_impl.cc
index 7463f749b..5abfb4ac2 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -49,6 +49,7 @@
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
 #include "rocksdb/merge_operator.h"
+#include "rocksdb/version.h"
 #include "rocksdb/statistics.h"
 #include "rocksdb/status.h"
 #include "rocksdb/table.h"
@@ -75,7 +76,7 @@ namespace rocksdb {
 
 const std::string kDefaultColumnFamilyName("default");
 
-void DumpLeveldbBuildVersion(Logger * log);
+void DumpRocksDBBuildVersion(Logger * log);
 
 struct DBImpl::WriteContext {
   autovector<SuperVersion*> superversions_to_free_;
@@ -364,7 +365,7 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname)
   column_family_memtables_.reset(new ColumnFamilyMemTablesImpl(
       versions_->GetColumnFamilySet(), &flush_scheduler_));
 
-  DumpLeveldbBuildVersion(db_options_.info_log.get());
+  DumpRocksDBBuildVersion(db_options_.info_log.get());
   DumpDBFileSummary(db_options_, dbname_);
   db_options_.Dump(db_options_.info_log.get());
 
@@ -4883,9 +4884,12 @@ Status DestroyDB(const std::string& dbname, const Options& options) {
 
 //
 // A global method that can dump out the build version
-void DumpLeveldbBuildVersion(Logger * log) {
+void DumpRocksDBBuildVersion(Logger * log) {
 #if !defined(IOS_CROSS_COMPILE)
-  // if we compile with Xcode, we don't run build_detect_vesion, so we don't generate util/build_version.cc
+  // if we compile with Xcode, we don't run build_detect_vesion, so we don't
+  // generate util/build_version.cc
+  Log(log, "RocksDB version: %d.%d.%d\n", ROCKSDB_MAJOR, ROCKSDB_MINOR,
+      ROCKSDB_PATCH);
   Log(log, "Git sha %s", rocksdb_build_git_sha);
   Log(log, "Compile time %s %s",
       rocksdb_build_compile_time, rocksdb_build_compile_date);

From 4f272408c5171f3355210ad3735065f185790c98 Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Tue, 7 Oct 2014 20:43:04 +0200
Subject: [PATCH 233/829] RocksJava Makefile includes incorrect paths to
 version.h

---
 java/Makefile | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/java/Makefile b/java/Makefile
index b2038355c..7da76f3f7 100644
--- a/java/Makefile
+++ b/java/Makefile
@@ -1,8 +1,8 @@
 NATIVE_JAVA_CLASSES = org.rocksdb.RocksDB org.rocksdb.Options org.rocksdb.WriteBatch org.rocksdb.WriteBatchInternal org.rocksdb.WriteBatchTest org.rocksdb.WriteOptions org.rocksdb.BackupableDB org.rocksdb.BackupableDBOptions org.rocksdb.Statistics org.rocksdb.RocksIterator org.rocksdb.VectorMemTableConfig org.rocksdb.SkipListMemTableConfig org.rocksdb.HashLinkedListMemTableConfig org.rocksdb.HashSkipListMemTableConfig org.rocksdb.PlainTableConfig org.rocksdb.BlockBasedTableConfig org.rocksdb.ReadOptions org.rocksdb.Filter org.rocksdb.BloomFilter org.rocksdb.RestoreOptions org.rocksdb.RestoreBackupableDB org.rocksdb.RocksEnv org.rocksdb.GenericRateLimiterConfig
 
-ROCKSDB_MAJOR = $(shell egrep "ROCKSDB_MAJOR.[0-9]" include/rocksdb/version.h | cut -d ' ' -f 3)
-ROCKSDB_MINOR = $(shell egrep "ROCKSDB_MINOR.[0-9]" include/rocksdb/version.h | cut -d ' ' -f 3)
-ROCKSDB_PATCH = $(shell egrep "ROCKSDB_PATCH.[0-9]" include/rocksdb/version.h | cut -d ' ' -f 3)
+ROCKSDB_MAJOR = $(shell egrep "ROCKSDB_MAJOR.[0-9]" ../include/rocksdb/version.h | cut -d ' ' -f 3)
+ROCKSDB_MINOR = $(shell egrep "ROCKSDB_MINOR.[0-9]" ../include/rocksdb/version.h | cut -d ' ' -f 3)
+ROCKSDB_PATCH = $(shell egrep "ROCKSDB_PATCH.[0-9]" ../include/rocksdb/version.h | cut -d ' ' -f 3)
 
 NATIVE_INCLUDE = ./include
 ARCH := $(shell getconf LONG_BIT)

From 88edfd90ae296785a6ba158e0d3f1e742d6b76b7 Mon Sep 17 00:00:00 2001
From: Tomislav Novak <tnovak@fb.com>
Date: Tue, 23 Sep 2014 15:52:28 -0700
Subject: [PATCH 234/829] SkipListRep::LookaheadIterator

Summary:
This diff introduces the `lookahead` argument to `SkipListFactory()`. This is an
optimization for the tailing use case which includes many seeks. E.g. consider
the following operations on a skip list iterator:

   Seek(x), Next(), Next(), Seek(x+2), Next(), Seek(x+3), Next(), Next(), ...

If `lookahead` is positive, `SkipListRep` will return an iterator which also
keeps track of the previously visited node. Seek() then first does a linear
search starting from that node (up to `lookahead` steps). As in the tailing
example above, this may require fewer than ~log(n) comparisons as with regular
skip list search.

Test Plan:
Added a new benchmark (`fillseekseq`) which simulates the usage pattern. It
first writes N records (with consecutive keys), then measures how much time it
takes to read them by calling `Seek()` and `Next()`.

   $ time ./db_bench -num 10000000 -benchmarks fillseekseq -prefix_size 1 \
      -key_size 8 -write_buffer_size $[1024*1024*1024] -value_size 50 \
      -seekseq_next 2 -skip_list_lookahead=0
   [...]
   DB path: [/dev/shm/rocksdbtest/dbbench]
   fillseekseq  :       0.389 micros/op 2569047 ops/sec;

   real    0m21.806s
   user    0m12.106s
   sys     0m9.672s

   $ time ./db_bench [...] -skip_list_lookahead=2
   [...]
   DB path: [/dev/shm/rocksdbtest/dbbench]
   fillseekseq  :       0.153 micros/op 6540684 ops/sec;

   real    0m19.469s
   user    0m10.192s
   sys     0m9.252s

Reviewers: ljin, sdong, igor

Reviewed By: igor

Subscribers: dhruba, leveldb, march, lovro

Differential Revision: https://reviews.facebook.net/D23997
---
 db/db_bench.cc                |  46 ++++++++++++-
 include/rocksdb/memtablerep.h |  11 ++++
 util/skiplistrep.cc           | 120 ++++++++++++++++++++++++++++++++--
 3 files changed, 168 insertions(+), 9 deletions(-)

diff --git a/db/db_bench.cc b/db/db_bench.cc
index bbd807c2c..f04ab8144 100644
--- a/db/db_bench.cc
+++ b/db/db_bench.cc
@@ -86,7 +86,8 @@ DEFINE_string(benchmarks,
               "xxhash,"
               "compress,"
               "uncompress,"
-              "acquireload,",
+              "acquireload,"
+              "fillseekseq,",
 
               "Comma-separated list of operations to run in the specified order"
               "Actual benchmarks:\n"
@@ -129,6 +130,8 @@ DEFINE_string(benchmarks,
               "\tcrc32c        -- repeated crc32c of 4K of data\n"
               "\txxhash        -- repeated xxHash of 4K of data\n"
               "\tacquireload   -- load N*1000 times\n"
+              "\tfillseekseq   -- write N values in sequential key, then read "
+              "them by seeking to each key\n"
               "Meta operations:\n"
               "\tcompact     -- Compact the entire DB\n"
               "\tstats       -- Print DB stats\n"
@@ -165,6 +168,9 @@ DEFINE_int32(duration, 0, "Time in seconds for the random-ops tests to run."
 
 DEFINE_int32(value_size, 100, "Size of each value");
 
+DEFINE_int32(seekseq_next, 0, "How many times to call Next() after Seek() in "
+             "fillseekseq");
+
 DEFINE_bool(use_uint64_comparator, false, "use Uint64 user comparator");
 
 DEFINE_int64(batch_size, 1, "Batch size");
@@ -565,6 +571,9 @@ DEFINE_string(merge_operator, "", "The merge operator to use with the database."
               "If a new merge operator is specified, be sure to use fresh"
               " database The possible merge operators are defined in"
               " utilities/merge_operators.h");
+DEFINE_int32(skip_list_lookahead, 0, "Used with skip_list memtablerep; try "
+             "linear search first for this many steps from the previous "
+             "position");
 
 static const bool FLAGS_soft_rate_limit_dummy __attribute__((unused)) =
     RegisterFlagValidator(&FLAGS_soft_rate_limit, &ValidateRateLimit);
@@ -1326,6 +1335,8 @@ class Benchmark {
         method = &Benchmark::MergeRandom;
       } else if (name == Slice("randomwithverify")) {
         method = &Benchmark::RandomWithVerify;
+      } else if (name == Slice("fillseekseq")) {
+        method = &Benchmark::WriteSeqSeekSeq;
       } else if (name == Slice("compact")) {
         method = &Benchmark::Compact;
       } else if (name == Slice("crc32c")) {
@@ -1717,7 +1728,8 @@ class Benchmark {
             FLAGS_hash_bucket_count));
         break;
       case kSkipList:
-        // no need to do anything
+        options.memtable_factory.reset(new SkipListFactory(
+            FLAGS_skip_list_lookahead));
         break;
       case kHashLinkedList:
         options.memtable_factory.reset(NewHashLinkListRepFactory(
@@ -2791,6 +2803,36 @@ class Benchmark {
     thread->stats.AddMessage(msg);
   }
 
+  void WriteSeqSeekSeq(ThreadState* thread) {
+    writes_ = FLAGS_num;
+    DoWrite(thread, SEQUENTIAL);
+    // exclude writes from the ops/sec calculation
+    thread->stats.Start(thread->tid);
+
+    DB* db = SelectDB(thread);
+    std::unique_ptr<Iterator> iter(
+      db->NewIterator(ReadOptions(FLAGS_verify_checksum, true)));
+
+    Slice key = AllocateKey();
+    for (int64_t i = 0; i < FLAGS_num; ++i) {
+      GenerateKeyFromInt(i, FLAGS_num, &key);
+      iter->Seek(key);
+      assert(iter->Valid() && iter->key() == key);
+      thread->stats.FinishedOps(nullptr, db, 1);
+
+      for (int j = 0; j < FLAGS_seekseq_next && i+1 < FLAGS_num; ++j) {
+        iter->Next();
+        GenerateKeyFromInt(++i, FLAGS_num, &key);
+        assert(iter->Valid() && iter->key() == key);
+        thread->stats.FinishedOps(nullptr, db, 1);
+      }
+
+      iter->Seek(key);
+      assert(iter->Valid() && iter->key() == key);
+      thread->stats.FinishedOps(nullptr, db, 1);
+    }
+  }
+
   void Compact(ThreadState* thread) {
     DB* db = SelectDB(thread);
     db->CompactRange(nullptr, nullptr);
diff --git a/include/rocksdb/memtablerep.h b/include/rocksdb/memtablerep.h
index b7fc39c81..8c2d7201b 100644
--- a/include/rocksdb/memtablerep.h
+++ b/include/rocksdb/memtablerep.h
@@ -186,12 +186,23 @@ class MemTableRepFactory {
 };
 
 // This uses a skip list to store keys. It is the default.
+//
+// Parameters:
+//   lookahead: If non-zero, each iterator's seek operation will start the
+//     search from the previously visited record (doing at most 'lookahead'
+//     steps). This is an optimization for the access pattern including many
+//     seeks with consecutive keys.
 class SkipListFactory : public MemTableRepFactory {
  public:
+  explicit SkipListFactory(size_t lookahead = 0) : lookahead_(lookahead) {}
+
   virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&,
                                          Arena*, const SliceTransform*,
                                          Logger* logger) override;
   virtual const char* Name() const override { return "SkipListFactory"; }
+
+ private:
+  const size_t lookahead_;
 };
 
 #ifndef ROCKSDB_LITE
diff --git a/util/skiplistrep.cc b/util/skiplistrep.cc
index a3c940d0e..1322f6c9a 100644
--- a/util/skiplistrep.cc
+++ b/util/skiplistrep.cc
@@ -12,9 +12,16 @@ namespace rocksdb {
 namespace {
 class SkipListRep : public MemTableRep {
   SkipList<const char*, const MemTableRep::KeyComparator&> skip_list_;
+  const MemTableRep::KeyComparator& cmp_;
+  const SliceTransform* transform_;
+  const size_t lookahead_;
+
+  friend class LookaheadIterator;
 public:
-  explicit SkipListRep(const MemTableRep::KeyComparator& compare, Arena* arena)
-    : MemTableRep(arena), skip_list_(compare, arena) {
+  explicit SkipListRep(const MemTableRep::KeyComparator& compare, Arena* arena,
+                       const SliceTransform* transform, const size_t lookahead)
+    : MemTableRep(arena), skip_list_(compare, arena), cmp_(compare),
+      transform_(transform), lookahead_(lookahead) {
   }
 
   // Insert key into the list.
@@ -106,11 +113,110 @@ public:
     std::string tmp_;       // For passing to EncodeKey
   };
 
+  // Iterator over the contents of a skip list which also keeps track of the
+  // previously visited node. In Seek(), it examines a few nodes after it
+  // first, falling back to O(log n) search from the head of the list only if
+  // the target key hasn't been found.
+  class LookaheadIterator : public MemTableRep::Iterator {
+   public:
+    explicit LookaheadIterator(const SkipListRep& rep) :
+        rep_(rep), iter_(&rep_.skip_list_), prev_(iter_) {}
+
+    virtual ~LookaheadIterator() override {}
+
+    virtual bool Valid() const override {
+      return iter_.Valid();
+    }
+
+    virtual const char *key() const override {
+      assert(Valid());
+      return iter_.key();
+    }
+
+    virtual void Next() override {
+      assert(Valid());
+
+      bool advance_prev = true;
+      if (prev_.Valid()) {
+        auto k1 = rep_.UserKey(prev_.key());
+        auto k2 = rep_.UserKey(iter_.key());
+
+        if (k1.compare(k2) == 0) {
+          // same user key, don't move prev_
+          advance_prev = false;
+        } else if (rep_.transform_) {
+          // only advance prev_ if it has the same prefix as iter_
+          auto t1 = rep_.transform_->Transform(k1);
+          auto t2 = rep_.transform_->Transform(k2);
+          advance_prev = t1.compare(t2) == 0;
+        }
+      }
+
+      if (advance_prev) {
+        prev_ = iter_;
+      }
+      iter_.Next();
+    }
+
+    virtual void Prev() override {
+      assert(Valid());
+      iter_.Prev();
+      prev_ = iter_;
+    }
+
+    virtual void Seek(const Slice& internal_key, const char *memtable_key)
+        override {
+      const char *encoded_key =
+        (memtable_key != nullptr) ?
+            memtable_key : EncodeKey(&tmp_, internal_key);
+
+      if (prev_.Valid() && rep_.cmp_(encoded_key, prev_.key()) >= 0) {
+        // prev_.key() is smaller or equal to our target key; do a quick
+        // linear search (at most lookahead_ steps) starting from prev_
+        iter_ = prev_;
+
+        size_t cur = 0;
+        while (cur++ <= rep_.lookahead_ && iter_.Valid()) {
+          if (rep_.cmp_(encoded_key, iter_.key()) <= 0) {
+            return;
+          }
+          Next();
+        }
+      }
+
+      iter_.Seek(encoded_key);
+      prev_ = iter_;
+    }
+
+    virtual void SeekToFirst() override {
+      iter_.SeekToFirst();
+      prev_ = iter_;
+    }
+
+    virtual void SeekToLast() override {
+      iter_.SeekToLast();
+      prev_ = iter_;
+    }
+
+   protected:
+    std::string tmp_;       // For passing to EncodeKey
+
+   private:
+    const SkipListRep& rep_;
+    SkipList<const char*, const MemTableRep::KeyComparator&>::Iterator iter_;
+    SkipList<const char*, const MemTableRep::KeyComparator&>::Iterator prev_;
+  };
+
   virtual MemTableRep::Iterator* GetIterator(Arena* arena = nullptr) override {
-    if (arena == nullptr) {
-      return new SkipListRep::Iterator(&skip_list_);
+    if (lookahead_ > 0) {
+      void *mem =
+        arena ? arena->AllocateAligned(sizeof(SkipListRep::LookaheadIterator))
+              : operator new(sizeof(SkipListRep::LookaheadIterator));
+      return new (mem) SkipListRep::LookaheadIterator(*this);
     } else {
-      auto mem = arena->AllocateAligned(sizeof(SkipListRep::Iterator));
+      void *mem =
+        arena ? arena->AllocateAligned(sizeof(SkipListRep::Iterator))
+              : operator new(sizeof(SkipListRep::Iterator));
       return new (mem) SkipListRep::Iterator(&skip_list_);
     }
   }
@@ -119,8 +225,8 @@ public:
 
 MemTableRep* SkipListFactory::CreateMemTableRep(
     const MemTableRep::KeyComparator& compare, Arena* arena,
-    const SliceTransform*, Logger* logger) {
-  return new SkipListRep(compare, arena);
+    const SliceTransform* transform, Logger* logger) {
+  return new SkipListRep(compare, arena, transform, lookahead_);
 }
 
 } // namespace rocksdb

From 1d525891bdf06b7dbc0d5f18e3981f2af9d686ef Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Tue, 7 Oct 2014 11:59:30 -0700
Subject: [PATCH 235/829] Update HISTORY for 3.6

---
 HISTORY.md | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index 41c49cc1a..7451a8dc8 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,6 +1,6 @@
 # Rocksdb Change Log
 
-## Unreleased (will be released with 3.6)
+## 3.6.0 (10/7/2014)
 ### Disk format changes
 * If you're using RocksDB on ARM platforms and you're using default bloom filter, there is a disk format change you need to be aware of. There are three steps you need to do when you convert to new release: 1. turn off filter policy, 2. compact the whole database, 3. turn on filter policy
 
@@ -13,8 +13,6 @@
 * Change target_file_size_base type to uint64_t from int.
 * Remove allow_thread_local. This feature was proved to be stable, so we are turning it always-on.
 
------ Past Releases -----
-
 ## 3.5.0 (9/3/2014)
 ### New Features
 * Add include/utilities/write_batch_with_index.h, providing a utilitiy class to query data out of WriteBatch when building it.

From 5e43155b3fc35c280c773831d9cb20f4109f93d0 Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Mon, 6 Oct 2014 23:34:27 +0200
Subject: [PATCH 236/829] RocksJava should support not only BlockBased
 Bloomfilter

---
 java/org/rocksdb/BloomFilter.java     | 102 +++++++++++++++++++++++---
 java/org/rocksdb/test/FilterTest.java |   4 +
 java/rocksjni/filter.cc               |  12 +--
 3 files changed, 103 insertions(+), 15 deletions(-)

diff --git a/java/org/rocksdb/BloomFilter.java b/java/org/rocksdb/BloomFilter.java
index 9c4913a8c..21915ed66 100644
--- a/java/org/rocksdb/BloomFilter.java
+++ b/java/org/rocksdb/BloomFilter.java
@@ -6,32 +6,114 @@
 package org.rocksdb;
 
 /**
- * This class creates a new filter policy that uses a bloom filter
- * with approximately the specified number of bits per key.
- * A good value for bitsPerKey is 10, which yields a filter
- * with ~ 1% false positive rate.
- *
- * Default value of bits per key is 10.
+ * BloomFilter
  */
 public class BloomFilter extends Filter {
+
   private static final int DEFAULT_BITS_PER_KEY = 10;
+  private static final boolean DEFAULT_MODE = true;
   private final int bitsPerKey_;
+  private final boolean useBlockBasedMode_;
 
+  /**
+   * Bloom filter policy that uses a bloom filter with approximately
+   * the specified number of bits per key.
+   *
+   * <p>
+   * bits_per_key: bits per key in bloom filter. A good value for bits_per_key
+   * is 10, which yields a filter with ~ 1% false positive rate.
+   * <p><strong>default bits_per_key</strong>: 10</p>
+   * </p>
+   * <p>use_block_based_builder: use block based filter rather than full filter.
+   * If you want to builder full filter, it needs to be set to false.
+   * </p>
+   * <p><strong>default mode: block based filter</strong></p>
+   * <p>
+   * Callers must delete the result after any database that is using the
+   * result has been closed.</p>
+   * <p>
+   * Note: if you are using a custom comparator that ignores some parts
+   * of the keys being compared, you must not use this {@code BloomFilter}
+   * and must provide your own FilterPolicy that also ignores the
+   * corresponding parts of the keys. For example, if the comparator
+   * ignores trailing spaces, it would be incorrect to use a
+   * FilterPolicy (like {@code BloomFilter}) that does not ignore
+   * trailing spaces in keys.</p>
+   */
   public BloomFilter() {
-    this(DEFAULT_BITS_PER_KEY);
+    this(DEFAULT_BITS_PER_KEY, DEFAULT_MODE);
   }
 
+  /**
+   * Bloom filter policy that uses a bloom filter with approximately
+   * the specified number of bits per key.
+   *
+   * <p>
+   * bits_per_key: bits per key in bloom filter. A good value for bits_per_key
+   * is 10, which yields a filter with ~ 1% false positive rate.
+   * </p>
+   * <p>use_block_based_builder: use block based filter rather than full filter.
+   * If you want to builder full filter, it needs to be set to false.
+   * </p>
+   * <p><strong>default mode: block based filter</strong></p>
+   * <p>
+   * Callers must delete the result after any database that is using the
+   * result has been closed.</p>
+   * <p>
+   * Note: if you are using a custom comparator that ignores some parts
+   * of the keys being compared, you must not use this {@code BloomFilter}
+   * and must provide your own FilterPolicy that also ignores the
+   * corresponding parts of the keys. For example, if the comparator
+   * ignores trailing spaces, it would be incorrect to use a
+   * FilterPolicy (like {@code BloomFilter}) that does not ignore
+   * trailing spaces in keys.</p>
+   *
+   * @param bitsPerKey number of bits to use
+   */
   public BloomFilter(int bitsPerKey) {
+    this(bitsPerKey, DEFAULT_MODE);
+  }
+
+  /**
+   * Bloom filter policy that uses a bloom filter with approximately
+   * the specified number of bits per key.
+   *
+   * <p>
+   * bits_per_key: bits per key in bloom filter. A good value for bits_per_key
+   * is 10, which yields a filter with ~ 1% false positive rate.
+   * <p><strong>default bits_per_key</strong>: 10</p>
+   * </p>
+   * <p>use_block_based_builder: use block based filter rather than full filter.
+   * If you want to builder full filter, it needs to be set to false.
+   * </p>
+   * <p><strong>default mode: block based filter</strong></p>
+   * <p>
+   * Callers must delete the result after any database that is using the
+   * result has been closed.</p>
+   * <p>
+   * Note: if you are using a custom comparator that ignores some parts
+   * of the keys being compared, you must not use this {@code BloomFilter}
+   * and must provide your own FilterPolicy that also ignores the
+   * corresponding parts of the keys. For example, if the comparator
+   * ignores trailing spaces, it would be incorrect to use a
+   * FilterPolicy (like {@code BloomFilter}) that does not ignore
+   * trailing spaces in keys.</p>
+   *
+   * @param bitsPerKey number of bits to use
+   * @param useBlockBasedMode use block based mode or full filter mode
+   */
+  public BloomFilter(int bitsPerKey, boolean useBlockBasedMode) {
     super();
     bitsPerKey_ = bitsPerKey;
-
+    useBlockBasedMode_ = useBlockBasedMode;
     createNewFilter();
   }
 
   @Override
   protected void createNewFilter() {
-    createNewFilter0(bitsPerKey_);
+    createNewBloomFilter(bitsPerKey_, useBlockBasedMode_);
   }
 
-  private native void createNewFilter0(int bitsKeyKey);
+  private native void createNewBloomFilter(int bitsKeyKey,
+      boolean useBlockBasedMode);
 }
diff --git a/java/org/rocksdb/test/FilterTest.java b/java/org/rocksdb/test/FilterTest.java
index 7475d2c34..00214d033 100644
--- a/java/org/rocksdb/test/FilterTest.java
+++ b/java/org/rocksdb/test/FilterTest.java
@@ -22,6 +22,10 @@ public class FilterTest {
     blockConfig = new BlockBasedTableConfig();
     blockConfig.setFilter(new BloomFilter());
     options.setTableFormatConfig(blockConfig);
+    blockConfig.setFilter(new BloomFilter(10));
+    options.setTableFormatConfig(blockConfig);
+    blockConfig.setFilter(new BloomFilter(10, false));
+    options.setTableFormatConfig(blockConfig);
     System.out.println("Filter test passed");
   }
 }
diff --git a/java/rocksjni/filter.cc b/java/rocksjni/filter.cc
index 572b4a66d..1b5d368b6 100644
--- a/java/rocksjni/filter.cc
+++ b/java/rocksjni/filter.cc
@@ -18,12 +18,14 @@
 
 /*
  * Class:     org_rocksdb_BloomFilter
- * Method:    createNewFilter0
- * Signature: (I)V
+ * Method:    createBloomFilter
+ * Signature: (IZ)V
  */
-void Java_org_rocksdb_BloomFilter_createNewFilter0(
-    JNIEnv* env, jobject jobj, jint bits_per_key) {
-  const rocksdb::FilterPolicy* fp = rocksdb::NewBloomFilterPolicy(bits_per_key);
+void Java_org_rocksdb_BloomFilter_createNewBloomFilter(
+    JNIEnv* env, jobject jobj, jint bits_per_key,
+    jboolean use_block_base_builder) {
+  const rocksdb::FilterPolicy* fp = rocksdb::NewBloomFilterPolicy(bits_per_key,
+       use_block_base_builder);
   rocksdb::FilterJni::setHandle(env, jobj, fp);
 }
 

From ced61295749dcb0d3b03b937b84ad968c491d9c1 Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Tue, 7 Oct 2014 09:09:33 +0200
Subject: [PATCH 237/829] Improved JavaDoc

---
 java/org/rocksdb/BloomFilter.java | 58 ++++++++-----------------------
 1 file changed, 14 insertions(+), 44 deletions(-)

diff --git a/java/org/rocksdb/BloomFilter.java b/java/org/rocksdb/BloomFilter.java
index 21915ed66..6772d2f54 100644
--- a/java/org/rocksdb/BloomFilter.java
+++ b/java/org/rocksdb/BloomFilter.java
@@ -6,7 +6,17 @@
 package org.rocksdb;
 
 /**
- * BloomFilter
+ * Bloom filter policy that uses a bloom filter with approximately
+ * the specified number of bits per key.
+ *
+ * <p>
+ * Note: if you are using a custom comparator that ignores some parts
+ * of the keys being compared, you must not use this {@code BloomFilter}
+ * and must provide your own FilterPolicy that also ignores the
+ * corresponding parts of the keys. For example, if the comparator
+ * ignores trailing spaces, it would be incorrect to use a
+ * FilterPolicy (like {@code BloomFilter}) that does not ignore
+ * trailing spaces in keys.</p>
  */
 public class BloomFilter extends Filter {
 
@@ -16,57 +26,26 @@ public class BloomFilter extends Filter {
   private final boolean useBlockBasedMode_;
 
   /**
-   * Bloom filter policy that uses a bloom filter with approximately
-   * the specified number of bits per key.
+   * BloomFilter constructor
    *
    * <p>
-   * bits_per_key: bits per key in bloom filter. A good value for bits_per_key
-   * is 10, which yields a filter with ~ 1% false positive rate.
-   * <p><strong>default bits_per_key</strong>: 10</p>
-   * </p>
-   * <p>use_block_based_builder: use block based filter rather than full filter.
-   * If you want to builder full filter, it needs to be set to false.
-   * </p>
-   * <p><strong>default mode: block based filter</strong></p>
-   * <p>
    * Callers must delete the result after any database that is using the
    * result has been closed.</p>
-   * <p>
-   * Note: if you are using a custom comparator that ignores some parts
-   * of the keys being compared, you must not use this {@code BloomFilter}
-   * and must provide your own FilterPolicy that also ignores the
-   * corresponding parts of the keys. For example, if the comparator
-   * ignores trailing spaces, it would be incorrect to use a
-   * FilterPolicy (like {@code BloomFilter}) that does not ignore
-   * trailing spaces in keys.</p>
    */
   public BloomFilter() {
     this(DEFAULT_BITS_PER_KEY, DEFAULT_MODE);
   }
 
   /**
-   * Bloom filter policy that uses a bloom filter with approximately
-   * the specified number of bits per key.
+   * BloomFilter constructor
    *
    * <p>
    * bits_per_key: bits per key in bloom filter. A good value for bits_per_key
    * is 10, which yields a filter with ~ 1% false positive rate.
    * </p>
-   * <p>use_block_based_builder: use block based filter rather than full filter.
-   * If you want to builder full filter, it needs to be set to false.
-   * </p>
-   * <p><strong>default mode: block based filter</strong></p>
    * <p>
    * Callers must delete the result after any database that is using the
    * result has been closed.</p>
-   * <p>
-   * Note: if you are using a custom comparator that ignores some parts
-   * of the keys being compared, you must not use this {@code BloomFilter}
-   * and must provide your own FilterPolicy that also ignores the
-   * corresponding parts of the keys. For example, if the comparator
-   * ignores trailing spaces, it would be incorrect to use a
-   * FilterPolicy (like {@code BloomFilter}) that does not ignore
-   * trailing spaces in keys.</p>
    *
    * @param bitsPerKey number of bits to use
    */
@@ -75,8 +54,7 @@ public class BloomFilter extends Filter {
   }
 
   /**
-   * Bloom filter policy that uses a bloom filter with approximately
-   * the specified number of bits per key.
+   * BloomFilter constructor
    *
    * <p>
    * bits_per_key: bits per key in bloom filter. A good value for bits_per_key
@@ -90,14 +68,6 @@ public class BloomFilter extends Filter {
    * <p>
    * Callers must delete the result after any database that is using the
    * result has been closed.</p>
-   * <p>
-   * Note: if you are using a custom comparator that ignores some parts
-   * of the keys being compared, you must not use this {@code BloomFilter}
-   * and must provide your own FilterPolicy that also ignores the
-   * corresponding parts of the keys. For example, if the comparator
-   * ignores trailing spaces, it would be incorrect to use a
-   * FilterPolicy (like {@code BloomFilter}) that does not ignore
-   * trailing spaces in keys.</p>
    *
    * @param bitsPerKey number of bits to use
    * @param useBlockBasedMode use block based mode or full filter mode

From b7d3d6ebc58d3fd2576e09c47b72d8d1ca763bb0 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Thu, 9 Oct 2014 20:07:12 -0700
Subject: [PATCH 238/829] db_bench: set thread pool size according to
 max_background_flushes

Summary: option max_background_flushes doesn't make sense if thread pool size is not set accordingly. Set the thread pool size as what we do for max_background_compactions.

Test Plan: Run db_bench with max_background_flushes > 1

Reviewers: yhchiang, igor, rven, ljin

Reviewed By: ljin

Subscribers: MarkCallaghan, leveldb

Differential Revision: https://reviews.facebook.net/D24717
---
 db/db_bench.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/db/db_bench.cc b/db/db_bench.cc
index f04ab8144..6d611ae1c 100644
--- a/db/db_bench.cc
+++ b/db/db_bench.cc
@@ -2904,6 +2904,9 @@ int main(int argc, char** argv) {
   // The number of background threads should be at least as much the
   // max number of concurrent compactions.
   FLAGS_env->SetBackgroundThreads(FLAGS_max_background_compactions);
+  FLAGS_env->SetBackgroundThreads(FLAGS_max_background_flushes,
+                                  rocksdb::Env::Priority::HIGH);
+
   // Choose a location for the test database if none given with --db=<path>
   if (FLAGS_db.empty()) {
     std::string default_db_path;

From f18b4a4847a309f4134d01b873a3715999f114b2 Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Fri, 10 Oct 2014 09:55:40 -0700
Subject: [PATCH 239/829] minor update to benchmark script

Summary: Try to match some parameters from Dhruba's benchmarks on github

Test Plan: ran it

Reviewers: sdong, yhchiang, igor

Reviewed By: igor

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D24687
---
 tools/benchmark.sh       | 19 ++++++++++---------
 tools/run_flash_bench.sh |  2 +-
 2 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/tools/benchmark.sh b/tools/benchmark.sh
index cde545801..431999340 100755
--- a/tools/benchmark.sh
+++ b/tools/benchmark.sh
@@ -28,7 +28,7 @@ fi
 
 num_read_threads=${NUM_READ_THREADS:-16}
 writes_per_second=${WRITES_PER_SEC:-$((80 * K))}  # (only for readwhilewriting)
-cache_size=$((16 * G))
+cache_size=$((1 * G))
 duration=${DURATION:-0}
 
 num_keys=${NUM_KEYS:-$((1 * G))}
@@ -45,13 +45,14 @@ const_params="
   --block_size=4096 \
   --cache_size=$cache_size \
   --cache_numshardbits=6 \
-  --compression_type=snappy \
+  --compression_type=zlib \
+  --min_level_to_compress=2 \
   --compression_ratio=0.5 \
   \
   --hard_rate_limit=2 \
   --rate_limit_delay_max_milliseconds=1000000 \
   --write_buffer_size=$((128 * M)) \
-  --max_write_buffer_number=2 \
+  --max_write_buffer_number=3 \
   --target_file_size_base=$((128 * M)) \
   --max_bytes_for_level_base=$((1 * G)) \
   \
@@ -71,9 +72,9 @@ const_params="
   --open_files=$((20 * K))"
 
 l0_config="
-  --level0_file_num_compaction_trigger=8 \
-  --level0_slowdown_writes_trigger=16 \
-  --level0_stop_writes_trigger=24"
+  --level0_file_num_compaction_trigger=4 \
+  --level0_slowdown_writes_trigger=8 \
+  --level0_stop_writes_trigger=12"
 
 if [ $duration -gt 0 ]; then
   const_params="$const_params --duration=$duration"
@@ -82,9 +83,9 @@ fi
 params_r="$const_params $l0_config --max_background_compactions=4 --max_background_flushes=1"
 params_w="$const_params $l0_config --max_background_compactions=16 --max_background_flushes=16"
 params_bulkload="$const_params --max_background_compactions=16 --max_background_flushes=16 \
-                 --level0_file_num_compaction_trigger=$((100 * M)) \
-                 --level0_slowdown_writes_trigger=$((100 * M)) \
-                 --level0_stop_writes_trigger=$((100 * M))"
+                 --level0_file_num_compaction_trigger=$((10 * M)) \
+                 --level0_slowdown_writes_trigger=$((10 * M)) \
+                 --level0_stop_writes_trigger=$((10 * M))"
 
 function run_bulkload {
   echo "Bulk loading $num_keys random keys into database..."
diff --git a/tools/run_flash_bench.sh b/tools/run_flash_bench.sh
index be7d1631f..affebe27c 100755
--- a/tools/run_flash_bench.sh
+++ b/tools/run_flash_bench.sh
@@ -10,7 +10,7 @@ G=$((1024 * M))
 
 n=$((1 * G))
 wps=$((80 * K))
-duration=$((6 * 60 * 60))
+duration=$((12 * 60 * 60))
 num_read_threads=24
 
 # Update these parameters before execution !!!

From cd0d581ff5154ea4d99c278599805ee6ed83745d Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Fri, 10 Oct 2014 10:00:12 -0700
Subject: [PATCH 240/829] convert Options from string

Summary: Allow accepting Options as a string of key/value pairs

Test Plan: unit test

Reviewers: yhchiang, sdong, igor

Reviewed By: igor

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D24597
---
 HISTORY.md                      |   6 +
 include/rocksdb/options.h       |   5 -
 util/options_helper.cc          | 101 +++++++++++++-
 util/options_test.cc            | 234 +++++++++++++++++++-------------
 utilities/options/convenience.h |  40 ++++++
 5 files changed, 284 insertions(+), 102 deletions(-)
 create mode 100644 utilities/options/convenience.h

diff --git a/HISTORY.md b/HISTORY.md
index 7451a8dc8..06660a0e8 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,5 +1,11 @@
 # Rocksdb Change Log
 
+## Unreleased
+
+### Public API changes
+* Introduce 4 new convenient functions for converting Options from string: GetColumnFamilyOptionsFromMap(), GetColumnFamilyOptionsFromString(), GetDBOptionsFromMap(), GetDBOptionsFromString()
+
+
 ## 3.6.0 (10/7/2014)
 ### Disk format changes
 * If you're using RocksDB on ARM platforms and you're using default bloom filter, there is a disk format change you need to be aware of. There are three steps you need to do when you convert to new release: 1. turn off filter policy, 2. compact the whole database, 3. turn on filter policy
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index 467c7bb1e..d9a82fd5a 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -1012,11 +1012,6 @@ extern Options GetOptions(size_t total_write_buffer_limit,
                           int write_amplification_threshold = 32,
                           uint64_t target_db_size = 68719476736 /* 64GB */);
 
-bool GetOptionsFromStrings(
-    const Options& base_options,
-    const std::unordered_map<std::string, std::string>& options_map,
-    Options* new_options);
-
 }  // namespace rocksdb
 
 #endif  // STORAGE_ROCKSDB_INCLUDE_OPTIONS_H_
diff --git a/util/options_helper.cc b/util/options_helper.cc
index 2a61c8b69..67726dc8f 100644
--- a/util/options_helper.cc
+++ b/util/options_helper.cc
@@ -4,6 +4,7 @@
 //  of patent rights can be found in the PATENTS file in the same directory.
 
 #include <cassert>
+#include <cctype>
 #include <unordered_set>
 #include "rocksdb/options.h"
 #include "util/options_helper.h"
@@ -161,13 +162,61 @@ bool GetMutableOptionsFromStrings(
   return true;
 }
 
-bool GetOptionsFromStrings(
-    const Options& base_options,
-    const std::unordered_map<std::string, std::string>& options_map,
-    Options* new_options) {
+namespace {
+
+std::string trim(const std::string& str) {
+  size_t start = 0;
+  size_t end = str.size() - 1;
+  while (isspace(str[start]) != 0 && start <= end) {
+    ++start;
+  }
+  while (isspace(str[end]) != 0 && start <= end) {
+    --end;
+  }
+  if (start <= end) {
+    return str.substr(start, end - start + 1);
+  }
+  return std::string();
+}
+
+bool StringToMap(const std::string& opts_str,
+                 std::unordered_map<std::string, std::string>* opts_map) {
+  assert(opts_map);
+  // Example:
+  //   opts_str = "write_buffer_size=1024;max_write_buffer_number=2"
+  size_t pos = 0;
+
+  std::string opts = trim(opts_str);
+  while (pos < opts.size()) {
+    size_t eq_pos = opts.find('=', pos);
+    if (eq_pos == std::string::npos) {
+      return false;
+    }
+    std::string key = trim(opts.substr(pos, eq_pos - pos));
+
+    size_t sc_pos = opts.find(';', eq_pos + 1);
+    if (sc_pos == std::string::npos) {
+      (*opts_map)[key] = trim(opts.substr(eq_pos + 1));
+      // It either ends with a trailing semi-colon or the last key-value pair
+      break;
+    } else {
+      (*opts_map)[key] = trim(opts.substr(eq_pos + 1, sc_pos - eq_pos - 1));
+    }
+    pos = sc_pos + 1;
+  }
+
+  return true;
+}
+
+}  // anonymous namespace
+
+bool GetColumnFamilyOptionsFromMap(
+    const ColumnFamilyOptions& base_options,
+    const std::unordered_map<std::string, std::string>& opts_map,
+    ColumnFamilyOptions* new_options) {
   assert(new_options);
   *new_options = base_options;
-  for (const auto& o : options_map) {
+  for (const auto& o : opts_map) {
     try {
       if (ParseMemtableOptions(o.first, o.second, new_options)) {
       } else if (ParseCompactionOptions(o.first, o.second, new_options)) {
@@ -247,7 +296,36 @@ bool GetOptionsFromStrings(
         new_options->bloom_locality = ParseUint32(o.second);
       } else if (o.first == "min_partial_merge_operands") {
         new_options->min_partial_merge_operands = ParseUint32(o.second);
-      } else if (o.first == "create_if_missing") {
+      } else {
+        return false;
+      }
+    } catch (std::exception) {
+      return false;
+    }
+  }
+  return true;
+}
+
+bool GetColumnFamilyOptionsFromString(
+    const ColumnFamilyOptions& base_options,
+    const std::string& opts_str,
+    ColumnFamilyOptions* new_options) {
+  std::unordered_map<std::string, std::string> opts_map;
+  if (!StringToMap(opts_str, &opts_map)) {
+    return false;
+  }
+  return GetColumnFamilyOptionsFromMap(base_options, opts_map, new_options);
+}
+
+bool GetDBOptionsFromMap(
+    const DBOptions& base_options,
+    const std::unordered_map<std::string, std::string>& opts_map,
+    DBOptions* new_options) {
+  assert(new_options);
+  *new_options = base_options;
+  for (const auto& o : opts_map) {
+    try {
+      if (o.first == "create_if_missing") {
         new_options->create_if_missing = ParseBoolean(o.first, o.second);
       } else if (o.first == "create_missing_column_families") {
         new_options->create_missing_column_families =
@@ -325,4 +403,15 @@ bool GetOptionsFromStrings(
   return true;
 }
 
+bool GetDBOptionsFromString(
+    const DBOptions& base_options,
+    const std::string& opts_str,
+    DBOptions* new_options) {
+  std::unordered_map<std::string, std::string> opts_map;
+  if (!StringToMap(opts_str, &opts_map)) {
+    return false;
+  }
+  return GetDBOptionsFromMap(base_options, opts_map, new_options);
+}
+
 }  // namespace rocksdb
diff --git a/util/options_test.cc b/util/options_test.cc
index 1e26c343d..6f6745aa0 100644
--- a/util/options_test.cc
+++ b/util/options_test.cc
@@ -17,6 +17,7 @@
 
 #include "rocksdb/options.h"
 #include "util/testharness.h"
+#include "utilities/options/convenience.h"
 
 using GFLAGS::ParseCommandLineFlags;
 DEFINE_bool(enable_print, false, "Print options generated to console.");
@@ -77,8 +78,8 @@ TEST(OptionsTest, LooseCondition) {
   PrintAndGetOptions(128 * 1024 * 1024, 4, 8);
 }
 
-TEST(OptionsTest, GetOptionsFromStringsTest) {
-  std::unordered_map<std::string, std::string> options_map = {
+TEST(OptionsTest, GetOptionsFromMapTest) {
+  std::unordered_map<std::string, std::string> cf_options_map = {
     {"write_buffer_size", "1"},
     {"max_write_buffer_number", "2"},
     {"min_write_buffer_number_to_merge", "3"},
@@ -120,7 +121,10 @@ TEST(OptionsTest, GetOptionsFromStringsTest) {
     {"memtable_prefix_bloom_huge_page_tlb_size", "28"},
     {"bloom_locality", "29"},
     {"max_successive_merges", "30"},
-    {"min_partial_merge_operands", "31"},
+    {"min_partial_merge_operands", "31"}
+  };
+
+  std::unordered_map<std::string, std::string> db_options_map = {
     {"create_if_missing", "false"},
     {"create_missing_column_families", "true"},
     {"error_if_exists", "false"},
@@ -154,98 +158,146 @@ TEST(OptionsTest, GetOptionsFromStringsTest) {
     {"bytes_per_sync", "47"},
   };
 
-  Options base_opt;
-  Options new_opt;
-  ASSERT_TRUE(GetOptionsFromStrings(base_opt, options_map, &new_opt));
-  ASSERT_EQ(new_opt.write_buffer_size, 1U);
-  ASSERT_EQ(new_opt.max_write_buffer_number, 2);
-  ASSERT_EQ(new_opt.min_write_buffer_number_to_merge, 3);
-  ASSERT_EQ(new_opt.compression, kSnappyCompression);
-  ASSERT_EQ(new_opt.compression_per_level.size(), 6U);
-  ASSERT_EQ(new_opt.compression_per_level[0], kNoCompression);
-  ASSERT_EQ(new_opt.compression_per_level[1], kSnappyCompression);
-  ASSERT_EQ(new_opt.compression_per_level[2], kZlibCompression);
-  ASSERT_EQ(new_opt.compression_per_level[3], kBZip2Compression);
-  ASSERT_EQ(new_opt.compression_per_level[4], kLZ4Compression);
-  ASSERT_EQ(new_opt.compression_per_level[5], kLZ4HCCompression);
-  ASSERT_EQ(new_opt.compression_opts.window_bits, 4);
-  ASSERT_EQ(new_opt.compression_opts.level, 5);
-  ASSERT_EQ(new_opt.compression_opts.strategy, 6);
-  ASSERT_EQ(new_opt.num_levels, 7);
-  ASSERT_EQ(new_opt.level0_file_num_compaction_trigger, 8);
-  ASSERT_EQ(new_opt.level0_slowdown_writes_trigger, 9);
-  ASSERT_EQ(new_opt.level0_stop_writes_trigger, 10);
-  ASSERT_EQ(new_opt.max_mem_compaction_level, 11);
-  ASSERT_EQ(new_opt.target_file_size_base, static_cast<uint64_t>(12));
-  ASSERT_EQ(new_opt.target_file_size_multiplier, 13);
-  ASSERT_EQ(new_opt.max_bytes_for_level_base, 14U);
-  ASSERT_EQ(new_opt.max_bytes_for_level_multiplier, 15);
-  ASSERT_EQ(new_opt.max_bytes_for_level_multiplier_additional.size(), 3U);
-  ASSERT_EQ(new_opt.max_bytes_for_level_multiplier_additional[0], 16);
-  ASSERT_EQ(new_opt.max_bytes_for_level_multiplier_additional[1], 17);
-  ASSERT_EQ(new_opt.max_bytes_for_level_multiplier_additional[2], 18);
-  ASSERT_EQ(new_opt.expanded_compaction_factor, 19);
-  ASSERT_EQ(new_opt.source_compaction_factor, 20);
-  ASSERT_EQ(new_opt.max_grandparent_overlap_factor, 21);
-  ASSERT_EQ(new_opt.soft_rate_limit, 1.1);
-  ASSERT_EQ(new_opt.hard_rate_limit, 2.1);
-  ASSERT_EQ(new_opt.arena_block_size, 22U);
-  ASSERT_EQ(new_opt.disable_auto_compactions, true);
-  ASSERT_EQ(new_opt.purge_redundant_kvs_while_flush, true);
-  ASSERT_EQ(new_opt.compaction_style, kCompactionStyleLevel);
-  ASSERT_EQ(new_opt.verify_checksums_in_compaction, false);
-  ASSERT_EQ(new_opt.compaction_options_fifo.max_table_files_size,
+  ColumnFamilyOptions base_cf_opt;
+  ColumnFamilyOptions new_cf_opt;
+  ASSERT_TRUE(GetColumnFamilyOptionsFromMap(
+              base_cf_opt, cf_options_map, &new_cf_opt));
+  ASSERT_EQ(new_cf_opt.write_buffer_size, 1U);
+  ASSERT_EQ(new_cf_opt.max_write_buffer_number, 2);
+  ASSERT_EQ(new_cf_opt.min_write_buffer_number_to_merge, 3);
+  ASSERT_EQ(new_cf_opt.compression, kSnappyCompression);
+  ASSERT_EQ(new_cf_opt.compression_per_level.size(), 6U);
+  ASSERT_EQ(new_cf_opt.compression_per_level[0], kNoCompression);
+  ASSERT_EQ(new_cf_opt.compression_per_level[1], kSnappyCompression);
+  ASSERT_EQ(new_cf_opt.compression_per_level[2], kZlibCompression);
+  ASSERT_EQ(new_cf_opt.compression_per_level[3], kBZip2Compression);
+  ASSERT_EQ(new_cf_opt.compression_per_level[4], kLZ4Compression);
+  ASSERT_EQ(new_cf_opt.compression_per_level[5], kLZ4HCCompression);
+  ASSERT_EQ(new_cf_opt.compression_opts.window_bits, 4);
+  ASSERT_EQ(new_cf_opt.compression_opts.level, 5);
+  ASSERT_EQ(new_cf_opt.compression_opts.strategy, 6);
+  ASSERT_EQ(new_cf_opt.num_levels, 7);
+  ASSERT_EQ(new_cf_opt.level0_file_num_compaction_trigger, 8);
+  ASSERT_EQ(new_cf_opt.level0_slowdown_writes_trigger, 9);
+  ASSERT_EQ(new_cf_opt.level0_stop_writes_trigger, 10);
+  ASSERT_EQ(new_cf_opt.max_mem_compaction_level, 11);
+  ASSERT_EQ(new_cf_opt.target_file_size_base, static_cast<uint64_t>(12));
+  ASSERT_EQ(new_cf_opt.target_file_size_multiplier, 13);
+  ASSERT_EQ(new_cf_opt.max_bytes_for_level_base, 14U);
+  ASSERT_EQ(new_cf_opt.max_bytes_for_level_multiplier, 15);
+  ASSERT_EQ(new_cf_opt.max_bytes_for_level_multiplier_additional.size(), 3U);
+  ASSERT_EQ(new_cf_opt.max_bytes_for_level_multiplier_additional[0], 16);
+  ASSERT_EQ(new_cf_opt.max_bytes_for_level_multiplier_additional[1], 17);
+  ASSERT_EQ(new_cf_opt.max_bytes_for_level_multiplier_additional[2], 18);
+  ASSERT_EQ(new_cf_opt.expanded_compaction_factor, 19);
+  ASSERT_EQ(new_cf_opt.source_compaction_factor, 20);
+  ASSERT_EQ(new_cf_opt.max_grandparent_overlap_factor, 21);
+  ASSERT_EQ(new_cf_opt.soft_rate_limit, 1.1);
+  ASSERT_EQ(new_cf_opt.hard_rate_limit, 2.1);
+  ASSERT_EQ(new_cf_opt.arena_block_size, 22U);
+  ASSERT_EQ(new_cf_opt.disable_auto_compactions, true);
+  ASSERT_EQ(new_cf_opt.purge_redundant_kvs_while_flush, true);
+  ASSERT_EQ(new_cf_opt.compaction_style, kCompactionStyleLevel);
+  ASSERT_EQ(new_cf_opt.verify_checksums_in_compaction, false);
+  ASSERT_EQ(new_cf_opt.compaction_options_fifo.max_table_files_size,
             static_cast<uint64_t>(23));
-  ASSERT_EQ(new_opt.filter_deletes, false);
-  ASSERT_EQ(new_opt.max_sequential_skip_in_iterations,
+  ASSERT_EQ(new_cf_opt.filter_deletes, false);
+  ASSERT_EQ(new_cf_opt.max_sequential_skip_in_iterations,
             static_cast<uint64_t>(24));
-  ASSERT_EQ(new_opt.inplace_update_support, true);
-  ASSERT_EQ(new_opt.inplace_update_num_locks, 25U);
-  ASSERT_EQ(new_opt.memtable_prefix_bloom_bits, 26U);
-  ASSERT_EQ(new_opt.memtable_prefix_bloom_probes, 27U);
-  ASSERT_EQ(new_opt.memtable_prefix_bloom_huge_page_tlb_size, 28U);
-  ASSERT_EQ(new_opt.bloom_locality, 29U);
-  ASSERT_EQ(new_opt.max_successive_merges, 30U);
-  ASSERT_EQ(new_opt.min_partial_merge_operands, 31U);
-  ASSERT_EQ(new_opt.create_if_missing, false);
-  ASSERT_EQ(new_opt.create_missing_column_families, true);
-  ASSERT_EQ(new_opt.error_if_exists, false);
-  ASSERT_EQ(new_opt.paranoid_checks, true);
-  ASSERT_EQ(new_opt.max_open_files, 32);
-  ASSERT_EQ(new_opt.max_total_wal_size, static_cast<uint64_t>(33));
-  ASSERT_EQ(new_opt.disableDataSync, false);
-  ASSERT_EQ(new_opt.use_fsync, true);
-  ASSERT_EQ(new_opt.db_log_dir, "/db_log_dir");
-  ASSERT_EQ(new_opt.wal_dir, "/wal_dir");
-  ASSERT_EQ(new_opt.delete_obsolete_files_period_micros,
+  ASSERT_EQ(new_cf_opt.inplace_update_support, true);
+  ASSERT_EQ(new_cf_opt.inplace_update_num_locks, 25U);
+  ASSERT_EQ(new_cf_opt.memtable_prefix_bloom_bits, 26U);
+  ASSERT_EQ(new_cf_opt.memtable_prefix_bloom_probes, 27U);
+  ASSERT_EQ(new_cf_opt.memtable_prefix_bloom_huge_page_tlb_size, 28U);
+  ASSERT_EQ(new_cf_opt.bloom_locality, 29U);
+  ASSERT_EQ(new_cf_opt.max_successive_merges, 30U);
+  ASSERT_EQ(new_cf_opt.min_partial_merge_operands, 31U);
+
+  cf_options_map["write_buffer_size"] = "hello";
+  ASSERT_TRUE(!GetColumnFamilyOptionsFromMap(
+              base_cf_opt, cf_options_map, &new_cf_opt));
+  cf_options_map["write_buffer_size"] = "1";
+  ASSERT_TRUE(GetColumnFamilyOptionsFromMap(
+              base_cf_opt, cf_options_map, &new_cf_opt));
+  cf_options_map["unknown_option"] = "1";
+  ASSERT_TRUE(!GetColumnFamilyOptionsFromMap(
+              base_cf_opt, cf_options_map, &new_cf_opt));
+
+  DBOptions base_db_opt;
+  DBOptions new_db_opt;
+  ASSERT_TRUE(GetDBOptionsFromMap(base_db_opt, db_options_map, &new_db_opt));
+  ASSERT_EQ(new_db_opt.create_if_missing, false);
+  ASSERT_EQ(new_db_opt.create_missing_column_families, true);
+  ASSERT_EQ(new_db_opt.error_if_exists, false);
+  ASSERT_EQ(new_db_opt.paranoid_checks, true);
+  ASSERT_EQ(new_db_opt.max_open_files, 32);
+  ASSERT_EQ(new_db_opt.max_total_wal_size, static_cast<uint64_t>(33));
+  ASSERT_EQ(new_db_opt.disableDataSync, false);
+  ASSERT_EQ(new_db_opt.use_fsync, true);
+  ASSERT_EQ(new_db_opt.db_log_dir, "/db_log_dir");
+  ASSERT_EQ(new_db_opt.wal_dir, "/wal_dir");
+  ASSERT_EQ(new_db_opt.delete_obsolete_files_period_micros,
             static_cast<uint64_t>(34));
-  ASSERT_EQ(new_opt.max_background_compactions, 35);
-  ASSERT_EQ(new_opt.max_background_flushes, 36);
-  ASSERT_EQ(new_opt.max_log_file_size, 37U);
-  ASSERT_EQ(new_opt.log_file_time_to_roll, 38U);
-  ASSERT_EQ(new_opt.keep_log_file_num, 39U);
-  ASSERT_EQ(new_opt.max_manifest_file_size, static_cast<uint64_t>(40));
-  ASSERT_EQ(new_opt.table_cache_numshardbits, 41);
-  ASSERT_EQ(new_opt.table_cache_remove_scan_count_limit, 42);
-  ASSERT_EQ(new_opt.WAL_ttl_seconds, static_cast<uint64_t>(43));
-  ASSERT_EQ(new_opt.WAL_size_limit_MB, static_cast<uint64_t>(44));
-  ASSERT_EQ(new_opt.manifest_preallocation_size, 45U);
-  ASSERT_EQ(new_opt.allow_os_buffer, false);
-  ASSERT_EQ(new_opt.allow_mmap_reads, true);
-  ASSERT_EQ(new_opt.allow_mmap_writes, false);
-  ASSERT_EQ(new_opt.is_fd_close_on_exec, true);
-  ASSERT_EQ(new_opt.skip_log_error_on_recovery, false);
-  ASSERT_EQ(new_opt.stats_dump_period_sec, 46U);
-  ASSERT_EQ(new_opt.advise_random_on_open, true);
-  ASSERT_EQ(new_opt.use_adaptive_mutex, false);
-  ASSERT_EQ(new_opt.bytes_per_sync, static_cast<uint64_t>(47));
+  ASSERT_EQ(new_db_opt.max_background_compactions, 35);
+  ASSERT_EQ(new_db_opt.max_background_flushes, 36);
+  ASSERT_EQ(new_db_opt.max_log_file_size, 37U);
+  ASSERT_EQ(new_db_opt.log_file_time_to_roll, 38U);
+  ASSERT_EQ(new_db_opt.keep_log_file_num, 39U);
+  ASSERT_EQ(new_db_opt.max_manifest_file_size, static_cast<uint64_t>(40));
+  ASSERT_EQ(new_db_opt.table_cache_numshardbits, 41);
+  ASSERT_EQ(new_db_opt.table_cache_remove_scan_count_limit, 42);
+  ASSERT_EQ(new_db_opt.WAL_ttl_seconds, static_cast<uint64_t>(43));
+  ASSERT_EQ(new_db_opt.WAL_size_limit_MB, static_cast<uint64_t>(44));
+  ASSERT_EQ(new_db_opt.manifest_preallocation_size, 45U);
+  ASSERT_EQ(new_db_opt.allow_os_buffer, false);
+  ASSERT_EQ(new_db_opt.allow_mmap_reads, true);
+  ASSERT_EQ(new_db_opt.allow_mmap_writes, false);
+  ASSERT_EQ(new_db_opt.is_fd_close_on_exec, true);
+  ASSERT_EQ(new_db_opt.skip_log_error_on_recovery, false);
+  ASSERT_EQ(new_db_opt.stats_dump_period_sec, 46U);
+  ASSERT_EQ(new_db_opt.advise_random_on_open, true);
+  ASSERT_EQ(new_db_opt.use_adaptive_mutex, false);
+  ASSERT_EQ(new_db_opt.bytes_per_sync, static_cast<uint64_t>(47));
+}
 
-  options_map["write_buffer_size"] = "hello";
-  ASSERT_TRUE(!GetOptionsFromStrings(base_opt, options_map, &new_opt));
-  options_map["write_buffer_size"] = "1";
-  ASSERT_TRUE(GetOptionsFromStrings(base_opt, options_map, &new_opt));
-  options_map["unknown_option"] = "1";
-  ASSERT_TRUE(!GetOptionsFromStrings(base_opt, options_map, &new_opt));
+TEST(OptionsTest, GetOptionsFromStringTest) {
+  ColumnFamilyOptions base_cf_opt;
+  ColumnFamilyOptions new_cf_opt;
+  ASSERT_TRUE(GetColumnFamilyOptionsFromString(base_cf_opt, "", &new_cf_opt));
+  ASSERT_TRUE(GetColumnFamilyOptionsFromString(base_cf_opt,
+              "write_buffer_size=5", &new_cf_opt));
+  ASSERT_EQ(new_cf_opt.write_buffer_size, 5);
+  ASSERT_TRUE(GetColumnFamilyOptionsFromString(base_cf_opt,
+              "write_buffer_size=6;", &new_cf_opt));
+  ASSERT_EQ(new_cf_opt.write_buffer_size, 6);
+  ASSERT_TRUE(GetColumnFamilyOptionsFromString(base_cf_opt,
+              "  write_buffer_size =  7  ", &new_cf_opt));
+  ASSERT_EQ(new_cf_opt.write_buffer_size, 7);
+  ASSERT_TRUE(GetColumnFamilyOptionsFromString(base_cf_opt,
+              "  write_buffer_size =  8 ; ", &new_cf_opt));
+  ASSERT_EQ(new_cf_opt.write_buffer_size, 8);
+  ASSERT_TRUE(GetColumnFamilyOptionsFromString(base_cf_opt,
+              "write_buffer_size=9;max_write_buffer_number=10", &new_cf_opt));
+  ASSERT_EQ(new_cf_opt.write_buffer_size, 9);
+  ASSERT_EQ(new_cf_opt.max_write_buffer_number, 10);
+  ASSERT_TRUE(GetColumnFamilyOptionsFromString(base_cf_opt,
+              "write_buffer_size=11; max_write_buffer_number  =  12 ;",
+              &new_cf_opt));
+  ASSERT_EQ(new_cf_opt.write_buffer_size, 11);
+  ASSERT_EQ(new_cf_opt.max_write_buffer_number, 12);
+  // Wrong name "max_write_buffer_number_"
+  ASSERT_TRUE(!GetColumnFamilyOptionsFromString(base_cf_opt,
+              "write_buffer_size=13;max_write_buffer_number_=14;",
+              &new_cf_opt));
+  // Wrong key/value pair
+  ASSERT_TRUE(!GetColumnFamilyOptionsFromString(base_cf_opt,
+              "write_buffer_size=13;max_write_buffer_number;", &new_cf_opt));
+  // Error Paring value
+  ASSERT_TRUE(!GetColumnFamilyOptionsFromString(base_cf_opt,
+              "write_buffer_size=13;max_write_buffer_number=;", &new_cf_opt));
+  // Missing option name
+  ASSERT_TRUE(!GetColumnFamilyOptionsFromString(base_cf_opt,
+              "write_buffer_size=13; =100;", &new_cf_opt));
 }
 
 }  // namespace rocksdb
diff --git a/utilities/options/convenience.h b/utilities/options/convenience.h
new file mode 100644
index 000000000..5d7b6d116
--- /dev/null
+++ b/utilities/options/convenience.h
@@ -0,0 +1,40 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#pragma once
+
+#include <unordered_map>
+#include <string>
+#include "rocksdb/options.h"
+
+namespace rocksdb {
+
+// Take a map of option name and option value, apply them into the
+// base_options, and return the new options as a result
+bool GetColumnFamilyOptionsFromMap(
+    const ColumnFamilyOptions& base_options,
+    const std::unordered_map<std::string, std::string>& opts_map,
+    ColumnFamilyOptions* new_options);
+
+bool GetDBOptionsFromMap(
+    const DBOptions& base_options,
+    const std::unordered_map<std::string, std::string>& opts_map,
+    DBOptions* new_options);
+
+// Take a string representation of option names and  values, apply them into the
+// base_options, and return the new options as a result. The string has the
+// following format:
+//   "write_buffer_size=1024;max_write_buffer_number=2"
+bool GetColumnFamilyOptionsFromString(
+    const ColumnFamilyOptions& base_options,
+    const std::string& opts_str,
+    ColumnFamilyOptions* new_options);
+
+bool GetDBOptionsFromString(
+    const DBOptions& base_options,
+    const std::string& opts_str,
+    DBOptions* new_options);
+
+}  // namespace rocksdb

From 5a7618634042d9766e75d030986ab2f5584b44b9 Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Fri, 10 Oct 2014 14:10:16 -0700
Subject: [PATCH 241/829] Fixed compile error on Mac: default arguments for
 lambda expressions

Summary:
Fixed the following compile error on Mac.

db/db_test.cc:8618:52: error: C++11 forbids default arguments for lambda expressions [-Werror,-Wlambda-extensions]
  auto gen_l0_kb = [this](int start, int size, int stride = 1) {
                                                   ^        ~
1 error generated.

Test Plan:
db_test
---
 db/db_test.cc | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/db/db_test.cc b/db/db_test.cc
index c67c45786..f516a488f 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -8615,7 +8615,7 @@ TEST(DBTest, DynamicCompactionOptions) {
   options.max_bytes_for_level_multiplier = 4;
   DestroyAndReopen(&options);
 
-  auto gen_l0_kb = [this](int start, int size, int stride = 1) {
+  auto gen_l0_kb = [this](int start, int size, int stride) {
     Random rnd(301);
     std::vector<std::string> values;
     for (int i = 0; i < size; i++) {
@@ -8627,11 +8627,11 @@ TEST(DBTest, DynamicCompactionOptions) {
 
   // Write 3 files that have the same key range, trigger compaction and
   // result in one L1 file
-  gen_l0_kb(0, 128);
+  gen_l0_kb(0, 128, 1);
   ASSERT_EQ(NumTableFilesAtLevel(0), 1);
-  gen_l0_kb(0, 128);
+  gen_l0_kb(0, 128, 1);
   ASSERT_EQ(NumTableFilesAtLevel(0), 2);
-  gen_l0_kb(0, 128);
+  gen_l0_kb(0, 128, 1);
   dbfull()->TEST_WaitForCompact();
   ASSERT_EQ("0,1", FilesPerLevel());
   std::vector<LiveFileMetaData> metadata;
@@ -8646,9 +8646,9 @@ TEST(DBTest, DynamicCompactionOptions) {
     {"target_file_size_base", "65536"}
   }));
 
-  gen_l0_kb(0, 128);
+  gen_l0_kb(0, 128, 1);
   ASSERT_EQ("1,1", FilesPerLevel());
-  gen_l0_kb(0, 128);
+  gen_l0_kb(0, 128, 1);
   dbfull()->TEST_WaitForCompact();
   ASSERT_EQ("0,2", FilesPerLevel());
   metadata.clear();

From 3ead857a0d54ef4674eb7a19fd85fe18300b4ce3 Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Fri, 10 Oct 2014 14:19:51 -0700
Subject: [PATCH 242/829] Fixed Mac compile error in util/options_test.cc

Summary:
Fixed the following error in Mac:

./util/testharness.h:93:19: error: comparison of integers of different signs: 'const unsigned long' and 'const int' [-Werror,-Wsign-compare]
  BINARY_OP(IsEq, ==)
  ~~~~~~~~~~~~~~~~^~~
./util/testharness.h:86:14: note: expanded from macro 'BINARY_OP'
    if (! (x op y)) {                                   \
             ^
util/options_test.cc:269:3: note: in instantiation of function template specialization 'rocksdb::test::Tester::IsEq<unsigned long, int>' requested here
  ASSERT_EQ(new_cf_opt.write_buffer_size, 5);
  ^

Test Plan:
options_test
---
 util/options_test.cc | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/util/options_test.cc b/util/options_test.cc
index 6f6745aa0..f1258b8ed 100644
--- a/util/options_test.cc
+++ b/util/options_test.cc
@@ -266,24 +266,24 @@ TEST(OptionsTest, GetOptionsFromStringTest) {
   ASSERT_TRUE(GetColumnFamilyOptionsFromString(base_cf_opt, "", &new_cf_opt));
   ASSERT_TRUE(GetColumnFamilyOptionsFromString(base_cf_opt,
               "write_buffer_size=5", &new_cf_opt));
-  ASSERT_EQ(new_cf_opt.write_buffer_size, 5);
+  ASSERT_EQ(new_cf_opt.write_buffer_size, 5U);
   ASSERT_TRUE(GetColumnFamilyOptionsFromString(base_cf_opt,
               "write_buffer_size=6;", &new_cf_opt));
-  ASSERT_EQ(new_cf_opt.write_buffer_size, 6);
+  ASSERT_EQ(new_cf_opt.write_buffer_size, 6U);
   ASSERT_TRUE(GetColumnFamilyOptionsFromString(base_cf_opt,
               "  write_buffer_size =  7  ", &new_cf_opt));
-  ASSERT_EQ(new_cf_opt.write_buffer_size, 7);
+  ASSERT_EQ(new_cf_opt.write_buffer_size, 7U);
   ASSERT_TRUE(GetColumnFamilyOptionsFromString(base_cf_opt,
               "  write_buffer_size =  8 ; ", &new_cf_opt));
-  ASSERT_EQ(new_cf_opt.write_buffer_size, 8);
+  ASSERT_EQ(new_cf_opt.write_buffer_size, 8U);
   ASSERT_TRUE(GetColumnFamilyOptionsFromString(base_cf_opt,
               "write_buffer_size=9;max_write_buffer_number=10", &new_cf_opt));
-  ASSERT_EQ(new_cf_opt.write_buffer_size, 9);
+  ASSERT_EQ(new_cf_opt.write_buffer_size, 9U);
   ASSERT_EQ(new_cf_opt.max_write_buffer_number, 10);
   ASSERT_TRUE(GetColumnFamilyOptionsFromString(base_cf_opt,
               "write_buffer_size=11; max_write_buffer_number  =  12 ;",
               &new_cf_opt));
-  ASSERT_EQ(new_cf_opt.write_buffer_size, 11);
+  ASSERT_EQ(new_cf_opt.write_buffer_size, 11U);
   ASSERT_EQ(new_cf_opt.max_write_buffer_number, 12);
   // Wrong name "max_write_buffer_number_"
   ASSERT_TRUE(!GetColumnFamilyOptionsFromString(base_cf_opt,

From f441b273ae69124455e0ce2341c09d87ca94aed3 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Fri, 10 Oct 2014 13:31:28 -0700
Subject: [PATCH 243/829] WriteBatchWithIndex to support an option to overwrite
 rows when operating the same key

Summary: With a new option, when accepting a new key, WriteBatchWithIndex will find an existing index of the same key, and replace the content of it.

Test Plan: Add a unit test case.

Reviewers: ljin, yhchiang, rven, igor

Reviewed By: igor

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D24753
---
 HISTORY.md                                    |   1 +
 .../utilities/write_batch_with_index.h        |  10 +-
 .../write_batch_with_index.cc                 | 145 ++++++++++++------
 .../write_batch_with_index_test.cc            |  76 +++++++++
 4 files changed, 182 insertions(+), 50 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index 06660a0e8..b72bce080 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -4,6 +4,7 @@
 
 ### Public API changes
 * Introduce 4 new convenient functions for converting Options from string: GetColumnFamilyOptionsFromMap(), GetColumnFamilyOptionsFromString(), GetDBOptionsFromMap(), GetDBOptionsFromString()
+* Remove WriteBatchWithIndex.Delete() overloads using SliceParts
 
 
 ## 3.6.0 (10/7/2014)
diff --git a/include/rocksdb/utilities/write_batch_with_index.h b/include/rocksdb/utilities/write_batch_with_index.h
index 85c80850f..4aafacaf5 100644
--- a/include/rocksdb/utilities/write_batch_with_index.h
+++ b/include/rocksdb/utilities/write_batch_with_index.h
@@ -19,7 +19,6 @@
 namespace rocksdb {
 
 class ColumnFamilyHandle;
-struct SliceParts;
 class Comparator;
 
 enum WriteType { kPutRecord, kMergeRecord, kDeleteRecord, kLogDataRecord };
@@ -62,9 +61,12 @@ class WriteBatchWithIndex {
   // interface, or we can't find a column family from the column family handle
   // passed in, backup_index_comparator will be used for the column family.
   // reserved_bytes: reserved bytes in underlying WriteBatch
+  // overwrite_key: if true, overwrite the key in the index when inserting
+  //                the same key as previously, so iterator will never
+  //                show two entries with the same key.
   explicit WriteBatchWithIndex(
       const Comparator* backup_index_comparator = BytewiseComparator(),
-      size_t reserved_bytes = 0);
+      size_t reserved_bytes = 0, bool overwrite_key = false);
   virtual ~WriteBatchWithIndex();
 
   WriteBatch* GetWriteBatch();
@@ -84,10 +86,6 @@ class WriteBatchWithIndex {
   virtual void Delete(ColumnFamilyHandle* column_family, const Slice& key);
   virtual void Delete(const Slice& key);
 
-  virtual void Delete(ColumnFamilyHandle* column_family, const SliceParts& key);
-
-  virtual void Delete(const SliceParts& key);
-
   // Create an iterator of a column family. User can call iterator.Seek() to
   // search to the next entry of or after a key. Keys will be iterated in the
   // order given by index_comparator. For multiple updates on the same key,
diff --git a/utilities/write_batch_with_index/write_batch_with_index.cc b/utilities/write_batch_with_index/write_batch_with_index.cc
index 481ec6867..8cc5686f6 100644
--- a/utilities/write_batch_with_index/write_batch_with_index.cc
+++ b/utilities/write_batch_with_index/write_batch_with_index.cc
@@ -45,6 +45,9 @@ class WriteBatchEntryComparator {
   int operator()(const WriteBatchIndexEntry* entry1,
                  const WriteBatchIndexEntry* entry2) const;
 
+  int CompareKey(uint32_t column_family, const Slice& key1,
+                 const Slice& key2) const;
+
   void SetComparatorForCF(uint32_t column_family_id,
                           const Comparator* comparator) {
     cf_comparator_map_[column_family_id] = comparator;
@@ -89,6 +92,10 @@ class WBWIIteratorImpl : public WBWIIterator {
 
   virtual Status status() const override { return status_; }
 
+  const WriteBatchIndexEntry* GetRawEntry() const {
+    return skip_list_iter_.key();
+  }
+
  private:
   uint32_t column_family_id_;
   WriteBatchEntrySkipList::Iterator skip_list_iter_;
@@ -123,32 +130,90 @@ class WBWIIteratorImpl : public WBWIIterator {
 };
 
 struct WriteBatchWithIndex::Rep {
-  Rep(const Comparator* index_comparator, size_t reserved_bytes = 0)
+  Rep(const Comparator* index_comparator, size_t reserved_bytes = 0,
+      bool overwrite_key = false)
       : write_batch(reserved_bytes),
         comparator(index_comparator, &write_batch),
-        skip_list(comparator, &arena) {}
+        skip_list(comparator, &arena),
+        overwrite_key(overwrite_key),
+        last_entry_offset(0) {}
   ReadableWriteBatch write_batch;
   WriteBatchEntryComparator comparator;
   Arena arena;
   WriteBatchEntrySkipList skip_list;
+  bool overwrite_key;
+  size_t last_entry_offset;
+
+  // Remember current offset of internal write batch, which is used as
+  // the starting offset of the next record.
+  void SetLastEntryOffset() { last_entry_offset = write_batch.GetDataSize(); }
+
+  // In overwrite mode, find the existing entry for the same key and update it
+  // to point to the current entry.
+  // Return true if the key is found and updated.
+  bool UpdateExistingEntry(ColumnFamilyHandle* column_family, const Slice& key);
+  bool UpdateExistingEntryWithCfId(uint32_t column_family_id, const Slice& key);
+
+  // Add the recent entry to the update.
+  // In overwrite mode, if key already exists in the index, update it.
+  void AddOrUpdateIndex(ColumnFamilyHandle* column_family, const Slice& key);
+  void AddOrUpdateIndex(const Slice& key);
+
+  // Allocate an index entry pointing to the last entry in the write batch and
+  // put it to skip list.
+  void AddNewEntry(uint32_t column_family_id);
+};
+
+bool WriteBatchWithIndex::Rep::UpdateExistingEntry(
+    ColumnFamilyHandle* column_family, const Slice& key) {
+  uint32_t cf_id = GetColumnFamilyID(column_family);
+  return UpdateExistingEntryWithCfId(cf_id, key);
+}
+
+bool WriteBatchWithIndex::Rep::UpdateExistingEntryWithCfId(
+    uint32_t column_family_id, const Slice& key) {
+  if (!overwrite_key) {
+    return false;
+  }
 
-  WriteBatchIndexEntry* GetEntry(ColumnFamilyHandle* column_family) {
+  WBWIIteratorImpl iter(column_family_id, &skip_list, &write_batch);
+  iter.Seek(key);
+  if (!iter.Valid()) {
+    return false;
+  }
+  if (comparator.CompareKey(column_family_id, key, iter.Entry().key) != 0) {
+    return false;
+  }
+  WriteBatchIndexEntry* non_const_entry =
+      const_cast<WriteBatchIndexEntry*>(iter.GetRawEntry());
+  non_const_entry->offset = last_entry_offset;
+  return true;
+}
+
+void WriteBatchWithIndex::Rep::AddOrUpdateIndex(
+    ColumnFamilyHandle* column_family, const Slice& key) {
+  if (!UpdateExistingEntry(column_family, key)) {
     uint32_t cf_id = GetColumnFamilyID(column_family);
     const auto* cf_cmp = GetColumnFamilyUserComparator(column_family);
     if (cf_cmp != nullptr) {
       comparator.SetComparatorForCF(cf_id, cf_cmp);
     }
+    AddNewEntry(cf_id);
+  }
+}
 
-    return GetEntryWithCfId(cf_id);
+void WriteBatchWithIndex::Rep::AddOrUpdateIndex(const Slice& key) {
+  if (!UpdateExistingEntryWithCfId(0, key)) {
+    AddNewEntry(0);
   }
+}
 
-  WriteBatchIndexEntry* GetEntryWithCfId(uint32_t column_family_id) {
+void WriteBatchWithIndex::Rep::AddNewEntry(uint32_t column_family_id) {
     auto* mem = arena.Allocate(sizeof(WriteBatchIndexEntry));
-    auto* index_entry = new (mem)
-        WriteBatchIndexEntry(write_batch.GetDataSize(), column_family_id);
-    return index_entry;
+    auto* index_entry =
+        new (mem) WriteBatchIndexEntry(last_entry_offset, column_family_id);
+    skip_list.Insert(index_entry);
   }
-};
 
 Status ReadableWriteBatch::GetEntryFromDataOffset(size_t data_offset,
                                                   WriteType* type, Slice* Key,
@@ -191,8 +256,9 @@ Status ReadableWriteBatch::GetEntryFromDataOffset(size_t data_offset,
 }
 
 WriteBatchWithIndex::WriteBatchWithIndex(
-    const Comparator* default_index_comparator, size_t reserved_bytes)
-    : rep(new Rep(default_index_comparator, reserved_bytes)) {}
+    const Comparator* default_index_comparator, size_t reserved_bytes,
+    bool overwrite_key)
+    : rep(new Rep(default_index_comparator, reserved_bytes, overwrite_key)) {}
 
 WriteBatchWithIndex::~WriteBatchWithIndex() { delete rep; }
 
@@ -210,28 +276,28 @@ WBWIIterator* WriteBatchWithIndex::NewIterator(
 
 void WriteBatchWithIndex::Put(ColumnFamilyHandle* column_family,
                               const Slice& key, const Slice& value) {
-  auto* index_entry = rep->GetEntry(column_family);
+  rep->SetLastEntryOffset();
   rep->write_batch.Put(column_family, key, value);
-  rep->skip_list.Insert(index_entry);
+  rep->AddOrUpdateIndex(column_family, key);
 }
 
 void WriteBatchWithIndex::Put(const Slice& key, const Slice& value) {
-  auto* index_entry = rep->GetEntryWithCfId(0);
+  rep->SetLastEntryOffset();
   rep->write_batch.Put(key, value);
-  rep->skip_list.Insert(index_entry);
+  rep->AddOrUpdateIndex(key);
 }
 
 void WriteBatchWithIndex::Merge(ColumnFamilyHandle* column_family,
                                 const Slice& key, const Slice& value) {
-  auto* index_entry = rep->GetEntry(column_family);
+  rep->SetLastEntryOffset();
   rep->write_batch.Merge(column_family, key, value);
-  rep->skip_list.Insert(index_entry);
+  rep->AddOrUpdateIndex(column_family, key);
 }
 
 void WriteBatchWithIndex::Merge(const Slice& key, const Slice& value) {
-  auto* index_entry = rep->GetEntryWithCfId(0);
+  rep->SetLastEntryOffset();
   rep->write_batch.Merge(key, value);
-  rep->skip_list.Insert(index_entry);
+  rep->AddOrUpdateIndex(key);
 }
 
 void WriteBatchWithIndex::PutLogData(const Slice& blob) {
@@ -240,28 +306,15 @@ void WriteBatchWithIndex::PutLogData(const Slice& blob) {
 
 void WriteBatchWithIndex::Delete(ColumnFamilyHandle* column_family,
                                  const Slice& key) {
-  auto* index_entry = rep->GetEntry(column_family);
+  rep->SetLastEntryOffset();
   rep->write_batch.Delete(column_family, key);
-  rep->skip_list.Insert(index_entry);
+  rep->AddOrUpdateIndex(column_family, key);
 }
 
 void WriteBatchWithIndex::Delete(const Slice& key) {
-  auto* index_entry = rep->GetEntryWithCfId(0);
+  rep->SetLastEntryOffset();
   rep->write_batch.Delete(key);
-  rep->skip_list.Insert(index_entry);
-}
-
-void WriteBatchWithIndex::Delete(ColumnFamilyHandle* column_family,
-                                 const SliceParts& key) {
-  auto* index_entry = rep->GetEntry(column_family);
-  rep->write_batch.Delete(column_family, key);
-  rep->skip_list.Insert(index_entry);
-}
-
-void WriteBatchWithIndex::Delete(const SliceParts& key) {
-  auto* index_entry = rep->GetEntryWithCfId(0);
-  rep->write_batch.Delete(key);
-  rep->skip_list.Insert(index_entry);
+  rep->AddOrUpdateIndex(key);
 }
 
 int WriteBatchEntryComparator::operator()(
@@ -298,14 +351,7 @@ int WriteBatchEntryComparator::operator()(
     key2 = *(entry2->search_key);
   }
 
-  int cmp;
-  auto comparator_for_cf = cf_comparator_map_.find(entry1->column_family);
-  if (comparator_for_cf != cf_comparator_map_.end()) {
-    cmp = comparator_for_cf->second->Compare(key1, key2);
-  } else {
-    cmp = default_comparator_->Compare(key1, key2);
-  }
-
+  int cmp = CompareKey(entry1->column_family, key1, key2);
   if (cmp != 0) {
     return cmp;
   } else if (entry1->offset > entry2->offset) {
@@ -316,4 +362,15 @@ int WriteBatchEntryComparator::operator()(
   return 0;
 }
 
+int WriteBatchEntryComparator::CompareKey(uint32_t column_family,
+                                          const Slice& key1,
+                                          const Slice& key2) const {
+  auto comparator_for_cf = cf_comparator_map_.find(column_family);
+  if (comparator_for_cf != cf_comparator_map_.end()) {
+    return comparator_for_cf->second->Compare(key1, key2);
+  } else {
+    return default_comparator_->Compare(key1, key2);
+  }
+}
+
 }  // namespace rocksdb
diff --git a/utilities/write_batch_with_index/write_batch_with_index_test.cc b/utilities/write_batch_with_index/write_batch_with_index_test.cc
index 1152c7b88..b3dbdaa68 100644
--- a/utilities/write_batch_with_index/write_batch_with_index_test.cc
+++ b/utilities/write_batch_with_index/write_batch_with_index_test.cc
@@ -318,6 +318,82 @@ TEST(WriteBatchWithIndexTest, TestComparatorForCF) {
   }
 }
 
+TEST(WriteBatchWithIndexTest, TestOverwriteKey) {
+  ColumnFamilyHandleImplDummy cf1(6, nullptr);
+  ColumnFamilyHandleImplDummy reverse_cf(66, ReverseBytewiseComparator());
+  ColumnFamilyHandleImplDummy cf2(88, BytewiseComparator());
+  WriteBatchWithIndex batch(BytewiseComparator(), 20, true);
+
+  batch.Put(&cf1, "ddd", "");
+  batch.Merge(&cf1, "ddd", "");
+  batch.Delete(&cf1, "ddd");
+  batch.Put(&cf2, "aaa", "");
+  batch.Delete(&cf2, "aaa");
+  batch.Put(&cf2, "aaa", "aaa");
+  batch.Put(&cf2, "eee", "eee");
+  batch.Put(&cf1, "ccc", "");
+  batch.Put(&reverse_cf, "a11", "");
+  batch.Delete(&cf1, "ccc");
+  batch.Put(&reverse_cf, "a33", "a33");
+  batch.Put(&reverse_cf, "a11", "a11");
+  batch.Delete(&reverse_cf, "a33");
+
+  {
+    std::unique_ptr<WBWIIterator> iter(batch.NewIterator(&cf1));
+    iter->Seek("");
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("ccc", iter->Entry().key.ToString());
+    ASSERT_TRUE(iter->Entry().type == WriteType::kDeleteRecord);
+    iter->Next();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("ddd", iter->Entry().key.ToString());
+    ASSERT_TRUE(iter->Entry().type == WriteType::kDeleteRecord);
+    iter->Next();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(!iter->Valid());
+  }
+
+  {
+    std::unique_ptr<WBWIIterator> iter(batch.NewIterator(&cf2));
+    iter->Seek("");
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("aaa", iter->Entry().key.ToString());
+    ASSERT_EQ("aaa", iter->Entry().value.ToString());
+    iter->Next();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("eee", iter->Entry().key.ToString());
+    ASSERT_EQ("eee", iter->Entry().value.ToString());
+    iter->Next();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(!iter->Valid());
+  }
+
+  {
+    std::unique_ptr<WBWIIterator> iter(batch.NewIterator(&reverse_cf));
+    iter->Seek("");
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(!iter->Valid());
+
+    iter->Seek("z");
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("a33", iter->Entry().key.ToString());
+    ASSERT_TRUE(iter->Entry().type == WriteType::kDeleteRecord);
+    iter->Next();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("a11", iter->Entry().key.ToString());
+    ASSERT_EQ("a11", iter->Entry().value.ToString());
+    iter->Next();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(!iter->Valid());
+  }
+}
+
 }  // namespace
 
 int main(int argc, char** argv) { return rocksdb::test::RunAllTests(); }

From 4f65fbd1976b826476c9419727c9557eb9df1b50 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Fri, 10 Oct 2014 16:11:40 -0700
Subject: [PATCH 244/829] WriteBatchWithIndex's iterator to support
 SeekToFirst(), SeekToLast() and Prev()

Summary: Support SeekToFirst(), SeekToLast() and Prev() in WBWIIterator, returned by WriteBatchWithIndex::NewIterator().

Test Plan: Write unit test cases to cover the case.

Reviewers: ljin, igor

Reviewed By: igor

Subscribers: rven, yhchiang, leveldb

Differential Revision: https://reviews.facebook.net/D24765
---
 .../utilities/write_batch_with_index.h        |  28 +++--
 .../write_batch_with_index.cc                 |  24 +++++
 .../write_batch_with_index_test.cc            | 102 +++++++++++++++---
 3 files changed, 127 insertions(+), 27 deletions(-)

diff --git a/include/rocksdb/utilities/write_batch_with_index.h b/include/rocksdb/utilities/write_batch_with_index.h
index 4aafacaf5..f31c86ea1 100644
--- a/include/rocksdb/utilities/write_batch_with_index.h
+++ b/include/rocksdb/utilities/write_batch_with_index.h
@@ -38,10 +38,16 @@ class WBWIIterator {
 
   virtual bool Valid() const = 0;
 
+  virtual void SeekToFirst() = 0;
+
+  virtual void SeekToLast() = 0;
+
   virtual void Seek(const Slice& key) = 0;
 
   virtual void Next() = 0;
 
+  virtual void Prev() = 0;
+
   virtual const WriteEntry& Entry() const = 0;
 
   virtual Status status() const = 0;
@@ -71,29 +77,29 @@ class WriteBatchWithIndex {
 
   WriteBatch* GetWriteBatch();
 
-  virtual void Put(ColumnFamilyHandle* column_family, const Slice& key,
-                   const Slice& value);
+  void Put(ColumnFamilyHandle* column_family, const Slice& key,
+           const Slice& value);
 
-  virtual void Put(const Slice& key, const Slice& value);
+  void Put(const Slice& key, const Slice& value);
 
-  virtual void Merge(ColumnFamilyHandle* column_family, const Slice& key,
-                     const Slice& value);
+  void Merge(ColumnFamilyHandle* column_family, const Slice& key,
+             const Slice& value);
 
-  virtual void Merge(const Slice& key, const Slice& value);
+  void Merge(const Slice& key, const Slice& value);
 
-  virtual void PutLogData(const Slice& blob);
+  void PutLogData(const Slice& blob);
 
-  virtual void Delete(ColumnFamilyHandle* column_family, const Slice& key);
-  virtual void Delete(const Slice& key);
+  void Delete(ColumnFamilyHandle* column_family, const Slice& key);
+  void Delete(const Slice& key);
 
   // Create an iterator of a column family. User can call iterator.Seek() to
   // search to the next entry of or after a key. Keys will be iterated in the
   // order given by index_comparator. For multiple updates on the same key,
   // each update will be returned as a separate entry, in the order of update
   // time.
-  virtual WBWIIterator* NewIterator(ColumnFamilyHandle* column_family);
+  WBWIIterator* NewIterator(ColumnFamilyHandle* column_family);
   // Create an iterator of the default column family.
-  virtual WBWIIterator* NewIterator();
+  WBWIIterator* NewIterator();
 
  private:
   struct Rep;
diff --git a/utilities/write_batch_with_index/write_batch_with_index.cc b/utilities/write_batch_with_index/write_batch_with_index.cc
index 8cc5686f6..0b460cd15 100644
--- a/utilities/write_batch_with_index/write_batch_with_index.cc
+++ b/utilities/write_batch_with_index/write_batch_with_index.cc
@@ -76,6 +76,25 @@ class WBWIIteratorImpl : public WBWIIterator {
 
   virtual bool Valid() const override { return valid_; }
 
+  virtual void SeekToFirst() {
+    valid_ = true;
+    WriteBatchIndexEntry search_entry(nullptr, column_family_id_);
+    skip_list_iter_.Seek(&search_entry);
+    ReadEntry();
+  }
+
+  virtual void SeekToLast() {
+    valid_ = true;
+    WriteBatchIndexEntry search_entry(nullptr, column_family_id_ + 1);
+    skip_list_iter_.Seek(&search_entry);
+    if (!skip_list_iter_.Valid()) {
+      skip_list_iter_.SeekToLast();
+    } else {
+      skip_list_iter_.Prev();
+    }
+    ReadEntry();
+  }
+
   virtual void Seek(const Slice& key) override {
     valid_ = true;
     WriteBatchIndexEntry search_entry(&key, column_family_id_);
@@ -88,6 +107,11 @@ class WBWIIteratorImpl : public WBWIIterator {
     ReadEntry();
   }
 
+  virtual void Prev() override {
+    skip_list_iter_.Prev();
+    ReadEntry();
+  }
+
   virtual const WriteEntry& Entry() const override { return current_; }
 
   virtual Status status() const override { return status_; }
diff --git a/utilities/write_batch_with_index/write_batch_with_index_test.cc b/utilities/write_batch_with_index/write_batch_with_index_test.cc
index b3dbdaa68..d34380fd7 100644
--- a/utilities/write_batch_with_index/write_batch_with_index_test.cc
+++ b/utilities/write_batch_with_index/write_batch_with_index_test.cc
@@ -120,18 +120,39 @@ TEST(WriteBatchWithIndexTest, TestValueAsSecondaryIndex) {
   // Iterator all keys
   {
     std::unique_ptr<WBWIIterator> iter(batch.NewIterator(&data));
-    iter->Seek("");
-    for (auto pair : data_map) {
-      for (auto v : pair.second) {
+    for (int seek_to_first : {0, 1}) {
+      if (seek_to_first) {
+        iter->SeekToFirst();
+      } else {
+        iter->Seek("");
+      }
+      for (auto pair : data_map) {
+        for (auto v : pair.second) {
+          ASSERT_OK(iter->status());
+          ASSERT_TRUE(iter->Valid());
+          auto& write_entry = iter->Entry();
+          ASSERT_EQ(pair.first, write_entry.key.ToString());
+          ASSERT_EQ(v->type, write_entry.type);
+          if (write_entry.type != kDeleteRecord) {
+            ASSERT_EQ(v->value, write_entry.value.ToString());
+          }
+          iter->Next();
+        }
+      }
+      ASSERT_TRUE(!iter->Valid());
+    }
+    iter->SeekToLast();
+    for (auto pair = data_map.rbegin(); pair != data_map.rend(); ++pair) {
+      for (auto v = pair->second.rbegin(); v != pair->second.rend(); v++) {
         ASSERT_OK(iter->status());
         ASSERT_TRUE(iter->Valid());
         auto& write_entry = iter->Entry();
-        ASSERT_EQ(pair.first, write_entry.key.ToString());
-        ASSERT_EQ(v->type, write_entry.type);
+        ASSERT_EQ(pair->first, write_entry.key.ToString());
+        ASSERT_EQ((*v)->type, write_entry.type);
         if (write_entry.type != kDeleteRecord) {
-          ASSERT_EQ(v->value, write_entry.value.ToString());
+          ASSERT_EQ((*v)->value, write_entry.value.ToString());
         }
-        iter->Next();
+        iter->Prev();
       }
     }
     ASSERT_TRUE(!iter->Valid());
@@ -140,18 +161,40 @@ TEST(WriteBatchWithIndexTest, TestValueAsSecondaryIndex) {
   // Iterator all indexes
   {
     std::unique_ptr<WBWIIterator> iter(batch.NewIterator(&index));
-    iter->Seek("");
-    for (auto pair : index_map) {
-      for (auto v : pair.second) {
+    for (int seek_to_first : {0, 1}) {
+      if (seek_to_first) {
+        iter->SeekToFirst();
+      } else {
+        iter->Seek("");
+      }
+      for (auto pair : index_map) {
+        for (auto v : pair.second) {
+          ASSERT_OK(iter->status());
+          ASSERT_TRUE(iter->Valid());
+          auto& write_entry = iter->Entry();
+          ASSERT_EQ(pair.first, write_entry.key.ToString());
+          if (v->type != kDeleteRecord) {
+            ASSERT_EQ(v->key, write_entry.value.ToString());
+            ASSERT_EQ(v->value, write_entry.key.ToString());
+          }
+          iter->Next();
+        }
+      }
+      ASSERT_TRUE(!iter->Valid());
+    }
+
+    iter->SeekToLast();
+    for (auto pair = index_map.rbegin(); pair != index_map.rend(); ++pair) {
+      for (auto v = pair->second.rbegin(); v != pair->second.rend(); v++) {
         ASSERT_OK(iter->status());
         ASSERT_TRUE(iter->Valid());
         auto& write_entry = iter->Entry();
-        ASSERT_EQ(pair.first, write_entry.key.ToString());
-        if (v->type != kDeleteRecord) {
-          ASSERT_EQ(v->key, write_entry.value.ToString());
-          ASSERT_EQ(v->value, write_entry.key.ToString());
+        ASSERT_EQ(pair->first, write_entry.key.ToString());
+        if ((*v)->type != kDeleteRecord) {
+          ASSERT_EQ((*v)->key, write_entry.value.ToString());
+          ASSERT_EQ((*v)->value, write_entry.key.ToString());
         }
-        iter->Next();
+        iter->Prev();
       }
     }
     ASSERT_TRUE(!iter->Valid());
@@ -357,7 +400,21 @@ TEST(WriteBatchWithIndexTest, TestOverwriteKey) {
 
   {
     std::unique_ptr<WBWIIterator> iter(batch.NewIterator(&cf2));
-    iter->Seek("");
+    iter->SeekToLast();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("eee", iter->Entry().key.ToString());
+    ASSERT_EQ("eee", iter->Entry().value.ToString());
+    iter->Prev();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("aaa", iter->Entry().key.ToString());
+    ASSERT_EQ("aaa", iter->Entry().value.ToString());
+    iter->Prev();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(!iter->Valid());
+
+    iter->SeekToFirst();
     ASSERT_OK(iter->status());
     ASSERT_TRUE(iter->Valid());
     ASSERT_EQ("aaa", iter->Entry().key.ToString());
@@ -391,6 +448,19 @@ TEST(WriteBatchWithIndexTest, TestOverwriteKey) {
     iter->Next();
     ASSERT_OK(iter->status());
     ASSERT_TRUE(!iter->Valid());
+
+    iter->SeekToLast();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("a11", iter->Entry().key.ToString());
+    ASSERT_EQ("a11", iter->Entry().value.ToString());
+    iter->Prev();
+
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("a33", iter->Entry().key.ToString());
+    ASSERT_TRUE(iter->Entry().type == WriteType::kDeleteRecord);
+    iter->Prev();
+    ASSERT_TRUE(!iter->Valid());
   }
 }
 

From 833357402cf1b79c9e31ecff74c6bd3c9dbddff0 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Fri, 10 Oct 2014 16:21:34 -0700
Subject: [PATCH 245/829] WriteBatchWithIndex supports an iterator that merge
 its change with a base iterator.

Summary: Add an iterator that combines base_iterator of type Iterator* with delta iterator of type WBWIIterator*.

Test Plan: nothing yet. work in progress

Reviewers: ljin, igor

Reviewed By: igor

Subscribers: rven, yhchiang, leveldb

Differential Revision: https://reviews.facebook.net/D24741
---
 .../utilities/write_batch_with_index.h        |   6 +
 .../write_batch_with_index.cc                 | 286 +++++++++++++++++
 .../write_batch_with_index_test.cc            | 299 ++++++++++++++++++
 3 files changed, 591 insertions(+)

diff --git a/include/rocksdb/utilities/write_batch_with_index.h b/include/rocksdb/utilities/write_batch_with_index.h
index f31c86ea1..ee5ec198e 100644
--- a/include/rocksdb/utilities/write_batch_with_index.h
+++ b/include/rocksdb/utilities/write_batch_with_index.h
@@ -12,6 +12,7 @@
 #pragma once
 
 #include "rocksdb/comparator.h"
+#include "rocksdb/iterator.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/status.h"
 #include "rocksdb/write_batch.h"
@@ -101,6 +102,11 @@ class WriteBatchWithIndex {
   // Create an iterator of the default column family.
   WBWIIterator* NewIterator();
 
+  // Will create a new Iterator that will use WBWIIterator as a delta and
+  // base_iterator as base
+  Iterator* NewIteratorWithBase(ColumnFamilyHandle* column_family,
+                                Iterator* base_iterator);
+
  private:
   struct Rep;
   Rep* rep;
diff --git a/utilities/write_batch_with_index/write_batch_with_index.cc b/utilities/write_batch_with_index/write_batch_with_index.cc
index 0b460cd15..adfa5b324 100644
--- a/utilities/write_batch_with_index/write_batch_with_index.cc
+++ b/utilities/write_batch_with_index/write_batch_with_index.cc
@@ -4,13 +4,289 @@
 //  of patent rights can be found in the PATENTS file in the same directory.
 
 #include "rocksdb/utilities/write_batch_with_index.h"
+
+#include <memory>
+
 #include "rocksdb/comparator.h"
+#include "rocksdb/iterator.h"
 #include "db/column_family.h"
 #include "db/skiplist.h"
 #include "util/arena.h"
 
 namespace rocksdb {
 
+// when direction == forward
+// * current_at_base_ <=> base_iterator > delta_iterator
+// when direction == backwards
+// * current_at_base_ <=> base_iterator < delta_iterator
+// always:
+// * equal_keys_ <=> base_iterator == delta_iterator
+class BaseDeltaIterator : public Iterator {
+ public:
+  BaseDeltaIterator(Iterator* base_iterator, WBWIIterator* delta_iterator,
+                    const Comparator* comparator)
+      : forward_(true),
+        current_at_base_(true),
+        equal_keys_(false),
+        status_(Status::OK()),
+        base_iterator_(base_iterator),
+        delta_iterator_(delta_iterator),
+        comparator_(comparator) {}
+
+  virtual ~BaseDeltaIterator() {}
+
+  bool Valid() const override {
+    return current_at_base_ ? BaseValid() : DeltaValid();
+  }
+
+  void SeekToFirst() override {
+    forward_ = true;
+    base_iterator_->SeekToFirst();
+    delta_iterator_->SeekToFirst();
+    UpdateCurrent();
+  }
+
+  void SeekToLast() override {
+    forward_ = false;
+    base_iterator_->SeekToLast();
+    delta_iterator_->SeekToLast();
+    UpdateCurrent();
+  }
+
+  void Seek(const Slice& key) override {
+    forward_ = true;
+    base_iterator_->Seek(key);
+    delta_iterator_->Seek(key);
+    UpdateCurrent();
+  }
+
+  void Next() override {
+    if (!Valid()) {
+      status_ = Status::NotSupported("Next() on invalid iterator");
+    }
+
+    if (!forward_) {
+      // Need to change direction
+      // if our direction was backward and we're not equal, we have two states:
+      // * both iterators are valid: we're already in a good state (current
+      // shows to smaller)
+      // * only one iterator is valid: we need to advance that iterator
+      forward_ = true;
+      equal_keys_ = false;
+      if (!BaseValid()) {
+        assert(DeltaValid());
+        base_iterator_->SeekToFirst();
+      } else if (!DeltaValid()) {
+        delta_iterator_->SeekToFirst();
+      } else if (current_at_base_) {
+        // Change delta from larger than base to smaller
+        AdvanceDelta();
+      } else {
+        // Change base from larger than delta to smaller
+        AdvanceBase();
+      }
+      if (DeltaValid() && BaseValid()) {
+        if (Compare() == 0) {
+          equal_keys_ = true;
+        }
+      }
+    }
+    Advance();
+  }
+
+  void Prev() override {
+    if (!Valid()) {
+      status_ = Status::NotSupported("Prev() on invalid iterator");
+    }
+
+    if (forward_) {
+      // Need to change direction
+      // if our direction was backward and we're not equal, we have two states:
+      // * both iterators are valid: we're already in a good state (current
+      // shows to smaller)
+      // * only one iterator is valid: we need to advance that iterator
+      forward_ = false;
+      equal_keys_ = false;
+      if (!BaseValid()) {
+        assert(DeltaValid());
+        base_iterator_->SeekToLast();
+      } else if (!DeltaValid()) {
+        delta_iterator_->SeekToLast();
+      } else if (current_at_base_) {
+        // Change delta from less advanced than base to more advanced
+        AdvanceDelta();
+      } else {
+        // Change base from less advanced than delta to more advanced
+        AdvanceBase();
+      }
+      if (DeltaValid() && BaseValid()) {
+        if (Compare() == 0) {
+          equal_keys_ = true;
+        }
+      }
+    }
+
+    Advance();
+  }
+
+  Slice key() const override {
+    return current_at_base_ ? base_iterator_->key()
+                            : delta_iterator_->Entry().key;
+  }
+
+  Slice value() const override {
+    return current_at_base_ ? base_iterator_->value()
+                            : delta_iterator_->Entry().value;
+  }
+
+  Status status() const {
+    if (!status_.ok()) {
+      return status_;
+    }
+    if (!base_iterator_->status().ok()) {
+      return base_iterator_->status();
+    }
+    return delta_iterator_->status();
+  }
+
+ private:
+  // -1 -- delta less advanced than base
+  // 0 -- delta == base
+  // 1 -- delta more advanced than base
+  int Compare() const {
+    assert(delta_iterator_->Valid() && base_iterator_->Valid());
+    int cmp = comparator_->Compare(delta_iterator_->Entry().key,
+                                   base_iterator_->key());
+    if (forward_) {
+      return cmp;
+    } else {
+      return -cmp;
+    }
+  }
+  bool IsDeltaDelete() {
+    assert(DeltaValid());
+    return delta_iterator_->Entry().type == kDeleteRecord;
+  }
+  void AssertInvariants() {
+#ifndef NDEBUG
+    if (!Valid()) {
+      return;
+    }
+    if (!BaseValid()) {
+      assert(!current_at_base_ && delta_iterator_->Valid());
+      return;
+    }
+    if (!DeltaValid()) {
+      assert(current_at_base_ && base_iterator_->Valid());
+      return;
+    }
+    // we don't support those yet
+    assert(delta_iterator_->Entry().type != kMergeRecord &&
+           delta_iterator_->Entry().type != kLogDataRecord);
+    int compare = comparator_->Compare(delta_iterator_->Entry().key,
+                                       base_iterator_->key());
+    if (forward_) {
+      // current_at_base -> compare < 0
+      assert(!current_at_base_ || compare < 0);
+      // !current_at_base -> compare <= 0
+      assert(current_at_base_ && compare >= 0);
+    } else {
+      // current_at_base -> compare > 0
+      assert(!current_at_base_ || compare > 0);
+      // !current_at_base -> compare <= 0
+      assert(current_at_base_ && compare <= 0);
+    }
+    // equal_keys_ <=> compare == 0
+    assert((equal_keys_ || compare != 0) && (!equal_keys_ || compare == 0));
+#endif
+  }
+
+  void Advance() {
+    if (equal_keys_) {
+      assert(BaseValid() && DeltaValid());
+      AdvanceBase();
+      AdvanceDelta();
+    } else {
+      if (current_at_base_) {
+        assert(BaseValid());
+        AdvanceBase();
+      } else {
+        assert(DeltaValid());
+        AdvanceDelta();
+      }
+    }
+    UpdateCurrent();
+  }
+
+  void AdvanceDelta() {
+    if (forward_) {
+      delta_iterator_->Next();
+    } else {
+      delta_iterator_->Prev();
+    }
+  }
+  void AdvanceBase() {
+    if (forward_) {
+      base_iterator_->Next();
+    } else {
+      base_iterator_->Prev();
+    }
+  }
+  bool BaseValid() const { return base_iterator_->Valid(); }
+  bool DeltaValid() const { return delta_iterator_->Valid(); }
+  void UpdateCurrent() {
+    while (true) {
+      equal_keys_ = false;
+      if (!BaseValid()) {
+        // Base has finished.
+        if (!DeltaValid()) {
+          // Finished
+          return;
+        }
+        if (IsDeltaDelete()) {
+          AdvanceDelta();
+        } else {
+          current_at_base_ = false;
+          return;
+        }
+      } else if (!DeltaValid()) {
+        // Delta has finished.
+        current_at_base_ = true;
+        return;
+      } else {
+        int compare = Compare();
+        if (compare <= 0) {  // delta bigger or equal
+          if (compare == 0) {
+            equal_keys_ = true;
+          }
+          if (!IsDeltaDelete()) {
+            current_at_base_ = false;
+            return;
+          }
+          // Delta is less advanced and is delete.
+          AdvanceDelta();
+          if (equal_keys_) {
+            AdvanceBase();
+          }
+        } else {
+          current_at_base_ = true;
+          return;
+        }
+      }
+    }
+
+    AssertInvariants();
+  }
+
+  bool forward_;
+  bool current_at_base_;
+  bool equal_keys_;
+  Status status_;
+  std::unique_ptr<Iterator> base_iterator_;
+  std::unique_ptr<WBWIIterator> delta_iterator_;
+  const Comparator* comparator_;  // not owned
+};
+
 class ReadableWriteBatch : public WriteBatch {
  public:
   explicit ReadableWriteBatch(size_t reserved_bytes = 0)
@@ -298,6 +574,16 @@ WBWIIterator* WriteBatchWithIndex::NewIterator(
                               &(rep->skip_list), &rep->write_batch);
 }
 
+Iterator* WriteBatchWithIndex::NewIteratorWithBase(
+    ColumnFamilyHandle* column_family, Iterator* base_iterator) {
+  if (rep->overwrite_key == false) {
+    assert(false);
+    return nullptr;
+  }
+  return new BaseDeltaIterator(base_iterator, NewIterator(column_family),
+                               GetColumnFamilyUserComparator(column_family));
+}
+
 void WriteBatchWithIndex::Put(ColumnFamilyHandle* column_family,
                               const Slice& key, const Slice& value) {
   rep->SetLastEntryOffset();
diff --git a/utilities/write_batch_with_index/write_batch_with_index_test.cc b/utilities/write_batch_with_index/write_batch_with_index_test.cc
index d34380fd7..32b45e339 100644
--- a/utilities/write_batch_with_index/write_batch_with_index_test.cc
+++ b/utilities/write_batch_with_index/write_batch_with_index_test.cc
@@ -464,6 +464,305 @@ TEST(WriteBatchWithIndexTest, TestOverwriteKey) {
   }
 }
 
+namespace {
+typedef std::map<std::string, std::string> KVMap;
+
+class KVIter : public Iterator {
+ public:
+  explicit KVIter(const KVMap* map) : map_(map), iter_(map_->end()) {}
+  virtual bool Valid() const { return iter_ != map_->end(); }
+  virtual void SeekToFirst() { iter_ = map_->begin(); }
+  virtual void SeekToLast() {
+    if (map_->empty()) {
+      iter_ = map_->end();
+    } else {
+      iter_ = map_->find(map_->rbegin()->first);
+    }
+  }
+  virtual void Seek(const Slice& k) { iter_ = map_->lower_bound(k.ToString()); }
+  virtual void Next() { ++iter_; }
+  virtual void Prev() {
+    if (iter_ == map_->begin()) {
+      iter_ = map_->end();
+      return;
+    }
+    --iter_;
+  }
+
+  virtual Slice key() const { return iter_->first; }
+  virtual Slice value() const { return iter_->second; }
+  virtual Status status() const { return Status::OK(); }
+
+ private:
+  const KVMap* const map_;
+  KVMap::const_iterator iter_;
+};
+
+void AssertIter(Iterator* iter, const std::string& key,
+                const std::string& value) {
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(key, iter->key().ToString());
+  ASSERT_EQ(value, iter->value().ToString());
+}
+
+void AssertItersEqual(Iterator* iter1, Iterator* iter2) {
+  ASSERT_EQ(iter1->Valid(), iter2->Valid());
+  if (iter1->Valid()) {
+    ASSERT_EQ(iter1->key().ToString(), iter2->key().ToString());
+    ASSERT_EQ(iter1->value().ToString(), iter2->value().ToString());
+  }
+}
+}  // namespace
+
+TEST(WriteBatchWithIndexTest, TestRandomIteraratorWithBase) {
+  std::vector<std::string> source_strings = {"a", "b", "c", "d", "e",
+                                             "f", "g", "h", "i", "j"};
+  for (int rand_seed = 301; rand_seed < 366; rand_seed++) {
+    Random rnd(rand_seed);
+
+    ColumnFamilyHandleImplDummy cf1(6, BytewiseComparator());
+    WriteBatchWithIndex batch(BytewiseComparator(), 20, true);
+    KVMap map;
+    KVMap merged_map;
+    for (auto key : source_strings) {
+      std::string value = key + key;
+      int type = rnd.Uniform(6);
+      switch (type) {
+        case 0:
+          // only base has it
+          map[key] = value;
+          merged_map[key] = value;
+          break;
+        case 1:
+          // only delta has it
+          batch.Put(&cf1, key, value);
+          map[key] = value;
+          merged_map[key] = value;
+          break;
+        case 2:
+          // both has it. Delta should win
+          batch.Put(&cf1, key, value);
+          map[key] = "wrong_value";
+          merged_map[key] = value;
+          break;
+        case 3:
+          // both has it. Delta is delete
+          batch.Delete(&cf1, key);
+          map[key] = "wrong_value";
+          break;
+        case 4:
+          // only delta has it. Delta is delete
+          batch.Delete(&cf1, key);
+          map[key] = "wrong_value";
+          break;
+        default:
+          // Neither iterator has it.
+          break;
+      }
+    }
+
+    std::unique_ptr<Iterator> iter(
+        batch.NewIteratorWithBase(&cf1, new KVIter(&map)));
+    std::unique_ptr<Iterator> result_iter(new KVIter(&merged_map));
+
+    bool is_valid = false;
+    for (int i = 0; i < 128; i++) {
+      // Random walk and make sure iter and result_iter returns the
+      // same key and value
+      int type = rnd.Uniform(5);
+      ASSERT_OK(iter->status());
+      switch (type) {
+        case 0:
+          // Seek to First
+          iter->SeekToFirst();
+          result_iter->SeekToFirst();
+          break;
+        case 1:
+          // Seek to last
+          iter->SeekToLast();
+          result_iter->SeekToLast();
+          break;
+        case 2: {
+          // Seek to random key
+          auto key_idx = rnd.Uniform(source_strings.size());
+          auto key = source_strings[key_idx];
+          iter->Seek(key);
+          result_iter->Seek(key);
+          break;
+        }
+        case 3:
+          // Next
+          if (is_valid) {
+            iter->Next();
+            result_iter->Next();
+          } else {
+            continue;
+          }
+          break;
+        default:
+          assert(type == 4);
+          // Prev
+          if (is_valid) {
+            iter->Prev();
+            result_iter->Prev();
+          } else {
+            continue;
+          }
+          break;
+      }
+      AssertItersEqual(iter.get(), result_iter.get());
+      is_valid = iter->Valid();
+    }
+  }
+}
+
+TEST(WriteBatchWithIndexTest, TestIteraratorWithBase) {
+  ColumnFamilyHandleImplDummy cf1(6, BytewiseComparator());
+  WriteBatchWithIndex batch(BytewiseComparator(), 20, true);
+
+  {
+    KVMap map;
+    map["a"] = "aa";
+    map["c"] = "cc";
+    map["e"] = "ee";
+    std::unique_ptr<Iterator> iter(
+        batch.NewIteratorWithBase(&cf1, new KVIter(&map)));
+
+    iter->SeekToFirst();
+    AssertIter(iter.get(), "a", "aa");
+    iter->Next();
+    AssertIter(iter.get(), "c", "cc");
+    iter->Next();
+    AssertIter(iter.get(), "e", "ee");
+    iter->Next();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(!iter->Valid());
+
+    iter->SeekToLast();
+    AssertIter(iter.get(), "e", "ee");
+    iter->Prev();
+    AssertIter(iter.get(), "c", "cc");
+    iter->Prev();
+    AssertIter(iter.get(), "a", "aa");
+    iter->Prev();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(!iter->Valid());
+
+    iter->Seek("b");
+    AssertIter(iter.get(), "c", "cc");
+
+    iter->Prev();
+    AssertIter(iter.get(), "a", "aa");
+
+    iter->Seek("a");
+    AssertIter(iter.get(), "a", "aa");
+  }
+
+  batch.Put(&cf1, "a", "aa");
+  batch.Delete(&cf1, "b");
+  batch.Put(&cf1, "c", "cc");
+  batch.Put(&cf1, "d", "dd");
+  batch.Delete(&cf1, "e");
+
+  {
+    KVMap map;
+    map["b"] = "";
+    map["cc"] = "cccc";
+    map["f"] = "ff";
+    std::unique_ptr<Iterator> iter(
+        batch.NewIteratorWithBase(&cf1, new KVIter(&map)));
+
+    iter->SeekToFirst();
+    AssertIter(iter.get(), "a", "aa");
+    iter->Next();
+    AssertIter(iter.get(), "c", "cc");
+    iter->Next();
+    AssertIter(iter.get(), "cc", "cccc");
+    iter->Next();
+    AssertIter(iter.get(), "d", "dd");
+    iter->Next();
+    AssertIter(iter.get(), "f", "ff");
+    iter->Next();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(!iter->Valid());
+
+    iter->SeekToLast();
+    AssertIter(iter.get(), "f", "ff");
+    iter->Prev();
+    AssertIter(iter.get(), "d", "dd");
+    iter->Prev();
+    AssertIter(iter.get(), "cc", "cccc");
+    iter->Prev();
+    AssertIter(iter.get(), "c", "cc");
+    iter->Next();
+    AssertIter(iter.get(), "cc", "cccc");
+    iter->Prev();
+    AssertIter(iter.get(), "c", "cc");
+    iter->Prev();
+    AssertIter(iter.get(), "a", "aa");
+    iter->Prev();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(!iter->Valid());
+
+    iter->Seek("c");
+    AssertIter(iter.get(), "c", "cc");
+
+    iter->Seek("cb");
+    AssertIter(iter.get(), "cc", "cccc");
+
+    iter->Seek("cc");
+    AssertIter(iter.get(), "cc", "cccc");
+    iter->Next();
+    AssertIter(iter.get(), "d", "dd");
+
+    iter->Seek("e");
+    AssertIter(iter.get(), "f", "ff");
+
+    iter->Prev();
+    AssertIter(iter.get(), "d", "dd");
+
+    iter->Next();
+    AssertIter(iter.get(), "f", "ff");
+  }
+  {
+    KVMap empty_map;
+    std::unique_ptr<Iterator> iter(
+        batch.NewIteratorWithBase(&cf1, new KVIter(&empty_map)));
+
+    iter->SeekToFirst();
+    AssertIter(iter.get(), "a", "aa");
+    iter->Next();
+    AssertIter(iter.get(), "c", "cc");
+    iter->Next();
+    AssertIter(iter.get(), "d", "dd");
+    iter->Next();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(!iter->Valid());
+
+    iter->SeekToLast();
+    AssertIter(iter.get(), "d", "dd");
+    iter->Prev();
+    AssertIter(iter.get(), "c", "cc");
+    iter->Prev();
+    AssertIter(iter.get(), "a", "aa");
+
+    iter->Prev();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(!iter->Valid());
+
+    iter->Seek("aa");
+    AssertIter(iter.get(), "c", "cc");
+    iter->Next();
+    AssertIter(iter.get(), "d", "dd");
+
+    iter->Seek("ca");
+    AssertIter(iter.get(), "d", "dd");
+
+    iter->Prev();
+    AssertIter(iter.get(), "c", "cc");
+  }
+}
 }  // namespace
 
 int main(int argc, char** argv) { return rocksdb::test::RunAllTests(); }

From 70294c911412c4b8a6484c593b7f0974454d6c93 Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Wed, 8 Oct 2014 22:45:12 +0200
Subject: [PATCH 246/829] JavaDoc improvements on RocksJava

Added some more documentation improvements and readability
improvements.
---
 java/org/rocksdb/CompactionStyle.java         | 30 +++++++++++
 java/org/rocksdb/CompressionType.java         | 13 +++++
 java/org/rocksdb/RocksEnv.java                | 54 ++++++++++---------
 java/org/rocksdb/RocksIterator.java           | 51 ++++++++++--------
 java/org/rocksdb/StatisticsCollector.java     |  8 +--
 .../rocksdb/StatisticsCollectorCallback.java  |  4 +-
 6 files changed, 107 insertions(+), 53 deletions(-)

diff --git a/java/org/rocksdb/CompactionStyle.java b/java/org/rocksdb/CompactionStyle.java
index ade48358e..76064395c 100644
--- a/java/org/rocksdb/CompactionStyle.java
+++ b/java/org/rocksdb/CompactionStyle.java
@@ -5,6 +5,31 @@
 
 package org.rocksdb;
 
+/**
+ * Enum CompactionStyle
+ *
+ * RocksDB supports different styles of compaction. Available
+ * compaction styles can be chosen using this enumeration.
+ *
+ * <ol>
+ *   <li><strong>LEVEL</strong> - Level based Compaction style</li>
+ *   <li><strong>UNIVERSAL</strong> - Universal Compaction Style is a
+ *   compaction style, targeting the use cases requiring lower write
+ *   amplification, trading off read amplification and space
+ *   amplification.</li>
+ *   <li><strong>FIFO</strong> - FIFO compaction style is the simplest
+ *   compaction strategy. It is suited for keeping event log data with
+ *   very low overhead (query log for example). It periodically deletes
+ *   the old data, so it's basically a TTL compaction style.</li>
+ * </ol>
+ *
+ * @see <a
+ * href="https://github.com/facebook/rocksdb/wiki/Universal-Compaction">
+ * Universal Compaction</a>
+ * @see <a
+ * href="https://github.com/facebook/rocksdb/wiki/FIFO-compaction-style">
+ * FIFO Compaction</a>
+ */
 public enum CompactionStyle {
   LEVEL((byte) 0),
   UNIVERSAL((byte) 1),
@@ -16,6 +41,11 @@ public enum CompactionStyle {
     value_ = value;
   }
 
+  /**
+   * Returns the byte value of the enumerations value
+   *
+   * @return byte representation
+   */
   public byte getValue() {
     return value_;
   }
diff --git a/java/org/rocksdb/CompressionType.java b/java/org/rocksdb/CompressionType.java
index f29eccb9b..9c158ccf4 100644
--- a/java/org/rocksdb/CompressionType.java
+++ b/java/org/rocksdb/CompressionType.java
@@ -5,6 +5,14 @@
 
 package org.rocksdb;
 
+/**
+ * Enum CompressionType
+ *
+ * <p>DB contents are stored in a set of blocks, each of which holds a
+ * sequence of key,value pairs. Each block may be compressed before
+ * being stored in a file. The following enum describes which
+ * compression method (if any) is used to compress a block.</p>
+ */
 public enum CompressionType {
   NO_COMPRESSION((byte) 0),
   SNAPPY_COMPRESSION((byte) 1),
@@ -19,6 +27,11 @@ public enum CompressionType {
     value_ = value;
   }
 
+  /**
+   * Returns the byte value of the enumerations value
+   *
+   * @return byte representation
+   */
   public byte getValue() {
     return value_;
   }
diff --git a/java/org/rocksdb/RocksEnv.java b/java/org/rocksdb/RocksEnv.java
index ce73ba654..21a4b2777 100644
--- a/java/org/rocksdb/RocksEnv.java
+++ b/java/org/rocksdb/RocksEnv.java
@@ -6,11 +6,11 @@
 package org.rocksdb;
 
 /**
- * A RocksEnv is an interface used by the rocksdb implementation to access
- * operating system functionality like the filesystem etc.
+ * <p>A RocksEnv is an interface used by the rocksdb implementation to access
+ * operating system functionality like the filesystem etc.</p>
  *
- * All Env implementations are safe for concurrent access from
- * multiple threads without any external synchronization.
+ * <p>All Env implementations are safe for concurrent access from
+ * multiple threads without any external synchronization.</p>
  */
 public class RocksEnv extends RocksObject {
   public static final int FLUSH_POOL = 0;
@@ -22,35 +22,36 @@ public class RocksEnv extends RocksObject {
   private static native long getDefaultEnvInternal();
 
   /**
-   * Returns the default environment suitable for the current operating
-   * system.
+   * <p>Returns the default environment suitable for the current operating
+   * system.</p>
    *
-   * The result of getDefault() is a singleton whose ownership belongs
-   * to rocksdb c++.  As a result, the returned RocksEnv will not
+   * <p>The result of {@see #getDefault()} is a singleton whose ownership
+   * belongs to rocksdb c++.  As a result, the returned RocksEnv will not
    * have the ownership of its c++ resource, and calling its dispose()
-   * will be no-op.
+   * will be no-op.</p>
    */
   public static RocksEnv getDefault() {
     return default_env_;
   }
 
   /**
-   * Sets the number of background worker threads of the flush pool
-   * for this environment.
-   * default number: 1
+   * <p>Sets the number of background worker threads of the flush pool
+   * for this environment.</p>
+   * <p>Default number: 1</p>
    */
   public RocksEnv setBackgroundThreads(int num) {
     return setBackgroundThreads(num, FLUSH_POOL);
   }
 
   /**
-   * Sets the number of background worker threads of the specified thread
-   * pool for this environment.
+   * <p>Sets the number of background worker threads of the specified thread
+   * pool for this environment.</p>
    *
    * @param num the number of threads
    * @param poolID the id to specified a thread pool.  Should be either
    *     FLUSH_POOL or COMPACTION_POOL.
-   * Default number: 1
+   *
+   * <p>Default number: 1</p>
    */
   public RocksEnv setBackgroundThreads(int num, int poolID) {
     setBackgroundThreads(nativeHandle_, num, poolID);
@@ -60,8 +61,8 @@ public class RocksEnv extends RocksObject {
       long handle, int num, int priority);
 
   /**
-   * Returns the length of the queue associated with the specified
-   * thread pool.
+   * <p>Returns the length of the queue associated with the specified
+   * thread pool.</p>
    *
    * @param poolID the id to specified a thread pool.  Should be either
    *     FLUSH_POOL or COMPACTION_POOL.
@@ -72,11 +73,13 @@ public class RocksEnv extends RocksObject {
   private native int getThreadPoolQueueLen(long handle, int poolID);
 
   /**
-   * Package-private constructor that uses the specified native handle
-   * to construct a RocksEnv.  Note that the ownership of the input handle
+   * <p>Package-private constructor that uses the specified native handle
+   * to construct a RocksEnv.</p>
+   *
+   * <p>Note that the ownership of the input handle
    * belongs to the caller, and the newly created RocksEnv will not take
-   * the ownership of the input handle.  As a result, calling dispose()
-   * of the created RocksEnv will be no-op.
+   * the ownership of the input handle.  As a result, calling
+   * {@see #dispose()} of the created RocksEnv will be no-op.</p>
    */
   RocksEnv(long handle) {
     super();
@@ -85,8 +88,9 @@ public class RocksEnv extends RocksObject {
   }
 
   /**
-   * The helper function of dispose() which all subclasses of RocksObject
-   * must implement to release their associated C++ resource.
+   * The helper function of {@link #dispose()} which all subclasses of
+   * {@link RocksObject} must implement to release their associated C++
+   * resource.
    */
   protected void disposeInternal() {
     disposeInternal(nativeHandle_);
@@ -94,9 +98,9 @@ public class RocksEnv extends RocksObject {
   private native void disposeInternal(long handle);
 
   /**
-   * The static default RocksEnv.  The ownership of its native handle
+   * <p>The static default RocksEnv. The ownership of its native handle
    * belongs to rocksdb c++ and is not able to be released on the Java
-   * side.
+   * side.</p>
    */
   static RocksEnv default_env_;
 }
diff --git a/java/org/rocksdb/RocksIterator.java b/java/org/rocksdb/RocksIterator.java
index 9ef2e8c24..2adff26cc 100644
--- a/java/org/rocksdb/RocksIterator.java
+++ b/java/org/rocksdb/RocksIterator.java
@@ -6,15 +6,17 @@
 package org.rocksdb;
 
 /**
- * An iterator yields a sequence of key/value pairs from a source.
- * The following class defines the interface.  Multiple implementations
+ * <p>An iterator yields a sequence of key/value pairs from a source.
+ * The following class defines the interface. Multiple implementations
  * are provided by this library.  In particular, iterators are provided
- * to access the contents of a Table or a DB.
+ * to access the contents of a Table or a DB.</p>
  *
- * Multiple threads can invoke const methods on an RocksIterator without
+ * <p>Multiple threads can invoke const methods on an RocksIterator without
  * external synchronization, but if any of the threads may call a
  * non-const method, all threads accessing the same RocksIterator must use
- * external synchronization.
+ * external synchronization.</p>
+ *
+ * @see org.rocksdb.RocksObject
  */
 public class RocksIterator extends RocksObject {
   public RocksIterator(long nativeHandle) {
@@ -25,6 +27,7 @@ public class RocksIterator extends RocksObject {
   /**
    * An iterator is either positioned at a key/value pair, or
    * not valid.  This method returns true iff the iterator is valid.
+   *
    * @return true if iterator is valid.
    */
   public boolean isValid() {
@@ -43,7 +46,7 @@ public class RocksIterator extends RocksObject {
 
   /**
    * Position at the last key in the source.  The iterator is
-   * Valid() after this call iff the source is not empty.
+   * valid after this call iff the source is not empty.
    */
   public void seekToLast() {
     assert(isInitialized());
@@ -51,9 +54,10 @@ public class RocksIterator extends RocksObject {
   }
 
   /**
-   * Moves to the next entry in the source.  After this call, Valid() is
-   * true iff the iterator was not positioned at the last entry in the source.
-   * REQUIRES: Valid()
+   * <p>Moves to the next entry in the source.  After this call, Valid() is
+   * true iff the iterator was not positioned at the last entry in the source.</p>
+   *
+   * <p>REQUIRES: {@link #isValid()}<p>
    */
   public void next() {
     assert(isInitialized());
@@ -61,9 +65,10 @@ public class RocksIterator extends RocksObject {
   }
 
   /**
-   * Moves to the previous entry in the source.  After this call, Valid() is
-   * true iff the iterator was not positioned at the first entry in source.
-   * REQUIRES: Valid()
+   * <p>Moves to the previous entry in the source.  After this call, Valid() is
+   * true iff the iterator was not positioned at the first entry in source.</p>
+   *
+   * <p>REQUIRES: {@link #isValid()}<p>
    */
   public void prev() {
     assert(isInitialized());
@@ -71,10 +76,12 @@ public class RocksIterator extends RocksObject {
   }
 
   /**
-   * Return the key for the current entry.  The underlying storage for
+   * <p>Return the key for the current entry.  The underlying storage for
    * the returned slice is valid only until the next modification of
-   * the iterator.
-   * REQUIRES: Valid()
+   * the iterator.</p>
+   *
+   * <p>REQUIRES: {@link #isValid()}<p>
+   *
    * @return key for the current entry.
    */
   public byte[] key() {
@@ -83,10 +90,11 @@ public class RocksIterator extends RocksObject {
   }
 
   /**
-   * Return the value for the current entry.  The underlying storage for
+   * <p>Return the value for the current entry.  The underlying storage for
    * the returned slice is valid only until the next modification of
-   * the iterator.
-   * REQUIRES: !AtEnd() && !AtStart()
+   * the iterator.</p>
+   *
+   * <p>REQUIRES: !AtEnd() && !AtStart()</p>
    * @return value for the current entry.
    */
   public byte[] value() {
@@ -95,9 +103,9 @@ public class RocksIterator extends RocksObject {
   }
 
   /**
-   * Position at the first key in the source that at or past target
-   * The iterator is Valid() after this call iff the source contains
-   * an entry that comes at or past target.
+   * <p>Position at the first key in the source that at or past target
+   * The iterator is valid after this call iff the source contains
+   * an entry that comes at or past target.</p>
    */
   public void seek(byte[] target) {
     assert(isInitialized());
@@ -109,6 +117,7 @@ public class RocksIterator extends RocksObject {
    * If non-blocking IO is requested and this operation cannot be
    * satisfied without doing some IO, then this returns Status::Incomplete().
    *
+   * @throws org.rocksdb.RocksDBException
    */
   public void status() throws RocksDBException {
     assert(isInitialized());
diff --git a/java/org/rocksdb/StatisticsCollector.java b/java/org/rocksdb/StatisticsCollector.java
index 965637697..524756a6c 100644
--- a/java/org/rocksdb/StatisticsCollector.java
+++ b/java/org/rocksdb/StatisticsCollector.java
@@ -13,13 +13,13 @@ import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicBoolean;
 
 /**
- * Helper class to collect DB statistics periodically at a period specified in
+ * <p>Helper class to collect DB statistics periodically at a period specified in
  * constructor. Callback function (provided in constructor) is called with
- * every statistics collection.
+ * every statistics collection.</p>
  *
- * Caller should call start() to start statistics collection. Shutdown() should
+ * <p>Caller should call start() to start statistics collection. Shutdown() should
  * be called to stop stats collection and should be called before statistics (
- * provided in constructor) reference has been disposed.
+ * provided in constructor) reference has been disposed.</p>
  */
 public class StatisticsCollector {
   private final List<StatsCollectorInput> _statsCollectorInputList;
diff --git a/java/org/rocksdb/StatisticsCollectorCallback.java b/java/org/rocksdb/StatisticsCollectorCallback.java
index b8d7a24ec..2ce92c5ee 100644
--- a/java/org/rocksdb/StatisticsCollectorCallback.java
+++ b/java/org/rocksdb/StatisticsCollectorCallback.java
@@ -14,9 +14,7 @@ package org.rocksdb;
  * StatisticsCollector references, then its the responsibility of the
  * user to make StatisticsCollectorCallback's implementation thread-safe.
  *
- * @param tickerType
- * @param tickerCount
-*/
+ */
 public interface StatisticsCollectorCallback {
   /**
    * Callback function to get ticker values.

From 16d2ebdbcf59438093b4493e7dd3212bc04f51fd Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Wed, 8 Oct 2014 22:50:54 +0200
Subject: [PATCH 247/829] Minor adjustment to prevent two warnings

---
 java/org/rocksdb/RocksEnv.java | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/java/org/rocksdb/RocksEnv.java b/java/org/rocksdb/RocksEnv.java
index 21a4b2777..a9b01ab72 100644
--- a/java/org/rocksdb/RocksEnv.java
+++ b/java/org/rocksdb/RocksEnv.java
@@ -25,7 +25,7 @@ public class RocksEnv extends RocksObject {
    * <p>Returns the default environment suitable for the current operating
    * system.</p>
    *
-   * <p>The result of {@see #getDefault()} is a singleton whose ownership
+   * <p>The result of {@code getDefault()} is a singleton whose ownership
    * belongs to rocksdb c++.  As a result, the returned RocksEnv will not
    * have the ownership of its c++ resource, and calling its dispose()
    * will be no-op.</p>
@@ -79,7 +79,7 @@ public class RocksEnv extends RocksObject {
    * <p>Note that the ownership of the input handle
    * belongs to the caller, and the newly created RocksEnv will not take
    * the ownership of the input handle.  As a result, calling
-   * {@see #dispose()} of the created RocksEnv will be no-op.</p>
+   * {@code dispose()} of the created RocksEnv will be no-op.</p>
    */
   RocksEnv(long handle) {
     super();

From 4f5a6872541ec1aaf04e1b6f34ce2b774fa2e83e Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Thu, 9 Oct 2014 23:16:41 +0200
Subject: [PATCH 248/829] 32-Bit RocksJava resolution for jlong overflows

Summary:
This pull request solves the jlong overflow problem on 32-Bit machines as described in https://github.com/facebook/rocksdb/issues/278:

1. There is a new org.rocksdb.test.PlatformRandomHelper to assist in getting random values. For 32 Bit the getLong method is overriden by xpromaches code above. For 64 Bit it behaves as is.
2. The detection should be cross-platform (Windows is supported though it is not ported completely yet).
3. Every JNI method which sets jlong values must check if the value fits into size_t. If it overflows size_t a InvalidArgument Status object will be returned. If its ok a OK Status will be returned.
4. Setters which have this check will throw a RocksDBException if its no OK Status.

Additionally some other parts of code were corrected using the wrong type casts.

Test Plan:
make rocksdbjava
make jtest

Differential Revision: https://reviews.facebook.net/D24531
---
 java/RocksDBSample.java                       | 85 +++++++++---------
 .../rocksdb/HashLinkedListMemTableConfig.java |  6 +-
 .../rocksdb/HashSkipListMemTableConfig.java   |  6 +-
 java/org/rocksdb/MemTableConfig.java          |  3 +-
 java/org/rocksdb/Options.java                 | 56 ++++++++----
 java/org/rocksdb/benchmark/DbBenchmark.java   |  2 +-
 java/org/rocksdb/test/OptionsTest.java        | 87 +++++++++++++------
 .../rocksdb/test/PlatformRandomHelper.java    | 54 ++++++++++++
 .../rocksdb/test/StatisticsCollectorTest.java |  4 +-
 java/rocksjni/memtablejni.cc                  | 31 +++++--
 java/rocksjni/options.cc                      | 81 ++++++++++++-----
 java/rocksjni/portal.h                        | 12 ++-
 java/rocksjni/ratelimiterjni.cc               |  4 +-
 java/rocksjni/restorejni.cc                   |  9 +-
 java/rocksjni/rocksjni.cc                     | 19 ++--
 java/rocksjni/write_batch.cc                  |  2 +-
 16 files changed, 322 insertions(+), 139 deletions(-)
 create mode 100644 java/org/rocksdb/test/PlatformRandomHelper.java

diff --git a/java/RocksDBSample.java b/java/RocksDBSample.java
index c9a30476a..9eff06037 100644
--- a/java/RocksDBSample.java
+++ b/java/RocksDBSample.java
@@ -35,13 +35,18 @@ public class RocksDBSample {
       assert(db == null);
     }
 
-    options.setCreateIfMissing(true)
-        .createStatistics()
-        .setWriteBufferSize(8 * SizeUnit.KB)
-        .setMaxWriteBufferNumber(3)
-        .setMaxBackgroundCompactions(10)
-        .setCompressionType(CompressionType.SNAPPY_COMPRESSION)
-        .setCompactionStyle(CompactionStyle.UNIVERSAL);
+    try {
+      options.setCreateIfMissing(true)
+          .createStatistics()
+          .setWriteBufferSize(8 * SizeUnit.KB)
+          .setMaxWriteBufferNumber(3)
+          .setMaxBackgroundCompactions(10)
+          .setCompressionType(CompressionType.SNAPPY_COMPRESSION)
+          .setCompactionStyle(CompactionStyle.UNIVERSAL);
+    } catch (RocksDBException e) {
+      assert(false);
+    }
+
     Statistics stats = options.statisticsPtr();
 
     assert(options.createIfMissing() == true);
@@ -50,36 +55,38 @@ public class RocksDBSample {
     assert(options.maxBackgroundCompactions() == 10);
     assert(options.compressionType() == CompressionType.SNAPPY_COMPRESSION);
     assert(options.compactionStyle() == CompactionStyle.UNIVERSAL);
-
-    assert(options.memTableFactoryName().equals("SkipListFactory"));
-    options.setMemTableConfig(
-        new HashSkipListMemTableConfig()
-            .setHeight(4)
-            .setBranchingFactor(4)
-            .setBucketCount(2000000));
-    assert(options.memTableFactoryName().equals("HashSkipListRepFactory"));
-
-    options.setMemTableConfig(
-        new HashLinkedListMemTableConfig()
-            .setBucketCount(100000));
-    assert(options.memTableFactoryName().equals("HashLinkedListRepFactory"));
-
-    options.setMemTableConfig(
-        new VectorMemTableConfig().setReservedSize(10000));
-    assert(options.memTableFactoryName().equals("VectorRepFactory"));
-
-    options.setMemTableConfig(new SkipListMemTableConfig());
-    assert(options.memTableFactoryName().equals("SkipListFactory"));
-
-    options.setTableFormatConfig(new PlainTableConfig());
-    // Plain-Table requires mmap read
-    options.setAllowMmapReads(true);
-    assert(options.tableFactoryName().equals("PlainTable"));
-    
-    options.setRateLimiterConfig(new GenericRateLimiterConfig(10000000,
-            10000, 10));
-    options.setRateLimiterConfig(new GenericRateLimiterConfig(10000000));
-
+    try {
+      assert(options.memTableFactoryName().equals("SkipListFactory"));
+      options.setMemTableConfig(
+          new HashSkipListMemTableConfig()
+              .setHeight(4)
+              .setBranchingFactor(4)
+              .setBucketCount(2000000));
+      assert(options.memTableFactoryName().equals("HashSkipListRepFactory"));
+
+      options.setMemTableConfig(
+          new HashLinkedListMemTableConfig()
+              .setBucketCount(100000));
+      assert(options.memTableFactoryName().equals("HashLinkedListRepFactory"));
+
+      options.setMemTableConfig(
+          new VectorMemTableConfig().setReservedSize(10000));
+      assert(options.memTableFactoryName().equals("VectorRepFactory"));
+
+      options.setMemTableConfig(new SkipListMemTableConfig());
+      assert(options.memTableFactoryName().equals("SkipListFactory"));
+
+      options.setTableFormatConfig(new PlainTableConfig());
+      // Plain-Table requires mmap read
+      options.setAllowMmapReads(true);
+      assert(options.tableFactoryName().equals("PlainTable"));
+
+      options.setRateLimiterConfig(new GenericRateLimiterConfig(10000000,
+          10000, 10));
+      options.setRateLimiterConfig(new GenericRateLimiterConfig(10000000));
+    } catch (RocksDBException e) {
+      assert(false);
+    }
     Filter bloomFilter = new BloomFilter(10);
     BlockBasedTableConfig table_options = new BlockBasedTableConfig();
     table_options.setBlockCacheSize(64 * SizeUnit.KB)
@@ -91,7 +98,7 @@ public class RocksDBSample {
                  .setHashIndexAllowCollision(false)
                  .setBlockCacheCompressedSize(64 * SizeUnit.KB)
                  .setBlockCacheCompressedNumShardBits(10);
-                 
+
     assert(table_options.blockCacheSize() == 64 * SizeUnit.KB);
     assert(table_options.cacheNumShardBits() == 6);
     assert(table_options.blockSizeDeviation() == 5);
@@ -100,7 +107,7 @@ public class RocksDBSample {
     assert(table_options.hashIndexAllowCollision() == false);
     assert(table_options.blockCacheCompressedSize() == 64 * SizeUnit.KB);
     assert(table_options.blockCacheCompressedNumShardBits() == 10);
-    
+
     options.setTableFormatConfig(table_options);
     assert(options.tableFactoryName().equals("BlockBasedTable"));
 
diff --git a/java/org/rocksdb/HashLinkedListMemTableConfig.java b/java/org/rocksdb/HashLinkedListMemTableConfig.java
index 24fcd8b52..381a16f49 100644
--- a/java/org/rocksdb/HashLinkedListMemTableConfig.java
+++ b/java/org/rocksdb/HashLinkedListMemTableConfig.java
@@ -42,11 +42,13 @@ public class HashLinkedListMemTableConfig extends MemTableConfig {
     return bucketCount_;
   }
 
-  @Override protected long newMemTableFactoryHandle() {
+  @Override protected long newMemTableFactoryHandle()
+       throws RocksDBException {
     return newMemTableFactoryHandle(bucketCount_);
   }
 
-  private native long newMemTableFactoryHandle(long bucketCount);
+  private native long newMemTableFactoryHandle(long bucketCount)
+      throws RocksDBException;
 
   private long bucketCount_;
 }
diff --git a/java/org/rocksdb/HashSkipListMemTableConfig.java b/java/org/rocksdb/HashSkipListMemTableConfig.java
index 74fb0dba2..100f16c82 100644
--- a/java/org/rocksdb/HashSkipListMemTableConfig.java
+++ b/java/org/rocksdb/HashSkipListMemTableConfig.java
@@ -83,13 +83,15 @@ public class HashSkipListMemTableConfig extends MemTableConfig {
     return branchingFactor_;
   }
 
-  @Override protected long newMemTableFactoryHandle() {
+  @Override protected long newMemTableFactoryHandle()
+      throws RocksDBException {
     return newMemTableFactoryHandle(
         bucketCount_, height_, branchingFactor_);
   }
 
   private native long newMemTableFactoryHandle(
-      long bucketCount, int height, int branchingFactor);
+      long bucketCount, int height, int branchingFactor)
+      throws RocksDBException;
 
   private long bucketCount_;
   private int branchingFactor_;
diff --git a/java/org/rocksdb/MemTableConfig.java b/java/org/rocksdb/MemTableConfig.java
index a69b1008f..deb74f185 100644
--- a/java/org/rocksdb/MemTableConfig.java
+++ b/java/org/rocksdb/MemTableConfig.java
@@ -23,5 +23,6 @@ public abstract class MemTableConfig {
    *
    * @see Options#setMemTableConfig(MemTableConfig)
    */
-  abstract protected long newMemTableFactoryHandle();
+  abstract protected long newMemTableFactoryHandle()
+      throws RocksDBException;
 }
diff --git a/java/org/rocksdb/Options.java b/java/org/rocksdb/Options.java
index b0989363b..1ad8f9489 100644
--- a/java/org/rocksdb/Options.java
+++ b/java/org/rocksdb/Options.java
@@ -118,8 +118,10 @@ public class Options extends RocksObject {
    * @param writeBufferSize the size of write buffer.
    * @return the instance of the current Options.
    * @see org.rocksdb.RocksDB#open(Options, String)
+   * @throws RocksDBException
    */
-  public Options setWriteBufferSize(long writeBufferSize) {
+  public Options setWriteBufferSize(long writeBufferSize)
+      throws RocksDBException {
     assert(isInitialized());
     setWriteBufferSize(nativeHandle_, writeBufferSize);
     return this;
@@ -561,13 +563,16 @@ public class Options extends RocksObject {
    *
    * @param maxLogFileSize the maximum size of a info log file.
    * @return the reference to the current option.
+   * @throws RocksDBException
    */
-  public Options setMaxLogFileSize(long maxLogFileSize) {
+  public Options setMaxLogFileSize(long maxLogFileSize)
+      throws RocksDBException {
     assert(isInitialized());
     setMaxLogFileSize(nativeHandle_, maxLogFileSize);
     return this;
   }
-  private native void setMaxLogFileSize(long handle, long maxLogFileSize);
+  private native void setMaxLogFileSize(long handle, long maxLogFileSize)
+      throws RocksDBException;
 
   /**
    * Returns the time interval for the info log file to roll (in seconds).
@@ -591,14 +596,16 @@ public class Options extends RocksObject {
    *
    * @param logFileTimeToRoll the time interval in seconds.
    * @return the reference to the current option.
+   * @throws RocksDBException
    */
-  public Options setLogFileTimeToRoll(long logFileTimeToRoll) {
+  public Options setLogFileTimeToRoll(long logFileTimeToRoll)
+      throws RocksDBException{
     assert(isInitialized());
     setLogFileTimeToRoll(nativeHandle_, logFileTimeToRoll);
     return this;
   }
   private native void setLogFileTimeToRoll(
-      long handle, long logFileTimeToRoll);
+      long handle, long logFileTimeToRoll) throws RocksDBException;
 
   /**
    * Returns the maximum number of info log files to be kept.
@@ -618,13 +625,16 @@ public class Options extends RocksObject {
    *
    * @param keepLogFileNum the maximum number of info log files to be kept.
    * @return the reference to the current option.
+   * @throws RocksDBException
    */
-  public Options setKeepLogFileNum(long keepLogFileNum) {
+  public Options setKeepLogFileNum(long keepLogFileNum)
+      throws RocksDBException{
     assert(isInitialized());
     setKeepLogFileNum(nativeHandle_, keepLogFileNum);
     return this;
   }
-  private native void setKeepLogFileNum(long handle, long keepLogFileNum);
+  private native void setKeepLogFileNum(long handle, long keepLogFileNum)
+      throws RocksDBException;
 
   /**
    * Manifest file is rolled over on reaching this limit.
@@ -844,14 +854,16 @@ public class Options extends RocksObject {
    *
    * @param size the size in byte
    * @return the reference to the current option.
+   * @throws RocksDBException
    */
-  public Options setManifestPreallocationSize(long size) {
+  public Options setManifestPreallocationSize(long size)
+      throws RocksDBException {
     assert(isInitialized());
     setManifestPreallocationSize(nativeHandle_, size);
     return this;
   }
   private native void setManifestPreallocationSize(
-      long handle, long size);
+      long handle, long size) throws RocksDBException;
 
   /**
    * Data being read from file storage may be buffered in the OS
@@ -1110,8 +1122,10 @@ public class Options extends RocksObject {
    *
    * @param config the mem-table config.
    * @return the instance of the current Options.
+   * @throws RocksDBException
    */
-  public Options setMemTableConfig(MemTableConfig config) {
+  public Options setMemTableConfig(MemTableConfig config)
+      throws RocksDBException {
     setMemTableFactory(nativeHandle_, config.newMemTableFactoryHandle());
     return this;
   }
@@ -1123,6 +1137,7 @@ public class Options extends RocksObject {
    *
    * @param config rate limiter config.
    * @return the instance of the current Options.
+   * @throws RocksDBException
    */
   public Options setRateLimiterConfig(RateLimiterConfig config) {
     setRateLimiter(nativeHandle_, config.newRateLimiterHandle());
@@ -1768,13 +1783,15 @@ public class Options extends RocksObject {
    *
    * @param arenaBlockSize the size of an arena block
    * @return the reference to the current option.
+   * @throws RocksDBException
    */
-  public Options setArenaBlockSize(long arenaBlockSize) {
+  public Options setArenaBlockSize(long arenaBlockSize)
+      throws RocksDBException {
     setArenaBlockSize(nativeHandle_, arenaBlockSize);
     return this;
   }
   private native void setArenaBlockSize(
-      long handle, long arenaBlockSize);
+      long handle, long arenaBlockSize) throws RocksDBException;
 
   /**
    * Disable automatic compactions. Manual compactions can still
@@ -1977,13 +1994,15 @@ public class Options extends RocksObject {
    * @param inplaceUpdateNumLocks the number of locks used for
    *     inplace updates.
    * @return the reference to the current option.
+   * @throws RocksDBException
    */
-  public Options setInplaceUpdateNumLocks(long inplaceUpdateNumLocks) {
+  public Options setInplaceUpdateNumLocks(long inplaceUpdateNumLocks)
+      throws RocksDBException {
     setInplaceUpdateNumLocks(nativeHandle_, inplaceUpdateNumLocks);
     return this;
   }
   private native void setInplaceUpdateNumLocks(
-      long handle, long inplaceUpdateNumLocks);
+      long handle, long inplaceUpdateNumLocks) throws RocksDBException;
 
   /**
    * Returns the number of bits used in the prefix bloom filter.
@@ -2108,13 +2127,15 @@ public class Options extends RocksObject {
    *
    * @param maxSuccessiveMerges the maximum number of successive merges.
    * @return the reference to the current option.
+   * @throws RocksDBException
    */
-  public Options setMaxSuccessiveMerges(long maxSuccessiveMerges) {
+  public Options setMaxSuccessiveMerges(long maxSuccessiveMerges)
+      throws RocksDBException {
     setMaxSuccessiveMerges(nativeHandle_, maxSuccessiveMerges);
     return this;
   }
   private native void setMaxSuccessiveMerges(
-      long handle, long maxSuccessiveMerges);
+      long handle, long maxSuccessiveMerges) throws RocksDBException;
 
   /**
    * The minimum number of write buffers that will be merged together
@@ -2204,7 +2225,8 @@ public class Options extends RocksObject {
   private native void disposeInternal(long handle);
   private native void setCreateIfMissing(long handle, boolean flag);
   private native boolean createIfMissing(long handle);
-  private native void setWriteBufferSize(long handle, long writeBufferSize);
+  private native void setWriteBufferSize(long handle, long writeBufferSize)
+      throws RocksDBException;
   private native long writeBufferSize(long handle);
   private native void setMaxWriteBufferNumber(
       long handle, int maxWriteBufferNumber);
diff --git a/java/org/rocksdb/benchmark/DbBenchmark.java b/java/org/rocksdb/benchmark/DbBenchmark.java
index 612fdaf28..d3d9f8c58 100644
--- a/java/org/rocksdb/benchmark/DbBenchmark.java
+++ b/java/org/rocksdb/benchmark/DbBenchmark.java
@@ -489,7 +489,7 @@ public class DbBenchmark {
     options.setDisableWAL((Boolean)flags_.get(Flag.disable_wal));
   }
 
-  private void prepareOptions(Options options) {
+  private void prepareOptions(Options options) throws RocksDBException {
     if (!useExisting_) {
       options.setCreateIfMissing(true);
     } else {
diff --git a/java/org/rocksdb/test/OptionsTest.java b/java/org/rocksdb/test/OptionsTest.java
index d3abb48cd..9f14b40d9 100644
--- a/java/org/rocksdb/test/OptionsTest.java
+++ b/java/org/rocksdb/test/OptionsTest.java
@@ -7,15 +7,19 @@ package org.rocksdb.test;
 
 import java.util.Random;
 import org.rocksdb.RocksDB;
+import org.rocksdb.RocksDBException;
 import org.rocksdb.Options;
+import org.rocksdb.test.PlatformRandomHelper;
 
 public class OptionsTest {
+
   static {
     RocksDB.loadLibrary();
   }
   public static void main(String[] args) {
     Options opt = new Options();
-    Random rand = new Random();
+    Random rand = PlatformRandomHelper.
+        getPlatformSpecificRandomFactory();
     { // CreateIfMissing test
       boolean boolValue = rand.nextBoolean();
       opt.setCreateIfMissing(boolValue);
@@ -83,21 +87,34 @@ public class OptionsTest {
     }
 
     { // MaxLogFileSize test
-      long longValue = rand.nextLong();
-      opt.setMaxLogFileSize(longValue);
-      assert(opt.maxLogFileSize() == longValue);
+      try {
+        long longValue = rand.nextLong();
+        opt.setMaxLogFileSize(longValue);
+        assert(opt.maxLogFileSize() == longValue);
+      } catch (RocksDBException e) {
+        System.out.println(e.getMessage());
+        assert(false);
+      }
     }
 
     { // LogFileTimeToRoll test
-      long longValue = rand.nextLong();
-      opt.setLogFileTimeToRoll(longValue);
-      assert(opt.logFileTimeToRoll() == longValue);
+      try {
+        long longValue = rand.nextLong();
+        opt.setLogFileTimeToRoll(longValue);
+        assert(opt.logFileTimeToRoll() == longValue);
+      } catch (RocksDBException e) {
+        assert(false);
+      }
     }
 
     { // KeepLogFileNum test
-      long longValue = rand.nextLong();
-      opt.setKeepLogFileNum(longValue);
-      assert(opt.keepLogFileNum() == longValue);
+      try {
+        long longValue = rand.nextLong();
+        opt.setKeepLogFileNum(longValue);
+        assert(opt.keepLogFileNum() == longValue);
+      } catch (RocksDBException e) {
+        assert(false);
+      }
     }
 
     { // MaxManifestFileSize test
@@ -125,9 +142,13 @@ public class OptionsTest {
     }
 
     { // ManifestPreallocationSize test
-      long longValue = rand.nextLong();
-      opt.setManifestPreallocationSize(longValue);
-      assert(opt.manifestPreallocationSize() == longValue);
+      try {
+        long longValue = rand.nextLong();
+        opt.setManifestPreallocationSize(longValue);
+        assert(opt.manifestPreallocationSize() == longValue);
+      } catch (RocksDBException e) {
+        assert(false);
+      }
     }
 
     { // AllowOsBuffer test
@@ -185,9 +206,13 @@ public class OptionsTest {
     }
 
     { // WriteBufferSize test
-      long longValue = rand.nextLong();
-      opt.setWriteBufferSize(longValue);
-      assert(opt.writeBufferSize() == longValue);
+      try {
+        long longValue = rand.nextLong();
+        opt.setWriteBufferSize(longValue);
+        assert(opt.writeBufferSize() == longValue);
+      } catch (RocksDBException e) {
+        assert(false);
+      }
     }
 
     { // MaxWriteBufferNumber test
@@ -293,9 +318,13 @@ public class OptionsTest {
     }
 
     { // ArenaBlockSize test
-      long longValue = rand.nextLong();
-      opt.setArenaBlockSize(longValue);
-      assert(opt.arenaBlockSize() == longValue);
+      try {
+        long longValue = rand.nextLong();
+        opt.setArenaBlockSize(longValue);
+        assert(opt.arenaBlockSize() == longValue);
+      } catch (RocksDBException e) {
+        assert(false);
+      }
     }
 
     { // DisableAutoCompactions test
@@ -335,9 +364,13 @@ public class OptionsTest {
     }
 
     { // InplaceUpdateNumLocks test
-      long longValue = rand.nextLong();
-      opt.setInplaceUpdateNumLocks(longValue);
-      assert(opt.inplaceUpdateNumLocks() == longValue);
+      try {
+        long longValue = rand.nextLong();
+        opt.setInplaceUpdateNumLocks(longValue);
+        assert(opt.inplaceUpdateNumLocks() == longValue);
+      } catch (RocksDBException e) {
+        assert(false);
+      }
     }
 
     { // MemtablePrefixBloomBits test
@@ -359,9 +392,13 @@ public class OptionsTest {
     }
 
     { // MaxSuccessiveMerges test
-      long longValue = rand.nextLong();
-      opt.setMaxSuccessiveMerges(longValue);
-      assert(opt.maxSuccessiveMerges() == longValue);
+      try {
+        long longValue = rand.nextLong();
+        opt.setMaxSuccessiveMerges(longValue);
+        assert(opt.maxSuccessiveMerges() == longValue);
+      } catch (RocksDBException e){
+        assert(false);
+      }
     }
 
     { // MinPartialMergeOperands test
diff --git a/java/org/rocksdb/test/PlatformRandomHelper.java b/java/org/rocksdb/test/PlatformRandomHelper.java
new file mode 100644
index 000000000..b0ef8d8a6
--- /dev/null
+++ b/java/org/rocksdb/test/PlatformRandomHelper.java
@@ -0,0 +1,54 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb.test;
+
+import java.util.Random;
+
+/**
+ * Helper class to get the appropriate Random class instance dependent
+ * on the current platform architecture (32bit vs 64bit)
+ */
+public class PlatformRandomHelper {
+    /**
+     * Determine if OS is 32-Bit/64-Bit
+     */
+    public static boolean isOs64Bit(){
+      boolean is64Bit = false;
+      if (System.getProperty("os.name").contains("Windows")) {
+        is64Bit = (System.getenv("ProgramFiles(x86)") != null);
+      } else {
+        is64Bit = (System.getProperty("os.arch").indexOf("64") != -1);
+      }
+      return is64Bit;
+    }
+
+    /**
+     * Factory to get a platform specific Random instance
+     */
+    public static Random getPlatformSpecificRandomFactory(){
+      if (isOs64Bit()) {
+        return new Random();
+      }
+      return new Random32Bit();
+    }
+
+    /**
+     * Random32Bit is a class which overrides {@code nextLong} to
+     * provide random numbers which fit in size_t. This workaround
+     * is necessary because there is no unsigned_int < Java 8
+     */
+    private static class Random32Bit extends Random {
+      @Override
+      public long nextLong(){
+      return this.nextInt(Integer.MAX_VALUE);
+    }
+    }
+
+    /**
+     * Utility class constructor
+     */
+    private PlatformRandomHelper() { }
+}
diff --git a/java/org/rocksdb/test/StatisticsCollectorTest.java b/java/org/rocksdb/test/StatisticsCollectorTest.java
index e497d14df..edbf693e4 100644
--- a/java/org/rocksdb/test/StatisticsCollectorTest.java
+++ b/java/org/rocksdb/test/StatisticsCollectorTest.java
@@ -14,7 +14,7 @@ public class StatisticsCollectorTest {
     RocksDB.loadLibrary();
   }
 
-  public static void main(String[] args) 
+  public static void main(String[] args)
       throws InterruptedException, RocksDBException {
     Options opt = new Options().createStatistics().setCreateIfMissing(true);
     Statistics stats = opt.statisticsPtr();
@@ -23,7 +23,7 @@ public class StatisticsCollectorTest {
 
     StatsCallbackMock callback = new StatsCallbackMock();
     StatsCollectorInput statsInput = new StatsCollectorInput(stats, callback);
-    
+
     StatisticsCollector statsCollector = new StatisticsCollector(
         Collections.singletonList(statsInput), 100);
     statsCollector.start();
diff --git a/java/rocksjni/memtablejni.cc b/java/rocksjni/memtablejni.cc
index 9b0dc252c..4be03d491 100644
--- a/java/rocksjni/memtablejni.cc
+++ b/java/rocksjni/memtablejni.cc
@@ -20,10 +20,15 @@
 jlong Java_org_rocksdb_HashSkipListMemTableConfig_newMemTableFactoryHandle(
     JNIEnv* env, jobject jobj, jlong jbucket_count,
     jint jheight, jint jbranching_factor) {
-  return reinterpret_cast<jlong>(rocksdb::NewHashSkipListRepFactory(
-      rocksdb::jlong_to_size_t(jbucket_count),
-      static_cast<int32_t>(jheight),
-      static_cast<int32_t>(jbranching_factor)));
+  rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t(jbucket_count);
+  if (s.ok()) {
+    return reinterpret_cast<jlong>(rocksdb::NewHashSkipListRepFactory(
+        static_cast<size_t>(jbucket_count),
+        static_cast<int32_t>(jheight),
+        static_cast<int32_t>(jbranching_factor)));
+  }
+  rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  return 0;
 }
 
 /*
@@ -33,8 +38,13 @@ jlong Java_org_rocksdb_HashSkipListMemTableConfig_newMemTableFactoryHandle(
  */
 jlong Java_org_rocksdb_HashLinkedListMemTableConfig_newMemTableFactoryHandle(
     JNIEnv* env, jobject jobj, jlong jbucket_count) {
-  return reinterpret_cast<jlong>(rocksdb::NewHashLinkListRepFactory(
-       rocksdb::jlong_to_size_t(jbucket_count)));
+  rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t(jbucket_count);
+  if (s.ok()) {
+    return reinterpret_cast<jlong>(rocksdb::NewHashLinkListRepFactory(
+        static_cast<size_t>(jbucket_count)));
+  }
+  rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  return 0;
 }
 
 /*
@@ -44,8 +54,13 @@ jlong Java_org_rocksdb_HashLinkedListMemTableConfig_newMemTableFactoryHandle(
  */
 jlong Java_org_rocksdb_VectorMemTableConfig_newMemTableFactoryHandle(
     JNIEnv* env, jobject jobj, jlong jreserved_size) {
-  return reinterpret_cast<jlong>(new rocksdb::VectorRepFactory(
-      rocksdb::jlong_to_size_t(jreserved_size)));
+  rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t(jreserved_size);
+  if (s.ok()) {
+    return reinterpret_cast<jlong>(new rocksdb::VectorRepFactory(
+        static_cast<size_t>(jreserved_size)));
+  }
+  rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  return 0;
 }
 
 /*
diff --git a/java/rocksjni/options.cc b/java/rocksjni/options.cc
index a8be5af8b..8d3cb37e0 100644
--- a/java/rocksjni/options.cc
+++ b/java/rocksjni/options.cc
@@ -71,7 +71,7 @@ jboolean Java_org_rocksdb_Options_createIfMissing(
  */
 void Java_org_rocksdb_Options_setBuiltinComparator(
     JNIEnv* env, jobject jobj, jlong jhandle, jint builtinComparator) {
-  switch (builtinComparator){
+  switch (builtinComparator) {
     case 1:
       reinterpret_cast<rocksdb::Options*>(jhandle)->comparator =
           rocksdb::ReverseBytewiseComparator();
@@ -90,11 +90,15 @@ void Java_org_rocksdb_Options_setBuiltinComparator(
  */
 void Java_org_rocksdb_Options_setWriteBufferSize(
     JNIEnv* env, jobject jobj, jlong jhandle, jlong jwrite_buffer_size) {
-  reinterpret_cast<rocksdb::Options*>(jhandle)->write_buffer_size =
-          rocksdb::jlong_to_size_t(jwrite_buffer_size);
+  rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t(jwrite_buffer_size);
+  if (s.ok()) {
+    reinterpret_cast<rocksdb::Options*>(jhandle)->write_buffer_size =
+        jwrite_buffer_size;
+  } else {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  }
 }
 
-
 /*
  * Class:     org_rocksdb_Options
  * Method:    writeBufferSize
@@ -382,8 +386,13 @@ jlong Java_org_rocksdb_Options_maxLogFileSize(
  */
 void Java_org_rocksdb_Options_setMaxLogFileSize(
     JNIEnv* env, jobject jobj, jlong jhandle, jlong max_log_file_size) {
-  reinterpret_cast<rocksdb::Options*>(jhandle)->max_log_file_size =
-      rocksdb::jlong_to_size_t(max_log_file_size);
+  rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t(max_log_file_size);
+  if (s.ok()) {
+    reinterpret_cast<rocksdb::Options*>(jhandle)->max_log_file_size =
+        max_log_file_size;
+  } else {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  }
 }
 
 /*
@@ -403,8 +412,14 @@ jlong Java_org_rocksdb_Options_logFileTimeToRoll(
  */
 void Java_org_rocksdb_Options_setLogFileTimeToRoll(
     JNIEnv* env, jobject jobj, jlong jhandle, jlong log_file_time_to_roll) {
-  reinterpret_cast<rocksdb::Options*>(jhandle)->log_file_time_to_roll =
-      rocksdb::jlong_to_size_t(log_file_time_to_roll);
+  rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t(
+      log_file_time_to_roll);
+  if (s.ok()) {
+    reinterpret_cast<rocksdb::Options*>(jhandle)->log_file_time_to_roll =
+        log_file_time_to_roll;
+  } else {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  }
 }
 
 /*
@@ -424,8 +439,13 @@ jlong Java_org_rocksdb_Options_keepLogFileNum(
  */
 void Java_org_rocksdb_Options_setKeepLogFileNum(
     JNIEnv* env, jobject jobj, jlong jhandle, jlong keep_log_file_num) {
-  reinterpret_cast<rocksdb::Options*>(jhandle)->keep_log_file_num =
-      rocksdb::jlong_to_size_t(keep_log_file_num);
+  rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t(keep_log_file_num);
+  if (s.ok()) {
+    reinterpret_cast<rocksdb::Options*>(jhandle)->keep_log_file_num =
+        keep_log_file_num;
+  } else {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  }
 }
 
 /*
@@ -542,7 +562,7 @@ void Java_org_rocksdb_Options_useFixedLengthPrefixExtractor(
     JNIEnv* env, jobject jobj, jlong jhandle, jint jprefix_length) {
   reinterpret_cast<rocksdb::Options*>(jhandle)->prefix_extractor.reset(
       rocksdb::NewFixedPrefixTransform(
-          rocksdb::jlong_to_size_t(jprefix_length)));
+          static_cast<int>(jprefix_length)));
 }
 
 /*
@@ -605,8 +625,13 @@ jlong Java_org_rocksdb_Options_manifestPreallocationSize(
  */
 void Java_org_rocksdb_Options_setManifestPreallocationSize(
     JNIEnv* env, jobject jobj, jlong jhandle, jlong preallocation_size) {
-  reinterpret_cast<rocksdb::Options*>(jhandle)->manifest_preallocation_size =
-      rocksdb::jlong_to_size_t(preallocation_size);
+  rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t(preallocation_size);
+  if (s.ok()) {
+    reinterpret_cast<rocksdb::Options*>(jhandle)->manifest_preallocation_size =
+        preallocation_size;
+  } else {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  }
 }
 
 /*
@@ -1256,8 +1281,13 @@ jlong Java_org_rocksdb_Options_arenaBlockSize(
  */
 void Java_org_rocksdb_Options_setArenaBlockSize(
     JNIEnv* env, jobject jobj, jlong jhandle, jlong jarena_block_size) {
-  reinterpret_cast<rocksdb::Options*>(jhandle)->arena_block_size =
-      rocksdb::jlong_to_size_t(jarena_block_size);
+  rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t(jarena_block_size);
+  if (s.ok()) {
+    reinterpret_cast<rocksdb::Options*>(jhandle)->arena_block_size =
+        jarena_block_size;
+  } else {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  }
 }
 
 /*
@@ -1420,9 +1450,14 @@ jlong Java_org_rocksdb_Options_inplaceUpdateNumLocks(
 void Java_org_rocksdb_Options_setInplaceUpdateNumLocks(
     JNIEnv* env, jobject jobj, jlong jhandle,
     jlong jinplace_update_num_locks) {
-  reinterpret_cast<rocksdb::Options*>(
-      jhandle)->inplace_update_num_locks =
-          rocksdb::jlong_to_size_t(jinplace_update_num_locks);
+  rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t(
+      jinplace_update_num_locks);
+  if (s.ok()) {
+    reinterpret_cast<rocksdb::Options*>(jhandle)->inplace_update_num_locks =
+        jinplace_update_num_locks;
+  } else {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  }
 }
 
 /*
@@ -1512,8 +1547,14 @@ jlong Java_org_rocksdb_Options_maxSuccessiveMerges(
 void Java_org_rocksdb_Options_setMaxSuccessiveMerges(
     JNIEnv* env, jobject jobj, jlong jhandle,
     jlong jmax_successive_merges) {
-  reinterpret_cast<rocksdb::Options*>(jhandle)->max_successive_merges =
-      rocksdb::jlong_to_size_t(jmax_successive_merges);
+  rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t(
+      jmax_successive_merges);
+  if (s.ok()) {
+    reinterpret_cast<rocksdb::Options*>(jhandle)->max_successive_merges =
+        jmax_successive_merges;
+  } else {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  }
 }
 
 /*
diff --git a/java/rocksjni/portal.h b/java/rocksjni/portal.h
index 4c7a8b9b9..374d20b0b 100644
--- a/java/rocksjni/portal.h
+++ b/java/rocksjni/portal.h
@@ -14,14 +14,18 @@
 #include <limits>
 #include "rocksdb/db.h"
 #include "rocksdb/filter_policy.h"
+#include "rocksdb/status.h"
 #include "rocksdb/utilities/backupable_db.h"
 
 namespace rocksdb {
 
-inline size_t jlong_to_size_t(const jlong& jvalue) {
-  return static_cast<uint64_t>(jvalue) <=
-      static_cast<uint64_t>(std::numeric_limits<size_t>::max()) ?
-      static_cast<size_t>(jvalue) : std::numeric_limits<size_t>::max();
+// detect if jlong overflows size_t
+inline Status check_if_jlong_fits_size_t(const jlong& jvalue) {
+  Status s = Status::OK();
+  if (static_cast<uint64_t>(jvalue) > std::numeric_limits<size_t>::max()) {
+    s = Status::InvalidArgument(Slice("jlong overflows 32 bit value."));
+  }
+  return s;
 }
 
 // The portal class for org.rocksdb.RocksDB
diff --git a/java/rocksjni/ratelimiterjni.cc b/java/rocksjni/ratelimiterjni.cc
index 5413978a0..ab6160e0d 100644
--- a/java/rocksjni/ratelimiterjni.cc
+++ b/java/rocksjni/ratelimiterjni.cc
@@ -18,7 +18,7 @@ jlong Java_org_rocksdb_GenericRateLimiterConfig_newRateLimiterHandle(
     JNIEnv* env, jobject jobj, jlong jrate_bytes_per_second,
     jlong jrefill_period_micros, jint jfairness) {
   return reinterpret_cast<jlong>(rocksdb::NewGenericRateLimiter(
-      rocksdb::jlong_to_size_t(jrate_bytes_per_second),
-      rocksdb::jlong_to_size_t(jrefill_period_micros),
+      static_cast<int64_t>(jrate_bytes_per_second),
+      static_cast<int64_t>(jrefill_period_micros),
       static_cast<int32_t>(jfairness)));
 }
diff --git a/java/rocksjni/restorejni.cc b/java/rocksjni/restorejni.cc
index 942e707e6..bd1734010 100644
--- a/java/rocksjni/restorejni.cc
+++ b/java/rocksjni/restorejni.cc
@@ -10,7 +10,6 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <jni.h>
-#include <iostream>
 #include <string>
 
 #include "include/org_rocksdb_RestoreOptions.h"
@@ -72,7 +71,7 @@ void Java_org_rocksdb_RestoreBackupableDB_restoreDBFromBackup0(JNIEnv* env,
   env->ReleaseStringUTFChars(jdb_dir, cdb_dir);
   env->ReleaseStringUTFChars(jwal_dir, cwal_dir);
 
-  if(!s.ok()) {
+  if (!s.ok()) {
     rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
   }
 }
@@ -97,7 +96,7 @@ void Java_org_rocksdb_RestoreBackupableDB_restoreDBFromLatestBackup0(
   env->ReleaseStringUTFChars(jdb_dir, cdb_dir);
   env->ReleaseStringUTFChars(jwal_dir, cwal_dir);
 
-  if(!s.ok()) {
+  if (!s.ok()) {
     rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
   }
 }
@@ -112,7 +111,7 @@ void Java_org_rocksdb_RestoreBackupableDB_purgeOldBackups0(JNIEnv* env,
   auto rdb = reinterpret_cast<rocksdb::RestoreBackupableDB*>(jhandle);
   rocksdb::Status s = rdb->PurgeOldBackups(jnum_backups_to_keep);
 
-  if(!s.ok()) {
+  if (!s.ok()) {
     rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
   }
 }
@@ -127,7 +126,7 @@ void Java_org_rocksdb_RestoreBackupableDB_deleteBackup0(JNIEnv* env,
   auto rdb = reinterpret_cast<rocksdb::RestoreBackupableDB*>(jhandle);
   rocksdb::Status s = rdb->DeleteBackup(jbackup_id);
 
-  if(!s.ok()) {
+  if (!s.ok()) {
     rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
   }
 }
diff --git a/java/rocksjni/rocksjni.cc b/java/rocksjni/rocksjni.cc
index f1b9cc758..bb3f1a845 100644
--- a/java/rocksjni/rocksjni.cc
+++ b/java/rocksjni/rocksjni.cc
@@ -246,7 +246,7 @@ jobject multi_get_helper(JNIEnv* env, jobject jdb, rocksdb::DB* db,
       jkey_list, rocksdb::ListJni::getIteratorMethod(env));
 
   // iterate over keys and convert java byte array to slice
-  while(env->CallBooleanMethod(
+  while (env->CallBooleanMethod(
       iteratorObj, rocksdb::ListJni::getHasNextMethod(env)) == JNI_TRUE) {
     jbyteArray jkey = (jbyteArray) env->CallObjectMethod(
        iteratorObj, rocksdb::ListJni::getNextMethod(env));
@@ -272,23 +272,22 @@ jobject multi_get_helper(JNIEnv* env, jobject jdb, rocksdb::DB* db,
   jobject jvalue_list = env->NewObject(jclazz, mid, jkeys_count);
 
   // insert in java list
-  for(std::vector<rocksdb::Status>::size_type i = 0; i != s.size(); i++) {
-    if(s[i].ok()) {
+  for (std::vector<rocksdb::Status>::size_type i = 0; i != s.size(); i++) {
+    if (s[i].ok()) {
       jbyteArray jvalue = env->NewByteArray(values[i].size());
       env->SetByteArrayRegion(
           jvalue, 0, values[i].size(),
           reinterpret_cast<const jbyte*>(values[i].c_str()));
       env->CallBooleanMethod(
           jvalue_list, rocksdb::ListJni::getListAddMethodId(env), jvalue);
-    }
-    else {
+    } else {
       env->CallBooleanMethod(
           jvalue_list, rocksdb::ListJni::getListAddMethodId(env), nullptr);
     }
   }
 
   // free up allocated byte arrays
-  for(std::vector<jbyte*>::size_type i = 0; i != keys_to_free.size(); i++) {
+  for (std::vector<jbyte*>::size_type i = 0; i != keys_to_free.size(); i++) {
     delete[] keys_to_free[i];
   }
   keys_to_free.clear();
@@ -435,17 +434,17 @@ jstring Java_org_rocksdb_RocksDB_getProperty0(
     JNIEnv* env, jobject jdb, jlong db_handle, jstring jproperty,
     jint jproperty_len) {
   auto db = reinterpret_cast<rocksdb::DB*>(db_handle);
-  
+
   const char* property = env->GetStringUTFChars(jproperty, 0);
   rocksdb::Slice property_slice(property, jproperty_len);
-  
+
   std::string property_value;
   bool retCode = db->GetProperty(property_slice, &property_value);
   env->ReleaseStringUTFChars(jproperty, property);
-  
+
   if (!retCode) {
     rocksdb::RocksDBExceptionJni::ThrowNew(env, rocksdb::Status::NotFound());
   }
-  
+
   return env->NewStringUTF(property_value.data());
 }
diff --git a/java/rocksjni/write_batch.cc b/java/rocksjni/write_batch.cc
index 46e7a6fa0..a51cfce12 100644
--- a/java/rocksjni/write_batch.cc
+++ b/java/rocksjni/write_batch.cc
@@ -30,7 +30,7 @@
 void Java_org_rocksdb_WriteBatch_newWriteBatch(
     JNIEnv* env, jobject jobj, jint jreserved_bytes) {
   rocksdb::WriteBatch* wb = new rocksdb::WriteBatch(
-      rocksdb::jlong_to_size_t(jreserved_bytes));
+      static_cast<size_t>(jreserved_bytes));
 
   rocksdb::WriteBatchJni::setHandle(env, jobj, wb);
 }

From 18004d2f202712546d8769c0b666aae63c60d7d7 Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Mon, 13 Oct 2014 10:34:52 +0200
Subject: [PATCH 249/829] [RocksJava] Column family support

This commit includes the support for the following functionalities:

 - Single Get/Put operations
 - WriteBatch operations
 - Single iterator functionality
 - Open database with column families
 - Open database with column families Read/Only
 - Create column family
 - Drop column family
 - Properties of column families
 - Listing of column families
 - Fully backwards comptabile implementation
 - Multi Iterator support
 - MultiGet
 - KeyMayExist
 - Option to create missing column families on open

In addition there is are two new Tests:

 - Test of ColumnFamily functionality
 - Test of Read only feature to open subsets of column families
 - Basic test to test the KeyMayExist feature

What is not supported currently using RocksJava:

 - Custom ColumnFamilyOptions

The following targets work as expected:

 - make rocksdbjava
 - make jtest

Test environment: Ubuntu 14.04(LTS, x64), Java 1.7.0_65(OpenJDK IcedTea 2.5.2), g++ 4.8.2, kernel 3.13.0-35-generix
---
 java/Makefile                                 |   9 +-
 java/RocksDBSample.java                       |   2 +
 java/org/rocksdb/ColumnFamilyHandle.java      |  32 +
 java/org/rocksdb/Options.java                 |  35 +-
 java/org/rocksdb/RocksDB.java                 | 717 +++++++++++++++++-
 java/org/rocksdb/WriteBatch.java              |  35 +
 java/org/rocksdb/test/BackupableDBTest.java   |   4 +-
 java/org/rocksdb/test/ColumnFamilyTest.java   | 282 +++++++
 java/org/rocksdb/test/KeyMayExistTest.java    |  52 ++
 java/org/rocksdb/test/OptionsTest.java        |   6 +
 java/org/rocksdb/test/ReadOnlyTest.java       | 126 +++
 .../rocksdb/test/StatisticsCollectorTest.java |   4 +-
 java/rocksjni/columnfamilyhandle.cc           |  25 +
 java/rocksjni/options.cc                      |  24 +-
 java/rocksjni/portal.h                        |  33 +
 java/rocksjni/rocksjni.cc                     | 693 ++++++++++++++++-
 java/rocksjni/write_batch.cc                  | 116 ++-
 17 files changed, 2110 insertions(+), 85 deletions(-)
 create mode 100644 java/org/rocksdb/ColumnFamilyHandle.java
 create mode 100644 java/org/rocksdb/test/ColumnFamilyTest.java
 create mode 100644 java/org/rocksdb/test/KeyMayExistTest.java
 create mode 100644 java/org/rocksdb/test/ReadOnlyTest.java
 create mode 100644 java/rocksjni/columnfamilyhandle.cc

diff --git a/java/Makefile b/java/Makefile
index 7da76f3f7..9c75c54ea 100644
--- a/java/Makefile
+++ b/java/Makefile
@@ -1,4 +1,4 @@
-NATIVE_JAVA_CLASSES = org.rocksdb.RocksDB org.rocksdb.Options org.rocksdb.WriteBatch org.rocksdb.WriteBatchInternal org.rocksdb.WriteBatchTest org.rocksdb.WriteOptions org.rocksdb.BackupableDB org.rocksdb.BackupableDBOptions org.rocksdb.Statistics org.rocksdb.RocksIterator org.rocksdb.VectorMemTableConfig org.rocksdb.SkipListMemTableConfig org.rocksdb.HashLinkedListMemTableConfig org.rocksdb.HashSkipListMemTableConfig org.rocksdb.PlainTableConfig org.rocksdb.BlockBasedTableConfig org.rocksdb.ReadOptions org.rocksdb.Filter org.rocksdb.BloomFilter org.rocksdb.RestoreOptions org.rocksdb.RestoreBackupableDB org.rocksdb.RocksEnv org.rocksdb.GenericRateLimiterConfig
+NATIVE_JAVA_CLASSES = org.rocksdb.RocksDB org.rocksdb.Options org.rocksdb.WriteBatch org.rocksdb.WriteBatchInternal org.rocksdb.WriteBatchTest org.rocksdb.WriteOptions org.rocksdb.BackupableDB org.rocksdb.BackupableDBOptions org.rocksdb.Statistics org.rocksdb.RocksIterator org.rocksdb.VectorMemTableConfig org.rocksdb.SkipListMemTableConfig org.rocksdb.HashLinkedListMemTableConfig org.rocksdb.HashSkipListMemTableConfig org.rocksdb.PlainTableConfig org.rocksdb.BlockBasedTableConfig org.rocksdb.ReadOptions org.rocksdb.Filter org.rocksdb.BloomFilter org.rocksdb.RestoreOptions org.rocksdb.RestoreBackupableDB org.rocksdb.RocksEnv org.rocksdb.GenericRateLimiterConfig org.rocksdb.ColumnFamilyHandle
 
 ROCKSDB_MAJOR = $(shell egrep "ROCKSDB_MAJOR.[0-9]" ../include/rocksdb/version.h | cut -d ' ' -f 3)
 ROCKSDB_MINOR = $(shell egrep "ROCKSDB_MINOR.[0-9]" ../include/rocksdb/version.h | cut -d ' ' -f 3)
@@ -35,13 +35,18 @@ sample: java
 	@rm -rf /tmp/rocksdbjni_not_found
 
 test: java
+	@rm -rf /tmp/rocksdbjni_*
 	javac org/rocksdb/test/*.java
 	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.WriteBatchTest
 	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.BackupableDBTest
+	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.ColumnFamilyTest
 	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.FilterTest
-	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.OptionsTest
+	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.KeyMayExistTest
+	#java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.OptionsTest
+	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.ReadOnlyTest
 	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.ReadOptionsTest
 	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.StatisticsCollectorTest
+	@rm -rf /tmp/rocksdbjni_*
 
 db_bench: java
 	javac org/rocksdb/benchmark/*.java
diff --git a/java/RocksDBSample.java b/java/RocksDBSample.java
index 9eff06037..302d4e04d 100644
--- a/java/RocksDBSample.java
+++ b/java/RocksDBSample.java
@@ -55,6 +55,7 @@ public class RocksDBSample {
     assert(options.maxBackgroundCompactions() == 10);
     assert(options.compressionType() == CompressionType.SNAPPY_COMPRESSION);
     assert(options.compactionStyle() == CompactionStyle.UNIVERSAL);
+
     try {
       assert(options.memTableFactoryName().equals("SkipListFactory"));
       options.setMemTableConfig(
@@ -87,6 +88,7 @@ public class RocksDBSample {
     } catch (RocksDBException e) {
       assert(false);
     }
+
     Filter bloomFilter = new BloomFilter(10);
     BlockBasedTableConfig table_options = new BlockBasedTableConfig();
     table_options.setBlockCacheSize(64 * SizeUnit.KB)
diff --git a/java/org/rocksdb/ColumnFamilyHandle.java b/java/org/rocksdb/ColumnFamilyHandle.java
new file mode 100644
index 000000000..334abd96d
--- /dev/null
+++ b/java/org/rocksdb/ColumnFamilyHandle.java
@@ -0,0 +1,32 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+/**
+ * ColumnFamilyHandle class to hold handles to underlying rocksdb
+ * ColumnFamily Pointers.
+ */
+public class ColumnFamilyHandle extends RocksObject {
+  ColumnFamilyHandle(long nativeHandle) {
+    super();
+    nativeHandle_ = nativeHandle;
+  }
+
+  /**
+   * Deletes underlying C++ filter pointer.
+   *
+   * Note that this function should be called only after all
+   * RocksDB instances referencing the filter are closed.
+   * Otherwise an undefined behavior will occur.
+   */
+  @Override protected void disposeInternal() {
+    assert(isInitialized());
+    disposeInternal(nativeHandle_);
+  }
+
+  private native void disposeInternal(long handle);
+
+}
diff --git a/java/org/rocksdb/Options.java b/java/org/rocksdb/Options.java
index 1ad8f9489..bb6f74e08 100644
--- a/java/org/rocksdb/Options.java
+++ b/java/org/rocksdb/Options.java
@@ -57,6 +57,21 @@ public class Options extends RocksObject {
     return this;
   }
 
+  /**
+   * <p>If true, missing column families will be automatically created</p>
+   *
+   * <p>Default: false</p>
+   *
+   * @param flag
+   * @return true if missing column families shall be created automatically
+   *     on open.
+   */
+  public Options setCreateMissingColumnFamilies(boolean flag) {
+    assert(isInitialized());
+    setCreateMissingColumnFamilies(nativeHandle_, flag);
+    return this;
+  }
+
   /**
    * Use the specified object to interact with the environment,
    * e.g. to read/write files, schedule background work, etc.
@@ -87,6 +102,19 @@ public class Options extends RocksObject {
     return createIfMissing(nativeHandle_);
   }
 
+  /**
+   * Return true if the create_missing_column_families flag is set
+   * to true. If true column families be created if missing.
+   *
+   * @return true if the createMissingColumnFamilies is set to
+   *     true.
+   * @see #setCreateMissingColumnFamilies(boolean)
+   */
+  public boolean createMissingColumnFamilies() {
+    assert(isInitialized());
+    return createIfMissing(nativeHandle_);
+  }
+
   /**
    * Set {@link org.rocksdb.Options.BuiltinComparator} to be used with RocksDB.
    *
@@ -781,7 +809,7 @@ public class Options extends RocksObject {
   private native void setWalTtlSeconds(long handle, long walTtlSeconds);
 
   /**
-   * {@link #walTtlSeconds()} and {@link #walSizeLimitMB()} affect how archived logs
+   * {@link #walTtlSeconds()} and {@code #walSizeLimitMB()} affect how archived logs
    * will be deleted.
    * <ol>
    * <li>If both set to 0, logs will be deleted asap and will not get into
@@ -1515,8 +1543,6 @@ public class Options extends RocksObject {
    * and total file size for level-3 will be 2GB.
    * by default 'maxBytesForLevelBase' is 10MB.
    *
-   * @return maxBytesForLevelBase the upper-bound of the total size of
-   *     leve-1 files in bytes.
    * @return the reference to the current option.
    * @see #setMaxBytesForLevelMultiplier(int)
    */
@@ -2227,6 +2253,9 @@ public class Options extends RocksObject {
   private native boolean createIfMissing(long handle);
   private native void setWriteBufferSize(long handle, long writeBufferSize)
       throws RocksDBException;
+  private native void setCreateMissingColumnFamilies(
+      long handle, boolean flag);
+  private native boolean createMissingColumnFamilies(long handle);
   private native long writeBufferSize(long handle);
   private native void setMaxWriteBufferNumber(
       long handle, int maxWriteBufferNumber);
diff --git a/java/org/rocksdb/RocksDB.java b/java/org/rocksdb/RocksDB.java
index afb858050..d10c235dc 100644
--- a/java/org/rocksdb/RocksDB.java
+++ b/java/org/rocksdb/RocksDB.java
@@ -5,13 +5,9 @@
 
 package org.rocksdb;
 
-import java.util.List;
-import java.util.Map;
-import java.util.HashMap;
-import java.io.Closeable;
+import java.util.*;
 import java.io.IOException;
 import org.rocksdb.util.Environment;
-import org.rocksdb.NativeLibraryLoader;
 
 /**
  * A RocksDB is a persistent ordered map from keys to values.  It is safe for
@@ -84,7 +80,7 @@ public class RocksDB extends RocksObject {
         err = e;
       }
     }
-    if (success == false) {
+    if (!success) {
       throw err;
     }
   }
@@ -95,11 +91,11 @@ public class RocksDB extends RocksObject {
    * set to true.
    *
    * @param path the path to the rocksdb.
-   * @return a rocksdb instance on success, null if the specified rocksdb can
-   *     not be opened.
+   * @return a {@link RocksDB} instance on success, null if the specified
+   *     {@link RocksDB} can not be opened.
    *
+   * @throws org.rocksdb.RocksDBException
    * @see Options#setCreateIfMissing(boolean)
-   * @see org.rocksdb.Options#createIfMissing()
    */
   public static RocksDB open(String path) throws RocksDBException {
     // This allows to use the rocksjni default Options instead of
@@ -108,18 +104,63 @@ public class RocksDB extends RocksObject {
     return open(options, path);
   }
 
+  /**
+   * The factory constructor of RocksDB that opens a RocksDB instance given
+   * the path to the database using the specified options and db path and a list
+   * of column family names.
+   * <p>
+   * If opened in read write mode every existing column family name must be passed
+   * within the list to this method.</p>
+   * <p>
+   * If opened in read-only mode only a subset of existing column families must
+   * be passed to this method.</p>
+   * <p>
+   * Options instance *should* not be disposed before all DBs using this options
+   * instance have been closed. If user doesn't call options dispose explicitly,
+   * then this options instance will be GC'd automatically</p>
+   * <p>
+   * ColumnFamily handles are disposed when the RocksDB instance is disposed.
+   * </p>
+   *
+   * @param path the path to the rocksdb.
+   * @param columnFamilyNames list of column family names
+   * @param columnFamilyHandles will be filled with ColumnFamilyHandle instances
+   *     on open.
+   * @return a {@link RocksDB} instance on success, null if the specified
+   *     {@link RocksDB} can not be opened.
+   *
+   * @throws org.rocksdb.RocksDBException
+   * @see Options#setCreateIfMissing(boolean)
+   */
+  public static RocksDB open(String path, List<String> columnFamilyNames,
+      List<ColumnFamilyHandle> columnFamilyHandles) throws RocksDBException {
+    // This allows to use the rocksjni default Options instead of
+    // the c++ one.
+    Options options = new Options();
+    return open(options, path, columnFamilyNames, columnFamilyHandles);
+  }
+
   /**
    * The factory constructor of RocksDB that opens a RocksDB instance given
    * the path to the database using the specified options and db path.
    *
+   * <p>
    * Options instance *should* not be disposed before all DBs using this options
    * instance have been closed. If user doesn't call options dispose explicitly,
-   * then this options instance will be GC'd automatically.
-   *
+   * then this options instance will be GC'd automatically.</p>
+   * <p>
    * Options instance can be re-used to open multiple DBs if DB statistics is
    * not used. If DB statistics are required, then its recommended to open DB
    * with new Options instance as underlying native statistics instance does not
-   * use any locks to prevent concurrent updates.
+   * use any locks to prevent concurrent updates.</p>
+   *
+   * @param options {@link org.rocksdb.Options} instance.
+   * @param path the path to the rocksdb.
+   * @return a {@link RocksDB} instance on success, null if the specified
+   *     {@link RocksDB} can not be opened.
+   *
+   * @throws org.rocksdb.RocksDBException
+   * @see Options#setCreateIfMissing(boolean)
    */
   public static RocksDB open(Options options, String path)
       throws RocksDBException {
@@ -133,6 +174,169 @@ public class RocksDB extends RocksObject {
     return db;
   }
 
+  /**
+   * The factory constructor of RocksDB that opens a RocksDB instance given
+   * the path to the database using the specified options and db path and a list
+   * of column family names.
+   * <p>
+   * If opened in read write mode every existing column family name must be passed
+   * within the list to this method.</p>
+   * <p>
+   * If opened in read-only mode only a subset of existing column families must
+   * be passed to this method.</p>
+   * <p>
+   * Options instance *should* not be disposed before all DBs using this options
+   * instance have been closed. If user doesn't call options dispose explicitly,
+   * then this options instance will be GC'd automatically.</p>
+   * <p>
+   * Options instance can be re-used to open multiple DBs if DB statistics is
+   * not used. If DB statistics are required, then its recommended to open DB
+   * with new Options instance as underlying native statistics instance does not
+   * use any locks to prevent concurrent updates.</p>
+   * <p>
+   * ColumnFamily handles are disposed when the RocksDB instance is disposed.</p>
+   *
+   * @param options {@link org.rocksdb.Options} instance.
+   * @param path the path to the rocksdb.
+   * @param columnFamilyNames list of column family names
+   * @param columnFamilyHandles will be filled with ColumnFamilyHandle instances
+   *     on open.
+   * @return a {@link RocksDB} instance on success, null if the specified
+   *     {@link RocksDB} can not be opened.
+   *
+   * @throws org.rocksdb.RocksDBException
+   * @see Options#setCreateIfMissing(boolean)
+   */
+  public static RocksDB open(Options options, String path, List<String> columnFamilyNames,
+      List<ColumnFamilyHandle> columnFamilyHandles)
+      throws RocksDBException {
+    RocksDB db = new RocksDB();
+    List<Long> cfReferences = db.open(options.nativeHandle_, path,
+        columnFamilyNames, columnFamilyNames.size());
+    for (int i=0; i<columnFamilyNames.size(); i++) {
+      columnFamilyHandles.add(new ColumnFamilyHandle(cfReferences.get(i)));
+    }
+    db.storeOptionsInstance(options);
+    return db;
+  }
+
+  /**
+   * The factory constructor of RocksDB that opens a RocksDB instance in
+   * Read-Only mode given the path to the database using the default
+   * options.
+   *
+   * @param path the path to the RocksDB.
+   * @return a {@link RocksDB} instance on success, null if the specified
+   *     {@link RocksDB} can not be opened.
+   * @throws RocksDBException
+   */
+  public static RocksDB openReadOnly(String path)
+      throws RocksDBException {
+    // This allows to use the rocksjni default Options instead of
+    // the c++ one.
+    Options options = new Options();
+    return openReadOnly(options, path);
+  }
+
+  /**
+   * The factory constructor of RocksDB that opens a RocksDB instance in
+   * Read-Only mode given the path to the database using the default
+   * options.
+   *
+   * @param path the path to the RocksDB.
+   * @param columnFamilyNames list of column family names
+   * @param columnFamilyHandles will be filled with ColumnFamilyHandle instances
+   *     on open.
+   * @return a {@link RocksDB} instance on success, null if the specified
+   *     {@link RocksDB} can not be opened.
+   * @throws RocksDBException
+   */
+  public static RocksDB openReadOnly(String path, List<String> columnFamilyNames,
+      List<ColumnFamilyHandle> columnFamilyHandles) throws RocksDBException {
+    // This allows to use the rocksjni default Options instead of
+    // the c++ one.
+    Options options = new Options();
+    return openReadOnly(options, path, columnFamilyNames, columnFamilyHandles);
+  }
+
+  /**
+   * The factory constructor of RocksDB that opens a RocksDB instance in
+   * Read-Only mode given the path to the database using the specified
+   * options and db path.
+   *
+   * Options instance *should* not be disposed before all DBs using this options
+   * instance have been closed. If user doesn't call options dispose explicitly,
+   * then this options instance will be GC'd automatically.
+   *
+   * @param options {@link Options} instance.
+   * @param path the path to the RocksDB.
+   * @return a {@link RocksDB} instance on success, null if the specified
+   *     {@link RocksDB} can not be opened.
+   * @throws RocksDBException
+   */
+  public static RocksDB openReadOnly(Options options, String path)
+      throws RocksDBException {
+    // when non-default Options is used, keeping an Options reference
+    // in RocksDB can prevent Java to GC during the life-time of
+    // the currently-created RocksDB.
+    RocksDB db = new RocksDB();
+    db.openROnly(options.nativeHandle_, path);
+
+    db.storeOptionsInstance(options);
+    return db;
+  }
+
+  /**
+   * The factory constructor of RocksDB that opens a RocksDB instance in
+   * Read-Only mode given the path to the database using the specified
+   * options and db path.
+   *
+   * <p>This open method allows to open RocksDB using a subset of available
+   * column families</p>
+   * <p>Options instance *should* not be disposed before all DBs using this
+   * options instance have been closed. If user doesn't call options dispose
+   * explicitly,then this options instance will be GC'd automatically.</p>
+   *
+   * @param options {@link Options} instance.
+   * @param path the path to the RocksDB.
+   * @param columnFamilyNames list of column family names
+   * @param columnFamilyHandles will be filled with ColumnFamilyHandle instances
+   *     on open.
+   * @return a {@link RocksDB} instance on success, null if the specified
+   *     {@link RocksDB} can not be opened.
+   * @throws RocksDBException
+   */
+  public static RocksDB openReadOnly(Options options, String path,
+      List<String> columnFamilyNames, List<ColumnFamilyHandle> columnFamilyHandles)
+      throws RocksDBException {
+    // when non-default Options is used, keeping an Options reference
+    // in RocksDB can prevent Java to GC during the life-time of
+    // the currently-created RocksDB.
+    RocksDB db = new RocksDB();
+    List<Long> cfReferences = db.openROnly(options.nativeHandle_, path,
+        columnFamilyNames, columnFamilyNames.size());
+    for (int i=0; i<columnFamilyNames.size(); i++) {
+      columnFamilyHandles.add(new ColumnFamilyHandle(cfReferences.get(i)));
+    }
+
+    db.storeOptionsInstance(options);
+    return db;
+  }
+  /**
+   * Static method to determine all available column families for a
+   * rocksdb database identified by path
+   *
+   * @param options Options for opening the database
+   * @param path Absolute path to rocksdb database
+   * @return List<byte[]> List containing the column family names
+   *
+   * @throws RocksDBException
+   */
+  public static List<byte[]> listColumnFamilies(Options options, String path)
+      throws RocksDBException {
+    return RocksDB.listColumnFamilies(options.nativeHandle_, path);
+  }
+
   private void storeOptionsInstance(Options options) {
     options_ = options;
   }
@@ -155,16 +359,39 @@ public class RocksDB extends RocksObject {
    *
    * @param key the specified key to be inserted.
    * @param value the value associated with the specified key.
+   *
+   * @see RocksDBException
    */
   public void put(byte[] key, byte[] value) throws RocksDBException {
     put(nativeHandle_, key, key.length, value, value.length);
   }
 
+  /**
+   * Set the database entry for "key" to "value" in the specified
+   * column family.
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance
+   * @param key the specified key to be inserted.
+   * @param value the value associated with the specified key.
+   *
+   * throws IllegalArgumentException if column family is not present
+   *
+   * @see RocksDBException
+   */
+  public void put(ColumnFamilyHandle columnFamilyHandle, byte[] key,
+      byte[] value) throws RocksDBException {
+    put(nativeHandle_, key, key.length, value, value.length,
+        columnFamilyHandle.nativeHandle_);
+  }
+
   /**
    * Set the database entry for "key" to "value".
    *
    * @param key the specified key to be inserted.
    * @param value the value associated with the specified key.
+   *
+   * @see RocksDBException
    */
   public void put(WriteOptions writeOpts, byte[] key, byte[] value)
       throws RocksDBException {
@@ -172,8 +399,73 @@ public class RocksDB extends RocksObject {
         key, key.length, value, value.length);
   }
 
+  /**
+   * Set the database entry for "key" to "value" for the specified
+   * column family.
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance
+   * @param key the specified key to be inserted.
+   * @param value the value associated with the specified key.
+   *
+   * throws IllegalArgumentException if column family is not present
+   *
+   * @see RocksDBException
+   * @see IllegalArgumentException
+   */
+  public void put(ColumnFamilyHandle columnFamilyHandle, WriteOptions writeOpts,
+      byte[] key, byte[] value) throws RocksDBException {
+    put(nativeHandle_, writeOpts.nativeHandle_, key, key.length, value, value.length,
+        columnFamilyHandle.nativeHandle_);
+  }
+
+  /**
+   * If the key definitely does not exist in the database, then this method
+   * returns false, else true.
+   *
+   * This check is potentially lighter-weight than invoking DB::Get(). One way
+   * to make this lighter weight is to avoid doing any IOs.
+   *
+   * @param columnFamilyHandle {@link ColumnFamilyHandle} instnace
+   * @param key byte array of a key to search for
+   * @param value StringBuffer instance which is a out parameter if a value is
+   *    found in block-cache.
+   * @return boolean value indicating if key does not exist or might exist.
+   */
+  public boolean keyMayExist(ColumnFamilyHandle columnFamilyHandle,
+      byte[] key, StringBuffer value){
+    return keyMayExist(key, key.length, columnFamilyHandle.nativeHandle_,
+        value);
+  }
+
+  /**
+   * If the key definitely does not exist in the database, then this method
+   * returns false, else true.
+   *
+   * This check is potentially lighter-weight than invoking DB::Get(). One way
+   * to make this lighter weight is to avoid doing any IOs.
+   *
+   * @param readOptions {@link ReadOptions} instance
+   * @param columnFamilyHandle {@link ColumnFamilyHandle} instnace
+   * @param key byte array of a key to search for
+   * @param value StringBuffer instance which is a out parameter if a value is
+   *    found in block-cache.
+   * @return boolean value indicating if key does not exist or might exist.
+   */
+  public boolean keyMayExist(ReadOptions readOptions,
+      ColumnFamilyHandle columnFamilyHandle, byte[] key, StringBuffer value){
+    return keyMayExist(readOptions.nativeHandle_,
+        key, key.length, columnFamilyHandle.nativeHandle_,
+        value);
+  }
+
   /**
    * Apply the specified updates to the database.
+   *
+   * @param writeOpts WriteOptions instance
+   * @param updates WriteBatch instance
+   *
+   * @see RocksDBException
    */
   public void write(WriteOptions writeOpts, WriteBatch updates)
       throws RocksDBException {
@@ -181,7 +473,7 @@ public class RocksDB extends RocksObject {
   }
 
   /**
-   * Get the value associated with the specified key.
+   * Get the value associated with the specified key within column family
    *
    * @param key the key to retrieve the value.
    * @param value the out-value to receive the retrieved value.
@@ -191,11 +483,35 @@ public class RocksDB extends RocksObject {
    *     input buffer {@code value} is insufficient and partial result will
    *     be returned.  RocksDB.NOT_FOUND will be returned if the value not
    *     found.
+   *
+   * @see RocksDBException
    */
   public int get(byte[] key, byte[] value) throws RocksDBException {
     return get(nativeHandle_, key, key.length, value, value.length);
   }
 
+  /**
+   * Get the value associated with the specified key within column family.
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance
+   * @param key the key to retrieve the value.
+   * @param value the out-value to receive the retrieved value.
+   * @return The size of the actual value that matches the specified
+   *     {@code key} in byte.  If the return value is greater than the
+   *     length of {@code value}, then it indicates that the size of the
+   *     input buffer {@code value} is insufficient and partial result will
+   *     be returned.  RocksDB.NOT_FOUND will be returned if the value not
+   *     found.
+   *
+   * @throws RocksDBException
+   */
+  public int get(ColumnFamilyHandle columnFamilyHandle, byte[] key, byte[] value)
+      throws RocksDBException, IllegalArgumentException {
+    return get(nativeHandle_, key, key.length, value, value.length,
+        columnFamilyHandle.nativeHandle_);
+  }
+
   /**
    * Get the value associated with the specified key.
    *
@@ -207,12 +523,35 @@ public class RocksDB extends RocksObject {
    *     input buffer {@code value} is insufficient and partial result will
    *     be returned.  RocksDB.NOT_FOUND will be returned if the value not
    *     found.
+   *
+   * @throws RocksDBException
    */
   public int get(ReadOptions opt, byte[] key, byte[] value)
       throws RocksDBException {
     return get(nativeHandle_, opt.nativeHandle_,
                key, key.length, value, value.length);
   }
+  /**
+   * Get the value associated with the specified key within column family.
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance
+   * @param key the key to retrieve the value.
+   * @param value the out-value to receive the retrieved value.
+   * @return The size of the actual value that matches the specified
+   *     {@code key} in byte.  If the return value is greater than the
+   *     length of {@code value}, then it indicates that the size of the
+   *     input buffer {@code value} is insufficient and partial result will
+   *     be returned.  RocksDB.NOT_FOUND will be returned if the value not
+   *     found.
+   *
+   * @throws RocksDBException
+   */
+  public int get(ColumnFamilyHandle columnFamilyHandle, ReadOptions opt, byte[] key,
+      byte[] value) throws RocksDBException {
+    return get(nativeHandle_, opt.nativeHandle_, key, key.length, value,
+        value.length, columnFamilyHandle.nativeHandle_);
+  }
 
   /**
    * The simplified version of get which returns a new byte array storing
@@ -223,12 +562,30 @@ public class RocksDB extends RocksObject {
    * @return a byte array storing the value associated with the input key if
    *     any.  null if it does not find the specified key.
    *
-   * @see RocksDBException
+   * @throws RocksDBException
    */
   public byte[] get(byte[] key) throws RocksDBException {
     return get(nativeHandle_, key, key.length);
   }
 
+  /**
+   * The simplified version of get which returns a new byte array storing
+   * the value associated with the specified input key if any.  null will be
+   * returned if the specified key is not found.
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance
+   * @param key the key retrieve the value.
+   * @return a byte array storing the value associated with the input key if
+   *     any.  null if it does not find the specified key.
+   *
+   * @throws RocksDBException
+   */
+  public byte[] get(ColumnFamilyHandle columnFamilyHandle, byte[] key)
+      throws RocksDBException {
+    return get(nativeHandle_, key, key.length, columnFamilyHandle.nativeHandle_);
+  }
+
   /**
    * The simplified version of get which returns a new byte array storing
    * the value associated with the specified input key if any.  null will be
@@ -239,12 +596,32 @@ public class RocksDB extends RocksObject {
    * @return a byte array storing the value associated with the input key if
    *     any.  null if it does not find the specified key.
    *
-   * @see RocksDBException
+   * @throws RocksDBException
    */
   public byte[] get(ReadOptions opt, byte[] key) throws RocksDBException {
     return get(nativeHandle_, opt.nativeHandle_, key, key.length);
   }
 
+  /**
+   * The simplified version of get which returns a new byte array storing
+   * the value associated with the specified input key if any.  null will be
+   * returned if the specified key is not found.
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance
+   * @param key the key retrieve the value.
+   * @param opt Read options.
+   * @return a byte array storing the value associated with the input key if
+   *     any.  null if it does not find the specified key.
+   *
+   * @throws RocksDBException
+   */
+  public byte[] get(ColumnFamilyHandle columnFamilyHandle, ReadOptions opt,
+      byte[] key) throws RocksDBException {
+    return get(nativeHandle_, opt.nativeHandle_, key, key.length,
+        columnFamilyHandle.nativeHandle_);
+  }
+
   /**
    * Returns a map of keys for which values were found in DB.
    *
@@ -252,7 +629,7 @@ public class RocksDB extends RocksObject {
    * @return Map where key of map is the key passed by user and value for map
    * entry is the corresponding value in DB.
    *
-   * @see RocksDBException
+   * @throws RocksDBException
    */
   public Map<byte[], byte[]> multiGet(List<byte[]> keys)
       throws RocksDBException {
@@ -273,6 +650,43 @@ public class RocksDB extends RocksObject {
     return keyValueMap;
   }
 
+  /**
+   * Returns a map of keys for which values were found in DB.
+   * <p>
+   * Note: Every key needs to have a related column family name in
+   * {@code columnFamilyHandleList}.
+   * </p>
+   *
+   * @param columnFamilyHandleList {@link java.util.List} containing
+   *     {@link org.rocksdb.ColumnFamilyHandle} instances.
+   * @param keys List of keys for which values need to be retrieved.
+   * @return Map where key of map is the key passed by user and value for map
+   * entry is the corresponding value in DB.
+   *
+   * @throws RocksDBException
+   * @throws IllegalArgumentException
+   */
+  public Map<byte[], byte[]> multiGet(List<ColumnFamilyHandle> columnFamilyHandleList,
+      List<byte[]> keys) throws RocksDBException, IllegalArgumentException {
+    assert(keys.size() != 0);
+    // Check if key size equals cfList size. If not a exception must be
+    // thrown. If not a Segmentation fault happens.
+    if (keys.size()!=columnFamilyHandleList.size()) {
+        throw new IllegalArgumentException(
+            "For each key there must be a ColumnFamilyHandle.");
+    }
+    List<byte[]> values = multiGet(nativeHandle_, keys, keys.size(),
+        columnFamilyHandleList);
+
+    Map<byte[], byte[]> keyValueMap = new HashMap<byte[], byte[]>();
+    for(int i = 0; i < values.size(); i++) {
+      if (values.get(i) == null) {
+        continue;
+      }
+      keyValueMap.put(keys.get(i), values.get(i));
+    }
+    return keyValueMap;
+  }
 
   /**
    * Returns a map of keys for which values were found in DB.
@@ -282,7 +696,7 @@ public class RocksDB extends RocksObject {
    * @return Map where key of map is the key passed by user and value for map
    * entry is the corresponding value in DB.
    *
-   * @see RocksDBException
+   * @throws RocksDBException
    */
   public Map<byte[], byte[]> multiGet(ReadOptions opt, List<byte[]> keys)
       throws RocksDBException {
@@ -303,10 +717,56 @@ public class RocksDB extends RocksObject {
     return keyValueMap;
   }
 
+  /**
+   * Returns a map of keys for which values were found in DB.
+   * <p>
+   * Note: Every key needs to have a related column family name in
+   * {@code columnFamilyHandleList}.
+   * </p>
+   *
+   * @param opt Read options.
+   * @param columnFamilyHandleList {@link java.util.List} containing
+   *     {@link org.rocksdb.ColumnFamilyHandle} instances.
+   * @param keys of keys for which values need to be retrieved.
+   * @return Map where key of map is the key passed by user and value for map
+   * entry is the corresponding value in DB.
+   *
+   * @throws RocksDBException
+   * @throws java.lang.IllegalArgumentException
+   */
+  public Map<byte[], byte[]> multiGet(ReadOptions opt,
+      List<ColumnFamilyHandle> columnFamilyHandleList, List<byte[]> keys)
+      throws RocksDBException {
+    assert(keys.size() != 0);
+    // Check if key size equals cfList size. If not a exception must be
+    // thrown. If not a Segmentation fault happens.
+    if (keys.size()!=columnFamilyHandleList.size()){
+      throw new IllegalArgumentException(
+          "For each key there must be a ColumnFamilyHandle.");
+    }
+
+    List<byte[]> values = multiGet(nativeHandle_, opt.nativeHandle_,
+        keys, keys.size(), columnFamilyHandleList);
+
+    Map<byte[], byte[]> keyValueMap = new HashMap<byte[], byte[]>();
+    for(int i = 0; i < values.size(); i++) {
+      if(values.get(i) == null) {
+        continue;
+      }
+      keyValueMap.put(keys.get(i), values.get(i));
+    }
+
+    return keyValueMap;
+  }
+
   /**
    * Remove the database entry (if any) for "key".  Returns OK on
    * success, and a non-OK status on error.  It is not an error if "key"
    * did not exist in the database.
+   *
+   * @param key Key to delete within database
+   *
+   * @throws RocksDBException
    */
   public void remove(byte[] key) throws RocksDBException {
     remove(nativeHandle_, key, key.length);
@@ -316,6 +776,27 @@ public class RocksDB extends RocksObject {
    * Remove the database entry (if any) for "key".  Returns OK on
    * success, and a non-OK status on error.  It is not an error if "key"
    * did not exist in the database.
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance
+   * @param key Key to delete within database
+   *
+   * @throws RocksDBException
+   */
+  public void remove(ColumnFamilyHandle columnFamilyHandle, byte[] key)
+      throws RocksDBException {
+    remove(nativeHandle_, key, key.length, columnFamilyHandle.nativeHandle_);
+  }
+
+  /**
+   * Remove the database entry (if any) for "key".  Returns OK on
+   * success, and a non-OK status on error.  It is not an error if "key"
+   * did not exist in the database.
+   *
+   * @param writeOpt WriteOptions to be used with delete operation
+   * @param key Key to delete within database
+   *
+   * @throws RocksDBException
    */
   public void remove(WriteOptions writeOpt, byte[] key)
       throws RocksDBException {
@@ -323,20 +804,74 @@ public class RocksDB extends RocksObject {
   }
 
   /**
-   * DB implementations can export properties about their state
-     via this method.  If "property" is a valid property understood by this
-     DB implementation, fills "*value" with its current value and returns
-     true.  Otherwise returns false.
-
+   * Remove the database entry (if any) for "key".  Returns OK on
+   * success, and a non-OK status on error.  It is not an error if "key"
+   * did not exist in the database.
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance
+   * @param writeOpt WriteOptions to be used with delete operation
+   * @param key Key to delete within database
+   *
+   * @throws RocksDBException
+   */
+  public void remove(ColumnFamilyHandle columnFamilyHandle, WriteOptions writeOpt,
+      byte[] key) throws RocksDBException {
+    remove(nativeHandle_, writeOpt.nativeHandle_, key, key.length,
+        columnFamilyHandle.nativeHandle_);
+  }
 
-     Valid property names include:
+  /**
+   * DB implements can export properties about their state
+   * via this method on a per column family level.
+   *
+   * <p>If {@code property} is a valid property understood by this DB
+   * implementation, fills {@code value} with its current value and
+   * returns true. Otherwise returns false.</p>
+   *
+   * <p>Valid property names include:
+   * <ul>
+   * <li>"rocksdb.num-files-at-level<N>" - return the number of files at level <N>,
+   *     where <N> is an ASCII representation of a level number (e.g. "0").</li>
+   * <li>"rocksdb.stats" - returns a multi-line string that describes statistics
+   *     about the internal operation of the DB.</li>
+   * <li>"rocksdb.sstables" - returns a multi-line string that describes all
+   *    of the sstables that make up the db contents.</li>
+   *</ul></p>
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance
+   * @param property to be fetched. See above for examples
+   * @return property value
+   *
+   * @throws RocksDBException
+   */
+  public String getProperty(ColumnFamilyHandle columnFamilyHandle, String property)
+      throws RocksDBException {
+    return getProperty0(nativeHandle_, columnFamilyHandle.nativeHandle_, property,
+        property.length());
+  }
 
-     "rocksdb.num-files-at-level<N>" - return the number of files at level <N>,
-         where <N> is an ASCII representation of a level number (e.g. "0").
-     "rocksdb.stats" - returns a multi-line string that describes statistics
-         about the internal operation of the DB.
-     "rocksdb.sstables" - returns a multi-line string that describes all
-       of the sstables that make up the db contents.
+  /**
+   * DB implementations can export properties about their state
+   * via this method.  If "property" is a valid property understood by this
+   * DB implementation, fills "*value" with its current value and returns
+   * true.  Otherwise returns false.
+   *
+   * <p>Valid property names include:
+   * <ul>
+   * <li>"rocksdb.num-files-at-level<N>" - return the number of files at level <N>,
+   *     where <N> is an ASCII representation of a level number (e.g. "0").</li>
+   * <li>"rocksdb.stats" - returns a multi-line string that describes statistics
+   *     about the internal operation of the DB.</li>
+   * <li>"rocksdb.sstables" - returns a multi-line string that describes all
+   *    of the sstables that make up the db contents.</li>
+   *</ul></p>
+   *
+   * @param property to be fetched. See above for examples
+   * @return property value
+   *
+   * @throws RocksDBException
    */
   public String getProperty(String property) throws RocksDBException {
     return getProperty0(nativeHandle_, property, property.length());
@@ -356,6 +891,77 @@ public class RocksDB extends RocksObject {
     return new RocksIterator(iterator0(nativeHandle_));
   }
 
+  /**
+   * Return a heap-allocated iterator over the contents of the database.
+   * The result of newIterator() is initially invalid (caller must
+   * call one of the Seek methods on the iterator before using it).
+   *
+   * Caller should close the iterator when it is no longer needed.
+   * The returned iterator should be closed before this db is closed.
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance
+   * @return instance of iterator object.
+   */
+  public RocksIterator newIterator(ColumnFamilyHandle columnFamilyHandle) {
+    return new RocksIterator(iterator0(nativeHandle_, columnFamilyHandle.nativeHandle_));
+  }
+
+  /**
+   * Returns iterators from a consistent database state across multiple
+   * column families. Iterators are heap allocated and need to be deleted
+   * before the db is deleted
+   *
+   * @param columnFamilyHandleList {@link java.util.List} containing
+   *     {@link org.rocksdb.ColumnFamilyHandle} instances.
+   * @return {@link java.util.List} containing {@link org.rocksdb.RocksIterator}
+   *     instances
+   *
+   * @throws RocksDBException
+   */
+  public List<RocksIterator> newIterators(
+      List<ColumnFamilyHandle> columnFamilyHandleList) throws RocksDBException {
+    List<RocksIterator> iterators =
+        new ArrayList<RocksIterator>(columnFamilyHandleList.size());
+
+    long[] iteratorRefs = iterators(nativeHandle_, columnFamilyHandleList);
+    for (int i=0; i<columnFamilyHandleList.size(); i++){
+      iterators.add(new RocksIterator(iteratorRefs[i]));
+    }
+    return iterators;
+  }
+
+  /**
+   * Creates a new column family with the name columnFamilyName and
+   * allocates a ColumnFamilyHandle within an internal structure.
+   * The ColumnFamilyHandle is automatically disposed with DB disposal.
+   *
+   * @param columnFamilyName Name of column family to be created.
+   * @return {@link org.rocksdb.ColumnFamilyHandle} instance
+   * @see RocksDBException
+   */
+  public ColumnFamilyHandle createColumnFamily(String columnFamilyName)
+      throws RocksDBException {
+    return new ColumnFamilyHandle(createColumnFamily(nativeHandle_,
+        columnFamilyName));
+  }
+
+  /**
+   * Drops the column family identified by columnFamilyName. Internal
+   * handles to this column family will be disposed. If the column family
+   * is not known removal will fail.
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance
+   *
+   * @throws RocksDBException
+   */
+  public void dropColumnFamily(ColumnFamilyHandle columnFamilyHandle)
+      throws RocksDBException, IllegalArgumentException {
+    // throws RocksDBException if something goes wrong
+    dropColumnFamily(nativeHandle_, columnFamilyHandle.nativeHandle_);
+  }
+
   /**
    * Private constructor.
    */
@@ -366,39 +972,90 @@ public class RocksDB extends RocksObject {
   // native methods
   protected native void open(
       long optionsHandle, String path) throws RocksDBException;
+  protected native List<Long> open(long optionsHandle, String path,
+      List<String> columnFamilyNames, int columnFamilyNamesLength)
+      throws RocksDBException;
+  protected native static List<byte[]> listColumnFamilies(
+      long optionsHandle, String path) throws RocksDBException;
+  protected native void openROnly(
+      long optionsHandle, String path) throws RocksDBException;
+  protected native List<Long> openROnly(
+      long optionsHandle, String path, List<String> columnFamilyNames,
+      int columnFamilyNamesLength) throws RocksDBException;
   protected native void put(
       long handle, byte[] key, int keyLen,
       byte[] value, int valueLen) throws RocksDBException;
+  protected native void put(
+      long handle, byte[] key, int keyLen,
+      byte[] value, int valueLen, long cfHandle) throws RocksDBException;
   protected native void put(
       long handle, long writeOptHandle,
       byte[] key, int keyLen,
       byte[] value, int valueLen) throws RocksDBException;
+  protected native void put(
+      long handle, long writeOptHandle,
+      byte[] key, int keyLen,
+      byte[] value, int valueLen, long cfHandle) throws RocksDBException;
   protected native void write(
       long writeOptHandle, long batchHandle) throws RocksDBException;
+  protected native boolean keyMayExist(byte[] key, int keyLen,
+      long cfHandle, StringBuffer stringBuffer);
+  protected native boolean keyMayExist(long optionsHandle, byte[] key, int keyLen,
+      long cfHandle, StringBuffer stringBuffer);
   protected native int get(
       long handle, byte[] key, int keyLen,
       byte[] value, int valueLen) throws RocksDBException;
+  protected native int get(
+      long handle, byte[] key, int keyLen,
+      byte[] value, int valueLen, long cfHandle) throws RocksDBException;
   protected native int get(
       long handle, long readOptHandle, byte[] key, int keyLen,
       byte[] value, int valueLen) throws RocksDBException;
+  protected native int get(
+      long handle, long readOptHandle, byte[] key, int keyLen,
+      byte[] value, int valueLen, long cfHandle) throws RocksDBException;
   protected native List<byte[]> multiGet(
       long dbHandle, List<byte[]> keys, int keysCount);
+  protected native List<byte[]> multiGet(
+      long dbHandle, List<byte[]> keys, int keysCount, List<ColumnFamilyHandle>
+      cfHandles);
   protected native List<byte[]> multiGet(
       long dbHandle, long rOptHandle, List<byte[]> keys, int keysCount);
+  protected native List<byte[]> multiGet(
+      long dbHandle, long rOptHandle, List<byte[]> keys, int keysCount,
+      List<ColumnFamilyHandle> cfHandles);
   protected native byte[] get(
       long handle, byte[] key, int keyLen) throws RocksDBException;
+  protected native byte[] get(
+      long handle, byte[] key, int keyLen, long cfHandle) throws RocksDBException;
   protected native byte[] get(
       long handle, long readOptHandle,
       byte[] key, int keyLen) throws RocksDBException;
+  protected native byte[] get(
+      long handle, long readOptHandle,
+      byte[] key, int keyLen, long cfHandle) throws RocksDBException;
   protected native void remove(
       long handle, byte[] key, int keyLen) throws RocksDBException;
+  protected native void remove(
+      long handle, byte[] key, int keyLen, long cfHandle) throws RocksDBException;
   protected native void remove(
       long handle, long writeOptHandle,
       byte[] key, int keyLen) throws RocksDBException;
+  protected native void remove(
+      long handle, long writeOptHandle,
+      byte[] key, int keyLen, long cfHandle) throws RocksDBException;
   protected native String getProperty0(long nativeHandle,
       String property, int propertyLength) throws RocksDBException;
-  protected native long iterator0(long optHandle);
+  protected native String getProperty0(long nativeHandle, long cfHandle,
+      String property, int propertyLength) throws RocksDBException;
+  protected native long iterator0(long handle);
+  protected native long iterator0(long handle, long cfHandle);
+  protected native long[] iterators(long handle,
+      List<ColumnFamilyHandle> columnFamilyNames) throws RocksDBException;
   private native void disposeInternal(long handle);
 
+  private native long createColumnFamily(long handle, String name) throws RocksDBException;
+  private native void dropColumnFamily(long handle, long cfHandle) throws RocksDBException;
+
   protected Options options_;
 }
diff --git a/java/org/rocksdb/WriteBatch.java b/java/org/rocksdb/WriteBatch.java
index f538dc1a0..0a16d5104 100644
--- a/java/org/rocksdb/WriteBatch.java
+++ b/java/org/rocksdb/WriteBatch.java
@@ -47,6 +47,16 @@ public class WriteBatch extends RocksObject {
     put(key, key.length, value, value.length);
   }
 
+  /**
+   * Store the mapping "key->value" within given column
+   * family.
+   */
+  public void put(ColumnFamilyHandle columnFamilyHandle,
+      byte[] key, byte[] value) {
+    put(key, key.length, value, value.length,
+        columnFamilyHandle.nativeHandle_);
+  }
+
   /**
    * Merge "value" with the existing value of "key" in the database.
    * "key->merge(existing, value)"
@@ -55,6 +65,16 @@ public class WriteBatch extends RocksObject {
     merge(key, key.length, value, value.length);
   }
 
+  /**
+   * Merge "value" with the existing value of "key" in given column family.
+   * "key->merge(existing, value)"
+   */
+  public void merge(ColumnFamilyHandle columnFamilyHandle,
+      byte[] key, byte[] value) {
+    merge(key, key.length, value, value.length,
+        columnFamilyHandle.nativeHandle_);
+  }
+
   /**
    * If the database contains a mapping for "key", erase it.  Else do nothing.
    */
@@ -62,6 +82,13 @@ public class WriteBatch extends RocksObject {
     remove(key, key.length);
   }
 
+  /**
+   * If column family contains a mapping for "key", erase it.  Else do nothing.
+   */
+  public void remove(ColumnFamilyHandle columnFamilyHandle, byte[] key) {
+    remove(key, key.length, columnFamilyHandle.nativeHandle_);
+  }
+
   /**
    * Append a blob of arbitrary size to the records in this batch. The blob will
    * be stored in the transaction log but not in any other file. In particular,
@@ -94,9 +121,17 @@ public class WriteBatch extends RocksObject {
   private native void newWriteBatch(int reserved_bytes);
   private native void put(byte[] key, int keyLen,
                           byte[] value, int valueLen);
+  private native void put(byte[] key, int keyLen,
+                          byte[] value, int valueLen,
+                          long cfHandle);
   private native void merge(byte[] key, int keyLen,
                             byte[] value, int valueLen);
+  private native void merge(byte[] key, int keyLen,
+                            byte[] value, int valueLen,
+                            long cfHandle);
   private native void remove(byte[] key, int keyLen);
+  private native void remove(byte[] key, int keyLen,
+                            long cfHandle);
   private native void putLogData(byte[] blob, int blobLen);
   private native void disposeInternal(long handle);
 }
diff --git a/java/org/rocksdb/test/BackupableDBTest.java b/java/org/rocksdb/test/BackupableDBTest.java
index 9d3a64c06..ee4509697 100644
--- a/java/org/rocksdb/test/BackupableDBTest.java
+++ b/java/org/rocksdb/test/BackupableDBTest.java
@@ -8,8 +8,8 @@ package org.rocksdb.test;
 import org.rocksdb.*;
 
 public class BackupableDBTest {
-  static final String db_path = "/tmp/backupablejni_db";
-  static final String backup_path = "/tmp/backupablejni_db_backup";
+  static final String db_path = "/tmp/rocksdbjni_backupable_db_test";
+  static final String backup_path = "/tmp/rocksdbjni_backupable_db_backup_test";
   static {
     RocksDB.loadLibrary();
   }
diff --git a/java/org/rocksdb/test/ColumnFamilyTest.java b/java/org/rocksdb/test/ColumnFamilyTest.java
new file mode 100644
index 000000000..57fd2e347
--- /dev/null
+++ b/java/org/rocksdb/test/ColumnFamilyTest.java
@@ -0,0 +1,282 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb.test;
+
+import java.util.HashMap;
+import java.util.List;
+import java.util.ArrayList;
+import java.util.Map;
+import org.rocksdb.*;
+
+public class ColumnFamilyTest {
+  static final String db_path = "/tmp/rocksdbjni_columnfamily_test";
+  static {
+    RocksDB.loadLibrary();
+  }
+
+  public static void main(String[] args) {
+
+    RocksDB db = null;
+    Options options = new Options();
+    options.setCreateIfMissing(true);
+    try {
+        db = RocksDB.open(options, db_path);
+    } catch (RocksDBException e) {
+      assert(false);
+    }
+    // Test listColumnFamilies
+    List<byte[]> columnFamilyNames;
+    try {
+      columnFamilyNames =  RocksDB.listColumnFamilies(options, db_path);
+      if (columnFamilyNames != null && columnFamilyNames.size() > 0) {
+        assert(columnFamilyNames.size() == 1);
+        assert(new String(columnFamilyNames.get(0)).equals("default"));
+      } else {
+        assert(false);
+      }
+    } catch (RocksDBException e) {
+      assert(false);
+    }
+
+    // Test createColumnFamily
+    try {
+      db.createColumnFamily("new_cf");
+    } catch (RocksDBException e) {
+      assert(false);
+    }
+
+    if (db != null) {
+      db.close();
+    }
+
+    // Test listColumnFamilies after create "new_cf"
+    try {
+      columnFamilyNames =  RocksDB.listColumnFamilies(options, db_path);
+      if (columnFamilyNames != null && columnFamilyNames.size() > 0) {
+        assert(columnFamilyNames.size() == 2);
+        assert(new String(columnFamilyNames.get(0)).equals("default"));
+        assert(new String(columnFamilyNames.get(1)).equals("new_cf"));
+      } else {
+        assert(false);
+      }
+    } catch (RocksDBException e) {
+      assert(false);
+    }
+
+    // Test open database with column family names
+    List<String> cfNames = new ArrayList<String>();
+    List<ColumnFamilyHandle> columnFamilyHandleList =
+        new ArrayList<ColumnFamilyHandle>();
+    cfNames.add("default");
+    cfNames.add("new_cf");
+
+    try {
+      db = RocksDB.open(options, db_path, cfNames, columnFamilyHandleList);
+      assert(columnFamilyHandleList.size() == 2);
+      db.put("dfkey1".getBytes(), "dfvalue".getBytes());
+      db.put(columnFamilyHandleList.get(0), "dfkey2".getBytes(),
+          "dfvalue".getBytes());
+      db.put(columnFamilyHandleList.get(1), "newcfkey1".getBytes(),
+          "newcfvalue".getBytes());
+
+      String retVal = new String(db.get(columnFamilyHandleList.get(1),
+          "newcfkey1".getBytes()));
+      assert(retVal.equals("newcfvalue"));
+      assert( (db.get(columnFamilyHandleList.get(1),
+          "dfkey1".getBytes())) == null);
+      db.remove(columnFamilyHandleList.get(1), "newcfkey1".getBytes());
+      assert( (db.get(columnFamilyHandleList.get(1),
+          "newcfkey1".getBytes())) == null);
+      db.remove("dfkey2".getBytes());
+      assert( (db.get(columnFamilyHandleList.get(0),
+          "dfkey2".getBytes())) == null);
+    } catch (RocksDBException e) {
+      assert(false);
+    }
+
+    // Test create write to and drop ColumnFamily
+    ColumnFamilyHandle tmpColumnFamilyHandle = null;
+    try {
+      tmpColumnFamilyHandle = db.createColumnFamily("tmpCF");
+      db.put(tmpColumnFamilyHandle, "key".getBytes(), "value".getBytes());
+      db.dropColumnFamily(tmpColumnFamilyHandle);
+      tmpColumnFamilyHandle.dispose();
+    } catch (Exception e) {
+      assert(false);
+    }
+
+    // Put to disposed column family tmpColumnFamilyHandle must fail
+    try {
+      db.put(tmpColumnFamilyHandle, "key".getBytes(), "value".getBytes());
+      assert(false);
+    } catch (RocksDBException e) {
+      assert(true);
+    }
+
+    // Remove to disposed column family tmpColumnFamilyHandle must fail
+    try {
+      db.remove(tmpColumnFamilyHandle, "key".getBytes());
+      assert(false);
+    } catch (RocksDBException e) {
+      assert(true);
+    }
+
+    // Get on a disposed column family tmpColumnFamilyHandle must fail
+    try {
+      db.get(tmpColumnFamilyHandle, "key".getBytes());
+      assert(false);
+    } catch (RocksDBException e) {
+      assert(true);
+    }
+
+    // Test WriteBatch
+    try {
+      WriteBatch writeBatch = new WriteBatch();
+      WriteOptions writeOpt = new WriteOptions();
+      writeBatch.put("key".getBytes(), "value".getBytes());
+      writeBatch.put(columnFamilyHandleList.get(1), "newcfkey".getBytes(),
+          "value".getBytes());
+      writeBatch.put(columnFamilyHandleList.get(1), "newcfkey2".getBytes(),
+          "value2".getBytes());
+      writeBatch.remove("xyz".getBytes());
+      writeBatch.remove(columnFamilyHandleList.get(1), "xyz".getBytes());
+      db.write(writeOpt, writeBatch);
+      writeBatch.dispose();
+      assert(db.get(columnFamilyHandleList.get(1),
+          "xyz".getBytes()) == null);
+      assert(new String(db.get(columnFamilyHandleList.get(1),
+          "newcfkey".getBytes())).equals("value"));
+      assert(new String(db.get(columnFamilyHandleList.get(1),
+          "newcfkey2".getBytes())).equals("value2"));
+      assert(new String(db.get("key".getBytes())).equals("value"));
+    } catch (Exception e) {
+      e.printStackTrace();
+      assert(false);
+    }
+
+    // Test iterator on column family
+    try {
+      RocksIterator rocksIterator = db.newIterator(
+          columnFamilyHandleList.get(1));
+      rocksIterator.seekToFirst();
+      Map<String, String> refMap = new HashMap<String, String>();
+      refMap.put("newcfkey", "value");
+      refMap.put("newcfkey2", "value2");
+      int i = 0;
+      while(rocksIterator.isValid()) {
+        i++;
+        refMap.get(new String(rocksIterator.key())).equals(
+            new String(rocksIterator.value()));
+        rocksIterator.next();
+      }
+      assert(i == 2);
+      rocksIterator.dispose();
+    } catch(Exception e) {
+      assert(false);
+    }
+
+    // Test property handling on column families
+    try {
+      assert(db.getProperty("rocksdb.estimate-num-keys") != null);
+      assert(db.getProperty("rocksdb.stats") != null);
+      assert(db.getProperty(columnFamilyHandleList.get(0),
+          "rocksdb.sstables") != null);
+      assert(db.getProperty(columnFamilyHandleList.get(1),
+          "rocksdb.estimate-num-keys") != null);
+      assert(db.getProperty(columnFamilyHandleList.get(1),
+          "rocksdb.stats") != null);
+      assert(db.getProperty(columnFamilyHandleList.get(1),
+          "rocksdb.sstables") != null);
+    } catch(Exception e) {
+      assert(false);
+    }
+
+    // MultiGet test
+    List<ColumnFamilyHandle> cfCustomList = new ArrayList<ColumnFamilyHandle>();
+    try {
+      List<byte[]> keys = new ArrayList<byte[]>();
+      keys.add("key".getBytes());
+      keys.add("newcfkey".getBytes());
+      Map<byte[], byte[]> retValues = db.multiGet(columnFamilyHandleList,keys);
+      assert(retValues.size() == 2);
+      assert(new String(retValues.get(keys.get(0)))
+          .equals("value"));
+      assert(new String(retValues.get(keys.get(1)))
+          .equals("value"));
+
+      cfCustomList.add(columnFamilyHandleList.get(0));
+      cfCustomList.add(columnFamilyHandleList.get(0));
+      retValues = db.multiGet(cfCustomList, keys);
+      assert(retValues.size() == 1);
+      assert(new String(retValues.get(keys.get(0)))
+          .equals("value"));
+    } catch (RocksDBException e) {
+      assert(false);
+    } catch (IllegalArgumentException e) {
+      assert(false);
+    }
+
+    // Test multiget without correct number of column
+    // families
+    try {
+      List<byte[]> keys = new ArrayList<byte[]>();
+      keys.add("key".getBytes());
+      keys.add("newcfkey".getBytes());
+      cfCustomList.remove(1);
+      db.multiGet(cfCustomList, keys);
+      assert(false);
+    } catch (RocksDBException e) {
+      assert(false);
+    } catch (IllegalArgumentException e) {
+      assert(true);
+    }
+
+    try {
+      // iterate over default key/value pairs
+      List<RocksIterator> iterators =
+          db.newIterators(columnFamilyHandleList);
+      assert(iterators.size() == 2);
+      RocksIterator iter = iterators.get(0);
+      iter.seekToFirst();
+      Map<String,String> defRefMap = new HashMap<String, String>();
+      defRefMap.put("dfkey1", "dfvalue");
+      defRefMap.put("key", "value");
+      while (iter.isValid()) {
+        defRefMap.get(new String(iter.key())).equals(
+            new String(iter.value()));
+        iter.next();
+      }
+      // iterate over new_cf key/value pairs
+      Map<String,String> cfRefMap = new HashMap<String, String>();
+      cfRefMap.put("newcfkey", "value");
+      cfRefMap.put("newcfkey2", "value2");
+      iter = iterators.get(1);
+      iter.seekToFirst();
+      while (iter.isValid()) {
+        cfRefMap.get(new String(iter.key())).equals(
+            new String(iter.value()));
+        iter.next();
+      }
+      // free iterators
+      for (RocksIterator iterator : iterators) {
+        iterator.dispose();
+      }
+      assert(true);
+    } catch (RocksDBException e) {
+      assert(false);
+    }
+
+    System.out.println("Passed ColumnFamilyTest");
+    // free cf handles before database close
+    for (ColumnFamilyHandle columnFamilyHandle : columnFamilyHandleList) {
+      columnFamilyHandle.dispose();
+    }
+    // close database
+    db.close();
+    // be sure to dispose c++ pointers
+    options.dispose();
+  }
+}
diff --git a/java/org/rocksdb/test/KeyMayExistTest.java b/java/org/rocksdb/test/KeyMayExistTest.java
new file mode 100644
index 000000000..a4ecb53da
--- /dev/null
+++ b/java/org/rocksdb/test/KeyMayExistTest.java
@@ -0,0 +1,52 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+package org.rocksdb.test;
+
+import org.rocksdb.ColumnFamilyHandle;
+import org.rocksdb.Options;
+import org.rocksdb.RocksDB;
+import org.rocksdb.RocksDBException;
+
+import java.util.ArrayList;
+import java.util.List;
+
+public class KeyMayExistTest {
+  static final String DB_PATH = "/tmp/rocksdbjni_keymayexit_test";
+  static {
+    RocksDB.loadLibrary();
+  }
+
+  public static void main(String[] args){
+    RocksDB db;
+    Options options = new Options();
+    options.setCreateIfMissing(true)
+        .setCreateMissingColumnFamilies(true);
+    try {
+      // open database using cf names
+      List<String> cfNames = new ArrayList<String>();
+      List<ColumnFamilyHandle> columnFamilyHandleList =
+          new ArrayList<ColumnFamilyHandle>();
+      cfNames.add("default");
+      cfNames.add("new_cf");
+      db = RocksDB.open(options, DB_PATH, cfNames, columnFamilyHandleList);
+      assert(columnFamilyHandleList.size()==2);
+
+      db.put("key".getBytes(), "value".getBytes());
+      StringBuffer retValue = new StringBuffer();
+      if (db.keyMayExist(columnFamilyHandleList.get(0), "key".getBytes(),
+          retValue)) {
+        assert(retValue.toString().equals("value"));
+      } else {
+        assert(false);
+      }
+      assert(db.keyMayExist(columnFamilyHandleList.get(1), "key".getBytes(),
+          retValue) == false);
+      System.out.println("Passed KeyMayExistTest");
+    }catch (RocksDBException e){
+      e.printStackTrace();
+      assert(false);
+    }
+  }
+}
diff --git a/java/org/rocksdb/test/OptionsTest.java b/java/org/rocksdb/test/OptionsTest.java
index 9f14b40d9..222d87b8d 100644
--- a/java/org/rocksdb/test/OptionsTest.java
+++ b/java/org/rocksdb/test/OptionsTest.java
@@ -26,6 +26,12 @@ public class OptionsTest {
       assert(opt.createIfMissing() == boolValue);
     }
 
+    { // CreateMissingColumnFamilies test
+      boolean boolValue = rand.nextBoolean();
+      opt.setCreateMissingColumnFamilies(boolValue);
+      assert(opt.createMissingColumnFamilies() == boolValue);
+    }
+
     { // ErrorIfExists test
       boolean boolValue = rand.nextBoolean();
       opt.setErrorIfExists(boolValue);
diff --git a/java/org/rocksdb/test/ReadOnlyTest.java b/java/org/rocksdb/test/ReadOnlyTest.java
new file mode 100644
index 000000000..87e8f1e9e
--- /dev/null
+++ b/java/org/rocksdb/test/ReadOnlyTest.java
@@ -0,0 +1,126 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+package org.rocksdb.test;
+
+import org.rocksdb.*;
+
+import java.util.ArrayList;
+import java.util.List;
+
+public class ReadOnlyTest {
+  static final String DB_PATH = "/tmp/rocksdbjni_readonly_test";
+  static {
+    RocksDB.loadLibrary();
+  }
+
+  public static void main(String[] args){
+    RocksDB db = null, db2 = null, db3 = null;
+    List<ColumnFamilyHandle> columnFamilyHandleList =
+        new ArrayList<ColumnFamilyHandle>();
+    List<ColumnFamilyHandle> db2ColumnFamilyHandleList =
+        new ArrayList<ColumnFamilyHandle>();
+    List<ColumnFamilyHandle> db3ColumnFamilyHandleList =
+        new ArrayList<ColumnFamilyHandle>();
+    Options options = new Options();
+    options.setCreateIfMissing(true);
+    try {
+      db = RocksDB.open(options, DB_PATH);
+      db.put("key".getBytes(), "value".getBytes());
+      db2 = RocksDB.openReadOnly(DB_PATH);
+      assert("value".equals(new String(db2.get("key".getBytes()))));
+      db.close();
+      db2.close();
+
+
+      List<String> cfNames = new ArrayList<String>();
+      cfNames.add("default");
+
+      db = RocksDB.open(DB_PATH, cfNames, columnFamilyHandleList);
+      columnFamilyHandleList.add(db.createColumnFamily("new_cf"));
+      columnFamilyHandleList.add(db.createColumnFamily("new_cf2"));
+      db.put(columnFamilyHandleList.get(2), "key2".getBytes(),
+          "value2".getBytes());
+
+      db2 = RocksDB.openReadOnly(DB_PATH, cfNames, db2ColumnFamilyHandleList);
+      assert(db2.get("key2".getBytes())==null);
+      assert(db2.get(columnFamilyHandleList.get(0), "key2".getBytes())==null);
+
+      List<String> cfNewName = new ArrayList<String>();
+      cfNewName.add("default");
+      cfNewName.add("new_cf2");
+      db3 = RocksDB.openReadOnly(DB_PATH, cfNewName, db3ColumnFamilyHandleList);
+      assert(new String(db3.get(db3ColumnFamilyHandleList.get(1),
+          "key2".getBytes())).equals("value2"));
+    }catch (RocksDBException e){
+      e.printStackTrace();
+      assert(false);
+    }
+    // test that put fails in readonly mode
+    try {
+      db2.put("key".getBytes(), "value".getBytes());
+      assert(false);
+    } catch (RocksDBException e) {
+      assert(true);
+    }
+    try {
+      db3.put(db3ColumnFamilyHandleList.get(1),
+          "key".getBytes(), "value".getBytes());
+      assert(false);
+    } catch (RocksDBException e) {
+      assert(true);
+    }
+    // test that remove fails in readonly mode
+    try {
+      db2.remove("key".getBytes());
+      assert(false);
+    } catch (RocksDBException e) {
+      assert(true);
+    }
+    try {
+      db3.remove(db3ColumnFamilyHandleList.get(1),
+          "key".getBytes());
+      assert(false);
+    } catch (RocksDBException e) {
+      assert(true);
+    }
+    // test that write fails in readonly mode
+    WriteBatch wb = new WriteBatch();
+    wb.put("key".getBytes(), "value".getBytes());
+    try {
+      db2.write(new WriteOptions(), wb);
+      assert(false);
+    } catch (RocksDBException e) {
+      assert(true);
+    }
+    wb.dispose();
+    wb = new WriteBatch();
+    wb.put(db3ColumnFamilyHandleList.get(1),
+        "key".getBytes(), "value".getBytes());
+    try {
+      db3.write(new WriteOptions(), wb);
+      assert(false);
+    } catch (RocksDBException e) {
+      assert(true);
+    }
+    wb.dispose();
+    // cleanup c++ pointers
+    for (ColumnFamilyHandle columnFamilyHandle :
+        columnFamilyHandleList) {
+      columnFamilyHandle.dispose();
+    }
+    db.close();
+    for (ColumnFamilyHandle columnFamilyHandle :
+        db2ColumnFamilyHandleList) {
+      columnFamilyHandle.dispose();
+    }
+    db2.close();
+    for (ColumnFamilyHandle columnFamilyHandle :
+        db3ColumnFamilyHandleList) {
+      columnFamilyHandle.dispose();
+    }
+    db3.close();
+    System.out.println("Passed ReadOnlyTest");
+  }
+}
diff --git a/java/org/rocksdb/test/StatisticsCollectorTest.java b/java/org/rocksdb/test/StatisticsCollectorTest.java
index edbf693e4..5298aa46a 100644
--- a/java/org/rocksdb/test/StatisticsCollectorTest.java
+++ b/java/org/rocksdb/test/StatisticsCollectorTest.java
@@ -9,7 +9,7 @@ import java.util.Collections;
 import org.rocksdb.*;
 
 public class StatisticsCollectorTest {
-  static final String db_path = "/tmp/backupablejni_db";
+  static final String db_path = "/tmp/rocksdbjni_statistics_collector_test";
   static {
     RocksDB.loadLibrary();
   }
@@ -19,7 +19,7 @@ public class StatisticsCollectorTest {
     Options opt = new Options().createStatistics().setCreateIfMissing(true);
     Statistics stats = opt.statisticsPtr();
 
-    RocksDB db = RocksDB.open(db_path);
+    RocksDB db = RocksDB.open(opt, db_path);
 
     StatsCallbackMock callback = new StatsCallbackMock();
     StatsCollectorInput statsInput = new StatsCollectorInput(stats, callback);
diff --git a/java/rocksjni/columnfamilyhandle.cc b/java/rocksjni/columnfamilyhandle.cc
new file mode 100644
index 000000000..be3b4c82f
--- /dev/null
+++ b/java/rocksjni/columnfamilyhandle.cc
@@ -0,0 +1,25 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+//
+// This file implements the "bridge" between Java and C++ and enables
+// calling c++ rocksdb::Iterator methods from Java side.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <jni.h>
+
+#include "include/org_rocksdb_ColumnFamilyHandle.h"
+#include "rocksjni/portal.h"
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyHandle
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_ColumnFamilyHandle_disposeInternal(
+    JNIEnv* env, jobject jobj, jlong handle) {
+  auto it = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(handle);
+  delete it;
+}
diff --git a/java/rocksjni/options.cc b/java/rocksjni/options.cc
index 8d3cb37e0..ef104d92b 100644
--- a/java/rocksjni/options.cc
+++ b/java/rocksjni/options.cc
@@ -8,7 +8,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <jni.h>
-#include <string>
+#include <strings.h>
 #include <memory>
 
 #include "include/org_rocksdb_Options.h"
@@ -64,6 +64,28 @@ jboolean Java_org_rocksdb_Options_createIfMissing(
   return reinterpret_cast<rocksdb::Options*>(jhandle)->create_if_missing;
 }
 
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setCreateMissingColumnFamilies
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setCreateMissingColumnFamilies(
+    JNIEnv* env, jobject jobj, jlong jhandle, jboolean flag) {
+  reinterpret_cast<rocksdb::Options*>
+      (jhandle)->create_missing_column_families = flag;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    createMissingColumnFamilies
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_createMissingColumnFamilies(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>
+      (jhandle)->create_missing_column_families;
+}
+
 /*
  * Class:     org_rocksdb_Options
  * Method:    useReverseBytewiseComparator
diff --git a/java/rocksjni/portal.h b/java/rocksjni/portal.h
index 374d20b0b..14b2cb98a 100644
--- a/java/rocksjni/portal.h
+++ b/java/rocksjni/portal.h
@@ -327,6 +327,39 @@ class FilterJni {
   }
 };
 
+class ColumnFamilyHandleJni {
+ public:
+  // Get the java class id of org.rocksdb.ColumnFamilyHandle.
+  static jclass getJClass(JNIEnv* env) {
+    jclass jclazz = env->FindClass("org/rocksdb/ColumnFamilyHandle");
+    assert(jclazz != nullptr);
+    return jclazz;
+  }
+
+  // Get the field id of the member variable of org.rocksdb.ColumnFamilyHandle.
+  // that stores the pointer to rocksdb::ColumnFamilyHandle.
+  static jfieldID getHandleFieldID(JNIEnv* env) {
+    static jfieldID fid = env->GetFieldID(
+        getJClass(env), "nativeHandle_", "J");
+    assert(fid != nullptr);
+    return fid;
+  }
+
+  // Get the pointer to rocksdb::ColumnFamilyHandle.
+  static rocksdb::ColumnFamilyHandle* getHandle(JNIEnv* env, jobject jobj) {
+    return reinterpret_cast<rocksdb::ColumnFamilyHandle*>(
+        env->GetLongField(jobj, getHandleFieldID(env)));
+  }
+
+  // Pass the rocksdb::ColumnFamilyHandle pointer to the java side.
+  static void setHandle(
+      JNIEnv* env, jobject jobj, const rocksdb::ColumnFamilyHandle* op) {
+    env->SetLongField(
+        jobj, getHandleFieldID(env),
+        reinterpret_cast<jlong>(op));
+  }
+};
+
 class ListJni {
  public:
   // Get the java class id of java.util.List.
diff --git a/java/rocksjni/rocksjni.cc b/java/rocksjni/rocksjni.cc
index bb3f1a845..fa9a66a7d 100644
--- a/java/rocksjni/rocksjni.cc
+++ b/java/rocksjni/rocksjni.cc
@@ -25,7 +25,7 @@
  * Method:    open
  * Signature: (JLjava/lang/String;)V
  */
-void Java_org_rocksdb_RocksDB_open(
+void Java_org_rocksdb_RocksDB_open__JLjava_lang_String_2(
     JNIEnv* env, jobject jdb, jlong jopt_handle, jstring jdb_path) {
   auto opt = reinterpret_cast<rocksdb::Options*>(jopt_handle);
   rocksdb::DB* db = nullptr;
@@ -40,12 +40,215 @@ void Java_org_rocksdb_RocksDB_open(
   rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
 }
 
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    openROnly
+ * Signature: (JLjava/lang/String;)V
+ */
+void Java_org_rocksdb_RocksDB_openROnly__JLjava_lang_String_2(
+    JNIEnv* env, jobject jdb, jlong jopt_handle, jstring jdb_path) {
+  auto opt = reinterpret_cast<rocksdb::Options*>(jopt_handle);
+  rocksdb::DB* db = nullptr;
+  const char* db_path = env->GetStringUTFChars(jdb_path, 0);
+  rocksdb::Status s = rocksdb::DB::OpenForReadOnly(*opt,
+      db_path, &db);
+  env->ReleaseStringUTFChars(jdb_path, db_path);
+
+  if (s.ok()) {
+    rocksdb::RocksDBJni::setHandle(env, jdb, db);
+    return;
+  }
+  rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    openROnly
+ * Signature: (JLjava/lang/String;Ljava/util/List;I)Ljava/util/List;
+ */
+jobject
+    Java_org_rocksdb_RocksDB_openROnly__JLjava_lang_String_2Ljava_util_List_2I(
+    JNIEnv* env, jobject jdb, jlong jopt_handle, jstring jdb_path,
+    jobject jcfname_list, jint jcfname_count) {
+  auto opt = reinterpret_cast<rocksdb::Options*>(jopt_handle);
+  rocksdb::DB* db = nullptr;
+  const char* db_path = env->GetStringUTFChars(jdb_path, 0);
+
+  std::vector<const char*> cfnames_to_free;
+  std::vector<jstring> jcfnames_for_free;
+
+  std::vector<rocksdb::ColumnFamilyDescriptor> column_families;
+  std::vector<rocksdb::ColumnFamilyHandle* > handles;
+  // get iterator for cfnames
+  jobject iteratorObj = env->CallObjectMethod(
+      jcfname_list, rocksdb::ListJni::getIteratorMethod(env));
+
+  // iterate over cfnames and convert cfnames to
+  // ColumnFamilyDescriptor instances
+  while (env->CallBooleanMethod(
+      iteratorObj, rocksdb::ListJni::getHasNextMethod(env)) == JNI_TRUE) {
+      jstring jstr = (jstring) env->CallObjectMethod(iteratorObj,
+          rocksdb::ListJni::getNextMethod(env));
+      const char* cfname = env->GetStringUTFChars(jstr, 0);
+
+      // free allocated cfnames after call to open
+      cfnames_to_free.push_back(cfname);
+      jcfnames_for_free.push_back(jstr);
+      column_families.push_back(rocksdb::ColumnFamilyDescriptor(cfname,
+          rocksdb::ColumnFamilyOptions()));
+  }
+
+  rocksdb::Status s = rocksdb::DB::OpenForReadOnly(*opt,
+      db_path, column_families, &handles, &db);
+  env->ReleaseStringUTFChars(jdb_path, db_path);
+  // free jbyte allocations
+  for (std::vector<jbyte*>::size_type i = 0;
+      i != cfnames_to_free.size(); i++) {
+    // free  cfnames
+    env->ReleaseStringUTFChars(jcfnames_for_free[i], cfnames_to_free[i]);
+  }
+
+  jobject jcfhandle_list = nullptr;
+  // check if open operation was successful
+  if (s.ok()) {
+    rocksdb::RocksDBJni::setHandle(env, jdb, db);
+    jclass jclazz = env->FindClass("java/util/ArrayList");
+    jmethodID mid = rocksdb::ListJni::getArrayListConstructorMethodId(
+        env, jclazz);
+    jobject jcfhandle_list = env->NewObject(jclazz, mid, handles.size());
+    // insert in java list
+    for (std::vector<rocksdb::ColumnFamilyHandle*>::size_type i = 0;
+        i != handles.size(); i++) {
+      // jlong must be converted to Long due to collections restrictions
+      jclass jclazz = env->FindClass("java/lang/Long");
+      jmethodID mid = env->GetMethodID(jclazz, "<init>", "(J)V");
+      jobject obj = env->NewObject(jclazz, mid,
+          reinterpret_cast<jlong>(handles[i]));
+      env->CallBooleanMethod(jcfhandle_list,
+          rocksdb::ListJni::getListAddMethodId(env), obj);
+    }
+
+    return jcfhandle_list;
+  }
+  rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  return jcfhandle_list;
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    open
+ * Signature: (JLjava/lang/String;Ljava/util/List;I)Ljava/util/List;
+ */
+jobject Java_org_rocksdb_RocksDB_open__JLjava_lang_String_2Ljava_util_List_2I(
+    JNIEnv* env, jobject jdb, jlong jopt_handle, jstring jdb_path,
+    jobject jcfname_list, jint jcfname_count) {
+  auto opt = reinterpret_cast<rocksdb::Options*>(jopt_handle);
+  rocksdb::DB* db = nullptr;
+  const char* db_path = env->GetStringUTFChars(jdb_path, 0);
+
+  std::vector<const char*> cfnames_to_free;
+  std::vector<jstring> jcfnames_for_free;
+
+  std::vector<rocksdb::ColumnFamilyDescriptor> column_families;
+  std::vector<rocksdb::ColumnFamilyHandle* > handles;
+  // get iterator for cfnames
+  jobject iteratorObj = env->CallObjectMethod(
+      jcfname_list, rocksdb::ListJni::getIteratorMethod(env));
+
+  // iterate over cfnames and convert cfnames to
+  // ColumnFamilyDescriptor instances
+  while (env->CallBooleanMethod(
+      iteratorObj, rocksdb::ListJni::getHasNextMethod(env)) == JNI_TRUE) {
+      jstring jstr = (jstring) env->CallObjectMethod(iteratorObj,
+          rocksdb::ListJni::getNextMethod(env));
+      const char* cfname = env->GetStringUTFChars(jstr, 0);
+
+      // free allocated cfnames after call to open
+      cfnames_to_free.push_back(cfname);
+      jcfnames_for_free.push_back(jstr);
+      column_families.push_back(rocksdb::ColumnFamilyDescriptor(cfname,
+          rocksdb::ColumnFamilyOptions()));
+  }
+
+  rocksdb::Status s = rocksdb::DB::Open(*opt, db_path, column_families,
+      &handles, &db);
+  env->ReleaseStringUTFChars(jdb_path, db_path);
+  // free jbyte allocations
+  for (std::vector<jbyte*>::size_type i = 0;
+      i != cfnames_to_free.size(); i++) {
+    // free  cfnames
+    env->ReleaseStringUTFChars(jcfnames_for_free[i], cfnames_to_free[i]);
+  }
+
+  jobject jcfhandle_list = nullptr;
+  // check if open operation was successful
+  if (s.ok()) {
+    rocksdb::RocksDBJni::setHandle(env, jdb, db);
+    jclass jclazz = env->FindClass("java/util/ArrayList");
+    jmethodID mid = rocksdb::ListJni::getArrayListConstructorMethodId(
+        env, jclazz);
+    jobject jcfhandle_list = env->NewObject(jclazz, mid, handles.size());
+    // insert in java list
+    for (std::vector<rocksdb::ColumnFamilyHandle*>::size_type i = 0;
+        i != handles.size(); i++) {
+      // jlong must be converted to Long due to collections restrictions
+      jclass jclazz = env->FindClass("java/lang/Long");
+      jmethodID mid = env->GetMethodID(jclazz, "<init>", "(J)V");
+      jobject obj = env->NewObject(jclazz, mid,
+          reinterpret_cast<jlong>(handles[i]));
+      env->CallBooleanMethod(jcfhandle_list,
+          rocksdb::ListJni::getListAddMethodId(env), obj);
+    }
+
+    return jcfhandle_list;
+  }
+  rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  return jcfhandle_list;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// rocksdb::DB::ListColumnFamilies
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    listColumnFamilies
+ * Signature: (JLjava/lang/String;)Ljava/util/List;
+ */
+jobject Java_org_rocksdb_RocksDB_listColumnFamilies(
+    JNIEnv* env, jclass jclazz, jlong jopt_handle, jstring jdb_path) {
+  std::vector<std::string> column_family_names;
+  auto opt = reinterpret_cast<rocksdb::Options*>(jopt_handle);
+  const char* db_path = env->GetStringUTFChars(jdb_path, 0);
+  jobject jvalue_list = nullptr;
+
+  rocksdb::Status s = rocksdb::DB::ListColumnFamilies(*opt, db_path,
+      &column_family_names);
+  env->ReleaseStringUTFChars(jdb_path, db_path);
+  if (s.ok()) {
+    // Don't reuse class pointer
+    jclass jclazz = env->FindClass("java/util/ArrayList");
+    jmethodID mid = rocksdb::ListJni::getArrayListConstructorMethodId(env,
+        jclazz);
+    jvalue_list = env->NewObject(jclazz, mid, column_family_names.size());
+
+    for (std::vector<std::string>::size_type i = 0;
+        i < column_family_names.size(); i++) {
+      jbyteArray jvalue = env->NewByteArray(column_family_names[i].size());
+      env->SetByteArrayRegion(jvalue, 0, column_family_names[i].size(),
+          reinterpret_cast<const jbyte*>(column_family_names[i].c_str()));
+      env->CallBooleanMethod(jvalue_list,
+          rocksdb::ListJni::getListAddMethodId(env), jvalue);
+    }
+  }
+  return jvalue_list;
+}
+
 //////////////////////////////////////////////////////////////////////////////
 // rocksdb::DB::Put
 
 void rocksdb_put_helper(
     JNIEnv* env, rocksdb::DB* db, const rocksdb::WriteOptions& write_options,
-    jbyteArray jkey, jint jkey_len,
+    rocksdb::ColumnFamilyHandle* cf_handle, jbyteArray jkey, jint jkey_len,
     jbyteArray jvalue, jint jvalue_len) {
 
   jbyte* key = env->GetByteArrayElements(jkey, 0);
@@ -53,7 +256,13 @@ void rocksdb_put_helper(
   rocksdb::Slice key_slice(reinterpret_cast<char*>(key), jkey_len);
   rocksdb::Slice value_slice(reinterpret_cast<char*>(value), jvalue_len);
 
-  rocksdb::Status s = db->Put(write_options, key_slice, value_slice);
+  rocksdb::Status s;
+  if (cf_handle != nullptr) {
+    s = db->Put(write_options, cf_handle, key_slice, value_slice);
+  } else {
+    // backwards compatibility
+    s = db->Put(write_options, key_slice, value_slice);
+  }
 
   // trigger java unref on key and value.
   // by passing JNI_ABORT, it will simply release the reference without
@@ -80,10 +289,31 @@ void Java_org_rocksdb_RocksDB_put__J_3BI_3BI(
   static const rocksdb::WriteOptions default_write_options =
       rocksdb::WriteOptions();
 
-  rocksdb_put_helper(env, db, default_write_options,
+  rocksdb_put_helper(env, db, default_write_options, nullptr,
                      jkey, jkey_len,
                      jvalue, jvalue_len);
 }
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    put
+ * Signature: (J[BI[BIJ)V
+ */
+void Java_org_rocksdb_RocksDB_put__J_3BI_3BIJ(
+    JNIEnv* env, jobject jdb, jlong jdb_handle,
+    jbyteArray jkey, jint jkey_len,
+    jbyteArray jvalue, jint jvalue_len, jlong jcf_handle) {
+  auto db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  static const rocksdb::WriteOptions default_write_options =
+      rocksdb::WriteOptions();
+  auto cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
+  if (cf_handle != nullptr) {
+    rocksdb_put_helper(env, db, default_write_options, cf_handle,
+        jkey, jkey_len, jvalue, jvalue_len);
+  } else {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env,
+        rocksdb::Status::InvalidArgument("Invalid ColumnFamilyHandle."));
+  }
+}
 
 /*
  * Class:     org_rocksdb_RocksDB
@@ -99,11 +329,34 @@ void Java_org_rocksdb_RocksDB_put__JJ_3BI_3BI(
   auto write_options = reinterpret_cast<rocksdb::WriteOptions*>(
       jwrite_options_handle);
 
-  rocksdb_put_helper(env, db, *write_options,
+  rocksdb_put_helper(env, db, *write_options, nullptr,
                      jkey, jkey_len,
                      jvalue, jvalue_len);
 }
 
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    put
+ * Signature: (JJ[BI[BIJ)V
+ */
+void Java_org_rocksdb_RocksDB_put__JJ_3BI_3BIJ(
+    JNIEnv* env, jobject jdb,
+    jlong jdb_handle, jlong jwrite_options_handle,
+    jbyteArray jkey, jint jkey_len,
+    jbyteArray jvalue, jint jvalue_len, jlong jcf_handle) {
+  auto db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  auto write_options = reinterpret_cast<rocksdb::WriteOptions*>(
+      jwrite_options_handle);
+  auto cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
+  if (cf_handle != nullptr) {
+    rocksdb_put_helper(env, db, *write_options, cf_handle,
+        jkey, jkey_len, jvalue, jvalue_len);
+  } else {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env,
+        rocksdb::Status::InvalidArgument("Invalid ColumnFamilyHandle."));
+  }
+}
+
 //////////////////////////////////////////////////////////////////////////////
 // rocksdb::DB::Write
 /*
@@ -126,20 +379,94 @@ void Java_org_rocksdb_RocksDB_write(
   }
 }
 
+//////////////////////////////////////////////////////////////////////////////
+// rocksdb::DB::KeyMayExist
+jboolean key_may_exist_helper(JNIEnv* env, rocksdb::DB* db,
+    const rocksdb::ReadOptions& read_opt,
+    rocksdb::ColumnFamilyHandle* cf_handle, jbyteArray jkey, jint jkey_len,
+    jobject jvalue) {
+  std::string value;
+  bool value_found = false;
+  jboolean isCopy;
+  jbyte* key = env->GetByteArrayElements(jkey, &isCopy);
+  rocksdb::Slice key_slice(reinterpret_cast<char*>(key), jkey_len);
+  bool keyMaxExist = db->KeyMayExist(read_opt, cf_handle, key_slice,
+       &value, &value_found);
+  if (value_found && !value.empty()) {
+    jclass clazz = env->GetObjectClass(jvalue);
+    jmethodID mid = env->GetMethodID(clazz, "append",
+        "(Ljava/lang/String;)Ljava/lang/StringBuffer;");
+    jstring new_value_str = env->NewStringUTF(value.c_str());
+    env->CallObjectMethod(jvalue, mid, new_value_str);
+  }
+  env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
+  return static_cast<jboolean>(keyMaxExist);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    keyMayExist
+ * Signature: ([BIJLjava/lang/StringBuffer;)Z
+ */
+jboolean Java_org_rocksdb_RocksDB_keyMayExist___3BIJLjava_lang_StringBuffer_2(
+    JNIEnv* env, jobject jdb, jbyteArray jkey, jint jkey_len,
+    jlong jcf_handle, jobject jvalue) {
+  rocksdb::DB* db = rocksdb::RocksDBJni::getHandle(env, jdb);
+  auto cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(
+      jcf_handle);
+  if (cf_handle != nullptr) {
+    return key_may_exist_helper(env, db, rocksdb::ReadOptions(),
+        cf_handle, jkey, jkey_len, jvalue);
+  } else {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env,
+        rocksdb::Status::InvalidArgument("Invalid ColumnFamilyHandle."));
+  }
+  return true;
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    keyMayExist
+ * Signature: (J[BIJLjava/lang/StringBuffer;)Z
+ */
+jboolean Java_org_rocksdb_RocksDB_keyMayExist__J_3BIJLjava_lang_StringBuffer_2(
+    JNIEnv* env, jobject jdb, jlong jread_options_handle,
+    jbyteArray jkey, jint jkey_len, jlong jcf_handle, jobject jvalue) {
+  rocksdb::DB* db = rocksdb::RocksDBJni::getHandle(env, jdb);
+  auto& read_options = *reinterpret_cast<rocksdb::ReadOptions*>(
+      jread_options_handle);
+  auto cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(
+      jcf_handle);
+  if (cf_handle != nullptr) {
+    return key_may_exist_helper(env, db, read_options, cf_handle,
+        jkey, jkey_len, jvalue);
+  } else {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env,
+        rocksdb::Status::InvalidArgument("Invalid ColumnFamilyHandle."));
+  }
+  return true;
+}
+
 //////////////////////////////////////////////////////////////////////////////
 // rocksdb::DB::Get
 
 jbyteArray rocksdb_get_helper(
     JNIEnv* env, rocksdb::DB* db, const rocksdb::ReadOptions& read_opt,
-    jbyteArray jkey, jint jkey_len) {
+    rocksdb::ColumnFamilyHandle* column_family_handle, jbyteArray jkey,
+    jint jkey_len) {
   jboolean isCopy;
   jbyte* key = env->GetByteArrayElements(jkey, &isCopy);
   rocksdb::Slice key_slice(
       reinterpret_cast<char*>(key), jkey_len);
 
   std::string value;
-  rocksdb::Status s = db->Get(
-      read_opt, key_slice, &value);
+  rocksdb::Status s;
+  if (column_family_handle != nullptr) {
+    s = db->Get(read_opt, column_family_handle, key_slice, &value);
+  } else {
+    // backwards compatibility
+    s = db->Get(read_opt, key_slice, &value);
+  }
 
   // trigger java unref on key.
   // by passing JNI_ABORT, it will simply release the reference without
@@ -172,10 +499,31 @@ jbyteArray Java_org_rocksdb_RocksDB_get__J_3BI(
     jbyteArray jkey, jint jkey_len) {
   return rocksdb_get_helper(env,
       reinterpret_cast<rocksdb::DB*>(jdb_handle),
-      rocksdb::ReadOptions(),
+      rocksdb::ReadOptions(), nullptr,
       jkey, jkey_len);
 }
 
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    get
+ * Signature: (J[BIJ)[B
+ */
+jbyteArray Java_org_rocksdb_RocksDB_get__J_3BIJ(
+    JNIEnv* env, jobject jdb, jlong jdb_handle,
+    jbyteArray jkey, jint jkey_len, jlong jcf_handle) {
+  auto db_handle = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  auto cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
+  if (cf_handle != nullptr) {
+    return rocksdb_get_helper(env, db_handle, rocksdb::ReadOptions(),
+        cf_handle, jkey, jkey_len);
+  } else {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env,
+        rocksdb::Status::InvalidArgument("Invalid ColumnFamilyHandle."));
+    // will never be evaluated
+    return env->NewByteArray(0);
+  }
+}
+
 /*
  * Class:     org_rocksdb_RocksDB
  * Method:    get
@@ -186,14 +534,36 @@ jbyteArray Java_org_rocksdb_RocksDB_get__JJ_3BI(
     jbyteArray jkey, jint jkey_len) {
   return rocksdb_get_helper(env,
       reinterpret_cast<rocksdb::DB*>(jdb_handle),
-      *reinterpret_cast<rocksdb::ReadOptions*>(jropt_handle),
+      *reinterpret_cast<rocksdb::ReadOptions*>(jropt_handle), nullptr,
       jkey, jkey_len);
 }
 
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    get
+ * Signature: (JJ[BIJ)[B
+ */
+jbyteArray Java_org_rocksdb_RocksDB_get__JJ_3BIJ(
+    JNIEnv* env, jobject jdb, jlong jdb_handle, jlong jropt_handle,
+    jbyteArray jkey, jint jkey_len, jlong jcf_handle) {
+  auto db_handle = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  auto& ro_opt = *reinterpret_cast<rocksdb::ReadOptions*>(jropt_handle);
+  auto cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
+  if (cf_handle != nullptr) {
+    return rocksdb_get_helper(env, db_handle, ro_opt, cf_handle,
+        jkey, jkey_len);
+  } else {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env,
+        rocksdb::Status::InvalidArgument("Invalid ColumnFamilyHandle."));
+    // will never be evaluated
+    return env->NewByteArray(0);
+  }
+}
+
 jint rocksdb_get_helper(
     JNIEnv* env, rocksdb::DB* db, const rocksdb::ReadOptions& read_options,
-    jbyteArray jkey, jint jkey_len,
-    jbyteArray jvalue, jint jvalue_len) {
+    rocksdb::ColumnFamilyHandle* column_family_handle, jbyteArray jkey,
+    jint jkey_len, jbyteArray jvalue, jint jvalue_len) {
   static const int kNotFound = -1;
   static const int kStatusError = -2;
 
@@ -204,8 +574,13 @@ jint rocksdb_get_helper(
   // TODO(yhchiang): we might save one memory allocation here by adding
   // a DB::Get() function which takes preallocated jbyte* as input.
   std::string cvalue;
-  rocksdb::Status s = db->Get(
-      read_options, key_slice, &cvalue);
+  rocksdb::Status s;
+  if (column_family_handle != nullptr) {
+    s = db->Get(read_options, column_family_handle, key_slice, &cvalue);
+  } else {
+    // backwards compatibility
+    s = db->Get(read_options, key_slice, &cvalue);
+  }
 
   // trigger java unref on key.
   // by passing JNI_ABORT, it will simply release the reference without
@@ -236,11 +611,31 @@ jint rocksdb_get_helper(
   return cvalue_len;
 }
 
+// cf multi get
 jobject multi_get_helper(JNIEnv* env, jobject jdb, rocksdb::DB* db,
-    const rocksdb::ReadOptions& rOpt, jobject jkey_list, jint jkeys_count) {
+    const rocksdb::ReadOptions& rOpt, jobject jkey_list, jint jkeys_count,
+    jobject jcfhandle_list) {
   std::vector<rocksdb::Slice> keys;
   std::vector<jbyte*> keys_to_free;
+  std::vector<rocksdb::ColumnFamilyHandle*> cf_handles;
+
+  if (jcfhandle_list != nullptr) {
+    // get cf iterator
+    jobject cfIteratorObj = env->CallObjectMethod(
+        jcfhandle_list, rocksdb::ListJni::getIteratorMethod(env));
+
+    // iterate over keys and convert java byte array to slice
+    while (env->CallBooleanMethod(
+        cfIteratorObj, rocksdb::ListJni::getHasNextMethod(env)) == JNI_TRUE) {
+      jobject jobj = (jbyteArray) env->CallObjectMethod(
+          cfIteratorObj, rocksdb::ListJni::getNextMethod(env));
+      rocksdb::ColumnFamilyHandle* cfHandle =
+          rocksdb::ColumnFamilyHandleJni::getHandle(env, jobj);
+      cf_handles.push_back(cfHandle);
+    }
+  }
 
+  // Process key list
   // get iterator
   jobject iteratorObj = env->CallObjectMethod(
       jkey_list, rocksdb::ListJni::getIteratorMethod(env));
@@ -263,7 +658,12 @@ jobject multi_get_helper(JNIEnv* env, jobject jdb, rocksdb::DB* db,
   }
 
   std::vector<std::string> values;
-  std::vector<rocksdb::Status> s = db->MultiGet(rOpt, keys, &values);
+  std::vector<rocksdb::Status> s;
+  if (cf_handles.size() == 0) {
+    s = db->MultiGet(rOpt, keys, &values);
+  } else {
+    s = db->MultiGet(rOpt, cf_handles, keys, &values);
+  }
 
   // Don't reuse class pointer
   jclass jclazz = env->FindClass("java/util/ArrayList");
@@ -285,13 +685,11 @@ jobject multi_get_helper(JNIEnv* env, jobject jdb, rocksdb::DB* db,
           jvalue_list, rocksdb::ListJni::getListAddMethodId(env), nullptr);
     }
   }
-
   // free up allocated byte arrays
   for (std::vector<jbyte*>::size_type i = 0; i != keys_to_free.size(); i++) {
     delete[] keys_to_free[i];
   }
   keys_to_free.clear();
-
   return jvalue_list;
 }
 
@@ -304,7 +702,20 @@ jobject Java_org_rocksdb_RocksDB_multiGet__JLjava_util_List_2I(
     JNIEnv* env, jobject jdb, jlong jdb_handle,
     jobject jkey_list, jint jkeys_count) {
   return multi_get_helper(env, jdb, reinterpret_cast<rocksdb::DB*>(jdb_handle),
-      rocksdb::ReadOptions(), jkey_list, jkeys_count);
+      rocksdb::ReadOptions(), jkey_list, jkeys_count, nullptr);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    multiGet
+ * Signature: (JLjava/util/List;ILjava/util/List;)Ljava/util/List;
+ */
+jobject
+    Java_org_rocksdb_RocksDB_multiGet__JLjava_util_List_2ILjava_util_List_2(
+    JNIEnv* env, jobject jdb, jlong jdb_handle,
+    jobject jkey_list, jint jkeys_count, jobject jcfhandle_list) {
+  return multi_get_helper(env, jdb, reinterpret_cast<rocksdb::DB*>(jdb_handle),
+      rocksdb::ReadOptions(), jkey_list, jkeys_count, jcfhandle_list);
 }
 
 /*
@@ -317,7 +728,22 @@ jobject Java_org_rocksdb_RocksDB_multiGet__JJLjava_util_List_2I(
     jlong jropt_handle, jobject jkey_list, jint jkeys_count) {
   return multi_get_helper(env, jdb, reinterpret_cast<rocksdb::DB*>(jdb_handle),
       *reinterpret_cast<rocksdb::ReadOptions*>(jropt_handle), jkey_list,
-      jkeys_count);
+      jkeys_count, nullptr);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    multiGet
+ * Signature: (JJLjava/util/List;ILjava/util/List;)Ljava/util/List;
+ */
+jobject
+    Java_org_rocksdb_RocksDB_multiGet__JJLjava_util_List_2ILjava_util_List_2(
+    JNIEnv* env, jobject jdb, jlong jdb_handle,
+    jlong jropt_handle, jobject jkey_list, jint jkeys_count,
+    jobject jcfhandle_list) {
+  return multi_get_helper(env, jdb, reinterpret_cast<rocksdb::DB*>(jdb_handle),
+      *reinterpret_cast<rocksdb::ReadOptions*>(jropt_handle), jkey_list,
+      jkeys_count, jcfhandle_list);
 }
 
 /*
@@ -331,10 +757,32 @@ jint Java_org_rocksdb_RocksDB_get__J_3BI_3BI(
     jbyteArray jvalue, jint jvalue_len) {
   return rocksdb_get_helper(env,
       reinterpret_cast<rocksdb::DB*>(jdb_handle),
-      rocksdb::ReadOptions(),
+      rocksdb::ReadOptions(), nullptr,
       jkey, jkey_len, jvalue, jvalue_len);
 }
 
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    get
+ * Signature: (J[BI[BIJ)I
+ */
+jint Java_org_rocksdb_RocksDB_get__J_3BI_3BIJ(
+    JNIEnv* env, jobject jdb, jlong jdb_handle,
+    jbyteArray jkey, jint jkey_len,
+    jbyteArray jvalue, jint jvalue_len, jlong jcf_handle) {
+  auto db_handle = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  auto cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
+  if (cf_handle != nullptr) {
+    return rocksdb_get_helper(env, db_handle, rocksdb::ReadOptions(), cf_handle,
+        jkey, jkey_len, jvalue, jvalue_len);
+  } else {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env,
+        rocksdb::Status::InvalidArgument("Invalid ColumnFamilyHandle."));
+    // will never be evaluated
+    return 0;
+  }
+}
+
 /*
  * Class:     org_rocksdb_RocksDB
  * Method:    get
@@ -347,19 +795,46 @@ jint Java_org_rocksdb_RocksDB_get__JJ_3BI_3BI(
   return rocksdb_get_helper(env,
       reinterpret_cast<rocksdb::DB*>(jdb_handle),
       *reinterpret_cast<rocksdb::ReadOptions*>(jropt_handle),
-      jkey, jkey_len, jvalue, jvalue_len);
+      nullptr, jkey, jkey_len, jvalue, jvalue_len);
 }
 
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    get
+ * Signature: (JJ[BI[BIJ)I
+ */
+jint Java_org_rocksdb_RocksDB_get__JJ_3BI_3BIJ(
+    JNIEnv* env, jobject jdb, jlong jdb_handle, jlong jropt_handle,
+    jbyteArray jkey, jint jkey_len,
+    jbyteArray jvalue, jint jvalue_len, jlong jcf_handle) {
+  auto db_handle = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  auto& ro_opt = *reinterpret_cast<rocksdb::ReadOptions*>(jropt_handle);
+  auto cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
+  if (cf_handle != nullptr) {
+    return rocksdb_get_helper(env, db_handle, ro_opt, cf_handle, jkey,
+        jkey_len, jvalue, jvalue_len);
+  } else {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env,
+        rocksdb::Status::InvalidArgument("Invalid ColumnFamilyHandle."));
+    // will never be evaluated
+    return 0;
+  }
+}
 //////////////////////////////////////////////////////////////////////////////
 // rocksdb::DB::Delete()
 void rocksdb_remove_helper(
     JNIEnv* env, rocksdb::DB* db, const rocksdb::WriteOptions& write_options,
-    jbyteArray jkey, jint jkey_len) {
+    rocksdb::ColumnFamilyHandle* cf_handle, jbyteArray jkey, jint jkey_len) {
   jbyte* key = env->GetByteArrayElements(jkey, 0);
   rocksdb::Slice key_slice(reinterpret_cast<char*>(key), jkey_len);
 
-  rocksdb::Status s = db->Delete(write_options, key_slice);
-
+  rocksdb::Status s;
+  if (cf_handle != nullptr) {
+    s = db->Delete(write_options, cf_handle, key_slice);
+  } else {
+    // backwards compatibility
+    s = db->Delete(write_options, key_slice);
+  }
   // trigger java unref on key and value.
   // by passing JNI_ABORT, it will simply release the reference without
   // copying the result back to the java byte array.
@@ -382,24 +857,63 @@ void Java_org_rocksdb_RocksDB_remove__J_3BI(
   auto db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
   static const rocksdb::WriteOptions default_write_options =
       rocksdb::WriteOptions();
+  rocksdb_remove_helper(env, db, default_write_options, nullptr,
+      jkey, jkey_len);
+}
 
-  rocksdb_remove_helper(env, db, default_write_options, jkey, jkey_len);
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    remove
+ * Signature: (J[BIJ)V
+ */
+void Java_org_rocksdb_RocksDB_remove__J_3BIJ(
+    JNIEnv* env, jobject jdb, jlong jdb_handle,
+    jbyteArray jkey, jint jkey_len, jlong jcf_handle) {
+  auto db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  static const rocksdb::WriteOptions default_write_options =
+      rocksdb::WriteOptions();
+  auto cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
+  if (cf_handle != nullptr) {
+    rocksdb_remove_helper(env, db, default_write_options, cf_handle,
+        jkey, jkey_len);
+  } else {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env,
+        rocksdb::Status::InvalidArgument("Invalid ColumnFamilyHandle."));
+  }
 }
 
 /*
  * Class:     org_rocksdb_RocksDB
  * Method:    remove
- * Signature: (JJ[BI)V
+ * Signature: (JJ[BIJ)V
  */
 void Java_org_rocksdb_RocksDB_remove__JJ_3BI(
     JNIEnv* env, jobject jdb, jlong jdb_handle,
     jlong jwrite_options, jbyteArray jkey, jint jkey_len) {
   auto db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
   auto write_options = reinterpret_cast<rocksdb::WriteOptions*>(jwrite_options);
-
-  rocksdb_remove_helper(env, db, *write_options, jkey, jkey_len);
+  rocksdb_remove_helper(env, db, *write_options, nullptr, jkey, jkey_len);
 }
 
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    remove
+ * Signature: (JJ[BIJ)V
+ */
+void Java_org_rocksdb_RocksDB_remove__JJ_3BIJ(
+    JNIEnv* env, jobject jdb, jlong jdb_handle,
+    jlong jwrite_options, jbyteArray jkey, jint jkey_len,
+    jlong jcf_handle) {
+  auto db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  auto write_options = reinterpret_cast<rocksdb::WriteOptions*>(jwrite_options);
+  auto cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
+  if (cf_handle != nullptr) {
+    rocksdb_remove_helper(env, db, *write_options, cf_handle, jkey, jkey_len);
+  } else {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env,
+        rocksdb::Status::InvalidArgument("Invalid ColumnFamilyHandle."));
+  }
+}
 //////////////////////////////////////////////////////////////////////////////
 // rocksdb::DB::~DB()
 
@@ -418,19 +932,111 @@ void Java_org_rocksdb_RocksDB_disposeInternal(
  * Method:    iterator0
  * Signature: (J)J
  */
-jlong Java_org_rocksdb_RocksDB_iterator0(
+jlong Java_org_rocksdb_RocksDB_iterator0__J(
     JNIEnv* env, jobject jdb, jlong db_handle) {
   auto db = reinterpret_cast<rocksdb::DB*>(db_handle);
   rocksdb::Iterator* iterator = db->NewIterator(rocksdb::ReadOptions());
   return reinterpret_cast<jlong>(iterator);
 }
 
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    iterator0
+ * Signature: (JJ)J
+ */
+jlong Java_org_rocksdb_RocksDB_iterator0__JJ(
+    JNIEnv* env, jobject jdb, jlong db_handle, jlong jcf_handle) {
+  auto db = reinterpret_cast<rocksdb::DB*>(db_handle);
+  auto cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
+  rocksdb::Iterator* iterator = db->NewIterator(rocksdb::ReadOptions(),
+      cf_handle);
+  return reinterpret_cast<jlong>(iterator);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    iterators
+ * Signature: (JLjava/util/List;)[J
+ */
+jlongArray Java_org_rocksdb_RocksDB_iterators(
+    JNIEnv* env, jobject jdb, jlong db_handle, jobject jcfhandle_list) {
+  auto db = reinterpret_cast<rocksdb::DB*>(db_handle);
+  std::vector<rocksdb::ColumnFamilyHandle*> cf_handles;
+  std::vector<rocksdb::Iterator*> iterators;
+
+  if (jcfhandle_list != nullptr) {
+    // get cf iterator
+    jobject cfIteratorObj = env->CallObjectMethod(
+        jcfhandle_list, rocksdb::ListJni::getIteratorMethod(env));
+
+    // iterate over keys and convert java byte array to slice
+    while (env->CallBooleanMethod(
+        cfIteratorObj, rocksdb::ListJni::getHasNextMethod(env)) == JNI_TRUE) {
+      jobject jobj = (jbyteArray) env->CallObjectMethod(
+          cfIteratorObj, rocksdb::ListJni::getNextMethod(env));
+      rocksdb::ColumnFamilyHandle* cfHandle =
+          rocksdb::ColumnFamilyHandleJni::getHandle(env, jobj);
+      cf_handles.push_back(cfHandle);
+    }
+  }
+
+  rocksdb::Status s = db->NewIterators(rocksdb::ReadOptions(),
+      cf_handles, &iterators);
+  if (s.ok()) {
+    jlongArray jLongArray = env->NewLongArray(iterators.size());
+    for (std::vector<rocksdb::Iterator*>::size_type i = 0;
+        i < iterators.size(); i++) {
+      env->SetLongArrayRegion(jLongArray, i, 1,
+          reinterpret_cast<const jlong*>(&iterators[i]));
+    }
+    return jLongArray;
+  }
+  rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  return env->NewLongArray(0);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    createColumnFamily
+ * Signature: (JLjava/lang/String;)J;
+ */
+jlong Java_org_rocksdb_RocksDB_createColumnFamily(
+    JNIEnv* env, jobject jdb, jlong jdb_handle, jstring jcfname) {
+  rocksdb::ColumnFamilyHandle* handle;
+  const char* cfname = env->GetStringUTFChars(jcfname, 0);
+  auto db_handle = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  rocksdb::Status s = db_handle->CreateColumnFamily(
+      rocksdb::ColumnFamilyOptions(), cfname, &handle);
+  env->ReleaseStringUTFChars(jcfname, cfname);
+
+  if (s.ok()) {
+    return reinterpret_cast<jlong>(handle);
+  }
+  rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  return 0;
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    dropColumnFamily
+ * Signature: (JJ)V;
+ */
+void Java_org_rocksdb_RocksDB_dropColumnFamily(
+    JNIEnv* env, jobject jdb, jlong jdb_handle, jlong jcf_handle) {
+  auto cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
+  auto db_handle = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  rocksdb::Status s = db_handle->DropColumnFamily(cf_handle);
+  if (!s.ok()) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
 /*
  * Class:     org_rocksdb_RocksDB
  * Method:    getProperty0
  * Signature: (JLjava/lang/String;I)Ljava/lang/String;
  */
-jstring Java_org_rocksdb_RocksDB_getProperty0(
+jstring Java_org_rocksdb_RocksDB_getProperty0__JLjava_lang_String_2I(
     JNIEnv* env, jobject jdb, jlong db_handle, jstring jproperty,
     jint jproperty_len) {
   auto db = reinterpret_cast<rocksdb::DB*>(db_handle);
@@ -448,3 +1054,28 @@ jstring Java_org_rocksdb_RocksDB_getProperty0(
 
   return env->NewStringUTF(property_value.data());
 }
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    getProperty0
+ * Signature: (JJLjava/lang/String;I)Ljava/lang/String;
+ */
+jstring Java_org_rocksdb_RocksDB_getProperty0__JJLjava_lang_String_2I(
+    JNIEnv* env, jobject jdb, jlong db_handle, jlong jcf_handle,
+    jstring jproperty, jint jproperty_len) {
+  auto db = reinterpret_cast<rocksdb::DB*>(db_handle);
+  auto cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
+
+  const char* property = env->GetStringUTFChars(jproperty, 0);
+  rocksdb::Slice property_slice(property, jproperty_len);
+
+  std::string property_value;
+  bool retCode = db->GetProperty(cf_handle, property_slice, &property_value);
+  env->ReleaseStringUTFChars(jproperty, property);
+
+  if (!retCode) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, rocksdb::Status::NotFound());
+  }
+
+  return env->NewStringUTF(property_value.data());
+}
diff --git a/java/rocksjni/write_batch.cc b/java/rocksjni/write_batch.cc
index a51cfce12..10937db14 100644
--- a/java/rocksjni/write_batch.cc
+++ b/java/rocksjni/write_batch.cc
@@ -60,14 +60,13 @@ void Java_org_rocksdb_WriteBatch_clear(JNIEnv* env, jobject jobj) {
 }
 
 /*
- * Class:     org_rocksdb_WriteBatch
- * Method:    put
- * Signature: ([BI[BI)V
+ * Helper for WriteBatch put operations
  */
-void Java_org_rocksdb_WriteBatch_put(
+void write_batch_put_helper(
     JNIEnv* env, jobject jobj,
     jbyteArray jkey, jint jkey_len,
-    jbyteArray jvalue, jint jvalue_len) {
+    jbyteArray jvalue, jint jvalue_len,
+    rocksdb::ColumnFamilyHandle* cf_handle) {
   rocksdb::WriteBatch* wb = rocksdb::WriteBatchJni::getHandle(env, jobj);
   assert(wb != nullptr);
 
@@ -75,20 +74,51 @@ void Java_org_rocksdb_WriteBatch_put(
   jbyte* value = env->GetByteArrayElements(jvalue, nullptr);
   rocksdb::Slice key_slice(reinterpret_cast<char*>(key), jkey_len);
   rocksdb::Slice value_slice(reinterpret_cast<char*>(value), jvalue_len);
-  wb->Put(key_slice, value_slice);
+  if (cf_handle != nullptr) {
+    wb->Put(cf_handle, key_slice, value_slice);
+  } else {
+    // backwards compatibility
+    wb->Put(key_slice, value_slice);
+  }
   env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
   env->ReleaseByteArrayElements(jvalue, value, JNI_ABORT);
 }
 
 /*
  * Class:     org_rocksdb_WriteBatch
- * Method:    merge
+ * Method:    put
  * Signature: ([BI[BI)V
  */
-JNIEXPORT void JNICALL Java_org_rocksdb_WriteBatch_merge(
+void Java_org_rocksdb_WriteBatch_put___3BI_3BI(
     JNIEnv* env, jobject jobj,
     jbyteArray jkey, jint jkey_len,
     jbyteArray jvalue, jint jvalue_len) {
+  write_batch_put_helper(env, jobj, jkey, jkey_len, jvalue,
+      jvalue_len, nullptr);
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatch
+ * Method:    put
+ * Signature: ([BI[BIJ)V
+ */
+void Java_org_rocksdb_WriteBatch_put___3BI_3BIJ(
+    JNIEnv* env, jobject jobj,
+    jbyteArray jkey, jint jkey_len,
+    jbyteArray jvalue, jint jvalue_len, jlong jcf_handle) {
+  auto cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
+  write_batch_put_helper(env, jobj, jkey, jkey_len, jvalue,
+      jvalue_len, cf_handle);
+}
+
+/*
+ * Helper for write batch merge operations
+ */
+void write_batch_merge_helper(
+    JNIEnv* env, jobject jobj,
+    jbyteArray jkey, jint jkey_len,
+    jbyteArray jvalue, jint jvalue_len,
+    rocksdb::ColumnFamilyHandle* cf_handle) {
   rocksdb::WriteBatch* wb = rocksdb::WriteBatchJni::getHandle(env, jobj);
   assert(wb != nullptr);
 
@@ -96,28 +126,86 @@ JNIEXPORT void JNICALL Java_org_rocksdb_WriteBatch_merge(
   jbyte* value = env->GetByteArrayElements(jvalue, nullptr);
   rocksdb::Slice key_slice(reinterpret_cast<char*>(key), jkey_len);
   rocksdb::Slice value_slice(reinterpret_cast<char*>(value), jvalue_len);
-  wb->Merge(key_slice, value_slice);
+  if (cf_handle != nullptr) {
+    wb->Merge(cf_handle, key_slice, value_slice);
+  } else {
+    // backwards compatibility
+    wb->Merge(key_slice, value_slice);
+  }
   env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
   env->ReleaseByteArrayElements(jvalue, value, JNI_ABORT);
 }
 
 /*
  * Class:     org_rocksdb_WriteBatch
- * Method:    remove
- * Signature: ([BI)V
+ * Method:    merge
+ * Signature: ([BI[BI)V
  */
-JNIEXPORT void JNICALL Java_org_rocksdb_WriteBatch_remove(
+void Java_org_rocksdb_WriteBatch_merge___3BI_3BI(
     JNIEnv* env, jobject jobj,
-    jbyteArray jkey, jint jkey_len) {
+    jbyteArray jkey, jint jkey_len,
+    jbyteArray jvalue, jint jvalue_len) {
+  write_batch_merge_helper(env, jobj, jkey, jkey_len,
+      jvalue, jvalue_len, nullptr);
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatch
+ * Method:    merge
+ * Signature: ([BI[BIJ)V
+ */
+void Java_org_rocksdb_WriteBatch_merge___3BI_3BIJ(
+    JNIEnv* env, jobject jobj,
+    jbyteArray jkey, jint jkey_len,
+    jbyteArray jvalue, jint jvalue_len, jlong jcf_handle) {
+  auto cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
+  write_batch_merge_helper(env, jobj, jkey, jkey_len,
+      jvalue, jvalue_len, cf_handle);
+}
+
+/*
+ * Helper for write batch remove operations
+ */
+void write_batch_remove_helper(
+    JNIEnv* env, jobject jobj,
+    jbyteArray jkey, jint jkey_len,
+    rocksdb::ColumnFamilyHandle* cf_handle) {
   rocksdb::WriteBatch* wb = rocksdb::WriteBatchJni::getHandle(env, jobj);
   assert(wb != nullptr);
 
   jbyte* key = env->GetByteArrayElements(jkey, nullptr);
   rocksdb::Slice key_slice(reinterpret_cast<char*>(key), jkey_len);
-  wb->Delete(key_slice);
+  if (cf_handle != nullptr) {
+    wb->Delete(cf_handle, key_slice);
+  } else {
+    wb->Delete(key_slice);
+  }
   env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
 }
 
+/*
+ * Class:     org_rocksdb_WriteBatch
+ * Method:    remove
+ * Signature: ([BI)V
+ */
+void Java_org_rocksdb_WriteBatch_remove___3BI(
+    JNIEnv* env, jobject jobj,
+    jbyteArray jkey, jint jkey_len) {
+  write_batch_remove_helper(env, jobj, jkey, jkey_len, nullptr);
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatch
+ * Method:    remove
+ * Signature: ([BIJ)V
+ */
+void Java_org_rocksdb_WriteBatch_remove___3BIJ(
+    JNIEnv* env, jobject jobj,
+    jbyteArray jkey, jint jkey_len, jlong jcf_handle) {
+  auto cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
+  write_batch_remove_helper(env, jobj, jkey, jkey_len, cf_handle);
+}
+
 /*
  * Class:     org_rocksdb_WriteBatch
  * Method:    putLogData

From ee28f431d09c9d144f5ff7225b7b5ece12f5311d Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Mon, 13 Oct 2014 21:12:42 +0200
Subject: [PATCH 250/829] With the last commit a Test was accidentally
 disabled. This commit solves this.

---
 java/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/java/Makefile b/java/Makefile
index 9c75c54ea..ef8ccbae4 100644
--- a/java/Makefile
+++ b/java/Makefile
@@ -42,7 +42,7 @@ test: java
 	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.ColumnFamilyTest
 	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.FilterTest
 	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.KeyMayExistTest
-	#java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.OptionsTest
+	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.OptionsTest
 	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.ReadOnlyTest
 	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.ReadOptionsTest
 	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.StatisticsCollectorTest

From cc6c883f59bbca6008362bc10b2f44c8a5c570d7 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Mon, 13 Oct 2014 14:25:55 -0700
Subject: [PATCH 251/829] Stop stopping writes on bg_error_

Summary: This might have caused https://github.com/facebook/rocksdb/issues/345. If we're stopping writes and bg_error comes along, we will never unblock the write.

Test Plan: compiles

Reviewers: ljin

Reviewed By: ljin

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D24807
---
 db/db_impl.cc | 10 ++++++----
 db/db_impl.h  |  2 +-
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/db/db_impl.cc b/db/db_impl.cc
index 5abfb4ac2..dc9399fd8 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -4036,7 +4036,7 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
 
   if (UNLIKELY(status.ok()) &&
       (write_controller_.IsStopped() || write_controller_.GetDelay() > 0)) {
-    DelayWrite(expiration_time);
+    status = DelayWrite(expiration_time);
   }
 
   if (UNLIKELY(status.ok() && has_timeout &&
@@ -4151,7 +4151,7 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
 
 // REQUIRES: mutex_ is held
 // REQUIRES: this thread is currently at the front of the writer queue
-void DBImpl::DelayWrite(uint64_t expiration_time) {
+Status DBImpl::DelayWrite(uint64_t expiration_time) {
   StopWatch sw(env_, stats_, WRITE_STALL);
   bool has_timeout = (expiration_time > 0);
   auto delay = write_controller_.GetDelay();
@@ -4161,16 +4161,18 @@ void DBImpl::DelayWrite(uint64_t expiration_time) {
     mutex_.Lock();
   }
 
-  while (write_controller_.IsStopped()) {
+  while (bg_error_.ok() && write_controller_.IsStopped()) {
     if (has_timeout) {
       bg_cv_.TimedWait(expiration_time);
       if (env_->NowMicros() > expiration_time) {
-        break;
+        return Status::TimedOut();
       }
     } else {
       bg_cv_.Wait();
     }
   }
+
+  return bg_error_;
 }
 
 Status DBImpl::ScheduleFlushes(WriteContext* context) {
diff --git a/db/db_impl.h b/db/db_impl.h
index 622df4293..149958315 100644
--- a/db/db_impl.h
+++ b/db/db_impl.h
@@ -367,7 +367,7 @@ class DBImpl : public DB {
       const autovector<MemTable*>& mems,
       VersionEdit* edit, uint64_t* filenumber, LogBuffer* log_buffer);
 
-  void DelayWrite(uint64_t expiration_time);
+  Status DelayWrite(uint64_t expiration_time);
 
   Status ScheduleFlushes(WriteContext* context);
 

From a40ce219b9d3009331f4984529d92b499b03d454 Mon Sep 17 00:00:00 2001
From: Vlad Balan <vlad.balan@turn.com>
Date: Tue, 16 Sep 2014 13:58:49 -0700
Subject: [PATCH 252/829] Adding merge functions to RocksDBJava

Summary:
Added support for the merge operation to RocksJava.
You can specify a merge function to be used on the current database.
The merge function can either be one of the functions defined in
utilities/merge_operators.h, which can be specified through its
corresponding name, or a user-created function that needs to be
encapsulated in a JNI object in order to be used. Examples are
provided for both use cases.

Test Plan: There are unit test in MergeTest.java

Reviewers: ankgup87

Subscribers: vladb38

Differential Revision: https://reviews.facebook.net/D24525
---
 java/Makefile                              |  3 +-
 java/org/rocksdb/MergeOperator.java        | 19 +++++
 java/org/rocksdb/Options.java              | 34 +++++++++
 java/org/rocksdb/RocksDB.java              | 33 +++++++-
 java/org/rocksdb/StringAppendOperator.java | 21 ++++++
 java/org/rocksdb/test/MergeTest.java       | 88 ++++++++++++++++++++++
 java/rocksjni/merge_operator.cc            | 35 +++++++++
 java/rocksjni/options.cc                   | 26 +++++++
 java/rocksjni/rocksjni.cc                  | 64 ++++++++++++++++
 9 files changed, 321 insertions(+), 2 deletions(-)
 create mode 100644 java/org/rocksdb/MergeOperator.java
 create mode 100644 java/org/rocksdb/StringAppendOperator.java
 create mode 100644 java/org/rocksdb/test/MergeTest.java
 create mode 100644 java/rocksjni/merge_operator.cc

diff --git a/java/Makefile b/java/Makefile
index 9c75c54ea..441238930 100644
--- a/java/Makefile
+++ b/java/Makefile
@@ -1,4 +1,4 @@
-NATIVE_JAVA_CLASSES = org.rocksdb.RocksDB org.rocksdb.Options org.rocksdb.WriteBatch org.rocksdb.WriteBatchInternal org.rocksdb.WriteBatchTest org.rocksdb.WriteOptions org.rocksdb.BackupableDB org.rocksdb.BackupableDBOptions org.rocksdb.Statistics org.rocksdb.RocksIterator org.rocksdb.VectorMemTableConfig org.rocksdb.SkipListMemTableConfig org.rocksdb.HashLinkedListMemTableConfig org.rocksdb.HashSkipListMemTableConfig org.rocksdb.PlainTableConfig org.rocksdb.BlockBasedTableConfig org.rocksdb.ReadOptions org.rocksdb.Filter org.rocksdb.BloomFilter org.rocksdb.RestoreOptions org.rocksdb.RestoreBackupableDB org.rocksdb.RocksEnv org.rocksdb.GenericRateLimiterConfig org.rocksdb.ColumnFamilyHandle
+NATIVE_JAVA_CLASSES = org.rocksdb.RocksDB org.rocksdb.Options org.rocksdb.WriteBatch org.rocksdb.WriteBatchInternal org.rocksdb.WriteBatchTest org.rocksdb.WriteOptions org.rocksdb.BackupableDB org.rocksdb.BackupableDBOptions org.rocksdb.Statistics org.rocksdb.RocksIterator org.rocksdb.VectorMemTableConfig org.rocksdb.SkipListMemTableConfig org.rocksdb.HashLinkedListMemTableConfig org.rocksdb.HashSkipListMemTableConfig org.rocksdb.PlainTableConfig org.rocksdb.BlockBasedTableConfig org.rocksdb.ReadOptions org.rocksdb.Filter org.rocksdb.BloomFilter org.rocksdb.RestoreOptions org.rocksdb.RestoreBackupableDB org.rocksdb.RocksEnv org.rocksdb.GenericRateLimiterConfig org.rocksdb.ColumnFamilyHandle org.rocksdb.MergeOperator org.rocksdb.StringAppendOperator
 
 ROCKSDB_MAJOR = $(shell egrep "ROCKSDB_MAJOR.[0-9]" ../include/rocksdb/version.h | cut -d ' ' -f 3)
 ROCKSDB_MINOR = $(shell egrep "ROCKSDB_MINOR.[0-9]" ../include/rocksdb/version.h | cut -d ' ' -f 3)
@@ -44,6 +44,7 @@ test: java
 	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.KeyMayExistTest
 	#java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.OptionsTest
 	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.ReadOnlyTest
+	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.MergeTest
 	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.ReadOptionsTest
 	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.StatisticsCollectorTest
 	@rm -rf /tmp/rocksdbjni_*
diff --git a/java/org/rocksdb/MergeOperator.java b/java/org/rocksdb/MergeOperator.java
new file mode 100644
index 000000000..310cf7a46
--- /dev/null
+++ b/java/org/rocksdb/MergeOperator.java
@@ -0,0 +1,19 @@
+// Copyright (c) 2014, Vlad Balan (vlad.gm@gmail.com).  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+import java.util.*;
+
+/**
+ * MergeOperator holds an operator to be applied when compacting
+ * two values held under the same key in order to obtain a single
+ * value.
+ */
+public abstract class MergeOperator {
+
+    abstract protected long newMergeOperatorHandle();
+
+}
diff --git a/java/org/rocksdb/Options.java b/java/org/rocksdb/Options.java
index bb6f74e08..f34171ea9 100644
--- a/java/org/rocksdb/Options.java
+++ b/java/org/rocksdb/Options.java
@@ -2234,6 +2234,40 @@ public class Options extends RocksObject {
   private native void setMinPartialMergeOperands(
       long handle, int minPartialMergeOperands);
 
+  /**
+   * Set the merge operator to be used for merging two different key/value
+   * pairs that share the same key. The merge function is invoked during
+   * compaction and at lookup time, if multiple key/value pairs belonging
+   * to the same key are found in the database.
+   *
+   * @param name the name of the merge function, as defined by
+   * the MergeOperators factory (see utilities/MergeOperators.h)
+   * @return the reference to the current option.
+   */
+  public Options setMergeOperatorName(String name) {
+      setMergeOperatorName(nativeHandle_, name);
+    return this;
+  }
+  private native void setMergeOperatorName(
+      long handle, String name);
+
+  /**
+   * Set the merge operator to be used for merging two different key/value
+   * pairs that share the same key. The merge function is invoked during
+   * compaction and at lookup time, if multiple key/value pairs belonging
+   * to the same key are found in the database.
+   *
+   * @param name the name of the merge function, as defined by
+   * the MergeOperators factory (see utilities/MergeOperators.h)
+   * @return the reference to the current option.
+   */
+  public Options setMergeOperator(MergeOperator mergeOperator) {
+      setMergeOperator(nativeHandle_, mergeOperator.newMergeOperatorHandle());
+    return this;
+  }
+  private native void setMergeOperator(
+      long handle, long mergeOperatorHandle);
+
   /**
    * Release the memory allocated for the current instance
    * in the c++ side.
diff --git a/java/org/rocksdb/RocksDB.java b/java/org/rocksdb/RocksDB.java
index d10c235dc..f54088da4 100644
--- a/java/org/rocksdb/RocksDB.java
+++ b/java/org/rocksdb/RocksDB.java
@@ -473,8 +473,32 @@ public class RocksDB extends RocksObject {
   }
 
   /**
-   * Get the value associated with the specified key within column family
+   * Set the database entry for "key" to "value".
    *
+   * @param key the specified key to be merged.
+   * @param value the value to be nerged with the current value for
+   * the specified key.
+   */
+  public void merge(byte[] key, byte[] value) throws RocksDBException {
+    merge(nativeHandle_, key, key.length, value, value.length);
+  }
+
+  /**
+   * Merge the database entry for "key" with "value".
+   *
+   * @param key the specified key to be merged.
+   * @param value the value to be merged with the current value for
+   * the specified key.
+   */
+  public void merge(WriteOptions writeOpts, byte[] key, byte[] value)
+      throws RocksDBException {
+    merge(nativeHandle_, writeOpts.nativeHandle_,
+        key, key.length, value, value.length);
+  }
+
+
+  /**
+   * Get the value associated with the specified key within column family*
    * @param key the key to retrieve the value.
    * @param value the out-value to receive the retrieved value.
    * @return The size of the actual value that matches the specified
@@ -1002,6 +1026,13 @@ public class RocksDB extends RocksObject {
       long cfHandle, StringBuffer stringBuffer);
   protected native boolean keyMayExist(long optionsHandle, byte[] key, int keyLen,
       long cfHandle, StringBuffer stringBuffer);
+  protected native void merge(
+      long handle, byte[] key, int keyLen,
+      byte[] value, int valueLen) throws RocksDBException;
+  protected native void merge(
+      long handle, long writeOptHandle,
+      byte[] key, int keyLen,
+      byte[] value, int valueLen) throws RocksDBException;
   protected native int get(
       long handle, byte[] key, int keyLen,
       byte[] value, int valueLen) throws RocksDBException;
diff --git a/java/org/rocksdb/StringAppendOperator.java b/java/org/rocksdb/StringAppendOperator.java
new file mode 100644
index 000000000..9b593204f
--- /dev/null
+++ b/java/org/rocksdb/StringAppendOperator.java
@@ -0,0 +1,21 @@
+// Copyright (c) 2014, Vlad Balan (vlad.gm@gmail.com).  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+/**
+ * MergeOperator holds an operator to be applied when compacting
+ * two values held under the same key in order to obtain a single
+ * value.
+ */
+public class StringAppendOperator extends MergeOperator {
+
+	@Override protected long newMergeOperatorHandle() {
+		return newMergeOperatorHandleImpl();
+	}
+
+    private native long newMergeOperatorHandleImpl();
+
+}
diff --git a/java/org/rocksdb/test/MergeTest.java b/java/org/rocksdb/test/MergeTest.java
new file mode 100644
index 000000000..0d3833715
--- /dev/null
+++ b/java/org/rocksdb/test/MergeTest.java
@@ -0,0 +1,88 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb.test;
+
+import java.util.Collections;
+import org.rocksdb.*;
+
+public class MergeTest {
+  static final String db_path_string = "/tmp/mergestringjni_db";
+  static final String db_path_function = "/tmp/mergefunctionjni_db";
+  static {
+	RocksDB.loadLibrary();
+  }
+
+  public static void testStringOption()
+	  throws InterruptedException, RocksDBException {
+
+	System.out.println("Testing merge function string option ===");
+
+	Options opt = new Options();
+	opt.setCreateIfMissing(true);
+	opt.setMergeOperatorName("stringappend");
+
+	RocksDB db = RocksDB.open(opt, db_path_string);
+
+	System.out.println("Writing aa under key...");
+	db.put("key".getBytes(), "aa".getBytes());
+
+	System.out.println("Writing bb under key...");
+	db.merge("key".getBytes(), "bb".getBytes());
+
+	byte[] value = db.get("key".getBytes());
+	String strValue = new String(value);
+
+	System.out.println("Retrieved value: " + strValue);
+
+	db.close();
+	opt.dispose();
+
+	assert(strValue.equals("aa,bb"));
+
+	System.out.println("Merge function string option passed!");
+
+  }
+
+  public static void testOperatorOption()
+	  throws InterruptedException, RocksDBException {
+
+	System.out.println("Testing merge function operator option ===");
+
+	Options opt = new Options();
+	opt.setCreateIfMissing(true);
+
+	StringAppendOperator stringAppendOperator = new StringAppendOperator();
+	opt.setMergeOperator(stringAppendOperator);
+
+	RocksDB db = RocksDB.open(opt, db_path_string);
+
+	System.out.println("Writing aa under key...");
+	db.put("key".getBytes(), "aa".getBytes());
+
+	System.out.println("Writing bb under key...");
+	db.merge("key".getBytes(), "bb".getBytes());
+
+	byte[] value = db.get("key".getBytes());
+	String strValue = new String(value);
+
+	System.out.println("Retrieved value: " + strValue);
+
+	db.close();
+	opt.dispose();
+
+	assert(strValue.equals("aa,bb"));
+
+	System.out.println("Merge function operator option passed!");
+
+  }
+
+  public static void main(String[] args)
+	  throws InterruptedException, RocksDBException {
+	testStringOption();
+	testOperatorOption();
+
+  }
+}
diff --git a/java/rocksjni/merge_operator.cc b/java/rocksjni/merge_operator.cc
new file mode 100644
index 000000000..fc295e38c
--- /dev/null
+++ b/java/rocksjni/merge_operator.cc
@@ -0,0 +1,35 @@
+// Copyright (c) 2014, Vlad Balan (vlad.gm@gmail.com).  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+//
+// This file implements the "bridge" between Java and C++ for rocksdb::MergeOperator.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <jni.h>
+#include <string>
+#include <memory>
+
+#include "include/org_rocksdb_StringAppendOperator.h"
+#include "rocksjni/portal.h"
+#include "rocksdb/db.h"
+#include "rocksdb/options.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/table.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/merge_operator.h"
+#include "utilities/merge_operators.h"
+
+/*
+ * Class:     org_rocksdb_StringAppendOperator
+ * Method:    newMergeOperatorHandle
+ * Signature: ()J
+ */
+jlong Java_org_rocksdb_StringAppendOperator_newMergeOperatorHandleImpl(JNIEnv* env, jobject jobj) {
+  std::shared_ptr<rocksdb::MergeOperator> *op = new std::shared_ptr<rocksdb::MergeOperator>();
+  *op = rocksdb::MergeOperators::CreateFromStringId("stringappend");
+  return reinterpret_cast<jlong>(op);
+}
+
diff --git a/java/rocksjni/options.cc b/java/rocksjni/options.cc
index ef104d92b..a52e2da70 100644
--- a/java/rocksjni/options.cc
+++ b/java/rocksjni/options.cc
@@ -23,6 +23,8 @@
 #include "rocksdb/slice_transform.h"
 #include "rocksdb/rate_limiter.h"
 #include "rocksdb/comparator.h"
+#include "rocksdb/merge_operator.h"
+#include "utilities/merge_operators.h"
 
 /*
  * Class:     org_rocksdb_Options
@@ -1603,6 +1605,29 @@ void Java_org_rocksdb_Options_setMinPartialMergeOperands(
           static_cast<int32_t>(jmin_partial_merge_operands);
 }
 
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setMergeOperatorName
+ * Signature: (JJjava/lang/String)V
+ */
+void Java_org_rocksdb_Options_setMergeOperatorName(
+    JNIEnv* env, jobject jobj, jlong jhandle, jstring name) {
+  const char* op_name = env->GetStringUTFChars(name, 0);
+  reinterpret_cast<rocksdb::Options*>(jhandle)->merge_operator =
+    rocksdb::MergeOperators::CreateFromStringId(op_name);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setMergeOperator
+ * Signature: (JJjava/lang/String)V
+ */
+void Java_org_rocksdb_Options_setMergeOperator(
+    JNIEnv* env, jobject jobj, jlong jhandle, jlong mergeOperatorHandle) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->merge_operator =
+    *(reinterpret_cast<std::shared_ptr<rocksdb::MergeOperator>*> (mergeOperatorHandle));
+}
+
 //////////////////////////////////////////////////////////////////////////////
 // WriteOptions
 
@@ -1759,3 +1784,4 @@ void Java_org_rocksdb_ReadOptions_setTailing(
   reinterpret_cast<rocksdb::ReadOptions*>(jhandle)->tailing =
       static_cast<bool>(jtailing);
 }
+
diff --git a/java/rocksjni/rocksjni.cc b/java/rocksjni/rocksjni.cc
index fa9a66a7d..f5e702520 100644
--- a/java/rocksjni/rocksjni.cc
+++ b/java/rocksjni/rocksjni.cc
@@ -914,6 +914,70 @@ void Java_org_rocksdb_RocksDB_remove__JJ_3BIJ(
         rocksdb::Status::InvalidArgument("Invalid ColumnFamilyHandle."));
   }
 }
+//////////////////////////////////////////////////////////////////////////////
+// rocksdb::DB::Merge
+
+void rocksdb_merge_helper(
+    JNIEnv* env, rocksdb::DB* db, const rocksdb::WriteOptions& write_options,
+    jbyteArray jkey, jint jkey_len,
+    jbyteArray jvalue, jint jvalue_len) {
+
+  jbyte* key = env->GetByteArrayElements(jkey, 0);
+  jbyte* value = env->GetByteArrayElements(jvalue, 0);
+  rocksdb::Slice key_slice(reinterpret_cast<char*>(key), jkey_len);
+  rocksdb::Slice value_slice(reinterpret_cast<char*>(value), jvalue_len);
+
+  rocksdb::Status s = db->Merge(write_options, key_slice, value_slice);
+
+  // trigger java unref on key and value.
+  // by passing JNI_ABORT, it will simply release the reference without
+  // copying the result back to the java byte array.
+  env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
+  env->ReleaseByteArrayElements(jvalue, value, JNI_ABORT);
+
+  if (s.ok()) {
+    return;
+  }
+  rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    merge
+ * Signature: (J[BI[BI)V
+ */
+void Java_org_rocksdb_RocksDB_merge__J_3BI_3BI(
+    JNIEnv* env, jobject jdb, jlong jdb_handle,
+    jbyteArray jkey, jint jkey_len,
+    jbyteArray jvalue, jint jvalue_len) {
+  auto db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  static const rocksdb::WriteOptions default_write_options =
+      rocksdb::WriteOptions();
+
+  rocksdb_merge_helper(env, db, default_write_options,
+                     jkey, jkey_len,
+                     jvalue, jvalue_len);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    merge
+ * Signature: (JJ[BI[BI)V
+ */
+void Java_org_rocksdb_RocksDB_merge__JJ_3BI_3BI(
+    JNIEnv* env, jobject jdb,
+    jlong jdb_handle, jlong jwrite_options_handle,
+    jbyteArray jkey, jint jkey_len,
+    jbyteArray jvalue, jint jvalue_len) {
+  auto db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  auto write_options = reinterpret_cast<rocksdb::WriteOptions*>(
+      jwrite_options_handle);
+
+  rocksdb_merge_helper(env, db, *write_options,
+                     jkey, jkey_len,
+                     jvalue, jvalue_len);
+}
+
 //////////////////////////////////////////////////////////////////////////////
 // rocksdb::DB::~DB()
 

From 2ef3ed86f3926f46c533572532b7369a283209e6 Mon Sep 17 00:00:00 2001
From: Vlad Balan <hbalan@usc.edu>
Date: Mon, 6 Oct 2014 16:58:24 -0700
Subject: [PATCH 253/829] Integrated feedback from ankgup87

Test Plan: tested using unit tests

Reviewers: ankgup87

Differential Revision: https://reviews.facebook.net/D24573
---
 java/org/rocksdb/MergeOperator.java        |   8 +-
 java/org/rocksdb/Options.java              |  10 +-
 java/org/rocksdb/RocksDB.java              |   5 +-
 java/org/rocksdb/StringAppendOperator.java |  16 ++--
 java/org/rocksdb/test/MergeTest.java       | 101 ++++++++++-----------
 java/rocksjni/merge_operator.cc            |  10 +-
 java/rocksjni/options.cc                   |   6 +-
 7 files changed, 76 insertions(+), 80 deletions(-)

diff --git a/java/org/rocksdb/MergeOperator.java b/java/org/rocksdb/MergeOperator.java
index 310cf7a46..aaf44d07c 100644
--- a/java/org/rocksdb/MergeOperator.java
+++ b/java/org/rocksdb/MergeOperator.java
@@ -9,11 +9,9 @@ import java.util.*;
 
 /**
  * MergeOperator holds an operator to be applied when compacting
- * two values held under the same key in order to obtain a single
+ * two merge operands held under the same key in order to obtain a single
  * value.
  */
-public abstract class MergeOperator {
-
-    abstract protected long newMergeOperatorHandle();
-
+public interface MergeOperator {
+    public long newMergeOperatorHandle();
 }
diff --git a/java/org/rocksdb/Options.java b/java/org/rocksdb/Options.java
index f34171ea9..586585a35 100644
--- a/java/org/rocksdb/Options.java
+++ b/java/org/rocksdb/Options.java
@@ -2235,13 +2235,16 @@ public class Options extends RocksObject {
       long handle, int minPartialMergeOperands);
 
   /**
-   * Set the merge operator to be used for merging two different key/value
-   * pairs that share the same key. The merge function is invoked during
+   * Set the merge operator to be used for merging two merge operands
+   * of the same key. The merge function is invoked during
    * compaction and at lookup time, if multiple key/value pairs belonging
    * to the same key are found in the database.
    *
    * @param name the name of the merge function, as defined by
    * the MergeOperators factory (see utilities/MergeOperators.h)
+   * The merge function is specified by name and must be one of the
+   * standard merge operators provided by RocksDB. The available
+   * operators are "put", "uint64add", "stringappend" and "stringappendtest".
    * @return the reference to the current option.
    */
   public Options setMergeOperatorName(String name) {
@@ -2257,8 +2260,7 @@ public class Options extends RocksObject {
    * compaction and at lookup time, if multiple key/value pairs belonging
    * to the same key are found in the database.
    *
-   * @param name the name of the merge function, as defined by
-   * the MergeOperators factory (see utilities/MergeOperators.h)
+   * @param a {@link MergeOperator} object
    * @return the reference to the current option.
    */
   public Options setMergeOperator(MergeOperator mergeOperator) {
diff --git a/java/org/rocksdb/RocksDB.java b/java/org/rocksdb/RocksDB.java
index f54088da4..f6f0d09aa 100644
--- a/java/org/rocksdb/RocksDB.java
+++ b/java/org/rocksdb/RocksDB.java
@@ -473,7 +473,7 @@ public class RocksDB extends RocksObject {
   }
 
   /**
-   * Set the database entry for "key" to "value".
+   * Add merge operand for key/value pair.
    *
    * @param key the specified key to be merged.
    * @param value the value to be nerged with the current value for
@@ -484,8 +484,9 @@ public class RocksDB extends RocksObject {
   }
 
   /**
-   * Merge the database entry for "key" with "value".
+   * Add merge operand for key/value pair.
    *
+   * @param writeOpts {@link WriteOptions} for this write.
    * @param key the specified key to be merged.
    * @param value the value to be merged with the current value for
    * the specified key.
diff --git a/java/org/rocksdb/StringAppendOperator.java b/java/org/rocksdb/StringAppendOperator.java
index 9b593204f..52cd43e79 100644
--- a/java/org/rocksdb/StringAppendOperator.java
+++ b/java/org/rocksdb/StringAppendOperator.java
@@ -6,16 +6,12 @@
 package org.rocksdb;
 
 /**
- * MergeOperator holds an operator to be applied when compacting
- * two values held under the same key in order to obtain a single
- * value.
+ * StringAppendOperator is a merge operator that concatenates
+ * two strings.
  */
-public class StringAppendOperator extends MergeOperator {
-
-	@Override protected long newMergeOperatorHandle() {
-		return newMergeOperatorHandleImpl();
-	}
-
+public class StringAppendOperator implements MergeOperator {
+    @Override public long newMergeOperatorHandle() {
+        return newMergeOperatorHandleImpl();
+    }
     private native long newMergeOperatorHandleImpl();
-
 }
diff --git a/java/org/rocksdb/test/MergeTest.java b/java/org/rocksdb/test/MergeTest.java
index 0d3833715..f6acff9b2 100644
--- a/java/org/rocksdb/test/MergeTest.java
+++ b/java/org/rocksdb/test/MergeTest.java
@@ -9,80 +9,77 @@ import java.util.Collections;
 import org.rocksdb.*;
 
 public class MergeTest {
-  static final String db_path_string = "/tmp/mergestringjni_db";
-  static final String db_path_function = "/tmp/mergefunctionjni_db";
-  static {
-	RocksDB.loadLibrary();
-  }
+        static final String db_path_string = "/tmp/mergestringjni_db";
+        static final String db_path_function = "/tmp/mergefunctionjni_db";
+        static {
+                RocksDB.loadLibrary();
+        }
 
-  public static void testStringOption()
-	  throws InterruptedException, RocksDBException {
+        public static void testStringOption()
+                throws InterruptedException, RocksDBException {
 
-	System.out.println("Testing merge function string option ===");
+                System.out.println("Testing merge function string option ===");
 
-	Options opt = new Options();
-	opt.setCreateIfMissing(true);
-	opt.setMergeOperatorName("stringappend");
+                Options opt = new Options();
+                opt.setCreateIfMissing(true);
+                opt.setMergeOperatorName("stringappend");
 
-	RocksDB db = RocksDB.open(opt, db_path_string);
+                RocksDB db = RocksDB.open(opt, db_path_string);
 
-	System.out.println("Writing aa under key...");
-	db.put("key".getBytes(), "aa".getBytes());
+                System.out.println("Writing aa under key...");
+                db.put("key".getBytes(), "aa".getBytes());
 
-	System.out.println("Writing bb under key...");
-	db.merge("key".getBytes(), "bb".getBytes());
+                System.out.println("Writing bb under key...");
+                db.merge("key".getBytes(), "bb".getBytes());
 
-	byte[] value = db.get("key".getBytes());
-	String strValue = new String(value);
+                byte[] value = db.get("key".getBytes());
+                String strValue = new String(value);
 
-	System.out.println("Retrieved value: " + strValue);
+                System.out.println("Retrieved value: " + strValue);
 
-	db.close();
-	opt.dispose();
+                db.close();
+                opt.dispose();
 
-	assert(strValue.equals("aa,bb"));
+                assert(strValue.equals("aa,bb"));
 
-	System.out.println("Merge function string option passed!");
+                System.out.println("Merge function string option passed!");
+        }
 
-  }
+        public static void testOperatorOption()
+                throws InterruptedException, RocksDBException {
 
-  public static void testOperatorOption()
-	  throws InterruptedException, RocksDBException {
+                System.out.println("Testing merge function operator option ===");
 
-	System.out.println("Testing merge function operator option ===");
+                Options opt = new Options();
+                opt.setCreateIfMissing(true);
 
-	Options opt = new Options();
-	opt.setCreateIfMissing(true);
+                StringAppendOperator stringAppendOperator = new StringAppendOperator();
+                opt.setMergeOperator(stringAppendOperator);
 
-	StringAppendOperator stringAppendOperator = new StringAppendOperator();
-	opt.setMergeOperator(stringAppendOperator);
+                RocksDB db = RocksDB.open(opt, db_path_string);
 
-	RocksDB db = RocksDB.open(opt, db_path_string);
+                System.out.println("Writing aa under key...");
+                db.put("key".getBytes(), "aa".getBytes());
 
-	System.out.println("Writing aa under key...");
-	db.put("key".getBytes(), "aa".getBytes());
+                System.out.println("Writing bb under key...");
+                db.merge("key".getBytes(), "bb".getBytes());
 
-	System.out.println("Writing bb under key...");
-	db.merge("key".getBytes(), "bb".getBytes());
+                byte[] value = db.get("key".getBytes());
+                String strValue = new String(value);
 
-	byte[] value = db.get("key".getBytes());
-	String strValue = new String(value);
+                System.out.println("Retrieved value: " + strValue);
 
-	System.out.println("Retrieved value: " + strValue);
+                db.close();
+                opt.dispose();
 
-	db.close();
-	opt.dispose();
+                assert(strValue.equals("aa,bb"));
 
-	assert(strValue.equals("aa,bb"));
+                System.out.println("Merge function operator option passed!");
+        }
 
-	System.out.println("Merge function operator option passed!");
-
-  }
-
-  public static void main(String[] args)
-	  throws InterruptedException, RocksDBException {
-	testStringOption();
-	testOperatorOption();
-
-  }
+        public static void main(String[] args)
+                throws InterruptedException, RocksDBException {
+                testStringOption();
+                testOperatorOption();
+        }
 }
diff --git a/java/rocksjni/merge_operator.cc b/java/rocksjni/merge_operator.cc
index fc295e38c..68fe9b635 100644
--- a/java/rocksjni/merge_operator.cc
+++ b/java/rocksjni/merge_operator.cc
@@ -3,7 +3,8 @@
 // LICENSE file in the root directory of this source tree. An additional grant
 // of patent rights can be found in the PATENTS file in the same directory.
 //
-// This file implements the "bridge" between Java and C++ for rocksdb::MergeOperator.
+// This file implements the "bridge" between Java and C++
+// for rocksdb::MergeOperator.
 
 #include <stdio.h>
 #include <stdlib.h>
@@ -27,9 +28,10 @@
  * Method:    newMergeOperatorHandle
  * Signature: ()J
  */
-jlong Java_org_rocksdb_StringAppendOperator_newMergeOperatorHandleImpl(JNIEnv* env, jobject jobj) {
-  std::shared_ptr<rocksdb::MergeOperator> *op = new std::shared_ptr<rocksdb::MergeOperator>();
+jlong Java_org_rocksdb_StringAppendOperator_newMergeOperatorHandleImpl
+(JNIEnv* env, jobject jobj) {
+  std::shared_ptr<rocksdb::MergeOperator> *op =
+    new std::shared_ptr<rocksdb::MergeOperator>();
   *op = rocksdb::MergeOperators::CreateFromStringId("stringappend");
   return reinterpret_cast<jlong>(op);
 }
-
diff --git a/java/rocksjni/options.cc b/java/rocksjni/options.cc
index a52e2da70..1a43c4966 100644
--- a/java/rocksjni/options.cc
+++ b/java/rocksjni/options.cc
@@ -1623,9 +1623,10 @@ void Java_org_rocksdb_Options_setMergeOperatorName(
  * Signature: (JJjava/lang/String)V
  */
 void Java_org_rocksdb_Options_setMergeOperator(
-    JNIEnv* env, jobject jobj, jlong jhandle, jlong mergeOperatorHandle) {
+  JNIEnv* env, jobject jobj, jlong jhandle, jlong mergeOperatorHandle) {
   reinterpret_cast<rocksdb::Options*>(jhandle)->merge_operator =
-    *(reinterpret_cast<std::shared_ptr<rocksdb::MergeOperator>*> (mergeOperatorHandle));
+    *(reinterpret_cast<std::shared_ptr<rocksdb::MergeOperator>*>
+      (mergeOperatorHandle));
 }
 
 //////////////////////////////////////////////////////////////////////////////
@@ -1784,4 +1785,3 @@ void Java_org_rocksdb_ReadOptions_setTailing(
   reinterpret_cast<rocksdb::ReadOptions*>(jhandle)->tailing =
       static_cast<bool>(jtailing);
 }
-

From 1b97934a2c0fa07c01a26ff78b191aeb84ee5c8e Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Tue, 14 Oct 2014 07:33:57 +0200
Subject: [PATCH 254/829] Options correction

---
 java/org/rocksdb/Options.java | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/java/org/rocksdb/Options.java b/java/org/rocksdb/Options.java
index bb6f74e08..e0261ff4f 100644
--- a/java/org/rocksdb/Options.java
+++ b/java/org/rocksdb/Options.java
@@ -112,7 +112,7 @@ public class Options extends RocksObject {
    */
   public boolean createMissingColumnFamilies() {
     assert(isInitialized());
-    return createIfMissing(nativeHandle_);
+    return createMissingColumnFamilies(nativeHandle_);
   }
 
   /**

From 5cc9adf5ba7553ca7a8353068bdd68767739197a Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Tue, 14 Oct 2014 19:46:19 -0700
Subject: [PATCH 255/829] WriteBatchWithIndex's Iterator bug of SeekToFirst()
 and SeekToLast()

Summary: WriteBatchWithIndex's iterator's SeekToFirst() and SeekToLast() use offset=0 to indicate it is smaller than all the keys, which is wrong. offset=0 will decode a key "" (the header decodes like that). It could be larger than other keys in non-default comparators. Fix it by using a special flag of offset to indicate searching to the beginning of the CF.

Test Plan: Add a unit test that used to fail. Also, add some more tests to related cases, though they don't fail for now.

Reviewers: igor

Reviewed By: igor

Subscribers: rven, yhchiang, ljin, leveldb

Differential Revision: https://reviews.facebook.net/D24873
---
 .../write_batch_with_index.cc                 | 16 +++-
 .../write_batch_with_index_test.cc            | 81 +++++++++++++++++++
 2 files changed, 95 insertions(+), 2 deletions(-)

diff --git a/utilities/write_batch_with_index/write_batch_with_index.cc b/utilities/write_batch_with_index/write_batch_with_index.cc
index adfa5b324..4ba063a06 100644
--- a/utilities/write_batch_with_index/write_batch_with_index.cc
+++ b/utilities/write_batch_with_index/write_batch_with_index.cc
@@ -304,6 +304,10 @@ struct WriteBatchIndexEntry {
   WriteBatchIndexEntry(const Slice* sk, uint32_t c)
       : offset(0), column_family(c), search_key(sk) {}
 
+  // If this flag appears in the offset, it indicates a key that is smaller
+  // than any other entry for the same column family
+  static const size_t kFlagMin = std::numeric_limits<size_t>::max();
+
   size_t offset;           // offset of an entry in write batch's string buffer.
   uint32_t column_family;  // column family of the entry
   const Slice* search_key;  // if not null, instead of reading keys from
@@ -354,14 +358,16 @@ class WBWIIteratorImpl : public WBWIIterator {
 
   virtual void SeekToFirst() {
     valid_ = true;
-    WriteBatchIndexEntry search_entry(nullptr, column_family_id_);
+    WriteBatchIndexEntry search_entry(WriteBatchIndexEntry::kFlagMin,
+                                      column_family_id_);
     skip_list_iter_.Seek(&search_entry);
     ReadEntry();
   }
 
   virtual void SeekToLast() {
     valid_ = true;
-    WriteBatchIndexEntry search_entry(nullptr, column_family_id_ + 1);
+    WriteBatchIndexEntry search_entry(WriteBatchIndexEntry::kFlagMin,
+                                      column_family_id_ + 1);
     skip_list_iter_.Seek(&search_entry);
     if (!skip_list_iter_.Valid()) {
       skip_list_iter_.SeekToLast();
@@ -636,6 +642,12 @@ int WriteBatchEntryComparator::operator()(
     return -1;
   }
 
+  if (entry1->offset == WriteBatchIndexEntry::kFlagMin) {
+    return -1;
+  } else if (entry2->offset == WriteBatchIndexEntry::kFlagMin) {
+    return 1;
+  }
+
   Status s;
   Slice key1, key2;
   if (entry1->search_key == nullptr) {
diff --git a/utilities/write_batch_with_index/write_batch_with_index_test.cc b/utilities/write_batch_with_index/write_batch_with_index_test.cc
index 32b45e339..8667079d3 100644
--- a/utilities/write_batch_with_index/write_batch_with_index_test.cc
+++ b/utilities/write_batch_with_index/write_batch_with_index_test.cc
@@ -522,7 +522,18 @@ TEST(WriteBatchWithIndexTest, TestRandomIteraratorWithBase) {
     Random rnd(rand_seed);
 
     ColumnFamilyHandleImplDummy cf1(6, BytewiseComparator());
+    ColumnFamilyHandleImplDummy cf2(2, BytewiseComparator());
+    ColumnFamilyHandleImplDummy cf3(8, BytewiseComparator());
+
     WriteBatchWithIndex batch(BytewiseComparator(), 20, true);
+
+    if (rand_seed % 2 == 0) {
+      batch.Put(&cf2, "zoo", "bar");
+    }
+    if (rand_seed % 4 == 1) {
+      batch.Put(&cf3, "zoo", "bar");
+    }
+
     KVMap map;
     KVMap merged_map;
     for (auto key : source_strings) {
@@ -619,6 +630,7 @@ TEST(WriteBatchWithIndexTest, TestRandomIteraratorWithBase) {
 
 TEST(WriteBatchWithIndexTest, TestIteraratorWithBase) {
   ColumnFamilyHandleImplDummy cf1(6, BytewiseComparator());
+  ColumnFamilyHandleImplDummy cf2(2, BytewiseComparator());
   WriteBatchWithIndex batch(BytewiseComparator(), 20, true);
 
   {
@@ -659,7 +671,21 @@ TEST(WriteBatchWithIndexTest, TestIteraratorWithBase) {
     AssertIter(iter.get(), "a", "aa");
   }
 
+  // Test the case that there is one element in the write batch
+  batch.Put(&cf2, "zoo", "bar");
   batch.Put(&cf1, "a", "aa");
+  {
+    KVMap empty_map;
+    std::unique_ptr<Iterator> iter(
+        batch.NewIteratorWithBase(&cf1, new KVIter(&empty_map)));
+
+    iter->SeekToFirst();
+    AssertIter(iter.get(), "a", "aa");
+    iter->Next();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(!iter->Valid());
+  }
+
   batch.Delete(&cf1, "b");
   batch.Put(&cf1, "c", "cc");
   batch.Put(&cf1, "d", "dd");
@@ -725,6 +751,7 @@ TEST(WriteBatchWithIndexTest, TestIteraratorWithBase) {
     iter->Next();
     AssertIter(iter.get(), "f", "ff");
   }
+
   {
     KVMap empty_map;
     std::unique_ptr<Iterator> iter(
@@ -763,6 +790,60 @@ TEST(WriteBatchWithIndexTest, TestIteraratorWithBase) {
     AssertIter(iter.get(), "c", "cc");
   }
 }
+
+TEST(WriteBatchWithIndexTest, TestIteraratorWithBaseReverseCmp) {
+  ColumnFamilyHandleImplDummy cf1(6, ReverseBytewiseComparator());
+  ColumnFamilyHandleImplDummy cf2(2, ReverseBytewiseComparator());
+  WriteBatchWithIndex batch(BytewiseComparator(), 20, true);
+
+  // Test the case that there is one element in the write batch
+  batch.Put(&cf2, "zoo", "bar");
+  batch.Put(&cf1, "a", "aa");
+  {
+    KVMap empty_map;
+    std::unique_ptr<Iterator> iter(
+        batch.NewIteratorWithBase(&cf1, new KVIter(&empty_map)));
+
+    iter->SeekToFirst();
+    AssertIter(iter.get(), "a", "aa");
+    iter->Next();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(!iter->Valid());
+  }
+
+  batch.Put(&cf1, "c", "cc");
+  {
+    KVMap map;
+    std::unique_ptr<Iterator> iter(
+        batch.NewIteratorWithBase(&cf1, new KVIter(&map)));
+
+    iter->SeekToFirst();
+    AssertIter(iter.get(), "c", "cc");
+    iter->Next();
+    AssertIter(iter.get(), "a", "aa");
+    iter->Next();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(!iter->Valid());
+
+    iter->SeekToLast();
+    AssertIter(iter.get(), "a", "aa");
+    iter->Prev();
+    AssertIter(iter.get(), "c", "cc");
+    iter->Prev();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(!iter->Valid());
+
+    iter->Seek("b");
+    AssertIter(iter.get(), "a", "aa");
+
+    iter->Prev();
+    AssertIter(iter.get(), "c", "cc");
+
+    iter->Seek("a");
+    AssertIter(iter.get(), "a", "aa");
+  }
+}
+
 }  // namespace
 
 int main(int argc, char** argv) { return rocksdb::test::RunAllTests(); }

From ca250d71a1bffd5efdd21601e20fceea2aaaccff Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Wed, 15 Oct 2014 10:56:50 -0700
Subject: [PATCH 256/829] Move logging out of mutex

Summary: As title

Test Plan: compiles

Reviewers: sdong

Reviewed By: sdong

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D24897
---
 db/db_impl.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/db/db_impl.cc b/db/db_impl.cc
index dc9399fd8..259247785 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -4234,6 +4234,9 @@ Status DBImpl::SetNewMemtableAndNewLogFile(ColumnFamilyData* cfd,
       new_superversion = new SuperVersion();
     }
   }
+  Log(db_options_.info_log,
+      "[%s] New memtable created with log file: #%" PRIu64 "\n",
+      cfd->GetName().c_str(), new_log_number);
   mutex_.Lock();
   if (!s.ok()) {
     // how do we fail if we're not creating new log?
@@ -4266,9 +4269,6 @@ Status DBImpl::SetNewMemtableAndNewLogFile(ColumnFamilyData* cfd,
   cfd->imm()->Add(cfd->mem());
   new_mem->Ref();
   cfd->SetMemtable(new_mem);
-  Log(db_options_.info_log,
-      "[%s] New memtable created with log file: #%" PRIu64 "\n",
-      cfd->GetName().c_str(), logfile_number_);
   context->superversions_to_free_.push_back(
       cfd->InstallSuperVersion(new_superversion, &mutex_, mutable_cf_options));
   return s;

From bafbc23baa6a1bd2aac6575d66a89379278b815a Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Tue, 14 Oct 2014 18:38:50 +0200
Subject: [PATCH 257/829] Filters getting disposed by System.gc before EOL

Previous to this commit Filters passed as parameters to the
BlockTableConfig are disposed before they should be disposed.

Further Smart pointer usage was corrected.

Java holds now the smart pointer to the FilterPolicy correctly
and cares about freeing underlying c++ structures.
---
 java/org/rocksdb/Options.java         |  6 ++++++
 java/org/rocksdb/test/FilterTest.java | 17 ++++++++++++++---
 java/rocksjni/filter.cc               | 16 +++++++++++-----
 java/rocksjni/portal.h                |  8 +++++---
 java/rocksjni/table.cc                |  6 ++++--
 5 files changed, 40 insertions(+), 13 deletions(-)

diff --git a/java/org/rocksdb/Options.java b/java/org/rocksdb/Options.java
index e0261ff4f..741404e40 100644
--- a/java/org/rocksdb/Options.java
+++ b/java/org/rocksdb/Options.java
@@ -1154,6 +1154,7 @@ public class Options extends RocksObject {
    */
   public Options setMemTableConfig(MemTableConfig config)
       throws RocksDBException {
+    memTableConfig_ = config;
     setMemTableFactory(nativeHandle_, config.newMemTableFactoryHandle());
     return this;
   }
@@ -1168,6 +1169,7 @@ public class Options extends RocksObject {
    * @throws RocksDBException
    */
   public Options setRateLimiterConfig(RateLimiterConfig config) {
+    rateLimiterConfig_ = config;
     setRateLimiter(nativeHandle_, config.newRateLimiterHandle());
     return this;
   }
@@ -1191,6 +1193,7 @@ public class Options extends RocksObject {
    * @return the reference of the current Options.
    */
   public Options setTableFormatConfig(TableFormatConfig config) {
+    tableFormatConfig_ = config;
     setTableFactory(nativeHandle_, config.newTableFactoryHandle());
     return this;
   }
@@ -2280,4 +2283,7 @@ public class Options extends RocksObject {
   long cacheSize_;
   int numShardBits_;
   RocksEnv env_;
+  MemTableConfig memTableConfig_;
+  TableFormatConfig tableFormatConfig_;
+  RateLimiterConfig rateLimiterConfig_;
 }
diff --git a/java/org/rocksdb/test/FilterTest.java b/java/org/rocksdb/test/FilterTest.java
index 00214d033..fc4fabf56 100644
--- a/java/org/rocksdb/test/FilterTest.java
+++ b/java/org/rocksdb/test/FilterTest.java
@@ -13,19 +13,30 @@ public class FilterTest {
   }
   public static void main(String[] args) {
     Options options = new Options();
-    // test table config without filter
+    // test table config
     BlockBasedTableConfig blockConfig = new BlockBasedTableConfig();
-    options.setTableFormatConfig(blockConfig);
+    options.setTableFormatConfig(new BlockBasedTableConfig().
+        setFilter(new BloomFilter()));
     options.dispose();
+    System.gc();
+    System.runFinalization();
     // new Bloom filter
     options = new Options();
     blockConfig = new BlockBasedTableConfig();
     blockConfig.setFilter(new BloomFilter());
     options.setTableFormatConfig(blockConfig);
-    blockConfig.setFilter(new BloomFilter(10));
+    BloomFilter bloomFilter = new BloomFilter(10);
+    blockConfig.setFilter(bloomFilter);
     options.setTableFormatConfig(blockConfig);
+    System.gc();
+    System.runFinalization();
     blockConfig.setFilter(new BloomFilter(10, false));
     options.setTableFormatConfig(blockConfig);
+    options.dispose();
+    options = null;
+    blockConfig = null;
+    System.gc();
+    System.runFinalization();
     System.out.println("Filter test passed");
   }
 }
diff --git a/java/rocksjni/filter.cc b/java/rocksjni/filter.cc
index 1b5d368b6..2ce17d499 100644
--- a/java/rocksjni/filter.cc
+++ b/java/rocksjni/filter.cc
@@ -24,9 +24,12 @@
 void Java_org_rocksdb_BloomFilter_createNewBloomFilter(
     JNIEnv* env, jobject jobj, jint bits_per_key,
     jboolean use_block_base_builder) {
-  const rocksdb::FilterPolicy* fp = rocksdb::NewBloomFilterPolicy(bits_per_key,
-       use_block_base_builder);
-  rocksdb::FilterJni::setHandle(env, jobj, fp);
+  rocksdb::FilterPolicy* fp = const_cast<rocksdb::FilterPolicy *>(
+      rocksdb::NewBloomFilterPolicy(bits_per_key, use_block_base_builder));
+  std::shared_ptr<rocksdb::FilterPolicy> *pFilterPolicy =
+      new std::shared_ptr<rocksdb::FilterPolicy>;
+  *pFilterPolicy = std::shared_ptr<rocksdb::FilterPolicy>(fp);
+  rocksdb::FilterJni::setHandle(env, jobj, pFilterPolicy);
 }
 
 /*
@@ -35,6 +38,9 @@ void Java_org_rocksdb_BloomFilter_createNewBloomFilter(
  * Signature: (J)V
  */
 void Java_org_rocksdb_Filter_disposeInternal(
-    JNIEnv* env, jobject jobj, jlong handle) {
-  delete reinterpret_cast<rocksdb::FilterPolicy*>(handle);
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+
+  std::shared_ptr<rocksdb::FilterPolicy> *handle =
+      reinterpret_cast<std::shared_ptr<rocksdb::FilterPolicy> *>(jhandle);
+  handle->reset();
 }
diff --git a/java/rocksjni/portal.h b/java/rocksjni/portal.h
index 14b2cb98a..8300a6e66 100644
--- a/java/rocksjni/portal.h
+++ b/java/rocksjni/portal.h
@@ -313,14 +313,16 @@ class FilterJni {
   }
 
   // Get the pointer to rocksdb::FilterPolicy.
-  static rocksdb::FilterPolicy* getHandle(JNIEnv* env, jobject jobj) {
-    return reinterpret_cast<rocksdb::FilterPolicy*>(
+  static std::shared_ptr<rocksdb::FilterPolicy>* getHandle(
+      JNIEnv* env, jobject jobj) {
+    return reinterpret_cast
+        <std::shared_ptr<rocksdb::FilterPolicy> *>(
         env->GetLongField(jobj, getHandleFieldID(env)));
   }
 
   // Pass the rocksdb::FilterPolicy pointer to the java side.
   static void setHandle(
-      JNIEnv* env, jobject jobj, const rocksdb::FilterPolicy* op) {
+      JNIEnv* env, jobject jobj, std::shared_ptr<rocksdb::FilterPolicy>* op) {
     env->SetLongField(
         jobj, getHandleFieldID(env),
         reinterpret_cast<jlong>(op));
diff --git a/java/rocksjni/table.cc b/java/rocksjni/table.cc
index 846526292..1582900f3 100644
--- a/java/rocksjni/table.cc
+++ b/java/rocksjni/table.cc
@@ -56,8 +56,10 @@ jlong Java_org_rocksdb_BlockBasedTableConfig_newTableFactoryHandle(
   options.block_restart_interval = block_restart_interval;
   options.whole_key_filtering = whole_key_filtering;
   if (jfilterPolicy > 0) {
-    options.filter_policy.reset(
-        reinterpret_cast<rocksdb::FilterPolicy*>(jfilterPolicy));
+    std::shared_ptr<rocksdb::FilterPolicy> *pFilterPolicy =
+        reinterpret_cast<std::shared_ptr<rocksdb::FilterPolicy> *>(
+            jfilterPolicy);
+    options.filter_policy = *pFilterPolicy;
   }
   options.cache_index_and_filter_blocks = cache_index_and_filter_blocks;
   options.hash_index_allow_collision = hash_index_allow_collision;

From 6a150c0118c434bc43a2b719e2aa5f0652d1cb0a Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Mon, 13 Oct 2014 16:08:02 -0700
Subject: [PATCH 258/829] ldb: support --fix_prefix_len

Summary:
ldb to support --fix_prefix_len to allow us to verify more cases.
Also fix a small issue that --bloom_bits might not be applied if --block_size is not given.

Test Plan: run ldb tool against an example DB.

Reviewers: ljin, yhchiang, rven, igor

Reviewed By: igor

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D24819
---
 util/ldb_cmd.cc  | 21 ++++++++++++++++++++-
 util/ldb_cmd.h   |  8 +++++---
 util/ldb_tool.cc |  1 +
 3 files changed, 26 insertions(+), 4 deletions(-)

diff --git a/util/ldb_cmd.cc b/util/ldb_cmd.cc
index 8eda39bf9..70f0c6a94 100644
--- a/util/ldb_cmd.cc
+++ b/util/ldb_cmd.cc
@@ -40,6 +40,7 @@ const string LDBCommand::ARG_FROM = "from";
 const string LDBCommand::ARG_TO = "to";
 const string LDBCommand::ARG_MAX_KEYS = "max_keys";
 const string LDBCommand::ARG_BLOOM_BITS = "bloom_bits";
+const string LDBCommand::ARG_FIX_PREFIX_LEN = "fix_prefix_len";
 const string LDBCommand::ARG_COMPRESSION_TYPE = "compression_type";
 const string LDBCommand::ARG_BLOCK_SIZE = "block_size";
 const string LDBCommand::ARG_AUTO_COMPACTION = "auto_compaction";
@@ -221,9 +222,11 @@ Options LDBCommand::PrepareOptionsForOpenDB() {
   map<string, string>::const_iterator itr;
 
   BlockBasedTableOptions table_options;
+  bool use_table_options = false;
   int bits;
   if (ParseIntOption(option_map_, ARG_BLOOM_BITS, bits, exec_state_)) {
     if (bits > 0) {
+      use_table_options = true;
       table_options.filter_policy.reset(NewBloomFilterPolicy(bits));
     } else {
       exec_state_ = LDBCommandExecuteResult::FAILED(ARG_BLOOM_BITS +
@@ -234,14 +237,18 @@ Options LDBCommand::PrepareOptionsForOpenDB() {
   int block_size;
   if (ParseIntOption(option_map_, ARG_BLOCK_SIZE, block_size, exec_state_)) {
     if (block_size > 0) {
+      use_table_options = true;
       table_options.block_size = block_size;
-      opt.table_factory.reset(NewBlockBasedTableFactory(table_options));
     } else {
       exec_state_ = LDBCommandExecuteResult::FAILED(ARG_BLOCK_SIZE +
                       " must be > 0.");
     }
   }
 
+  if (use_table_options) {
+    opt.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  }
+
   itr = option_map_.find(ARG_AUTO_COMPACTION);
   if (itr != option_map_.end()) {
     opt.disable_auto_compactions = ! StringToBool(itr->second);
@@ -294,6 +301,18 @@ Options LDBCommand::PrepareOptionsForOpenDB() {
     opt.db_paths.emplace_back(db_path_, std::numeric_limits<uint64_t>::max());
   }
 
+  int fix_prefix_len;
+  if (ParseIntOption(option_map_, ARG_FIX_PREFIX_LEN, fix_prefix_len,
+                     exec_state_)) {
+    if (fix_prefix_len > 0) {
+      opt.prefix_extractor.reset(
+          NewFixedPrefixTransform(static_cast<size_t>(fix_prefix_len)));
+    } else {
+      exec_state_ =
+          LDBCommandExecuteResult::FAILED(ARG_FIX_PREFIX_LEN + " must be > 0.");
+    }
+  }
+
   return opt;
 }
 
diff --git a/util/ldb_cmd.h b/util/ldb_cmd.h
index 0553fe64a..9ffe0eabc 100644
--- a/util/ldb_cmd.h
+++ b/util/ldb_cmd.h
@@ -46,6 +46,7 @@ public:
   static const string ARG_TO;
   static const string ARG_MAX_KEYS;
   static const string ARG_BLOOM_BITS;
+  static const string ARG_FIX_PREFIX_LEN;
   static const string ARG_COMPRESSION_TYPE;
   static const string ARG_BLOCK_SIZE;
   static const string ARG_AUTO_COMPACTION;
@@ -284,9 +285,10 @@ protected:
    * passed in.
    */
   vector<string> BuildCmdLineOptions(vector<string> options) {
-    vector<string> ret = {ARG_DB, ARG_BLOOM_BITS, ARG_BLOCK_SIZE,
-                          ARG_AUTO_COMPACTION, ARG_COMPRESSION_TYPE,
-                          ARG_WRITE_BUFFER_SIZE, ARG_FILE_SIZE};
+    vector<string> ret = {ARG_DB,               ARG_BLOOM_BITS,
+                          ARG_BLOCK_SIZE,       ARG_AUTO_COMPACTION,
+                          ARG_COMPRESSION_TYPE, ARG_WRITE_BUFFER_SIZE,
+                          ARG_FILE_SIZE,        ARG_FIX_PREFIX_LEN};
     ret.insert(ret.end(), options.begin(), options.end());
     return ret;
   }
diff --git a/util/ldb_tool.cc b/util/ldb_tool.cc
index 271dba350..bb6c8ffca 100644
--- a/util/ldb_tool.cc
+++ b/util/ldb_tool.cc
@@ -47,6 +47,7 @@ public:
         " with 'put','get','scan','dump','query','batchput'"
         " : DB supports ttl and value is internally timestamp-suffixed\n");
     ret.append("  --" + LDBCommand::ARG_BLOOM_BITS + "=<int,e.g.:14>\n");
+    ret.append("  --" + LDBCommand::ARG_FIX_PREFIX_LEN + "=<int,e.g.:14>\n");
     ret.append("  --" + LDBCommand::ARG_COMPRESSION_TYPE +
         "=<no|snappy|zlib|bzip2>\n");
     ret.append("  --" + LDBCommand::ARG_BLOCK_SIZE +

From dc50a1a59321ee69c5091467e4d0e0e489fcd2a4 Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Thu, 16 Oct 2014 16:57:59 -0700
Subject: [PATCH 259/829] make max_write_buffer_number dynamic

Summary: as title

Test Plan: unit test

Reviewers: sdong, yhchiang, igor

Reviewed By: igor

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D24729
---
 db/column_family.cc       |   7 +-
 db/db_impl.cc             | 142 ++++++++++++++++++++++----------------
 db/db_impl.h              |   4 ++
 db/db_test.cc             |  81 +++++++++++++++++++---
 util/mutable_cf_options.h |   3 +
 util/options_helper.cc    |   4 +-
 6 files changed, 168 insertions(+), 73 deletions(-)

diff --git a/db/column_family.cc b/db/column_family.cc
index 0beb23c91..37699af21 100644
--- a/db/column_family.cc
+++ b/db/column_family.cc
@@ -326,13 +326,14 @@ void ColumnFamilyData::RecalculateWriteStallConditions(
 
     auto write_controller = column_family_set_->write_controller_;
 
-    if (imm()->size() == options_.max_write_buffer_number) {
+    if (imm()->size() >= mutable_cf_options.max_write_buffer_number) {
       write_controller_token_ = write_controller->GetStopToken();
       internal_stats_->AddCFStats(InternalStats::MEMTABLE_COMPACTION, 1);
       Log(ioptions_.info_log,
           "[%s] Stopping writes because we have %d immutable memtables "
-          "(waiting for flush)",
-          name_.c_str(), imm()->size());
+          "(waiting for flush), max_write_buffer_number is set to %d",
+          name_.c_str(), imm()->size(),
+          mutable_cf_options.max_write_buffer_number);
     } else if (current_->NumLevelFiles(0) >=
                mutable_cf_options.level0_stop_writes_trigger) {
       write_controller_token_ = write_controller->GetStopToken();
diff --git a/db/db_impl.cc b/db/db_impl.cc
index 259247785..592634600 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -1236,9 +1236,12 @@ Status DBImpl::Recover(
     SetTickerCount(stats_, SEQUENCE_NUMBER, versions_->LastSequence());
   }
 
+  // Initial value
+  max_total_in_memory_state_ = 0;
   for (auto cfd : *versions_->GetColumnFamilySet()) {
-    max_total_in_memory_state_ += cfd->options()->write_buffer_size *
-                                  cfd->options()->max_write_buffer_number;
+    auto* mutable_cf_options = cfd->GetLatestMutableCFOptions();
+    max_total_in_memory_state_ += mutable_cf_options->write_buffer_size *
+                                  mutable_cf_options->max_write_buffer_number;
   }
 
   return s;
@@ -1803,8 +1806,8 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) {
 
     status = versions_->LogAndApply(cfd,
         mutable_cf_options, &edit, &mutex_, db_directory_.get());
-    superversion_to_free = cfd->InstallSuperVersion(
-        new_superversion, &mutex_, mutable_cf_options);
+    superversion_to_free = InstallSuperVersion(
+        cfd, new_superversion, mutable_cf_options);
     new_superversion = nullptr;
 
     Log(db_options_.info_log, "[%s] LogAndApply: %s\n", cfd->GetName().c_str(),
@@ -1840,10 +1843,10 @@ int DBImpl::Level0StopWriteTrigger(ColumnFamilyHandle* column_family) {
   return cfh->cfd()->options()->level0_stop_writes_trigger;
 }
 
-Status DBImpl::Flush(const FlushOptions& options,
+Status DBImpl::Flush(const FlushOptions& flush_options,
                      ColumnFamilyHandle* column_family) {
   auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
-  return FlushMemTable(cfh->cfd(), options);
+  return FlushMemTable(cfh->cfd(), flush_options);
 }
 
 SequenceNumber DBImpl::GetLatestSequenceNumber() const {
@@ -1933,7 +1936,7 @@ Status DBImpl::RunManualCompaction(ColumnFamilyData* cfd, int input_level,
 }
 
 Status DBImpl::FlushMemTable(ColumnFamilyData* cfd,
-                             const FlushOptions& options) {
+                             const FlushOptions& flush_options) {
   Status s;
   {
     WriteContext context;
@@ -1957,7 +1960,7 @@ Status DBImpl::FlushMemTable(ColumnFamilyData* cfd,
     write_thread_.ExitWriteThread(&w, &w, s);
   }
 
-  if (s.ok() && options.wait) {
+  if (s.ok() && flush_options.wait) {
     // Wait until the compaction completes
     s = WaitForFlushMemTable(cfd);
   }
@@ -3441,7 +3444,7 @@ static void CleanupIteratorState(void* arg1, void* arg2) {
 }
 }  // namespace
 
-Iterator* DBImpl::NewInternalIterator(const ReadOptions& options,
+Iterator* DBImpl::NewInternalIterator(const ReadOptions& read_options,
                                       ColumnFamilyData* cfd,
                                       SuperVersion* super_version,
                                       Arena* arena) {
@@ -3451,11 +3454,11 @@ Iterator* DBImpl::NewInternalIterator(const ReadOptions& options,
   MergeIteratorBuilder merge_iter_builder(&cfd->internal_comparator(), arena);
   // Collect iterator for mutable mem
   merge_iter_builder.AddIterator(
-      super_version->mem->NewIterator(options, arena));
+      super_version->mem->NewIterator(read_options, arena));
   // Collect all needed child iterators for immutable memtables
-  super_version->imm->AddIterators(options, &merge_iter_builder);
+  super_version->imm->AddIterators(read_options, &merge_iter_builder);
   // Collect iterators for files in L0 - Ln
-  super_version->current->AddIterators(options, env_options_,
+  super_version->current->AddIterators(read_options, env_options_,
                                        &merge_iter_builder);
   internal_iter = merge_iter_builder.Finish();
   IterState* cleanup = new IterState(this, &mutex_, super_version);
@@ -3468,10 +3471,10 @@ ColumnFamilyHandle* DBImpl::DefaultColumnFamily() const {
   return default_cf_handle_;
 }
 
-Status DBImpl::Get(const ReadOptions& options,
+Status DBImpl::Get(const ReadOptions& read_options,
                    ColumnFamilyHandle* column_family, const Slice& key,
                    std::string* value) {
-  return GetImpl(options, column_family, key, value);
+  return GetImpl(read_options, column_family, key, value);
 }
 
 // DeletionState gets created and destructed outside of the lock -- we
@@ -3488,17 +3491,39 @@ void DBImpl::InstallSuperVersion(
     ColumnFamilyData* cfd, DeletionState& deletion_state,
     const MutableCFOptions& mutable_cf_options) {
   mutex_.AssertHeld();
-  // if new_superversion == nullptr, it means somebody already used it
-  SuperVersion* new_superversion =
-    (deletion_state.new_superversion != nullptr) ?
-    deletion_state.new_superversion : new SuperVersion();
   SuperVersion* old_superversion =
-      cfd->InstallSuperVersion(new_superversion, &mutex_, mutable_cf_options);
+      InstallSuperVersion(cfd, deletion_state.new_superversion,
+                          mutable_cf_options);
   deletion_state.new_superversion = nullptr;
   deletion_state.superversions_to_free.push_back(old_superversion);
 }
 
-Status DBImpl::GetImpl(const ReadOptions& options,
+SuperVersion* DBImpl::InstallSuperVersion(
+    ColumnFamilyData* cfd, SuperVersion* new_sv,
+    const MutableCFOptions& mutable_cf_options) {
+  mutex_.AssertHeld();
+  auto* old = cfd->InstallSuperVersion(
+      new_sv ? new_sv : new SuperVersion(), &mutex_, mutable_cf_options);
+
+  // We want to schedule potential flush or compactions since new options may
+  // have been picked up in this new version. New options may cause flush
+  // compaction trigger condition to change.
+  MaybeScheduleFlushOrCompaction();
+
+  // Update max_total_in_memory_state_
+  auto old_memtable_size = 0;
+  if (old) {
+    old_memtable_size = old->mutable_cf_options.write_buffer_size *
+                        old->mutable_cf_options.max_write_buffer_number;
+  }
+  max_total_in_memory_state_ =
+      max_total_in_memory_state_ - old_memtable_size +
+      mutable_cf_options.write_buffer_size *
+      mutable_cf_options.max_write_buffer_number;
+  return old;
+}
+
+Status DBImpl::GetImpl(const ReadOptions& read_options,
                        ColumnFamilyHandle* column_family, const Slice& key,
                        std::string* value, bool* value_found) {
   StopWatch sw(env_, stats_, DB_GET);
@@ -3508,8 +3533,9 @@ Status DBImpl::GetImpl(const ReadOptions& options,
   auto cfd = cfh->cfd();
 
   SequenceNumber snapshot;
-  if (options.snapshot != nullptr) {
-    snapshot = reinterpret_cast<const SnapshotImpl*>(options.snapshot)->number_;
+  if (read_options.snapshot != nullptr) {
+    snapshot = reinterpret_cast<const SnapshotImpl*>(
+        read_options.snapshot)->number_;
   } else {
     snapshot = versions_->LastSequence();
   }
@@ -3535,7 +3561,8 @@ Status DBImpl::GetImpl(const ReadOptions& options,
     RecordTick(stats_, MEMTABLE_HIT);
   } else {
     PERF_TIMER_GUARD(get_from_output_files_time);
-    sv->current->Get(options, lkey, value, &s, &merge_context, value_found);
+    sv->current->Get(read_options, lkey, value, &s, &merge_context,
+                     value_found);
     RecordTick(stats_, MEMTABLE_MISS);
   }
 
@@ -3551,7 +3578,7 @@ Status DBImpl::GetImpl(const ReadOptions& options,
 }
 
 std::vector<Status> DBImpl::MultiGet(
-    const ReadOptions& options,
+    const ReadOptions& read_options,
     const std::vector<ColumnFamilyHandle*>& column_family,
     const std::vector<Slice>& keys, std::vector<std::string>* values) {
 
@@ -3577,8 +3604,9 @@ std::vector<Status> DBImpl::MultiGet(
   }
 
   mutex_.Lock();
-  if (options.snapshot != nullptr) {
-    snapshot = reinterpret_cast<const SnapshotImpl*>(options.snapshot)->number_;
+  if (read_options.snapshot != nullptr) {
+    snapshot = reinterpret_cast<const SnapshotImpl*>(
+        read_options.snapshot)->number_;
   } else {
     snapshot = versions_->LastSequence();
   }
@@ -3621,7 +3649,8 @@ std::vector<Status> DBImpl::MultiGet(
       // Done
     } else {
       PERF_TIMER_GUARD(get_from_output_files_time);
-      super_version->current->Get(options, lkey, value, &s, &merge_context);
+      super_version->current->Get(read_options, lkey, value, &s,
+                                  &merge_context);
     }
 
     if (s.ok()) {
@@ -3659,7 +3688,7 @@ std::vector<Status> DBImpl::MultiGet(
   return stat_list;
 }
 
-Status DBImpl::CreateColumnFamily(const ColumnFamilyOptions& options,
+Status DBImpl::CreateColumnFamily(const ColumnFamilyOptions& cf_options,
                                   const std::string& column_family_name,
                                   ColumnFamilyHandle** handle) {
   *handle = nullptr;
@@ -3674,26 +3703,23 @@ Status DBImpl::CreateColumnFamily(const ColumnFamilyOptions& options,
   uint32_t new_id = versions_->GetColumnFamilySet()->GetNextColumnFamilyID();
   edit.SetColumnFamily(new_id);
   edit.SetLogNumber(logfile_number_);
-  edit.SetComparatorName(options.comparator->Name());
+  edit.SetComparatorName(cf_options.comparator->Name());
 
   // LogAndApply will both write the creation in MANIFEST and create
   // ColumnFamilyData object
-  Options opt(db_options_, options);
+  Options opt(db_options_, cf_options);
   Status s = versions_->LogAndApply(nullptr,
       MutableCFOptions(opt, ImmutableCFOptions(opt)),
-      &edit, &mutex_, db_directory_.get(), false, &options);
+      &edit, &mutex_, db_directory_.get(), false, &cf_options);
   if (s.ok()) {
     single_column_family_mode_ = false;
     auto cfd =
         versions_->GetColumnFamilySet()->GetColumnFamily(column_family_name);
     assert(cfd != nullptr);
-    delete cfd->InstallSuperVersion(new SuperVersion(), &mutex_,
-                                    *cfd->GetLatestMutableCFOptions());
+    delete InstallSuperVersion(cfd, nullptr, *cfd->GetLatestMutableCFOptions());
     *handle = new ColumnFamilyHandleImpl(cfd, this, &mutex_);
     Log(db_options_.info_log, "Created column family [%s] (ID %u)",
         column_family_name.c_str(), (unsigned)cfd->GetID());
-    max_total_in_memory_state_ += cfd->options()->write_buffer_size *
-                                  cfd->options()->max_write_buffer_number;
   } else {
     Log(db_options_.info_log, "Creating column family [%s] FAILED -- %s",
         column_family_name.c_str(), s.ToString().c_str());
@@ -3712,7 +3738,6 @@ Status DBImpl::DropColumnFamily(ColumnFamilyHandle* column_family) {
   edit.DropColumnFamily();
   edit.SetColumnFamily(cfd->GetID());
 
-
   Status s;
   {
     MutexLock l(&mutex_);
@@ -3732,8 +3757,9 @@ Status DBImpl::DropColumnFamily(ColumnFamilyHandle* column_family) {
 
   if (s.ok()) {
     assert(cfd->IsDropped());
-    max_total_in_memory_state_ -= cfd->options()->write_buffer_size *
-                                  cfd->options()->max_write_buffer_number;
+    auto* mutable_cf_options = cfd->GetLatestMutableCFOptions();
+    max_total_in_memory_state_ -= mutable_cf_options->write_buffer_size *
+                                  mutable_cf_options->max_write_buffer_number;
     Log(db_options_.info_log, "Dropped column family with id %u\n",
         cfd->GetID());
   } else {
@@ -3745,14 +3771,14 @@ Status DBImpl::DropColumnFamily(ColumnFamilyHandle* column_family) {
   return s;
 }
 
-bool DBImpl::KeyMayExist(const ReadOptions& options,
+bool DBImpl::KeyMayExist(const ReadOptions& read_options,
                          ColumnFamilyHandle* column_family, const Slice& key,
                          std::string* value, bool* value_found) {
   if (value_found != nullptr) {
     // falsify later if key-may-exist but can't fetch value
     *value_found = true;
   }
-  ReadOptions roptions = options;
+  ReadOptions roptions = read_options;
   roptions.read_tier = kBlockCacheTier; // read from block cache only
   auto s = GetImpl(roptions, column_family, key, value, value_found);
 
@@ -3941,23 +3967,23 @@ Status DBImpl::Merge(const WriteOptions& o, ColumnFamilyHandle* column_family,
   }
 }
 
-Status DBImpl::Delete(const WriteOptions& options,
+Status DBImpl::Delete(const WriteOptions& write_options,
                       ColumnFamilyHandle* column_family, const Slice& key) {
-  return DB::Delete(options, column_family, key);
+  return DB::Delete(write_options, column_family, key);
 }
 
-Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
+Status DBImpl::Write(const WriteOptions& write_options, WriteBatch* my_batch) {
   if (my_batch == nullptr) {
     return Status::Corruption("Batch is nullptr!");
   }
   PERF_TIMER_GUARD(write_pre_and_post_process_time);
   WriteThread::Writer w(&mutex_);
   w.batch = my_batch;
-  w.sync = options.sync;
-  w.disableWAL = options.disableWAL;
+  w.sync = write_options.sync;
+  w.disableWAL = write_options.disableWAL;
   w.in_batch_group = false;
   w.done = false;
-  w.timeout_hint_us = options.timeout_hint_us;
+  w.timeout_hint_us = write_options.timeout_hint_us;
 
   uint64_t expiration_time = 0;
   bool has_timeout = false;
@@ -3968,7 +3994,7 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
     has_timeout = true;
   }
 
-  if (!options.disableWAL) {
+  if (!write_options.disableWAL) {
     RecordTick(stats_, WRITE_WITH_WAL);
     default_cf_internal_stats_->AddDBStats(InternalStats::WRITE_WITH_WAL, 1);
   }
@@ -4074,13 +4100,13 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
       // Record statistics
       RecordTick(stats_, NUMBER_KEYS_WRITTEN, my_batch_count);
       RecordTick(stats_, BYTES_WRITTEN, WriteBatchInternal::ByteSize(updates));
-      if (options.disableWAL) {
+      if (write_options.disableWAL) {
         flush_on_destroy_ = true;
       }
       PERF_TIMER_STOP(write_pre_and_post_process_time);
 
       uint64_t log_size = 0;
-      if (!options.disableWAL) {
+      if (!write_options.disableWAL) {
         PERF_TIMER_GUARD(write_wal_time);
         Slice log_entry = WriteBatchInternal::Contents(updates);
         status = log_->AddRecord(log_entry);
@@ -4089,7 +4115,7 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
         log_empty_ = false;
         log_size = log_entry.size();
         RecordTick(stats_, WAL_FILE_BYTES, log_size);
-        if (status.ok() && options.sync) {
+        if (status.ok() && write_options.sync) {
           RecordTick(stats_, WAL_FILE_SYNCED);
           StopWatch sw(env_, stats_, WAL_FILE_SYNC_MICROS);
           if (db_options_.use_fsync) {
@@ -4104,7 +4130,7 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
 
         status = WriteBatchInternal::InsertInto(
             updates, column_family_memtables_.get(),
-            options.ignore_missing_column_families, 0, this, false);
+            write_options.ignore_missing_column_families, 0, this, false);
         // A non-OK status here indicates iteration failure (either in-memory
         // writebatch corruption (very bad), or the client specified invalid
         // column family).  This will later on trigger bg_error_.
@@ -4123,7 +4149,7 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
       // internal stats
       default_cf_internal_stats_->AddDBStats(
           InternalStats::BYTES_WRITTEN, batch_size);
-      if (!options.disableWAL) {
+      if (!write_options.disableWAL) {
         default_cf_internal_stats_->AddDBStats(
             InternalStats::WAL_FILE_SYNCED, 1);
         default_cf_internal_stats_->AddDBStats(
@@ -4221,8 +4247,8 @@ Status DBImpl::SetNewMemtableAndNewLogFile(ColumnFamilyData* cfd,
       if (s.ok()) {
         // Our final size should be less than write_buffer_size
         // (compression, etc) but err on the side of caution.
-        lfile->SetPreallocationBlockSize(1.1 *
-                                         cfd->options()->write_buffer_size);
+        lfile->SetPreallocationBlockSize(
+            1.1 * mutable_cf_options.write_buffer_size);
         new_log = new log::Writer(std::move(lfile));
       }
     }
@@ -4270,7 +4296,7 @@ Status DBImpl::SetNewMemtableAndNewLogFile(ColumnFamilyData* cfd,
   new_mem->Ref();
   cfd->SetMemtable(new_mem);
   context->superversions_to_free_.push_back(
-      cfd->InstallSuperVersion(new_superversion, &mutex_, mutable_cf_options));
+      InstallSuperVersion(cfd, new_superversion, mutable_cf_options));
   return s;
 }
 
@@ -4616,7 +4642,7 @@ Status DB::Merge(const WriteOptions& opt, ColumnFamilyHandle* column_family,
 }
 
 // Default implementation -- returns not supported status
-Status DB::CreateColumnFamily(const ColumnFamilyOptions& options,
+Status DB::CreateColumnFamily(const ColumnFamilyOptions& cf_options,
                               const std::string& column_family_name,
                               ColumnFamilyHandle** handle) {
   return Status::NotSupported("");
@@ -4739,8 +4765,8 @@ Status DB::Open(const DBOptions& db_options, const std::string& dbname,
     }
     if (s.ok()) {
       for (auto cfd : *impl->versions_->GetColumnFamilySet()) {
-        delete cfd->InstallSuperVersion(new SuperVersion(), &impl->mutex_,
-            *cfd->GetLatestMutableCFOptions());
+        delete impl->InstallSuperVersion(
+            cfd, nullptr, *cfd->GetLatestMutableCFOptions());
       }
       impl->alive_log_files_.push_back(
           DBImpl::LogFileNumberSize(impl->logfile_number_));
diff --git a/db/db_impl.h b/db/db_impl.h
index 149958315..2d5cfe6c2 100644
--- a/db/db_impl.h
+++ b/db/db_impl.h
@@ -630,6 +630,10 @@ class DBImpl : public DB {
                            DeletionState& deletion_state,
                            const MutableCFOptions& mutable_cf_options);
 
+  SuperVersion* InstallSuperVersion(
+    ColumnFamilyData* cfd, SuperVersion* new_sv,
+    const MutableCFOptions& mutable_cf_options);
+
   // Find Super version and reference it. Based on options, it might return
   // the thread local cached one.
   inline SuperVersion* GetAndRefSuperVersion(ColumnFamilyData* cfd);
diff --git a/db/db_test.cc b/db/db_test.cc
index f516a488f..862eecda6 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -8137,7 +8137,7 @@ TEST(DBTest, SimpleWriteTimeoutTest) {
   options.max_background_flushes = 0;
   options.max_write_buffer_number = 2;
   options.max_total_wal_size = std::numeric_limits<uint64_t>::max();
-  WriteOptions write_opt = WriteOptions();
+  WriteOptions write_opt;
   write_opt.timeout_hint_us = 0;
   DestroyAndReopen(&options);
   // fill the two write buffers
@@ -8173,7 +8173,7 @@ static void RandomTimeoutWriter(void* arg) {
   DB* db = state->db;
 
   Random rnd(1000 + thread_id);
-  WriteOptions write_opt = WriteOptions();
+  WriteOptions write_opt;
   write_opt.timeout_hint_us = 500;
   int timeout_count = 0;
   int num_keys = kNumKeys * 5;
@@ -8558,14 +8558,13 @@ TEST(DBTest, DynamicMemtableOptions) {
 
   auto gen_l0_kb = [this](int size) {
     Random rnd(301);
-    std::vector<std::string> values;
     for (int i = 0; i < size; i++) {
-      values.push_back(RandomString(&rnd, 1024));
-      ASSERT_OK(Put(Key(i), values[i]));
+      ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024)));
     }
     dbfull()->TEST_WaitForFlushMemTable();
   };
 
+  // Test write_buffer_size
   gen_l0_kb(64);
   ASSERT_EQ(NumTableFilesAtLevel(0), 1);
   ASSERT_TRUE(SizeAtLevel(0) < k64KB + k5KB);
@@ -8587,6 +8586,68 @@ TEST(DBTest, DynamicMemtableOptions) {
   ASSERT_EQ(NumTableFilesAtLevel(0), 2);
   ASSERT_TRUE(SizeAtLevel(0) < k128KB + k64KB + 2 * k5KB);
   ASSERT_TRUE(SizeAtLevel(0) > k128KB + k64KB - 2 * k5KB);
+
+  // Test max_write_buffer_number
+  // Block compaction thread, which will also block the flushes because
+  // max_background_flushes == 0, so flushes are getting executed by the
+  // compaction thread
+  env_->SetBackgroundThreads(1, Env::LOW);
+  SleepingBackgroundTask sleeping_task_low;
+  env_->Schedule(&SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+  // Start from scratch and disable compaction/flush. Flush can only happen
+  // during compaction but trigger is pretty high
+  options.max_background_flushes = 0;
+  options.disable_auto_compactions = true;
+  DestroyAndReopen(&options);
+
+  // Put until timeout, bounded by 256 puts. We should see timeout at ~128KB
+  int count = 0;
+  Random rnd(301);
+  WriteOptions wo;
+  wo.timeout_hint_us = 1000;
+
+  while (Put(Key(count), RandomString(&rnd, 1024), wo).ok() && count < 256) {
+    count++;
+  }
+  ASSERT_TRUE(count > (128 * 0.9) && count < (128 * 1.1));
+
+  sleeping_task_low.WakeUp();
+  sleeping_task_low.WaitUntilDone();
+
+  // Increase
+  ASSERT_TRUE(dbfull()->SetOptions({
+    {"max_write_buffer_number", "8"},
+  }));
+  // Clean up memtable and L0
+  dbfull()->CompactRange(nullptr, nullptr);
+
+  env_->Schedule(&SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+  count = 0;
+  while (Put(Key(count), RandomString(&rnd, 1024), wo).ok() && count < 1024) {
+    count++;
+  }
+  ASSERT_TRUE(count > (512 * 0.9) && count < (512 * 1.1));
+  sleeping_task_low.WakeUp();
+  sleeping_task_low.WaitUntilDone();
+
+  // Decrease
+  ASSERT_TRUE(dbfull()->SetOptions({
+    {"max_write_buffer_number", "4"},
+  }));
+  // Clean up memtable and L0
+  dbfull()->CompactRange(nullptr, nullptr);
+  env_->Schedule(&SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+
+  count = 0;
+  while (Put(Key(count), RandomString(&rnd, 1024), wo).ok() && count < 1024) {
+    count++;
+  }
+  ASSERT_TRUE(count > (256 * 0.9) && count < (256 * 1.1));
+  sleeping_task_low.WakeUp();
+  sleeping_task_low.WaitUntilDone();
 }
 
 TEST(DBTest, DynamicCompactionOptions) {
@@ -8617,10 +8678,8 @@ TEST(DBTest, DynamicCompactionOptions) {
 
   auto gen_l0_kb = [this](int start, int size, int stride) {
     Random rnd(301);
-    std::vector<std::string> values;
     for (int i = 0; i < size; i++) {
-      values.push_back(RandomString(&rnd, 1024));
-      ASSERT_OK(Put(Key(start + stride * i), values[i]));
+      ASSERT_OK(Put(Key(start + stride * i), RandomString(&rnd, 1024)));
     }
     dbfull()->TEST_WaitForFlushMemTable();
   };
@@ -8666,8 +8725,10 @@ TEST(DBTest, DynamicCompactionOptions) {
     gen_l0_kb(i, 128, 56);
   }
   dbfull()->TEST_WaitForCompact();
-  ASSERT_TRUE(SizeAtLevel(1) < 1048576 * 1.1);
-  ASSERT_TRUE(SizeAtLevel(2) < 4 * 1048576 * 1.1);
+  ASSERT_TRUE(SizeAtLevel(1) > 1048576 * 0.9 &&
+              SizeAtLevel(1) < 1048576 * 1.1);
+  ASSERT_TRUE(SizeAtLevel(2) > 4 * 1048576 * 0.9 &&
+              SizeAtLevel(2) < 4 * 1048576 * 1.1);
 
   // Change multiplier to 2 with smaller base
   ASSERT_TRUE(dbfull()->SetOptions({
diff --git a/util/mutable_cf_options.h b/util/mutable_cf_options.h
index 02f63fed4..4336baa12 100644
--- a/util/mutable_cf_options.h
+++ b/util/mutable_cf_options.h
@@ -14,6 +14,7 @@ namespace rocksdb {
 struct MutableCFOptions {
   MutableCFOptions(const Options& options, const ImmutableCFOptions& ioptions)
     : write_buffer_size(options.write_buffer_size),
+      max_write_buffer_number(options.max_write_buffer_number),
       arena_block_size(options.arena_block_size),
       memtable_prefix_bloom_bits(options.memtable_prefix_bloom_bits),
       memtable_prefix_bloom_probes(options.memtable_prefix_bloom_probes),
@@ -39,6 +40,7 @@ struct MutableCFOptions {
   }
   MutableCFOptions()
     : write_buffer_size(0),
+      max_write_buffer_number(0),
       arena_block_size(0),
       memtable_prefix_bloom_bits(0),
       memtable_prefix_bloom_probes(0),
@@ -72,6 +74,7 @@ struct MutableCFOptions {
 
   // Memtable related options
   size_t write_buffer_size;
+  int max_write_buffer_number;
   size_t arena_block_size;
   uint32_t memtable_prefix_bloom_bits;
   uint32_t memtable_prefix_bloom_probes;
diff --git a/util/options_helper.cc b/util/options_helper.cc
index 67726dc8f..a58c7c596 100644
--- a/util/options_helper.cc
+++ b/util/options_helper.cc
@@ -92,6 +92,8 @@ bool ParseMemtableOptions(const std::string& name, const std::string& value,
     new_options->max_successive_merges = ParseInt64(value);
   } else if (name == "filter_deletes") {
     new_options->filter_deletes = ParseBoolean(name, value);
+  } else if (name == "max_write_buffer_number") {
+    new_options->max_write_buffer_number = ParseInt(value);
   } else {
     return false;
   }
@@ -220,8 +222,6 @@ bool GetColumnFamilyOptionsFromMap(
     try {
       if (ParseMemtableOptions(o.first, o.second, new_options)) {
       } else if (ParseCompactionOptions(o.first, o.second, new_options)) {
-      } else if (o.first == "max_write_buffer_number") {
-        new_options->max_write_buffer_number = ParseInt(o.second);
       } else if (o.first == "min_write_buffer_number_to_merge") {
         new_options->min_write_buffer_number_to_merge = ParseInt(o.second);
       } else if (o.first == "compression") {

From 065a67c4f060c6ab22029076578338e22bc46cd3 Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Thu, 16 Oct 2014 17:14:17 -0700
Subject: [PATCH 260/829] dynamic disable_auto_compactions

Summary: Add more tests as well

Test Plan: unit test

Reviewers: igor, sdong, yhchiang

Reviewed By: sdong

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D24747
---
 db/db_impl.cc             |  8 ++--
 db/db_test.cc             | 81 +++++++++++++++++++++++++++++++++++++--
 util/mutable_cf_options.h |  3 ++
 util/options_helper.cc    |  6 +--
 4 files changed, 89 insertions(+), 9 deletions(-)

diff --git a/db/db_impl.cc b/db/db_impl.cc
index 592634600..b61ea303d 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -2323,12 +2323,14 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress,
   } else {
     // no need to refcount in iteration since it's always under a mutex
     for (auto cfd : *versions_->GetColumnFamilySet()) {
-      if (!cfd->options()->disable_auto_compactions) {
+      // Pick up latest mutable CF Options and use it throughout the
+      // compaction job
+      auto* mutable_cf_options = cfd->GetLatestMutableCFOptions();
+      if (!mutable_cf_options->disable_auto_compactions) {
         // NOTE: try to avoid unnecessary copy of MutableCFOptions if
         // compaction is not necessary. Need to make sure mutex is held
         // until we make a copy in the following code
-        c.reset(cfd->PickCompaction(
-              *cfd->GetLatestMutableCFOptions(), log_buffer));
+        c.reset(cfd->PickCompaction(*mutable_cf_options, log_buffer));
         if (c != nullptr) {
           // update statistics
           MeasureTime(stats_, NUM_FILES_IN_SINGLE_COMPACTION,
diff --git a/db/db_test.cc b/db/db_test.cc
index 862eecda6..437cebc33 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -8659,14 +8659,13 @@ TEST(DBTest, DynamicCompactionOptions) {
   options.env = env_;
   options.create_if_missing = true;
   options.compression = kNoCompression;
-  options.max_background_compactions = 4;
   options.hard_rate_limit = 1.1;
   options.write_buffer_size = k128KB;
   options.max_write_buffer_number = 2;
   // Compaction related options
   options.level0_file_num_compaction_trigger = 3;
-  options.level0_slowdown_writes_trigger = 10;
-  options.level0_stop_writes_trigger = 20;
+  options.level0_slowdown_writes_trigger = 4;
+  options.level0_stop_writes_trigger = 8;
   options.max_grandparent_overlap_factor = 10;
   options.expanded_compaction_factor = 25;
   options.source_compaction_factor = 1;
@@ -8674,6 +8673,10 @@ TEST(DBTest, DynamicCompactionOptions) {
   options.target_file_size_multiplier = 1;
   options.max_bytes_for_level_base = k256KB;
   options.max_bytes_for_level_multiplier = 4;
+
+  // Block flush thread and disable compaction thread
+  env_->SetBackgroundThreads(1, Env::LOW);
+  env_->SetBackgroundThreads(1, Env::HIGH);
   DestroyAndReopen(&options);
 
   auto gen_l0_kb = [this](int start, int size, int stride) {
@@ -8745,6 +8748,78 @@ TEST(DBTest, DynamicCompactionOptions) {
   ASSERT_TRUE(SizeAtLevel(1) < 262144 * 1.1);
   ASSERT_TRUE(SizeAtLevel(2) < 2 * 262144 * 1.1);
   ASSERT_TRUE(SizeAtLevel(3) < 4 * 262144 * 1.1);
+
+  // Clean up memtable and L0
+  dbfull()->CompactRange(nullptr, nullptr);
+  // Block compaction
+  SleepingBackgroundTask sleeping_task_low1;
+  env_->Schedule(&SleepingBackgroundTask::DoSleepTask, &sleeping_task_low1,
+                 Env::Priority::LOW);
+  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+  int count = 0;
+  Random rnd(301);
+  WriteOptions wo;
+  wo.timeout_hint_us = 10000;
+  while (Put(Key(count), RandomString(&rnd, 1024), wo).ok() && count < 64) {
+    // Wait for compaction so that put won't timeout
+    dbfull()->TEST_FlushMemTable(true);
+    count++;
+  }
+  ASSERT_EQ(count, 8);
+  // Unblock
+  sleeping_task_low1.WakeUp();
+  sleeping_task_low1.WaitUntilDone();
+
+  // Reduce stop trigger
+  ASSERT_TRUE(dbfull()->SetOptions({
+    {"level0_stop_writes_trigger", "6"}
+  }));
+  dbfull()->CompactRange(nullptr, nullptr);
+  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+
+  // Block compaction
+  SleepingBackgroundTask sleeping_task_low2;
+  env_->Schedule(&SleepingBackgroundTask::DoSleepTask, &sleeping_task_low2,
+                 Env::Priority::LOW);
+  count = 0;
+  while (Put(Key(count), RandomString(&rnd, 1024), wo).ok() && count < 64) {
+    // Wait for compaction so that put won't timeout
+    dbfull()->TEST_FlushMemTable(true);
+    count++;
+  }
+  ASSERT_EQ(count, 6);
+  // Unblock
+  sleeping_task_low2.WakeUp();
+  sleeping_task_low2.WaitUntilDone();
+
+  // Test disable_auto_compactions
+  ASSERT_TRUE(dbfull()->SetOptions({
+    {"disable_auto_compactions", "true"}
+  }));
+  dbfull()->CompactRange(nullptr, nullptr);
+  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+
+  for (int i = 0; i < 4; ++i) {
+    ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024)));
+    // Wait for compaction so that put won't timeout
+    dbfull()->TEST_FlushMemTable(true);
+  }
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(NumTableFilesAtLevel(0), 4);
+
+  ASSERT_TRUE(dbfull()->SetOptions({
+    {"disable_auto_compactions", "false"}
+  }));
+  dbfull()->CompactRange(nullptr, nullptr);
+  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+
+  for (int i = 0; i < 4; ++i) {
+    ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024)));
+    // Wait for compaction so that put won't timeout
+    dbfull()->TEST_FlushMemTable(true);
+  }
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_LT(NumTableFilesAtLevel(0), 4);
 }
 
 }  // namespace rocksdb
diff --git a/util/mutable_cf_options.h b/util/mutable_cf_options.h
index 4336baa12..eeb56eb82 100644
--- a/util/mutable_cf_options.h
+++ b/util/mutable_cf_options.h
@@ -22,6 +22,7 @@ struct MutableCFOptions {
           options.memtable_prefix_bloom_huge_page_tlb_size),
       max_successive_merges(options.max_successive_merges),
       filter_deletes(options.filter_deletes),
+      disable_auto_compactions(options.disable_auto_compactions),
       level0_file_num_compaction_trigger(
           options.level0_file_num_compaction_trigger),
       level0_slowdown_writes_trigger(options.level0_slowdown_writes_trigger),
@@ -47,6 +48,7 @@ struct MutableCFOptions {
       memtable_prefix_bloom_huge_page_tlb_size(0),
       max_successive_merges(0),
       filter_deletes(false),
+      disable_auto_compactions(false),
       level0_file_num_compaction_trigger(0),
       level0_slowdown_writes_trigger(0),
       level0_stop_writes_trigger(0),
@@ -83,6 +85,7 @@ struct MutableCFOptions {
   bool filter_deletes;
 
   // Compaction related options
+  bool disable_auto_compactions;
   int level0_file_num_compaction_trigger;
   int level0_slowdown_writes_trigger;
   int level0_stop_writes_trigger;
diff --git a/util/options_helper.cc b/util/options_helper.cc
index a58c7c596..0a723cff6 100644
--- a/util/options_helper.cc
+++ b/util/options_helper.cc
@@ -103,7 +103,9 @@ bool ParseMemtableOptions(const std::string& name, const std::string& value,
 template<typename OptionsType>
 bool ParseCompactionOptions(const std::string& name, const std::string& value,
                             OptionsType* new_options) {
-  if (name == "level0_file_num_compaction_trigger") {
+  if (name == "disable_auto_compactions") {
+    new_options->disable_auto_compactions = ParseBoolean(name, value);
+  } else if (name == "level0_file_num_compaction_trigger") {
     new_options->level0_file_num_compaction_trigger = ParseInt(value);
   } else if (name == "level0_slowdown_writes_trigger") {
     new_options->level0_slowdown_writes_trigger = ParseInt(value);
@@ -270,8 +272,6 @@ bool GetColumnFamilyOptionsFromMap(
         new_options->soft_rate_limit = ParseDouble(o.second);
       } else if (o.first == "hard_rate_limit") {
         new_options->hard_rate_limit = ParseDouble(o.second);
-      } else if (o.first == "disable_auto_compactions") {
-        new_options->disable_auto_compactions = ParseBoolean(o.first, o.second);
       } else if (o.first == "purge_redundant_kvs_while_flush") {
         new_options->purge_redundant_kvs_while_flush =
           ParseBoolean(o.first, o.second);

From 4d5708aa560d5fefaf14d08e795abc19621f9207 Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Thu, 16 Oct 2014 17:21:31 -0700
Subject: [PATCH 261/829] dynamic soft_rate_limit and hard_rate_limit

Summary: as title

Test Plan:
unit test
I am only able to build the test case for hard_rate_limit.
soft_rate_limit is essentially the same thing as hard_rate_limit

Reviewers: igor, sdong, yhchiang

Reviewed By: yhchiang

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D24759
---
 db/column_family.cc       |  13 ++--
 db/db_test.cc             | 131 +++++++++++++++++++++++++++-----------
 util/mutable_cf_options.h |   6 ++
 util/options_helper.cc    |   8 +--
 4 files changed, 111 insertions(+), 47 deletions(-)

diff --git a/db/column_family.cc b/db/column_family.cc
index 37699af21..a728a3fd5 100644
--- a/db/column_family.cc
+++ b/db/column_family.cc
@@ -354,8 +354,8 @@ void ColumnFamilyData::RecalculateWriteStallConditions(
           "[%s] Stalling writes because we have %d level-0 files (%" PRIu64
           "us)",
           name_.c_str(), current_->NumLevelFiles(0), slowdown);
-    } else if (options_.hard_rate_limit > 1.0 &&
-               score > options_.hard_rate_limit) {
+    } else if (mutable_cf_options.hard_rate_limit > 1.0 &&
+               score > mutable_cf_options.hard_rate_limit) {
       uint64_t kHardLimitSlowdown = 1000;
       write_controller_token_ =
           write_controller->GetDelayToken(kHardLimitSlowdown);
@@ -365,10 +365,11 @@ void ColumnFamilyData::RecalculateWriteStallConditions(
           "[%s] Stalling writes because we hit hard limit on level %d. "
           "(%" PRIu64 "us)",
           name_.c_str(), max_level, kHardLimitSlowdown);
-    } else if (options_.soft_rate_limit > 0.0 &&
-               score > options_.soft_rate_limit) {
-      uint64_t slowdown = SlowdownAmount(score, options_.soft_rate_limit,
-                                         options_.hard_rate_limit);
+    } else if (mutable_cf_options.soft_rate_limit > 0.0 &&
+               score > mutable_cf_options.soft_rate_limit) {
+      uint64_t slowdown = SlowdownAmount(score,
+          mutable_cf_options.soft_rate_limit,
+          mutable_cf_options.hard_rate_limit);
       write_controller_token_ = write_controller->GetDelayToken(slowdown);
       internal_stats_->RecordLevelNSlowdown(max_level, slowdown, true);
       Log(ioptions_.info_log,
diff --git a/db/db_test.cc b/db/db_test.cc
index 437cebc33..a279ff5fb 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -8651,16 +8651,18 @@ TEST(DBTest, DynamicMemtableOptions) {
 }
 
 TEST(DBTest, DynamicCompactionOptions) {
+  // minimum write buffer size is enforced at 64KB
+  const uint64_t k32KB = 1 << 15;
   const uint64_t k64KB = 1 << 16;
   const uint64_t k128KB = 1 << 17;
   const uint64_t k256KB = 1 << 18;
-  const uint64_t k5KB = 5 * 1024;
+  const uint64_t k4KB = 1 << 12;
   Options options;
   options.env = env_;
   options.create_if_missing = true;
   options.compression = kNoCompression;
   options.hard_rate_limit = 1.1;
-  options.write_buffer_size = k128KB;
+  options.write_buffer_size = k64KB;
   options.max_write_buffer_number = 2;
   // Compaction related options
   options.level0_file_num_compaction_trigger = 3;
@@ -8669,9 +8671,9 @@ TEST(DBTest, DynamicCompactionOptions) {
   options.max_grandparent_overlap_factor = 10;
   options.expanded_compaction_factor = 25;
   options.source_compaction_factor = 1;
-  options.target_file_size_base = k128KB;
+  options.target_file_size_base = k64KB;
   options.target_file_size_multiplier = 1;
-  options.max_bytes_for_level_base = k256KB;
+  options.max_bytes_for_level_base = k128KB;
   options.max_bytes_for_level_multiplier = 4;
 
   // Block flush thread and disable compaction thread
@@ -8689,65 +8691,71 @@ TEST(DBTest, DynamicCompactionOptions) {
 
   // Write 3 files that have the same key range, trigger compaction and
   // result in one L1 file
-  gen_l0_kb(0, 128, 1);
+  gen_l0_kb(0, 64);
   ASSERT_EQ(NumTableFilesAtLevel(0), 1);
-  gen_l0_kb(0, 128, 1);
+  gen_l0_kb(0, 64);
   ASSERT_EQ(NumTableFilesAtLevel(0), 2);
-  gen_l0_kb(0, 128, 1);
+  gen_l0_kb(0, 64);
   dbfull()->TEST_WaitForCompact();
   ASSERT_EQ("0,1", FilesPerLevel());
   std::vector<LiveFileMetaData> metadata;
   db_->GetLiveFilesMetaData(&metadata);
   ASSERT_EQ(1U, metadata.size());
-  ASSERT_LE(metadata[0].size, k128KB + k5KB);  // < 128KB + 5KB
-  ASSERT_GE(metadata[0].size, k128KB - k5KB);  // > 128B - 5KB
+  ASSERT_LE(metadata[0].size, k64KB + k4KB);
+  ASSERT_GE(metadata[0].size, k64KB - k4KB);
 
-  // Make compaction trigger and file size smaller
+  // Test compaction trigger and target_file_size_base
   ASSERT_TRUE(dbfull()->SetOptions({
     {"level0_file_num_compaction_trigger", "2"},
-    {"target_file_size_base", "65536"}
+    {"target_file_size_base", std::to_string(k32KB) }
   }));
 
-  gen_l0_kb(0, 128, 1);
+  gen_l0_kb(0, 64);
   ASSERT_EQ("1,1", FilesPerLevel());
-  gen_l0_kb(0, 128, 1);
+  gen_l0_kb(0, 64);
   dbfull()->TEST_WaitForCompact();
   ASSERT_EQ("0,2", FilesPerLevel());
   metadata.clear();
   db_->GetLiveFilesMetaData(&metadata);
   ASSERT_EQ(2U, metadata.size());
-  ASSERT_LE(metadata[0].size, k64KB + k5KB);  // < 64KB + 5KB
-  ASSERT_GE(metadata[0].size, k64KB - k5KB);  // > 64KB - 5KB
+  ASSERT_LE(metadata[0].size, k32KB + k4KB);
+  ASSERT_GE(metadata[0].size, k32KB - k4KB);
 
-  // Change base level size to 1MB
-  ASSERT_TRUE(dbfull()->SetOptions({ {"max_bytes_for_level_base", "1048576"} }));
+  // Test max_bytes_for_level_base
+  ASSERT_TRUE(dbfull()->SetOptions({
+    {"max_bytes_for_level_base", std::to_string(k256KB) }
+  }));
 
-  // writing 56 x 128KB => 7MB
-  // (L1 + L2) = (1 + 4) * 1MB = 5MB
-  for (int i = 0; i < 56; ++i) {
-    gen_l0_kb(i, 128, 56);
+  // writing 24 x 64KB => 6 * 256KB
+  // (L1 + L2) = (1 + 4) * 256KB
+  for (int i = 0; i < 24; ++i) {
+    gen_l0_kb(i, 64, 32);
   }
   dbfull()->TEST_WaitForCompact();
-  ASSERT_TRUE(SizeAtLevel(1) > 1048576 * 0.9 &&
-              SizeAtLevel(1) < 1048576 * 1.1);
-  ASSERT_TRUE(SizeAtLevel(2) > 4 * 1048576 * 0.9 &&
-              SizeAtLevel(2) < 4 * 1048576 * 1.1);
+  ASSERT_TRUE(SizeAtLevel(1) > k256KB * 0.8 &&
+              SizeAtLevel(1) < k256KB * 1.2);
+  ASSERT_TRUE(SizeAtLevel(2) > 4 * k256KB * 0.8 &&
+              SizeAtLevel(2) < 4 * k256KB * 1.2);
 
-  // Change multiplier to 2 with smaller base
+  // Test max_bytes_for_level_multiplier and
+  // max_bytes_for_level_base (reduce)
   ASSERT_TRUE(dbfull()->SetOptions({
     {"max_bytes_for_level_multiplier", "2"},
-    {"max_bytes_for_level_base", "262144"}
+    {"max_bytes_for_level_base", std::to_string(k128KB) }
   }));
 
-  // writing 16 x 128KB
-  // (L1 + L2 + L3) = (1 + 2 + 4) * 256KB
-  for (int i = 0; i < 16; ++i) {
-    gen_l0_kb(i, 128, 50);
+  // writing 20 x 64KB = 10 x 128KB
+  // (L1 + L2 + L3) = (1 + 2 + 4) * 128KB
+  for (int i = 0; i < 20; ++i) {
+    gen_l0_kb(i, 64, 32);
   }
   dbfull()->TEST_WaitForCompact();
-  ASSERT_TRUE(SizeAtLevel(1) < 262144 * 1.1);
-  ASSERT_TRUE(SizeAtLevel(2) < 2 * 262144 * 1.1);
-  ASSERT_TRUE(SizeAtLevel(3) < 4 * 262144 * 1.1);
+  ASSERT_TRUE(SizeAtLevel(1) > k128KB * 0.8 &&
+              SizeAtLevel(1) < k128KB * 1.2);
+  ASSERT_TRUE(SizeAtLevel(2) > 2 * k128KB * 0.8 &&
+              SizeAtLevel(2) < 2 * k128KB * 1.2);
+  ASSERT_TRUE(SizeAtLevel(3) > 4 * k128KB * 0.8 &&
+              SizeAtLevel(3) < 4 * k128KB * 1.2);
 
   // Clean up memtable and L0
   dbfull()->CompactRange(nullptr, nullptr);
@@ -8761,16 +8769,16 @@ TEST(DBTest, DynamicCompactionOptions) {
   WriteOptions wo;
   wo.timeout_hint_us = 10000;
   while (Put(Key(count), RandomString(&rnd, 1024), wo).ok() && count < 64) {
-    // Wait for compaction so that put won't timeout
     dbfull()->TEST_FlushMemTable(true);
     count++;
   }
+  // Stop trigger = 8
   ASSERT_EQ(count, 8);
   // Unblock
   sleeping_task_low1.WakeUp();
   sleeping_task_low1.WaitUntilDone();
 
-  // Reduce stop trigger
+  // Test: stop trigger (reduce)
   ASSERT_TRUE(dbfull()->SetOptions({
     {"level0_stop_writes_trigger", "6"}
   }));
@@ -8783,7 +8791,6 @@ TEST(DBTest, DynamicCompactionOptions) {
                  Env::Priority::LOW);
   count = 0;
   while (Put(Key(count), RandomString(&rnd, 1024), wo).ok() && count < 64) {
-    // Wait for compaction so that put won't timeout
     dbfull()->TEST_FlushMemTable(true);
     count++;
   }
@@ -8820,6 +8827,56 @@ TEST(DBTest, DynamicCompactionOptions) {
   }
   dbfull()->TEST_WaitForCompact();
   ASSERT_LT(NumTableFilesAtLevel(0), 4);
+
+  // Test for hard_rate_limit, change max_bytes_for_level_base to make level
+  // size big
+  ASSERT_TRUE(dbfull()->SetOptions({
+    {"max_bytes_for_level_base", std::to_string(k256KB) }
+  }));
+  // writing 40 x 64KB = 10 x 256KB
+  // (L1 + L2 + L3) = (1 + 2 + 4) * 256KB
+  for (int i = 0; i < 40; ++i) {
+    gen_l0_kb(i, 64, 32);
+  }
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_TRUE(SizeAtLevel(1) > k256KB * 0.8 &&
+              SizeAtLevel(1) < k256KB * 1.2);
+  ASSERT_TRUE(SizeAtLevel(2) > 2 * k256KB * 0.8 &&
+              SizeAtLevel(2) < 2 * k256KB * 1.2);
+  ASSERT_TRUE(SizeAtLevel(3) > 4 * k256KB * 0.8 &&
+              SizeAtLevel(3) < 4 * k256KB * 1.2);
+  // Reduce max_bytes_for_level_base and disable compaction at the same time
+  // This should cause score to increase
+  ASSERT_TRUE(dbfull()->SetOptions({
+    {"disable_auto_compactions", "true"},
+    {"max_bytes_for_level_base", "65536"},
+  }));
+  ASSERT_OK(Put(Key(count), RandomString(&rnd, 1024)));
+  dbfull()->TEST_FlushMemTable(true);
+
+  // Check score is above 2
+  ASSERT_TRUE(SizeAtLevel(1) / k64KB > 2 ||
+              SizeAtLevel(2) / k64KB > 4 ||
+              SizeAtLevel(3) / k64KB > 8);
+
+  // Enfoce hard rate limit, L0 score is not regulated by this limit
+  ASSERT_TRUE(dbfull()->SetOptions({
+    {"hard_rate_limit", "2"}
+  }));
+  ASSERT_OK(Put(Key(count), RandomString(&rnd, 1024)));
+  dbfull()->TEST_FlushMemTable(true);
+
+  // Hard rate limit slow down for 1000 us, so default 10ms should be ok
+  ASSERT_TRUE(Put(Key(count), RandomString(&rnd, 1024), wo).ok());
+  wo.timeout_hint_us = 500;
+  ASSERT_TRUE(Put(Key(count), RandomString(&rnd, 1024), wo).IsTimedOut());
+
+  // Bump up limit
+  ASSERT_TRUE(dbfull()->SetOptions({
+    {"hard_rate_limit", "100"}
+  }));
+  dbfull()->TEST_FlushMemTable(true);
+  ASSERT_TRUE(Put(Key(count), RandomString(&rnd, 1024), wo).ok());
 }
 
 }  // namespace rocksdb
diff --git a/util/mutable_cf_options.h b/util/mutable_cf_options.h
index eeb56eb82..bf340205d 100644
--- a/util/mutable_cf_options.h
+++ b/util/mutable_cf_options.h
@@ -23,6 +23,8 @@ struct MutableCFOptions {
       max_successive_merges(options.max_successive_merges),
       filter_deletes(options.filter_deletes),
       disable_auto_compactions(options.disable_auto_compactions),
+      soft_rate_limit(options.soft_rate_limit),
+      hard_rate_limit(options.hard_rate_limit),
       level0_file_num_compaction_trigger(
           options.level0_file_num_compaction_trigger),
       level0_slowdown_writes_trigger(options.level0_slowdown_writes_trigger),
@@ -49,6 +51,8 @@ struct MutableCFOptions {
       max_successive_merges(0),
       filter_deletes(false),
       disable_auto_compactions(false),
+      soft_rate_limit(0),
+      hard_rate_limit(0),
       level0_file_num_compaction_trigger(0),
       level0_slowdown_writes_trigger(0),
       level0_stop_writes_trigger(0),
@@ -86,6 +90,8 @@ struct MutableCFOptions {
 
   // Compaction related options
   bool disable_auto_compactions;
+  double soft_rate_limit;
+  double hard_rate_limit;
   int level0_file_num_compaction_trigger;
   int level0_slowdown_writes_trigger;
   int level0_stop_writes_trigger;
diff --git a/util/options_helper.cc b/util/options_helper.cc
index 0a723cff6..2a56a1ccf 100644
--- a/util/options_helper.cc
+++ b/util/options_helper.cc
@@ -105,6 +105,10 @@ bool ParseCompactionOptions(const std::string& name, const std::string& value,
                             OptionsType* new_options) {
   if (name == "disable_auto_compactions") {
     new_options->disable_auto_compactions = ParseBoolean(name, value);
+  } else if (name == "soft_rate_limit") {
+    new_options->soft_rate_limit = ParseDouble(value);
+  } else if (name == "hard_rate_limit") {
+    new_options->hard_rate_limit = ParseDouble(value);
   } else if (name == "level0_file_num_compaction_trigger") {
     new_options->level0_file_num_compaction_trigger = ParseInt(value);
   } else if (name == "level0_slowdown_writes_trigger") {
@@ -268,10 +272,6 @@ bool GetColumnFamilyOptionsFromMap(
         new_options->num_levels = ParseInt(o.second);
       } else if (o.first == "max_mem_compaction_level") {
         new_options->max_mem_compaction_level = ParseInt(o.second);
-      } else if (o.first == "soft_rate_limit") {
-        new_options->soft_rate_limit = ParseDouble(o.second);
-      } else if (o.first == "hard_rate_limit") {
-        new_options->hard_rate_limit = ParseDouble(o.second);
       } else if (o.first == "purge_redundant_kvs_while_flush") {
         new_options->purge_redundant_kvs_while_flush =
           ParseBoolean(o.first, o.second);

From d6c8dba727430c1eb799c563dca308dbcfa27f4b Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Thu, 16 Oct 2014 17:22:28 -0700
Subject: [PATCH 262/829] Log MutableCFOptions in SetOptions

Summary: as title

Test Plan: make release

Reviewers: sdong, yhchiang, igor

Reviewed By: igor

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D24903
---
 db/db_impl.cc              | 34 +++++++++++++++++++--
 util/mutable_cf_options.cc | 61 ++++++++++++++++++++++++++++++++++++++
 util/mutable_cf_options.h  |  2 ++
 3 files changed, 94 insertions(+), 3 deletions(-)

diff --git a/db/db_impl.cc b/db/db_impl.cc
index b61ea303d..d8df10c5a 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -1727,9 +1727,37 @@ Status DBImpl::CompactRange(ColumnFamilyHandle* column_family,
 
 bool DBImpl::SetOptions(ColumnFamilyHandle* column_family,
     const std::unordered_map<std::string, std::string>& options_map) {
-  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
-  MutexLock l(&mutex_);
-  return cfh->cfd()->SetOptions(options_map);
+  auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family)->cfd();
+  if (options_map.empty()) {
+    Log(db_options_.info_log, "SetOptions() on column family [%s], empty input",
+        cfd->GetName().c_str());
+    return false;
+  }
+
+  MutableCFOptions new_options;
+  bool succeed = false;
+  {
+    MutexLock l(&mutex_);
+    if (cfd->SetOptions(options_map)) {
+      new_options = *cfd->GetLatestMutableCFOptions();
+      succeed = true;
+    }
+  }
+
+  Log(db_options_.info_log, "SetOptions() on column family [%s], inputs:",
+      cfd->GetName().c_str());
+  for (const auto& o : options_map) {
+    Log(db_options_.info_log, "%s: %s\n", o.first.c_str(), o.second.c_str());
+  }
+  if (succeed) {
+    Log(db_options_.info_log, "[%s] SetOptions succeeded",
+        cfd->GetName().c_str());
+    new_options.Dump(db_options_.info_log.get());
+  } else {
+    Log(db_options_.info_log, "[%s] SetOptions failed",
+        cfd->GetName().c_str());
+  }
+  return succeed;
 }
 
 // return the same level if it cannot be moved
diff --git a/util/mutable_cf_options.cc b/util/mutable_cf_options.cc
index 1c710c656..f6a2933d3 100644
--- a/util/mutable_cf_options.cc
+++ b/util/mutable_cf_options.cc
@@ -3,8 +3,15 @@
 //  LICENSE file in the root directory of this source tree. An additional grant
 //  of patent rights can be found in the PATENTS file in the same directory.
 
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include <inttypes.h>
 #include <limits>
 #include <cassert>
+#include <string>
+#include "rocksdb/env.h"
 #include "rocksdb/options.h"
 #include "rocksdb/immutable_options.h"
 #include "util/mutable_cf_options.h"
@@ -69,4 +76,58 @@ uint64_t MutableCFOptions::ExpandedCompactionByteSizeLimit(int level) const {
   return MaxFileSizeForLevel(level) * expanded_compaction_factor;
 }
 
+void MutableCFOptions::Dump(Logger* log) const {
+  // Memtable related options
+  Log(log, "                        write_buffer_size: %" PRIu64,
+      write_buffer_size);
+  Log(log, "                  max_write_buffer_number: %d",
+      max_write_buffer_number);
+  Log(log, "                         arena_block_size: %" PRIu64,
+      arena_block_size);
+  Log(log, "               memtable_prefix_bloom_bits: %" PRIu32,
+      memtable_prefix_bloom_bits);
+  Log(log, "             memtable_prefix_bloom_probes: %" PRIu32,
+      memtable_prefix_bloom_probes);
+  Log(log, " memtable_prefix_bloom_huge_page_tlb_size: %" PRIu64,
+      memtable_prefix_bloom_huge_page_tlb_size);
+  Log(log, "                    max_successive_merges: %" PRIu64,
+      max_successive_merges);
+  Log(log, "                           filter_deletes: %d",
+      filter_deletes);
+  Log(log, "                 disable_auto_compactions: %d",
+      disable_auto_compactions);
+  Log(log, "                          soft_rate_limit: %lf",
+      soft_rate_limit);
+  Log(log, "                          hard_rate_limit: %lf",
+      hard_rate_limit);
+  Log(log, "       level0_file_num_compaction_trigger: %d",
+      level0_file_num_compaction_trigger);
+  Log(log, "           level0_slowdown_writes_trigger: %d",
+      level0_slowdown_writes_trigger);
+  Log(log, "               level0_stop_writes_trigger: %d",
+      level0_stop_writes_trigger);
+  Log(log, "           max_grandparent_overlap_factor: %d",
+      max_grandparent_overlap_factor);
+  Log(log, "               expanded_compaction_factor: %d",
+      expanded_compaction_factor);
+  Log(log, "                 source_compaction_factor: %d",
+      source_compaction_factor);
+  Log(log, "                    target_file_size_base: %d",
+      target_file_size_base);
+  Log(log, "              target_file_size_multiplier: %d",
+      target_file_size_multiplier);
+  Log(log, "                 max_bytes_for_level_base: %" PRIu64,
+      max_bytes_for_level_base);
+  Log(log, "           max_bytes_for_level_multiplier: %d",
+      max_bytes_for_level_multiplier);
+  std::string result;
+  char buf[10];
+  for (const auto m : max_bytes_for_level_multiplier_additional) {
+    snprintf(buf, sizeof(buf), "%d, ", m);
+    result += buf;
+  }
+  result.resize(result.size() - 2);
+  Log(log, "max_bytes_for_level_multiplier_additional: %s", result.c_str());
+}
+
 }  // namespace rocksdb
diff --git a/util/mutable_cf_options.h b/util/mutable_cf_options.h
index bf340205d..eaecaa487 100644
--- a/util/mutable_cf_options.h
+++ b/util/mutable_cf_options.h
@@ -78,6 +78,8 @@ struct MutableCFOptions {
   uint64_t MaxGrandParentOverlapBytes(int level) const;
   uint64_t ExpandedCompactionByteSizeLimit(int level) const;
 
+  void Dump(Logger* log) const;
+
   // Memtable related options
   size_t write_buffer_size;
   int max_write_buffer_number;

From 274dc81c928e151ae1fa4c5c80c9ebf4e7401197 Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Thu, 16 Oct 2014 17:33:09 -0700
Subject: [PATCH 263/829] fix build failure

Summary: missed default value during merge

Test Plan: ./db_test

Reviewers: igor, sdong, yhchiang

Reviewed By: yhchiang

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D24975
---
 db/db_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/db/db_test.cc b/db/db_test.cc
index a279ff5fb..cacd830ee 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -8681,7 +8681,7 @@ TEST(DBTest, DynamicCompactionOptions) {
   env_->SetBackgroundThreads(1, Env::HIGH);
   DestroyAndReopen(&options);
 
-  auto gen_l0_kb = [this](int start, int size, int stride) {
+  auto gen_l0_kb = [this](int start, int size, int stride = 1) {
     Random rnd(301);
     for (int i = 0; i < size; i++) {
       ASSERT_OK(Put(Key(start + stride * i), RandomString(&rnd, 1024)));

From d2e60f5ceeb2ddb33d9c3e63bd5dd030d9c025e6 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Fri, 17 Oct 2014 12:05:32 -0400
Subject: [PATCH 264/829] Fix mac compile

---
 util/mutable_cf_options.cc | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/util/mutable_cf_options.cc b/util/mutable_cf_options.cc
index f6a2933d3..1e10890b7 100644
--- a/util/mutable_cf_options.cc
+++ b/util/mutable_cf_options.cc
@@ -78,19 +78,17 @@ uint64_t MutableCFOptions::ExpandedCompactionByteSizeLimit(int level) const {
 
 void MutableCFOptions::Dump(Logger* log) const {
   // Memtable related options
-  Log(log, "                        write_buffer_size: %" PRIu64,
-      write_buffer_size);
+  Log(log, "                        write_buffer_size: zu%", write_buffer_size);
   Log(log, "                  max_write_buffer_number: %d",
       max_write_buffer_number);
-  Log(log, "                         arena_block_size: %" PRIu64,
-      arena_block_size);
+  Log(log, "                         arena_block_size: %zu", arena_block_size);
   Log(log, "               memtable_prefix_bloom_bits: %" PRIu32,
       memtable_prefix_bloom_bits);
   Log(log, "             memtable_prefix_bloom_probes: %" PRIu32,
       memtable_prefix_bloom_probes);
-  Log(log, " memtable_prefix_bloom_huge_page_tlb_size: %" PRIu64,
+  Log(log, " memtable_prefix_bloom_huge_page_tlb_size: %zu",
       memtable_prefix_bloom_huge_page_tlb_size);
-  Log(log, "                    max_successive_merges: %" PRIu64,
+  Log(log, "                    max_successive_merges: %zu",
       max_successive_merges);
   Log(log, "                           filter_deletes: %d",
       filter_deletes);

From c12f571d31e8f721983a8fd95978b385a6be5e51 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Fri, 17 Oct 2014 12:09:45 -0400
Subject: [PATCH 265/829] Fix mac compile, second try

---
 util/mutable_cf_options.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/util/mutable_cf_options.cc b/util/mutable_cf_options.cc
index 1e10890b7..1b3197b18 100644
--- a/util/mutable_cf_options.cc
+++ b/util/mutable_cf_options.cc
@@ -78,7 +78,7 @@ uint64_t MutableCFOptions::ExpandedCompactionByteSizeLimit(int level) const {
 
 void MutableCFOptions::Dump(Logger* log) const {
   // Memtable related options
-  Log(log, "                        write_buffer_size: zu%", write_buffer_size);
+  Log(log, "                        write_buffer_size: %zu", write_buffer_size);
   Log(log, "                  max_write_buffer_number: %d",
       max_write_buffer_number);
   Log(log, "                         arena_block_size: %zu", arena_block_size);

From ee80fb4b4ae8f26ee44f7064487d609b5d2dfb14 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Fri, 17 Oct 2014 09:26:27 -0700
Subject: [PATCH 266/829] Total memtables size counter

Summary: Added one new counter for GetProperty

Test Plan: Not sure if needs a test case. compiles

Reviewers: sdong

Reviewed By: sdong

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D25023
---
 db/internal_stats.cc | 11 +++++++++--
 db/internal_stats.h  |  2 ++
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/db/internal_stats.cc b/db/internal_stats.cc
index 3f60d72ce..aa3b3c850 100644
--- a/db/internal_stats.cc
+++ b/db/internal_stats.cc
@@ -133,6 +133,8 @@ DBPropertyType GetPropertyType(const Slice& property, bool* is_int_property,
     return kBackgroundErrors;
   } else if (in == "cur-size-active-mem-table") {
     return kCurSizeActiveMemTable;
+  } else if (in == "cur-size-all-mem-tables") {
+    return kCurSizeAllMemTables;
   } else if (in == "num-entries-active-mem-table") {
     return kNumEntriesInMutableMemtable;
   } else if (in == "num-entries-imm-mem-tables") {
@@ -250,12 +252,17 @@ bool InternalStats::GetIntProperty(DBPropertyType property_type,
       // Current size of the active memtable
       *value = cfd_->mem()->ApproximateMemoryUsage();
       return true;
+    case kCurSizeAllMemTables:
+      // Current size of the active memtable + immutable memtables
+      *value = cfd_->mem()->ApproximateMemoryUsage() +
+               cfd_->imm()->ApproximateMemoryUsage();
+      return true;
     case kNumEntriesInMutableMemtable:
-      // Current size of the active memtable
+      // Current number of entires in the active memtable
       *value = cfd_->mem()->GetNumEntries();
       return true;
     case kNumEntriesInImmutableMemtable:
-      // Current size of the active memtable
+      // Current number of entries in the immutable memtables
       *value = cfd_->imm()->current()->GetTotalNumEntries();
       return true;
     case kEstimatedNumKeys:
diff --git a/db/internal_stats.h b/db/internal_stats.h
index 18d67de5c..4d12a2512 100644
--- a/db/internal_stats.h
+++ b/db/internal_stats.h
@@ -36,6 +36,8 @@ enum DBPropertyType : uint32_t {
   kCompactionPending,      // Return 1 if a compaction is pending. Otherwise 0.
   kBackgroundErrors,       // Return accumulated background errors encountered.
   kCurSizeActiveMemTable,  // Return current size of the active memtable
+  kCurSizeAllMemTables,    // Return current size of all (active + immutable)
+                           // memtables
   kNumEntriesInMutableMemtable,    // Return number of entries in the mutable
                                    // memtable.
   kNumEntriesInImmutableMemtable,  // Return sum of number of entries in all

From f4363fb81cae9c5d2b694a0d095a0fbaf8f52511 Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Fri, 17 Oct 2014 10:09:45 -0700
Subject: [PATCH 267/829] Fix DynamicMemtableOptions test

Summary: as title

Test Plan: make release

Reviewers: igor

Reviewed By: igor

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D25029
---
 db/db_test.cc | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/db/db_test.cc b/db/db_test.cc
index cacd830ee..344b59273 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -8592,8 +8592,8 @@ TEST(DBTest, DynamicMemtableOptions) {
   // max_background_flushes == 0, so flushes are getting executed by the
   // compaction thread
   env_->SetBackgroundThreads(1, Env::LOW);
-  SleepingBackgroundTask sleeping_task_low;
-  env_->Schedule(&SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+  SleepingBackgroundTask sleeping_task_low1;
+  env_->Schedule(&SleepingBackgroundTask::DoSleepTask, &sleeping_task_low1,
                  Env::Priority::LOW);
   // Start from scratch and disable compaction/flush. Flush can only happen
   // during compaction but trigger is pretty high
@@ -8612,8 +8612,8 @@ TEST(DBTest, DynamicMemtableOptions) {
   }
   ASSERT_TRUE(count > (128 * 0.9) && count < (128 * 1.1));
 
-  sleeping_task_low.WakeUp();
-  sleeping_task_low.WaitUntilDone();
+  sleeping_task_low1.WakeUp();
+  sleeping_task_low1.WaitUntilDone();
 
   // Increase
   ASSERT_TRUE(dbfull()->SetOptions({
@@ -8622,15 +8622,16 @@ TEST(DBTest, DynamicMemtableOptions) {
   // Clean up memtable and L0
   dbfull()->CompactRange(nullptr, nullptr);
 
-  env_->Schedule(&SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+  SleepingBackgroundTask sleeping_task_low2;
+  env_->Schedule(&SleepingBackgroundTask::DoSleepTask, &sleeping_task_low2,
                  Env::Priority::LOW);
   count = 0;
   while (Put(Key(count), RandomString(&rnd, 1024), wo).ok() && count < 1024) {
     count++;
   }
   ASSERT_TRUE(count > (512 * 0.9) && count < (512 * 1.1));
-  sleeping_task_low.WakeUp();
-  sleeping_task_low.WaitUntilDone();
+  sleeping_task_low2.WakeUp();
+  sleeping_task_low2.WaitUntilDone();
 
   // Decrease
   ASSERT_TRUE(dbfull()->SetOptions({
@@ -8638,16 +8639,17 @@ TEST(DBTest, DynamicMemtableOptions) {
   }));
   // Clean up memtable and L0
   dbfull()->CompactRange(nullptr, nullptr);
-  env_->Schedule(&SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
-                 Env::Priority::LOW);
 
+  SleepingBackgroundTask sleeping_task_low3;
+  env_->Schedule(&SleepingBackgroundTask::DoSleepTask, &sleeping_task_low3,
+                 Env::Priority::LOW);
   count = 0;
   while (Put(Key(count), RandomString(&rnd, 1024), wo).ok() && count < 1024) {
     count++;
   }
   ASSERT_TRUE(count > (256 * 0.9) && count < (256 * 1.1));
-  sleeping_task_low.WakeUp();
-  sleeping_task_low.WaitUntilDone();
+  sleeping_task_low3.WakeUp();
+  sleeping_task_low3.WaitUntilDone();
 }
 
 TEST(DBTest, DynamicCompactionOptions) {

From 5db9e76644eed9b7c9b25be398cefb1248f7cc3a Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Fri, 17 Oct 2014 14:46:40 -0700
Subject: [PATCH 268/829] Fix Mac compile error: C++11 forbids default
 arguments for lambda expressions

Summary:
Fix the following Mac compile error.
db/db_test.cc:8686:52: error: C++11 forbids default arguments for lambda expressions [-Werror,-Wlambda-extensions]
  auto gen_l0_kb = [this](int start, int size, int stride = 1) {
                                                   ^        ~
Test Plan:
db_test
---
 db/db_test.cc | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/db/db_test.cc b/db/db_test.cc
index 344b59273..169718387 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -8683,7 +8683,7 @@ TEST(DBTest, DynamicCompactionOptions) {
   env_->SetBackgroundThreads(1, Env::HIGH);
   DestroyAndReopen(&options);
 
-  auto gen_l0_kb = [this](int start, int size, int stride = 1) {
+  auto gen_l0_kb = [this](int start, int size, int stride) {
     Random rnd(301);
     for (int i = 0; i < size; i++) {
       ASSERT_OK(Put(Key(start + stride * i), RandomString(&rnd, 1024)));
@@ -8693,11 +8693,11 @@ TEST(DBTest, DynamicCompactionOptions) {
 
   // Write 3 files that have the same key range, trigger compaction and
   // result in one L1 file
-  gen_l0_kb(0, 64);
+  gen_l0_kb(0, 64, 1);
   ASSERT_EQ(NumTableFilesAtLevel(0), 1);
-  gen_l0_kb(0, 64);
+  gen_l0_kb(0, 64, 1);
   ASSERT_EQ(NumTableFilesAtLevel(0), 2);
-  gen_l0_kb(0, 64);
+  gen_l0_kb(0, 64, 1);
   dbfull()->TEST_WaitForCompact();
   ASSERT_EQ("0,1", FilesPerLevel());
   std::vector<LiveFileMetaData> metadata;
@@ -8712,9 +8712,9 @@ TEST(DBTest, DynamicCompactionOptions) {
     {"target_file_size_base", std::to_string(k32KB) }
   }));
 
-  gen_l0_kb(0, 64);
+  gen_l0_kb(0, 64, 1);
   ASSERT_EQ("1,1", FilesPerLevel());
-  gen_l0_kb(0, 64);
+  gen_l0_kb(0, 64, 1);
   dbfull()->TEST_WaitForCompact();
   ASSERT_EQ("0,2", FilesPerLevel());
   metadata.clear();

From 6c66918645d1a2fc693b79d7a580659407cedac9 Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Fri, 17 Oct 2014 14:58:30 -0700
Subject: [PATCH 269/829] Speed up DB::Open() and Version creation by limiting
 the number of FileMetaData initialization.

Summary:
This diff speeds up DB::Open() and Version creation by limiting the number of FileMetaData initialization. The behavior of Version::UpdateAccumulatedStats() is changed as follows:

* It only initializes the first 20 uninitialized FileMetaData from file.  This guarantees the size of the latest 20 files will always be compensated when they have any deletion entries.  Previously it may initialize all FileMetaData by loading all files at DB::Open().
* In case none the first 20 files has any data entry, UpdateAccumulatedStats() will initialize the FileMetaData of the oldest file.

Test Plan: db_test

Reviewers: igor, sdong, ljin

Reviewed By: ljin

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D24255
---
 db/version_edit.h |   3 +-
 db/version_set.cc | 100 +++++++++++++++++++++++++++++++++-------------
 db/version_set.h  |  42 +++++++++++--------
 3 files changed, 100 insertions(+), 45 deletions(-)

diff --git a/db/version_edit.h b/db/version_edit.h
index db133402c..ef883297a 100644
--- a/db/version_edit.h
+++ b/db/version_edit.h
@@ -8,6 +8,7 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #pragma once
+#include <algorithm>
 #include <set>
 #include <utility>
 #include <vector>
@@ -74,7 +75,7 @@ struct FileMetaData {
   // Stats for compensating deletion entries during compaction
 
   // File size compensated by deletion entry.
-  // This is updated in Version::UpdateTemporaryStats() first time when the
+  // This is updated in Version::UpdateAccumulatedStats() first time when the
   // file is created or loaded.  After it is updated, it is immutable.
   uint64_t compensated_file_size;
   uint64_t num_entries;            // the number of entries.
diff --git a/db/version_set.cc b/db/version_set.cc
index 78241d1f0..0819196fb 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -597,7 +597,19 @@ uint64_t Version::GetEstimatedActiveKeys() {
   // (1) there is merge keys
   // (2) keys are directly overwritten
   // (3) deletion on non-existing keys
-  return num_non_deletions_ - num_deletions_;
+  // (4) low number of samples
+  if (num_samples_ == 0) {
+    return 0;
+  }
+
+  if (num_samples_ < files_->size()) {
+    // casting to avoid overflowing
+    return static_cast<uint64_t>(static_cast<double>(
+        accumulated_num_non_deletions_ - accumulated_num_deletions_) *
+        files_->size() / num_samples_);
+  } else {
+    return accumulated_num_non_deletions_ - accumulated_num_deletions_;
+  }
 }
 
 void Version::AddIterators(const ReadOptions& read_options,
@@ -658,17 +670,21 @@ Version::Version(ColumnFamilyData* cfd, VersionSet* vset,
       compaction_score_(num_levels_),
       compaction_level_(num_levels_),
       version_number_(version_number),
-      total_file_size_(0),
-      total_raw_key_size_(0),
-      total_raw_value_size_(0),
-      num_non_deletions_(0),
-      num_deletions_(0) {
+      accumulated_file_size_(0),
+      accumulated_raw_key_size_(0),
+      accumulated_raw_value_size_(0),
+      accumulated_num_non_deletions_(0),
+      accumulated_num_deletions_(0),
+      num_samples_(0) {
   if (cfd != nullptr && cfd->current() != nullptr) {
-      total_file_size_ = cfd->current()->total_file_size_;
-      total_raw_key_size_ = cfd->current()->total_raw_key_size_;
-      total_raw_value_size_ = cfd->current()->total_raw_value_size_;
-      num_non_deletions_ = cfd->current()->num_non_deletions_;
-      num_deletions_ = cfd->current()->num_deletions_;
+      accumulated_file_size_ = cfd->current()->accumulated_file_size_;
+      accumulated_raw_key_size_ = cfd->current()->accumulated_raw_key_size_;
+      accumulated_raw_value_size_ =
+          cfd->current()->accumulated_raw_value_size_;
+      accumulated_num_non_deletions_ =
+          cfd->current()->accumulated_num_non_deletions_;
+      accumulated_num_deletions_ = cfd->current()->accumulated_num_deletions_;
+      num_samples_ = cfd->current()->num_samples_;
   }
 }
 
@@ -748,7 +764,7 @@ void Version::GenerateFileLevels() {
 
 void Version::PrepareApply(const MutableCFOptions& mutable_cf_options,
                            std::vector<uint64_t>& size_being_compacted) {
-  UpdateTemporaryStats();
+  UpdateAccumulatedStats();
   ComputeCompactionScore(mutable_cf_options, size_being_compacted);
   UpdateFilesBySize();
   UpdateNumNonEmptyLevels();
@@ -757,7 +773,8 @@ void Version::PrepareApply(const MutableCFOptions& mutable_cf_options,
 }
 
 bool Version::MaybeInitializeFileMetaData(FileMetaData* file_meta) {
-  if (file_meta->init_stats_from_file) {
+  if (file_meta->init_stats_from_file ||
+      file_meta->compensated_file_size > 0) {
     return false;
   }
   std::shared_ptr<const TableProperties> tp;
@@ -778,26 +795,55 @@ bool Version::MaybeInitializeFileMetaData(FileMetaData* file_meta) {
   return true;
 }
 
-void Version::UpdateTemporaryStats() {
+void Version::UpdateAccumulatedStats(FileMetaData* file_meta) {
+  assert(file_meta->init_stats_from_file);
+  accumulated_file_size_ += file_meta->fd.GetFileSize();
+  accumulated_raw_key_size_ += file_meta->raw_key_size;
+  accumulated_raw_value_size_ += file_meta->raw_value_size;
+  accumulated_num_non_deletions_ +=
+      file_meta->num_entries - file_meta->num_deletions;
+  accumulated_num_deletions_ += file_meta->num_deletions;
+  num_samples_++;
+}
+
+void Version::UpdateAccumulatedStats() {
   static const int kDeletionWeightOnCompaction = 2;
 
-  // incrementally update the average value size by
-  // including newly added files into the global stats
+  // maximum number of table properties loaded from files.
+  const int kMaxInitCount = 20;
   int init_count = 0;
-  int total_count = 0;
-  for (int level = 0; level < num_levels_; level++) {
+  // here only the first kMaxInitCount files which haven't been
+  // initialized from file will be updated with num_deletions.
+  // The motivation here is to cap the maximum I/O per Version creation.
+  // The reason for choosing files from lower-level instead of higher-level
+  // is that such design is able to propagate the initialization from
+  // lower-level to higher-level:  When the num_deletions of lower-level
+  // files are updated, it will make the lower-level files have accurate
+  // compensated_file_size, making lower-level to higher-level compaction
+  // will be triggered, which creates higher-level files whose num_deletions
+  // will be updated here.
+  for (int level = 0;
+       level < num_levels_ && init_count < kMaxInitCount; ++level) {
     for (auto* file_meta : files_[level]) {
       if (MaybeInitializeFileMetaData(file_meta)) {
         // each FileMeta will be initialized only once.
-        total_file_size_ += file_meta->fd.GetFileSize();
-        total_raw_key_size_ += file_meta->raw_key_size;
-        total_raw_value_size_ += file_meta->raw_value_size;
-        num_non_deletions_ +=
-            file_meta->num_entries - file_meta->num_deletions;
-        num_deletions_ += file_meta->num_deletions;
-        init_count++;
-      }
-      total_count++;
+        UpdateAccumulatedStats(file_meta);
+        if (++init_count >= kMaxInitCount) {
+          break;
+        }
+      }
+    }
+  }
+  // In case all sampled-files contain only deletion entries, then we
+  // load the table-property of a file in higher-level to initialize
+  // that value.
+  for (int level = num_levels_ - 1;
+       accumulated_raw_value_size_ == 0 && level >= 0; --level) {
+    for (int i = static_cast<int>(files_[level].size()) - 1;
+        accumulated_raw_value_size_ == 0 && i >= 0; --i) {
+      if (MaybeInitializeFileMetaData(files_[level][i])) {
+        UpdateAccumulatedStats(files_[level][i]);
+      }
     }
   }
 
diff --git a/db/version_set.h b/db/version_set.h
index 05e6e9a65..93e9e0c9d 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -212,13 +212,15 @@ class Version {
   uint64_t GetVersionNumber() const { return version_number_; }
 
   uint64_t GetAverageValueSize() const {
-    if (num_non_deletions_ == 0) {
+    if (accumulated_num_non_deletions_ == 0) {
       return 0;
     }
-    assert(total_raw_key_size_ + total_raw_value_size_ > 0);
-    assert(total_file_size_ > 0);
-    return total_raw_value_size_ / num_non_deletions_ * total_file_size_ /
-           (total_raw_key_size_ + total_raw_value_size_);
+    assert(accumulated_raw_key_size_ + accumulated_raw_value_size_ > 0);
+    assert(accumulated_file_size_ > 0);
+    return accumulated_raw_value_size_ /
+           accumulated_num_non_deletions_ *
+           accumulated_file_size_ /
+           (accumulated_raw_key_size_ + accumulated_raw_value_size_);
   }
 
   // REQUIRES: lock is held
@@ -268,14 +270,17 @@ class Version {
   // Update num_non_empty_levels_.
   void UpdateNumNonEmptyLevels();
 
-  // The helper function of UpdateTemporaryStats, which may fill the missing
+  // The helper function of UpdateAccumulatedStats, which may fill the missing
   // fields of file_mata from its associated TableProperties.
   // Returns true if it does initialize FileMetaData.
   bool MaybeInitializeFileMetaData(FileMetaData* file_meta);
 
-  // Update the temporary stats associated with the current version.
-  // This temporary stats will be used in compaction.
-  void UpdateTemporaryStats();
+  // Update the accumulated stats from a file-meta.
+  void UpdateAccumulatedStats(FileMetaData* file_meta);
+
+  // Update the accumulated stats associated with the current version.
+  // This accumulated stats will be used in compaction.
+  void UpdateAccumulatedStats();
 
   // Sort all files for this version based on their file size and
   // record results in files_by_size_. The largest files are listed first.
@@ -337,16 +342,19 @@ class Version {
 
   Version(ColumnFamilyData* cfd, VersionSet* vset, uint64_t version_number = 0);
 
-  // total file size
-  uint64_t total_file_size_;
-  // the total size of all raw keys.
-  uint64_t total_raw_key_size_;
-  // the total size of all raw values.
-  uint64_t total_raw_value_size_;
+  // the following are the sampled temporary stats.
+  // the current accumulated size of sampled files.
+  uint64_t accumulated_file_size_;
+  // the current accumulated size of all raw keys based on the sampled files.
+  uint64_t accumulated_raw_key_size_;
+  // the current accumulated size of all raw keys based on the sampled files.
+  uint64_t accumulated_raw_value_size_;
   // total number of non-deletion entries
-  uint64_t num_non_deletions_;
+  uint64_t accumulated_num_non_deletions_;
   // total number of deletion entries
-  uint64_t num_deletions_;
+  uint64_t accumulated_num_deletions_;
+  // the number of samples
+  uint64_t num_samples_;
 
   ~Version();
 

From 2dd9bfe3a84f7cfe4b9e684f7dd8e8044d5f5de4 Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Fri, 17 Oct 2014 21:18:36 -0700
Subject: [PATCH 270/829] Sanitize block-based table index type and check
 prefix_extractor

Summary:
Respond to issue reported
https://www.facebook.com/groups/rocksdb.dev/permalink/651090261656158/
Change the Sanitize signature to take both DBOptions and CFOptions

Test Plan: unit test

Reviewers: sdong, yhchiang, igor

Reviewed By: igor

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D25041
---
 db/db_impl.cc                      |  8 ++++----
 db/db_test.cc                      | 11 +++++++++++
 include/rocksdb/table.h            |  6 ++++--
 table/adaptive_table_factory.h     |  5 +++--
 table/block_based_table_factory.cc | 11 +++++++++++
 table/block_based_table_factory.h  |  5 ++---
 table/cuckoo_table_factory.h       |  3 ++-
 table/plain_table_factory.h        |  5 +++--
 8 files changed, 40 insertions(+), 14 deletions(-)

diff --git a/db/db_impl.cc b/db/db_impl.cc
index d8df10c5a..c5bc7680a 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -282,12 +282,12 @@ DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src) {
 
 namespace {
 
-Status SanitizeDBOptionsByCFOptions(
-    const DBOptions* db_opts,
+Status SanitizeOptionsByTable(
+    const DBOptions& db_opts,
     const std::vector<ColumnFamilyDescriptor>& column_families) {
   Status s;
   for (auto cf : column_families) {
-    s = cf.options.table_factory->SanitizeDBOptions(db_opts);
+    s = cf.options.table_factory->SanitizeOptions(db_opts, cf.options);
     if (!s.ok()) {
       return s;
     }
@@ -4703,7 +4703,7 @@ Status DB::Open(const Options& options, const std::string& dbname, DB** dbptr) {
 Status DB::Open(const DBOptions& db_options, const std::string& dbname,
                 const std::vector<ColumnFamilyDescriptor>& column_families,
                 std::vector<ColumnFamilyHandle*>* handles, DB** dbptr) {
-  Status s = SanitizeDBOptionsByCFOptions(&db_options, column_families);
+  Status s = SanitizeOptionsByTable(db_options, column_families);
   if (!s.ok()) {
     return s;
   }
diff --git a/db/db_test.cc b/db/db_test.cc
index 169718387..ebe946cf4 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -8354,6 +8354,17 @@ TEST(DBTest, TableOptionsSanitizeTest) {
   options.prefix_extractor.reset(NewNoopTransform());
   Destroy(&options);
   ASSERT_TRUE(TryReopen(&options).IsNotSupported());
+
+  // Test for check of prefix_extractor when hash index is used for
+  // block-based table
+  BlockBasedTableOptions to;
+  to.index_type = BlockBasedTableOptions::kHashSearch;
+  options = Options();
+  options.create_if_missing = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(to));
+  ASSERT_TRUE(TryReopen(&options).IsInvalidArgument());
+  options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+  ASSERT_OK(TryReopen(&options));
 }
 
 TEST(DBTest, DBIteratorBoundTest) {
diff --git a/include/rocksdb/table.h b/include/rocksdb/table.h
index 4c06c23f7..4fddab4b3 100644
--- a/include/rocksdb/table.h
+++ b/include/rocksdb/table.h
@@ -355,11 +355,13 @@ class TableFactory {
       WritableFile* file, const CompressionType compression_type,
       const CompressionOptions& compression_opts) const = 0;
 
-  // Sanitizes the specified DB Options.
+  // Sanitizes the specified DB Options and ColumnFamilyOptions.
   //
   // If the function cannot find a way to sanitize the input DB Options,
   // a non-ok Status will be returned.
-  virtual Status SanitizeDBOptions(const DBOptions* db_opts) const = 0;
+  virtual Status SanitizeOptions(
+      const DBOptions& db_opts,
+      const ColumnFamilyOptions& cf_opts) const = 0;
 
   // Return a string that contains printable format of table configurations.
   // RocksDB prints configurations at DB Open().
diff --git a/table/adaptive_table_factory.h b/table/adaptive_table_factory.h
index f0920db97..3c6455f90 100644
--- a/table/adaptive_table_factory.h
+++ b/table/adaptive_table_factory.h
@@ -47,8 +47,9 @@ class AdaptiveTableFactory : public TableFactory {
       const CompressionOptions& compression_opts) const override;
 
   // Sanitizes the specified DB Options.
-  Status SanitizeDBOptions(const DBOptions* db_opts) const override {
-    if (db_opts->allow_mmap_reads == false) {
+  Status SanitizeOptions(const DBOptions& db_opts,
+                         const ColumnFamilyOptions& cf_opts) const override {
+    if (db_opts.allow_mmap_reads == false) {
       return Status::NotSupported(
           "AdaptiveTable with allow_mmap_reads == false is not supported.");
     }
diff --git a/table/block_based_table_factory.cc b/table/block_based_table_factory.cc
index b4e2e7d1f..3155f3394 100644
--- a/table/block_based_table_factory.cc
+++ b/table/block_based_table_factory.cc
@@ -63,6 +63,17 @@ TableBuilder* BlockBasedTableFactory::NewTableBuilder(
   return table_builder;
 }
 
+Status BlockBasedTableFactory::SanitizeOptions(
+    const DBOptions& db_opts,
+    const ColumnFamilyOptions& cf_opts) const {
+  if (table_options_.index_type == BlockBasedTableOptions::kHashSearch &&
+      cf_opts.prefix_extractor == nullptr) {
+    return Status::InvalidArgument("Hash index is specified for block-based "
+        "table, but prefix_extractor is not given");
+  }
+  return Status::OK();
+}
+
 std::string BlockBasedTableFactory::GetPrintableTableOptions() const {
   std::string ret;
   ret.reserve(20000);
diff --git a/table/block_based_table_factory.h b/table/block_based_table_factory.h
index 2dcfda6d4..247fcd691 100644
--- a/table/block_based_table_factory.h
+++ b/table/block_based_table_factory.h
@@ -46,9 +46,8 @@ class BlockBasedTableFactory : public TableFactory {
       const CompressionOptions& compression_opts) const override;
 
   // Sanitizes the specified DB Options.
-  Status SanitizeDBOptions(const DBOptions* db_opts) const override {
-    return Status::OK();
-  }
+  Status SanitizeOptions(const DBOptions& db_opts,
+                         const ColumnFamilyOptions& cf_opts) const override;
 
   std::string GetPrintableTableOptions() const override;
 
diff --git a/table/cuckoo_table_factory.h b/table/cuckoo_table_factory.h
index 599908678..714fdc2a0 100644
--- a/table/cuckoo_table_factory.h
+++ b/table/cuckoo_table_factory.h
@@ -64,7 +64,8 @@ class CuckooTableFactory : public TableFactory {
       const CompressionType, const CompressionOptions&) const override;
 
   // Sanitizes the specified DB Options.
-  Status SanitizeDBOptions(const DBOptions* db_opts) const override {
+  Status SanitizeOptions(const DBOptions& db_opts,
+                         const ColumnFamilyOptions& cf_opts) const override {
     return Status::OK();
   }
 
diff --git a/table/plain_table_factory.h b/table/plain_table_factory.h
index e79475221..23b54f092 100644
--- a/table/plain_table_factory.h
+++ b/table/plain_table_factory.h
@@ -170,8 +170,9 @@ class PlainTableFactory : public TableFactory {
   static const char kValueTypeSeqId0 = 0xFF;
 
   // Sanitizes the specified DB Options.
-  Status SanitizeDBOptions(const DBOptions* db_opts) const override {
-    if (db_opts->allow_mmap_reads == false) {
+  Status SanitizeOptions(const DBOptions& db_opts,
+                         const ColumnFamilyOptions& cf_opts) const override {
+    if (db_opts.allow_mmap_reads == false) {
       return Status::NotSupported(
           "PlainTable with allow_mmap_reads == false is not supported.");
     }

From ff8f74c204eba8cafc2779eac55475beff559941 Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Fri, 17 Oct 2014 21:15:39 -0700
Subject: [PATCH 271/829] remove checking lower bound of level size

Summary:
as title

Test Plan:
make db_test
---
 db/db_test.cc | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/db/db_test.cc b/db/db_test.cc
index ebe946cf4..d13929fc6 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -8763,12 +8763,9 @@ TEST(DBTest, DynamicCompactionOptions) {
     gen_l0_kb(i, 64, 32);
   }
   dbfull()->TEST_WaitForCompact();
-  ASSERT_TRUE(SizeAtLevel(1) > k128KB * 0.8 &&
-              SizeAtLevel(1) < k128KB * 1.2);
-  ASSERT_TRUE(SizeAtLevel(2) > 2 * k128KB * 0.8 &&
-              SizeAtLevel(2) < 2 * k128KB * 1.2);
-  ASSERT_TRUE(SizeAtLevel(3) > 4 * k128KB * 0.8 &&
-              SizeAtLevel(3) < 4 * k128KB * 1.2);
+  ASSERT_TRUE(SizeAtLevel(1) < k128KB * 1.2);
+  ASSERT_TRUE(SizeAtLevel(2) < 2 * k128KB * 1.2);
+  ASSERT_TRUE(SizeAtLevel(3) < 4 * k128KB * 1.2);
 
   // Clean up memtable and L0
   dbfull()->CompactRange(nullptr, nullptr);

From 5bfb7f5d0b2a09ccc3b6c8eb94fbf7cf1672a83a Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Fri, 17 Oct 2014 10:20:25 -0700
Subject: [PATCH 272/829] db_bench: seekrandom can specify --seek_nexts to read
 specific keys after seek.

Summary:
Add a function as tittle.
Also use the same parameter to fillseekseq too.

Test Plan: Run seekrandom using the new parameter

Reviewers: ljin, MarkCallaghan

Reviewed By: MarkCallaghan

Subscribers: rven, igor, yhchiang, leveldb

Differential Revision: https://reviews.facebook.net/D25035
---
 db/db_bench.cc | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/db/db_bench.cc b/db/db_bench.cc
index 6d611ae1c..d018ce70f 100644
--- a/db/db_bench.cc
+++ b/db/db_bench.cc
@@ -168,8 +168,9 @@ DEFINE_int32(duration, 0, "Time in seconds for the random-ops tests to run."
 
 DEFINE_int32(value_size, 100, "Size of each value");
 
-DEFINE_int32(seekseq_next, 0, "How many times to call Next() after Seek() in "
-             "fillseekseq");
+DEFINE_int32(seek_nexts, 0,
+             "How many times to call Next() after Seek() in "
+             "fillseekseq and seekrandom");
 
 DEFINE_bool(use_uint64_comparator, false, "use Uint64 user comparator");
 
@@ -2265,6 +2266,7 @@ class Benchmark {
     std::unique_ptr<const char[]> key_guard(key.data());
 
     Duration duration(FLAGS_duration, reads_);
+    char value_buffer[256];
     while (!duration.Done(1)) {
       if (!FLAGS_use_tailing_iterator && FLAGS_iter_refresh_interval_us >= 0) {
         uint64_t now = FLAGS_env->NowMicros();
@@ -2296,6 +2298,16 @@ class Benchmark {
       if (iter_to_use->Valid() && iter_to_use->key().compare(key) == 0) {
         found++;
       }
+
+      for (int j = 0; j < FLAGS_seek_nexts && iter_to_use->Valid(); ++j) {
+        // Copy out iterator's value to make sure we read them.
+        Slice value = iter_to_use->value();
+        memcpy(value_buffer, value.data(),
+               std::min(value.size(), sizeof(value_buffer)));
+        iter_to_use->Next();
+        assert(iter_to_use->status().ok());
+      }
+
       thread->stats.FinishedOps(&db_, db_.db, 1);
     }
     delete single_iter;
@@ -2820,7 +2832,7 @@ class Benchmark {
       assert(iter->Valid() && iter->key() == key);
       thread->stats.FinishedOps(nullptr, db, 1);
 
-      for (int j = 0; j < FLAGS_seekseq_next && i+1 < FLAGS_num; ++j) {
+      for (int j = 0; j < FLAGS_seek_nexts && i + 1 < FLAGS_num; ++j) {
         iter->Next();
         GenerateKeyFromInt(++i, FLAGS_num, &key);
         assert(iter->Valid() && iter->key() == key);

From 700f6ec3ffd4e9c877a848fa7b05268052f9e7b3 Mon Sep 17 00:00:00 2001
From: Adam Retter <adam.retter@googlemail.com>
Date: Sun, 3 Aug 2014 21:11:15 +0100
Subject: [PATCH 273/829] Ignore IntelliJ idea project files and ignore
 java/out folder

---
 .gitignore | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/.gitignore b/.gitignore
index 4ed05c5e3..ccbb46b03 100644
--- a/.gitignore
+++ b/.gitignore
@@ -31,8 +31,14 @@ coverage/COVERAGE_REPORT
 package/
 .phutil_module_cache
 tags
+
+java/out
 java/*.log
 java/include/org_rocksdb_*.h
+
+.idea/
+*.iml
+
 unity.cc
 java/crossbuild/.vagrant
 .vagrant/

From d6fe8dacc8c7121ab33e910c1589c1a1a449fc68 Mon Sep 17 00:00:00 2001
From: Adam Retter <adam.retter@googlemail.com>
Date: Sun, 3 Aug 2014 21:11:44 +0100
Subject: [PATCH 274/829] Feature - Implement Java API for Comparator and
 Slice. Allows use of either byte[] or DirectByteBuffer for accessing
 underlying data.

---
 java/Makefile                            |   2 +-
 java/org/rocksdb/AbstractComparator.java |  81 ++++++++
 java/org/rocksdb/AbstractSlice.java      | 156 +++++++++++++++
 java/org/rocksdb/Comparator.java         |  25 +++
 java/org/rocksdb/DirectComparator.java   |  25 +++
 java/org/rocksdb/DirectSlice.java        |  99 ++++++++++
 java/org/rocksdb/Options.java            |  22 +++
 java/org/rocksdb/Slice.java              |  61 ++++++
 java/rocksjni/comparator.cc              |  64 +++++++
 java/rocksjni/comparatorjnicallback.cc   | 148 +++++++++++++++
 java/rocksjni/comparatorjnicallback.h    |  52 +++++
 java/rocksjni/options.cc                 |  11 ++
 java/rocksjni/portal.h                   | 147 +++++++++++++++
 java/rocksjni/slice.cc                   | 231 +++++++++++++++++++++++
 14 files changed, 1123 insertions(+), 1 deletion(-)
 create mode 100644 java/org/rocksdb/AbstractComparator.java
 create mode 100644 java/org/rocksdb/AbstractSlice.java
 create mode 100644 java/org/rocksdb/Comparator.java
 create mode 100644 java/org/rocksdb/DirectComparator.java
 create mode 100644 java/org/rocksdb/DirectSlice.java
 create mode 100644 java/org/rocksdb/Slice.java
 create mode 100644 java/rocksjni/comparator.cc
 create mode 100644 java/rocksjni/comparatorjnicallback.cc
 create mode 100644 java/rocksjni/comparatorjnicallback.h
 create mode 100644 java/rocksjni/slice.cc

diff --git a/java/Makefile b/java/Makefile
index ef8ccbae4..5c20032d2 100644
--- a/java/Makefile
+++ b/java/Makefile
@@ -1,4 +1,4 @@
-NATIVE_JAVA_CLASSES = org.rocksdb.RocksDB org.rocksdb.Options org.rocksdb.WriteBatch org.rocksdb.WriteBatchInternal org.rocksdb.WriteBatchTest org.rocksdb.WriteOptions org.rocksdb.BackupableDB org.rocksdb.BackupableDBOptions org.rocksdb.Statistics org.rocksdb.RocksIterator org.rocksdb.VectorMemTableConfig org.rocksdb.SkipListMemTableConfig org.rocksdb.HashLinkedListMemTableConfig org.rocksdb.HashSkipListMemTableConfig org.rocksdb.PlainTableConfig org.rocksdb.BlockBasedTableConfig org.rocksdb.ReadOptions org.rocksdb.Filter org.rocksdb.BloomFilter org.rocksdb.RestoreOptions org.rocksdb.RestoreBackupableDB org.rocksdb.RocksEnv org.rocksdb.GenericRateLimiterConfig org.rocksdb.ColumnFamilyHandle
+NATIVE_JAVA_CLASSES = org.rocksdb.RocksDB org.rocksdb.Options org.rocksdb.WriteBatch org.rocksdb.WriteBatchInternal org.rocksdb.WriteBatchTest org.rocksdb.WriteOptions org.rocksdb.BackupableDB org.rocksdb.BackupableDBOptions org.rocksdb.Statistics org.rocksdb.RocksIterator org.rocksdb.VectorMemTableConfig org.rocksdb.SkipListMemTableConfig org.rocksdb.HashLinkedListMemTableConfig org.rocksdb.HashSkipListMemTableConfig org.rocksdb.PlainTableConfig org.rocksdb.BlockBasedTableConfig org.rocksdb.ReadOptions org.rocksdb.Filter org.rocksdb.BloomFilter org.rocksdb.AbstractComparator org.rocksdb.Comparator org.rocksdb.DirectComparator org.rocksdb.AbstractSlice org.rocksdb.Slice org.rocksdb.DirectSlice org.rocksdb.RestoreOptions org.rocksdb.RestoreBackupableDB org.rocksdb.RocksEnv org.rocksdb.GenericRateLimiterConfig org.rocksdb.ColumnFamilyHandle
 
 ROCKSDB_MAJOR = $(shell egrep "ROCKSDB_MAJOR.[0-9]" ../include/rocksdb/version.h | cut -d ' ' -f 3)
 ROCKSDB_MINOR = $(shell egrep "ROCKSDB_MINOR.[0-9]" ../include/rocksdb/version.h | cut -d ' ' -f 3)
diff --git a/java/org/rocksdb/AbstractComparator.java b/java/org/rocksdb/AbstractComparator.java
new file mode 100644
index 000000000..fa797b273
--- /dev/null
+++ b/java/org/rocksdb/AbstractComparator.java
@@ -0,0 +1,81 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+/**
+ * Comparators are used by RocksDB to determine
+ * the ordering of keys.
+ *
+ * This class is package private, implementers
+ * should extend either of the public abstract classes:
+ *   @see org.rocksdb.Comparator
+ *   @see org.rocksdb.DirectComparator
+ */
+abstract class AbstractComparator<T extends AbstractSlice> extends RocksObject {
+
+  public abstract String name();
+
+  /**
+   * Three-way key comparison
+   *
+   *  @param a Slice access to first key
+   *  @param b Slice access to second key
+   *
+   *  @return Should return either:
+   *    1) < 0 if "a" < "b"
+   *    2) == 0 if "a" == "b"
+   *    3) > 0 if "a" > "b"
+   */
+  public abstract int compare(final T a, final T b);
+
+  /**
+   * Used to reduce the space requirements
+   * for internal data structures like index blocks.
+   *
+   * If start < limit, you may return a new start which is a
+   * shorter string in [start, limit).
+   *
+   * Simple comparator implementations may return null if they
+   * wish to use start unchanged. i.e., an implementation of
+   * this method that does nothing is correct.
+   *
+   * @return a shorter start, or null
+   */
+  public String findShortestSeparator(final String start, final T limit) {
+      return null;
+  }
+
+  /**
+   * Used to reduce the space requirements
+   * for internal data structures like index blocks.
+   *
+   * You may return a new short key (key1) where
+   * key1 >= key.
+   *
+   * Simple comparator implementations may return null if they
+   * wish to leave the key unchanged. i.e., an implementation of
+   * this method that does nothing is correct.
+   *
+   * @return a shorter key, or null
+   */
+  public String findShortSuccessor(final String key) {
+      return null;
+  }
+
+  /**
+   * Deletes underlying C++ comparator pointer.
+   *
+   * Note that this function should be called only after all
+   * RocksDB instances referencing the comparator are closed.
+   * Otherwise an undefined behavior will occur.
+   */
+  @Override protected void disposeInternal() {
+    assert(isInitialized());
+    disposeInternal(nativeHandle_);
+  }
+
+  private native void disposeInternal(long handle);
+}
diff --git a/java/org/rocksdb/AbstractSlice.java b/java/org/rocksdb/AbstractSlice.java
new file mode 100644
index 000000000..963c72a1b
--- /dev/null
+++ b/java/org/rocksdb/AbstractSlice.java
@@ -0,0 +1,156 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+/**
+ * Slices are used by RocksDB to provide
+ * efficient access to keys and values.
+ *
+ * This class is package private, implementers
+ * should extend either of the public abstract classes:
+ *   @see org.rocksdb.Slice
+ *   @see org.rocksdb.DirectSlice
+ */
+abstract class AbstractSlice<T> extends RocksObject {
+
+  /**
+   * Returns the data.
+   *
+   * @return The data. Note, the type of access is
+   *   determined by the subclass
+   *   @see org.rocksdb.AbstractSlice#data0(long).
+   */
+  public T data() {
+    assert (isInitialized());
+    return data0(nativeHandle_);
+  }
+
+  /**
+   * Access to the data is provided by the
+   * subtype as it needs to handle the
+   * generic typing.
+   *
+   * @param handle The address of the underlying
+   *   native object.
+   *
+   * @return Java typed access to the data.
+   */
+  protected abstract T data0(long handle);
+
+  /**
+   * Return the length (in bytes) of the data.
+   *
+   * @return The length in bytes.
+   */
+  public int size() {
+    assert (isInitialized());
+    return size0(nativeHandle_);
+  }
+
+  /**
+   * Return true if the length of the
+   * data is zero.
+   *
+   * @return true if there is no data, false otherwise.
+   */
+  public boolean empty() {
+    assert (isInitialized());
+    return empty0(nativeHandle_);
+  }
+
+  /**
+   * Creates a string representation of the data
+   *
+   * @param hex When true, the representation
+   *   will be encoded in hexidecimal.
+   *
+   * @return The string representation of the data.
+   */
+  public String toString(final boolean hex) {
+    assert (isInitialized());
+    return toString0(nativeHandle_, hex);
+  }
+
+  @Override
+  public String toString() {
+    return toString(false);
+  }
+
+  /**
+   * Three-way key comparison
+   *
+   *  @param other A slice to compare against
+   *
+   *  @return Should return either:
+   *    1) < 0 if this < other
+   *    2) == 0 if this == other
+   *    3) > 0 if this > other
+   */
+  public int compare(final AbstractSlice other) {
+    assert (other != null);
+    assert (isInitialized());
+    return compare0(nativeHandle_, other.nativeHandle_);
+  }
+
+  /**
+   * If other is a slice, then
+   * we defer to compare to check equality,
+   * otherwise we return false.
+   *
+   * @param other Object to test for equality
+   *
+   * @return true when this.compare(other) == 0,
+   *   false otherwise.
+   */
+  @Override
+  public boolean equals(final Object other) {
+    if (other != null && other instanceof AbstractSlice) {
+      return compare((AbstractSlice)other) == 0;
+    } else {
+      return false;
+    }
+  }
+
+  /**
+   * Determines whether this starts with prefix
+   *
+   * @param prefix Another slice which may of may not
+   *   be the prefix of this slice.
+   *
+   * @return true when slice `prefix` is a prefix
+   *   of this slice
+   */
+  public boolean startsWith(final AbstractSlice prefix) {
+    if (prefix != null) {
+      assert (isInitialized());
+      return startsWith0(nativeHandle_, prefix.nativeHandle_);
+    } else {
+      return false;
+    }
+  }
+
+  /**
+   * Deletes underlying C++ slice pointer.
+   * <p/>
+   * Note that this function should be called only after all
+   * RocksDB instances referencing the slice are closed.
+   * Otherwise an undefined behavior will occur.
+   */
+  @Override
+  protected void disposeInternal() {
+    assert(isInitialized());
+    disposeInternal(nativeHandle_);
+  }
+
+  protected native void createNewSliceFromString(String str);
+  private native int size0(long handle);
+  private native boolean empty0(long handle);
+  private native String toString0(long handle, boolean hex);
+  private native int compare0(long handle, long otherHandle);
+  private native boolean startsWith0(long handle, long otherHandle);
+  private native void disposeInternal(long handle);
+
+}
diff --git a/java/org/rocksdb/Comparator.java b/java/org/rocksdb/Comparator.java
new file mode 100644
index 000000000..8466cfd8e
--- /dev/null
+++ b/java/org/rocksdb/Comparator.java
@@ -0,0 +1,25 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+/**
+ * Base class for comparators which will receive
+ * byte[] based access via org.rocksdb.Slice in their
+ * compare method implementation.
+ *
+ * byte[] based slices perform better when small keys
+ * are involved. When using larger keys consider
+ * using @see org.rocksdb.DirectComparator
+ */
+public abstract class Comparator extends AbstractComparator<Slice> {
+
+  public Comparator() {
+    super();
+    createNewComparator0();
+  }
+
+  private native void createNewComparator0();
+}
diff --git a/java/org/rocksdb/DirectComparator.java b/java/org/rocksdb/DirectComparator.java
new file mode 100644
index 000000000..25b4058ae
--- /dev/null
+++ b/java/org/rocksdb/DirectComparator.java
@@ -0,0 +1,25 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+/**
+ * Base class for comparators which will receive
+ * ByteBuffer based access via org.rocksdb.DirectSlice
+ * in their compare method implementation.
+ *
+ * ByteBuffer based slices perform better when large keys
+ * are involved. When using smaller keys consider
+ * using @see org.rocksdb.Comparator
+ */
+public abstract class DirectComparator extends AbstractComparator<DirectSlice> {
+
+  public DirectComparator() {
+    super();
+    createNewDirectComparator0();
+  }
+
+  private native void createNewDirectComparator0();
+}
diff --git a/java/org/rocksdb/DirectSlice.java b/java/org/rocksdb/DirectSlice.java
new file mode 100644
index 000000000..8169e3529
--- /dev/null
+++ b/java/org/rocksdb/DirectSlice.java
@@ -0,0 +1,99 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+import java.nio.ByteBuffer;
+
+/**
+ * Base class for slices which will receive direct
+ * ByteBuffer based access to the underlying data.
+ *
+ * ByteBuffer backed slices typically perform better with
+ * larger keys and values. When using smaller keys and
+ * values consider using @see org.rocksdb.Slice
+ */
+public class DirectSlice extends AbstractSlice<ByteBuffer> {
+
+  /**
+   * Called from JNI to construct a new Java DirectSlice
+   * without an underlying C++ object set
+   * at creation time.
+   */
+  private DirectSlice() {
+    super();
+    disOwnNativeHandle();
+  }
+
+  /**
+   * Constructs a slice
+   * where the data is taken from
+   * a String.
+   */
+  public DirectSlice(final String str) {
+    super();
+    createNewSliceFromString(str);
+  }
+
+  /**
+   * Constructs a slice where the data is
+   * read from the provided
+   * ByteBuffer up to a certain length
+   */
+  public DirectSlice(final ByteBuffer data, final int length) {
+    super();
+    createNewDirectSlice0(data, length);
+  }
+
+  /**
+   * Constructs a slice where the data is
+   * read from the provided
+   * ByteBuffer
+   */
+  public DirectSlice(final ByteBuffer data) {
+    super();
+    createNewDirectSlice1(data);
+  }
+
+  /**
+   * Retrieves the byte at a specific offset
+   * from the underlying data
+   *
+   * @param offset The (zero-based) offset of the byte to retrieve
+   *
+   * @return the requested byte
+   */
+  public byte get(int offset) {
+    assert (isInitialized());
+    return get0(nativeHandle_, offset);
+  }
+
+  /**
+   * Clears the backing slice
+   */
+  public void clear() {
+    assert (isInitialized());
+    clear0(nativeHandle_);
+  }
+
+  /**
+   * Drops the specified n
+   * number of bytes from the start
+   * of the backing slice
+   *
+   * @param n The number of bytes to drop
+   */
+  public void removePrefix(final int n) {
+    assert (isInitialized());
+    removePrefix0(nativeHandle_, n);
+  }
+
+  private native void createNewDirectSlice0(ByteBuffer data, int length);
+  private native void createNewDirectSlice1(ByteBuffer data);
+  @Override protected final native ByteBuffer data0(long handle);
+  private native byte get0(long handle, int offset);
+  private native void clear0(long handle);
+  private native void removePrefix0(long handle, int length);
+}
diff --git a/java/org/rocksdb/Options.java b/java/org/rocksdb/Options.java
index 741404e40..b99d0c7ea 100644
--- a/java/org/rocksdb/Options.java
+++ b/java/org/rocksdb/Options.java
@@ -193,6 +193,27 @@ public class Options extends RocksObject {
     return maxWriteBufferNumber(nativeHandle_);
   }
 
+  /**
+   * Use the specified comparator for key ordering.
+   *
+   * Comparator should not be disposed before options instances using this comparator is
+   * disposed. If dispose() function is not called, then comparator object will be
+   * GC'd automatically.
+   *
+   * Comparator instance can be re-used in multiple options instances.
+   *
+   * @param comparator java instance.
+   * @return the instance of the current Options.
+   * @see RocksDB.open()
+   */
+  public Options setComparator(AbstractComparator comparator) {
+      assert (isInitialized());
+      setComparatorHandle(nativeHandle_, comparator.nativeHandle_);
+      comparator_ = comparator;
+      return this;
+  }
+  private native void setComparatorHandle(long optHandle, long comparatorHandle);
+
   /**
    * If true, an error will be thrown during RocksDB.open() if the
    * database already exists.
@@ -2282,6 +2303,7 @@ public class Options extends RocksObject {
 
   long cacheSize_;
   int numShardBits_;
+  AbstractComparator comparator_;
   RocksEnv env_;
   MemTableConfig memTableConfig_;
   TableFormatConfig tableFormatConfig_;
diff --git a/java/org/rocksdb/Slice.java b/java/org/rocksdb/Slice.java
new file mode 100644
index 000000000..28c29c43d
--- /dev/null
+++ b/java/org/rocksdb/Slice.java
@@ -0,0 +1,61 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+/**
+ * Base class for slices which will receive
+ * byte[] based access to the underlying data.
+ *
+ * byte[] backed slices typically perform better with
+ * small keys and values. When using larger keys and
+ * values consider using @see org.rocksdb.DirectSlice
+ */
+public class Slice extends AbstractSlice<byte[]> {
+
+  /**
+   * Called from JNI to construct a new Java Slice
+   * without an underlying C++ object set
+   * at creation time.
+   */
+  private Slice() {
+    super();
+    disOwnNativeHandle();
+  }
+
+  /**
+   * Constructs a slice
+   * where the data is taken from
+   * a String.
+   */
+  public Slice(final String str) {
+    super();
+    createNewSliceFromString(str);
+  }
+
+  /**
+   * Constructs a slice
+   * where the data is a copy of
+   * the byte array from a specific offset.
+   */
+  public Slice(final byte[] data, final int offset) {
+    super();
+    createNewSlice0(data, offset);
+  }
+
+  /**
+   * Constructs a slice
+   * where the data is a copy of
+   * the byte array.
+   */
+  public Slice(final byte[] data) {
+    super();
+    createNewSlice1(data);
+  }
+
+  @Override protected final native byte[] data0(long handle);
+  private native void createNewSlice0(byte[] data, int length);
+  private native void createNewSlice1(byte[] data);
+}
diff --git a/java/rocksjni/comparator.cc b/java/rocksjni/comparator.cc
new file mode 100644
index 000000000..54d6137cd
--- /dev/null
+++ b/java/rocksjni/comparator.cc
@@ -0,0 +1,64 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+//
+// This file implements the "bridge" between Java and C++ for
+// rocksdb::Comparator.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <jni.h>
+#include <string>
+#include <functional>
+
+#include "include/org_rocksdb_AbstractComparator.h"
+#include "include/org_rocksdb_Comparator.h"
+#include "include/org_rocksdb_DirectComparator.h"
+#include "rocksjni/comparatorjnicallback.h"
+#include "rocksjni/portal.h"
+
+//<editor-fold desc="org.rocksdb.AbstractComparator>
+
+/*
+ * Class:     org_rocksdb_AbstractComparator
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_AbstractComparator_disposeInternal(
+    JNIEnv* env, jobject jobj, jlong handle) {
+  delete reinterpret_cast<rocksdb::BaseComparatorJniCallback*>(handle);
+}
+
+//</editor-fold>
+
+//<editor-fold desc="org.rocksdb.Comparator>
+
+/*
+ * Class:     org_rocksdb_Comparator
+ * Method:    createNewComparator0
+ * Signature: ()V
+ */
+void Java_org_rocksdb_Comparator_createNewComparator0(
+    JNIEnv* env, jobject jobj) {
+  const rocksdb::ComparatorJniCallback* c = new rocksdb::ComparatorJniCallback(env, jobj);
+  rocksdb::AbstractComparatorJni::setHandle(env, jobj, c);
+}
+
+//</editor-fold>
+
+//<editor-fold desc="org.rocksdb.DirectComparator>
+
+/*
+ * Class:     org_rocksdb_DirectComparator
+ * Method:    createNewDirectComparator0
+ * Signature: ()V
+ */
+void Java_org_rocksdb_DirectComparator_createNewDirectComparator0(
+    JNIEnv* env, jobject jobj) {
+  const rocksdb::DirectComparatorJniCallback* c = new rocksdb::DirectComparatorJniCallback(env, jobj);
+  rocksdb::AbstractComparatorJni::setHandle(env, jobj, c);
+}
+
+//</editor-fold>
+
diff --git a/java/rocksjni/comparatorjnicallback.cc b/java/rocksjni/comparatorjnicallback.cc
new file mode 100644
index 000000000..6d80547d5
--- /dev/null
+++ b/java/rocksjni/comparatorjnicallback.cc
@@ -0,0 +1,148 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+//
+// This file implements the callback "bridge" between Java and C++ for
+// rocksdb::Comparator.
+
+#include "rocksjni/comparatorjnicallback.h"
+#include "portal.h"
+
+namespace rocksdb {
+BaseComparatorJniCallback::BaseComparatorJniCallback(
+    JNIEnv* env, jobject jComparator) {
+
+  // Note: Comparator methods may be accessed by multiple threads,
+  // so we ref the jvm not the env
+  const jint rs = env->GetJavaVM(&m_jvm);
+  assert(rs == JNI_OK);
+
+  // Note: we want to access the Java Comparator instance
+  // across multiple method calls, so we create a global ref
+  m_jComparator = env->NewGlobalRef(jComparator);
+
+  // Note: The name of a Comparator will not change during it's lifetime,
+  // so we cache it in a global var
+  jmethodID jNameMethodId = AbstractComparatorJni::getNameMethodId(env);
+  jstring jsName = (jstring)env->CallObjectMethod(m_jComparator, jNameMethodId);
+  m_name = JniUtil::copyString(env, jsName); //also releases jsName
+
+  m_jCompareMethodId = AbstractComparatorJni::getCompareMethodId(env);
+  m_jFindShortestSeparatorMethodId = AbstractComparatorJni::getFindShortestSeparatorMethodId(env);
+  m_jFindShortSuccessorMethodId = AbstractComparatorJni::getFindShortSuccessorMethodId(env);
+}
+
+/**
+ * Attach/Get a JNIEnv for the current native thread
+ */
+JNIEnv* BaseComparatorJniCallback::getJniEnv() const {
+  JNIEnv *env;
+  jint rs = m_jvm->AttachCurrentThread((void **)&env, NULL);
+  assert(rs == JNI_OK);
+  return env;
+};
+
+const char* BaseComparatorJniCallback::Name() const {
+  return m_name.c_str();
+}
+
+int BaseComparatorJniCallback::Compare(const Slice& a, const Slice& b) const {
+
+  JNIEnv* m_env = getJniEnv();
+
+  AbstractSliceJni::setHandle(m_env, m_jSliceA, &a);
+  AbstractSliceJni::setHandle(m_env, m_jSliceB, &b);
+
+  jint result = m_env->CallIntMethod(m_jComparator, m_jCompareMethodId, m_jSliceA, m_jSliceB);
+
+  m_jvm->DetachCurrentThread();
+
+  return result;
+}
+
+void BaseComparatorJniCallback::FindShortestSeparator(std::string* start, const Slice& limit) const {
+
+  if (start == nullptr) {
+    return;
+  }
+
+  JNIEnv* m_env = getJniEnv();
+
+  const char* startUtf = start->c_str();
+  jstring jsStart = m_env->NewStringUTF(startUtf);
+
+  AbstractSliceJni::setHandle(m_env, m_jSliceLimit, &limit);
+
+  jstring jsResultStart = (jstring)m_env->CallObjectMethod(m_jComparator, m_jFindShortestSeparatorMethodId, jsStart, m_jSliceLimit);
+
+  m_env->DeleteLocalRef(jsStart);
+
+  if(jsResultStart != nullptr) {
+    //update start with result
+    *start = JniUtil::copyString(m_env, jsResultStart); //also releases jsResultStart
+  }
+
+  m_jvm->DetachCurrentThread();
+}
+
+void BaseComparatorJniCallback::FindShortSuccessor(std::string* key) const {
+
+  if (key == nullptr) {
+    return;
+  }
+
+  JNIEnv* m_env = getJniEnv();
+
+  const char* keyUtf = key->c_str();
+  jstring jsKey = m_env->NewStringUTF(keyUtf);
+
+  jstring jsResultKey = (jstring)m_env->CallObjectMethod(m_jComparator, m_jFindShortSuccessorMethodId, jsKey);
+
+  m_env->DeleteLocalRef(jsKey);
+
+  if(jsResultKey != nullptr) {
+    //update key with result
+    *key = JniUtil::copyString(m_env, jsResultKey); //also releases jsResultKey
+  }
+
+  m_jvm->DetachCurrentThread();
+}
+
+BaseComparatorJniCallback::~BaseComparatorJniCallback() {
+
+  // NOTE: we do not need to delete m_name here,
+  // I am not yet sure why, but doing so causes the error:
+  //   java(13051,0x109f54000) malloc: *** error for object 0x109f52fa9: pointer being freed was not allocated
+  //   *** set a breakpoint in malloc_error_break to debug
+  //delete[] m_name;
+
+  JNIEnv* m_env = getJniEnv();
+  
+  m_env->DeleteGlobalRef(m_jComparator);
+  m_env->DeleteGlobalRef(m_jSliceA);
+  m_env->DeleteGlobalRef(m_jSliceB);
+  m_env->DeleteGlobalRef(m_jSliceLimit);
+
+  // Note: do not need to explicitly detach, as this function is effectively
+  // called from the Java class's disposeInternal method, and so already
+  // has an attached thread, getJniEnv above is just a no-op Attach to get the env
+  //jvm->DetachCurrentThread();
+}
+
+ComparatorJniCallback::ComparatorJniCallback(
+    JNIEnv* env, jobject jComparator) : BaseComparatorJniCallback(env, jComparator) {
+
+  m_jSliceA = env->NewGlobalRef(SliceJni::construct0(env));
+  m_jSliceB = env->NewGlobalRef(SliceJni::construct0(env));
+  m_jSliceLimit = env->NewGlobalRef(SliceJni::construct0(env));
+}
+
+DirectComparatorJniCallback::DirectComparatorJniCallback(
+    JNIEnv* env, jobject jComparator) : BaseComparatorJniCallback(env, jComparator) {
+
+  m_jSliceA = env->NewGlobalRef(DirectSliceJni::construct0(env));
+  m_jSliceB = env->NewGlobalRef(DirectSliceJni::construct0(env));
+  m_jSliceLimit = env->NewGlobalRef(DirectSliceJni::construct0(env));
+}
+}  // namespace rocksdb
diff --git a/java/rocksjni/comparatorjnicallback.h b/java/rocksjni/comparatorjnicallback.h
new file mode 100644
index 000000000..f188dce86
--- /dev/null
+++ b/java/rocksjni/comparatorjnicallback.h
@@ -0,0 +1,52 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+//
+// This file implements the callback "bridge" between Java and C++ for
+// rocksdb::Comparator and rocksdb::DirectComparator.
+
+#ifndef JAVA_ROCKSJNI_COMPARATORJNICALLBACK_H_
+#define JAVA_ROCKSJNI_COMPARATORJNICALLBACK_H_
+
+#include <jni.h>
+#include "rocksdb/comparator.h"
+#include "rocksdb/slice.h"
+
+namespace rocksdb {
+class BaseComparatorJniCallback : public Comparator {
+  public:
+    BaseComparatorJniCallback(JNIEnv* env, jobject jComparator);
+    virtual ~BaseComparatorJniCallback();
+    virtual const char* Name() const;
+    virtual int Compare(const Slice& a, const Slice& b) const;
+    virtual void FindShortestSeparator(std::string* start, const Slice& limit) const;
+    virtual void FindShortSuccessor(std::string* key) const;
+
+  private:
+    JavaVM* m_jvm;
+    jobject m_jComparator;
+    std::string m_name;
+    jmethodID m_jCompareMethodId;
+    jmethodID m_jFindShortestSeparatorMethodId;
+    jmethodID m_jFindShortSuccessorMethodId;
+    JNIEnv* getJniEnv() const;
+
+  protected:
+    jobject m_jSliceA;
+    jobject m_jSliceB;
+    jobject m_jSliceLimit;
+};
+
+class ComparatorJniCallback : public BaseComparatorJniCallback {
+    public:
+      ComparatorJniCallback(JNIEnv* env, jobject jComparator);
+};
+
+class DirectComparatorJniCallback : public BaseComparatorJniCallback {
+    public:
+      DirectComparatorJniCallback(JNIEnv* env, jobject jComparator);
+};
+}  // namespace rocksdb
+
+#endif  // JAVA_ROCKSJNI_COMPARATORJNICALLBACK_H_
diff --git a/java/rocksjni/options.cc b/java/rocksjni/options.cc
index ef104d92b..8e94f965b 100644
--- a/java/rocksjni/options.cc
+++ b/java/rocksjni/options.cc
@@ -164,6 +164,17 @@ jlong Java_org_rocksdb_Options_statisticsPtr(
   return reinterpret_cast<jlong>(st);
 }
 
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setComparatorHandle
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setComparatorHandle(
+    JNIEnv* env, jobject jobj, jlong jopt_handle, jlong jcomparator_handle) {
+  reinterpret_cast<rocksdb::Options*>(jopt_handle)->comparator =
+      reinterpret_cast<rocksdb::Comparator*>(jcomparator_handle);
+}
+
 /*
  * Class:     org_rocksdb_Options
  * Method:    maxWriteBufferNumber
diff --git a/java/rocksjni/portal.h b/java/rocksjni/portal.h
index 8300a6e66..68403ebff 100644
--- a/java/rocksjni/portal.h
+++ b/java/rocksjni/portal.h
@@ -16,6 +16,7 @@
 #include "rocksdb/filter_policy.h"
 #include "rocksdb/status.h"
 #include "rocksdb/utilities/backupable_db.h"
+#include "rocksjni/comparatorjnicallback.h"
 
 namespace rocksdb {
 
@@ -362,6 +363,136 @@ class ColumnFamilyHandleJni {
   }
 };
 
+class AbstractComparatorJni {
+ public:
+  // Get the java class id of org.rocksdb.Comparator.
+  static jclass getJClass(JNIEnv* env) {
+    jclass jclazz = env->FindClass("org/rocksdb/AbstractComparator");
+    assert(jclazz != nullptr);
+    return jclazz;
+  }
+
+  // Get the field id of the member variable of org.rocksdb.Comparator
+  // that stores the pointer to rocksdb::Comparator.
+  static jfieldID getHandleFieldID(JNIEnv* env) {
+    static jfieldID fid = env->GetFieldID(
+        getJClass(env), "nativeHandle_", "J");
+    assert(fid != nullptr);
+    return fid;
+  }
+
+  // Get the java method `name` of org.rocksdb.Comparator.
+  static jmethodID getNameMethodId(JNIEnv* env) {
+    static jmethodID mid = env->GetMethodID(
+        getJClass(env), "name", "()Ljava/lang/String;");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  // Get the java method `compare` of org.rocksdb.Comparator.
+  static jmethodID getCompareMethodId(JNIEnv* env) {
+    static jmethodID mid = env->GetMethodID(
+        getJClass(env), "compare", "(Lorg/rocksdb/AbstractSlice;Lorg/rocksdb/AbstractSlice;)I");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  // Get the java method `findShortestSeparator` of org.rocksdb.Comparator.
+  static jmethodID getFindShortestSeparatorMethodId(JNIEnv* env) {
+    static jmethodID mid = env->GetMethodID(
+        getJClass(env), "findShortestSeparator", "(Ljava/lang/String;Lorg/rocksdb/AbstractSlice;)Ljava/lang/String;");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  // Get the java method `findShortSuccessor` of org.rocksdb.Comparator.
+  static jmethodID getFindShortSuccessorMethodId(JNIEnv* env) {
+    static jmethodID mid = env->GetMethodID(
+        getJClass(env), "findShortSuccessor", "(Ljava/lang/String;)Ljava/lang/String;");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  // Get the pointer to ComparatorJniCallback.
+  static rocksdb::BaseComparatorJniCallback* getHandle(JNIEnv* env, jobject jobj) {
+    return reinterpret_cast<rocksdb::BaseComparatorJniCallback*>(
+        env->GetLongField(jobj, getHandleFieldID(env)));
+  }
+
+  // Pass the ComparatorJniCallback pointer to the java side.
+  static void setHandle(
+      JNIEnv* env, jobject jobj, const rocksdb::BaseComparatorJniCallback* op) {
+    env->SetLongField(
+        jobj, getHandleFieldID(env),
+        reinterpret_cast<jlong>(op));
+  }
+};
+
+class AbstractSliceJni {
+ public:
+  // Get the java class id of org.rocksdb.Slice.
+  static jclass getJClass(JNIEnv* env) {
+    jclass jclazz = env->FindClass("org/rocksdb/AbstractSlice");
+    assert(jclazz != nullptr);
+    return jclazz;
+  }
+
+  // Get the field id of the member variable of org.rocksdb.Slice
+  // that stores the pointer to rocksdb::Slice.
+  static jfieldID getHandleFieldID(JNIEnv* env) {
+    static jfieldID fid = env->GetFieldID(
+        getJClass(env), "nativeHandle_", "J");
+    assert(fid != nullptr);
+    return fid;
+  }
+
+  // Get the pointer to Slice.
+  static rocksdb::Slice* getHandle(JNIEnv* env, jobject jobj) {
+    return reinterpret_cast<rocksdb::Slice*>(
+        env->GetLongField(jobj, getHandleFieldID(env)));
+  }
+
+  // Pass the Slice pointer to the java side.
+  static void setHandle(
+      JNIEnv* env, jobject jobj, const rocksdb::Slice* op) {
+    env->SetLongField(
+        jobj, getHandleFieldID(env),
+        reinterpret_cast<jlong>(op));
+  }
+};
+
+class SliceJni {
+ public:
+  // Get the java class id of org.rocksdb.Slice.
+  static jclass getJClass(JNIEnv* env) {
+    jclass jclazz = env->FindClass("org/rocksdb/Slice");
+    assert(jclazz != nullptr);
+    return jclazz;
+  }
+
+  static jobject construct0(JNIEnv* env) {
+    static jmethodID mid = env->GetMethodID(getJClass(env), "<init>", "()V");
+    assert(mid != nullptr);
+    return env->NewObject(getJClass(env), mid);
+  }
+};
+
+class DirectSliceJni {
+ public:
+  // Get the java class id of org.rocksdb.DirectSlice.
+  static jclass getJClass(JNIEnv* env) {
+    jclass jclazz = env->FindClass("org/rocksdb/DirectSlice");
+    assert(jclazz != nullptr);
+    return jclazz;
+  }
+
+  static jobject construct0(JNIEnv* env) {
+    static jmethodID mid = env->GetMethodID(getJClass(env), "<init>", "()V");
+    assert(mid != nullptr);
+    return env->NewObject(getJClass(env), mid);
+  }
+};
+
 class ListJni {
  public:
   // Get the java class id of java.util.List.
@@ -425,5 +556,21 @@ class ListJni {
     return mid;
   }
 };
+
+class JniUtil {
+  public:
+
+    /**
+     * Copies a jstring to a std::string
+     * and releases the original jstring
+     */
+    static std::string copyString(JNIEnv* env, jstring js) {
+      const char *utf = env->GetStringUTFChars(js, NULL);
+      std::string name(utf);
+      env->ReleaseStringUTFChars(js, utf);
+      return name;
+    }
+};
+
 }  // namespace rocksdb
 #endif  // JAVA_ROCKSJNI_PORTAL_H_
diff --git a/java/rocksjni/slice.cc b/java/rocksjni/slice.cc
new file mode 100644
index 000000000..a0a6f71e6
--- /dev/null
+++ b/java/rocksjni/slice.cc
@@ -0,0 +1,231 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+//
+// This file implements the "bridge" between Java and C++ for
+// rocksdb::Slice.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <jni.h>
+#include <string>
+
+#include "include/org_rocksdb_AbstractSlice.h"
+#include "include/org_rocksdb_Slice.h"
+#include "include/org_rocksdb_DirectSlice.h"
+#include "rocksdb/slice.h"
+#include "rocksjni/portal.h"
+
+//<editor-fold desc="org.rocksdb.AbstractSlice>
+
+/*
+ * Class:     org_rocksdb_AbstractSlice
+ * Method:    createNewSliceFromString
+ * Signature: (Ljava/lang/String;)V
+ */
+void Java_org_rocksdb_AbstractSlice_createNewSliceFromString(
+    JNIEnv* env, jobject jobj, jstring str) {
+  const std::string s = rocksdb::JniUtil::copyString(env, str);
+  const rocksdb::Slice* slice = new rocksdb::Slice(s);
+  rocksdb::AbstractSliceJni::setHandle(env, jobj, slice);
+}
+
+/*
+ * Class:     org_rocksdb_AbstractSlice
+ * Method:    size0
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_AbstractSlice_size0(
+    JNIEnv* env, jobject jobj, jlong handle) {
+  const rocksdb::Slice* slice = reinterpret_cast<rocksdb::Slice*>(handle);
+  return slice->size();
+}
+
+/*
+ * Class:     org_rocksdb_AbstractSlice
+ * Method:    empty0
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_AbstractSlice_empty0(
+    JNIEnv* env, jobject jobj, jlong handle) {
+  const rocksdb::Slice* slice = reinterpret_cast<rocksdb::Slice*>(handle);
+  return slice->empty();
+}
+
+/*
+ * Class:     org_rocksdb_AbstractSlice
+ * Method:    toString0
+ * Signature: (JZ)Ljava/lang/String;
+ */
+jstring Java_org_rocksdb_AbstractSlice_toString0(
+    JNIEnv* env, jobject jobj, jlong handle, jboolean hex) {
+  const rocksdb::Slice* slice = reinterpret_cast<rocksdb::Slice*>(handle);
+  const std::string s = slice->ToString(hex);
+  return env->NewStringUTF(s.c_str());
+}
+
+/*
+ * Class:     org_rocksdb_AbstractSlice
+ * Method:    compare0
+ * Signature: (JJ)I;
+ */
+jint Java_org_rocksdb_AbstractSlice_compare0(
+    JNIEnv* env, jobject jobj, jlong handle, jlong otherHandle) {
+  const rocksdb::Slice* slice = reinterpret_cast<rocksdb::Slice*>(handle);
+  const rocksdb::Slice* otherSlice = reinterpret_cast<rocksdb::Slice*>(otherHandle);
+  return slice->compare(*otherSlice);
+}
+
+/*
+ * Class:     org_rocksdb_AbstractSlice
+ * Method:    startsWith0
+ * Signature: (JJ)Z;
+ */
+jboolean Java_org_rocksdb_AbstractSlice_startsWith0(
+    JNIEnv* env, jobject jobj, jlong handle, jlong otherHandle) {
+  const rocksdb::Slice* slice = reinterpret_cast<rocksdb::Slice*>(handle);
+  const rocksdb::Slice* otherSlice = reinterpret_cast<rocksdb::Slice*>(otherHandle);
+  return slice->starts_with(*otherSlice);
+}
+
+/*
+ * Class:     org_rocksdb_AbstractSlice
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_AbstractSlice_disposeInternal(
+    JNIEnv* env, jobject jobj, jlong handle) {
+  delete reinterpret_cast<rocksdb::Slice*>(handle);
+}
+
+//</editor-fold>
+
+//<editor-fold desc="org.rocksdb.Slice>
+
+/*
+ * Class:     org_rocksdb_Slice
+ * Method:    createNewSlice0
+ * Signature: ([BI)V
+ */
+void Java_org_rocksdb_Slice_createNewSlice0(
+    JNIEnv * env, jobject jobj, jbyteArray data, jint offset) {
+
+  const jsize dataSize = env->GetArrayLength(data);
+  const int len = dataSize - offset;
+  //jbyte ptrData[len];
+  jbyte* ptrData = new jbyte[len];
+  env->GetByteArrayRegion(data, offset, len, ptrData);
+
+  const rocksdb::Slice* slice = new rocksdb::Slice((const char*)ptrData, len);
+  rocksdb::AbstractSliceJni::setHandle(env, jobj, slice);
+
+}
+
+/*
+ * Class:     org_rocksdb_Slice
+ * Method:    createNewSlice1
+ * Signature: ([B)V
+ */
+void Java_org_rocksdb_Slice_createNewSlice1(
+    JNIEnv * env, jobject jobj, jbyteArray data) {
+
+  jboolean isCopy;
+  jbyte* ptrData = env->GetByteArrayElements(data, &isCopy);
+
+  const rocksdb::Slice* slice = new rocksdb::Slice((const char*)ptrData, env->GetArrayLength(data));
+  rocksdb::AbstractSliceJni::setHandle(env, jobj, slice);
+
+  env->ReleaseByteArrayElements(data, ptrData, JNI_COMMIT);
+
+  //TODO where do we free ptrData later?
+  //do we need to call env->ReleaseByteArrayElements(data, ptrData, JNI_ABORT) in the org.rocksdb.Slice#dispose() method?
+}
+
+/*
+ * Class:     org_rocksdb_Slice
+ * Method:    data0
+ * Signature: (J)[B
+ */
+jbyteArray Java_org_rocksdb_Slice_data0(
+    JNIEnv* env, jobject jobj, jlong handle) {
+  const rocksdb::Slice* slice = reinterpret_cast<rocksdb::Slice*>(handle);
+  const int len = slice->size();
+  const jbyteArray data = env->NewByteArray(len);
+  env->SetByteArrayRegion(data, 0, len, (jbyte*)slice->data());
+  return data;
+}
+
+//</editor-fold>
+
+//<editor-fold desc="org.rocksdb.DirectSlice>
+
+/*
+ * Class:     org_rocksdb_DirectSlice
+ * Method:    createNewDirectSlice0
+ * Signature: (Ljava/nio/ByteBuffer;I)V
+ */
+void Java_org_rocksdb_DirectSlice_createNewDirectSlice0(
+    JNIEnv* env, jobject jobj, jobject data, jint length) {
+  const char* ptrData = (char*)env->GetDirectBufferAddress(data);
+  const rocksdb::Slice* slice = new rocksdb::Slice(ptrData, length);
+  rocksdb::AbstractSliceJni::setHandle(env, jobj, slice);
+}
+
+/*
+ * Class:     org_rocksdb_DirectSlice
+ * Method:    createNewDirectSlice1
+ * Signature: (Ljava/nio/ByteBuffer;)V
+ */
+void Java_org_rocksdb_DirectSlice_createNewDirectSlice1(
+    JNIEnv* env, jobject jobj, jobject data) {
+  const char* ptrData = (char*)env->GetDirectBufferAddress(data);
+  const rocksdb::Slice* slice = new rocksdb::Slice(ptrData);
+  rocksdb::AbstractSliceJni::setHandle(env, jobj, slice);
+}
+
+/*
+ * Class:     org_rocksdb_DirectSlice
+ * Method:    data0
+ * Signature: (J)Ljava/lang/Object;
+ */
+jobject Java_org_rocksdb_DirectSlice_data0(
+    JNIEnv* env, jobject jobj, jlong handle) {
+  const rocksdb::Slice* slice = reinterpret_cast<rocksdb::Slice*>(handle);
+  return env->NewDirectByteBuffer((void*)slice->data(), slice->size());
+}
+
+/*
+ * Class:     org_rocksdb_DirectSlice
+ * Method:    get0
+ * Signature: (JI)B
+ */
+jbyte Java_org_rocksdb_DirectSlice_get0(
+    JNIEnv* env, jobject jobj, jlong handle, jint offset) {
+  rocksdb::Slice* slice = reinterpret_cast<rocksdb::Slice*>(handle);
+  return (*slice)[offset];
+}
+
+/*
+ * Class:     org_rocksdb_DirectSlice
+ * Method:    clear0
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_DirectSlice_clear0(
+    JNIEnv* env, jobject jobj, jlong handle) {
+  rocksdb::Slice* slice = reinterpret_cast<rocksdb::Slice*>(handle);
+  slice->clear();
+}
+
+/*
+ * Class:     org_rocksdb_DirectSlice
+ * Method:    removePrefix0
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_DirectSlice_removePrefix0(
+    JNIEnv* env, jobject jobj, jlong handle, jint length) {
+  rocksdb::Slice* slice = reinterpret_cast<rocksdb::Slice*>(handle);
+  slice->remove_prefix(length);
+}
+
+//</editor-fold>

From fc12cb83f25476e38f7ded6458036895705c680c Mon Sep 17 00:00:00 2001
From: Adam Retter <adam.retter@googlemail.com>
Date: Fri, 15 Aug 2014 13:34:10 +0100
Subject: [PATCH 275/829] Add locking to comparator jni callback methods which
 must be thread-safe

---
 java/Makefile                           |  2 +-
 java/org/rocksdb/Comparator.java        |  6 +--
 java/org/rocksdb/ComparatorOptions.java | 49 +++++++++++++++++++++++++
 java/org/rocksdb/DirectComparator.java  |  6 +--
 java/rocksjni/comparator.cc             | 32 ++++++++++++++--
 java/rocksjni/comparatorjnicallback.cc  | 27 ++++++++------
 java/rocksjni/comparatorjnicallback.h   | 35 ++++++++++++++++--
 java/rocksjni/options.cc                | 48 ++++++++++++++++++++++++
 java/rocksjni/portal.h                  | 27 ++++++++++++++
 9 files changed, 206 insertions(+), 26 deletions(-)
 create mode 100644 java/org/rocksdb/ComparatorOptions.java

diff --git a/java/Makefile b/java/Makefile
index 5c20032d2..19df0fa69 100644
--- a/java/Makefile
+++ b/java/Makefile
@@ -1,4 +1,4 @@
-NATIVE_JAVA_CLASSES = org.rocksdb.RocksDB org.rocksdb.Options org.rocksdb.WriteBatch org.rocksdb.WriteBatchInternal org.rocksdb.WriteBatchTest org.rocksdb.WriteOptions org.rocksdb.BackupableDB org.rocksdb.BackupableDBOptions org.rocksdb.Statistics org.rocksdb.RocksIterator org.rocksdb.VectorMemTableConfig org.rocksdb.SkipListMemTableConfig org.rocksdb.HashLinkedListMemTableConfig org.rocksdb.HashSkipListMemTableConfig org.rocksdb.PlainTableConfig org.rocksdb.BlockBasedTableConfig org.rocksdb.ReadOptions org.rocksdb.Filter org.rocksdb.BloomFilter org.rocksdb.AbstractComparator org.rocksdb.Comparator org.rocksdb.DirectComparator org.rocksdb.AbstractSlice org.rocksdb.Slice org.rocksdb.DirectSlice org.rocksdb.RestoreOptions org.rocksdb.RestoreBackupableDB org.rocksdb.RocksEnv org.rocksdb.GenericRateLimiterConfig org.rocksdb.ColumnFamilyHandle
+NATIVE_JAVA_CLASSES = org.rocksdb.RocksDB org.rocksdb.Options org.rocksdb.WriteBatch org.rocksdb.WriteBatchInternal org.rocksdb.WriteBatchTest org.rocksdb.WriteOptions org.rocksdb.BackupableDB org.rocksdb.BackupableDBOptions org.rocksdb.Statistics org.rocksdb.RocksIterator org.rocksdb.VectorMemTableConfig org.rocksdb.SkipListMemTableConfig org.rocksdb.HashLinkedListMemTableConfig org.rocksdb.HashSkipListMemTableConfig org.rocksdb.PlainTableConfig org.rocksdb.BlockBasedTableConfig org.rocksdb.ReadOptions org.rocksdb.Filter org.rocksdb.BloomFilter org.rocksdb.ComparatorOptions org.rocksdb.AbstractComparator org.rocksdb.Comparator org.rocksdb.DirectComparator org.rocksdb.AbstractSlice org.rocksdb.Slice org.rocksdb.DirectSlice org.rocksdb.RestoreOptions org.rocksdb.RestoreBackupableDB org.rocksdb.RocksEnv org.rocksdb.GenericRateLimiterConfig org.rocksdb.ColumnFamilyHandle
 
 ROCKSDB_MAJOR = $(shell egrep "ROCKSDB_MAJOR.[0-9]" ../include/rocksdb/version.h | cut -d ' ' -f 3)
 ROCKSDB_MINOR = $(shell egrep "ROCKSDB_MINOR.[0-9]" ../include/rocksdb/version.h | cut -d ' ' -f 3)
diff --git a/java/org/rocksdb/Comparator.java b/java/org/rocksdb/Comparator.java
index 8466cfd8e..7272a555a 100644
--- a/java/org/rocksdb/Comparator.java
+++ b/java/org/rocksdb/Comparator.java
@@ -16,10 +16,10 @@ package org.rocksdb;
  */
 public abstract class Comparator extends AbstractComparator<Slice> {
 
-  public Comparator() {
+  public Comparator(final ComparatorOptions copt) {
     super();
-    createNewComparator0();
+    createNewComparator0(copt.nativeHandle_);
   }
 
-  private native void createNewComparator0();
+  private native void createNewComparator0(final long comparatorOptionsHandle);
 }
diff --git a/java/org/rocksdb/ComparatorOptions.java b/java/org/rocksdb/ComparatorOptions.java
new file mode 100644
index 000000000..a55091dfa
--- /dev/null
+++ b/java/org/rocksdb/ComparatorOptions.java
@@ -0,0 +1,49 @@
+package org.rocksdb;
+
+public class ComparatorOptions extends RocksObject {
+
+  public ComparatorOptions() {
+    super();
+    newComparatorOptions();
+  }
+
+  /**
+   * Use adaptive mutex, which spins in the user space before resorting
+   * to kernel. This could reduce context switch when the mutex is not
+   * heavily contended. However, if the mutex is hot, we could end up
+   * wasting spin time.
+   * Default: false
+   *
+   * @return true if adaptive mutex is used.
+   */
+  public boolean useAdaptiveMutex() {
+    assert(isInitialized());
+    return useAdaptiveMutex(nativeHandle_);
+  }
+
+  /**
+   * Use adaptive mutex, which spins in the user space before resorting
+   * to kernel. This could reduce context switch when the mutex is not
+   * heavily contended. However, if the mutex is hot, we could end up
+   * wasting spin time.
+   * Default: false
+   *
+   * @param useAdaptiveMutex true if adaptive mutex is used.
+   * @return the reference to the current comparator options.
+   */
+  public ComparatorOptions setUseAdaptiveMutex(final boolean useAdaptiveMutex) {
+    assert (isInitialized());
+    setUseAdaptiveMutex(nativeHandle_, useAdaptiveMutex);
+    return this;
+  }
+
+  @Override protected void disposeInternal() {
+    assert(isInitialized());
+    disposeInternal(nativeHandle_);
+  }
+
+  private native void newComparatorOptions();
+  private native boolean useAdaptiveMutex(final long handle);
+  private native void setUseAdaptiveMutex(final long handle, final boolean useAdaptiveMutex);
+  private native void disposeInternal(long handle);
+}
diff --git a/java/org/rocksdb/DirectComparator.java b/java/org/rocksdb/DirectComparator.java
index 25b4058ae..86476c40e 100644
--- a/java/org/rocksdb/DirectComparator.java
+++ b/java/org/rocksdb/DirectComparator.java
@@ -16,10 +16,10 @@ package org.rocksdb;
  */
 public abstract class DirectComparator extends AbstractComparator<DirectSlice> {
 
-  public DirectComparator() {
+  public DirectComparator(final ComparatorOptions copt) {
     super();
-    createNewDirectComparator0();
+    createNewDirectComparator0(copt.nativeHandle_);
   }
 
-  private native void createNewDirectComparator0();
+  private native void createNewDirectComparator0(final long comparatorOptionsHandle);
 }
diff --git a/java/rocksjni/comparator.cc b/java/rocksjni/comparator.cc
index 54d6137cd..8dcda6aa6 100644
--- a/java/rocksjni/comparator.cc
+++ b/java/rocksjni/comparator.cc
@@ -18,6 +18,28 @@
 #include "rocksjni/comparatorjnicallback.h"
 #include "rocksjni/portal.h"
 
+//<editor-fold desc="org.rocksdb.ComparatorOptions">
+
+void Java_org_rocksdb_ComparatorOptions_newComparatorOptions(
+    JNIEnv* env, jobject jobj, jstring jpath, jboolean jshare_table_files,
+    jboolean jsync, jboolean jdestroy_old_data, jboolean jbackup_log_files,
+    jlong jbackup_rate_limit, jlong jrestore_rate_limit) {
+  jbackup_rate_limit = (jbackup_rate_limit <= 0) ? 0 : jbackup_rate_limit;
+  jrestore_rate_limit = (jrestore_rate_limit <= 0) ? 0 : jrestore_rate_limit;
+
+  const char* cpath = env->GetStringUTFChars(jpath, 0);
+
+  auto bopt = new rocksdb::BackupableDBOptions(cpath, nullptr,
+      jshare_table_files, nullptr, jsync, jdestroy_old_data, jbackup_log_files,
+      jbackup_rate_limit, jrestore_rate_limit);
+
+  env->ReleaseStringUTFChars(jpath, cpath);
+
+  rocksdb::BackupableDBOptionsJni::setHandle(env, jobj, bopt);
+}
+
+//</editor-fold>
+
 //<editor-fold desc="org.rocksdb.AbstractComparator>
 
 /*
@@ -40,8 +62,9 @@ void Java_org_rocksdb_AbstractComparator_disposeInternal(
  * Signature: ()V
  */
 void Java_org_rocksdb_Comparator_createNewComparator0(
-    JNIEnv* env, jobject jobj) {
-  const rocksdb::ComparatorJniCallback* c = new rocksdb::ComparatorJniCallback(env, jobj);
+    JNIEnv* env, jobject jobj, jlong copt_handle) {
+  const rocksdb::ComparatorJniCallbackOptions* copt = reinterpret_cast<rocksdb::ComparatorJniCallbackOptions*>(copt_handle);
+  const rocksdb::ComparatorJniCallback* c = new rocksdb::ComparatorJniCallback(env, jobj, copt);
   rocksdb::AbstractComparatorJni::setHandle(env, jobj, c);
 }
 
@@ -55,8 +78,9 @@ void Java_org_rocksdb_Comparator_createNewComparator0(
  * Signature: ()V
  */
 void Java_org_rocksdb_DirectComparator_createNewDirectComparator0(
-    JNIEnv* env, jobject jobj) {
-  const rocksdb::DirectComparatorJniCallback* c = new rocksdb::DirectComparatorJniCallback(env, jobj);
+    JNIEnv* env, jobject jobj, jlong copt_handle) {
+  const rocksdb::ComparatorJniCallbackOptions* copt = reinterpret_cast<rocksdb::ComparatorJniCallbackOptions*>(copt_handle);
+  const rocksdb::DirectComparatorJniCallback* c = new rocksdb::DirectComparatorJniCallback(env, jobj, copt);
   rocksdb::AbstractComparatorJni::setHandle(env, jobj, c);
 }
 
diff --git a/java/rocksjni/comparatorjnicallback.cc b/java/rocksjni/comparatorjnicallback.cc
index 6d80547d5..22bba334c 100644
--- a/java/rocksjni/comparatorjnicallback.cc
+++ b/java/rocksjni/comparatorjnicallback.cc
@@ -11,7 +11,11 @@
 
 namespace rocksdb {
 BaseComparatorJniCallback::BaseComparatorJniCallback(
-    JNIEnv* env, jobject jComparator) {
+    JNIEnv* env, jobject jComparator, const ComparatorJniCallbackOptions* copt) {
+
+  //mutex is used for synchronisation when we are re-using
+  //the global java slice objects
+  mutex_ = new port::Mutex(copt->use_adaptive_mutex);
 
   // Note: Comparator methods may be accessed by multiple threads,
   // so we ref the jvm not the env
@@ -51,11 +55,14 @@ int BaseComparatorJniCallback::Compare(const Slice& a, const Slice& b) const {
 
   JNIEnv* m_env = getJniEnv();
 
+  mutex_->Lock();
+
   AbstractSliceJni::setHandle(m_env, m_jSliceA, &a);
   AbstractSliceJni::setHandle(m_env, m_jSliceB, &b);
-
   jint result = m_env->CallIntMethod(m_jComparator, m_jCompareMethodId, m_jSliceA, m_jSliceB);
 
+  mutex_->Unlock();
+
   m_jvm->DetachCurrentThread();
 
   return result;
@@ -72,10 +79,13 @@ void BaseComparatorJniCallback::FindShortestSeparator(std::string* start, const
   const char* startUtf = start->c_str();
   jstring jsStart = m_env->NewStringUTF(startUtf);
 
-  AbstractSliceJni::setHandle(m_env, m_jSliceLimit, &limit);
+  mutex_->Lock();
 
+  AbstractSliceJni::setHandle(m_env, m_jSliceLimit, &limit);
   jstring jsResultStart = (jstring)m_env->CallObjectMethod(m_jComparator, m_jFindShortestSeparatorMethodId, jsStart, m_jSliceLimit);
 
+  mutex_->Unlock();
+
   m_env->DeleteLocalRef(jsStart);
 
   if(jsResultStart != nullptr) {
@@ -110,13 +120,6 @@ void BaseComparatorJniCallback::FindShortSuccessor(std::string* key) const {
 }
 
 BaseComparatorJniCallback::~BaseComparatorJniCallback() {
-
-  // NOTE: we do not need to delete m_name here,
-  // I am not yet sure why, but doing so causes the error:
-  //   java(13051,0x109f54000) malloc: *** error for object 0x109f52fa9: pointer being freed was not allocated
-  //   *** set a breakpoint in malloc_error_break to debug
-  //delete[] m_name;
-
   JNIEnv* m_env = getJniEnv();
   
   m_env->DeleteGlobalRef(m_jComparator);
@@ -131,7 +134,7 @@ BaseComparatorJniCallback::~BaseComparatorJniCallback() {
 }
 
 ComparatorJniCallback::ComparatorJniCallback(
-    JNIEnv* env, jobject jComparator) : BaseComparatorJniCallback(env, jComparator) {
+    JNIEnv* env, jobject jComparator, const ComparatorJniCallbackOptions* copt) : BaseComparatorJniCallback(env, jComparator, copt) {
 
   m_jSliceA = env->NewGlobalRef(SliceJni::construct0(env));
   m_jSliceB = env->NewGlobalRef(SliceJni::construct0(env));
@@ -139,7 +142,7 @@ ComparatorJniCallback::ComparatorJniCallback(
 }
 
 DirectComparatorJniCallback::DirectComparatorJniCallback(
-    JNIEnv* env, jobject jComparator) : BaseComparatorJniCallback(env, jComparator) {
+    JNIEnv* env, jobject jComparator, const ComparatorJniCallbackOptions* copt) : BaseComparatorJniCallback(env, jComparator, copt) {
 
   m_jSliceA = env->NewGlobalRef(DirectSliceJni::construct0(env));
   m_jSliceB = env->NewGlobalRef(DirectSliceJni::construct0(env));
diff --git a/java/rocksjni/comparatorjnicallback.h b/java/rocksjni/comparatorjnicallback.h
index f188dce86..8ca0ac64f 100644
--- a/java/rocksjni/comparatorjnicallback.h
+++ b/java/rocksjni/comparatorjnicallback.h
@@ -12,11 +12,39 @@
 #include <jni.h>
 #include "rocksdb/comparator.h"
 #include "rocksdb/slice.h"
+#include "port/port.h"
 
 namespace rocksdb {
+
+struct ComparatorJniCallbackOptions {
+  // Use adaptive mutex, which spins in the user space before resorting
+  // to kernel. This could reduce context switch when the mutex is not
+  // heavily contended. However, if the mutex is hot, we could end up
+  // wasting spin time.
+  // Default: false
+  bool use_adaptive_mutex;
+
+  ComparatorJniCallbackOptions() : use_adaptive_mutex(false) {
+  }
+};
+
+/**
+ * This class acts as a bridge between C++
+ * and Java. The methods in this class will be
+ * called back from the RocksDB storage engine (C++)
+ * we then callback to the appropriate Java method
+ * this enables Comparators to be implemented in Java.
+ *
+ * The design of this Comparator caches the Java Slice
+ * objects that are used in the compare and findShortestSeparator
+ * method callbacks. Instead of creating new objects for each callback
+ * of those functions, by reuse via setHandle we are a lot
+ * faster; Unfortunately this means that we have to
+ * introduce locking in regions of those methods via mutex_.
+ */
 class BaseComparatorJniCallback : public Comparator {
   public:
-    BaseComparatorJniCallback(JNIEnv* env, jobject jComparator);
+    BaseComparatorJniCallback(JNIEnv* env, jobject jComparator, const ComparatorJniCallbackOptions* copt);
     virtual ~BaseComparatorJniCallback();
     virtual const char* Name() const;
     virtual int Compare(const Slice& a, const Slice& b) const;
@@ -24,6 +52,7 @@ class BaseComparatorJniCallback : public Comparator {
     virtual void FindShortSuccessor(std::string* key) const;
 
   private:
+    port::Mutex* mutex_;
     JavaVM* m_jvm;
     jobject m_jComparator;
     std::string m_name;
@@ -40,12 +69,12 @@ class BaseComparatorJniCallback : public Comparator {
 
 class ComparatorJniCallback : public BaseComparatorJniCallback {
     public:
-      ComparatorJniCallback(JNIEnv* env, jobject jComparator);
+      ComparatorJniCallback(JNIEnv* env, jobject jComparator, const ComparatorJniCallbackOptions* copt);
 };
 
 class DirectComparatorJniCallback : public BaseComparatorJniCallback {
     public:
-      DirectComparatorJniCallback(JNIEnv* env, jobject jComparator);
+      DirectComparatorJniCallback(JNIEnv* env, jobject jComparator, const ComparatorJniCallbackOptions* copt);
 };
 }  // namespace rocksdb
 
diff --git a/java/rocksjni/options.cc b/java/rocksjni/options.cc
index 8e94f965b..4367fc708 100644
--- a/java/rocksjni/options.cc
+++ b/java/rocksjni/options.cc
@@ -14,6 +14,8 @@
 #include "include/org_rocksdb_Options.h"
 #include "include/org_rocksdb_WriteOptions.h"
 #include "include/org_rocksdb_ReadOptions.h"
+#include "include/org_rocksdb_ComparatorOptions.h"
+
 #include "rocksjni/portal.h"
 #include "rocksdb/db.h"
 #include "rocksdb/options.h"
@@ -23,6 +25,7 @@
 #include "rocksdb/slice_transform.h"
 #include "rocksdb/rate_limiter.h"
 #include "rocksdb/comparator.h"
+#include "comparatorjnicallback.h"
 
 /*
  * Class:     org_rocksdb_Options
@@ -1770,3 +1773,48 @@ void Java_org_rocksdb_ReadOptions_setTailing(
   reinterpret_cast<rocksdb::ReadOptions*>(jhandle)->tailing =
       static_cast<bool>(jtailing);
 }
+
+/////////////////////////////////////////////////////////////////////
+// rocksdb::ComparatorOptions
+/*
+ * Class:     org_rocksdb_ComparatorOptions
+ * Method:    newComparatorOptions
+ * Signature: ()V
+ */
+void Java_org_rocksdb_ComparatorOptions_newComparatorOptions(
+    JNIEnv* env, jobject jobj) {
+  auto comparator_opt = new rocksdb::ComparatorJniCallbackOptions();
+  rocksdb::ComparatorOptionsJni::setHandle(env, jobj, comparator_opt);
+}
+
+/*
+ * Class:     org_rocksdb_ComparatorOptions
+ * Method:    useAdaptiveMutex
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_ComparatorOptions_useAdaptiveMutex(
+    JNIEnv * env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ComparatorJniCallbackOptions*>(jhandle)->use_adaptive_mutex;
+}
+
+/*
+ * Class:     org_rocksdb_ComparatorOptions
+ * Method:    setUseAdaptiveMutex
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_ComparatorOptions_setUseAdaptiveMutex(
+    JNIEnv * env, jobject jobj, jlong jhandle, jboolean juse_adaptive_mutex) {
+  reinterpret_cast<rocksdb::ComparatorJniCallbackOptions*>(jhandle)->use_adaptive_mutex =
+      static_cast<bool>(juse_adaptive_mutex);
+}
+
+/*
+ * Class:     org_rocksdb_ComparatorOptions
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_ComparatorOptions_disposeInternal(
+    JNIEnv * env, jobject jobj, jlong jhandle) {
+  delete reinterpret_cast<rocksdb::ComparatorJniCallbackOptions*>(jhandle);
+  rocksdb::ComparatorOptionsJni::setHandle(env, jobj, nullptr);
+}
diff --git a/java/rocksjni/portal.h b/java/rocksjni/portal.h
index 68403ebff..bd40d4290 100644
--- a/java/rocksjni/portal.h
+++ b/java/rocksjni/portal.h
@@ -363,6 +363,33 @@ class ColumnFamilyHandleJni {
   }
 };
 
+class ComparatorOptionsJni {
+  public:
+    // Get the java class id of org.rocksdb.ComparatorOptions.
+    static jclass getJClass(JNIEnv* env) {
+      jclass jclazz = env->FindClass("org/rocksdb/ComparatorOptions");
+      assert(jclazz != nullptr);
+      return jclazz;
+    }
+
+    // Get the field id of the member variable of org.rocksdb.ComparatorOptions
+    // that stores the pointer to rocksdb::ComparatorJniCallbackOptions.
+    static jfieldID getHandleFieldID(JNIEnv* env) {
+      static jfieldID fid = env->GetFieldID(
+          getJClass(env), "nativeHandle_", "J");
+      assert(fid != nullptr);
+      return fid;
+    }
+
+    // Pass the ComparatorJniCallbackOptions pointer to the java side.
+    static void setHandle(
+        JNIEnv* env, jobject jobj, const rocksdb::ComparatorJniCallbackOptions* op) {
+      env->SetLongField(
+          jobj, getHandleFieldID(env),
+          reinterpret_cast<jlong>(op));
+    }
+};
+
 class AbstractComparatorJni {
  public:
   // Get the java class id of org.rocksdb.Comparator.

From 25641bfc9c68671fa442b69506f7b638252ae341 Mon Sep 17 00:00:00 2001
From: Adam Retter <adam.retter@googlemail.com>
Date: Fri, 15 Aug 2014 14:42:03 +0100
Subject: [PATCH 276/829] Fix to memory dealocation when creating a slice from
 a byte buffer

---
 java/org/rocksdb/Slice.java | 16 ++++++++++++++++
 java/rocksjni/slice.cc      | 22 ++++++++++++++++++----
 2 files changed, 34 insertions(+), 4 deletions(-)

diff --git a/java/org/rocksdb/Slice.java b/java/org/rocksdb/Slice.java
index 28c29c43d..e6932cc76 100644
--- a/java/org/rocksdb/Slice.java
+++ b/java/org/rocksdb/Slice.java
@@ -55,7 +55,23 @@ public class Slice extends AbstractSlice<byte[]> {
     createNewSlice1(data);
   }
 
+  /**
+   * Deletes underlying C++ slice pointer
+   * and any buffered data.
+   *
+   * <p/>
+   * Note that this function should be called only after all
+   * RocksDB instances referencing the slice are closed.
+   * Otherwise an undefined behavior will occur.
+   */
+  @Override
+  protected void disposeInternal() {
+    super.disposeInternal();
+    disposeInternalBuf(nativeHandle_);
+  }
+
   @Override protected final native byte[] data0(long handle);
   private native void createNewSlice0(byte[] data, int length);
   private native void createNewSlice1(byte[] data);
+  private native void disposeInternalBuf(long handle);
 }
diff --git a/java/rocksjni/slice.cc b/java/rocksjni/slice.cc
index a0a6f71e6..e54b9a745 100644
--- a/java/rocksjni/slice.cc
+++ b/java/rocksjni/slice.cc
@@ -130,16 +130,19 @@ void Java_org_rocksdb_Slice_createNewSlice0(
 void Java_org_rocksdb_Slice_createNewSlice1(
     JNIEnv * env, jobject jobj, jbyteArray data) {
 
+  const int len = env->GetArrayLength(data);
+
   jboolean isCopy;
   jbyte* ptrData = env->GetByteArrayElements(data, &isCopy);
+  const char* buf = new char[len];
+  memcpy((void*)buf, ptrData, len);
 
-  const rocksdb::Slice* slice = new rocksdb::Slice((const char*)ptrData, env->GetArrayLength(data));
+  const rocksdb::Slice* slice = new rocksdb::Slice(buf, env->GetArrayLength(data));
   rocksdb::AbstractSliceJni::setHandle(env, jobj, slice);
 
-  env->ReleaseByteArrayElements(data, ptrData, JNI_COMMIT);
+  env->ReleaseByteArrayElements(data, ptrData, JNI_ABORT);
 
-  //TODO where do we free ptrData later?
-  //do we need to call env->ReleaseByteArrayElements(data, ptrData, JNI_ABORT) in the org.rocksdb.Slice#dispose() method?
+  //NOTE: buf will be deleted in the org.rocksdb.Slice#dispose method
 }
 
 /*
@@ -156,6 +159,17 @@ jbyteArray Java_org_rocksdb_Slice_data0(
   return data;
 }
 
+/*
+ * Class:     org_rocksdb_Slice
+ * Method:    disposeInternalBuf
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_Slice_disposeInternalBuf(
+    JNIEnv * env, jobject jobj, jlong handle) {
+    const rocksdb::Slice* slice = reinterpret_cast<rocksdb::Slice*>(handle);
+    delete [] slice->data_;
+}
+
 //</editor-fold>
 
 //<editor-fold desc="org.rocksdb.DirectSlice>

From 5e25274110ade253a1b6248629428ab77ce0c193 Mon Sep 17 00:00:00 2001
From: Adam Retter <adam.retter@googlemail.com>
Date: Thu, 21 Aug 2014 21:55:51 +0100
Subject: [PATCH 277/829] Fix code style problems identified by lint

---
 java/rocksjni/comparator.cc            | 33 ++++++------
 java/rocksjni/comparatorjnicallback.cc | 69 +++++++++++++++-----------
 java/rocksjni/comparatorjnicallback.h  | 26 ++++++----
 java/rocksjni/options.cc               |  9 ++--
 java/rocksjni/portal.h                 | 30 ++++++-----
 java/rocksjni/slice.cc                 | 40 ++++++++-------
 6 files changed, 119 insertions(+), 88 deletions(-)

diff --git a/java/rocksjni/comparator.cc b/java/rocksjni/comparator.cc
index 8dcda6aa6..420897939 100644
--- a/java/rocksjni/comparator.cc
+++ b/java/rocksjni/comparator.cc
@@ -18,7 +18,7 @@
 #include "rocksjni/comparatorjnicallback.h"
 #include "rocksjni/portal.h"
 
-//<editor-fold desc="org.rocksdb.ComparatorOptions">
+// <editor-fold desc="org.rocksdb.ComparatorOptions">
 
 void Java_org_rocksdb_ComparatorOptions_newComparatorOptions(
     JNIEnv* env, jobject jobj, jstring jpath, jboolean jshare_table_files,
@@ -37,10 +37,9 @@ void Java_org_rocksdb_ComparatorOptions_newComparatorOptions(
 
   rocksdb::BackupableDBOptionsJni::setHandle(env, jobj, bopt);
 }
+// </editor-fold>
 
-//</editor-fold>
-
-//<editor-fold desc="org.rocksdb.AbstractComparator>
+// <editor-fold desc="org.rocksdb.AbstractComparator>
 
 /*
  * Class:     org_rocksdb_AbstractComparator
@@ -51,10 +50,9 @@ void Java_org_rocksdb_AbstractComparator_disposeInternal(
     JNIEnv* env, jobject jobj, jlong handle) {
   delete reinterpret_cast<rocksdb::BaseComparatorJniCallback*>(handle);
 }
+// </editor-fold>
 
-//</editor-fold>
-
-//<editor-fold desc="org.rocksdb.Comparator>
+// <editor-fold desc="org.rocksdb.Comparator>
 
 /*
  * Class:     org_rocksdb_Comparator
@@ -63,14 +61,15 @@ void Java_org_rocksdb_AbstractComparator_disposeInternal(
  */
 void Java_org_rocksdb_Comparator_createNewComparator0(
     JNIEnv* env, jobject jobj, jlong copt_handle) {
-  const rocksdb::ComparatorJniCallbackOptions* copt = reinterpret_cast<rocksdb::ComparatorJniCallbackOptions*>(copt_handle);
-  const rocksdb::ComparatorJniCallback* c = new rocksdb::ComparatorJniCallback(env, jobj, copt);
+  const rocksdb::ComparatorJniCallbackOptions* copt =
+    reinterpret_cast<rocksdb::ComparatorJniCallbackOptions*>(copt_handle);
+  const rocksdb::ComparatorJniCallback* c =
+    new rocksdb::ComparatorJniCallback(env, jobj, copt);
   rocksdb::AbstractComparatorJni::setHandle(env, jobj, c);
 }
+// </editor-fold>
 
-//</editor-fold>
-
-//<editor-fold desc="org.rocksdb.DirectComparator>
+// <editor-fold desc="org.rocksdb.DirectComparator>
 
 /*
  * Class:     org_rocksdb_DirectComparator
@@ -79,10 +78,10 @@ void Java_org_rocksdb_Comparator_createNewComparator0(
  */
 void Java_org_rocksdb_DirectComparator_createNewDirectComparator0(
     JNIEnv* env, jobject jobj, jlong copt_handle) {
-  const rocksdb::ComparatorJniCallbackOptions* copt = reinterpret_cast<rocksdb::ComparatorJniCallbackOptions*>(copt_handle);
-  const rocksdb::DirectComparatorJniCallback* c = new rocksdb::DirectComparatorJniCallback(env, jobj, copt);
+  const rocksdb::ComparatorJniCallbackOptions* copt =
+    reinterpret_cast<rocksdb::ComparatorJniCallbackOptions*>(copt_handle);
+  const rocksdb::DirectComparatorJniCallback* c =
+    new rocksdb::DirectComparatorJniCallback(env, jobj, copt);
   rocksdb::AbstractComparatorJni::setHandle(env, jobj, c);
 }
-
-//</editor-fold>
-
+// </editor-fold>
diff --git a/java/rocksjni/comparatorjnicallback.cc b/java/rocksjni/comparatorjnicallback.cc
index 22bba334c..1f271fe42 100644
--- a/java/rocksjni/comparatorjnicallback.cc
+++ b/java/rocksjni/comparatorjnicallback.cc
@@ -7,14 +7,15 @@
 // rocksdb::Comparator.
 
 #include "rocksjni/comparatorjnicallback.h"
-#include "portal.h"
+#include "rocksjni/portal.h"
 
 namespace rocksdb {
 BaseComparatorJniCallback::BaseComparatorJniCallback(
-    JNIEnv* env, jobject jComparator, const ComparatorJniCallbackOptions* copt) {
+    JNIEnv* env, jobject jComparator,
+    const ComparatorJniCallbackOptions* copt) {
 
-  //mutex is used for synchronisation when we are re-using
-  //the global java slice objects
+  // mutex is used for synchronisation when we are re-using
+  // the global java slice objects
   mutex_ = new port::Mutex(copt->use_adaptive_mutex);
 
   // Note: Comparator methods may be accessed by multiple threads,
@@ -30,11 +31,13 @@ BaseComparatorJniCallback::BaseComparatorJniCallback(
   // so we cache it in a global var
   jmethodID jNameMethodId = AbstractComparatorJni::getNameMethodId(env);
   jstring jsName = (jstring)env->CallObjectMethod(m_jComparator, jNameMethodId);
-  m_name = JniUtil::copyString(env, jsName); //also releases jsName
+  m_name = JniUtil::copyString(env, jsName);  // also releases jsName
 
   m_jCompareMethodId = AbstractComparatorJni::getCompareMethodId(env);
-  m_jFindShortestSeparatorMethodId = AbstractComparatorJni::getFindShortestSeparatorMethodId(env);
-  m_jFindShortSuccessorMethodId = AbstractComparatorJni::getFindShortSuccessorMethodId(env);
+  m_jFindShortestSeparatorMethodId =
+    AbstractComparatorJni::getFindShortestSeparatorMethodId(env);
+  m_jFindShortSuccessorMethodId =
+    AbstractComparatorJni::getFindShortSuccessorMethodId(env);
 }
 
 /**
@@ -42,24 +45,25 @@ BaseComparatorJniCallback::BaseComparatorJniCallback(
  */
 JNIEnv* BaseComparatorJniCallback::getJniEnv() const {
   JNIEnv *env;
-  jint rs = m_jvm->AttachCurrentThread((void **)&env, NULL);
+  jint rs = m_jvm->AttachCurrentThread(reinterpret_cast<void **>(&env), NULL);
   assert(rs == JNI_OK);
   return env;
-};
+}
 
 const char* BaseComparatorJniCallback::Name() const {
   return m_name.c_str();
 }
 
 int BaseComparatorJniCallback::Compare(const Slice& a, const Slice& b) const {
-
   JNIEnv* m_env = getJniEnv();
 
   mutex_->Lock();
 
   AbstractSliceJni::setHandle(m_env, m_jSliceA, &a);
   AbstractSliceJni::setHandle(m_env, m_jSliceB, &b);
-  jint result = m_env->CallIntMethod(m_jComparator, m_jCompareMethodId, m_jSliceA, m_jSliceB);
+  jint result =
+    m_env->CallIntMethod(m_jComparator, m_jCompareMethodId, m_jSliceA,
+      m_jSliceB);
 
   mutex_->Unlock();
 
@@ -68,8 +72,8 @@ int BaseComparatorJniCallback::Compare(const Slice& a, const Slice& b) const {
   return result;
 }
 
-void BaseComparatorJniCallback::FindShortestSeparator(std::string* start, const Slice& limit) const {
-
+void BaseComparatorJniCallback::FindShortestSeparator(
+  std::string* start, const Slice& limit) const {
   if (start == nullptr) {
     return;
   }
@@ -82,22 +86,24 @@ void BaseComparatorJniCallback::FindShortestSeparator(std::string* start, const
   mutex_->Lock();
 
   AbstractSliceJni::setHandle(m_env, m_jSliceLimit, &limit);
-  jstring jsResultStart = (jstring)m_env->CallObjectMethod(m_jComparator, m_jFindShortestSeparatorMethodId, jsStart, m_jSliceLimit);
+  jstring jsResultStart =
+    (jstring)m_env->CallObjectMethod(m_jComparator,
+      m_jFindShortestSeparatorMethodId, jsStart, m_jSliceLimit);
 
   mutex_->Unlock();
 
   m_env->DeleteLocalRef(jsStart);
 
-  if(jsResultStart != nullptr) {
-    //update start with result
-    *start = JniUtil::copyString(m_env, jsResultStart); //also releases jsResultStart
+  if (jsResultStart != nullptr) {
+    // update start with result
+    *start =
+      JniUtil::copyString(m_env, jsResultStart);  // also releases jsResultStart
   }
 
   m_jvm->DetachCurrentThread();
 }
 
 void BaseComparatorJniCallback::FindShortSuccessor(std::string* key) const {
-
   if (key == nullptr) {
     return;
   }
@@ -107,13 +113,16 @@ void BaseComparatorJniCallback::FindShortSuccessor(std::string* key) const {
   const char* keyUtf = key->c_str();
   jstring jsKey = m_env->NewStringUTF(keyUtf);
 
-  jstring jsResultKey = (jstring)m_env->CallObjectMethod(m_jComparator, m_jFindShortSuccessorMethodId, jsKey);
+  jstring jsResultKey =
+    (jstring)m_env->CallObjectMethod(m_jComparator,
+      m_jFindShortSuccessorMethodId, jsKey);
 
   m_env->DeleteLocalRef(jsKey);
 
-  if(jsResultKey != nullptr) {
-    //update key with result
-    *key = JniUtil::copyString(m_env, jsResultKey); //also releases jsResultKey
+  if (jsResultKey != nullptr) {
+    // update key with result
+    *key =
+      JniUtil::copyString(m_env, jsResultKey);  // also releases jsResultKey
   }
 
   m_jvm->DetachCurrentThread();
@@ -121,7 +130,7 @@ void BaseComparatorJniCallback::FindShortSuccessor(std::string* key) const {
 
 BaseComparatorJniCallback::~BaseComparatorJniCallback() {
   JNIEnv* m_env = getJniEnv();
-  
+
   m_env->DeleteGlobalRef(m_jComparator);
   m_env->DeleteGlobalRef(m_jSliceA);
   m_env->DeleteGlobalRef(m_jSliceB);
@@ -129,21 +138,23 @@ BaseComparatorJniCallback::~BaseComparatorJniCallback() {
 
   // Note: do not need to explicitly detach, as this function is effectively
   // called from the Java class's disposeInternal method, and so already
-  // has an attached thread, getJniEnv above is just a no-op Attach to get the env
-  //jvm->DetachCurrentThread();
+  // has an attached thread, getJniEnv above is just a no-op Attach to get
+  // the env jvm->DetachCurrentThread();
 }
 
 ComparatorJniCallback::ComparatorJniCallback(
-    JNIEnv* env, jobject jComparator, const ComparatorJniCallbackOptions* copt) : BaseComparatorJniCallback(env, jComparator, copt) {
-
+    JNIEnv* env, jobject jComparator,
+    const ComparatorJniCallbackOptions* copt) :
+    BaseComparatorJniCallback(env, jComparator, copt) {
   m_jSliceA = env->NewGlobalRef(SliceJni::construct0(env));
   m_jSliceB = env->NewGlobalRef(SliceJni::construct0(env));
   m_jSliceLimit = env->NewGlobalRef(SliceJni::construct0(env));
 }
 
 DirectComparatorJniCallback::DirectComparatorJniCallback(
-    JNIEnv* env, jobject jComparator, const ComparatorJniCallbackOptions* copt) : BaseComparatorJniCallback(env, jComparator, copt) {
-
+    JNIEnv* env, jobject jComparator,
+    const ComparatorJniCallbackOptions* copt) :
+    BaseComparatorJniCallback(env, jComparator, copt) {
   m_jSliceA = env->NewGlobalRef(DirectSliceJni::construct0(env));
   m_jSliceB = env->NewGlobalRef(DirectSliceJni::construct0(env));
   m_jSliceLimit = env->NewGlobalRef(DirectSliceJni::construct0(env));
diff --git a/java/rocksjni/comparatorjnicallback.h b/java/rocksjni/comparatorjnicallback.h
index 8ca0ac64f..cda32fce1 100644
--- a/java/rocksjni/comparatorjnicallback.h
+++ b/java/rocksjni/comparatorjnicallback.h
@@ -10,6 +10,7 @@
 #define JAVA_ROCKSJNI_COMPARATORJNICALLBACK_H_
 
 #include <jni.h>
+#include <string>
 #include "rocksdb/comparator.h"
 #include "rocksdb/slice.h"
 #include "port/port.h"
@@ -43,15 +44,18 @@ struct ComparatorJniCallbackOptions {
  * introduce locking in regions of those methods via mutex_.
  */
 class BaseComparatorJniCallback : public Comparator {
-  public:
-    BaseComparatorJniCallback(JNIEnv* env, jobject jComparator, const ComparatorJniCallbackOptions* copt);
+ public:
+    BaseComparatorJniCallback(
+      JNIEnv* env, jobject jComparator,
+      const ComparatorJniCallbackOptions* copt);
     virtual ~BaseComparatorJniCallback();
     virtual const char* Name() const;
     virtual int Compare(const Slice& a, const Slice& b) const;
-    virtual void FindShortestSeparator(std::string* start, const Slice& limit) const;
+    virtual void FindShortestSeparator(
+      std::string* start, const Slice& limit) const;
     virtual void FindShortSuccessor(std::string* key) const;
 
-  private:
+ private:
     port::Mutex* mutex_;
     JavaVM* m_jvm;
     jobject m_jComparator;
@@ -61,20 +65,24 @@ class BaseComparatorJniCallback : public Comparator {
     jmethodID m_jFindShortSuccessorMethodId;
     JNIEnv* getJniEnv() const;
 
-  protected:
+ protected:
     jobject m_jSliceA;
     jobject m_jSliceB;
     jobject m_jSliceLimit;
 };
 
 class ComparatorJniCallback : public BaseComparatorJniCallback {
-    public:
-      ComparatorJniCallback(JNIEnv* env, jobject jComparator, const ComparatorJniCallbackOptions* copt);
+ public:
+      ComparatorJniCallback(
+        JNIEnv* env, jobject jComparator,
+        const ComparatorJniCallbackOptions* copt);
 };
 
 class DirectComparatorJniCallback : public BaseComparatorJniCallback {
-    public:
-      DirectComparatorJniCallback(JNIEnv* env, jobject jComparator, const ComparatorJniCallbackOptions* copt);
+ public:
+      DirectComparatorJniCallback(
+        JNIEnv* env, jobject jComparator,
+        const ComparatorJniCallbackOptions* copt);
 };
 }  // namespace rocksdb
 
diff --git a/java/rocksjni/options.cc b/java/rocksjni/options.cc
index 4367fc708..ceb4ce031 100644
--- a/java/rocksjni/options.cc
+++ b/java/rocksjni/options.cc
@@ -16,6 +16,7 @@
 #include "include/org_rocksdb_ReadOptions.h"
 #include "include/org_rocksdb_ComparatorOptions.h"
 
+#include "rocksjni/comparatorjnicallback.h"
 #include "rocksjni/portal.h"
 #include "rocksdb/db.h"
 #include "rocksdb/options.h"
@@ -25,7 +26,6 @@
 #include "rocksdb/slice_transform.h"
 #include "rocksdb/rate_limiter.h"
 #include "rocksdb/comparator.h"
-#include "comparatorjnicallback.h"
 
 /*
  * Class:     org_rocksdb_Options
@@ -1794,7 +1794,8 @@ void Java_org_rocksdb_ComparatorOptions_newComparatorOptions(
  */
 jboolean Java_org_rocksdb_ComparatorOptions_useAdaptiveMutex(
     JNIEnv * env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::ComparatorJniCallbackOptions*>(jhandle)->use_adaptive_mutex;
+  return reinterpret_cast<rocksdb::ComparatorJniCallbackOptions*>(jhandle)
+    ->use_adaptive_mutex;
 }
 
 /*
@@ -1804,8 +1805,8 @@ jboolean Java_org_rocksdb_ComparatorOptions_useAdaptiveMutex(
  */
 void Java_org_rocksdb_ComparatorOptions_setUseAdaptiveMutex(
     JNIEnv * env, jobject jobj, jlong jhandle, jboolean juse_adaptive_mutex) {
-  reinterpret_cast<rocksdb::ComparatorJniCallbackOptions*>(jhandle)->use_adaptive_mutex =
-      static_cast<bool>(juse_adaptive_mutex);
+  reinterpret_cast<rocksdb::ComparatorJniCallbackOptions*>(jhandle)
+    ->use_adaptive_mutex = static_cast<bool>(juse_adaptive_mutex);
 }
 
 /*
diff --git a/java/rocksjni/portal.h b/java/rocksjni/portal.h
index bd40d4290..32452ae0b 100644
--- a/java/rocksjni/portal.h
+++ b/java/rocksjni/portal.h
@@ -12,6 +12,8 @@
 
 #include <jni.h>
 #include <limits>
+#include <string>
+
 #include "rocksdb/db.h"
 #include "rocksdb/filter_policy.h"
 #include "rocksdb/status.h"
@@ -364,7 +366,7 @@ class ColumnFamilyHandleJni {
 };
 
 class ComparatorOptionsJni {
-  public:
+ public:
     // Get the java class id of org.rocksdb.ComparatorOptions.
     static jclass getJClass(JNIEnv* env) {
       jclass jclazz = env->FindClass("org/rocksdb/ComparatorOptions");
@@ -383,7 +385,8 @@ class ComparatorOptionsJni {
 
     // Pass the ComparatorJniCallbackOptions pointer to the java side.
     static void setHandle(
-        JNIEnv* env, jobject jobj, const rocksdb::ComparatorJniCallbackOptions* op) {
+      JNIEnv* env, jobject jobj,
+      const rocksdb::ComparatorJniCallbackOptions* op) {
       env->SetLongField(
           jobj, getHandleFieldID(env),
           reinterpret_cast<jlong>(op));
@@ -418,37 +421,41 @@ class AbstractComparatorJni {
 
   // Get the java method `compare` of org.rocksdb.Comparator.
   static jmethodID getCompareMethodId(JNIEnv* env) {
-    static jmethodID mid = env->GetMethodID(
-        getJClass(env), "compare", "(Lorg/rocksdb/AbstractSlice;Lorg/rocksdb/AbstractSlice;)I");
+    static jmethodID mid = env->GetMethodID(getJClass(env),
+      "compare",
+      "(Lorg/rocksdb/AbstractSlice;Lorg/rocksdb/AbstractSlice;)I");
     assert(mid != nullptr);
     return mid;
   }
 
   // Get the java method `findShortestSeparator` of org.rocksdb.Comparator.
   static jmethodID getFindShortestSeparatorMethodId(JNIEnv* env) {
-    static jmethodID mid = env->GetMethodID(
-        getJClass(env), "findShortestSeparator", "(Ljava/lang/String;Lorg/rocksdb/AbstractSlice;)Ljava/lang/String;");
+    static jmethodID mid = env->GetMethodID(getJClass(env),
+      "findShortestSeparator",
+      "(Ljava/lang/String;Lorg/rocksdb/AbstractSlice;)Ljava/lang/String;");
     assert(mid != nullptr);
     return mid;
   }
 
   // Get the java method `findShortSuccessor` of org.rocksdb.Comparator.
   static jmethodID getFindShortSuccessorMethodId(JNIEnv* env) {
-    static jmethodID mid = env->GetMethodID(
-        getJClass(env), "findShortSuccessor", "(Ljava/lang/String;)Ljava/lang/String;");
+    static jmethodID mid = env->GetMethodID(getJClass(env),
+      "findShortSuccessor",
+      "(Ljava/lang/String;)Ljava/lang/String;");
     assert(mid != nullptr);
     return mid;
   }
 
   // Get the pointer to ComparatorJniCallback.
-  static rocksdb::BaseComparatorJniCallback* getHandle(JNIEnv* env, jobject jobj) {
+  static rocksdb::BaseComparatorJniCallback* getHandle(
+    JNIEnv* env, jobject jobj) {
     return reinterpret_cast<rocksdb::BaseComparatorJniCallback*>(
         env->GetLongField(jobj, getHandleFieldID(env)));
   }
 
   // Pass the ComparatorJniCallback pointer to the java side.
   static void setHandle(
-      JNIEnv* env, jobject jobj, const rocksdb::BaseComparatorJniCallback* op) {
+    JNIEnv* env, jobject jobj, const rocksdb::BaseComparatorJniCallback* op) {
     env->SetLongField(
         jobj, getHandleFieldID(env),
         reinterpret_cast<jlong>(op));
@@ -585,8 +592,7 @@ class ListJni {
 };
 
 class JniUtil {
-  public:
-
+ public:
     /**
      * Copies a jstring to a std::string
      * and releases the original jstring
diff --git a/java/rocksjni/slice.cc b/java/rocksjni/slice.cc
index e54b9a745..0d8b92c9c 100644
--- a/java/rocksjni/slice.cc
+++ b/java/rocksjni/slice.cc
@@ -17,7 +17,7 @@
 #include "rocksdb/slice.h"
 #include "rocksjni/portal.h"
 
-//<editor-fold desc="org.rocksdb.AbstractSlice>
+// <editor-fold desc="org.rocksdb.AbstractSlice>
 
 /*
  * Class:     org_rocksdb_AbstractSlice
@@ -73,7 +73,8 @@ jstring Java_org_rocksdb_AbstractSlice_toString0(
 jint Java_org_rocksdb_AbstractSlice_compare0(
     JNIEnv* env, jobject jobj, jlong handle, jlong otherHandle) {
   const rocksdb::Slice* slice = reinterpret_cast<rocksdb::Slice*>(handle);
-  const rocksdb::Slice* otherSlice = reinterpret_cast<rocksdb::Slice*>(otherHandle);
+  const rocksdb::Slice* otherSlice =
+    reinterpret_cast<rocksdb::Slice*>(otherHandle);
   return slice->compare(*otherSlice);
 }
 
@@ -85,7 +86,8 @@ jint Java_org_rocksdb_AbstractSlice_compare0(
 jboolean Java_org_rocksdb_AbstractSlice_startsWith0(
     JNIEnv* env, jobject jobj, jlong handle, jlong otherHandle) {
   const rocksdb::Slice* slice = reinterpret_cast<rocksdb::Slice*>(handle);
-  const rocksdb::Slice* otherSlice = reinterpret_cast<rocksdb::Slice*>(otherHandle);
+  const rocksdb::Slice* otherSlice =
+    reinterpret_cast<rocksdb::Slice*>(otherHandle);
   return slice->starts_with(*otherSlice);
 }
 
@@ -99,9 +101,9 @@ void Java_org_rocksdb_AbstractSlice_disposeInternal(
   delete reinterpret_cast<rocksdb::Slice*>(handle);
 }
 
-//</editor-fold>
+// </editor-fold>
 
-//<editor-fold desc="org.rocksdb.Slice>
+// <editor-fold desc="org.rocksdb.Slice>
 
 /*
  * Class:     org_rocksdb_Slice
@@ -113,13 +115,11 @@ void Java_org_rocksdb_Slice_createNewSlice0(
 
   const jsize dataSize = env->GetArrayLength(data);
   const int len = dataSize - offset;
-  //jbyte ptrData[len];
   jbyte* ptrData = new jbyte[len];
   env->GetByteArrayRegion(data, offset, len, ptrData);
 
   const rocksdb::Slice* slice = new rocksdb::Slice((const char*)ptrData, len);
   rocksdb::AbstractSliceJni::setHandle(env, jobj, slice);
-
 }
 
 /*
@@ -135,14 +135,15 @@ void Java_org_rocksdb_Slice_createNewSlice1(
   jboolean isCopy;
   jbyte* ptrData = env->GetByteArrayElements(data, &isCopy);
   const char* buf = new char[len];
-  memcpy((void*)buf, ptrData, len);
+  memcpy(const_cast<char*>(buf), ptrData, len);
 
-  const rocksdb::Slice* slice = new rocksdb::Slice(buf, env->GetArrayLength(data));
+  const rocksdb::Slice* slice =
+    new rocksdb::Slice(buf, env->GetArrayLength(data));
   rocksdb::AbstractSliceJni::setHandle(env, jobj, slice);
 
   env->ReleaseByteArrayElements(data, ptrData, JNI_ABORT);
 
-  //NOTE: buf will be deleted in the org.rocksdb.Slice#dispose method
+  // NOTE: buf will be deleted in the org.rocksdb.Slice#dispose method
 }
 
 /*
@@ -155,7 +156,8 @@ jbyteArray Java_org_rocksdb_Slice_data0(
   const rocksdb::Slice* slice = reinterpret_cast<rocksdb::Slice*>(handle);
   const int len = slice->size();
   const jbyteArray data = env->NewByteArray(len);
-  env->SetByteArrayRegion(data, 0, len, (jbyte*)slice->data());
+  env->SetByteArrayRegion(data, 0, len,
+    reinterpret_cast<jbyte*>(const_cast<char*>(slice->data())));
   return data;
 }
 
@@ -170,9 +172,9 @@ void Java_org_rocksdb_Slice_disposeInternalBuf(
     delete [] slice->data_;
 }
 
-//</editor-fold>
+// </editor-fold>
 
-//<editor-fold desc="org.rocksdb.DirectSlice>
+// <editor-fold desc="org.rocksdb.DirectSlice>
 
 /*
  * Class:     org_rocksdb_DirectSlice
@@ -181,7 +183,8 @@ void Java_org_rocksdb_Slice_disposeInternalBuf(
  */
 void Java_org_rocksdb_DirectSlice_createNewDirectSlice0(
     JNIEnv* env, jobject jobj, jobject data, jint length) {
-  const char* ptrData = (char*)env->GetDirectBufferAddress(data);
+  const char* ptrData =
+    reinterpret_cast<char*>(env->GetDirectBufferAddress(data));
   const rocksdb::Slice* slice = new rocksdb::Slice(ptrData, length);
   rocksdb::AbstractSliceJni::setHandle(env, jobj, slice);
 }
@@ -193,7 +196,8 @@ void Java_org_rocksdb_DirectSlice_createNewDirectSlice0(
  */
 void Java_org_rocksdb_DirectSlice_createNewDirectSlice1(
     JNIEnv* env, jobject jobj, jobject data) {
-  const char* ptrData = (char*)env->GetDirectBufferAddress(data);
+  const char* ptrData =
+    reinterpret_cast<char*>(env->GetDirectBufferAddress(data));
   const rocksdb::Slice* slice = new rocksdb::Slice(ptrData);
   rocksdb::AbstractSliceJni::setHandle(env, jobj, slice);
 }
@@ -206,7 +210,8 @@ void Java_org_rocksdb_DirectSlice_createNewDirectSlice1(
 jobject Java_org_rocksdb_DirectSlice_data0(
     JNIEnv* env, jobject jobj, jlong handle) {
   const rocksdb::Slice* slice = reinterpret_cast<rocksdb::Slice*>(handle);
-  return env->NewDirectByteBuffer((void*)slice->data(), slice->size());
+  return env->NewDirectByteBuffer(const_cast<char*>(slice->data()),
+    slice->size());
 }
 
 /*
@@ -228,6 +233,7 @@ jbyte Java_org_rocksdb_DirectSlice_get0(
 void Java_org_rocksdb_DirectSlice_clear0(
     JNIEnv* env, jobject jobj, jlong handle) {
   rocksdb::Slice* slice = reinterpret_cast<rocksdb::Slice*>(handle);
+  delete [] slice->data_;
   slice->clear();
 }
 
@@ -242,4 +248,4 @@ void Java_org_rocksdb_DirectSlice_removePrefix0(
   slice->remove_prefix(length);
 }
 
-//</editor-fold>
+// </editor-fold>

From c63494fb61488b2c9380f17424fe277a2c094e9d Mon Sep 17 00:00:00 2001
From: Adam Retter <adam.retter@googlemail.com>
Date: Thu, 25 Sep 2014 14:12:02 +0100
Subject: [PATCH 278/829] Tests for ComparatorOptions, Comparator and
 DirectComparator, and by proxy we also exercise Slice and DirectSlice

---
 java/Makefile                                 |   4 +-
 java/org/rocksdb/AbstractComparator.java      |   2 +-
 .../rocksdb/test/AbstractComparatorTest.java  | 166 ++++++++++++++++++
 .../rocksdb/test/ComparatorOptionsTest.java   |  34 ++++
 java/org/rocksdb/test/ComparatorTest.java     |  45 +++++
 .../rocksdb/test/DirectComparatorTest.java    |  48 +++++
 java/org/rocksdb/test/Types.java              |  43 +++++
 7 files changed, 340 insertions(+), 2 deletions(-)
 create mode 100644 java/org/rocksdb/test/AbstractComparatorTest.java
 create mode 100644 java/org/rocksdb/test/ComparatorOptionsTest.java
 create mode 100644 java/org/rocksdb/test/ComparatorTest.java
 create mode 100644 java/org/rocksdb/test/DirectComparatorTest.java
 create mode 100644 java/org/rocksdb/test/Types.java

diff --git a/java/Makefile b/java/Makefile
index 19df0fa69..697df5175 100644
--- a/java/Makefile
+++ b/java/Makefile
@@ -46,7 +46,9 @@ test: java
 	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.ReadOnlyTest
 	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.ReadOptionsTest
 	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.StatisticsCollectorTest
-	@rm -rf /tmp/rocksdbjni_*
+	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.ComparatorOptionsTest
+	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.ComparatorTest
+	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.DirectComparatorTest
 
 db_bench: java
 	javac org/rocksdb/benchmark/*.java
diff --git a/java/org/rocksdb/AbstractComparator.java b/java/org/rocksdb/AbstractComparator.java
index fa797b273..e5c503025 100644
--- a/java/org/rocksdb/AbstractComparator.java
+++ b/java/org/rocksdb/AbstractComparator.java
@@ -14,7 +14,7 @@ package org.rocksdb;
  *   @see org.rocksdb.Comparator
  *   @see org.rocksdb.DirectComparator
  */
-abstract class AbstractComparator<T extends AbstractSlice> extends RocksObject {
+public abstract class AbstractComparator<T extends AbstractSlice> extends RocksObject {
 
   public abstract String name();
 
diff --git a/java/org/rocksdb/test/AbstractComparatorTest.java b/java/org/rocksdb/test/AbstractComparatorTest.java
new file mode 100644
index 000000000..e3cc6bb77
--- /dev/null
+++ b/java/org/rocksdb/test/AbstractComparatorTest.java
@@ -0,0 +1,166 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb.test;
+
+import org.rocksdb.*;
+
+import java.io.IOException;
+import java.nio.file.*;
+import java.nio.file.attribute.BasicFileAttributes;
+import java.util.Random;
+
+import static org.rocksdb.test.Types.byteToInt;
+import static org.rocksdb.test.Types.intToByte;
+
+/**
+ * Abstract tests for both Comparator and DirectComparator
+ */
+public abstract class AbstractComparatorTest {
+
+  /**
+   * Get a comparator which will expect Integer keys
+   * and determine an ascending order
+   *
+   * @return An integer ascending order key comparator
+   */
+  public abstract AbstractComparator getAscendingIntKeyComparator();
+
+  /**
+   * Test which stores random keys into the database
+   * using an @see getAscendingIntKeyComparator
+   * it then checks that these keys are read back in
+   * ascending order
+   *
+   * @param db_path A path where we can store database
+   *                files temporarily
+   */
+  public void testRoundtrip(final Path db_path) throws IOException {
+
+    Options opt = null;
+    RocksDB db = null;
+
+    try {
+      opt = new Options();
+      opt.setCreateIfMissing(true);
+      opt.setComparator(getAscendingIntKeyComparator());
+
+      // store 10,000 random integer keys
+      final int ITERATIONS = 10000;
+
+      db = RocksDB.open(opt, db_path.toString());
+      final Random random = new Random();
+      for(int i = 0; i < ITERATIONS; i++) {
+        final byte key[] = intToByte(random.nextInt());
+        if(i > 0 && db.get(key) != null) { // does key already exist (avoid duplicates)
+          i--; // generate a different key
+        } else {
+          db.put(key, "value".getBytes());
+        }
+      }
+      db.close();
+
+
+      // re-open db and read from start to end
+      // integer keys should be in ascending
+      // order as defined by SimpleIntComparator
+      db = RocksDB.open(opt, db_path.toString());
+      final RocksIterator it = db.newIterator();
+      it.seekToFirst();
+      int lastKey = Integer.MIN_VALUE;
+      int count = 0;
+      for(it.seekToFirst(); it.isValid(); it.next()) {
+        final int thisKey = byteToInt(it.key());
+        assert(thisKey > lastKey);
+        lastKey = thisKey;
+        count++;
+      }
+      db.close();
+
+      assert(count == ITERATIONS);
+
+    } catch (final RocksDBException e) {
+      System.err.format("[ERROR]: %s%n", e);
+      e.printStackTrace();
+    } finally {
+      if(db != null) {
+        db.close();
+      }
+
+      if(opt != null) {
+        opt.dispose();
+      }
+
+      removeDb(db_path); // cleanup after ourselves!
+    }
+  }
+
+  /**
+   * Compares integer keys
+   * so that they are in ascending order
+   *
+   * @param a 4-bytes representing an integer key
+   * @param b 4-bytes representing an integer key
+   *
+   * @return negative if a < b, 0 if a == b, positive otherwise
+   */
+  protected final int compareIntKeys(final byte[] a, final byte[] b) {
+
+    final int iA = byteToInt(a);
+    final int iB = byteToInt(b);
+
+    // protect against int key calculation overflow
+    final double diff = (double)iA - iB;
+    final int result;
+    if(diff < Integer.MIN_VALUE) {
+      result = Integer.MIN_VALUE;
+    } else if(diff > Integer.MAX_VALUE) {
+      result = Integer.MAX_VALUE;
+    } else {
+      result = (int)diff;
+    }
+
+    return result;
+  }
+
+  /**
+   * Utility method for deleting database files
+   *
+   * @param db_path The path to the database to remove
+   *                from the filesystem
+   */
+  private static void removeDb(final Path db_path) throws IOException {
+    Files.walkFileTree(db_path, new SimpleFileVisitor<Path>() {
+      @Override
+      public FileVisitResult visitFile(final Path file, final BasicFileAttributes attrs)
+          throws IOException {
+        Files.delete(file);
+        return FileVisitResult.CONTINUE;
+      }
+
+      @Override
+      public FileVisitResult visitFileFailed(final Path file, IOException exc)
+          throws IOException {
+        // try to delete the file anyway, even if its attributes
+        // could not be read, since delete-only access is
+        // theoretically possible
+        Files.delete(file);
+        return FileVisitResult.CONTINUE;
+      }
+
+      @Override
+      public FileVisitResult postVisitDirectory(final Path dir, IOException exc)
+          throws IOException {
+        if (exc == null) {
+          Files.delete(dir);
+          return FileVisitResult.CONTINUE;
+        } else {
+          // directory iteration failed; propagate exception
+          throw exc;
+        }
+      }
+    });
+  }
+}
diff --git a/java/org/rocksdb/test/ComparatorOptionsTest.java b/java/org/rocksdb/test/ComparatorOptionsTest.java
new file mode 100644
index 000000000..e25209392
--- /dev/null
+++ b/java/org/rocksdb/test/ComparatorOptionsTest.java
@@ -0,0 +1,34 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb.test;
+
+import org.rocksdb.ComparatorOptions;
+import org.rocksdb.RocksDB;
+
+import java.util.Random;
+
+public class ComparatorOptionsTest {
+
+  static {
+    RocksDB.loadLibrary();
+  }
+
+  public static void main(String[] args) {
+    final ComparatorOptions copt = new ComparatorOptions();
+    Random rand = new Random();
+
+    { // UseAdaptiveMutex test
+      copt.setUseAdaptiveMutex(true);
+      assert(copt.useAdaptiveMutex() == true);
+
+      copt.setUseAdaptiveMutex(false);
+      assert(copt.useAdaptiveMutex() == false);
+    }
+
+    copt.dispose();
+    System.out.println("Passed ComparatorOptionsTest");
+  }
+}
diff --git a/java/org/rocksdb/test/ComparatorTest.java b/java/org/rocksdb/test/ComparatorTest.java
new file mode 100644
index 000000000..34d7c78df
--- /dev/null
+++ b/java/org/rocksdb/test/ComparatorTest.java
@@ -0,0 +1,45 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb.test;
+
+import org.rocksdb.*;
+
+import java.io.IOException;
+import java.nio.file.FileSystems;
+
+public class ComparatorTest {
+  private static final String db_path = "/tmp/comparator_db";
+
+  static {
+    RocksDB.loadLibrary();
+  }
+
+  public static void main(String[] args) throws IOException {
+
+    final AbstractComparatorTest comparatorTest = new AbstractComparatorTest() {
+      @Override
+      public AbstractComparator getAscendingIntKeyComparator() {
+        return new Comparator(new ComparatorOptions()) {
+
+          @Override
+          public String name() {
+            return "test.AscendingIntKeyComparator";
+          }
+
+          @Override
+          public int compare(final Slice a, final Slice b) {
+            return compareIntKeys(a.data(), b.data());
+          }
+        };
+      }
+    };
+
+    // test the round-tripability of keys written and read with the Comparator
+    comparatorTest.testRoundtrip(FileSystems.getDefault().getPath(db_path));
+
+    System.out.println("Passed ComparatorTest");
+  }
+}
diff --git a/java/org/rocksdb/test/DirectComparatorTest.java b/java/org/rocksdb/test/DirectComparatorTest.java
new file mode 100644
index 000000000..9df06eb73
--- /dev/null
+++ b/java/org/rocksdb/test/DirectComparatorTest.java
@@ -0,0 +1,48 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb.test;
+
+import org.rocksdb.*;
+
+import java.io.IOException;
+import java.nio.file.FileSystems;
+
+public class DirectComparatorTest {
+  private static final String db_path = "/tmp/direct_comparator_db";
+
+  static {
+    RocksDB.loadLibrary();
+  }
+
+  public static void main(String[] args) throws IOException {
+
+    final AbstractComparatorTest comparatorTest = new AbstractComparatorTest() {
+      @Override
+      public AbstractComparator getAscendingIntKeyComparator() {
+        return new DirectComparator(new ComparatorOptions()) {
+
+          @Override
+          public String name() {
+            return "test.AscendingIntKeyDirectComparator";
+          }
+
+          @Override
+          public int compare(final DirectSlice a, final DirectSlice b) {
+            final byte ax[] = new byte[4], bx[] = new byte[4];
+            a.data().get(ax);
+            b.data().get(bx);
+            return compareIntKeys(ax, bx);
+          }
+        };
+      }
+    };
+
+    // test the round-tripability of keys written and read with the DirectComparator
+    comparatorTest.testRoundtrip(FileSystems.getDefault().getPath(db_path));
+
+    System.out.println("Passed DirectComparatorTest");
+  }
+}
diff --git a/java/org/rocksdb/test/Types.java b/java/org/rocksdb/test/Types.java
new file mode 100644
index 000000000..22fcd3537
--- /dev/null
+++ b/java/org/rocksdb/test/Types.java
@@ -0,0 +1,43 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb.test;
+
+/**
+ * Simple type conversion methods
+ * for use in tests
+ */
+public class Types {
+
+  /**
+   * Convert first 4 bytes of a byte array to an int
+   *
+   * @param data The byte array
+   *
+   * @return An integer
+   */
+  public static int byteToInt(final byte data[]) {
+    return (data[0] & 0xff) |
+        ((data[1] & 0xff) << 8) |
+        ((data[2] & 0xff) << 16) |
+        ((data[3] & 0xff) << 24);
+  }
+
+  /**
+   * Convert an int to 4 bytes
+   *
+   * @param v The int
+   *
+   * @return A byte array containing 4 bytes
+   */
+  public static byte[] intToByte(final int v) {
+    return new byte[] {
+        (byte)((v >>> 0) & 0xff),
+        (byte)((v >>> 8) & 0xff),
+        (byte)((v >>> 16) & 0xff),
+        (byte)((v >>> 24) & 0xff)
+    };
+  }
+}

From a6fb7f312dcbe4357c2b9d097f3bef033f2c659f Mon Sep 17 00:00:00 2001
From: Adam Retter <adam.retter@googlemail.com>
Date: Mon, 6 Oct 2014 18:35:53 +0100
Subject: [PATCH 279/829] Fix code review comments raised in
 https://reviews.facebook.net/D22779

---
 java/org/rocksdb/AbstractComparator.java      | 16 ++++++-
 java/org/rocksdb/AbstractSlice.java           | 33 +++++++++-----
 java/org/rocksdb/Comparator.java              |  1 -
 java/org/rocksdb/ComparatorOptions.java       | 12 ++++-
 java/org/rocksdb/DirectComparator.java        |  1 -
 java/org/rocksdb/DirectSlice.java             | 18 +++++++-
 java/org/rocksdb/Slice.java                   |  9 +++-
 .../rocksdb/test/AbstractComparatorTest.java  | 12 ++---
 java/rocksjni/comparator.cc                   | 21 ---------
 java/rocksjni/comparatorjnicallback.cc        | 44 ++++++++++++-------
 java/rocksjni/comparatorjnicallback.h         | 12 +++--
 11 files changed, 115 insertions(+), 64 deletions(-)

diff --git a/java/org/rocksdb/AbstractComparator.java b/java/org/rocksdb/AbstractComparator.java
index e5c503025..8de50e271 100644
--- a/java/org/rocksdb/AbstractComparator.java
+++ b/java/org/rocksdb/AbstractComparator.java
@@ -14,8 +14,22 @@ package org.rocksdb;
  *   @see org.rocksdb.Comparator
  *   @see org.rocksdb.DirectComparator
  */
-public abstract class AbstractComparator<T extends AbstractSlice> extends RocksObject {
+public abstract class AbstractComparator<T extends AbstractSlice>
+    extends RocksObject {
 
+  /**
+   * The name of the comparator.  Used to check for comparator
+   * mismatches (i.e., a DB created with one comparator is
+   * accessed using a different comparator).
+   *
+   * A new name should be used whenever
+   * the comparator implementation changes in a way that will cause
+   * the relative ordering of any two keys to change.
+   *
+   * Names starting with "rocksdb." are reserved and should not be used.
+   *
+   * @return The name of this comparator implementation
+   */
   public abstract String name();
 
   /**
diff --git a/java/org/rocksdb/AbstractSlice.java b/java/org/rocksdb/AbstractSlice.java
index 963c72a1b..971bd7c1a 100644
--- a/java/org/rocksdb/AbstractSlice.java
+++ b/java/org/rocksdb/AbstractSlice.java
@@ -13,13 +13,23 @@ package org.rocksdb;
  * should extend either of the public abstract classes:
  *   @see org.rocksdb.Slice
  *   @see org.rocksdb.DirectSlice
+ *
+ * Regards the lifecycle of Java Slices in RocksDB:
+ *   At present when you configure a Comparator from Java, it creates an
+ *   instance of a C++ BaseComparatorJniCallback subclass and
+ *   passes that to RocksDB as the comparator. That subclass of
+ *   BaseComparatorJniCallback creates the Java
+ *   {@see org.rocksdb.AbstractSlice} subclass Objects. When you dispose
+ *   the Java {@see org.rocksdb.AbstractComparator} subclass, it disposes the
+ *   C++ BaseComparatorJniCallback subclass, which in turn destroys the
+ *   Java {@see org.rocksdb.AbstractSlice} subclass Objects.
  */
 abstract class AbstractSlice<T> extends RocksObject {
 
   /**
-   * Returns the data.
+   * Returns the data of the slice.
    *
-   * @return The data. Note, the type of access is
+   * @return The slice data. Note, the type of access is
    *   determined by the subclass
    *   @see org.rocksdb.AbstractSlice#data0(long).
    */
@@ -65,7 +75,7 @@ abstract class AbstractSlice<T> extends RocksObject {
    * Creates a string representation of the data
    *
    * @param hex When true, the representation
-   *   will be encoded in hexidecimal.
+   *   will be encoded in hexadecimal.
    *
    * @return The string representation of the data.
    */
@@ -96,13 +106,13 @@ abstract class AbstractSlice<T> extends RocksObject {
   }
 
   /**
-   * If other is a slice, then
-   * we defer to compare to check equality,
-   * otherwise we return false.
+   * If other is a slice object, then
+   * we defer to {@link #compare(AbstractSlice) compare}
+   * to check equality, otherwise we return false.
    *
    * @param other Object to test for equality
    *
-   * @return true when this.compare(other) == 0,
+   * @return true when {@code this.compare(other) == 0},
    *   false otherwise.
    */
   @Override
@@ -115,13 +125,14 @@ abstract class AbstractSlice<T> extends RocksObject {
   }
 
   /**
-   * Determines whether this starts with prefix
+   * Determines whether this slice starts with
+   * another slice
    *
    * @param prefix Another slice which may of may not
-   *   be the prefix of this slice.
+   *   be a prefix of this slice.
    *
-   * @return true when slice `prefix` is a prefix
-   *   of this slice
+   * @return true when this slice starts with the
+   *   {@code prefix} slice
    */
   public boolean startsWith(final AbstractSlice prefix) {
     if (prefix != null) {
diff --git a/java/org/rocksdb/Comparator.java b/java/org/rocksdb/Comparator.java
index 7272a555a..c8e050bca 100644
--- a/java/org/rocksdb/Comparator.java
+++ b/java/org/rocksdb/Comparator.java
@@ -15,7 +15,6 @@ package org.rocksdb;
  * using @see org.rocksdb.DirectComparator
  */
 public abstract class Comparator extends AbstractComparator<Slice> {
-
   public Comparator(final ComparatorOptions copt) {
     super();
     createNewComparator0(copt.nativeHandle_);
diff --git a/java/org/rocksdb/ComparatorOptions.java b/java/org/rocksdb/ComparatorOptions.java
index a55091dfa..f0ba520a3 100644
--- a/java/org/rocksdb/ComparatorOptions.java
+++ b/java/org/rocksdb/ComparatorOptions.java
@@ -1,7 +1,14 @@
 package org.rocksdb;
 
+/**
+ * This class controls the behaviour
+ * of Java implementations of
+ * AbstractComparator
+ *
+ * Note that dispose() must be called before a ComparatorOptions
+ * instance becomes out-of-scope to release the allocated memory in C++.
+ */
 public class ComparatorOptions extends RocksObject {
-
   public ComparatorOptions() {
     super();
     newComparatorOptions();
@@ -44,6 +51,7 @@ public class ComparatorOptions extends RocksObject {
 
   private native void newComparatorOptions();
   private native boolean useAdaptiveMutex(final long handle);
-  private native void setUseAdaptiveMutex(final long handle, final boolean useAdaptiveMutex);
+  private native void setUseAdaptiveMutex(final long handle,
+      final boolean useAdaptiveMutex);
   private native void disposeInternal(long handle);
 }
diff --git a/java/org/rocksdb/DirectComparator.java b/java/org/rocksdb/DirectComparator.java
index 86476c40e..47f4d7256 100644
--- a/java/org/rocksdb/DirectComparator.java
+++ b/java/org/rocksdb/DirectComparator.java
@@ -15,7 +15,6 @@ package org.rocksdb;
  * using @see org.rocksdb.Comparator
  */
 public abstract class DirectComparator extends AbstractComparator<DirectSlice> {
-
   public DirectComparator(final ComparatorOptions copt) {
     super();
     createNewDirectComparator0(copt.nativeHandle_);
diff --git a/java/org/rocksdb/DirectSlice.java b/java/org/rocksdb/DirectSlice.java
index 8169e3529..847bbd9c1 100644
--- a/java/org/rocksdb/DirectSlice.java
+++ b/java/org/rocksdb/DirectSlice.java
@@ -16,11 +16,18 @@ import java.nio.ByteBuffer;
  * values consider using @see org.rocksdb.Slice
  */
 public class DirectSlice extends AbstractSlice<ByteBuffer> {
-
   /**
    * Called from JNI to construct a new Java DirectSlice
    * without an underlying C++ object set
    * at creation time.
+   *
+   * Note: You should be aware that
+   * {@see org.rocksdb.RocksObject#disOwnNativeHandle()} is intentionally
+   * called from the default DirectSlice constructor, and that it is marked as
+   * private. This is so that developers cannot construct their own default
+   * DirectSlice objects (at present). As developers cannot construct their own
+   * DirectSlice objects through this, they are not creating underlying C++
+   * DirectSlice objects, and so there is nothing to free (dispose) from Java.
    */
   private DirectSlice() {
     super();
@@ -31,6 +38,8 @@ public class DirectSlice extends AbstractSlice<ByteBuffer> {
    * Constructs a slice
    * where the data is taken from
    * a String.
+   *
+   * @param str The string
    */
   public DirectSlice(final String str) {
     super();
@@ -41,6 +50,9 @@ public class DirectSlice extends AbstractSlice<ByteBuffer> {
    * Constructs a slice where the data is
    * read from the provided
    * ByteBuffer up to a certain length
+   *
+   * @param data The buffer containing the data
+   * @param length The length of the data to use for the slice
    */
   public DirectSlice(final ByteBuffer data, final int length) {
     super();
@@ -51,6 +63,8 @@ public class DirectSlice extends AbstractSlice<ByteBuffer> {
    * Constructs a slice where the data is
    * read from the provided
    * ByteBuffer
+   *
+   * @param data The bugger containing the data
    */
   public DirectSlice(final ByteBuffer data) {
     super();
@@ -79,7 +93,7 @@ public class DirectSlice extends AbstractSlice<ByteBuffer> {
   }
 
   /**
-   * Drops the specified n
+   * Drops the specified {@code n}
    * number of bytes from the start
    * of the backing slice
    *
diff --git a/java/org/rocksdb/Slice.java b/java/org/rocksdb/Slice.java
index e6932cc76..4449cb7b8 100644
--- a/java/org/rocksdb/Slice.java
+++ b/java/org/rocksdb/Slice.java
@@ -14,11 +14,18 @@ package org.rocksdb;
  * values consider using @see org.rocksdb.DirectSlice
  */
 public class Slice extends AbstractSlice<byte[]> {
-
   /**
    * Called from JNI to construct a new Java Slice
    * without an underlying C++ object set
    * at creation time.
+   *
+   * Note: You should be aware that
+   * {@see org.rocksdb.RocksObject#disOwnNativeHandle()} is intentionally
+   * called from the default Slice constructor, and that it is marked as
+   * private. This is so that developers cannot construct their own default
+   * Slice objects (at present). As developers cannot construct their own
+   * Slice objects through this, they are not creating underlying C++ Slice
+   * objects, and so there is nothing to free (dispose) from Java.
    */
   private Slice() {
     super();
diff --git a/java/org/rocksdb/test/AbstractComparatorTest.java b/java/org/rocksdb/test/AbstractComparatorTest.java
index e3cc6bb77..dfdb3cad9 100644
--- a/java/org/rocksdb/test/AbstractComparatorTest.java
+++ b/java/org/rocksdb/test/AbstractComparatorTest.java
@@ -52,9 +52,9 @@ public abstract class AbstractComparatorTest {
 
       db = RocksDB.open(opt, db_path.toString());
       final Random random = new Random();
-      for(int i = 0; i < ITERATIONS; i++) {
+      for (int i = 0; i < ITERATIONS; i++) {
         final byte key[] = intToByte(random.nextInt());
-        if(i > 0 && db.get(key) != null) { // does key already exist (avoid duplicates)
+        if (i > 0 && db.get(key) != null) { // does key already exist (avoid duplicates)
           i--; // generate a different key
         } else {
           db.put(key, "value".getBytes());
@@ -71,7 +71,7 @@ public abstract class AbstractComparatorTest {
       it.seekToFirst();
       int lastKey = Integer.MIN_VALUE;
       int count = 0;
-      for(it.seekToFirst(); it.isValid(); it.next()) {
+      for (it.seekToFirst(); it.isValid(); it.next()) {
         final int thisKey = byteToInt(it.key());
         assert(thisKey > lastKey);
         lastKey = thisKey;
@@ -85,11 +85,11 @@ public abstract class AbstractComparatorTest {
       System.err.format("[ERROR]: %s%n", e);
       e.printStackTrace();
     } finally {
-      if(db != null) {
+      if (db != null) {
         db.close();
       }
 
-      if(opt != null) {
+      if (opt != null) {
         opt.dispose();
       }
 
@@ -114,7 +114,7 @@ public abstract class AbstractComparatorTest {
     // protect against int key calculation overflow
     final double diff = (double)iA - iB;
     final int result;
-    if(diff < Integer.MIN_VALUE) {
+    if (diff < Integer.MIN_VALUE) {
       result = Integer.MIN_VALUE;
     } else if(diff > Integer.MAX_VALUE) {
       result = Integer.MAX_VALUE;
diff --git a/java/rocksjni/comparator.cc b/java/rocksjni/comparator.cc
index 420897939..196376235 100644
--- a/java/rocksjni/comparator.cc
+++ b/java/rocksjni/comparator.cc
@@ -18,27 +18,6 @@
 #include "rocksjni/comparatorjnicallback.h"
 #include "rocksjni/portal.h"
 
-// <editor-fold desc="org.rocksdb.ComparatorOptions">
-
-void Java_org_rocksdb_ComparatorOptions_newComparatorOptions(
-    JNIEnv* env, jobject jobj, jstring jpath, jboolean jshare_table_files,
-    jboolean jsync, jboolean jdestroy_old_data, jboolean jbackup_log_files,
-    jlong jbackup_rate_limit, jlong jrestore_rate_limit) {
-  jbackup_rate_limit = (jbackup_rate_limit <= 0) ? 0 : jbackup_rate_limit;
-  jrestore_rate_limit = (jrestore_rate_limit <= 0) ? 0 : jrestore_rate_limit;
-
-  const char* cpath = env->GetStringUTFChars(jpath, 0);
-
-  auto bopt = new rocksdb::BackupableDBOptions(cpath, nullptr,
-      jshare_table_files, nullptr, jsync, jdestroy_old_data, jbackup_log_files,
-      jbackup_rate_limit, jrestore_rate_limit);
-
-  env->ReleaseStringUTFChars(jpath, cpath);
-
-  rocksdb::BackupableDBOptionsJni::setHandle(env, jobj, bopt);
-}
-// </editor-fold>
-
 // <editor-fold desc="org.rocksdb.AbstractComparator>
 
 /*
diff --git a/java/rocksjni/comparatorjnicallback.cc b/java/rocksjni/comparatorjnicallback.cc
index 1f271fe42..1be363541 100644
--- a/java/rocksjni/comparatorjnicallback.cc
+++ b/java/rocksjni/comparatorjnicallback.cc
@@ -12,11 +12,9 @@
 namespace rocksdb {
 BaseComparatorJniCallback::BaseComparatorJniCallback(
     JNIEnv* env, jobject jComparator,
-    const ComparatorJniCallbackOptions* copt) {
-
-  // mutex is used for synchronisation when we are re-using
-  // the global java slice objects
-  mutex_ = new port::Mutex(copt->use_adaptive_mutex);
+    const ComparatorJniCallbackOptions* copt)
+    : mtx_compare(new port::Mutex(copt->use_adaptive_mutex)),
+    mtx_findShortestSeparator(new port::Mutex(copt->use_adaptive_mutex)) {
 
   // Note: Comparator methods may be accessed by multiple threads,
   // so we ref the jvm not the env
@@ -57,7 +55,10 @@ const char* BaseComparatorJniCallback::Name() const {
 int BaseComparatorJniCallback::Compare(const Slice& a, const Slice& b) const {
   JNIEnv* m_env = getJniEnv();
 
-  mutex_->Lock();
+  // TODO(adamretter): slice objects can potentially be cached using thread
+  // local variables to avoid locking. Could make this configurable depending on
+  // performance.
+  mtx_compare->Lock();
 
   AbstractSliceJni::setHandle(m_env, m_jSliceA, &a);
   AbstractSliceJni::setHandle(m_env, m_jSliceB, &b);
@@ -65,7 +66,7 @@ int BaseComparatorJniCallback::Compare(const Slice& a, const Slice& b) const {
     m_env->CallIntMethod(m_jComparator, m_jCompareMethodId, m_jSliceA,
       m_jSliceB);
 
-  mutex_->Unlock();
+  mtx_compare->Unlock();
 
   m_jvm->DetachCurrentThread();
 
@@ -83,14 +84,17 @@ void BaseComparatorJniCallback::FindShortestSeparator(
   const char* startUtf = start->c_str();
   jstring jsStart = m_env->NewStringUTF(startUtf);
 
-  mutex_->Lock();
+  // TODO(adamretter): slice object can potentially be cached using thread local
+  // variable to avoid locking. Could make this configurable depending on
+  // performance.
+  mtx_findShortestSeparator->Lock();
 
   AbstractSliceJni::setHandle(m_env, m_jSliceLimit, &limit);
   jstring jsResultStart =
     (jstring)m_env->CallObjectMethod(m_jComparator,
       m_jFindShortestSeparatorMethodId, jsStart, m_jSliceLimit);
 
-  mutex_->Unlock();
+  mtx_findShortestSeparator->Unlock();
 
   m_env->DeleteLocalRef(jsStart);
 
@@ -120,9 +124,8 @@ void BaseComparatorJniCallback::FindShortSuccessor(std::string* key) const {
   m_env->DeleteLocalRef(jsKey);
 
   if (jsResultKey != nullptr) {
-    // update key with result
-    *key =
-      JniUtil::copyString(m_env, jsResultKey);  // also releases jsResultKey
+    // updates key with result, also releases jsResultKey.
+    *key = JniUtil::copyString(m_env, jsResultKey);
   }
 
   m_jvm->DetachCurrentThread();
@@ -132,9 +135,6 @@ BaseComparatorJniCallback::~BaseComparatorJniCallback() {
   JNIEnv* m_env = getJniEnv();
 
   m_env->DeleteGlobalRef(m_jComparator);
-  m_env->DeleteGlobalRef(m_jSliceA);
-  m_env->DeleteGlobalRef(m_jSliceB);
-  m_env->DeleteGlobalRef(m_jSliceLimit);
 
   // Note: do not need to explicitly detach, as this function is effectively
   // called from the Java class's disposeInternal method, and so already
@@ -151,6 +151,13 @@ ComparatorJniCallback::ComparatorJniCallback(
   m_jSliceLimit = env->NewGlobalRef(SliceJni::construct0(env));
 }
 
+ComparatorJniCallback::~ComparatorJniCallback() {
+  JNIEnv* m_env = getJniEnv();
+  m_env->DeleteGlobalRef(m_jSliceA);
+  m_env->DeleteGlobalRef(m_jSliceB);
+  m_env->DeleteGlobalRef(m_jSliceLimit);
+}
+
 DirectComparatorJniCallback::DirectComparatorJniCallback(
     JNIEnv* env, jobject jComparator,
     const ComparatorJniCallbackOptions* copt) :
@@ -159,4 +166,11 @@ DirectComparatorJniCallback::DirectComparatorJniCallback(
   m_jSliceB = env->NewGlobalRef(DirectSliceJni::construct0(env));
   m_jSliceLimit = env->NewGlobalRef(DirectSliceJni::construct0(env));
 }
+
+DirectComparatorJniCallback::~DirectComparatorJniCallback() {
+  JNIEnv* m_env = getJniEnv();
+  m_env->DeleteGlobalRef(m_jSliceA);
+  m_env->DeleteGlobalRef(m_jSliceB);
+  m_env->DeleteGlobalRef(m_jSliceLimit);
+}
 }  // namespace rocksdb
diff --git a/java/rocksjni/comparatorjnicallback.h b/java/rocksjni/comparatorjnicallback.h
index cda32fce1..65b986ca4 100644
--- a/java/rocksjni/comparatorjnicallback.h
+++ b/java/rocksjni/comparatorjnicallback.h
@@ -41,7 +41,8 @@ struct ComparatorJniCallbackOptions {
  * method callbacks. Instead of creating new objects for each callback
  * of those functions, by reuse via setHandle we are a lot
  * faster; Unfortunately this means that we have to
- * introduce locking in regions of those methods via mutex_.
+ * introduce independent locking in regions of each of those methods
+ * via the mutexs mtx_compare and mtx_findShortestSeparator respectively
  */
 class BaseComparatorJniCallback : public Comparator {
  public:
@@ -56,16 +57,19 @@ class BaseComparatorJniCallback : public Comparator {
     virtual void FindShortSuccessor(std::string* key) const;
 
  private:
-    port::Mutex* mutex_;
+    // used for synchronisation in compare method
+    port::Mutex* mtx_compare;
+    // used for synchronisation in findShortestSeparator method
+    port::Mutex* mtx_findShortestSeparator;
     JavaVM* m_jvm;
     jobject m_jComparator;
     std::string m_name;
     jmethodID m_jCompareMethodId;
     jmethodID m_jFindShortestSeparatorMethodId;
     jmethodID m_jFindShortSuccessorMethodId;
-    JNIEnv* getJniEnv() const;
 
  protected:
+    JNIEnv* getJniEnv() const;
     jobject m_jSliceA;
     jobject m_jSliceB;
     jobject m_jSliceLimit;
@@ -76,6 +80,7 @@ class ComparatorJniCallback : public BaseComparatorJniCallback {
       ComparatorJniCallback(
         JNIEnv* env, jobject jComparator,
         const ComparatorJniCallbackOptions* copt);
+      ~ComparatorJniCallback();
 };
 
 class DirectComparatorJniCallback : public BaseComparatorJniCallback {
@@ -83,6 +88,7 @@ class DirectComparatorJniCallback : public BaseComparatorJniCallback {
       DirectComparatorJniCallback(
         JNIEnv* env, jobject jComparator,
         const ComparatorJniCallbackOptions* copt);
+      ~DirectComparatorJniCallback();
 };
 }  // namespace rocksdb
 

From 6398e6a6a5b7bab7dc886d7a99e78189ed632e0f Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Tue, 21 Oct 2014 11:23:06 -0700
Subject: [PATCH 280/829] Fix DeleteFile() + enable deleting files oldest files
 in level 0

Summary:
DeleteFile() call was broken for non-default column family. This fixes it. We might need this feature for mongo.

I also introduced a possibility of deleting oldest file in level 0.

Test Plan: added unit test to deletefile_test

Reviewers: ljin, yhchiang, rven, sdong

Reviewed By: sdong

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D24909
---
 db/db_impl.cc         |  8 ++++-
 db/deletefile_test.cc | 69 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 76 insertions(+), 1 deletion(-)

diff --git a/db/db_impl.cc b/db/db_impl.cc
index c5bc7680a..90f50174a 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -4542,7 +4542,7 @@ Status DBImpl::DeleteFile(std::string name) {
                              name.c_str());
       return Status::InvalidArgument("File not found");
     }
-    assert((level > 0) && (level < cfd->NumberLevels()));
+    assert(level < cfd->NumberLevels());
 
     // If the file is being compacted no need to delete.
     if (metadata->being_compacted) {
@@ -4561,6 +4561,12 @@ Status DBImpl::DeleteFile(std::string name) {
         return Status::InvalidArgument("File not in last level");
       }
     }
+    // if level == 0, it has to be the oldest file
+    if (level == 0 &&
+        cfd->current()->files_[0].back()->fd.GetNumber() != number) {
+      return Status::InvalidArgument("File in level 0, but not oldest");
+    }
+    edit.SetColumnFamily(cfd->GetID());
     edit.DeleteFile(level, number);
     status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(),
                                     &edit, &mutex_, db_directory_.get());
diff --git a/db/deletefile_test.cc b/db/deletefile_test.cc
index f1cd4b040..6a6f8e953 100644
--- a/db/deletefile_test.cc
+++ b/db/deletefile_test.cc
@@ -287,6 +287,75 @@ TEST(DeleteFileTest, DeleteLogFiles) {
   CloseDB();
 }
 
+TEST(DeleteFileTest, DeleteNonDefaultColumnFamily) {
+  CloseDB();
+  DBOptions db_options;
+  db_options.create_if_missing = true;
+  db_options.create_missing_column_families = true;
+  std::vector<ColumnFamilyDescriptor> column_families;
+  column_families.emplace_back();
+  column_families.emplace_back("new_cf", ColumnFamilyOptions());
+
+  std::vector<rocksdb::ColumnFamilyHandle*> handles;
+  rocksdb::DB* db;
+  ASSERT_OK(DB::Open(db_options, dbname_, column_families, &handles, &db));
+
+  Random rnd(5);
+  for (int i = 0; i < 1000; ++i) {
+    ASSERT_OK(db->Put(WriteOptions(), handles[1], test::RandomKey(&rnd, 10),
+                      test::RandomKey(&rnd, 10)));
+  }
+  ASSERT_OK(db->Flush(FlushOptions(), handles[1]));
+  for (int i = 0; i < 1000; ++i) {
+    ASSERT_OK(db->Put(WriteOptions(), handles[1], test::RandomKey(&rnd, 10),
+                      test::RandomKey(&rnd, 10)));
+  }
+  ASSERT_OK(db->Flush(FlushOptions(), handles[1]));
+
+  std::vector<LiveFileMetaData> metadata;
+  db->GetLiveFilesMetaData(&metadata);
+  ASSERT_EQ(2U, metadata.size());
+  ASSERT_EQ("new_cf", metadata[0].column_family_name);
+  ASSERT_EQ("new_cf", metadata[1].column_family_name);
+  auto old_file = metadata[0].smallest_seqno < metadata[1].smallest_seqno
+                      ? metadata[0].name
+                      : metadata[1].name;
+  auto new_file = metadata[0].smallest_seqno > metadata[1].smallest_seqno
+                      ? metadata[0].name
+                      : metadata[1].name;
+  ASSERT_TRUE(db->DeleteFile(new_file).IsInvalidArgument());
+  ASSERT_OK(db->DeleteFile(old_file));
+
+  {
+    std::unique_ptr<Iterator> itr(db->NewIterator(ReadOptions(), handles[1]));
+    int count = 0;
+    for (itr->SeekToFirst(); itr->Valid(); itr->Next()) {
+      ASSERT_OK(itr->status());
+      ++count;
+    }
+    ASSERT_EQ(count, 1000);
+  }
+
+  delete handles[0];
+  delete handles[1];
+  delete db;
+
+  ASSERT_OK(DB::Open(db_options, dbname_, column_families, &handles, &db));
+  {
+    std::unique_ptr<Iterator> itr(db->NewIterator(ReadOptions(), handles[1]));
+    int count = 0;
+    for (itr->SeekToFirst(); itr->Valid(); itr->Next()) {
+      ASSERT_OK(itr->status());
+      ++count;
+    }
+    ASSERT_EQ(count, 1000);
+  }
+
+  delete handles[0];
+  delete handles[1];
+  delete db;
+}
+
 } //namespace rocksdb
 
 int main(int argc, char** argv) {

From e11a5e776fa44611f34ca8676480fd6a9c21f4be Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Tue, 21 Oct 2014 17:28:31 -0700
Subject: [PATCH 281/829] Improve the comment of util/thread_local.h

Summary: Improve the comment of util/thread_local.h

Test Plan: n/a

Reviewers: ljin

Reviewed By: ljin

Differential Revision: https://reviews.facebook.net/D25449
---
 util/thread_local.h | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/util/thread_local.h b/util/thread_local.h
index a037a9ceb..6884ed138 100644
--- a/util/thread_local.h
+++ b/util/thread_local.h
@@ -26,13 +26,14 @@ namespace rocksdb {
 // (2) a ThreadLocalPtr is destroyed
 typedef void (*UnrefHandler)(void* ptr);
 
-// Thread local storage that only stores value of pointer type. The storage
-// distinguish data coming from different thread and different ThreadLocalPtr
-// instances. For example, if a regular thread_local variable A is declared
-// in DBImpl, two DBImpl objects would share the same A. ThreadLocalPtr avoids
-// the confliction. The total storage size equals to # of threads * # of
-// ThreadLocalPtr instances. It is not efficient in terms of space, but it
-// should serve most of our use cases well and keep code simple.
+// ThreadLocalPtr stores only values of pointer type.  Different from
+// the usual thread-local-storage, ThreadLocalPtr has the ability to
+// distinguish data coming from different threads and different
+// ThreadLocalPtr instances.  For example, if a regular thread_local
+// variable A is declared in DBImpl, two DBImpl objects would share
+// the same A.  However, a ThreadLocalPtr that is defined under the
+// scope of DBImpl can avoid such confliction.  As a result, its memory
+// usage would be O(# of threads * # of ThreadLocalPtr instances).
 class ThreadLocalPtr {
  public:
   explicit ThreadLocalPtr(UnrefHandler handler = nullptr);

From 0fd985f42794c32ae14e25fc2780b38fde489c2e Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Wed, 22 Oct 2014 11:52:35 -0700
Subject: [PATCH 282/829] Avoid reloading filter on Get() if
 cache_index_and_filter_blocks == false

Summary:
This fixes the case that filter policy is missing in SST file, but we
open the table with filter policy on and cache_index_and_filter_blocks =
false. The current behavior is that we will try to load it every time on
Get() but fail.

Test Plan: unit test

Reviewers: yhchiang, igor, rven, sdong

Reviewed By: sdong

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D25455
---
 table/block_based_table_factory.cc |   5 ++
 table/block_based_table_reader.cc  | 106 +++++++++++++----------------
 table/block_based_table_reader.h   |   8 +--
 table/table_test.cc                |  53 ++++++++-------
 4 files changed, 87 insertions(+), 85 deletions(-)

diff --git a/table/block_based_table_factory.cc b/table/block_based_table_factory.cc
index 3155f3394..3013ade2a 100644
--- a/table/block_based_table_factory.cc
+++ b/table/block_based_table_factory.cc
@@ -71,6 +71,11 @@ Status BlockBasedTableFactory::SanitizeOptions(
     return Status::InvalidArgument("Hash index is specified for block-based "
         "table, but prefix_extractor is not given");
   }
+  if (table_options_.cache_index_and_filter_blocks &&
+      table_options_.no_block_cache) {
+    return Status::InvalidArgument("Enable cache_index_and_filter_blocks, "
+        ", but block cache is disabled");
+  }
   return Status::OK();
 }
 
diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc
index 4b2050e03..c973b755e 100644
--- a/table/block_based_table_reader.cc
+++ b/table/block_based_table_reader.cc
@@ -483,8 +483,8 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions,
   }
 
   // Will use block cache for index/filter blocks access?
-  if (table_options.block_cache &&
-      table_options.cache_index_and_filter_blocks) {
+  if (table_options.cache_index_and_filter_blocks) {
+    assert(table_options.block_cache != nullptr);
     // Hack: Call NewIndexIterator() to implicitly add index to the block_cache
     unique_ptr<Iterator> iter(new_table->NewIndexIterator(ReadOptions()));
     s = iter->status();
@@ -506,19 +506,7 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions,
 
       // Set filter block
       if (rep->filter_policy) {
-        // First try reading full_filter, then reading block_based_filter
-        for (auto filter_block_prefix : { kFullFilterBlockPrefix,
-                                          kFilterBlockPrefix }) {
-          std::string key = filter_block_prefix;
-          key.append(rep->filter_policy->Name());
-
-          BlockHandle handle;
-          if (FindMetaBlock(meta_iter.get(), key, &handle).ok()) {
-            rep->filter.reset(ReadFilter(handle, rep,
-                filter_block_prefix, nullptr));
-            break;
-          }
-        }
+        rep->filter.reset(ReadFilter(rep, meta_iter.get(), nullptr));
       }
     } else {
       delete index_reader;
@@ -726,33 +714,43 @@ Status BlockBasedTable::PutDataBlockToCache(
 }
 
 FilterBlockReader* BlockBasedTable::ReadFilter(
-    const BlockHandle& filter_handle, BlockBasedTable::Rep* rep,
-    const std::string& filter_block_prefix, size_t* filter_size) {
+    Rep* rep, Iterator* meta_index_iter, size_t* filter_size) {
   // TODO: We might want to unify with ReadBlockFromFile() if we start
   // requiring checksum verification in Table::Open.
-  ReadOptions opt;
-  BlockContents block;
-  if (!ReadBlockContents(rep->file.get(), rep->footer, opt, filter_handle,
-                         &block, rep->ioptions.env, false).ok()) {
-    return nullptr;
-  }
+  for (auto prefix : {kFullFilterBlockPrefix, kFilterBlockPrefix}) {
+    std::string filter_block_key = prefix;
+    filter_block_key.append(rep->filter_policy->Name());
+    BlockHandle handle;
+    if (FindMetaBlock(meta_index_iter, filter_block_key, &handle).ok()) {
+      BlockContents block;
+      if (!ReadBlockContents(rep->file.get(), rep->footer, ReadOptions(),
+                             handle, &block, rep->ioptions.env, false).ok()) {
+        // Error reading the block
+        return nullptr;
+      }
 
-  if (filter_size) {
-    *filter_size = block.data.size();
-  }
+      if (filter_size) {
+        *filter_size = block.data.size();
+      }
 
-  assert(rep->filter_policy);
-  if (kFilterBlockPrefix == filter_block_prefix) {
-    return new BlockBasedFilterBlockReader(
-        rep->ioptions.prefix_extractor, rep->table_options, std::move(block));
-  } else if (kFullFilterBlockPrefix == filter_block_prefix) {
-    auto filter_bits_reader = rep->filter_policy->
-        GetFilterBitsReader(block.data);
-
-    if (filter_bits_reader != nullptr) {
-      return new FullFilterBlockReader(rep->ioptions.prefix_extractor,
-                                       rep->table_options, std::move(block),
-                                       filter_bits_reader);
+      assert(rep->filter_policy);
+      if (kFilterBlockPrefix == prefix) {
+        return new BlockBasedFilterBlockReader(
+            rep->ioptions.prefix_extractor, rep->table_options,
+            std::move(block));
+      } else if (kFullFilterBlockPrefix == prefix) {
+        auto filter_bits_reader = rep->filter_policy->
+            GetFilterBitsReader(block.data);
+        if (filter_bits_reader != nullptr) {
+          return new FullFilterBlockReader(rep->ioptions.prefix_extractor,
+                                           rep->table_options,
+                                           std::move(block),
+                                           filter_bits_reader);
+        }
+      } else {
+        assert(false);
+        return nullptr;
+      }
     }
   }
   return nullptr;
@@ -760,8 +758,11 @@ FilterBlockReader* BlockBasedTable::ReadFilter(
 
 BlockBasedTable::CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter(
                                                           bool no_io) const {
-  // filter pre-populated
-  if (rep_->filter != nullptr) {
+  // If cache_index_and_filter_blocks is false, filter should be pre-populated.
+  // We will return rep_->filter anyway. rep_->filter can be nullptr if filter
+  // read fails at Open() time. We don't want to reload again since it will
+  // most probably fail again.
+  if (!rep_->table_options.cache_index_and_filter_blocks) {
     return {rep_->filter.get(), nullptr /* cache handle */};
   }
 
@@ -775,8 +776,7 @@ BlockBasedTable::CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter(
   char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
   auto key = GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size,
                          rep_->footer.metaindex_handle(),
-                         cache_key
-  );
+                         cache_key);
 
   Statistics* statistics = rep_->ioptions.statistics;
   auto cache_handle =
@@ -797,22 +797,12 @@ BlockBasedTable::CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter(
     auto s = ReadMetaBlock(rep_, &meta, &iter);
 
     if (s.ok()) {
-      // First try reading full_filter, then reading block_based_filter
-      for (auto filter_block_prefix : {kFullFilterBlockPrefix,
-                                       kFilterBlockPrefix}) {
-        std::string filter_block_key = filter_block_prefix;
-        filter_block_key.append(rep_->filter_policy->Name());
-        BlockHandle handle;
-        if (FindMetaBlock(iter.get(), filter_block_key, &handle).ok()) {
-          filter = ReadFilter(handle, rep_, filter_block_prefix, &filter_size);
-
-          if (filter == nullptr) break;  // err happen in ReadFilter
-          assert(filter_size > 0);
-          cache_handle = block_cache->Insert(
-              key, filter, filter_size, &DeleteCachedEntry<FilterBlockReader>);
-          RecordTick(statistics, BLOCK_CACHE_ADD);
-          break;
-        }
+      filter = ReadFilter(rep_, iter.get(), &filter_size);
+      if (filter != nullptr) {
+        assert(filter_size > 0);
+        cache_handle = block_cache->Insert(
+            key, filter, filter_size, &DeleteCachedEntry<FilterBlockReader>);
+        RecordTick(statistics, BLOCK_CACHE_ADD);
       }
     }
   }
diff --git a/table/block_based_table_reader.h b/table/block_based_table_reader.h
index b272c4d13..a000c6a9a 100644
--- a/table/block_based_table_reader.h
+++ b/table/block_based_table_reader.h
@@ -183,10 +183,10 @@ class BlockBasedTable : public TableReader {
       std::unique_ptr<Iterator>* iter);
 
   // Create the filter from the filter block.
-  static FilterBlockReader* ReadFilter(const BlockHandle& filter_handle,
-                                       Rep* rep,
-                                       const std::string& filter_block_prefix,
-                                       size_t* filter_size = nullptr);
+  static FilterBlockReader* ReadFilter(
+      Rep* rep,
+      Iterator* meta_index_iter,
+      size_t* filter_size = nullptr);
 
   static void SetupCacheKeyPrefix(Rep* rep);
 
diff --git a/table/table_test.cc b/table/table_test.cc
index e4657e8cd..c54fb2ff7 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -1461,8 +1461,6 @@ TEST(BlockBasedTableTest, BlockCacheDisabledTest) {
   options.create_if_missing = true;
   options.statistics = CreateDBStatistics();
   BlockBasedTableOptions table_options;
-  // Intentionally commented out: table_options.cache_index_and_filter_blocks =
-  // true;
   table_options.block_cache = NewLRUCache(1024);
   table_options.filter_policy.reset(NewBloomFilterPolicy(10));
   options.table_factory.reset(new BlockBasedTableFactory(table_options));
@@ -1521,7 +1519,7 @@ TEST(BlockBasedTableTest, FilterBlockInBlockCache) {
   c.Finish(options, ioptions, table_options,
            GetPlainInternalComparator(options.comparator), &keys, &kvmap);
   // preloading filter/index blocks is prohibited.
-  auto reader = dynamic_cast<BlockBasedTable*>(c.GetTableReader());
+  auto* reader = dynamic_cast<BlockBasedTable*>(c.GetTableReader());
   ASSERT_TRUE(!reader->TEST_filter_block_preloaded());
   ASSERT_TRUE(!reader->TEST_index_reader_preloaded());
 
@@ -1567,28 +1565,11 @@ TEST(BlockBasedTableTest, FilterBlockInBlockCache) {
   // release the iterator so that the block cache can reset correctly.
   iter.reset();
 
-  // -- PART 2: Open without block cache
-  table_options.no_block_cache = true;
-  table_options.block_cache.reset();
-  options.table_factory.reset(new BlockBasedTableFactory(table_options));
-  options.statistics = CreateDBStatistics();  // reset the stats
-  const ImmutableCFOptions ioptions1(options);
-  c.Reopen(ioptions1);
-  table_options.no_block_cache = false;
-
-  {
-    iter.reset(c.NewIterator());
-    iter->SeekToFirst();
-    ASSERT_EQ("key", iter->key().ToString());
-    BlockCachePropertiesSnapshot props(options.statistics.get());
-    // Nothing is affected at all
-    props.AssertEqual(0, 0, 0, 0);
-  }
-
-  // -- PART 3: Open with very small block cache
+  // -- PART 2: Open with very small block cache
   // In this test, no block will ever get hit since the block cache is
   // too small to fit even one entry.
   table_options.block_cache = NewLRUCache(1);
+  options.statistics = CreateDBStatistics();
   options.table_factory.reset(new BlockBasedTableFactory(table_options));
   const ImmutableCFOptions ioptions2(options);
   c.Reopen(ioptions2);
@@ -1598,7 +1579,6 @@ TEST(BlockBasedTableTest, FilterBlockInBlockCache) {
                       0, 0, 0);
   }
 
-
   {
     // Both index and data block get accessed.
     // It first cache index block then data block. But since the cache size
@@ -1618,6 +1598,33 @@ TEST(BlockBasedTableTest, FilterBlockInBlockCache) {
     props.AssertEqual(2, 0, 0 + 1,  // data block miss
                       0);
   }
+  iter.reset();
+
+  // -- PART 3: Open table with bloom filter enabled but not in SST file
+  table_options.block_cache = NewLRUCache(4096);
+  table_options.cache_index_and_filter_blocks = false;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  TableConstructor c3(BytewiseComparator());
+  c3.Add("k01", "hello");
+  ImmutableCFOptions ioptions3(options);
+  // Generate table without filter policy
+  c3.Finish(options, ioptions3, table_options,
+           GetPlainInternalComparator(options.comparator), &keys, &kvmap);
+  // Open table with filter policy
+  table_options.filter_policy.reset(NewBloomFilterPolicy(1));
+  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+  options.statistics = CreateDBStatistics();
+  ImmutableCFOptions ioptions4(options);
+  ASSERT_OK(c3.Reopen(ioptions4));
+  reader = dynamic_cast<BlockBasedTable*>(c3.GetTableReader());
+  ASSERT_TRUE(!reader->TEST_filter_block_preloaded());
+  GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
+                         GetContext::kNotFound, Slice(), nullptr,
+                         nullptr, nullptr);
+  ASSERT_OK(reader->Get(ReadOptions(), "k01", &get_context));
+  BlockCachePropertiesSnapshot props(options.statistics.get());
+  props.AssertFilterBlockStat(0, 0);
 }
 
 TEST(BlockBasedTableTest, BlockCacheLeak) {

From 839c376bd1ab866957081d9a6aa9edf9fa1cdb78 Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Wed, 22 Oct 2014 13:53:35 -0700
Subject: [PATCH 283/829] fix table_test

Summary:
SaveValue expects an internal key but I previously added to table a
user key

Test Plan:
ran the test
---
 table/table_test.cc | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/table/table_test.cc b/table/table_test.cc
index c54fb2ff7..5e1bbe4cf 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -1606,7 +1606,9 @@ TEST(BlockBasedTableTest, FilterBlockInBlockCache) {
   options.table_factory.reset(NewBlockBasedTableFactory(table_options));
 
   TableConstructor c3(BytewiseComparator());
-  c3.Add("k01", "hello");
+  std::string user_key = "k01";
+  InternalKey internal_key(user_key, 0, kTypeValue);
+  c3.Add(internal_key.Encode().ToString(), "hello");
   ImmutableCFOptions ioptions3(options);
   // Generate table without filter policy
   c3.Finish(options, ioptions3, table_options,
@@ -1619,10 +1621,12 @@ TEST(BlockBasedTableTest, FilterBlockInBlockCache) {
   ASSERT_OK(c3.Reopen(ioptions4));
   reader = dynamic_cast<BlockBasedTable*>(c3.GetTableReader());
   ASSERT_TRUE(!reader->TEST_filter_block_preloaded());
+  std::string value;
   GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
-                         GetContext::kNotFound, Slice(), nullptr,
+                         GetContext::kNotFound, user_key, &value,
                          nullptr, nullptr);
-  ASSERT_OK(reader->Get(ReadOptions(), "k01", &get_context));
+  ASSERT_OK(reader->Get(ReadOptions(), user_key, &get_context));
+  ASSERT_EQ(value, "hello");
   BlockCachePropertiesSnapshot props(options.statistics.get());
   props.AssertFilterBlockStat(0, 0);
 }

From d755e53b87a436d761f2f6a4654b34a7df506054 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Wed, 22 Oct 2014 18:24:14 -0700
Subject: [PATCH 284/829] Printing number of keys in DB Stats

Summary: It is useful to print out number of keys in DB Stats

Test Plan:
./db_bench --benchmarks fillrandom --num 1000000 -threads 16 -batch_size=16

and watch the outputs in LOG files

Reviewers: MarkCallaghan, ljin, yhchiang, igor

Reviewed By: igor

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D24513
---
 db/db_impl.cc        |  4 +++-
 db/internal_stats.cc | 24 ++++++++++++++++++------
 db/internal_stats.h  |  6 ++++++
 3 files changed, 27 insertions(+), 7 deletions(-)

diff --git a/db/db_impl.cc b/db/db_impl.cc
index 90f50174a..6a2daad7d 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -4129,7 +4129,7 @@ Status DBImpl::Write(const WriteOptions& write_options, WriteBatch* my_batch) {
       const uint64_t batch_size = WriteBatchInternal::ByteSize(updates);
       // Record statistics
       RecordTick(stats_, NUMBER_KEYS_WRITTEN, my_batch_count);
-      RecordTick(stats_, BYTES_WRITTEN, WriteBatchInternal::ByteSize(updates));
+      RecordTick(stats_, BYTES_WRITTEN, batch_size);
       if (write_options.disableWAL) {
         flush_on_destroy_ = true;
       }
@@ -4179,6 +4179,8 @@ Status DBImpl::Write(const WriteOptions& write_options, WriteBatch* my_batch) {
       // internal stats
       default_cf_internal_stats_->AddDBStats(
           InternalStats::BYTES_WRITTEN, batch_size);
+      default_cf_internal_stats_->AddDBStats(InternalStats::NUMBER_KEYS_WRITTEN,
+                                             my_batch_count);
       if (!write_options.disableWAL) {
         default_cf_internal_stats_->AddDBStats(
             InternalStats::WAL_FILE_SYNCED, 1);
diff --git a/db/internal_stats.cc b/db/internal_stats.cc
index aa3b3c850..cfeb9c00d 100644
--- a/db/internal_stats.cc
+++ b/db/internal_stats.cc
@@ -293,16 +293,25 @@ void InternalStats::DumpDBStats(std::string* value) {
   value->append(buf);
   // Cumulative
   uint64_t user_bytes_written = db_stats_[InternalStats::BYTES_WRITTEN];
+  uint64_t num_keys_written = db_stats_[InternalStats::NUMBER_KEYS_WRITTEN];
   uint64_t write_other = db_stats_[InternalStats::WRITE_DONE_BY_OTHER];
   uint64_t write_self = db_stats_[InternalStats::WRITE_DONE_BY_SELF];
   uint64_t wal_bytes = db_stats_[InternalStats::WAL_FILE_BYTES];
   uint64_t wal_synced = db_stats_[InternalStats::WAL_FILE_SYNCED];
   uint64_t write_with_wal = db_stats_[InternalStats::WRITE_WITH_WAL];
   // Data
+  // writes: total number of write requests.
+  // keys: total number of key updates issued by all the write requests
+  // batches: number of group commits issued to the DB. Each group can contain
+  //          one or more writes.
+  // so writes/keys is the average number of put in multi-put or put
+  // writes/batches is the average group commit size.
+  //
+  // The format is the same for interval stats.
   snprintf(buf, sizeof(buf),
-           "Cumulative writes: %" PRIu64 " writes, %" PRIu64 " batches, "
-           "%.1f writes per batch, %.2f GB user ingest\n",
-           write_other + write_self, write_self,
+           "Cumulative writes: %" PRIu64 " writes, %" PRIu64 " keys, %" PRIu64
+           " batches, %.1f writes per batch, %.2f GB user ingest\n",
+           write_other + write_self, num_keys_written, write_self,
            (write_other + write_self) / static_cast<double>(write_self + 1),
            user_bytes_written / kGB);
   value->append(buf);
@@ -318,11 +327,13 @@ void InternalStats::DumpDBStats(std::string* value) {
   // Interval
   uint64_t interval_write_other = write_other - db_stats_snapshot_.write_other;
   uint64_t interval_write_self = write_self - db_stats_snapshot_.write_self;
+  uint64_t interval_num_keys_written =
+      num_keys_written - db_stats_snapshot_.num_keys_written;
   snprintf(buf, sizeof(buf),
-           "Interval writes: %" PRIu64 " writes, %" PRIu64 " batches, "
-           "%.1f writes per batch, %.1f MB user ingest\n",
+           "Interval writes: %" PRIu64 " writes, %" PRIu64 " keys, %" PRIu64
+           " batches, %.1f writes per batch, %.1f MB user ingest\n",
            interval_write_other + interval_write_self,
-           interval_write_self,
+           interval_num_keys_written, interval_write_self,
            static_cast<double>(interval_write_other + interval_write_self) /
                (interval_write_self + 1),
            (user_bytes_written - db_stats_snapshot_.ingest_bytes) / kMB);
@@ -347,6 +358,7 @@ void InternalStats::DumpDBStats(std::string* value) {
   db_stats_snapshot_.ingest_bytes = user_bytes_written;
   db_stats_snapshot_.write_other = write_other;
   db_stats_snapshot_.write_self = write_self;
+  db_stats_snapshot_.num_keys_written = num_keys_written;
   db_stats_snapshot_.wal_bytes = wal_bytes;
   db_stats_snapshot_.wal_synced = wal_synced;
   db_stats_snapshot_.write_with_wal = write_with_wal;
diff --git a/db/internal_stats.h b/db/internal_stats.h
index 4d12a2512..84fd10289 100644
--- a/db/internal_stats.h
+++ b/db/internal_stats.h
@@ -67,6 +67,7 @@ class InternalStats {
     WAL_FILE_BYTES,
     WAL_FILE_SYNCED,
     BYTES_WRITTEN,
+    NUMBER_KEYS_WRITTEN,
     WRITE_DONE_BY_OTHER,
     WRITE_DONE_BY_SELF,
     WRITE_WITH_WAL,
@@ -264,6 +265,11 @@ class InternalStats {
     // another thread.
     uint64_t write_other;
     uint64_t write_self;
+    // Total number of keys written. write_self and write_other measure number
+    // of write requests written, Each of the write request can contain updates
+    // to multiple keys. num_keys_written is total number of keys updated by all
+    // those writes.
+    uint64_t num_keys_written;
     double seconds_up;
 
     DBStatsSnapshot()

From 2a8e5203d8f955e94b074f52a50410dbdbee0a0d Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Wed, 22 Oct 2014 18:43:27 -0700
Subject: [PATCH 285/829] db_bench: --batch_size used for write benchmarks too

Summary: Now --bench_size is only used in multireadrandom tests, although the codes allow it to run in all write tests. I don't see a reason why we can't enable it.

Test Plan:
Run
   ./db_bench -benchmarks multirandomwrite --threads=5 -batch_size=16
and see the stats printed out in LOG to make sure batching really happened.

Reviewers: ljin, yhchiang, rven, igor

Reviewed By: igor

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D25509
---
 db/db_bench.cc | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/db/db_bench.cc b/db/db_bench.cc
index d018ce70f..f0fe5e02e 100644
--- a/db/db_bench.cc
+++ b/db/db_bench.cc
@@ -1232,7 +1232,7 @@ class Benchmark {
       writes_ = (FLAGS_writes < 0 ? FLAGS_num : FLAGS_writes);
       value_size_ = FLAGS_value_size;
       key_size_ = FLAGS_key_size;
-      entries_per_batch_ = 1;
+      entries_per_batch_ = FLAGS_batch_size;
       write_options_ = WriteOptions();
       if (FLAGS_sync) {
         write_options_.sync = true;
@@ -1287,7 +1287,6 @@ class Benchmark {
       } else if (name == Slice("readrandomfast")) {
         method = &Benchmark::ReadRandomFast;
       } else if (name == Slice("multireadrandom")) {
-        entries_per_batch_ = FLAGS_batch_size;
         fprintf(stderr, "entries_per_batch = %" PRIi64 "\n",
                 entries_per_batch_);
         method = &Benchmark::MultiReadRandom;

From c584d2b538c29242c14a56ebcf6f5e396f120fb1 Mon Sep 17 00:00:00 2001
From: Adam Retter <adam.retter@googlemail.com>
Date: Thu, 23 Oct 2014 15:39:48 +0100
Subject: [PATCH 286/829] Fix for building RocksDB Java on Mac OS X Yosemite

---
 Makefile | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 2e101bede..d9a8feffa 100644
--- a/Makefile
+++ b/Makefile
@@ -528,7 +528,11 @@ ROCKSDB_SOURCES_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PAT
 ifeq ($(PLATFORM), OS_MACOSX)
 ROCKSDBJNILIB = librocksdbjni-osx.jnilib
 ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-osx.jar
-JAVA_INCLUDE = -I/System/Library/Frameworks/JavaVM.framework/Headers/
+ifneq ("$(wildcard $(JAVA_HOME)/include/darwin)","")
+	JAVA_INCLUDE = -I$(JAVA_HOME)/include -I $(JAVA_HOME)/include/darwin
+else
+	JAVA_INCLUDE = -I/System/Library/Frameworks/JavaVM.framework/Headers/
+endif
 endif
 
 libz.a:

From 3b5fe3a1f36cdd438a2e818e94a4174951c68ba1 Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Thu, 23 Oct 2014 10:41:58 -0700
Subject: [PATCH 287/829] Correct the log message in VersionEdit

Summary:
When VersionEdit fails in kNewFile3, previously it logs "new-file2 entry".
However, it should be "new-file3 entry."

Test Plan:
make
---
 db/version_edit.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/db/version_edit.cc b/db/version_edit.cc
index 4e2cf8f5b..271016aaf 100644
--- a/db/version_edit.cc
+++ b/db/version_edit.cc
@@ -293,7 +293,7 @@ Status VersionEdit::DecodeFrom(const Slice& src) {
           new_files_.push_back(std::make_pair(level, f));
         } else {
           if (!msg) {
-            msg = "new-file2 entry";
+            msg = "new-file3 entry";
           }
         }
         break;

From 90f156402c4b74f05fb86834401ab96cc018cf03 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Thu, 23 Oct 2014 11:18:11 -0700
Subject: [PATCH 288/829] Fix CompactBetweenSnapshots

---
 db/db_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/db/db_test.cc b/db/db_test.cc
index d13929fc6..968ea7521 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -5272,7 +5272,7 @@ TEST(DBTest, CompactBetweenSnapshots) {
   do {
     Options options = CurrentOptions();
     options.disable_auto_compactions = true;
-    CreateAndReopenWithCF({"pikachu"});
+    CreateAndReopenWithCF({"pikachu"}, &options);
     Random rnd(301);
     FillLevels("a", "z", 1);
 

From 9383922cc3bdf8f85caf466cb61cf66090762e37 Mon Sep 17 00:00:00 2001
From: Adam Retter <adam.retter@googlemail.com>
Date: Thu, 23 Oct 2014 19:35:58 +0100
Subject: [PATCH 289/829] Added java tests to travis build

---
 .travis.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.travis.yml b/.travis.yml
index 8f1bcb0ae..b52e5acf7 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -14,6 +14,6 @@ before_install:
  - sudo dpkg -i libgflags-dev_2.0-1_amd64.deb
 # Lousy hack to disable use and testing of fallocate, which doesn't behave quite
 # as EnvPosixTest::AllocateTest expects within the Travis OpenVZ environment.
-script: OPT=-DTRAVIS make unity  && OPT=-DTRAVIS make check -j8
+script: OPT=-DTRAVIS make unity  && OPT=-DTRAVIS make check -j8  && OPT=-DTRAVIS make rocksdbjava jtest
 notifications:
     email: false

From 4b1786e9599c5770376e8809bfbce7d74426640e Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Thu, 23 Oct 2014 12:03:19 -0700
Subject: [PATCH 290/829] Fix SIGSEGV when declaring Arena after
 ScopedArenaIterator

---
 db/db_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/db/db_test.cc b/db/db_test.cc
index 968ea7521..06d0241ee 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -807,8 +807,8 @@ class DBTest {
   }
 
   std::string AllEntriesFor(const Slice& user_key, int cf = 0) {
-    ScopedArenaIterator iter;
     Arena arena;
+    ScopedArenaIterator iter;
     if (cf == 0) {
       iter.set(dbfull()->TEST_NewInternalIterator(&arena));
     } else {

From 9aa9668a83cfebe865212a5468cc747fc800872c Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Thu, 16 Oct 2014 20:39:26 +0200
Subject: [PATCH 291/829] [RocksJava] Memtables update to 3.6

- Adjusted HashLinkedList to 3.6.0
- Adjusted SkipList to 3.6.0
- Introduced a memtable test
---
 java/Makefile                                 |   1 +
 .../rocksdb/HashLinkedListMemTableConfig.java | 122 +++++++++++++++++-
 .../rocksdb/HashSkipListMemTableConfig.java   |   3 +
 java/org/rocksdb/SkipListMemTableConfig.java  |  38 +++++-
 java/org/rocksdb/VectorMemTableConfig.java    |   4 +
 java/org/rocksdb/test/MemTableTest.java       | 107 +++++++++++++++
 java/rocksjni/memtablejni.cc                  |  34 +++--
 7 files changed, 296 insertions(+), 13 deletions(-)
 create mode 100644 java/org/rocksdb/test/MemTableTest.java

diff --git a/java/Makefile b/java/Makefile
index 697df5175..504060bc3 100644
--- a/java/Makefile
+++ b/java/Makefile
@@ -42,6 +42,7 @@ test: java
 	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.ColumnFamilyTest
 	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.FilterTest
 	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.KeyMayExistTest
+	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.MemTableTest
 	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.OptionsTest
 	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.ReadOnlyTest
 	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.ReadOptionsTest
diff --git a/java/org/rocksdb/HashLinkedListMemTableConfig.java b/java/org/rocksdb/HashLinkedListMemTableConfig.java
index 381a16f49..78a4e8661 100644
--- a/java/org/rocksdb/HashLinkedListMemTableConfig.java
+++ b/java/org/rocksdb/HashLinkedListMemTableConfig.java
@@ -15,9 +15,21 @@ package org.rocksdb;
  */
 public class HashLinkedListMemTableConfig extends MemTableConfig {
   public static final long DEFAULT_BUCKET_COUNT = 50000;
+  public static final long DEFAULT_HUGE_PAGE_TLB_SIZE = 0;
+  public static final int DEFAULT_BUCKET_ENTRIES_LOG_THRES = 4096;
+  public static final boolean
+      DEFAULT_IF_LOG_BUCKET_DIST_WHEN_FLUSH = true;
+  public static final int DEFAUL_THRESHOLD_USE_SKIPLIST = 256;
 
+  /**
+   * HashLinkedListMemTableConfig constructor
+   */
   public HashLinkedListMemTableConfig() {
     bucketCount_ = DEFAULT_BUCKET_COUNT;
+    hugePageTlbSize_ = DEFAULT_HUGE_PAGE_TLB_SIZE;
+    bucketEntriesLoggingThreshold_ = DEFAULT_BUCKET_ENTRIES_LOG_THRES;
+    ifLogBucketDistWhenFlush_ = DEFAULT_IF_LOG_BUCKET_DIST_WHEN_FLUSH;
+    thresholdUseSkiplist_ = DEFAUL_THRESHOLD_USE_SKIPLIST;
   }
 
   /**
@@ -42,13 +54,119 @@ public class HashLinkedListMemTableConfig extends MemTableConfig {
     return bucketCount_;
   }
 
+  /**
+   * <p>Set the size of huge tlb or allocate the hashtable bytes from
+   * malloc if {@code size <= 0}.</p>
+   *
+   * <p>The user needs to reserve huge pages for it to be allocated,
+   * like: {@code sysctl -w vm.nr_hugepages=20}</p>
+   *
+   * <p>See linux documentation/vm/hugetlbpage.txt</p>
+   *
+   * @param size if set to {@code <= 0} hashtable bytes from malloc
+   * @return the reference to the current HashLinkedListMemTableConfig.
+   */
+  public HashLinkedListMemTableConfig setHugePageTlbSize(long size) {
+    hugePageTlbSize_ = size;
+    return this;
+  }
+
+  /**
+   * Returns the size value of hugePageTlbSize.
+   *
+   * @return the hugePageTlbSize.
+   */
+  public long hugePageTlbSize() {
+    return hugePageTlbSize_;
+  }
+
+  /**
+   * If number of entries in one bucket exceeds that setting, log
+   * about it.
+   *
+   * @param threshold - number of entries in a single bucket before
+   *     logging starts.
+   * @return the reference to the current HashLinkedListMemTableConfig.
+   */
+  public HashLinkedListMemTableConfig
+      setBucketEntriesLoggingThreshold(int threshold) {
+    bucketEntriesLoggingThreshold_ = threshold;
+    return this;
+  }
+
+  /**
+   * Returns the maximum number of entries in one bucket before
+   * logging starts.
+   *
+   * @return maximum number of entries in one bucket before logging
+   *     starts.
+   */
+  public int bucketEntriesLoggingThreshold() {
+    return bucketEntriesLoggingThreshold_;
+  }
+
+  /**
+   * If true the distrubition of number of entries will be logged.
+   *
+   * @param logDistribution - boolean parameter indicating if number
+   *     of entry distribution shall be logged.
+   * @return the reference to the current HashLinkedListMemTableConfig.
+   */
+  public HashLinkedListMemTableConfig
+      setIfLogBucketDistWhenFlush(boolean logDistribution) {
+    ifLogBucketDistWhenFlush_ = logDistribution;
+    return this;
+  }
+
+  /**
+   * Returns information about logging the distribution of
+   *  number of entries on flush.
+   *
+   * @return if distrubtion of number of entries shall be logged.
+   */
+  public boolean ifLogBucketDistWhenFlush() {
+    return ifLogBucketDistWhenFlush_;
+  }
+
+  /**
+   * Set maximum number of entries in one bucket. Exceeding this val
+   * leads to a switch from LinkedList to SkipList.
+   *
+   * @param threshold maximum number of entries before SkipList is
+   *     used.
+   * @return the reference to the current HashLinkedListMemTableConfig.
+   */
+  public HashLinkedListMemTableConfig
+      setThresholdUseSkiplist(int threshold) {
+    thresholdUseSkiplist_ = threshold;
+    return this;
+  }
+
+  /**
+   * Returns entries per bucket threshold before LinkedList is
+   * replaced by SkipList usage for that bucket.
+   *
+   * @return entries per bucket threshold before SkipList is used.
+   */
+  public int thresholdUseSkiplist() {
+    return thresholdUseSkiplist_;
+  }
+
   @Override protected long newMemTableFactoryHandle()
        throws RocksDBException {
-    return newMemTableFactoryHandle(bucketCount_);
+    return newMemTableFactoryHandle(bucketCount_, hugePageTlbSize_,
+        bucketEntriesLoggingThreshold_, ifLogBucketDistWhenFlush_,
+        thresholdUseSkiplist_);
   }
 
-  private native long newMemTableFactoryHandle(long bucketCount)
+  private native long newMemTableFactoryHandle(long bucketCount,
+      long hugePageTlbSize, int bucketEntriesLoggingThreshold,
+      boolean ifLogBucketDistWhenFlush, int thresholdUseSkiplist)
       throws RocksDBException;
 
   private long bucketCount_;
+  private long hugePageTlbSize_;
+  private int bucketEntriesLoggingThreshold_;
+  private boolean ifLogBucketDistWhenFlush_;
+  private int thresholdUseSkiplist_;
 }
diff --git a/java/org/rocksdb/HashSkipListMemTableConfig.java b/java/org/rocksdb/HashSkipListMemTableConfig.java
index 100f16c82..ad2120f18 100644
--- a/java/org/rocksdb/HashSkipListMemTableConfig.java
+++ b/java/org/rocksdb/HashSkipListMemTableConfig.java
@@ -18,6 +18,9 @@ public class HashSkipListMemTableConfig extends MemTableConfig {
   public static final int DEFAULT_BRANCHING_FACTOR = 4;
   public static final int DEFAULT_HEIGHT = 4;
 
+  /**
+   * HashSkipListMemTableConfig constructor
+   */
   public HashSkipListMemTableConfig() {
     bucketCount_ = DEFAULT_BUCKET_COUNT;
     branchingFactor_ = DEFAULT_BRANCHING_FACTOR;
diff --git a/java/org/rocksdb/SkipListMemTableConfig.java b/java/org/rocksdb/SkipListMemTableConfig.java
index 7f9f5cb5f..d26fd9d32 100644
--- a/java/org/rocksdb/SkipListMemTableConfig.java
+++ b/java/org/rocksdb/SkipListMemTableConfig.java
@@ -4,12 +4,46 @@ package org.rocksdb;
  * The config for skip-list memtable representation.
  */
 public class SkipListMemTableConfig extends MemTableConfig {
+
+  public static final long DEFAULT_LOOKAHEAD = 0;
+
+  /**
+   * SkipListMemTableConfig constructor
+   */
   public SkipListMemTableConfig() {
+    lookahead_ = DEFAULT_LOOKAHEAD;
+  }
+
+  /**
+   * Sets lookahead for SkipList
+   *
+   * @param lookahead If non-zero, each iterator's seek operation
+   *     will start the search from the previously visited record
+   *     (doing at most 'lookahead' steps). This is an
+   *     optimization for the access pattern including many
+   *     seeks with consecutive keys.
+   * @return the current instance of SkipListMemTableConfig
+   */
+  public SkipListMemTableConfig setLookahead(long lookahead) {
+    lookahead_ = lookahead;
+    return this;
   }
 
+  /**
+   * Returns the currently set lookahead value.
+   *
+   * @return lookahead value
+   */
+  public long lookahead() {
+    return lookahead_;
+  }
+
+
   @Override protected long newMemTableFactoryHandle() {
-    return newMemTableFactoryHandle0();
+    return newMemTableFactoryHandle0(lookahead_);
   }
 
-  private native long newMemTableFactoryHandle0();
+  private native long newMemTableFactoryHandle0(long lookahead);
+
+  private long lookahead_;
 }
diff --git a/java/org/rocksdb/VectorMemTableConfig.java b/java/org/rocksdb/VectorMemTableConfig.java
index b7a413f19..ba1be3e77 100644
--- a/java/org/rocksdb/VectorMemTableConfig.java
+++ b/java/org/rocksdb/VectorMemTableConfig.java
@@ -5,6 +5,10 @@ package org.rocksdb;
  */
 public class VectorMemTableConfig extends MemTableConfig {
   public static final int DEFAULT_RESERVED_SIZE = 0;
+
+  /**
+   * VectorMemTableConfig constructor
+   */
   public VectorMemTableConfig() {
     reservedSize_ = DEFAULT_RESERVED_SIZE;
   }
diff --git a/java/org/rocksdb/test/MemTableTest.java b/java/org/rocksdb/test/MemTableTest.java
new file mode 100644
index 000000000..0d1e4d54a
--- /dev/null
+++ b/java/org/rocksdb/test/MemTableTest.java
@@ -0,0 +1,107 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb.test;
+
+import org.rocksdb.*;
+
+public class MemTableTest {
+  static {
+    RocksDB.loadLibrary();
+  }
+  public static void main(String[] args) {
+    Options options = new Options();
+    // Test HashSkipListMemTableConfig
+    HashSkipListMemTableConfig memTableConfig =
+        new HashSkipListMemTableConfig();
+    assert(memTableConfig.bucketCount() == 1000000);
+    memTableConfig.setBucketCount(2000000);
+    assert(memTableConfig.bucketCount() == 2000000);
+    assert(memTableConfig.height() == 4);
+    memTableConfig.setHeight(5);
+    assert(memTableConfig.height() == 5);
+    assert(memTableConfig.branchingFactor() == 4);
+    memTableConfig.setBranchingFactor(6);
+    assert(memTableConfig.branchingFactor() == 6);
+    try {
+      options.setMemTableConfig(memTableConfig);
+    } catch (RocksDBException e) {
+      assert(false);
+    }
+    memTableConfig = null;
+    options.dispose();
+    System.gc();
+    System.runFinalization();
+    // Test SkipList
+    options = new Options();
+    SkipListMemTableConfig skipMemTableConfig =
+        new SkipListMemTableConfig();
+    assert(skipMemTableConfig.lookahead() == 0);
+    skipMemTableConfig.setLookahead(20);
+    assert(skipMemTableConfig.lookahead() == 20);
+    try {
+      options.setMemTableConfig(skipMemTableConfig);
+    } catch (RocksDBException e) {
+      assert(false);
+    }
+    skipMemTableConfig = null;
+    options.dispose();
+    System.gc();
+    System.runFinalization();
+    // Test HashLinkedListMemTableConfig
+    options = new Options();
+    HashLinkedListMemTableConfig hashLinkedListMemTableConfig =
+        new HashLinkedListMemTableConfig();
+    assert(hashLinkedListMemTableConfig.bucketCount() == 50000);
+    hashLinkedListMemTableConfig.setBucketCount(100000);
+    assert(hashLinkedListMemTableConfig.bucketCount() == 100000);
+    assert(hashLinkedListMemTableConfig.hugePageTlbSize() == 0);
+    hashLinkedListMemTableConfig.setHugePageTlbSize(1);
+    assert(hashLinkedListMemTableConfig.hugePageTlbSize() == 1);
+    assert(hashLinkedListMemTableConfig.
+       bucketEntriesLoggingThreshold() == 4096);
+    hashLinkedListMemTableConfig.
+        setBucketEntriesLoggingThreshold(200);
+    assert(hashLinkedListMemTableConfig.
+       bucketEntriesLoggingThreshold() == 200);
+    assert(hashLinkedListMemTableConfig.
+        ifLogBucketDistWhenFlush() == true);
+    hashLinkedListMemTableConfig.
+        setIfLogBucketDistWhenFlush(false);
+    assert(hashLinkedListMemTableConfig.
+        ifLogBucketDistWhenFlush() == false);
+    assert(hashLinkedListMemTableConfig.
+        thresholdUseSkiplist() == 256);
+    hashLinkedListMemTableConfig.setThresholdUseSkiplist(29);
+    assert(hashLinkedListMemTableConfig.
+        thresholdUseSkiplist() == 29);
+    try {
+      options.setMemTableConfig(hashLinkedListMemTableConfig);
+    } catch (RocksDBException e) {
+      assert(false);
+    }
+    hashLinkedListMemTableConfig = null;
+    options.dispose();
+    System.gc();
+    System.runFinalization();
+    // test VectorMemTableConfig
+    options = new Options();
+    VectorMemTableConfig vectorMemTableConfig =
+        new VectorMemTableConfig();
+    assert(vectorMemTableConfig.reservedSize() == 0);
+    vectorMemTableConfig.setReservedSize(123);
+    assert(vectorMemTableConfig.reservedSize() == 123);
+    try {
+      options.setMemTableConfig(vectorMemTableConfig);
+    } catch (RocksDBException e) {
+      assert(false);
+    }
+    vectorMemTableConfig = null;
+    options.dispose();
+    System.gc();
+    System.runFinalization();
+    System.out.println("Mem-table test passed");
+  }
+}
diff --git a/java/rocksjni/memtablejni.cc b/java/rocksjni/memtablejni.cc
index 4be03d491..fe83885c2 100644
--- a/java/rocksjni/memtablejni.cc
+++ b/java/rocksjni/memtablejni.cc
@@ -34,16 +34,26 @@ jlong Java_org_rocksdb_HashSkipListMemTableConfig_newMemTableFactoryHandle(
 /*
  * Class:     org_rocksdb_HashLinkedListMemTableConfig
  * Method:    newMemTableFactoryHandle
- * Signature: (J)J
+ * Signature: (JJIZI)J
  */
 jlong Java_org_rocksdb_HashLinkedListMemTableConfig_newMemTableFactoryHandle(
-    JNIEnv* env, jobject jobj, jlong jbucket_count) {
-  rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t(jbucket_count);
-  if (s.ok()) {
+    JNIEnv* env, jobject jobj, jlong jbucket_count, jlong jhuge_page_tlb_size,
+    jint jbucket_entries_logging_threshold,
+    jboolean jif_log_bucket_dist_when_flash, jint jthreshold_use_skiplist) {
+  rocksdb::Status statusBucketCount =
+      rocksdb::check_if_jlong_fits_size_t(jbucket_count);
+  rocksdb::Status statusHugePageTlb =
+      rocksdb::check_if_jlong_fits_size_t(jhuge_page_tlb_size);
+  if (statusBucketCount.ok() && statusHugePageTlb.ok()) {
     return reinterpret_cast<jlong>(rocksdb::NewHashLinkListRepFactory(
-        static_cast<size_t>(jbucket_count)));
+        static_cast<size_t>(jbucket_count),
+        static_cast<size_t>(jhuge_page_tlb_size),
+        static_cast<int32_t>(jbucket_entries_logging_threshold),
+        static_cast<bool>(jif_log_bucket_dist_when_flash),
+        static_cast<int32_t>(jthreshold_use_skiplist)));
   }
-  rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  rocksdb::RocksDBExceptionJni::ThrowNew(env,
+      !statusBucketCount.ok()?statusBucketCount:statusHugePageTlb);
   return 0;
 }
 
@@ -66,9 +76,15 @@ jlong Java_org_rocksdb_VectorMemTableConfig_newMemTableFactoryHandle(
 /*
  * Class:     org_rocksdb_SkipListMemTableConfig
  * Method:    newMemTableFactoryHandle0
- * Signature: ()J
+ * Signature: (J)J
  */
 jlong Java_org_rocksdb_SkipListMemTableConfig_newMemTableFactoryHandle0(
-    JNIEnv* env, jobject jobj) {
-  return reinterpret_cast<jlong>(new rocksdb::SkipListFactory());
+    JNIEnv* env, jobject jobj, jlong jlookahead) {
+  rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t(jlookahead);
+  if (s.ok()) {
+    return reinterpret_cast<jlong>(new rocksdb::SkipListFactory(
+        static_cast<size_t>(jlookahead)));
+  }
+  rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  return 0;
 }

From 1eb545721df80a186887eb0b83c7b006401652c5 Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Tue, 21 Oct 2014 21:17:45 +0200
Subject: [PATCH 292/829] Fix incorrectly merged Java - Makefile

---
 java/Makefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/java/Makefile b/java/Makefile
index 697df5175..fd15dd34e 100644
--- a/java/Makefile
+++ b/java/Makefile
@@ -49,6 +49,7 @@ test: java
 	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.ComparatorOptionsTest
 	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.ComparatorTest
 	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.DirectComparatorTest
+	@rm -rf /tmp/rocksdbjni_*
 
 db_bench: java
 	javac org/rocksdb/benchmark/*.java

From bd4fbaee3753a7405b3f3d4248b6b43a93ea760c Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Tue, 21 Oct 2014 21:18:33 +0200
Subject: [PATCH 293/829] Fixed cross platform build after introducing Java-7
 dependencies

---
 java/crossbuild/build-linux-centos.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/java/crossbuild/build-linux-centos.sh b/java/crossbuild/build-linux-centos.sh
index 55f179b62..5730b1533 100755
--- a/java/crossbuild/build-linux-centos.sh
+++ b/java/crossbuild/build-linux-centos.sh
@@ -1,7 +1,7 @@
 #!/usr/bin/env bash
 # install all required packages for rocksdb that are available through yum
 ARCH=$(uname -i)
-sudo yum -y install java-1.6.0-openjdk-devel.$ARCH zlib zlib-devel bzip2 bzip2-devel
+sudo yum -y install java-1.7.0-openjdk-devel.$ARCH zlib zlib-devel bzip2 bzip2-devel
 
 # install gcc/g++ 4.7 via CERN (http://linux.web.cern.ch/linux/devtoolset/)
 sudo wget -O /etc/yum.repos.d/slc5-devtoolset.repo http://linuxsoft.cern.ch/cern/devtoolset/slc5-devtoolset.repo
@@ -12,7 +12,7 @@ tar xvfz gflags-1.6.tar.gz; cd gflags-1.6; scl enable devtoolset-1.1 ./configure
 export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib
 
 # set java home so we can build rocksdb jars
-export JAVA_HOME=/usr/lib/jvm/java-1.6.0
+export JAVA_HOME=/usr/lib/jvm/java-1.7.0
 
 # build rocksdb
 cd /rocksdb

From 574028679b9f19d504a3695989c711fa5d73fe80 Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Thu, 23 Oct 2014 15:34:21 -0700
Subject: [PATCH 294/829] dynamic max_sequential_skip_in_iterations

Summary:
This is not a critical options. Making it dynamic so that we can remove
more reference to cfd->options()

Test Plan: unit test

Reviewers: yhchiang, sdong, igor

Reviewed By: igor

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D24957
---
 db/column_family.h        |  2 +-
 db/db_impl.cc             | 45 ++++++++++++++--------------------
 db/db_impl_readonly.cc    | 13 +++++-----
 db/db_test.cc             | 50 ++++++++++++++++++++++++++++++++++++++
 db/forward_iterator.cc    | 51 +++++++++++++++++++++++----------------
 db/forward_iterator.h     |  6 ++---
 util/mutable_cf_options.h | 10 ++++++--
 util/options_helper.cc    | 15 ++++++++++--
 8 files changed, 130 insertions(+), 62 deletions(-)

diff --git a/db/column_family.h b/db/column_family.h
index 9c415c2a8..96b08c52e 100644
--- a/db/column_family.h
+++ b/db/column_family.h
@@ -166,7 +166,7 @@ class ColumnFamilyData {
   bool IsDropped() const { return dropped_; }
 
   // thread-safe
-  int NumberLevels() const { return options_.num_levels; }
+  int NumberLevels() const { return ioptions_.num_levels; }
 
   void SetLogNumber(uint64_t log_number) { log_number_ = log_number; }
   uint64_t GetLogNumber() const { return log_number_; }
diff --git a/db/db_impl.cc b/db/db_impl.cc
index 6a2daad7d..0e47774a7 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -3828,16 +3828,16 @@ Iterator* DBImpl::NewIterator(const ReadOptions& read_options,
     // not supported in lite version
     return nullptr;
 #else
-    auto iter = new ForwardIterator(this, read_options, cfd);
+    SuperVersion* sv = cfd->GetReferencedSuperVersion(&mutex_);
+    auto iter = new ForwardIterator(this, read_options, cfd, sv);
     return NewDBIterator(env_, *cfd->ioptions(), cfd->user_comparator(), iter,
-                         kMaxSequenceNumber,
-                         cfd->options()->max_sequential_skip_in_iterations,
-                         read_options.iterate_upper_bound);
+        kMaxSequenceNumber,
+        sv->mutable_cf_options.max_sequential_skip_in_iterations,
+        read_options.iterate_upper_bound);
 #endif
   } else {
     SequenceNumber latest_snapshot = versions_->LastSequence();
-    SuperVersion* sv = nullptr;
-    sv = cfd->GetReferencedSuperVersion(&mutex_);
+    SuperVersion* sv = cfd->GetReferencedSuperVersion(&mutex_);
 
     auto snapshot =
         read_options.snapshot != nullptr
@@ -3889,7 +3889,7 @@ Iterator* DBImpl::NewIterator(const ReadOptions& read_options,
     // that they are likely to be in the same cache line and/or page.
     ArenaWrappedDBIter* db_iter = NewArenaWrappedDbIterator(
         env_, *cfd->ioptions(), cfd->user_comparator(),
-        snapshot, cfd->options()->max_sequential_skip_in_iterations,
+        snapshot, sv->mutable_cf_options.max_sequential_skip_in_iterations,
         read_options.iterate_upper_bound);
 
     Iterator* internal_iter =
@@ -3908,19 +3908,6 @@ Status DBImpl::NewIterators(
     std::vector<Iterator*>* iterators) {
   iterators->clear();
   iterators->reserve(column_families.size());
-  SequenceNumber latest_snapshot = 0;
-  std::vector<SuperVersion*> super_versions;
-  super_versions.reserve(column_families.size());
-
-  if (!read_options.tailing) {
-    mutex_.Lock();
-    latest_snapshot = versions_->LastSequence();
-    for (auto cfh : column_families) {
-      auto cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(cfh)->cfd();
-      super_versions.push_back(cfd->GetSuperVersion()->Ref());
-    }
-    mutex_.Unlock();
-  }
 
   if (read_options.tailing) {
 #ifdef ROCKSDB_LITE
@@ -3929,17 +3916,21 @@ Status DBImpl::NewIterators(
 #else
     for (auto cfh : column_families) {
       auto cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(cfh)->cfd();
-      auto iter = new ForwardIterator(this, read_options, cfd);
+      SuperVersion* sv = cfd->GetReferencedSuperVersion(&mutex_);
+      auto iter = new ForwardIterator(this, read_options, cfd, sv);
       iterators->push_back(
           NewDBIterator(env_, *cfd->ioptions(), cfd->user_comparator(), iter,
-                        kMaxSequenceNumber,
-                        cfd->options()->max_sequential_skip_in_iterations));
+              kMaxSequenceNumber,
+              sv->mutable_cf_options.max_sequential_skip_in_iterations));
     }
 #endif
   } else {
+    SequenceNumber latest_snapshot = versions_->LastSequence();
+
     for (size_t i = 0; i < column_families.size(); ++i) {
-      auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_families[i]);
-      auto cfd = cfh->cfd();
+      auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(
+          column_families[i])->cfd();
+      SuperVersion* sv = cfd->GetReferencedSuperVersion(&mutex_);
 
       auto snapshot =
           read_options.snapshot != nullptr
@@ -3949,9 +3940,9 @@ Status DBImpl::NewIterators(
 
       ArenaWrappedDBIter* db_iter = NewArenaWrappedDbIterator(
           env_, *cfd->ioptions(), cfd->user_comparator(), snapshot,
-          cfd->options()->max_sequential_skip_in_iterations);
+          sv->mutable_cf_options.max_sequential_skip_in_iterations);
       Iterator* internal_iter = NewInternalIterator(
-          read_options, cfd, super_versions[i], db_iter->GetArena());
+          read_options, cfd, sv, db_iter->GetArena());
       db_iter->SetIterUnderDBIter(internal_iter);
       iterators->push_back(db_iter);
     }
diff --git a/db/db_impl_readonly.cc b/db/db_impl_readonly.cc
index 31ebdbedd..c98693d38 100644
--- a/db/db_impl_readonly.cc
+++ b/db/db_impl_readonly.cc
@@ -53,7 +53,7 @@ Iterator* DBImplReadOnly::NewIterator(const ReadOptions& read_options,
            ? reinterpret_cast<const SnapshotImpl*>(
                 read_options.snapshot)->number_
            : latest_snapshot),
-      cfd->options()->max_sequential_skip_in_iterations);
+      super_version->mutable_cf_options.max_sequential_skip_in_iterations);
   auto internal_iter = NewInternalIterator(
       read_options, cfd, super_version, db_iter->GetArena());
   db_iter->SetIterUnderDBIter(internal_iter);
@@ -72,16 +72,17 @@ Status DBImplReadOnly::NewIterators(
   SequenceNumber latest_snapshot = versions_->LastSequence();
 
   for (auto cfh : column_families) {
-    auto cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(cfh)->cfd();
-    auto db_iter = NewArenaWrappedDbIterator(
+    auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(cfh)->cfd();
+    auto* sv = cfd->GetSuperVersion()->Ref();
+    auto* db_iter = NewArenaWrappedDbIterator(
         env_, *cfd->ioptions(), cfd->user_comparator(),
         (read_options.snapshot != nullptr
             ? reinterpret_cast<const SnapshotImpl*>(
                   read_options.snapshot)->number_
             : latest_snapshot),
-        cfd->options()->max_sequential_skip_in_iterations);
-    auto internal_iter = NewInternalIterator(
-        read_options, cfd, cfd->GetSuperVersion()->Ref(), db_iter->GetArena());
+        sv->mutable_cf_options.max_sequential_skip_in_iterations);
+    auto* internal_iter = NewInternalIterator(
+        read_options, cfd, sv, db_iter->GetArena());
     db_iter->SetIterUnderDBIter(internal_iter);
     iterators->push_back(db_iter);
   }
diff --git a/db/db_test.cc b/db/db_test.cc
index 06d0241ee..1ee8b58af 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -8889,6 +8889,56 @@ TEST(DBTest, DynamicCompactionOptions) {
   ASSERT_TRUE(Put(Key(count), RandomString(&rnd, 1024), wo).ok());
 }
 
+TEST(DBTest, DynamicMiscOptions) {
+  // Test max_sequential_skip_in_iterations
+  Options options;
+  options.env = env_;
+  options.create_if_missing = true;
+  options.max_sequential_skip_in_iterations = 16;
+  options.compression = kNoCompression;
+  options.statistics = rocksdb::CreateDBStatistics();
+  DestroyAndReopen(&options);
+
+  auto assert_reseek_count = [this, &options](int key_start, int num_reseek) {
+    int key0 = key_start;
+    int key1 = key_start + 1;
+    int key2 = key_start + 2;
+    Random rnd(301);
+    ASSERT_OK(Put(Key(key0), RandomString(&rnd, 8)));
+    for (int i = 0; i < 10; ++i) {
+      ASSERT_OK(Put(Key(key1), RandomString(&rnd, 8)));
+    }
+    ASSERT_OK(Put(Key(key2), RandomString(&rnd, 8)));
+    std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
+    iter->Seek(Key(key1));
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(Key(key1)), 0);
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(Key(key2)), 0);
+    ASSERT_EQ(num_reseek,
+              TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION));
+  };
+  // No reseek
+  assert_reseek_count(100, 0);
+
+  ASSERT_TRUE(dbfull()->SetOptions({
+    {"max_sequential_skip_in_iterations", "4"}
+  }));
+  // Clear memtable and make new option effective
+  dbfull()->TEST_FlushMemTable(true);
+  // Trigger reseek
+  assert_reseek_count(200, 1);
+
+  ASSERT_TRUE(dbfull()->SetOptions({
+    {"max_sequential_skip_in_iterations", "16"}
+  }));
+  // Clear memtable and make new option effective
+  dbfull()->TEST_FlushMemTable(true);
+  // No reseek
+  assert_reseek_count(300, 1);
+}
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
diff --git a/db/forward_iterator.cc b/db/forward_iterator.cc
index 684045e05..b2e4bd067 100644
--- a/db/forward_iterator.cc
+++ b/db/forward_iterator.cc
@@ -114,25 +114,29 @@ class LevelIterator : public Iterator {
 };
 
 ForwardIterator::ForwardIterator(DBImpl* db, const ReadOptions& read_options,
-                                 ColumnFamilyData* cfd)
+    ColumnFamilyData* cfd, SuperVersion* current_sv)
     : db_(db),
       read_options_(read_options),
       cfd_(cfd),
       prefix_extractor_(cfd->options()->prefix_extractor.get()),
       user_comparator_(cfd->user_comparator()),
       immutable_min_heap_(MinIterComparator(&cfd_->internal_comparator())),
-      sv_(nullptr),
+      sv_(current_sv),
       mutable_iter_(nullptr),
       current_(nullptr),
       valid_(false),
       is_prev_set_(false),
-      is_prev_inclusive_(false) {}
+      is_prev_inclusive_(false) {
+  if (sv_) {
+    RebuildIterators(false);
+  }
+}
 
 ForwardIterator::~ForwardIterator() {
-  Cleanup();
+  Cleanup(true);
 }
 
-void ForwardIterator::Cleanup() {
+void ForwardIterator::Cleanup(bool release_sv) {
   if (mutable_iter_ != nullptr) {
     mutable_iter_->~Iterator();
   }
@@ -149,15 +153,17 @@ void ForwardIterator::Cleanup() {
   }
   level_iters_.clear();
 
-  if (sv_ != nullptr && sv_->Unref()) {
-    DBImpl::DeletionState deletion_state;
-    db_->mutex_.Lock();
-    sv_->Cleanup();
-    db_->FindObsoleteFiles(deletion_state, false, true);
-    db_->mutex_.Unlock();
-    delete sv_;
-    if (deletion_state.HaveSomethingToDelete()) {
-      db_->PurgeObsoleteFiles(deletion_state);
+  if (release_sv) {
+    if (sv_ != nullptr && sv_->Unref()) {
+      DBImpl::DeletionState deletion_state;
+      db_->mutex_.Lock();
+      sv_->Cleanup();
+      db_->FindObsoleteFiles(deletion_state, false, true);
+      db_->mutex_.Unlock();
+      delete sv_;
+      if (deletion_state.HaveSomethingToDelete()) {
+        db_->PurgeObsoleteFiles(deletion_state);
+      }
     }
   }
 }
@@ -169,7 +175,7 @@ bool ForwardIterator::Valid() const {
 void ForwardIterator::SeekToFirst() {
   if (sv_ == nullptr ||
       sv_ ->version_number != cfd_->GetSuperVersionNumber()) {
-    RebuildIterators();
+    RebuildIterators(true);
   } else if (status_.IsIncomplete()) {
     ResetIncompleteIterators();
   }
@@ -179,7 +185,7 @@ void ForwardIterator::SeekToFirst() {
 void ForwardIterator::Seek(const Slice& internal_key) {
   if (sv_ == nullptr ||
       sv_ ->version_number != cfd_->GetSuperVersionNumber()) {
-    RebuildIterators();
+    RebuildIterators(true);
   } else if (status_.IsIncomplete()) {
     ResetIncompleteIterators();
   }
@@ -188,6 +194,7 @@ void ForwardIterator::Seek(const Slice& internal_key) {
 
 void ForwardIterator::SeekInternal(const Slice& internal_key,
                                    bool seek_to_first) {
+  assert(mutable_iter_);
   // mutable
   seek_to_first ? mutable_iter_->SeekToFirst() :
                   mutable_iter_->Seek(internal_key);
@@ -338,7 +345,7 @@ void ForwardIterator::Next() {
     std::string current_key = key().ToString();
     Slice old_key(current_key.data(), current_key.size());
 
-    RebuildIterators();
+    RebuildIterators(true);
     SeekInternal(old_key, false);
     if (!valid_ || key().compare(old_key) != 0) {
       return;
@@ -412,11 +419,13 @@ Status ForwardIterator::status() const {
   return Status::OK();
 }
 
-void ForwardIterator::RebuildIterators() {
+void ForwardIterator::RebuildIterators(bool refresh_sv) {
   // Clean up
-  Cleanup();
-  // New
-  sv_ = cfd_->GetReferencedSuperVersion(&(db_->mutex_));
+  Cleanup(refresh_sv);
+  if (refresh_sv) {
+    // New
+    sv_ = cfd_->GetReferencedSuperVersion(&(db_->mutex_));
+  }
   mutable_iter_ = sv_->mem->NewIterator(read_options_, &arena_);
   sv_->imm->AddIterators(read_options_, &imm_iters_, &arena_);
   const auto& l0_files = sv_->current->files_[0];
diff --git a/db/forward_iterator.h b/db/forward_iterator.h
index 4d3761ee1..537dc1352 100644
--- a/db/forward_iterator.h
+++ b/db/forward_iterator.h
@@ -51,7 +51,7 @@ typedef std::priority_queue<Iterator*,
 class ForwardIterator : public Iterator {
  public:
   ForwardIterator(DBImpl* db, const ReadOptions& read_options,
-                  ColumnFamilyData* cfd);
+                  ColumnFamilyData* cfd, SuperVersion* current_sv = nullptr);
   virtual ~ForwardIterator();
 
   void SeekToLast() override {
@@ -72,8 +72,8 @@ class ForwardIterator : public Iterator {
   virtual Status status() const override;
 
  private:
-  void Cleanup();
-  void RebuildIterators();
+  void Cleanup(bool release_sv);
+  void RebuildIterators(bool refresh_sv);
   void ResetIncompleteIterators();
   void SeekInternal(const Slice& internal_key, bool seek_to_first);
   void UpdateCurrent();
diff --git a/util/mutable_cf_options.h b/util/mutable_cf_options.h
index eaecaa487..c6b312e1f 100644
--- a/util/mutable_cf_options.h
+++ b/util/mutable_cf_options.h
@@ -37,7 +37,9 @@ struct MutableCFOptions {
       max_bytes_for_level_base(options.max_bytes_for_level_base),
       max_bytes_for_level_multiplier(options.max_bytes_for_level_multiplier),
       max_bytes_for_level_multiplier_additional(
-          options.max_bytes_for_level_multiplier_additional)
+          options.max_bytes_for_level_multiplier_additional),
+      max_sequential_skip_in_iterations(
+          options.max_sequential_skip_in_iterations)
   {
     RefreshDerivedOptions(ioptions);
   }
@@ -62,7 +64,8 @@ struct MutableCFOptions {
       target_file_size_base(0),
       target_file_size_multiplier(0),
       max_bytes_for_level_base(0),
-      max_bytes_for_level_multiplier(0)
+      max_bytes_for_level_multiplier(0),
+      max_sequential_skip_in_iterations(0)
   {}
 
   // Must be called after any change to MutableCFOptions
@@ -106,6 +109,9 @@ struct MutableCFOptions {
   int max_bytes_for_level_multiplier;
   std::vector<int> max_bytes_for_level_multiplier_additional;
 
+  // Misc options
+  uint64_t max_sequential_skip_in_iterations;
+
   // Derived options
   // Per-level target file size.
   std::vector<uint64_t> max_file_size;
diff --git a/util/options_helper.cc b/util/options_helper.cc
index 2a56a1ccf..372a7171f 100644
--- a/util/options_helper.cc
+++ b/util/options_helper.cc
@@ -150,6 +150,17 @@ bool ParseCompactionOptions(const std::string& name, const std::string& value,
   return true;
 }
 
+template<typename OptionsType>
+bool ParseMiscOptions(const std::string& name, const std::string& value,
+                      OptionsType* new_options) {
+  if (name == "max_sequential_skip_in_iterations") {
+    new_options->max_sequential_skip_in_iterations = ParseUint64(value);
+  } else {
+    return false;
+  }
+  return true;
+}
+
 bool GetMutableOptionsFromStrings(
     const MutableCFOptions& base_options,
     const std::unordered_map<std::string, std::string>& options_map,
@@ -160,6 +171,7 @@ bool GetMutableOptionsFromStrings(
     for (const auto& o : options_map) {
       if (ParseMemtableOptions(o.first, o.second, new_options)) {
       } else if (ParseCompactionOptions(o.first, o.second, new_options)) {
+      } else if (ParseMiscOptions(o.first, o.second, new_options)) {
       } else {
         return false;
       }
@@ -228,6 +240,7 @@ bool GetColumnFamilyOptionsFromMap(
     try {
       if (ParseMemtableOptions(o.first, o.second, new_options)) {
       } else if (ParseCompactionOptions(o.first, o.second, new_options)) {
+      } else if (ParseMiscOptions(o.first, o.second, new_options)) {
       } else if (o.first == "min_write_buffer_number_to_merge") {
         new_options->min_write_buffer_number_to_merge = ParseInt(o.second);
       } else if (o.first == "compression") {
@@ -286,8 +299,6 @@ bool GetColumnFamilyOptionsFromMap(
       } else if (o.first == "compaction_options_fifo") {
         new_options->compaction_options_fifo.max_table_files_size
           = ParseUint64(o.second);
-      } else if (o.first == "max_sequential_skip_in_iterations") {
-        new_options->max_sequential_skip_in_iterations = ParseUint64(o.second);
       } else if (o.first == "inplace_update_support") {
         new_options->inplace_update_support = ParseBoolean(o.first, o.second);
       } else if (o.first == "inplace_update_num_locks") {

From 1fee591e74222784e6cb91783ab0d69f2804d413 Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Thu, 23 Oct 2014 15:35:10 -0700
Subject: [PATCH 295/829] comments for DynamicCompactionOptions test

Summary: as title

Test Plan: n/a

Reviewers: yhchiang, sdong, igor

Reviewed By: igor

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D24963
---
 db/db_test.cc | 45 ++++++++++++++++++++++++++++++++++++---------
 1 file changed, 36 insertions(+), 9 deletions(-)

diff --git a/db/db_test.cc b/db/db_test.cc
index 1ee8b58af..a68e5686c 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -8702,8 +8702,9 @@ TEST(DBTest, DynamicCompactionOptions) {
     dbfull()->TEST_WaitForFlushMemTable();
   };
 
-  // Write 3 files that have the same key range, trigger compaction and
-  // result in one L1 file
+  // Write 3 files that have the same key range.
+  // Since level0_file_num_compaction_trigger is 3, compaction should be
+  // triggered. The compaction should result in one L1 file
   gen_l0_kb(0, 64, 1);
   ASSERT_EQ(NumTableFilesAtLevel(0), 1);
   gen_l0_kb(0, 64, 1);
@@ -8718,6 +8719,10 @@ TEST(DBTest, DynamicCompactionOptions) {
   ASSERT_GE(metadata[0].size, k64KB - k4KB);
 
   // Test compaction trigger and target_file_size_base
+  // Reduce compaction trigger to 2, and reduce L1 file size to 32KB.
+  // Writing to 64KB L0 files should trigger a compaction. Since these
+  // 2 L0 files have the same key range, compaction merge them and should
+  // result in 2 32KB L1 files.
   ASSERT_TRUE(dbfull()->SetOptions({
     {"level0_file_num_compaction_trigger", "2"},
     {"target_file_size_base", std::to_string(k32KB) }
@@ -8733,8 +8738,13 @@ TEST(DBTest, DynamicCompactionOptions) {
   ASSERT_EQ(2U, metadata.size());
   ASSERT_LE(metadata[0].size, k32KB + k4KB);
   ASSERT_GE(metadata[0].size, k32KB - k4KB);
+  ASSERT_LE(metadata[1].size, k32KB + k4KB);
+  ASSERT_GE(metadata[1].size, k32KB - k4KB);
 
   // Test max_bytes_for_level_base
+  // Increase level base size to 256KB and write enough data that will
+  // fill L1 and L2. L1 size should be around 256KB while L2 size should be
+  // around 256KB x 4.
   ASSERT_TRUE(dbfull()->SetOptions({
     {"max_bytes_for_level_base", std::to_string(k256KB) }
   }));
@@ -8751,7 +8761,9 @@ TEST(DBTest, DynamicCompactionOptions) {
               SizeAtLevel(2) < 4 * k256KB * 1.2);
 
   // Test max_bytes_for_level_multiplier and
-  // max_bytes_for_level_base (reduce)
+  // max_bytes_for_level_base. Now, reduce both mulitplier and level base,
+  // After filling enough data that can fit in L1 - L3, we should see L1 size
+  // reduces to 128KB from 256KB which was asserted previously. Same for L2.
   ASSERT_TRUE(dbfull()->SetOptions({
     {"max_bytes_for_level_multiplier", "2"},
     {"max_bytes_for_level_base", std::to_string(k128KB) }
@@ -8767,7 +8779,10 @@ TEST(DBTest, DynamicCompactionOptions) {
   ASSERT_TRUE(SizeAtLevel(2) < 2 * k128KB * 1.2);
   ASSERT_TRUE(SizeAtLevel(3) < 4 * k128KB * 1.2);
 
-  // Clean up memtable and L0
+  // Test level0_stop_writes_trigger.
+  // Clean up memtable and L0. Block compaction threads. If continue to write
+  // and flush memtables. We should see put timeout after 8 memtable flushes
+  // since level0_stop_writes_trigger = 8
   dbfull()->CompactRange(nullptr, nullptr);
   // Block compaction
   SleepingBackgroundTask sleeping_task_low1;
@@ -8788,7 +8803,9 @@ TEST(DBTest, DynamicCompactionOptions) {
   sleeping_task_low1.WakeUp();
   sleeping_task_low1.WaitUntilDone();
 
-  // Test: stop trigger (reduce)
+  // Now reduce level0_stop_writes_trigger to 6. Clear up memtables and L0.
+  // Block compaction thread again. Perform the put and memtable flushes
+  // until we see timeout after 6 memtable flushes.
   ASSERT_TRUE(dbfull()->SetOptions({
     {"level0_stop_writes_trigger", "6"}
   }));
@@ -8810,6 +8827,10 @@ TEST(DBTest, DynamicCompactionOptions) {
   sleeping_task_low2.WaitUntilDone();
 
   // Test disable_auto_compactions
+  // Compaction thread is unblocked but auto compaction is disabled. Write
+  // 4 L0 files and compaction should be triggered. If auto compaction is
+  // disabled, then TEST_WaitForCompact will be waiting for nothing. Number of
+  // L0 files do not change after the call.
   ASSERT_TRUE(dbfull()->SetOptions({
     {"disable_auto_compactions", "true"}
   }));
@@ -8824,6 +8845,8 @@ TEST(DBTest, DynamicCompactionOptions) {
   dbfull()->TEST_WaitForCompact();
   ASSERT_EQ(NumTableFilesAtLevel(0), 4);
 
+  // Enable auto compaction and perform the same test, # of L0 files should be
+  // reduced after compaction.
   ASSERT_TRUE(dbfull()->SetOptions({
     {"disable_auto_compactions", "false"}
   }));
@@ -8838,8 +8861,10 @@ TEST(DBTest, DynamicCompactionOptions) {
   dbfull()->TEST_WaitForCompact();
   ASSERT_LT(NumTableFilesAtLevel(0), 4);
 
-  // Test for hard_rate_limit, change max_bytes_for_level_base to make level
-  // size big
+  // Test for hard_rate_limit.
+  // First change max_bytes_for_level_base to a big value and populate
+  // L1 - L3. Then thrink max_bytes_for_level_base and disable auto compaction
+  // at the same time, we should see some level with score greater than 2.
   ASSERT_TRUE(dbfull()->SetOptions({
     {"max_bytes_for_level_base", std::to_string(k256KB) }
   }));
@@ -8869,7 +8894,9 @@ TEST(DBTest, DynamicCompactionOptions) {
               SizeAtLevel(2) / k64KB > 4 ||
               SizeAtLevel(3) / k64KB > 8);
 
-  // Enfoce hard rate limit, L0 score is not regulated by this limit
+  // Enfoce hard rate limit. Now set hard_rate_limit to 2,
+  // we should start to see put delay (1000 us) and timeout as a result
+  // (L0 score is not regulated by this limit).
   ASSERT_TRUE(dbfull()->SetOptions({
     {"hard_rate_limit", "2"}
   }));
@@ -8881,7 +8908,7 @@ TEST(DBTest, DynamicCompactionOptions) {
   wo.timeout_hint_us = 500;
   ASSERT_TRUE(Put(Key(count), RandomString(&rnd, 1024), wo).IsTimedOut());
 
-  // Bump up limit
+  // Lift the limit and no timeout
   ASSERT_TRUE(dbfull()->SetOptions({
     {"hard_rate_limit", "100"}
   }));

From 122f98e0b9814ee54edb74a7c39eebdf1ead41ec Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Thu, 23 Oct 2014 15:37:14 -0700
Subject: [PATCH 296/829] dynamic max_mem_compact_level

Summary: as title

Test Plan: unit test

Reviewers: sdong, yhchiang, rven, igor

Reviewed By: igor

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D25347
---
 db/db_impl.cc              |  8 ++++++--
 db/db_test.cc              | 39 ++++++++++++++++++++++++++++++++++++++
 db/version_set.cc          |  6 +++---
 util/mutable_cf_options.cc |  6 ++++++
 util/mutable_cf_options.h  |  3 +++
 util/options_helper.cc     |  4 ++--
 6 files changed, 59 insertions(+), 7 deletions(-)

diff --git a/db/db_impl.cc b/db/db_impl.cc
index 0e47774a7..334b6df0f 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -1863,12 +1863,16 @@ int DBImpl::NumberLevels(ColumnFamilyHandle* column_family) {
 
 int DBImpl::MaxMemCompactionLevel(ColumnFamilyHandle* column_family) {
   auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
-  return cfh->cfd()->options()->max_mem_compaction_level;
+  MutexLock l(&mutex_);
+  return cfh->cfd()->GetSuperVersion()->
+      mutable_cf_options.max_mem_compaction_level;
 }
 
 int DBImpl::Level0StopWriteTrigger(ColumnFamilyHandle* column_family) {
   auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
-  return cfh->cfd()->options()->level0_stop_writes_trigger;
+  MutexLock l(&mutex_);
+  return cfh->cfd()->GetSuperVersion()->
+      mutable_cf_options.level0_stop_writes_trigger;
 }
 
 Status DBImpl::Flush(const FlushOptions& flush_options,
diff --git a/db/db_test.cc b/db/db_test.cc
index a68e5686c..cfd9dcd9b 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -8914,6 +8914,45 @@ TEST(DBTest, DynamicCompactionOptions) {
   }));
   dbfull()->TEST_FlushMemTable(true);
   ASSERT_TRUE(Put(Key(count), RandomString(&rnd, 1024), wo).ok());
+
+  // Test max_mem_compaction_level.
+  // Destory DB and start from scratch
+  options.max_background_compactions = 1;
+  options.max_background_flushes = 0;
+  options.max_mem_compaction_level = 2;
+  DestroyAndReopen(&options);
+  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(1), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(2), 0);
+
+  ASSERT_TRUE(Put("max_mem_compaction_level_key",
+              RandomString(&rnd, 8)).ok());
+  dbfull()->TEST_FlushMemTable(true);
+  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(1), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(2), 1);
+
+  ASSERT_TRUE(Put("max_mem_compaction_level_key",
+              RandomString(&rnd, 8)).ok());
+  // Set new value and it becomes effective in this flush
+  ASSERT_TRUE(dbfull()->SetOptions({
+    {"max_mem_compaction_level", "1"}
+  }));
+  dbfull()->TEST_FlushMemTable(true);
+  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(1), 1);
+  ASSERT_EQ(NumTableFilesAtLevel(2), 1);
+
+  ASSERT_TRUE(Put("max_mem_compaction_level_key",
+              RandomString(&rnd, 8)).ok());
+  // Set new value and it becomes effective in this flush
+  ASSERT_TRUE(dbfull()->SetOptions({
+    {"max_mem_compaction_level", "0"}
+  }));
+  dbfull()->TEST_FlushMemTable(true);
+  ASSERT_EQ(NumTableFilesAtLevel(0), 1);
+  ASSERT_EQ(NumTableFilesAtLevel(1), 1);
+  ASSERT_EQ(NumTableFilesAtLevel(2), 1);
 }
 
 TEST(DBTest, DynamicMiscOptions) {
diff --git a/db/version_set.cc b/db/version_set.cc
index 0819196fb..65c36c715 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -895,7 +895,7 @@ void Version::ComputeCompactionScore(
       }
       if (cfd_->ioptions()->compaction_style == kCompactionStyleFIFO) {
         score = static_cast<double>(total_size) /
-                cfd_->options()->compaction_options_fifo.max_table_files_size;
+                cfd_->ioptions()->compaction_options_fifo.max_table_files_size;
       } else if (numfiles >= mutable_cf_options.level0_stop_writes_trigger) {
         // If we are slowing down writes, then we better compact that first
         score = 1000000;
@@ -1051,8 +1051,8 @@ int Version::PickLevelForMemTableOutput(
     InternalKey start(smallest_user_key, kMaxSequenceNumber, kValueTypeForSeek);
     InternalKey limit(largest_user_key, 0, static_cast<ValueType>(0));
     std::vector<FileMetaData*> overlaps;
-    int max_mem_compact_level = cfd_->options()->max_mem_compaction_level;
-    while (max_mem_compact_level > 0 && level < max_mem_compact_level) {
+    while (mutable_cf_options.max_mem_compaction_level > 0 &&
+           level < mutable_cf_options.max_mem_compaction_level) {
       if (OverlapInLevel(level + 1, &smallest_user_key, &largest_user_key)) {
         break;
       }
diff --git a/util/mutable_cf_options.cc b/util/mutable_cf_options.cc
index 1b3197b18..5ae26ac81 100644
--- a/util/mutable_cf_options.cc
+++ b/util/mutable_cf_options.cc
@@ -92,6 +92,8 @@ void MutableCFOptions::Dump(Logger* log) const {
       max_successive_merges);
   Log(log, "                           filter_deletes: %d",
       filter_deletes);
+  Log(log, "                 inplace_update_num_locks: %zu",
+      inplace_update_num_locks);
   Log(log, "                 disable_auto_compactions: %d",
       disable_auto_compactions);
   Log(log, "                          soft_rate_limit: %lf",
@@ -126,6 +128,10 @@ void MutableCFOptions::Dump(Logger* log) const {
   }
   result.resize(result.size() - 2);
   Log(log, "max_bytes_for_level_multiplier_additional: %s", result.c_str());
+  Log(log, "                 max_mem_compaction_level: %d",
+      max_mem_compaction_level);
+  Log(log, "        max_sequential_skip_in_iterations: %" PRIu64,
+      max_sequential_skip_in_iterations);
 }
 
 }  // namespace rocksdb
diff --git a/util/mutable_cf_options.h b/util/mutable_cf_options.h
index c6b312e1f..a738e7978 100644
--- a/util/mutable_cf_options.h
+++ b/util/mutable_cf_options.h
@@ -38,6 +38,7 @@ struct MutableCFOptions {
       max_bytes_for_level_multiplier(options.max_bytes_for_level_multiplier),
       max_bytes_for_level_multiplier_additional(
           options.max_bytes_for_level_multiplier_additional),
+      max_mem_compaction_level(options.max_mem_compaction_level),
       max_sequential_skip_in_iterations(
           options.max_sequential_skip_in_iterations)
   {
@@ -65,6 +66,7 @@ struct MutableCFOptions {
       target_file_size_multiplier(0),
       max_bytes_for_level_base(0),
       max_bytes_for_level_multiplier(0),
+      max_mem_compaction_level(0),
       max_sequential_skip_in_iterations(0)
   {}
 
@@ -108,6 +110,7 @@ struct MutableCFOptions {
   uint64_t max_bytes_for_level_base;
   int max_bytes_for_level_multiplier;
   std::vector<int> max_bytes_for_level_multiplier_additional;
+  int max_mem_compaction_level;
 
   // Misc options
   uint64_t max_sequential_skip_in_iterations;
diff --git a/util/options_helper.cc b/util/options_helper.cc
index 372a7171f..4fef52299 100644
--- a/util/options_helper.cc
+++ b/util/options_helper.cc
@@ -144,6 +144,8 @@ bool ParseCompactionOptions(const std::string& name, const std::string& value,
         start = end + 1;
       }
     }
+  } else if (name == "max_mem_compaction_level") {
+    new_options->max_mem_compaction_level = ParseInt(value);
   } else {
     return false;
   }
@@ -283,8 +285,6 @@ bool GetColumnFamilyOptionsFromMap(
             ParseInt(o.second.substr(start, o.second.size() - start));
       } else if (o.first == "num_levels") {
         new_options->num_levels = ParseInt(o.second);
-      } else if (o.first == "max_mem_compaction_level") {
-        new_options->max_mem_compaction_level = ParseInt(o.second);
       } else if (o.first == "purge_redundant_kvs_while_flush") {
         new_options->purge_redundant_kvs_while_flush =
           ParseBoolean(o.first, o.second);

From b794194adef63437a9248dc6bcdb5c38a7e335dd Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Thu, 23 Oct 2014 15:37:51 -0700
Subject: [PATCH 297/829] Remove java build from travis

---
 .travis.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.travis.yml b/.travis.yml
index b52e5acf7..8f1bcb0ae 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -14,6 +14,6 @@ before_install:
  - sudo dpkg -i libgflags-dev_2.0-1_amd64.deb
 # Lousy hack to disable use and testing of fallocate, which doesn't behave quite
 # as EnvPosixTest::AllocateTest expects within the Travis OpenVZ environment.
-script: OPT=-DTRAVIS make unity  && OPT=-DTRAVIS make check -j8  && OPT=-DTRAVIS make rocksdbjava jtest
+script: OPT=-DTRAVIS make unity  && OPT=-DTRAVIS make check -j8
 notifications:
     email: false

From 720c1c056d0b1afde0bb22347fe7965627b3eb11 Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Thu, 23 Oct 2014 15:41:37 -0700
Subject: [PATCH 298/829] fix erro during merge

Summary:
as title

Test Plan:
make release
---
 util/mutable_cf_options.cc | 2 --
 1 file changed, 2 deletions(-)

diff --git a/util/mutable_cf_options.cc b/util/mutable_cf_options.cc
index 5ae26ac81..1bc8a5b7d 100644
--- a/util/mutable_cf_options.cc
+++ b/util/mutable_cf_options.cc
@@ -92,8 +92,6 @@ void MutableCFOptions::Dump(Logger* log) const {
       max_successive_merges);
   Log(log, "                           filter_deletes: %d",
       filter_deletes);
-  Log(log, "                 inplace_update_num_locks: %zu",
-      inplace_update_num_locks);
   Log(log, "                 disable_auto_compactions: %d",
       disable_auto_compactions);
   Log(log, "                          soft_rate_limit: %lf",

From 724fba2b397396978bbe9533c5be81564ffe090e Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Thu, 23 Oct 2014 15:40:20 -0700
Subject: [PATCH 299/829] Improve the log in Universal Compaction to include
 more debug information.

Summary:
Previously, the log for Universal Compaction does not include the current
number of files in case the compaction is triggered by the number of files.
This diff includes the number of files in the log.

Test Plan:
make
---
 db/compaction_picker.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/db/compaction_picker.cc b/db/compaction_picker.cc
index 84bd95839..63d621c50 100644
--- a/db/compaction_picker.cc
+++ b/db/compaction_picker.cc
@@ -553,8 +553,8 @@ Compaction* UniversalCompactionPicker::PickCompaction(
       if ((c = PickCompactionUniversalReadAmp(
                mutable_cf_options, version, score, UINT_MAX,
                num_files, log_buffer)) != nullptr) {
-        LogToBuffer(log_buffer, "[%s] Universal: compacting for file num\n",
-                    version->cfd_->GetName().c_str());
+        LogToBuffer(log_buffer, "[%s] Universal: compacting for file num -- %u\n",
+                    version->cfd_->GetName().c_str(), num_files);
       }
     }
   }

From 240ed0cd7b68f3dbaf48fb07e9e861b9fd653fe2 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Thu, 23 Oct 2014 15:44:45 -0700
Subject: [PATCH 300/829] Fix uninitialized parameter caused by D24513

Summary: D24513 introduced a bug that a variable is not initialized. It also causes valgrind issue.

Test Plan: Run tests used to fail valgrind and make sure it passes

Reviewers: yhchiang, ljin, igor

Reviewed By: igor

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D25569
---
 db/internal_stats.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/db/internal_stats.h b/db/internal_stats.h
index 84fd10289..5caa33415 100644
--- a/db/internal_stats.h
+++ b/db/internal_stats.h
@@ -279,6 +279,7 @@ class InternalStats {
           write_with_wal(0),
           write_other(0),
           write_self(0),
+          num_keys_written(0),
           seconds_up(0) {}
   } db_stats_snapshot_;
 

From 001ce64dc7659c65569ffb1c440e26cd23db3c94 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Fri, 24 Oct 2014 10:11:57 -0700
Subject: [PATCH 301/829] Use chrono for timing

Summary: Since we depend on C++11, we might as well use it for timing, instead of this platform-depended code.

Test Plan: Ran autovector_test, which reports time and confirmed that output is similar to master

Reviewers: ljin, sdong, yhchiang, rven, dhruba

Reviewed By: dhruba

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D25587
---
 util/env_posix.cc | 21 +++++----------------
 1 file changed, 5 insertions(+), 16 deletions(-)

diff --git a/util/env_posix.cc b/util/env_posix.cc
index cf917e874..177932bcd 100644
--- a/util/env_posix.cc
+++ b/util/env_posix.cc
@@ -7,6 +7,7 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
+#include <chrono>
 #include <deque>
 #include <set>
 #include <dirent.h>
@@ -1350,25 +1351,13 @@ class PosixEnv : public Env {
   }
 
   virtual uint64_t NowMicros() {
-    struct timeval tv;
-    // TODO(kailiu) MAC DON'T HAVE THIS
-    gettimeofday(&tv, nullptr);
-    return static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
+    return std::chrono::duration_cast<std::chrono::milliseconds>(
+        std::chrono::steady_clock::now().time_since_epoch()).count();
   }
 
   virtual uint64_t NowNanos() {
-#ifdef OS_LINUX
-    struct timespec ts;
-    clock_gettime(CLOCK_MONOTONIC, &ts);
-    return static_cast<uint64_t>(ts.tv_sec) * 1000000000 + ts.tv_nsec;
-#elif __MACH__
-    clock_serv_t cclock;
-    mach_timespec_t ts;
-    host_get_clock_service(mach_host_self(), CALENDAR_CLOCK, &cclock);
-    clock_get_time(cclock, &ts);
-    mach_port_deallocate(mach_task_self(), cclock);
-#endif
-    return static_cast<uint64_t>(ts.tv_sec) * 1000000000 + ts.tv_nsec;
+    return std::chrono::duration_cast<std::chrono::nanoseconds>(
+        std::chrono::steady_clock::now().time_since_epoch()).count();
   }
 
   virtual void SleepForMicroseconds(int micros) {

From 965d9d50b8cbb413de5e834b5b83ddbb682d0f1d Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Fri, 24 Oct 2014 11:58:15 -0700
Subject: [PATCH 302/829] Fix timing

---
 util/env_posix.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/util/env_posix.cc b/util/env_posix.cc
index 177932bcd..76ba4a6bd 100644
--- a/util/env_posix.cc
+++ b/util/env_posix.cc
@@ -1351,8 +1351,8 @@ class PosixEnv : public Env {
   }
 
   virtual uint64_t NowMicros() {
-    return std::chrono::duration_cast<std::chrono::milliseconds>(
-        std::chrono::steady_clock::now().time_since_epoch()).count();
+    return std::chrono::duration_cast<std::chrono::microseconds>(
+        std::chrono::system_clock::now().time_since_epoch()).count();
   }
 
   virtual uint64_t NowNanos() {

From b8ce5264875427fc1f9e94e290c90fd77e200646 Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Mon, 13 Oct 2014 20:48:44 +0200
Subject: [PATCH 303/829] [RocksJava] Support Snapshots

Summary:
Snapshots integration into RocksJava. Added support for the following functionalities:

- getSnapshot
- releaseSnapshot
- ReadOptions support to set a Snapshot
- ReadOptions support to retrieve Snapshot
- SnapshotTest

Test Plan:
make rocksdbjava
make jtest

Differential Revision: https://reviews.facebook.net/D24801
---
 java/Makefile                           |  1 +
 java/org/rocksdb/ReadOptions.java       | 33 ++++++++++
 java/org/rocksdb/RocksDB.java           | 35 +++++++++++
 java/org/rocksdb/Snapshot.java          | 24 ++++++++
 java/org/rocksdb/test/SnapshotTest.java | 82 +++++++++++++++++++++++++
 java/rocksjni/options.cc                | 25 +++++++-
 java/rocksjni/rocksjni.cc               | 22 +++++++
 7 files changed, 220 insertions(+), 2 deletions(-)
 create mode 100644 java/org/rocksdb/Snapshot.java
 create mode 100644 java/org/rocksdb/test/SnapshotTest.java

diff --git a/java/Makefile b/java/Makefile
index ea6b274f6..d490da4e5 100644
--- a/java/Makefile
+++ b/java/Makefile
@@ -46,6 +46,7 @@ test: java
 	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.OptionsTest
 	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.ReadOnlyTest
 	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.ReadOptionsTest
+	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.SnapshotTest
 	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.StatisticsCollectorTest
 	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.ComparatorOptionsTest
 	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.ComparatorTest
diff --git a/java/org/rocksdb/ReadOptions.java b/java/org/rocksdb/ReadOptions.java
index 3590a1a87..5db616eba 100644
--- a/java/org/rocksdb/ReadOptions.java
+++ b/java/org/rocksdb/ReadOptions.java
@@ -80,6 +80,39 @@ public class ReadOptions extends RocksObject {
   private native void setFillCache(
       long handle, boolean fillCache);
 
+  /**
+   * <p>If "snapshot" is non-nullptr, read as of the supplied snapshot
+   * (which must belong to the DB that is being read and which must
+   * not have been released).  If "snapshot" is nullptr, use an implicit
+   * snapshot of the state at the beginning of this read operation.</p>
+   * <p>Default: null</p>
+   *
+   * @param snapshot {@link Snapshot} instance
+   * @return the reference to the current ReadOptions.
+   */
+  public ReadOptions setSnapshot(Snapshot snapshot) {
+    assert(isInitialized());
+    setSnapshot(nativeHandle_, snapshot.nativeHandle_);
+    return this;
+  }
+  private native void setSnapshot(long handle, long snapshotHandle);
+
+  /**
+   * Returns the currently assigned Snapshot instance.
+   *
+   * @return the Snapshot assigned to this instance. If no Snapshot
+   *     is assigned null.
+   */
+  public Snapshot snapshot() {
+    assert(isInitialized());
+    long snapshotHandle = snapshot(nativeHandle_);
+    if (snapshotHandle != 0) {
+      return new Snapshot(snapshotHandle);
+    }
+    return null;
+  }
+  private native long snapshot(long handle);
+
   /**
    * Specify to create a tailing iterator -- a special iterator that has a
    * view of the complete database (i.e. it can also be used to read newly
diff --git a/java/org/rocksdb/RocksDB.java b/java/org/rocksdb/RocksDB.java
index d10c235dc..676f636d4 100644
--- a/java/org/rocksdb/RocksDB.java
+++ b/java/org/rocksdb/RocksDB.java
@@ -891,6 +891,38 @@ public class RocksDB extends RocksObject {
     return new RocksIterator(iterator0(nativeHandle_));
   }
 
+
+  /**
+   * <p>Return a handle to the current DB state. Iterators created with
+   * this handle will all observe a stable snapshot of the current DB
+   * state. The caller must call ReleaseSnapshot(result) when the
+   * snapshot is no longer needed.</p>
+   *
+   * <p>nullptr will be returned if the DB fails to take a snapshot or does
+   * not support snapshot.</p>
+   *
+   * @return Snapshot
+   */
+  public Snapshot getSnapshot() {
+    long snapshotHandle = getSnapshot(nativeHandle_);
+    if (snapshotHandle != 0) {
+      return new Snapshot(snapshotHandle);
+    }
+    return null;
+  }
+
+  /**
+   * Release a previously acquired snapshot.  The caller must not
+   * use "snapshot" after this call.
+   *
+   * @param snapshot {@link Snapshot} instance
+   */
+  public void releaseSnapshot(final Snapshot snapshot) {
+    if (snapshot != null) {
+      releaseSnapshot(nativeHandle_, snapshot.nativeHandle_);
+    }
+  }
+
   /**
    * Return a heap-allocated iterator over the contents of the database.
    * The result of newIterator() is initially invalid (caller must
@@ -1052,6 +1084,9 @@ public class RocksDB extends RocksObject {
   protected native long iterator0(long handle, long cfHandle);
   protected native long[] iterators(long handle,
       List<ColumnFamilyHandle> columnFamilyNames) throws RocksDBException;
+  protected native long getSnapshot(long nativeHandle);
+  protected native void releaseSnapshot(
+      long nativeHandle, long snapshotHandle);
   private native void disposeInternal(long handle);
 
   private native long createColumnFamily(long handle, String name) throws RocksDBException;
diff --git a/java/org/rocksdb/Snapshot.java b/java/org/rocksdb/Snapshot.java
new file mode 100644
index 000000000..5817a8b44
--- /dev/null
+++ b/java/org/rocksdb/Snapshot.java
@@ -0,0 +1,24 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+/**
+ * Snapshot of database
+ */
+public class Snapshot extends RocksObject {
+  Snapshot(long nativeHandle) {
+    super();
+    nativeHandle_ = nativeHandle;
+  }
+
+  /**
+   * Dont release C++ Snapshot pointer. The pointer
+   * to the snapshot is released by the database
+   * instance.
+   */
+  @Override protected void disposeInternal() {
+  }
+}
diff --git a/java/org/rocksdb/test/SnapshotTest.java b/java/org/rocksdb/test/SnapshotTest.java
new file mode 100644
index 000000000..7ff1637e3
--- /dev/null
+++ b/java/org/rocksdb/test/SnapshotTest.java
@@ -0,0 +1,82 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+package org.rocksdb.test;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.rocksdb.ColumnFamilyHandle;
+import org.rocksdb.Options;
+import org.rocksdb.ReadOptions;
+import org.rocksdb.RocksDB;
+import org.rocksdb.RocksDBException;
+import org.rocksdb.Snapshot;
+import org.rocksdb.WriteBatch;
+import org.rocksdb.WriteOptions;
+
+
+public class SnapshotTest
+{
+  static final String DB_PATH = "/tmp/rocksdbjni_snapshot_test";
+  static {
+    RocksDB.loadLibrary();
+  }
+
+  public static void main(String[] args){
+    RocksDB db = null;
+    Options options = new Options();
+    options.setCreateIfMissing(true);
+    try {
+      db = RocksDB.open(options, DB_PATH);
+      db.put("key".getBytes(), "value".getBytes());
+      // Get new Snapshot of database
+      Snapshot snapshot = db.getSnapshot();
+      ReadOptions readOptions = new ReadOptions();
+      // set snapshot in ReadOptions
+      readOptions.setSnapshot(snapshot);
+      // retrieve key value pair
+      assert(new String(db.get("key".getBytes()))
+          .equals("value"));
+      // retrieve key value pair created before
+      // the snapshot was made
+      assert(new String(db.get(readOptions,
+          "key".getBytes())).equals("value"));
+      // add new key/value pair
+      db.put("newkey".getBytes(), "newvalue".getBytes());
+      // using no snapshot the latest db entries
+      // will be taken into account
+      assert(new String(db.get("newkey".getBytes()))
+          .equals("newvalue"));
+      // snapshopot was created before newkey
+      assert(db.get(readOptions, "newkey".getBytes())
+          == null);
+      // Retrieve snapshot from read options
+      Snapshot sameSnapshot = readOptions.snapshot();
+      readOptions.setSnapshot(sameSnapshot);
+      // results must be the same with new Snapshot
+      // instance using the same native pointer
+      assert(new String(db.get(readOptions,
+          "key".getBytes())).equals("value"));
+      // update key value pair to newvalue
+      db.put("key".getBytes(), "newvalue".getBytes());
+      // read with previously created snapshot will
+      // read previous version of key value pair
+      assert(new String(db.get(readOptions,
+          "key".getBytes())).equals("value"));
+      // read for newkey using the snapshot must be
+      // null
+      assert(db.get(readOptions, "newkey".getBytes())
+          == null);
+      // release Snapshot
+      db.releaseSnapshot(snapshot);
+      // Close database
+      db.close();
+    }catch (RocksDBException e){
+      e.printStackTrace();
+      assert(false);
+    }
+    System.out.println("Passed SnapshotTest");
+  }
+}
diff --git a/java/rocksjni/options.cc b/java/rocksjni/options.cc
index ceb4ce031..3576a8c1e 100644
--- a/java/rocksjni/options.cc
+++ b/java/rocksjni/options.cc
@@ -1774,8 +1774,6 @@ void Java_org_rocksdb_ReadOptions_setTailing(
       static_cast<bool>(jtailing);
 }
 
-/////////////////////////////////////////////////////////////////////
-// rocksdb::ComparatorOptions
 /*
  * Class:     org_rocksdb_ComparatorOptions
  * Method:    newComparatorOptions
@@ -1819,3 +1817,26 @@ void Java_org_rocksdb_ComparatorOptions_disposeInternal(
   delete reinterpret_cast<rocksdb::ComparatorJniCallbackOptions*>(jhandle);
   rocksdb::ComparatorOptionsJni::setHandle(env, jobj, nullptr);
 }
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    setSnapshot
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_ReadOptions_setSnapshot(
+    JNIEnv* env, jobject jobj, jlong jhandle, jlong jsnapshot) {
+  reinterpret_cast<rocksdb::ReadOptions*>(jhandle)->snapshot =
+      reinterpret_cast<rocksdb::Snapshot*>(jsnapshot);
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    snapshot
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_ReadOptions_snapshot(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  auto& snapshot =
+      reinterpret_cast<rocksdb::ReadOptions*>(jhandle)->snapshot;
+  return reinterpret_cast<jlong>(snapshot);
+}
diff --git a/java/rocksjni/rocksjni.cc b/java/rocksjni/rocksjni.cc
index fa9a66a7d..5ba797d7d 100644
--- a/java/rocksjni/rocksjni.cc
+++ b/java/rocksjni/rocksjni.cc
@@ -1031,6 +1031,28 @@ void Java_org_rocksdb_RocksDB_dropColumnFamily(
   }
 }
 
+/*
+ * Method:    getSnapshot
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_RocksDB_getSnapshot(
+    JNIEnv* env, jobject jdb, jlong db_handle) {
+  auto db = reinterpret_cast<rocksdb::DB*>(db_handle);
+  const rocksdb::Snapshot* snapshot = db->GetSnapshot();
+  return reinterpret_cast<jlong>(snapshot);
+}
+
+/*
+ * Method:    releaseSnapshot
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_RocksDB_releaseSnapshot(
+    JNIEnv* env, jobject jdb, jlong db_handle, jlong snapshot_handle) {
+  auto db = reinterpret_cast<rocksdb::DB*>(db_handle);
+  auto snapshot = reinterpret_cast<rocksdb::Snapshot*>(snapshot_handle);
+  db->ReleaseSnapshot(snapshot);
+}
+
 /*
  * Class:     org_rocksdb_RocksDB
  * Method:    getProperty0

From a1bae76c87c871213b6cb2c00bf7b2a36c2a648d Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Sun, 26 Oct 2014 13:27:43 +0100
Subject: [PATCH 304/829] Integrated changes due to review bei ankgup87

---
 java/org/rocksdb/ReadOptions.java       | 6 +++++-
 java/org/rocksdb/test/SnapshotTest.java | 5 +++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/java/org/rocksdb/ReadOptions.java b/java/org/rocksdb/ReadOptions.java
index 5db616eba..aa6977e98 100644
--- a/java/org/rocksdb/ReadOptions.java
+++ b/java/org/rocksdb/ReadOptions.java
@@ -92,7 +92,11 @@ public class ReadOptions extends RocksObject {
    */
   public ReadOptions setSnapshot(Snapshot snapshot) {
     assert(isInitialized());
-    setSnapshot(nativeHandle_, snapshot.nativeHandle_);
+    if (snapshot != null) {
+      setSnapshot(nativeHandle_, snapshot.nativeHandle_);
+    } else {
+      setSnapshot(nativeHandle_, 0l);
+    }
     return this;
   }
   private native void setSnapshot(long handle, long snapshotHandle);
diff --git a/java/org/rocksdb/test/SnapshotTest.java b/java/org/rocksdb/test/SnapshotTest.java
index 7ff1637e3..67d0a83ef 100644
--- a/java/org/rocksdb/test/SnapshotTest.java
+++ b/java/org/rocksdb/test/SnapshotTest.java
@@ -69,6 +69,11 @@ public class SnapshotTest
       // null
       assert(db.get(readOptions, "newkey".getBytes())
           == null);
+      // setting null to snapshot in ReadOptions leads
+      // to no Snapshot being used.
+      readOptions.setSnapshot(null);
+      assert(new String(db.get(readOptions,
+          "newkey".getBytes())).equals("newvalue"));
       // release Snapshot
       db.releaseSnapshot(snapshot);
       // Close database

From f1841985e4b70a9973222cd796b361c61dfacdca Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Mon, 27 Oct 2014 12:10:13 -0700
Subject: [PATCH 305/829] dynamic inplace_update options

Summary:
Make inplace_update_support and inplace_update_num_locks dynamic.
inplace_callback becomes immutable
We are almost free of references to cfd->options() in db_impl

Test Plan: unit test

Reviewers: igor, yhchiang, rven, sdong

Reviewed By: sdong

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D25293
---
 db/column_family.cc                 |  5 ++--
 db/column_family.h                  |  2 +-
 db/db_impl.cc                       | 16 ++++------
 db/memtable.cc                      | 45 +++++++++++++++--------------
 db/memtable.h                       | 11 +++----
 db/repair.cc                        |  2 +-
 db/version_set.cc                   |  3 +-
 db/write_batch.cc                   | 17 +++++------
 db/write_batch_test.cc              |  2 +-
 include/rocksdb/immutable_options.h |  8 +++++
 table/table_test.cc                 |  6 ++--
 util/mutable_cf_options.h           |  3 ++
 util/options.cc                     |  2 ++
 util/options_helper.cc              |  8 ++---
 14 files changed, 70 insertions(+), 60 deletions(-)

diff --git a/db/column_family.cc b/db/column_family.cc
index a728a3fd5..0e83e98ab 100644
--- a/db/column_family.cc
+++ b/db/column_family.cc
@@ -388,12 +388,13 @@ const EnvOptions* ColumnFamilyData::soptions() const {
 
 void ColumnFamilyData::SetCurrent(Version* current) { current_ = current; }
 
-void ColumnFamilyData::CreateNewMemtable(const MemTableOptions& moptions) {
+void ColumnFamilyData::CreateNewMemtable(
+    const MutableCFOptions& mutable_cf_options) {
   assert(current_ != nullptr);
   if (mem_ != nullptr) {
     delete mem_->Unref();
   }
-  mem_ = new MemTable(internal_comparator_, ioptions_, moptions);
+  mem_ = new MemTable(internal_comparator_, ioptions_, mutable_cf_options);
   mem_->Ref();
 }
 
diff --git a/db/column_family.h b/db/column_family.h
index 96b08c52e..b37b684fa 100644
--- a/db/column_family.h
+++ b/db/column_family.h
@@ -198,7 +198,7 @@ class ColumnFamilyData {
   Version* dummy_versions() { return dummy_versions_; }
   void SetMemtable(MemTable* new_mem) { mem_ = new_mem; }
   void SetCurrent(Version* current);
-  void CreateNewMemtable(const MemTableOptions& moptions);
+  void CreateNewMemtable(const MutableCFOptions& mutable_cf_options);
 
   TableCache* table_cache() const { return table_cache_.get(); }
 
diff --git a/db/db_impl.cc b/db/db_impl.cc
index 334b6df0f..40b94acab 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -1228,8 +1228,7 @@ Status DBImpl::Recover(
       if (!s.ok()) {
         // Clear memtables if recovery failed
         for (auto cfd : *versions_->GetColumnFamilySet()) {
-          cfd->CreateNewMemtable(MemTableOptions(
-              *cfd->GetLatestMutableCFOptions(), *cfd->options()));
+          cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions());
         }
       }
     }
@@ -1360,8 +1359,7 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
             // file-systems cause the DB::Open() to fail.
             return status;
           }
-          cfd->CreateNewMemtable(MemTableOptions(
-              *cfd->GetLatestMutableCFOptions(), *cfd->options()));
+          cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions());
         }
       }
     }
@@ -1398,8 +1396,7 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
           // Recovery failed
           break;
         }
-        cfd->CreateNewMemtable(MemTableOptions(
-            *cfd->GetLatestMutableCFOptions(), *cfd->options()));
+        cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions());
       }
 
       // write MANIFEST with update
@@ -2749,7 +2746,7 @@ Status DBImpl::ProcessKeyValueCompaction(
   ColumnFamilyData* cfd = compact->compaction->column_family_data();
   MergeHelper merge(
       cfd->user_comparator(), cfd->ioptions()->merge_operator,
-      db_options_.info_log.get(), cfd->options()->min_partial_merge_operands,
+      db_options_.info_log.get(), cfd->ioptions()->min_partial_merge_operands,
       false /* internal key corruption is expected */);
   auto compaction_filter = cfd->ioptions()->compaction_filter;
   std::unique_ptr<CompactionFilter> compaction_filter_from_factory = nullptr;
@@ -4281,9 +4278,8 @@ Status DBImpl::SetNewMemtableAndNewLogFile(ColumnFamilyData* cfd,
     }
 
     if (s.ok()) {
-      new_mem = new MemTable(cfd->internal_comparator(),
-          *cfd->ioptions(), MemTableOptions(mutable_cf_options,
-          *cfd->options()));
+      new_mem = new MemTable(cfd->internal_comparator(), *cfd->ioptions(),
+                             mutable_cf_options);
       new_superversion = new SuperVersion();
     }
   }
diff --git a/db/memtable.cc b/db/memtable.cc
index b9b99a684..8d9d99d7e 100644
--- a/db/memtable.cc
+++ b/db/memtable.cc
@@ -32,7 +32,8 @@
 namespace rocksdb {
 
 MemTableOptions::MemTableOptions(
-    const MutableCFOptions& mutable_cf_options, const Options& options)
+    const ImmutableCFOptions& ioptions,
+    const MutableCFOptions& mutable_cf_options)
   : write_buffer_size(mutable_cf_options.write_buffer_size),
     arena_block_size(mutable_cf_options.arena_block_size),
     memtable_prefix_bloom_bits(mutable_cf_options.memtable_prefix_bloom_bits),
@@ -40,21 +41,23 @@ MemTableOptions::MemTableOptions(
         mutable_cf_options.memtable_prefix_bloom_probes),
     memtable_prefix_bloom_huge_page_tlb_size(
         mutable_cf_options.memtable_prefix_bloom_huge_page_tlb_size),
-    inplace_update_support(options.inplace_update_support),
-    inplace_update_num_locks(options.inplace_update_num_locks),
-    inplace_callback(options.inplace_callback),
+    inplace_update_support(ioptions.inplace_update_support),
+    inplace_update_num_locks(mutable_cf_options.inplace_update_num_locks),
+    inplace_callback(ioptions.inplace_callback),
     max_successive_merges(mutable_cf_options.max_successive_merges),
-    filter_deletes(mutable_cf_options.filter_deletes) {}
+    filter_deletes(mutable_cf_options.filter_deletes),
+    statistics(ioptions.statistics),
+    merge_operator(ioptions.merge_operator),
+    info_log(ioptions.info_log) {}
 
 MemTable::MemTable(const InternalKeyComparator& cmp,
                    const ImmutableCFOptions& ioptions,
-                   const MemTableOptions& moptions)
+                   const MutableCFOptions& mutable_cf_options)
     : comparator_(cmp),
-      ioptions_(ioptions),
-      moptions_(moptions),
+      moptions_(ioptions, mutable_cf_options),
       refs_(0),
-      kArenaBlockSize(OptimizeBlockSize(moptions.arena_block_size)),
-      arena_(moptions.arena_block_size),
+      kArenaBlockSize(OptimizeBlockSize(moptions_.arena_block_size)),
+      arena_(moptions_.arena_block_size),
       table_(ioptions.memtable_factory->CreateMemTableRep(
           comparator_, &arena_, ioptions.prefix_extractor, ioptions.info_log)),
       num_entries_(0),
@@ -63,20 +66,20 @@ MemTable::MemTable(const InternalKeyComparator& cmp,
       file_number_(0),
       first_seqno_(0),
       mem_next_logfile_number_(0),
-      locks_(moptions.inplace_update_support ? moptions.inplace_update_num_locks
-                                             : 0),
+      locks_(moptions_.inplace_update_support ?
+             moptions_.inplace_update_num_locks : 0),
       prefix_extractor_(ioptions.prefix_extractor),
       should_flush_(ShouldFlushNow()),
       flush_scheduled_(false) {
   // if should_flush_ == true without an entry inserted, something must have
   // gone wrong already.
   assert(!should_flush_);
-  if (prefix_extractor_ && moptions.memtable_prefix_bloom_bits > 0) {
+  if (prefix_extractor_ && moptions_.memtable_prefix_bloom_bits > 0) {
     prefix_bloom_.reset(new DynamicBloom(
         &arena_,
-        moptions.memtable_prefix_bloom_bits, ioptions.bloom_locality,
-        moptions.memtable_prefix_bloom_probes, nullptr,
-        moptions.memtable_prefix_bloom_huge_page_tlb_size,
+        moptions_.memtable_prefix_bloom_bits, ioptions.bloom_locality,
+        moptions_.memtable_prefix_bloom_probes, nullptr,
+        moptions_.memtable_prefix_bloom_huge_page_tlb_size,
         ioptions.info_log));
   }
 }
@@ -454,10 +457,10 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
     saver.status = s;
     saver.mem = this;
     saver.merge_context = merge_context;
-    saver.merge_operator = ioptions_.merge_operator;
-    saver.logger = ioptions_.info_log;
+    saver.merge_operator = moptions_.merge_operator;
+    saver.logger = moptions_.info_log;
     saver.inplace_update_support = moptions_.inplace_update_support;
-    saver.statistics = ioptions_.statistics;
+    saver.statistics = moptions_.statistics;
     table_->Get(key, &saver, SaveValue);
   }
 
@@ -578,12 +581,12 @@ bool MemTable::UpdateCallback(SequenceNumber seq,
                 memcpy(p, prev_buffer, new_prev_size);
               }
             }
-            RecordTick(ioptions_.statistics, NUMBER_KEYS_UPDATED);
+            RecordTick(moptions_.statistics, NUMBER_KEYS_UPDATED);
             should_flush_ = ShouldFlushNow();
             return true;
           } else if (status == UpdateStatus::UPDATED) {
             Add(seq, kTypeValue, key, Slice(str_value));
-            RecordTick(ioptions_.statistics, NUMBER_KEYS_WRITTEN);
+            RecordTick(moptions_.statistics, NUMBER_KEYS_WRITTEN);
             should_flush_ = ShouldFlushNow();
             return true;
           } else if (status == UpdateStatus::UPDATE_FAILED) {
diff --git a/db/memtable.h b/db/memtable.h
index ce6cce7f6..96af1e90a 100644
--- a/db/memtable.h
+++ b/db/memtable.h
@@ -32,8 +32,8 @@ class MergeContext;
 
 struct MemTableOptions {
   explicit MemTableOptions(
-      const MutableCFOptions& mutable_cf_options,
-      const Options& options);
+      const ImmutableCFOptions& ioptions,
+      const MutableCFOptions& mutable_cf_options);
   size_t write_buffer_size;
   size_t arena_block_size;
   uint32_t memtable_prefix_bloom_bits;
@@ -47,6 +47,9 @@ struct MemTableOptions {
                                    std::string* merged_value);
   size_t max_successive_merges;
   bool filter_deletes;
+  Statistics* statistics;
+  MergeOperator* merge_operator;
+  Logger* info_log;
 };
 
 class MemTable {
@@ -64,7 +67,7 @@ class MemTable {
   // is zero and the caller must call Ref() at least once.
   explicit MemTable(const InternalKeyComparator& comparator,
                     const ImmutableCFOptions& ioptions,
-                    const MemTableOptions& moptions);
+                    const MutableCFOptions& mutable_cf_options);
 
   ~MemTable();
 
@@ -199,7 +202,6 @@ class MemTable {
 
   const Arena& TEST_GetArena() const { return arena_; }
 
-  const ImmutableCFOptions* GetImmutableOptions() const { return &ioptions_; }
   const MemTableOptions* GetMemTableOptions() const { return &moptions_; }
 
  private:
@@ -211,7 +213,6 @@ class MemTable {
   friend class MemTableList;
 
   KeyComparator comparator_;
-  const ImmutableCFOptions& ioptions_;
   const MemTableOptions moptions_;
   int refs_;
   const size_t kArenaBlockSize;
diff --git a/db/repair.cc b/db/repair.cc
index 80fb92bd9..10628c544 100644
--- a/db/repair.cc
+++ b/db/repair.cc
@@ -220,7 +220,7 @@ class Repairer {
     Slice record;
     WriteBatch batch;
     MemTable* mem = new MemTable(icmp_, ioptions_,
-        MemTableOptions(MutableCFOptions(options_, ioptions_), options_));
+                                 MutableCFOptions(options_, ioptions_));
     auto cf_mems_default = new ColumnFamilyMemTablesDefault(mem, &options_);
     mem->Ref();
     int counter = 0;
diff --git a/db/version_set.cc b/db/version_set.cc
index 65c36c715..ea52d95bf 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -2926,8 +2926,7 @@ ColumnFamilyData* VersionSet::CreateColumnFamily(
   AppendVersion(new_cfd, v);
   // GetLatestMutableCFOptions() is safe here without mutex since the
   // cfd is not available to client
-  new_cfd->CreateNewMemtable(MemTableOptions(
-        *new_cfd->GetLatestMutableCFOptions(), *new_cfd->options()));
+  new_cfd->CreateNewMemtable(*new_cfd->GetLatestMutableCFOptions());
   new_cfd->SetLogNumber(edit->log_number_);
   return new_cfd;
 }
diff --git a/db/write_batch.cc b/db/write_batch.cc
index b8d0322d8..6e15ec5c0 100644
--- a/db/write_batch.cc
+++ b/db/write_batch.cc
@@ -349,13 +349,12 @@ class MemTableInserter : public WriteBatch::Handler {
       return seek_status;
     }
     MemTable* mem = cf_mems_->GetMemTable();
-    auto* ioptions = mem->GetImmutableOptions();
     auto* moptions = mem->GetMemTableOptions();
     if (!moptions->inplace_update_support) {
       mem->Add(sequence_, kTypeValue, key, value);
     } else if (moptions->inplace_callback == nullptr) {
       mem->Update(sequence_, key, value);
-      RecordTick(ioptions->statistics, NUMBER_KEYS_UPDATED);
+      RecordTick(moptions->statistics, NUMBER_KEYS_UPDATED);
     } else {
       if (mem->UpdateCallback(sequence_, key, value)) {
       } else {
@@ -382,11 +381,11 @@ class MemTableInserter : public WriteBatch::Handler {
         if (status == UpdateStatus::UPDATED_INPLACE) {
           // prev_value is updated in-place with final value.
           mem->Add(sequence_, kTypeValue, key, Slice(prev_buffer, prev_size));
-          RecordTick(ioptions->statistics, NUMBER_KEYS_WRITTEN);
+          RecordTick(moptions->statistics, NUMBER_KEYS_WRITTEN);
         } else if (status == UpdateStatus::UPDATED) {
           // merged_value contains the final value.
           mem->Add(sequence_, kTypeValue, key, Slice(merged_value));
-          RecordTick(ioptions->statistics, NUMBER_KEYS_WRITTEN);
+          RecordTick(moptions->statistics, NUMBER_KEYS_WRITTEN);
         }
       }
     }
@@ -406,7 +405,6 @@ class MemTableInserter : public WriteBatch::Handler {
       return seek_status;
     }
     MemTable* mem = cf_mems_->GetMemTable();
-    auto* ioptions = mem->GetImmutableOptions();
     auto* moptions = mem->GetMemTableOptions();
     bool perform_merge = false;
 
@@ -441,16 +439,16 @@ class MemTableInserter : public WriteBatch::Handler {
       Slice get_value_slice = Slice(get_value);
 
       // 2) Apply this merge
-      auto merge_operator = ioptions->merge_operator;
+      auto merge_operator = moptions->merge_operator;
       assert(merge_operator);
 
       std::deque<std::string> operands;
       operands.push_front(value.ToString());
       std::string new_value;
       if (!merge_operator->FullMerge(key, &get_value_slice, operands,
-                                     &new_value, ioptions->info_log)) {
+                                     &new_value, moptions->info_log)) {
           // Failed to merge!
-        RecordTick(ioptions->statistics, NUMBER_MERGE_FAILURES);
+        RecordTick(moptions->statistics, NUMBER_MERGE_FAILURES);
 
         // Store the delta in memtable
         perform_merge = false;
@@ -477,7 +475,6 @@ class MemTableInserter : public WriteBatch::Handler {
       return seek_status;
     }
     MemTable* mem = cf_mems_->GetMemTable();
-    auto* ioptions = mem->GetImmutableOptions();
     auto* moptions = mem->GetMemTableOptions();
     if (!dont_filter_deletes_ && moptions->filter_deletes) {
       SnapshotImpl read_from_snapshot;
@@ -490,7 +487,7 @@ class MemTableInserter : public WriteBatch::Handler {
         cf_handle = db_->DefaultColumnFamily();
       }
       if (!db_->KeyMayExist(ropts, cf_handle, key, &value)) {
-        RecordTick(ioptions->statistics, NUMBER_FILTERED_DELETES);
+        RecordTick(moptions->statistics, NUMBER_FILTERED_DELETES);
         return Status::OK();
       }
     }
diff --git a/db/write_batch_test.cc b/db/write_batch_test.cc
index cb4048214..d24b2e068 100644
--- a/db/write_batch_test.cc
+++ b/db/write_batch_test.cc
@@ -29,7 +29,7 @@ static std::string PrintContents(WriteBatch* b) {
   options.memtable_factory = factory;
   ImmutableCFOptions ioptions(options);
   MemTable* mem = new MemTable(cmp, ioptions,
-      MemTableOptions(MutableCFOptions(options, ioptions), options));
+                               MutableCFOptions(options, ioptions));
   mem->Ref();
   std::string state;
   ColumnFamilyMemTablesDefault cf_mems_default(mem, &options);
diff --git a/include/rocksdb/immutable_options.h b/include/rocksdb/immutable_options.h
index 2dd50f756..49a136c07 100644
--- a/include/rocksdb/immutable_options.h
+++ b/include/rocksdb/immutable_options.h
@@ -5,6 +5,7 @@
 
 #pragma once
 
+#include <string>
 #include <vector>
 #include "rocksdb/options.h"
 
@@ -36,6 +37,13 @@ struct ImmutableCFOptions {
 
   CompactionFilterFactoryV2* compaction_filter_factory_v2;
 
+  bool inplace_update_support;
+
+  UpdateStatus (*inplace_callback)(char* existing_value,
+                                   uint32_t* existing_value_size,
+                                   Slice delta_value,
+                                   std::string* merged_value);
+
   Logger* info_log;
 
   Statistics* statistics;
diff --git a/table/table_test.cc b/table/table_test.cc
index 5e1bbe4cf..362905eea 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -439,7 +439,7 @@ class MemTableConstructor: public Constructor {
     options.memtable_factory = table_factory_;
     ImmutableCFOptions ioptions(options);
     memtable_ = new MemTable(internal_comparator_, ioptions,
-        MemTableOptions(MutableCFOptions(options, ioptions), options));
+                             MutableCFOptions(options, ioptions));
     memtable_->Ref();
   }
   ~MemTableConstructor() {
@@ -455,7 +455,7 @@ class MemTableConstructor: public Constructor {
     options.memtable_factory = table_factory_;
     ImmutableCFOptions mem_ioptions(options);
     memtable_ = new MemTable(internal_comparator_, mem_ioptions,
-        MemTableOptions(MutableCFOptions(options, mem_ioptions), options));
+                             MutableCFOptions(options, mem_ioptions));
     memtable_->Ref();
     int seq = 1;
     for (KVMap::const_iterator it = data.begin();
@@ -1879,7 +1879,7 @@ TEST(MemTableTest, Simple) {
   options.memtable_factory = table_factory;
   ImmutableCFOptions ioptions(options);
   MemTable* memtable = new MemTable(cmp, ioptions,
-        MemTableOptions(MutableCFOptions(options, ioptions), options));
+                                    MutableCFOptions(options, ioptions));
   memtable->Ref();
   WriteBatch batch;
   WriteBatchInternal::SetSequence(&batch, 100);
diff --git a/util/mutable_cf_options.h b/util/mutable_cf_options.h
index a738e7978..831b0d786 100644
--- a/util/mutable_cf_options.h
+++ b/util/mutable_cf_options.h
@@ -22,6 +22,7 @@ struct MutableCFOptions {
           options.memtable_prefix_bloom_huge_page_tlb_size),
       max_successive_merges(options.max_successive_merges),
       filter_deletes(options.filter_deletes),
+      inplace_update_num_locks(options.inplace_update_num_locks),
       disable_auto_compactions(options.disable_auto_compactions),
       soft_rate_limit(options.soft_rate_limit),
       hard_rate_limit(options.hard_rate_limit),
@@ -53,6 +54,7 @@ struct MutableCFOptions {
       memtable_prefix_bloom_huge_page_tlb_size(0),
       max_successive_merges(0),
       filter_deletes(false),
+      inplace_update_num_locks(0),
       disable_auto_compactions(false),
       soft_rate_limit(0),
       hard_rate_limit(0),
@@ -94,6 +96,7 @@ struct MutableCFOptions {
   size_t memtable_prefix_bloom_huge_page_tlb_size;
   size_t max_successive_merges;
   bool filter_deletes;
+  size_t inplace_update_num_locks;
 
   // Compaction related options
   bool disable_auto_compactions;
diff --git a/util/options.cc b/util/options.cc
index b5dc98317..03ffb0a6d 100644
--- a/util/options.cc
+++ b/util/options.cc
@@ -42,6 +42,8 @@ ImmutableCFOptions::ImmutableCFOptions(const Options& options)
     compaction_filter(options.compaction_filter),
     compaction_filter_factory(options.compaction_filter_factory.get()),
     compaction_filter_factory_v2(options.compaction_filter_factory_v2.get()),
+    inplace_update_support(options.inplace_update_support),
+    inplace_callback(options.inplace_callback),
     info_log(options.info_log.get()),
     statistics(options.statistics.get()),
     env(options.env),
diff --git a/util/options_helper.cc b/util/options_helper.cc
index 4fef52299..9b95150c5 100644
--- a/util/options_helper.cc
+++ b/util/options_helper.cc
@@ -94,6 +94,8 @@ bool ParseMemtableOptions(const std::string& name, const std::string& value,
     new_options->filter_deletes = ParseBoolean(name, value);
   } else if (name == "max_write_buffer_number") {
     new_options->max_write_buffer_number = ParseInt(value);
+  } else if (name == "inplace_update_num_locks") {
+    new_options->inplace_update_num_locks = ParseInt64(value);
   } else {
     return false;
   }
@@ -299,14 +301,12 @@ bool GetColumnFamilyOptionsFromMap(
       } else if (o.first == "compaction_options_fifo") {
         new_options->compaction_options_fifo.max_table_files_size
           = ParseUint64(o.second);
-      } else if (o.first == "inplace_update_support") {
-        new_options->inplace_update_support = ParseBoolean(o.first, o.second);
-      } else if (o.first == "inplace_update_num_locks") {
-        new_options->inplace_update_num_locks = ParseInt64(o.second);
       } else if (o.first == "bloom_locality") {
         new_options->bloom_locality = ParseUint32(o.second);
       } else if (o.first == "min_partial_merge_operands") {
         new_options->min_partial_merge_operands = ParseUint32(o.second);
+      } else if (o.first == "inplace_update_support") {
+        new_options->inplace_update_support = ParseBoolean(o.first, o.second);
       } else {
         return false;
       }

From 714c63c5840c8ec85b226db5750bd6723aa38561 Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Mon, 27 Oct 2014 12:11:16 -0700
Subject: [PATCH 306/829] db_stress for dynamic options

Summary: Allow SetOptions() during db_stress test

Test Plan: make crash_test

Reviewers: sdong, yhchiang, rven, igor

Reviewed By: igor

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D25497
---
 tools/db_crashtest.py |   1 +
 tools/db_stress.cc    | 161 +++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 161 insertions(+), 1 deletion(-)

diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index 8d0b4f5f7..77bd6ef27 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -98,6 +98,7 @@ def main(argv):
             --filter_deletes=%s
             --memtablerep=prefix_hash
             --prefix_size=7
+            --set_options_one_in=10000
             """ % (ops_per_thread,
                    threads,
                    write_buf_size,
diff --git a/tools/db_stress.cc b/tools/db_stress.cc
index b5c79bf3b..2cdf241bb 100644
--- a/tools/db_stress.cc
+++ b/tools/db_stress.cc
@@ -191,6 +191,9 @@ DEFINE_int32(clear_column_family_one_in, 1000000,
              "it again. If N == 0, never drop/create column families. "
              "When test_batches_snapshots is true, this flag has no effect");
 
+DEFINE_int32(set_options_one_in, 0,
+             "With a chance of 1/N, change some random options");
+
 DEFINE_int64(cache_size, 2 * KB * KB * KB,
              "Number of bytes to use as a cache of uncompressed data.");
 
@@ -372,7 +375,7 @@ static bool ValidatePrefixSize(const char* flagname, int32_t value) {
   return true;
 }
 DEFINE_int32(prefix_size, 7, "Control the prefix size for HashSkipListRep");
-static const bool FLAGS_prefix_size_dummy =
+static const bool FLAGS_prefix_size_dummy __attribute__((unused)) =
     RegisterFlagValidator(&FLAGS_prefix_size, &ValidatePrefixSize);
 
 DEFINE_bool(use_merge, false, "On true, replaces all writes with a Merge "
@@ -787,8 +790,129 @@ class StressTest {
     delete db_;
   }
 
+  bool BuildOptionsTable() {
+    if (FLAGS_set_options_one_in <= 0) {
+      return true;
+    }
+    options_table_ = {
+      {"write_buffer_size",
+        {
+          std::to_string(FLAGS_write_buffer_size),
+          std::to_string(FLAGS_write_buffer_size * 2),
+          std::to_string(FLAGS_write_buffer_size * 4)
+        }
+      },
+      {"max_write_buffer_number",
+        {
+          std::to_string(FLAGS_max_write_buffer_number),
+          std::to_string(FLAGS_max_write_buffer_number * 2),
+          std::to_string(FLAGS_max_write_buffer_number * 4)
+        }
+      },
+      {"arena_block_size",
+        {
+          std::to_string(Options().arena_block_size),
+          std::to_string(FLAGS_write_buffer_size / 4),
+          std::to_string(FLAGS_write_buffer_size / 8),
+        }
+      },
+      {"memtable_prefix_bloom_bits", {"0", "8", "10"}},
+      {"memtable_prefix_bloom_probes", {"4", "5", "6"}},
+      {"memtable_prefix_bloom_huge_page_tlb_size",
+        {
+          "0",
+          std::to_string(2 * 1024 * 1024)
+        }
+      },
+      {"max_successive_merges", {"0", "2", "4"}},
+      {"filter_deletes", {"0", "1"}},
+      {"inplace_update_num_locks", {"100", "200", "300"}},
+      // TODO(ljin): enable test for this option
+      // {"disable_auto_compactions", {"100", "200", "300"}},
+      {"soft_rate_limit", {"0", "0.5", "0.9"}},
+      {"hard_rate_limit", {"0", "1.1", "2.0"}},
+      {"level0_file_num_compaction_trigger",
+        {
+          std::to_string(FLAGS_level0_file_num_compaction_trigger),
+          std::to_string(FLAGS_level0_file_num_compaction_trigger + 2),
+          std::to_string(FLAGS_level0_file_num_compaction_trigger + 4),
+        }
+      },
+      {"level0_slowdown_writes_trigger",
+        {
+          std::to_string(FLAGS_level0_slowdown_writes_trigger),
+          std::to_string(FLAGS_level0_slowdown_writes_trigger + 2),
+          std::to_string(FLAGS_level0_slowdown_writes_trigger + 4),
+        }
+      },
+      {"level0_stop_writes_trigger",
+        {
+          std::to_string(FLAGS_level0_stop_writes_trigger),
+          std::to_string(FLAGS_level0_stop_writes_trigger + 2),
+          std::to_string(FLAGS_level0_stop_writes_trigger + 4),
+        }
+      },
+      {"max_grandparent_overlap_factor",
+        {
+          std::to_string(Options().max_grandparent_overlap_factor - 5),
+          std::to_string(Options().max_grandparent_overlap_factor),
+          std::to_string(Options().max_grandparent_overlap_factor + 5),
+        }
+      },
+      {"expanded_compaction_factor",
+        {
+          std::to_string(Options().expanded_compaction_factor - 5),
+          std::to_string(Options().expanded_compaction_factor),
+          std::to_string(Options().expanded_compaction_factor + 5),
+        }
+      },
+      {"source_compaction_factor",
+        {
+          std::to_string(Options().source_compaction_factor),
+          std::to_string(Options().source_compaction_factor * 2),
+          std::to_string(Options().source_compaction_factor * 4),
+        }
+      },
+      {"target_file_size_base",
+        {
+          std::to_string(FLAGS_target_file_size_base),
+          std::to_string(FLAGS_target_file_size_base * 2),
+          std::to_string(FLAGS_target_file_size_base * 4),
+        }
+      },
+      {"target_file_size_multiplier",
+        {
+          std::to_string(FLAGS_target_file_size_multiplier),
+          "1",
+          "2",
+        }
+      },
+      {"max_bytes_for_level_base",
+        {
+          std::to_string(FLAGS_max_bytes_for_level_base / 2),
+          std::to_string(FLAGS_max_bytes_for_level_base),
+          std::to_string(FLAGS_max_bytes_for_level_base * 2),
+        }
+      },
+      {"max_bytes_for_level_multiplier",
+        {
+          std::to_string(FLAGS_max_bytes_for_level_multiplier),
+          "1",
+          "2",
+        }
+      },
+      {"max_mem_compaction_level", {"0", "1", "2"}},
+      {"max_sequential_skip_in_iterations", {"4", "8", "12"}},
+    };
+    for (const auto& iter : options_table_) {
+      options_index_.push_back(iter.first);
+    }
+    return true;
+  }
+
   bool Run() {
     PrintEnv();
+    BuildOptionsTable();
     Open();
     SharedState shared(this);
     uint32_t n = shared.GetNumThreads();
@@ -1169,6 +1293,33 @@ class StressTest {
     return s;
   }
 
+  bool SetOptions(ThreadState* thread) {
+    assert(FLAGS_set_options_one_in > 0);
+    std::unordered_map<std::string, std::string> opts;
+    std::string name = options_index_[
+      thread->rand.Next() % options_index_.size()];
+    int value_idx = thread->rand.Next() % options_table_[name].size();
+    if (name == "soft_rate_limit" || name == "hard_rate_limit") {
+      opts["soft_rate_limit"] = options_table_["soft_rate_limit"][value_idx];
+      opts["hard_rate_limit"] = options_table_["hard_rate_limit"][value_idx];
+    } else if (name == "level0_file_num_compaction_trigger" ||
+               name == "level0_slowdown_writes_trigger" ||
+               name == "level0_stop_writes_trigger") {
+      opts["level0_file_num_compaction_trigger"] =
+        options_table_["level0_file_num_compaction_trigger"][value_idx];
+      opts["level0_slowdown_writes_trigger"] =
+        options_table_["level0_slowdown_writes_trigger"][value_idx];
+      opts["level0_stop_writes_trigger"] =
+        options_table_["level0_stop_writes_trigger"][value_idx];
+    } else {
+      opts[name] = options_table_[name][value_idx];
+    }
+
+    int rand_cf_idx = thread->rand.Next() % FLAGS_column_families;
+    auto cfh = column_families_[rand_cf_idx];
+    return db_->SetOptions(cfh, opts);
+  }
+
   void OperateDb(ThreadState* thread) {
     ReadOptions read_opts(FLAGS_verify_checksum, true);
     WriteOptions write_opts;
@@ -1205,6 +1356,12 @@ class StressTest {
         }
       }
 
+      // Change Options
+      if (FLAGS_set_options_one_in > 0 &&
+          thread->rand.OneIn(FLAGS_set_options_one_in)) {
+        SetOptions(thread);
+      }
+
       if (!FLAGS_test_batches_snapshots &&
           FLAGS_clear_column_family_one_in != 0 && FLAGS_column_families > 1) {
         if (thread->rand.OneIn(FLAGS_clear_column_family_one_in)) {
@@ -1751,6 +1908,8 @@ class StressTest {
   std::vector<std::string> column_family_names_;
   std::atomic<int> new_column_family_name_;
   int num_times_reopened_;
+  std::unordered_map<std::string, std::vector<std::string>> options_table_;
+  std::vector<std::string> options_index_;
 };
 
 }  // namespace rocksdb

From 679a9671febc301e953627744177c0df4a851104 Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Mon, 27 Oct 2014 21:32:21 +0100
Subject: [PATCH 307/829] RocksJava Fix after MutableCFOptions change.

---
 java/rocksjni/write_batch.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/java/rocksjni/write_batch.cc b/java/rocksjni/write_batch.cc
index 10937db14..13bff26db 100644
--- a/java/rocksjni/write_batch.cc
+++ b/java/rocksjni/write_batch.cc
@@ -294,8 +294,8 @@ jbyteArray Java_org_rocksdb_WriteBatchTest_getContents(
   options.memtable_factory = factory;
   rocksdb::MemTable* mem = new rocksdb::MemTable(
       cmp, rocksdb::ImmutableCFOptions(options),
-      rocksdb::MemTableOptions(rocksdb::MutableCFOptions(options,
-          rocksdb::ImmutableCFOptions(options)), options));
+      rocksdb::MutableCFOptions(options,
+      rocksdb::ImmutableCFOptions(options)));
   mem->Ref();
   std::string state;
   rocksdb::ColumnFamilyMemTablesDefault cf_mems_default(mem, &options);

From 48842ab316c81c30068e1b3a6f3161219424cee4 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Mon, 27 Oct 2014 14:50:21 -0700
Subject: [PATCH 308/829] Deprecate AtomicPointer

Summary: RocksDB already depends on C++11, so we might as well all the goodness that C++11 provides. This means that we don't need AtomicPointer anymore. The less things in port/, the easier it will be to port to other platforms.

Test Plan: make check + careful visual review verifying that NoBarried got memory_order_relaxed, while Acquire/Release methods got memory_order_acquire and memory_order_release

Reviewers: rven, yhchiang, ljin, sdong

Reviewed By: ljin

Subscribers: leveldb

Differential Revision: https://reviews.facebook.net/D27543
---
 build_tools/build_detect_platform |   9 --
 build_tools/fbcode.gcc471.sh      |   2 +-
 build_tools/fbcode.gcc481.sh      |   2 +-
 db/db_bench.cc                    |   4 +-
 db/db_impl.cc                     |  23 +++--
 db/db_impl.h                      |   2 +-
 db/db_test.cc                     | 119 +++++++++++-----------
 db/memtable_list.cc               |  10 +-
 db/memtable_list.h                |   6 +-
 db/skiplist.h                     |  21 ++--
 db/skiplist_test.cc               |  23 ++---
 port/atomic_pointer.h             | 157 ------------------------------
 port/port_example.h               |  29 ------
 port/port_posix.h                 |   1 -
 tools/db_repl_stress.cc           |  10 +-
 util/env_test.cc                  |  25 ++---
 util/hash_linklist_rep.cc         |  61 ++++++------
 util/hash_skiplist_rep.cc         |  16 +--
 18 files changed, 167 insertions(+), 353 deletions(-)
 delete mode 100644 port/atomic_pointer.h

diff --git a/build_tools/build_detect_platform b/build_tools/build_detect_platform
index c026782f6..92839ad4f 100755
--- a/build_tools/build_detect_platform
+++ b/build_tools/build_detect_platform
@@ -188,15 +188,6 @@ if [ "$CROSS_COMPILE" = "true" -o "$FBCODE_BUILD" = "true" ]; then
     # Also don't need any compilation tests if compiling on fbcode
     true
 else
-    # If -std=c++0x works, use <atomic>.  Otherwise use port_posix.h.
-    $CXX $CFLAGS -std=c++0x -x c++ - -o /dev/null 2>/dev/null  <<EOF
-      #include <atomic>
-      int main() {}
-EOF
-    if [ "$?" = 0 ]; then
-        COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_ATOMIC_PRESENT"
-    fi
-
     # Test whether fallocate is available
     $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
       #include <fcntl.h>
diff --git a/build_tools/fbcode.gcc471.sh b/build_tools/fbcode.gcc471.sh
index c971cda5b..b5d886730 100644
--- a/build_tools/fbcode.gcc471.sh
+++ b/build_tools/fbcode.gcc471.sh
@@ -54,7 +54,7 @@ RANLIB=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ranlib
 CFLAGS="-B$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/gold -m64 -mtune=generic"
 CFLAGS+=" -I $TOOLCHAIN_LIB_BASE/jemalloc/$TOOL_JEMALLOC/include -DHAVE_JEMALLOC"
 CFLAGS+=" $LIBGCC_INCLUDE $GLIBC_INCLUDE"
-CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_ATOMIC_PRESENT -DROCKSDB_FALLOCATE_PRESENT"
+CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_FALLOCATE_PRESENT"
 CFLAGS+=" -DSNAPPY -DGFLAGS=google -DZLIB -DBZIP2"
 
 EXEC_LDFLAGS=" -Wl,--whole-archive $TOOLCHAIN_LIB_BASE/jemalloc/$TOOL_JEMALLOC/lib/libjemalloc.a"
diff --git a/build_tools/fbcode.gcc481.sh b/build_tools/fbcode.gcc481.sh
index 5426e3f9a..386ad509b 100644
--- a/build_tools/fbcode.gcc481.sh
+++ b/build_tools/fbcode.gcc481.sh
@@ -70,7 +70,7 @@ RANLIB=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ranlib
 
 CFLAGS="-B$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/gold -m64 -mtune=generic"
 CFLAGS+=" $LIBGCC_INCLUDE $GLIBC_INCLUDE"
-CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_ATOMIC_PRESENT -DROCKSDB_FALLOCATE_PRESENT"
+CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_FALLOCATE_PRESENT"
 CFLAGS+=" -DSNAPPY -DGFLAGS=google -DZLIB -DBZIP2 -DLZ4 -DNUMA"
 
 EXEC_LDFLAGS="-Wl,--dynamic-linker,/usr/local/fbcode/gcc-4.8.1-glibc-2.17/lib/ld.so"
diff --git a/db/db_bench.cc b/db/db_bench.cc
index f0fe5e02e..1a379e948 100644
--- a/db/db_bench.cc
+++ b/db/db_bench.cc
@@ -1533,13 +1533,13 @@ class Benchmark {
 
   void AcquireLoad(ThreadState* thread) {
     int dummy;
-    port::AtomicPointer ap(&dummy);
+    std::atomic<void*> ap;
     int count = 0;
     void *ptr = nullptr;
     thread->stats.AddMessage("(each op is 1000 loads)");
     while (count < 100000) {
       for (int i = 0; i < 1000; i++) {
-        ptr = ap.Acquire_Load();
+        ptr = ap.load(std::memory_order_acquire);
       }
       count++;
       thread->stats.FinishedOps(nullptr, nullptr, 1);
diff --git a/db/db_impl.cc b/db/db_impl.cc
index 40b94acab..dc5fc2394 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -326,7 +326,7 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname)
       stats_(db_options_.statistics.get()),
       db_lock_(nullptr),
       mutex_(options.use_adaptive_mutex),
-      shutting_down_(nullptr),
+      shutting_down_(false),
       bg_cv_(&mutex_),
       logfile_number_(0),
       log_empty_(true),
@@ -388,7 +388,7 @@ DBImpl::~DBImpl() {
   }
 
   // Wait for background work to finish
-  shutting_down_.Release_Store(this);  // Any non-nullptr value is ok
+  shutting_down_.store(true, std::memory_order_release);
   while (bg_compaction_scheduled_ || bg_flush_scheduled_) {
     bg_cv_.Wait();
   }
@@ -1615,7 +1615,8 @@ Status DBImpl::FlushMemTableToOutputFile(
   Status s = WriteLevel0Table(cfd, mutable_cf_options, mems, edit,
                               &file_number, log_buffer);
 
-  if (s.ok() && (shutting_down_.Acquire_Load() || cfd->IsDropped())) {
+  if (s.ok() &&
+      (shutting_down_.load(std::memory_order_acquire) || cfd->IsDropped())) {
     s = Status::ShutdownInProgress(
         "Database shutdown or Column family drop during flush");
   }
@@ -2014,7 +2015,7 @@ void DBImpl::MaybeScheduleFlushOrCompaction() {
   bg_schedule_needed_ = false;
   if (bg_work_gate_closed_) {
     // gate closed for backgrond work
-  } else if (shutting_down_.Acquire_Load()) {
+  } else if (shutting_down_.load(std::memory_order_acquire)) {
     // DB is being deleted; no more background compactions
   } else {
     bool is_flush_pending = false;
@@ -2129,7 +2130,7 @@ void DBImpl::BackgroundCallFlush() {
     MutexLock l(&mutex_);
 
     Status s;
-    if (!shutting_down_.Acquire_Load()) {
+    if (!shutting_down_.load(std::memory_order_acquire)) {
       s = BackgroundFlush(&madeProgress, deletion_state, &log_buffer);
       if (!s.ok()) {
         // Wait a little bit before retrying background compaction in
@@ -2196,7 +2197,7 @@ void DBImpl::BackgroundCallCompaction() {
     MutexLock l(&mutex_);
     assert(bg_compaction_scheduled_);
     Status s;
-    if (!shutting_down_.Acquire_Load()) {
+    if (!shutting_down_.load(std::memory_order_acquire)) {
       s = BackgroundCompaction(&madeProgress, deletion_state, &log_buffer);
       if (!s.ok()) {
         // Wait a little bit before retrying background compaction in
@@ -2700,7 +2701,7 @@ uint64_t DBImpl::CallFlushDuringCompaction(ColumnFamilyData* cfd,
     // flush thread will take care of this
     return 0;
   }
-  if (cfd->imm()->imm_flush_needed.NoBarrier_Load() != nullptr) {
+  if (cfd->imm()->imm_flush_needed.load(std::memory_order_relaxed)) {
     const uint64_t imm_start = env_->NowMicros();
     mutex_.Lock();
     if (cfd->imm()->IsFlushPending()) {
@@ -2762,7 +2763,7 @@ Status DBImpl::ProcessKeyValueCompaction(
   int64_t key_drop_newer_entry = 0;
   int64_t key_drop_obsolete = 0;
   int64_t loop_cnt = 0;
-  while (input->Valid() && !shutting_down_.Acquire_Load() &&
+  while (input->Valid() && !shutting_down_.load(std::memory_order_acquire) &&
          !cfd->IsDropped()) {
     if (++loop_cnt > 1000) {
       if (key_drop_user > 0) {
@@ -3222,7 +3223,8 @@ Status DBImpl::DoCompactionWork(CompactionState* compact,
     shared_ptr<Iterator> backup_input(
         versions_->MakeInputIterator(compact->compaction));
     backup_input->SeekToFirst();
-    while (backup_input->Valid() && !shutting_down_.Acquire_Load() &&
+    while (backup_input->Valid() &&
+           !shutting_down_.load(std::memory_order_acquire) &&
            !cfd->IsDropped()) {
       // FLUSH preempts compaction
       // TODO(icanadi) this currently only checks if flush is necessary on
@@ -3356,7 +3358,8 @@ Status DBImpl::DoCompactionWork(CompactionState* compact,
         log_buffer);
   }  // checking for compaction filter v2
 
-  if (status.ok() && (shutting_down_.Acquire_Load() || cfd->IsDropped())) {
+  if (status.ok() &&
+      (shutting_down_.load(std::memory_order_acquire) || cfd->IsDropped())) {
     status = Status::ShutdownInProgress(
         "Database shutdown or Column family drop during compaction");
   }
diff --git a/db/db_impl.h b/db/db_impl.h
index 2d5cfe6c2..f730d6ba4 100644
--- a/db/db_impl.h
+++ b/db/db_impl.h
@@ -488,7 +488,7 @@ class DBImpl : public DB {
 
   // State below is protected by mutex_
   port::Mutex mutex_;
-  port::AtomicPointer shutting_down_;
+  std::atomic<bool> shutting_down_;
   // This condition variable is signaled on these conditions:
   // * whenever bg_compaction_scheduled_ goes down to 0
   // * if bg_manual_only_ > 0, whenever a compaction finishes, even if it hasn't
diff --git a/db/db_test.cc b/db/db_test.cc
index cfd9dcd9b..3ded0ec97 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -121,25 +121,25 @@ static std::string Key(int i) {
 class SpecialEnv : public EnvWrapper {
  public:
   // sstable Sync() calls are blocked while this pointer is non-nullptr.
-  port::AtomicPointer delay_sstable_sync_;
+  std::atomic<bool> delay_sstable_sync_;
 
   // Drop writes on the floor while this pointer is non-nullptr.
-  port::AtomicPointer drop_writes_;
+  std::atomic<bool> drop_writes_;
 
   // Simulate no-space errors while this pointer is non-nullptr.
-  port::AtomicPointer no_space_;
+  std::atomic<bool> no_space_;
 
   // Simulate non-writable file system while this pointer is non-nullptr
-  port::AtomicPointer non_writable_;
+  std::atomic<bool> non_writable_;
 
   // Force sync of manifest files to fail while this pointer is non-nullptr
-  port::AtomicPointer manifest_sync_error_;
+  std::atomic<bool> manifest_sync_error_;
 
   // Force write to manifest files to fail while this pointer is non-nullptr
-  port::AtomicPointer manifest_write_error_;
+  std::atomic<bool> manifest_write_error_;
 
   // Force write to log files to fail while this pointer is non-nullptr
-  port::AtomicPointer log_write_error_;
+  std::atomic<bool> log_write_error_;
 
   bool count_random_reads_;
   anon::AtomicCounter random_read_counter_;
@@ -154,15 +154,15 @@ class SpecialEnv : public EnvWrapper {
   std::atomic<int> sync_counter_;
 
   explicit SpecialEnv(Env* base) : EnvWrapper(base) {
-    delay_sstable_sync_.Release_Store(nullptr);
-    drop_writes_.Release_Store(nullptr);
-    no_space_.Release_Store(nullptr);
-    non_writable_.Release_Store(nullptr);
+    delay_sstable_sync_.store(false, std::memory_order_release);
+    drop_writes_.store(false, std::memory_order_release);
+    no_space_.store(false, std::memory_order_release);
+    non_writable_.store(false, std::memory_order_release);
     count_random_reads_ = false;
     count_sequential_reads_ = false;
-    manifest_sync_error_.Release_Store(nullptr);
-    manifest_write_error_.Release_Store(nullptr);
-    log_write_error_.Release_Store(nullptr);
+    manifest_sync_error_.store(false, std::memory_order_release);
+    manifest_write_error_.store(false, std::memory_order_release);
+    log_write_error_.store(false, std::memory_order_release);
     bytes_written_ = 0;
     sync_counter_ = 0;
   }
@@ -180,10 +180,10 @@ class SpecialEnv : public EnvWrapper {
             base_(std::move(base)) {
       }
       Status Append(const Slice& data) {
-        if (env_->drop_writes_.Acquire_Load() != nullptr) {
+        if (env_->drop_writes_.load(std::memory_order_acquire)) {
           // Drop writes on the floor
           return Status::OK();
-        } else if (env_->no_space_.Acquire_Load() != nullptr) {
+        } else if (env_->no_space_.load(std::memory_order_acquire)) {
           return Status::IOError("No space left on device");
         } else {
           env_->bytes_written_ += data.size();
@@ -194,7 +194,7 @@ class SpecialEnv : public EnvWrapper {
       Status Flush() { return base_->Flush(); }
       Status Sync() {
         ++env_->sync_counter_;
-        while (env_->delay_sstable_sync_.Acquire_Load() != nullptr) {
+        while (env_->delay_sstable_sync_.load(std::memory_order_acquire)) {
           env_->SleepForMicroseconds(100000);
         }
         return base_->Sync();
@@ -211,7 +211,7 @@ class SpecialEnv : public EnvWrapper {
       ManifestFile(SpecialEnv* env, unique_ptr<WritableFile>&& b)
           : env_(env), base_(std::move(b)) { }
       Status Append(const Slice& data) {
-        if (env_->manifest_write_error_.Acquire_Load() != nullptr) {
+        if (env_->manifest_write_error_.load(std::memory_order_acquire)) {
           return Status::IOError("simulated writer error");
         } else {
           return base_->Append(data);
@@ -221,7 +221,7 @@ class SpecialEnv : public EnvWrapper {
       Status Flush() { return base_->Flush(); }
       Status Sync() {
         ++env_->sync_counter_;
-        if (env_->manifest_sync_error_.Acquire_Load() != nullptr) {
+        if (env_->manifest_sync_error_.load(std::memory_order_acquire)) {
           return Status::IOError("simulated sync error");
         } else {
           return base_->Sync();
@@ -236,7 +236,7 @@ class SpecialEnv : public EnvWrapper {
       LogFile(SpecialEnv* env, unique_ptr<WritableFile>&& b)
           : env_(env), base_(std::move(b)) { }
       Status Append(const Slice& data) {
-        if (env_->log_write_error_.Acquire_Load() != nullptr) {
+        if (env_->log_write_error_.load(std::memory_order_acquire)) {
           return Status::IOError("simulated writer error");
         } else {
           return base_->Append(data);
@@ -250,7 +250,7 @@ class SpecialEnv : public EnvWrapper {
       }
     };
 
-    if (non_writable_.Acquire_Load() != nullptr) {
+    if (non_writable_.load(std::memory_order_acquire)) {
       return Status::IOError("simulated write error");
     }
 
@@ -1211,7 +1211,8 @@ TEST(DBTest, Empty) {
         handles_[1], "rocksdb.num-entries-active-mem-table", &num));
     ASSERT_EQ("1", num);
 
-    env_->delay_sstable_sync_.Release_Store(env_);  // Block sync calls
+    // Block sync calls
+    env_->delay_sstable_sync_.store(true, std::memory_order_release);
     Put(1, "k1", std::string(100000, 'x'));         // Fill memtable
     ASSERT_TRUE(dbfull()->GetProperty(
         handles_[1], "rocksdb.num-entries-active-mem-table", &num));
@@ -1223,7 +1224,8 @@ TEST(DBTest, Empty) {
     ASSERT_EQ("1", num);
 
     ASSERT_EQ("v1", Get(1, "foo"));
-    env_->delay_sstable_sync_.Release_Store(nullptr);   // Release sync calls
+    // Release sync calls
+    env_->delay_sstable_sync_.store(false, std::memory_order_release);
 
     ASSERT_OK(db_->DisableFileDeletions());
     ASSERT_TRUE(
@@ -1539,12 +1541,14 @@ TEST(DBTest, GetFromImmutableLayer) {
     ASSERT_OK(Put(1, "foo", "v1"));
     ASSERT_EQ("v1", Get(1, "foo"));
 
-    env_->delay_sstable_sync_.Release_Store(env_);   // Block sync calls
+    // Block sync calls
+    env_->delay_sstable_sync_.store(true, std::memory_order_release);
     Put(1, "k1", std::string(100000, 'x'));          // Fill memtable
     Put(1, "k2", std::string(100000, 'y'));          // Trigger flush
     ASSERT_EQ("v1", Get(1, "foo"));
     ASSERT_EQ("NOT_FOUND", Get(0, "foo"));
-    env_->delay_sstable_sync_.Release_Store(nullptr);   // Release sync calls
+    // Release sync calls
+    env_->delay_sstable_sync_.store(false, std::memory_order_release);
   } while (ChangeOptions());
 }
 
@@ -5776,7 +5780,8 @@ TEST(DBTest, DropWrites) {
     ASSERT_EQ("v1", Get("foo"));
     Compact("a", "z");
     const int num_files = CountFiles();
-    env_->drop_writes_.Release_Store(env_);   // Force out-of-space errors
+    // Force out-of-space errors
+    env_->drop_writes_.store(true, std::memory_order_release);
     env_->sleep_counter_.Reset();
     for (int i = 0; i < 5; i++) {
       for (int level = 0; level < dbfull()->NumberLevels()-1; level++) {
@@ -5788,7 +5793,7 @@ TEST(DBTest, DropWrites) {
     ASSERT_TRUE(db_->GetProperty("rocksdb.background-errors", &property_value));
     ASSERT_EQ("5", property_value);
 
-    env_->drop_writes_.Release_Store(nullptr);
+    env_->drop_writes_.store(false, std::memory_order_release);
     ASSERT_LT(CountFiles(), num_files + 3);
 
     // Check that compaction attempts slept after errors
@@ -5805,7 +5810,8 @@ TEST(DBTest, DropWritesFlush) {
     Reopen(&options);
 
     ASSERT_OK(Put("foo", "v1"));
-    env_->drop_writes_.Release_Store(env_);  // Force out-of-space errors
+    // Force out-of-space errors
+    env_->drop_writes_.store(true, std::memory_order_release);
 
     std::string property_value;
     // Background error count is 0 now.
@@ -5829,7 +5835,7 @@ TEST(DBTest, DropWritesFlush) {
     }
     ASSERT_EQ("1", property_value);
 
-    env_->drop_writes_.Release_Store(nullptr);
+    env_->drop_writes_.store(false, std::memory_order_release);
   } while (ChangeCompactOptions());
 }
 
@@ -5848,12 +5854,13 @@ TEST(DBTest, NoSpaceCompactRange) {
       ASSERT_OK(Flush());
     }
 
-    env_->no_space_.Release_Store(env_);  // Force out-of-space errors
+    // Force out-of-space errors
+    env_->no_space_.store(true, std::memory_order_release);
 
     Status s = db_->CompactRange(nullptr, nullptr);
     ASSERT_TRUE(s.IsIOError());
 
-    env_->no_space_.Release_Store(nullptr);
+    env_->no_space_.store(false, std::memory_order_release);
   } while (ChangeCompactOptions());
 }
 
@@ -5864,7 +5871,8 @@ TEST(DBTest, NonWritableFileSystem) {
     options.env = env_;
     Reopen(&options);
     ASSERT_OK(Put("foo", "v1"));
-    env_->non_writable_.Release_Store(env_); // Force errors for new files
+    // Force errors for new files
+    env_->non_writable_.store(true, std::memory_order_release);
     std::string big(100000, 'x');
     int errors = 0;
     for (int i = 0; i < 20; i++) {
@@ -5874,7 +5882,7 @@ TEST(DBTest, NonWritableFileSystem) {
       }
     }
     ASSERT_GT(errors, 0);
-    env_->non_writable_.Release_Store(nullptr);
+    env_->non_writable_.store(false, std::memory_order_release);
   } while (ChangeCompactOptions());
 }
 
@@ -5888,7 +5896,7 @@ TEST(DBTest, ManifestWriteError) {
   // We iterate twice.  In the second iteration, everything is the
   // same except the log record never makes it to the MANIFEST file.
   for (int iter = 0; iter < 2; iter++) {
-    port::AtomicPointer* error_type = (iter == 0)
+    std::atomic<bool>* error_type = (iter == 0)
         ? &env_->manifest_sync_error_
         : &env_->manifest_write_error_;
 
@@ -5909,12 +5917,12 @@ TEST(DBTest, ManifestWriteError) {
     ASSERT_EQ(NumTableFilesAtLevel(last), 1);   // foo=>bar is now in last level
 
     // Merging compaction (will fail)
-    error_type->Release_Store(env_);
+    error_type->store(true, std::memory_order_release);
     dbfull()->TEST_CompactRange(last, nullptr, nullptr);  // Should fail
     ASSERT_EQ("bar", Get("foo"));
 
     // Recovery: should not lose data
-    error_type->Release_Store(nullptr);
+    error_type->store(false, std::memory_order_release);
     Reopen(&options);
     ASSERT_EQ("bar", Get("foo"));
   }
@@ -5938,10 +5946,10 @@ TEST(DBTest, PutFailsParanoid) {
   ASSERT_OK(Put(1, "foo", "bar"));
   ASSERT_OK(Put(1, "foo1", "bar1"));
   // simulate error
-  env_->log_write_error_.Release_Store(env_);
+  env_->log_write_error_.store(true, std::memory_order_release);
   s = Put(1, "foo2", "bar2");
   ASSERT_TRUE(!s.ok());
-  env_->log_write_error_.Release_Store(nullptr);
+  env_->log_write_error_.store(false, std::memory_order_release);
   s = Put(1, "foo3", "bar3");
   // the next put should fail, too
   ASSERT_TRUE(!s.ok());
@@ -5956,10 +5964,10 @@ TEST(DBTest, PutFailsParanoid) {
   ASSERT_OK(Put(1, "foo", "bar"));
   ASSERT_OK(Put(1, "foo1", "bar1"));
   // simulate error
-  env_->log_write_error_.Release_Store(env_);
+  env_->log_write_error_.store(true, std::memory_order_release);
   s = Put(1, "foo2", "bar2");
   ASSERT_TRUE(!s.ok());
-  env_->log_write_error_.Release_Store(nullptr);
+  env_->log_write_error_.store(false, std::memory_order_release);
   s = Put(1, "foo3", "bar3");
   // the next put should NOT fail
   ASSERT_TRUE(s.ok());
@@ -6005,7 +6013,7 @@ TEST(DBTest, BloomFilter) {
     Flush(1);
 
     // Prevent auto compactions triggered by seeks
-    env_->delay_sstable_sync_.Release_Store(env_);
+    env_->delay_sstable_sync_.store(true, std::memory_order_release);
 
     // Lookup present keys.  Should rarely read from small sstable.
     env_->random_read_counter_.Reset();
@@ -6026,7 +6034,7 @@ TEST(DBTest, BloomFilter) {
     fprintf(stderr, "%d missing => %d reads\n", N, reads);
     ASSERT_LE(reads, 3*N/100);
 
-    env_->delay_sstable_sync_.Release_Store(nullptr);
+    env_->delay_sstable_sync_.store(false, std::memory_order_release);
     Close();
   } while (ChangeCompactOptions());
 }
@@ -7047,9 +7055,9 @@ static const int kNumKeys = 1000;
 
 struct MTState {
   DBTest* test;
-  port::AtomicPointer stop;
-  port::AtomicPointer counter[kNumThreads];
-  port::AtomicPointer thread_done[kNumThreads];
+  std::atomic<bool> stop;
+  std::atomic<int> counter[kNumThreads];
+  std::atomic<bool> thread_done[kNumThreads];
 };
 
 struct MTThread {
@@ -7061,12 +7069,12 @@ static void MTThreadBody(void* arg) {
   MTThread* t = reinterpret_cast<MTThread*>(arg);
   int id = t->id;
   DB* db = t->state->test->db_;
-  uintptr_t counter = 0;
+  int counter = 0;
   fprintf(stderr, "... starting thread %d\n", id);
   Random rnd(1000 + id);
   char valbuf[1500];
-  while (t->state->stop.Acquire_Load() == nullptr) {
-    t->state->counter[id].Release_Store(reinterpret_cast<void*>(counter));
+  while (t->state->stop.load(std::memory_order_acquire) == false) {
+    t->state->counter[id].store(counter, std::memory_order_release);
 
     int key = rnd.Uniform(kNumKeys);
     char keybuf[20];
@@ -7126,8 +7134,7 @@ static void MTThreadBody(void* arg) {
           ASSERT_EQ(k, key);
           ASSERT_GE(w, 0);
           ASSERT_LT(w, kNumThreads);
-          ASSERT_LE((unsigned int)c, reinterpret_cast<uintptr_t>(
-                                         t->state->counter[w].Acquire_Load()));
+          ASSERT_LE(c, t->state->counter[w].load(std::memory_order_acquire));
           ASSERT_EQ(cf, i);
           if (i == 0) {
             unique_id = u;
@@ -7141,7 +7148,7 @@ static void MTThreadBody(void* arg) {
     }
     counter++;
   }
-  t->state->thread_done[id].Release_Store(t);
+  t->state->thread_done[id].store(true, std::memory_order_release);
   fprintf(stderr, "... stopping thread %d after %d ops\n", id, int(counter));
 }
 
@@ -7157,10 +7164,10 @@ TEST(DBTest, MultiThreaded) {
     // Initialize state
     MTState mt;
     mt.test = this;
-    mt.stop.Release_Store(0);
+    mt.stop.store(false, std::memory_order_release);
     for (int id = 0; id < kNumThreads; id++) {
-      mt.counter[id].Release_Store(0);
-      mt.thread_done[id].Release_Store(0);
+      mt.counter[id].store(0, std::memory_order_release);
+      mt.thread_done[id].store(false, std::memory_order_release);
     }
 
     // Start threads
@@ -7175,9 +7182,9 @@ TEST(DBTest, MultiThreaded) {
     env_->SleepForMicroseconds(kTestSeconds * 1000000);
 
     // Stop the threads and wait for them to finish
-    mt.stop.Release_Store(&mt);
+    mt.stop.store(true, std::memory_order_release);
     for (int id = 0; id < kNumThreads; id++) {
-      while (mt.thread_done[id].Acquire_Load() == nullptr) {
+      while (mt.thread_done[id].load(std::memory_order_acquire) == false) {
         env_->SleepForMicroseconds(100000);
       }
     }
diff --git a/db/memtable_list.cc b/db/memtable_list.cc
index bd48f1f47..69325c748 100644
--- a/db/memtable_list.cc
+++ b/db/memtable_list.cc
@@ -114,7 +114,7 @@ void MemTableListVersion::Remove(MemTable* m) {
 bool MemTableList::IsFlushPending() const {
   if ((flush_requested_ && num_flush_not_started_ >= 1) ||
       (num_flush_not_started_ >= min_write_buffer_number_to_merge_)) {
-    assert(imm_flush_needed.NoBarrier_Load() != nullptr);
+    assert(imm_flush_needed.load(std::memory_order_relaxed));
     return true;
   }
   return false;
@@ -129,7 +129,7 @@ void MemTableList::PickMemtablesToFlush(autovector<MemTable*>* ret) {
       assert(!m->flush_completed_);
       num_flush_not_started_--;
       if (num_flush_not_started_ == 0) {
-        imm_flush_needed.Release_Store(nullptr);
+        imm_flush_needed.store(false, std::memory_order_release);
       }
       m->flush_in_progress_ = true;  // flushing will start very soon
       ret->push_back(m);
@@ -155,7 +155,7 @@ void MemTableList::RollbackMemtableFlush(const autovector<MemTable*>& mems,
     num_flush_not_started_++;
   }
   pending_outputs->erase(file_number);
-  imm_flush_needed.Release_Store(reinterpret_cast<void *>(1));
+  imm_flush_needed.store(true, std::memory_order_release);
 }
 
 // Record a successful flush in the manifest file
@@ -236,7 +236,7 @@ Status MemTableList::InstallMemtableFlushResults(
         num_flush_not_started_++;
         pending_outputs->erase(m->file_number_);
         m->file_number_ = 0;
-        imm_flush_needed.Release_Store((void *)1);
+        imm_flush_needed.store(true, std::memory_order_release);
       }
       ++mem_id;
     } while (!current_->memlist_.empty() && (m = current_->memlist_.back()) &&
@@ -259,7 +259,7 @@ void MemTableList::Add(MemTable* m) {
   m->MarkImmutable();
   num_flush_not_started_++;
   if (num_flush_not_started_ == 1) {
-    imm_flush_needed.Release_Store((void *)1);
+    imm_flush_needed.store(true, std::memory_order_release);
   }
 }
 
diff --git a/db/memtable_list.h b/db/memtable_list.h
index d93c7df92..5e16be5cb 100644
--- a/db/memtable_list.h
+++ b/db/memtable_list.h
@@ -78,12 +78,12 @@ class MemTableList {
  public:
   // A list of memtables.
   explicit MemTableList(int min_write_buffer_number_to_merge)
-      : min_write_buffer_number_to_merge_(min_write_buffer_number_to_merge),
+      : imm_flush_needed(false),
+        min_write_buffer_number_to_merge_(min_write_buffer_number_to_merge),
         current_(new MemTableListVersion()),
         num_flush_not_started_(0),
         commit_in_progress_(false),
         flush_requested_(false) {
-    imm_flush_needed.Release_Store(nullptr);
     current_->Ref();
   }
   ~MemTableList() {}
@@ -92,7 +92,7 @@ class MemTableList {
 
   // so that background threads can detect non-nullptr pointer to
   // determine whether there is anything more to start flushing.
-  port::AtomicPointer imm_flush_needed;
+  std::atomic<bool> imm_flush_needed;
 
   // Returns the total number of memtables in the list
   int size() const;
diff --git a/db/skiplist.h b/db/skiplist.h
index 751f7c3ec..68c8bceba 100644
--- a/db/skiplist.h
+++ b/db/skiplist.h
@@ -115,15 +115,14 @@ class SkipList {
 
   // Modified only by Insert().  Read racily by readers, but stale
   // values are ok.
-  port::AtomicPointer max_height_;   // Height of the entire list
+  std::atomic<int> max_height_;  // Height of the entire list
 
   // Used for optimizing sequential insert patterns
   Node** prev_;
   int32_t prev_height_;
 
   inline int GetMaxHeight() const {
-    return static_cast<int>(
-        reinterpret_cast<intptr_t>(max_height_.NoBarrier_Load()));
+    return max_height_.load(std::memory_order_relaxed);
   }
 
   // Read/written only by Insert().
@@ -169,35 +168,35 @@ struct SkipList<Key, Comparator>::Node {
     assert(n >= 0);
     // Use an 'acquire load' so that we observe a fully initialized
     // version of the returned Node.
-    return reinterpret_cast<Node*>(next_[n].Acquire_Load());
+    return (next_[n].load(std::memory_order_acquire));
   }
   void SetNext(int n, Node* x) {
     assert(n >= 0);
     // Use a 'release store' so that anybody who reads through this
     // pointer observes a fully initialized version of the inserted node.
-    next_[n].Release_Store(x);
+    next_[n].store(x, std::memory_order_release);
   }
 
   // No-barrier variants that can be safely used in a few locations.
   Node* NoBarrier_Next(int n) {
     assert(n >= 0);
-    return reinterpret_cast<Node*>(next_[n].NoBarrier_Load());
+    return next_[n].load(std::memory_order_relaxed);
   }
   void NoBarrier_SetNext(int n, Node* x) {
     assert(n >= 0);
-    next_[n].NoBarrier_Store(x);
+    next_[n].store(x, std::memory_order_relaxed);
   }
 
  private:
   // Array of length equal to the node height.  next_[0] is lowest level link.
-  port::AtomicPointer next_[1];
+  std::atomic<Node*> next_[1];
 };
 
 template<typename Key, class Comparator>
 typename SkipList<Key, Comparator>::Node*
 SkipList<Key, Comparator>::NewNode(const Key& key, int height) {
   char* mem = arena_->AllocateAligned(
-      sizeof(Node) + sizeof(port::AtomicPointer) * (height - 1));
+      sizeof(Node) + sizeof(std::atomic<Node*>) * (height - 1));
   return new (mem) Node(key);
 }
 
@@ -364,7 +363,7 @@ SkipList<Key, Comparator>::SkipList(const Comparator cmp, Arena* arena,
       compare_(cmp),
       arena_(arena),
       head_(NewNode(0 /* any key will do */, max_height)),
-      max_height_(reinterpret_cast<void*>(1)),
+      max_height_(1),
       prev_height_(1),
       rnd_(0xdeadbeef) {
   assert(kMaxHeight_ > 0);
@@ -402,7 +401,7 @@ void SkipList<Key, Comparator>::Insert(const Key& key) {
     // the loop below.  In the former case the reader will
     // immediately drop to the next level since nullptr sorts after all
     // keys.  In the latter case the reader will use the new node.
-    max_height_.NoBarrier_Store(reinterpret_cast<void*>(height));
+    max_height_.store(height, std::memory_order_relaxed);
   }
 
   x = NewNode(key, height);
diff --git a/db/skiplist_test.cc b/db/skiplist_test.cc
index b87ddcbb0..48323b244 100644
--- a/db/skiplist_test.cc
+++ b/db/skiplist_test.cc
@@ -191,13 +191,11 @@ class ConcurrentTest {
 
   // Per-key generation
   struct State {
-    port::AtomicPointer generation[K];
-    void Set(int k, intptr_t v) {
-      generation[k].Release_Store(reinterpret_cast<void*>(v));
-    }
-    intptr_t Get(int k) {
-      return reinterpret_cast<intptr_t>(generation[k].Acquire_Load());
+    std::atomic<int> generation[K];
+    void Set(int k, int v) {
+      generation[k].store(v, std::memory_order_release);
     }
+    int Get(int k) { return generation[k].load(std::memory_order_acquire); }
 
     State() {
       for (unsigned int k = 0; k < K; k++) {
@@ -221,7 +219,7 @@ class ConcurrentTest {
   // REQUIRES: External synchronization
   void WriteStep(Random* rnd) {
     const uint32_t k = rnd->Next() % K;
-    const intptr_t g = current_.Get(k) + 1;
+    const int g = current_.Get(k) + 1;
     const Key key = MakeKey(k, g);
     list_.Insert(key);
     current_.Set(k, g);
@@ -303,7 +301,7 @@ class TestState {
  public:
   ConcurrentTest t_;
   int seed_;
-  port::AtomicPointer quit_flag_;
+  std::atomic<bool> quit_flag_;
 
   enum ReaderState {
     STARTING,
@@ -312,10 +310,7 @@ class TestState {
   };
 
   explicit TestState(int s)
-      : seed_(s),
-        quit_flag_(nullptr),
-        state_(STARTING),
-        state_cv_(&mu_) {}
+      : seed_(s), quit_flag_(false), state_(STARTING), state_cv_(&mu_) {}
 
   void Wait(ReaderState s) {
     mu_.Lock();
@@ -343,7 +338,7 @@ static void ConcurrentReader(void* arg) {
   Random rnd(state->seed_);
   int64_t reads = 0;
   state->Change(TestState::RUNNING);
-  while (!state->quit_flag_.Acquire_Load()) {
+  while (!state->quit_flag_.load(std::memory_order_acquire)) {
     state->t_.ReadStep(&rnd);
     ++reads;
   }
@@ -365,7 +360,7 @@ static void RunConcurrent(int run) {
     for (int i = 0; i < kSize; i++) {
       state.t_.WriteStep(&rnd);
     }
-    state.quit_flag_.Release_Store(&state);  // Any non-nullptr arg will do
+    state.quit_flag_.store(true, std::memory_order_release);
     state.Wait(TestState::DONE);
   }
 }
diff --git a/port/atomic_pointer.h b/port/atomic_pointer.h
deleted file mode 100644
index db3580bde..000000000
--- a/port/atomic_pointer.h
+++ /dev/null
@@ -1,157 +0,0 @@
-//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under the BSD-style license found in the
-//  LICENSE file in the root directory of this source tree. An additional grant
-//  of patent rights can be found in the PATENTS file in the same directory.
-//
-// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-
-// AtomicPointer provides storage for a lock-free pointer.
-// Platform-dependent implementation of AtomicPointer:
-// - If the platform provides a cheap barrier, we use it with raw pointers
-// - If cstdatomic is present (on newer versions of gcc, it is), we use
-//   a cstdatomic-based AtomicPointer.  However we prefer the memory
-//   barrier based version, because at least on a gcc 4.4 32-bit build
-//   on linux, we have encountered a buggy <cstdatomic>
-//   implementation.  Also, some <cstdatomic> implementations are much
-//   slower than a memory-barrier based implementation (~16ns for
-//   <cstdatomic> based acquire-load vs. ~1ns for a barrier based
-//   acquire-load).
-// This code is based on atomicops-internals-* in Google's perftools:
-// http://code.google.com/p/google-perftools/source/browse/#svn%2Ftrunk%2Fsrc%2Fbase
-
-#ifndef PORT_ATOMIC_POINTER_H_
-#define PORT_ATOMIC_POINTER_H_
-
-#include <stdint.h>
-#ifdef ROCKSDB_ATOMIC_PRESENT
-#include <atomic>
-#endif
-#ifdef OS_WIN
-#include <windows.h>
-#endif
-#ifdef OS_MACOSX
-#include <libkern/OSAtomic.h>
-#endif
-
-#if defined(_M_X64) || defined(__x86_64__)
-#define ARCH_CPU_X86_FAMILY 1
-#elif defined(_M_IX86) || defined(__i386__) || defined(__i386)
-#define ARCH_CPU_X86_FAMILY 1
-#elif defined(__ARMEL__)
-#define ARCH_CPU_ARM_FAMILY 1
-#endif
-
-namespace rocksdb {
-namespace port {
-
-// Define MemoryBarrier() if available
-// Windows on x86
-#if defined(OS_WIN) && defined(COMPILER_MSVC) && defined(ARCH_CPU_X86_FAMILY)
-// windows.h already provides a MemoryBarrier(void) macro
-// http://msdn.microsoft.com/en-us/library/ms684208(v=vs.85).aspx
-#define ROCKSDB_HAVE_MEMORY_BARRIER
-
-// Gcc on x86
-#elif defined(ARCH_CPU_X86_FAMILY) && defined(__GNUC__)
-inline void MemoryBarrier() {
-  // See http://gcc.gnu.org/ml/gcc/2003-04/msg01180.html for a discussion on
-  // this idiom. Also see http://en.wikipedia.org/wiki/Memory_ordering.
-  __asm__ __volatile__("" : : : "memory");
-}
-#define ROCKSDB_HAVE_MEMORY_BARRIER
-
-// Sun Studio
-#elif defined(ARCH_CPU_X86_FAMILY) && defined(__SUNPRO_CC)
-inline void MemoryBarrier() {
-  // See http://gcc.gnu.org/ml/gcc/2003-04/msg01180.html for a discussion on
-  // this idiom. Also see http://en.wikipedia.org/wiki/Memory_ordering.
-  asm volatile("" : : : "memory");
-}
-#define ROCKSDB_HAVE_MEMORY_BARRIER
-
-// Mac OS
-#elif defined(OS_MACOSX)
-inline void MemoryBarrier() {
-  OSMemoryBarrier();
-}
-#define ROCKSDB_HAVE_MEMORY_BARRIER
-
-// ARM Linux
-#elif defined(ARCH_CPU_ARM_FAMILY) && defined(__linux__)
-typedef void (*LinuxKernelMemoryBarrierFunc)(void);
-// The Linux ARM kernel provides a highly optimized device-specific memory
-// barrier function at a fixed memory address that is mapped in every
-// user-level process.
-//
-// This beats using CPU-specific instructions which are, on single-core
-// devices, un-necessary and very costly (e.g. ARMv7-A "dmb" takes more
-// than 180ns on a Cortex-A8 like the one on a Nexus One). Benchmarking
-// shows that the extra function call cost is completely negligible on
-// multi-core devices.
-//
-inline void MemoryBarrier() {
-  (*(LinuxKernelMemoryBarrierFunc)0xffff0fa0)();
-}
-#define ROCKSDB_HAVE_MEMORY_BARRIER
-
-#endif
-
-// AtomicPointer built using platform-specific MemoryBarrier()
-#if defined(ROCKSDB_HAVE_MEMORY_BARRIER)
-class AtomicPointer {
- private:
-  void* rep_;
- public:
-  AtomicPointer() { }
-  explicit AtomicPointer(void* p) : rep_(p) {}
-  inline void* NoBarrier_Load() const { return rep_; }
-  inline void NoBarrier_Store(void* v) { rep_ = v; }
-  inline void* Acquire_Load() const {
-    void* result = rep_;
-    MemoryBarrier();
-    return result;
-  }
-  inline void Release_Store(void* v) {
-    MemoryBarrier();
-    rep_ = v;
-  }
-};
-
-// AtomicPointer based on <atomic>
-#elif defined(ROCKSDB_ATOMIC_PRESENT)
-class AtomicPointer {
- private:
-  std::atomic<void*> rep_;
- public:
-  AtomicPointer() { }
-  explicit AtomicPointer(void* v) : rep_(v) { }
-  inline void* Acquire_Load() const {
-    return rep_.load(std::memory_order_acquire);
-  }
-  inline void Release_Store(void* v) {
-    rep_.store(v, std::memory_order_release);
-  }
-  inline void* NoBarrier_Load() const {
-    return rep_.load(std::memory_order_relaxed);
-  }
-  inline void NoBarrier_Store(void* v) {
-    rep_.store(v, std::memory_order_relaxed);
-  }
-};
-
-// We have neither MemoryBarrier(), nor <cstdatomic>
-#else
-#error Please implement AtomicPointer for this platform.
-
-#endif
-
-#undef ROCKSDB_HAVE_MEMORY_BARRIER
-#undef ARCH_CPU_X86_FAMILY
-#undef ARCH_CPU_ARM_FAMILY
-
-}  // namespace port
-}  // namespace rocksdb
-
-#endif  // PORT_ATOMIC_POINTER_H_
diff --git a/port/port_example.h b/port/port_example.h
index f124abb06..ba14618fa 100644
--- a/port/port_example.h
+++ b/port/port_example.h
@@ -75,35 +75,6 @@ typedef intptr_t OnceType;
 #define LEVELDB_ONCE_INIT 0
 extern void InitOnce(port::OnceType*, void (*initializer)());
 
-// A type that holds a pointer that can be read or written atomically
-// (i.e., without word-tearing.)
-class AtomicPointer {
- private:
-  intptr_t rep_;
- public:
-  // Initialize to arbitrary value
-  AtomicPointer();
-
-  // Initialize to hold v
-  explicit AtomicPointer(void* v) : rep_(v) { }
-
-  // Read and return the stored pointer with the guarantee that no
-  // later memory access (read or write) by this thread can be
-  // reordered ahead of this read.
-  void* Acquire_Load() const;
-
-  // Set v as the stored pointer with the guarantee that no earlier
-  // memory access (read or write) by this thread can be reordered
-  // after this store.
-  void Release_Store(void* v);
-
-  // Read the stored pointer with no ordering guarantees.
-  void* NoBarrier_Load() const;
-
-  // Set va as the stored pointer with no ordering guarantees.
-  void NoBarrier_Store(void* v);
-};
-
 // ------------------ Compression -------------------
 
 // Store the snappy compression of "input[0,input_length-1]" in *output.
diff --git a/port/port_posix.h b/port/port_posix.h
index 2e3c868b3..dae8f7219 100644
--- a/port/port_posix.h
+++ b/port/port_posix.h
@@ -55,7 +55,6 @@
 #include <string>
 #include <string.h>
 #include "rocksdb/options.h"
-#include "port/atomic_pointer.h"
 
 #ifndef PLATFORM_IS_LITTLE_ENDIAN
 #define PLATFORM_IS_LITTLE_ENDIAN (__BYTE_ORDER == __LITTLE_ENDIAN)
diff --git a/tools/db_repl_stress.cc b/tools/db_repl_stress.cc
index 5970bb684..fbe426573 100644
--- a/tools/db_repl_stress.cc
+++ b/tools/db_repl_stress.cc
@@ -58,7 +58,7 @@ static void DataPumpThreadBody(void* arg) {
 }
 
 struct ReplicationThread {
-  port::AtomicPointer stop;
+  std::atomic<bool> stop;
   DB* db;
   volatile size_t no_read;
 };
@@ -68,11 +68,11 @@ static void ReplicationThreadBody(void* arg) {
   DB* db = t->db;
   unique_ptr<TransactionLogIterator> iter;
   SequenceNumber currentSeqNum = 1;
-  while (t->stop.Acquire_Load() != nullptr) {
+  while (!t->stop.load(std::memory_order_acquire)) {
     iter.reset();
     Status s;
     while(!db->GetUpdatesSince(currentSeqNum, &iter).ok()) {
-      if (t->stop.Acquire_Load() == nullptr) {
+      if (t->stop.load(std::memory_order_acquire)) {
         return;
       }
     }
@@ -129,11 +129,11 @@ int main(int argc, const char** argv) {
   ReplicationThread replThread;
   replThread.db = db;
   replThread.no_read = 0;
-  replThread.stop.Release_Store(env); // store something to make it non-null.
+  replThread.stop.store(false, std::memory_order_release);
 
   env->StartThread(ReplicationThreadBody, &replThread);
   while(replThread.no_read < FLAGS_num_inserts);
-  replThread.stop.Release_Store(nullptr);
+  replThread.stop.store(true, std::memory_order_release);
   if (replThread.no_read < dataPump.no_records) {
     // no. read should be => than inserted.
     fprintf(stderr, "No. of Record's written and read not same\nRead : %zu"
diff --git a/util/env_test.cc b/util/env_test.cc
index 1779f1aa0..be542e9af 100644
--- a/util/env_test.cc
+++ b/util/env_test.cc
@@ -44,30 +44,31 @@ class EnvPosixTest {
 };
 
 static void SetBool(void* ptr) {
-  reinterpret_cast<port::AtomicPointer*>(ptr)->NoBarrier_Store(ptr);
+  reinterpret_cast<std::atomic<bool>*>(ptr)
+      ->store(true, std::memory_order_relaxed);
 }
 
 TEST(EnvPosixTest, RunImmediately) {
-  port::AtomicPointer called (nullptr);
+  std::atomic<bool> called(false);
   env_->Schedule(&SetBool, &called);
   Env::Default()->SleepForMicroseconds(kDelayMicros);
-  ASSERT_TRUE(called.NoBarrier_Load() != nullptr);
+  ASSERT_TRUE(called.load(std::memory_order_relaxed));
 }
 
 TEST(EnvPosixTest, RunMany) {
-  port::AtomicPointer last_id (nullptr);
+  std::atomic<int> last_id(0);
 
   struct CB {
-    port::AtomicPointer* last_id_ptr;   // Pointer to shared slot
-    uintptr_t id;             // Order# for the execution of this callback
+    std::atomic<int>* last_id_ptr;  // Pointer to shared slot
+    int id;                         // Order# for the execution of this callback
 
-    CB(port::AtomicPointer* p, int i) : last_id_ptr(p), id(i) { }
+    CB(std::atomic<int>* p, int i) : last_id_ptr(p), id(i) {}
 
     static void Run(void* v) {
       CB* cb = reinterpret_cast<CB*>(v);
-      void* cur = cb->last_id_ptr->NoBarrier_Load();
-      ASSERT_EQ(cb->id-1, reinterpret_cast<uintptr_t>(cur));
-      cb->last_id_ptr->Release_Store(reinterpret_cast<void*>(cb->id));
+      int cur = cb->last_id_ptr->load(std::memory_order_relaxed);
+      ASSERT_EQ(cb->id - 1, cur);
+      cb->last_id_ptr->store(cb->id, std::memory_order_release);
     }
   };
 
@@ -82,8 +83,8 @@ TEST(EnvPosixTest, RunMany) {
   env_->Schedule(&CB::Run, &cb4);
 
   Env::Default()->SleepForMicroseconds(kDelayMicros);
-  void* cur = last_id.Acquire_Load();
-  ASSERT_EQ(4U, reinterpret_cast<uintptr_t>(cur));
+  int cur = last_id.load(std::memory_order_acquire);
+  ASSERT_EQ(4, cur);
 }
 
 struct State {
diff --git a/util/hash_linklist_rep.cc b/util/hash_linklist_rep.cc
index 8e3dc5826..8e5f4025d 100644
--- a/util/hash_linklist_rep.cc
+++ b/util/hash_linklist_rep.cc
@@ -8,12 +8,12 @@
 #include "util/hash_linklist_rep.h"
 
 #include <algorithm>
+#include <atomic>
 #include "rocksdb/memtablerep.h"
 #include "util/arena.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/slice_transform.h"
 #include "port/port.h"
-#include "port/atomic_pointer.h"
 #include "util/histogram.h"
 #include "util/murmurhash.h"
 #include "db/memtable.h"
@@ -24,7 +24,7 @@ namespace {
 
 typedef const char* Key;
 typedef SkipList<Key, const MemTableRep::KeyComparator&> MemtableSkipList;
-typedef port::AtomicPointer Pointer;
+typedef std::atomic<void*> Pointer;
 
 // A data structure used as the header of a link list of a hash bucket.
 struct BucketHeader {
@@ -34,7 +34,9 @@ struct BucketHeader {
   explicit BucketHeader(void* n, uint32_t count)
       : next(n), num_entries(count) {}
 
-  bool IsSkipListBucket() { return next.NoBarrier_Load() == this; }
+  bool IsSkipListBucket() {
+    return next.load(std::memory_order_relaxed) == this;
+  }
 };
 
 // A data structure used as the header of a skip list of a hash bucket.
@@ -55,24 +57,23 @@ struct Node {
   Node* Next() {
     // Use an 'acquire load' so that we observe a fully initialized
     // version of the returned Node.
-    return reinterpret_cast<Node*>(next_.Acquire_Load());
+    return next_.load(std::memory_order_acquire);
   }
   void SetNext(Node* x) {
     // Use a 'release store' so that anybody who reads through this
     // pointer observes a fully initialized version of the inserted node.
-    next_.Release_Store(x);
+    next_.store(x, std::memory_order_release);
   }
   // No-barrier variants that can be safely used in a few locations.
   Node* NoBarrier_Next() {
-    return reinterpret_cast<Node*>(next_.NoBarrier_Load());
+    return next_.load(std::memory_order_relaxed);
   }
 
-  void NoBarrier_SetNext(Node* x) {
-    next_.NoBarrier_Store(x);
-  }
+  void NoBarrier_SetNext(Node* x) { next_.store(x, std::memory_order_relaxed); }
 
  private:
-  port::AtomicPointer next_;
+  std::atomic<Node*> next_;
+
  public:
   char key[0];
 };
@@ -174,7 +175,7 @@ class HashLinkListRep : public MemTableRep {
 
   // Maps slices (which are transformed user keys) to buckets of keys sharing
   // the same transform.
-  port::AtomicPointer* buckets_;
+  Pointer* buckets_;
 
   const uint32_t threshold_use_skiplist_;
 
@@ -203,7 +204,7 @@ class HashLinkListRep : public MemTableRep {
   }
 
   Pointer* GetBucket(size_t i) const {
-    return static_cast<Pointer*>(buckets_[i].Acquire_Load());
+    return static_cast<Pointer*>(buckets_[i].load(std::memory_order_acquire));
   }
 
   Pointer* GetBucket(const Slice& slice) const {
@@ -467,13 +468,13 @@ HashLinkListRep::HashLinkListRep(const MemTableRep::KeyComparator& compare,
       logger_(logger),
       bucket_entries_logging_threshold_(bucket_entries_logging_threshold),
       if_log_bucket_dist_when_flash_(if_log_bucket_dist_when_flash) {
-  char* mem = arena_->AllocateAligned(sizeof(port::AtomicPointer) * bucket_size,
+  char* mem = arena_->AllocateAligned(sizeof(Pointer) * bucket_size,
                                       huge_page_tlb_size, logger);
 
-  buckets_ = new (mem) port::AtomicPointer[bucket_size];
+  buckets_ = new (mem) Pointer[bucket_size];
 
   for (size_t i = 0; i < bucket_size_; ++i) {
-    buckets_[i].NoBarrier_Store(nullptr);
+    buckets_[i].store(nullptr, std::memory_order_relaxed);
   }
 }
 
@@ -492,7 +493,7 @@ SkipListBucketHeader* HashLinkListRep::GetSkipListBucketHeader(
   if (first_next_pointer == nullptr) {
     return nullptr;
   }
-  if (first_next_pointer->NoBarrier_Load() == nullptr) {
+  if (first_next_pointer->load(std::memory_order_relaxed) == nullptr) {
     // Single entry bucket
     return nullptr;
   }
@@ -502,8 +503,8 @@ SkipListBucketHeader* HashLinkListRep::GetSkipListBucketHeader(
     assert(header->num_entries > threshold_use_skiplist_);
     auto* skip_list_bucket_header =
         reinterpret_cast<SkipListBucketHeader*>(header);
-    assert(skip_list_bucket_header->Counting_header.next.NoBarrier_Load() ==
-           header);
+    assert(skip_list_bucket_header->Counting_header.next.load(
+               std::memory_order_relaxed) == header);
     return skip_list_bucket_header;
   }
   assert(header->num_entries <= threshold_use_skiplist_);
@@ -514,7 +515,7 @@ Node* HashLinkListRep::GetLinkListFirstNode(Pointer* first_next_pointer) const {
   if (first_next_pointer == nullptr) {
     return nullptr;
   }
-  if (first_next_pointer->NoBarrier_Load() == nullptr) {
+  if (first_next_pointer->load(std::memory_order_relaxed) == nullptr) {
     // Single entry bucket
     return reinterpret_cast<Node*>(first_next_pointer);
   }
@@ -522,7 +523,8 @@ Node* HashLinkListRep::GetLinkListFirstNode(Pointer* first_next_pointer) const {
   BucketHeader* header = reinterpret_cast<BucketHeader*>(first_next_pointer);
   if (!header->IsSkipListBucket()) {
     assert(header->num_entries <= threshold_use_skiplist_);
-    return reinterpret_cast<Node*>(header->next.NoBarrier_Load());
+    return reinterpret_cast<Node*>(
+        header->next.load(std::memory_order_relaxed));
   }
   assert(header->num_entries > threshold_use_skiplist_);
   return nullptr;
@@ -534,19 +536,20 @@ void HashLinkListRep::Insert(KeyHandle handle) {
   Slice internal_key = GetLengthPrefixedSlice(x->key);
   auto transformed = GetPrefix(internal_key);
   auto& bucket = buckets_[GetHash(transformed)];
-  Pointer* first_next_pointer = static_cast<Pointer*>(bucket.NoBarrier_Load());
+  Pointer* first_next_pointer =
+      static_cast<Pointer*>(bucket.load(std::memory_order_relaxed));
 
   if (first_next_pointer == nullptr) {
     // Case 1. empty bucket
     // NoBarrier_SetNext() suffices since we will add a barrier when
     // we publish a pointer to "x" in prev[i].
     x->NoBarrier_SetNext(nullptr);
-    bucket.Release_Store(x);
+    bucket.store(x, std::memory_order_release);
     return;
   }
 
   BucketHeader* header = nullptr;
-  if (first_next_pointer->NoBarrier_Load() == nullptr) {
+  if (first_next_pointer->load(std::memory_order_relaxed) == nullptr) {
     // Case 2. only one entry in the bucket
     // Need to convert to a Counting bucket and turn to case 4.
     Node* first = reinterpret_cast<Node*>(first_next_pointer);
@@ -557,7 +560,7 @@ void HashLinkListRep::Insert(KeyHandle handle) {
     // think the node is a bucket header.
     auto* mem = arena_->AllocateAligned(sizeof(BucketHeader));
     header = new (mem) BucketHeader(first, 1);
-    bucket.Release_Store(header);
+    bucket.store(header, std::memory_order_release);
   } else {
     header = reinterpret_cast<BucketHeader*>(first_next_pointer);
     if (header->IsSkipListBucket()) {
@@ -585,7 +588,8 @@ void HashLinkListRep::Insert(KeyHandle handle) {
     // Case 3. number of entries reaches the threshold so need to convert to
     // skip list.
     LinkListIterator bucket_iter(
-        this, reinterpret_cast<Node*>(first_next_pointer->NoBarrier_Load()));
+        this, reinterpret_cast<Node*>(
+                  first_next_pointer->load(std::memory_order_relaxed)));
     auto mem = arena_->AllocateAligned(sizeof(SkipListBucketHeader));
     SkipListBucketHeader* new_skip_list_header = new (mem)
         SkipListBucketHeader(compare_, arena_, header->num_entries + 1);
@@ -599,11 +603,12 @@ void HashLinkListRep::Insert(KeyHandle handle) {
     // insert the new entry
     skip_list.Insert(x->key);
     // Set the bucket
-    bucket.Release_Store(new_skip_list_header);
+    bucket.store(new_skip_list_header, std::memory_order_release);
   } else {
     // Case 5. Need to insert to the sorted linked list without changing the
     // header.
-    Node* first = reinterpret_cast<Node*>(header->next.NoBarrier_Load());
+    Node* first =
+        reinterpret_cast<Node*>(header->next.load(std::memory_order_relaxed));
     assert(first != nullptr);
     // Advance counter unless the bucket needs to be advanced to skip list.
     // In that case, we need to make sure the previous count never exceeds
@@ -640,7 +645,7 @@ void HashLinkListRep::Insert(KeyHandle handle) {
     if (prev) {
       prev->SetNext(x);
     } else {
-      header->next.Release_Store(static_cast<void*>(x));
+      header->next.store(static_cast<void*>(x), std::memory_order_release);
     }
   }
 }
diff --git a/util/hash_skiplist_rep.cc b/util/hash_skiplist_rep.cc
index 1c7a459bd..f410350e7 100644
--- a/util/hash_skiplist_rep.cc
+++ b/util/hash_skiplist_rep.cc
@@ -7,12 +7,13 @@
 #ifndef ROCKSDB_LITE
 #include "util/hash_skiplist_rep.h"
 
+#include <atomic>
+
 #include "rocksdb/memtablerep.h"
 #include "util/arena.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/slice_transform.h"
 #include "port/port.h"
-#include "port/atomic_pointer.h"
 #include "util/murmurhash.h"
 #include "db/memtable.h"
 #include "db/skiplist.h"
@@ -54,7 +55,7 @@ class HashSkipListRep : public MemTableRep {
 
   // Maps slices (which are transformed user keys) to buckets of keys sharing
   // the same transform.
-  port::AtomicPointer* buckets_;
+  std::atomic<Bucket*>* buckets_;
 
   // The user-supplied transform whose domain is the user keys.
   const SliceTransform* transform_;
@@ -67,7 +68,7 @@ class HashSkipListRep : public MemTableRep {
     return MurmurHash(slice.data(), slice.size(), 0) % bucket_size_;
   }
   inline Bucket* GetBucket(size_t i) const {
-    return static_cast<Bucket*>(buckets_[i].Acquire_Load());
+    return buckets_[i].load(std::memory_order_acquire);
   }
   inline Bucket* GetBucket(const Slice& slice) const {
     return GetBucket(GetHash(slice));
@@ -229,12 +230,11 @@ HashSkipListRep::HashSkipListRep(const MemTableRep::KeyComparator& compare,
       transform_(transform),
       compare_(compare),
       arena_(arena) {
-  auto mem =
-      arena->AllocateAligned(sizeof(port::AtomicPointer) * bucket_size);
-  buckets_ = new (mem) port::AtomicPointer[bucket_size];
+  auto mem = arena->AllocateAligned(sizeof(std::atomic<void*>) * bucket_size);
+  buckets_ = new (mem) std::atomic<Bucket*>[bucket_size];
 
   for (size_t i = 0; i < bucket_size_; ++i) {
-    buckets_[i].NoBarrier_Store(nullptr);
+    buckets_[i].store(nullptr, std::memory_order_relaxed);
   }
 }
 
@@ -249,7 +249,7 @@ HashSkipListRep::Bucket* HashSkipListRep::GetInitializedBucket(
     auto addr = arena_->AllocateAligned(sizeof(Bucket));
     bucket = new (addr) Bucket(compare_, arena_, skiplist_height_,
                                skiplist_branching_factor_);
-    buckets_[hash].Release_Store(static_cast<void*>(bucket));
+    buckets_[hash].store(bucket, std::memory_order_release);
   }
   return bucket;
 }

From 7c303f0e78ab11dd40ae9a016d774a779851ecff Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Mon, 27 Oct 2014 15:03:20 -0700
Subject: [PATCH 309/829] Include atomic

---
 db/skiplist.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/db/skiplist.h b/db/skiplist.h
index 68c8bceba..4ee4ed714 100644
--- a/db/skiplist.h
+++ b/db/skiplist.h
@@ -32,6 +32,7 @@
 
 #pragma once
 #include <assert.h>
+#include <atomic>
 #include <stdlib.h>
 #include "util/arena.h"
 #include "port/port.h"

From c1c68bce439ad7d392501cefffad921552af8d91 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Mon, 27 Oct 2014 15:12:20 -0700
Subject: [PATCH 310/829] remove atomic_pointer.h references

---
 table/table_reader_bench.cc | 1 -
 tools/db_repl_stress.cc     | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/table/table_reader_bench.cc b/table/table_reader_bench.cc
index 52fa20ec0..e6960f751 100644
--- a/table/table_reader_bench.cc
+++ b/table/table_reader_bench.cc
@@ -18,7 +18,6 @@ int main() {
 #include "rocksdb/table.h"
 #include "db/db_impl.h"
 #include "db/dbformat.h"
-#include "port/atomic_pointer.h"
 #include "table/block_based_table_factory.h"
 #include "table/plain_table_factory.h"
 #include "table/table_builder.h"
diff --git a/tools/db_repl_stress.cc b/tools/db_repl_stress.cc
index fbe426573..ec18ab512 100644
--- a/tools/db_repl_stress.cc
+++ b/tools/db_repl_stress.cc
@@ -12,13 +12,13 @@ int main() {
 #else
 
 #include <cstdio>
+#include <atomic>
 
 #include <gflags/gflags.h>
 
 #include "db/write_batch_internal.h"
 #include "rocksdb/db.h"
 #include "rocksdb/types.h"
-#include "port/atomic_pointer.h"
 #include "util/testutil.h"
 
 // Run a thread to perform Put's.

From 56ef2caaa5ebfe855a4b8d76eec4d33cf04e731d Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Mon, 27 Oct 2014 20:55:57 +0100
Subject: [PATCH 311/829] [RocksJava] - Hardening RocksIterator

RocksIterator will sometimes Sigsegv on dispose. Mainly thats related
to dispose order. If the related RocksDB instance is freed beforehand
RocksIterator.dispose() will fail.

Within this commit there is a major change to RocksIterator. RocksIterator
will hold a private reference to the RocksDB instance which created the
RocksIterator. So even if RocksDB is freed in the same GC cycle the
RocksIterator instances will be freed prior to related RocksDB instances.

Another aspect targets the dispose logic if the RocksDB is freed previously
and already gc`ed. On dispose of a RocksIterator the dispose logic will check
if the RocksDB instance points to an initialized DB. If not the dispose logic
will not perform any further action.

The crash can be reproduced by using the related test provided within this
commit.

Related information: This relates to @adamretter`s facebook rocksdb-dev group
post about SigSegv on RocksIterator.dispose().
---
 java/Makefile                                |  1 +
 java/org/rocksdb/RocksDB.java                |  7 +--
 java/org/rocksdb/RocksIterator.java          |  9 +++-
 java/org/rocksdb/test/RocksIteratorTest.java | 48 ++++++++++++++++++++
 4 files changed, 60 insertions(+), 5 deletions(-)
 create mode 100644 java/org/rocksdb/test/RocksIteratorTest.java

diff --git a/java/Makefile b/java/Makefile
index d490da4e5..8c147f9b4 100644
--- a/java/Makefile
+++ b/java/Makefile
@@ -46,6 +46,7 @@ test: java
 	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.OptionsTest
 	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.ReadOnlyTest
 	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.ReadOptionsTest
+	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.RocksIteratorTest
 	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.SnapshotTest
 	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.StatisticsCollectorTest
 	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.ComparatorOptionsTest
diff --git a/java/org/rocksdb/RocksDB.java b/java/org/rocksdb/RocksDB.java
index 676f636d4..ca26a596d 100644
--- a/java/org/rocksdb/RocksDB.java
+++ b/java/org/rocksdb/RocksDB.java
@@ -888,7 +888,7 @@ public class RocksDB extends RocksObject {
    * @return instance of iterator object.
    */
   public RocksIterator newIterator() {
-    return new RocksIterator(iterator0(nativeHandle_));
+    return new RocksIterator(this, iterator0(nativeHandle_));
   }
 
 
@@ -936,7 +936,8 @@ public class RocksDB extends RocksObject {
    * @return instance of iterator object.
    */
   public RocksIterator newIterator(ColumnFamilyHandle columnFamilyHandle) {
-    return new RocksIterator(iterator0(nativeHandle_, columnFamilyHandle.nativeHandle_));
+    return new RocksIterator(this, iterator0(nativeHandle_,
+        columnFamilyHandle.nativeHandle_));
   }
 
   /**
@@ -958,7 +959,7 @@ public class RocksDB extends RocksObject {
 
     long[] iteratorRefs = iterators(nativeHandle_, columnFamilyHandleList);
     for (int i=0; i<columnFamilyHandleList.size(); i++){
-      iterators.add(new RocksIterator(iteratorRefs[i]));
+      iterators.add(new RocksIterator(this, iteratorRefs[i]));
     }
     return iterators;
   }
diff --git a/java/org/rocksdb/RocksIterator.java b/java/org/rocksdb/RocksIterator.java
index 2adff26cc..ddaddbf95 100644
--- a/java/org/rocksdb/RocksIterator.java
+++ b/java/org/rocksdb/RocksIterator.java
@@ -19,9 +19,10 @@ package org.rocksdb;
  * @see org.rocksdb.RocksObject
  */
 public class RocksIterator extends RocksObject {
-  public RocksIterator(long nativeHandle) {
+  public RocksIterator(RocksDB rocksDB, long nativeHandle) {
     super();
     nativeHandle_ = nativeHandle;
+    rocksDB_ = rocksDB;
   }
 
   /**
@@ -129,7 +130,9 @@ public class RocksIterator extends RocksObject {
    */
   @Override protected void disposeInternal() {
     assert(isInitialized());
-    disposeInternal(nativeHandle_);
+    if (rocksDB_.isInitialized()) {
+      disposeInternal(nativeHandle_);
+    }
   }
 
   private native boolean isValid0(long handle);
@@ -142,4 +145,6 @@ public class RocksIterator extends RocksObject {
   private native byte[] value0(long handle);
   private native void seek0(long handle, byte[] target, int targetLen);
   private native void status0(long handle);
+
+  RocksDB rocksDB_;
 }
diff --git a/java/org/rocksdb/test/RocksIteratorTest.java b/java/org/rocksdb/test/RocksIteratorTest.java
new file mode 100644
index 000000000..1e2fa8c6d
--- /dev/null
+++ b/java/org/rocksdb/test/RocksIteratorTest.java
@@ -0,0 +1,48 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+package org.rocksdb.test;
+
+import org.rocksdb.ColumnFamilyHandle;
+import org.rocksdb.Options;
+import org.rocksdb.RocksDB;
+import org.rocksdb.RocksDBException;
+import org.rocksdb.RocksIterator;
+
+import java.util.ArrayList;
+import java.util.List;
+
+public class RocksIteratorTest {
+  static final String DB_PATH = "/tmp/rocksdbjni_iterator_test";
+  static {
+    RocksDB.loadLibrary();
+  }
+
+  public static void main(String[] args){
+    RocksDB db;
+    Options options = new Options();
+    options.setCreateIfMissing(true)
+        .setCreateMissingColumnFamilies(true);
+    try {
+      db = RocksDB.open(options, DB_PATH);
+      db.put("key".getBytes(), "value".getBytes());
+      RocksIterator iter = db.newIterator();
+      RocksIterator iter2 = db.newIterator();
+      RocksIterator iter3 = db.newIterator();
+      iter = null;
+      db.close();
+      db = null;
+      iter2 = null;
+      System.gc();
+      System.runFinalization();
+      System.out.println("Passed RocksIterator Test");
+      iter3.dispose();
+      System.gc();
+      System.runFinalization();
+    }catch (RocksDBException e){
+      e.printStackTrace();
+      assert(false);
+    }
+  }
+}

From b680033e63b4a8c1f3bf4b3c58e986e9f855a1e5 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Mon, 27 Oct 2014 15:41:05 -0700
Subject: [PATCH 312/829] Include atomic in env_test

---
 db/db_bench.cc   | 2 +-
 util/env_test.cc | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/db/db_bench.cc b/db/db_bench.cc
index 1a379e948..fcc930e67 100644
--- a/db/db_bench.cc
+++ b/db/db_bench.cc
@@ -1533,7 +1533,7 @@ class Benchmark {
 
   void AcquireLoad(ThreadState* thread) {
     int dummy;
-    std::atomic<void*> ap;
+    std::atomic<void*> ap(&dummy);
     int count = 0;
     void *ptr = nullptr;
     thread->stats.AddMessage("(each op is 1000 loads)");
diff --git a/util/env_test.cc b/util/env_test.cc
index be542e9af..3d7a9a4db 100644
--- a/util/env_test.cc
+++ b/util/env_test.cc
@@ -11,6 +11,7 @@
 
 #include <iostream>
 #include <unordered_set>
+#include <atomic>
 
 #ifdef OS_LINUX
 #include <sys/stat.h>

From b08c39e14f1119ecb462e6264ad0ccf390d1d712 Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Mon, 27 Oct 2014 23:53:27 +0100
Subject: [PATCH 313/829] [RocksJava] RocksIterator: Assert for valid RocksDB
 instance & documentation

---
 java/org/rocksdb/RocksIterator.java | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/java/org/rocksdb/RocksIterator.java b/java/org/rocksdb/RocksIterator.java
index ddaddbf95..98b7f6efb 100644
--- a/java/org/rocksdb/RocksIterator.java
+++ b/java/org/rocksdb/RocksIterator.java
@@ -22,6 +22,11 @@ public class RocksIterator extends RocksObject {
   public RocksIterator(RocksDB rocksDB, long nativeHandle) {
     super();
     nativeHandle_ = nativeHandle;
+    // rocksDB must point to a valid RocksDB instance.
+    assert(rocksDB);
+    // RocksIterator must hold a reference to the related RocksDB instance
+    // to guarantee that while a GC cycle starts RocksDBIterator instances
+    // are freed prior to RocksDB instances.
     rocksDB_ = rocksDB;
   }
 
@@ -126,7 +131,12 @@ public class RocksIterator extends RocksObject {
   }
 
   /**
-   * Deletes underlying C++ iterator pointer.
+   * <p>Deletes underlying C++ iterator pointer.</p>
+   *
+   * <p>Note: the underlying handle can only be safely deleted if the RocksDB
+   * instance related to a certain RocksIterator is still valid and initialized.
+   * Therefore {@code disposeInternal()} checks if the RocksDB is initialized
+   * before freeing the native handle.</p>
    */
   @Override protected void disposeInternal() {
     assert(isInitialized());

From 45e756f04a08a9edfd45f422ed1cd5d78bd782ef Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Tue, 28 Oct 2014 00:07:53 +0100
Subject: [PATCH 314/829] [RocksJava] Minor correction to the previous pull
 request merge

---
 java/org/rocksdb/RocksIterator.java | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/java/org/rocksdb/RocksIterator.java b/java/org/rocksdb/RocksIterator.java
index 98b7f6efb..12377b6df 100644
--- a/java/org/rocksdb/RocksIterator.java
+++ b/java/org/rocksdb/RocksIterator.java
@@ -23,7 +23,7 @@ public class RocksIterator extends RocksObject {
     super();
     nativeHandle_ = nativeHandle;
     // rocksDB must point to a valid RocksDB instance.
-    assert(rocksDB);
+    assert(rocksDB != null);
     // RocksIterator must hold a reference to the related RocksDB instance
     // to guarantee that while a GC cycle starts RocksDBIterator instances
     // are freed prior to RocksDB instances.

From 5187d896b943930cf44e5f345311c9062806534d Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Tue, 28 Oct 2014 09:59:56 -0700
Subject: [PATCH 315/829] unfriend Compaction and CompactionPicker from
 VersionSet

Summary: as title

Test Plan: running make all check

Reviewers: sdong, yhchiang, rven, igor

Reviewed By: igor

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D27549
---
 db/compaction.cc        |  6 +++---
 db/compaction_picker.cc | 15 +++++++-------
 db/version_set.cc       | 19 ++++++++++++++---
 db/version_set.h        | 45 +++++++++++++++++++++++++----------------
 4 files changed, 54 insertions(+), 31 deletions(-)

diff --git a/db/compaction.cc b/db/compaction.cc
index f02feeee7..2802044a4 100644
--- a/db/compaction.cc
+++ b/db/compaction.cc
@@ -41,7 +41,7 @@ Compaction::Compaction(Version* input_version, int start_level, int out_level,
       max_grandparent_overlap_bytes_(max_grandparent_overlap_bytes),
       input_version_(input_version),
       number_levels_(input_version_->NumberLevels()),
-      cfd_(input_version_->cfd_),
+      cfd_(input_version_->cfd()),
       output_path_id_(output_path_id),
       output_compression_(output_compression),
       seek_compaction_(seek_compaction),
@@ -119,7 +119,7 @@ bool Compaction::KeyNotExistsBeyondOutputLevel(const Slice& user_key) {
   // Maybe use binary search to find right entry instead of linear search?
   const Comparator* user_cmp = cfd_->user_comparator();
   for (int lvl = output_level_ + 1; lvl < number_levels_; lvl++) {
-    const std::vector<FileMetaData*>& files = input_version_->files_[lvl];
+    const std::vector<FileMetaData*>& files = input_version_->LevelFiles(lvl);
     for (; level_ptrs_[lvl] < files.size(); ) {
       FileMetaData* f = files[level_ptrs_[lvl]];
       if (user_cmp->Compare(user_key, f->largest.user_key()) <= 0) {
@@ -217,7 +217,7 @@ void Compaction::ReleaseCompactionFiles(Status status) {
 }
 
 void Compaction::ResetNextCompactionIndex() {
-  input_version_->ResetNextCompactionIndex(start_level_);
+  input_version_->SetNextCompactionIndex(start_level_, 0);
 }
 
 namespace {
diff --git a/db/compaction_picker.cc b/db/compaction_picker.cc
index 63d621c50..42887e057 100644
--- a/db/compaction_picker.cc
+++ b/db/compaction_picker.cc
@@ -331,7 +331,7 @@ Compaction* CompactionPicker::CompactRange(
     delete c;
     Log(ioptions_.info_log,
         "[%s] Could not compact due to expansion failure.\n",
-        version->cfd_->GetName().c_str());
+        version->cfd()->GetName().c_str());
     return nullptr;
   }
 
@@ -455,22 +455,21 @@ Compaction* LevelCompactionPicker::PickCompactionBySize(
 
   // Pick the largest file in this level that is not already
   // being compacted
-  std::vector<int>& file_size = c->input_version_->files_by_size_[level];
+  std::vector<int>& file_size = version->files_by_size_[level];
 
   // record the first file that is not yet compacted
   int nextIndex = -1;
 
-  for (unsigned int i = c->input_version_->next_file_to_compact_by_size_[level];
+  for (unsigned int i = version->NextCompactionIndex(level);
        i < file_size.size(); i++) {
     int index = file_size[i];
-    FileMetaData* f = c->input_version_->files_[level][index];
+    FileMetaData* f = version->files_[level][index];
 
     // Check to verify files are arranged in descending compensated size.
     assert((i == file_size.size() - 1) ||
            (i >= Version::number_of_files_to_sort_ - 1) ||
            (f->compensated_file_size >=
-            c->input_version_->files_[level][file_size[i + 1]]->
-                compensated_file_size));
+            version->files_[level][file_size[i + 1]]->compensated_file_size));
 
     // do not pick a file to compact if it is being compacted
     // from n-1 level.
@@ -486,7 +485,7 @@ Compaction* LevelCompactionPicker::PickCompactionBySize(
     // Do not pick this file if its parents at level+1 are being compacted.
     // Maybe we can avoid redoing this work in SetupOtherInputs
     int parent_index = -1;
-    if (ParentRangeInCompaction(c->input_version_, &f->smallest, &f->largest,
+    if (ParentRangeInCompaction(version, &f->smallest, &f->largest,
                                 level, &parent_index)) {
       continue;
     }
@@ -502,7 +501,7 @@ Compaction* LevelCompactionPicker::PickCompactionBySize(
   }
 
   // store where to start the iteration in the next call to PickCompaction
-  version->next_file_to_compact_by_size_[level] = nextIndex;
+  version->SetNextCompactionIndex(level,  nextIndex);
 
   return c;
 }
diff --git a/db/version_set.cc b/db/version_set.cc
index ea52d95bf..88f66ad51 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -615,6 +615,8 @@ uint64_t Version::GetEstimatedActiveKeys() {
 void Version::AddIterators(const ReadOptions& read_options,
                            const EnvOptions& soptions,
                            MergeIteratorBuilder* merge_iter_builder) {
+  assert(finalized_);
+
   // Merge all level zero files together since they may overlap
   for (size_t i = 0; i < file_levels_[0].num_files; i++) {
     const auto& file = file_levels_[0].files[i];
@@ -675,7 +677,8 @@ Version::Version(ColumnFamilyData* cfd, VersionSet* vset,
       accumulated_raw_value_size_(0),
       accumulated_num_non_deletions_(0),
       accumulated_num_deletions_(0),
-      num_samples_(0) {
+      num_samples_(0),
+      finalized_(false) {
   if (cfd != nullptr && cfd->current() != nullptr) {
       accumulated_file_size_ = cfd->current()->accumulated_file_size_;
       accumulated_raw_key_size_ = cfd->current()->accumulated_raw_key_size_;
@@ -942,13 +945,20 @@ void Version::ComputeCompactionScore(
 }
 
 namespace {
+
+// used to sort files by size
+struct Fsize {
+  int index;
+  FileMetaData* file;
+};
+
 // Compator that is used to sort files based on their size
 // In normal mode: descending size
-bool CompareCompensatedSizeDescending(const Version::Fsize& first,
-                                      const Version::Fsize& second) {
+bool CompareCompensatedSizeDescending(const Fsize& first, const Fsize& second) {
   return (first.file->compensated_file_size >
       second.file->compensated_file_size);
 }
+
 } // anonymous namespace
 
 void Version::UpdateNumNonEmptyLevels() {
@@ -1683,6 +1693,9 @@ VersionSet::~VersionSet() {
 
 void VersionSet::AppendVersion(ColumnFamilyData* column_family_data,
                                Version* v) {
+  // Mark v finalized
+  v->finalized_ = true;
+
   // Make "v" current
   assert(v->refs_ == 0);
   Version* current = column_family_data->current();
diff --git a/db/version_set.h b/db/version_set.h
index 93e9e0c9d..2c5b3a8a7 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -41,7 +41,6 @@ namespace rocksdb {
 namespace log { class Writer; }
 
 class Compaction;
-class CompactionPicker;
 class Iterator;
 class LogBuffer;
 class LookupKey;
@@ -87,7 +86,6 @@ class Version {
   // Append to *iters a sequence of iterators that will
   // yield the contents of this Version when merged together.
   // REQUIRES: This version has been saved (see VersionSet::SaveTo)
-
   void AddIterators(const ReadOptions&, const EnvOptions& soptions,
                     MergeIteratorBuilder* merger_iter_builder);
 
@@ -179,8 +177,11 @@ class Version {
 
   int NumberLevels() const { return num_levels_; }
 
-  // REQUIRES: lock is held
-  int NumLevelFiles(int level) const { return files_[level].size(); }
+  // REQUIRES: This version has been saved (see VersionSet::SaveTo)
+  int NumLevelFiles(int level) const {
+    assert(finalized_);
+    return files_[level].size();
+  }
 
   // Return the combined file size of all files at the specified level.
   uint64_t NumLevelBytes(int level) const;
@@ -242,19 +243,31 @@ class Version {
 
   size_t GetMemoryUsageByTableReaders();
 
-  // used to sort files by size
-  struct Fsize {
-    int index;
-    FileMetaData* file;
-  };
+  ColumnFamilyData* cfd() const { return cfd_; }
+
+  // REQUIRES: This version has been saved (see VersionSet::SaveTo)
+  const std::vector<FileMetaData*>& LevelFiles(int level) const {
+    assert(finalized_);
+    return files_[level];
+  }
+
+  // REQUIRES: lock is held
+  // Set the index that is used to offset into files_by_size_ to find
+  // the next compaction candidate file.
+  void SetNextCompactionIndex(int level, int index) {
+    next_file_to_compact_by_size_[level] = index;
+  }
+
+  // REQUIRES: lock is held
+  int NextCompactionIndex(int level) const {
+    return next_file_to_compact_by_size_[level];
+  }
 
  private:
-  friend class Compaction;
   friend class VersionSet;
   friend class DBImpl;
   friend class CompactedDBImpl;
   friend class ColumnFamilyData;
-  friend class CompactionPicker;
   friend class LevelCompactionPicker;
   friend class UniversalCompactionPicker;
   friend class FIFOCompactionPicker;
@@ -356,13 +369,11 @@ class Version {
   // the number of samples
   uint64_t num_samples_;
 
-  ~Version();
+  // Used to assert APIs that are only safe to use after the version
+  // is finalized
+  bool finalized_;
 
-  // re-initializes the index that is used to offset into files_by_size_
-  // to find the next compaction candidate file.
-  void ResetNextCompactionIndex(int level) {
-    next_file_to_compact_by_size_[level] = 0;
-  }
+  ~Version();
 
   // No copying allowed
   Version(const Version&);

From a28b3c438887c66544ad4d912f7f41484d114f8b Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Tue, 28 Oct 2014 10:00:51 -0700
Subject: [PATCH 316/829] unfriend
 UniversalCompactionPicker,LevelCompactionPicker and FIFOCompactionPicker from
 VersionSet

Summary: as title

Test Plan:
make release
I will run make all check for all stacked diffs before commit

Reviewers: sdong, yhchiang, rven, igor

Reviewed By: igor

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D27573
---
 db/compaction_picker.cc | 121 ++++++++++++++++++++--------------------
 db/version_set.cc       |   4 +-
 db/version_set.h        |  29 ++++++----
 3 files changed, 83 insertions(+), 71 deletions(-)

diff --git a/db/compaction_picker.cc b/db/compaction_picker.cc
index 42887e057..6377ebc64 100644
--- a/db/compaction_picker.cc
+++ b/db/compaction_picker.cc
@@ -372,12 +372,11 @@ Compaction* LevelCompactionPicker::PickCompaction(
   //
   // Find the compactions by size on all levels.
   for (int i = 0; i < NumberLevels() - 1; i++) {
-    assert(i == 0 ||
-           version->compaction_score_[i] <= version->compaction_score_[i - 1]);
-    level = version->compaction_level_[i];
-    if ((version->compaction_score_[i] >= 1)) {
-      c = PickCompactionBySize(mutable_cf_options, version, level,
-          version->compaction_score_[i]);
+    double score = version->CompactionScore(i);
+    assert(i == 0 || score <= version->CompactionScore(i - 1));
+    level = version->CompactionScoreLevel(i);
+    if (score >= 1) {
+      c = PickCompactionBySize(mutable_cf_options, version, level, score);
       if (c == nullptr || ExpandWhileOverlapping(c) == false) {
         delete c;
         c = nullptr;
@@ -455,7 +454,8 @@ Compaction* LevelCompactionPicker::PickCompactionBySize(
 
   // Pick the largest file in this level that is not already
   // being compacted
-  std::vector<int>& file_size = version->files_by_size_[level];
+  const std::vector<int>& file_size = version->FilesBySize(level);
+  const std::vector<FileMetaData*>& level_files = version->LevelFiles(level);
 
   // record the first file that is not yet compacted
   int nextIndex = -1;
@@ -463,13 +463,13 @@ Compaction* LevelCompactionPicker::PickCompactionBySize(
   for (unsigned int i = version->NextCompactionIndex(level);
        i < file_size.size(); i++) {
     int index = file_size[i];
-    FileMetaData* f = version->files_[level][index];
+    FileMetaData* f = level_files[index];
 
     // Check to verify files are arranged in descending compensated size.
     assert((i == file_size.size() - 1) ||
-           (i >= Version::number_of_files_to_sort_ - 1) ||
+           (i >= Version::kNumberFilesToSort - 1) ||
            (f->compensated_file_size >=
-            version->files_[level][file_size[i + 1]]->compensated_file_size));
+            level_files[file_size[i + 1]]->compensated_file_size));
 
     // do not pick a file to compact if it is being compacted
     // from n-1 level.
@@ -512,26 +512,27 @@ Compaction* LevelCompactionPicker::PickCompactionBySize(
 Compaction* UniversalCompactionPicker::PickCompaction(
     const MutableCFOptions& mutable_cf_options,
     Version* version, LogBuffer* log_buffer) {
-  int level = 0;
-  double score = version->compaction_score_[0];
+  const int kLevel0 = 0;
+  double score = version->CompactionScore(kLevel0);
+  const std::vector<FileMetaData*>& level_files = version->LevelFiles(kLevel0);
 
-  if ((version->files_[level].size() <
+  if ((level_files.size() <
        (unsigned int)mutable_cf_options.level0_file_num_compaction_trigger)) {
     LogToBuffer(log_buffer, "[%s] Universal: nothing to do\n",
-                version->cfd_->GetName().c_str());
+                version->cfd()->GetName().c_str());
     return nullptr;
   }
   Version::FileSummaryStorage tmp;
   LogToBuffer(log_buffer, 3072, "[%s] Universal: candidate files(%zu): %s\n",
-              version->cfd_->GetName().c_str(), version->files_[level].size(),
-              version->LevelFileSummary(&tmp, 0));
+              version->cfd()->GetName().c_str(), level_files.size(),
+              version->LevelFileSummary(&tmp, kLevel0));
 
   // Check for size amplification first.
   Compaction* c;
   if ((c = PickCompactionUniversalSizeAmp(
           mutable_cf_options, version, score, log_buffer)) != nullptr) {
     LogToBuffer(log_buffer, "[%s] Universal: compacting for size amp\n",
-                version->cfd_->GetName().c_str());
+                version->cfd()->GetName().c_str());
   } else {
     // Size amplification is within limits. Try reducing read
     // amplification while maintaining file size ratios.
@@ -541,31 +542,31 @@ Compaction* UniversalCompactionPicker::PickCompaction(
             mutable_cf_options, version, score, ratio,
             UINT_MAX, log_buffer)) != nullptr) {
       LogToBuffer(log_buffer, "[%s] Universal: compacting for size ratio\n",
-                  version->cfd_->GetName().c_str());
+                  version->cfd()->GetName().c_str());
     } else {
       // Size amplification and file size ratios are within configured limits.
       // If max read amplification is exceeding configured limits, then force
       // compaction without looking at filesize ratios and try to reduce
       // the number of files to fewer than level0_file_num_compaction_trigger.
-      unsigned int num_files = version->files_[level].size() -
+      unsigned int num_files = level_files.size() -
           mutable_cf_options.level0_file_num_compaction_trigger;
       if ((c = PickCompactionUniversalReadAmp(
                mutable_cf_options, version, score, UINT_MAX,
                num_files, log_buffer)) != nullptr) {
         LogToBuffer(log_buffer, "[%s] Universal: compacting for file num -- %u\n",
-                    version->cfd_->GetName().c_str(), num_files);
+                    version->cfd()->GetName().c_str(), num_files);
       }
     }
   }
   if (c == nullptr) {
     return nullptr;
   }
-  assert(c->inputs_[0].size() > 1);
+  assert(c->inputs_[kLevel0].size() > 1);
 
   // validate that all the chosen files are non overlapping in time
   FileMetaData* newerfile __attribute__((unused)) = nullptr;
-  for (unsigned int i = 0; i < c->inputs_[0].size(); i++) {
-    FileMetaData* f = c->inputs_[0][i];
+  for (unsigned int i = 0; i < c->inputs_[kLevel0].size(); i++) {
+    FileMetaData* f = c->inputs_[kLevel0][i];
     assert (f->smallest_seqno <= f->largest_seqno);
     assert(newerfile == nullptr ||
            newerfile->smallest_seqno > f->largest_seqno);
@@ -573,23 +574,22 @@ Compaction* UniversalCompactionPicker::PickCompaction(
   }
 
   // Is the earliest file part of this compaction?
-  FileMetaData* last_file = c->input_version_->files_[level].back();
-  c->bottommost_level_ = c->inputs_[0].files.back() == last_file;
+  FileMetaData* last_file = level_files.back();
+  c->bottommost_level_ = c->inputs_[kLevel0].files.back() == last_file;
 
   // update statistics
   MeasureTime(ioptions_.statistics,
-              NUM_FILES_IN_SINGLE_COMPACTION, c->inputs_[0].size());
+              NUM_FILES_IN_SINGLE_COMPACTION, c->inputs_[kLevel0].size());
 
   // mark all the files that are being compacted
   c->MarkFilesBeingCompacted(true);
 
   // remember this currently undergoing compaction
-  compactions_in_progress_[level].insert(c);
+  compactions_in_progress_[kLevel0].insert(c);
 
   // Record whether this compaction includes all sst files.
   // For now, it is only relevant in universal compaction mode.
-  c->is_full_compaction_ =
-      (c->inputs_[0].size() == c->input_version_->files_[0].size());
+  c->is_full_compaction_ = (c->inputs_[kLevel0].size() == level_files.size());
 
   c->mutable_cf_options_ = mutable_cf_options;
   return c;
@@ -634,7 +634,7 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalReadAmp(
     const MutableCFOptions& mutable_cf_options, Version* version,
     double score, unsigned int ratio,
     unsigned int max_number_of_files_to_compact, LogBuffer* log_buffer) {
-  int level = 0;
+  const int kLevel0 = 0;
 
   unsigned int min_merge_width =
     ioptions_.compaction_options_universal.min_merge_width;
@@ -642,7 +642,7 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalReadAmp(
     ioptions_.compaction_options_universal.max_merge_width;
 
   // The files are sorted from newest first to oldest last.
-  const auto& files = version->files_[level];
+  const auto& files = version->LevelFiles(kLevel0);
 
   FileMetaData* f = nullptr;
   bool done = false;
@@ -669,7 +669,7 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalReadAmp(
       }
       LogToBuffer(log_buffer, "[%s] Universal: file %" PRIu64
                               "[%d] being compacted, skipping",
-                  version->cfd_->GetName().c_str(), f->fd.GetNumber(), loop);
+                  version->cfd()->GetName().c_str(), f->fd.GetNumber(), loop);
       f = nullptr;
     }
 
@@ -681,7 +681,7 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalReadAmp(
       FormatFileNumber(f->fd.GetNumber(), f->fd.GetPathId(), file_num_buf,
                        sizeof(file_num_buf));
       LogToBuffer(log_buffer, "[%s] Universal: Possible candidate file %s[%d].",
-                  version->cfd_->GetName().c_str(), file_num_buf, loop);
+                  version->cfd()->GetName().c_str(), file_num_buf, loop);
     }
 
     // Check if the suceeding files need compaction.
@@ -732,7 +732,7 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalReadAmp(
         LogToBuffer(log_buffer, "[%s] Universal: Skipping file %" PRIu64
                                 "[%d] with size %" PRIu64
                                 " (compensated size %" PRIu64 ") %d\n",
-                    version->cfd_->GetName().c_str(), f->fd.GetNumber(), i,
+                    version->cfd()->GetName().c_str(), f->fd.GetNumber(), i,
                     f->fd.GetFileSize(), f->compensated_file_size,
                     f->being_compacted);
       }
@@ -748,7 +748,7 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalReadAmp(
   int ratio_to_compress =
       ioptions_.compaction_options_universal.compression_size_percent;
   if (ratio_to_compress >= 0) {
-    uint64_t total_size = version->NumLevelBytes(level);
+    uint64_t total_size = version->NumLevelBytes(kLevel0);
     uint64_t older_file_size = 0;
     for (unsigned int i = files.size() - 1;
          i >= first_index_after; i--) {
@@ -766,14 +766,14 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalReadAmp(
   }
   uint32_t path_id = GetPathId(ioptions_, estimated_total_size);
 
-  Compaction* c = new Compaction(
-      version, level, level, mutable_cf_options.MaxFileSizeForLevel(level),
-      LLONG_MAX, path_id, GetCompressionType(ioptions_, level,
+  Compaction* c = new Compaction(version, kLevel0, kLevel0,
+      mutable_cf_options.MaxFileSizeForLevel(kLevel0),
+      LLONG_MAX, path_id, GetCompressionType(ioptions_, kLevel0,
       enable_compression));
   c->score_ = score;
 
   for (unsigned int i = start_index; i < first_index_after; i++) {
-    FileMetaData* f = c->input_version_->files_[level][i];
+    FileMetaData* f = files[i];
     c->inputs_[0].files.push_back(f);
     char file_num_buf[kFormatFileNumberBufSize];
     FormatFileNumber(f->fd.GetNumber(), f->fd.GetPathId(), file_num_buf,
@@ -781,7 +781,7 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalReadAmp(
     LogToBuffer(log_buffer,
                 "[%s] Universal: Picking file %s[%d] "
                 "with size %" PRIu64 " (compensated size %" PRIu64 ")\n",
-                version->cfd_->GetName().c_str(), file_num_buf, i,
+                version->cfd()->GetName().c_str(), file_num_buf, i,
                 f->fd.GetFileSize(), f->compensated_file_size);
   }
   return c;
@@ -796,14 +796,14 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalReadAmp(
 Compaction* UniversalCompactionPicker::PickCompactionUniversalSizeAmp(
     const MutableCFOptions& mutable_cf_options, Version* version,
     double score, LogBuffer* log_buffer) {
-  int level = 0;
+  const int kLevel = 0;
 
   // percentage flexibilty while reducing size amplification
   uint64_t ratio = ioptions_.compaction_options_universal.
                      max_size_amplification_percent;
 
   // The files are sorted from newest first to oldest last.
-  const auto& files = version->files_[level];
+  const auto& files = version->LevelFiles(kLevel);
 
   unsigned int candidate_count = 0;
   uint64_t candidate_size = 0;
@@ -821,10 +821,11 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalSizeAmp(
     FormatFileNumber(f->fd.GetNumber(), f->fd.GetPathId(), file_num_buf,
                      sizeof(file_num_buf));
     LogToBuffer(log_buffer, "[%s] Universal: skipping file %s[%d] compacted %s",
-                version->cfd_->GetName().c_str(), file_num_buf, loop,
+                version->cfd()->GetName().c_str(), file_num_buf, loop,
                 " cannot be a candidate to reduce size amp.\n");
     f = nullptr;
   }
+
   if (f == nullptr) {
     return nullptr;             // no candidate files
   }
@@ -833,7 +834,7 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalSizeAmp(
   FormatFileNumber(f->fd.GetNumber(), f->fd.GetPathId(), file_num_buf,
                    sizeof(file_num_buf));
   LogToBuffer(log_buffer, "[%s] Universal: First candidate file %s[%d] %s",
-              version->cfd_->GetName().c_str(), file_num_buf, start_index,
+              version->cfd()->GetName().c_str(), file_num_buf, start_index,
               " to reduce size amp.\n");
 
   // keep adding up all the remaining files
@@ -845,7 +846,7 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalSizeAmp(
                        sizeof(file_num_buf));
       LogToBuffer(
           log_buffer, "[%s] Universal: Possible candidate file %s[%d] %s.",
-          version->cfd_->GetName().c_str(), file_num_buf, loop,
+          version->cfd()->GetName().c_str(), file_num_buf, loop,
           " is already being compacted. No size amp reduction possible.\n");
       return nullptr;
     }
@@ -865,14 +866,14 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalSizeAmp(
         log_buffer,
         "[%s] Universal: size amp not needed. newer-files-total-size %" PRIu64
         "earliest-file-size %" PRIu64,
-        version->cfd_->GetName().c_str(), candidate_size, earliest_file_size);
+        version->cfd()->GetName().c_str(), candidate_size, earliest_file_size);
     return nullptr;
   } else {
     LogToBuffer(
         log_buffer,
         "[%s] Universal: size amp needed. newer-files-total-size %" PRIu64
         "earliest-file-size %" PRIu64,
-        version->cfd_->GetName().c_str(), candidate_size, earliest_file_size);
+        version->cfd()->GetName().c_str(), candidate_size, earliest_file_size);
   }
   assert(start_index < files.size() - 1);
 
@@ -886,17 +887,17 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalSizeAmp(
   // create a compaction request
   // We always compact all the files, so always compress.
   Compaction* c =
-      new Compaction(version, level, level,
-                     mutable_cf_options.MaxFileSizeForLevel(level),
-                     LLONG_MAX, path_id, GetCompressionType(ioptions_, level));
+      new Compaction(version, kLevel, kLevel,
+                     mutable_cf_options.MaxFileSizeForLevel(kLevel),
+                     LLONG_MAX, path_id, GetCompressionType(ioptions_, kLevel));
   c->score_ = score;
   for (unsigned int loop = start_index; loop < files.size(); loop++) {
-    f = c->input_version_->files_[level][loop];
+    f = files[loop];
     c->inputs_[0].files.push_back(f);
     LogToBuffer(log_buffer,
         "[%s] Universal: size amp picking file %" PRIu64 "[%d] "
         "with size %" PRIu64 " (compensated size %" PRIu64 ")",
-        version->cfd_->GetName().c_str(),
+        version->cfd()->GetName().c_str(),
         f->fd.GetNumber(), loop,
         f->fd.GetFileSize(), f->compensated_file_size);
   }
@@ -907,18 +908,20 @@ Compaction* FIFOCompactionPicker::PickCompaction(
     const MutableCFOptions& mutable_cf_options,
     Version* version, LogBuffer* log_buffer) {
   assert(version->NumberLevels() == 1);
+  const int kLevel0 = 0;
+  const std::vector<FileMetaData*>& level_files = version->LevelFiles(kLevel0);
   uint64_t total_size = 0;
-  for (const auto& file : version->files_[0]) {
+  for (const auto& file : level_files) {
     total_size += file->compensated_file_size;
   }
 
   if (total_size <= ioptions_.compaction_options_fifo.max_table_files_size ||
-      version->files_[0].size() == 0) {
+      level_files.size() == 0) {
     // total size not exceeded
     LogToBuffer(log_buffer,
                 "[%s] FIFO compaction: nothing to do. Total size %" PRIu64
                 ", max size %" PRIu64 "\n",
-                version->cfd_->GetName().c_str(), total_size,
+                version->cfd()->GetName().c_str(), total_size,
                 ioptions_.compaction_options_fifo.max_table_files_size);
     return nullptr;
   }
@@ -927,15 +930,14 @@ Compaction* FIFOCompactionPicker::PickCompaction(
     LogToBuffer(log_buffer,
                 "[%s] FIFO compaction: Already executing compaction. No need "
                 "to run parallel compactions since compactions are very fast",
-                version->cfd_->GetName().c_str());
+                version->cfd()->GetName().c_str());
     return nullptr;
   }
 
   Compaction* c = new Compaction(version, 0, 0, 0, 0, 0, kNoCompression, false,
                                  true /* is deletion compaction */);
   // delete old files (FIFO)
-  for (auto ritr = version->files_[0].rbegin();
-       ritr != version->files_[0].rend(); ++ritr) {
+  for (auto ritr = level_files.rbegin(); ritr != level_files.rend(); ++ritr) {
     auto f = *ritr;
     total_size -= f->compensated_file_size;
     c->inputs_[0].files.push_back(f);
@@ -943,7 +945,8 @@ Compaction* FIFOCompactionPicker::PickCompaction(
     AppendHumanBytes(f->fd.GetFileSize(), tmp_fsize, sizeof(tmp_fsize));
     LogToBuffer(log_buffer, "[%s] FIFO compaction: picking file %" PRIu64
                             " with size %s for deletion",
-                version->cfd_->GetName().c_str(), f->fd.GetNumber(), tmp_fsize);
+                version->cfd()->GetName().c_str(), f->fd.GetNumber(),
+                tmp_fsize);
     if (total_size <= ioptions_.compaction_options_fifo.max_table_files_size) {
       break;
     }
diff --git a/db/version_set.cc b/db/version_set.cc
index 88f66ad51..b121424d7 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -991,8 +991,8 @@ void Version::UpdateFilesBySize() {
       temp[i].file = files[i];
     }
 
-    // sort the top number_of_files_to_sort_ based on file size
-    size_t num = Version::number_of_files_to_sort_;
+    // sort the top kNumberFilesToSort based on file size
+    size_t num = Version::kNumberFilesToSort;
     if (num > temp.size()) {
       num = temp.size();
     }
diff --git a/db/version_set.h b/db/version_set.h
index 2c5b3a8a7..95e38fca4 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -130,6 +130,12 @@ class Version {
   // See field declaration
   int MaxCompactionScoreLevel() const { return max_compaction_score_level_; }
 
+  // Return level number that has idx'th highest score
+  int CompactionScoreLevel(int idx) const { return compaction_level_[idx]; }
+
+  // Return idx'th highest score
+  double CompactionScore(int idx) const { return compaction_score_[idx]; }
+
   void GetOverlappingInputs(
       int level,
       const InternalKey* begin,         // nullptr means before all keys
@@ -251,6 +257,12 @@ class Version {
     return files_[level];
   }
 
+  // REQUIRES: This version has been saved (see VersionSet::SaveTo)
+  const std::vector<int>& FilesBySize(int level) const {
+    assert(finalized_);
+    return files_by_size_[level];
+  }
+
   // REQUIRES: lock is held
   // Set the index that is used to offset into files_by_size_ to find
   // the next compaction candidate file.
@@ -263,14 +275,18 @@ class Version {
     return next_file_to_compact_by_size_[level];
   }
 
+  // Only the first few entries of files_by_size_ are sorted.
+  // There is no need to sort all the files because it is likely
+  // that on a running system, we need to look at only the first
+  // few largest files because a new version is created every few
+  // seconds/minutes (because of concurrent compactions).
+  static const size_t kNumberFilesToSort = 50;
+
  private:
   friend class VersionSet;
   friend class DBImpl;
   friend class CompactedDBImpl;
   friend class ColumnFamilyData;
-  friend class LevelCompactionPicker;
-  friend class UniversalCompactionPicker;
-  friend class FIFOCompactionPicker;
   friend class ForwardIterator;
   friend class InternalStats;
 
@@ -332,13 +348,6 @@ class Version {
   // file that is not yet compacted
   std::vector<int> next_file_to_compact_by_size_;
 
-  // Only the first few entries of files_by_size_ are sorted.
-  // There is no need to sort all the files because it is likely
-  // that on a running system, we need to look at only the first
-  // few largest files because a new version is created every few
-  // seconds/minutes (because of concurrent compactions).
-  static const size_t number_of_files_to_sort_ = 50;
-
   // Level that should be compacted next and its compaction score.
   // Score < 1 means compaction is not strictly needed.  These fields
   // are initialized by Finalize().

From 834c67d77f81e892ddf03d942e9d1a2e38a9787b Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Tue, 28 Oct 2014 10:03:13 -0700
Subject: [PATCH 317/829] rename FileLevel to LevelFilesBrief / unfriend
 CompactedDBImpl

Summary:
We have several different types of data structures for file information.
FileLevel is kinda of confusing since it only contains file range and
fd. Rename it to LevelFilesBrief to make it clear.
Unfriend CompactedDBImpl as a by product

Test Plan:
make release / make all
will run full test with all stacked diffs

Reviewers: sdong, yhchiang, rven, igor

Reviewed By: igor

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D27585
---
 db/compaction.cc                            |  3 +-
 db/compaction.h                             |  6 +-
 db/version_edit.h                           |  4 +-
 db/version_set.cc                           | 75 +++++++++++----------
 db/version_set.h                            | 28 +++++---
 db/version_set_test.cc                      | 22 +++---
 utilities/compacted_db/compacted_db_impl.cc | 19 +++---
 utilities/compacted_db/compacted_db_impl.h  |  2 +-
 8 files changed, 87 insertions(+), 72 deletions(-)

diff --git a/db/compaction.cc b/db/compaction.cc
index 2802044a4..533fe497e 100644
--- a/db/compaction.cc
+++ b/db/compaction.cc
@@ -86,7 +86,8 @@ Compaction::~Compaction() {
 void Compaction::GenerateFileLevels() {
   input_levels_.resize(num_input_levels());
   for (int which = 0; which < num_input_levels(); which++) {
-    DoGenerateFileLevel(&input_levels_[which], inputs_[which].files, &arena_);
+    DoGenerateLevelFilesBrief(
+        &input_levels_[which], inputs_[which].files, &arena_);
   }
 }
 
diff --git a/db/compaction.h b/db/compaction.h
index 7c490946a..5183822e3 100644
--- a/db/compaction.h
+++ b/db/compaction.h
@@ -89,8 +89,8 @@ class Compaction {
     return &inputs_[compaction_input_level].files;
   }
 
-  // Returns the FileLevel of the specified compaction input level.
-  FileLevel* input_levels(int compaction_input_level) {
+  // Returns the LevelFilesBrief of the specified compaction input level.
+  LevelFilesBrief* input_levels(int compaction_input_level) {
     return &input_levels_[compaction_input_level];
   }
 
@@ -193,7 +193,7 @@ class Compaction {
   autovector<CompactionInputFiles> inputs_;
 
   // A copy of inputs_, organized more closely in memory
-  autovector<FileLevel, 2> input_levels_;
+  autovector<LevelFilesBrief, 2> input_levels_;
 
   // State used to check for number of of overlapping grandparent files
   // (grandparent == "output_level_ + 1")
diff --git a/db/version_edit.h b/db/version_edit.h
index ef883297a..fbe7e02d1 100644
--- a/db/version_edit.h
+++ b/db/version_edit.h
@@ -120,10 +120,10 @@ struct FdWithKeyRange {
 
 // Data structure to store an array of FdWithKeyRange in one level
 // Actual data is guaranteed to be stored closely
-struct FileLevel {
+struct LevelFilesBrief {
   size_t num_files;
   FdWithKeyRange* files;
-  FileLevel() {
+  LevelFilesBrief() {
     num_files = 0;
     files = nullptr;
   }
diff --git a/db/version_set.cc b/db/version_set.cc
index b121424d7..ec4df2823 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -46,10 +46,10 @@ namespace rocksdb {
 
 namespace {
 
-// Find File in FileLevel data structure
+// Find File in LevelFilesBrief data structure
 // Within an index range defined by left and right
 int FindFileInRange(const InternalKeyComparator& icmp,
-    const FileLevel& file_level,
+    const LevelFilesBrief& file_level,
     const Slice& key,
     uint32_t left,
     uint32_t right) {
@@ -102,7 +102,7 @@ class FilePicker {
       std::vector<FileMetaData*>* files,
       const Slice& user_key,
       const Slice& ikey,
-      autovector<FileLevel>* file_levels,
+      autovector<LevelFilesBrief>* file_levels,
       unsigned int num_levels,
       FileIndexer* file_indexer,
       const Comparator* user_comparator,
@@ -114,7 +114,7 @@ class FilePicker {
 #ifndef NDEBUG
         files_(files),
 #endif
-        file_levels_(file_levels),
+        level_files_brief_(file_levels),
         user_key_(user_key),
         ikey_(ikey),
         file_indexer_(file_indexer),
@@ -124,8 +124,8 @@ class FilePicker {
     search_ended_ = !PrepareNextLevel();
     if (!search_ended_) {
       // Prefetch Level 0 table data to avoid cache miss if possible.
-      for (unsigned int i = 0; i < (*file_levels_)[0].num_files; ++i) {
-        auto* r = (*file_levels_)[0].files[i].fd.table_reader;
+      for (unsigned int i = 0; i < (*level_files_brief_)[0].num_files; ++i) {
+        auto* r = (*level_files_brief_)[0].files[i].fd.table_reader;
         if (r) {
           r->Prepare(ikey);
         }
@@ -225,9 +225,9 @@ class FilePicker {
 #ifndef NDEBUG
   std::vector<FileMetaData*>* files_;
 #endif
-  autovector<FileLevel>* file_levels_;
+  autovector<LevelFilesBrief>* level_files_brief_;
   bool search_ended_;
-  FileLevel* curr_file_level_;
+  LevelFilesBrief* curr_file_level_;
   unsigned int curr_index_in_curr_level_;
   unsigned int start_index_in_curr_level_;
   Slice user_key_;
@@ -244,7 +244,7 @@ class FilePicker {
   bool PrepareNextLevel() {
     curr_level_++;
     while (curr_level_ < num_levels_) {
-      curr_file_level_ = &(*file_levels_)[curr_level_];
+      curr_file_level_ = &(*level_files_brief_)[curr_level_];
       if (curr_file_level_->num_files == 0) {
         // When current level is empty, the search bound generated from upper
         // level must be [0, -1] or [0, FileIndexer::kLevelMaxIndex] if it is
@@ -331,12 +331,12 @@ Version::~Version() {
 }
 
 int FindFile(const InternalKeyComparator& icmp,
-             const FileLevel& file_level,
+             const LevelFilesBrief& file_level,
              const Slice& key) {
   return FindFileInRange(icmp, file_level, key, 0, file_level.num_files);
 }
 
-void DoGenerateFileLevel(FileLevel* file_level,
+void DoGenerateLevelFilesBrief(LevelFilesBrief* file_level,
         const std::vector<FileMetaData*>& files,
         Arena* arena) {
   assert(file_level);
@@ -383,7 +383,7 @@ static bool BeforeFile(const Comparator* ucmp,
 bool SomeFileOverlapsRange(
     const InternalKeyComparator& icmp,
     bool disjoint_sorted_files,
-    const FileLevel& file_level,
+    const LevelFilesBrief& file_level,
     const Slice* smallest_user_key,
     const Slice* largest_user_key) {
   const Comparator* ucmp = icmp.user_comparator();
@@ -425,7 +425,7 @@ bool SomeFileOverlapsRange(
 class Version::LevelFileNumIterator : public Iterator {
  public:
   LevelFileNumIterator(const InternalKeyComparator& icmp,
-                       const FileLevel* flevel)
+                       const LevelFilesBrief* flevel)
       : icmp_(icmp),
         flevel_(flevel),
         index_(flevel->num_files),
@@ -468,7 +468,7 @@ class Version::LevelFileNumIterator : public Iterator {
   virtual Status status() const { return Status::OK(); }
  private:
   const InternalKeyComparator icmp_;
-  const FileLevel* flevel_;
+  const LevelFilesBrief* flevel_;
   uint32_t index_;
   mutable FileDescriptor current_value_;
 };
@@ -582,7 +582,7 @@ Status Version::GetPropertiesOfAllTables(TablePropertiesCollection* props) {
 
 size_t Version::GetMemoryUsageByTableReaders() {
   size_t total_usage = 0;
-  for (auto& file_level : file_levels_) {
+  for (auto& file_level : level_files_brief_) {
     for (size_t i = 0; i < file_level.num_files; i++) {
       total_usage += cfd_->table_cache()->GetMemoryUsageByTableReader(
           vset_->env_options_, cfd_->internal_comparator(),
@@ -618,8 +618,8 @@ void Version::AddIterators(const ReadOptions& read_options,
   assert(finalized_);
 
   // Merge all level zero files together since they may overlap
-  for (size_t i = 0; i < file_levels_[0].num_files; i++) {
-    const auto& file = file_levels_[0].files[i];
+  for (size_t i = 0; i < level_files_brief_[0].num_files; i++) {
+    const auto& file = level_files_brief_[0].files[i];
     merge_iter_builder->AddIterator(cfd_->table_cache()->NewIterator(
         read_options, soptions, cfd_->internal_comparator(), file.fd, nullptr,
         false, merge_iter_builder->GetArena()));
@@ -629,14 +629,14 @@ void Version::AddIterators(const ReadOptions& read_options,
   // walks through the non-overlapping files in the level, opening them
   // lazily.
   for (int level = 1; level < num_levels_; level++) {
-    if (file_levels_[level].num_files != 0) {
+    if (level_files_brief_[level].num_files != 0) {
       merge_iter_builder->AddIterator(NewTwoLevelIterator(
           new LevelFileIteratorState(
               cfd_->table_cache(), read_options, soptions,
               cfd_->internal_comparator(), false /* for_compaction */,
               cfd_->ioptions()->prefix_extractor != nullptr),
           new LevelFileNumIterator(cfd_->internal_comparator(),
-              &file_levels_[level]), merge_iter_builder->GetArena()));
+              &level_files_brief_[level]), merge_iter_builder->GetArena()));
     }
   }
 }
@@ -706,8 +706,9 @@ void Version::Get(const ReadOptions& read_options,
       db_statistics_, status->ok() ? GetContext::kNotFound : GetContext::kMerge,
       user_key, value, value_found, merge_context);
 
-  FilePicker fp(files_, user_key, ikey, &file_levels_, num_non_empty_levels_,
-      &file_indexer_, user_comparator_, internal_comparator_);
+  FilePicker fp(files_, user_key, ikey, &level_files_brief_,
+      num_non_empty_levels_, &file_indexer_, user_comparator_,
+      internal_comparator_);
   FdWithKeyRange* f = fp.GetNextFile();
   while (f != nullptr) {
     *status = table_cache_->Get(read_options, *internal_comparator_, f->fd,
@@ -758,10 +759,11 @@ void Version::Get(const ReadOptions& read_options,
   }
 }
 
-void Version::GenerateFileLevels() {
-  file_levels_.resize(num_non_empty_levels_);
+void Version::GenerateLevelFilesBrief() {
+  level_files_brief_.resize(num_non_empty_levels_);
   for (int level = 0; level < num_non_empty_levels_; level++) {
-    DoGenerateFileLevel(&file_levels_[level], files_[level], &arena_);
+    DoGenerateLevelFilesBrief(
+        &level_files_brief_[level], files_[level], &arena_);
   }
 }
 
@@ -772,7 +774,7 @@ void Version::PrepareApply(const MutableCFOptions& mutable_cf_options,
   UpdateFilesBySize();
   UpdateNumNonEmptyLevels();
   file_indexer_.UpdateIndex(&arena_, num_non_empty_levels_, files_);
-  GenerateFileLevels();
+  GenerateLevelFilesBrief();
 }
 
 bool Version::MaybeInitializeFileMetaData(FileMetaData* file_meta) {
@@ -1046,7 +1048,7 @@ bool Version::OverlapInLevel(int level,
                              const Slice* smallest_user_key,
                              const Slice* largest_user_key) {
   return SomeFileOverlapsRange(cfd_->internal_comparator(), (level > 0),
-                               file_levels_[level], smallest_user_key,
+                               level_files_brief_[level], smallest_user_key,
                                largest_user_key);
 }
 
@@ -1109,8 +1111,8 @@ void Version::GetOverlappingInputs(int level,
       hint_index, file_index);
     return;
   }
-  for (size_t i = 0; i < file_levels_[level].num_files; ) {
-    FdWithKeyRange* f = &(file_levels_[level].files[i++]);
+  for (size_t i = 0; i < level_files_brief_[level].num_files; ) {
+    FdWithKeyRange* f = &(level_files_brief_[level].files[i++]);
     const Slice file_start = ExtractUserKey(f->smallest_key);
     const Slice file_limit = ExtractUserKey(f->largest_key);
     if (begin != nullptr && user_cmp->Compare(file_limit, user_begin) < 0) {
@@ -1166,7 +1168,7 @@ void Version::GetOverlappingInputsBinarySearch(
 
   while (!foundOverlap && min <= max) {
     mid = (min + max)/2;
-    FdWithKeyRange* f = &(file_levels_[level].files[mid]);
+    FdWithKeyRange* f = &(level_files_brief_[level].files[mid]);
     const Slice file_start = ExtractUserKey(f->smallest_key);
     const Slice file_limit = ExtractUserKey(f->largest_key);
     if (user_cmp->Compare(file_limit, user_begin) < 0) {
@@ -1194,7 +1196,7 @@ void Version::GetOverlappingInputsBinarySearch(
 // The midIndex specifies the index of at least one file that
 // overlaps the specified range. From that file, iterate backward
 // and forward to find all overlapping files.
-// Use FileLevel in searching, make it faster
+// Use LevelFilesBrief in searching, make it faster
 void Version::ExtendOverlappingInputs(
     int level,
     const Slice& user_begin,
@@ -1203,11 +1205,11 @@ void Version::ExtendOverlappingInputs(
     unsigned int midIndex) {
 
   const Comparator* user_cmp = cfd_->internal_comparator().user_comparator();
-  const FdWithKeyRange* files = file_levels_[level].files;
+  const FdWithKeyRange* files = level_files_brief_[level].files;
 #ifndef NDEBUG
   {
     // assert that the file at midIndex overlaps with the range
-    assert(midIndex < file_levels_[level].num_files);
+    assert(midIndex < level_files_brief_[level].num_files);
     const FdWithKeyRange* f = &files[midIndex];
     const Slice fstart = ExtractUserKey(f->smallest_key);
     const Slice flimit = ExtractUserKey(f->largest_key);
@@ -1234,7 +1236,8 @@ void Version::ExtendOverlappingInputs(
     }
   }
   // check forward from 'mid+1' to higher indices
-  for (unsigned int i = midIndex+1; i < file_levels_[level].num_files; i++) {
+  for (unsigned int i = midIndex+1;
+       i < level_files_brief_[level].num_files; i++) {
     const FdWithKeyRange* f = &files[i];
     const Slice file_start = ExtractUserKey(f->smallest_key);
     if (user_cmp->Compare(file_start, user_end) <= 0) {
@@ -1268,8 +1271,8 @@ bool Version::HasOverlappingUserKey(
   }
 
   const Comparator* user_cmp = cfd_->internal_comparator().user_comparator();
-  const FileLevel& file_level = file_levels_[level];
-  const FdWithKeyRange* files = file_levels_[level].files;
+  const LevelFilesBrief& file_level = level_files_brief_[level];
+  const FdWithKeyRange* files = level_files_brief_[level].files;
   const size_t kNumFiles = file_level.num_files;
 
   // Check the last file in inputs against the file after it
@@ -2799,7 +2802,7 @@ Iterator* VersionSet::MakeInputIterator(Compaction* c) {
   for (int which = 0; which < c->num_input_levels(); which++) {
     if (c->input_levels(which)->num_files != 0) {
       if (c->level(which) == 0) {
-        const FileLevel* flevel = c->input_levels(which);
+        const LevelFilesBrief* flevel = c->input_levels(which);
         for (size_t i = 0; i < flevel->num_files; i++) {
           list[num++] = cfd->table_cache()->NewIterator(
               read_options, env_options_compactions_,
diff --git a/db/version_set.h b/db/version_set.h
index 95e38fca4..9a0c6733b 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -58,7 +58,7 @@ class MergeIteratorBuilder;
 // REQUIRES: "file_level.files" contains a sorted list of
 // non-overlapping files.
 extern int FindFile(const InternalKeyComparator& icmp,
-                    const FileLevel& file_level,
+                    const LevelFilesBrief& file_level,
                     const Slice& key);
 
 // Returns true iff some file in "files" overlaps the user key range
@@ -70,14 +70,14 @@ extern int FindFile(const InternalKeyComparator& icmp,
 extern bool SomeFileOverlapsRange(
     const InternalKeyComparator& icmp,
     bool disjoint_sorted_files,
-    const FileLevel& file_level,
+    const LevelFilesBrief& file_level,
     const Slice* smallest_user_key,
     const Slice* largest_user_key);
 
-// Generate FileLevel from vector<FdWithKeyRange*>
+// Generate LevelFilesBrief from vector<FdWithKeyRange*>
 // Would copy smallest_key and largest_key data to sequential memory
 // arena: Arena used to allocate the memory
-extern void DoGenerateFileLevel(FileLevel* file_level,
+extern void DoGenerateLevelFilesBrief(LevelFilesBrief* file_level,
         const std::vector<FileMetaData*>& files,
         Arena* arena);
 
@@ -105,8 +105,8 @@ class Version {
       const MutableCFOptions& mutable_cf_options,
       std::vector<uint64_t>& size_being_compacted);
 
-  // Generate file_levels_ from files_
-  void GenerateFileLevels();
+  // Generate level_files_brief_ from files_
+  void GenerateLevelFilesBrief();
 
   // Update scores, pre-calculated variables. It needs to be called before
   // applying the version to the version set.
@@ -183,6 +183,12 @@ class Version {
 
   int NumberLevels() const { return num_levels_; }
 
+  // REQUIRES: This version has been saved (see VersionSet::SaveTo)
+  int NumNonEmptyLevels() const {
+    assert(finalized_);
+    return num_non_empty_levels_;
+  }
+
   // REQUIRES: This version has been saved (see VersionSet::SaveTo)
   int NumLevelFiles(int level) const {
     assert(finalized_);
@@ -263,6 +269,10 @@ class Version {
     return files_by_size_[level];
   }
 
+  const LevelFilesBrief& GetLevelFilesBrief(int level) const {
+    return level_files_brief_[level];
+  }
+
   // REQUIRES: lock is held
   // Set the index that is used to offset into files_by_size_ to find
   // the next compaction candidate file.
@@ -285,7 +295,6 @@ class Version {
  private:
   friend class VersionSet;
   friend class DBImpl;
-  friend class CompactedDBImpl;
   friend class ColumnFamilyData;
   friend class ForwardIterator;
   friend class InternalStats;
@@ -321,7 +330,8 @@ class Version {
   TableCache* table_cache_;
   const MergeOperator* merge_operator_;
 
-  autovector<FileLevel> file_levels_;   // A copy of list of files per level
+  // A short brief metadata of files per level
+  autovector<LevelFilesBrief> level_files_brief_;
   Logger* info_log_;
   Statistics* db_statistics_;
   int num_levels_;              // Number of levels
@@ -329,7 +339,7 @@ class Version {
                                 // is guaranteed to be empty.
   FileIndexer file_indexer_;
   VersionSet* vset_;            // VersionSet to which this Version belongs
-  Arena arena_;                 // Used to allocate space for file_levels_
+  Arena arena_;                 // Used to allocate space for level_files_brief_
   Version* next_;               // Next version in linked list
   Version* prev_;               // Previous version in linked list
   int refs_;                    // Number of live refs to this version
diff --git a/db/version_set_test.cc b/db/version_set_test.cc
index 402762efa..9920a9e05 100644
--- a/db/version_set_test.cc
+++ b/db/version_set_test.cc
@@ -14,15 +14,15 @@
 
 namespace rocksdb {
 
-class GenerateFileLevelTest {
+class GenerateLevelFilesBriefTest {
  public:
   std::vector<FileMetaData*> files_;
-  FileLevel file_level_;
+  LevelFilesBrief file_level_;
   Arena arena_;
 
-  GenerateFileLevelTest() { }
+  GenerateLevelFilesBriefTest() { }
 
-  ~GenerateFileLevelTest() {
+  ~GenerateLevelFilesBriefTest() {
     for (unsigned int i = 0; i < files_.size(); i++) {
       delete files_[i];
     }
@@ -49,33 +49,33 @@ class GenerateFileLevelTest {
   }
 };
 
-TEST(GenerateFileLevelTest, Empty) {
-  DoGenerateFileLevel(&file_level_, files_, &arena_);
+TEST(GenerateLevelFilesBriefTest, Empty) {
+  DoGenerateLevelFilesBrief(&file_level_, files_, &arena_);
   ASSERT_EQ(0u, file_level_.num_files);
   ASSERT_EQ(0, Compare());
 }
 
-TEST(GenerateFileLevelTest, Single) {
+TEST(GenerateLevelFilesBriefTest, Single) {
   Add("p", "q");
-  DoGenerateFileLevel(&file_level_, files_, &arena_);
+  DoGenerateLevelFilesBrief(&file_level_, files_, &arena_);
   ASSERT_EQ(1u, file_level_.num_files);
   ASSERT_EQ(0, Compare());
 }
 
 
-TEST(GenerateFileLevelTest, Multiple) {
+TEST(GenerateLevelFilesBriefTest, Multiple) {
   Add("150", "200");
   Add("200", "250");
   Add("300", "350");
   Add("400", "450");
-  DoGenerateFileLevel(&file_level_, files_, &arena_);
+  DoGenerateLevelFilesBrief(&file_level_, files_, &arena_);
   ASSERT_EQ(4u, file_level_.num_files);
   ASSERT_EQ(0, Compare());
 }
 
 class FindLevelFileTest {
  public:
-  FileLevel file_level_;
+  LevelFilesBrief file_level_;
   bool disjoint_sorted_files_;
   Arena arena_;
 
diff --git a/utilities/compacted_db/compacted_db_impl.cc b/utilities/compacted_db/compacted_db_impl.cc
index 775033e2a..a253153ae 100644
--- a/utilities/compacted_db/compacted_db_impl.cc
+++ b/utilities/compacted_db/compacted_db_impl.cc
@@ -104,27 +104,28 @@ Status CompactedDBImpl::Init(const Options& options) {
   }
   version_ = cfd_->GetSuperVersion()->current;
   user_comparator_ = cfd_->user_comparator();
+  const LevelFilesBrief& l0 = version_->GetLevelFilesBrief(0);
   // L0 should not have files
-  if (version_->file_levels_[0].num_files > 1) {
+  if (l0.num_files > 1) {
     return Status::NotSupported("L0 contain more than 1 file");
   }
-  if (version_->file_levels_[0].num_files == 1) {
-    if (version_->num_non_empty_levels_ > 1) {
+  if (l0.num_files == 1) {
+    if (version_->NumNonEmptyLevels() > 1) {
       return Status::NotSupported("Both L0 and other level contain files");
     }
-    files_ = version_->file_levels_[0];
+    files_ = l0;
     return Status::OK();
   }
 
-  for (int i = 1; i < version_->num_non_empty_levels_ - 1; ++i) {
-    if (version_->file_levels_[i].num_files > 0) {
+  for (int i = 1; i < version_->NumNonEmptyLevels() - 1; ++i) {
+    if (version_->GetLevelFilesBrief(i).num_files > 0) {
       return Status::NotSupported("Other levels also contain files");
     }
   }
 
-  int level = version_->num_non_empty_levels_ - 1;
-  if (version_->file_levels_[level].num_files > 0) {
-    files_ = version_->file_levels_[version_->num_non_empty_levels_ - 1];
+  int level = version_->NumNonEmptyLevels() - 1;
+  if (version_->GetLevelFilesBrief(level).num_files > 0) {
+    files_ = version_->GetLevelFilesBrief(level);
     return Status::OK();
   }
   return Status::NotSupported("no file exists");
diff --git a/utilities/compacted_db/compacted_db_impl.h b/utilities/compacted_db/compacted_db_impl.h
index 49aca53b1..e1ac92dc4 100644
--- a/utilities/compacted_db/compacted_db_impl.h
+++ b/utilities/compacted_db/compacted_db_impl.h
@@ -86,7 +86,7 @@ class CompactedDBImpl : public DBImpl {
   ColumnFamilyData* cfd_;
   Version* version_;
   const Comparator* user_comparator_;
-  FileLevel files_;
+  LevelFilesBrief files_;
 
   // No copying allowed
   CompactedDBImpl(const CompactedDBImpl&);

From f981e081393a440f086482dd5ef23473abbdbe04 Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Tue, 28 Oct 2014 10:04:38 -0700
Subject: [PATCH 318/829] unfriend ColumnFamilyData from VersionSet

Summary: as title

Test Plan:
make release
will run full test on all stacked diffs before committing

Reviewers: sdong, yhchiang, rven, igor

Reviewed By: igor

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D27591
---
 db/column_family.cc | 5 +++--
 db/version_set.cc   | 3 +++
 db/version_set.h    | 6 +++++-
 3 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/db/column_family.cc b/db/column_family.cc
index 0e83e98ab..c47bbb12f 100644
--- a/db/column_family.cc
+++ b/db/column_family.cc
@@ -304,8 +304,9 @@ ColumnFamilyData::~ColumnFamilyData() {
 
   if (dummy_versions_ != nullptr) {
     // List must be empty
-    assert(dummy_versions_->next_ == dummy_versions_);
-    delete dummy_versions_;
+    assert(dummy_versions_->TEST_Next() == dummy_versions_);
+    bool deleted __attribute__((unused)) = dummy_versions_->Unref();
+    assert(deleted);
   }
 
   if (mem_ != nullptr) {
diff --git a/db/version_set.cc b/db/version_set.cc
index ec4df2823..eab417a28 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -2933,6 +2933,9 @@ ColumnFamilyData* VersionSet::CreateColumnFamily(
   assert(edit->is_column_family_add_);
 
   Version* dummy_versions = new Version(nullptr, this);
+  // Ref() dummy version once so that later we can call Unref() to delete it
+  // by avoiding calling "delete" explicitly (~Version is private)
+  dummy_versions->Ref();
   auto new_cfd = column_family_set_->CreateColumnFamily(
       edit->column_family_name_, edit->column_family_, dummy_versions,
       cf_options);
diff --git a/db/version_set.h b/db/version_set.h
index 9a0c6733b..8c20937d3 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -292,10 +292,14 @@ class Version {
   // seconds/minutes (because of concurrent compactions).
   static const size_t kNumberFilesToSort = 50;
 
+  // Return the next Version in the linked list. Used for debug only
+  Version* TEST_Next() const {
+    return next_;
+  }
+
  private:
   friend class VersionSet;
   friend class DBImpl;
-  friend class ColumnFamilyData;
   friend class ForwardIterator;
   friend class InternalStats;
 

From eb357af58c972d5ccbb84739f9aab77897a6f817 Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Tue, 28 Oct 2014 10:08:41 -0700
Subject: [PATCH 319/829] unfriend ForwardIterator from VersionSet

Summary: as title

Test Plan:
make release
will run full test on all stacked diffs

Reviewers: sdong, yhchiang, igor

Reviewed By: igor

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D27597
---
 db/file_indexer.cc     |  6 ++---
 db/file_indexer.h      |  6 ++---
 db/forward_iterator.cc | 50 +++++++++++++++++++++---------------------
 db/version_set.h       |  7 +++++-
 4 files changed, 37 insertions(+), 32 deletions(-)

diff --git a/db/file_indexer.cc b/db/file_indexer.cc
index 56691bde5..ca2ef9bc8 100644
--- a/db/file_indexer.cc
+++ b/db/file_indexer.cc
@@ -17,17 +17,17 @@ namespace rocksdb {
 FileIndexer::FileIndexer(const Comparator* ucmp)
     : num_levels_(0), ucmp_(ucmp), level_rb_(nullptr) {}
 
-uint32_t FileIndexer::NumLevelIndex() {
+uint32_t FileIndexer::NumLevelIndex() const {
   return next_level_index_.size();
 }
 
-uint32_t FileIndexer::LevelIndexSize(uint32_t level) {
+uint32_t FileIndexer::LevelIndexSize(uint32_t level) const {
   return next_level_index_[level].num_index;
 }
 
 void FileIndexer::GetNextLevelIndex(
     const uint32_t level, const uint32_t file_index, const int cmp_smallest,
-    const int cmp_largest, int32_t* left_bound, int32_t* right_bound) {
+    const int cmp_largest, int32_t* left_bound, int32_t* right_bound) const {
   assert(level > 0);
 
   // Last level, no hint
diff --git a/db/file_indexer.h b/db/file_indexer.h
index 127b3ee46..0c5dea92e 100644
--- a/db/file_indexer.h
+++ b/db/file_indexer.h
@@ -42,9 +42,9 @@ class FileIndexer {
  public:
   explicit FileIndexer(const Comparator* ucmp);
 
-  uint32_t NumLevelIndex();
+  uint32_t NumLevelIndex() const;
 
-  uint32_t LevelIndexSize(uint32_t level);
+  uint32_t LevelIndexSize(uint32_t level) const;
 
   // Return a file index range in the next level to search for a key based on
   // smallest and largest key comparision for the current file specified by
@@ -52,7 +52,7 @@ class FileIndexer {
   // be valid and fit in the vector size.
   void GetNextLevelIndex(
     const uint32_t level, const uint32_t file_index, const int cmp_smallest,
-    const int cmp_largest, int32_t* left_bound, int32_t* right_bound);
+    const int cmp_largest, int32_t* left_bound, int32_t* right_bound) const;
 
   void UpdateIndex(Arena* arena, const uint32_t num_levels,
                    std::vector<FileMetaData*>* const files);
diff --git a/db/forward_iterator.cc b/db/forward_iterator.cc
index b2e4bd067..cd9299aa4 100644
--- a/db/forward_iterator.cc
+++ b/db/forward_iterator.cc
@@ -219,15 +219,15 @@ void ForwardIterator::SeekInternal(const Slice& internal_key,
     if (!seek_to_first) {
       user_key = ExtractUserKey(internal_key);
     }
-    auto* files = sv_->current->files_;
-    for (uint32_t i = 0; i < files[0].size(); ++i) {
+    const std::vector<FileMetaData*>& l0 = sv_->current->LevelFiles(0);
+    for (uint32_t i = 0; i < l0.size(); ++i) {
       if (seek_to_first) {
         l0_iters_[i]->SeekToFirst();
       } else {
         // If the target key passes over the larget key, we are sure Next()
         // won't go over this file.
         if (user_comparator_->Compare(user_key,
-              files[0][i]->largest.user_key()) > 0) {
+              l0[i]->largest.user_key()) > 0) {
           continue;
         }
         l0_iters_[i]->Seek(internal_key);
@@ -248,64 +248,63 @@ void ForwardIterator::SeekInternal(const Slice& internal_key,
     int32_t search_left_bound = 0;
     int32_t search_right_bound = FileIndexer::kLevelMaxIndex;
     for (int32_t level = 1; level < sv_->current->NumberLevels(); ++level) {
-      if (files[level].empty()) {
+      const std::vector<FileMetaData*>& level_files =
+        sv_->current->LevelFiles(level);
+      if (level_files.empty()) {
         search_left_bound = 0;
         search_right_bound = FileIndexer::kLevelMaxIndex;
         continue;
       }
       assert(level_iters_[level - 1] != nullptr);
       uint32_t f_idx = 0;
+      const auto& indexer = sv_->current->GetIndexer();
       if (!seek_to_first) {
-        // TODO(ljin): remove before committing
-        // f_idx = FindFileInRange(
-        //    files[level], internal_key, 0, files[level].size());
-
         if (search_left_bound == search_right_bound) {
           f_idx = search_left_bound;
         } else if (search_left_bound < search_right_bound) {
           f_idx = FindFileInRange(
-              files[level], internal_key, search_left_bound,
+              level_files, internal_key, search_left_bound,
               search_right_bound == FileIndexer::kLevelMaxIndex ?
-                files[level].size() : search_right_bound);
+                level_files.size() : search_right_bound);
         } else {
           // search_left_bound > search_right_bound
           // There are only 2 cases this can happen:
           // (1) target key is smaller than left most file
           // (2) target key is larger than right most file
-          assert(search_left_bound == (int32_t)files[level].size() ||
+          assert(search_left_bound == (int32_t)level_files.size() ||
                  search_right_bound == -1);
           if (search_right_bound == -1) {
             assert(search_left_bound == 0);
             f_idx = 0;
           } else {
-            sv_->current->file_indexer_.GetNextLevelIndex(
-                level, files[level].size() - 1,
+            indexer.GetNextLevelIndex(
+                level, level_files.size() - 1,
                 1, 1, &search_left_bound, &search_right_bound);
             continue;
           }
         }
 
         // Prepare hints for the next level
-        if (f_idx < files[level].size()) {
+        if (f_idx < level_files.size()) {
           int cmp_smallest = user_comparator_->Compare(
-              user_key, files[level][f_idx]->smallest.user_key());
+              user_key, level_files[f_idx]->smallest.user_key());
           int cmp_largest = -1;
           if (cmp_smallest >= 0) {
             cmp_smallest = user_comparator_->Compare(
-                user_key, files[level][f_idx]->smallest.user_key());
+                user_key, level_files[f_idx]->smallest.user_key());
           }
-          sv_->current->file_indexer_.GetNextLevelIndex(level, f_idx,
+          indexer.GetNextLevelIndex(level, f_idx,
               cmp_smallest, cmp_largest,
               &search_left_bound, &search_right_bound);
         } else {
-          sv_->current->file_indexer_.GetNextLevelIndex(
-              level, files[level].size() - 1,
+          indexer.GetNextLevelIndex(
+              level, level_files.size() - 1,
               1, 1, &search_left_bound, &search_right_bound);
         }
       }
 
       // Seek
-      if (f_idx < files[level].size()) {
+      if (f_idx < level_files.size()) {
         level_iters_[level - 1]->SetFileIndex(f_idx);
         seek_to_first ? level_iters_[level - 1]->SeekToFirst() :
                         level_iters_[level - 1]->Seek(internal_key);
@@ -428,7 +427,7 @@ void ForwardIterator::RebuildIterators(bool refresh_sv) {
   }
   mutable_iter_ = sv_->mem->NewIterator(read_options_, &arena_);
   sv_->imm->AddIterators(read_options_, &imm_iters_, &arena_);
-  const auto& l0_files = sv_->current->files_[0];
+  const auto& l0_files = sv_->current->LevelFiles(0);
   l0_iters_.reserve(l0_files.size());
   for (const auto* l0 : l0_files) {
     l0_iters_.push_back(cfd_->table_cache()->NewIterator(
@@ -436,11 +435,12 @@ void ForwardIterator::RebuildIterators(bool refresh_sv) {
   }
   level_iters_.reserve(sv_->current->NumberLevels() - 1);
   for (int32_t level = 1; level < sv_->current->NumberLevels(); ++level) {
-    if (sv_->current->files_[level].empty()) {
+    const auto& level_files = sv_->current->LevelFiles(level);
+    if (level_files.empty()) {
       level_iters_.push_back(nullptr);
     } else {
-      level_iters_.push_back(new LevelIterator(cfd_, read_options_,
-          sv_->current->files_[level]));
+      level_iters_.push_back(
+          new LevelIterator(cfd_, read_options_, level_files));
     }
   }
 
@@ -449,7 +449,7 @@ void ForwardIterator::RebuildIterators(bool refresh_sv) {
 }
 
 void ForwardIterator::ResetIncompleteIterators() {
-  const auto& l0_files = sv_->current->files_[0];
+  const auto& l0_files = sv_->current->LevelFiles(0);
   for (uint32_t i = 0; i < l0_iters_.size(); ++i) {
     assert(i < l0_files.size());
     if (!l0_iters_[i]->status().IsIncomplete()) {
diff --git a/db/version_set.h b/db/version_set.h
index 8c20937d3..ee3bbcbf2 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -297,10 +297,15 @@ class Version {
     return next_;
   }
 
+  // REQUIRES: This version has been saved (see VersionSet::SaveTo)
+  const FileIndexer& GetIndexer() const {
+    assert(finalized_);
+    return file_indexer_;
+  }
+
  private:
   friend class VersionSet;
   friend class DBImpl;
-  friend class ForwardIterator;
   friend class InternalStats;
 
   class LevelFileNumIterator;

From 7e12ae5a211977857cd17da7206414a979c3d020 Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Sun, 26 Oct 2014 23:23:06 +0100
Subject: [PATCH 320/829] [RocksJava] - BackupInfos & Restore-/BackupableDB
 enhancements

Summary:
- BackupableDB deleteBackup method
- BackupableDB purgeOldBackups bugfix
- BackupInfos now available in Restorable-/BackupableDB
- Extended BackupableDBTest to cover more of the currently implemented functionality.

Test Plan:
make rocksdbjava
make jtest

Differential Revision: https://reviews.facebook.net/D27027
---
 java/org/rocksdb/BackupInfo.java            | 67 +++++++++++++++++++++
 java/org/rocksdb/BackupableDB.java          | 38 ++++++++++--
 java/org/rocksdb/RestoreBackupableDB.java   | 27 +++++++--
 java/org/rocksdb/test/BackupableDBTest.java | 64 +++++++++++++++++++-
 java/rocksjni/backupablejni.cc              | 32 +++++++++-
 java/rocksjni/comparatorjnicallback.cc      |  1 -
 java/rocksjni/portal.h                      | 45 ++++++++++++++
 java/rocksjni/restorejni.cc                 | 14 +++++
 8 files changed, 277 insertions(+), 11 deletions(-)
 create mode 100644 java/org/rocksdb/BackupInfo.java

diff --git a/java/org/rocksdb/BackupInfo.java b/java/org/rocksdb/BackupInfo.java
new file mode 100644
index 000000000..407445473
--- /dev/null
+++ b/java/org/rocksdb/BackupInfo.java
@@ -0,0 +1,67 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+package org.rocksdb;
+
+/**
+ * Instances of this class describe a Backup made by
+ * {@link org.rocksdb.BackupableDB}.
+ */
+public class BackupInfo {
+
+  /**
+   * Package private constructor used to create instances
+   * of BackupInfo by {@link org.rocksdb.BackupableDB} and
+   * {@link org.rocksdb.RestoreBackupableDB}.
+   *
+   * @param backupId id of backup
+   * @param timestamp timestamp of backup
+   * @param size size of backup
+   * @param numberFiles number of files related to this backup.
+   */
+  BackupInfo(int backupId, long timestamp, long size,
+      int numberFiles) {
+    backupId_ = backupId;
+    timestamp_ = timestamp;
+    size_ = size;
+    numberFiles_ = numberFiles;
+  }
+
+  /**
+   *
+   * @return the backup id.
+   */
+  public int backupId() {
+    return backupId_;
+  }
+
+  /**
+   *
+   * @return the timestamp of the backup.
+   */
+  public long timestamp() {
+    return timestamp_;
+  }
+
+  /**
+   *
+   * @return the size of the backup
+   */
+  public long size() {
+    return size_;
+  }
+
+  /**
+   *
+   * @return the number of files of this backup.
+   */
+  public int numberFiles() {
+    return numberFiles_;
+  }
+
+  private int backupId_;
+  private long timestamp_;
+  private long size_;
+  private int numberFiles_;
+}
diff --git a/java/org/rocksdb/BackupableDB.java b/java/org/rocksdb/BackupableDB.java
index 24e1c2e33..0c1ef328f 100644
--- a/java/org/rocksdb/BackupableDB.java
+++ b/java/org/rocksdb/BackupableDB.java
@@ -5,6 +5,8 @@
 
 package org.rocksdb;
 
+import java.util.List;
+
 /**
  * A subclass of RocksDB which supports backup-related operations.
  *
@@ -43,8 +45,10 @@ public class BackupableDB extends RocksDB {
    *
    * @param flushBeforeBackup if true, then all data will be flushed
    *     before creating backup.
+   * @throws org.rocksdb.RocksDBException
    */
-  public void createNewBackup(boolean flushBeforeBackup) {
+  public void createNewBackup(boolean flushBeforeBackup)
+      throws RocksDBException {
     createNewBackup(nativeHandle_, flushBeforeBackup);
   }
 
@@ -52,11 +56,32 @@ public class BackupableDB extends RocksDB {
    * Deletes old backups, keeping latest numBackupsToKeep alive.
    *
    * @param numBackupsToKeep Number of latest backups to keep.
+   * @throws org.rocksdb.RocksDBException
    */
-  public void purgeOldBackups(int numBackupsToKeep) {
+  public void purgeOldBackups(int numBackupsToKeep)
+      throws RocksDBException {
     purgeOldBackups(nativeHandle_, numBackupsToKeep);
   }
 
+  /**
+   * Deletes a specific backup.
+   *
+   * @param backupId of backup to delete.
+   * @throws org.rocksdb.RocksDBException
+   */
+  public void deleteBackup(long backupId) throws RocksDBException {
+    deleteBackup0(nativeHandle_, backupId);
+  }
+
+  /**
+   * Returns a list of {@link BackupInfo} instances, which describe
+   * already made backups.
+   *
+   * @return List of {@link BackupInfo} instances.
+   */
+  public List<BackupInfo> getBackupInfos() {
+    return getBackupInfo(nativeHandle_);
+  }
 
   /**
    * Close the BackupableDB instance and release resource.
@@ -85,6 +110,11 @@ public class BackupableDB extends RocksDB {
   }
 
   protected native void open(long rocksDBHandle, long backupDBOptionsHandle);
-  protected native void createNewBackup(long handle, boolean flag);
-  protected native void purgeOldBackups(long handle, int numBackupsToKeep);
+  protected native void createNewBackup(long handle, boolean flag)
+      throws RocksDBException;
+  protected native void purgeOldBackups(long handle, int numBackupsToKeep)
+      throws RocksDBException;
+  private native void deleteBackup0(long nativeHandle, long backupId)
+      throws RocksDBException;
+  protected native List<BackupInfo> getBackupInfo(long handle);
 }
diff --git a/java/org/rocksdb/RestoreBackupableDB.java b/java/org/rocksdb/RestoreBackupableDB.java
index 5bc8dfbec..7b2296898 100644
--- a/java/org/rocksdb/RestoreBackupableDB.java
+++ b/java/org/rocksdb/RestoreBackupableDB.java
@@ -5,6 +5,8 @@
 
 package org.rocksdb;
 
+import java.util.List;
+
 /**
  * This class is used to access information about backups and restore from them.
  *
@@ -65,6 +67,7 @@ public class RestoreBackupableDB extends RocksObject {
    * Deletes old backups, keeping latest numBackupsToKeep alive.
    *
    * @param numBackupsToKeep of latest backups to keep
+   * @throws org.rocksdb.RocksDBException
    */
   public void purgeOldBackups(int numBackupsToKeep) throws RocksDBException {
     purgeOldBackups0(nativeHandle_, numBackupsToKeep);
@@ -74,11 +77,22 @@ public class RestoreBackupableDB extends RocksObject {
    * Deletes a specific backup.
    *
    * @param backupId of backup to delete.
+   * @throws org.rocksdb.RocksDBException
    */
   public void deleteBackup(long backupId) throws RocksDBException {
     deleteBackup0(nativeHandle_, backupId);
   }
 
+  /**
+   * Returns a list of {@link BackupInfo} instances, which describe
+   * already made backups.
+   *
+   * @return List of {@link BackupInfo} instances.
+   */
+  public List<BackupInfo> getBackupInfos() {
+    return getBackupInfo(nativeHandle_);
+  }
+
   /**
    * Release the memory allocated for the current instance
    * in the c++ side.
@@ -90,10 +104,15 @@ public class RestoreBackupableDB extends RocksObject {
 
   private native long newRestoreBackupableDB(long options);
   private native void restoreDBFromBackup0(long nativeHandle, long backupId,
-      String dbDir, String walDir, long restoreOptions) throws RocksDBException;
+      String dbDir, String walDir, long restoreOptions)
+      throws RocksDBException;
   private native void restoreDBFromLatestBackup0(long nativeHandle,
-      String dbDir, String walDir, long restoreOptions) throws RocksDBException;
-  private native void purgeOldBackups0(long nativeHandle, int numBackupsToKeep);
-  private native void deleteBackup0(long nativeHandle, long backupId);
+      String dbDir, String walDir, long restoreOptions)
+      throws RocksDBException;
+  private native void purgeOldBackups0(long nativeHandle, int numBackupsToKeep)
+      throws RocksDBException;
+  private native void deleteBackup0(long nativeHandle, long backupId)
+      throws RocksDBException;
+  protected native List<BackupInfo> getBackupInfo(long handle);
   private native void dispose(long nativeHandle);
 }
diff --git a/java/org/rocksdb/test/BackupableDBTest.java b/java/org/rocksdb/test/BackupableDBTest.java
index ee4509697..2115e9ca9 100644
--- a/java/org/rocksdb/test/BackupableDBTest.java
+++ b/java/org/rocksdb/test/BackupableDBTest.java
@@ -7,6 +7,8 @@ package org.rocksdb.test;
 
 import org.rocksdb.*;
 
+import java.util.List;
+
 public class BackupableDBTest {
   static final String db_path = "/tmp/rocksdbjni_backupable_db_test";
   static final String backup_path = "/tmp/rocksdbjni_backupable_db_backup_test";
@@ -21,14 +23,34 @@ public class BackupableDBTest {
     BackupableDBOptions bopt = new BackupableDBOptions(backup_path, false,
         true, false, true, 0, 0);
     BackupableDB bdb = null;
-
+    List<BackupInfo> backupInfos;
+    List<BackupInfo> restoreInfos;
     try {
       bdb = BackupableDB.open(opt, bopt, db_path);
 
       bdb.put("abc".getBytes(), "def".getBytes());
       bdb.put("ghi".getBytes(), "jkl".getBytes());
+
+      backupInfos = bdb.getBackupInfos();
+      assert(backupInfos.size() == 0);
+
       bdb.createNewBackup(true);
 
+      backupInfos = bdb.getBackupInfos();
+      assert(backupInfos.size() == 1);
+
+      // Retrieving backup infos twice shall not
+      // lead to different results
+      List<BackupInfo> tmpBackupInfo = bdb.getBackupInfos();
+      assert(tmpBackupInfo.get(0).backupId() ==
+          backupInfos.get(0).backupId());
+      assert(tmpBackupInfo.get(0).timestamp() ==
+          backupInfos.get(0).timestamp());
+      assert(tmpBackupInfo.get(0).size() ==
+          backupInfos.get(0).size());
+      assert(tmpBackupInfo.get(0).numberFiles() ==
+          backupInfos.get(0).numberFiles());
+
       // delete record after backup
       bdb.remove("abc".getBytes());
       byte[] value = bdb.get("abc".getBytes());
@@ -38,8 +60,26 @@ public class BackupableDBTest {
       // restore from backup
       RestoreOptions ropt = new RestoreOptions(false);
       RestoreBackupableDB rdb = new RestoreBackupableDB(bopt);
+
+      // getting backup infos from restorable db should
+      // lead to the same infos as from backupable db
+      restoreInfos = rdb.getBackupInfos();
+      assert(restoreInfos.size() == backupInfos.size());
+      assert(restoreInfos.get(0).backupId() ==
+          backupInfos.get(0).backupId());
+      assert(restoreInfos.get(0).timestamp() ==
+          backupInfos.get(0).timestamp());
+      assert(restoreInfos.get(0).size() ==
+          backupInfos.get(0).size());
+      assert(restoreInfos.get(0).numberFiles() ==
+          backupInfos.get(0).numberFiles());
+
       rdb.restoreDBFromLatestBackup(db_path, db_path,
           ropt);
+      // do nothing because there is only one backup
+      rdb.purgeOldBackups(1);
+      restoreInfos = rdb.getBackupInfos();
+      assert(restoreInfos.size() == 1);
       rdb.dispose();
       ropt.dispose();
 
@@ -48,6 +88,28 @@ public class BackupableDBTest {
       value = bdb.get("abc".getBytes());
       assert(new String(value).equals("def"));
 
+      bdb.createNewBackup(false);
+      // after new backup there must be two backup infos
+      backupInfos = bdb.getBackupInfos();
+      assert(backupInfos.size() == 2);
+      // deleting the backup must be possible using the
+      // id provided by backup infos
+      bdb.deleteBackup(backupInfos.get(1).backupId());
+      // after deletion there should only be one info
+      backupInfos = bdb.getBackupInfos();
+      assert(backupInfos.size() == 1);
+      bdb.createNewBackup(false);
+      bdb.createNewBackup(false);
+      bdb.createNewBackup(false);
+      backupInfos = bdb.getBackupInfos();
+      assert(backupInfos.size() == 4);
+      // purge everything and keep two
+      bdb.purgeOldBackups(2);
+      // backup infos need to be two
+      backupInfos = bdb.getBackupInfos();
+      assert(backupInfos.size() == 2);
+      assert(backupInfos.get(0).backupId() == 4);
+      assert(backupInfos.get(1).backupId() == 5);
       System.out.println("Backup and restore test passed");
     } catch (RocksDBException e) {
       System.err.format("[ERROR]: %s%n", e);
diff --git a/java/rocksjni/backupablejni.cc b/java/rocksjni/backupablejni.cc
index 2aa1d0b1d..a5107af57 100644
--- a/java/rocksjni/backupablejni.cc
+++ b/java/rocksjni/backupablejni.cc
@@ -11,6 +11,7 @@
 #include <stdlib.h>
 #include <jni.h>
 #include <string>
+#include <vector>
 
 #include "include/org_rocksdb_BackupableDB.h"
 #include "include/org_rocksdb_BackupableDBOptions.h"
@@ -53,7 +54,7 @@ void Java_org_rocksdb_BackupableDB_createNewBackup(
  * Signature: (JI)V
  */
 void Java_org_rocksdb_BackupableDB_purgeOldBackups(
-    JNIEnv* env, jobject jbdb, jlong jhandle, jboolean jnumBackupsToKeep) {
+    JNIEnv* env, jobject jbdb, jlong jhandle, jint jnumBackupsToKeep) {
   rocksdb::Status s =
       reinterpret_cast<rocksdb::BackupableDB*>(jhandle)->
       PurgeOldBackups(jnumBackupsToKeep);
@@ -62,6 +63,35 @@ void Java_org_rocksdb_BackupableDB_purgeOldBackups(
   }
 }
 
+/*
+ * Class:     org_rocksdb_BackupableDB
+ * Method:    deleteBackup0
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_BackupableDB_deleteBackup0(JNIEnv* env,
+    jobject jobj, jlong jhandle, jlong jbackup_id) {
+  auto rdb = reinterpret_cast<rocksdb::BackupableDB*>(jhandle);
+  rocksdb::Status s = rdb->DeleteBackup(jbackup_id);
+
+  if (!s.ok()) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_BackupableDB
+ * Method:    getBackupInfo
+ * Signature: (J)Ljava/util/List;
+ */
+jobject Java_org_rocksdb_BackupableDB_getBackupInfo(
+    JNIEnv* env, jobject jbdb, jlong jhandle) {
+  std::vector<rocksdb::BackupInfo> backup_infos;
+  reinterpret_cast<rocksdb::BackupableDB*>(jhandle)->
+      GetBackupInfo(&backup_infos);
+  return rocksdb::BackupInfoListJni::getBackupInfo(env,
+      backup_infos);
+}
+
 ///////////////////////////////////////////////////////////////////////////
 // BackupDBOptions
 
diff --git a/java/rocksjni/comparatorjnicallback.cc b/java/rocksjni/comparatorjnicallback.cc
index 1be363541..fcae16c7c 100644
--- a/java/rocksjni/comparatorjnicallback.cc
+++ b/java/rocksjni/comparatorjnicallback.cc
@@ -15,7 +15,6 @@ BaseComparatorJniCallback::BaseComparatorJniCallback(
     const ComparatorJniCallbackOptions* copt)
     : mtx_compare(new port::Mutex(copt->use_adaptive_mutex)),
     mtx_findShortestSeparator(new port::Mutex(copt->use_adaptive_mutex)) {
-
   // Note: Comparator methods may be accessed by multiple threads,
   // so we ref the jvm not the env
   const jint rs = env->GetJavaVM(&m_jvm);
diff --git a/java/rocksjni/portal.h b/java/rocksjni/portal.h
index 32452ae0b..5a56fe639 100644
--- a/java/rocksjni/portal.h
+++ b/java/rocksjni/portal.h
@@ -13,6 +13,7 @@
 #include <jni.h>
 #include <limits>
 #include <string>
+#include <vector>
 
 #include "rocksdb/db.h"
 #include "rocksdb/filter_policy.h"
@@ -591,6 +592,50 @@ class ListJni {
   }
 };
 
+class BackupInfoJni {
+ public:
+  // Get the java class id of org.rocksdb.BackupInfo.
+  static jclass getJClass(JNIEnv* env) {
+    jclass jclazz = env->FindClass("org/rocksdb/BackupInfo");
+    assert(jclazz != nullptr);
+    return jclazz;
+  }
+
+  static jobject construct0(JNIEnv* env, uint32_t backup_id, int64_t timestamp,
+      uint64_t size, uint32_t number_files) {
+    static jmethodID mid = env->GetMethodID(getJClass(env), "<init>",
+        "(IJJI)V");
+    assert(mid != nullptr);
+    return env->NewObject(getJClass(env), mid,
+        backup_id, timestamp, size, number_files);
+  }
+};
+
+class BackupInfoListJni {
+ public:
+  static jobject getBackupInfo(JNIEnv* env,
+      std::vector<BackupInfo> backup_infos) {
+    jclass jclazz = env->FindClass("java/util/ArrayList");
+    jmethodID mid = rocksdb::ListJni::getArrayListConstructorMethodId(
+        env, jclazz);
+    jobject jbackup_info_handle_list = env->NewObject(jclazz, mid,
+        backup_infos.size());
+    // insert in java list
+    for (std::vector<rocksdb::BackupInfo>::size_type i = 0;
+        i != backup_infos.size(); i++) {
+      rocksdb::BackupInfo backup_info = backup_infos[i];
+      jobject obj = rocksdb::BackupInfoJni::construct0(env,
+          backup_info.backup_id,
+          backup_info.timestamp,
+          backup_info.size,
+          backup_info.number_files);
+      env->CallBooleanMethod(jbackup_info_handle_list,
+          rocksdb::ListJni::getListAddMethodId(env), obj);
+    }
+    return jbackup_info_handle_list;
+  }
+};
+
 class JniUtil {
  public:
     /**
diff --git a/java/rocksjni/restorejni.cc b/java/rocksjni/restorejni.cc
index bd1734010..2e833d3be 100644
--- a/java/rocksjni/restorejni.cc
+++ b/java/rocksjni/restorejni.cc
@@ -131,6 +131,20 @@ void Java_org_rocksdb_RestoreBackupableDB_deleteBackup0(JNIEnv* env,
   }
 }
 
+/*
+ * Class:     org_rocksdb_RestoreBackupableDB
+ * Method:    getBackupInfo
+ * Signature: (J)Ljava/util/List;
+ */
+jobject Java_org_rocksdb_RestoreBackupableDB_getBackupInfo(
+    JNIEnv* env, jobject jbdb, jlong jhandle) {
+  std::vector<rocksdb::BackupInfo> backup_infos;
+  reinterpret_cast<rocksdb::RestoreBackupableDB*>(jhandle)->
+      GetBackupInfo(&backup_infos);
+  return rocksdb::BackupInfoListJni::getBackupInfo(env,
+      backup_infos);
+}
+
 /*
  * Class:     org_rocksdb_RestoreBackupableDB
  * Method:    dispose

From f7c9730696025e88b71b0ca88e81b9b1ce736995 Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Tue, 28 Oct 2014 18:42:30 +0100
Subject: [PATCH 321/829] [RocksJava] Integrated review comments

Parameter types for BackupID are now aligned to int.
---
 java/org/rocksdb/BackupableDB.java        | 4 ++--
 java/org/rocksdb/RestoreBackupableDB.java | 4 ++--
 java/rocksjni/backupablejni.cc            | 4 ++--
 java/rocksjni/restorejni.cc               | 4 ++--
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/java/org/rocksdb/BackupableDB.java b/java/org/rocksdb/BackupableDB.java
index 0c1ef328f..7fa37abab 100644
--- a/java/org/rocksdb/BackupableDB.java
+++ b/java/org/rocksdb/BackupableDB.java
@@ -69,7 +69,7 @@ public class BackupableDB extends RocksDB {
    * @param backupId of backup to delete.
    * @throws org.rocksdb.RocksDBException
    */
-  public void deleteBackup(long backupId) throws RocksDBException {
+  public void deleteBackup(int backupId) throws RocksDBException {
     deleteBackup0(nativeHandle_, backupId);
   }
 
@@ -114,7 +114,7 @@ public class BackupableDB extends RocksDB {
       throws RocksDBException;
   protected native void purgeOldBackups(long handle, int numBackupsToKeep)
       throws RocksDBException;
-  private native void deleteBackup0(long nativeHandle, long backupId)
+  private native void deleteBackup0(long nativeHandle, int backupId)
       throws RocksDBException;
   protected native List<BackupInfo> getBackupInfo(long handle);
 }
diff --git a/java/org/rocksdb/RestoreBackupableDB.java b/java/org/rocksdb/RestoreBackupableDB.java
index 7b2296898..207383e43 100644
--- a/java/org/rocksdb/RestoreBackupableDB.java
+++ b/java/org/rocksdb/RestoreBackupableDB.java
@@ -79,7 +79,7 @@ public class RestoreBackupableDB extends RocksObject {
    * @param backupId of backup to delete.
    * @throws org.rocksdb.RocksDBException
    */
-  public void deleteBackup(long backupId) throws RocksDBException {
+  public void deleteBackup(int backupId) throws RocksDBException {
     deleteBackup0(nativeHandle_, backupId);
   }
 
@@ -111,7 +111,7 @@ public class RestoreBackupableDB extends RocksObject {
       throws RocksDBException;
   private native void purgeOldBackups0(long nativeHandle, int numBackupsToKeep)
       throws RocksDBException;
-  private native void deleteBackup0(long nativeHandle, long backupId)
+  private native void deleteBackup0(long nativeHandle, int backupId)
       throws RocksDBException;
   protected native List<BackupInfo> getBackupInfo(long handle);
   private native void dispose(long nativeHandle);
diff --git a/java/rocksjni/backupablejni.cc b/java/rocksjni/backupablejni.cc
index a5107af57..41390c5bc 100644
--- a/java/rocksjni/backupablejni.cc
+++ b/java/rocksjni/backupablejni.cc
@@ -66,10 +66,10 @@ void Java_org_rocksdb_BackupableDB_purgeOldBackups(
 /*
  * Class:     org_rocksdb_BackupableDB
  * Method:    deleteBackup0
- * Signature: (JJ)V
+ * Signature: (JI)V
  */
 void Java_org_rocksdb_BackupableDB_deleteBackup0(JNIEnv* env,
-    jobject jobj, jlong jhandle, jlong jbackup_id) {
+    jobject jobj, jlong jhandle, jint jbackup_id) {
   auto rdb = reinterpret_cast<rocksdb::BackupableDB*>(jhandle);
   rocksdb::Status s = rdb->DeleteBackup(jbackup_id);
 
diff --git a/java/rocksjni/restorejni.cc b/java/rocksjni/restorejni.cc
index 2e833d3be..a180dec1b 100644
--- a/java/rocksjni/restorejni.cc
+++ b/java/rocksjni/restorejni.cc
@@ -119,10 +119,10 @@ void Java_org_rocksdb_RestoreBackupableDB_purgeOldBackups0(JNIEnv* env,
 /*
  * Class:     org_rocksdb_RestoreBackupableDB
  * Method:    deleteBackup0
- * Signature: (JJ)V
+ * Signature: (JI)V
  */
 void Java_org_rocksdb_RestoreBackupableDB_deleteBackup0(JNIEnv* env,
-    jobject jobj, jlong jhandle, jlong jbackup_id) {
+    jobject jobj, jlong jhandle, jint jbackup_id) {
   auto rdb = reinterpret_cast<rocksdb::RestoreBackupableDB*>(jhandle);
   rocksdb::Status s = rdb->DeleteBackup(jbackup_id);
 

From efa2fb33b0b89a46f07c2c9a5fe430f60840d066 Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Tue, 28 Oct 2014 11:42:22 -0700
Subject: [PATCH 322/829] make LevelFileNumIterator and LevelFileIteratorState
 anonymous

Summary: No need to expose them in .h

Test Plan: make release

Reviewers: igor, yhchiang, rven, sdong

Reviewed By: sdong

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D27645
---
 db/version_set.cc | 14 +++++++++-----
 db/version_set.h  |  3 ---
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/db/version_set.cc b/db/version_set.cc
index eab417a28..b47578a4a 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -417,12 +417,14 @@ bool SomeFileOverlapsRange(
   return !BeforeFile(ucmp, largest_user_key, &file_level.files[index]);
 }
 
+namespace {
+
 // An internal iterator.  For a given version/level pair, yields
 // information about the files in the level.  For a given entry, key()
 // is the largest key that occurs in the file, and value() is an
 // 16-byte value containing the file number and file size, both
 // encoded using EncodeFixed64.
-class Version::LevelFileNumIterator : public Iterator {
+class LevelFileNumIterator : public Iterator {
  public:
   LevelFileNumIterator(const InternalKeyComparator& icmp,
                        const LevelFilesBrief* flevel)
@@ -473,7 +475,7 @@ class Version::LevelFileNumIterator : public Iterator {
   mutable FileDescriptor current_value_;
 };
 
-class Version::LevelFileIteratorState : public TwoLevelIteratorState {
+class LevelFileIteratorState : public TwoLevelIteratorState {
  public:
   LevelFileIteratorState(TableCache* table_cache,
     const ReadOptions& read_options, const EnvOptions& env_options,
@@ -509,6 +511,8 @@ class Version::LevelFileIteratorState : public TwoLevelIteratorState {
   bool for_compaction_;
 };
 
+}  // anonymous namespace
+
 Status Version::GetTableProperties(std::shared_ptr<const TableProperties>* tp,
                                    const FileMetaData* file_meta,
                                    const std::string* fname) {
@@ -2811,12 +2815,12 @@ Iterator* VersionSet::MakeInputIterator(Compaction* c) {
         }
       } else {
         // Create concatenating iterator for the files from this level
-        list[num++] = NewTwoLevelIterator(new Version::LevelFileIteratorState(
+        list[num++] = NewTwoLevelIterator(new LevelFileIteratorState(
               cfd->table_cache(), read_options, env_options_,
               cfd->internal_comparator(), true /* for_compaction */,
               false /* prefix enabled */),
-            new Version::LevelFileNumIterator(cfd->internal_comparator(),
-                                              c->input_levels(which)));
+            new LevelFileNumIterator(cfd->internal_comparator(),
+                                     c->input_levels(which)));
       }
     }
   }
diff --git a/db/version_set.h b/db/version_set.h
index ee3bbcbf2..5a11a2f1c 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -308,9 +308,6 @@ class Version {
   friend class DBImpl;
   friend class InternalStats;
 
-  class LevelFileNumIterator;
-  class LevelFileIteratorState;
-
   bool PrefixMayMatch(const ReadOptions& read_options, Iterator* level_iter,
                       const Slice& internal_prefix) const;
 

From a39e931e50f881abe01fd77dd35ec7ede12f44c3 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Tue, 28 Oct 2014 11:54:33 -0700
Subject: [PATCH 323/829] FlushProcess

Summary:
Abstract out FlushProcess and take it out of DBImpl.
This also includes taking DeletionState outside of DBImpl.

Currently this diff is only doing the refactoring. Future work includes:
1. Decoupling flush_process.cc, make it depend on less state
2. Write flush_process_test, which will mock out everything that FlushProcess depends on and test it in isolation

Test Plan: make check

Reviewers: rven, yhchiang, sdong, ljin

Reviewed By: ljin

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D27561
---
 Makefile               |   6 +-
 db/column_family.cc    |   9 +-
 db/db_filesnapshot.cc  |   6 +-
 db/db_impl.cc          | 410 ++++++++++++-----------------------------
 db/db_impl.h           | 130 +++----------
 db/flush_job.cc        | 223 ++++++++++++++++++++++
 db/flush_job.h         |  86 +++++++++
 db/flush_job_test.cc   | 113 ++++++++++++
 db/forward_iterator.cc |   9 +-
 db/job_context.h       |  87 +++++++++
 db/memtable_list.cc    |  31 ++--
 db/memtable_list.h     |   8 +-
 port/stack_trace.cc    |   2 +-
 13 files changed, 689 insertions(+), 431 deletions(-)
 create mode 100644 db/flush_job.cc
 create mode 100644 db/flush_job.h
 create mode 100644 db/flush_job_test.cc
 create mode 100644 db/job_context.h

diff --git a/Makefile b/Makefile
index d9a8feffa..12dbba153 100644
--- a/Makefile
+++ b/Makefile
@@ -143,7 +143,8 @@ TESTS = \
 	cuckoo_table_builder_test \
 	cuckoo_table_reader_test \
 	cuckoo_table_db_test \
-	write_batch_with_index_test
+	write_batch_with_index_test \
+	flush_job_test
 
 TOOLS = \
         sst_dump \
@@ -412,6 +413,9 @@ ttl_test: utilities/ttl/ttl_test.o $(LIBOBJECTS) $(TESTHARNESS)
 write_batch_with_index_test: utilities/write_batch_with_index/write_batch_with_index_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) utilities/write_batch_with_index/write_batch_with_index_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@  $(LDFLAGS) $(COVERAGEFLAGS)
 
+flush_job_test: db/flush_job_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) db/flush_job_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@  $(LDFLAGS) $(COVERAGEFLAGS)
+
 dbformat_test: db/dbformat_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) db/dbformat_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
 
diff --git a/db/column_family.cc b/db/column_family.cc
index c47bbb12f..0127d10ad 100644
--- a/db/column_family.cc
+++ b/db/column_family.cc
@@ -20,6 +20,7 @@
 #include <limits>
 
 #include "db/db_impl.h"
+#include "db/job_context.h"
 #include "db/version_set.h"
 #include "db/internal_stats.h"
 #include "db/compaction_picker.h"
@@ -71,15 +72,15 @@ ColumnFamilyHandleImpl::ColumnFamilyHandleImpl(ColumnFamilyData* cfd,
 
 ColumnFamilyHandleImpl::~ColumnFamilyHandleImpl() {
   if (cfd_ != nullptr) {
-    DBImpl::DeletionState deletion_state;
+    JobContext job_context;
     mutex_->Lock();
     if (cfd_->Unref()) {
       delete cfd_;
     }
-    db_->FindObsoleteFiles(deletion_state, false, true);
+    db_->FindObsoleteFiles(&job_context, false, true);
     mutex_->Unlock();
-    if (deletion_state.HaveSomethingToDelete()) {
-      db_->PurgeObsoleteFiles(deletion_state);
+    if (job_context.HaveSomethingToDelete()) {
+      db_->PurgeObsoleteFiles(job_context);
     }
   }
 }
diff --git a/db/db_filesnapshot.cc b/db/db_filesnapshot.cc
index 9f05b8d30..89fe9c983 100644
--- a/db/db_filesnapshot.cc
+++ b/db/db_filesnapshot.cc
@@ -42,7 +42,7 @@ Status DBImpl::DisableFileDeletions() {
 }
 
 Status DBImpl::EnableFileDeletions(bool force) {
-  DeletionState deletion_state;
+  JobContext job_context;
   bool should_purge_files = false;
   {
     MutexLock l(&mutex_);
@@ -55,7 +55,7 @@ Status DBImpl::EnableFileDeletions(bool force) {
     if (disable_delete_obsolete_files_ == 0)  {
       Log(db_options_.info_log, "File Deletions Enabled");
       should_purge_files = true;
-      FindObsoleteFiles(deletion_state, true);
+      FindObsoleteFiles(&job_context, true);
     } else {
       Log(db_options_.info_log,
           "File Deletions Enable, but not really enabled. Counter: %d",
@@ -63,7 +63,7 @@ Status DBImpl::EnableFileDeletions(bool force) {
     }
   }
   if (should_purge_files)  {
-    PurgeObsoleteFiles(deletion_state);
+    PurgeObsoleteFiles(job_context);
   }
   LogFlush(db_options_.info_log);
   return Status::OK();
diff --git a/db/db_impl.cc b/db/db_impl.cc
index dc5fc2394..c53a4bd92 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -27,6 +27,7 @@
 #include <vector>
 
 #include "db/builder.h"
+#include "db/flush_job.h"
 #include "db/db_iter.h"
 #include "db/dbformat.h"
 #include "db/filename.h"
@@ -412,12 +413,12 @@ DBImpl::~DBImpl() {
   // result, all "live" files can get deleted by accident. However, corrupted
   // manifest is recoverable by RepairDB().
   if (opened_successfully_) {
-    DeletionState deletion_state;
-    FindObsoleteFiles(deletion_state, true);
+    JobContext job_context;
+    FindObsoleteFiles(&job_context, true);
     // manifest number starting from 2
-    deletion_state.manifest_file_number = 1;
-    if (deletion_state.HaveSomethingToDelete()) {
-      PurgeObsoleteFiles(deletion_state);
+    job_context.manifest_file_number = 1;
+    if (job_context.HaveSomethingToDelete()) {
+      PurgeObsoleteFiles(job_context);
     }
   }
 
@@ -531,8 +532,7 @@ void DBImpl::MaybeDumpStats() {
 // force = false -- don't force the full scan, except every
 //  db_options_.delete_obsolete_files_period_micros
 // force = true -- force the full scan
-void DBImpl::FindObsoleteFiles(DeletionState& deletion_state,
-                               bool force,
+void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force,
                                bool no_full_scan) {
   mutex_.AssertHeld();
 
@@ -558,16 +558,16 @@ void DBImpl::FindObsoleteFiles(DeletionState& deletion_state,
   }
 
   // get obsolete files
-  versions_->GetObsoleteFiles(&deletion_state.sst_delete_files);
+  versions_->GetObsoleteFiles(&job_context->sst_delete_files);
 
   // store the current filenum, lognum, etc
-  deletion_state.manifest_file_number = versions_->ManifestFileNumber();
-  deletion_state.pending_manifest_file_number =
+  job_context->manifest_file_number = versions_->ManifestFileNumber();
+  job_context->pending_manifest_file_number =
       versions_->PendingManifestFileNumber();
-  deletion_state.log_number = versions_->MinLogNumber();
-  deletion_state.prev_log_number = versions_->PrevLogNumber();
+  job_context->log_number = versions_->MinLogNumber();
+  job_context->prev_log_number = versions_->PrevLogNumber();
 
-  if (!doing_the_full_scan && !deletion_state.HaveSomethingToDelete()) {
+  if (!doing_the_full_scan && !job_context->HaveSomethingToDelete()) {
     // avoid filling up sst_live if we're sure that we
     // are not going to do the full scan and that we don't have
     // anything to delete at the moment
@@ -576,11 +576,9 @@ void DBImpl::FindObsoleteFiles(DeletionState& deletion_state,
 
   // don't delete live files
   for (auto pair : pending_outputs_) {
-    deletion_state.sst_live.emplace_back(pair.first, pair.second, 0);
+    job_context->sst_live.emplace_back(pair.first, pair.second, 0);
   }
-  /*  deletion_state.sst_live.insert(pending_outputs_.begin(),
-                                   pending_outputs_.end());*/
-  versions_->AddLiveFiles(&deletion_state.sst_live);
+  versions_->AddLiveFiles(&job_context->sst_live);
 
   if (doing_the_full_scan) {
     for (uint32_t path_id = 0;
@@ -592,7 +590,7 @@ void DBImpl::FindObsoleteFiles(DeletionState& deletion_state,
                         &files);  // Ignore errors
       for (std::string file : files) {
         // TODO(icanadi) clean up this mess to avoid having one-off "/" prefixes
-        deletion_state.candidate_files.emplace_back("/" + file, path_id);
+        job_context->candidate_files.emplace_back("/" + file, path_id);
       }
     }
 
@@ -601,7 +599,7 @@ void DBImpl::FindObsoleteFiles(DeletionState& deletion_state,
       std::vector<std::string> log_files;
       env_->GetChildren(db_options_.wal_dir, &log_files);  // Ignore errors
       for (std::string log_file : log_files) {
-        deletion_state.candidate_files.emplace_back(log_file, 0);
+        job_context->candidate_files.emplace_back(log_file, 0);
       }
     }
     // Add info log files in db_log_dir
@@ -610,15 +608,15 @@ void DBImpl::FindObsoleteFiles(DeletionState& deletion_state,
       // Ignore errors
       env_->GetChildren(db_options_.db_log_dir, &info_log_files);
       for (std::string log_file : info_log_files) {
-        deletion_state.candidate_files.emplace_back(log_file, 0);
+        job_context->candidate_files.emplace_back(log_file, 0);
       }
     }
   }
 }
 
 namespace {
-bool CompareCandidateFile(const rocksdb::DBImpl::CandidateFileInfo& first,
-                          const rocksdb::DBImpl::CandidateFileInfo& second) {
+bool CompareCandidateFile(const JobContext::CandidateFileInfo& first,
+                          const JobContext::CandidateFileInfo& second) {
   if (first.file_name > second.file_name) {
     return true;
   } else if (first.file_name < second.file_name) {
@@ -633,7 +631,7 @@ bool CompareCandidateFile(const rocksdb::DBImpl::CandidateFileInfo& first,
 // belong to live files are posibly removed. Also, removes all the
 // files in sst_delete_files and log_delete_files.
 // It is not necessary to hold the mutex when invoking this method.
-void DBImpl::PurgeObsoleteFiles(DeletionState& state) {
+void DBImpl::PurgeObsoleteFiles(const JobContext& state) {
   // we'd better have sth to delete
   assert(state.HaveSomethingToDelete());
 
@@ -647,15 +645,14 @@ void DBImpl::PurgeObsoleteFiles(DeletionState& state) {
   // Now, convert live list to an unordered map, WITHOUT mutex held;
   // set is slow.
   std::unordered_map<uint64_t, const FileDescriptor*> sst_live_map;
-  for (FileDescriptor& fd : state.sst_live) {
+  for (const FileDescriptor& fd : state.sst_live) {
     sst_live_map[fd.GetNumber()] = &fd;
   }
 
-  auto& candidate_files = state.candidate_files;
-  candidate_files.reserve(
-      candidate_files.size() +
-      state.sst_delete_files.size() +
-      state.log_delete_files.size());
+  auto candidate_files = state.candidate_files;
+  candidate_files.reserve(candidate_files.size() +
+                          state.sst_delete_files.size() +
+                          state.log_delete_files.size());
   // We may ignore the dbname when generating the file names.
   const char* kDumbDbName = "";
   for (auto file : state.sst_delete_files) {
@@ -784,10 +781,10 @@ void DBImpl::PurgeObsoleteFiles(DeletionState& state) {
 
 void DBImpl::DeleteObsoleteFiles() {
   mutex_.AssertHeld();
-  DeletionState deletion_state;
-  FindObsoleteFiles(deletion_state, true);
-  if (deletion_state.HaveSomethingToDelete()) {
-    PurgeObsoleteFiles(deletion_state);
+  JobContext job_context;
+  FindObsoleteFiles(&job_context, true);
+  if (job_context.HaveSomethingToDelete()) {
+    PurgeObsoleteFiles(job_context);
   }
 }
 
@@ -1480,159 +1477,23 @@ Status DBImpl::WriteLevel0TableForRecovery(ColumnFamilyData* cfd, MemTable* mem,
   return s;
 }
 
-Status DBImpl::WriteLevel0Table(ColumnFamilyData* cfd,
-    const MutableCFOptions& mutable_cf_options,
-    const autovector<MemTable*>& mems,
-    VersionEdit* edit, uint64_t* filenumber, LogBuffer* log_buffer) {
-  mutex_.AssertHeld();
-  const uint64_t start_micros = env_->NowMicros();
-  FileMetaData meta;
-
-  meta.fd = FileDescriptor(versions_->NewFileNumber(), 0, 0);
-  *filenumber = meta.fd.GetNumber();
-  pending_outputs_[meta.fd.GetNumber()] = 0;  // path 0 for level 0 file.
-
-  const SequenceNumber newest_snapshot = snapshots_.GetNewest();
-  const SequenceNumber earliest_seqno_in_memtable =
-    mems[0]->GetFirstSequenceNumber();
-  Version* base = cfd->current();
-  base->Ref();          // it is likely that we do not need this reference
-  Status s;
-  {
-    mutex_.Unlock();
-    log_buffer->FlushBufferToLog();
-    std::vector<Iterator*> memtables;
-    ReadOptions ro;
-    ro.total_order_seek = true;
-    Arena arena;
-    for (MemTable* m : mems) {
-      Log(db_options_.info_log,
-          "[%s] Flushing memtable with next log file: %" PRIu64 "\n",
-          cfd->GetName().c_str(), m->GetNextLogNumber());
-      memtables.push_back(m->NewIterator(ro, &arena));
-    }
-    {
-      ScopedArenaIterator iter(NewMergingIterator(&cfd->internal_comparator(),
-                                                  &memtables[0],
-                                                  memtables.size(), &arena));
-      Log(db_options_.info_log,
-           "[%s] Level-0 flush table #%" PRIu64 ": started",
-          cfd->GetName().c_str(), meta.fd.GetNumber());
-
-      s = BuildTable(
-          dbname_, env_, *cfd->ioptions(), env_options_, cfd->table_cache(),
-          iter.get(), &meta, cfd->internal_comparator(), newest_snapshot,
-          earliest_seqno_in_memtable, GetCompressionFlush(*cfd->ioptions()),
-          cfd->ioptions()->compression_opts, Env::IO_HIGH);
-      LogFlush(db_options_.info_log);
-    }
-    Log(db_options_.info_log,
-        "[%s] Level-0 flush table #%" PRIu64 ": %" PRIu64 " bytes %s",
-        cfd->GetName().c_str(), meta.fd.GetNumber(), meta.fd.GetFileSize(),
-        s.ToString().c_str());
-
-    if (!db_options_.disableDataSync) {
-      db_directory_->Fsync();
-    }
-    mutex_.Lock();
-  }
-  base->Unref();
-
-  // re-acquire the most current version
-  base = cfd->current();
-
-  // There could be multiple threads writing to its own level-0 file.
-  // The pending_outputs cannot be cleared here, otherwise this newly
-  // created file might not be considered as a live-file by another
-  // compaction thread that is concurrently deleting obselete files.
-  // The pending_outputs can be cleared only after the new version is
-  // committed so that other threads can recognize this file as a
-  // valid one.
-  // pending_outputs_.erase(meta.number);
-
-  // Note that if file_size is zero, the file has been deleted and
-  // should not be added to the manifest.
-  int level = 0;
-  if (s.ok() && meta.fd.GetFileSize() > 0) {
-    const Slice min_user_key = meta.smallest.user_key();
-    const Slice max_user_key = meta.largest.user_key();
-    // if we have more than 1 background thread, then we cannot
-    // insert files directly into higher levels because some other
-    // threads could be concurrently producing compacted files for
-    // that key range.
-    if (base != nullptr && db_options_.max_background_compactions <= 1 &&
-        db_options_.max_background_flushes == 0 &&
-        cfd->ioptions()->compaction_style == kCompactionStyleLevel) {
-      level = base->PickLevelForMemTableOutput(
-          mutable_cf_options, min_user_key, max_user_key);
-    }
-    edit->AddFile(level, meta.fd.GetNumber(), meta.fd.GetPathId(),
-                  meta.fd.GetFileSize(), meta.smallest, meta.largest,
-                  meta.smallest_seqno, meta.largest_seqno);
-  }
-
-  InternalStats::CompactionStats stats(1);
-  stats.micros = env_->NowMicros() - start_micros;
-  stats.bytes_written = meta.fd.GetFileSize();
-  cfd->internal_stats()->AddCompactionStats(level, stats);
-  cfd->internal_stats()->AddCFStats(
-      InternalStats::BYTES_FLUSHED, meta.fd.GetFileSize());
-  RecordTick(stats_, COMPACT_WRITE_BYTES, meta.fd.GetFileSize());
-  return s;
-}
-
 Status DBImpl::FlushMemTableToOutputFile(
     ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
-    bool* madeProgress, DeletionState& deletion_state, LogBuffer* log_buffer) {
+    bool* madeProgress, JobContext* job_context, LogBuffer* log_buffer) {
   mutex_.AssertHeld();
   assert(cfd->imm()->size() != 0);
   assert(cfd->imm()->IsFlushPending());
 
-  // Save the contents of the earliest memtable as a new Table
-  uint64_t file_number;
-  autovector<MemTable*> mems;
-  cfd->imm()->PickMemtablesToFlush(&mems);
-  if (mems.empty()) {
-    LogToBuffer(log_buffer, "[%s] Nothing in memtable to flush",
-                cfd->GetName().c_str());
-    return Status::OK();
-  }
+  FlushJob flush_job(dbname_, cfd, db_options_, mutable_cf_options,
+                     env_options_, versions_.get(), &mutex_, &shutting_down_,
+                     &pending_outputs_, snapshots_.GetNewest(), job_context,
+                     log_buffer, db_directory_.get(),
+                     GetCompressionFlush(*cfd->ioptions()), stats_);
 
-  // record the logfile_number_ before we release the mutex
-  // entries mems are (implicitly) sorted in ascending order by their created
-  // time. We will use the first memtable's `edit` to keep the meta info for
-  // this flush.
-  MemTable* m = mems[0];
-  VersionEdit* edit = m->GetEdits();
-  edit->SetPrevLogNumber(0);
-  // SetLogNumber(log_num) indicates logs with number smaller than log_num
-  // will no longer be picked up for recovery.
-  edit->SetLogNumber(mems.back()->GetNextLogNumber());
-  edit->SetColumnFamily(cfd->GetID());
-
-
-  // This will release and re-acquire the mutex.
-  Status s = WriteLevel0Table(cfd, mutable_cf_options, mems, edit,
-                              &file_number, log_buffer);
-
-  if (s.ok() &&
-      (shutting_down_.load(std::memory_order_acquire) || cfd->IsDropped())) {
-    s = Status::ShutdownInProgress(
-        "Database shutdown or Column family drop during flush");
-  }
-
-  if (!s.ok()) {
-    cfd->imm()->RollbackMemtableFlush(mems, file_number, &pending_outputs_);
-  } else {
-    // Replace immutable memtable with the generated Table
-    s = cfd->imm()->InstallMemtableFlushResults(
-        cfd, mutable_cf_options, mems, versions_.get(), &mutex_,
-        db_options_.info_log.get(), file_number, &pending_outputs_,
-        &deletion_state.memtables_to_free, db_directory_.get(), log_buffer);
-  }
+  Status s = flush_job.Run();
 
   if (s.ok()) {
-    InstallSuperVersion(cfd, deletion_state, mutable_cf_options);
+    InstallSuperVersionBackground(cfd, job_context, mutable_cf_options);
     if (madeProgress) {
       *madeProgress = 1;
     }
@@ -1645,7 +1506,7 @@ Status DBImpl::FlushMemTableToOutputFile(
       while (alive_log_files_.size() &&
              alive_log_files_.begin()->number < versions_->MinLogNumber()) {
         const auto& earliest = *alive_log_files_.begin();
-        deletion_state.log_delete_files.push_back(earliest.number);
+        job_context->log_delete_files.push_back(earliest.number);
         total_log_size_ -= earliest.size;
         alive_log_files_.pop_front();
       }
@@ -2082,8 +1943,7 @@ void DBImpl::BGWorkCompaction(void* db) {
   reinterpret_cast<DBImpl*>(db)->BackgroundCallCompaction();
 }
 
-Status DBImpl::BackgroundFlush(bool* madeProgress,
-                               DeletionState& deletion_state,
+Status DBImpl::BackgroundFlush(bool* madeProgress, JobContext* job_context,
                                LogBuffer* log_buffer) {
   mutex_.AssertHeld();
 
@@ -2109,7 +1969,7 @@ Status DBImpl::BackgroundFlush(bool* madeProgress,
           cfd->GetName().c_str(),
           db_options_.max_background_flushes - bg_flush_scheduled_);
       flush_status = FlushMemTableToOutputFile(
-          cfd, mutable_cf_options, madeProgress, deletion_state, log_buffer);
+          cfd, mutable_cf_options, madeProgress, job_context, log_buffer);
     }
     if (call_status.ok() && !flush_status.ok()) {
       call_status = flush_status;
@@ -2122,7 +1982,7 @@ Status DBImpl::BackgroundFlush(bool* madeProgress,
 
 void DBImpl::BackgroundCallFlush() {
   bool madeProgress = false;
-  DeletionState deletion_state(true);
+  JobContext job_context(true);
   assert(bg_flush_scheduled_);
 
   LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, db_options_.info_log.get());
@@ -2131,7 +1991,7 @@ void DBImpl::BackgroundCallFlush() {
 
     Status s;
     if (!shutting_down_.load(std::memory_order_acquire)) {
-      s = BackgroundFlush(&madeProgress, deletion_state, &log_buffer);
+      s = BackgroundFlush(&madeProgress, &job_context, &log_buffer);
       if (!s.ok()) {
         // Wait a little bit before retrying background compaction in
         // case this is an environmental problem and we do not want to
@@ -2154,9 +2014,9 @@ void DBImpl::BackgroundCallFlush() {
 
     // If !s.ok(), this means that Flush failed. In that case, we want
     // to delete all obsolete files and we force FindObsoleteFiles()
-    FindObsoleteFiles(deletion_state, !s.ok());
+    FindObsoleteFiles(&job_context, !s.ok());
     // delete unnecessary files if any, this is done outside the mutex
-    if (deletion_state.HaveSomethingToDelete() || !log_buffer.IsEmpty()) {
+    if (job_context.HaveSomethingToDelete() || !log_buffer.IsEmpty()) {
       mutex_.Unlock();
       // Have to flush the info logs before bg_flush_scheduled_--
       // because if bg_flush_scheduled_ becomes 0 and the lock is
@@ -2164,8 +2024,8 @@ void DBImpl::BackgroundCallFlush() {
       // states of DB so info_log might not be available after that point.
       // It also applies to access other states that DB owns.
       log_buffer.FlushBufferToLog();
-      if (deletion_state.HaveSomethingToDelete()) {
-        PurgeObsoleteFiles(deletion_state);
+      if (job_context.HaveSomethingToDelete()) {
+        PurgeObsoleteFiles(job_context);
       }
       mutex_.Lock();
     }
@@ -2189,7 +2049,7 @@ void DBImpl::BackgroundCallFlush() {
 
 void DBImpl::BackgroundCallCompaction() {
   bool madeProgress = false;
-  DeletionState deletion_state(true);
+  JobContext job_context(true);
 
   MaybeDumpStats();
   LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, db_options_.info_log.get());
@@ -2198,7 +2058,7 @@ void DBImpl::BackgroundCallCompaction() {
     assert(bg_compaction_scheduled_);
     Status s;
     if (!shutting_down_.load(std::memory_order_acquire)) {
-      s = BackgroundCompaction(&madeProgress, deletion_state, &log_buffer);
+      s = BackgroundCompaction(&madeProgress, &job_context, &log_buffer);
       if (!s.ok()) {
         // Wait a little bit before retrying background compaction in
         // case this is an environmental problem and we do not want to
@@ -2221,12 +2081,12 @@ void DBImpl::BackgroundCallCompaction() {
 
     // If !s.ok(), this means that Compaction failed. In that case, we want
     // to delete all obsolete files we might have created and we force
-    // FindObsoleteFiles(). This is because deletion_state does not catch
-    // all created files if compaction failed.
-    FindObsoleteFiles(deletion_state, !s.ok());
+    // FindObsoleteFiles(). This is because job_context does not
+    // catch all created files if compaction failed.
+    FindObsoleteFiles(&job_context, !s.ok());
 
     // delete unnecessary files if any, this is done outside the mutex
-    if (deletion_state.HaveSomethingToDelete() || !log_buffer.IsEmpty()) {
+    if (job_context.HaveSomethingToDelete() || !log_buffer.IsEmpty()) {
       mutex_.Unlock();
       // Have to flush the info logs before bg_compaction_scheduled_--
       // because if bg_flush_scheduled_ becomes 0 and the lock is
@@ -2234,8 +2094,8 @@ void DBImpl::BackgroundCallCompaction() {
       // states of DB so info_log might not be available after that point.
       // It also applies to access other states that DB owns.
       log_buffer.FlushBufferToLog();
-      if (deletion_state.HaveSomethingToDelete()) {
-        PurgeObsoleteFiles(deletion_state);
+      if (job_context.HaveSomethingToDelete()) {
+        PurgeObsoleteFiles(job_context);
       }
       mutex_.Lock();
     }
@@ -2271,8 +2131,7 @@ void DBImpl::BackgroundCallCompaction() {
   }
 }
 
-Status DBImpl::BackgroundCompaction(bool* madeProgress,
-                                    DeletionState& deletion_state,
+Status DBImpl::BackgroundCompaction(bool* madeProgress, JobContext* job_context,
                                     LogBuffer* log_buffer) {
   *madeProgress = false;
   mutex_.AssertHeld();
@@ -2312,7 +2171,7 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress,
           db_options_.max_background_compactions - bg_compaction_scheduled_);
       cfd->Ref();
       flush_stat = FlushMemTableToOutputFile(
-          cfd, mutable_cf_options, madeProgress, deletion_state, log_buffer);
+          cfd, mutable_cf_options, madeProgress, job_context, log_buffer);
       cfd->Unref();
       if (!flush_stat.ok()) {
         if (is_manual) {
@@ -2388,8 +2247,8 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress,
     status = versions_->LogAndApply(
         c->column_family_data(), *c->mutable_cf_options(), c->edit(),
         &mutex_, db_directory_.get());
-    InstallSuperVersion(c->column_family_data(), deletion_state,
-                        *c->mutable_cf_options());
+    InstallSuperVersionBackground(c->column_family_data(), job_context,
+                                  *c->mutable_cf_options());
     LogToBuffer(log_buffer, "[%s] Deleted %d files\n",
                 c->column_family_data()->GetName().c_str(),
                 c->num_input_files(0));
@@ -2407,8 +2266,8 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress,
                                     *c->mutable_cf_options(),
                                     c->edit(), &mutex_, db_directory_.get());
     // Use latest MutableCFOptions
-    InstallSuperVersion(c->column_family_data(), deletion_state,
-                        *c->mutable_cf_options());
+    InstallSuperVersionBackground(c->column_family_data(), job_context,
+                                  *c->mutable_cf_options());
 
     Version::LevelSummaryStorage tmp;
     LogToBuffer(
@@ -2423,8 +2282,8 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress,
   } else {
     MaybeScheduleFlushOrCompaction(); // do more compaction work in parallel.
     CompactionState* compact = new CompactionState(c.get());
-    status = DoCompactionWork(compact, *c->mutable_cf_options(),
-                              deletion_state, log_buffer);
+    status = DoCompactionWork(compact, *c->mutable_cf_options(), job_context,
+                              log_buffer);
     CleanupCompaction(compact, status);
     c->ReleaseCompactionFiles(status);
     c->ReleaseInputs();
@@ -2694,9 +2553,9 @@ inline SequenceNumber DBImpl::findEarliestVisibleSnapshot(
   return 0;
 }
 
-uint64_t DBImpl::CallFlushDuringCompaction(ColumnFamilyData* cfd,
-    const MutableCFOptions& mutable_cf_options, DeletionState& deletion_state,
-    LogBuffer* log_buffer) {
+uint64_t DBImpl::CallFlushDuringCompaction(
+    ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
+    JobContext* job_context, LogBuffer* log_buffer) {
   if (db_options_.max_background_flushes > 0) {
     // flush thread will take care of this
     return 0;
@@ -2706,8 +2565,8 @@ uint64_t DBImpl::CallFlushDuringCompaction(ColumnFamilyData* cfd,
     mutex_.Lock();
     if (cfd->imm()->IsFlushPending()) {
       cfd->Ref();
-      FlushMemTableToOutputFile(cfd, mutable_cf_options, nullptr,
-                                deletion_state, log_buffer);
+      FlushMemTableToOutputFile(cfd, mutable_cf_options, nullptr, job_context,
+                                log_buffer);
       cfd->Unref();
       bg_cv_.SignalAll();  // Wakeup DelayWrite() if necessary
     }
@@ -2719,18 +2578,11 @@ uint64_t DBImpl::CallFlushDuringCompaction(ColumnFamilyData* cfd,
 }
 
 Status DBImpl::ProcessKeyValueCompaction(
-    const MutableCFOptions& mutable_cf_options,
-    bool is_snapshot_supported,
-    SequenceNumber visible_at_tip,
-    SequenceNumber earliest_snapshot,
-    SequenceNumber latest_snapshot,
-    DeletionState& deletion_state,
-    bool bottommost_level,
-    int64_t& imm_micros,
-    Iterator* input,
-    CompactionState* compact,
-    bool is_compaction_v2,
-    int* num_output_records,
+    const MutableCFOptions& mutable_cf_options, bool is_snapshot_supported,
+    SequenceNumber visible_at_tip, SequenceNumber earliest_snapshot,
+    SequenceNumber latest_snapshot, JobContext* job_context,
+    bool bottommost_level, int64_t* imm_micros, Iterator* input,
+    CompactionState* compact, bool is_compaction_v2, int* num_output_records,
     LogBuffer* log_buffer) {
   assert(num_output_records != nullptr);
 
@@ -2786,8 +2638,8 @@ Status DBImpl::ProcessKeyValueCompaction(
     // TODO(icanadi) this currently only checks if flush is necessary on
     // compacting column family. we should also check if flush is necessary on
     // other column families, too
-    imm_micros += CallFlushDuringCompaction(
-        cfd, mutable_cf_options, deletion_state, log_buffer);
+    (*imm_micros) += CallFlushDuringCompaction(cfd, mutable_cf_options,
+                                               job_context, log_buffer);
 
     Slice key;
     Slice value;
@@ -3127,7 +2979,7 @@ void DBImpl::CallCompactionFilterV2(CompactionState* compact,
 
 Status DBImpl::DoCompactionWork(CompactionState* compact,
                                 const MutableCFOptions& mutable_cf_options,
-                                DeletionState& deletion_state,
+                                JobContext* job_context,
                                 LogBuffer* log_buffer) {
   assert(compact);
   compact->CleanupBatchBuffer();
@@ -3198,19 +3050,10 @@ Status DBImpl::DoCompactionWork(CompactionState* compact,
 
   if (!compaction_filter_v2) {
     status = ProcessKeyValueCompaction(
-      mutable_cf_options,
-      is_snapshot_supported,
-      visible_at_tip,
-      earliest_snapshot,
-      latest_snapshot,
-      deletion_state,
-      bottommost_level,
-      imm_micros,
-      input.get(),
-      compact,
-      false,
-      &num_output_records,
-      log_buffer);
+        mutable_cf_options, is_snapshot_supported, visible_at_tip,
+        earliest_snapshot, latest_snapshot, job_context, bottommost_level,
+        &imm_micros, input.get(), compact, false, &num_output_records,
+        log_buffer);
   } else {
     // temp_backup_input always point to the start of the current buffer
     // temp_backup_input = backup_input;
@@ -3231,7 +3074,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact,
       // compacting column family. we should also check if flush is necessary on
       // other column families, too
       imm_micros += CallFlushDuringCompaction(cfd, mutable_cf_options,
-          deletion_state, log_buffer);
+                                              job_context, log_buffer);
 
       Slice key = backup_input->key();
       Slice value = backup_input->value();
@@ -3281,18 +3124,9 @@ Status DBImpl::DoCompactionWork(CompactionState* compact,
       // Done buffering for the current prefix. Spit it out to disk
       // Now just iterate through all the kv-pairs
       status = ProcessKeyValueCompaction(
-          mutable_cf_options,
-          is_snapshot_supported,
-          visible_at_tip,
-          earliest_snapshot,
-          latest_snapshot,
-          deletion_state,
-          bottommost_level,
-          imm_micros,
-          input.get(),
-          compact,
-          true,
-          &num_output_records,
+          mutable_cf_options, is_snapshot_supported, visible_at_tip,
+          earliest_snapshot, latest_snapshot, job_context, bottommost_level,
+          &imm_micros, input.get(), compact, true, &num_output_records,
           log_buffer);
 
       if (!status.ok()) {
@@ -3319,18 +3153,9 @@ Status DBImpl::DoCompactionWork(CompactionState* compact,
         compact->MergeKeyValueSliceBuffer(&cfd->internal_comparator());
 
         status = ProcessKeyValueCompaction(
-            mutable_cf_options,
-            is_snapshot_supported,
-            visible_at_tip,
-            earliest_snapshot,
-            latest_snapshot,
-            deletion_state,
-            bottommost_level,
-            imm_micros,
-            input.get(),
-            compact,
-            true,
-            &num_output_records,
+            mutable_cf_options, is_snapshot_supported, visible_at_tip,
+            earliest_snapshot, latest_snapshot, job_context, bottommost_level,
+            &imm_micros, input.get(), compact, true, &num_output_records,
             log_buffer);
 
         compact->CleanupBatchBuffer();
@@ -3343,18 +3168,9 @@ Status DBImpl::DoCompactionWork(CompactionState* compact,
     }
     compact->MergeKeyValueSliceBuffer(&cfd->internal_comparator());
     status = ProcessKeyValueCompaction(
-        mutable_cf_options,
-        is_snapshot_supported,
-        visible_at_tip,
-        earliest_snapshot,
-        latest_snapshot,
-        deletion_state,
-        bottommost_level,
-        imm_micros,
-        input.get(),
-        compact,
-        true,
-        &num_output_records,
+        mutable_cf_options, is_snapshot_supported, visible_at_tip,
+        earliest_snapshot, latest_snapshot, job_context, bottommost_level,
+        &imm_micros, input.get(), compact, true, &num_output_records,
         log_buffer);
   }  // checking for compaction filter v2
 
@@ -3421,7 +3237,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact,
 
   if (status.ok()) {
     status = InstallCompactionResults(compact, mutable_cf_options, log_buffer);
-    InstallSuperVersion(cfd, deletion_state, mutable_cf_options);
+    InstallSuperVersionBackground(cfd, job_context, mutable_cf_options);
   }
   Version::LevelSummaryStorage tmp;
   LogToBuffer(
@@ -3461,16 +3277,16 @@ static void CleanupIteratorState(void* arg1, void* arg2) {
   IterState* state = reinterpret_cast<IterState*>(arg1);
 
   if (state->super_version->Unref()) {
-    DBImpl::DeletionState deletion_state;
+    JobContext job_context;
 
     state->mu->Lock();
     state->super_version->Cleanup();
-    state->db->FindObsoleteFiles(deletion_state, false, true);
+    state->db->FindObsoleteFiles(&job_context, false, true);
     state->mu->Unlock();
 
     delete state->super_version;
-    if (deletion_state.HaveSomethingToDelete()) {
-      state->db->PurgeObsoleteFiles(deletion_state);
+    if (job_context.HaveSomethingToDelete()) {
+      state->db->PurgeObsoleteFiles(job_context);
     }
   }
 
@@ -3511,25 +3327,27 @@ Status DBImpl::Get(const ReadOptions& read_options,
   return GetImpl(read_options, column_family, key, value);
 }
 
-// DeletionState gets created and destructed outside of the lock -- we
+// JobContext gets created and destructed outside of the lock --
+// we
 // use this convinently to:
 // * malloc one SuperVersion() outside of the lock -- new_superversion
 // * delete SuperVersion()s outside of the lock -- superversions_to_free
 //
-// However, if InstallSuperVersion() gets called twice with the same,
-// deletion_state, we can't reuse the SuperVersion() that got malloced because
+// However, if InstallSuperVersion() gets called twice with the same
+// job_context, we can't reuse the SuperVersion() that got
+// malloced
+// because
 // first call already used it. In that rare case, we take a hit and create a
 // new SuperVersion() inside of the mutex. We do similar thing
 // for superversion_to_free
-void DBImpl::InstallSuperVersion(
-    ColumnFamilyData* cfd, DeletionState& deletion_state,
+void DBImpl::InstallSuperVersionBackground(
+    ColumnFamilyData* cfd, JobContext* job_context,
     const MutableCFOptions& mutable_cf_options) {
   mutex_.AssertHeld();
-  SuperVersion* old_superversion =
-      InstallSuperVersion(cfd, deletion_state.new_superversion,
-                          mutable_cf_options);
-  deletion_state.new_superversion = nullptr;
-  deletion_state.superversions_to_free.push_back(old_superversion);
+  SuperVersion* old_superversion = InstallSuperVersion(
+      cfd, job_context->new_superversion, mutable_cf_options);
+  job_context->new_superversion = nullptr;
+  job_context->superversions_to_free.push_back(old_superversion);
 }
 
 SuperVersion* DBImpl::InstallSuperVersion(
@@ -4529,7 +4347,7 @@ Status DBImpl::DeleteFile(std::string name) {
   FileMetaData* metadata;
   ColumnFamilyData* cfd;
   VersionEdit edit;
-  DeletionState deletion_state(true);
+  JobContext job_context(true);
   {
     MutexLock l(&mutex_);
     status = versions_->GetMetadataForFile(number, &level, &metadata, &cfd);
@@ -4567,15 +4385,15 @@ Status DBImpl::DeleteFile(std::string name) {
     status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(),
                                     &edit, &mutex_, db_directory_.get());
     if (status.ok()) {
-      InstallSuperVersion(cfd, deletion_state,
-                          *cfd->GetLatestMutableCFOptions());
+      InstallSuperVersionBackground(cfd, &job_context,
+                                    *cfd->GetLatestMutableCFOptions());
     }
-    FindObsoleteFiles(deletion_state, false);
-  } // lock released here
+    FindObsoleteFiles(&job_context, false);
+  }  // lock released here
   LogFlush(db_options_.info_log);
   // remove files outside the db-lock
-  if (deletion_state.HaveSomethingToDelete()) {
-    PurgeObsoleteFiles(deletion_state);
+  if (job_context.HaveSomethingToDelete()) {
+    PurgeObsoleteFiles(job_context);
   }
   {
     MutexLock l(&mutex_);
diff --git a/db/db_impl.h b/db/db_impl.h
index f730d6ba4..15205d90b 100644
--- a/db/db_impl.h
+++ b/db/db_impl.h
@@ -35,6 +35,7 @@
 #include "db/write_controller.h"
 #include "db/flush_scheduler.h"
 #include "db/write_thread.h"
+#include "db/job_context.h"
 
 namespace rocksdb {
 
@@ -223,88 +224,19 @@ class DBImpl : public DB {
   void TEST_EndWrite(void* w);
 #endif  // NDEBUG
 
-  // Structure to store information for candidate files to delete.
-  struct CandidateFileInfo {
-    std::string file_name;
-    uint32_t path_id;
-    CandidateFileInfo(std::string name, uint32_t path)
-        : file_name(name), path_id(path) {}
-    bool operator==(const CandidateFileInfo& other) const {
-      return file_name == other.file_name && path_id == other.path_id;
-    }
-  };
-
-  // needed for CleanupIteratorState
-  struct DeletionState {
-    inline bool HaveSomethingToDelete() const {
-      return  candidate_files.size() ||
-        sst_delete_files.size() ||
-        log_delete_files.size();
-    }
-
-    // a list of all files that we'll consider deleting
-    // (every once in a while this is filled up with all files
-    // in the DB directory)
-    std::vector<CandidateFileInfo> candidate_files;
-
-    // the list of all live sst files that cannot be deleted
-    std::vector<FileDescriptor> sst_live;
-
-    // a list of sst files that we need to delete
-    std::vector<FileMetaData*> sst_delete_files;
-
-    // a list of log files that we need to delete
-    std::vector<uint64_t> log_delete_files;
-
-    // a list of memtables to be free
-    autovector<MemTable*> memtables_to_free;
-
-    autovector<SuperVersion*> superversions_to_free;
-
-    SuperVersion* new_superversion;  // if nullptr no new superversion
-
-    // the current manifest_file_number, log_number and prev_log_number
-    // that corresponds to the set of files in 'live'.
-    uint64_t manifest_file_number, pending_manifest_file_number, log_number,
-        prev_log_number;
-
-    explicit DeletionState(bool create_superversion = false) {
-      manifest_file_number = 0;
-      pending_manifest_file_number = 0;
-      log_number = 0;
-      prev_log_number = 0;
-      new_superversion = create_superversion ? new SuperVersion() : nullptr;
-    }
-
-    ~DeletionState() {
-      // free pending memtables
-      for (auto m : memtables_to_free) {
-        delete m;
-      }
-      // free superversions
-      for (auto s : superversions_to_free) {
-        delete s;
-      }
-      // if new_superversion was not used, it will be non-nullptr and needs
-      // to be freed here
-      delete new_superversion;
-    }
-  };
-
   // Returns the list of live files in 'live' and the list
   // of all files in the filesystem in 'candidate_files'.
   // If force == false and the last call was less than
   // db_options_.delete_obsolete_files_period_micros microseconds ago,
-  // it will not fill up the deletion_state
-  void FindObsoleteFiles(DeletionState& deletion_state,
-                         bool force,
+  // it will not fill up the job_context
+  void FindObsoleteFiles(JobContext* job_context, bool force,
                          bool no_full_scan = false);
 
   // Diffs the files listed in filenames and those that do not
   // belong to live files are posibly removed. Also, removes all the
   // files in sst_delete_files and log_delete_files.
   // It is not necessary to hold the mutex when invoking this method.
-  void PurgeObsoleteFiles(DeletionState& deletion_state);
+  void PurgeObsoleteFiles(const JobContext& background_contet);
 
   ColumnFamilyHandle* DefaultColumnFamily() const;
 
@@ -347,9 +279,10 @@ class DBImpl : public DB {
 
   // Flush the in-memory write buffer to storage.  Switches to a new
   // log-file/memtable and writes a new descriptor iff successful.
-  Status FlushMemTableToOutputFile(
-      ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
-      bool* madeProgress, DeletionState& deletion_state, LogBuffer* log_buffer);
+  Status FlushMemTableToOutputFile(ColumnFamilyData* cfd,
+                                   const MutableCFOptions& mutable_cf_options,
+                                   bool* madeProgress, JobContext* job_context,
+                                   LogBuffer* log_buffer);
 
   // REQUIRES: log_numbers are sorted in ascending order
   Status RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
@@ -362,11 +295,6 @@ class DBImpl : public DB {
   // concurrent flush memtables to storage.
   Status WriteLevel0TableForRecovery(ColumnFamilyData* cfd, MemTable* mem,
                                      VersionEdit* edit);
-  Status WriteLevel0Table(ColumnFamilyData* cfd,
-      const MutableCFOptions& mutable_cf_options,
-      const autovector<MemTable*>& mems,
-      VersionEdit* edit, uint64_t* filenumber, LogBuffer* log_buffer);
-
   Status DelayWrite(uint64_t expiration_time);
 
   Status ScheduleFlushes(WriteContext* context);
@@ -388,39 +316,32 @@ class DBImpl : public DB {
   static void BGWorkFlush(void* db);
   void BackgroundCallCompaction();
   void BackgroundCallFlush();
-  Status BackgroundCompaction(bool* madeProgress, DeletionState& deletion_state,
+  Status BackgroundCompaction(bool* madeProgress, JobContext* job_context,
                               LogBuffer* log_buffer);
-  Status BackgroundFlush(bool* madeProgress, DeletionState& deletion_state,
+  Status BackgroundFlush(bool* madeProgress, JobContext* job_context,
                          LogBuffer* log_buffer);
   void CleanupCompaction(CompactionState* compact, Status status);
   Status DoCompactionWork(CompactionState* compact,
                           const MutableCFOptions& mutable_cf_options,
-                          DeletionState& deletion_state,
-                          LogBuffer* log_buffer);
+                          JobContext* job_context, LogBuffer* log_buffer);
 
   // This function is called as part of compaction. It enables Flush process to
   // preempt compaction, since it's higher prioirty
   // Returns: micros spent executing
   uint64_t CallFlushDuringCompaction(ColumnFamilyData* cfd,
-      const MutableCFOptions& mutable_cf_options, DeletionState& deletion_state,
-      LogBuffer* log_buffer);
+                                     const MutableCFOptions& mutable_cf_options,
+                                     JobContext* job_context,
+                                     LogBuffer* log_buffer);
 
   // Call compaction filter if is_compaction_v2 is not true. Then iterate
   // through input and compact the kv-pairs
   Status ProcessKeyValueCompaction(
-    const MutableCFOptions& mutable_cf_options,
-    bool is_snapshot_supported,
-    SequenceNumber visible_at_tip,
-    SequenceNumber earliest_snapshot,
-    SequenceNumber latest_snapshot,
-    DeletionState& deletion_state,
-    bool bottommost_level,
-    int64_t& imm_micros,
-    Iterator* input,
-    CompactionState* compact,
-    bool is_compaction_v2,
-    int* num_output_records,
-    LogBuffer* log_buffer);
+      const MutableCFOptions& mutable_cf_options, bool is_snapshot_supported,
+      SequenceNumber visible_at_tip, SequenceNumber earliest_snapshot,
+      SequenceNumber latest_snapshot, JobContext* job_context,
+      bool bottommost_level, int64_t* imm_micros, Iterator* input,
+      CompactionState* compact, bool is_compaction_v2, int* num_output_records,
+      LogBuffer* log_buffer);
 
   // Call compaction_filter_v2->Filter() on kv-pairs in compact
   void CallCompactionFilterV2(CompactionState* compact,
@@ -624,11 +545,12 @@ class DBImpl : public DB {
     SequenceNumber* prev_snapshot);
 
   // Background threads call this function, which is just a wrapper around
-  // the cfd->InstallSuperVersion() function. Background threads carry
-  // deletion_state which can have new_superversion already allocated.
-  void InstallSuperVersion(ColumnFamilyData* cfd,
-                           DeletionState& deletion_state,
-                           const MutableCFOptions& mutable_cf_options);
+  // the InstallSuperVersion() function. Background threads carry
+  // job_context which can have new_superversion already
+  // allocated.
+  void InstallSuperVersionBackground(
+      ColumnFamilyData* cfd, JobContext* job_context,
+      const MutableCFOptions& mutable_cf_options);
 
   SuperVersion* InstallSuperVersion(
     ColumnFamilyData* cfd, SuperVersion* new_sv,
diff --git a/db/flush_job.cc b/db/flush_job.cc
new file mode 100644
index 000000000..ff35e9a9a
--- /dev/null
+++ b/db/flush_job.cc
@@ -0,0 +1,223 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/flush_job.h"
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include <inttypes.h>
+#include <algorithm>
+#include <vector>
+
+#include "db/builder.h"
+#include "db/db_iter.h"
+#include "db/dbformat.h"
+#include "db/filename.h"
+#include "db/log_reader.h"
+#include "db/log_writer.h"
+#include "db/memtable.h"
+#include "db/memtable_list.h"
+#include "db/merge_context.h"
+#include "db/version_set.h"
+#include "port/port.h"
+#include "port/likely.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table.h"
+#include "table/block.h"
+#include "table/block_based_table_factory.h"
+#include "table/merger.h"
+#include "table/table_builder.h"
+#include "table/two_level_iterator.h"
+#include "util/coding.h"
+#include "util/logging.h"
+#include "util/log_buffer.h"
+#include "util/mutexlock.h"
+#include "util/perf_context_imp.h"
+#include "util/iostats_context_imp.h"
+#include "util/stop_watch.h"
+#include "util/sync_point.h"
+
+namespace rocksdb {
+
+FlushJob::FlushJob(const std::string& dbname, ColumnFamilyData* cfd,
+                   const DBOptions& db_options,
+                   const MutableCFOptions& mutable_cf_options,
+                   const EnvOptions& env_options, VersionSet* versions,
+                   port::Mutex* db_mutex, std::atomic<bool>* shutting_down,
+                   FileNumToPathIdMap* pending_outputs,
+                   SequenceNumber newest_snapshot, JobContext* job_context,
+                   LogBuffer* log_buffer, Directory* db_directory,
+                   CompressionType output_compression, Statistics* stats)
+    : dbname_(dbname),
+      cfd_(cfd),
+      db_options_(db_options),
+      mutable_cf_options_(mutable_cf_options),
+      env_options_(env_options),
+      versions_(versions),
+      db_mutex_(db_mutex),
+      shutting_down_(shutting_down),
+      pending_outputs_(pending_outputs),
+      newest_snapshot_(newest_snapshot),
+      job_context_(job_context),
+      log_buffer_(log_buffer),
+      db_directory_(db_directory),
+      output_compression_(output_compression),
+      stats_(stats) {}
+
+Status FlushJob::Run() {
+  // Save the contents of the earliest memtable as a new Table
+  uint64_t file_number;
+  autovector<MemTable*> mems;
+  cfd_->imm()->PickMemtablesToFlush(&mems);
+  if (mems.empty()) {
+    LogToBuffer(log_buffer_, "[%s] Nothing in memtable to flush",
+                cfd_->GetName().c_str());
+    return Status::OK();
+  }
+
+  // entries mems are (implicitly) sorted in ascending order by their created
+  // time. We will use the first memtable's `edit` to keep the meta info for
+  // this flush.
+  MemTable* m = mems[0];
+  VersionEdit* edit = m->GetEdits();
+  edit->SetPrevLogNumber(0);
+  // SetLogNumber(log_num) indicates logs with number smaller than log_num
+  // will no longer be picked up for recovery.
+  edit->SetLogNumber(mems.back()->GetNextLogNumber());
+  edit->SetColumnFamily(cfd_->GetID());
+
+  // This will release and re-acquire the mutex.
+  Status s = WriteLevel0Table(mems, edit, &file_number);
+
+  if (s.ok() &&
+      (shutting_down_->load(std::memory_order_acquire) || cfd_->IsDropped())) {
+    s = Status::ShutdownInProgress(
+        "Database shutdown or Column family drop during flush");
+  }
+
+  if (!s.ok()) {
+    cfd_->imm()->RollbackMemtableFlush(mems, file_number, pending_outputs_);
+  } else {
+    // Replace immutable memtable with the generated Table
+    s = cfd_->imm()->InstallMemtableFlushResults(
+        cfd_, mutable_cf_options_, mems, versions_, db_mutex_, file_number,
+        pending_outputs_, &job_context_->memtables_to_free, db_directory_,
+        log_buffer_);
+  }
+
+  return s;
+}
+
+Status FlushJob::WriteLevel0Table(const autovector<MemTable*>& mems,
+                                  VersionEdit* edit, uint64_t* filenumber) {
+  db_mutex_->AssertHeld();
+  const uint64_t start_micros = db_options_.env->NowMicros();
+  FileMetaData meta;
+
+  meta.fd = FileDescriptor(versions_->NewFileNumber(), 0, 0);
+  *filenumber = meta.fd.GetNumber();
+  // path 0 for level 0 file.
+  pending_outputs_->insert({meta.fd.GetNumber(), 0});
+
+  const SequenceNumber earliest_seqno_in_memtable =
+      mems[0]->GetFirstSequenceNumber();
+  Version* base = cfd_->current();
+  base->Ref();  // it is likely that we do not need this reference
+  Status s;
+  {
+    db_mutex_->Unlock();
+    if (log_buffer_) {
+      log_buffer_->FlushBufferToLog();
+    }
+    std::vector<Iterator*> memtables;
+    ReadOptions ro;
+    ro.total_order_seek = true;
+    Arena arena;
+    for (MemTable* m : mems) {
+      Log(db_options_.info_log,
+          "[%s] Flushing memtable with next log file: %" PRIu64 "\n",
+          cfd_->GetName().c_str(), m->GetNextLogNumber());
+      memtables.push_back(m->NewIterator(ro, &arena));
+    }
+    {
+      ScopedArenaIterator iter(NewMergingIterator(&cfd_->internal_comparator(),
+                                                  &memtables[0],
+                                                  memtables.size(), &arena));
+      Log(db_options_.info_log,
+          "[%s] Level-0 flush table #%" PRIu64 ": started",
+          cfd_->GetName().c_str(), meta.fd.GetNumber());
+
+      s = BuildTable(dbname_, db_options_.env, *cfd_->ioptions(), env_options_,
+                     cfd_->table_cache(), iter.get(), &meta,
+                     cfd_->internal_comparator(), newest_snapshot_,
+                     earliest_seqno_in_memtable, output_compression_,
+                     cfd_->ioptions()->compression_opts, Env::IO_HIGH);
+      LogFlush(db_options_.info_log);
+    }
+    Log(db_options_.info_log,
+        "[%s] Level-0 flush table #%" PRIu64 ": %" PRIu64 " bytes %s",
+        cfd_->GetName().c_str(), meta.fd.GetNumber(), meta.fd.GetFileSize(),
+        s.ToString().c_str());
+
+    if (!db_options_.disableDataSync && db_directory_ != nullptr) {
+      db_directory_->Fsync();
+    }
+    db_mutex_->Lock();
+  }
+  base->Unref();
+
+  // re-acquire the most current version
+  base = cfd_->current();
+
+  // There could be multiple threads writing to its own level-0 file.
+  // The pending_outputs cannot be cleared here, otherwise this newly
+  // created file might not be considered as a live-file by another
+  // compaction thread that is concurrently deleting obselete files.
+  // The pending_outputs can be cleared only after the new version is
+  // committed so that other threads can recognize this file as a
+  // valid one.
+  // pending_outputs_.erase(meta.number);
+
+  // Note that if file_size is zero, the file has been deleted and
+  // should not be added to the manifest.
+  int level = 0;
+  if (s.ok() && meta.fd.GetFileSize() > 0) {
+    const Slice min_user_key = meta.smallest.user_key();
+    const Slice max_user_key = meta.largest.user_key();
+    // if we have more than 1 background thread, then we cannot
+    // insert files directly into higher levels because some other
+    // threads could be concurrently producing compacted files for
+    // that key range.
+    if (base != nullptr && db_options_.max_background_compactions <= 1 &&
+        db_options_.max_background_flushes == 0 &&
+        cfd_->ioptions()->compaction_style == kCompactionStyleLevel) {
+      level = base->PickLevelForMemTableOutput(mutable_cf_options_,
+                                               min_user_key, max_user_key);
+    }
+    edit->AddFile(level, meta.fd.GetNumber(), meta.fd.GetPathId(),
+                  meta.fd.GetFileSize(), meta.smallest, meta.largest,
+                  meta.smallest_seqno, meta.largest_seqno);
+  }
+
+  InternalStats::CompactionStats stats(1);
+  stats.micros = db_options_.env->NowMicros() - start_micros;
+  stats.bytes_written = meta.fd.GetFileSize();
+  cfd_->internal_stats()->AddCompactionStats(level, stats);
+  cfd_->internal_stats()->AddCFStats(InternalStats::BYTES_FLUSHED,
+                                     meta.fd.GetFileSize());
+  RecordTick(stats_, COMPACT_WRITE_BYTES, meta.fd.GetFileSize());
+  return s;
+}
+
+}  // namespace rocksdb
diff --git a/db/flush_job.h b/db/flush_job.h
new file mode 100644
index 000000000..a5a40ce41
--- /dev/null
+++ b/db/flush_job.h
@@ -0,0 +1,86 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include <atomic>
+#include <deque>
+#include <limits>
+#include <set>
+#include <utility>
+#include <vector>
+#include <string>
+
+#include "db/dbformat.h"
+#include "db/log_writer.h"
+#include "db/snapshot.h"
+#include "db/column_family.h"
+#include "db/version_edit.h"
+#include "db/memtable_list.h"
+#include "port/port.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/transaction_log.h"
+#include "util/autovector.h"
+#include "util/stop_watch.h"
+#include "util/thread_local.h"
+#include "util/scoped_arena_iterator.h"
+#include "db/internal_stats.h"
+#include "db/write_controller.h"
+#include "db/flush_scheduler.h"
+#include "db/write_thread.h"
+#include "db/job_context.h"
+
+namespace rocksdb {
+
+class MemTable;
+class TableCache;
+class Version;
+class VersionEdit;
+class VersionSet;
+class Arena;
+
+class FlushJob {
+ public:
+  // TODO(icanadi) make effort to reduce number of parameters here
+  // IMPORTANT: mutable_cf_options needs to be alive while FlushJob is alive
+  FlushJob(const std::string& dbname, ColumnFamilyData* cfd,
+           const DBOptions& db_options,
+           const MutableCFOptions& mutable_cf_options,
+           const EnvOptions& env_options, VersionSet* versions,
+           port::Mutex* db_mutex, std::atomic<bool>* shutting_down,
+           FileNumToPathIdMap* pending_outputs, SequenceNumber newest_snapshot,
+           JobContext* job_context, LogBuffer* log_buffer,
+           Directory* db_directory, CompressionType output_compression,
+           Statistics* stats);
+  ~FlushJob() {}
+
+  Status Run();
+
+ private:
+  Status WriteLevel0Table(const autovector<MemTable*>& mems, VersionEdit* edit,
+                          uint64_t* filenumber);
+  const std::string& dbname_;
+  ColumnFamilyData* cfd_;
+  const DBOptions& db_options_;
+  const MutableCFOptions& mutable_cf_options_;
+  const EnvOptions& env_options_;
+  VersionSet* versions_;
+  port::Mutex* db_mutex_;
+  std::atomic<bool>* shutting_down_;
+  FileNumToPathIdMap* pending_outputs_;
+  SequenceNumber newest_snapshot_;
+  JobContext* job_context_;
+  LogBuffer* log_buffer_;
+  Directory* db_directory_;
+  CompressionType output_compression_;
+  Statistics* stats_;
+};
+
+}  // namespace rocksdb
diff --git a/db/flush_job_test.cc b/db/flush_job_test.cc
new file mode 100644
index 000000000..06852eedf
--- /dev/null
+++ b/db/flush_job_test.cc
@@ -0,0 +1,113 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include "db/flush_job.h"
+#include "db/column_family.h"
+#include "db/version_set.h"
+#include "rocksdb/cache.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+namespace rocksdb {
+
+// TODO(icanadi) Mock out everything else:
+// 1. VersionSet
+// 2. TableBuilder
+// 3. Memtable
+class FlushJobTest {
+ public:
+  FlushJobTest()
+      : env_(Env::Default()),
+        dbname_(test::TmpDir() + "/flush_job_test"),
+        table_cache_(NewLRUCache(50000, 16, 8)),
+        versions_(new VersionSet(dbname_, &db_options_, env_options_,
+                                 table_cache_.get(), &write_controller_)),
+        shutting_down_(false) {
+    ASSERT_OK(env_->CreateDirIfMissing(dbname_));
+    db_options_.db_paths.emplace_back(dbname_,
+                                      std::numeric_limits<uint64_t>::max());
+    // TODO(icanadi) Remove this once we mock out VersionSet
+    NewDB();
+    std::vector<ColumnFamilyDescriptor> column_families;
+    column_families.emplace_back();
+
+    ASSERT_OK(versions_->Recover(column_families, false));
+  }
+
+  void NewDB() {
+    VersionEdit new_db;
+    new_db.SetLogNumber(0);
+    new_db.SetNextFile(2);
+    new_db.SetLastSequence(0);
+
+    const std::string manifest = DescriptorFileName(dbname_, 1);
+    unique_ptr<WritableFile> file;
+    Status s = env_->NewWritableFile(
+        manifest, &file, env_->OptimizeForManifestWrite(env_options_));
+    ASSERT_OK(s);
+    {
+      log::Writer log(std::move(file));
+      std::string record;
+      new_db.EncodeTo(&record);
+      s = log.AddRecord(record);
+    }
+    ASSERT_OK(s);
+    // Make "CURRENT" file that points to the new manifest file.
+    s = SetCurrentFile(env_, dbname_, 1, nullptr);
+  }
+
+  Env* env_;
+  std::string dbname_;
+  EnvOptions env_options_;
+  std::shared_ptr<Cache> table_cache_;
+  WriteController write_controller_;
+  DBOptions db_options_;
+  ColumnFamilyOptions cf_options_;
+  std::unique_ptr<VersionSet> versions_;
+  port::Mutex mutex_;
+  std::atomic<bool> shutting_down_;
+  FileNumToPathIdMap pending_outputs_;
+};
+
+TEST(FlushJobTest, Empty) {
+  JobContext job_context;
+  auto cfd = versions_->GetColumnFamilySet()->GetDefault();
+  FlushJob flush_job(dbname_, versions_->GetColumnFamilySet()->GetDefault(),
+                     db_options_, *cfd->GetLatestMutableCFOptions(),
+                     env_options_, versions_.get(), &mutex_, &shutting_down_,
+                     &pending_outputs_, SequenceNumber(), &job_context, nullptr,
+                     nullptr, kNoCompression, nullptr);
+  ASSERT_OK(flush_job.Run());
+}
+
+TEST(FlushJobTest, NonEmpty) {
+  JobContext job_context;
+  auto cfd = versions_->GetColumnFamilySet()->GetDefault();
+
+  auto new_mem = new MemTable(cfd->internal_comparator(), *cfd->ioptions(),
+                              *cfd->GetLatestMutableCFOptions());
+  new_mem->Ref();
+  for (int i = 1; i < 10000; ++i) {
+    std::string key(std::to_string(i));
+    std::string value("value" + std::to_string(i));
+    new_mem->Add(SequenceNumber(i), kTypeValue, key, value);
+  }
+  cfd->imm()->Add(new_mem);
+
+  FlushJob flush_job(dbname_, versions_->GetColumnFamilySet()->GetDefault(),
+                     db_options_, *cfd->GetLatestMutableCFOptions(),
+                     env_options_, versions_.get(), &mutex_, &shutting_down_,
+                     &pending_outputs_, SequenceNumber(), &job_context, nullptr,
+                     nullptr, kNoCompression, nullptr);
+  mutex_.Lock();
+  ASSERT_OK(flush_job.Run());
+  mutex_.Unlock();
+  // TODO(icanadi) once you have TableMock, verify that key-values are as
+  // expected
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) { return rocksdb::test::RunAllTests(); }
diff --git a/db/forward_iterator.cc b/db/forward_iterator.cc
index cd9299aa4..04b5b3b34 100644
--- a/db/forward_iterator.cc
+++ b/db/forward_iterator.cc
@@ -10,6 +10,7 @@
 #include <string>
 #include <utility>
 
+#include "db/job_context.h"
 #include "db/db_impl.h"
 #include "db/db_iter.h"
 #include "db/column_family.h"
@@ -155,14 +156,14 @@ void ForwardIterator::Cleanup(bool release_sv) {
 
   if (release_sv) {
     if (sv_ != nullptr && sv_->Unref()) {
-      DBImpl::DeletionState deletion_state;
+      JobContext job_context;
       db_->mutex_.Lock();
       sv_->Cleanup();
-      db_->FindObsoleteFiles(deletion_state, false, true);
+      db_->FindObsoleteFiles(&job_context, false, true);
       db_->mutex_.Unlock();
       delete sv_;
-      if (deletion_state.HaveSomethingToDelete()) {
-        db_->PurgeObsoleteFiles(deletion_state);
+      if (job_context.HaveSomethingToDelete()) {
+        db_->PurgeObsoleteFiles(job_context);
       }
     }
   }
diff --git a/db/job_context.h b/db/job_context.h
new file mode 100644
index 000000000..caf28f7d9
--- /dev/null
+++ b/db/job_context.h
@@ -0,0 +1,87 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "db/column_family.h"
+
+namespace rocksdb {
+
+class MemTable;
+
+struct JobContext {
+  inline bool HaveSomethingToDelete() const {
+    return candidate_files.size() || sst_delete_files.size() ||
+           log_delete_files.size();
+  }
+
+  // Structure to store information for candidate files to delete.
+  struct CandidateFileInfo {
+    std::string file_name;
+    uint32_t path_id;
+    CandidateFileInfo(std::string name, uint32_t path)
+        : file_name(std::move(name)), path_id(path) {}
+    bool operator==(const CandidateFileInfo& other) const {
+      return file_name == other.file_name && path_id == other.path_id;
+    }
+  };
+
+  // a list of all files that we'll consider deleting
+  // (every once in a while this is filled up with all files
+  // in the DB directory)
+  std::vector<CandidateFileInfo> candidate_files;
+
+  // the list of all live sst files that cannot be deleted
+  std::vector<FileDescriptor> sst_live;
+
+  // a list of sst files that we need to delete
+  std::vector<FileMetaData*> sst_delete_files;
+
+  // a list of log files that we need to delete
+  std::vector<uint64_t> log_delete_files;
+
+  // a list of memtables to be free
+  autovector<MemTable*> memtables_to_free;
+
+  autovector<SuperVersion*> superversions_to_free;
+
+  SuperVersion* new_superversion;  // if nullptr no new superversion
+
+  // the current manifest_file_number, log_number and prev_log_number
+  // that corresponds to the set of files in 'live'.
+  uint64_t manifest_file_number, pending_manifest_file_number, log_number,
+      prev_log_number;
+
+  explicit JobContext(bool create_superversion = false) {
+    manifest_file_number = 0;
+    pending_manifest_file_number = 0;
+    log_number = 0;
+    prev_log_number = 0;
+    new_superversion = create_superversion ? new SuperVersion() : nullptr;
+  }
+
+  ~JobContext() {
+    // free pending memtables
+    for (auto m : memtables_to_free) {
+      delete m;
+    }
+    // free superversions
+    for (auto s : superversions_to_free) {
+      delete s;
+    }
+    // if new_superversion was not used, it will be non-nullptr and needs
+    // to be freed here
+    delete new_superversion;
+  }
+};
+
+}  // namespace rocksdb
diff --git a/db/memtable_list.cc b/db/memtable_list.cc
index 69325c748..3c74e073c 100644
--- a/db/memtable_list.cc
+++ b/db/memtable_list.cc
@@ -5,6 +5,11 @@
 //
 #include "db/memtable_list.h"
 
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include <inttypes.h>
 #include <string>
 #include "rocksdb/db.h"
 #include "db/memtable.h"
@@ -161,10 +166,10 @@ void MemTableList::RollbackMemtableFlush(const autovector<MemTable*>& mems,
 // Record a successful flush in the manifest file
 Status MemTableList::InstallMemtableFlushResults(
     ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
-    const autovector<MemTable*>& mems, VersionSet* vset,
-    port::Mutex* mu, Logger* info_log, uint64_t file_number,
-    FileNumToPathIdMap* pending_outputs, autovector<MemTable*>* to_delete,
-    Directory* db_directory, LogBuffer* log_buffer) {
+    const autovector<MemTable*>& mems, VersionSet* vset, port::Mutex* mu,
+    uint64_t file_number, FileNumToPathIdMap* pending_outputs,
+    autovector<MemTable*>* to_delete, Directory* db_directory,
+    LogBuffer* log_buffer) {
   mu->AssertHeld();
 
   // flush was sucessful
@@ -194,8 +199,8 @@ Status MemTableList::InstallMemtableFlushResults(
       break;
     }
 
-    LogToBuffer(log_buffer, "[%s] Level-0 commit table #%lu started",
-                cfd->GetName().c_str(), (unsigned long)m->file_number_);
+    LogToBuffer(log_buffer, "[%s] Level-0 commit table #%" PRIu64 " started",
+                cfd->GetName().c_str(), m->file_number_);
 
     // this can release and reacquire the mutex.
     s = vset->LogAndApply(cfd, mutable_cf_options, &m->edit_, mu, db_directory);
@@ -209,10 +214,9 @@ Status MemTableList::InstallMemtableFlushResults(
     uint64_t mem_id = 1;  // how many memtables has been flushed.
     do {
       if (s.ok()) { // commit new state
-        LogToBuffer(log_buffer,
-                    "[%s] Level-0 commit table #%lu: memtable #%lu done",
-                    cfd->GetName().c_str(), (unsigned long)m->file_number_,
-                    (unsigned long)mem_id);
+        LogToBuffer(log_buffer, "[%s] Level-0 commit table #%" PRIu64
+                                ": memtable #%" PRIu64 " done",
+                    cfd->GetName().c_str(), m->file_number_, mem_id);
         current_->Remove(m);
         assert(m->file_number_ > 0);
 
@@ -226,10 +230,9 @@ Status MemTableList::InstallMemtableFlushResults(
         }
       } else {
         //commit failed. setup state so that we can flush again.
-        Log(info_log,
-            "Level-0 commit table #%lu: memtable #%lu failed",
-            (unsigned long)m->file_number_,
-            (unsigned long)mem_id);
+        LogToBuffer(log_buffer, "Level-0 commit table #%" PRIu64
+                                ": memtable #%" PRIu64 " failed",
+                    m->file_number_, mem_id);
         m->flush_completed_ = false;
         m->flush_in_progress_ = false;
         m->edit_.Clear();
diff --git a/db/memtable_list.h b/db/memtable_list.h
index 5e16be5cb..9f499b834 100644
--- a/db/memtable_list.h
+++ b/db/memtable_list.h
@@ -114,10 +114,10 @@ class MemTableList {
   // Commit a successful flush in the manifest file
   Status InstallMemtableFlushResults(
       ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
-      const autovector<MemTable*>& m, VersionSet* vset,
-      port::Mutex* mu, Logger* info_log, uint64_t file_number,
-      FileNumToPathIdMap* pending_outputs, autovector<MemTable*>* to_delete,
-      Directory* db_directory, LogBuffer* log_buffer);
+      const autovector<MemTable*>& m, VersionSet* vset, port::Mutex* mu,
+      uint64_t file_number, FileNumToPathIdMap* pending_outputs,
+      autovector<MemTable*>* to_delete, Directory* db_directory,
+      LogBuffer* log_buffer);
 
   // New memtables are inserted at the front of the list.
   // Takes ownership of the referenced held on *m by the caller of Add().
diff --git a/port/stack_trace.cc b/port/stack_trace.cc
index 296b1f620..1aeb5f7b5 100644
--- a/port/stack_trace.cc
+++ b/port/stack_trace.cc
@@ -74,7 +74,7 @@ void PrintStackTraceLine(const char* symbol, void* frame) {
   // out source to atos, for the address translation
   const int kLineMax = 256;
   char cmd[kLineMax];
-  snprintf(cmd, kLineMax, "xcrun atos %p -p %d  2>&1", frame, pid);
+  snprintf(cmd, kLineMax, "xcrun atos -d %p -p %d  2>&1", frame, pid);
   auto f = popen(cmd, "r");
   if (f) {
     char line[kLineMax];

From 3772a3d09dc2835f5ee3db34fa2c31474bfe3186 Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Tue, 28 Oct 2014 14:27:26 -0700
Subject: [PATCH 324/829] Fix the bug where compaction does not fail when
 RocksDB can't create a new file.

Summary:
This diff has two fixes.

1. Fix the bug where compaction does not fail when RocksDB can't create a new file.
2. When NewWritableFiles() fails in OpenCompactionOutputFiles(), previously such fail-to-created file will be still be included as a compaction output.  This patch also fixes this bug.
3. Allow VersionEdit::EncodeTo() to return Status and add basic check.

Test Plan:
./version_edit_test
export ROCKSDB_TESTS=FileCreationRandomFailure
./db_test

Reviewers: ljin, sdong, nkg-, igor

Reviewed By: igor

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D25581
---
 db/db_impl.cc           |  47 ++++++-----
 db/db_test.cc           | 168 ++++++++++++++++++++++++++++++++++++++--
 db/version_edit.cc      |   7 +-
 db/version_edit.h       |   3 +-
 db/version_edit_test.cc |  10 +++
 db/version_set.cc       |  29 +++++--
 6 files changed, 229 insertions(+), 35 deletions(-)

diff --git a/db/db_impl.cc b/db/db_impl.cc
index c53a4bd92..a47668763 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -2347,7 +2347,7 @@ void DBImpl::CleanupCompaction(CompactionState* compact, Status status) {
     compact->builder->Abandon();
     compact->builder.reset();
   } else {
-    assert(compact->outfile == nullptr);
+    assert(!status.ok() || compact->outfile == nullptr);
   }
   for (size_t i = 0; i < compact->outputs.size(); i++) {
     const CompactionState::Output& out = compact->outputs[i];
@@ -2402,30 +2402,37 @@ Status DBImpl::OpenCompactionOutputFile(
     pending_outputs_[file_number] = compact->compaction->GetOutputPathId();
     mutex_.Unlock();
   }
+  // Make the output file
+  std::string fname = TableFileName(db_options_.db_paths, file_number,
+                                    compact->compaction->GetOutputPathId());
+  Status s = env_->NewWritableFile(fname, &compact->outfile, env_options_);
+
+  if (!s.ok()) {
+    Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
+        "[%s] OpenCompactionOutputFiles for table #%" PRIu64 " "
+        "fails at NewWritableFile with status %s",
+        compact->compaction->column_family_data()->GetName().c_str(),
+        file_number, s.ToString().c_str());
+    LogFlush(db_options_.info_log);
+    return s;
+  }
   CompactionState::Output out;
   out.number = file_number;
   out.path_id = compact->compaction->GetOutputPathId();
   out.smallest.Clear();
   out.largest.Clear();
   out.smallest_seqno = out.largest_seqno = 0;
-  compact->outputs.push_back(out);
 
-  // Make the output file
-  std::string fname = TableFileName(db_options_.db_paths, file_number,
-                                    compact->compaction->GetOutputPathId());
-  Status s = env_->NewWritableFile(fname, &compact->outfile, env_options_);
-
-  if (s.ok()) {
-    compact->outfile->SetIOPriority(Env::IO_LOW);
-    compact->outfile->SetPreallocationBlockSize(
-        compact->compaction->OutputFilePreallocationSize(mutable_cf_options));
+  compact->outputs.push_back(out);
+  compact->outfile->SetIOPriority(Env::IO_LOW);
+  compact->outfile->SetPreallocationBlockSize(
+      compact->compaction->OutputFilePreallocationSize(mutable_cf_options));
 
-    ColumnFamilyData* cfd = compact->compaction->column_family_data();
-    compact->builder.reset(NewTableBuilder(
-        *cfd->ioptions(), cfd->internal_comparator(), compact->outfile.get(),
-        compact->compaction->OutputCompressionType(),
-        cfd->ioptions()->compression_opts));
-  }
+  ColumnFamilyData* cfd = compact->compaction->column_family_data();
+  compact->builder.reset(NewTableBuilder(
+      *cfd->ioptions(), cfd->internal_comparator(), compact->outfile.get(),
+      compact->compaction->OutputCompressionType(),
+      cfd->ioptions()->compression_opts));
   LogFlush(db_options_.info_log);
   return s;
 }
@@ -2616,7 +2623,7 @@ Status DBImpl::ProcessKeyValueCompaction(
   int64_t key_drop_obsolete = 0;
   int64_t loop_cnt = 0;
   while (input->Valid() && !shutting_down_.load(std::memory_order_acquire) &&
-         !cfd->IsDropped()) {
+         !cfd->IsDropped() && status.ok()) {
     if (++loop_cnt > 1000) {
       if (key_drop_user > 0) {
         RecordTick(stats_, COMPACTION_KEY_DROP_USER, key_drop_user);
@@ -2891,8 +2898,8 @@ Status DBImpl::ProcessKeyValueCompaction(
           // Only had one item to begin with (Put/Delete)
           break;
         }
-      }
-    }
+      }  // while (true)
+    }  // if (!drop)
 
     // MergeUntil has moved input to the next entry
     if (!current_entry_is_merging) {
diff --git a/db/db_test.cc b/db/db_test.cc
index 3ded0ec97..927d97ed4 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -120,6 +120,8 @@ static std::string Key(int i) {
 // Special Env used to delay background operations
 class SpecialEnv : public EnvWrapper {
  public:
+  Random rnd_;
+
   // sstable Sync() calls are blocked while this pointer is non-nullptr.
   std::atomic<bool> delay_sstable_sync_;
 
@@ -153,7 +155,13 @@ class SpecialEnv : public EnvWrapper {
 
   std::atomic<int> sync_counter_;
 
-  explicit SpecialEnv(Env* base) : EnvWrapper(base) {
+  std::atomic<uint32_t> non_writeable_rate_;
+
+  std::atomic<uint32_t> new_writable_count_;
+
+  std::atomic<uint32_t> periodic_non_writable_;
+
+  explicit SpecialEnv(Env* base) : EnvWrapper(base), rnd_(301) {
     delay_sstable_sync_.store(false, std::memory_order_release);
     drop_writes_.store(false, std::memory_order_release);
     no_space_.store(false, std::memory_order_release);
@@ -165,6 +173,9 @@ class SpecialEnv : public EnvWrapper {
     log_write_error_.store(false, std::memory_order_release);
     bytes_written_ = 0;
     sync_counter_ = 0;
+    non_writeable_rate_ = 0;
+    new_writable_count_ = 0;
+    periodic_non_writable_ = 0;
   }
 
   Status NewWritableFile(const std::string& f, unique_ptr<WritableFile>* r,
@@ -250,8 +261,19 @@ class SpecialEnv : public EnvWrapper {
       }
     };
 
-    if (non_writable_.load(std::memory_order_acquire)) {
-      return Status::IOError("simulated write error");
+    if (non_writeable_rate_.load(std::memory_order_acquire) > 0) {
+      auto random_number = rnd_.Uniform(100);
+      if (random_number < non_writeable_rate_.load()) {
+        return Status::IOError("simulated random write error");
+      }
+    }
+
+    new_writable_count_++;
+
+    auto periodic_fail = periodic_non_writable_.load();
+    if (periodic_fail > 0 &&
+        new_writable_count_.load() % periodic_fail == 0) {
+      return Status::IOError("simulated periodic write error");
     }
 
     Status s = target()->NewWritableFile(f, r, soptions);
@@ -5871,8 +5893,7 @@ TEST(DBTest, NonWritableFileSystem) {
     options.env = env_;
     Reopen(&options);
     ASSERT_OK(Put("foo", "v1"));
-    // Force errors for new files
-    env_->non_writable_.store(true, std::memory_order_release);
+    env_->non_writeable_rate_.store(100);
     std::string big(100000, 'x');
     int errors = 0;
     for (int i = 0; i < 20; i++) {
@@ -5882,7 +5903,7 @@ TEST(DBTest, NonWritableFileSystem) {
       }
     }
     ASSERT_GT(errors, 0);
-    env_->non_writable_.store(false, std::memory_order_release);
+    env_->non_writeable_rate_.store(0);
   } while (ChangeCompactOptions());
 }
 
@@ -8962,6 +8983,141 @@ TEST(DBTest, DynamicCompactionOptions) {
   ASSERT_EQ(NumTableFilesAtLevel(2), 1);
 }
 
+TEST(DBTest, FileCreationRandomFailure) {
+  Options options;
+  options.env = env_;
+  options.create_if_missing = true;
+  options.write_buffer_size = 100000;  // Small write buffer
+  options.target_file_size_base = 200000;
+  options.max_bytes_for_level_base = 1000000;
+  options.max_bytes_for_level_multiplier = 2;
+
+  DestroyAndReopen(&options);
+  Random rnd(301);
+
+  const int kTestSize = kCDTKeysPerBuffer * 4096;
+  const int kTotalIteration = 100;
+  // the second half of the test involves in random failure
+  // of file creation.
+  const int kRandomFailureTest = kTotalIteration / 2;
+  std::vector<std::string> values;
+  for (int i = 0; i < kTestSize; ++i) {
+    values.push_back("NOT_FOUND");
+  }
+  for (int j = 0; j < kTotalIteration; ++j) {
+    if (j == kRandomFailureTest) {
+      env_->non_writeable_rate_.store(90);
+    }
+    for (int k = 0; k < kTestSize; ++k) {
+      // here we expect some of the Put fails.
+      std::string value = RandomString(&rnd, 100);
+      Status s = Put(Key(k), Slice(value));
+      if (s.ok()) {
+        // update the latest successful put
+        values[k] = value;
+      }
+      // But everything before we simulate the failure-test should succeed.
+      if (j < kRandomFailureTest) {
+        ASSERT_OK(s);
+      }
+    }
+  }
+
+  // If rocksdb does not do the correct job, internal assert will fail here.
+  dbfull()->TEST_WaitForFlushMemTable();
+  dbfull()->TEST_WaitForCompact();
+
+  // verify we have the latest successful update
+  for (int k = 0; k < kTestSize; ++k) {
+    auto v = Get(Key(k));
+    ASSERT_EQ(v, values[k]);
+  }
+
+  // reopen and reverify we have the latest successful update
+  env_->non_writeable_rate_.store(0);
+  Reopen(&options);
+  for (int k = 0; k < kTestSize; ++k) {
+    auto v = Get(Key(k));
+    ASSERT_EQ(v, values[k]);
+  }
+}
+
+TEST(DBTest, PartialCompactionFailure) {
+  Options options;
+  const int kKeySize = 16;
+  const int kKvSize = 1000;
+  const int kKeysPerBuffer = 100;
+  const int kNumL1Files = 5;
+  options.create_if_missing = true;
+  options.write_buffer_size = kKeysPerBuffer * kKvSize;
+  options.max_write_buffer_number = 2;
+  options.target_file_size_base =
+      options.write_buffer_size *
+      (options.max_write_buffer_number - 1);
+  options.level0_file_num_compaction_trigger = kNumL1Files;
+  options.max_bytes_for_level_base =
+      options.level0_file_num_compaction_trigger *
+      options.target_file_size_base;
+  options.max_bytes_for_level_multiplier = 2;
+  options.compression = kNoCompression;
+
+  // The number of NewWritableFiles calls required by each operation.
+  const int kNumInitialNewWritableFiles = 4;
+  const int kNumLevel0FlushNewWritableFiles =
+      options.level0_file_num_compaction_trigger * 2;
+  const int kNumLevel1NewWritableFiles =
+      options.level0_file_num_compaction_trigger + 1;
+  // This setting will make one of the file-creation fail
+  // in the first L0 -> L1 compaction while making sure
+  // all flushes succeeed.
+  env_->periodic_non_writable_ =
+      kNumInitialNewWritableFiles + kNumLevel0FlushNewWritableFiles +
+      kNumLevel1NewWritableFiles - 3;
+  options.env = env_;
+
+  DestroyAndReopen(&options);
+
+  const int kNumKeys =
+      options.level0_file_num_compaction_trigger *
+      (options.max_write_buffer_number - 1) *
+      kKeysPerBuffer * 1.0;
+
+  Random rnd(301);
+  std::vector<std::string> keys;
+  std::vector<std::string> values;
+  for (int k = 0; k < kNumKeys; ++k) {
+    keys.emplace_back(RandomString(&rnd, kKeySize));
+    values.emplace_back(RandomString(&rnd, kKvSize - kKeySize));
+    ASSERT_OK(Put(Slice(keys[k]), Slice(values[k])));
+  }
+
+  dbfull()->TEST_WaitForFlushMemTable();
+  // Make sure the number of L0 files can trigger compaction.
+  ASSERT_GE(NumTableFilesAtLevel(0),
+            options.level0_file_num_compaction_trigger);
+  auto previous_num_level0_files = NumTableFilesAtLevel(0);
+  // Expect compaction to fail here as one file will fail its
+  // creation.
+  dbfull()->TEST_WaitForCompact();
+  // Verify L0 -> L1 compaction does fail.
+  ASSERT_EQ(NumTableFilesAtLevel(1), 0);
+  // Verify all L0 files are still there.
+  ASSERT_EQ(NumTableFilesAtLevel(0), previous_num_level0_files);
+
+  // All key-values must exist after compaction fails.
+  for (int k = 0; k < kNumKeys; ++k) {
+    ASSERT_EQ(values[k], Get(keys[k]));
+  }
+
+  // Make sure RocksDB will not get into corrupted state.
+  Reopen(&options);
+
+  // Verify again after reopen.
+  for (int k = 0; k < kNumKeys; ++k) {
+    ASSERT_EQ(values[k], Get(keys[k]));
+  }
+}
+
 TEST(DBTest, DynamicMiscOptions) {
   // Test max_sequential_skip_in_iterations
   Options options;
diff --git a/db/version_edit.cc b/db/version_edit.cc
index 271016aaf..1252759aa 100644
--- a/db/version_edit.cc
+++ b/db/version_edit.cc
@@ -9,6 +9,7 @@
 
 #include "db/version_edit.h"
 
+#include "db/filename.h"
 #include "db/version_set.h"
 #include "util/coding.h"
 #include "rocksdb/slice.h"
@@ -64,7 +65,7 @@ void VersionEdit::Clear() {
   column_family_name_.clear();
 }
 
-void VersionEdit::EncodeTo(std::string* dst) const {
+bool VersionEdit::EncodeTo(std::string* dst) const {
   if (has_comparator_) {
     PutVarint32(dst, kComparator);
     PutLengthPrefixedSlice(dst, comparator_);
@@ -111,6 +112,9 @@ void VersionEdit::EncodeTo(std::string* dst) const {
       PutVarint32(dst, f.fd.GetPathId());
     }
     PutVarint64(dst, f.fd.GetFileSize());
+    if (!f.smallest.Valid() || !f.largest.Valid()) {
+      return false;
+    }
     PutLengthPrefixedSlice(dst, f.smallest.Encode());
     PutLengthPrefixedSlice(dst, f.largest.Encode());
     PutVarint64(dst, f.smallest_seqno);
@@ -131,6 +135,7 @@ void VersionEdit::EncodeTo(std::string* dst) const {
   if (is_column_family_drop_) {
     PutVarint32(dst, kColumnFamilyDrop);
   }
+  return true;
 }
 
 static bool GetInternalKey(Slice* input, InternalKey* dst) {
diff --git a/db/version_edit.h b/db/version_edit.h
index fbe7e02d1..3317b11c4 100644
--- a/db/version_edit.h
+++ b/db/version_edit.h
@@ -213,7 +213,8 @@ class VersionEdit {
     is_column_family_drop_ = true;
   }
 
-  void EncodeTo(std::string* dst) const;
+  // return true on success.
+  bool EncodeTo(std::string* dst) const;
   Status DecodeFrom(const Slice& src);
 
   std::string DebugString(bool hex_key = false) const;
diff --git a/db/version_edit_test.cc b/db/version_edit_test.cc
index 850f242c1..fe663c766 100644
--- a/db/version_edit_test.cc
+++ b/db/version_edit_test.cc
@@ -44,6 +44,16 @@ TEST(VersionEditTest, EncodeDecode) {
   TestEncodeDecode(edit);
 }
 
+TEST(VersionEditTest, EncodeEmptyFile) {
+  VersionEdit edit;
+  edit.AddFile(0, 0, 0, 0,
+               InternalKey(),
+               InternalKey(),
+               0, 0);
+  std::string buffer;
+  ASSERT_TRUE(!edit.EncodeTo(&buffer));
+}
+
 TEST(VersionEditTest, ColumnFamilyTest) {
   VersionEdit edit;
   edit.SetColumnFamily(2);
diff --git a/db/version_set.cc b/db/version_set.cc
index b47578a4a..6a68c373e 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -1854,7 +1854,11 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
     if (s.ok()) {
       for (auto& e : batch_edits) {
         std::string record;
-        e->EncodeTo(&record);
+        if (!e->EncodeTo(&record)) {
+          s = Status::Corruption(
+              "Unable to Encode VersionEdit:" + e->DebugString(true));
+          break;
+        }
         s = descriptor_log_->AddRecord(record);
         if (!s.ok()) {
           break;
@@ -1872,19 +1876,24 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
         }
       }
       if (!s.ok()) {
-        Log(db_options_->info_log, "MANIFEST write: %s\n",
-            s.ToString().c_str());
+        Log(InfoLogLevel::ERROR_LEVEL, db_options_->info_log,
+            "MANIFEST write: %s\n", s.ToString().c_str());
         bool all_records_in = true;
         for (auto& e : batch_edits) {
           std::string record;
-          e->EncodeTo(&record);
+          if (!e->EncodeTo(&record)) {
+            s = Status::Corruption(
+                "Unable to Encode VersionEdit:" + e->DebugString(true));
+            all_records_in = false;
+            break;
+          }
           if (!ManifestContains(pending_manifest_file_number_, record)) {
             all_records_in = false;
             break;
           }
         }
         if (all_records_in) {
-          Log(db_options_->info_log,
+          Log(InfoLogLevel::WARN_LEVEL, db_options_->info_log,
               "MANIFEST contains log record despite error; advancing to new "
               "version to prevent mismatch between in-memory and logged state"
               " If paranoid is set, then the db is now in readonly mode.");
@@ -2661,7 +2670,10 @@ Status VersionSet::WriteSnapshot(log::Writer* log) {
       edit.SetComparatorName(
           cfd->internal_comparator().user_comparator()->Name());
       std::string record;
-      edit.EncodeTo(&record);
+      if (!edit.EncodeTo(&record)) {
+        return Status::Corruption(
+            "Unable to Encode VersionEdit:" + edit.DebugString(true));
+      }
       Status s = log->AddRecord(record);
       if (!s.ok()) {
         return s;
@@ -2682,7 +2694,10 @@ Status VersionSet::WriteSnapshot(log::Writer* log) {
       }
       edit.SetLogNumber(cfd->GetLogNumber());
       std::string record;
-      edit.EncodeTo(&record);
+      if (!edit.EncodeTo(&record)) {
+        return Status::Corruption(
+            "Unable to Encode VersionEdit:" + edit.DebugString(true));
+      }
       Status s = log->AddRecord(record);
       if (!s.ok()) {
         return s;

From 60fa7d1365323f32b29b3441b3678dd3c319dafc Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Tue, 28 Oct 2014 15:17:02 -0700
Subject: [PATCH 325/829] Improve the robustnesss of PartialCompactionFailure
 test.

Summary:
Improve the robustness of PartialCompactionFailure test.

Test Plan:
./db_test
---
 db/db_test.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/db/db_test.cc b/db/db_test.cc
index 927d97ed4..3ef636e7c 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -9072,7 +9072,7 @@ TEST(DBTest, PartialCompactionFailure) {
   // all flushes succeeed.
   env_->periodic_non_writable_ =
       kNumInitialNewWritableFiles + kNumLevel0FlushNewWritableFiles +
-      kNumLevel1NewWritableFiles - 3;
+      kNumLevel1NewWritableFiles - 2;
   options.env = env_;
 
   DestroyAndReopen(&options);
@@ -9080,7 +9080,7 @@ TEST(DBTest, PartialCompactionFailure) {
   const int kNumKeys =
       options.level0_file_num_compaction_trigger *
       (options.max_write_buffer_number - 1) *
-      kKeysPerBuffer * 1.0;
+      kKeysPerBuffer * 0.95;
 
   Random rnd(301);
   std::vector<std::string> keys;

From fb3f8ffe5e7c0bdecc40f89cf95cb1b86746b729 Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Tue, 28 Oct 2014 15:35:10 -0700
Subject: [PATCH 326/829] Improve the robustness of PartialCompactionFailure
 test again.

Summary:
Improve the robustness of PartialCompactionFailure test again.

Test Plan:
./db_test
---
 db/db_test.cc | 25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/db/db_test.cc b/db/db_test.cc
index 3ef636e7c..b5e32d791 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -9061,18 +9061,6 @@ TEST(DBTest, PartialCompactionFailure) {
   options.max_bytes_for_level_multiplier = 2;
   options.compression = kNoCompression;
 
-  // The number of NewWritableFiles calls required by each operation.
-  const int kNumInitialNewWritableFiles = 4;
-  const int kNumLevel0FlushNewWritableFiles =
-      options.level0_file_num_compaction_trigger * 2;
-  const int kNumLevel1NewWritableFiles =
-      options.level0_file_num_compaction_trigger + 1;
-  // This setting will make one of the file-creation fail
-  // in the first L0 -> L1 compaction while making sure
-  // all flushes succeeed.
-  env_->periodic_non_writable_ =
-      kNumInitialNewWritableFiles + kNumLevel0FlushNewWritableFiles +
-      kNumLevel1NewWritableFiles - 2;
   options.env = env_;
 
   DestroyAndReopen(&options);
@@ -9080,7 +9068,7 @@ TEST(DBTest, PartialCompactionFailure) {
   const int kNumKeys =
       options.level0_file_num_compaction_trigger *
       (options.max_write_buffer_number - 1) *
-      kKeysPerBuffer * 0.95;
+      kKeysPerBuffer;
 
   Random rnd(301);
   std::vector<std::string> keys;
@@ -9096,6 +9084,15 @@ TEST(DBTest, PartialCompactionFailure) {
   ASSERT_GE(NumTableFilesAtLevel(0),
             options.level0_file_num_compaction_trigger);
   auto previous_num_level0_files = NumTableFilesAtLevel(0);
+
+  // The number of NewWritableFiles calls required by each operation.
+  const int kNumLevel1NewWritableFiles =
+      options.level0_file_num_compaction_trigger + 1;
+  // This setting will make one of the file-creation fail
+  // in the first L0 -> L1 compaction while making sure
+  // all flushes succeeed.
+  env_->periodic_non_writable_ = kNumLevel1NewWritableFiles - 2;
+
   // Expect compaction to fail here as one file will fail its
   // creation.
   dbfull()->TEST_WaitForCompact();
@@ -9109,6 +9106,8 @@ TEST(DBTest, PartialCompactionFailure) {
     ASSERT_EQ(values[k], Get(keys[k]));
   }
 
+  env_->periodic_non_writable_ = 0;
+
   // Make sure RocksDB will not get into corrupted state.
   Reopen(&options);
 

From abac3d64760e333365784b1f4a9ab00c2cdd8084 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Tue, 28 Oct 2014 17:52:32 -0700
Subject: [PATCH 327/829] TableMock + framework for mock classes

Summary:
This diff replaces BlockBasedTable in flush_job_test with TableMock, making it depend on less things and making it closer to an unit test than integration test.

It also introduces a framework to compile mock classes -- Any file named *mock.cc will not be compiled into the build. It will only get compiled into the tests. What way we can mock out most other classes, Version, VersionSet, DBImpl, etc.

Test Plan: flush_job_test

Reviewers: ljin, rven, yhchiang, sdong

Reviewed By: sdong

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D27681
---
 Makefile                          |   4 +-
 build_tools/build_detect_platform |   7 +-
 db/flush_job_test.cc              |  20 ++--
 db/table_cache.cc                 |   1 -
 table/mock_table.cc               |  95 +++++++++++++++++
 table/mock_table.h                | 171 ++++++++++++++++++++++++++++++
 6 files changed, 286 insertions(+), 12 deletions(-)
 create mode 100644 table/mock_table.cc
 create mode 100644 table/mock_table.h

diff --git a/Makefile b/Makefile
index 12dbba153..62b31b87a 100644
--- a/Makefile
+++ b/Makefile
@@ -76,11 +76,11 @@ CXXFLAGS += $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CXXFLAGS) $(OPT) -Woverl
 LDFLAGS += $(PLATFORM_LDFLAGS)
 
 LIBOBJECTS = $(SOURCES:.cc=.o)
-LIBOBJECTS += $(SOURCESCPP:.cpp=.o)
 MEMENVOBJECTS = $(MEMENV_SOURCES:.cc=.o)
+MOCKOBJECTS = $(MOCK_SOURCES:.cc=.o)
 
 TESTUTIL = ./util/testutil.o
-TESTHARNESS = ./util/testharness.o $(TESTUTIL)
+TESTHARNESS = ./util/testharness.o $(TESTUTIL) $(MOCKOBJECTS)
 BENCHHARNESS = ./util/benchharness.o
 VALGRIND_ERROR = 2
 VALGRIND_DIR = build_tools/VALGRIND_LOGS
diff --git a/build_tools/build_detect_platform b/build_tools/build_detect_platform
index 92839ad4f..4dc19fd03 100755
--- a/build_tools/build_detect_platform
+++ b/build_tools/build_detect_platform
@@ -173,14 +173,15 @@ DIRS="util db table utilities"
 set -f # temporarily disable globbing so that our patterns arent expanded
 PRUNE_TEST="-name *test*.cc -prune"
 PRUNE_BENCH="-name *bench*.cc -prune"
-PORTABLE_FILES=`cd "$ROCKSDB_ROOT"; find $DIRS $PRUNE_TEST -o $PRUNE_BENCH -o -name '*.cc' -print | sort | tr "\n" " "`
-PORTABLE_CPP=`cd "$ROCKSDB_ROOT"; find $DIRS $PRUNE_TEST -o $PRUNE_BENCH -o -name '*.cpp' -print | sort | tr "\n" " "`
+PRUNE_MOCK="-name *mock*.cc -prune"
+PORTABLE_FILES=`cd "$ROCKSDB_ROOT"; find $DIRS $PRUNE_TEST -o $PRUNE_BENCH -o $PRUNE_MOCK -o -name '*.cc' -print | sort | tr "\n" " "`
+MOCK_SOURCES=`cd "$ROCKSDB_ROOT"; find $DIRS -name '*mock.cc' -print | sort | tr "\n" " "`
 set +f # re-enable globbing
 
 # The sources consist of the portable files, plus the platform-specific port
 # file.
 echo "SOURCES=$PORTABLE_FILES $GENERIC_PORT_FILES $PORT_FILES" >> "$OUTPUT"
-echo "SOURCESCPP=$PORTABLE_CPP" >> "$OUTPUT"
+echo "MOCK_SOURCES=$MOCK_SOURCES" >> "$OUTPUT"
 echo "MEMENV_SOURCES=helpers/memenv/memenv.cc" >> "$OUTPUT"
 
 if [ "$CROSS_COMPILE" = "true" -o "$FBCODE_BUILD" = "true" ]; then
diff --git a/db/flush_job_test.cc b/db/flush_job_test.cc
index 06852eedf..9cfe015e1 100644
--- a/db/flush_job_test.cc
+++ b/db/flush_job_test.cc
@@ -3,19 +3,22 @@
 //  LICENSE file in the root directory of this source tree. An additional grant
 //  of patent rights can be found in the PATENTS file in the same directory.
 
+#include <map>
+#include <string>
+
 #include "db/flush_job.h"
 #include "db/column_family.h"
 #include "db/version_set.h"
 #include "rocksdb/cache.h"
 #include "util/testharness.h"
 #include "util/testutil.h"
+#include "table/mock_table.h"
 
 namespace rocksdb {
 
 // TODO(icanadi) Mock out everything else:
 // 1. VersionSet
-// 2. TableBuilder
-// 3. Memtable
+// 2. Memtable
 class FlushJobTest {
  public:
   FlushJobTest()
@@ -24,14 +27,16 @@ class FlushJobTest {
         table_cache_(NewLRUCache(50000, 16, 8)),
         versions_(new VersionSet(dbname_, &db_options_, env_options_,
                                  table_cache_.get(), &write_controller_)),
-        shutting_down_(false) {
+        shutting_down_(false),
+        mock_table_factory_(new MockTableFactory()) {
     ASSERT_OK(env_->CreateDirIfMissing(dbname_));
     db_options_.db_paths.emplace_back(dbname_,
                                       std::numeric_limits<uint64_t>::max());
     // TODO(icanadi) Remove this once we mock out VersionSet
     NewDB();
     std::vector<ColumnFamilyDescriptor> column_families;
-    column_families.emplace_back();
+    cf_options_.table_factory = mock_table_factory_;
+    column_families.emplace_back(kDefaultColumnFamilyName, cf_options_);
 
     ASSERT_OK(versions_->Recover(column_families, false));
   }
@@ -69,6 +74,7 @@ class FlushJobTest {
   port::Mutex mutex_;
   std::atomic<bool> shutting_down_;
   FileNumToPathIdMap pending_outputs_;
+  std::shared_ptr<MockTableFactory> mock_table_factory_;
 };
 
 TEST(FlushJobTest, Empty) {
@@ -89,10 +95,13 @@ TEST(FlushJobTest, NonEmpty) {
   auto new_mem = new MemTable(cfd->internal_comparator(), *cfd->ioptions(),
                               *cfd->GetLatestMutableCFOptions());
   new_mem->Ref();
+  std::map<std::string, std::string> inserted_keys;
   for (int i = 1; i < 10000; ++i) {
     std::string key(std::to_string(i));
     std::string value("value" + std::to_string(i));
     new_mem->Add(SequenceNumber(i), kTypeValue, key, value);
+    InternalKey internal_key(key, SequenceNumber(i), kTypeValue);
+    inserted_keys.insert({internal_key.Encode().ToString(), value});
   }
   cfd->imm()->Add(new_mem);
 
@@ -104,8 +113,7 @@ TEST(FlushJobTest, NonEmpty) {
   mutex_.Lock();
   ASSERT_OK(flush_job.Run());
   mutex_.Unlock();
-  // TODO(icanadi) once you have TableMock, verify that key-values are as
-  // expected
+  mock_table_factory_->AssertSingleFile(inserted_keys);
 }
 
 }  // namespace rocksdb
diff --git a/db/table_cache.cc b/db/table_cache.cc
index 580e8049d..e1b0ca8b9 100644
--- a/db/table_cache.cc
+++ b/db/table_cache.cc
@@ -88,7 +88,6 @@ Status TableCache::FindTable(const EnvOptions& env_options,
       // We do not cache error results so that if the error is transient,
       // or somebody repairs the file, we recover automatically.
     } else {
-      assert(file.get() == nullptr);
       *handle = cache_->Insert(key, table_reader.release(), 1, &DeleteEntry);
     }
   }
diff --git a/table/mock_table.cc b/table/mock_table.cc
new file mode 100644
index 000000000..64a00951c
--- /dev/null
+++ b/table/mock_table.cc
@@ -0,0 +1,95 @@
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include "rocksdb/table_properties.h"
+#include "table/mock_table.h"
+#include "table/get_context.h"
+#include "db/dbformat.h"
+#include "port/port.h"
+#include "util/coding.h"
+
+namespace rocksdb {
+
+Iterator* MockTableReader::NewIterator(const ReadOptions&, Arena* arena) {
+  return new MockTableIterator(table_);
+}
+
+Status MockTableReader::Get(const ReadOptions&, const Slice& key,
+                            GetContext* get_context) {
+  std::unique_ptr<MockTableIterator> iter(new MockTableIterator(table_));
+  for (iter->Seek(key); iter->Valid(); iter->Next()) {
+    ParsedInternalKey parsed_key;
+    if (!ParseInternalKey(iter->key(), &parsed_key)) {
+      return Status::Corruption(Slice());
+    }
+
+    if (!get_context->SaveValue(parsed_key, iter->value())) {
+      break;
+    }
+  }
+  return Status::OK();
+}
+
+std::shared_ptr<const TableProperties> MockTableReader::GetTableProperties()
+    const {
+  return std::shared_ptr<const TableProperties>(new TableProperties());
+}
+
+MockTableFactory::MockTableFactory() : next_id_(1) {}
+
+Status MockTableFactory::NewTableReader(
+    const ImmutableCFOptions& ioptions, const EnvOptions& env_options,
+    const InternalKeyComparator& internal_key,
+    unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
+    unique_ptr<TableReader>* table_reader) const {
+  uint32_t id = GetIDFromFile(file.get());
+
+  MutexLock lock_guard(&file_system_.mutex);
+
+  auto it = file_system_.files.find(id);
+  if (it == file_system_.files.end()) {
+    return Status::IOError("Mock file not found");
+  }
+
+  table_reader->reset(new MockTableReader(it->second));
+
+  return Status::OK();
+}
+
+TableBuilder* MockTableFactory::NewTableBuilder(
+    const ImmutableCFOptions& ioptions,
+    const InternalKeyComparator& internal_key, WritableFile* file,
+    const CompressionType compression_type,
+    const CompressionOptions& compression_opts) const {
+  uint32_t id = GetAndWriteNextID(file);
+
+  return new MockTableBuilder(id, &file_system_);
+}
+
+uint32_t MockTableFactory::GetAndWriteNextID(WritableFile* file) const {
+  uint32_t next_id = next_id_.fetch_add(1);
+  char buf[4];
+  EncodeFixed32(buf, next_id);
+  file->Append(Slice(buf, 4));
+  return next_id;
+}
+
+uint32_t MockTableFactory::GetIDFromFile(RandomAccessFile* file) const {
+  char buf[4];
+  Slice result;
+  file->Read(0, 4, &result, buf);
+  assert(result.size() == 4);
+  return DecodeFixed32(buf);
+}
+
+void MockTableFactory::AssertSingleFile(
+    const std::map<std::string, std::string>& file_contents) {
+  ASSERT_EQ(file_system_.files.size(), 1U);
+  ASSERT_TRUE(file_contents == file_system_.files.begin()->second);
+}
+
+}  // namespace rocksdb
diff --git a/table/mock_table.h b/table/mock_table.h
new file mode 100644
index 000000000..0ad92cb7c
--- /dev/null
+++ b/table/mock_table.h
@@ -0,0 +1,171 @@
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+#pragma once
+#include <algorithm>
+#include <set>
+#include <memory>
+#include <map>
+#include <string>
+
+#include "rocksdb/table.h"
+#include "table/table_reader.h"
+#include "table/table_builder.h"
+#include "port/port.h"
+#include "util/mutexlock.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+namespace rocksdb {
+
+// NOTE this currently only supports bitwise comparator
+
+struct MockTableFileSystem {
+  port::Mutex mutex;
+  std::map<uint32_t, std::map<std::string, std::string>> files;
+};
+
+class MockTableReader : public TableReader {
+ public:
+  MockTableReader(const std::map<std::string, std::string>& table)
+      : table_(table) {}
+
+  Iterator* NewIterator(const ReadOptions&, Arena* arena) override;
+
+  Status Get(const ReadOptions&, const Slice& key,
+             GetContext* get_context) override;
+
+  uint64_t ApproximateOffsetOf(const Slice& key) override { return 0; }
+
+  virtual size_t ApproximateMemoryUsage() const override { return 0; }
+
+  void SetupForCompaction() override {}
+
+  std::shared_ptr<const TableProperties> GetTableProperties() const override;
+
+  ~MockTableReader() {}
+
+ private:
+  const std::map<std::string, std::string>& table_;
+};
+
+class MockTableIterator : public Iterator {
+ public:
+  explicit MockTableIterator(const std::map<std::string, std::string>& table)
+      : table_(table) {
+    itr_ = table_.end();
+  }
+
+  bool Valid() const { return itr_ == table_.end(); }
+
+  void SeekToFirst() { itr_ = table_.begin(); }
+
+  void SeekToLast() {
+    itr_ = table_.end();
+    --itr_;
+  }
+
+  void Seek(const Slice& target) {
+    std::string str_target(target.data(), target.size());
+    itr_ = table_.lower_bound(str_target);
+  }
+
+  void Next() { ++itr_; }
+
+  void Prev() {
+    if (itr_ == table_.begin()) {
+      itr_ = table_.end();
+    } else {
+      --itr_;
+    }
+  }
+
+  Slice key() const { return Slice(itr_->first); }
+
+  Slice value() const { return Slice(itr_->second); }
+
+  Status status() const { return Status::OK(); }
+
+ private:
+  const std::map<std::string, std::string>& table_;
+  std::map<std::string, std::string>::const_iterator itr_;
+};
+
+class MockTableBuilder : public TableBuilder {
+ public:
+  MockTableBuilder(uint32_t id, MockTableFileSystem* file_system)
+      : id_(id), file_system_(file_system) {}
+
+  // REQUIRES: Either Finish() or Abandon() has been called.
+  ~MockTableBuilder() {}
+
+  // Add key,value to the table being constructed.
+  // REQUIRES: key is after any previously added key according to comparator.
+  // REQUIRES: Finish(), Abandon() have not been called
+  void Add(const Slice& key, const Slice& value) override {
+    table_.insert({key.ToString(), value.ToString()});
+  }
+
+  // Return non-ok iff some error has been detected.
+  Status status() const override { return Status::OK(); }
+
+  Status Finish() override {
+    MutexLock lock_guard(&file_system_->mutex);
+    file_system_->files.insert({id_, table_});
+    return Status::OK();
+  }
+
+  void Abandon() override {}
+
+  uint64_t NumEntries() const override { return table_.size(); }
+
+  uint64_t FileSize() const override { return table_.size(); }
+
+ private:
+  uint32_t id_;
+  MockTableFileSystem* file_system_;
+  std::map<std::string, std::string> table_;
+};
+
+class MockTableFactory : public TableFactory {
+ public:
+  MockTableFactory();
+  const char* Name() const override { return "MockTable"; }
+  Status NewTableReader(const ImmutableCFOptions& ioptions,
+                        const EnvOptions& env_options,
+                        const InternalKeyComparator& internal_key,
+                        unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
+                        unique_ptr<TableReader>* table_reader) const;
+
+  TableBuilder* NewTableBuilder(
+      const ImmutableCFOptions& ioptions,
+      const InternalKeyComparator& internal_key, WritableFile* file,
+      const CompressionType compression_type,
+      const CompressionOptions& compression_opts) const;
+
+  virtual Status SanitizeOptions(const DBOptions& db_opts,
+                                 const ColumnFamilyOptions& cf_opts) const {
+    return Status::OK();
+  }
+
+  virtual std::string GetPrintableTableOptions() const override {
+    return std::string();
+  }
+
+  // This function will assert that only a single file exists and that the
+  // contents are equal to file_contents
+  void AssertSingleFile(
+      const std::map<std::string, std::string>& file_contents);
+
+ private:
+  uint32_t GetAndWriteNextID(WritableFile* file) const;
+  uint32_t GetIDFromFile(RandomAccessFile* file) const;
+
+  mutable MockTableFileSystem file_system_;
+  mutable std::atomic<uint32_t> next_id_;
+};
+
+}  // namespace rocksdb

From c082853340b054ad1d9a1abbc06cbc1e4d258525 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Tue, 28 Oct 2014 17:55:08 -0700
Subject: [PATCH 328/829] Include all the mocks

---
 build_tools/build_detect_platform | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/build_tools/build_detect_platform b/build_tools/build_detect_platform
index 4dc19fd03..8e92b9b6b 100755
--- a/build_tools/build_detect_platform
+++ b/build_tools/build_detect_platform
@@ -175,7 +175,7 @@ PRUNE_TEST="-name *test*.cc -prune"
 PRUNE_BENCH="-name *bench*.cc -prune"
 PRUNE_MOCK="-name *mock*.cc -prune"
 PORTABLE_FILES=`cd "$ROCKSDB_ROOT"; find $DIRS $PRUNE_TEST -o $PRUNE_BENCH -o $PRUNE_MOCK -o -name '*.cc' -print | sort | tr "\n" " "`
-MOCK_SOURCES=`cd "$ROCKSDB_ROOT"; find $DIRS -name '*mock.cc' -print | sort | tr "\n" " "`
+MOCK_SOURCES=`cd "$ROCKSDB_ROOT"; find $DIRS -name '*mock*.cc' -print | sort | tr "\n" " "`
 set +f # re-enable globbing
 
 # The sources consist of the portable files, plus the platform-specific port

From 412b7f85bbbfd19ac55f09963fd47a3421dd0b7f Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Tue, 28 Oct 2014 18:10:55 -0700
Subject: [PATCH 329/829] Include atomic in mock_table.h

---
 table/mock_table.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/table/mock_table.h b/table/mock_table.h
index 0ad92cb7c..806ab93d4 100644
--- a/table/mock_table.h
+++ b/table/mock_table.h
@@ -8,6 +8,7 @@
 #include <algorithm>
 #include <set>
 #include <memory>
+#include <atomic>
 #include <map>
 #include <string>
 

From 2110e43a5ab0f7ea440570bb6bb8f43bbe04e15c Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Wed, 29 Oct 2014 11:19:44 -0700
Subject: [PATCH 330/829] Remove an unnecessary include file in version_edit.cc

Summary:
Remove an unnecessary include file in version_edit.cc
---
 db/version_edit.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/db/version_edit.cc b/db/version_edit.cc
index 1252759aa..32e49fe95 100644
--- a/db/version_edit.cc
+++ b/db/version_edit.cc
@@ -9,7 +9,6 @@
 
 #include "db/version_edit.h"
 
-#include "db/filename.h"
 #include "db/version_set.h"
 #include "util/coding.h"
 #include "rocksdb/slice.h"

From c9c935923e7de16ac3c7341d385c743083ee4001 Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Wed, 29 Oct 2014 11:21:51 -0700
Subject: [PATCH 331/829] Move the check to the beginning of the loop in
 VersionEdit::EncodeTo()

---
 db/version_edit.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/db/version_edit.cc b/db/version_edit.cc
index 32e49fe95..4a6506c7d 100644
--- a/db/version_edit.cc
+++ b/db/version_edit.cc
@@ -98,6 +98,9 @@ bool VersionEdit::EncodeTo(std::string* dst) const {
 
   for (size_t i = 0; i < new_files_.size(); i++) {
     const FileMetaData& f = new_files_[i].second;
+    if (!f.smallest.Valid() || !f.largest.Valid()) {
+      return false;
+    }
     if (f.fd.GetPathId() == 0) {
       // Use older format to make sure user can roll back the build if they
       // don't config multiple DB paths.
@@ -111,9 +114,6 @@ bool VersionEdit::EncodeTo(std::string* dst) const {
       PutVarint32(dst, f.fd.GetPathId());
     }
     PutVarint64(dst, f.fd.GetFileSize());
-    if (!f.smallest.Valid() || !f.largest.Valid()) {
-      return false;
-    }
     PutLengthPrefixedSlice(dst, f.smallest.Encode());
     PutLengthPrefixedSlice(dst, f.largest.Encode());
     PutVarint64(dst, f.smallest_seqno);

From 5a921b895054529670ff5aa874548c6e7b9c4b71 Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Wed, 29 Oct 2014 11:58:09 -0700
Subject: [PATCH 332/829] DBTest: options clean up - part 1

Summary:
DBTest has several functions (Reopen(), TryReopen(), ChangeOptins(), etc
that takes a pointer to options), depending on if it is nullptr, it uses
different options underneath. This makes it really hard to track what
options is used in different test case. We should just kill the default
value and make it being passed into explicitly. It is going to be very
hairy. I will start with simple ones.

Test Plan:
make db_test
stacked diffs, will run test with full stack

Reviewers: sdong, yhchiang, rven, igor

Reviewed By: igor

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D27687
---
 db/db_test.cc | 126 ++++++++++++++++++++++++++------------------------
 1 file changed, 65 insertions(+), 61 deletions(-)

diff --git a/db/db_test.cc b/db/db_test.cc
index b5e32d791..010b04ee4 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -406,9 +406,11 @@ class DBTest {
   DBTest() : option_config_(kDefault),
              env_(new SpecialEnv(Env::Default())) {
     dbname_ = test::TmpDir() + "/db_test";
-    ASSERT_OK(DestroyDB(dbname_, Options()));
+    Options options;
+    options.create_if_missing = true;
+    ASSERT_OK(DestroyDB(dbname_, options));
     db_ = nullptr;
-    Reopen();
+    Reopen(options);
   }
 
   ~DBTest() {
@@ -697,8 +699,8 @@ class DBTest {
     return TryReopenWithColumnFamilies(cfs, v_opts);
   }
 
-  void Reopen(Options* options = nullptr) {
-    ASSERT_OK(TryReopen(options));
+  void Reopen(const Options& options) {
+    ASSERT_OK(TryReopen(&options));
   }
 
   void Close() {
@@ -725,7 +727,7 @@ class DBTest {
     return DB::OpenForReadOnly(*options, dbname_, &db_);
   }
 
-  Status TryReopen(Options* options = nullptr) {
+  Status TryReopen(const Options* options = nullptr) {
     Close();
     Options opts;
     if (options != nullptr) {
@@ -1297,7 +1299,7 @@ TEST(DBTest, ReadOnlyDB) {
   Close();
 
   // Reopen and flush memtable.
-  Reopen();
+  Reopen(options);
   Flush();
   Close();
   // Now check keys in read only mode.
@@ -1315,7 +1317,7 @@ TEST(DBTest, CompactedDB) {
   options.target_file_size_base = kFileSize;
   options.max_bytes_for_level_base = 1 << 30;
   options.compression = kNoCompression;
-  Reopen(&options);
+  Reopen(options);
   // 1 L0 file, use CompactedDB if max_open_files = -1
   ASSERT_OK(Put("aaa", DummyString(kFileSize / 2, '1')));
   Flush();
@@ -1333,7 +1335,7 @@ TEST(DBTest, CompactedDB) {
             "Not implemented: Not supported in compacted db mode.");
   ASSERT_EQ(DummyString(kFileSize / 2, '1'), Get("aaa"));
   Close();
-  Reopen(&options);
+  Reopen(options);
   // Add more L0 files
   ASSERT_OK(Put("bbb", DummyString(kFileSize / 2, '2')));
   Flush();
@@ -1351,7 +1353,7 @@ TEST(DBTest, CompactedDB) {
   Close();
 
   // Full compaction
-  Reopen(&options);
+  Reopen(options);
   // Add more keys
   ASSERT_OK(Put("eee", DummyString(kFileSize / 2, 'e')));
   ASSERT_OK(Put("fff", DummyString(kFileSize / 2, 'f')));
@@ -1454,7 +1456,7 @@ TEST(DBTest, IndexAndFilterBlocksOfNewTableAddedToCache) {
 TEST(DBTest, GetPropertiesOfAllTablesTest) {
   Options options = CurrentOptions();
   options.max_background_flushes = 0;
-  Reopen(&options);
+  Reopen(options);
   // Create 4 tables
   for (int table = 0; table < 4; ++table) {
     for (int i = 0; i < 10 + table; ++i) {
@@ -1464,11 +1466,11 @@ TEST(DBTest, GetPropertiesOfAllTablesTest) {
   }
 
   // 1. Read table properties directly from file
-  Reopen(&options);
+  Reopen(options);
   VerifyTableProperties(db_, 10 + 11 + 12 + 13);
 
   // 2. Put two tables to table cache and
-  Reopen(&options);
+  Reopen(options);
   // fetch key from 1st and 2nd table, which will internally place that table to
   // the table cache.
   for (int i = 0; i < 2; ++i) {
@@ -1478,7 +1480,7 @@ TEST(DBTest, GetPropertiesOfAllTablesTest) {
   VerifyTableProperties(db_, 10 + 11 + 12 + 13);
 
   // 3. Put all tables to table cache
-  Reopen(&options);
+  Reopen(options);
   // fetch key from 1st and 2nd table, which will internally place that table to
   // the table cache.
   for (int i = 0; i < 4; ++i) {
@@ -2456,7 +2458,7 @@ TEST(DBTest, IgnoreRecoveredLog) {
     }
 
     // recover the DB
-    Reopen(&options);
+    Reopen(options);
     ASSERT_EQ(two, Get("foo"));
     ASSERT_EQ(one, Get("bar"));
     Close();
@@ -2470,12 +2472,12 @@ TEST(DBTest, IgnoreRecoveredLog) {
     // this should ignore the log files, recovery should not happen again
     // if the recovery happens, the same merge operator would be called twice,
     // leading to incorrect results
-    Reopen(&options);
+    Reopen(options);
     ASSERT_EQ(two, Get("foo"));
     ASSERT_EQ(one, Get("bar"));
     Close();
     Destroy(&options);
-    Reopen(&options);
+    Reopen(options);
     Close();
 
     // copy the logs from backup back to wal dir
@@ -2487,7 +2489,7 @@ TEST(DBTest, IgnoreRecoveredLog) {
     }
     // assert that we successfully recovered only from logs, even though we
     // destroyed the DB
-    Reopen(&options);
+    Reopen(options);
     ASSERT_EQ(two, Get("foo"));
     ASSERT_EQ(one, Get("bar"));
 
@@ -2767,7 +2769,7 @@ TEST(DBTest, GetProperty) {
   options.max_write_buffer_number = 10;
   options.min_write_buffer_number_to_merge = 1;
   options.write_buffer_size = 1000000;
-  Reopen(&options);
+  Reopen(options);
 
   std::string big_value(1000000 * 2, 'x');
   std::string num;
@@ -2841,7 +2843,7 @@ TEST(DBTest, GetProperty) {
 
   dbfull()->TEST_WaitForFlushMemTable();
   options.max_open_files = 10;
-  Reopen(&options);
+  Reopen(options);
   // After reopening, no table reader is loaded, so no memory for table readers
   ASSERT_TRUE(
       dbfull()->GetIntProperty("rocksdb.estimate-table-readers-mem", &int_num));
@@ -3034,7 +3036,7 @@ TEST(DBTest, IdentityAcrossRestarts) {
     ASSERT_OK(db_->GetDbIdentity(id1));
 
     Options options = CurrentOptions();
-    Reopen(&options);
+    Reopen(options);
     std::string id2;
     ASSERT_OK(db_->GetDbIdentity(id2));
     // id1 should match id2 because identity was not regenerated
@@ -3042,7 +3044,7 @@ TEST(DBTest, IdentityAcrossRestarts) {
 
     std::string idfilename = IdentityFileName(dbname_);
     ASSERT_OK(env_->DeleteFile(idfilename));
-    Reopen(&options);
+    Reopen(options);
     std::string id3;
     ASSERT_OK(db_->GetDbIdentity(id3));
     // id1 should NOT match id3 because identity was regenerated
@@ -3221,7 +3223,7 @@ TEST(DBTest, CompactionDeletionTriggerReopen) {
     // round 2 --- disable auto-compactions and issue deletions.
     options.create_if_missing = false;
     options.disable_auto_compactions = true;
-    Reopen(&options);
+    Reopen(options);
 
     for (int k = 0; k < kTestSize; ++k) {
       ASSERT_OK(Delete(Key(k)));
@@ -3235,7 +3237,7 @@ TEST(DBTest, CompactionDeletionTriggerReopen) {
     // round 3 --- reopen db with auto_compaction on and see if
     // deletion compensation still work.
     options.disable_auto_compactions = false;
-    Reopen(&options);
+    Reopen(options);
     // insert relatively small amount of data to trigger auto compaction.
     for (int k = 0; k < kTestSize / 10; ++k) {
       ASSERT_OK(Put(Key(k), values[k]));
@@ -3566,7 +3568,7 @@ TEST(DBTest, UniversalCompactionStopStyleSimilarSize) {
   options.compaction_options_universal.size_ratio = 10;
   options.compaction_options_universal.stop_style = kCompactionStopStyleSimilarSize;
   options.num_levels=1;
-  Reopen(&options);
+  Reopen(options);
 
   Random rnd(301);
   int key_idx = 0;
@@ -3768,7 +3770,7 @@ TEST(DBTest, UniversalCompactionCompressRatio1) {
   options.num_levels = 1;
   options.compaction_options_universal.compression_size_percent = 70;
   options = CurrentOptions(options);
-  Reopen(&options);
+  Reopen(options);
 
   Random rnd(301);
   int key_idx = 0;
@@ -3833,7 +3835,7 @@ TEST(DBTest, UniversalCompactionCompressRatio2) {
   options.num_levels = 1;
   options.compaction_options_universal.compression_size_percent = 95;
   options = CurrentOptions(options);
-  Reopen(&options);
+  Reopen(options);
 
   Random rnd(301);
   int key_idx = 0;
@@ -3880,7 +3882,7 @@ TEST(DBTest, UniversalCompactionSecondPathRatio) {
     env_->DeleteFile(options.db_paths[1].path + "/" + filenames[i]);
   }
   env_->DeleteDir(options.db_paths[1].path);
-  Reopen(&options);
+  Reopen(options);
 
   Random rnd(301);
   int key_idx = 0;
@@ -3946,7 +3948,7 @@ TEST(DBTest, UniversalCompactionSecondPathRatio) {
     ASSERT_TRUE(v.size() == 1 || v.size() == 10000);
   }
 
-  Reopen(&options);
+  Reopen(options);
 
   for (int i = 0; i < key_idx; i++) {
     auto v = Get(Key(i));
@@ -3976,7 +3978,7 @@ TEST(DBTest, UniversalCompactionFourPaths) {
     env_->DeleteFile(options.db_paths[1].path + "/" + filenames[i]);
   }
   env_->DeleteDir(options.db_paths[1].path);
-  Reopen(&options);
+  Reopen(options);
 
   Random rnd(301);
   int key_idx = 0;
@@ -4045,7 +4047,7 @@ TEST(DBTest, UniversalCompactionFourPaths) {
     ASSERT_TRUE(v.size() == 1 || v.size() == 10000);
   }
 
-  Reopen(&options);
+  Reopen(options);
 
   for (int i = 0; i < key_idx; i++) {
     auto v = Get(Key(i));
@@ -4237,7 +4239,7 @@ TEST(DBTest, MinLevelToCompress1) {
   if (!MinLevelToCompress(type, options, -14, -1, 0)) {
     return;
   }
-  Reopen(&options);
+  Reopen(options);
   MinLevelHelper(this, options);
 
   // do not compress L0 and L1
@@ -4257,7 +4259,7 @@ TEST(DBTest, MinLevelToCompress2) {
   if (!MinLevelToCompress(type, options, 15, -1, 0)) {
     return;
   }
-  Reopen(&options);
+  Reopen(options);
   MinLevelHelper(this, options);
 
   // do not compress L0 and L1
@@ -4615,7 +4617,7 @@ TEST(DBTest, CompactionFilterDeletesAll) {
   ASSERT_OK(db_->CompactRange(nullptr, nullptr));
   ASSERT_EQ(0, CountLiveFiles());
 
-  Reopen(&options);
+  Reopen(options);
 
   Iterator* itr = db_->NewIterator(ReadOptions());
   itr->SeekToFirst();
@@ -4684,7 +4686,7 @@ TEST(DBTest, CompactionFilterContextManual) {
   options.compaction_filter_factory.reset(filter);
   options.compression = kNoCompression;
   options.level0_file_num_compaction_trigger = 8;
-  Reopen(&options);
+  Reopen(options);
   int num_keys_per_file = 400;
   for (int j = 0; j < 3; j++) {
     // Write several keys.
@@ -4866,7 +4868,7 @@ TEST(DBTest, CompactionFilterV2) {
   // compaction filter buffer using universal compaction
   option_config_ = kUniversalCompaction;
   options.compaction_style = (rocksdb::CompactionStyle)1;
-  Reopen(&options);
+  Reopen(options);
 
   // Write 100K keys, these are written to a few files in L0.
   const std::string value(10, 'x');
@@ -4955,7 +4957,7 @@ TEST(DBTest, CompactionFilterV2WithValueChange) {
   option_config_ = kUniversalCompaction;
   options.compaction_style = (rocksdb::CompactionStyle)1;
   options = CurrentOptions(options);
-  Reopen(&options);
+  Reopen(options);
 
   // Write 100K+1 keys, these are written to a few files
   // in L0. We do this so that the current snapshot points
@@ -4996,7 +4998,7 @@ TEST(DBTest, CompactionFilterV2NULLPrefix) {
   // compaction filter buffer using universal compaction
   option_config_ = kUniversalCompaction;
   options.compaction_style = (rocksdb::CompactionStyle)1;
-  Reopen(&options);
+  Reopen(options);
 
   // Write 100K+1 keys, these are written to a few files
   // in L0. We do this so that the current snapshot points
@@ -5796,7 +5798,7 @@ TEST(DBTest, DropWrites) {
     Options options = CurrentOptions();
     options.env = env_;
     options.paranoid_checks = false;
-    Reopen(&options);
+    Reopen(options);
 
     ASSERT_OK(Put("foo", "v1"));
     ASSERT_EQ("v1", Get("foo"));
@@ -5829,7 +5831,7 @@ TEST(DBTest, DropWritesFlush) {
     Options options = CurrentOptions();
     options.env = env_;
     options.max_background_flushes = 1;
-    Reopen(&options);
+    Reopen(options);
 
     ASSERT_OK(Put("foo", "v1"));
     // Force out-of-space errors
@@ -5868,7 +5870,7 @@ TEST(DBTest, NoSpaceCompactRange) {
     Options options = CurrentOptions();
     options.env = env_;
     options.disable_auto_compactions = true;
-    Reopen(&options);
+    Reopen(options);
 
     // generate 5 tables
     for (int i = 0; i < 5; ++i) {
@@ -5891,7 +5893,7 @@ TEST(DBTest, NonWritableFileSystem) {
     Options options = CurrentOptions();
     options.write_buffer_size = 1000;
     options.env = env_;
-    Reopen(&options);
+    Reopen(options);
     ASSERT_OK(Put("foo", "v1"));
     env_->non_writeable_rate_.store(100);
     std::string big(100000, 'x');
@@ -5944,7 +5946,7 @@ TEST(DBTest, ManifestWriteError) {
 
     // Recovery: should not lose data
     error_type->store(false, std::memory_order_release);
-    Reopen(&options);
+    Reopen(options);
     ASSERT_EQ("bar", Get("foo"));
   }
 }
@@ -6632,7 +6634,7 @@ TEST(DBTest, WALArchivalTtl) {
       std::vector<uint64_t> log_files = ListLogFiles(env_, dbname_);
 
       options.create_if_missing = false;
-      Reopen(&options);
+      Reopen(options);
 
       std::vector<uint64_t> logs = ListLogFiles(env_, archiveDir);
       std::set<uint64_t> archivedFiles(logs.begin(), logs.end());
@@ -6647,7 +6649,7 @@ TEST(DBTest, WALArchivalTtl) {
 
     options.WAL_ttl_seconds = 1;
     env_->SleepForMicroseconds(2 * 1000 * 1000);
-    Reopen(&options);
+    Reopen(options);
 
     log_files = ListLogFiles(env_, archiveDir);
     ASSERT_TRUE(log_files.empty());
@@ -6692,14 +6694,14 @@ TEST(DBTest, WALArchivalSizeLimit) {
     for (int i = 0; i < 128 * 128; ++i) {
       ASSERT_OK(Put(Key(i), DummyString(1024)));
     }
-    Reopen(&options);
+    Reopen(options);
 
     std::string archive_dir = ArchivalDirectory(dbname_);
     std::vector<std::uint64_t> log_files = ListLogFiles(env_, archive_dir);
     ASSERT_TRUE(log_files.size() > 2);
 
     options.WAL_size_limit_MB = 8;
-    Reopen(&options);
+    Reopen(options);
     dbfull()->TEST_PurgeObsoleteteWAL();
 
     uint64_t archive_size = GetLogDirSize(archive_dir, env_);
@@ -6708,7 +6710,7 @@ TEST(DBTest, WALArchivalSizeLimit) {
     options.WAL_ttl_seconds = 1;
     dbfull()->TEST_SetDefaultTimeToCheck(1);
     env_->SleepForMicroseconds(2 * 1000 * 1000);
-    Reopen(&options);
+    Reopen(options);
     dbfull()->TEST_PurgeObsoleteteWAL();
 
     log_files = ListLogFiles(env_, archive_dir);
@@ -6728,7 +6730,7 @@ TEST(DBTest, PurgeInfoLogs) {
       options.db_log_dir = "";
     }
     for (int i = 0; i < 8; i++) {
-      Reopen(&options);
+      Reopen(options);
     }
 
     std::vector<std::string> files;
@@ -6925,7 +6927,7 @@ TEST(DBTest, TransactionLogIteratorCheckAfterRestart) {
     Put("key1", DummyString(1024));
     Put("key2", DummyString(1023));
     dbfull()->Flush(FlushOptions());
-    Reopen(&options);
+    Reopen(options);
     auto iter = OpenTransactionLogIter(0);
     ExpectRecords(2, iter);
   } while (ChangeCompactOptions());
@@ -7244,7 +7246,7 @@ TEST(DBTest, GroupCommitTest) {
   do {
     Options options = CurrentOptions();
     options.statistics = rocksdb::CreateDBStatistics();
-    Reopen(&options);
+    Reopen(options);
 
     // Start threads
     GCThread thread[kGCNumThreads];
@@ -7660,7 +7662,9 @@ TEST(DBTest, Randomized) {
         if (model_snap != nullptr) model.ReleaseSnapshot(model_snap);
         if (db_snap != nullptr) db_->ReleaseSnapshot(db_snap);
 
-        Reopen();
+
+        auto options = CurrentOptions();
+        Reopen(options);
         ASSERT_TRUE(CompareIterators(step, &model, db_, nullptr, nullptr));
 
         model_snap = model.GetSnapshot();
@@ -8062,7 +8066,7 @@ TEST(DBTest, BlockBasedTablePrefixIndexTest) {
   options.prefix_extractor.reset(NewFixedPrefixTransform(1));
 
 
-  Reopen(&options);
+  Reopen(options);
   ASSERT_OK(Put("k1", "v1"));
   Flush();
   ASSERT_OK(Put("k2", "v2"));
@@ -8073,7 +8077,7 @@ TEST(DBTest, BlockBasedTablePrefixIndexTest) {
   options.table_factory.reset(NewBlockBasedTableFactory(table_options));
   options.prefix_extractor.reset();
 
-  Reopen(&options);
+  Reopen(options);
   ASSERT_EQ("v1", Get("k1"));
   ASSERT_EQ("v2", Get("k2"));
 }
@@ -8084,21 +8088,21 @@ TEST(DBTest, ChecksumTest) {
 
   table_options.checksum = kCRC32c;
   options.table_factory.reset(NewBlockBasedTableFactory(table_options));
-  Reopen(&options);
+  Reopen(options);
   ASSERT_OK(Put("a", "b"));
   ASSERT_OK(Put("c", "d"));
   ASSERT_OK(Flush());  // table with crc checksum
 
   table_options.checksum = kxxHash;
   options.table_factory.reset(NewBlockBasedTableFactory(table_options));
-  Reopen(&options);
+  Reopen(options);
   ASSERT_OK(Put("e", "f"));
   ASSERT_OK(Put("g", "h"));
   ASSERT_OK(Flush());  // table with xxhash checksum
 
   table_options.checksum = kCRC32c;
   options.table_factory.reset(NewBlockBasedTableFactory(table_options));
-  Reopen(&options);
+  Reopen(options);
   ASSERT_EQ("b", Get("a"));
   ASSERT_EQ("d", Get("c"));
   ASSERT_EQ("f", Get("e"));
@@ -8106,7 +8110,7 @@ TEST(DBTest, ChecksumTest) {
 
   table_options.checksum = kCRC32c;
   options.table_factory.reset(NewBlockBasedTableFactory(table_options));
-  Reopen(&options);
+  Reopen(options);
   ASSERT_EQ("b", Get("a"));
   ASSERT_EQ("d", Get("c"));
   ASSERT_EQ("f", Get("e"));
@@ -8284,7 +8288,7 @@ TEST(DBTest, Level0StopWritesTest) {
   options.level0_stop_writes_trigger = 4;
   options.disable_auto_compactions = 4;
   options.max_mem_compaction_level = 0;
-  Reopen(&options);
+  Reopen(options);
 
   // create 4 level0 tables
   for (int i = 0; i < 4; ++i) {
@@ -8562,7 +8566,7 @@ TEST(DBTest, DisableDataSyncTest) {
     options.disableDataSync = iter == 0;
     options.create_if_missing = true;
     options.env = env_;
-    Reopen(&options);
+    Reopen(options);
     CreateAndReopenWithCF({"pikachu"}, &options);
 
     MakeTables(10, "a", "z");
@@ -9035,7 +9039,7 @@ TEST(DBTest, FileCreationRandomFailure) {
 
   // reopen and reverify we have the latest successful update
   env_->non_writeable_rate_.store(0);
-  Reopen(&options);
+  Reopen(options);
   for (int k = 0; k < kTestSize; ++k) {
     auto v = Get(Key(k));
     ASSERT_EQ(v, values[k]);
@@ -9109,7 +9113,7 @@ TEST(DBTest, PartialCompactionFailure) {
   env_->periodic_non_writable_ = 0;
 
   // Make sure RocksDB will not get into corrupted state.
-  Reopen(&options);
+  Reopen(options);
 
   // Verify again after reopen.
   for (int k = 0; k < kNumKeys; ++k) {

From cdc7230e4ccf2d820e3bf0304c31402591d247d1 Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Wed, 29 Oct 2014 11:59:18 -0700
Subject: [PATCH 333/829] DBTest: options clean up - part 2

Summary: as title

Test Plan: same as part 1

Reviewers: sdong, yhchiang, rven, igor

Reviewed By: igor

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D27693
---
 db/db_test.cc | 165 ++++++++++++++++++++++++++------------------------
 1 file changed, 85 insertions(+), 80 deletions(-)

diff --git a/db/db_test.cc b/db/db_test.cc
index 010b04ee4..7f46d0993 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -469,10 +469,12 @@ class DBTest {
     }
 
     if (option_config_ >= kEnd) {
-      Destroy(&last_options_);
+      Destroy(last_options_);
       return false;
     } else {
-      DestroyAndReopen();
+      auto options = CurrentOptions();
+      options.create_if_missing = true;
+      DestroyAndReopen(options);
       return true;
     }
   }
@@ -484,8 +486,11 @@ class DBTest {
       if (prev_options == nullptr) {
         prev_options = &last_options_;
       }
-      Destroy(prev_options);
-      TryReopen();
+      Destroy(*prev_options);
+
+      auto options = CurrentOptions();
+      options.create_if_missing = true;
+      TryReopen(&options);
       return true;
     } else {
       return false;
@@ -497,23 +502,20 @@ class DBTest {
   bool ChangeFilterOptions(Options* prev_options = nullptr) {
     if (option_config_ == kDefault) {
       option_config_ = kFilter;
-      if (prev_options == nullptr) {
-        prev_options = &last_options_;
-      }
-      Destroy(prev_options);
-      TryReopen();
-      return true;
     } else if (option_config_ == kFilter) {
       option_config_ = kFullFilter;
-      if (prev_options == nullptr) {
-        prev_options = &last_options_;
-      }
-      Destroy(prev_options);
-      TryReopen();
-      return true;
     } else {
       return false;
     }
+    if (prev_options == nullptr) {
+      prev_options = &last_options_;
+    }
+    Destroy(*prev_options);
+
+    auto options = CurrentOptions();
+    options.create_if_missing = true;
+    TryReopen(&options);
+    return true;
   }
 
   // Return the current option configuration.
@@ -712,15 +714,15 @@ class DBTest {
     db_ = nullptr;
   }
 
-  void DestroyAndReopen(Options* options = nullptr) {
+  void DestroyAndReopen(const Options& options) {
     //Destroy using last options
-    Destroy(&last_options_);
-    ASSERT_OK(TryReopen(options));
+    Destroy(last_options_);
+    ASSERT_OK(TryReopen(&options));
   }
 
-  void Destroy(Options* options) {
+  void Destroy(const Options& options) {
     Close();
-    ASSERT_OK(DestroyDB(dbname_, *options));
+    ASSERT_OK(DestroyDB(dbname_, options));
   }
 
   Status ReadOnlyReopen(Options* options) {
@@ -2141,7 +2143,7 @@ TEST(DBTest, IterReseek) {
   options.max_sequential_skip_in_iterations = 3;
   options.create_if_missing = true;
   options.statistics = rocksdb::CreateDBStatistics();
-  DestroyAndReopen(&options);
+  DestroyAndReopen(options);
   CreateAndReopenWithCF({"pikachu"}, &options);
 
   // insert two keys with same userkey and verify that
@@ -2389,7 +2391,7 @@ TEST(DBTest, RecoverWithTableHandle) {
     options.write_buffer_size = 100;
     options.disable_auto_compactions = true;
     options = CurrentOptions(options);
-    DestroyAndReopen(&options);
+    DestroyAndReopen(options);
     CreateAndReopenWithCF({"pikachu"}, &options);
 
     ASSERT_OK(Put(1, "foo", "v1"));
@@ -2438,7 +2440,7 @@ TEST(DBTest, IgnoreRecoveredLog) {
     options.create_if_missing = true;
     options.merge_operator = MergeOperators::CreateUInt64AddOperator();
     options.wal_dir = dbname_ + "/logs";
-    DestroyAndReopen(&options);
+    DestroyAndReopen(options);
 
     // fill up the DB
     std::string one, two;
@@ -2476,7 +2478,7 @@ TEST(DBTest, IgnoreRecoveredLog) {
     ASSERT_EQ(two, Get("foo"));
     ASSERT_EQ(one, Get("bar"));
     Close();
-    Destroy(&options);
+    Destroy(options);
     Reopen(options);
     Close();
 
@@ -2494,7 +2496,7 @@ TEST(DBTest, IgnoreRecoveredLog) {
     ASSERT_EQ(one, Get("bar"));
 
     // Recovery will fail if DB directory doesn't exist.
-    Destroy(&options);
+    Destroy(options);
     // copy the logs from backup back to wal dir
     env_->CreateDirIfMissing(options.wal_dir);
     for (auto& log : logs) {
@@ -3170,7 +3172,7 @@ TEST(DBTest, CompactionDeletionTrigger) {
   for (int tid = 0; tid < 2; ++tid) {
     uint64_t db_size[2];
 
-    DestroyAndReopen(&options);
+    DestroyAndReopen(options);
     Random rnd(301);
 
     const int kTestSize = kCDTKeysPerBuffer * 512;
@@ -3205,7 +3207,7 @@ TEST(DBTest, CompactionDeletionTriggerReopen) {
     Options options = DeletionTriggerOptions();
     options.create_if_missing = true;
 
-    DestroyAndReopen(&options);
+    DestroyAndReopen(options);
     Random rnd(301);
 
     // round 1 --- insert key/value pairs.
@@ -3752,7 +3754,7 @@ TEST(DBTest, CompressedCache) {
     }
 
     options.create_if_missing = true;
-    DestroyAndReopen(&options);
+    DestroyAndReopen(options);
   }
 }
 
@@ -3956,7 +3958,7 @@ TEST(DBTest, UniversalCompactionSecondPathRatio) {
     ASSERT_TRUE(v.size() == 1 || v.size() == 10000);
   }
 
-  Destroy(&options);
+  Destroy(options);
 }
 
 TEST(DBTest, UniversalCompactionFourPaths) {
@@ -4055,7 +4057,7 @@ TEST(DBTest, UniversalCompactionFourPaths) {
     ASSERT_TRUE(v.size() == 1 || v.size() == 10000);
   }
 
-  Destroy(&options);
+  Destroy(options);
 }
 #endif
 
@@ -4249,7 +4251,7 @@ TEST(DBTest, MinLevelToCompress1) {
   for (int i = 2; i < options.num_levels; i++) {
     options.compression_per_level[i] = type;
   }
-  DestroyAndReopen(&options);
+  DestroyAndReopen(options);
   MinLevelHelper(this, options);
 }
 
@@ -4269,7 +4271,7 @@ TEST(DBTest, MinLevelToCompress2) {
   for (int i = 2; i < options.num_levels; i++) {
     options.compression_per_level[i] = type;
   }
-  DestroyAndReopen(&options);
+  DestroyAndReopen(options);
   MinLevelHelper(this, options);
 }
 
@@ -4535,7 +4537,7 @@ TEST(DBTest, CompactionFilter) {
   // filter in such a way that it deletes all keys
   options.compaction_filter_factory = std::make_shared<DeleteFilterFactory>();
   options.create_if_missing = true;
-  DestroyAndReopen(&options);
+  DestroyAndReopen(options);
   CreateAndReopenWithCF({"pikachu"}, &options);
 
   // write all the keys once again.
@@ -4603,7 +4605,7 @@ TEST(DBTest, CompactionFilterDeletesAll) {
   options.compaction_filter_factory = std::make_shared<DeleteFilterFactory>();
   options.disable_auto_compactions = true;
   options.create_if_missing = true;
-  DestroyAndReopen(&options);
+  DestroyAndReopen(options);
 
   // put some data
   for (int table = 0; table < 4; ++table) {
@@ -4915,7 +4917,7 @@ TEST(DBTest, CompactionFilterV2) {
   options.compaction_filter_factory_v2 =
     std::make_shared<DeleteFilterFactoryV2>(prefix_extractor.get());
   options.create_if_missing = true;
-  DestroyAndReopen(&options);
+  DestroyAndReopen(options);
 
   // write all the keys once again.
   for (int i = 0; i < 100000; i++) {
@@ -5098,8 +5100,9 @@ TEST(DBTest, ApproximateSizes) {
     Options options;
     options.write_buffer_size = 100000000;        // Large write buffer
     options.compression = kNoCompression;
+    options.create_if_missing = true;
     options = CurrentOptions(options);
-    DestroyAndReopen();
+    DestroyAndReopen(options);
     CreateAndReopenWithCF({"pikachu"}, &options);
 
     ASSERT_TRUE(Between(Size("", "xyz", 1), 0, 0));
@@ -5575,7 +5578,7 @@ TEST(DBTest, CustomComparator) {
     new_options.comparator = &cmp;
     new_options.write_buffer_size = 1000;  // Compact more often
     new_options = CurrentOptions(new_options);
-    DestroyAndReopen(&new_options);
+    DestroyAndReopen(new_options);
     CreateAndReopenWithCF({"pikachu"}, &new_options);
     ASSERT_OK(Put(1, "[10]", "ten"));
     ASSERT_OK(Put(1, "[0x14]", "twenty"));
@@ -5644,7 +5647,7 @@ TEST(DBTest, ManualCompaction) {
       options.max_background_flushes = 0;
       options.num_levels = 3;
       options.create_if_missing = true;
-      DestroyAndReopen(&options);
+      DestroyAndReopen(options);
       CreateAndReopenWithCF({"pikachu"}, &options);
     }
   }
@@ -5658,8 +5661,8 @@ TEST(DBTest, ManualCompactionOutputPathId) {
   options.db_paths.emplace_back(dbname_ + "_2", 1000000000);
   options.compaction_style = kCompactionStyleUniversal;
   options.level0_file_num_compaction_trigger = 10;
-  Destroy(&options);
-  DestroyAndReopen(&options);
+  Destroy(options);
+  DestroyAndReopen(options);
   CreateAndReopenWithCF({"pikachu"}, &options);
   MakeTables(3, "p", "q", 1);
   dbfull()->TEST_WaitForCompact();
@@ -5742,7 +5745,7 @@ TEST(DBTest, DBOpen_Change_NumLevels) {
   Options opts;
   opts.create_if_missing = true;
   opts.max_background_flushes = 0;
-  DestroyAndReopen(&opts);
+  DestroyAndReopen(opts);
   ASSERT_TRUE(db_ != nullptr);
   CreateAndReopenWithCF({"pikachu"}, &opts);
 
@@ -5929,7 +5932,7 @@ TEST(DBTest, ManifestWriteError) {
     options.create_if_missing = true;
     options.error_if_exists = false;
     options.max_background_flushes = 0;
-    DestroyAndReopen(&options);
+    DestroyAndReopen(options);
     ASSERT_OK(Put("foo", "bar"));
     ASSERT_EQ("bar", Get("foo"));
 
@@ -5962,7 +5965,7 @@ TEST(DBTest, PutFailsParanoid) {
   options.create_if_missing = true;
   options.error_if_exists = false;
   options.paranoid_checks = true;
-  DestroyAndReopen(&options);
+  DestroyAndReopen(options);
   CreateAndReopenWithCF({"pikachu"}, &options);
   Status s;
 
@@ -5981,7 +5984,7 @@ TEST(DBTest, PutFailsParanoid) {
 
   // do the same thing with paranoid checks off
   options.paranoid_checks = false;
-  DestroyAndReopen(&options);
+  DestroyAndReopen(options);
   CreateAndReopenWithCF({"pikachu"}, &options);
 
   ASSERT_OK(Put(1, "foo", "bar"));
@@ -6615,7 +6618,7 @@ TEST(DBTest, WALArchivalTtl) {
     Options options = CurrentOptions();
     options.create_if_missing = true;
     options.WAL_ttl_seconds = 1000;
-    DestroyAndReopen(&options);
+    DestroyAndReopen(options);
 
     //  TEST : Create DB with a ttl and no size limit.
     //  Put some keys. Count the log files present in the DB just after insert.
@@ -6690,7 +6693,7 @@ TEST(DBTest, WALArchivalSizeLimit) {
     // Set ttl and time_to_check_ to small values. Re-open db.
     // Assert that there are no archived logs left.
 
-    DestroyAndReopen(&options);
+    DestroyAndReopen(options);
     for (int i = 0; i < 128 * 128; ++i) {
       ASSERT_OK(Put(Key(i), DummyString(1024)));
     }
@@ -6744,7 +6747,7 @@ TEST(DBTest, PurgeInfoLogs) {
     }
     ASSERT_EQ(5, info_log_count);
 
-    Destroy(&options);
+    Destroy(options);
     // For mode (1), test DestroyDB() to delete all the logs under DB dir.
     // For mode (2), no info log file should have been put under DB dir.
     std::vector<std::string> db_files;
@@ -6794,7 +6797,7 @@ void ExpectRecords(
 TEST(DBTest, TransactionLogIterator) {
   do {
     Options options = OptionsForLogIterTest();
-    DestroyAndReopen(&options);
+    DestroyAndReopen(options);
     CreateAndReopenWithCF({"pikachu"}, &options);
     Put(0, "key1", DummyString(1024));
     Put(1, "key2", DummyString(1024));
@@ -6838,7 +6841,7 @@ TEST(DBTest, TransactionLogIteratorRace) {
       rocksdb::SyncPoint::GetInstance()->ClearTrace();
       rocksdb::SyncPoint::GetInstance()->DisableProcessing();
       Options options = OptionsForLogIterTest();
-      DestroyAndReopen(&options);
+      DestroyAndReopen(options);
       Put("key1", DummyString(1024));
       dbfull()->Flush(FlushOptions());
       Put("key2", DummyString(1024));
@@ -6876,7 +6879,7 @@ TEST(DBTest, TransactionLogIteratorRace) {
 TEST(DBTest, TransactionLogIteratorMoveOverZeroFiles) {
   do {
     Options options = OptionsForLogIterTest();
-    DestroyAndReopen(&options);
+    DestroyAndReopen(options);
     CreateAndReopenWithCF({"pikachu"}, &options);
     // Do a plain Reopen.
     Put(1, "key1", DummyString(1024));
@@ -6894,7 +6897,7 @@ TEST(DBTest, TransactionLogIteratorMoveOverZeroFiles) {
 TEST(DBTest, TransactionLogIteratorStallAtLastRecord) {
   do {
     Options options = OptionsForLogIterTest();
-    DestroyAndReopen(&options);
+    DestroyAndReopen(options);
     Put("key1", DummyString(1024));
     auto iter = OpenTransactionLogIter(0);
     ASSERT_OK(iter->status());
@@ -6912,7 +6915,7 @@ TEST(DBTest, TransactionLogIteratorStallAtLastRecord) {
 TEST(DBTest, TransactionLogIteratorJustEmptyFile) {
   do {
     Options options = OptionsForLogIterTest();
-    DestroyAndReopen(&options);
+    DestroyAndReopen(options);
     unique_ptr<TransactionLogIterator> iter;
     Status status = dbfull()->GetUpdatesSince(0, &iter);
     // Check that an empty iterator is returned
@@ -6923,7 +6926,7 @@ TEST(DBTest, TransactionLogIteratorJustEmptyFile) {
 TEST(DBTest, TransactionLogIteratorCheckAfterRestart) {
   do {
     Options options = OptionsForLogIterTest();
-    DestroyAndReopen(&options);
+    DestroyAndReopen(options);
     Put("key1", DummyString(1024));
     Put("key2", DummyString(1023));
     dbfull()->Flush(FlushOptions());
@@ -6936,7 +6939,7 @@ TEST(DBTest, TransactionLogIteratorCheckAfterRestart) {
 TEST(DBTest, TransactionLogIteratorCorruptedLog) {
   do {
     Options options = OptionsForLogIterTest();
-    DestroyAndReopen(&options);
+    DestroyAndReopen(options);
     for (int i = 0; i < 1024; i++) {
       Put("key"+std::to_string(i), DummyString(10));
     }
@@ -6965,7 +6968,7 @@ TEST(DBTest, TransactionLogIteratorCorruptedLog) {
 TEST(DBTest, TransactionLogIteratorBatchOperations) {
   do {
     Options options = OptionsForLogIterTest();
-    DestroyAndReopen(&options);
+    DestroyAndReopen(options);
     CreateAndReopenWithCF({"pikachu"}, &options);
     WriteBatch batch;
     batch.Put(handles_[1], "key1", DummyString(1024));
@@ -6984,7 +6987,7 @@ TEST(DBTest, TransactionLogIteratorBatchOperations) {
 
 TEST(DBTest, TransactionLogIteratorBlobs) {
   Options options = OptionsForLogIterTest();
-  DestroyAndReopen(&options);
+  DestroyAndReopen(options);
   CreateAndReopenWithCF({"pikachu"}, &options);
   {
     WriteBatch batch;
@@ -7034,7 +7037,7 @@ TEST(DBTest, ReadFirstRecordCache) {
   Options options = CurrentOptions();
   options.env = env_;
   options.create_if_missing = true;
-  DestroyAndReopen(&options);
+  DestroyAndReopen(options);
 
   std::string path = dbname_ + "/000001.log";
   unique_ptr<WritableFile> file;
@@ -7727,7 +7730,9 @@ TEST(DBTest, MultiGetEmpty) {
     ASSERT_EQ(s.size(), 0U);
 
     // Empty Database, Empty Key Set
-    DestroyAndReopen();
+    Options options = CurrentOptions();
+    options.create_if_missing = true;
+    DestroyAndReopen(options);
     CreateAndReopenWithCF({"pikachu"});
     s = db_->MultiGet(ReadOptions(), cfs, keys, &values);
     ASSERT_EQ(s.size(), 0U);
@@ -7823,7 +7828,7 @@ TEST(DBTest, PrefixScan) {
     options.table_factory.reset(NewBlockBasedTableFactory(table_options));
 
     // 11 RAND I/Os
-    DestroyAndReopen(&options);
+    DestroyAndReopen(options);
     PrefixScanInit(this);
     count = 0;
     env_->random_read_counter_.Reset();
@@ -7978,7 +7983,7 @@ TEST(DBTest, TailingIteratorPrefixSeek) {
   options.disable_auto_compactions = true;
   options.prefix_extractor.reset(NewFixedPrefixTransform(2));
   options.memtable_factory.reset(NewHashSkipListRepFactory(16));
-  DestroyAndReopen(&options);
+  DestroyAndReopen(options);
   CreateAndReopenWithCF({"pikachu"}, &options);
 
   std::unique_ptr<Iterator> iter(db_->NewIterator(read_options, handles_[1]));
@@ -8130,7 +8135,7 @@ TEST(DBTest, FIFOCompactionTest) {
     if (iter == 1) {
       options.disable_auto_compactions = true;
     }
-    DestroyAndReopen(&options);
+    DestroyAndReopen(options);
 
     Random rnd(301);
     for (int i = 0; i < 6; ++i) {
@@ -8171,7 +8176,7 @@ TEST(DBTest, SimpleWriteTimeoutTest) {
   options.max_total_wal_size = std::numeric_limits<uint64_t>::max();
   WriteOptions write_opt;
   write_opt.timeout_hint_us = 0;
-  DestroyAndReopen(&options);
+  DestroyAndReopen(options);
   // fill the two write buffers
   ASSERT_OK(Put(Key(1), Key(1) + std::string(100000, 'v'), write_opt));
   ASSERT_OK(Put(Key(2), Key(2) + std::string(100000, 'v'), write_opt));
@@ -8256,7 +8261,7 @@ TEST(DBTest, MTRandomTimeoutTest) {
   options.level0_slowdown_writes_trigger = 10;
   options.level0_stop_writes_trigger = 20;
   options.write_buffer_size = kWriteBufferSize;
-  DestroyAndReopen(&options);
+  DestroyAndReopen(options);
 
   TimeoutWriterState thread_states[kNumThreads];
   for (int tid = 0; tid < kNumThreads; ++tid) {
@@ -8318,7 +8323,7 @@ TEST(DBTest, RateLimitingTest) {
   options.create_if_missing = true;
   options.env = env_;
   options.IncreaseParallelism(4);
-  DestroyAndReopen(&options);
+  DestroyAndReopen(options);
 
   WriteOptions wo;
   wo.disableWAL = true;
@@ -8339,7 +8344,7 @@ TEST(DBTest, RateLimitingTest) {
   options.rate_limiter.reset(
     NewGenericRateLimiter(static_cast<int64_t>(0.7 * raw_rate)));
   env_->bytes_written_ = 0;
-  DestroyAndReopen(&options);
+  DestroyAndReopen(options);
 
   start = env_->NowMicros();
   // Write ~96M data
@@ -8359,7 +8364,7 @@ TEST(DBTest, RateLimitingTest) {
   options.rate_limiter.reset(
     NewGenericRateLimiter(static_cast<int64_t>(raw_rate / 2)));
   env_->bytes_written_ = 0;
-  DestroyAndReopen(&options);
+  DestroyAndReopen(options);
 
   start = env_->NowMicros();
   // Write ~96M data
@@ -8379,12 +8384,12 @@ TEST(DBTest, RateLimitingTest) {
 TEST(DBTest, TableOptionsSanitizeTest) {
   Options options = CurrentOptions();
   options.create_if_missing = true;
-  DestroyAndReopen(&options);
+  DestroyAndReopen(options);
   ASSERT_EQ(db_->GetOptions().allow_mmap_reads, false);
 
   options.table_factory.reset(new PlainTableFactory());
   options.prefix_extractor.reset(NewNoopTransform());
-  Destroy(&options);
+  Destroy(options);
   ASSERT_TRUE(TryReopen(&options).IsNotSupported());
 
   // Test for check of prefix_extractor when hash index is used for
@@ -8405,7 +8410,7 @@ TEST(DBTest, DBIteratorBoundTest) {
   options.create_if_missing = true;
 
   options.prefix_extractor = nullptr;
-  DestroyAndReopen(&options);
+  DestroyAndReopen(options);
   ASSERT_OK(Put("a", "0"));
   ASSERT_OK(Put("foo", "bar"));
   ASSERT_OK(Put("foo1", "bar1"));
@@ -8459,7 +8464,7 @@ TEST(DBTest, DBIteratorBoundTest) {
   // prefix is the first letter of the key
   options.prefix_extractor.reset(NewFixedPrefixTransform(1));
 
-  DestroyAndReopen(&options);
+  DestroyAndReopen(options);
   ASSERT_OK(Put("a", "0"));
   ASSERT_OK(Put("foo", "bar"));
   ASSERT_OK(Put("foo1", "bar1"));
@@ -8485,7 +8490,7 @@ TEST(DBTest, DBIteratorBoundTest) {
   // if the bound has already reached
   {
     options.prefix_extractor = nullptr;
-    DestroyAndReopen(&options);
+    DestroyAndReopen(options);
     ASSERT_OK(Put("a", "0"));
     ASSERT_OK(Put("b", "0"));
     ASSERT_OK(Put("b1", "0"));
@@ -8577,7 +8582,7 @@ TEST(DBTest, DisableDataSyncTest) {
     } else {
       ASSERT_GT(env_->sync_counter_.load(), 0);
     }
-    Destroy(&options);
+    Destroy(options);
   }
 }
 
@@ -8597,7 +8602,7 @@ TEST(DBTest, DynamicMemtableOptions) {
   options.level0_file_num_compaction_trigger = 1024;
   options.level0_slowdown_writes_trigger = 1024;
   options.level0_stop_writes_trigger = 1024;
-  DestroyAndReopen(&options);
+  DestroyAndReopen(options);
 
   auto gen_l0_kb = [this](int size) {
     Random rnd(301);
@@ -8642,7 +8647,7 @@ TEST(DBTest, DynamicMemtableOptions) {
   // during compaction but trigger is pretty high
   options.max_background_flushes = 0;
   options.disable_auto_compactions = true;
-  DestroyAndReopen(&options);
+  DestroyAndReopen(options);
 
   // Put until timeout, bounded by 256 puts. We should see timeout at ~128KB
   int count = 0;
@@ -8724,7 +8729,7 @@ TEST(DBTest, DynamicCompactionOptions) {
   // Block flush thread and disable compaction thread
   env_->SetBackgroundThreads(1, Env::LOW);
   env_->SetBackgroundThreads(1, Env::HIGH);
-  DestroyAndReopen(&options);
+  DestroyAndReopen(options);
 
   auto gen_l0_kb = [this](int start, int size, int stride) {
     Random rnd(301);
@@ -8952,7 +8957,7 @@ TEST(DBTest, DynamicCompactionOptions) {
   options.max_background_compactions = 1;
   options.max_background_flushes = 0;
   options.max_mem_compaction_level = 2;
-  DestroyAndReopen(&options);
+  DestroyAndReopen(options);
   ASSERT_EQ(NumTableFilesAtLevel(0), 0);
   ASSERT_EQ(NumTableFilesAtLevel(1), 0);
   ASSERT_EQ(NumTableFilesAtLevel(2), 0);
@@ -8996,7 +9001,7 @@ TEST(DBTest, FileCreationRandomFailure) {
   options.max_bytes_for_level_base = 1000000;
   options.max_bytes_for_level_multiplier = 2;
 
-  DestroyAndReopen(&options);
+  DestroyAndReopen(options);
   Random rnd(301);
 
   const int kTestSize = kCDTKeysPerBuffer * 4096;
@@ -9067,7 +9072,7 @@ TEST(DBTest, PartialCompactionFailure) {
 
   options.env = env_;
 
-  DestroyAndReopen(&options);
+  DestroyAndReopen(options);
 
   const int kNumKeys =
       options.level0_file_num_compaction_trigger *
@@ -9129,7 +9134,7 @@ TEST(DBTest, DynamicMiscOptions) {
   options.max_sequential_skip_in_iterations = 16;
   options.compression = kNoCompression;
   options.statistics = rocksdb::CreateDBStatistics();
-  DestroyAndReopen(&options);
+  DestroyAndReopen(options);
 
   auto assert_reseek_count = [this, &options](int key_start, int num_reseek) {
     int key0 = key_start;

From 34f3c5a20f5a0f325b91dbb515eddea02725fcc4 Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Wed, 29 Oct 2014 12:00:01 -0700
Subject: [PATCH 334/829] DBTest: options clean up - part 3

Summary: as title

Test Plan: same as part 1

Reviewers: sdong, yhchiang, rven, igor

Reviewed By: igor

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D27705
---
 db/db_test.cc | 239 +++++++++++++++++++++++++-------------------------
 1 file changed, 118 insertions(+), 121 deletions(-)

diff --git a/db/db_test.cc b/db/db_test.cc
index 7f46d0993..9ee935d54 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -490,7 +490,7 @@ class DBTest {
 
       auto options = CurrentOptions();
       options.create_if_missing = true;
-      TryReopen(&options);
+      TryReopen(options);
       return true;
     } else {
       return false;
@@ -514,7 +514,7 @@ class DBTest {
 
     auto options = CurrentOptions();
     options.create_if_missing = true;
-    TryReopen(&options);
+    TryReopen(options);
     return true;
   }
 
@@ -648,13 +648,8 @@ class DBTest {
   }
 
   void CreateColumnFamilies(const std::vector<std::string>& cfs,
-                            const ColumnFamilyOptions* options = nullptr) {
-    ColumnFamilyOptions cf_opts;
-    if (options != nullptr) {
-      cf_opts = ColumnFamilyOptions(*options);
-    } else {
-      cf_opts = ColumnFamilyOptions(CurrentOptions());
-    }
+                            const Options& options) {
+    ColumnFamilyOptions cf_opts(options);
     int cfi = handles_.size();
     handles_.resize(cfi + cfs.size());
     for (auto cf : cfs) {
@@ -663,11 +658,11 @@ class DBTest {
   }
 
   void CreateAndReopenWithCF(const std::vector<std::string>& cfs,
-                             const Options* options = nullptr) {
+                             const Options& options) {
     CreateColumnFamilies(cfs, options);
     std::vector<std::string> cfs_plus_default = cfs;
     cfs_plus_default.insert(cfs_plus_default.begin(), kDefaultColumnFamilyName);
-    ReopenWithColumnFamilies(cfs_plus_default, options);
+    ReopenWithColumnFamilies(cfs_plus_default, &options);
   }
 
   void ReopenWithColumnFamilies(const std::vector<std::string>& cfs,
@@ -702,7 +697,7 @@ class DBTest {
   }
 
   void Reopen(const Options& options) {
-    ASSERT_OK(TryReopen(&options));
+    ASSERT_OK(TryReopen(options));
   }
 
   void Close() {
@@ -717,7 +712,7 @@ class DBTest {
   void DestroyAndReopen(const Options& options) {
     //Destroy using last options
     Destroy(last_options_);
-    ASSERT_OK(TryReopen(&options));
+    ASSERT_OK(TryReopen(options));
   }
 
   void Destroy(const Options& options) {
@@ -729,8 +724,9 @@ class DBTest {
     return DB::OpenForReadOnly(*options, dbname_, &db_);
   }
 
-  Status TryReopen(const Options* options = nullptr) {
+  Status TryReopen(const Options& options) {
     Close();
+    /*
     Options opts;
     if (options != nullptr) {
       opts = *options;
@@ -738,8 +734,9 @@ class DBTest {
       opts = CurrentOptions();
       opts.create_if_missing = true;
     }
-    last_options_ = opts;
-    return DB::Open(opts, dbname_, &db_);
+    */
+    last_options_ = options;
+    return DB::Open(options, dbname_, &db_);
   }
 
   Status Flush(int cf = 0) {
@@ -1224,7 +1221,7 @@ TEST(DBTest, Empty) {
     options.env = env_;
     options.write_buffer_size = 100000;  // Small write buffer
     options = CurrentOptions(options);
-    CreateAndReopenWithCF({"pikachu"}, &options);
+    CreateAndReopenWithCF({"pikachu"}, options);
 
     std::string num;
     ASSERT_TRUE(dbfull()->GetProperty(
@@ -1412,7 +1409,7 @@ TEST(DBTest, IndexAndFilterBlocksOfNewTableAddedToCache) {
   table_options.cache_index_and_filter_blocks = true;
   table_options.filter_policy.reset(NewBloomFilterPolicy(20));
   options.table_factory.reset(new BlockBasedTableFactory(table_options));
-  CreateAndReopenWithCF({"pikachu"}, &options);
+  CreateAndReopenWithCF({"pikachu"}, options);
 
   ASSERT_OK(Put(1, "key", "val"));
   // Create a new table.
@@ -1493,7 +1490,7 @@ TEST(DBTest, GetPropertiesOfAllTablesTest) {
 
 TEST(DBTest, LevelLimitReopen) {
   Options options = CurrentOptions();
-  CreateAndReopenWithCF({"pikachu"}, &options);
+  CreateAndReopenWithCF({"pikachu"}, options);
 
   const std::string value(1024 * 1024, ' ');
   int i = 0;
@@ -1545,7 +1542,7 @@ TEST(DBTest, Preallocation) {
 
 TEST(DBTest, PutDeleteGet) {
   do {
-    CreateAndReopenWithCF({"pikachu"});
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
     ASSERT_OK(Put(1, "foo", "v1"));
     ASSERT_EQ("v1", Get(1, "foo"));
     ASSERT_OK(Put(1, "foo", "v2"));
@@ -1562,7 +1559,7 @@ TEST(DBTest, GetFromImmutableLayer) {
     options.env = env_;
     options.write_buffer_size = 100000;  // Small write buffer
     options = CurrentOptions(options);
-    CreateAndReopenWithCF({"pikachu"}, &options);
+    CreateAndReopenWithCF({"pikachu"}, options);
 
     ASSERT_OK(Put(1, "foo", "v1"));
     ASSERT_EQ("v1", Get(1, "foo"));
@@ -1580,7 +1577,7 @@ TEST(DBTest, GetFromImmutableLayer) {
 
 TEST(DBTest, GetFromVersions) {
   do {
-    CreateAndReopenWithCF({"pikachu"});
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
     ASSERT_OK(Put(1, "foo", "v1"));
     ASSERT_OK(Flush(1));
     ASSERT_EQ("v1", Get(1, "foo"));
@@ -1590,7 +1587,7 @@ TEST(DBTest, GetFromVersions) {
 
 TEST(DBTest, GetSnapshot) {
   do {
-    CreateAndReopenWithCF({"pikachu"});
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
     // Try with both a short key and a long key
     for (int i = 0; i < 2; i++) {
       std::string key = (i == 0) ? std::string("foo") : std::string(200, 'x');
@@ -1610,7 +1607,7 @@ TEST(DBTest, GetSnapshot) {
 
 TEST(DBTest, GetLevel0Ordering) {
   do {
-    CreateAndReopenWithCF({"pikachu"});
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
     // Check that we process level-0 files in correct order.  The code
     // below generates two level-0 files where the earlier one comes
     // before the later one in the level-0 file list since the earlier
@@ -1626,7 +1623,7 @@ TEST(DBTest, GetLevel0Ordering) {
 
 TEST(DBTest, GetOrderedByLevels) {
   do {
-    CreateAndReopenWithCF({"pikachu"});
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
     ASSERT_OK(Put(1, "foo", "v1"));
     Compact(1, "a", "z");
     ASSERT_EQ("v1", Get(1, "foo"));
@@ -1639,7 +1636,7 @@ TEST(DBTest, GetOrderedByLevels) {
 
 TEST(DBTest, GetPicksCorrectFile) {
   do {
-    CreateAndReopenWithCF({"pikachu"});
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
     // Arrange to have multiple files in a non-level-0 level.
     ASSERT_OK(Put(1, "a", "va"));
     Compact(1, "a", "b");
@@ -1658,7 +1655,7 @@ TEST(DBTest, GetEncountersEmptyLevel) {
     Options options = CurrentOptions();
     options.max_background_flushes = 0;
     options.disableDataSync = true;
-    CreateAndReopenWithCF({"pikachu"}, &options);
+    CreateAndReopenWithCF({"pikachu"}, options);
     // Arrange for the following to happen:
     //   * sstable A in level 0
     //   * nothing in level 1
@@ -1706,7 +1703,7 @@ TEST(DBTest, KeyMayExist) {
     options_override.filter_policy.reset(NewBloomFilterPolicy(20));
     Options options = CurrentOptions(options_override);
     options.statistics = rocksdb::CreateDBStatistics();
-    CreateAndReopenWithCF({"pikachu"}, &options);
+    CreateAndReopenWithCF({"pikachu"}, options);
 
     ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "a", &value));
 
@@ -1767,7 +1764,7 @@ TEST(DBTest, NonBlockingIteration) {
     Options options = CurrentOptions();
     options.statistics = rocksdb::CreateDBStatistics();
     non_blocking_opts.read_tier = kBlockCacheTier;
-    CreateAndReopenWithCF({"pikachu"}, &options);
+    CreateAndReopenWithCF({"pikachu"}, options);
     // write one kv to the database.
     ASSERT_OK(Put(1, "a", "b"));
 
@@ -1833,7 +1830,7 @@ TEST(DBTest, FilterDeletes) {
     options_override.filter_policy.reset(NewBloomFilterPolicy(20));
     Options options = CurrentOptions(options_override);
     options.filter_deletes = true;
-    CreateAndReopenWithCF({"pikachu"}, &options);
+    CreateAndReopenWithCF({"pikachu"}, options);
     WriteBatch batch;
 
     batch.Delete(handles_[1], "a");
@@ -1992,7 +1989,7 @@ TEST(DBTest, IterPrevWithNewerSeq2) {
 
 TEST(DBTest, IterEmpty) {
   do {
-    CreateAndReopenWithCF({"pikachu"});
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
     Iterator* iter = db_->NewIterator(ReadOptions(), handles_[1]);
 
     iter->SeekToFirst();
@@ -2010,7 +2007,7 @@ TEST(DBTest, IterEmpty) {
 
 TEST(DBTest, IterSingle) {
   do {
-    CreateAndReopenWithCF({"pikachu"});
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
     ASSERT_OK(Put(1, "a", "va"));
     Iterator* iter = db_->NewIterator(ReadOptions(), handles_[1]);
 
@@ -2051,7 +2048,7 @@ TEST(DBTest, IterSingle) {
 
 TEST(DBTest, IterMulti) {
   do {
-    CreateAndReopenWithCF({"pikachu"});
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
     ASSERT_OK(Put(1, "a", "va"));
     ASSERT_OK(Put(1, "b", "vb"));
     ASSERT_OK(Put(1, "c", "vc"));
@@ -2144,7 +2141,7 @@ TEST(DBTest, IterReseek) {
   options.create_if_missing = true;
   options.statistics = rocksdb::CreateDBStatistics();
   DestroyAndReopen(options);
-  CreateAndReopenWithCF({"pikachu"}, &options);
+  CreateAndReopenWithCF({"pikachu"}, options);
 
   // insert two keys with same userkey and verify that
   // reseek is not invoked. For each of these test cases,
@@ -2223,7 +2220,7 @@ TEST(DBTest, IterReseek) {
 
 TEST(DBTest, IterSmallAndLargeMix) {
   do {
-    CreateAndReopenWithCF({"pikachu"});
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
     ASSERT_OK(Put(1, "a", "va"));
     ASSERT_OK(Put(1, "b", std::string(100000, 'b')));
     ASSERT_OK(Put(1, "c", "vc"));
@@ -2264,7 +2261,7 @@ TEST(DBTest, IterSmallAndLargeMix) {
 
 TEST(DBTest, IterMultiWithDelete) {
   do {
-    CreateAndReopenWithCF({"pikachu"});
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
     ASSERT_OK(Put(1, "ka", "va"));
     ASSERT_OK(Put(1, "kb", "vb"));
     ASSERT_OK(Put(1, "kc", "vc"));
@@ -2289,7 +2286,7 @@ TEST(DBTest, IterMultiWithDelete) {
 
 TEST(DBTest, IterPrevMaxSkip) {
   do {
-    CreateAndReopenWithCF({"pikachu"});
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
     for (int i = 0; i < 2; i++) {
       ASSERT_OK(Put(1, "key1", "v1"));
       ASSERT_OK(Put(1, "key2", "v2"));
@@ -2319,7 +2316,7 @@ TEST(DBTest, IterPrevMaxSkip) {
 
 TEST(DBTest, IterWithSnapshot) {
   do {
-    CreateAndReopenWithCF({"pikachu"});
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
     ASSERT_OK(Put(1, "key1", "val1"));
     ASSERT_OK(Put(1, "key2", "val2"));
     ASSERT_OK(Put(1, "key3", "val3"));
@@ -2363,7 +2360,7 @@ TEST(DBTest, IterWithSnapshot) {
 
 TEST(DBTest, Recover) {
   do {
-    CreateAndReopenWithCF({"pikachu"});
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
     ASSERT_OK(Put(1, "foo", "v1"));
     ASSERT_OK(Put(1, "baz", "v5"));
 
@@ -2392,7 +2389,7 @@ TEST(DBTest, RecoverWithTableHandle) {
     options.disable_auto_compactions = true;
     options = CurrentOptions(options);
     DestroyAndReopen(options);
-    CreateAndReopenWithCF({"pikachu"}, &options);
+    CreateAndReopenWithCF({"pikachu"}, options);
 
     ASSERT_OK(Put(1, "foo", "v1"));
     ASSERT_OK(Put(1, "bar", "v2"));
@@ -2506,14 +2503,14 @@ TEST(DBTest, IgnoreRecoveredLog) {
         env_->DeleteFile(backup_logs + "/" + log);
       }
     }
-    Status s = TryReopen(&options);
+    Status s = TryReopen(options);
     ASSERT_TRUE(!s.ok());
   } while (ChangeOptions(kSkipHashCuckoo));
 }
 
 TEST(DBTest, RollLog) {
   do {
-    CreateAndReopenWithCF({"pikachu"});
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
     ASSERT_OK(Put(1, "foo", "v1"));
     ASSERT_OK(Put(1, "baz", "v5"));
 
@@ -2530,7 +2527,7 @@ TEST(DBTest, RollLog) {
 
 TEST(DBTest, WAL) {
   do {
-    CreateAndReopenWithCF({"pikachu"});
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
     WriteOptions writeOpt = WriteOptions();
     writeOpt.disableWAL = true;
     ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v1"));
@@ -2566,7 +2563,7 @@ TEST(DBTest, CheckLock) {
   do {
     DB* localdb;
     Options options = CurrentOptions();
-    ASSERT_OK(TryReopen(&options));
+    ASSERT_OK(TryReopen(options));
 
     // second open should fail
     ASSERT_TRUE(!(DB::Open(options, dbname_, &localdb)).ok());
@@ -2580,7 +2577,7 @@ TEST(DBTest, FlushMultipleMemtable) {
     writeOpt.disableWAL = true;
     options.max_write_buffer_number = 4;
     options.min_write_buffer_number_to_merge = 3;
-    CreateAndReopenWithCF({"pikachu"}, &options);
+    CreateAndReopenWithCF({"pikachu"}, options);
     ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v1"));
     ASSERT_OK(Flush(1));
     ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1"));
@@ -2599,7 +2596,7 @@ TEST(DBTest, NumImmutableMemTable) {
     options.max_write_buffer_number = 4;
     options.min_write_buffer_number_to_merge = 3;
     options.write_buffer_size = 1000000;
-    CreateAndReopenWithCF({"pikachu"}, &options);
+    CreateAndReopenWithCF({"pikachu"}, options);
 
     std::string big_value(1000000 * 2, 'x');
     std::string num;
@@ -2724,7 +2721,7 @@ TEST(DBTest, FlushEmptyColumnFamily) {
   writeOpt.disableWAL = true;
   options.max_write_buffer_number = 2;
   options.min_write_buffer_number_to_merge = 1;
-  CreateAndReopenWithCF({"pikachu"}, &options);
+  CreateAndReopenWithCF({"pikachu"}, options);
 
   // Compaction can still go through even if no thread can flush the
   // mem table.
@@ -2862,7 +2859,7 @@ TEST(DBTest, GetProperty) {
 
 TEST(DBTest, FLUSH) {
   do {
-    CreateAndReopenWithCF({"pikachu"});
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
     WriteOptions writeOpt = WriteOptions();
     writeOpt.disableWAL = true;
     SetPerfLevel(kEnableTime);;
@@ -2907,7 +2904,7 @@ TEST(DBTest, FLUSH) {
 
 TEST(DBTest, RecoveryWithEmptyLog) {
   do {
-    CreateAndReopenWithCF({"pikachu"});
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
     ASSERT_OK(Put(1, "foo", "v1"));
     ASSERT_OK(Put(1, "foo", "v2"));
     ReopenWithColumnFamilies({"default", "pikachu"});
@@ -2926,7 +2923,7 @@ TEST(DBTest, RecoverDuringMemtableCompaction) {
     options.env = env_;
     options.write_buffer_size = 1000000;
     options = CurrentOptions(options);
-    CreateAndReopenWithCF({"pikachu"}, &options);
+    CreateAndReopenWithCF({"pikachu"}, options);
 
     // Trigger a long memtable compaction and reopen the database during it
     ASSERT_OK(Put(1, "foo", "v1"));  // Goes to 1st log file
@@ -2950,7 +2947,7 @@ TEST(DBTest, FlushSchedule) {
   options.min_write_buffer_number_to_merge = 1;
   options.max_write_buffer_number = 2;
   options.write_buffer_size = 100 * 1000;
-  CreateAndReopenWithCF({"pikachu"}, &options);
+  CreateAndReopenWithCF({"pikachu"}, options);
   std::vector<std::thread> threads;
 
   std::atomic<int> thread_num(0);
@@ -2985,7 +2982,7 @@ TEST(DBTest, MinorCompactionsHappen) {
     Options options;
     options.write_buffer_size = 10000;
     options = CurrentOptions(options);
-    CreateAndReopenWithCF({"pikachu"}, &options);
+    CreateAndReopenWithCF({"pikachu"}, options);
 
     const int N = 500;
 
@@ -3013,7 +3010,7 @@ TEST(DBTest, ManifestRollOver) {
     Options options;
     options.max_manifest_file_size = 10 ;  // 10 bytes
     options = CurrentOptions(options);
-    CreateAndReopenWithCF({"pikachu"}, &options);
+    CreateAndReopenWithCF({"pikachu"}, options);
     {
       ASSERT_OK(Put(1, "manifest_key1", std::string(1000, '1')));
       ASSERT_OK(Put(1, "manifest_key2", std::string(1000, '2')));
@@ -3058,7 +3055,7 @@ TEST(DBTest, RecoverWithLargeLog) {
   do {
     {
       Options options = CurrentOptions();
-      CreateAndReopenWithCF({"pikachu"}, &options);
+      CreateAndReopenWithCF({"pikachu"}, options);
       ASSERT_OK(Put(1, "big1", std::string(200000, '1')));
       ASSERT_OK(Put(1, "big2", std::string(200000, '2')));
       ASSERT_OK(Put(1, "small3", std::string(10, '3')));
@@ -3085,7 +3082,7 @@ TEST(DBTest, CompactionsGenerateMultipleFiles) {
   Options options;
   options.write_buffer_size = 100000000;        // Large write buffer
   options = CurrentOptions(options);
-  CreateAndReopenWithCF({"pikachu"}, &options);
+  CreateAndReopenWithCF({"pikachu"}, options);
 
   Random rnd(301);
 
@@ -3115,7 +3112,7 @@ TEST(DBTest, CompactionTrigger) {
   options.max_mem_compaction_level = 0;
   options.level0_file_num_compaction_trigger = 3;
   options = CurrentOptions(options);
-  CreateAndReopenWithCF({"pikachu"}, &options);
+  CreateAndReopenWithCF({"pikachu"}, options);
 
   Random rnd(301);
 
@@ -3362,7 +3359,7 @@ TEST(DBTest, UniversalCompactionTrigger) {
   options.compaction_filter_factory.reset(filter);
 
   options = CurrentOptions(options);
-  CreateAndReopenWithCF({"pikachu"}, &options);
+  CreateAndReopenWithCF({"pikachu"}, options);
 
   Random rnd(301);
   int key_idx = 0;
@@ -3494,7 +3491,7 @@ TEST(DBTest, UniversalCompactionSizeAmplification) {
   options.compaction_style = kCompactionStyleUniversal;
   options.write_buffer_size = 100<<10; //100KB
   options.level0_file_num_compaction_trigger = 3;
-  CreateAndReopenWithCF({"pikachu"}, &options);
+  CreateAndReopenWithCF({"pikachu"}, options);
 
   // Trigger compaction if size amplification exceeds 110%
   options.compaction_options_universal.max_size_amplification_percent = 110;
@@ -3536,7 +3533,7 @@ TEST(DBTest, UniversalCompactionOptions) {
   options.num_levels = 1;
   options.compaction_options_universal.compression_size_percent = -1;
   options = CurrentOptions(options);
-  CreateAndReopenWithCF({"pikachu"}, &options);
+  CreateAndReopenWithCF({"pikachu"}, options);
 
   Random rnd(301);
   int key_idx = 0;
@@ -3691,7 +3688,7 @@ TEST(DBTest, CompressedCache) {
       default:
         ASSERT_TRUE(false);
     }
-    CreateAndReopenWithCF({"pikachu"}, &options);
+    CreateAndReopenWithCF({"pikachu"}, options);
     // default column family doesn't have block cache
     Options no_block_cache_opts;
     no_block_cache_opts.statistics = options.statistics;
@@ -3864,7 +3861,7 @@ TEST(DBTest, FailMoreDbPaths) {
   options.db_paths.emplace_back(dbname_ + "_3", 1000000);
   options.db_paths.emplace_back(dbname_ + "_4", 1000000);
   options.db_paths.emplace_back(dbname_ + "_5", 1000000);
-  ASSERT_TRUE(TryReopen(&options).IsNotSupported());
+  ASSERT_TRUE(TryReopen(options).IsNotSupported());
 }
 
 TEST(DBTest, UniversalCompactionSecondPathRatio) {
@@ -4076,7 +4073,7 @@ TEST(DBTest, ConvertCompactionStyle) {
   options.target_file_size_base = 200<<10; // 200KB
   options.target_file_size_multiplier = 1;
   options = CurrentOptions(options);
-  CreateAndReopenWithCF({"pikachu"}, &options);
+  CreateAndReopenWithCF({"pikachu"}, options);
 
   for (int i = 0; i <= max_key_level_insert; i++) {
     // each value is 10K
@@ -4281,7 +4278,7 @@ TEST(DBTest, RepeatedWritesToSameKey) {
     options.env = env_;
     options.write_buffer_size = 100000;  // Small write buffer
     options = CurrentOptions(options);
-    CreateAndReopenWithCF({"pikachu"}, &options);
+    CreateAndReopenWithCF({"pikachu"}, options);
 
     // We must have at most one file per level except for level-0,
     // which may have up to kL0_StopWritesTrigger files.
@@ -4305,7 +4302,7 @@ TEST(DBTest, InPlaceUpdate) {
     options.env = env_;
     options.write_buffer_size = 100000;
     options = CurrentOptions(options);
-    CreateAndReopenWithCF({"pikachu"}, &options);
+    CreateAndReopenWithCF({"pikachu"}, options);
 
     // Update key with values of smaller size
     int numValues = 10;
@@ -4329,7 +4326,7 @@ TEST(DBTest, InPlaceUpdateLargeNewValue) {
     options.env = env_;
     options.write_buffer_size = 100000;
     options = CurrentOptions(options);
-    CreateAndReopenWithCF({"pikachu"}, &options);
+    CreateAndReopenWithCF({"pikachu"}, options);
 
     // Update key with values of larger size
     int numValues = 10;
@@ -4357,7 +4354,7 @@ TEST(DBTest, InPlaceUpdateCallbackSmallerSize) {
     options.inplace_callback =
       rocksdb::DBTest::updateInPlaceSmallerSize;
     options = CurrentOptions(options);
-    CreateAndReopenWithCF({"pikachu"}, &options);
+    CreateAndReopenWithCF({"pikachu"}, options);
 
     // Update key with values of smaller size
     int numValues = 10;
@@ -4386,7 +4383,7 @@ TEST(DBTest, InPlaceUpdateCallbackSmallerVarintSize) {
     options.inplace_callback =
       rocksdb::DBTest::updateInPlaceSmallerVarintSize;
     options = CurrentOptions(options);
-    CreateAndReopenWithCF({"pikachu"}, &options);
+    CreateAndReopenWithCF({"pikachu"}, options);
 
     // Update key with values of smaller varint size
     int numValues = 265;
@@ -4415,7 +4412,7 @@ TEST(DBTest, InPlaceUpdateCallbackLargeNewValue) {
     options.inplace_callback =
       rocksdb::DBTest::updateInPlaceLargerSize;
     options = CurrentOptions(options);
-    CreateAndReopenWithCF({"pikachu"}, &options);
+    CreateAndReopenWithCF({"pikachu"}, options);
 
     // Update key with values of larger size
     int numValues = 10;
@@ -4442,7 +4439,7 @@ TEST(DBTest, InPlaceUpdateCallbackNoAction) {
     options.inplace_callback =
       rocksdb::DBTest::updateInPlaceNoAction;
     options = CurrentOptions(options);
-    CreateAndReopenWithCF({"pikachu"}, &options);
+    CreateAndReopenWithCF({"pikachu"}, options);
 
     // Callback function requests no actions from db
     ASSERT_OK(Put(1, "key", DummyString(1, 'a')));
@@ -4458,7 +4455,7 @@ TEST(DBTest, CompactionFilter) {
   options.max_mem_compaction_level = 0;
   options.compaction_filter_factory = std::make_shared<KeepFilterFactory>();
   options = CurrentOptions(options);
-  CreateAndReopenWithCF({"pikachu"}, &options);
+  CreateAndReopenWithCF({"pikachu"}, options);
 
   // Write 100K keys, these are written to a few files in L0.
   const std::string value(10, 'x');
@@ -4538,7 +4535,7 @@ TEST(DBTest, CompactionFilter) {
   options.compaction_filter_factory = std::make_shared<DeleteFilterFactory>();
   options.create_if_missing = true;
   DestroyAndReopen(options);
-  CreateAndReopenWithCF({"pikachu"}, &options);
+  CreateAndReopenWithCF({"pikachu"}, options);
 
   // write all the keys once again.
   for (int i = 0; i < 100000; i++) {
@@ -4637,7 +4634,7 @@ TEST(DBTest, CompactionFilterWithValueChange) {
     options.compaction_filter_factory =
       std::make_shared<ChangeFilterFactory>();
     options = CurrentOptions(options);
-    CreateAndReopenWithCF({"pikachu"}, &options);
+    CreateAndReopenWithCF({"pikachu"}, options);
 
     // Write 100K+1 keys, these are written to a few files
     // in L0. We do this so that the current snapshot points
@@ -5043,7 +5040,7 @@ TEST(DBTest, SparseMerge) {
   do {
     Options options = CurrentOptions();
     options.compression = kNoCompression;
-    CreateAndReopenWithCF({"pikachu"}, &options);
+    CreateAndReopenWithCF({"pikachu"}, options);
 
     FillLevels("A", "Z", 1);
 
@@ -5103,7 +5100,7 @@ TEST(DBTest, ApproximateSizes) {
     options.create_if_missing = true;
     options = CurrentOptions(options);
     DestroyAndReopen(options);
-    CreateAndReopenWithCF({"pikachu"}, &options);
+    CreateAndReopenWithCF({"pikachu"}, options);
 
     ASSERT_TRUE(Between(Size("", "xyz", 1), 0, 0));
     ReopenWithColumnFamilies({"default", "pikachu"}, &options);
@@ -5156,7 +5153,7 @@ TEST(DBTest, ApproximateSizes_MixOfSmallAndLarge) {
   do {
     Options options = CurrentOptions();
     options.compression = kNoCompression;
-    CreateAndReopenWithCF({"pikachu"}, &options);
+    CreateAndReopenWithCF({"pikachu"}, options);
 
     Random rnd(301);
     std::string big1 = RandomString(&rnd, 100000);
@@ -5193,7 +5190,7 @@ TEST(DBTest, ApproximateSizes_MixOfSmallAndLarge) {
 
 TEST(DBTest, IteratorPinsRef) {
   do {
-    CreateAndReopenWithCF({"pikachu"});
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
     Put(1, "foo", "hello");
 
     // Get iterator that will yield the current contents of the DB.
@@ -5219,7 +5216,7 @@ TEST(DBTest, IteratorPinsRef) {
 
 TEST(DBTest, Snapshot) {
   do {
-    CreateAndReopenWithCF({"pikachu"});
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
     Put(0, "foo", "0v1");
     Put(1, "foo", "1v1");
     const Snapshot* s1 = db_->GetSnapshot();
@@ -5265,7 +5262,7 @@ TEST(DBTest, HiddenValuesAreRemoved) {
   do {
     Options options = CurrentOptions();
     options.max_background_flushes = 0;
-    CreateAndReopenWithCF({"pikachu"}, &options);
+    CreateAndReopenWithCF({"pikachu"}, options);
     Random rnd(301);
     FillLevels("a", "z", 1);
 
@@ -5303,7 +5300,7 @@ TEST(DBTest, CompactBetweenSnapshots) {
   do {
     Options options = CurrentOptions();
     options.disable_auto_compactions = true;
-    CreateAndReopenWithCF({"pikachu"}, &options);
+    CreateAndReopenWithCF({"pikachu"}, options);
     Random rnd(301);
     FillLevels("a", "z", 1);
 
@@ -5358,7 +5355,7 @@ TEST(DBTest, CompactBetweenSnapshots) {
 TEST(DBTest, DeletionMarkers1) {
   Options options = CurrentOptions();
   options.max_background_flushes = 0;
-  CreateAndReopenWithCF({"pikachu"}, &options);
+  CreateAndReopenWithCF({"pikachu"}, options);
   Put(1, "foo", "v1");
   ASSERT_OK(Flush(1));
   const int last = CurrentOptions().max_mem_compaction_level;
@@ -5395,7 +5392,7 @@ TEST(DBTest, DeletionMarkers1) {
 TEST(DBTest, DeletionMarkers2) {
   Options options = CurrentOptions();
   options.max_background_flushes = 0;
-  CreateAndReopenWithCF({"pikachu"}, &options);
+  CreateAndReopenWithCF({"pikachu"}, options);
   Put(1, "foo", "v1");
   ASSERT_OK(Flush(1));
   const int last = CurrentOptions().max_mem_compaction_level;
@@ -5426,7 +5423,7 @@ TEST(DBTest, OverlapInLevel0) {
   do {
     Options options = CurrentOptions();
     options.max_background_flushes = 0;
-    CreateAndReopenWithCF({"pikachu"}, &options);
+    CreateAndReopenWithCF({"pikachu"}, options);
     int tmp = CurrentOptions().max_mem_compaction_level;
     ASSERT_EQ(tmp, 2) << "Fix test to match config";
 
@@ -5469,7 +5466,7 @@ TEST(DBTest, OverlapInLevel0) {
 
 TEST(DBTest, L0_CompactionBug_Issue44_a) {
   do {
-    CreateAndReopenWithCF({"pikachu"});
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
     ASSERT_OK(Put(1, "b", "v"));
     ReopenWithColumnFamilies({"default", "pikachu"});
     ASSERT_OK(Delete(1, "b"));
@@ -5488,7 +5485,7 @@ TEST(DBTest, L0_CompactionBug_Issue44_a) {
 
 TEST(DBTest, L0_CompactionBug_Issue44_b) {
   do {
-    CreateAndReopenWithCF({"pikachu"});
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
     Put(1, "", "");
     ReopenWithColumnFamilies({"default", "pikachu"});
     Delete(1, "e");
@@ -5531,8 +5528,8 @@ TEST(DBTest, ComparatorCheck) {
   Options new_options, options;
   NewComparator cmp;
   do {
-    CreateAndReopenWithCF({"pikachu"});
     options = CurrentOptions();
+    CreateAndReopenWithCF({"pikachu"}, options);
     new_options = CurrentOptions();
     new_options.comparator = &cmp;
     // only the non-default column family has non-matching comparator
@@ -5579,7 +5576,7 @@ TEST(DBTest, CustomComparator) {
     new_options.write_buffer_size = 1000;  // Compact more often
     new_options = CurrentOptions(new_options);
     DestroyAndReopen(new_options);
-    CreateAndReopenWithCF({"pikachu"}, &new_options);
+    CreateAndReopenWithCF({"pikachu"}, new_options);
     ASSERT_OK(Put(1, "[10]", "ten"));
     ASSERT_OK(Put(1, "[0x14]", "twenty"));
     for (int i = 0; i < 2; i++) {
@@ -5606,7 +5603,7 @@ TEST(DBTest, CustomComparator) {
 TEST(DBTest, ManualCompaction) {
   Options options = CurrentOptions();
   options.max_background_flushes = 0;
-  CreateAndReopenWithCF({"pikachu"}, &options);
+  CreateAndReopenWithCF({"pikachu"}, options);
   ASSERT_EQ(dbfull()->MaxMemCompactionLevel(), 2)
       << "Need to update this test to match kMaxMemCompactLevel";
 
@@ -5648,7 +5645,7 @@ TEST(DBTest, ManualCompaction) {
       options.num_levels = 3;
       options.create_if_missing = true;
       DestroyAndReopen(options);
-      CreateAndReopenWithCF({"pikachu"}, &options);
+      CreateAndReopenWithCF({"pikachu"}, options);
     }
   }
 
@@ -5663,7 +5660,7 @@ TEST(DBTest, ManualCompactionOutputPathId) {
   options.level0_file_num_compaction_trigger = 10;
   Destroy(options);
   DestroyAndReopen(options);
-  CreateAndReopenWithCF({"pikachu"}, &options);
+  CreateAndReopenWithCF({"pikachu"}, options);
   MakeTables(3, "p", "q", 1);
   dbfull()->TEST_WaitForCompact();
   ASSERT_EQ("3", FilesPerLevel(1));
@@ -5747,7 +5744,7 @@ TEST(DBTest, DBOpen_Change_NumLevels) {
   opts.max_background_flushes = 0;
   DestroyAndReopen(opts);
   ASSERT_TRUE(db_ != nullptr);
-  CreateAndReopenWithCF({"pikachu"}, &opts);
+  CreateAndReopenWithCF({"pikachu"}, opts);
 
   ASSERT_OK(Put(1, "a", "123"));
   ASSERT_OK(Put(1, "b", "234"));
@@ -5966,7 +5963,7 @@ TEST(DBTest, PutFailsParanoid) {
   options.error_if_exists = false;
   options.paranoid_checks = true;
   DestroyAndReopen(options);
-  CreateAndReopenWithCF({"pikachu"}, &options);
+  CreateAndReopenWithCF({"pikachu"}, options);
   Status s;
 
   ASSERT_OK(Put(1, "foo", "bar"));
@@ -5985,7 +5982,7 @@ TEST(DBTest, PutFailsParanoid) {
   // do the same thing with paranoid checks off
   options.paranoid_checks = false;
   DestroyAndReopen(options);
-  CreateAndReopenWithCF({"pikachu"}, &options);
+  CreateAndReopenWithCF({"pikachu"}, options);
 
   ASSERT_OK(Put(1, "foo", "bar"));
   ASSERT_OK(Put(1, "foo1", "bar1"));
@@ -6001,7 +5998,7 @@ TEST(DBTest, PutFailsParanoid) {
 
 TEST(DBTest, FilesDeletedAfterCompaction) {
   do {
-    CreateAndReopenWithCF({"pikachu"});
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
     ASSERT_OK(Put(1, "foo", "v2"));
     Compact(1, "a", "z");
     const int num_files = CountLiveFiles();
@@ -6025,7 +6022,7 @@ TEST(DBTest, BloomFilter) {
     table_options.filter_policy.reset(NewBloomFilterPolicy(10));
     options.table_factory.reset(NewBlockBasedTableFactory(table_options));
 
-    CreateAndReopenWithCF({"pikachu"}, &options);
+    CreateAndReopenWithCF({"pikachu"}, options);
 
     // Populate multiple layers
     const int N = 10000;
@@ -6069,7 +6066,7 @@ TEST(DBTest, BloomFilterRate) {
   while (ChangeFilterOptions()) {
     Options options = CurrentOptions();
     options.statistics = rocksdb::CreateDBStatistics();
-    CreateAndReopenWithCF({"pikachu"}, &options);
+    CreateAndReopenWithCF({"pikachu"}, options);
 
     const int maxKey = 10000;
     for (int i = 0; i < maxKey; i++) {
@@ -6101,7 +6098,7 @@ TEST(DBTest, BloomFilterCompatibility) {
   options.table_factory.reset(NewBlockBasedTableFactory(table_options));
 
   // Create with block based filter
-  CreateAndReopenWithCF({"pikachu"}, &options);
+  CreateAndReopenWithCF({"pikachu"}, options);
 
   const int maxKey = 10000;
   for (int i = 0; i < maxKey; i++) {
@@ -6130,7 +6127,7 @@ TEST(DBTest, BloomFilterReverseCompatibility) {
   options.table_factory.reset(NewBlockBasedTableFactory(table_options));
 
   // Create with full filter
-  CreateAndReopenWithCF({"pikachu"}, &options);
+  CreateAndReopenWithCF({"pikachu"}, options);
 
   const int maxKey = 10000;
   for (int i = 0; i < maxKey; i++) {
@@ -6199,7 +6196,7 @@ TEST(DBTest, BloomFilterWrapper) {
   table_options.filter_policy.reset(policy);
   options.table_factory.reset(NewBlockBasedTableFactory(table_options));
 
-  CreateAndReopenWithCF({"pikachu"}, &options);
+  CreateAndReopenWithCF({"pikachu"}, options);
 
   const int maxKey = 10000;
   for (int i = 0; i < maxKey; i++) {
@@ -6229,7 +6226,7 @@ TEST(DBTest, SnapshotFiles) {
   do {
     Options options = CurrentOptions();
     options.write_buffer_size = 100000000;        // Large write buffer
-    CreateAndReopenWithCF({"pikachu"}, &options);
+    CreateAndReopenWithCF({"pikachu"}, options);
 
     Random rnd(301);
 
@@ -6359,7 +6356,7 @@ TEST(DBTest, CompactOnFlush) {
     Options options = CurrentOptions();
     options.purge_redundant_kvs_while_flush = true;
     options.disable_auto_compactions = true;
-    CreateAndReopenWithCF({"pikachu"}, &options);
+    CreateAndReopenWithCF({"pikachu"}, options);
 
     Put(1, "foo", "v1");
     ASSERT_OK(Flush(1));
@@ -6471,7 +6468,7 @@ TEST(DBTest, FlushOneColumnFamily) {
   Options options;
   CreateAndReopenWithCF({"pikachu", "ilya", "muromec", "dobrynia", "nikitich",
                          "alyosha", "popovich"},
-                        &options);
+                        options);
 
   ASSERT_OK(Put(0, "Default", "Default"));
   ASSERT_OK(Put(1, "pikachu", "pikachu"));
@@ -6497,7 +6494,7 @@ TEST(DBTest, FlushOneColumnFamily) {
 TEST(DBTest, RecoverCheckFileAmountWithSmallWriteBuffer) {
   Options options;
   options.write_buffer_size = 5000000;
-  CreateAndReopenWithCF({"pikachu", "dobrynia", "nikitich"}, &options);
+  CreateAndReopenWithCF({"pikachu", "dobrynia", "nikitich"}, options);
 
   // Since we will reopen DB with smaller write_buffer_size,
   // each key will go to new SST file
@@ -6552,7 +6549,7 @@ TEST(DBTest, RecoverCheckFileAmountWithSmallWriteBuffer) {
 TEST(DBTest, RecoverCheckFileAmount) {
   Options options;
   options.write_buffer_size = 100000;
-  CreateAndReopenWithCF({"pikachu", "dobrynia", "nikitich"}, &options);
+  CreateAndReopenWithCF({"pikachu", "dobrynia", "nikitich"}, options);
 
   ASSERT_OK(Put(0, Key(1), DummyString(1)));
   ASSERT_OK(Put(1, Key(1), DummyString(1)));
@@ -6798,7 +6795,7 @@ TEST(DBTest, TransactionLogIterator) {
   do {
     Options options = OptionsForLogIterTest();
     DestroyAndReopen(options);
-    CreateAndReopenWithCF({"pikachu"}, &options);
+    CreateAndReopenWithCF({"pikachu"}, options);
     Put(0, "key1", DummyString(1024));
     Put(1, "key2", DummyString(1024));
     Put(1, "key2", DummyString(1024));
@@ -6880,7 +6877,7 @@ TEST(DBTest, TransactionLogIteratorMoveOverZeroFiles) {
   do {
     Options options = OptionsForLogIterTest();
     DestroyAndReopen(options);
-    CreateAndReopenWithCF({"pikachu"}, &options);
+    CreateAndReopenWithCF({"pikachu"}, options);
     // Do a plain Reopen.
     Put(1, "key1", DummyString(1024));
     // Two reopens should create a zero record WAL file.
@@ -6969,7 +6966,7 @@ TEST(DBTest, TransactionLogIteratorBatchOperations) {
   do {
     Options options = OptionsForLogIterTest();
     DestroyAndReopen(options);
-    CreateAndReopenWithCF({"pikachu"}, &options);
+    CreateAndReopenWithCF({"pikachu"}, options);
     WriteBatch batch;
     batch.Put(handles_[1], "key1", DummyString(1024));
     batch.Put(handles_[0], "key2", DummyString(1024));
@@ -6988,7 +6985,7 @@ TEST(DBTest, TransactionLogIteratorBatchOperations) {
 TEST(DBTest, TransactionLogIteratorBlobs) {
   Options options = OptionsForLogIterTest();
   DestroyAndReopen(options);
-  CreateAndReopenWithCF({"pikachu"}, &options);
+  CreateAndReopenWithCF({"pikachu"}, options);
   {
     WriteBatch batch;
     batch.Put(handles_[1], "key1", DummyString(1024));
@@ -7186,7 +7183,7 @@ TEST(DBTest, MultiThreaded) {
     for (int i = 1; i < kColumnFamilies; ++i) {
       cfs.push_back(std::to_string(i));
     }
-    CreateAndReopenWithCF(cfs);
+    CreateAndReopenWithCF(cfs, CurrentOptions());
     // Initialize state
     MTState mt;
     mt.test = this;
@@ -7689,7 +7686,7 @@ TEST(DBTest, Randomized) {
 
 TEST(DBTest, MultiGetSimple) {
   do {
-    CreateAndReopenWithCF({"pikachu"});
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
     ASSERT_OK(Put(1, "k1", "v1"));
     ASSERT_OK(Put(1, "k2", "v2"));
     ASSERT_OK(Put(1, "k3", "v3"));
@@ -7721,7 +7718,7 @@ TEST(DBTest, MultiGetSimple) {
 
 TEST(DBTest, MultiGetEmpty) {
   do {
-    CreateAndReopenWithCF({"pikachu"});
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
     // Empty Key Set
     std::vector<Slice> keys;
     std::vector<std::string> values;
@@ -7733,7 +7730,7 @@ TEST(DBTest, MultiGetEmpty) {
     Options options = CurrentOptions();
     options.create_if_missing = true;
     DestroyAndReopen(options);
-    CreateAndReopenWithCF({"pikachu"});
+    CreateAndReopenWithCF({"pikachu"}, options);
     s = db_->MultiGet(ReadOptions(), cfs, keys, &values);
     ASSERT_EQ(s.size(), 0U);
 
@@ -7866,7 +7863,7 @@ TEST(DBTest, TailingIteratorSingle) {
 }
 
 TEST(DBTest, TailingIteratorKeepAdding) {
-  CreateAndReopenWithCF({"pikachu"});
+  CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
   ReadOptions read_options;
   read_options.tailing = true;
 
@@ -7888,7 +7885,7 @@ TEST(DBTest, TailingIteratorKeepAdding) {
 }
 
 TEST(DBTest, TailingIteratorSeekToNext) {
-  CreateAndReopenWithCF({"pikachu"});
+  CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
   ReadOptions read_options;
   read_options.tailing = true;
 
@@ -7935,7 +7932,7 @@ TEST(DBTest, TailingIteratorSeekToNext) {
 }
 
 TEST(DBTest, TailingIteratorDeletes) {
-  CreateAndReopenWithCF({"pikachu"});
+  CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
   ReadOptions read_options;
   read_options.tailing = true;
 
@@ -7984,7 +7981,7 @@ TEST(DBTest, TailingIteratorPrefixSeek) {
   options.prefix_extractor.reset(NewFixedPrefixTransform(2));
   options.memtable_factory.reset(NewHashSkipListRepFactory(16));
   DestroyAndReopen(options);
-  CreateAndReopenWithCF({"pikachu"}, &options);
+  CreateAndReopenWithCF({"pikachu"}, options);
 
   std::unique_ptr<Iterator> iter(db_->NewIterator(read_options, handles_[1]));
   ASSERT_OK(Put(1, "0101", "test"));
@@ -8006,7 +8003,7 @@ TEST(DBTest, TailingIteratorPrefixSeek) {
 }
 
 TEST(DBTest, TailingIteratorIncomplete) {
-  CreateAndReopenWithCF({"pikachu"});
+  CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
   ReadOptions read_options;
   read_options.tailing = true;
   read_options.read_tier = kBlockCacheTier;
@@ -8031,7 +8028,7 @@ TEST(DBTest, TailingIteratorSeekToSame) {
   Options options = CurrentOptions();
   options.compaction_style = kCompactionStyleUniversal;
   options.write_buffer_size = 1000;
-  CreateAndReopenWithCF({"pikachu"}, &options);
+  CreateAndReopenWithCF({"pikachu"}, options);
 
   ReadOptions read_options;
   read_options.tailing = true;
@@ -8390,7 +8387,7 @@ TEST(DBTest, TableOptionsSanitizeTest) {
   options.table_factory.reset(new PlainTableFactory());
   options.prefix_extractor.reset(NewNoopTransform());
   Destroy(options);
-  ASSERT_TRUE(TryReopen(&options).IsNotSupported());
+  ASSERT_TRUE(TryReopen(options).IsNotSupported());
 
   // Test for check of prefix_extractor when hash index is used for
   // block-based table
@@ -8399,9 +8396,9 @@ TEST(DBTest, TableOptionsSanitizeTest) {
   options = Options();
   options.create_if_missing = true;
   options.table_factory.reset(NewBlockBasedTableFactory(to));
-  ASSERT_TRUE(TryReopen(&options).IsInvalidArgument());
+  ASSERT_TRUE(TryReopen(options).IsInvalidArgument());
   options.prefix_extractor.reset(NewFixedPrefixTransform(1));
-  ASSERT_OK(TryReopen(&options));
+  ASSERT_OK(TryReopen(options));
 }
 
 TEST(DBTest, DBIteratorBoundTest) {
@@ -8572,7 +8569,7 @@ TEST(DBTest, DisableDataSyncTest) {
     options.create_if_missing = true;
     options.env = env_;
     Reopen(options);
-    CreateAndReopenWithCF({"pikachu"}, &options);
+    CreateAndReopenWithCF({"pikachu"}, options);
 
     MakeTables(10, "a", "z");
     Compact("a", "z");

From e130e88bc63bcf9aa90257d0945ab7ba65b606c3 Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Wed, 29 Oct 2014 12:00:42 -0700
Subject: [PATCH 335/829] DBTest: options clean up - part 4

Summary: as title

Test Plan: as part 1

Reviewers: sdong, yhchiang, rven, igor

Reviewed By: igor

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D27789
---
 db/db_test.cc | 129 +++++++++++++++++++++++++-------------------------
 1 file changed, 64 insertions(+), 65 deletions(-)

diff --git a/db/db_test.cc b/db/db_test.cc
index 9ee935d54..da6a8ffc9 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -662,37 +662,36 @@ class DBTest {
     CreateColumnFamilies(cfs, options);
     std::vector<std::string> cfs_plus_default = cfs;
     cfs_plus_default.insert(cfs_plus_default.begin(), kDefaultColumnFamilyName);
-    ReopenWithColumnFamilies(cfs_plus_default, &options);
+    ReopenWithColumnFamilies(cfs_plus_default, options);
   }
 
   void ReopenWithColumnFamilies(const std::vector<std::string>& cfs,
-                                const std::vector<const Options*>& options) {
+                                const std::vector<Options>& options) {
     ASSERT_OK(TryReopenWithColumnFamilies(cfs, options));
   }
 
   void ReopenWithColumnFamilies(const std::vector<std::string>& cfs,
-                                const Options* options = nullptr) {
+                                const Options& options) {
     ASSERT_OK(TryReopenWithColumnFamilies(cfs, options));
   }
 
   Status TryReopenWithColumnFamilies(
       const std::vector<std::string>& cfs,
-      const std::vector<const Options*>& options) {
+      const std::vector<Options>& options) {
     Close();
     ASSERT_EQ(cfs.size(), options.size());
     std::vector<ColumnFamilyDescriptor> column_families;
     for (size_t i = 0; i < cfs.size(); ++i) {
-      column_families.push_back(ColumnFamilyDescriptor(cfs[i], *options[i]));
+      column_families.push_back(ColumnFamilyDescriptor(cfs[i], options[i]));
     }
-    DBOptions db_opts = DBOptions(*options[0]);
+    DBOptions db_opts = DBOptions(options[0]);
     return DB::Open(db_opts, dbname_, column_families, &handles_, &db_);
   }
 
   Status TryReopenWithColumnFamilies(const std::vector<std::string>& cfs,
-                                     const Options* options = nullptr) {
+                                     const Options& options) {
     Close();
-    Options opts = (options == nullptr) ? CurrentOptions() : *options;
-    std::vector<const Options*> v_opts(cfs.size(), &opts);
+    std::vector<Options> v_opts(cfs.size(), options);
     return TryReopenWithColumnFamilies(cfs, v_opts);
   }
 
@@ -1500,14 +1499,14 @@ TEST(DBTest, LevelLimitReopen) {
 
   options.num_levels = 1;
   options.max_bytes_for_level_multiplier_additional.resize(1, 1);
-  Status s = TryReopenWithColumnFamilies({"default", "pikachu"}, &options);
+  Status s = TryReopenWithColumnFamilies({"default", "pikachu"}, options);
   ASSERT_EQ(s.IsInvalidArgument(), true);
   ASSERT_EQ(s.ToString(),
             "Invalid argument: db has more levels than options.num_levels");
 
   options.num_levels = 10;
   options.max_bytes_for_level_multiplier_additional.resize(10, 1);
-  ASSERT_OK(TryReopenWithColumnFamilies({"default", "pikachu"}, &options));
+  ASSERT_OK(TryReopenWithColumnFamilies({"default", "pikachu"}, options));
 }
 
 TEST(DBTest, Preallocation) {
@@ -2364,7 +2363,7 @@ TEST(DBTest, Recover) {
     ASSERT_OK(Put(1, "foo", "v1"));
     ASSERT_OK(Put(1, "baz", "v5"));
 
-    ReopenWithColumnFamilies({"default", "pikachu"});
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
     ASSERT_EQ("v1", Get(1, "foo"));
 
     ASSERT_EQ("v1", Get(1, "foo"));
@@ -2372,7 +2371,7 @@ TEST(DBTest, Recover) {
     ASSERT_OK(Put(1, "bar", "v2"));
     ASSERT_OK(Put(1, "foo", "v3"));
 
-    ReopenWithColumnFamilies({"default", "pikachu"});
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
     ASSERT_EQ("v3", Get(1, "foo"));
     ASSERT_OK(Put(1, "foo", "v4"));
     ASSERT_EQ("v4", Get(1, "foo"));
@@ -2398,7 +2397,7 @@ TEST(DBTest, RecoverWithTableHandle) {
     ASSERT_OK(Put(1, "bar", "v4"));
     ASSERT_OK(Flush(1));
     ASSERT_OK(Put(1, "big", std::string(100, 'a')));
-    ReopenWithColumnFamilies({"default", "pikachu"});
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
 
     std::vector<std::vector<FileMetaData>> files;
     dbfull()->TEST_GetFilesMetaData(handles_[1], &files);
@@ -2514,13 +2513,13 @@ TEST(DBTest, RollLog) {
     ASSERT_OK(Put(1, "foo", "v1"));
     ASSERT_OK(Put(1, "baz", "v5"));
 
-    ReopenWithColumnFamilies({"default", "pikachu"});
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
     for (int i = 0; i < 10; i++) {
-      ReopenWithColumnFamilies({"default", "pikachu"});
+      ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
     }
     ASSERT_OK(Put(1, "foo", "v4"));
     for (int i = 0; i < 10; i++) {
-      ReopenWithColumnFamilies({"default", "pikachu"});
+      ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
     }
   } while (ChangeOptions());
 }
@@ -2533,7 +2532,7 @@ TEST(DBTest, WAL) {
     ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v1"));
     ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1"));
 
-    ReopenWithColumnFamilies({"default", "pikachu"});
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
     ASSERT_EQ("v1", Get(1, "foo"));
     ASSERT_EQ("v1", Get(1, "bar"));
 
@@ -2542,7 +2541,7 @@ TEST(DBTest, WAL) {
     writeOpt.disableWAL = true;
     ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v2"));
 
-    ReopenWithColumnFamilies({"default", "pikachu"});
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
     // Both value's should be present.
     ASSERT_EQ("v2", Get(1, "bar"));
     ASSERT_EQ("v2", Get(1, "foo"));
@@ -2552,7 +2551,7 @@ TEST(DBTest, WAL) {
     writeOpt.disableWAL = false;
     ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v3"));
 
-    ReopenWithColumnFamilies({"default", "pikachu"});
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
     // again both values should be present.
     ASSERT_EQ("v3", Get(1, "foo"));
     ASSERT_EQ("v3", Get(1, "bar"));
@@ -2872,7 +2871,7 @@ TEST(DBTest, FLUSH) {
     Get(1, "foo");
     ASSERT_TRUE((int) perf_context.get_from_output_files_time > 0);
 
-    ReopenWithColumnFamilies({"default", "pikachu"});
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
     ASSERT_EQ("v1", Get(1, "foo"));
     ASSERT_EQ("v1", Get(1, "bar"));
 
@@ -2881,7 +2880,7 @@ TEST(DBTest, FLUSH) {
     ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v2"));
     ASSERT_OK(Flush(1));
 
-    ReopenWithColumnFamilies({"default", "pikachu"});
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
     ASSERT_EQ("v2", Get(1, "bar"));
     perf_context.Reset();
     ASSERT_EQ("v2", Get(1, "foo"));
@@ -2892,7 +2891,7 @@ TEST(DBTest, FLUSH) {
     ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v3"));
     ASSERT_OK(Flush(1));
 
-    ReopenWithColumnFamilies({"default", "pikachu"});
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
     // 'foo' should be there because its put
     // has WAL enabled.
     ASSERT_EQ("v3", Get(1, "foo"));
@@ -2907,10 +2906,10 @@ TEST(DBTest, RecoveryWithEmptyLog) {
     CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
     ASSERT_OK(Put(1, "foo", "v1"));
     ASSERT_OK(Put(1, "foo", "v2"));
-    ReopenWithColumnFamilies({"default", "pikachu"});
-    ReopenWithColumnFamilies({"default", "pikachu"});
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
     ASSERT_OK(Put(1, "foo", "v3"));
-    ReopenWithColumnFamilies({"default", "pikachu"});
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
     ASSERT_EQ("v3", Get(1, "foo"));
   } while (ChangeOptions());
 }
@@ -2931,7 +2930,7 @@ TEST(DBTest, RecoverDuringMemtableCompaction) {
     ASSERT_OK(Put(1, "big2", std::string(1000, 'y')));  // Triggers compaction
     ASSERT_OK(Put(1, "bar", "v2"));                     // Goes to new log file
 
-    ReopenWithColumnFamilies({"default", "pikachu"}, &options);
+    ReopenWithColumnFamilies({"default", "pikachu"}, options);
     ASSERT_EQ("v1", Get(1, "foo"));
     ASSERT_EQ("v2", Get(1, "bar"));
     ASSERT_EQ(std::string(10000000, 'x'), Get(1, "big1"));
@@ -2997,7 +2996,7 @@ TEST(DBTest, MinorCompactionsHappen) {
       ASSERT_EQ(Key(i) + std::string(1000, 'v'), Get(1, Key(i)));
     }
 
-    ReopenWithColumnFamilies({"default", "pikachu"}, &options);
+    ReopenWithColumnFamilies({"default", "pikachu"}, options);
 
     for (int i = 0; i < N; i++) {
       ASSERT_EQ(Key(i) + std::string(1000, 'v'), Get(1, Key(i)));
@@ -3019,7 +3018,7 @@ TEST(DBTest, ManifestRollOver) {
       ASSERT_OK(Flush(1));  // This should trigger LogAndApply.
       uint64_t manifest_after_flush = dbfull()->TEST_Current_Manifest_FileNo();
       ASSERT_GT(manifest_after_flush, manifest_before_flush);
-      ReopenWithColumnFamilies({"default", "pikachu"}, &options);
+      ReopenWithColumnFamilies({"default", "pikachu"}, options);
       ASSERT_GT(dbfull()->TEST_Current_Manifest_FileNo(), manifest_after_flush);
       // check if a new manifest file got inserted or not.
       ASSERT_EQ(std::string(1000, '1'), Get(1, "manifest_key1"));
@@ -3068,7 +3067,7 @@ TEST(DBTest, RecoverWithLargeLog) {
     Options options;
     options.write_buffer_size = 100000;
     options = CurrentOptions(options);
-    ReopenWithColumnFamilies({"default", "pikachu"}, &options);
+    ReopenWithColumnFamilies({"default", "pikachu"}, options);
     ASSERT_EQ(NumTableFilesAtLevel(0, 1), 3);
     ASSERT_EQ(std::string(200000, '1'), Get(1, "big1"));
     ASSERT_EQ(std::string(200000, '2'), Get(1, "big2"));
@@ -3095,7 +3094,7 @@ TEST(DBTest, CompactionsGenerateMultipleFiles) {
   }
 
   // Reopening moves updates to level-0
-  ReopenWithColumnFamilies({"default", "pikachu"}, &options);
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
   dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]);
 
   ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
@@ -3496,7 +3495,7 @@ TEST(DBTest, UniversalCompactionSizeAmplification) {
   // Trigger compaction if size amplification exceeds 110%
   options.compaction_options_universal.max_size_amplification_percent = 110;
   options = CurrentOptions(options);
-  ReopenWithColumnFamilies({"default", "pikachu"}, &options);
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
 
   Random rnd(301);
   int key_idx = 0;
@@ -3697,7 +3696,7 @@ TEST(DBTest, CompressedCache) {
     no_block_cache_opts.table_factory.reset(
         NewBlockBasedTableFactory(table_options_no_bc));
     ReopenWithColumnFamilies({"default", "pikachu"},
-                             {&no_block_cache_opts, &options});
+        std::vector<Options>({no_block_cache_opts, options}));
 
     Random rnd(301);
 
@@ -4093,7 +4092,7 @@ TEST(DBTest, ConvertCompactionStyle) {
   options = CurrentOptions();
   options.compaction_style = kCompactionStyleUniversal;
   options = CurrentOptions(options);
-  Status s = TryReopenWithColumnFamilies({"default", "pikachu"}, &options);
+  Status s = TryReopenWithColumnFamilies({"default", "pikachu"}, options);
   ASSERT_TRUE(s.IsInvalidArgument());
 
   // Stage 3: compact into a single file and move the file to level 0
@@ -4104,7 +4103,7 @@ TEST(DBTest, ConvertCompactionStyle) {
   options.max_bytes_for_level_base = INT_MAX;
   options.max_bytes_for_level_multiplier = 1;
   options = CurrentOptions(options);
-  ReopenWithColumnFamilies({"default", "pikachu"}, &options);
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
 
   dbfull()->CompactRange(handles_[1], nullptr, nullptr, true /* reduce level */,
                          0 /* reduce to level 0 */);
@@ -4124,7 +4123,7 @@ TEST(DBTest, ConvertCompactionStyle) {
   options.write_buffer_size = 100<<10; //100KB
   options.level0_file_num_compaction_trigger = 3;
   options = CurrentOptions(options);
-  ReopenWithColumnFamilies({"default", "pikachu"}, &options);
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
 
   for (int i = max_key_level_insert / 2; i <= max_key_universal_insert; i++) {
     ASSERT_OK(Put(1, Key(i), RandomString(&rnd, 10000)));
@@ -5103,7 +5102,7 @@ TEST(DBTest, ApproximateSizes) {
     CreateAndReopenWithCF({"pikachu"}, options);
 
     ASSERT_TRUE(Between(Size("", "xyz", 1), 0, 0));
-    ReopenWithColumnFamilies({"default", "pikachu"}, &options);
+    ReopenWithColumnFamilies({"default", "pikachu"}, options);
     ASSERT_TRUE(Between(Size("", "xyz", 1), 0, 0));
 
     // Write 8MB (80 values, each 100K)
@@ -5121,7 +5120,7 @@ TEST(DBTest, ApproximateSizes) {
 
     // Check sizes across recovery by reopening a few times
     for (int run = 0; run < 3; run++) {
-      ReopenWithColumnFamilies({"default", "pikachu"}, &options);
+      ReopenWithColumnFamilies({"default", "pikachu"}, options);
 
       for (int compact_start = 0; compact_start < N; compact_start += 10) {
         for (int i = 0; i < N; i += 10) {
@@ -5168,7 +5167,7 @@ TEST(DBTest, ApproximateSizes_MixOfSmallAndLarge) {
 
     // Check sizes across recovery by reopening a few times
     for (int run = 0; run < 3; run++) {
-      ReopenWithColumnFamilies({"default", "pikachu"}, &options);
+      ReopenWithColumnFamilies({"default", "pikachu"}, options);
 
       ASSERT_TRUE(Between(Size("", Key(0), 1), 0, 0));
       ASSERT_TRUE(Between(Size("", Key(1), 1), 10000, 11000));
@@ -5468,15 +5467,15 @@ TEST(DBTest, L0_CompactionBug_Issue44_a) {
   do {
     CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
     ASSERT_OK(Put(1, "b", "v"));
-    ReopenWithColumnFamilies({"default", "pikachu"});
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
     ASSERT_OK(Delete(1, "b"));
     ASSERT_OK(Delete(1, "a"));
-    ReopenWithColumnFamilies({"default", "pikachu"});
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
     ASSERT_OK(Delete(1, "a"));
-    ReopenWithColumnFamilies({"default", "pikachu"});
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
     ASSERT_OK(Put(1, "a", "v"));
-    ReopenWithColumnFamilies({"default", "pikachu"});
-    ReopenWithColumnFamilies({"default", "pikachu"});
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
     ASSERT_EQ("(a->v)", Contents(1));
     env_->SleepForMicroseconds(1000000);  // Wait for compaction to finish
     ASSERT_EQ("(a->v)", Contents(1));
@@ -5487,24 +5486,24 @@ TEST(DBTest, L0_CompactionBug_Issue44_b) {
   do {
     CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
     Put(1, "", "");
-    ReopenWithColumnFamilies({"default", "pikachu"});
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
     Delete(1, "e");
     Put(1, "", "");
-    ReopenWithColumnFamilies({"default", "pikachu"});
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
     Put(1, "c", "cv");
-    ReopenWithColumnFamilies({"default", "pikachu"});
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
     Put(1, "", "");
-    ReopenWithColumnFamilies({"default", "pikachu"});
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
     Put(1, "", "");
     env_->SleepForMicroseconds(1000000);  // Wait for compaction to finish
-    ReopenWithColumnFamilies({"default", "pikachu"});
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
     Put(1, "d", "dv");
-    ReopenWithColumnFamilies({"default", "pikachu"});
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
     Put(1, "", "");
-    ReopenWithColumnFamilies({"default", "pikachu"});
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
     Delete(1, "d");
     Delete(1, "b");
-    ReopenWithColumnFamilies({"default", "pikachu"});
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
     ASSERT_EQ("(->)(c->cv)", Contents(1));
     env_->SleepForMicroseconds(1000000);  // Wait for compaction to finish
     ASSERT_EQ("(->)(c->cv)", Contents(1));
@@ -5534,7 +5533,7 @@ TEST(DBTest, ComparatorCheck) {
     new_options.comparator = &cmp;
     // only the non-default column family has non-matching comparator
     Status s = TryReopenWithColumnFamilies({"default", "pikachu"},
-                                           {&options, &new_options});
+        std::vector<Options>({options, new_options}));
     ASSERT_TRUE(!s.ok());
     ASSERT_TRUE(s.ToString().find("comparator") != std::string::npos)
         << s.ToString();
@@ -5673,7 +5672,7 @@ TEST(DBTest, ManualCompactionOutputPathId) {
   ASSERT_EQ(0, GetSstFileCount(options.db_paths[0].path));
   ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
 
-  ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, &options);
+  ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options);
   ASSERT_EQ("1", FilesPerLevel(1));
   ASSERT_EQ(0, GetSstFileCount(options.db_paths[0].path));
   ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
@@ -5683,7 +5682,7 @@ TEST(DBTest, ManualCompactionOutputPathId) {
   ASSERT_EQ(1, GetSstFileCount(options.db_paths[0].path));
   ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
 
-  ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, &options);
+  ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options);
   ASSERT_EQ("2", FilesPerLevel(1));
   ASSERT_EQ(1, GetSstFileCount(options.db_paths[0].path));
   ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
@@ -5753,7 +5752,7 @@ TEST(DBTest, DBOpen_Change_NumLevels) {
 
   opts.create_if_missing = false;
   opts.num_levels = 2;
-  Status s = TryReopenWithColumnFamilies({"default", "pikachu"}, &opts);
+  Status s = TryReopenWithColumnFamilies({"default", "pikachu"}, opts);
   ASSERT_TRUE(strstr(s.ToString().c_str(), "Invalid argument") != nullptr);
   ASSERT_TRUE(db_ == nullptr);
 }
@@ -6110,7 +6109,7 @@ TEST(DBTest, BloomFilterCompatibility) {
   // Check db with full filter
   table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
   options.table_factory.reset(NewBlockBasedTableFactory(table_options));
-  ReopenWithColumnFamilies({"default", "pikachu"}, &options);
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
 
   // Check if they can be found
   for (int i = 0; i < maxKey; i++) {
@@ -6139,7 +6138,7 @@ TEST(DBTest, BloomFilterReverseCompatibility) {
   // Check db with block_based filter
   table_options.filter_policy.reset(NewBloomFilterPolicy(10, true));
   options.table_factory.reset(NewBlockBasedTableFactory(table_options));
-  ReopenWithColumnFamilies({"default", "pikachu"}, &options);
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
 
   // Check if they can be found
   for (int i = 0; i < maxKey; i++) {
@@ -6524,7 +6523,7 @@ TEST(DBTest, RecoverCheckFileAmountWithSmallWriteBuffer) {
 
   options.write_buffer_size = 10;
   ReopenWithColumnFamilies({"default", "pikachu", "dobrynia", "nikitich"},
-                           &options);
+                           options);
   {
     // No inserts => default is empty
     ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
@@ -6592,7 +6591,7 @@ TEST(DBTest, RecoverCheckFileAmount) {
   }
 
   ReopenWithColumnFamilies({"default", "pikachu", "dobrynia", "nikitich"},
-                           &options);
+                           options);
   {
     std::vector<uint64_t> table_files = ListTableFiles(env_, dbname_);
     // Check, that records for 'default', 'dobrynia' and 'pikachu' from
@@ -6804,7 +6803,7 @@ TEST(DBTest, TransactionLogIterator) {
       auto iter = OpenTransactionLogIter(0);
       ExpectRecords(3, iter);
     }
-    ReopenWithColumnFamilies({"default", "pikachu"}, &options);
+    ReopenWithColumnFamilies({"default", "pikachu"}, options);
     env_->SleepForMicroseconds(2 * 1000 * 1000);
     {
       Put(0, "key4", DummyString(1024));
@@ -6881,8 +6880,8 @@ TEST(DBTest, TransactionLogIteratorMoveOverZeroFiles) {
     // Do a plain Reopen.
     Put(1, "key1", DummyString(1024));
     // Two reopens should create a zero record WAL file.
-    ReopenWithColumnFamilies({"default", "pikachu"}, &options);
-    ReopenWithColumnFamilies({"default", "pikachu"}, &options);
+    ReopenWithColumnFamilies({"default", "pikachu"}, options);
+    ReopenWithColumnFamilies({"default", "pikachu"}, options);
 
     Put(1, "key2", DummyString(1024));
 
@@ -6975,7 +6974,7 @@ TEST(DBTest, TransactionLogIteratorBatchOperations) {
     dbfull()->Write(WriteOptions(), &batch);
     Flush(1);
     Flush(0);
-    ReopenWithColumnFamilies({"default", "pikachu"}, &options);
+    ReopenWithColumnFamilies({"default", "pikachu"}, options);
     Put(1, "key4", DummyString(1024));
     auto iter = OpenTransactionLogIter(3);
     ExpectRecords(2, iter);
@@ -6995,7 +6994,7 @@ TEST(DBTest, TransactionLogIteratorBlobs) {
     batch.PutLogData(Slice("blob2"));
     batch.Delete(handles_[0], "key2");
     dbfull()->Write(WriteOptions(), &batch);
-    ReopenWithColumnFamilies({"default", "pikachu"}, &options);
+    ReopenWithColumnFamilies({"default", "pikachu"}, options);
   }
 
   auto res = OpenTransactionLogIter(0)->GetBatch();

From 97451f837ef8f7af3a38dc9fce4b1fdd5e81c054 Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Wed, 29 Oct 2014 12:02:11 -0700
Subject: [PATCH 336/829] add an env var ROCKSDB_TESTS_FROM to control where to
 start from a list of tests

Summary:
Sometimes, I got a test failure. After fixing that, I want to resume
db_test from that test. ROCKSDB_TESTS_FROM is for this purpose.

Test Plan: as title

Reviewers: yhchiang, rven, igor, sdong

Reviewed By: sdong

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D27807
---
 util/testharness.cc | 25 ++++++++++++++++++-------
 util/testharness.h  |  8 +++++---
 2 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/util/testharness.cc b/util/testharness.cc
index 4208d2c46..16773f69f 100644
--- a/util/testharness.cc
+++ b/util/testharness.cc
@@ -41,18 +41,29 @@ bool RegisterTest(const char* base, const char* name, void (*func)()) {
 int RunAllTests() {
   port::InstallStackTraceHandler();
 
-  const char* matcher = getenv("ROCKSDB_TESTS");
+  const char* one_matcher = getenv("ROCKSDB_TESTS");
+  const char* from_matcher = getenv("ROCKSDB_TESTS_FROM");
 
   int num = 0;
+  bool tests_on = (one_matcher == nullptr && from_matcher == nullptr);
   if (tests != nullptr) {
     for (unsigned int i = 0; i < tests->size(); i++) {
       const Test& t = (*tests)[i];
-      if (matcher != nullptr) {
-        std::string name = t.base;
-        name.push_back('.');
-        name.append(t.name);
-        if (strstr(name.c_str(), matcher) == nullptr) {
-          continue;
+      if (tests_on == false) {
+        if (one_matcher != nullptr || from_matcher != nullptr) {
+          std::string name = t.base;
+          name.push_back('.');
+          name.append(t.name);
+          if (from_matcher != nullptr &&
+              strstr(name.c_str(), from_matcher) != nullptr) {
+            tests_on = true;
+          }
+          if (!tests_on) {
+            if (one_matcher == nullptr ||
+                strstr(name.c_str(), one_matcher) == nullptr) {
+              continue;
+            }
+          }
         }
       }
       fprintf(stderr, "==== Test %s.%s\n", t.base, t.name);
diff --git a/util/testharness.h b/util/testharness.h
index 52c29848d..af4b2858c 100644
--- a/util/testharness.h
+++ b/util/testharness.h
@@ -21,9 +21,11 @@ namespace rocksdb {
 namespace test {
 
 // Run some of the tests registered by the TEST() macro.  If the
-// environment variable "ROCKSDB_TESTS" is not set, runs all tests.
-// Otherwise, runs only the tests whose name contains the value of
-// "ROCKSDB_TESTS" as a substring.  E.g., suppose the tests are:
+// environment variable "ROCKSDB_TESTS" and "ROCKSDB_TESTS_FROM"
+// are not set, runs all tests. Otherwise, run all tests after
+// ROCKSDB_TESTS_FROM and those specified by ROCKSDB_TESTS.
+// Partial name match also works for ROCKSDB_TESTS and
+// ROCKSDB_TESTS_FROM. E.g., suppose the tests are:
 //    TEST(Foo, Hello) { ... }
 //    TEST(Foo, World) { ... }
 // ROCKSDB_TESTS=Hello will run the first test

From 44f0ff31c21164685a6cd25a2beb944767c39e46 Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Wed, 29 Oct 2014 12:24:49 -0700
Subject: [PATCH 337/829] use fallocate(FALLOC_FL_PUNCH_HOLE) to release unused
 blocks at the end of file

Summary:
ftruncate does not always free preallocated unused space at the end of file.
In some cases, we pin too much disk space than it should

Test Plan: env_test

Reviewers: sdong, rven, yhchiang, igor

Reviewed By: igor

Subscribers: nkg-, dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D25641
---
 util/env_posix.cc | 24 +++++++++++++++++++-----
 util/env_test.cc  |  4 ++--
 2 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/util/env_posix.cc b/util/env_posix.cc
index 76ba4a6bd..84c9e558e 100644
--- a/util/env_posix.cc
+++ b/util/env_posix.cc
@@ -737,14 +737,28 @@ class PosixWritableFile : public WritableFile {
     GetPreallocationStatus(&block_size, &last_allocated_block);
     if (last_allocated_block > 0) {
       // trim the extra space preallocated at the end of the file
-      int dummy __attribute__((unused));
-      dummy = ftruncate(fd_, filesize_);  // ignore errors
+      // NOTE(ljin): we probably don't want to surface failure as an IOError,
+      // but it will be nice to log these errors.
+      ftruncate(fd_, filesize_);
+#ifdef ROCKSDB_FALLOCATE_PRESENT
+      // in some file systems, ftruncate only trims trailing space if the
+      // new file size is smaller than the current size. Calling fallocate
+      // with FALLOC_FL_PUNCH_HOLE flag to explicitly release these unused
+      // blocks. FALLOC_FL_PUNCH_HOLE is supported on at least the following
+      // filesystems:
+      //   XFS (since Linux 2.6.38)
+      //   ext4 (since Linux 3.0)
+      //   Btrfs (since Linux 3.7)
+      //   tmpfs (since Linux 3.5)
+      // We ignore error since failure of this operation does not affect
+      // correctness.
+      fallocate(fd_, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE,
+                filesize_, block_size * last_allocated_block - filesize_);
+#endif
     }
 
     if (close(fd_) < 0) {
-      if (s.ok()) {
-        s = IOError(filename_, errno);
-      }
+      s = IOError(filename_, errno);
     }
     fd_ = -1;
     return s;
diff --git a/util/env_test.cc b/util/env_test.cc
index 3d7a9a4db..48e7d353d 100644
--- a/util/env_test.cc
+++ b/util/env_test.cc
@@ -518,7 +518,7 @@ TEST(EnvPosixTest, AllocateTest) {
   // allocate 100 MB
   size_t kPreallocateSize = 100 * 1024 * 1024;
   size_t kBlockSize = 512;
-  std::string data = "test";
+  std::string data(1024 * 1024, 'a');
   wfile->SetPreallocationBlockSize(kPreallocateSize);
   ASSERT_OK(wfile->Append(Slice(data)));
   ASSERT_OK(wfile->Flush());
@@ -540,7 +540,7 @@ TEST(EnvPosixTest, AllocateTest) {
   stat(fname.c_str(), &f_stat);
   ASSERT_EQ((unsigned int)data.size(), f_stat.st_size);
   // verify that preallocated blocks were deallocated on file close
-  ASSERT_GT(st_blocks, f_stat.st_blocks);
+  ASSERT_EQ((f_stat.st_size + kBlockSize - 1) / kBlockSize, f_stat.st_blocks);
 }
 #endif  // ROCKSDB_FALLOCATE_PRESENT
 

From 76d54530d3d2ab1764ea725358d1d50fb90021da Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Wed, 29 Oct 2014 13:36:18 -0700
Subject: [PATCH 338/829] minor - remove default value for
 ChangeFilterOptions() and ChangeCompactionOptions()

Summary:
So now all open() in db_test should get options from callsite. And
destroy() always uses the last used options saved on open()
I will start to integrate env_mem in the next diff

Test Plan: make all check -j32

Reviewers: sdong, yhchiang, rven, igor

Reviewed By: igor

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D27819
---
 db/db_test.cc | 19 ++++++-------------
 1 file changed, 6 insertions(+), 13 deletions(-)

diff --git a/db/db_test.cc b/db/db_test.cc
index da6a8ffc9..ef5067f64 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -480,14 +480,10 @@ class DBTest {
   }
 
   // Switch between different compaction styles (we have only 2 now).
-  bool ChangeCompactOptions(Options* prev_options = nullptr) {
+  bool ChangeCompactOptions() {
     if (option_config_ == kDefault) {
       option_config_ = kUniversalCompaction;
-      if (prev_options == nullptr) {
-        prev_options = &last_options_;
-      }
-      Destroy(*prev_options);
-
+      Destroy(last_options_);
       auto options = CurrentOptions();
       options.create_if_missing = true;
       TryReopen(options);
@@ -499,7 +495,7 @@ class DBTest {
 
   // Switch between different filter policy
   // Jump from kDefault to kFilter to kFullFilter
-  bool ChangeFilterOptions(Options* prev_options = nullptr) {
+  bool ChangeFilterOptions() {
     if (option_config_ == kDefault) {
       option_config_ = kFilter;
     } else if (option_config_ == kFilter) {
@@ -507,10 +503,7 @@ class DBTest {
     } else {
       return false;
     }
-    if (prev_options == nullptr) {
-      prev_options = &last_options_;
-    }
-    Destroy(*prev_options);
+    Destroy(last_options_);
 
     auto options = CurrentOptions();
     options.create_if_missing = true;
@@ -5537,7 +5530,7 @@ TEST(DBTest, ComparatorCheck) {
     ASSERT_TRUE(!s.ok());
     ASSERT_TRUE(s.ToString().find("comparator") != std::string::npos)
         << s.ToString();
-  } while (ChangeCompactOptions(&new_options));
+  } while (ChangeCompactOptions());
 }
 
 TEST(DBTest, CustomComparator) {
@@ -5596,7 +5589,7 @@ TEST(DBTest, CustomComparator) {
       }
       Compact(1, "[0]", "[1000000]");
     }
-  } while (ChangeCompactOptions(&new_options));
+  } while (ChangeCompactOptions());
 }
 
 TEST(DBTest, ManualCompaction) {

From 9ab0132360fbf68eb0561f7525e726d4d3a4c0f7 Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Tue, 28 Oct 2014 15:31:38 -0700
Subject: [PATCH 339/829] tmp

Summary:

Test Plan:

Reviewers:

CC:

Task ID: #

Blame Rev:
---
 include/rocksdb/env.h |   1 +
 util/env_mem.cc       | 367 ++++++++++++++++++++++++++++++++++++++++++
 util/env_mem_test.cc  | 231 ++++++++++++++++++++++++++
 3 files changed, 599 insertions(+)
 create mode 100644 util/env_mem.cc
 create mode 100644 util/env_mem_test.cc

diff --git a/include/rocksdb/env.h b/include/rocksdb/env.h
index 70244bb31..b0cd40ddd 100644
--- a/include/rocksdb/env.h
+++ b/include/rocksdb/env.h
@@ -798,6 +798,7 @@ class EnvWrapper : public Env {
 // when it is no longer needed.
 // *base_env must remain live while the result is in use.
 Env* NewMemEnv(Env* base_env);
+Env* NewTestMemEnv(Env* base_env);
 
 }  // namespace rocksdb
 
diff --git a/util/env_mem.cc b/util/env_mem.cc
new file mode 100644
index 000000000..43337da7e
--- /dev/null
+++ b/util/env_mem.cc
@@ -0,0 +1,367 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "rocksdb/env.h"
+#include "rocksdb/status.h"
+#include "port/port.h"
+#include "util/mutexlock.h"
+#include <map>
+#include <string.h>
+#include <string>
+#include <vector>
+
+namespace rocksdb {
+
+namespace {
+
+class MemFile {
+ public:
+  enum Mode {
+    READ = 0,
+    WRITE = 1,
+  };
+
+  MemFile(Mode mode) : mode_(mode), refs_(0) {}
+
+  void Ref() {
+    MutexLock lock(&mutex_);
+    ++refs_;
+  }
+
+  void Unref() {
+    bool do_delete = false;
+    {
+      MutexLock lock(&mutex_);
+      --refs_;
+      assert(refs_ >= 0);
+      if (refs_ <= 0) {
+        do_delete = true;
+      }
+    }
+
+    if (do_delete) {
+      delete this;
+    }
+  }
+
+  void SetMode(Mode mode) {
+    mode_ = mode;
+  }
+
+  uint64_t Size() const { return data_.size(); }
+
+  Status Read(uint64_t offset, size_t n, Slice* result, char* scratch) const {
+    assert(mode_ == READ);
+    if (offset > Size()) {
+      return Status::IOError("Offset greater than file size.");
+    }
+    const uint64_t available = Size() - offset;
+    if (n > available) {
+      n = available;
+    }
+    if (n == 0) {
+      *result = Slice();
+      return Status::OK();
+    }
+    if (scratch) {
+      memcpy(scratch, &(data_[offset]), n);
+      *result = Slice(scratch, n);
+    } else {
+      *result = Slice(&(data_[offset]), n);
+    }
+    return Status::OK();
+  }
+
+  Status Append(const Slice& data) {
+    assert(mode_ == WRITE);
+    data_.append(data.data(), data.size());
+    return Status::OK();
+  }
+
+  Status Fsync() {
+    return Status::OK();
+  }
+
+ private:
+  // Private since only Unref() should be used to delete it.
+  ~MemFile() {
+    assert(refs_ == 0);
+  }
+
+  // No copying allowed.
+  MemFile(const MemFile&);
+  void operator=(const MemFile&);
+
+  Mode mode_;
+  port::Mutex mutex_;
+  int refs_;  // Protected by mutex_;
+
+  std::string data_;
+};
+
+class SequentialFileImpl : public SequentialFile {
+ public:
+  explicit SequentialFileImpl(MemFile* file) : file_(file), pos_(0) {
+    file_->Ref();
+  }
+
+  ~SequentialFileImpl() {
+    file_->Unref();
+  }
+
+  virtual Status Read(size_t n, Slice* result, char* scratch) {
+    Status s = file_->Read(pos_, n, result, scratch);
+    if (s.ok()) {
+      pos_ += result->size();
+    }
+    return s;
+  }
+
+  virtual Status Skip(uint64_t n) {
+    if (pos_ > file_->Size()) {
+      return Status::IOError("pos_ > file_->Size()");
+    }
+    const size_t available = file_->Size() - pos_;
+    if (n > available) {
+      n = available;
+    }
+    pos_ += n;
+    return Status::OK();
+  }
+
+ private:
+  MemFile* file_;
+  size_t pos_;
+};
+
+class RandomAccessFileImpl : public RandomAccessFile {
+ public:
+  explicit RandomAccessFileImpl(MemFile* file) : file_(file) {
+    file_->Ref();
+  }
+
+  ~RandomAccessFileImpl() {
+    file_->Unref();
+  }
+
+  virtual Status Read(uint64_t offset, size_t n, Slice* result,
+                      char* scratch) const {
+    return file_->Read(offset, n, result, scratch);
+  }
+
+ private:
+  MemFile* file_;
+};
+
+class WritableFileImpl : public WritableFile {
+ public:
+  WritableFileImpl(MemFile* file) : file_(file) {
+    file_->Ref();
+  }
+
+  ~WritableFileImpl() {
+    file_->Unref();
+  }
+
+  virtual Status Append(const Slice& data) {
+    return file_->Append(data);
+  }
+
+  virtual Status Close() {
+    return Status::OK();
+  }
+
+  virtual Status Flush() {
+    return Status::OK();
+  }
+
+  virtual Status Sync() {
+    return file_->Fsync();
+  }
+
+ private:
+  MemFile* file_;
+};
+
+class TestMemDirectory : public Directory {
+ public:
+  virtual Status Fsync() { return Status::OK(); }
+};
+
+class TestMemEnv : public EnvWrapper {
+ public:
+  explicit TestMemEnv(Env* base_env) : EnvWrapper(base_env) { }
+
+  virtual ~TestMemEnv() {
+    for (FileSystem::iterator i = file_map_.begin(); i != file_map_.end(); ++i){
+      i->second->Unref();
+    }
+  }
+
+  // Partial implementation of the Env interface.
+  virtual Status NewSequentialFile(const std::string& fname,
+                                   unique_ptr<SequentialFile>* result,
+                                   const EnvOptions& soptions) {
+    MutexLock lock(&mutex_);
+    if (file_map_.find(fname) == file_map_.end()) {
+      *result = NULL;
+      return Status::IOError(fname, "File not found");
+    }
+    auto* f = file_map_[fname];
+    f->SetMode(MemFile::READ);
+    result->reset(new SequentialFileImpl(f));
+    return Status::OK();
+  }
+
+  virtual Status NewRandomAccessFile(const std::string& fname,
+                                     unique_ptr<RandomAccessFile>* result,
+                                     const EnvOptions& soptions) {
+    MutexLock lock(&mutex_);
+    if (file_map_.find(fname) == file_map_.end()) {
+      *result = NULL;
+      return Status::IOError(fname, "File not found");
+    }
+    auto* f = file_map_[fname];
+    f->SetMode(MemFile::READ);
+    result->reset(new RandomAccessFileImpl(f));
+    return Status::OK();
+  }
+
+  virtual Status NewWritableFile(const std::string& fname,
+                                 unique_ptr<WritableFile>* result,
+                                 const EnvOptions& soptions) {
+    MutexLock lock(&mutex_);
+    if (file_map_.find(fname) != file_map_.end()) {
+      DeleteFileInternal(fname);
+    }
+    MemFile* file = new MemFile(MemFile::WRITE);
+    file->Ref();
+    file_map_[fname] = file;
+
+    result->reset(new WritableFileImpl(file));
+    return Status::OK();
+  }
+
+  virtual Status NewRandomRWFile(const std::string& fname,
+                                 unique_ptr<RandomRWFile>* result,
+                                 const EnvOptions& options) {
+    return Status::OK();
+  }
+
+  virtual Status NewDirectory(const std::string& name,
+                              unique_ptr<Directory>* result) {
+    result->reset(new TestMemDirectory());
+    return Status::OK();
+  }
+
+  virtual bool FileExists(const std::string& fname) {
+    MutexLock lock(&mutex_);
+    return file_map_.find(fname) != file_map_.end();
+  }
+
+  virtual Status GetChildren(const std::string& dir,
+                             std::vector<std::string>* result) {
+    MutexLock lock(&mutex_);
+    result->clear();
+
+    for (FileSystem::iterator i = file_map_.begin(); i != file_map_.end(); ++i){
+      const std::string& filename = i->first;
+
+      if (filename.size() >= dir.size() + 1 && filename[dir.size()] == '/' &&
+          Slice(filename).starts_with(Slice(dir))) {
+        result->push_back(filename.substr(dir.size() + 1));
+      }
+    }
+
+    return Status::OK();
+  }
+
+  void DeleteFileInternal(const std::string& fname) {
+    if (file_map_.find(fname) == file_map_.end()) {
+      return;
+    }
+
+    file_map_[fname]->Unref();
+    file_map_.erase(fname);
+  }
+
+  virtual Status DeleteFile(const std::string& fname) {
+    MutexLock lock(&mutex_);
+    if (file_map_.find(fname) == file_map_.end()) {
+      return Status::IOError(fname, "File not found");
+    }
+
+    DeleteFileInternal(fname);
+    return Status::OK();
+  }
+
+  virtual Status CreateDir(const std::string& dirname) {
+    return Status::OK();
+  }
+
+  virtual Status CreateDirIfMissing(const std::string& dirname) {
+    return Status::OK();
+  }
+
+  virtual Status DeleteDir(const std::string& dirname) {
+    return Status::OK();
+  }
+
+  virtual Status GetFileSize(const std::string& fname, uint64_t* file_size) {
+    MutexLock lock(&mutex_);
+    if (file_map_.find(fname) == file_map_.end()) {
+      return Status::IOError(fname, "File not found");
+    }
+
+    *file_size = file_map_[fname]->Size();
+    return Status::OK();
+  }
+
+  virtual Status GetFileModificationTime(const std::string& fname,
+                                         uint64_t* time) {
+    return Status::NotSupported("getFileMTime", "Not supported in MemEnv");
+  }
+
+  virtual Status RenameFile(const std::string& src,
+                            const std::string& target) {
+    MutexLock lock(&mutex_);
+    if (file_map_.find(src) == file_map_.end()) {
+      return Status::IOError(src, "File not found");
+    }
+
+    DeleteFileInternal(target);
+    file_map_[target] = file_map_[src];
+    file_map_.erase(src);
+    return Status::OK();
+  }
+
+  virtual Status LockFile(const std::string& fname, FileLock** lock) {
+    *lock = new FileLock;
+    return Status::OK();
+  }
+
+  virtual Status UnlockFile(FileLock* lock) {
+    delete lock;
+    return Status::OK();
+  }
+
+  virtual Status GetTestDirectory(std::string* path) {
+    *path = "/test";
+    return Status::OK();
+  }
+
+ private:
+  // Map from filenames to MemFile objects, representing a simple file system.
+  typedef std::map<std::string, MemFile*> FileSystem;
+  port::Mutex mutex_;
+  FileSystem file_map_;  // Protected by mutex_.
+};
+
+}  // namespace
+
+Env* NewTestMemEnv(Env* base_env) {
+  return new TestMemEnv(base_env);
+}
+
+}  // namespace rocksdb
diff --git a/util/env_mem_test.cc b/util/env_mem_test.cc
new file mode 100644
index 000000000..ea3ed61a0
--- /dev/null
+++ b/util/env_mem_test.cc
@@ -0,0 +1,231 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_impl.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "util/testharness.h"
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace rocksdb {
+
+class MemEnvTest {
+ public:
+  Env* env_;
+  const EnvOptions soptions_;
+
+  MemEnvTest()
+      : env_(NewMemEnv(Env::Default())) {
+  }
+  ~MemEnvTest() {
+    delete env_;
+  }
+};
+
+TEST(MemEnvTest, Basics) {
+  uint64_t file_size;
+  unique_ptr<WritableFile> writable_file;
+  std::vector<std::string> children;
+
+  ASSERT_OK(env_->CreateDir("/dir"));
+
+  // Check that the directory is empty.
+  ASSERT_TRUE(!env_->FileExists("/dir/non_existent"));
+  ASSERT_TRUE(!env_->GetFileSize("/dir/non_existent", &file_size).ok());
+  ASSERT_OK(env_->GetChildren("/dir", &children));
+  ASSERT_EQ(0U, children.size());
+
+  // Create a file.
+  ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file, soptions_));
+  writable_file.reset();
+
+  // Check that the file exists.
+  ASSERT_TRUE(env_->FileExists("/dir/f"));
+  ASSERT_OK(env_->GetFileSize("/dir/f", &file_size));
+  ASSERT_EQ(0U, file_size);
+  ASSERT_OK(env_->GetChildren("/dir", &children));
+  ASSERT_EQ(1U, children.size());
+  ASSERT_EQ("f", children[0]);
+
+  // Write to the file.
+  ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file, soptions_));
+  ASSERT_OK(writable_file->Append("abc"));
+  writable_file.reset();
+
+  // Check for expected size.
+  ASSERT_OK(env_->GetFileSize("/dir/f", &file_size));
+  ASSERT_EQ(3U, file_size);
+
+  // Check that renaming works.
+  ASSERT_TRUE(!env_->RenameFile("/dir/non_existent", "/dir/g").ok());
+  ASSERT_OK(env_->RenameFile("/dir/f", "/dir/g"));
+  ASSERT_TRUE(!env_->FileExists("/dir/f"));
+  ASSERT_TRUE(env_->FileExists("/dir/g"));
+  ASSERT_OK(env_->GetFileSize("/dir/g", &file_size));
+  ASSERT_EQ(3U, file_size);
+
+  // Check that opening non-existent file fails.
+  unique_ptr<SequentialFile> seq_file;
+  unique_ptr<RandomAccessFile> rand_file;
+  ASSERT_TRUE(!env_->NewSequentialFile("/dir/non_existent", &seq_file,
+                                       soptions_).ok());
+  ASSERT_TRUE(!seq_file);
+  ASSERT_TRUE(!env_->NewRandomAccessFile("/dir/non_existent", &rand_file,
+                                         soptions_).ok());
+  ASSERT_TRUE(!rand_file);
+
+  // Check that deleting works.
+  ASSERT_TRUE(!env_->DeleteFile("/dir/non_existent").ok());
+  ASSERT_OK(env_->DeleteFile("/dir/g"));
+  ASSERT_TRUE(!env_->FileExists("/dir/g"));
+  ASSERT_OK(env_->GetChildren("/dir", &children));
+  ASSERT_EQ(0U, children.size());
+  ASSERT_OK(env_->DeleteDir("/dir"));
+}
+
+TEST(MemEnvTest, ReadWrite) {
+  unique_ptr<WritableFile> writable_file;
+  unique_ptr<SequentialFile> seq_file;
+  unique_ptr<RandomAccessFile> rand_file;
+  Slice result;
+  char scratch[100];
+
+  ASSERT_OK(env_->CreateDir("/dir"));
+
+  ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file, soptions_));
+  ASSERT_OK(writable_file->Append("hello "));
+  ASSERT_OK(writable_file->Append("world"));
+  writable_file.reset();
+
+  // Read sequentially.
+  ASSERT_OK(env_->NewSequentialFile("/dir/f", &seq_file, soptions_));
+  ASSERT_OK(seq_file->Read(5, &result, scratch)); // Read "hello".
+  ASSERT_EQ(0, result.compare("hello"));
+  ASSERT_OK(seq_file->Skip(1));
+  ASSERT_OK(seq_file->Read(1000, &result, scratch)); // Read "world".
+  ASSERT_EQ(0, result.compare("world"));
+  ASSERT_OK(seq_file->Read(1000, &result, scratch)); // Try reading past EOF.
+  ASSERT_EQ(0U, result.size());
+  ASSERT_OK(seq_file->Skip(100)); // Try to skip past end of file.
+  ASSERT_OK(seq_file->Read(1000, &result, scratch));
+  ASSERT_EQ(0U, result.size());
+
+  // Random reads.
+  ASSERT_OK(env_->NewRandomAccessFile("/dir/f", &rand_file, soptions_));
+  ASSERT_OK(rand_file->Read(6, 5, &result, scratch)); // Read "world".
+  ASSERT_EQ(0, result.compare("world"));
+  ASSERT_OK(rand_file->Read(0, 5, &result, scratch)); // Read "hello".
+  ASSERT_EQ(0, result.compare("hello"));
+  ASSERT_OK(rand_file->Read(10, 100, &result, scratch)); // Read "d".
+  ASSERT_EQ(0, result.compare("d"));
+
+  // Too high offset.
+  ASSERT_TRUE(!rand_file->Read(1000, 5, &result, scratch).ok());
+}
+
+TEST(MemEnvTest, Locks) {
+  FileLock* lock;
+
+  // These are no-ops, but we test they return success.
+  ASSERT_OK(env_->LockFile("some file", &lock));
+  ASSERT_OK(env_->UnlockFile(lock));
+}
+
+TEST(MemEnvTest, Misc) {
+  std::string test_dir;
+  ASSERT_OK(env_->GetTestDirectory(&test_dir));
+  ASSERT_TRUE(!test_dir.empty());
+
+  unique_ptr<WritableFile> writable_file;
+  ASSERT_OK(env_->NewWritableFile("/a/b", &writable_file, soptions_));
+
+  // These are no-ops, but we test they return success.
+  ASSERT_OK(writable_file->Sync());
+  ASSERT_OK(writable_file->Flush());
+  ASSERT_OK(writable_file->Close());
+  writable_file.reset();
+}
+
+TEST(MemEnvTest, LargeWrite) {
+  const size_t kWriteSize = 300 * 1024;
+  char* scratch = new char[kWriteSize * 2];
+
+  std::string write_data;
+  for (size_t i = 0; i < kWriteSize; ++i) {
+    write_data.append(1, static_cast<char>(i));
+  }
+
+  unique_ptr<WritableFile> writable_file;
+  ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file, soptions_));
+  ASSERT_OK(writable_file->Append("foo"));
+  ASSERT_OK(writable_file->Append(write_data));
+  writable_file.reset();
+
+  unique_ptr<SequentialFile> seq_file;
+  Slice result;
+  ASSERT_OK(env_->NewSequentialFile("/dir/f", &seq_file, soptions_));
+  ASSERT_OK(seq_file->Read(3, &result, scratch)); // Read "foo".
+  ASSERT_EQ(0, result.compare("foo"));
+
+  size_t read = 0;
+  std::string read_data;
+  while (read < kWriteSize) {
+    ASSERT_OK(seq_file->Read(kWriteSize - read, &result, scratch));
+    read_data.append(result.data(), result.size());
+    read += result.size();
+  }
+  ASSERT_TRUE(write_data == read_data);
+  delete [] scratch;
+}
+
+TEST(MemEnvTest, DBTest) {
+  Options options;
+  options.create_if_missing = true;
+  options.env = env_;
+  DB* db;
+
+  const Slice keys[] = {Slice("aaa"), Slice("bbb"), Slice("ccc")};
+  const Slice vals[] = {Slice("foo"), Slice("bar"), Slice("baz")};
+
+  ASSERT_OK(DB::Open(options, "/dir/db", &db));
+  for (size_t i = 0; i < 3; ++i) {
+    ASSERT_OK(db->Put(WriteOptions(), keys[i], vals[i]));
+  }
+
+  for (size_t i = 0; i < 3; ++i) {
+    std::string res;
+    ASSERT_OK(db->Get(ReadOptions(), keys[i], &res));
+    ASSERT_TRUE(res == vals[i]);
+  }
+
+  Iterator* iterator = db->NewIterator(ReadOptions());
+  iterator->SeekToFirst();
+  for (size_t i = 0; i < 3; ++i) {
+    ASSERT_TRUE(iterator->Valid());
+    ASSERT_TRUE(keys[i] == iterator->key());
+    ASSERT_TRUE(vals[i] == iterator->value());
+    iterator->Next();
+  }
+  ASSERT_TRUE(!iterator->Valid());
+  delete iterator;
+
+  DBImpl* dbi = reinterpret_cast<DBImpl*>(db);
+  ASSERT_OK(dbi->TEST_FlushMemTable());
+
+  for (size_t i = 0; i < 3; ++i) {
+    std::string res;
+    ASSERT_OK(db->Get(ReadOptions(), keys[i], &res));
+    ASSERT_TRUE(res == vals[i]);
+  }
+
+  delete db;
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}

From 2d4fe048f4b917ed47410698272ff51e6ff7f5f6 Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Wed, 29 Oct 2014 13:38:06 -0700
Subject: [PATCH 340/829] remove dead code

Summary:
as title

Test Plan:
make db_test
---
 db/db_test.cc | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/db/db_test.cc b/db/db_test.cc
index ef5067f64..b79758b0d 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -718,15 +718,6 @@ class DBTest {
 
   Status TryReopen(const Options& options) {
     Close();
-    /*
-    Options opts;
-    if (options != nullptr) {
-      opts = *options;
-    } else {
-      opts = CurrentOptions();
-      opts.create_if_missing = true;
-    }
-    */
     last_options_ = options;
     return DB::Open(options, dbname_, &db_);
   }

From 7b3a618f94b55c6ddf03c7d21a888fb1743f6b39 Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Wed, 29 Oct 2014 14:06:14 -0700
Subject: [PATCH 341/829] Apply InfoLogLevel to the logs in
 db/db_filesnapshot.cc

Summary: Apply InfoLogLevel to the logs in db/db_filesnapshot.cc

Test Plan: make

Reviewers: ljin, sdong, rven, igor

Reviewed By: igor

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D27813
---
 db/db_filesnapshot.cc | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/db/db_filesnapshot.cc b/db/db_filesnapshot.cc
index 89fe9c983..eeee99c1b 100644
--- a/db/db_filesnapshot.cc
+++ b/db/db_filesnapshot.cc
@@ -32,9 +32,10 @@ Status DBImpl::DisableFileDeletions() {
   MutexLock l(&mutex_);
   ++disable_delete_obsolete_files_;
   if (disable_delete_obsolete_files_ == 1) {
-    Log(db_options_.info_log, "File Deletions Disabled");
+    Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
+        "File Deletions Disabled");
   } else {
-    Log(db_options_.info_log,
+    Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log,
         "File Deletions Disabled, but already disabled. Counter: %d",
         disable_delete_obsolete_files_);
   }
@@ -53,11 +54,12 @@ Status DBImpl::EnableFileDeletions(bool force) {
       --disable_delete_obsolete_files_;
     }
     if (disable_delete_obsolete_files_ == 0)  {
-      Log(db_options_.info_log, "File Deletions Enabled");
+      Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
+          "File Deletions Enabled");
       should_purge_files = true;
       FindObsoleteFiles(&job_context, true);
     } else {
-      Log(db_options_.info_log,
+      Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log,
           "File Deletions Enable, but not really enabled. Counter: %d",
           disable_delete_obsolete_files_);
     }
@@ -98,8 +100,8 @@ Status DBImpl::GetLiveFiles(std::vector<std::string>& ret,
 
     if (!status.ok()) {
       mutex_.Unlock();
-      Log(db_options_.info_log, "Cannot Flush data %s\n",
-          status.ToString().c_str());
+      Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
+          "Cannot Flush data %s\n", status.ToString().c_str());
       return status;
     }
   }
@@ -160,8 +162,8 @@ Status DBImpl::GetSortedWalFiles(VectorLogPtr& files) {
   uint64_t latest_archived_log_number = 0;
   if (!files.empty()) {
     latest_archived_log_number = files.back()->LogNumber();
-    Log(db_options_.info_log, "Latest Archived log: %" PRIu64,
-        latest_archived_log_number);
+    Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
+        "Latest Archived log: %" PRIu64, latest_archived_log_number);
   }
 
   files.reserve(files.size() + logs.size());
@@ -173,8 +175,8 @@ Status DBImpl::GetSortedWalFiles(VectorLogPtr& files) {
       // same log in both db dir and archived dir. Simply
       // ignore the one in db dir. Note that, if we read
       // archived dir first, we would have missed the log file.
-      Log(db_options_.info_log, "%s already moved to archive",
-          log->PathName().c_str());
+      Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log,
+          "%s already moved to archive", log->PathName().c_str());
     }
   }
 

From cda9943f9ff5e3d07c28cbc2576fffcc68078d03 Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Wed, 29 Oct 2014 15:11:12 -0700
Subject: [PATCH 342/829] Apply InfoLogLevel to the logs in
 db/compaction_picker.cc

Summary: Apply InfoLogLevel to the logs in db/compaction_picker.cc

Test Plan: make

Reviewers: ljin, sdong, igor

Reviewed By: igor

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D27837
---
 db/compaction_picker.cc | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/db/compaction_picker.cc b/db/compaction_picker.cc
index 6377ebc64..974400fd9 100644
--- a/db/compaction_picker.cc
+++ b/db/compaction_picker.cc
@@ -160,7 +160,7 @@ bool CompactionPicker::ExpandWhileOverlapping(Compaction* c) {
   // compaction, then we must drop/cancel this compaction.
   int parent_index = -1;
   if (c->inputs_[0].empty()) {
-    Log(ioptions_.info_log,
+    Log(InfoLogLevel::WARN_LEVEL, ioptions_.info_log,
         "[%s] ExpandWhileOverlapping() failure because zero input files",
         c->column_family_data()->GetName().c_str());
   }
@@ -170,6 +170,12 @@ bool CompactionPicker::ExpandWhileOverlapping(Compaction* c) {
                                &parent_index))) {
     c->inputs_[0].clear();
     c->inputs_[1].clear();
+    if (!c->inputs_[0].empty()) {
+      Log(InfoLogLevel::WARN_LEVEL, ioptions_.info_log,
+          "[%s] ExpandWhileOverlapping() failure because some of the necessary"
+          " compaction input files are currently being compacted.",
+          c->column_family_data()->GetName().c_str());
+    }
     return false;
   }
   return true;
@@ -252,7 +258,7 @@ void CompactionPicker::SetupOtherInputs(
                                               &c->parent_index_);
       if (expanded1.size() == c->inputs_[1].size() &&
           !FilesInCompaction(expanded1)) {
-        Log(ioptions_.info_log,
+        Log(InfoLogLevel::INFO_LEVEL, ioptions_.info_log,
             "[%s] Expanding@%d %zu+%zu (%" PRIu64 "+%" PRIu64
             " bytes) to %zu+%zu (%" PRIu64 "+%" PRIu64 "bytes)\n",
             c->column_family_data()->GetName().c_str(), level,
@@ -329,8 +335,10 @@ Compaction* CompactionPicker::CompactRange(
   c->inputs_[0].files = inputs;
   if (ExpandWhileOverlapping(c) == false) {
     delete c;
-    Log(ioptions_.info_log,
-        "[%s] Could not compact due to expansion failure.\n",
+    Log(InfoLogLevel::WARN_LEVEL, ioptions_.info_log,
+        "[%s] Unable to perform CompactRange compact due to expansion"
+        " failure.  Possible causes include some of the necessary "
+        " compaction input files are currently being compacted.\n",
         version->cfd()->GetName().c_str());
     return nullptr;
   }

From 34d436b7dbac714c69a2ff2b9b3884544ebd6977 Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Wed, 29 Oct 2014 15:11:32 -0700
Subject: [PATCH 343/829] Apply InfoLogLevel to the logs in db/column_family.cc

Summary: Apply InfoLogLevel to the logs in db/column_family.cc

Test Plan: make

Reviewers: ljin, sdong, igor

Reviewed By: igor

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D27843
---
 db/column_family.cc | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/db/column_family.cc b/db/column_family.cc
index 0127d10ad..b64c24ffe 100644
--- a/db/column_family.cc
+++ b/db/column_family.cc
@@ -260,7 +260,8 @@ ColumnFamilyData::ColumnFamilyData(uint32_t id, const std::string& name,
           new FIFOCompactionPicker(ioptions_, &internal_comparator_));
     }
 
-    Log(ioptions_.info_log, "Options for column family \"%s\":\n",
+    Log(InfoLogLevel::INFO_LEVEL,
+        ioptions_.info_log, "Options for column family \"%s\":\n",
         name.c_str());
     const ColumnFamilyOptions* cf_options = &options_;
     cf_options->Dump(ioptions_.info_log);
@@ -331,7 +332,7 @@ void ColumnFamilyData::RecalculateWriteStallConditions(
     if (imm()->size() >= mutable_cf_options.max_write_buffer_number) {
       write_controller_token_ = write_controller->GetStopToken();
       internal_stats_->AddCFStats(InternalStats::MEMTABLE_COMPACTION, 1);
-      Log(ioptions_.info_log,
+      Log(InfoLogLevel::WARN_LEVEL, ioptions_.info_log,
           "[%s] Stopping writes because we have %d immutable memtables "
           "(waiting for flush), max_write_buffer_number is set to %d",
           name_.c_str(), imm()->size(),
@@ -340,7 +341,7 @@ void ColumnFamilyData::RecalculateWriteStallConditions(
                mutable_cf_options.level0_stop_writes_trigger) {
       write_controller_token_ = write_controller->GetStopToken();
       internal_stats_->AddCFStats(InternalStats::LEVEL0_NUM_FILES, 1);
-      Log(ioptions_.info_log,
+      Log(InfoLogLevel::WARN_LEVEL, ioptions_.info_log,
           "[%s] Stopping writes because we have %d level-0 files",
           name_.c_str(), current_->NumLevelFiles(0));
     } else if (mutable_cf_options.level0_slowdown_writes_trigger >= 0 &&
@@ -352,7 +353,7 @@ void ColumnFamilyData::RecalculateWriteStallConditions(
           mutable_cf_options.level0_stop_writes_trigger);
       write_controller_token_ = write_controller->GetDelayToken(slowdown);
       internal_stats_->AddCFStats(InternalStats::LEVEL0_SLOWDOWN, slowdown);
-      Log(ioptions_.info_log,
+      Log(InfoLogLevel::WARN_LEVEL, ioptions_.info_log,
           "[%s] Stalling writes because we have %d level-0 files (%" PRIu64
           "us)",
           name_.c_str(), current_->NumLevelFiles(0), slowdown);
@@ -363,7 +364,7 @@ void ColumnFamilyData::RecalculateWriteStallConditions(
           write_controller->GetDelayToken(kHardLimitSlowdown);
       internal_stats_->RecordLevelNSlowdown(max_level, kHardLimitSlowdown,
                                             false);
-      Log(ioptions_.info_log,
+      Log(InfoLogLevel::WARN_LEVEL, ioptions_.info_log,
           "[%s] Stalling writes because we hit hard limit on level %d. "
           "(%" PRIu64 "us)",
           name_.c_str(), max_level, kHardLimitSlowdown);
@@ -374,7 +375,7 @@ void ColumnFamilyData::RecalculateWriteStallConditions(
           mutable_cf_options.hard_rate_limit);
       write_controller_token_ = write_controller->GetDelayToken(slowdown);
       internal_stats_->RecordLevelNSlowdown(max_level, slowdown, true);
-      Log(ioptions_.info_log,
+      Log(InfoLogLevel::WARN_LEVEL, ioptions_.info_log,
           "[%s] Stalling writes because we hit soft limit on level %d (%" PRIu64
           "us)",
           name_.c_str(), max_level, slowdown);

From c4b468000b14fc0285f778408c92395aa883be4b Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Wed, 29 Oct 2014 15:12:10 -0700
Subject: [PATCH 344/829] Apply InfoLogLevel to the logs in db/flush_job.cc

Summary: Apply InfoLogLevel to the logs in db/flush_job.cc

Test Plan: make

Reviewers: ljin, sdong, igor

Reviewed By: igor

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D27849
---
 db/flush_job.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/db/flush_job.cc b/db/flush_job.cc
index ff35e9a9a..c4eb12d3c 100644
--- a/db/flush_job.cc
+++ b/db/flush_job.cc
@@ -145,7 +145,7 @@ Status FlushJob::WriteLevel0Table(const autovector<MemTable*>& mems,
     ro.total_order_seek = true;
     Arena arena;
     for (MemTable* m : mems) {
-      Log(db_options_.info_log,
+      Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
           "[%s] Flushing memtable with next log file: %" PRIu64 "\n",
           cfd_->GetName().c_str(), m->GetNextLogNumber());
       memtables.push_back(m->NewIterator(ro, &arena));
@@ -154,7 +154,7 @@ Status FlushJob::WriteLevel0Table(const autovector<MemTable*>& mems,
       ScopedArenaIterator iter(NewMergingIterator(&cfd_->internal_comparator(),
                                                   &memtables[0],
                                                   memtables.size(), &arena));
-      Log(db_options_.info_log,
+      Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
           "[%s] Level-0 flush table #%" PRIu64 ": started",
           cfd_->GetName().c_str(), meta.fd.GetNumber());
 
@@ -165,7 +165,7 @@ Status FlushJob::WriteLevel0Table(const autovector<MemTable*>& mems,
                      cfd_->ioptions()->compression_opts, Env::IO_HIGH);
       LogFlush(db_options_.info_log);
     }
-    Log(db_options_.info_log,
+    Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
         "[%s] Level-0 flush table #%" PRIu64 ": %" PRIu64 " bytes %s",
         cfd_->GetName().c_str(), meta.fd.GetNumber(), meta.fd.GetFileSize(),
         s.ToString().c_str());

From 082e49ba82e5c84775a508225bf6f0a46a187f81 Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Wed, 29 Oct 2014 15:12:50 -0700
Subject: [PATCH 345/829] Apply InfoLogLevel to the logs in db/repair.cc

Summary: Apply InfoLogLevel to the logs in db/repair.cc

Test Plan: make

Reviewers: ljin, sdong, igor

Reviewed By: igor

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D27855
---
 db/repair.cc | 28 +++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/db/repair.cc b/db/repair.cc
index 10628c544..f23e757b0 100644
--- a/db/repair.cc
+++ b/db/repair.cc
@@ -94,7 +94,7 @@ class Repairer {
       for (size_t i = 0; i < tables_.size(); i++) {
         bytes += tables_[i].meta.fd.GetFileSize();
       }
-      Log(options_.info_log,
+      Log(InfoLogLevel::WARN_LEVEL, options_.info_log,
           "**** Repaired rocksdb %s; "
           "recovered %zu files; %" PRIu64
           "bytes. "
@@ -175,7 +175,7 @@ class Repairer {
       std::string logname = LogFileName(dbname_, logs_[i]);
       Status status = ConvertLogToTable(logs_[i]);
       if (!status.ok()) {
-        Log(options_.info_log,
+        Log(InfoLogLevel::WARN_LEVEL, options_.info_log,
             "Log #%" PRIu64 ": ignoring conversion error: %s", logs_[i],
             status.ToString().c_str());
       }
@@ -190,7 +190,8 @@ class Repairer {
       uint64_t lognum;
       virtual void Corruption(size_t bytes, const Status& s) {
         // We print error messages for corruption, but continue repairing.
-        Log(info_log, "Log #%" PRIu64 ": dropping %d bytes; %s", lognum,
+        Log(InfoLogLevel::ERROR_LEVEL, info_log,
+            "Log #%" PRIu64 ": dropping %d bytes; %s", lognum,
             static_cast<int>(bytes), s.ToString().c_str());
       }
     };
@@ -235,7 +236,8 @@ class Repairer {
       if (status.ok()) {
         counter += WriteBatchInternal::Count(&batch);
       } else {
-        Log(options_.info_log, "Log #%" PRIu64 ": ignoring %s", log,
+        Log(InfoLogLevel::WARN_LEVEL,
+            options_.info_log, "Log #%" PRIu64 ": ignoring %s", log,
             status.ToString().c_str());
         status = Status::OK();  // Keep going with rest of file
       }
@@ -262,9 +264,9 @@ class Repairer {
         table_fds_.push_back(meta.fd);
       }
     }
-    Log(options_.info_log,
-        "Log #%" PRIu64 ": %d ops saved to Table #%" PRIu64 " %s", log, counter,
-        meta.fd.GetNumber(), status.ToString().c_str());
+    Log(InfoLogLevel::INFO_LEVEL, options_.info_log,
+        "Log #%" PRIu64 ": %d ops saved to Table #%" PRIu64 " %s",
+        log, counter, meta.fd.GetNumber(), status.ToString().c_str());
     return status;
   }
 
@@ -279,7 +281,8 @@ class Repairer {
         char file_num_buf[kFormatFileNumberBufSize];
         FormatFileNumber(t.meta.fd.GetNumber(), t.meta.fd.GetPathId(),
                          file_num_buf, sizeof(file_num_buf));
-        Log(options_.info_log, "Table #%s: ignoring %s", file_num_buf,
+        Log(InfoLogLevel::WARN_LEVEL, options_.info_log,
+            "Table #%s: ignoring %s", file_num_buf,
             status.ToString().c_str());
         ArchiveFile(fname);
       } else {
@@ -306,7 +309,8 @@ class Repairer {
       for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
         Slice key = iter->key();
         if (!ParseInternalKey(key, &parsed)) {
-          Log(options_.info_log, "Table #%" PRIu64 ": unparsable key %s",
+          Log(InfoLogLevel::ERROR_LEVEL,
+              options_.info_log, "Table #%" PRIu64 ": unparsable key %s",
               t->meta.fd.GetNumber(), EscapeString(key).c_str());
           continue;
         }
@@ -329,7 +333,8 @@ class Repairer {
       }
       delete iter;
     }
-    Log(options_.info_log, "Table #%" PRIu64 ": %d entries %s",
+    Log(InfoLogLevel::INFO_LEVEL,
+        options_.info_log, "Table #%" PRIu64 ": %d entries %s",
         t->meta.fd.GetNumber(), counter, status.ToString().c_str());
     return status;
   }
@@ -406,7 +411,8 @@ class Repairer {
     new_file.append("/");
     new_file.append((slash == nullptr) ? fname.c_str() : slash + 1);
     Status s = env_->RenameFile(fname, new_file);
-    Log(options_.info_log, "Archiving %s: %s\n",
+    Log(InfoLogLevel::INFO_LEVEL,
+        options_.info_log, "Archiving %s: %s\n",
         fname.c_str(), s.ToString().c_str());
   }
 };

From 01e6f850986f5d4a88586a84f761d8af36ecea62 Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Wed, 29 Oct 2014 15:13:31 -0700
Subject: [PATCH 346/829] Apply InfoLogLevel to the logs in
 db/transaction_log_impl.h

Summary: Apply InfoLogLevel to the logs in db/transaction_log_impl.h

Test Plan: make

Reviewers: ljin, sdong, igor

Reviewed By: igor

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D27867
---
 db/transaction_log_impl.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/db/transaction_log_impl.h b/db/transaction_log_impl.h
index 319b01cb1..1c7ab78d9 100644
--- a/db/transaction_log_impl.h
+++ b/db/transaction_log_impl.h
@@ -21,10 +21,12 @@ struct LogReporter : public log::Reader::Reporter {
   Env* env;
   Logger* info_log;
   virtual void Corruption(size_t bytes, const Status& s) {
-    Log(info_log, "dropping %zu bytes; %s", bytes, s.ToString().c_str());
+    Log(InfoLogLevel::ERROR_LEVEL, info_log,
+        "dropping %zu bytes; %s", bytes, s.ToString().c_str());
   }
   virtual void Info(const char* s) {
-    Log(info_log, "%s", s);
+    Log(InfoLogLevel::INFO_LEVEL,
+        info_log, "%s", s);
   }
 };
 

From 76d1c28e82d5f1407a45e641692420c8e8217410 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Mon, 27 Oct 2014 15:49:46 -0700
Subject: [PATCH 347/829] Make CompactionPicker more easily tested

Summary:
Make compaction picker easier to test.
The basic idea is to separate a minimum subcomponent of Version to VersionStorageInfo, which just responsible to LSM tree. A stub VersionStorageInfo can then be easily created and passed into compaction picker so that we can check the outputs.

It now passes most tests. Still two things need to be done:
(1) deal with the FIFO compaction's file size.
(2) write an example test to make sure the interface can do the job.

Add a compaction_picker_test to make sure compaction picker codes can be easily unit tested.

Test Plan:
Pass all unit tests and compaction_picker_test

Reviewers: yhchiang, rven, igor, ljin

Reviewed By: ljin

Subscribers: leveldb, dhruba

Differential Revision: https://reviews.facebook.net/D27639
---
 Makefile                                    |   4 +
 db/column_family.cc                         |  38 +-
 db/compaction.cc                            |  38 +-
 db/compaction.h                             |   8 +-
 db/compaction_picker.cc                     | 255 ++++++-------
 db/compaction_picker.h                      |  65 ++--
 db/compaction_picker_test.cc                | 149 ++++++++
 db/db_impl.cc                               |  85 +++--
 db/db_impl_debug.cc                         |   8 +-
 db/flush_job.cc                             |   4 +-
 db/forward_iterator.cc                      |  22 +-
 db/internal_stats.cc                        |  38 +-
 db/version_set.cc                           | 387 ++++++++++---------
 db/version_set.h                            | 388 ++++++++++++--------
 util/ldb_cmd.cc                             |   2 +-
 utilities/compacted_db/compacted_db_impl.cc |  15 +-
 16 files changed, 896 insertions(+), 610 deletions(-)
 create mode 100644 db/compaction_picker_test.cc

diff --git a/Makefile b/Makefile
index 62b31b87a..6b11012c2 100644
--- a/Makefile
+++ b/Makefile
@@ -131,6 +131,7 @@ TESTS = \
 	spatial_db_test \
 	version_edit_test \
 	version_set_test \
+  compaction_picker_test \
 	file_indexer_test \
 	write_batch_test \
 	write_controller_test\
@@ -452,6 +453,9 @@ version_edit_test: db/version_edit_test.o $(LIBOBJECTS) $(TESTHARNESS)
 version_set_test: db/version_set_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) db/version_set_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
 
+compaction_picker_test: db/compaction_picker_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) db/compaction_picker_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
 file_indexer_test : db/file_indexer_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) db/file_indexer_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
 
diff --git a/db/column_family.cc b/db/column_family.cc
index b64c24ffe..e6298692a 100644
--- a/db/column_family.cc
+++ b/db/column_family.cc
@@ -324,8 +324,9 @@ ColumnFamilyData::~ColumnFamilyData() {
 void ColumnFamilyData::RecalculateWriteStallConditions(
       const MutableCFOptions& mutable_cf_options) {
   if (current_ != nullptr) {
-    const double score = current_->MaxCompactionScore();
-    const int max_level = current_->MaxCompactionScoreLevel();
+    auto* vstorage = current_->GetStorageInfo();
+    const double score = vstorage->MaxCompactionScore();
+    const int max_level = vstorage->MaxCompactionScoreLevel();
 
     auto write_controller = column_family_set_->write_controller_;
 
@@ -337,26 +338,26 @@ void ColumnFamilyData::RecalculateWriteStallConditions(
           "(waiting for flush), max_write_buffer_number is set to %d",
           name_.c_str(), imm()->size(),
           mutable_cf_options.max_write_buffer_number);
-    } else if (current_->NumLevelFiles(0) >=
+    } else if (vstorage->NumLevelFiles(0) >=
                mutable_cf_options.level0_stop_writes_trigger) {
       write_controller_token_ = write_controller->GetStopToken();
       internal_stats_->AddCFStats(InternalStats::LEVEL0_NUM_FILES, 1);
       Log(InfoLogLevel::WARN_LEVEL, ioptions_.info_log,
           "[%s] Stopping writes because we have %d level-0 files",
-          name_.c_str(), current_->NumLevelFiles(0));
+          name_.c_str(), vstorage->NumLevelFiles(0));
     } else if (mutable_cf_options.level0_slowdown_writes_trigger >= 0 &&
-               current_->NumLevelFiles(0) >=
+               vstorage->NumLevelFiles(0) >=
                    mutable_cf_options.level0_slowdown_writes_trigger) {
-      uint64_t slowdown = SlowdownAmount(
-          current_->NumLevelFiles(0),
-          mutable_cf_options.level0_slowdown_writes_trigger,
-          mutable_cf_options.level0_stop_writes_trigger);
+      uint64_t slowdown =
+          SlowdownAmount(vstorage->NumLevelFiles(0),
+                         mutable_cf_options.level0_slowdown_writes_trigger,
+                         mutable_cf_options.level0_stop_writes_trigger);
       write_controller_token_ = write_controller->GetDelayToken(slowdown);
       internal_stats_->AddCFStats(InternalStats::LEVEL0_SLOWDOWN, slowdown);
       Log(InfoLogLevel::WARN_LEVEL, ioptions_.info_log,
           "[%s] Stalling writes because we have %d level-0 files (%" PRIu64
           "us)",
-          name_.c_str(), current_->NumLevelFiles(0), slowdown);
+          name_.c_str(), vstorage->NumLevelFiles(0), slowdown);
     } else if (mutable_cf_options.hard_rate_limit > 1.0 &&
                score > mutable_cf_options.hard_rate_limit) {
       uint64_t kHardLimitSlowdown = 1000;
@@ -403,8 +404,11 @@ void ColumnFamilyData::CreateNewMemtable(
 
 Compaction* ColumnFamilyData::PickCompaction(
     const MutableCFOptions& mutable_options, LogBuffer* log_buffer) {
-  auto result = compaction_picker_->PickCompaction(
-      mutable_options, current_, log_buffer);
+  auto* result = compaction_picker_->PickCompaction(
+      GetName(), mutable_options, current_->GetStorageInfo(), log_buffer);
+  if (result != nullptr) {
+    result->SetInputVersion(current_);
+  }
   return result;
 }
 
@@ -413,9 +417,13 @@ Compaction* ColumnFamilyData::CompactRange(
     int input_level, int output_level, uint32_t output_path_id,
     const InternalKey* begin, const InternalKey* end,
     InternalKey** compaction_end) {
-  return compaction_picker_->CompactRange(
-      mutable_cf_options, current_, input_level, output_level,
-      output_path_id, begin, end, compaction_end);
+  auto* result = compaction_picker_->CompactRange(
+      GetName(), mutable_cf_options, current_->GetStorageInfo(), input_level,
+      output_level, output_path_id, begin, end, compaction_end);
+  if (result != nullptr) {
+    result->SetInputVersion(current_);
+  }
+  return result;
 }
 
 SuperVersion* ColumnFamilyData::GetReferencedSuperVersion(
diff --git a/db/compaction.cc b/db/compaction.cc
index 533fe497e..a739da29e 100644
--- a/db/compaction.cc
+++ b/db/compaction.cc
@@ -29,7 +29,17 @@ uint64_t TotalFileSize(const std::vector<FileMetaData*>& files) {
   return sum;
 }
 
-Compaction::Compaction(Version* input_version, int start_level, int out_level,
+void Compaction::SetInputVersion(Version* input_version) {
+  input_version_ = input_version;
+  cfd_ = input_version_->cfd();
+
+  cfd_->Ref();
+  input_version_->Ref();
+  edit_ = new VersionEdit();
+  edit_->SetColumnFamily(cfd_->GetID());
+}
+
+Compaction::Compaction(int number_levels, int start_level, int out_level,
                        uint64_t target_file_size,
                        uint64_t max_grandparent_overlap_bytes,
                        uint32_t output_path_id,
@@ -39,9 +49,10 @@ Compaction::Compaction(Version* input_version, int start_level, int out_level,
       output_level_(out_level),
       max_output_file_size_(target_file_size),
       max_grandparent_overlap_bytes_(max_grandparent_overlap_bytes),
-      input_version_(input_version),
-      number_levels_(input_version_->NumberLevels()),
-      cfd_(input_version_->cfd()),
+      input_version_(nullptr),
+      edit_(nullptr),
+      number_levels_(number_levels),
+      cfd_(nullptr),
       output_path_id_(output_path_id),
       output_compression_(output_compression),
       seek_compaction_(seek_compaction),
@@ -56,10 +67,6 @@ Compaction::Compaction(Version* input_version, int start_level, int out_level,
       is_full_compaction_(false),
       is_manual_compaction_(false),
       level_ptrs_(std::vector<size_t>(number_levels_)) {
-  cfd_->Ref();
-  input_version_->Ref();
-  edit_ = new VersionEdit();
-  edit_->SetColumnFamily(cfd_->GetID());
   for (int i = 0; i < number_levels_; i++) {
     level_ptrs_[i] = 0;
   }
@@ -113,6 +120,7 @@ void Compaction::AddInputDeletions(VersionEdit* edit) {
 }
 
 bool Compaction::KeyNotExistsBeyondOutputLevel(const Slice& user_key) {
+  assert(input_version_ != nullptr);
   assert(cfd_->ioptions()->compaction_style != kCompactionStyleFIFO);
   if (cfd_->ioptions()->compaction_style == kCompactionStyleUniversal) {
     return bottommost_level_;
@@ -120,7 +128,8 @@ bool Compaction::KeyNotExistsBeyondOutputLevel(const Slice& user_key) {
   // Maybe use binary search to find right entry instead of linear search?
   const Comparator* user_cmp = cfd_->user_comparator();
   for (int lvl = output_level_ + 1; lvl < number_levels_; lvl++) {
-    const std::vector<FileMetaData*>& files = input_version_->LevelFiles(lvl);
+    const std::vector<FileMetaData*>& files =
+        input_version_->GetStorageInfo()->LevelFiles(lvl);
     for (; level_ptrs_[lvl] < files.size(); ) {
       FileMetaData* f = files[level_ptrs_[lvl]];
       if (user_cmp->Compare(user_key, f->largest.user_key()) <= 0) {
@@ -176,9 +185,9 @@ void Compaction::MarkFilesBeingCompacted(bool mark_as_compacted) {
 }
 
 // Is this compaction producing files at the bottommost level?
-void Compaction::SetupBottomMostLevel(bool is_manual) {
-  assert(cfd_->ioptions()->compaction_style != kCompactionStyleFIFO);
-  if (cfd_->ioptions()->compaction_style == kCompactionStyleUniversal) {
+void Compaction::SetupBottomMostLevel(VersionStorageInfo* vstorage,
+                                      bool is_manual, bool level0_only) {
+  if (level0_only) {
     // If universal compaction style is used and manual
     // compaction is occuring, then we are guaranteed that
     // all files will be picked in a single compaction
@@ -193,7 +202,7 @@ void Compaction::SetupBottomMostLevel(bool is_manual) {
   bottommost_level_ = true;
   // checks whether there are files living beyond the output_level.
   for (int i = output_level_ + 1; i < number_levels_; i++) {
-    if (input_version_->NumLevelFiles(i) > 0) {
+    if (vstorage->NumLevelFiles(i) > 0) {
       bottommost_level_ = false;
       break;
     }
@@ -218,7 +227,8 @@ void Compaction::ReleaseCompactionFiles(Status status) {
 }
 
 void Compaction::ResetNextCompactionIndex() {
-  input_version_->SetNextCompactionIndex(start_level_, 0);
+  assert(input_version_ != nullptr);
+  input_version_->GetStorageInfo()->ResetNextCompactionIndex(start_level_);
 }
 
 namespace {
diff --git a/db/compaction.h b/db/compaction.h
index 5183822e3..d8014545b 100644
--- a/db/compaction.h
+++ b/db/compaction.h
@@ -28,6 +28,7 @@ struct CompactionInputFiles {
 
 class Version;
 class ColumnFamilyData;
+class VersionStorageInfo;
 
 // A Compaction encapsulates information about a compaction.
 class Compaction {
@@ -161,13 +162,15 @@ class Compaction {
   // is the sum of all input file sizes.
   uint64_t OutputFilePreallocationSize(const MutableCFOptions& mutable_options);
 
+  void SetInputVersion(Version* input_version);
+
  private:
   friend class CompactionPicker;
   friend class UniversalCompactionPicker;
   friend class FIFOCompactionPicker;
   friend class LevelCompactionPicker;
 
-  Compaction(Version* input_version, int start_level, int out_level,
+  Compaction(int num_levels, int start_level, int out_level,
              uint64_t target_file_size, uint64_t max_grandparent_overlap_bytes,
              uint32_t output_path_id, CompressionType output_compression,
              bool seek_compaction = false, bool deletion_compaction = false);
@@ -230,7 +233,8 @@ class Compaction {
   // bottommost level.
   //
   // @see BottomMostLevel()
-  void SetupBottomMostLevel(bool is_manual);
+  void SetupBottomMostLevel(VersionStorageInfo* vstorage, bool is_manual,
+                            bool level0_only);
 
   // In case of compaction error, reset the nextIndex that is used
   // to pick up the next file to be compacted from files_by_size_
diff --git a/db/compaction_picker.cc b/db/compaction_picker.cc
index 974400fd9..676f39b7d 100644
--- a/db/compaction_picker.cc
+++ b/db/compaction_picker.cc
@@ -15,6 +15,7 @@
 
 #include <inttypes.h>
 #include <limits>
+#include <string>
 #include "db/filename.h"
 #include "util/log_buffer.h"
 #include "util/statistics.h"
@@ -121,7 +122,9 @@ void CompactionPicker::GetRange(const std::vector<FileMetaData*>& inputs1,
   GetRange(all, smallest, largest);
 }
 
-bool CompactionPicker::ExpandWhileOverlapping(Compaction* c) {
+bool CompactionPicker::ExpandWhileOverlapping(const std::string& cf_name,
+                                              VersionStorageInfo* vstorage,
+                                              Compaction* c) {
   assert(c != nullptr);
   // If inputs are empty then there is nothing to expand.
   if (c->inputs_[0].empty()) {
@@ -148,9 +151,9 @@ bool CompactionPicker::ExpandWhileOverlapping(Compaction* c) {
     old_size = c->inputs_[0].size();
     GetRange(c->inputs_[0].files, &smallest, &largest);
     c->inputs_[0].clear();
-    c->input_version_->GetOverlappingInputs(
-        level, &smallest, &largest, &c->inputs_[0].files,
-        hint_index, &hint_index);
+    vstorage->GetOverlappingInputs(level, &smallest, &largest,
+                                   &c->inputs_[0].files, hint_index,
+                                   &hint_index);
   } while(c->inputs_[0].size() > old_size);
 
   // Get the new range
@@ -162,11 +165,11 @@ bool CompactionPicker::ExpandWhileOverlapping(Compaction* c) {
   if (c->inputs_[0].empty()) {
     Log(InfoLogLevel::WARN_LEVEL, ioptions_.info_log,
         "[%s] ExpandWhileOverlapping() failure because zero input files",
-        c->column_family_data()->GetName().c_str());
+        cf_name.c_str());
   }
   if (c->inputs_[0].empty() || FilesInCompaction(c->inputs_[0].files) ||
       (c->level() != c->output_level() &&
-       ParentRangeInCompaction(c->input_version_, &smallest, &largest, level,
+       ParentRangeInCompaction(vstorage, &smallest, &largest, level,
                                &parent_index))) {
     c->inputs_[0].clear();
     c->inputs_[1].clear();
@@ -192,15 +195,15 @@ bool CompactionPicker::FilesInCompaction(std::vector<FileMetaData*>& files) {
 }
 
 // Returns true if any one of the parent files are being compacted
-bool CompactionPicker::ParentRangeInCompaction(Version* version,
+bool CompactionPicker::ParentRangeInCompaction(VersionStorageInfo* vstorage,
                                                const InternalKey* smallest,
                                                const InternalKey* largest,
                                                int level, int* parent_index) {
   std::vector<FileMetaData*> inputs;
   assert(level + 1 < NumberLevels());
 
-  version->GetOverlappingInputs(level + 1, smallest, largest, &inputs,
-                                *parent_index, parent_index);
+  vstorage->GetOverlappingInputs(level + 1, smallest, largest, &inputs,
+                                 *parent_index, parent_index);
   return FilesInCompaction(inputs);
 }
 
@@ -209,7 +212,8 @@ bool CompactionPicker::ParentRangeInCompaction(Version* version,
 // or cause "level" to include a file for compaction that has an overlapping
 // user-key with another file.
 void CompactionPicker::SetupOtherInputs(
-    const MutableCFOptions& mutable_cf_options, Compaction* c) {
+    const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+    VersionStorageInfo* vstorage, Compaction* c) {
   // If inputs are empty, then there is nothing to expand.
   // If both input and output levels are the same, no need to consider
   // files at level "level+1"
@@ -224,10 +228,9 @@ void CompactionPicker::SetupOtherInputs(
   GetRange(c->inputs_[0].files, &smallest, &largest);
 
   // Populate the set of next-level files (inputs_[1]) to include in compaction
-  c->input_version_->GetOverlappingInputs(
-      level + 1, &smallest, &largest,
-      &c->inputs_[1].files, c->parent_index_,
-      &c->parent_index_);
+  vstorage->GetOverlappingInputs(level + 1, &smallest, &largest,
+                                 &c->inputs_[1].files, c->parent_index_,
+                                 &c->parent_index_);
 
   // Get entire range covered by compaction
   InternalKey all_start, all_limit;
@@ -240,8 +243,8 @@ void CompactionPicker::SetupOtherInputs(
   // can happen when one user key spans multiple files.
   if (!c->inputs_[1].empty()) {
     std::vector<FileMetaData*> expanded0;
-    c->input_version_->GetOverlappingInputs(
-        level, &all_start, &all_limit, &expanded0, c->base_index_, nullptr);
+    vstorage->GetOverlappingInputs(level, &all_start, &all_limit, &expanded0,
+                                   c->base_index_, nullptr);
     const uint64_t inputs0_size = TotalCompensatedFileSize(c->inputs_[0].files);
     const uint64_t inputs1_size = TotalCompensatedFileSize(c->inputs_[1].files);
     const uint64_t expanded0_size = TotalCompensatedFileSize(expanded0);
@@ -249,22 +252,21 @@ void CompactionPicker::SetupOtherInputs(
     if (expanded0.size() > c->inputs_[0].size() &&
         inputs1_size + expanded0_size < limit &&
         !FilesInCompaction(expanded0) &&
-        !c->input_version_->HasOverlappingUserKey(&expanded0, level)) {
+        !vstorage->HasOverlappingUserKey(&expanded0, level)) {
       InternalKey new_start, new_limit;
       GetRange(expanded0, &new_start, &new_limit);
       std::vector<FileMetaData*> expanded1;
-      c->input_version_->GetOverlappingInputs(level + 1, &new_start, &new_limit,
-                                              &expanded1, c->parent_index_,
-                                              &c->parent_index_);
+      vstorage->GetOverlappingInputs(level + 1, &new_start, &new_limit,
+                                     &expanded1, c->parent_index_,
+                                     &c->parent_index_);
       if (expanded1.size() == c->inputs_[1].size() &&
           !FilesInCompaction(expanded1)) {
         Log(InfoLogLevel::INFO_LEVEL, ioptions_.info_log,
             "[%s] Expanding@%d %zu+%zu (%" PRIu64 "+%" PRIu64
             " bytes) to %zu+%zu (%" PRIu64 "+%" PRIu64 "bytes)\n",
-            c->column_family_data()->GetName().c_str(), level,
-            c->inputs_[0].size(), c->inputs_[1].size(), inputs0_size,
-            inputs1_size, expanded0.size(), expanded1.size(), expanded0_size,
-            inputs1_size);
+            cf_name.c_str(), level, c->inputs_[0].size(), c->inputs_[1].size(),
+            inputs0_size, inputs1_size, expanded0.size(), expanded1.size(),
+            expanded0_size, inputs1_size);
         smallest = new_start;
         largest = new_limit;
         c->inputs_[0].files = expanded0;
@@ -278,15 +280,15 @@ void CompactionPicker::SetupOtherInputs(
   // Compute the set of grandparent files that overlap this compaction
   // (parent == level+1; grandparent == level+2)
   if (level + 2 < NumberLevels()) {
-    c->input_version_->GetOverlappingInputs(level + 2, &all_start, &all_limit,
-                                            &c->grandparents_);
+    vstorage->GetOverlappingInputs(level + 2, &all_start, &all_limit,
+                                   &c->grandparents_);
   }
 }
 
 Compaction* CompactionPicker::CompactRange(
-    const MutableCFOptions& mutable_cf_options, Version* version,
-    int input_level, int output_level, uint32_t output_path_id,
-    const InternalKey* begin, const InternalKey* end,
+    const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+    VersionStorageInfo* vstorage, int input_level, int output_level,
+    uint32_t output_path_id, const InternalKey* begin, const InternalKey* end,
     InternalKey** compaction_end) {
   // CompactionPickerFIFO has its own implementation of compact range
   assert(ioptions_.compaction_style != kCompactionStyleFIFO);
@@ -300,7 +302,7 @@ Compaction* CompactionPicker::CompactRange(
     begin = nullptr;
     end = nullptr;
   }
-  version->GetOverlappingInputs(input_level, begin, end, &inputs);
+  vstorage->GetOverlappingInputs(input_level, begin, end, &inputs);
   if (inputs.empty()) {
     return nullptr;
   }
@@ -326,24 +328,20 @@ Compaction* CompactionPicker::CompactRange(
   }
   assert(output_path_id < static_cast<uint32_t>(ioptions_.db_paths.size()));
   Compaction* c = new Compaction(
-      version, input_level, output_level,
+      vstorage->NumberLevels(), input_level, output_level,
       mutable_cf_options.MaxFileSizeForLevel(output_level),
       mutable_cf_options.MaxGrandParentOverlapBytes(input_level),
-      output_path_id,
-      GetCompressionType(ioptions_, output_level));
+      output_path_id, GetCompressionType(ioptions_, output_level));
 
   c->inputs_[0].files = inputs;
-  if (ExpandWhileOverlapping(c) == false) {
+  if (ExpandWhileOverlapping(cf_name, vstorage, c) == false) {
     delete c;
     Log(InfoLogLevel::WARN_LEVEL, ioptions_.info_log,
-        "[%s] Unable to perform CompactRange compact due to expansion"
-        " failure.  Possible causes include some of the necessary "
-        " compaction input files are currently being compacted.\n",
-        version->cfd()->GetName().c_str());
+        "[%s] Could not compact due to expansion failure.\n", cf_name.c_str());
     return nullptr;
   }
 
-  SetupOtherInputs(mutable_cf_options, c);
+  SetupOtherInputs(cf_name, mutable_cf_options, vstorage, c);
 
   if (covering_the_whole_range) {
     *compaction_end = nullptr;
@@ -355,7 +353,8 @@ Compaction* CompactionPicker::CompactRange(
   c->MarkFilesBeingCompacted(true);
 
   // Is this compaction creating a file at the bottommost level
-  c->SetupBottomMostLevel(true);
+  c->SetupBottomMostLevel(
+      vstorage, true, ioptions_.compaction_style == kCompactionStyleUniversal);
 
   c->is_manual_compaction_ = true;
   c->mutable_cf_options_ = mutable_cf_options;
@@ -364,8 +363,8 @@ Compaction* CompactionPicker::CompactRange(
 }
 
 Compaction* LevelCompactionPicker::PickCompaction(
-    const MutableCFOptions& mutable_cf_options,
-    Version* version, LogBuffer* log_buffer) {
+    const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+    VersionStorageInfo* vstorage, LogBuffer* log_buffer) {
   Compaction* c = nullptr;
   int level = -1;
 
@@ -373,19 +372,23 @@ Compaction* LevelCompactionPicker::PickCompaction(
   // and also in LogAndApply(), otherwise the values could be stale.
   std::vector<uint64_t> size_being_compacted(NumberLevels() - 1);
   SizeBeingCompacted(size_being_compacted);
-  version->ComputeCompactionScore(mutable_cf_options, size_being_compacted);
+
+  CompactionOptionsFIFO dummy_compaction_options_fifo;
+  vstorage->ComputeCompactionScore(
+      mutable_cf_options, dummy_compaction_options_fifo, size_being_compacted);
 
   // We prefer compactions triggered by too much data in a level over
   // the compactions triggered by seeks.
   //
   // Find the compactions by size on all levels.
   for (int i = 0; i < NumberLevels() - 1; i++) {
-    double score = version->CompactionScore(i);
-    assert(i == 0 || score <= version->CompactionScore(i - 1));
-    level = version->CompactionScoreLevel(i);
-    if (score >= 1) {
-      c = PickCompactionBySize(mutable_cf_options, version, level, score);
-      if (c == nullptr || ExpandWhileOverlapping(c) == false) {
+    double score = vstorage->CompactionScore(i);
+    level = vstorage->CompactionScoreLevel(i);
+    assert(i == 0 || score <= vstorage->CompactionScore(i - 1));
+    if ((score >= 1)) {
+      c = PickCompactionBySize(mutable_cf_options, vstorage, level, score);
+      if (c == nullptr ||
+          ExpandWhileOverlapping(cf_name, vstorage, c) == false) {
         delete c;
         c = nullptr;
       } else {
@@ -408,14 +411,14 @@ Compaction* LevelCompactionPicker::PickCompaction(
     // c->inputs_[0] earlier and replace it with an overlapping set
     // which will include the picked file.
     c->inputs_[0].clear();
-    c->input_version_->GetOverlappingInputs(0, &smallest, &largest,
-                                            &c->inputs_[0].files);
+    vstorage->GetOverlappingInputs(0, &smallest, &largest,
+                                   &c->inputs_[0].files);
 
     // If we include more L0 files in the same compaction run it can
     // cause the 'smallest' and 'largest' key to get extended to a
     // larger range. So, re-invoke GetRange to get the new key range
     GetRange(c->inputs_[0].files, &smallest, &largest);
-    if (ParentRangeInCompaction(c->input_version_, &smallest, &largest, level,
+    if (ParentRangeInCompaction(vstorage, &smallest, &largest, level,
                                 &c->parent_index_)) {
       delete c;
       return nullptr;
@@ -424,13 +427,13 @@ Compaction* LevelCompactionPicker::PickCompaction(
   }
 
   // Setup "level+1" files (inputs_[1])
-  SetupOtherInputs(mutable_cf_options, c);
+  SetupOtherInputs(cf_name, mutable_cf_options, vstorage, c);
 
   // mark all the files that are being compacted
   c->MarkFilesBeingCompacted(true);
 
   // Is this compaction creating a file at the bottommost level
-  c->SetupBottomMostLevel(false);
+  c->SetupBottomMostLevel(vstorage, false, false);
 
   // remember this currently undergoing compaction
   compactions_in_progress_[level].insert(c);
@@ -440,8 +443,8 @@ Compaction* LevelCompactionPicker::PickCompaction(
 }
 
 Compaction* LevelCompactionPicker::PickCompactionBySize(
-    const MutableCFOptions& mutable_cf_options,
-    Version* version, int level, double score) {
+    const MutableCFOptions& mutable_cf_options, VersionStorageInfo* vstorage,
+    int level, double score) {
   Compaction* c = nullptr;
 
   // level 0 files are overlapping. So we cannot pick more
@@ -454,7 +457,7 @@ Compaction* LevelCompactionPicker::PickCompactionBySize(
 
   assert(level >= 0);
   assert(level + 1 < NumberLevels());
-  c = new Compaction(version, level, level + 1,
+  c = new Compaction(vstorage->NumberLevels(), level, level + 1,
                      mutable_cf_options.MaxFileSizeForLevel(level + 1),
                      mutable_cf_options.MaxGrandParentOverlapBytes(level), 0,
                      GetCompressionType(ioptions_, level + 1));
@@ -462,20 +465,19 @@ Compaction* LevelCompactionPicker::PickCompactionBySize(
 
   // Pick the largest file in this level that is not already
   // being compacted
-  const std::vector<int>& file_size = version->FilesBySize(level);
-  const std::vector<FileMetaData*>& level_files = version->LevelFiles(level);
+  const std::vector<int>& file_size = vstorage->FilesBySize(level);
+  const std::vector<FileMetaData*>& level_files = vstorage->LevelFiles(level);
 
   // record the first file that is not yet compacted
   int nextIndex = -1;
 
-  for (unsigned int i = version->NextCompactionIndex(level);
+  for (unsigned int i = vstorage->NextCompactionIndex(level);
        i < file_size.size(); i++) {
     int index = file_size[i];
     FileMetaData* f = level_files[index];
 
-    // Check to verify files are arranged in descending compensated size.
     assert((i == file_size.size() - 1) ||
-           (i >= Version::kNumberFilesToSort - 1) ||
+           (i >= VersionStorageInfo::kNumberFilesToSort - 1) ||
            (f->compensated_file_size >=
             level_files[file_size[i + 1]]->compensated_file_size));
 
@@ -493,8 +495,8 @@ Compaction* LevelCompactionPicker::PickCompactionBySize(
     // Do not pick this file if its parents at level+1 are being compacted.
     // Maybe we can avoid redoing this work in SetupOtherInputs
     int parent_index = -1;
-    if (ParentRangeInCompaction(version, &f->smallest, &f->largest,
-                                level, &parent_index)) {
+    if (ParentRangeInCompaction(vstorage, &f->smallest, &f->largest, level,
+                                &parent_index)) {
       continue;
     }
     c->inputs_[0].files.push_back(f);
@@ -509,7 +511,7 @@ Compaction* LevelCompactionPicker::PickCompactionBySize(
   }
 
   // store where to start the iteration in the next call to PickCompaction
-  version->SetNextCompactionIndex(level,  nextIndex);
+  vstorage->SetNextCompactionIndex(level, nextIndex);
 
   return c;
 }
@@ -518,39 +520,38 @@ Compaction* LevelCompactionPicker::PickCompactionBySize(
 // time-range to compact.
 //
 Compaction* UniversalCompactionPicker::PickCompaction(
-    const MutableCFOptions& mutable_cf_options,
-    Version* version, LogBuffer* log_buffer) {
+    const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+    VersionStorageInfo* vstorage, LogBuffer* log_buffer) {
   const int kLevel0 = 0;
-  double score = version->CompactionScore(kLevel0);
-  const std::vector<FileMetaData*>& level_files = version->LevelFiles(kLevel0);
+  double score = vstorage->CompactionScore(kLevel0);
+  const std::vector<FileMetaData*>& level_files = vstorage->LevelFiles(kLevel0);
 
   if ((level_files.size() <
        (unsigned int)mutable_cf_options.level0_file_num_compaction_trigger)) {
-    LogToBuffer(log_buffer, "[%s] Universal: nothing to do\n",
-                version->cfd()->GetName().c_str());
+    LogToBuffer(log_buffer, "[%s] Universal: nothing to do\n", cf_name.c_str());
     return nullptr;
   }
-  Version::FileSummaryStorage tmp;
+  VersionStorageInfo::FileSummaryStorage tmp;
   LogToBuffer(log_buffer, 3072, "[%s] Universal: candidate files(%zu): %s\n",
-              version->cfd()->GetName().c_str(), level_files.size(),
-              version->LevelFileSummary(&tmp, kLevel0));
+              cf_name.c_str(), level_files.size(),
+              vstorage->LevelFileSummary(&tmp, kLevel0));
 
   // Check for size amplification first.
   Compaction* c;
-  if ((c = PickCompactionUniversalSizeAmp(
-          mutable_cf_options, version, score, log_buffer)) != nullptr) {
+  if ((c = PickCompactionUniversalSizeAmp(cf_name, mutable_cf_options, vstorage,
+                                          score, log_buffer)) != nullptr) {
     LogToBuffer(log_buffer, "[%s] Universal: compacting for size amp\n",
-                version->cfd()->GetName().c_str());
+                cf_name.c_str());
   } else {
     // Size amplification is within limits. Try reducing read
     // amplification while maintaining file size ratios.
     unsigned int ratio = ioptions_.compaction_options_universal.size_ratio;
 
-    if ((c = PickCompactionUniversalReadAmp(
-            mutable_cf_options, version, score, ratio,
-            UINT_MAX, log_buffer)) != nullptr) {
+    if ((c = PickCompactionUniversalReadAmp(cf_name, mutable_cf_options,
+                                            vstorage, score, ratio, UINT_MAX,
+                                            log_buffer)) != nullptr) {
       LogToBuffer(log_buffer, "[%s] Universal: compacting for size ratio\n",
-                  version->cfd()->GetName().c_str());
+                  cf_name.c_str());
     } else {
       // Size amplification and file size ratios are within configured limits.
       // If max read amplification is exceeding configured limits, then force
@@ -559,10 +560,11 @@ Compaction* UniversalCompactionPicker::PickCompaction(
       unsigned int num_files = level_files.size() -
           mutable_cf_options.level0_file_num_compaction_trigger;
       if ((c = PickCompactionUniversalReadAmp(
-               mutable_cf_options, version, score, UINT_MAX,
+               cf_name, mutable_cf_options, vstorage, score, UINT_MAX,
                num_files, log_buffer)) != nullptr) {
-        LogToBuffer(log_buffer, "[%s] Universal: compacting for file num -- %u\n",
-                    version->cfd()->GetName().c_str(), num_files);
+        LogToBuffer(log_buffer,
+                    "[%s] Universal: compacting for file num -- %u\n",
+                    cf_name.c_str(), num_files);
       }
     }
   }
@@ -639,8 +641,8 @@ uint32_t UniversalCompactionPicker::GetPathId(
 // the next file in time order.
 //
 Compaction* UniversalCompactionPicker::PickCompactionUniversalReadAmp(
-    const MutableCFOptions& mutable_cf_options, Version* version,
-    double score, unsigned int ratio,
+    const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+    VersionStorageInfo* vstorage, double score, unsigned int ratio,
     unsigned int max_number_of_files_to_compact, LogBuffer* log_buffer) {
   const int kLevel0 = 0;
 
@@ -650,7 +652,7 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalReadAmp(
     ioptions_.compaction_options_universal.max_merge_width;
 
   // The files are sorted from newest first to oldest last.
-  const auto& files = version->LevelFiles(kLevel0);
+  const auto& files = vstorage->LevelFiles(kLevel0);
 
   FileMetaData* f = nullptr;
   bool done = false;
@@ -677,7 +679,7 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalReadAmp(
       }
       LogToBuffer(log_buffer, "[%s] Universal: file %" PRIu64
                               "[%d] being compacted, skipping",
-                  version->cfd()->GetName().c_str(), f->fd.GetNumber(), loop);
+                  cf_name.c_str(), f->fd.GetNumber(), loop);
       f = nullptr;
     }
 
@@ -689,7 +691,7 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalReadAmp(
       FormatFileNumber(f->fd.GetNumber(), f->fd.GetPathId(), file_num_buf,
                        sizeof(file_num_buf));
       LogToBuffer(log_buffer, "[%s] Universal: Possible candidate file %s[%d].",
-                  version->cfd()->GetName().c_str(), file_num_buf, loop);
+                  cf_name.c_str(), file_num_buf, loop);
     }
 
     // Check if the suceeding files need compaction.
@@ -740,9 +742,8 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalReadAmp(
         LogToBuffer(log_buffer, "[%s] Universal: Skipping file %" PRIu64
                                 "[%d] with size %" PRIu64
                                 " (compensated size %" PRIu64 ") %d\n",
-                    version->cfd()->GetName().c_str(), f->fd.GetNumber(), i,
-                    f->fd.GetFileSize(), f->compensated_file_size,
-                    f->being_compacted);
+                    cf_name.c_str(), f->fd.GetNumber(), i, f->fd.GetFileSize(),
+                    f->compensated_file_size, f->being_compacted);
       }
     }
   }
@@ -756,7 +757,7 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalReadAmp(
   int ratio_to_compress =
       ioptions_.compaction_options_universal.compression_size_percent;
   if (ratio_to_compress >= 0) {
-    uint64_t total_size = version->NumLevelBytes(kLevel0);
+    uint64_t total_size = vstorage->NumLevelBytes(kLevel0);
     uint64_t older_file_size = 0;
     for (unsigned int i = files.size() - 1;
          i >= first_index_after; i--) {
@@ -774,10 +775,10 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalReadAmp(
   }
   uint32_t path_id = GetPathId(ioptions_, estimated_total_size);
 
-  Compaction* c = new Compaction(version, kLevel0, kLevel0,
-      mutable_cf_options.MaxFileSizeForLevel(kLevel0),
-      LLONG_MAX, path_id, GetCompressionType(ioptions_, kLevel0,
-      enable_compression));
+  Compaction* c = new Compaction(
+      vstorage->NumberLevels(), kLevel0, kLevel0,
+      mutable_cf_options.MaxFileSizeForLevel(kLevel0), LLONG_MAX, path_id,
+      GetCompressionType(ioptions_, kLevel0, enable_compression));
   c->score_ = score;
 
   for (unsigned int i = start_index; i < first_index_after; i++) {
@@ -789,8 +790,8 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalReadAmp(
     LogToBuffer(log_buffer,
                 "[%s] Universal: Picking file %s[%d] "
                 "with size %" PRIu64 " (compensated size %" PRIu64 ")\n",
-                version->cfd()->GetName().c_str(), file_num_buf, i,
-                f->fd.GetFileSize(), f->compensated_file_size);
+                cf_name.c_str(), file_num_buf, i, f->fd.GetFileSize(),
+                f->compensated_file_size);
   }
   return c;
 }
@@ -802,8 +803,8 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalReadAmp(
 // min_merge_width and max_merge_width).
 //
 Compaction* UniversalCompactionPicker::PickCompactionUniversalSizeAmp(
-    const MutableCFOptions& mutable_cf_options, Version* version,
-    double score, LogBuffer* log_buffer) {
+    const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+    VersionStorageInfo* vstorage, double score, LogBuffer* log_buffer) {
   const int kLevel = 0;
 
   // percentage flexibilty while reducing size amplification
@@ -811,7 +812,7 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalSizeAmp(
                      max_size_amplification_percent;
 
   // The files are sorted from newest first to oldest last.
-  const auto& files = version->LevelFiles(kLevel);
+  const auto& files = vstorage->LevelFiles(kLevel);
 
   unsigned int candidate_count = 0;
   uint64_t candidate_size = 0;
@@ -829,7 +830,7 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalSizeAmp(
     FormatFileNumber(f->fd.GetNumber(), f->fd.GetPathId(), file_num_buf,
                      sizeof(file_num_buf));
     LogToBuffer(log_buffer, "[%s] Universal: skipping file %s[%d] compacted %s",
-                version->cfd()->GetName().c_str(), file_num_buf, loop,
+                cf_name.c_str(), file_num_buf, loop,
                 " cannot be a candidate to reduce size amp.\n");
     f = nullptr;
   }
@@ -842,7 +843,7 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalSizeAmp(
   FormatFileNumber(f->fd.GetNumber(), f->fd.GetPathId(), file_num_buf,
                    sizeof(file_num_buf));
   LogToBuffer(log_buffer, "[%s] Universal: First candidate file %s[%d] %s",
-              version->cfd()->GetName().c_str(), file_num_buf, start_index,
+              cf_name.c_str(), file_num_buf, start_index,
               " to reduce size amp.\n");
 
   // keep adding up all the remaining files
@@ -854,7 +855,7 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalSizeAmp(
                        sizeof(file_num_buf));
       LogToBuffer(
           log_buffer, "[%s] Universal: Possible candidate file %s[%d] %s.",
-          version->cfd()->GetName().c_str(), file_num_buf, loop,
+          cf_name.c_str(), file_num_buf, loop,
           " is already being compacted. No size amp reduction possible.\n");
       return nullptr;
     }
@@ -874,14 +875,14 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalSizeAmp(
         log_buffer,
         "[%s] Universal: size amp not needed. newer-files-total-size %" PRIu64
         "earliest-file-size %" PRIu64,
-        version->cfd()->GetName().c_str(), candidate_size, earliest_file_size);
+        cf_name.c_str(), candidate_size, earliest_file_size);
     return nullptr;
   } else {
     LogToBuffer(
         log_buffer,
         "[%s] Universal: size amp needed. newer-files-total-size %" PRIu64
         "earliest-file-size %" PRIu64,
-        version->cfd()->GetName().c_str(), candidate_size, earliest_file_size);
+        cf_name.c_str(), candidate_size, earliest_file_size);
   }
   assert(start_index < files.size() - 1);
 
@@ -895,29 +896,29 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalSizeAmp(
   // create a compaction request
   // We always compact all the files, so always compress.
   Compaction* c =
-      new Compaction(version, kLevel, kLevel,
-                     mutable_cf_options.MaxFileSizeForLevel(kLevel),
-                     LLONG_MAX, path_id, GetCompressionType(ioptions_, kLevel));
+      new Compaction(vstorage->NumberLevels(), kLevel, kLevel,
+                     mutable_cf_options.MaxFileSizeForLevel(kLevel), LLONG_MAX,
+                     path_id, GetCompressionType(ioptions_, kLevel));
   c->score_ = score;
   for (unsigned int loop = start_index; loop < files.size(); loop++) {
     f = files[loop];
     c->inputs_[0].files.push_back(f);
     LogToBuffer(log_buffer,
-        "[%s] Universal: size amp picking file %" PRIu64 "[%d] "
-        "with size %" PRIu64 " (compensated size %" PRIu64 ")",
-        version->cfd()->GetName().c_str(),
-        f->fd.GetNumber(), loop,
-        f->fd.GetFileSize(), f->compensated_file_size);
+                "[%s] Universal: size amp picking file %" PRIu64
+                "[%d] "
+                "with size %" PRIu64 " (compensated size %" PRIu64 ")",
+                cf_name.c_str(), f->fd.GetNumber(), loop, f->fd.GetFileSize(),
+                f->compensated_file_size);
   }
   return c;
 }
 
 Compaction* FIFOCompactionPicker::PickCompaction(
-    const MutableCFOptions& mutable_cf_options,
-    Version* version, LogBuffer* log_buffer) {
-  assert(version->NumberLevels() == 1);
+    const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+    VersionStorageInfo* vstorage, LogBuffer* log_buffer) {
+  assert(vstorage->NumberLevels() == 1);
   const int kLevel0 = 0;
-  const std::vector<FileMetaData*>& level_files = version->LevelFiles(kLevel0);
+  const std::vector<FileMetaData*>& level_files = vstorage->LevelFiles(kLevel0);
   uint64_t total_size = 0;
   for (const auto& file : level_files) {
     total_size += file->compensated_file_size;
@@ -929,7 +930,7 @@ Compaction* FIFOCompactionPicker::PickCompaction(
     LogToBuffer(log_buffer,
                 "[%s] FIFO compaction: nothing to do. Total size %" PRIu64
                 ", max size %" PRIu64 "\n",
-                version->cfd()->GetName().c_str(), total_size,
+                cf_name.c_str(), total_size,
                 ioptions_.compaction_options_fifo.max_table_files_size);
     return nullptr;
   }
@@ -938,11 +939,11 @@ Compaction* FIFOCompactionPicker::PickCompaction(
     LogToBuffer(log_buffer,
                 "[%s] FIFO compaction: Already executing compaction. No need "
                 "to run parallel compactions since compactions are very fast",
-                version->cfd()->GetName().c_str());
+                cf_name.c_str());
     return nullptr;
   }
 
-  Compaction* c = new Compaction(version, 0, 0, 0, 0, 0, kNoCompression, false,
+  Compaction* c = new Compaction(1, 0, 0, 0, 0, 0, kNoCompression, false,
                                  true /* is deletion compaction */);
   // delete old files (FIFO)
   for (auto ritr = level_files.rbegin(); ritr != level_files.rend(); ++ritr) {
@@ -953,8 +954,7 @@ Compaction* FIFOCompactionPicker::PickCompaction(
     AppendHumanBytes(f->fd.GetFileSize(), tmp_fsize, sizeof(tmp_fsize));
     LogToBuffer(log_buffer, "[%s] FIFO compaction: picking file %" PRIu64
                             " with size %s for deletion",
-                version->cfd()->GetName().c_str(), f->fd.GetNumber(),
-                tmp_fsize);
+                cf_name.c_str(), f->fd.GetNumber(), tmp_fsize);
     if (total_size <= ioptions_.compaction_options_fifo.max_table_files_size) {
       break;
     }
@@ -967,15 +967,16 @@ Compaction* FIFOCompactionPicker::PickCompaction(
 }
 
 Compaction* FIFOCompactionPicker::CompactRange(
-    const MutableCFOptions& mutable_cf_options,
-    Version* version, int input_level, int output_level,
+    const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+    VersionStorageInfo* vstorage, int input_level, int output_level,
     uint32_t output_path_id, const InternalKey* begin, const InternalKey* end,
     InternalKey** compaction_end) {
   assert(input_level == 0);
   assert(output_level == 0);
   *compaction_end = nullptr;
   LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, ioptions_.info_log);
-  Compaction* c = PickCompaction(mutable_cf_options, version, &log_buffer);
+  Compaction* c =
+      PickCompaction(cf_name, mutable_cf_options, vstorage, &log_buffer);
   if (c != nullptr) {
     assert(output_path_id < static_cast<uint32_t>(ioptions_.db_paths.size()));
     c->output_path_id_ = output_path_id;
diff --git a/db/compaction_picker.h b/db/compaction_picker.h
index 138b97eb4..d691a765a 100644
--- a/db/compaction_picker.h
+++ b/db/compaction_picker.h
@@ -18,12 +18,13 @@
 #include <vector>
 #include <memory>
 #include <set>
+#include <string>
 
 namespace rocksdb {
 
 class LogBuffer;
 class Compaction;
-class Version;
+class VersionStorageInfo;
 
 class CompactionPicker {
  public:
@@ -35,9 +36,10 @@ class CompactionPicker {
   // Returns nullptr if there is no compaction to be done.
   // Otherwise returns a pointer to a heap-allocated object that
   // describes the compaction.  Caller should delete the result.
-  virtual Compaction* PickCompaction(
-      const MutableCFOptions& mutable_cf_options,
-      Version* version, LogBuffer* log_buffer) = 0;
+  virtual Compaction* PickCompaction(const std::string& cf_name,
+                                     const MutableCFOptions& mutable_cf_options,
+                                     VersionStorageInfo* vstorage,
+                                     LogBuffer* log_buffer) = 0;
 
   // Return a compaction object for compacting the range [begin,end] in
   // the specified level.  Returns nullptr if there is nothing in that
@@ -51,9 +53,9 @@ class CompactionPicker {
   // Client is responsible for compaction_end storage -- when called,
   // *compaction_end should point to valid InternalKey!
   virtual Compaction* CompactRange(
-      const MutableCFOptions& mutable_cf_options, Version* version,
-      int input_level, int output_level, uint32_t output_path_id,
-      const InternalKey* begin, const InternalKey* end,
+      const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+      VersionStorageInfo* vstorage, int input_level, int output_level,
+      uint32_t output_path_id, const InternalKey* begin, const InternalKey* end,
       InternalKey** compaction_end);
 
   // Given the current number of levels, returns the lowest allowed level
@@ -93,18 +95,21 @@ class CompactionPicker {
   // populated.
   //
   // Will return false if it is impossible to apply this compaction.
-  bool ExpandWhileOverlapping(Compaction* c);
+  bool ExpandWhileOverlapping(const std::string& cf_name,
+                              VersionStorageInfo* vstorage, Compaction* c);
 
   // Returns true if any one of the specified files are being compacted
   bool FilesInCompaction(std::vector<FileMetaData*>& files);
 
   // Returns true if any one of the parent files are being compacted
-  bool ParentRangeInCompaction(Version* version, const InternalKey* smallest,
+  bool ParentRangeInCompaction(VersionStorageInfo* vstorage,
+                               const InternalKey* smallest,
                                const InternalKey* largest, int level,
                                int* index);
 
-  void SetupOtherInputs(const MutableCFOptions& mutable_cf_options,
-                        Compaction* c);
+  void SetupOtherInputs(const std::string& cf_name,
+                        const MutableCFOptions& mutable_cf_options,
+                        VersionStorageInfo* vstorage, Compaction* c);
 
   const ImmutableCFOptions& ioptions_;
 
@@ -121,9 +126,10 @@ class UniversalCompactionPicker : public CompactionPicker {
   UniversalCompactionPicker(const ImmutableCFOptions& ioptions,
                             const InternalKeyComparator* icmp)
       : CompactionPicker(ioptions, icmp) {}
-  virtual Compaction* PickCompaction(
-      const MutableCFOptions& mutable_cf_options,
-      Version* version, LogBuffer* log_buffer) override;
+  virtual Compaction* PickCompaction(const std::string& cf_name,
+                                     const MutableCFOptions& mutable_cf_options,
+                                     VersionStorageInfo* vstorage,
+                                     LogBuffer* log_buffer) override;
 
   // The maxinum allowed input level.  Always return 0.
   virtual int MaxInputLevel(int current_num_levels) const override {
@@ -133,14 +139,14 @@ class UniversalCompactionPicker : public CompactionPicker {
  private:
   // Pick Universal compaction to limit read amplification
   Compaction* PickCompactionUniversalReadAmp(
-      const MutableCFOptions& mutable_cf_options,
-      Version* version, double score, unsigned int ratio,
+      const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+      VersionStorageInfo* vstorage, double score, unsigned int ratio,
       unsigned int num_files, LogBuffer* log_buffer);
 
   // Pick Universal compaction to limit space amplification.
   Compaction* PickCompactionUniversalSizeAmp(
-      const MutableCFOptions& mutable_cf_options,
-      Version* version, double score, LogBuffer* log_buffer);
+      const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+      VersionStorageInfo* vstorage, double score, LogBuffer* log_buffer);
 
   // Pick a path ID to place a newly generated file, with its estimated file
   // size.
@@ -153,9 +159,10 @@ class LevelCompactionPicker : public CompactionPicker {
   LevelCompactionPicker(const ImmutableCFOptions& ioptions,
                         const InternalKeyComparator* icmp)
       : CompactionPicker(ioptions, icmp) {}
-  virtual Compaction* PickCompaction(
-      const MutableCFOptions& mutable_cf_options,
-      Version* version, LogBuffer* log_buffer) override;
+  virtual Compaction* PickCompaction(const std::string& cf_name,
+                                     const MutableCFOptions& mutable_cf_options,
+                                     VersionStorageInfo* vstorage,
+                                     LogBuffer* log_buffer) override;
 
   // Returns current_num_levels - 2, meaning the last level cannot be
   // compaction input level.
@@ -169,7 +176,8 @@ class LevelCompactionPicker : public CompactionPicker {
   // If level is 0 and there is already a compaction on that level, this
   // function will return nullptr.
   Compaction* PickCompactionBySize(const MutableCFOptions& mutable_cf_options,
-      Version* version, int level, double score);
+                                   VersionStorageInfo* vstorage, int level,
+                                   double score);
 };
 
 class FIFOCompactionPicker : public CompactionPicker {
@@ -178,14 +186,15 @@ class FIFOCompactionPicker : public CompactionPicker {
                        const InternalKeyComparator* icmp)
       : CompactionPicker(ioptions, icmp) {}
 
-  virtual Compaction* PickCompaction(
-      const MutableCFOptions& mutable_cf_options,
-      Version* version, LogBuffer* log_buffer) override;
+  virtual Compaction* PickCompaction(const std::string& cf_name,
+                                     const MutableCFOptions& mutable_cf_options,
+                                     VersionStorageInfo* version,
+                                     LogBuffer* log_buffer) override;
 
   virtual Compaction* CompactRange(
-      const MutableCFOptions& mutable_cf_options, Version* version,
-      int input_level, int output_level, uint32_t output_path_id,
-      const InternalKey* begin, const InternalKey* end,
+      const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+      VersionStorageInfo* vstorage, int input_level, int output_level,
+      uint32_t output_path_id, const InternalKey* begin, const InternalKey* end,
       InternalKey** compaction_end) override;
 
   // The maxinum allowed input level.  Always return 0.
diff --git a/db/compaction_picker_test.cc b/db/compaction_picker_test.cc
new file mode 100644
index 000000000..81bffe0af
--- /dev/null
+++ b/db/compaction_picker_test.cc
@@ -0,0 +1,149 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include "db/compaction_picker.h"
+#include <string>
+#include "util/logging.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+namespace rocksdb {
+
+class CountingLogger : public Logger {
+ public:
+  virtual void Logv(const char* format, va_list ap) override { log_count++; }
+  size_t log_count;
+};
+
+class CompactionPickerTest {
+ public:
+  const Comparator* ucmp;
+  InternalKeyComparator icmp;
+  Options options;
+  ImmutableCFOptions ioptions;
+  MutableCFOptions mutable_cf_options;
+  LevelCompactionPicker level_compaction_picker;
+  std::string cf_name;
+  CountingLogger logger;
+  LogBuffer log_buffer;
+  VersionStorageInfo vstorage;
+  uint32_t file_num;
+  CompactionOptionsFIFO fifo_options;
+  std::vector<uint64_t> size_being_compacted;
+
+  CompactionPickerTest()
+      : ucmp(BytewiseComparator()),
+        icmp(ucmp),
+        ioptions(options),
+        mutable_cf_options(options, ioptions),
+        level_compaction_picker(ioptions, &icmp),
+        cf_name("dummy"),
+        log_buffer(InfoLogLevel::INFO_LEVEL, &logger),
+        vstorage(&icmp, ucmp, options.num_levels, kCompactionStyleLevel,
+                 nullptr),
+        file_num(1) {
+    fifo_options.max_table_files_size = 1;
+    mutable_cf_options.RefreshDerivedOptions(ioptions);
+    size_being_compacted.resize(options.num_levels);
+  }
+
+  ~CompactionPickerTest() {
+    auto* files = vstorage.GetFiles();
+    for (int i = 0; i < vstorage.NumberLevels(); i++) {
+      for (auto* f : files[i]) {
+        delete f;
+      }
+    }
+  }
+
+  void Add(int level, uint32_t file_number, const char* smallest,
+           const char* largest, uint64_t file_size = 0, uint32_t path_id = 0,
+           SequenceNumber smallest_seq = 100,
+           SequenceNumber largest_seq = 100) {
+    assert(level < vstorage.NumberLevels());
+    auto& files = vstorage.GetFiles()[level];
+    FileMetaData* f = new FileMetaData;
+    f->fd = FileDescriptor(file_number, path_id, file_size);
+    f->smallest = InternalKey(smallest, smallest_seq, kTypeValue);
+    f->largest = InternalKey(largest, largest_seq, kTypeValue);
+    f->compensated_file_size = file_size;
+    files.push_back(f);
+  }
+
+  void UpdateVersionStorageInfo() {
+    vstorage.ComputeCompactionScore(mutable_cf_options, fifo_options,
+                                    size_being_compacted);
+    vstorage.UpdateFilesBySize();
+    vstorage.UpdateNumNonEmptyLevels();
+    vstorage.GenerateFileIndexer();
+    vstorage.GenerateLevelFilesBrief();
+    vstorage.SetFinalized();
+  }
+};
+
+TEST(CompactionPickerTest, Empty) {
+  UpdateVersionStorageInfo();
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name, mutable_cf_options, &vstorage, &log_buffer));
+  ASSERT_TRUE(compaction.get() == nullptr);
+}
+
+TEST(CompactionPickerTest, Single) {
+  mutable_cf_options.level0_file_num_compaction_trigger = 2;
+  Add(0, 1U, "p", "q");
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name, mutable_cf_options, &vstorage, &log_buffer));
+  ASSERT_TRUE(compaction.get() == nullptr);
+}
+
+TEST(CompactionPickerTest, Level0Trigger) {
+  mutable_cf_options.level0_file_num_compaction_trigger = 2;
+  Add(0, 1U, "150", "200");
+  Add(0, 2U, "200", "250");
+
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name, mutable_cf_options, &vstorage, &log_buffer));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(2, compaction->num_input_files(0));
+  ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber());
+}
+
+TEST(CompactionPickerTest, Level1Trigger) {
+  Add(1, 66U, "150", "200", 1000000000U);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name, mutable_cf_options, &vstorage, &log_buffer));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(1, compaction->num_input_files(0));
+  ASSERT_EQ(66U, compaction->input(0, 0)->fd.GetNumber());
+}
+
+TEST(CompactionPickerTest, Level1Trigger2) {
+  Add(1, 66U, "150", "200", 1000000000U);
+  Add(1, 88U, "201", "300", 1000000000U);
+  Add(2, 6U, "150", "180", 1000000000U);
+  Add(2, 7U, "180", "220", 1000000000U);
+  Add(2, 8U, "220", "300", 1000000000U);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name, mutable_cf_options, &vstorage, &log_buffer));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(1, compaction->num_input_files(0));
+  ASSERT_EQ(2, compaction->num_input_files(1));
+  ASSERT_EQ(66U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(6U, compaction->input(1, 0)->fd.GetNumber());
+  ASSERT_EQ(7U, compaction->input(1, 1)->fd.GetNumber());
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) { return rocksdb::test::RunAllTests(); }
diff --git a/db/db_impl.cc b/db/db_impl.cc
index a47668763..345188703 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -1497,9 +1497,9 @@ Status DBImpl::FlushMemTableToOutputFile(
     if (madeProgress) {
       *madeProgress = 1;
     }
-    Version::LevelSummaryStorage tmp;
+    VersionStorageInfo::LevelSummaryStorage tmp;
     LogToBuffer(log_buffer, "[%s] Level summary: %s\n", cfd->GetName().c_str(),
-                cfd->current()->LevelSummary(&tmp));
+                cfd->current()->GetStorageInfo()->LevelSummary(&tmp));
 
     if (disable_delete_obsolete_files_ == 0) {
       // add to deletion state
@@ -1545,7 +1545,7 @@ Status DBImpl::CompactRange(ColumnFamilyHandle* column_family,
     MutexLock l(&mutex_);
     Version* base = cfd->current();
     for (int level = 1; level < cfd->NumberLevels(); level++) {
-      if (base->OverlapInLevel(level, begin, end)) {
+      if (base->GetStorageInfo()->OverlapInLevel(level, begin, end)) {
         max_level_with_files = level;
       }
     }
@@ -1623,14 +1623,14 @@ bool DBImpl::SetOptions(ColumnFamilyHandle* column_family,
 int DBImpl::FindMinimumEmptyLevelFitting(ColumnFamilyData* cfd,
     const MutableCFOptions& mutable_cf_options, int level) {
   mutex_.AssertHeld();
-  Version* current = cfd->current();
+  auto* vstorage = cfd->current()->GetStorageInfo();
   int minimum_level = level;
   for (int i = level - 1; i > 0; --i) {
     // stop if level i is not empty
-    if (current->NumLevelFiles(i) > 0) break;
+    if (vstorage->NumLevelFiles(i) > 0) break;
     // stop if level i is too small (cannot fit the level files)
     if (mutable_cf_options.MaxBytesForLevel(i) <
-        current->NumLevelBytes(level)) {
+        vstorage->NumLevelBytes(level)) {
       break;
     }
 
@@ -1682,7 +1682,7 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) {
 
     VersionEdit edit;
     edit.SetColumnFamily(cfd->GetID());
-    for (const auto& f : cfd->current()->files_[level]) {
+    for (const auto& f : cfd->current()->GetStorageInfo()->files_[level]) {
       edit.DeleteFile(level, f->fd.GetNumber());
       edit.AddFile(to_level, f->fd.GetNumber(), f->fd.GetPathId(),
                    f->fd.GetFileSize(), f->smallest, f->largest,
@@ -1898,7 +1898,7 @@ void DBImpl::MaybeScheduleFlushOrCompaction() {
     bool is_compaction_needed = false;
     // no need to refcount since we're under a mutex
     for (auto cfd : *versions_->GetColumnFamilySet()) {
-      if (cfd->current()->NeedsCompaction()) {
+      if (cfd->current()->GetStorageInfo()->NeedsCompaction()) {
         is_compaction_needed = true;
         break;
       }
@@ -2269,14 +2269,12 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress, JobContext* job_context,
     InstallSuperVersionBackground(c->column_family_data(), job_context,
                                   *c->mutable_cf_options());
 
-    Version::LevelSummaryStorage tmp;
-    LogToBuffer(
-        log_buffer,
-        "[%s] Moved #%" PRIu64 " to level-%d %" PRIu64 " bytes %s: %s\n",
-        c->column_family_data()->GetName().c_str(),
-        f->fd.GetNumber(), c->level() + 1,
-        f->fd.GetFileSize(),
-        status.ToString().c_str(), c->input_version()->LevelSummary(&tmp));
+    VersionStorageInfo::LevelSummaryStorage tmp;
+    LogToBuffer(log_buffer, "[%s] Moved #%" PRIu64 " to level-%d %" PRIu64
+                            " bytes %s: %s\n",
+                c->column_family_data()->GetName().c_str(), f->fd.GetNumber(),
+                c->level() + 1, f->fd.GetFileSize(), status.ToString().c_str(),
+                c->input_version()->GetStorageInfo()->LevelSummary(&tmp));
     c->ReleaseCompactionFiles(status);
     *madeProgress = true;
   } else {
@@ -3008,7 +3006,8 @@ Status DBImpl::DoCompactionWork(CompactionState* compact,
   LogToBuffer(log_buffer, "[%s] Compaction start summary: %s\n",
               cfd->GetName().c_str(), scratch);
 
-  assert(cfd->current()->NumLevelFiles(compact->compaction->level()) > 0);
+  assert(cfd->current()->GetStorageInfo()->NumLevelFiles(
+             compact->compaction->level()) > 0);
   assert(compact->builder == nullptr);
   assert(!compact->outfile);
 
@@ -3246,26 +3245,26 @@ Status DBImpl::DoCompactionWork(CompactionState* compact,
     status = InstallCompactionResults(compact, mutable_cf_options, log_buffer);
     InstallSuperVersionBackground(cfd, job_context, mutable_cf_options);
   }
-  Version::LevelSummaryStorage tmp;
-  LogToBuffer(
-      log_buffer,
-      "[%s] compacted to: %s, MB/sec: %.1f rd, %.1f wr, level %d, "
-      "files in(%d, %d) out(%d) "
-      "MB in(%.1f, %.1f) out(%.1f), read-write-amplify(%.1f) "
-      "write-amplify(%.1f) %s, records in: %d, records dropped: %d\n",
-      cfd->GetName().c_str(), cfd->current()->LevelSummary(&tmp),
-      (stats.bytes_readn + stats.bytes_readnp1) /
-          static_cast<double>(stats.micros),
-      stats.bytes_written / static_cast<double>(stats.micros),
-      compact->compaction->output_level(), stats.files_in_leveln,
-      stats.files_in_levelnp1, stats.files_out_levelnp1,
-      stats.bytes_readn / 1048576.0, stats.bytes_readnp1 / 1048576.0,
-      stats.bytes_written / 1048576.0,
-      (stats.bytes_written + stats.bytes_readnp1 + stats.bytes_readn) /
-          (double)stats.bytes_readn,
-      stats.bytes_written / (double)stats.bytes_readn,
-      status.ToString().c_str(), stats.num_input_records,
-      stats.num_dropped_records);
+  VersionStorageInfo::LevelSummaryStorage tmp;
+  LogToBuffer(log_buffer,
+              "[%s] compacted to: %s, MB/sec: %.1f rd, %.1f wr, level %d, "
+              "files in(%d, %d) out(%d) "
+              "MB in(%.1f, %.1f) out(%.1f), read-write-amplify(%.1f) "
+              "write-amplify(%.1f) %s, records in: %d, records dropped: %d\n",
+              cfd->GetName().c_str(),
+              cfd->current()->GetStorageInfo()->LevelSummary(&tmp),
+              (stats.bytes_readn + stats.bytes_readnp1) /
+                  static_cast<double>(stats.micros),
+              stats.bytes_written / static_cast<double>(stats.micros),
+              compact->compaction->output_level(), stats.files_in_leveln,
+              stats.files_in_levelnp1, stats.files_out_levelnp1,
+              stats.bytes_readn / 1048576.0, stats.bytes_readnp1 / 1048576.0,
+              stats.bytes_written / 1048576.0,
+              (stats.bytes_written + stats.bytes_readnp1 + stats.bytes_readn) /
+                  (double)stats.bytes_readn,
+              stats.bytes_written / (double)stats.bytes_readn,
+              status.ToString().c_str(), stats.num_input_records,
+              stats.num_dropped_records);
 
   return status;
 }
@@ -4375,16 +4374,16 @@ Status DBImpl::DeleteFile(std::string name) {
     // Only the files in the last level can be deleted externally.
     // This is to make sure that any deletion tombstones are not
     // lost. Check that the level passed is the last level.
+    auto* vstoreage = cfd->current()->GetStorageInfo();
     for (int i = level + 1; i < cfd->NumberLevels(); i++) {
-      if (cfd->current()->NumLevelFiles(i) != 0) {
+      if (vstoreage->NumLevelFiles(i) != 0) {
         Log(db_options_.info_log,
             "DeleteFile %s FAILED. File not in last level\n", name.c_str());
         return Status::InvalidArgument("File not in last level");
       }
     }
     // if level == 0, it has to be the oldest file
-    if (level == 0 &&
-        cfd->current()->files_[0].back()->fd.GetNumber() != number) {
+    if (level == 0 && vstoreage->files_[0].back()->fd.GetNumber() != number) {
       return Status::InvalidArgument("File in level 0, but not oldest");
     }
     edit.SetColumnFamily(cfd->GetID());
@@ -4637,9 +4636,9 @@ Status DB::Open(const DBOptions& db_options, const std::string& dbname,
     for (auto cfd : *impl->versions_->GetColumnFamilySet()) {
       if (cfd->ioptions()->compaction_style == kCompactionStyleUniversal ||
           cfd->ioptions()->compaction_style == kCompactionStyleFIFO) {
-        Version* current = cfd->current();
-        for (int i = 1; i < current->NumberLevels(); ++i) {
-          int num_files = current->NumLevelFiles(i);
+        auto* vstorage = cfd->current()->GetStorageInfo();
+        for (int i = 1; i < vstorage->NumberLevels(); ++i) {
+          int num_files = vstorage->NumLevelFiles(i);
           if (num_files > 0) {
             s = Status::InvalidArgument(
                 "Not all files are at level 0. Cannot "
diff --git a/db/db_impl_debug.cc b/db/db_impl_debug.cc
index 6c073d4d5..a7be59313 100644
--- a/db/db_impl_debug.cc
+++ b/db/db_impl_debug.cc
@@ -17,7 +17,8 @@ void DBImpl::TEST_PurgeObsoleteteWAL() { PurgeObsoleteWALFiles(); }
 
 uint64_t DBImpl::TEST_GetLevel0TotalSize() {
   MutexLock l(&mutex_);
-  return default_cf_handle_->cfd()->current()->NumLevelBytes(0);
+  return default_cf_handle_->cfd()->current()->GetStorageInfo()->NumLevelBytes(
+      0);
 }
 
 Iterator* DBImpl::TEST_NewInternalIterator(Arena* arena,
@@ -47,7 +48,7 @@ int64_t DBImpl::TEST_MaxNextLevelOverlappingBytes(
     cfd = cfh->cfd();
   }
   MutexLock l(&mutex_);
-  return cfd->current()->MaxNextLevelOverlappingBytes();
+  return cfd->current()->GetStorageInfo()->MaxNextLevelOverlappingBytes();
 }
 
 void DBImpl::TEST_GetFilesMetaData(
@@ -58,7 +59,8 @@ void DBImpl::TEST_GetFilesMetaData(
   MutexLock l(&mutex_);
   metadata->resize(NumberLevels());
   for (int level = 0; level < NumberLevels(); level++) {
-    const std::vector<FileMetaData*>& files = cfd->current()->files_[level];
+    const std::vector<FileMetaData*>& files =
+        cfd->current()->GetStorageInfo()->LevelFiles(level);
 
     (*metadata)[level].clear();
     for (const auto& f : files) {
diff --git a/db/flush_job.cc b/db/flush_job.cc
index c4eb12d3c..fda80cea8 100644
--- a/db/flush_job.cc
+++ b/db/flush_job.cc
@@ -202,8 +202,8 @@ Status FlushJob::WriteLevel0Table(const autovector<MemTable*>& mems,
     if (base != nullptr && db_options_.max_background_compactions <= 1 &&
         db_options_.max_background_flushes == 0 &&
         cfd_->ioptions()->compaction_style == kCompactionStyleLevel) {
-      level = base->PickLevelForMemTableOutput(mutable_cf_options_,
-                                               min_user_key, max_user_key);
+      level = base->GetStorageInfo()->PickLevelForMemTableOutput(
+          mutable_cf_options_, min_user_key, max_user_key);
     }
     edit->AddFile(level, meta.fd.GetNumber(), meta.fd.GetPathId(),
                   meta.fd.GetFileSize(), meta.smallest, meta.largest,
diff --git a/db/forward_iterator.cc b/db/forward_iterator.cc
index 04b5b3b34..88415e5b8 100644
--- a/db/forward_iterator.cc
+++ b/db/forward_iterator.cc
@@ -220,7 +220,8 @@ void ForwardIterator::SeekInternal(const Slice& internal_key,
     if (!seek_to_first) {
       user_key = ExtractUserKey(internal_key);
     }
-    const std::vector<FileMetaData*>& l0 = sv_->current->LevelFiles(0);
+    VersionStorageInfo* vstorage = sv_->current->GetStorageInfo();
+    const std::vector<FileMetaData*>& l0 = vstorage->LevelFiles(0);
     for (uint32_t i = 0; i < l0.size(); ++i) {
       if (seek_to_first) {
         l0_iters_[i]->SeekToFirst();
@@ -248,9 +249,9 @@ void ForwardIterator::SeekInternal(const Slice& internal_key,
 
     int32_t search_left_bound = 0;
     int32_t search_right_bound = FileIndexer::kLevelMaxIndex;
-    for (int32_t level = 1; level < sv_->current->NumberLevels(); ++level) {
+    for (int32_t level = 1; level < vstorage->NumberLevels(); ++level) {
       const std::vector<FileMetaData*>& level_files =
-        sv_->current->LevelFiles(level);
+          vstorage->LevelFiles(level);
       if (level_files.empty()) {
         search_left_bound = 0;
         search_right_bound = FileIndexer::kLevelMaxIndex;
@@ -258,7 +259,7 @@ void ForwardIterator::SeekInternal(const Slice& internal_key,
       }
       assert(level_iters_[level - 1] != nullptr);
       uint32_t f_idx = 0;
-      const auto& indexer = sv_->current->GetIndexer();
+      const auto& indexer = vstorage->GetIndexer();
       if (!seek_to_first) {
         if (search_left_bound == search_right_bound) {
           f_idx = search_left_bound;
@@ -428,15 +429,18 @@ void ForwardIterator::RebuildIterators(bool refresh_sv) {
   }
   mutable_iter_ = sv_->mem->NewIterator(read_options_, &arena_);
   sv_->imm->AddIterators(read_options_, &imm_iters_, &arena_);
-  const auto& l0_files = sv_->current->LevelFiles(0);
+
+  auto* vstorage = sv_->current->GetStorageInfo();
+  const auto& l0_files = vstorage->LevelFiles(0);
   l0_iters_.reserve(l0_files.size());
   for (const auto* l0 : l0_files) {
     l0_iters_.push_back(cfd_->table_cache()->NewIterator(
         read_options_, *cfd_->soptions(), cfd_->internal_comparator(), l0->fd));
   }
-  level_iters_.reserve(sv_->current->NumberLevels() - 1);
-  for (int32_t level = 1; level < sv_->current->NumberLevels(); ++level) {
-    const auto& level_files = sv_->current->LevelFiles(level);
+  level_iters_.reserve(vstorage->NumberLevels() - 1);
+  for (int32_t level = 1; level < vstorage->NumberLevels(); ++level) {
+    const auto& level_files = vstorage->LevelFiles(level);
+
     if (level_files.empty()) {
       level_iters_.push_back(nullptr);
     } else {
@@ -450,7 +454,7 @@ void ForwardIterator::RebuildIterators(bool refresh_sv) {
 }
 
 void ForwardIterator::ResetIncompleteIterators() {
-  const auto& l0_files = sv_->current->LevelFiles(0);
+  const auto& l0_files = sv_->current->GetStorageInfo()->LevelFiles(0);
   for (uint32_t i = 0; i < l0_iters_.size(); ++i) {
     assert(i < l0_files.size());
     if (!l0_iters_[i]->status().IsIncomplete()) {
diff --git a/db/internal_stats.cc b/db/internal_stats.cc
index cfeb9c00d..ca0a8d62c 100644
--- a/db/internal_stats.cc
+++ b/db/internal_stats.cc
@@ -169,7 +169,8 @@ bool InternalStats::GetStringProperty(DBPropertyType property_type,
                                       const Slice& property,
                                       std::string* value) {
   assert(value != nullptr);
-  Version* current = cfd_->current();
+  auto* current = cfd_->current();
+  auto* vstorage = current->GetStorageInfo();
   Slice in = property;
 
   switch (property_type) {
@@ -182,7 +183,7 @@ bool InternalStats::GetStringProperty(DBPropertyType property_type,
       } else {
         char buf[100];
         snprintf(buf, sizeof(buf), "%d",
-                 current->NumLevelFiles(static_cast<int>(level)));
+                 vstorage->NumLevelFiles(static_cast<int>(level)));
         *value = buf;
         return true;
       }
@@ -196,8 +197,8 @@ bool InternalStats::GetStringProperty(DBPropertyType property_type,
 
       for (int level = 0; level < number_levels_; level++) {
         snprintf(buf, sizeof(buf), "%3d %8d %8.0f\n", level,
-                 current->NumLevelFiles(level),
-                 current->NumLevelBytes(level) / kMB);
+                 vstorage->NumLevelFiles(level),
+                 vstorage->NumLevelBytes(level) / kMB);
         value->append(buf);
       }
       return true;
@@ -229,7 +230,7 @@ bool InternalStats::GetStringProperty(DBPropertyType property_type,
 
 bool InternalStats::GetIntProperty(DBPropertyType property_type,
                                    uint64_t* value, DBImpl* db) const {
-  Version* current = cfd_->current();
+  auto* vstorage = cfd_->current()->GetStorageInfo();
 
   switch (property_type) {
     case kNumImmutableMemTable:
@@ -242,7 +243,7 @@ bool InternalStats::GetIntProperty(DBPropertyType property_type,
     case kCompactionPending:
       // 1 if the system already determines at least one compacdtion is needed.
       // 0 otherwise,
-      *value = (current->NeedsCompaction() ? 1 : 0);
+      *value = (vstorage->NeedsCompaction() ? 1 : 0);
       return true;
     case kBackgroundErrors:
       // Accumulated number of  errors in background flushes or compactions.
@@ -270,7 +271,7 @@ bool InternalStats::GetIntProperty(DBPropertyType property_type,
       // Use estimated entries in tables + total entries in memtables.
       *value = cfd_->mem()->GetNumEntries() +
                cfd_->imm()->current()->GetTotalNumEntries() +
-               current->GetEstimatedActiveKeys();
+               vstorage->GetEstimatedActiveKeys();
       return true;
 #ifndef ROCKSDB_LITE
     case kIsFileDeletionEnabled:
@@ -365,24 +366,25 @@ void InternalStats::DumpDBStats(std::string* value) {
 }
 
 void InternalStats::DumpCFStats(std::string* value) {
-  Version* current = cfd_->current();
+  VersionStorageInfo* vstorage = cfd_->current()->GetStorageInfo();
 
   int num_levels_to_check =
       (cfd_->options()->compaction_style != kCompactionStyleUniversal &&
        cfd_->options()->compaction_style != kCompactionStyleFIFO)
-          ? current->NumberLevels() - 1
+          ? vstorage->NumberLevels() - 1
           : 1;
+
   // Compaction scores are sorted base on its value. Restore them to the
   // level order
   std::vector<double> compaction_score(number_levels_, 0);
   for (int i = 0; i < num_levels_to_check; ++i) {
-    compaction_score[current->compaction_level_[i]] =
-      current->compaction_score_[i];
+    compaction_score[vstorage->compaction_level_[i]] =
+        vstorage->compaction_score_[i];
   }
   // Count # of files being compacted for each level
   std::vector<int> files_being_compacted(number_levels_, 0);
   for (int level = 0; level < num_levels_to_check; ++level) {
-    for (auto* f : current->files_[level]) {
+    for (auto* f : vstorage->files_[level]) {
       if (f->being_compacted) {
         ++files_being_compacted[level];
       }
@@ -405,7 +407,7 @@ void InternalStats::DumpCFStats(std::string* value) {
   uint64_t total_stall_count = 0;
   double total_stall_us = 0;
   for (int level = 0; level < number_levels_; level++) {
-    int files = current->NumLevelFiles(level);
+    int files = vstorage->NumLevelFiles(level);
     total_files += files;
     total_files_being_compacted += files_being_compacted[level];
     if (comp_stats_[level].micros > 0 || files > 0) {
@@ -424,7 +426,7 @@ void InternalStats::DumpCFStats(std::string* value) {
             stall_leveln_slowdown_hard_[level]);
 
       stats_sum.Add(comp_stats_[level]);
-      total_file_size += current->NumLevelBytes(level);
+      total_file_size += vstorage->NumLevelBytes(level);
       total_stall_us += stall_us;
       total_stall_count += stalls;
       total_slowdown_soft += stall_leveln_slowdown_soft_[level];
@@ -439,10 +441,10 @@ void InternalStats::DumpCFStats(std::string* value) {
       double w_amp = (comp_stats_[level].bytes_readn == 0) ? 0.0
           : comp_stats_[level].bytes_written /
             static_cast<double>(comp_stats_[level].bytes_readn);
-      PrintLevelStats(buf, sizeof(buf), "L" + std::to_string(level),
-          files, files_being_compacted[level], current->NumLevelBytes(level),
-          compaction_score[level], rw_amp, w_amp, stall_us, stalls,
-          comp_stats_[level]);
+      PrintLevelStats(buf, sizeof(buf), "L" + std::to_string(level), files,
+                      files_being_compacted[level],
+                      vstorage->NumLevelBytes(level), compaction_score[level],
+                      rw_amp, w_amp, stall_us, stalls, comp_stats_[level]);
       value->append(buf);
     }
   }
diff --git a/db/version_set.cc b/db/version_set.cc
index 6a68c373e..0069ef6b0 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -305,6 +305,8 @@ class FilePicker {
 };
 }  // anonymous namespace
 
+VersionStorageInfo::~VersionStorageInfo() { delete[] files_; }
+
 Version::~Version() {
   assert(refs_ == 0);
 
@@ -313,9 +315,9 @@ Version::~Version() {
   next_->prev_ = prev_;
 
   // Drop references to files
-  for (int level = 0; level < num_levels_; level++) {
-    for (size_t i = 0; i < files_[level].size(); i++) {
-      FileMetaData* f = files_[level][i];
+  for (int level = 0; level < vstorage_.num_levels_; level++) {
+    for (size_t i = 0; i < vstorage_.files_[level].size(); i++) {
+      FileMetaData* f = vstorage_.files_[level][i];
       assert(f->refs > 0);
       f->refs--;
       if (f->refs <= 0) {
@@ -327,7 +329,6 @@ Version::~Version() {
       }
     }
   }
-  delete[] files_;
 }
 
 int FindFile(const InternalKeyComparator& icmp,
@@ -564,8 +565,8 @@ Status Version::GetTableProperties(std::shared_ptr<const TableProperties>* tp,
 }
 
 Status Version::GetPropertiesOfAllTables(TablePropertiesCollection* props) {
-  for (int level = 0; level < num_levels_; level++) {
-    for (const auto& file_meta : files_[level]) {
+  for (int level = 0; level < vstorage_.num_levels_; level++) {
+    for (const auto& file_meta : vstorage_.files_[level]) {
       auto fname =
           TableFileName(vset_->db_options_->db_paths, file_meta->fd.GetNumber(),
                         file_meta->fd.GetPathId());
@@ -586,7 +587,7 @@ Status Version::GetPropertiesOfAllTables(TablePropertiesCollection* props) {
 
 size_t Version::GetMemoryUsageByTableReaders() {
   size_t total_usage = 0;
-  for (auto& file_level : level_files_brief_) {
+  for (auto& file_level : vstorage_.level_files_brief_) {
     for (size_t i = 0; i < file_level.num_files; i++) {
       total_usage += cfd_->table_cache()->GetMemoryUsageByTableReader(
           vset_->env_options_, cfd_->internal_comparator(),
@@ -596,7 +597,7 @@ size_t Version::GetMemoryUsageByTableReaders() {
   return total_usage;
 }
 
-uint64_t Version::GetEstimatedActiveKeys() {
+uint64_t VersionStorageInfo::GetEstimatedActiveKeys() {
   // Estimation will be not accurate when:
   // (1) there is merge keys
   // (2) keys are directly overwritten
@@ -619,11 +620,11 @@ uint64_t Version::GetEstimatedActiveKeys() {
 void Version::AddIterators(const ReadOptions& read_options,
                            const EnvOptions& soptions,
                            MergeIteratorBuilder* merge_iter_builder) {
-  assert(finalized_);
+  assert(vstorage_.finalized_);
 
   // Merge all level zero files together since they may overlap
-  for (size_t i = 0; i < level_files_brief_[0].num_files; i++) {
-    const auto& file = level_files_brief_[0].files[i];
+  for (size_t i = 0; i < vstorage_.level_files_brief_[0].num_files; i++) {
+    const auto& file = vstorage_.level_files_brief_[0].files[i];
     merge_iter_builder->AddIterator(cfd_->table_cache()->NewIterator(
         read_options, soptions, cfd_->internal_comparator(), file.fd, nullptr,
         false, merge_iter_builder->GetArena()));
@@ -632,50 +633,36 @@ void Version::AddIterators(const ReadOptions& read_options,
   // For levels > 0, we can use a concatenating iterator that sequentially
   // walks through the non-overlapping files in the level, opening them
   // lazily.
-  for (int level = 1; level < num_levels_; level++) {
-    if (level_files_brief_[level].num_files != 0) {
+  for (int level = 1; level < vstorage_.num_levels_; level++) {
+    if (vstorage_.level_files_brief_[level].num_files != 0) {
       merge_iter_builder->AddIterator(NewTwoLevelIterator(
           new LevelFileIteratorState(
               cfd_->table_cache(), read_options, soptions,
               cfd_->internal_comparator(), false /* for_compaction */,
               cfd_->ioptions()->prefix_extractor != nullptr),
           new LevelFileNumIterator(cfd_->internal_comparator(),
-              &level_files_brief_[level]), merge_iter_builder->GetArena()));
+                                   &vstorage_.level_files_brief_[level]),
+          merge_iter_builder->GetArena()));
     }
   }
 }
 
-
-
-Version::Version(ColumnFamilyData* cfd, VersionSet* vset,
-                 uint64_t version_number)
-    : cfd_(cfd),
-      internal_comparator_((cfd == nullptr) ? nullptr
-                                            : &cfd->internal_comparator()),
-      user_comparator_(
-          (cfd == nullptr) ? nullptr : internal_comparator_->user_comparator()),
-      table_cache_((cfd == nullptr) ? nullptr : cfd->table_cache()),
-      merge_operator_((cfd == nullptr) ? nullptr
-                                       : cfd->ioptions()->merge_operator),
-      info_log_((cfd == nullptr) ? nullptr : cfd->ioptions()->info_log),
-      db_statistics_((cfd == nullptr) ? nullptr
-                                      : cfd->ioptions()->statistics),
+VersionStorageInfo::VersionStorageInfo(
+    const InternalKeyComparator* internal_comparator,
+    const Comparator* user_comparator, int num_levels,
+    CompactionStyle compaction_style, VersionStorageInfo* ref_vstorage)
+    : internal_comparator_(internal_comparator),
+      user_comparator_(user_comparator),
       // cfd is nullptr if Version is dummy
-      num_levels_(cfd == nullptr ? 0 : cfd->NumberLevels()),
+      num_levels_(num_levels),
       num_non_empty_levels_(num_levels_),
-      file_indexer_(cfd == nullptr
-                        ? nullptr
-                        : cfd->internal_comparator().user_comparator()),
-      vset_(vset),
-      next_(this),
-      prev_(this),
-      refs_(0),
+      file_indexer_(user_comparator),
+      compaction_style_(compaction_style),
       files_(new std::vector<FileMetaData*>[num_levels_]),
       files_by_size_(num_levels_),
       next_file_to_compact_by_size_(num_levels_),
       compaction_score_(num_levels_),
       compaction_level_(num_levels_),
-      version_number_(version_number),
       accumulated_file_size_(0),
       accumulated_raw_key_size_(0),
       accumulated_raw_value_size_(0),
@@ -683,18 +670,39 @@ Version::Version(ColumnFamilyData* cfd, VersionSet* vset,
       accumulated_num_deletions_(0),
       num_samples_(0),
       finalized_(false) {
-  if (cfd != nullptr && cfd->current() != nullptr) {
-      accumulated_file_size_ = cfd->current()->accumulated_file_size_;
-      accumulated_raw_key_size_ = cfd->current()->accumulated_raw_key_size_;
-      accumulated_raw_value_size_ =
-          cfd->current()->accumulated_raw_value_size_;
-      accumulated_num_non_deletions_ =
-          cfd->current()->accumulated_num_non_deletions_;
-      accumulated_num_deletions_ = cfd->current()->accumulated_num_deletions_;
-      num_samples_ = cfd->current()->num_samples_;
+  if (ref_vstorage != nullptr) {
+    accumulated_file_size_ = ref_vstorage->accumulated_file_size_;
+    accumulated_raw_key_size_ = ref_vstorage->accumulated_raw_key_size_;
+    accumulated_raw_value_size_ = ref_vstorage->accumulated_raw_value_size_;
+    accumulated_num_non_deletions_ =
+        ref_vstorage->accumulated_num_non_deletions_;
+    accumulated_num_deletions_ = ref_vstorage->accumulated_num_deletions_;
+    num_samples_ = ref_vstorage->num_samples_;
   }
 }
 
+Version::Version(ColumnFamilyData* cfd, VersionSet* vset,
+                 uint64_t version_number)
+    : cfd_(cfd),
+      info_log_((cfd == nullptr) ? nullptr : cfd->ioptions()->info_log),
+      db_statistics_((cfd == nullptr) ? nullptr : cfd->ioptions()->statistics),
+      table_cache_((cfd == nullptr) ? nullptr : cfd->table_cache()),
+      merge_operator_((cfd == nullptr) ? nullptr
+                                       : cfd->ioptions()->merge_operator),
+      vstorage_((cfd == nullptr) ? nullptr : &cfd->internal_comparator(),
+                (cfd == nullptr) ? nullptr : cfd->user_comparator(),
+                cfd == nullptr ? 0 : cfd->NumberLevels(),
+                cfd == nullptr ? kCompactionStyleLevel
+                               : cfd->ioptions()->compaction_style,
+                (cfd == nullptr || cfd->current() == nullptr)
+                    ? nullptr
+                    : cfd->current()->GetStorageInfo()),
+      vset_(vset),
+      next_(this),
+      prev_(this),
+      refs_(0),
+      version_number_(version_number) {}
+
 void Version::Get(const ReadOptions& read_options,
                   const LookupKey& k,
                   std::string* value,
@@ -706,16 +714,17 @@ void Version::Get(const ReadOptions& read_options,
 
   assert(status->ok() || status->IsMergeInProgress());
 
-  GetContext get_context(user_comparator_, merge_operator_, info_log_,
-      db_statistics_, status->ok() ? GetContext::kNotFound : GetContext::kMerge,
-      user_key, value, value_found, merge_context);
+  GetContext get_context(
+      GetUserComparator(), merge_operator_, info_log_, db_statistics_,
+      status->ok() ? GetContext::kNotFound : GetContext::kMerge, user_key,
+      value, value_found, merge_context);
 
-  FilePicker fp(files_, user_key, ikey, &level_files_brief_,
-      num_non_empty_levels_, &file_indexer_, user_comparator_,
-      internal_comparator_);
+  FilePicker fp(vstorage_.files_, user_key, ikey, &vstorage_.level_files_brief_,
+                vstorage_.num_non_empty_levels_, &vstorage_.file_indexer_,
+                GetUserComparator(), GetInternalComparator());
   FdWithKeyRange* f = fp.GetNextFile();
   while (f != nullptr) {
-    *status = table_cache_->Get(read_options, *internal_comparator_, f->fd,
+    *status = table_cache_->Get(read_options, *GetInternalComparator(), f->fd,
                                 ikey, &get_context);
     // TODO: examine the behavior for corrupted key
     if (!status->ok()) {
@@ -763,7 +772,7 @@ void Version::Get(const ReadOptions& read_options,
   }
 }
 
-void Version::GenerateLevelFilesBrief() {
+void VersionStorageInfo::GenerateLevelFilesBrief() {
   level_files_brief_.resize(num_non_empty_levels_);
   for (int level = 0; level < num_non_empty_levels_; level++) {
     DoGenerateLevelFilesBrief(
@@ -774,11 +783,13 @@ void Version::GenerateLevelFilesBrief() {
 void Version::PrepareApply(const MutableCFOptions& mutable_cf_options,
                            std::vector<uint64_t>& size_being_compacted) {
   UpdateAccumulatedStats();
-  ComputeCompactionScore(mutable_cf_options, size_being_compacted);
-  UpdateFilesBySize();
-  UpdateNumNonEmptyLevels();
-  file_indexer_.UpdateIndex(&arena_, num_non_empty_levels_, files_);
-  GenerateLevelFilesBrief();
+  vstorage_.ComputeCompactionScore(mutable_cf_options,
+                                   cfd_->ioptions()->compaction_options_fifo,
+                                   size_being_compacted);
+  vstorage_.UpdateFilesBySize();
+  vstorage_.UpdateNumNonEmptyLevels();
+  vstorage_.GenerateFileIndexer();
+  vstorage_.GenerateLevelFilesBrief();
 }
 
 bool Version::MaybeInitializeFileMetaData(FileMetaData* file_meta) {
@@ -804,7 +815,7 @@ bool Version::MaybeInitializeFileMetaData(FileMetaData* file_meta) {
   return true;
 }
 
-void Version::UpdateAccumulatedStats(FileMetaData* file_meta) {
+void VersionStorageInfo::UpdateAccumulatedStats(FileMetaData* file_meta) {
   assert(file_meta->init_stats_from_file);
   accumulated_file_size_ += file_meta->fd.GetFileSize();
   accumulated_raw_key_size_ += file_meta->raw_key_size;
@@ -816,8 +827,6 @@ void Version::UpdateAccumulatedStats(FileMetaData* file_meta) {
 }
 
 void Version::UpdateAccumulatedStats() {
-  static const int kDeletionWeightOnCompaction = 2;
-
   // maximum number of table properties loaded from files.
   const int kMaxInitCount = 20;
   int init_count = 0;
@@ -832,11 +841,11 @@ void Version::UpdateAccumulatedStats() {
   // will be triggered, which creates higher-level files whose num_deletions
   // will be updated here.
   for (int level = 0;
-       level < num_levels_ && init_count < kMaxInitCount; ++level) {
-    for (auto* file_meta : files_[level]) {
+       level < vstorage_.num_levels_ && init_count < kMaxInitCount; ++level) {
+    for (auto* file_meta : vstorage_.files_[level]) {
       if (MaybeInitializeFileMetaData(file_meta)) {
         // each FileMeta will be initialized only once.
-        UpdateAccumulatedStats(file_meta);
+        vstorage_.UpdateAccumulatedStats(file_meta);
         if (++init_count >= kMaxInitCount) {
           break;
         }
@@ -846,16 +855,21 @@ void Version::UpdateAccumulatedStats() {
   // In case all sampled-files contain only deletion entries, then we
   // load the table-property of a file in higher-level to initialize
   // that value.
-  for (int level = num_levels_ - 1;
-       accumulated_raw_value_size_ == 0 && level >= 0; --level) {
-    for (int i = static_cast<int>(files_[level].size()) - 1;
-        accumulated_raw_value_size_ == 0 && i >= 0; --i) {
-      if (MaybeInitializeFileMetaData(files_[level][i])) {
-        UpdateAccumulatedStats(files_[level][i]);
+  for (int level = vstorage_.num_levels_ - 1;
+       vstorage_.accumulated_raw_value_size_ == 0 && level >= 0; --level) {
+    for (int i = static_cast<int>(vstorage_.files_[level].size()) - 1;
+         vstorage_.accumulated_raw_value_size_ == 0 && i >= 0; --i) {
+      if (MaybeInitializeFileMetaData(vstorage_.files_[level][i])) {
+        vstorage_.UpdateAccumulatedStats(vstorage_.files_[level][i]);
       }
     }
   }
 
+  vstorage_.ComputeCompensatedSizes();
+}
+
+void VersionStorageInfo::ComputeCompensatedSizes() {
+  static const int kDeletionWeightOnCompaction = 2;
   uint64_t average_value_size = GetAverageValueSize();
 
   // compute the compensated size
@@ -872,15 +886,21 @@ void Version::UpdateAccumulatedStats() {
   }
 }
 
-void Version::ComputeCompactionScore(
+int VersionStorageInfo::MaxInputLevel() const {
+  if (compaction_style_ == kCompactionStyleLevel) {
+    return NumberLevels() - 2;
+  }
+  return 0;
+}
+
+void VersionStorageInfo::ComputeCompactionScore(
     const MutableCFOptions& mutable_cf_options,
+    const CompactionOptionsFIFO& compaction_options_fifo,
     std::vector<uint64_t>& size_being_compacted) {
   double max_score = 0;
   int max_score_level = 0;
 
-  int max_input_level =
-      cfd_->compaction_picker()->MaxInputLevel(NumberLevels());
-  for (int level = 0; level <= max_input_level; level++) {
+  for (int level = 0; level <= MaxInputLevel(); level++) {
     double score;
     if (level == 0) {
       // We treat level-0 specially by bounding the number of files
@@ -902,9 +922,9 @@ void Version::ComputeCompactionScore(
           numfiles++;
         }
       }
-      if (cfd_->ioptions()->compaction_style == kCompactionStyleFIFO) {
+      if (compaction_style_ == kCompactionStyleFIFO) {
         score = static_cast<double>(total_size) /
-                cfd_->ioptions()->compaction_options_fifo.max_table_files_size;
+                compaction_options_fifo.max_table_files_size;
       } else if (numfiles >= mutable_cf_options.level0_stop_writes_trigger) {
         // If we are slowing down writes, then we better compact that first
         score = 1000000;
@@ -967,7 +987,7 @@ bool CompareCompensatedSizeDescending(const Fsize& first, const Fsize& second) {
 
 } // anonymous namespace
 
-void Version::UpdateNumNonEmptyLevels() {
+void VersionStorageInfo::UpdateNumNonEmptyLevels() {
   num_non_empty_levels_ = num_levels_;
   for (int i = num_levels_ - 1; i >= 0; i--) {
     if (files_[i].size() != 0) {
@@ -978,9 +998,9 @@ void Version::UpdateNumNonEmptyLevels() {
   }
 }
 
-void Version::UpdateFilesBySize() {
-  if (cfd_->ioptions()->compaction_style == kCompactionStyleFIFO ||
-      cfd_->ioptions()->compaction_style == kCompactionStyleUniversal) {
+void VersionStorageInfo::UpdateFilesBySize() {
+  if (compaction_style_ == kCompactionStyleFIFO ||
+      compaction_style_ == kCompactionStyleUniversal) {
     // don't need this
     return;
   }
@@ -997,8 +1017,8 @@ void Version::UpdateFilesBySize() {
       temp[i].file = files[i];
     }
 
-    // sort the top kNumberFilesToSort based on file size
-    size_t num = Version::kNumberFilesToSort;
+    // sort the top number_of_files_to_sort_ based on file size
+    size_t num = VersionStorageInfo::kNumberFilesToSort;
     if (num > temp.size()) {
       num = temp.size();
     }
@@ -1029,7 +1049,7 @@ bool Version::Unref() {
   return false;
 }
 
-bool Version::NeedsCompaction() const {
+bool VersionStorageInfo::NeedsCompaction() const {
   // In universal compaction case, this check doesn't really
   // check the compaction condition, but checks num of files threshold
   // only. We are not going to miss any compaction opportunity
@@ -1037,10 +1057,7 @@ bool Version::NeedsCompaction() const {
   // ending up with nothing to do. We can improve it later.
   // TODO(sdong): improve this function to be accurate for universal
   //              compactions.
-  int max_input_level =
-      cfd_->compaction_picker()->MaxInputLevel(NumberLevels());
-
-  for (int i = 0; i <= max_input_level; i++) {
+  for (int i = 0; i <= MaxInputLevel(); i++) {
     if (compaction_score_[i] >= 1) {
       return true;
     }
@@ -1048,17 +1065,16 @@ bool Version::NeedsCompaction() const {
   return false;
 }
 
-bool Version::OverlapInLevel(int level,
-                             const Slice* smallest_user_key,
-                             const Slice* largest_user_key) {
-  return SomeFileOverlapsRange(cfd_->internal_comparator(), (level > 0),
+bool VersionStorageInfo::OverlapInLevel(int level,
+                                        const Slice* smallest_user_key,
+                                        const Slice* largest_user_key) {
+  return SomeFileOverlapsRange(*internal_comparator_, (level > 0),
                                level_files_brief_[level], smallest_user_key,
                                largest_user_key);
 }
 
-int Version::PickLevelForMemTableOutput(
-    const MutableCFOptions& mutable_cf_options,
-    const Slice& smallest_user_key,
+int VersionStorageInfo::PickLevelForMemTableOutput(
+    const MutableCFOptions& mutable_cf_options, const Slice& smallest_user_key,
     const Slice& largest_user_key) {
   int level = 0;
   if (!OverlapInLevel(0, &smallest_user_key, &largest_user_key)) {
@@ -1092,12 +1108,9 @@ int Version::PickLevelForMemTableOutput(
 // If hint_index is specified, then it points to a file in the
 // overlapping range.
 // The file_index returns a pointer to any file in an overlapping range.
-void Version::GetOverlappingInputs(int level,
-                                   const InternalKey* begin,
-                                   const InternalKey* end,
-                                   std::vector<FileMetaData*>* inputs,
-                                   int hint_index,
-                                   int* file_index) {
+void VersionStorageInfo::GetOverlappingInputs(
+    int level, const InternalKey* begin, const InternalKey* end,
+    std::vector<FileMetaData*>* inputs, int hint_index, int* file_index) {
   inputs->clear();
   Slice user_begin, user_end;
   if (begin != nullptr) {
@@ -1109,7 +1122,7 @@ void Version::GetOverlappingInputs(int level,
   if (file_index) {
     *file_index = -1;
   }
-  const Comparator* user_cmp = cfd_->internal_comparator().user_comparator();
+  const Comparator* user_cmp = user_comparator_;
   if (begin != nullptr && end != nullptr && level > 0) {
     GetOverlappingInputsBinarySearch(level, user_begin, user_end, inputs,
       hint_index, file_index);
@@ -1149,19 +1162,15 @@ void Version::GetOverlappingInputs(int level,
 // Employ binary search to find at least one file that overlaps the
 // specified range. From that file, iterate backwards and
 // forwards to find all overlapping files.
-void Version::GetOverlappingInputsBinarySearch(
-    int level,
-    const Slice& user_begin,
-    const Slice& user_end,
-    std::vector<FileMetaData*>* inputs,
-    int hint_index,
-    int* file_index) {
+void VersionStorageInfo::GetOverlappingInputsBinarySearch(
+    int level, const Slice& user_begin, const Slice& user_end,
+    std::vector<FileMetaData*>* inputs, int hint_index, int* file_index) {
   assert(level > 0);
   int min = 0;
   int mid = 0;
   int max = files_[level].size() -1;
   bool foundOverlap = false;
-  const Comparator* user_cmp = cfd_->internal_comparator().user_comparator();
+  const Comparator* user_cmp = user_comparator_;
 
   // if the caller already knows the index of a file that has overlap,
   // then we can skip the binary search.
@@ -1200,15 +1209,12 @@ void Version::GetOverlappingInputsBinarySearch(
 // The midIndex specifies the index of at least one file that
 // overlaps the specified range. From that file, iterate backward
 // and forward to find all overlapping files.
-// Use LevelFilesBrief in searching, make it faster
-void Version::ExtendOverlappingInputs(
-    int level,
-    const Slice& user_begin,
-    const Slice& user_end,
-    std::vector<FileMetaData*>* inputs,
-    unsigned int midIndex) {
-
-  const Comparator* user_cmp = cfd_->internal_comparator().user_comparator();
+// Use FileLevel in searching, make it faster
+void VersionStorageInfo::ExtendOverlappingInputs(
+    int level, const Slice& user_begin, const Slice& user_end,
+    std::vector<FileMetaData*>* inputs, unsigned int midIndex) {
+
+  const Comparator* user_cmp = user_comparator_;
   const FdWithKeyRange* files = level_files_brief_[level].files;
 #ifndef NDEBUG
   {
@@ -1264,9 +1270,8 @@ void Version::ExtendOverlappingInputs(
 // an overlapping user key to the file "just outside" of it (i.e.
 // just after the last file, or just before the first file)
 // REQUIRES: "*inputs" is a sorted list of non-overlapping files
-bool Version::HasOverlappingUserKey(
-    const std::vector<FileMetaData*>* inputs,
-    int level) {
+bool VersionStorageInfo::HasOverlappingUserKey(
+    const std::vector<FileMetaData*>* inputs, int level) {
 
   // If inputs empty, there is no overlap.
   // If level == 0, it is assumed that all needed files were already included.
@@ -1274,13 +1279,13 @@ bool Version::HasOverlappingUserKey(
     return false;
   }
 
-  const Comparator* user_cmp = cfd_->internal_comparator().user_comparator();
-  const LevelFilesBrief& file_level = level_files_brief_[level];
+  const Comparator* user_cmp = user_comparator_;
+  const rocksdb::LevelFilesBrief& file_level = level_files_brief_[level];
   const FdWithKeyRange* files = level_files_brief_[level].files;
   const size_t kNumFiles = file_level.num_files;
 
   // Check the last file in inputs against the file after it
-  size_t last_file = FindFile(cfd_->internal_comparator(), file_level,
+  size_t last_file = FindFile(*internal_comparator_, file_level,
                               inputs->back()->largest.Encode());
   assert(last_file < kNumFiles);  // File should exist!
   if (last_file < kNumFiles-1) {                    // If not the last file
@@ -1295,7 +1300,7 @@ bool Version::HasOverlappingUserKey(
   }
 
   // Check the first file in inputs against the file just before it
-  size_t first_file = FindFile(cfd_->internal_comparator(), file_level,
+  size_t first_file = FindFile(*internal_comparator_, file_level,
                                inputs->front()->smallest.Encode());
   assert(first_file <= last_file);   // File should exist!
   if (first_file > 0) {                                 // If not first file
@@ -1312,13 +1317,14 @@ bool Version::HasOverlappingUserKey(
   return false;
 }
 
-uint64_t Version::NumLevelBytes(int level) const {
+uint64_t VersionStorageInfo::NumLevelBytes(int level) const {
   assert(level >= 0);
   assert(level < NumberLevels());
   return TotalFileSize(files_[level]);
 }
 
-const char* Version::LevelSummary(LevelSummaryStorage* scratch) const {
+const char* VersionStorageInfo::LevelSummary(
+    LevelSummaryStorage* scratch) const {
   int len = snprintf(scratch->buffer, sizeof(scratch->buffer), "files[");
   for (int i = 0; i < NumberLevels(); i++) {
     int sz = sizeof(scratch->buffer) - len;
@@ -1334,8 +1340,8 @@ const char* Version::LevelSummary(LevelSummaryStorage* scratch) const {
   return scratch->buffer;
 }
 
-const char* Version::LevelFileSummary(FileSummaryStorage* scratch,
-                                      int level) const {
+const char* VersionStorageInfo::LevelFileSummary(FileSummaryStorage* scratch,
+                                                 int level) const {
   int len = snprintf(scratch->buffer, sizeof(scratch->buffer), "files_size[");
   for (const auto& f : files_[level]) {
     int sz = sizeof(scratch->buffer) - len;
@@ -1357,7 +1363,7 @@ const char* Version::LevelFileSummary(FileSummaryStorage* scratch,
   return scratch->buffer;
 }
 
-int64_t Version::MaxNextLevelOverlappingBytes() {
+int64_t VersionStorageInfo::MaxNextLevelOverlappingBytes() {
   uint64_t result = 0;
   std::vector<FileMetaData*> overlaps;
   for (int level = 1; level < NumberLevels() - 1; level++) {
@@ -1373,8 +1379,8 @@ int64_t Version::MaxNextLevelOverlappingBytes() {
 }
 
 void Version::AddLiveFiles(std::vector<FileDescriptor>* live) {
-  for (int level = 0; level < NumberLevels(); level++) {
-    const std::vector<FileMetaData*>& files = files_[level];
+  for (int level = 0; level < vstorage_.NumberLevels(); level++) {
+    const std::vector<FileMetaData*>& files = vstorage_.files_[level];
     for (const auto& file : files) {
       live->push_back(file->fd);
     }
@@ -1383,7 +1389,7 @@ void Version::AddLiveFiles(std::vector<FileDescriptor>* live) {
 
 std::string Version::DebugString(bool hex) const {
   std::string r;
-  for (int level = 0; level < num_levels_; level++) {
+  for (int level = 0; level < vstorage_.num_levels_; level++) {
     // E.g.,
     //   --- level 1 ---
     //   17:123['a' .. 'd']
@@ -1393,7 +1399,7 @@ std::string Version::DebugString(bool hex) const {
     r.append(" --- version# ");
     AppendNumberTo(&r, version_number_);
     r.append(" ---\n");
-    const std::vector<FileMetaData*>& files = files_[level];
+    const std::vector<FileMetaData*>& files = vstorage_.files_[level];
     for (size_t i = 0; i < files.size(); i++) {
       r.push_back(' ');
       AppendNumberTo(&r, files[i]->fd.GetNumber());
@@ -1427,7 +1433,7 @@ struct VersionSet::ManifestWriter {
 // Versions that contain full copies of the intermediate state.
 class VersionSet::Builder {
  private:
-  // Helper to sort v->files_
+  // Helper to sort files_ in v
   // kLevel0 -- NewestFirstBySeqNo
   // kLevelNon0 -- BySmallestKey
   struct FileComparator {
@@ -1464,19 +1470,21 @@ class VersionSet::Builder {
  public:
   Builder(ColumnFamilyData* cfd) : cfd_(cfd), base_(cfd->current()) {
     base_->Ref();
-    levels_ = new LevelState[base_->NumberLevels()];
+    levels_ = new LevelState[base_->GetStorageInfo()->NumberLevels()];
     level_zero_cmp_.sort_method = FileComparator::kLevel0;
     level_nonzero_cmp_.sort_method = FileComparator::kLevelNon0;
     level_nonzero_cmp_.internal_comparator = &cfd->internal_comparator();
 
     levels_[0].added_files = new FileSet(level_zero_cmp_);
-    for (int level = 1; level < base_->NumberLevels(); level++) {
+    for (int level = 1; level < base_->GetStorageInfo()->NumberLevels();
+         level++) {
         levels_[level].added_files = new FileSet(level_nonzero_cmp_);
     }
   }
 
   ~Builder() {
-    for (int level = 0; level < base_->NumberLevels(); level++) {
+    for (int level = 0; level < base_->GetStorageInfo()->NumberLevels();
+         level++) {
       const FileSet* added = levels_[level].added_files;
       std::vector<FileMetaData*> to_unref;
       to_unref.reserve(added->size());
@@ -1505,10 +1513,11 @@ class VersionSet::Builder {
   void CheckConsistency(Version* v) {
 #ifndef NDEBUG
     // make sure the files are sorted correctly
-    for (int level = 0; level < v->NumberLevels(); level++) {
-      for (size_t i = 1; i < v->files_[level].size(); i++) {
-        auto f1 = v->files_[level][i - 1];
-        auto f2 = v->files_[level][i];
+    auto* files = v->GetFiles();
+    for (int level = 0; level < v->GetStorageInfo()->NumberLevels(); level++) {
+      for (size_t i = 1; i < files[level].size(); i++) {
+        auto f1 = files[level][i - 1];
+        auto f2 = files[level][i];
         if (level == 0) {
           assert(level_zero_cmp_(f1, f2));
           assert(f1->largest_seqno > f2->largest_seqno);
@@ -1534,8 +1543,10 @@ class VersionSet::Builder {
 #ifndef NDEBUG
       // a file to be deleted better exist in the previous version
       bool found = false;
-      for (int l = 0; !found && l < base_->NumberLevels(); l++) {
-        const std::vector<FileMetaData*>& base_files = base_->files_[l];
+      auto* files = base_->GetFiles();
+      for (int l = 0; !found && l < base_->GetStorageInfo()->NumberLevels();
+           l++) {
+        const std::vector<FileMetaData*>& base_files = files[l];
         for (unsigned int i = 0; i < base_files.size(); i++) {
           FileMetaData* f = base_files[i];
           if (f->fd.GetNumber() == number) {
@@ -1547,7 +1558,8 @@ class VersionSet::Builder {
       // if the file did not exist in the previous version, then it
       // is possibly moved from lower level to higher level in current
       // version
-      for (int l = level+1; !found && l < base_->NumberLevels(); l++) {
+      for (int l = level + 1;
+           !found && l < base_->GetStorageInfo()->NumberLevels(); l++) {
         const FileSet* added = levels_[l].added_files;
         for (FileSet::const_iterator added_iter = added->begin();
              added_iter != added->end(); ++added_iter) {
@@ -1607,15 +1619,17 @@ class VersionSet::Builder {
     CheckConsistency(base_);
     CheckConsistency(v);
 
-    for (int level = 0; level < base_->NumberLevels(); level++) {
+    auto* out_files = v->GetFiles();
+    for (int level = 0; level < base_->GetStorageInfo()->NumberLevels();
+         level++) {
       const auto& cmp = (level == 0) ? level_zero_cmp_ : level_nonzero_cmp_;
       // Merge the set of added files with the set of pre-existing files.
       // Drop any deleted files.  Store the result in *v.
-      const auto& base_files = base_->files_[level];
+      const auto& base_files = base_->GetStorageInfo()->LevelFiles(level);
       auto base_iter = base_files.begin();
       auto base_end = base_files.end();
       const auto& added_files = *levels_[level].added_files;
-      v->files_[level].reserve(base_files.size() + added_files.size());
+      out_files[level].reserve(base_files.size() + added_files.size());
 
       for (const auto& added : added_files) {
         // Add all smaller files listed in base_
@@ -1642,7 +1656,7 @@ class VersionSet::Builder {
       for (auto& file_meta : *(levels_[level].added_files)) {
         assert (!file_meta->table_reader_handle);
         cfd_->table_cache()->FindTable(
-            base_->vset_->env_options_, cfd_->internal_comparator(),
+            base_->GetVersionSet()->env_options_, cfd_->internal_comparator(),
             file_meta->fd, &file_meta->table_reader_handle, false);
         if (file_meta->table_reader_handle != nullptr) {
           // Load table_reader
@@ -1658,14 +1672,16 @@ class VersionSet::Builder {
     if (levels_[level].deleted_files.count(f->fd.GetNumber()) > 0) {
       // File is deleted: do nothing
     } else {
-      auto* files = &v->files_[level];
-      if (level > 0 && !files->empty()) {
+      auto* files = v->GetFiles();
+      auto* level_files = &files[level];
+      if (level > 0 && !level_files->empty()) {
         // Must not overlap
         assert(cfd_->internal_comparator().Compare(
-                   (*files)[files->size() - 1]->largest, f->smallest) < 0);
+                   (*level_files)[level_files->size() - 1]->largest,
+                   f->smallest) < 0);
       }
       f->refs++;
-      files->push_back(f);
+      level_files->push_back(f);
     }
   }
 };
@@ -1701,7 +1717,7 @@ VersionSet::~VersionSet() {
 void VersionSet::AppendVersion(ColumnFamilyData* column_family_data,
                                Version* v) {
   // Mark v finalized
-  v->finalized_ = true;
+  v->vstorage_.SetFinalized();
 
   // Make "v" current
   assert(v->refs_ == 0);
@@ -1812,7 +1828,7 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
   {
     std::vector<uint64_t> size_being_compacted;
     if (!edit->IsColumnFamilyManipulation()) {
-      size_being_compacted.resize(v->NumberLevels() - 1);
+      size_being_compacted.resize(v->GetStorageInfo()->NumberLevels() - 1);
       // calculate the amount of data being compacted at every level
       column_family_data->compaction_picker()->SizeBeingCompacted(
           size_being_compacted);
@@ -2172,7 +2188,8 @@ Status VersionSet::Recover(
         cfd = column_family_set_->GetColumnFamily(edit.column_family_);
         // this should never happen since cf_in_builders is true
         assert(cfd != nullptr);
-        if (edit.max_level_ >= cfd->current()->NumberLevels()) {
+        if (edit.max_level_ >=
+            cfd->current()->GetStorageInfo()->NumberLevels()) {
           s = Status::InvalidArgument(
               "db has more levels than options.num_levels");
           break;
@@ -2275,7 +2292,8 @@ Status VersionSet::Recover(
       builder->SaveTo(v);
 
       // Install recovered version
-      std::vector<uint64_t> size_being_compacted(v->NumberLevels() - 1);
+      std::vector<uint64_t> size_being_compacted(
+          v->GetStorageInfo()->NumberLevels() - 1);
       cfd->compaction_picker()->SizeBeingCompacted(size_being_compacted);
       v->PrepareApply(*cfd->GetLatestMutableCFOptions(), size_being_compacted);
       AppendVersion(cfd, v);
@@ -2407,7 +2425,8 @@ Status VersionSet::ReduceNumberOfLevels(const std::string& dbname,
 
   Version* current_version =
       versions.GetColumnFamilySet()->GetDefault()->current();
-  int current_levels = current_version->NumberLevels();
+  auto* vstorage = current_version->GetStorageInfo();
+  int current_levels = vstorage->NumberLevels();
 
   if (current_levels <= new_levels) {
     return Status::OK();
@@ -2418,7 +2437,7 @@ Status VersionSet::ReduceNumberOfLevels(const std::string& dbname,
   int first_nonempty_level = -1;
   int first_nonempty_level_filenum = 0;
   for (int i = new_levels - 1; i < current_levels; i++) {
-    int file_num = current_version->NumLevelFiles(i);
+    int file_num = vstorage->NumLevelFiles(i);
     if (file_num != 0) {
       if (first_nonempty_level < 0) {
         first_nonempty_level = i;
@@ -2435,7 +2454,7 @@ Status VersionSet::ReduceNumberOfLevels(const std::string& dbname,
     }
   }
 
-  std::vector<FileMetaData*>* old_files_list = current_version->files_;
+  std::vector<FileMetaData*>* old_files_list = vstorage->GetFiles();
   // we need to allocate an array with the old number of levels size to
   // avoid SIGSEGV in WriteSnapshot()
   // however, all levels bigger or equal to new_levels will be empty
@@ -2449,9 +2468,9 @@ Status VersionSet::ReduceNumberOfLevels(const std::string& dbname,
     new_files_list[new_levels - 1] = old_files_list[first_nonempty_level];
   }
 
-  delete[] current_version->files_;
-  current_version->files_ = new_files_list;
-  current_version->num_levels_ = new_levels;
+  delete[] vstorage -> files_;
+  vstorage->files_ = new_files_list;
+  vstorage->num_levels_ = new_levels;
 
   MutableCFOptions mutable_cf_options(*options, ImmutableCFOptions(*options));
   VersionEdit ve;
@@ -2609,7 +2628,8 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname,
 
       Version* v = new Version(cfd, this, current_version_number_++);
       builder->SaveTo(v);
-      std::vector<uint64_t> size_being_compacted(v->NumberLevels() - 1);
+      std::vector<uint64_t> size_being_compacted(
+          v->GetStorageInfo()->NumberLevels() - 1);
       cfd->compaction_picker()->SizeBeingCompacted(size_being_compacted);
       v->PrepareApply(*cfd->GetLatestMutableCFOptions(), size_being_compacted);
       delete builder;
@@ -2686,7 +2706,8 @@ Status VersionSet::WriteSnapshot(log::Writer* log) {
       edit.SetColumnFamily(cfd->GetID());
 
       for (int level = 0; level < cfd->NumberLevels(); level++) {
-        for (const auto& f : cfd->current()->files_[level]) {
+        auto* files = cfd->current()->GetFiles();
+        for (const auto& f : files[level]) {
           edit.AddFile(level, f->fd.GetNumber(), f->fd.GetPathId(),
                        f->fd.GetFileSize(), f->smallest, f->largest,
                        f->smallest_seqno, f->largest_seqno);
@@ -2741,8 +2762,9 @@ bool VersionSet::ManifestContains(uint64_t manifest_file_number,
 
 uint64_t VersionSet::ApproximateOffsetOf(Version* v, const InternalKey& ikey) {
   uint64_t result = 0;
-  for (int level = 0; level < v->NumberLevels(); level++) {
-    const std::vector<FileMetaData*>& files = v->files_[level];
+  auto* vstorage = v->GetStorageInfo();
+  for (int level = 0; level < vstorage->NumberLevels(); level++) {
+    const std::vector<FileMetaData*>& files = vstorage->LevelFiles(level);
     for (size_t i = 0; i < files.size(); i++) {
       if (v->cfd_->internal_comparator().Compare(files[i]->largest, ikey) <=
           0) {
@@ -2781,8 +2803,9 @@ void VersionSet::AddLiveFiles(std::vector<FileDescriptor>* live_list) {
     Version* dummy_versions = cfd->dummy_versions();
     for (Version* v = dummy_versions->next_; v != dummy_versions;
          v = v->next_) {
-      for (int level = 0; level < v->NumberLevels(); level++) {
-        total_files += v->files_[level].size();
+      auto* vstorage = v->GetStorageInfo();
+      for (int level = 0; level < vstorage->NumberLevels(); level++) {
+        total_files += vstorage->LevelFiles(level).size();
       }
     }
   }
@@ -2794,8 +2817,9 @@ void VersionSet::AddLiveFiles(std::vector<FileDescriptor>* live_list) {
     Version* dummy_versions = cfd->dummy_versions();
     for (Version* v = dummy_versions->next_; v != dummy_versions;
          v = v->next_) {
-      for (int level = 0; level < v->NumberLevels(); level++) {
-        for (const auto& f : v->files_[level]) {
+      auto* vstorage = v->GetStorageInfo();
+      for (int level = 0; level < vstorage->NumberLevels(); level++) {
+        for (const auto& f : vstorage->LevelFiles(level)) {
           live_list->push_back(f->fd);
         }
       }
@@ -2851,6 +2875,7 @@ Iterator* VersionSet::MakeInputIterator(Compaction* c) {
 bool VersionSet::VerifyCompactionFileConsistency(Compaction* c) {
 #ifndef NDEBUG
   Version* version = c->column_family_data()->current();
+  VersionStorageInfo* vstorage = version->GetStorageInfo();
   if (c->input_version() != version) {
     Log(db_options_->info_log,
         "[%s] VerifyCompactionFileConsistency version mismatch",
@@ -2864,8 +2889,8 @@ bool VersionSet::VerifyCompactionFileConsistency(Compaction* c) {
 
     // look for this file in the current version
     bool found = false;
-    for (unsigned int j = 0; j < version->files_[level].size(); j++) {
-      FileMetaData* f = version->files_[level][j];
+    for (unsigned int j = 0; j < vstorage->files_[level].size(); j++) {
+      FileMetaData* f = vstorage->files_[level][j];
       if (f->fd.GetNumber() == number) {
         found = true;
         break;
@@ -2882,8 +2907,8 @@ bool VersionSet::VerifyCompactionFileConsistency(Compaction* c) {
 
     // look for this file in the current version
     bool found = false;
-    for (unsigned int j = 0; j < version->files_[level].size(); j++) {
-      FileMetaData* f = version->files_[level][j];
+    for (unsigned int j = 0; j < vstorage->files_[level].size(); j++) {
+      FileMetaData* f = vstorage->files_[level][j];
       if (f->fd.GetNumber() == number) {
         found = true;
         break;
@@ -2902,8 +2927,9 @@ Status VersionSet::GetMetadataForFile(uint64_t number, int* filelevel,
                                       ColumnFamilyData** cfd) {
   for (auto cfd_iter : *column_family_set_) {
     Version* version = cfd_iter->current();
-    for (int level = 0; level < version->NumberLevels(); level++) {
-      for (const auto& file : version->files_[level]) {
+    auto* vstorage = version->GetStorageInfo();
+    for (int level = 0; level < vstorage->NumberLevels(); level++) {
+      for (const auto& file : vstorage->LevelFiles(level)) {
         if (file->fd.GetNumber() == number) {
           *meta = file;
           *filelevel = level;
@@ -2918,8 +2944,9 @@ Status VersionSet::GetMetadataForFile(uint64_t number, int* filelevel,
 
 void VersionSet::GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata) {
   for (auto cfd : *column_family_set_) {
+    auto* files = cfd->current()->GetFiles();
     for (int level = 0; level < cfd->NumberLevels(); level++) {
-      for (const auto& file : cfd->current()->files_[level]) {
+      for (const auto& file : files[level]) {
         LiveFileMetaData filemetadata;
         filemetadata.column_family_name = cfd->GetName();
         uint32_t path_id = file->fd.GetPathId();
diff --git a/db/version_set.h b/db/version_set.h
index 5a11a2f1c..98ce172e3 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -38,7 +38,9 @@
 
 namespace rocksdb {
 
-namespace log { class Writer; }
+namespace log {
+class Writer;
+}
 
 class Compaction;
 class Iterator;
@@ -81,45 +83,45 @@ extern void DoGenerateLevelFilesBrief(LevelFilesBrief* file_level,
         const std::vector<FileMetaData*>& files,
         Arena* arena);
 
-class Version {
+class VersionStorageInfo {
  public:
-  // Append to *iters a sequence of iterators that will
-  // yield the contents of this Version when merged together.
-  // REQUIRES: This version has been saved (see VersionSet::SaveTo)
-  void AddIterators(const ReadOptions&, const EnvOptions& soptions,
-                    MergeIteratorBuilder* merger_iter_builder);
+  VersionStorageInfo(const InternalKeyComparator* internal_comparator,
+                     const Comparator* user_comparator, int num_levels,
+                     CompactionStyle compaction_style,
+                     VersionStorageInfo* src_vstorage);
+  ~VersionStorageInfo();
 
-  // Lookup the value for key.  If found, store it in *val and
-  // return OK.  Else return a non-OK status.
-  // Uses *operands to store merge_operator operations to apply later
-  // REQUIRES: lock is not held
-  void Get(const ReadOptions&, const LookupKey& key, std::string* val,
-           Status* status, MergeContext* merge_context,
-           bool* value_found = nullptr);
+  void SetFinalized() { finalized_ = true; }
+
+  // Update num_non_empty_levels_.
+  void UpdateNumNonEmptyLevels();
+
+  void GenerateFileIndexer() {
+    file_indexer_.UpdateIndex(&arena_, num_non_empty_levels_, files_);
+  }
+
+  // Update the accumulated stats from a file-meta.
+  void UpdateAccumulatedStats(FileMetaData* file_meta);
+
+  void ComputeCompensatedSizes();
 
   // Updates internal structures that keep track of compaction scores
   // We use compaction scores to figure out which compaction to do next
   // REQUIRES: If Version is not yet saved to current_, it can be called without
   // a lock. Once a version is saved to current_, call only with mutex held
+  // TODO find a better way to pass compaction_options_fifo.
   void ComputeCompactionScore(
       const MutableCFOptions& mutable_cf_options,
+      const CompactionOptionsFIFO& compaction_options_fifo,
       std::vector<uint64_t>& size_being_compacted);
 
   // Generate level_files_brief_ from files_
   void GenerateLevelFilesBrief();
+  // Sort all files for this version based on their file size and
+  // record results in files_by_size_. The largest files are listed first.
+  void UpdateFilesBySize();
 
-  // Update scores, pre-calculated variables. It needs to be called before
-  // applying the version to the version set.
-  void PrepareApply(
-      const MutableCFOptions& mutable_cf_options,
-      std::vector<uint64_t>& size_being_compacted);
-
-  // Reference count management (so Versions do not disappear out from
-  // under live iterators)
-  void Ref();
-  // Decrease reference count. Delete the object if no reference left
-  // and return true. Otherwise, return false.
-  bool Unref();
+  int MaxInputLevel() const;
 
   // Returns true iff some level needs a compaction.
   bool NeedsCompaction() const;
@@ -137,34 +139,30 @@ class Version {
   double CompactionScore(int idx) const { return compaction_score_[idx]; }
 
   void GetOverlappingInputs(
-      int level,
-      const InternalKey* begin,         // nullptr means before all keys
-      const InternalKey* end,           // nullptr means after all keys
+      int level, const InternalKey* begin,  // nullptr means before all keys
+      const InternalKey* end,               // nullptr means after all keys
       std::vector<FileMetaData*>* inputs,
-      int hint_index = -1,              // index of overlap file
-      int* file_index = nullptr);          // return index of overlap file
+      int hint_index = -1,         // index of overlap file
+      int* file_index = nullptr);  // return index of overlap file
 
   void GetOverlappingInputsBinarySearch(
-      int level,
-      const Slice& begin,         // nullptr means before all keys
-      const Slice& end,           // nullptr means after all keys
+      int level, const Slice& begin,  // nullptr means before all keys
+      const Slice& end,               // nullptr means after all keys
       std::vector<FileMetaData*>* inputs,
-      int hint_index,             // index of overlap file
-      int* file_index);           // return index of overlap file
+      int hint_index,    // index of overlap file
+      int* file_index);  // return index of overlap file
 
   void ExtendOverlappingInputs(
-      int level,
-      const Slice& begin,         // nullptr means before all keys
-      const Slice& end,           // nullptr means after all keys
+      int level, const Slice& begin,  // nullptr means before all keys
+      const Slice& end,               // nullptr means after all keys
       std::vector<FileMetaData*>* inputs,
-      unsigned int index);                 // start extending from this index
+      unsigned int index);  // start extending from this index
 
   // Returns true iff some file in the specified level overlaps
   // some part of [*smallest_user_key,*largest_user_key].
   // smallest_user_key==NULL represents a key smaller than all keys in the DB.
   // largest_user_key==NULL represents a key largest than all keys in the DB.
-  bool OverlapInLevel(int level,
-                      const Slice* smallest_user_key,
+  bool OverlapInLevel(int level, const Slice* smallest_user_key,
                       const Slice* largest_user_key);
 
   // Returns true iff the first or last file in inputs contains
@@ -174,7 +172,6 @@ class Version {
   bool HasOverlappingUserKey(const std::vector<FileMetaData*>* inputs,
                              int level);
 
-
   // Return the level at which we should place a new memtable compaction
   // result that covers the range [smallest_user_key,largest_user_key].
   int PickLevelForMemTableOutput(const MutableCFOptions& mutable_cf_options,
@@ -198,6 +195,47 @@ class Version {
   // Return the combined file size of all files at the specified level.
   uint64_t NumLevelBytes(int level) const;
 
+  // REQUIRES: This version has been saved (see VersionSet::SaveTo)
+  const std::vector<FileMetaData*>& LevelFiles(int level) const {
+    assert(finalized_);
+    return files_[level];
+  }
+
+  const rocksdb::LevelFilesBrief& LevelFilesBrief(int level) const {
+    return level_files_brief_[level];
+  }
+
+  // REQUIRES: This version has been saved (see VersionSet::SaveTo)
+  const std::vector<int>& FilesBySize(int level) const {
+    assert(finalized_);
+    return files_by_size_[level];
+  }
+
+  // REQUIRES: lock is held
+  // Set the index that is used to offset into files_by_size_ to find
+  // the next compaction candidate file.
+  void SetNextCompactionIndex(int level, int index) {
+    next_file_to_compact_by_size_[level] = index;
+  }
+
+  // REQUIRES: lock is held
+  int NextCompactionIndex(int level) const {
+    return next_file_to_compact_by_size_[level];
+  }
+
+  // REQUIRES: This version has been saved (see VersionSet::SaveTo)
+  const FileIndexer& GetIndexer() const {
+    assert(finalized_);
+    return file_indexer_;
+  }
+
+  // Only the first few entries of files_by_size_ are sorted.
+  // There is no need to sort all the files because it is likely
+  // that on a running system, we need to look at only the first
+  // few largest files because a new version is created every few
+  // seconds/minutes (because of concurrent compactions).
+  static const size_t kNumberFilesToSort = 50;
+
   // Return a human-readable short (single-line) summary of the number
   // of files per level.  Uses *scratch as backing store.
   struct LevelSummaryStorage {
@@ -211,31 +249,146 @@ class Version {
   // in a specified level.  Uses *scratch as backing store.
   const char* LevelFileSummary(FileSummaryStorage* scratch, int level) const;
 
+  std::vector<FileMetaData*>* GetFiles() { return files_; }
+
   // Return the maximum overlapping data (in bytes) at next level for any
   // file at a level >= 1.
   int64_t MaxNextLevelOverlappingBytes();
 
-  // Add all files listed in the current version to *live.
-  void AddLiveFiles(std::vector<FileDescriptor>* live);
-
   // Return a human readable string that describes this version's contents.
   std::string DebugString(bool hex = false) const;
 
-  // Returns the version nuber of this version
-  uint64_t GetVersionNumber() const { return version_number_; }
-
   uint64_t GetAverageValueSize() const {
     if (accumulated_num_non_deletions_ == 0) {
       return 0;
     }
     assert(accumulated_raw_key_size_ + accumulated_raw_value_size_ > 0);
     assert(accumulated_file_size_ > 0);
-    return accumulated_raw_value_size_ /
-           accumulated_num_non_deletions_ *
+    return accumulated_raw_value_size_ / accumulated_num_non_deletions_ *
            accumulated_file_size_ /
            (accumulated_raw_key_size_ + accumulated_raw_value_size_);
   }
 
+  uint64_t GetEstimatedActiveKeys();
+
+  // re-initializes the index that is used to offset into files_by_size_
+  // to find the next compaction candidate file.
+  void ResetNextCompactionIndex(int level) {
+    next_file_to_compact_by_size_[level] = 0;
+  }
+
+ private:
+  const InternalKeyComparator* internal_comparator_;
+  const Comparator* user_comparator_;
+  int num_levels_;            // Number of levels
+  int num_non_empty_levels_;  // Number of levels. Any level larger than it
+                              // is guaranteed to be empty.
+  // A short brief metadata of files per level
+  autovector<rocksdb::LevelFilesBrief> level_files_brief_;
+  FileIndexer file_indexer_;
+  Arena arena_;  // Used to allocate space for file_levels_
+
+  CompactionStyle compaction_style_;
+
+  // List of files per level, files in each level are arranged
+  // in increasing order of keys
+  std::vector<FileMetaData*>* files_;
+
+  // A list for the same set of files that are stored in files_,
+  // but files in each level are now sorted based on file
+  // size. The file with the largest size is at the front.
+  // This vector stores the index of the file from files_.
+  std::vector<std::vector<int>> files_by_size_;
+
+  // An index into files_by_size_ that specifies the first
+  // file that is not yet compacted
+  std::vector<int> next_file_to_compact_by_size_;
+
+  // Only the first few entries of files_by_size_ are sorted.
+  // There is no need to sort all the files because it is likely
+  // that on a running system, we need to look at only the first
+  // few largest files because a new version is created every few
+  // seconds/minutes (because of concurrent compactions).
+  static const size_t number_of_files_to_sort_ = 50;
+
+  // Level that should be compacted next and its compaction score.
+  // Score < 1 means compaction is not strictly needed.  These fields
+  // are initialized by Finalize().
+  // The most critical level to be compacted is listed first
+  // These are used to pick the best compaction level
+  std::vector<double> compaction_score_;
+  std::vector<int> compaction_level_;
+  double max_compaction_score_ = 0.0;   // max score in l1 to ln-1
+  int max_compaction_score_level_ = 0;  // level on which max score occurs
+
+  // the following are the sampled temporary stats.
+  // the current accumulated size of sampled files.
+  uint64_t accumulated_file_size_;
+  // the current accumulated size of all raw keys based on the sampled files.
+  uint64_t accumulated_raw_key_size_;
+  // the current accumulated size of all raw keys based on the sampled files.
+  uint64_t accumulated_raw_value_size_;
+  // total number of non-deletion entries
+  uint64_t accumulated_num_non_deletions_;
+  // total number of deletion entries
+  uint64_t accumulated_num_deletions_;
+  // the number of samples
+  uint64_t num_samples_;
+
+  bool finalized_;
+
+  friend class Version;
+  friend class VersionSet;
+  friend class DBImpl;
+  friend class InternalStats;
+  // No copying allowed
+  VersionStorageInfo(const VersionStorageInfo&) = delete;
+  void operator=(const VersionStorageInfo&) = delete;
+};
+
+class Version {
+ public:
+  // Append to *iters a sequence of iterators that will
+  // yield the contents of this Version when merged together.
+  // REQUIRES: This version has been saved (see VersionSet::SaveTo)
+  void AddIterators(const ReadOptions&, const EnvOptions& soptions,
+                    MergeIteratorBuilder* merger_iter_builder);
+
+  // Lookup the value for key.  If found, store it in *val and
+  // return OK.  Else return a non-OK status.
+  // Uses *operands to store merge_operator operations to apply later
+  // REQUIRES: lock is not held
+  void Get(const ReadOptions&, const LookupKey& key, std::string* val,
+           Status* status, MergeContext* merge_context,
+           bool* value_found = nullptr);
+
+  // Update scores, pre-calculated variables. It needs to be called before
+  // applying the version to the version set.
+  void PrepareApply(const MutableCFOptions& mutable_cf_options,
+                    std::vector<uint64_t>& size_being_compacted);
+
+  // Reference count management (so Versions do not disappear out from
+  // under live iterators)
+  void Ref();
+  // Decrease reference count. Delete the object if no reference left
+  // and return true. Otherwise, return false.
+  bool Unref();
+
+  std::vector<FileMetaData*>* GetFiles() { return vstorage_.GetFiles(); }
+
+  // Add all files listed in the current version to *live.
+  void AddLiveFiles(std::vector<FileDescriptor>* live);
+
+  // Return a human readable string that describes this version's contents.
+  std::string DebugString(bool hex = false) const;
+
+  // Returns the version nuber of this version
+  uint64_t GetVersionNumber() const { return version_number_; }
+
+  uint64_t GetAverageValueSize() const {
+    return vstorage_.GetAverageValueSize();
+  }
+
   // REQUIRES: lock is held
   // On success, "tp" will contains the table properties of the file
   // specified in "file_meta".  If the file name of "file_meta" is
@@ -251,77 +404,40 @@ class Version {
   // tables' propertis, represented as shared_ptr.
   Status GetPropertiesOfAllTables(TablePropertiesCollection* props);
 
-  uint64_t GetEstimatedActiveKeys();
+  uint64_t GetEstimatedActiveKeys() {
+    return vstorage_.GetEstimatedActiveKeys();
+  }
 
   size_t GetMemoryUsageByTableReaders();
 
   ColumnFamilyData* cfd() const { return cfd_; }
 
-  // REQUIRES: This version has been saved (see VersionSet::SaveTo)
-  const std::vector<FileMetaData*>& LevelFiles(int level) const {
-    assert(finalized_);
-    return files_[level];
-  }
-
-  // REQUIRES: This version has been saved (see VersionSet::SaveTo)
-  const std::vector<int>& FilesBySize(int level) const {
-    assert(finalized_);
-    return files_by_size_[level];
-  }
-
-  const LevelFilesBrief& GetLevelFilesBrief(int level) const {
-    return level_files_brief_[level];
-  }
-
-  // REQUIRES: lock is held
-  // Set the index that is used to offset into files_by_size_ to find
-  // the next compaction candidate file.
-  void SetNextCompactionIndex(int level, int index) {
-    next_file_to_compact_by_size_[level] = index;
-  }
-
-  // REQUIRES: lock is held
-  int NextCompactionIndex(int level) const {
-    return next_file_to_compact_by_size_[level];
-  }
-
-  // Only the first few entries of files_by_size_ are sorted.
-  // There is no need to sort all the files because it is likely
-  // that on a running system, we need to look at only the first
-  // few largest files because a new version is created every few
-  // seconds/minutes (because of concurrent compactions).
-  static const size_t kNumberFilesToSort = 50;
 
   // Return the next Version in the linked list. Used for debug only
   Version* TEST_Next() const {
     return next_;
   }
 
-  // REQUIRES: This version has been saved (see VersionSet::SaveTo)
-  const FileIndexer& GetIndexer() const {
-    assert(finalized_);
-    return file_indexer_;
-  }
+  VersionStorageInfo* GetStorageInfo() { return &vstorage_; }
 
  private:
   friend class VersionSet;
-  friend class DBImpl;
-  friend class InternalStats;
+
+  const InternalKeyComparator* GetInternalComparator() const {
+    return vstorage_.internal_comparator_;
+  }
+  const Comparator* GetUserComparator() const {
+    return vstorage_.user_comparator_;
+  }
 
   bool PrefixMayMatch(const ReadOptions& read_options, Iterator* level_iter,
                       const Slice& internal_prefix) const;
 
-  // Update num_non_empty_levels_.
-  void UpdateNumNonEmptyLevels();
-
   // The helper function of UpdateAccumulatedStats, which may fill the missing
   // fields of file_mata from its associated TableProperties.
   // Returns true if it does initialize FileMetaData.
   bool MaybeInitializeFileMetaData(FileMetaData* file_meta);
 
-  // Update the accumulated stats from a file-meta.
-  void UpdateAccumulatedStats(FileMetaData* file_meta);
-
   // Update the accumulated stats associated with the current version.
   // This accumulated stats will be used in compaction.
   void UpdateAccumulatedStats();
@@ -330,74 +446,26 @@ class Version {
   // record results in files_by_size_. The largest files are listed first.
   void UpdateFilesBySize();
 
+  VersionSet* GetVersionSet() { return vset_; }
+
   ColumnFamilyData* cfd_;  // ColumnFamilyData to which this Version belongs
-  const InternalKeyComparator* internal_comparator_;
-  const Comparator* user_comparator_;
+  Logger* info_log_;
+  Statistics* db_statistics_;
   TableCache* table_cache_;
   const MergeOperator* merge_operator_;
 
-  // A short brief metadata of files per level
-  autovector<LevelFilesBrief> level_files_brief_;
-  Logger* info_log_;
-  Statistics* db_statistics_;
-  int num_levels_;              // Number of levels
-  int num_non_empty_levels_;    // Number of levels. Any level larger than it
-                                // is guaranteed to be empty.
-  FileIndexer file_indexer_;
+  VersionStorageInfo vstorage_;
   VersionSet* vset_;            // VersionSet to which this Version belongs
-  Arena arena_;                 // Used to allocate space for level_files_brief_
   Version* next_;               // Next version in linked list
   Version* prev_;               // Previous version in linked list
   int refs_;                    // Number of live refs to this version
 
-  // List of files per level, files in each level are arranged
-  // in increasing order of keys
-  std::vector<FileMetaData*>* files_;
-
-  // A list for the same set of files that are stored in files_,
-  // but files in each level are now sorted based on file
-  // size. The file with the largest size is at the front.
-  // This vector stores the index of the file from files_.
-  std::vector<std::vector<int>> files_by_size_;
-
-  // An index into files_by_size_ that specifies the first
-  // file that is not yet compacted
-  std::vector<int> next_file_to_compact_by_size_;
-
-  // Level that should be compacted next and its compaction score.
-  // Score < 1 means compaction is not strictly needed.  These fields
-  // are initialized by Finalize().
-  // The most critical level to be compacted is listed first
-  // These are used to pick the best compaction level
-  std::vector<double> compaction_score_;
-  std::vector<int> compaction_level_;
-  double max_compaction_score_ = 0.0;   // max score in l1 to ln-1
-  int max_compaction_score_level_ = 0;  // level on which max score occurs
-
   // A version number that uniquely represents this version. This is
   // used for debugging and logging purposes only.
   uint64_t version_number_;
 
   Version(ColumnFamilyData* cfd, VersionSet* vset, uint64_t version_number = 0);
 
-  // the following are the sampled temporary stats.
-  // the current accumulated size of sampled files.
-  uint64_t accumulated_file_size_;
-  // the current accumulated size of all raw keys based on the sampled files.
-  uint64_t accumulated_raw_key_size_;
-  // the current accumulated size of all raw keys based on the sampled files.
-  uint64_t accumulated_raw_value_size_;
-  // total number of non-deletion entries
-  uint64_t accumulated_num_non_deletions_;
-  // total number of deletion entries
-  uint64_t accumulated_num_deletions_;
-  // the number of samples
-  uint64_t num_samples_;
-
-  // Used to assert APIs that are only safe to use after the version
-  // is finalized
-  bool finalized_;
-
   ~Version();
 
   // No copying allowed
@@ -418,13 +486,12 @@ class VersionSet {
   // column_family_options has to be set if edit is column family add
   // REQUIRES: *mu is held on entry.
   // REQUIRES: no other thread concurrently calls LogAndApply()
-  Status LogAndApply(ColumnFamilyData* column_family_data,
-                     const MutableCFOptions& mutable_cf_options,
-                     VersionEdit* edit,
-                     port::Mutex* mu, Directory* db_directory = nullptr,
-                     bool new_descriptor_log = false,
-                     const ColumnFamilyOptions* column_family_options =
-                         nullptr);
+  Status LogAndApply(
+      ColumnFamilyData* column_family_data,
+      const MutableCFOptions& mutable_cf_options, VersionEdit* edit,
+      port::Mutex* mu, Directory* db_directory = nullptr,
+      bool new_descriptor_log = false,
+      const ColumnFamilyOptions* column_family_options = nullptr);
 
   // Recover the last saved descriptor from persistent storage.
   // If read_only == true, Recover() will not complain if some column families
@@ -530,8 +597,7 @@ class VersionSet {
   Status GetMetadataForFile(uint64_t number, int* filelevel,
                             FileMetaData** metadata, ColumnFamilyData** cfd);
 
-  void GetLiveFilesMetaData(
-    std::vector<LiveFileMetaData> *metadata);
+  void GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata);
 
   void GetObsoleteFiles(std::vector<FileMetaData*>* files);
 
diff --git a/util/ldb_cmd.cc b/util/ldb_cmd.cc
index 70f0c6a94..3ff31359b 100644
--- a/util/ldb_cmd.cc
+++ b/util/ldb_cmd.cc
@@ -1125,7 +1125,7 @@ Status ReduceDBLevelsCommand::GetOldNumOfLevels(Options& opt,
   int max = -1;
   auto default_cfd = versions.GetColumnFamilySet()->GetDefault();
   for (int i = 0; i < default_cfd->NumberLevels(); i++) {
-    if (default_cfd->current()->NumLevelFiles(i)) {
+    if (default_cfd->current()->GetStorageInfo()->NumLevelFiles(i)) {
       max = i;
     }
   }
diff --git a/utilities/compacted_db/compacted_db_impl.cc b/utilities/compacted_db/compacted_db_impl.cc
index a253153ae..455b312fa 100644
--- a/utilities/compacted_db/compacted_db_impl.cc
+++ b/utilities/compacted_db/compacted_db_impl.cc
@@ -104,28 +104,29 @@ Status CompactedDBImpl::Init(const Options& options) {
   }
   version_ = cfd_->GetSuperVersion()->current;
   user_comparator_ = cfd_->user_comparator();
-  const LevelFilesBrief& l0 = version_->GetLevelFilesBrief(0);
+  auto* vstorage = version_->GetStorageInfo();
+  const LevelFilesBrief& l0 = vstorage->LevelFilesBrief(0);
   // L0 should not have files
   if (l0.num_files > 1) {
     return Status::NotSupported("L0 contain more than 1 file");
   }
   if (l0.num_files == 1) {
-    if (version_->NumNonEmptyLevels() > 1) {
+    if (vstorage->NumNonEmptyLevels() > 1) {
       return Status::NotSupported("Both L0 and other level contain files");
     }
     files_ = l0;
     return Status::OK();
   }
 
-  for (int i = 1; i < version_->NumNonEmptyLevels() - 1; ++i) {
-    if (version_->GetLevelFilesBrief(i).num_files > 0) {
+  for (int i = 1; i < vstorage->NumNonEmptyLevels() - 1; ++i) {
+    if (vstorage->LevelFilesBrief(i).num_files > 0) {
       return Status::NotSupported("Other levels also contain files");
     }
   }
 
-  int level = version_->NumNonEmptyLevels() - 1;
-  if (version_->GetLevelFilesBrief(level).num_files > 0) {
-    files_ = version_->GetLevelFilesBrief(level);
+  int level = vstorage->NumNonEmptyLevels() - 1;
+  if (vstorage->LevelFilesBrief(level).num_files > 0) {
+    files_ = vstorage->LevelFilesBrief(level);
     return Status::OK();
   }
   return Status::NotSupported("no file exists");

From c2999f54bd775ede3a37b9648b263b608f9b31fa Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Wed, 29 Oct 2014 15:29:33 -0700
Subject: [PATCH 348/829] Revert "tmp"

This reverts commit 9ab0132360fbf68eb0561f7525e726d4d3a4c0f7.
---
 include/rocksdb/env.h |   1 -
 util/env_mem.cc       | 367 ------------------------------------------
 util/env_mem_test.cc  | 231 --------------------------
 3 files changed, 599 deletions(-)
 delete mode 100644 util/env_mem.cc
 delete mode 100644 util/env_mem_test.cc

diff --git a/include/rocksdb/env.h b/include/rocksdb/env.h
index b0cd40ddd..70244bb31 100644
--- a/include/rocksdb/env.h
+++ b/include/rocksdb/env.h
@@ -798,7 +798,6 @@ class EnvWrapper : public Env {
 // when it is no longer needed.
 // *base_env must remain live while the result is in use.
 Env* NewMemEnv(Env* base_env);
-Env* NewTestMemEnv(Env* base_env);
 
 }  // namespace rocksdb
 
diff --git a/util/env_mem.cc b/util/env_mem.cc
deleted file mode 100644
index 43337da7e..000000000
--- a/util/env_mem.cc
+++ /dev/null
@@ -1,367 +0,0 @@
-// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-
-#include "rocksdb/env.h"
-#include "rocksdb/status.h"
-#include "port/port.h"
-#include "util/mutexlock.h"
-#include <map>
-#include <string.h>
-#include <string>
-#include <vector>
-
-namespace rocksdb {
-
-namespace {
-
-class MemFile {
- public:
-  enum Mode {
-    READ = 0,
-    WRITE = 1,
-  };
-
-  MemFile(Mode mode) : mode_(mode), refs_(0) {}
-
-  void Ref() {
-    MutexLock lock(&mutex_);
-    ++refs_;
-  }
-
-  void Unref() {
-    bool do_delete = false;
-    {
-      MutexLock lock(&mutex_);
-      --refs_;
-      assert(refs_ >= 0);
-      if (refs_ <= 0) {
-        do_delete = true;
-      }
-    }
-
-    if (do_delete) {
-      delete this;
-    }
-  }
-
-  void SetMode(Mode mode) {
-    mode_ = mode;
-  }
-
-  uint64_t Size() const { return data_.size(); }
-
-  Status Read(uint64_t offset, size_t n, Slice* result, char* scratch) const {
-    assert(mode_ == READ);
-    if (offset > Size()) {
-      return Status::IOError("Offset greater than file size.");
-    }
-    const uint64_t available = Size() - offset;
-    if (n > available) {
-      n = available;
-    }
-    if (n == 0) {
-      *result = Slice();
-      return Status::OK();
-    }
-    if (scratch) {
-      memcpy(scratch, &(data_[offset]), n);
-      *result = Slice(scratch, n);
-    } else {
-      *result = Slice(&(data_[offset]), n);
-    }
-    return Status::OK();
-  }
-
-  Status Append(const Slice& data) {
-    assert(mode_ == WRITE);
-    data_.append(data.data(), data.size());
-    return Status::OK();
-  }
-
-  Status Fsync() {
-    return Status::OK();
-  }
-
- private:
-  // Private since only Unref() should be used to delete it.
-  ~MemFile() {
-    assert(refs_ == 0);
-  }
-
-  // No copying allowed.
-  MemFile(const MemFile&);
-  void operator=(const MemFile&);
-
-  Mode mode_;
-  port::Mutex mutex_;
-  int refs_;  // Protected by mutex_;
-
-  std::string data_;
-};
-
-class SequentialFileImpl : public SequentialFile {
- public:
-  explicit SequentialFileImpl(MemFile* file) : file_(file), pos_(0) {
-    file_->Ref();
-  }
-
-  ~SequentialFileImpl() {
-    file_->Unref();
-  }
-
-  virtual Status Read(size_t n, Slice* result, char* scratch) {
-    Status s = file_->Read(pos_, n, result, scratch);
-    if (s.ok()) {
-      pos_ += result->size();
-    }
-    return s;
-  }
-
-  virtual Status Skip(uint64_t n) {
-    if (pos_ > file_->Size()) {
-      return Status::IOError("pos_ > file_->Size()");
-    }
-    const size_t available = file_->Size() - pos_;
-    if (n > available) {
-      n = available;
-    }
-    pos_ += n;
-    return Status::OK();
-  }
-
- private:
-  MemFile* file_;
-  size_t pos_;
-};
-
-class RandomAccessFileImpl : public RandomAccessFile {
- public:
-  explicit RandomAccessFileImpl(MemFile* file) : file_(file) {
-    file_->Ref();
-  }
-
-  ~RandomAccessFileImpl() {
-    file_->Unref();
-  }
-
-  virtual Status Read(uint64_t offset, size_t n, Slice* result,
-                      char* scratch) const {
-    return file_->Read(offset, n, result, scratch);
-  }
-
- private:
-  MemFile* file_;
-};
-
-class WritableFileImpl : public WritableFile {
- public:
-  WritableFileImpl(MemFile* file) : file_(file) {
-    file_->Ref();
-  }
-
-  ~WritableFileImpl() {
-    file_->Unref();
-  }
-
-  virtual Status Append(const Slice& data) {
-    return file_->Append(data);
-  }
-
-  virtual Status Close() {
-    return Status::OK();
-  }
-
-  virtual Status Flush() {
-    return Status::OK();
-  }
-
-  virtual Status Sync() {
-    return file_->Fsync();
-  }
-
- private:
-  MemFile* file_;
-};
-
-class TestMemDirectory : public Directory {
- public:
-  virtual Status Fsync() { return Status::OK(); }
-};
-
-class TestMemEnv : public EnvWrapper {
- public:
-  explicit TestMemEnv(Env* base_env) : EnvWrapper(base_env) { }
-
-  virtual ~TestMemEnv() {
-    for (FileSystem::iterator i = file_map_.begin(); i != file_map_.end(); ++i){
-      i->second->Unref();
-    }
-  }
-
-  // Partial implementation of the Env interface.
-  virtual Status NewSequentialFile(const std::string& fname,
-                                   unique_ptr<SequentialFile>* result,
-                                   const EnvOptions& soptions) {
-    MutexLock lock(&mutex_);
-    if (file_map_.find(fname) == file_map_.end()) {
-      *result = NULL;
-      return Status::IOError(fname, "File not found");
-    }
-    auto* f = file_map_[fname];
-    f->SetMode(MemFile::READ);
-    result->reset(new SequentialFileImpl(f));
-    return Status::OK();
-  }
-
-  virtual Status NewRandomAccessFile(const std::string& fname,
-                                     unique_ptr<RandomAccessFile>* result,
-                                     const EnvOptions& soptions) {
-    MutexLock lock(&mutex_);
-    if (file_map_.find(fname) == file_map_.end()) {
-      *result = NULL;
-      return Status::IOError(fname, "File not found");
-    }
-    auto* f = file_map_[fname];
-    f->SetMode(MemFile::READ);
-    result->reset(new RandomAccessFileImpl(f));
-    return Status::OK();
-  }
-
-  virtual Status NewWritableFile(const std::string& fname,
-                                 unique_ptr<WritableFile>* result,
-                                 const EnvOptions& soptions) {
-    MutexLock lock(&mutex_);
-    if (file_map_.find(fname) != file_map_.end()) {
-      DeleteFileInternal(fname);
-    }
-    MemFile* file = new MemFile(MemFile::WRITE);
-    file->Ref();
-    file_map_[fname] = file;
-
-    result->reset(new WritableFileImpl(file));
-    return Status::OK();
-  }
-
-  virtual Status NewRandomRWFile(const std::string& fname,
-                                 unique_ptr<RandomRWFile>* result,
-                                 const EnvOptions& options) {
-    return Status::OK();
-  }
-
-  virtual Status NewDirectory(const std::string& name,
-                              unique_ptr<Directory>* result) {
-    result->reset(new TestMemDirectory());
-    return Status::OK();
-  }
-
-  virtual bool FileExists(const std::string& fname) {
-    MutexLock lock(&mutex_);
-    return file_map_.find(fname) != file_map_.end();
-  }
-
-  virtual Status GetChildren(const std::string& dir,
-                             std::vector<std::string>* result) {
-    MutexLock lock(&mutex_);
-    result->clear();
-
-    for (FileSystem::iterator i = file_map_.begin(); i != file_map_.end(); ++i){
-      const std::string& filename = i->first;
-
-      if (filename.size() >= dir.size() + 1 && filename[dir.size()] == '/' &&
-          Slice(filename).starts_with(Slice(dir))) {
-        result->push_back(filename.substr(dir.size() + 1));
-      }
-    }
-
-    return Status::OK();
-  }
-
-  void DeleteFileInternal(const std::string& fname) {
-    if (file_map_.find(fname) == file_map_.end()) {
-      return;
-    }
-
-    file_map_[fname]->Unref();
-    file_map_.erase(fname);
-  }
-
-  virtual Status DeleteFile(const std::string& fname) {
-    MutexLock lock(&mutex_);
-    if (file_map_.find(fname) == file_map_.end()) {
-      return Status::IOError(fname, "File not found");
-    }
-
-    DeleteFileInternal(fname);
-    return Status::OK();
-  }
-
-  virtual Status CreateDir(const std::string& dirname) {
-    return Status::OK();
-  }
-
-  virtual Status CreateDirIfMissing(const std::string& dirname) {
-    return Status::OK();
-  }
-
-  virtual Status DeleteDir(const std::string& dirname) {
-    return Status::OK();
-  }
-
-  virtual Status GetFileSize(const std::string& fname, uint64_t* file_size) {
-    MutexLock lock(&mutex_);
-    if (file_map_.find(fname) == file_map_.end()) {
-      return Status::IOError(fname, "File not found");
-    }
-
-    *file_size = file_map_[fname]->Size();
-    return Status::OK();
-  }
-
-  virtual Status GetFileModificationTime(const std::string& fname,
-                                         uint64_t* time) {
-    return Status::NotSupported("getFileMTime", "Not supported in MemEnv");
-  }
-
-  virtual Status RenameFile(const std::string& src,
-                            const std::string& target) {
-    MutexLock lock(&mutex_);
-    if (file_map_.find(src) == file_map_.end()) {
-      return Status::IOError(src, "File not found");
-    }
-
-    DeleteFileInternal(target);
-    file_map_[target] = file_map_[src];
-    file_map_.erase(src);
-    return Status::OK();
-  }
-
-  virtual Status LockFile(const std::string& fname, FileLock** lock) {
-    *lock = new FileLock;
-    return Status::OK();
-  }
-
-  virtual Status UnlockFile(FileLock* lock) {
-    delete lock;
-    return Status::OK();
-  }
-
-  virtual Status GetTestDirectory(std::string* path) {
-    *path = "/test";
-    return Status::OK();
-  }
-
- private:
-  // Map from filenames to MemFile objects, representing a simple file system.
-  typedef std::map<std::string, MemFile*> FileSystem;
-  port::Mutex mutex_;
-  FileSystem file_map_;  // Protected by mutex_.
-};
-
-}  // namespace
-
-Env* NewTestMemEnv(Env* base_env) {
-  return new TestMemEnv(base_env);
-}
-
-}  // namespace rocksdb
diff --git a/util/env_mem_test.cc b/util/env_mem_test.cc
deleted file mode 100644
index ea3ed61a0..000000000
--- a/util/env_mem_test.cc
+++ /dev/null
@@ -1,231 +0,0 @@
-// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-
-#include "db/db_impl.h"
-#include "rocksdb/db.h"
-#include "rocksdb/env.h"
-#include "util/testharness.h"
-#include <memory>
-#include <string>
-#include <vector>
-
-namespace rocksdb {
-
-class MemEnvTest {
- public:
-  Env* env_;
-  const EnvOptions soptions_;
-
-  MemEnvTest()
-      : env_(NewMemEnv(Env::Default())) {
-  }
-  ~MemEnvTest() {
-    delete env_;
-  }
-};
-
-TEST(MemEnvTest, Basics) {
-  uint64_t file_size;
-  unique_ptr<WritableFile> writable_file;
-  std::vector<std::string> children;
-
-  ASSERT_OK(env_->CreateDir("/dir"));
-
-  // Check that the directory is empty.
-  ASSERT_TRUE(!env_->FileExists("/dir/non_existent"));
-  ASSERT_TRUE(!env_->GetFileSize("/dir/non_existent", &file_size).ok());
-  ASSERT_OK(env_->GetChildren("/dir", &children));
-  ASSERT_EQ(0U, children.size());
-
-  // Create a file.
-  ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file, soptions_));
-  writable_file.reset();
-
-  // Check that the file exists.
-  ASSERT_TRUE(env_->FileExists("/dir/f"));
-  ASSERT_OK(env_->GetFileSize("/dir/f", &file_size));
-  ASSERT_EQ(0U, file_size);
-  ASSERT_OK(env_->GetChildren("/dir", &children));
-  ASSERT_EQ(1U, children.size());
-  ASSERT_EQ("f", children[0]);
-
-  // Write to the file.
-  ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file, soptions_));
-  ASSERT_OK(writable_file->Append("abc"));
-  writable_file.reset();
-
-  // Check for expected size.
-  ASSERT_OK(env_->GetFileSize("/dir/f", &file_size));
-  ASSERT_EQ(3U, file_size);
-
-  // Check that renaming works.
-  ASSERT_TRUE(!env_->RenameFile("/dir/non_existent", "/dir/g").ok());
-  ASSERT_OK(env_->RenameFile("/dir/f", "/dir/g"));
-  ASSERT_TRUE(!env_->FileExists("/dir/f"));
-  ASSERT_TRUE(env_->FileExists("/dir/g"));
-  ASSERT_OK(env_->GetFileSize("/dir/g", &file_size));
-  ASSERT_EQ(3U, file_size);
-
-  // Check that opening non-existent file fails.
-  unique_ptr<SequentialFile> seq_file;
-  unique_ptr<RandomAccessFile> rand_file;
-  ASSERT_TRUE(!env_->NewSequentialFile("/dir/non_existent", &seq_file,
-                                       soptions_).ok());
-  ASSERT_TRUE(!seq_file);
-  ASSERT_TRUE(!env_->NewRandomAccessFile("/dir/non_existent", &rand_file,
-                                         soptions_).ok());
-  ASSERT_TRUE(!rand_file);
-
-  // Check that deleting works.
-  ASSERT_TRUE(!env_->DeleteFile("/dir/non_existent").ok());
-  ASSERT_OK(env_->DeleteFile("/dir/g"));
-  ASSERT_TRUE(!env_->FileExists("/dir/g"));
-  ASSERT_OK(env_->GetChildren("/dir", &children));
-  ASSERT_EQ(0U, children.size());
-  ASSERT_OK(env_->DeleteDir("/dir"));
-}
-
-TEST(MemEnvTest, ReadWrite) {
-  unique_ptr<WritableFile> writable_file;
-  unique_ptr<SequentialFile> seq_file;
-  unique_ptr<RandomAccessFile> rand_file;
-  Slice result;
-  char scratch[100];
-
-  ASSERT_OK(env_->CreateDir("/dir"));
-
-  ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file, soptions_));
-  ASSERT_OK(writable_file->Append("hello "));
-  ASSERT_OK(writable_file->Append("world"));
-  writable_file.reset();
-
-  // Read sequentially.
-  ASSERT_OK(env_->NewSequentialFile("/dir/f", &seq_file, soptions_));
-  ASSERT_OK(seq_file->Read(5, &result, scratch)); // Read "hello".
-  ASSERT_EQ(0, result.compare("hello"));
-  ASSERT_OK(seq_file->Skip(1));
-  ASSERT_OK(seq_file->Read(1000, &result, scratch)); // Read "world".
-  ASSERT_EQ(0, result.compare("world"));
-  ASSERT_OK(seq_file->Read(1000, &result, scratch)); // Try reading past EOF.
-  ASSERT_EQ(0U, result.size());
-  ASSERT_OK(seq_file->Skip(100)); // Try to skip past end of file.
-  ASSERT_OK(seq_file->Read(1000, &result, scratch));
-  ASSERT_EQ(0U, result.size());
-
-  // Random reads.
-  ASSERT_OK(env_->NewRandomAccessFile("/dir/f", &rand_file, soptions_));
-  ASSERT_OK(rand_file->Read(6, 5, &result, scratch)); // Read "world".
-  ASSERT_EQ(0, result.compare("world"));
-  ASSERT_OK(rand_file->Read(0, 5, &result, scratch)); // Read "hello".
-  ASSERT_EQ(0, result.compare("hello"));
-  ASSERT_OK(rand_file->Read(10, 100, &result, scratch)); // Read "d".
-  ASSERT_EQ(0, result.compare("d"));
-
-  // Too high offset.
-  ASSERT_TRUE(!rand_file->Read(1000, 5, &result, scratch).ok());
-}
-
-TEST(MemEnvTest, Locks) {
-  FileLock* lock;
-
-  // These are no-ops, but we test they return success.
-  ASSERT_OK(env_->LockFile("some file", &lock));
-  ASSERT_OK(env_->UnlockFile(lock));
-}
-
-TEST(MemEnvTest, Misc) {
-  std::string test_dir;
-  ASSERT_OK(env_->GetTestDirectory(&test_dir));
-  ASSERT_TRUE(!test_dir.empty());
-
-  unique_ptr<WritableFile> writable_file;
-  ASSERT_OK(env_->NewWritableFile("/a/b", &writable_file, soptions_));
-
-  // These are no-ops, but we test they return success.
-  ASSERT_OK(writable_file->Sync());
-  ASSERT_OK(writable_file->Flush());
-  ASSERT_OK(writable_file->Close());
-  writable_file.reset();
-}
-
-TEST(MemEnvTest, LargeWrite) {
-  const size_t kWriteSize = 300 * 1024;
-  char* scratch = new char[kWriteSize * 2];
-
-  std::string write_data;
-  for (size_t i = 0; i < kWriteSize; ++i) {
-    write_data.append(1, static_cast<char>(i));
-  }
-
-  unique_ptr<WritableFile> writable_file;
-  ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file, soptions_));
-  ASSERT_OK(writable_file->Append("foo"));
-  ASSERT_OK(writable_file->Append(write_data));
-  writable_file.reset();
-
-  unique_ptr<SequentialFile> seq_file;
-  Slice result;
-  ASSERT_OK(env_->NewSequentialFile("/dir/f", &seq_file, soptions_));
-  ASSERT_OK(seq_file->Read(3, &result, scratch)); // Read "foo".
-  ASSERT_EQ(0, result.compare("foo"));
-
-  size_t read = 0;
-  std::string read_data;
-  while (read < kWriteSize) {
-    ASSERT_OK(seq_file->Read(kWriteSize - read, &result, scratch));
-    read_data.append(result.data(), result.size());
-    read += result.size();
-  }
-  ASSERT_TRUE(write_data == read_data);
-  delete [] scratch;
-}
-
-TEST(MemEnvTest, DBTest) {
-  Options options;
-  options.create_if_missing = true;
-  options.env = env_;
-  DB* db;
-
-  const Slice keys[] = {Slice("aaa"), Slice("bbb"), Slice("ccc")};
-  const Slice vals[] = {Slice("foo"), Slice("bar"), Slice("baz")};
-
-  ASSERT_OK(DB::Open(options, "/dir/db", &db));
-  for (size_t i = 0; i < 3; ++i) {
-    ASSERT_OK(db->Put(WriteOptions(), keys[i], vals[i]));
-  }
-
-  for (size_t i = 0; i < 3; ++i) {
-    std::string res;
-    ASSERT_OK(db->Get(ReadOptions(), keys[i], &res));
-    ASSERT_TRUE(res == vals[i]);
-  }
-
-  Iterator* iterator = db->NewIterator(ReadOptions());
-  iterator->SeekToFirst();
-  for (size_t i = 0; i < 3; ++i) {
-    ASSERT_TRUE(iterator->Valid());
-    ASSERT_TRUE(keys[i] == iterator->key());
-    ASSERT_TRUE(vals[i] == iterator->value());
-    iterator->Next();
-  }
-  ASSERT_TRUE(!iterator->Valid());
-  delete iterator;
-
-  DBImpl* dbi = reinterpret_cast<DBImpl*>(db);
-  ASSERT_OK(dbi->TEST_FlushMemTable());
-
-  for (size_t i = 0; i < 3; ++i) {
-    std::string res;
-    ASSERT_OK(db->Get(ReadOptions(), keys[i], &res));
-    ASSERT_TRUE(res == vals[i]);
-  }
-
-  delete db;
-}
-
-}  // namespace rocksdb
-
-int main(int argc, char** argv) {
-  return rocksdb::test::RunAllTests();
-}

From 17be187ff9f57dede2ee152f07a17a1d100fd4cb Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Wed, 29 Oct 2014 15:38:34 -0700
Subject: [PATCH 349/829] dummy var to suppress compiler warning/error

Summary: Revmoed this in D25641, causing compiler complain. put it back

Test Plan: make release

Reviewers: igor, yhchiang, rven, sdong

Reviewed By: sdong

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D27891
---
 util/env_posix.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/util/env_posix.cc b/util/env_posix.cc
index 84c9e558e..e44ebc83e 100644
--- a/util/env_posix.cc
+++ b/util/env_posix.cc
@@ -739,7 +739,8 @@ class PosixWritableFile : public WritableFile {
       // trim the extra space preallocated at the end of the file
       // NOTE(ljin): we probably don't want to surface failure as an IOError,
       // but it will be nice to log these errors.
-      ftruncate(fd_, filesize_);
+      int dummy __attribute__((unused));
+      dummy = ftruncate(fd_, filesize_);
 #ifdef ROCKSDB_FALLOCATE_PRESENT
       // in some file systems, ftruncate only trims trailing space if the
       // new file size is smaller than the current size. Calling fallocate

From 86de2007b8f2068899f8f1d3aa5bd8b6bd11f9d5 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Wed, 29 Oct 2014 13:49:45 -0700
Subject: [PATCH 350/829] Add ComparatorDBTest to test non-default comparators

Summary:
Add some helper functions to make sure DB works well for non-default comparators.
Add a test for SimpleSuffixReverseComparator.

Test Plan: Run the new test

Reviewers: ljin, rven, yhchiang, igor

Reviewed By: igor

Subscribers: leveldb, dhruba

Differential Revision: https://reviews.facebook.net/D27831
---
 Makefile                  |   4 +
 db/comparator_db_test.cc  | 260 ++++++++++++++++++++++++++++++++++++++
 db/plain_table_db_test.cc |  32 +----
 util/testutil.h           |  30 +++++
 4 files changed, 296 insertions(+), 30 deletions(-)
 create mode 100644 db/comparator_db_test.cc

diff --git a/Makefile b/Makefile
index 6b11012c2..52019a17f 100644
--- a/Makefile
+++ b/Makefile
@@ -121,6 +121,7 @@ TESTS = \
 	redis_test \
 	reduce_levels_test \
 	plain_table_db_test \
+	comparator_db_test \
 	prefix_test \
 	skiplist_test \
 	stringappend_test \
@@ -384,6 +385,9 @@ log_write_bench: util/log_write_bench.o $(LIBOBJECTS) $(TESTHARNESS)
 plain_table_db_test: db/plain_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) db/plain_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
 
+comparator_db_test: db/comparator_db_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) db/comparator_db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
 table_reader_bench: table/table_reader_bench.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) table/table_reader_bench.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) -pg
 
diff --git a/db/comparator_db_test.cc b/db/comparator_db_test.cc
new file mode 100644
index 000000000..ea24a30a5
--- /dev/null
+++ b/db/comparator_db_test.cc
@@ -0,0 +1,260 @@
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+#include <map>
+#include <string>
+
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+#include "utilities/merge_operators.h"
+
+using std::unique_ptr;
+
+namespace rocksdb {
+namespace {
+
+static const Comparator* comparator;
+
+// A comparator for std::map, using comparator
+struct MapComparator {
+  bool operator()(const std::string& a, const std::string& b) const {
+    return comparator->Compare(a, b) < 0;
+  }
+};
+
+typedef std::map<std::string, std::string, MapComparator> KVMap;
+
+class KVIter : public Iterator {
+ public:
+  explicit KVIter(const KVMap* map) : map_(map), iter_(map_->end()) {}
+  virtual bool Valid() const { return iter_ != map_->end(); }
+  virtual void SeekToFirst() { iter_ = map_->begin(); }
+  virtual void SeekToLast() {
+    if (map_->empty()) {
+      iter_ = map_->end();
+    } else {
+      iter_ = map_->find(map_->rbegin()->first);
+    }
+  }
+  virtual void Seek(const Slice& k) { iter_ = map_->lower_bound(k.ToString()); }
+  virtual void Next() { ++iter_; }
+  virtual void Prev() {
+    if (iter_ == map_->begin()) {
+      iter_ = map_->end();
+      return;
+    }
+    --iter_;
+  }
+
+  virtual Slice key() const { return iter_->first; }
+  virtual Slice value() const { return iter_->second; }
+  virtual Status status() const { return Status::OK(); }
+
+ private:
+  const KVMap* const map_;
+  KVMap::const_iterator iter_;
+};
+
+void AssertItersEqual(Iterator* iter1, Iterator* iter2) {
+  ASSERT_EQ(iter1->Valid(), iter2->Valid());
+  if (iter1->Valid()) {
+    ASSERT_EQ(iter1->key().ToString(), iter2->key().ToString());
+    ASSERT_EQ(iter1->value().ToString(), iter2->value().ToString());
+  }
+}
+
+// Measuring operations on DB (expect to be empty).
+// source_strings are candidate keys
+void DoRandomIteraratorTest(DB* db, std::vector<std::string> source_strings,
+                            Random* rnd, int num_writes, int num_iter_ops,
+                            int num_trigger_flush) {
+  KVMap map;
+
+  for (int i = 0; i < num_writes; i++) {
+    if (num_trigger_flush > 0 && i != 0 && i % num_trigger_flush == 0) {
+      db->Flush(FlushOptions());
+    }
+
+    int type = rnd->Uniform(2);
+    int index = rnd->Uniform(source_strings.size());
+    auto& key = source_strings[index];
+    switch (type) {
+      case 0:
+        // put
+        map[key] = key;
+        ASSERT_OK(db->Put(WriteOptions(), key, key));
+        break;
+      case 1:
+        // delete
+        if (map.find(key) != map.end()) {
+          map.erase(key);
+        }
+        ASSERT_OK(db->Delete(WriteOptions(), key));
+        break;
+      default:
+        assert(false);
+    }
+  }
+
+  std::unique_ptr<Iterator> iter(db->NewIterator(ReadOptions()));
+  std::unique_ptr<Iterator> result_iter(new KVIter(&map));
+
+  bool is_valid = false;
+  for (int i = 0; i < num_iter_ops; i++) {
+    // Random walk and make sure iter and result_iter returns the
+    // same key and value
+    int type = rnd->Uniform(6);
+    ASSERT_OK(iter->status());
+    switch (type) {
+      case 0:
+        // Seek to First
+        iter->SeekToFirst();
+        result_iter->SeekToFirst();
+        break;
+      case 1:
+        // Seek to last
+        iter->SeekToLast();
+        result_iter->SeekToLast();
+        break;
+      case 2: {
+        // Seek to random key
+        auto key_idx = rnd->Uniform(source_strings.size());
+        auto key = source_strings[key_idx];
+        iter->Seek(key);
+        result_iter->Seek(key);
+        break;
+      }
+      case 3:
+        // Next
+        if (is_valid) {
+          iter->Next();
+          result_iter->Next();
+        } else {
+          continue;
+        }
+        break;
+      case 4:
+        // Prev
+        if (is_valid) {
+          iter->Prev();
+          result_iter->Prev();
+        } else {
+          continue;
+        }
+        break;
+      default: {
+        assert(type == 5);
+        auto key_idx = rnd->Uniform(source_strings.size());
+        auto key = source_strings[key_idx];
+        std::string result;
+        auto status = db->Get(ReadOptions(), key, &result);
+        if (map.find(key) == map.end()) {
+          ASSERT_TRUE(status.IsNotFound());
+        } else {
+          ASSERT_EQ(map[key], result);
+        }
+        break;
+      }
+    }
+    AssertItersEqual(iter.get(), result_iter.get());
+    is_valid = iter->Valid();
+  }
+}
+}  // namespace
+
+class ComparatorDBTest {
+ private:
+  std::string dbname_;
+  Env* env_;
+  DB* db_;
+  Options last_options_;
+  std::unique_ptr<const Comparator> comparator_guard;
+
+ public:
+  ComparatorDBTest() : env_(Env::Default()), db_(nullptr) {
+    comparator = BytewiseComparator();
+    dbname_ = test::TmpDir() + "/comparator_db_test";
+    ASSERT_OK(DestroyDB(dbname_, last_options_));
+  }
+
+  ~ComparatorDBTest() {
+    delete db_;
+    ASSERT_OK(DestroyDB(dbname_, last_options_));
+    comparator = BytewiseComparator();
+  }
+
+  DB* GetDB() { return db_; }
+
+  void SetOwnedComparator(const Comparator* cmp) {
+    comparator_guard.reset(cmp);
+    comparator = cmp;
+    last_options_.comparator = cmp;
+  }
+
+  // Return the current option configuration.
+  Options* GetOptions() { return &last_options_; }
+
+  void DestroyAndReopen() {
+    // Destroy using last options
+    Destroy();
+    ASSERT_OK(TryReopen());
+  }
+
+  void Destroy() {
+    delete db_;
+    db_ = nullptr;
+    ASSERT_OK(DestroyDB(dbname_, last_options_));
+  }
+
+  Status TryReopen() {
+    delete db_;
+    db_ = nullptr;
+    last_options_.create_if_missing = true;
+
+    return DB::Open(last_options_, dbname_, &db_);
+  }
+};
+
+TEST(ComparatorDBTest, Bytewise) {
+  for (int rand_seed = 301; rand_seed < 306; rand_seed++) {
+    DestroyAndReopen();
+    Random rnd(rand_seed);
+    DoRandomIteraratorTest(GetDB(),
+                           {"a", "b", "c", "d", "e", "f", "g", "h", "i"}, &rnd,
+                           8, 100, 3);
+  }
+}
+
+TEST(ComparatorDBTest, SimpleSuffixReverseComparator) {
+  SetOwnedComparator(new test::SimpleSuffixReverseComparator());
+
+  for (int rnd_seed = 301; rnd_seed < 316; rnd_seed++) {
+    Options* opt = GetOptions();
+    opt->comparator = comparator;
+    DestroyAndReopen();
+    Random rnd(rnd_seed);
+
+    std::vector<std::string> source_strings;
+    std::vector<std::string> source_prefixes;
+    // Randomly generate 5 prefixes
+    for (int i = 0; i < 5; i++) {
+      source_prefixes.push_back(test::RandomHumanReadableString(&rnd, 8));
+    }
+    for (int j = 0; j < 20; j++) {
+      int prefix_index = rnd.Uniform(source_prefixes.size());
+      std::string key = source_prefixes[prefix_index] +
+                        test::RandomHumanReadableString(&rnd, rnd.Uniform(8));
+      source_strings.push_back(key);
+    }
+
+    DoRandomIteraratorTest(GetDB(), source_strings, &rnd, 30, 600, 66);
+  }
+}
+}  // namespace rocksdb
+
+int main(int argc, char** argv) { return rocksdb::test::RunAllTests(); }
diff --git a/db/plain_table_db_test.cc b/db/plain_table_db_test.cc
index 1750d265c..81a5d9989 100644
--- a/db/plain_table_db_test.cc
+++ b/db/plain_table_db_test.cc
@@ -696,40 +696,12 @@ TEST(PlainTableDBTest, IteratorLargeKeysWithPrefix) {
   delete iter;
 }
 
-// A test comparator which compare two strings in this way:
-// (1) first compare prefix of 8 bytes in alphabet order,
-// (2) if two strings share the same prefix, sort the other part of the string
-//     in the reverse alphabet order.
-class SimpleSuffixReverseComparator : public Comparator {
- public:
-  SimpleSuffixReverseComparator() {}
-
-  virtual const char* Name() const { return "SimpleSuffixReverseComparator"; }
-
-  virtual int Compare(const Slice& a, const Slice& b) const {
-    Slice prefix_a = Slice(a.data(), 8);
-    Slice prefix_b = Slice(b.data(), 8);
-    int prefix_comp = prefix_a.compare(prefix_b);
-    if (prefix_comp != 0) {
-      return prefix_comp;
-    } else {
-      Slice suffix_a = Slice(a.data() + 8, a.size() - 8);
-      Slice suffix_b = Slice(b.data() + 8, b.size() - 8);
-      return -(suffix_a.compare(suffix_b));
-    }
-  }
-  virtual void FindShortestSeparator(std::string* start,
-                                     const Slice& limit) const {}
-
-  virtual void FindShortSuccessor(std::string* key) const {}
-};
-
 TEST(PlainTableDBTest, IteratorReverseSuffixComparator) {
   Options options = CurrentOptions();
   options.create_if_missing = true;
   // Set only one bucket to force bucket conflict.
   // Test index interval for the same prefix to be 1, 2 and 4
-  SimpleSuffixReverseComparator comp;
+  test::SimpleSuffixReverseComparator comp;
   options.comparator = &comp;
   DestroyAndReopen(&options);
 
@@ -892,7 +864,7 @@ TEST(PlainTableDBTest, HashBucketConflictReverseSuffixComparator) {
     for (unsigned char i = 1; i <= 3; i++) {
       Options options = CurrentOptions();
       options.create_if_missing = true;
-      SimpleSuffixReverseComparator comp;
+      test::SimpleSuffixReverseComparator comp;
       options.comparator = &comp;
       // Set only one bucket to force bucket conflict.
       // Test index interval for the same prefix to be 1, 2 and 4
diff --git a/util/testutil.h b/util/testutil.h
index eff0d7e7d..b489e9175 100644
--- a/util/testutil.h
+++ b/util/testutil.h
@@ -78,6 +78,36 @@ class PlainInternalKeyComparator : public InternalKeyComparator {
   }
 };
 
+// A test comparator which compare two strings in this way:
+// (1) first compare prefix of 8 bytes in alphabet order,
+// (2) if two strings share the same prefix, sort the other part of the string
+//     in the reverse alphabet order.
+// This helps simulate the case of compounded key of [entity][timestamp] and
+// latest timestamp first.
+class SimpleSuffixReverseComparator : public Comparator {
+ public:
+  SimpleSuffixReverseComparator() {}
+
+  virtual const char* Name() const { return "SimpleSuffixReverseComparator"; }
+
+  virtual int Compare(const Slice& a, const Slice& b) const {
+    Slice prefix_a = Slice(a.data(), 8);
+    Slice prefix_b = Slice(b.data(), 8);
+    int prefix_comp = prefix_a.compare(prefix_b);
+    if (prefix_comp != 0) {
+      return prefix_comp;
+    } else {
+      Slice suffix_a = Slice(a.data() + 8, a.size() - 8);
+      Slice suffix_b = Slice(b.data() + 8, b.size() - 8);
+      return -(suffix_a.compare(suffix_b));
+    }
+  }
+  virtual void FindShortestSeparator(std::string* start,
+                                     const Slice& limit) const {}
+
+  virtual void FindShortSuccessor(std::string* key) const {}
+};
+
 // Returns a user key comparator that can be used for comparing two uint64_t
 // slices. Instead of comparing slices byte-wise, it compares all the 8 bytes
 // at once. Assumes same endian-ness is used though the database's lifetime.

From 5c82a8837e226e6f6fe40405908c3a0a424930b4 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Wed, 29 Oct 2014 16:45:07 -0700
Subject: [PATCH 351/829] Add a test in compaction_picker_test to test the max
 score

Summary: Add a new unit test in compaction_picker_test to make sure level-based compaction to pick up the level with the largest score.

Test Plan: Run the new test

Reviewers: ljin, yhchiang, rven, igor

Reviewed By: igor

Subscribers: leveldb, dhruba

Differential Revision: https://reviews.facebook.net/D27933
---
 db/compaction_picker_test.cc | 29 +++++++++++++++++++++++++++--
 1 file changed, 27 insertions(+), 2 deletions(-)

diff --git a/db/compaction_picker_test.cc b/db/compaction_picker_test.cc
index 81bffe0af..f4417b8b5 100644
--- a/db/compaction_picker_test.cc
+++ b/db/compaction_picker_test.cc
@@ -129,9 +129,9 @@ TEST(CompactionPickerTest, Level1Trigger) {
 TEST(CompactionPickerTest, Level1Trigger2) {
   Add(1, 66U, "150", "200", 1000000000U);
   Add(1, 88U, "201", "300", 1000000000U);
-  Add(2, 6U, "150", "180", 1000000000U);
+  Add(2, 6U, "150", "179", 1000000000U);
   Add(2, 7U, "180", "220", 1000000000U);
-  Add(2, 8U, "220", "300", 1000000000U);
+  Add(2, 8U, "221", "300", 1000000000U);
   UpdateVersionStorageInfo();
 
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
@@ -144,6 +144,31 @@ TEST(CompactionPickerTest, Level1Trigger2) {
   ASSERT_EQ(7U, compaction->input(1, 1)->fd.GetNumber());
 }
 
+TEST(CompactionPickerTest, LevelMaxScore) {
+  mutable_cf_options.target_file_size_base = 10000000;
+  mutable_cf_options.target_file_size_multiplier = 10;
+  Add(0, 1U, "150", "200", 1000000000U);
+  // Level 1 score 1.2
+  Add(1, 66U, "150", "200", 6000000U);
+  Add(1, 88U, "201", "300", 6000000U);
+  // Level 2 score 1.8. File 7 is the largest. Should be picked
+  Add(2, 6U, "150", "179", 60000000U);
+  Add(2, 7U, "180", "220", 60000001U);
+  Add(2, 8U, "221", "300", 60000000U);
+  // Level 3 score slightly larger than 1
+  Add(3, 26U, "150", "170", 260000000U);
+  Add(3, 27U, "171", "179", 260000000U);
+  Add(3, 28U, "191", "220", 260000000U);
+  Add(3, 29U, "221", "300", 260000000U);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name, mutable_cf_options, &vstorage, &log_buffer));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(1, compaction->num_input_files(0));
+  ASSERT_EQ(7U, compaction->input(0, 0)->fd.GetNumber());
+}
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) { return rocksdb::test::RunAllTests(); }

From 065766b8d21ffee012cf7dd2c25404057449a54e Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Wed, 29 Oct 2014 17:02:21 -0700
Subject: [PATCH 352/829] DynamicCompactionOptions: relax the check bound a
 little

Summary:
Increase the level size so that impact of a single file is smaller.
Also relax the bound

Test Plan: ran locally

Reviewers: sdong, igor

Reviewed By: igor

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D27939
---
 db/db_test.cc | 46 +++++++++++++++++++++++-----------------------
 1 file changed, 23 insertions(+), 23 deletions(-)

diff --git a/db/db_test.cc b/db/db_test.cc
index b79758b0d..477c6c812 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -8638,7 +8638,7 @@ TEST(DBTest, DynamicMemtableOptions) {
   while (Put(Key(count), RandomString(&rnd, 1024), wo).ok() && count < 256) {
     count++;
   }
-  ASSERT_TRUE(count > (128 * 0.9) && count < (128 * 1.1));
+  ASSERT_TRUE(count > (128 * 0.8) && count < (128 * 1.2));
 
   sleeping_task_low1.WakeUp();
   sleeping_task_low1.WaitUntilDone();
@@ -8657,7 +8657,7 @@ TEST(DBTest, DynamicMemtableOptions) {
   while (Put(Key(count), RandomString(&rnd, 1024), wo).ok() && count < 1024) {
     count++;
   }
-  ASSERT_TRUE(count > (512 * 0.9) && count < (512 * 1.1));
+  ASSERT_TRUE(count > (512 * 0.8) && count < (512 * 1.2));
   sleeping_task_low2.WakeUp();
   sleeping_task_low2.WaitUntilDone();
 
@@ -8675,7 +8675,7 @@ TEST(DBTest, DynamicMemtableOptions) {
   while (Put(Key(count), RandomString(&rnd, 1024), wo).ok() && count < 1024) {
     count++;
   }
-  ASSERT_TRUE(count > (256 * 0.9) && count < (256 * 1.1));
+  ASSERT_TRUE(count > (256 * 0.8) && count < (256 * 1.2));
   sleeping_task_low3.WakeUp();
   sleeping_task_low3.WaitUntilDone();
 }
@@ -8685,7 +8685,7 @@ TEST(DBTest, DynamicCompactionOptions) {
   const uint64_t k32KB = 1 << 15;
   const uint64_t k64KB = 1 << 16;
   const uint64_t k128KB = 1 << 17;
-  const uint64_t k256KB = 1 << 18;
+  const uint64_t k1MB = 1 << 20;
   const uint64_t k4KB = 1 << 12;
   Options options;
   options.env = env_;
@@ -8763,19 +8763,19 @@ TEST(DBTest, DynamicCompactionOptions) {
   // fill L1 and L2. L1 size should be around 256KB while L2 size should be
   // around 256KB x 4.
   ASSERT_TRUE(dbfull()->SetOptions({
-    {"max_bytes_for_level_base", std::to_string(k256KB) }
+    {"max_bytes_for_level_base", std::to_string(k1MB) }
   }));
 
-  // writing 24 x 64KB => 6 * 256KB
-  // (L1 + L2) = (1 + 4) * 256KB
-  for (int i = 0; i < 24; ++i) {
-    gen_l0_kb(i, 64, 32);
+  // writing 96 x 64KB => 6 * 1024KB
+  // (L1 + L2) = (1 + 4) * 1024KB
+  for (int i = 0; i < 96; ++i) {
+    gen_l0_kb(i, 64, 96);
   }
   dbfull()->TEST_WaitForCompact();
-  ASSERT_TRUE(SizeAtLevel(1) > k256KB * 0.8 &&
-              SizeAtLevel(1) < k256KB * 1.2);
-  ASSERT_TRUE(SizeAtLevel(2) > 4 * k256KB * 0.8 &&
-              SizeAtLevel(2) < 4 * k256KB * 1.2);
+  ASSERT_TRUE(SizeAtLevel(1) > k1MB * 0.5 &&
+              SizeAtLevel(1) < k1MB * 1.5);
+  ASSERT_TRUE(SizeAtLevel(2) > 4 * k1MB * 0.5 &&
+              SizeAtLevel(2) < 4 * k1MB * 1.5);
 
   // Test max_bytes_for_level_multiplier and
   // max_bytes_for_level_base. Now, reduce both mulitplier and level base,
@@ -8792,9 +8792,9 @@ TEST(DBTest, DynamicCompactionOptions) {
     gen_l0_kb(i, 64, 32);
   }
   dbfull()->TEST_WaitForCompact();
-  ASSERT_TRUE(SizeAtLevel(1) < k128KB * 1.2);
-  ASSERT_TRUE(SizeAtLevel(2) < 2 * k128KB * 1.2);
-  ASSERT_TRUE(SizeAtLevel(3) < 4 * k128KB * 1.2);
+  uint64_t total_size =
+    SizeAtLevel(1) + SizeAtLevel(2) + SizeAtLevel(3);
+  ASSERT_TRUE(total_size < k128KB * 7 * 1.5);
 
   // Test level0_stop_writes_trigger.
   // Clean up memtable and L0. Block compaction threads. If continue to write
@@ -8883,7 +8883,7 @@ TEST(DBTest, DynamicCompactionOptions) {
   // L1 - L3. Then thrink max_bytes_for_level_base and disable auto compaction
   // at the same time, we should see some level with score greater than 2.
   ASSERT_TRUE(dbfull()->SetOptions({
-    {"max_bytes_for_level_base", std::to_string(k256KB) }
+    {"max_bytes_for_level_base", std::to_string(k1MB) }
   }));
   // writing 40 x 64KB = 10 x 256KB
   // (L1 + L2 + L3) = (1 + 2 + 4) * 256KB
@@ -8891,12 +8891,12 @@ TEST(DBTest, DynamicCompactionOptions) {
     gen_l0_kb(i, 64, 32);
   }
   dbfull()->TEST_WaitForCompact();
-  ASSERT_TRUE(SizeAtLevel(1) > k256KB * 0.8 &&
-              SizeAtLevel(1) < k256KB * 1.2);
-  ASSERT_TRUE(SizeAtLevel(2) > 2 * k256KB * 0.8 &&
-              SizeAtLevel(2) < 2 * k256KB * 1.2);
-  ASSERT_TRUE(SizeAtLevel(3) > 4 * k256KB * 0.8 &&
-              SizeAtLevel(3) < 4 * k256KB * 1.2);
+  ASSERT_TRUE((SizeAtLevel(1) > k1MB * 0.8 &&
+               SizeAtLevel(1) < k1MB * 1.2) ||
+              (SizeAtLevel(2) > 2 * k1MB * 0.8 &&
+               SizeAtLevel(2) < 2 * k1MB * 1.2) ||
+              (SizeAtLevel(3) > 4 * k1MB * 0.8 &&
+               SizeAtLevel(3) < 4 * k1MB * 1.2));
   // Reduce max_bytes_for_level_base and disable compaction at the same time
   // This should cause score to increase
   ASSERT_TRUE(dbfull()->SetOptions({

From bbd9c53457141e96db841af3cf284595f66346de Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Wed, 29 Oct 2014 17:08:20 -0700
Subject: [PATCH 353/829] Apply InfoLogLevel to the logs in
 table/block_based_table_builder.cc

Summary: Apply InfoLogLevel to the logs in table/block_based_table_builder.cc

Test Plan: make

Reviewers: igor, ljin, sdong

Reviewed By: sdong

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D27921
---
 table/block_based_table_builder.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/table/block_based_table_builder.cc b/table/block_based_table_builder.cc
index 9e4328cd4..c053e7e4f 100644
--- a/table/block_based_table_builder.cc
+++ b/table/block_based_table_builder.cc
@@ -790,7 +790,7 @@ Status BlockBasedTableBuilder::Finish() {
       }
     }
 
-    Log(r->ioptions.info_log,
+    Log(InfoLogLevel::INFO_LEVEL, r->ioptions.info_log,
         "Table was constructed:\n"
         "  [basic properties]: %s\n"
         "  [user collected properties]: %s",

From e7ad69b9fe4ac6ca2b24a4cb5237d2e7832a4a7f Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Wed, 29 Oct 2014 17:08:40 -0700
Subject: [PATCH 354/829] Apply InfoLogLevel to the logs in
 table/plain_table_index.cc

Summary: Apply InfoLogLevel to the logs in table/plain_table_index.cc

Test Plan: make

Reviewers: ljin, sdong, igor

Reviewed By: igor

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D27909
---
 table/plain_table_index.cc | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/table/plain_table_index.cc b/table/plain_table_index.cc
index 61f9e335b..8f68525c4 100644
--- a/table/plain_table_index.cc
+++ b/table/plain_table_index.cc
@@ -93,7 +93,8 @@ Slice PlainTableIndexBuilder::Finish() {
   BucketizeIndexes(&hash_to_offsets, &entries_per_bucket);
 
   keys_per_prefix_hist_.Add(num_keys_per_prefix_);
-  Log(ioptions_.info_log, "Number of Keys per prefix Histogram: %s",
+  Log(InfoLogLevel::INFO_LEVEL, ioptions_.info_log,
+      "Number of Keys per prefix Histogram: %s",
       keys_per_prefix_hist_.ToString().c_str());
 
   // From the temp data structure, populate indexes.
@@ -147,7 +148,8 @@ void PlainTableIndexBuilder::BucketizeIndexes(
 Slice PlainTableIndexBuilder::FillIndexes(
     const std::vector<IndexRecord*>& hash_to_offsets,
     const std::vector<uint32_t>& entries_per_bucket) {
-  Log(ioptions_.info_log, "Reserving %zu bytes for plain table's sub_index",
+  Log(InfoLogLevel::DEBUG_LEVEL, ioptions_.info_log,
+      "Reserving %zu bytes for plain table's sub_index",
       sub_index_size_);
   auto total_allocate_size = GetTotalSize();
   char* allocated = arena_->AllocateAligned(
@@ -191,7 +193,8 @@ Slice PlainTableIndexBuilder::FillIndexes(
   }
   assert(sub_index_offset == sub_index_size_);
 
-  Log(ioptions_.info_log, "hash table size: %d, suffix_map length %zu",
+  Log(InfoLogLevel::DEBUG_INFO, ioptions_.info_log,
+      "hash table size: %d, suffix_map length %zu",
       index_size_, sub_index_size_);
   return Slice(allocated, GetTotalSize());
 }

From 6afafa36949e0ea72f1be1ed80f28507bcc05835 Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Wed, 29 Oct 2014 17:10:21 -0700
Subject: [PATCH 355/829] Apply InfoLogLevel to the logs in
 utilities/merge_operators/uint64add.cc

Summary:
Apply InfoLogLevel to the logs and add missing copy-right information
to  utilities/merge_operators/uint64add.cc.

Test Plan: make

Reviewers: ljin, sdong, igor

Reviewed By: igor

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D27897
---
 utilities/merge_operators/uint64add.cc | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/utilities/merge_operators/uint64add.cc b/utilities/merge_operators/uint64add.cc
index 9d78651ec..d5083e300 100644
--- a/utilities/merge_operators/uint64add.cc
+++ b/utilities/merge_operators/uint64add.cc
@@ -1,3 +1,8 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
 #include <memory>
 #include "rocksdb/env.h"
 #include "rocksdb/merge_operator.h"
@@ -45,7 +50,8 @@ class UInt64AddOperator : public AssociativeMergeOperator {
       result = DecodeFixed64(value.data());
     } else if (logger != nullptr) {
       // If value is corrupted, treat it as 0
-      Log(logger, "uint64 value corruption, size: %zu > %zu",
+      Log(InfoLogLevel::ERROR_LEVEL, logger,
+          "uint64 value corruption, size: %zu > %zu",
           value.size(), sizeof(uint64_t));
     }
 

From c3dd0f75da05b3dfdea7e7d9b6add457f5af0923 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Wed, 29 Oct 2014 16:17:28 -0700
Subject: [PATCH 356/829] comparator_db_test to cover more irregular
 comparators

Summary:
comparator_db_test now adds verification for three more comparators:
(1) one that store double as string
(2) one that cast uint64 to string
(3) one that concatenate two strings, prefixing their sizes.
(4) one that order by hash of the string

Test Plan:
Run ./comparator_db_test

Reviewers: ljin, yhchiang, rven, igor

Reviewed By: igor

Subscribers: leveldb, dhruba

Differential Revision: https://reviews.facebook.net/D27927
---
 db/comparator_db_test.cc | 174 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 174 insertions(+)

diff --git a/db/comparator_db_test.cc b/db/comparator_db_test.cc
index ea24a30a5..548c495cb 100644
--- a/db/comparator_db_test.cc
+++ b/db/comparator_db_test.cc
@@ -9,6 +9,7 @@
 
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
+#include "util/hash.h"
 #include "util/testharness.h"
 #include "util/testutil.h"
 #include "utilities/merge_operators.h"
@@ -165,6 +166,84 @@ void DoRandomIteraratorTest(DB* db, std::vector<std::string> source_strings,
     is_valid = iter->Valid();
   }
 }
+
+class DoubleComparator : public Comparator {
+ public:
+  DoubleComparator() {}
+
+  virtual const char* Name() const { return "DoubleComparator"; }
+
+  virtual int Compare(const Slice& a, const Slice& b) const {
+    double da = std::stod(a.ToString());
+    double db = std::stod(b.ToString());
+    if (da == db) {
+      return a.compare(b);
+    } else if (da > db) {
+      return 1;
+    } else {
+      return -1;
+    }
+  }
+  virtual void FindShortestSeparator(std::string* start,
+                                     const Slice& limit) const {}
+
+  virtual void FindShortSuccessor(std::string* key) const {}
+};
+
+class HashComparator : public Comparator {
+ public:
+  HashComparator() {}
+
+  virtual const char* Name() const { return "HashComparator"; }
+
+  virtual int Compare(const Slice& a, const Slice& b) const {
+    uint32_t ha = Hash(a.data(), a.size(), 66);
+    uint32_t hb = Hash(b.data(), b.size(), 66);
+    if (ha == hb) {
+      return a.compare(b);
+    } else if (ha > hb) {
+      return 1;
+    } else {
+      return -1;
+    }
+  }
+  virtual void FindShortestSeparator(std::string* start,
+                                     const Slice& limit) const {}
+
+  virtual void FindShortSuccessor(std::string* key) const {}
+};
+
+class TwoStrComparator : public Comparator {
+ public:
+  TwoStrComparator() {}
+
+  virtual const char* Name() const { return "TwoStrComparator"; }
+
+  virtual int Compare(const Slice& a, const Slice& b) const {
+    assert(a.size() >= 2);
+    assert(b.size() >= 2);
+    size_t size_a1 = static_cast<size_t>(a[0]);
+    size_t size_b1 = static_cast<size_t>(b[0]);
+    size_t size_a2 = static_cast<size_t>(a[1]);
+    size_t size_b2 = static_cast<size_t>(b[1]);
+    assert(size_a1 + size_a2 + 2 == a.size());
+    assert(size_b1 + size_b2 + 2 == b.size());
+
+    Slice a1 = Slice(a.data() + 2, size_a1);
+    Slice b1 = Slice(b.data() + 2, size_b1);
+    Slice a2 = Slice(a.data() + 2 + size_a1, size_a2);
+    Slice b2 = Slice(b.data() + 2 + size_b1, size_b2);
+
+    if (a1 != b1) {
+      return a1.compare(b1);
+    }
+    return a2.compare(b2);
+  }
+  virtual void FindShortestSeparator(std::string* start,
+                                     const Slice& limit) const {}
+
+  virtual void FindShortSuccessor(std::string* key) const {}
+};
 }  // namespace
 
 class ComparatorDBTest {
@@ -255,6 +334,101 @@ TEST(ComparatorDBTest, SimpleSuffixReverseComparator) {
     DoRandomIteraratorTest(GetDB(), source_strings, &rnd, 30, 600, 66);
   }
 }
+
+TEST(ComparatorDBTest, Uint64Comparator) {
+  SetOwnedComparator(test::Uint64Comparator());
+
+  for (int rnd_seed = 301; rnd_seed < 316; rnd_seed++) {
+    Options* opt = GetOptions();
+    opt->comparator = comparator;
+    DestroyAndReopen();
+    Random rnd(rnd_seed);
+    Random64 rnd64(rnd_seed);
+
+    std::vector<std::string> source_strings;
+    // Randomly generate source keys
+    for (int i = 0; i < 100; i++) {
+      uint64_t r = rnd64.Next();
+      std::string str;
+      str.resize(8);
+      memcpy(&str[0], static_cast<void*>(&r), 8);
+      source_strings.push_back(str);
+    }
+
+    DoRandomIteraratorTest(GetDB(), source_strings, &rnd, 200, 1000, 66);
+  }
+}
+
+TEST(ComparatorDBTest, DoubleComparator) {
+  SetOwnedComparator(new DoubleComparator());
+
+  for (int rnd_seed = 301; rnd_seed < 316; rnd_seed++) {
+    Options* opt = GetOptions();
+    opt->comparator = comparator;
+    DestroyAndReopen();
+    Random rnd(rnd_seed);
+
+    std::vector<std::string> source_strings;
+    // Randomly generate source keys
+    for (int i = 0; i < 100; i++) {
+      uint32_t r = rnd.Next();
+      uint32_t divide_order = rnd.Uniform(8);
+      double to_divide = 1.0;
+      for (uint32_t j = 0; j < divide_order; j++) {
+        to_divide *= 10.0;
+      }
+      source_strings.push_back(std::to_string(r / to_divide));
+    }
+
+    DoRandomIteraratorTest(GetDB(), source_strings, &rnd, 200, 1000, 66);
+  }
+}
+
+TEST(ComparatorDBTest, HashComparator) {
+  SetOwnedComparator(new HashComparator());
+
+  for (int rnd_seed = 301; rnd_seed < 316; rnd_seed++) {
+    Options* opt = GetOptions();
+    opt->comparator = comparator;
+    DestroyAndReopen();
+    Random rnd(rnd_seed);
+
+    std::vector<std::string> source_strings;
+    // Randomly generate source keys
+    for (int i = 0; i < 100; i++) {
+      source_strings.push_back(test::RandomKey(&rnd, 8));
+    }
+
+    DoRandomIteraratorTest(GetDB(), source_strings, &rnd, 200, 1000, 66);
+  }
+}
+
+TEST(ComparatorDBTest, TwoStrComparator) {
+  SetOwnedComparator(new TwoStrComparator());
+
+  for (int rnd_seed = 301; rnd_seed < 316; rnd_seed++) {
+    Options* opt = GetOptions();
+    opt->comparator = comparator;
+    DestroyAndReopen();
+    Random rnd(rnd_seed);
+
+    std::vector<std::string> source_strings;
+    // Randomly generate source keys
+    for (int i = 0; i < 100; i++) {
+      std::string str;
+      uint32_t size1 = rnd.Uniform(8);
+      uint32_t size2 = rnd.Uniform(8);
+      str.append(1, static_cast<char>(size1));
+      str.append(1, static_cast<char>(size2));
+      str.append(test::RandomKey(&rnd, size1));
+      str.append(test::RandomKey(&rnd, size2));
+      source_strings.push_back(str);
+    }
+
+    DoRandomIteraratorTest(GetDB(), source_strings, &rnd, 200, 1000, 66);
+  }
+}
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) { return rocksdb::test::RunAllTests(); }

From fd95745a59f5fc6e0e76b1395314097162486f7b Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Wed, 29 Oct 2014 17:42:38 -0700
Subject: [PATCH 357/829] Fix compile error in table/plain_table_index.cc

Summary:
Fix compile error in table/plain_table_index.cc

Test Plan:
make
---
 table/plain_table_index.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/table/plain_table_index.cc b/table/plain_table_index.cc
index 8f68525c4..b5e3981c1 100644
--- a/table/plain_table_index.cc
+++ b/table/plain_table_index.cc
@@ -193,7 +193,7 @@ Slice PlainTableIndexBuilder::FillIndexes(
   }
   assert(sub_index_offset == sub_index_size_);
 
-  Log(InfoLogLevel::DEBUG_INFO, ioptions_.info_log,
+  Log(InfoLogLevel::DEBUG_LEVEL, ioptions_.info_log,
       "hash table size: %d, suffix_map length %zu",
       index_size_, sub_index_size_);
   return Slice(allocated, GetTotalSize());

From 635905481d5d0f29010dfadfda20b6a632215a3f Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Wed, 29 Oct 2014 17:43:37 -0700
Subject: [PATCH 358/829] WalManager

Summary: Decoupling code that deals with archived log files outside of DBImpl. That will make this code easier to reason about and test. It will also make the code easier to improve, because an improver doesn't have to understand DBImpl code in entirety.

Test Plan: added test

Reviewers: ljin, yhchiang, rven, sdong

Reviewed By: sdong

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D27873
---
 Makefile                   |   6 +-
 db/db_filesnapshot.cc      |  51 +----
 db/db_impl.cc              | 366 ++----------------------------
 db/db_impl.h               |  53 +----
 db/db_impl_debug.cc        |  13 --
 db/db_test.cc              | 191 +---------------
 db/transaction_log_impl.cc |  27 ++-
 db/transaction_log_impl.h  |   8 +-
 db/wal_manager.cc          | 445 +++++++++++++++++++++++++++++++++++++
 db/wal_manager.h           |  95 ++++++++
 db/wal_manager_test.cc     | 279 +++++++++++++++++++++++
 11 files changed, 873 insertions(+), 661 deletions(-)
 create mode 100644 db/wal_manager.cc
 create mode 100644 db/wal_manager.h
 create mode 100644 db/wal_manager_test.cc

diff --git a/Makefile b/Makefile
index 52019a17f..8642834b8 100644
--- a/Makefile
+++ b/Makefile
@@ -146,7 +146,8 @@ TESTS = \
 	cuckoo_table_reader_test \
 	cuckoo_table_db_test \
 	write_batch_with_index_test \
-	flush_job_test
+	flush_job_test \
+	wal_manager_test
 
 TOOLS = \
         sst_dump \
@@ -421,6 +422,9 @@ write_batch_with_index_test: utilities/write_batch_with_index/write_batch_with_i
 flush_job_test: db/flush_job_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) db/flush_job_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@  $(LDFLAGS) $(COVERAGEFLAGS)
 
+wal_manager_test: db/wal_manager_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) db/wal_manager_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@  $(LDFLAGS) $(COVERAGEFLAGS)
+
 dbformat_test: db/dbformat_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) db/dbformat_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
 
diff --git a/db/db_filesnapshot.cc b/db/db_filesnapshot.cc
index eeee99c1b..48819e766 100644
--- a/db/db_filesnapshot.cc
+++ b/db/db_filesnapshot.cc
@@ -132,57 +132,8 @@ Status DBImpl::GetLiveFiles(std::vector<std::string>& ret,
 }
 
 Status DBImpl::GetSortedWalFiles(VectorLogPtr& files) {
-  // First get sorted files in db dir, then get sorted files from archived
-  // dir, to avoid a race condition where a log file is moved to archived
-  // dir in between.
-  Status s;
-  // list wal files in main db dir.
-  VectorLogPtr logs;
-  s = GetSortedWalsOfType(db_options_.wal_dir, logs, kAliveLogFile);
-  if (!s.ok()) {
-    return s;
-  }
-
-  // Reproduce the race condition where a log file is moved
-  // to archived dir, between these two sync points, used in
-  // (DBTest,TransactionLogIteratorRace)
-  TEST_SYNC_POINT("DBImpl::GetSortedWalFiles:1");
-  TEST_SYNC_POINT("DBImpl::GetSortedWalFiles:2");
-
-  files.clear();
-  // list wal files in archive dir.
-  std::string archivedir = ArchivalDirectory(db_options_.wal_dir);
-  if (env_->FileExists(archivedir)) {
-    s = GetSortedWalsOfType(archivedir, files, kArchivedLogFile);
-    if (!s.ok()) {
-      return s;
-    }
-  }
-
-  uint64_t latest_archived_log_number = 0;
-  if (!files.empty()) {
-    latest_archived_log_number = files.back()->LogNumber();
-    Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
-        "Latest Archived log: %" PRIu64, latest_archived_log_number);
-  }
-
-  files.reserve(files.size() + logs.size());
-  for (auto& log : logs) {
-    if (log->LogNumber() > latest_archived_log_number) {
-      files.push_back(std::move(log));
-    } else {
-      // When the race condition happens, we could see the
-      // same log in both db dir and archived dir. Simply
-      // ignore the one in db dir. Note that, if we read
-      // archived dir first, we would have missed the log file.
-      Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log,
-          "%s already moved to archive", log->PathName().c_str());
-    }
-  }
-
-  return s;
+  return wal_manager_.GetSortedWalFiles(files);
 }
-
 }
 
 #endif  // ROCKSDB_LITE
diff --git a/db/db_impl.cc b/db/db_impl.cc
index 345188703..78fb4ce13 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -342,11 +342,12 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname)
       manual_compaction_(nullptr),
       disable_delete_obsolete_files_(0),
       delete_obsolete_files_last_run_(options.env->NowMicros()),
-      purge_wal_files_last_run_(0),
       last_stats_dump_time_microsec_(0),
-      default_interval_to_delete_obsolete_WAL_(600),
       flush_on_destroy_(false),
       env_options_(options),
+#ifndef ROCKSDB_LITE
+      wal_manager_(db_options_, env_options_),
+#endif  // ROCKSDB_LITE
       bg_work_gate_closed_(false),
       refitting_level_(false),
       opened_successfully_(false) {
@@ -738,23 +739,20 @@ void DBImpl::PurgeObsoleteFiles(const JobContext& state) {
           db_options_.wal_dir : dbname_) + "/" + to_delete;
     }
 
-    if (type == kLogFile &&
-        (db_options_.WAL_ttl_seconds > 0 ||
-         db_options_.WAL_size_limit_MB > 0)) {
-      auto archived_log_name = ArchivedLogFileName(db_options_.wal_dir, number);
-      // The sync point below is used in (DBTest,TransactionLogIteratorRace)
-      TEST_SYNC_POINT("DBImpl::PurgeObsoleteFiles:1");
-      Status s = env_->RenameFile(fname, archived_log_name);
-      // The sync point below is used in (DBTest,TransactionLogIteratorRace)
-      TEST_SYNC_POINT("DBImpl::PurgeObsoleteFiles:2");
-      Log(db_options_.info_log,
-          "Move log file %s to %s -- %s\n",
-          fname.c_str(), archived_log_name.c_str(), s.ToString().c_str());
+#ifdef ROCKSDB_LITE
+    Status s = env_->DeleteFile(fname);
+    Log(db_options_.info_log, "Delete %s type=%d #%" PRIu64 " -- %s\n",
+        fname.c_str(), type, number, s.ToString().c_str());
+#else   // not ROCKSDB_LITE
+    if (type == kLogFile && (db_options_.WAL_ttl_seconds > 0 ||
+                             db_options_.WAL_size_limit_MB > 0)) {
+      wal_manager_.ArchiveWALFile(fname, number);
     } else {
       Status s = env_->DeleteFile(fname);
       Log(db_options_.info_log, "Delete %s type=%d #%" PRIu64 " -- %s\n",
           fname.c_str(), type, number, s.ToString().c_str());
     }
+#endif  // ROCKSDB_LITE
   }
 
   // Delete old info log files.
@@ -775,7 +773,9 @@ void DBImpl::PurgeObsoleteFiles(const JobContext& state) {
       }
     }
   }
-  PurgeObsoleteWALFiles();
+#ifndef ROCKSDB_LITE
+  wal_manager_.PurgeObsoleteWALFiles();
+#endif  // ROCKSDB_LITE
   LogFlush(db_options_.info_log);
 }
 
@@ -788,324 +788,6 @@ void DBImpl::DeleteObsoleteFiles() {
   }
 }
 
-#ifndef ROCKSDB_LITE
-// 1. Go through all archived files and
-//    a. if ttl is enabled, delete outdated files
-//    b. if archive size limit is enabled, delete empty files,
-//        compute file number and size.
-// 2. If size limit is enabled:
-//    a. compute how many files should be deleted
-//    b. get sorted non-empty archived logs
-//    c. delete what should be deleted
-void DBImpl::PurgeObsoleteWALFiles() {
-  bool const ttl_enabled = db_options_.WAL_ttl_seconds > 0;
-  bool const size_limit_enabled =  db_options_.WAL_size_limit_MB > 0;
-  if (!ttl_enabled && !size_limit_enabled) {
-    return;
-  }
-
-  int64_t current_time;
-  Status s = env_->GetCurrentTime(&current_time);
-  if (!s.ok()) {
-    Log(db_options_.info_log, "Can't get current time: %s",
-        s.ToString().c_str());
-    assert(false);
-    return;
-  }
-  uint64_t const now_seconds = static_cast<uint64_t>(current_time);
-  uint64_t const time_to_check = (ttl_enabled && !size_limit_enabled) ?
-    db_options_.WAL_ttl_seconds / 2 : default_interval_to_delete_obsolete_WAL_;
-
-  if (purge_wal_files_last_run_ + time_to_check > now_seconds) {
-    return;
-  }
-
-  purge_wal_files_last_run_ = now_seconds;
-
-  std::string archival_dir = ArchivalDirectory(db_options_.wal_dir);
-  std::vector<std::string> files;
-  s = env_->GetChildren(archival_dir, &files);
-  if (!s.ok()) {
-    Log(db_options_.info_log, "Can't get archive files: %s",
-        s.ToString().c_str());
-    assert(false);
-    return;
-  }
-
-  size_t log_files_num = 0;
-  uint64_t log_file_size = 0;
-
-  for (auto& f : files) {
-    uint64_t number;
-    FileType type;
-    if (ParseFileName(f, &number, &type) && type == kLogFile) {
-      std::string const file_path = archival_dir + "/" + f;
-      if (ttl_enabled) {
-        uint64_t file_m_time;
-        Status const s = env_->GetFileModificationTime(file_path,
-          &file_m_time);
-        if (!s.ok()) {
-          Log(db_options_.info_log, "Can't get file mod time: %s: %s",
-              file_path.c_str(), s.ToString().c_str());
-          continue;
-        }
-        if (now_seconds - file_m_time > db_options_.WAL_ttl_seconds) {
-          Status const s = env_->DeleteFile(file_path);
-          if (!s.ok()) {
-            Log(db_options_.info_log, "Can't delete file: %s: %s",
-                file_path.c_str(), s.ToString().c_str());
-            continue;
-          } else {
-            MutexLock l(&read_first_record_cache_mutex_);
-            read_first_record_cache_.erase(number);
-          }
-          continue;
-        }
-      }
-
-      if (size_limit_enabled) {
-        uint64_t file_size;
-        Status const s = env_->GetFileSize(file_path, &file_size);
-        if (!s.ok()) {
-          Log(db_options_.info_log, "Can't get file size: %s: %s",
-              file_path.c_str(), s.ToString().c_str());
-          return;
-        } else {
-          if (file_size > 0) {
-            log_file_size = std::max(log_file_size, file_size);
-            ++log_files_num;
-          } else {
-            Status s = env_->DeleteFile(file_path);
-            if (!s.ok()) {
-              Log(db_options_.info_log, "Can't delete file: %s: %s",
-                  file_path.c_str(), s.ToString().c_str());
-              continue;
-            } else {
-              MutexLock l(&read_first_record_cache_mutex_);
-              read_first_record_cache_.erase(number);
-            }
-          }
-        }
-      }
-    }
-  }
-
-  if (0 == log_files_num || !size_limit_enabled) {
-    return;
-  }
-
-  size_t const files_keep_num = db_options_.WAL_size_limit_MB *
-    1024 * 1024 / log_file_size;
-  if (log_files_num <= files_keep_num) {
-    return;
-  }
-
-  size_t files_del_num = log_files_num - files_keep_num;
-  VectorLogPtr archived_logs;
-  GetSortedWalsOfType(archival_dir, archived_logs, kArchivedLogFile);
-
-  if (files_del_num > archived_logs.size()) {
-    Log(db_options_.info_log, "Trying to delete more archived log files than "
-        "exist. Deleting all");
-    files_del_num = archived_logs.size();
-  }
-
-  for (size_t i = 0; i < files_del_num; ++i) {
-    std::string const file_path = archived_logs[i]->PathName();
-    Status const s = DeleteFile(file_path);
-    if (!s.ok()) {
-      Log(db_options_.info_log, "Can't delete file: %s: %s",
-          file_path.c_str(), s.ToString().c_str());
-      continue;
-    } else {
-      MutexLock l(&read_first_record_cache_mutex_);
-      read_first_record_cache_.erase(archived_logs[i]->LogNumber());
-    }
-  }
-}
-
-namespace {
-struct CompareLogByPointer {
-  bool operator()(const unique_ptr<LogFile>& a, const unique_ptr<LogFile>& b) {
-    LogFileImpl* a_impl = dynamic_cast<LogFileImpl*>(a.get());
-    LogFileImpl* b_impl = dynamic_cast<LogFileImpl*>(b.get());
-    return *a_impl < *b_impl;
-  }
-};
-}
-
-Status DBImpl::GetSortedWalsOfType(const std::string& path,
-                                   VectorLogPtr& log_files,
-                                   WalFileType log_type) {
-  std::vector<std::string> all_files;
-  const Status status = env_->GetChildren(path, &all_files);
-  if (!status.ok()) {
-    return status;
-  }
-  log_files.reserve(all_files.size());
-  for (const auto& f : all_files) {
-    uint64_t number;
-    FileType type;
-    if (ParseFileName(f, &number, &type) && type == kLogFile) {
-      SequenceNumber sequence;
-      Status s = ReadFirstRecord(log_type, number, &sequence);
-      if (!s.ok()) {
-        return s;
-      }
-      if (sequence == 0) {
-        // empty file
-        continue;
-      }
-
-      // Reproduce the race condition where a log file is moved
-      // to archived dir, between these two sync points, used in
-      // (DBTest,TransactionLogIteratorRace)
-      TEST_SYNC_POINT("DBImpl::GetSortedWalsOfType:1");
-      TEST_SYNC_POINT("DBImpl::GetSortedWalsOfType:2");
-
-      uint64_t size_bytes;
-      s = env_->GetFileSize(LogFileName(path, number), &size_bytes);
-      // re-try in case the alive log file has been moved to archive.
-      if (!s.ok() && log_type == kAliveLogFile &&
-          env_->FileExists(ArchivedLogFileName(path, number))) {
-        s = env_->GetFileSize(ArchivedLogFileName(path, number), &size_bytes);
-      }
-      if (!s.ok()) {
-        return s;
-      }
-
-      log_files.push_back(std::move(unique_ptr<LogFile>(
-          new LogFileImpl(number, log_type, sequence, size_bytes))));
-    }
-  }
-  CompareLogByPointer compare_log_files;
-  std::sort(log_files.begin(), log_files.end(), compare_log_files);
-  return status;
-}
-
-Status DBImpl::RetainProbableWalFiles(VectorLogPtr& all_logs,
-                                      const SequenceNumber target) {
-  int64_t start = 0;  // signed to avoid overflow when target is < first file.
-  int64_t end = static_cast<int64_t>(all_logs.size()) - 1;
-  // Binary Search. avoid opening all files.
-  while (end >= start) {
-    int64_t mid = start + (end - start) / 2;  // Avoid overflow.
-    SequenceNumber current_seq_num = all_logs.at(mid)->StartSequence();
-    if (current_seq_num == target) {
-      end = mid;
-      break;
-    } else if (current_seq_num < target) {
-      start = mid + 1;
-    } else {
-      end = mid - 1;
-    }
-  }
-  // end could be -ve.
-  size_t start_index = std::max(static_cast<int64_t>(0), end);
-  // The last wal file is always included
-  all_logs.erase(all_logs.begin(), all_logs.begin() + start_index);
-  return Status::OK();
-}
-
-Status DBImpl::ReadFirstRecord(const WalFileType type, const uint64_t number,
-                               SequenceNumber* sequence) {
-  if (type != kAliveLogFile && type != kArchivedLogFile) {
-    return Status::NotSupported("File Type Not Known " + std::to_string(type));
-  }
-  {
-    MutexLock l(&read_first_record_cache_mutex_);
-    auto itr = read_first_record_cache_.find(number);
-    if (itr != read_first_record_cache_.end()) {
-      *sequence = itr->second;
-      return Status::OK();
-    }
-  }
-  Status s;
-  if (type == kAliveLogFile) {
-    std::string fname = LogFileName(db_options_.wal_dir, number);
-    s = ReadFirstLine(fname, sequence);
-    if (env_->FileExists(fname) && !s.ok()) {
-      // return any error that is not caused by non-existing file
-      return s;
-    }
-  }
-
-  if (type == kArchivedLogFile || !s.ok()) {
-    //  check if the file got moved to archive.
-    std::string archived_file =
-        ArchivedLogFileName(db_options_.wal_dir, number);
-    s = ReadFirstLine(archived_file, sequence);
-  }
-
-  if (s.ok() && *sequence != 0) {
-    MutexLock l(&read_first_record_cache_mutex_);
-    read_first_record_cache_.insert({number, *sequence});
-  }
-  return s;
-}
-
-// the function returns status.ok() and sequence == 0 if the file exists, but is
-// empty
-Status DBImpl::ReadFirstLine(const std::string& fname,
-                             SequenceNumber* sequence) {
-  struct LogReporter : public log::Reader::Reporter {
-    Env* env;
-    Logger* info_log;
-    const char* fname;
-
-    Status* status;
-    bool ignore_error;  // true if db_options_.paranoid_checks==false
-    virtual void Corruption(size_t bytes, const Status& s) {
-      Log(info_log, "%s%s: dropping %d bytes; %s",
-          (this->ignore_error ? "(ignoring error) " : ""), fname,
-          static_cast<int>(bytes), s.ToString().c_str());
-      if (this->status->ok()) {
-        // only keep the first error
-        *this->status = s;
-      }
-    }
-  };
-
-  unique_ptr<SequentialFile> file;
-  Status status = env_->NewSequentialFile(fname, &file, env_options_);
-
-  if (!status.ok()) {
-    return status;
-  }
-
-  LogReporter reporter;
-  reporter.env = env_;
-  reporter.info_log = db_options_.info_log.get();
-  reporter.fname = fname.c_str();
-  reporter.status = &status;
-  reporter.ignore_error = !db_options_.paranoid_checks;
-  log::Reader reader(std::move(file), &reporter, true /*checksum*/,
-                     0 /*initial_offset*/);
-  std::string scratch;
-  Slice record;
-
-  if (reader.ReadRecord(&record, &scratch) &&
-      (status.ok() || !db_options_.paranoid_checks)) {
-    if (record.size() < 12) {
-      reporter.Corruption(record.size(),
-                          Status::Corruption("log record too small"));
-      // TODO read record's till the first no corrupt entry?
-    } else {
-      WriteBatch batch;
-      WriteBatchInternal::SetContents(&batch, record);
-      *sequence = WriteBatchInternal::Sequence(&batch);
-      return Status::OK();
-    }
-  }
-
-  // ReadRecord returns false on EOF, which means that the log file is empty. we
-  // return status.ok() in that case and set sequence number to 0
-  *sequence = 0;
-  return status;
-}
-
-#endif  // ROCKSDB_LITE
-
 Status DBImpl::Recover(
     const std::vector<ColumnFamilyDescriptor>& column_families, bool read_only,
     bool error_if_log_file_exist) {
@@ -4304,23 +3986,7 @@ Status DBImpl::GetUpdatesSince(
   if (seq > versions_->LastSequence()) {
     return Status::NotFound("Requested sequence not yet written in the db");
   }
-  //  Get all sorted Wal Files.
-  //  Do binary search and open files and find the seq number.
-
-  std::unique_ptr<VectorLogPtr> wal_files(new VectorLogPtr);
-  Status s = GetSortedWalFiles(*wal_files);
-  if (!s.ok()) {
-    return s;
-  }
-
-  s = RetainProbableWalFiles(*wal_files, seq);
-  if (!s.ok()) {
-    return s;
-  }
-  iter->reset(new TransactionLogIteratorImpl(db_options_.wal_dir, &db_options_,
-                                             read_options, env_options_,
-                                             seq, std::move(wal_files), this));
-  return (*iter)->status();
+  return wal_manager_.GetUpdatesSince(seq, iter, read_options, versions_.get());
 }
 
 Status DBImpl::DeleteFile(std::string name) {
diff --git a/db/db_impl.h b/db/db_impl.h
index 15205d90b..547a85da5 100644
--- a/db/db_impl.h
+++ b/db/db_impl.h
@@ -21,6 +21,7 @@
 #include "db/snapshot.h"
 #include "db/column_family.h"
 #include "db/version_edit.h"
+#include "db/wal_manager.h"
 #include "memtable_list.h"
 #include "port/port.h"
 #include "rocksdb/db.h"
@@ -193,25 +194,12 @@ class DBImpl : public DB {
   // Return the current manifest file no.
   uint64_t TEST_Current_Manifest_FileNo();
 
-  // Trigger's a background call for testing.
-  void TEST_PurgeObsoleteteWAL();
-
   // get total level0 file size. Only for testing.
   uint64_t TEST_GetLevel0TotalSize();
 
-  void TEST_SetDefaultTimeToCheck(uint64_t default_interval_to_delete_obsolete_WAL)
-  {
-    default_interval_to_delete_obsolete_WAL_ = default_interval_to_delete_obsolete_WAL;
-  }
-
   void TEST_GetFilesMetaData(ColumnFamilyHandle* column_family,
                              std::vector<std::vector<FileMetaData>>* metadata);
 
-  Status TEST_ReadFirstRecord(const WalFileType type, const uint64_t number,
-                              SequenceNumber* sequence);
-
-  Status TEST_ReadFirstLine(const std::string& fname, SequenceNumber* sequence);
-
   void TEST_LockMutex();
 
   void TEST_UnlockMutex();
@@ -355,30 +343,6 @@ class DBImpl : public DB {
   void AllocateCompactionOutputFileNumbers(CompactionState* compact);
   void ReleaseCompactionUnusedFileNumbers(CompactionState* compact);
 
-#ifdef ROCKSDB_LITE
-  void PurgeObsoleteWALFiles() {
-    // this function is used for archiving WAL files. we don't need this in
-    // ROCKSDB_LITE
-  }
-#else
-  void PurgeObsoleteWALFiles();
-
-  Status GetSortedWalsOfType(const std::string& path,
-                             VectorLogPtr& log_files,
-                             WalFileType type);
-
-  // Requires: all_logs should be sorted with earliest log file first
-  // Retains all log files in all_logs which contain updates with seq no.
-  // Greater Than or Equal to the requested SequenceNumber.
-  Status RetainProbableWalFiles(VectorLogPtr& all_logs,
-                                const SequenceNumber target);
-
-  Status ReadFirstRecord(const WalFileType type, const uint64_t number,
-                         SequenceNumber* sequence);
-
-  Status ReadFirstLine(const std::string& fname, SequenceNumber* sequence);
-#endif  // ROCKSDB_LITE
-
   void PrintStatistics();
 
   // dump rocksdb.stats to LOG
@@ -453,10 +417,6 @@ class DBImpl : public DB {
 
   SnapshotList snapshots_;
 
-  // cache for ReadFirstRecord() calls
-  std::unordered_map<uint64_t, SequenceNumber> read_first_record_cache_;
-  port::Mutex read_first_record_cache_mutex_;
-
   // Set of table files to protect from deletion because they are
   // part of ongoing compactions.
   // map from pending file number ID to their path IDs.
@@ -506,16 +466,9 @@ class DBImpl : public DB {
   // last time when DeleteObsoleteFiles was invoked
   uint64_t delete_obsolete_files_last_run_;
 
-  // last time when PurgeObsoleteWALFiles ran.
-  uint64_t purge_wal_files_last_run_;
-
   // last time stats were dumped to LOG
   std::atomic<uint64_t> last_stats_dump_time_microsec_;
 
-  // obsolete files will be deleted every this seconds if ttl deletion is
-  // enabled and archive size_limit is disabled.
-  uint64_t default_interval_to_delete_obsolete_WAL_;
-
   bool flush_on_destroy_; // Used when disableWAL is true.
 
   static const int KEEP_LOG_FILE_NUM = 1000;
@@ -524,6 +477,10 @@ class DBImpl : public DB {
   // The options to access storage files
   const EnvOptions env_options_;
 
+#ifndef ROCKSDB_LITE
+  WalManager wal_manager_;
+#endif  // ROCKSDB_LITE
+
   // A value of true temporarily disables scheduling of background work
   bool bg_work_gate_closed_;
 
diff --git a/db/db_impl_debug.cc b/db/db_impl_debug.cc
index a7be59313..2d67167ba 100644
--- a/db/db_impl_debug.cc
+++ b/db/db_impl_debug.cc
@@ -13,8 +13,6 @@
 
 namespace rocksdb {
 
-void DBImpl::TEST_PurgeObsoleteteWAL() { PurgeObsoleteWALFiles(); }
-
 uint64_t DBImpl::TEST_GetLevel0TotalSize() {
   MutexLock l(&mutex_);
   return default_cf_handle_->cfd()->current()->GetStorageInfo()->NumLevelBytes(
@@ -122,17 +120,6 @@ Status DBImpl::TEST_WaitForCompact() {
   return bg_error_;
 }
 
-Status DBImpl::TEST_ReadFirstRecord(const WalFileType type,
-                                    const uint64_t number,
-                                    SequenceNumber* sequence) {
-  return ReadFirstRecord(type, number, sequence);
-}
-
-Status DBImpl::TEST_ReadFirstLine(const std::string& fname,
-                                  SequenceNumber* sequence) {
-  return ReadFirstLine(fname, sequence);
-}
-
 void DBImpl::TEST_LockMutex() {
   mutex_.Lock();
 }
diff --git a/db/db_test.cc b/db/db_test.cc
index 477c6c812..59b611c65 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -6438,10 +6438,6 @@ std::vector<std::uint64_t> ListSpecificFiles(
   return std::move(file_numbers);
 }
 
-std::vector<std::uint64_t> ListLogFiles(Env* env, const std::string& path) {
-  return ListSpecificFiles(env, path, kLogFile);
-}
-
 std::vector<std::uint64_t> ListTableFiles(Env* env, const std::string& path) {
   return ListSpecificFiles(env, path, kTableFile);
 }
@@ -6593,114 +6589,6 @@ TEST(DBTest, RecoverCheckFileAmount) {
   }
 }
 
-TEST(DBTest, WALArchivalTtl) {
-  do {
-    Options options = CurrentOptions();
-    options.create_if_missing = true;
-    options.WAL_ttl_seconds = 1000;
-    DestroyAndReopen(options);
-
-    //  TEST : Create DB with a ttl and no size limit.
-    //  Put some keys. Count the log files present in the DB just after insert.
-    //  Re-open db. Causes deletion/archival to take place.
-    //  Assert that the files moved under "/archive".
-    //  Reopen db with small ttl.
-    //  Assert that archive was removed.
-
-    std::string archiveDir = ArchivalDirectory(dbname_);
-
-    for (int i = 0; i < 10; ++i) {
-      for (int j = 0; j < 10; ++j) {
-        ASSERT_OK(Put(Key(10 * i + j), DummyString(1024)));
-      }
-
-      std::vector<uint64_t> log_files = ListLogFiles(env_, dbname_);
-
-      options.create_if_missing = false;
-      Reopen(options);
-
-      std::vector<uint64_t> logs = ListLogFiles(env_, archiveDir);
-      std::set<uint64_t> archivedFiles(logs.begin(), logs.end());
-
-      for (auto& log : log_files) {
-        ASSERT_TRUE(archivedFiles.find(log) != archivedFiles.end());
-      }
-    }
-
-    std::vector<uint64_t> log_files = ListLogFiles(env_, archiveDir);
-    ASSERT_TRUE(log_files.size() > 0);
-
-    options.WAL_ttl_seconds = 1;
-    env_->SleepForMicroseconds(2 * 1000 * 1000);
-    Reopen(options);
-
-    log_files = ListLogFiles(env_, archiveDir);
-    ASSERT_TRUE(log_files.empty());
-  } while (ChangeCompactOptions());
-}
-
-namespace {
-uint64_t GetLogDirSize(std::string dir_path, SpecialEnv* env) {
-  uint64_t dir_size = 0;
-  std::vector<std::string> files;
-  env->GetChildren(dir_path, &files);
-  for (auto& f : files) {
-    uint64_t number;
-    FileType type;
-    if (ParseFileName(f, &number, &type) && type == kLogFile) {
-      std::string const file_path = dir_path + "/" + f;
-      uint64_t file_size;
-      env->GetFileSize(file_path, &file_size);
-      dir_size += file_size;
-    }
-  }
-  return dir_size;
-}
-}  // namespace
-
-TEST(DBTest, WALArchivalSizeLimit) {
-  do {
-    Options options = CurrentOptions();
-    options.create_if_missing = true;
-    options.WAL_ttl_seconds = 0;
-    options.WAL_size_limit_MB = 1000;
-
-    // TEST : Create DB with huge size limit and no ttl.
-    // Put some keys. Count the archived log files present in the DB
-    // just after insert. Assert that there are many enough.
-    // Change size limit. Re-open db.
-    // Assert that archive is not greater than WAL_size_limit_MB.
-    // Set ttl and time_to_check_ to small values. Re-open db.
-    // Assert that there are no archived logs left.
-
-    DestroyAndReopen(options);
-    for (int i = 0; i < 128 * 128; ++i) {
-      ASSERT_OK(Put(Key(i), DummyString(1024)));
-    }
-    Reopen(options);
-
-    std::string archive_dir = ArchivalDirectory(dbname_);
-    std::vector<std::uint64_t> log_files = ListLogFiles(env_, archive_dir);
-    ASSERT_TRUE(log_files.size() > 2);
-
-    options.WAL_size_limit_MB = 8;
-    Reopen(options);
-    dbfull()->TEST_PurgeObsoleteteWAL();
-
-    uint64_t archive_size = GetLogDirSize(archive_dir, env_);
-    ASSERT_TRUE(archive_size <= options.WAL_size_limit_MB * 1024 * 1024);
-
-    options.WAL_ttl_seconds = 1;
-    dbfull()->TEST_SetDefaultTimeToCheck(1);
-    env_->SleepForMicroseconds(2 * 1000 * 1000);
-    Reopen(options);
-    dbfull()->TEST_PurgeObsoleteteWAL();
-
-    log_files = ListLogFiles(env_, archive_dir);
-    ASSERT_TRUE(log_files.empty());
-  } while (ChangeCompactOptions());
-}
-
 TEST(DBTest, PurgeInfoLogs) {
   Options options = CurrentOptions();
   options.keep_log_file_num = 5;
@@ -6804,11 +6692,13 @@ TEST(DBTest, TransactionLogIterator) {
 #ifndef NDEBUG // sync point is not included with DNDEBUG build
 TEST(DBTest, TransactionLogIteratorRace) {
   static const int LOG_ITERATOR_RACE_TEST_COUNT = 2;
-  static const char* sync_points[LOG_ITERATOR_RACE_TEST_COUNT][4] =
-    { { "DBImpl::GetSortedWalFiles:1", "DBImpl::PurgeObsoleteFiles:1",
-        "DBImpl::PurgeObsoleteFiles:2", "DBImpl::GetSortedWalFiles:2" },
-      { "DBImpl::GetSortedWalsOfType:1", "DBImpl::PurgeObsoleteFiles:1",
-        "DBImpl::PurgeObsoleteFiles:2", "DBImpl::GetSortedWalsOfType:2" }};
+  static const char* sync_points[LOG_ITERATOR_RACE_TEST_COUNT][4] = {
+      {"WalManager::GetSortedWalFiles:1",  "WalManager::PurgeObsoleteFiles:1",
+       "WalManager::PurgeObsoleteFiles:2", "WalManager::GetSortedWalFiles:2"},
+      {"WalManager::GetSortedWalsOfType:1",
+       "WalManager::PurgeObsoleteFiles:1",
+       "WalManager::PurgeObsoleteFiles:2",
+       "WalManager::GetSortedWalsOfType:2"}};
   for (int test = 0; test < LOG_ITERATOR_RACE_TEST_COUNT; ++test) {
     // Setup sync point dependency to reproduce the race condition of
     // a log file moved to archived dir, in the middle of GetSortedWalFiles
@@ -6856,24 +6746,6 @@ TEST(DBTest, TransactionLogIteratorRace) {
 }
 #endif
 
-TEST(DBTest, TransactionLogIteratorMoveOverZeroFiles) {
-  do {
-    Options options = OptionsForLogIterTest();
-    DestroyAndReopen(options);
-    CreateAndReopenWithCF({"pikachu"}, options);
-    // Do a plain Reopen.
-    Put(1, "key1", DummyString(1024));
-    // Two reopens should create a zero record WAL file.
-    ReopenWithColumnFamilies({"default", "pikachu"}, options);
-    ReopenWithColumnFamilies({"default", "pikachu"}, options);
-
-    Put(1, "key2", DummyString(1024));
-
-    auto iter = OpenTransactionLogIter(0);
-    ExpectRecords(2, iter);
-  } while (ChangeCompactOptions());
-}
-
 TEST(DBTest, TransactionLogIteratorStallAtLastRecord) {
   do {
     Options options = OptionsForLogIterTest();
@@ -6892,17 +6764,6 @@ TEST(DBTest, TransactionLogIteratorStallAtLastRecord) {
   } while (ChangeCompactOptions());
 }
 
-TEST(DBTest, TransactionLogIteratorJustEmptyFile) {
-  do {
-    Options options = OptionsForLogIterTest();
-    DestroyAndReopen(options);
-    unique_ptr<TransactionLogIterator> iter;
-    Status status = dbfull()->GetUpdatesSince(0, &iter);
-    // Check that an empty iterator is returned
-    ASSERT_TRUE(!iter->Valid());
-  } while (ChangeCompactOptions());
-}
-
 TEST(DBTest, TransactionLogIteratorCheckAfterRestart) {
   do {
     Options options = OptionsForLogIterTest();
@@ -7013,44 +6874,6 @@ TEST(DBTest, TransactionLogIteratorBlobs) {
       handler.seen);
 }
 
-TEST(DBTest, ReadFirstRecordCache) {
-  Options options = CurrentOptions();
-  options.env = env_;
-  options.create_if_missing = true;
-  DestroyAndReopen(options);
-
-  std::string path = dbname_ + "/000001.log";
-  unique_ptr<WritableFile> file;
-  ASSERT_OK(env_->NewWritableFile(path, &file, EnvOptions()));
-
-  SequenceNumber s;
-  ASSERT_OK(dbfull()->TEST_ReadFirstLine(path, &s));
-  ASSERT_EQ(s, 0U);
-
-  ASSERT_OK(dbfull()->TEST_ReadFirstRecord(kAliveLogFile, 1, &s));
-  ASSERT_EQ(s, 0U);
-
-  log::Writer writer(std::move(file));
-  WriteBatch batch;
-  batch.Put("foo", "bar");
-  WriteBatchInternal::SetSequence(&batch, 10);
-  writer.AddRecord(WriteBatchInternal::Contents(&batch));
-
-  env_->count_sequential_reads_ = true;
-  // sequential_read_counter_ sanity test
-  ASSERT_EQ(env_->sequential_read_counter_.Read(), 0);
-
-  ASSERT_OK(dbfull()->TEST_ReadFirstRecord(kAliveLogFile, 1, &s));
-  ASSERT_EQ(s, 10U);
-  // did a read
-  ASSERT_EQ(env_->sequential_read_counter_.Read(), 1);
-
-  ASSERT_OK(dbfull()->TEST_ReadFirstRecord(kAliveLogFile, 1, &s));
-  ASSERT_EQ(s, 10U);
-  // no new reads since the value is cached
-  ASSERT_EQ(env_->sequential_read_counter_.Read(), 1);
-}
-
 // Multi-threaded test:
 namespace {
 
diff --git a/db/transaction_log_impl.cc b/db/transaction_log_impl.cc
index bfcf7b328..6fc9fbaae 100644
--- a/db/transaction_log_impl.cc
+++ b/db/transaction_log_impl.cc
@@ -4,6 +4,11 @@
 //  of patent rights can be found in the PATENTS file in the same directory.
 
 #ifndef ROCKSDB_LITE
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include <inttypes.h>
 #include "db/transaction_log_impl.h"
 #include "db/write_batch_internal.h"
 
@@ -13,7 +18,7 @@ TransactionLogIteratorImpl::TransactionLogIteratorImpl(
     const std::string& dir, const DBOptions* options,
     const TransactionLogIterator::ReadOptions& read_options,
     const EnvOptions& soptions, const SequenceNumber seq,
-    std::unique_ptr<VectorLogPtr> files, DBImpl const* const dbimpl)
+    std::unique_ptr<VectorLogPtr> files, VersionSet const* const versions)
     : dir_(dir),
       options_(options),
       read_options_(read_options),
@@ -25,9 +30,9 @@ TransactionLogIteratorImpl::TransactionLogIteratorImpl(
       currentFileIndex_(0),
       currentBatchSeq_(0),
       currentLastSeq_(0),
-      dbimpl_(dbimpl) {
+      versions_(versions) {
   assert(files_ != nullptr);
-  assert(dbimpl_ != nullptr);
+  assert(versions_ != nullptr);
 
   reporter_.env = options_->env;
   reporter_.info_log = options_->info_log.get();
@@ -74,7 +79,7 @@ bool TransactionLogIteratorImpl::RestrictedRead(
     Slice* record,
     std::string* scratch) {
   // Don't read if no more complete entries to read from logs
-  if (currentLastSeq_ >= dbimpl_->GetLatestSequenceNumber()) {
+  if (currentLastSeq_ >= versions_->LastSequence()) {
     return false;
   }
   return currentLogReader_->ReadRecord(record, scratch);
@@ -185,7 +190,7 @@ void TransactionLogIteratorImpl::NextImpl(bool internal) {
       }
     } else {
       isValid_ = false;
-      if (currentLastSeq_ == dbimpl_->GetLatestSequenceNumber()) {
+      if (currentLastSeq_ == versions_->LastSequence()) {
         currentStatus_ = Status::OK();
       } else {
         currentStatus_ = Status::Corruption("NO MORE DATA LEFT");
@@ -203,12 +208,10 @@ bool TransactionLogIteratorImpl::IsBatchExpected(
   if (batchSeq != expectedSeq) {
     char buf[200];
     snprintf(buf, sizeof(buf),
-             "Discontinuity in log records. Got seq=%lu, Expected seq=%lu, "
-             "Last flushed seq=%lu.Log iterator will reseek the correct "
-             "batch.",
-             (unsigned long)batchSeq,
-             (unsigned long)expectedSeq,
-             (unsigned long)dbimpl_->GetLatestSequenceNumber());
+             "Discontinuity in log records. Got seq=%" PRIu64
+             ", Expected seq=%" PRIu64 ", Last flushed seq=%" PRIu64
+             ".Log iterator will reseek the correct batch.",
+             batchSeq, expectedSeq, versions_->LastSequence());
     reporter_.Info(buf);
     return false;
   }
@@ -240,7 +243,7 @@ void TransactionLogIteratorImpl::UpdateCurrentWriteBatch(const Slice& record) {
   currentLastSeq_ = currentBatchSeq_ +
                     WriteBatchInternal::Count(batch.get()) - 1;
   // currentBatchSeq_ can only change here
-  assert(currentLastSeq_ <= dbimpl_->GetLatestSequenceNumber());
+  assert(currentLastSeq_ <= versions_->LastSequence());
 
   currentBatch_ = move(batch);
   isValid_ = true;
diff --git a/db/transaction_log_impl.h b/db/transaction_log_impl.h
index 1c7ab78d9..a0b7c9d3c 100644
--- a/db/transaction_log_impl.h
+++ b/db/transaction_log_impl.h
@@ -11,7 +11,7 @@
 #include "rocksdb/options.h"
 #include "rocksdb/types.h"
 #include "rocksdb/transaction_log.h"
-#include "db/db_impl.h"
+#include "db/version_set.h"
 #include "db/log_reader.h"
 #include "db/filename.h"
 
@@ -73,7 +73,7 @@ class TransactionLogIteratorImpl : public TransactionLogIterator {
       const std::string& dir, const DBOptions* options,
       const TransactionLogIterator::ReadOptions& read_options,
       const EnvOptions& soptions, const SequenceNumber seqNum,
-      std::unique_ptr<VectorLogPtr> files, DBImpl const* const dbimpl);
+      std::unique_ptr<VectorLogPtr> files, VersionSet const* const versions);
 
   virtual bool Valid();
 
@@ -100,7 +100,9 @@ class TransactionLogIteratorImpl : public TransactionLogIterator {
   LogReporter reporter_;
   SequenceNumber currentBatchSeq_; // sequence number at start of current batch
   SequenceNumber currentLastSeq_; // last sequence in the current batch
-  DBImpl const * const dbimpl_; // The db on whose log files this iterates
+  // Used only to get latest seq. num
+  // TODO(icanadi) can this be just a callback?
+  VersionSet const* const versions_;
 
   // Reads from transaction log only if the writebatch record has been written
   bool RestrictedRead(Slice* record, std::string* scratch);
diff --git a/db/wal_manager.cc b/db/wal_manager.cc
new file mode 100644
index 000000000..c08b3b220
--- /dev/null
+++ b/db/wal_manager.cc
@@ -0,0 +1,445 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/wal_manager.h"
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include <inttypes.h>
+#include <algorithm>
+#include <vector>
+#include <memory>
+
+#include "db/filename.h"
+#include "db/transaction_log_impl.h"
+#include "db/log_reader.h"
+#include "db/log_writer.h"
+#include "db/write_batch_internal.h"
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "rocksdb/write_batch.h"
+#include "util/coding.h"
+#include "util/logging.h"
+#include "util/mutexlock.h"
+#include "util/sync_point.h"
+
+namespace rocksdb {
+
+#ifndef ROCKSDB_LITE
+
+Status WalManager::GetSortedWalFiles(VectorLogPtr& files) {
+  // First get sorted files in db dir, then get sorted files from archived
+  // dir, to avoid a race condition where a log file is moved to archived
+  // dir in between.
+  Status s;
+  // list wal files in main db dir.
+  VectorLogPtr logs;
+  s = GetSortedWalsOfType(db_options_.wal_dir, logs, kAliveLogFile);
+  if (!s.ok()) {
+    return s;
+  }
+
+  // Reproduce the race condition where a log file is moved
+  // to archived dir, between these two sync points, used in
+  // (DBTest,TransactionLogIteratorRace)
+  TEST_SYNC_POINT("WalManager::GetSortedWalFiles:1");
+  TEST_SYNC_POINT("WalManager::GetSortedWalFiles:2");
+
+  files.clear();
+  // list wal files in archive dir.
+  std::string archivedir = ArchivalDirectory(db_options_.wal_dir);
+  if (env_->FileExists(archivedir)) {
+    s = GetSortedWalsOfType(archivedir, files, kArchivedLogFile);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  uint64_t latest_archived_log_number = 0;
+  if (!files.empty()) {
+    latest_archived_log_number = files.back()->LogNumber();
+    Log(db_options_.info_log, "Latest Archived log: %" PRIu64,
+        latest_archived_log_number);
+  }
+
+  files.reserve(files.size() + logs.size());
+  for (auto& log : logs) {
+    if (log->LogNumber() > latest_archived_log_number) {
+      files.push_back(std::move(log));
+    } else {
+      // When the race condition happens, we could see the
+      // same log in both db dir and archived dir. Simply
+      // ignore the one in db dir. Note that, if we read
+      // archived dir first, we would have missed the log file.
+      Log(db_options_.info_log, "%s already moved to archive",
+          log->PathName().c_str());
+    }
+  }
+
+  return s;
+}
+
+Status WalManager::GetUpdatesSince(
+    SequenceNumber seq, std::unique_ptr<TransactionLogIterator>* iter,
+    const TransactionLogIterator::ReadOptions& read_options,
+    VersionSet* version_set) {
+
+  //  Get all sorted Wal Files.
+  //  Do binary search and open files and find the seq number.
+
+  std::unique_ptr<VectorLogPtr> wal_files(new VectorLogPtr);
+  Status s = GetSortedWalFiles(*wal_files);
+  if (!s.ok()) {
+    return s;
+  }
+
+  s = RetainProbableWalFiles(*wal_files, seq);
+  if (!s.ok()) {
+    return s;
+  }
+  iter->reset(new TransactionLogIteratorImpl(
+      db_options_.wal_dir, &db_options_, read_options, env_options_, seq,
+      std::move(wal_files), version_set));
+  return (*iter)->status();
+}
+
+// 1. Go through all archived files and
+//    a. if ttl is enabled, delete outdated files
+//    b. if archive size limit is enabled, delete empty files,
+//        compute file number and size.
+// 2. If size limit is enabled:
+//    a. compute how many files should be deleted
+//    b. get sorted non-empty archived logs
+//    c. delete what should be deleted
+void WalManager::PurgeObsoleteWALFiles() {
+  bool const ttl_enabled = db_options_.WAL_ttl_seconds > 0;
+  bool const size_limit_enabled = db_options_.WAL_size_limit_MB > 0;
+  if (!ttl_enabled && !size_limit_enabled) {
+    return;
+  }
+
+  int64_t current_time;
+  Status s = env_->GetCurrentTime(&current_time);
+  if (!s.ok()) {
+    Log(db_options_.info_log, "Can't get current time: %s",
+        s.ToString().c_str());
+    assert(false);
+    return;
+  }
+  uint64_t const now_seconds = static_cast<uint64_t>(current_time);
+  uint64_t const time_to_check = (ttl_enabled && !size_limit_enabled)
+                                     ? db_options_.WAL_ttl_seconds / 2
+                                     : kDefaultIntervalToDeleteObsoleteWAL;
+
+  if (purge_wal_files_last_run_ + time_to_check > now_seconds) {
+    return;
+  }
+
+  purge_wal_files_last_run_ = now_seconds;
+
+  std::string archival_dir = ArchivalDirectory(db_options_.wal_dir);
+  std::vector<std::string> files;
+  s = env_->GetChildren(archival_dir, &files);
+  if (!s.ok()) {
+    Log(db_options_.info_log, "Can't get archive files: %s",
+        s.ToString().c_str());
+    assert(false);
+    return;
+  }
+
+  size_t log_files_num = 0;
+  uint64_t log_file_size = 0;
+
+  for (auto& f : files) {
+    uint64_t number;
+    FileType type;
+    if (ParseFileName(f, &number, &type) && type == kLogFile) {
+      std::string const file_path = archival_dir + "/" + f;
+      if (ttl_enabled) {
+        uint64_t file_m_time;
+        Status const s = env_->GetFileModificationTime(file_path, &file_m_time);
+        if (!s.ok()) {
+          Log(db_options_.info_log, "Can't get file mod time: %s: %s",
+              file_path.c_str(), s.ToString().c_str());
+          continue;
+        }
+        if (now_seconds - file_m_time > db_options_.WAL_ttl_seconds) {
+          Status const s = env_->DeleteFile(file_path);
+          if (!s.ok()) {
+            Log(db_options_.info_log, "Can't delete file: %s: %s",
+                file_path.c_str(), s.ToString().c_str());
+            continue;
+          } else {
+            MutexLock l(&read_first_record_cache_mutex_);
+            read_first_record_cache_.erase(number);
+          }
+          continue;
+        }
+      }
+
+      if (size_limit_enabled) {
+        uint64_t file_size;
+        Status const s = env_->GetFileSize(file_path, &file_size);
+        if (!s.ok()) {
+          Log(db_options_.info_log, "Can't get file size: %s: %s",
+              file_path.c_str(), s.ToString().c_str());
+          return;
+        } else {
+          if (file_size > 0) {
+            log_file_size = std::max(log_file_size, file_size);
+            ++log_files_num;
+          } else {
+            Status s = env_->DeleteFile(file_path);
+            if (!s.ok()) {
+              Log(db_options_.info_log, "Can't delete file: %s: %s",
+                  file_path.c_str(), s.ToString().c_str());
+              continue;
+            } else {
+              MutexLock l(&read_first_record_cache_mutex_);
+              read_first_record_cache_.erase(number);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  if (0 == log_files_num || !size_limit_enabled) {
+    return;
+  }
+
+  size_t const files_keep_num =
+      db_options_.WAL_size_limit_MB * 1024 * 1024 / log_file_size;
+  if (log_files_num <= files_keep_num) {
+    return;
+  }
+
+  size_t files_del_num = log_files_num - files_keep_num;
+  VectorLogPtr archived_logs;
+  GetSortedWalsOfType(archival_dir, archived_logs, kArchivedLogFile);
+
+  if (files_del_num > archived_logs.size()) {
+    Log(db_options_.info_log,
+        "Trying to delete more archived log files than "
+        "exist. Deleting all");
+    files_del_num = archived_logs.size();
+  }
+
+  for (size_t i = 0; i < files_del_num; ++i) {
+    std::string const file_path = archived_logs[i]->PathName();
+    Status const s = env_->DeleteFile(db_options_.wal_dir + "/" + file_path);
+    if (!s.ok()) {
+      Log(db_options_.info_log, "Can't delete file: %s: %s", file_path.c_str(),
+          s.ToString().c_str());
+      continue;
+    } else {
+      MutexLock l(&read_first_record_cache_mutex_);
+      read_first_record_cache_.erase(archived_logs[i]->LogNumber());
+    }
+  }
+}
+
+void WalManager::ArchiveWALFile(const std::string& fname, uint64_t number) {
+  auto archived_log_name = ArchivedLogFileName(db_options_.wal_dir, number);
+  // The sync point below is used in (DBTest,TransactionLogIteratorRace)
+  TEST_SYNC_POINT("WalManager::PurgeObsoleteFiles:1");
+  Status s = env_->RenameFile(fname, archived_log_name);
+  // The sync point below is used in (DBTest,TransactionLogIteratorRace)
+  TEST_SYNC_POINT("WalManager::PurgeObsoleteFiles:2");
+  Log(db_options_.info_log, "Move log file %s to %s -- %s\n", fname.c_str(),
+      archived_log_name.c_str(), s.ToString().c_str());
+}
+
+namespace {
+struct CompareLogByPointer {
+  bool operator()(const std::unique_ptr<LogFile>& a,
+                  const std::unique_ptr<LogFile>& b) {
+    LogFileImpl* a_impl = dynamic_cast<LogFileImpl*>(a.get());
+    LogFileImpl* b_impl = dynamic_cast<LogFileImpl*>(b.get());
+    return *a_impl < *b_impl;
+  }
+};
+}
+
+Status WalManager::GetSortedWalsOfType(const std::string& path,
+                                       VectorLogPtr& log_files,
+                                       WalFileType log_type) {
+  std::vector<std::string> all_files;
+  const Status status = env_->GetChildren(path, &all_files);
+  if (!status.ok()) {
+    return status;
+  }
+  log_files.reserve(all_files.size());
+  for (const auto& f : all_files) {
+    uint64_t number;
+    FileType type;
+    if (ParseFileName(f, &number, &type) && type == kLogFile) {
+      SequenceNumber sequence;
+      Status s = ReadFirstRecord(log_type, number, &sequence);
+      if (!s.ok()) {
+        return s;
+      }
+      if (sequence == 0) {
+        // empty file
+        continue;
+      }
+
+      // Reproduce the race condition where a log file is moved
+      // to archived dir, between these two sync points, used in
+      // (DBTest,TransactionLogIteratorRace)
+      TEST_SYNC_POINT("WalManager::GetSortedWalsOfType:1");
+      TEST_SYNC_POINT("WalManager::GetSortedWalsOfType:2");
+
+      uint64_t size_bytes;
+      s = env_->GetFileSize(LogFileName(path, number), &size_bytes);
+      // re-try in case the alive log file has been moved to archive.
+      if (!s.ok() && log_type == kAliveLogFile &&
+          env_->FileExists(ArchivedLogFileName(path, number))) {
+        s = env_->GetFileSize(ArchivedLogFileName(path, number), &size_bytes);
+      }
+      if (!s.ok()) {
+        return s;
+      }
+
+      log_files.push_back(std::move(std::unique_ptr<LogFile>(
+          new LogFileImpl(number, log_type, sequence, size_bytes))));
+    }
+  }
+  CompareLogByPointer compare_log_files;
+  std::sort(log_files.begin(), log_files.end(), compare_log_files);
+  return status;
+}
+
+Status WalManager::RetainProbableWalFiles(VectorLogPtr& all_logs,
+                                          const SequenceNumber target) {
+  int64_t start = 0;  // signed to avoid overflow when target is < first file.
+  int64_t end = static_cast<int64_t>(all_logs.size()) - 1;
+  // Binary Search. avoid opening all files.
+  while (end >= start) {
+    int64_t mid = start + (end - start) / 2;  // Avoid overflow.
+    SequenceNumber current_seq_num = all_logs.at(mid)->StartSequence();
+    if (current_seq_num == target) {
+      end = mid;
+      break;
+    } else if (current_seq_num < target) {
+      start = mid + 1;
+    } else {
+      end = mid - 1;
+    }
+  }
+  // end could be -ve.
+  size_t start_index = std::max(static_cast<int64_t>(0), end);
+  // The last wal file is always included
+  all_logs.erase(all_logs.begin(), all_logs.begin() + start_index);
+  return Status::OK();
+}
+
+Status WalManager::ReadFirstRecord(const WalFileType type,
+                                   const uint64_t number,
+                                   SequenceNumber* sequence) {
+  if (type != kAliveLogFile && type != kArchivedLogFile) {
+    return Status::NotSupported("File Type Not Known " + std::to_string(type));
+  }
+  {
+    MutexLock l(&read_first_record_cache_mutex_);
+    auto itr = read_first_record_cache_.find(number);
+    if (itr != read_first_record_cache_.end()) {
+      *sequence = itr->second;
+      return Status::OK();
+    }
+  }
+  Status s;
+  if (type == kAliveLogFile) {
+    std::string fname = LogFileName(db_options_.wal_dir, number);
+    s = ReadFirstLine(fname, sequence);
+    if (env_->FileExists(fname) && !s.ok()) {
+      // return any error that is not caused by non-existing file
+      return s;
+    }
+  }
+
+  if (type == kArchivedLogFile || !s.ok()) {
+    //  check if the file got moved to archive.
+    std::string archived_file =
+        ArchivedLogFileName(db_options_.wal_dir, number);
+    s = ReadFirstLine(archived_file, sequence);
+  }
+
+  if (s.ok() && *sequence != 0) {
+    MutexLock l(&read_first_record_cache_mutex_);
+    read_first_record_cache_.insert({number, *sequence});
+  }
+  return s;
+}
+
+// the function returns status.ok() and sequence == 0 if the file exists, but is
+// empty
+Status WalManager::ReadFirstLine(const std::string& fname,
+                                 SequenceNumber* sequence) {
+  struct LogReporter : public log::Reader::Reporter {
+    Env* env;
+    Logger* info_log;
+    const char* fname;
+
+    Status* status;
+    bool ignore_error;  // true if db_options_.paranoid_checks==false
+    virtual void Corruption(size_t bytes, const Status& s) {
+      Log(info_log, "%s%s: dropping %d bytes; %s",
+          (this->ignore_error ? "(ignoring error) " : ""), fname,
+          static_cast<int>(bytes), s.ToString().c_str());
+      if (this->status->ok()) {
+        // only keep the first error
+        *this->status = s;
+      }
+    }
+  };
+
+  std::unique_ptr<SequentialFile> file;
+  Status status = env_->NewSequentialFile(fname, &file, env_options_);
+
+  if (!status.ok()) {
+    return status;
+  }
+
+  LogReporter reporter;
+  reporter.env = env_;
+  reporter.info_log = db_options_.info_log.get();
+  reporter.fname = fname.c_str();
+  reporter.status = &status;
+  reporter.ignore_error = !db_options_.paranoid_checks;
+  log::Reader reader(std::move(file), &reporter, true /*checksum*/,
+                     0 /*initial_offset*/);
+  std::string scratch;
+  Slice record;
+
+  if (reader.ReadRecord(&record, &scratch) &&
+      (status.ok() || !db_options_.paranoid_checks)) {
+    if (record.size() < 12) {
+      reporter.Corruption(record.size(),
+                          Status::Corruption("log record too small"));
+      // TODO read record's till the first no corrupt entry?
+    } else {
+      WriteBatch batch;
+      WriteBatchInternal::SetContents(&batch, record);
+      *sequence = WriteBatchInternal::Sequence(&batch);
+      return Status::OK();
+    }
+  }
+
+  // ReadRecord returns false on EOF, which means that the log file is empty. we
+  // return status.ok() in that case and set sequence number to 0
+  *sequence = 0;
+  return status;
+}
+
+#endif  // ROCKSDB_LITE
+}  // namespace rocksdb
diff --git a/db/wal_manager.h b/db/wal_manager.h
new file mode 100644
index 000000000..493c426e3
--- /dev/null
+++ b/db/wal_manager.h
@@ -0,0 +1,95 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include <atomic>
+#include <deque>
+#include <limits>
+#include <set>
+#include <utility>
+#include <vector>
+#include <string>
+#include <memory>
+
+#include "port/port.h"
+
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "rocksdb/types.h"
+#include "rocksdb/transaction_log.h"
+#include "rocksdb/status.h"
+
+#include "db/version_set.h"
+
+namespace rocksdb {
+
+#ifndef ROCKSDB_LITE
+class WalManager {
+ public:
+  WalManager(const DBOptions& db_options, const EnvOptions& env_options)
+      : db_options_(db_options),
+        env_options_(env_options),
+        env_(db_options.env),
+        purge_wal_files_last_run_(0) {}
+
+  virtual Status GetSortedWalFiles(VectorLogPtr& files);
+
+  virtual Status GetUpdatesSince(
+      SequenceNumber seq_number, std::unique_ptr<TransactionLogIterator>* iter,
+      const TransactionLogIterator::ReadOptions& read_options,
+      VersionSet* version_set);
+
+  void PurgeObsoleteWALFiles();
+
+  void ArchiveWALFile(const std::string& fname, uint64_t number);
+
+  Status TEST_ReadFirstRecord(const WalFileType type, const uint64_t number,
+                              SequenceNumber* sequence) {
+    return ReadFirstRecord(type, number, sequence);
+  }
+
+  Status TEST_ReadFirstLine(const std::string& fname,
+                            SequenceNumber* sequence) {
+    return ReadFirstLine(fname, sequence);
+  }
+
+ private:
+  Status GetSortedWalsOfType(const std::string& path, VectorLogPtr& log_files,
+                             WalFileType type);
+  // Requires: all_logs should be sorted with earliest log file first
+  // Retains all log files in all_logs which contain updates with seq no.
+  // Greater Than or Equal to the requested SequenceNumber.
+  Status RetainProbableWalFiles(VectorLogPtr& all_logs,
+                                const SequenceNumber target);
+
+  Status ReadFirstRecord(const WalFileType type, const uint64_t number,
+                         SequenceNumber* sequence);
+
+  Status ReadFirstLine(const std::string& fname, SequenceNumber* sequence);
+
+  // ------- state from DBImpl ------
+  const DBOptions& db_options_;
+  const EnvOptions& env_options_;
+  Env* env_;
+
+  // ------- WalManager state -------
+  // cache for ReadFirstRecord() calls
+  std::unordered_map<uint64_t, SequenceNumber> read_first_record_cache_;
+  port::Mutex read_first_record_cache_mutex_;
+
+  // last time when PurgeObsoleteWALFiles ran.
+  uint64_t purge_wal_files_last_run_;
+
+  // obsolete files will be deleted every this seconds if ttl deletion is
+  // enabled and archive size_limit is disabled.
+  static const uint64_t kDefaultIntervalToDeleteObsoleteWAL = 600;
+};
+
+#endif  // ROCKSDB_LITE
+}  // namespace rocksdb
diff --git a/db/wal_manager_test.cc b/db/wal_manager_test.cc
new file mode 100644
index 000000000..1f609d083
--- /dev/null
+++ b/db/wal_manager_test.cc
@@ -0,0 +1,279 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include <map>
+#include <string>
+
+#include "rocksdb/cache.h"
+#include "rocksdb/write_batch.h"
+
+#include "db/wal_manager.h"
+#include "db/log_writer.h"
+#include "db/column_family.h"
+#include "db/version_set.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+#include "table/mock_table.h"
+#include "db/db_impl.h"
+
+namespace rocksdb {
+
+// TODO(icanadi) mock out VersionSet
+// TODO(icanadi) move other WalManager-specific tests from db_test here
+class WalManagerTest {
+ public:
+  WalManagerTest()
+      : env_(Env::Default()),
+        dbname_(test::TmpDir() + "/wal_manager_test"),
+        table_cache_(NewLRUCache(50000, 16, 8)),
+        current_log_number_(0) {
+    DestroyDB(dbname_, Options());
+  }
+
+  void Init() {
+    ASSERT_OK(env_->CreateDirIfMissing(dbname_));
+    ASSERT_OK(env_->CreateDirIfMissing(ArchivalDirectory(dbname_)));
+    db_options_.db_paths.emplace_back(dbname_,
+                                      std::numeric_limits<uint64_t>::max());
+    db_options_.wal_dir = dbname_;
+
+    versions_.reset(new VersionSet(dbname_, &db_options_, env_options_,
+                                   table_cache_.get(), &write_controller_));
+
+    wal_manager_.reset(new WalManager(db_options_, env_options_));
+  }
+
+  void Reopen() {
+    wal_manager_.reset(new WalManager(db_options_, env_options_));
+  }
+
+  // NOT thread safe
+  void Put(const std::string& key, const std::string& value) {
+    assert(current_log_writer_.get() != nullptr);
+    uint64_t seq =  versions_->LastSequence() + 1;
+    WriteBatch batch;
+    batch.Put(key, value);
+    WriteBatchInternal::SetSequence(&batch, seq);
+    current_log_writer_->AddRecord(WriteBatchInternal::Contents(&batch));
+    versions_->SetLastSequence(seq);
+  }
+
+  // NOT thread safe
+  void RollTheLog(bool archived) {
+    current_log_number_++;
+    std::string fname = ArchivedLogFileName(dbname_, current_log_number_);
+    unique_ptr<WritableFile> file;
+    ASSERT_OK(env_->NewWritableFile(fname, &file, env_options_));
+    current_log_writer_.reset(new log::Writer(std::move(file)));
+  }
+
+  void CreateArchiveLogs(int num_logs, int entries_per_log) {
+    for (int i = 1; i <= num_logs; ++i) {
+      RollTheLog(true);
+      for (int k = 0; k < entries_per_log; ++k) {
+        Put(std::to_string(k), std::string(1024, 'a'));
+      }
+    }
+  }
+
+  std::unique_ptr<TransactionLogIterator> OpenTransactionLogIter(
+      const SequenceNumber seq) {
+    unique_ptr<TransactionLogIterator> iter;
+    Status status = wal_manager_->GetUpdatesSince(
+        seq, &iter, TransactionLogIterator::ReadOptions(), versions_.get());
+    ASSERT_OK(status);
+    return std::move(iter);
+  }
+
+  Env* env_;
+  std::string dbname_;
+  WriteController write_controller_;
+  EnvOptions env_options_;
+  std::shared_ptr<Cache> table_cache_;
+  DBOptions db_options_;
+  std::unique_ptr<VersionSet> versions_;
+  std::unique_ptr<WalManager> wal_manager_;
+
+  std::unique_ptr<log::Writer> current_log_writer_;
+  uint64_t current_log_number_;
+};
+
+TEST(WalManagerTest, ReadFirstRecordCache) {
+  Init();
+  std::string path = dbname_ + "/000001.log";
+  unique_ptr<WritableFile> file;
+  ASSERT_OK(env_->NewWritableFile(path, &file, EnvOptions()));
+
+  SequenceNumber s;
+  ASSERT_OK(wal_manager_->TEST_ReadFirstLine(path, &s));
+  ASSERT_EQ(s, 0U);
+
+  ASSERT_OK(wal_manager_->TEST_ReadFirstRecord(kAliveLogFile, 1, &s));
+  ASSERT_EQ(s, 0U);
+
+  log::Writer writer(std::move(file));
+  WriteBatch batch;
+  batch.Put("foo", "bar");
+  WriteBatchInternal::SetSequence(&batch, 10);
+  writer.AddRecord(WriteBatchInternal::Contents(&batch));
+
+  // TODO(icanadi) move SpecialEnv outside of db_test, so we can reuse it here.
+  // Waiting for lei to finish with db_test
+  // env_->count_sequential_reads_ = true;
+  // sequential_read_counter_ sanity test
+  // ASSERT_EQ(env_->sequential_read_counter_.Read(), 0);
+
+  ASSERT_OK(wal_manager_->TEST_ReadFirstRecord(kAliveLogFile, 1, &s));
+  ASSERT_EQ(s, 10U);
+  // did a read
+  // TODO(icanadi) move SpecialEnv outside of db_test, so we can reuse it here
+  // ASSERT_EQ(env_->sequential_read_counter_.Read(), 1);
+
+  ASSERT_OK(wal_manager_->TEST_ReadFirstRecord(kAliveLogFile, 1, &s));
+  ASSERT_EQ(s, 10U);
+  // no new reads since the value is cached
+  // TODO(icanadi) move SpecialEnv outside of db_test, so we can reuse it here
+  // ASSERT_EQ(env_->sequential_read_counter_.Read(), 1);
+}
+
+namespace {
+uint64_t GetLogDirSize(std::string dir_path, Env* env) {
+  uint64_t dir_size = 0;
+  std::vector<std::string> files;
+  env->GetChildren(dir_path, &files);
+  for (auto& f : files) {
+    uint64_t number;
+    FileType type;
+    if (ParseFileName(f, &number, &type) && type == kLogFile) {
+      std::string const file_path = dir_path + "/" + f;
+      uint64_t file_size;
+      env->GetFileSize(file_path, &file_size);
+      dir_size += file_size;
+    }
+  }
+  return dir_size;
+}
+std::vector<std::uint64_t> ListSpecificFiles(
+    Env* env, const std::string& path, const FileType expected_file_type) {
+  std::vector<std::string> files;
+  std::vector<uint64_t> file_numbers;
+  env->GetChildren(path, &files);
+  uint64_t number;
+  FileType type;
+  for (size_t i = 0; i < files.size(); ++i) {
+    if (ParseFileName(files[i], &number, &type)) {
+      if (type == expected_file_type) {
+        file_numbers.push_back(number);
+      }
+    }
+  }
+  return std::move(file_numbers);
+}
+
+int CountRecords(TransactionLogIterator* iter) {
+  int count = 0;
+  SequenceNumber lastSequence = 0;
+  BatchResult res;
+  while (iter->Valid()) {
+    res = iter->GetBatch();
+    ASSERT_TRUE(res.sequence > lastSequence);
+    ++count;
+    lastSequence = res.sequence;
+    ASSERT_OK(iter->status());
+    iter->Next();
+  }
+  return count;
+}
+}  // namespace
+
+TEST(WalManagerTest, WALArchivalSizeLimit) {
+  db_options_.WAL_ttl_seconds = 0;
+  db_options_.WAL_size_limit_MB = 1000;
+  Init();
+
+  // TEST : Create WalManager with huge size limit and no ttl.
+  // Create some archived files and call PurgeObsoleteWALFiles().
+  // Count the archived log files that survived.
+  // Assert that all of them did.
+  // Change size limit. Re-open WalManager.
+  // Assert that archive is not greater than WAL_size_limit_MB after
+  // PurgeObsoleteWALFiles()
+  // Set ttl and time_to_check_ to small values. Re-open db.
+  // Assert that there are no archived logs left.
+
+  std::string archive_dir = ArchivalDirectory(dbname_);
+  CreateArchiveLogs(20, 5000);
+
+  std::vector<std::uint64_t> log_files =
+      ListSpecificFiles(env_, archive_dir, kLogFile);
+  ASSERT_EQ(log_files.size(), 20U);
+
+  db_options_.WAL_size_limit_MB = 8;
+  Reopen();
+  wal_manager_->PurgeObsoleteWALFiles();
+
+  uint64_t archive_size = GetLogDirSize(archive_dir, env_);
+  ASSERT_TRUE(archive_size <= db_options_.WAL_size_limit_MB * 1024 * 1024);
+
+  db_options_.WAL_ttl_seconds = 1;
+  env_->SleepForMicroseconds(2 * 1000 * 1000);
+  Reopen();
+  wal_manager_->PurgeObsoleteWALFiles();
+
+  log_files = ListSpecificFiles(env_, archive_dir, kLogFile);
+  ASSERT_TRUE(log_files.empty());
+}
+
+TEST(WalManagerTest, WALArchivalTtl) {
+  db_options_.WAL_ttl_seconds = 1000;
+  Init();
+
+  // TEST : Create WalManager with a ttl and no size limit.
+  // Create some archived log files and call PurgeObsoleteWALFiles().
+  // Assert that files are not deleted
+  // Reopen db with small ttl.
+  // Assert that all archived logs was removed.
+
+  std::string archive_dir = ArchivalDirectory(dbname_);
+  CreateArchiveLogs(20, 5000);
+
+  std::vector<uint64_t> log_files =
+      ListSpecificFiles(env_, archive_dir, kLogFile);
+  ASSERT_GT(log_files.size(), 0U);
+
+  db_options_.WAL_ttl_seconds = 1;
+  env_->SleepForMicroseconds(3 * 1000 * 1000);
+  Reopen();
+  wal_manager_->PurgeObsoleteWALFiles();
+
+  log_files = ListSpecificFiles(env_, archive_dir, kLogFile);
+  ASSERT_TRUE(log_files.empty());
+}
+
+TEST(WalManagerTest, TransactionLogIteratorMoveOverZeroFiles) {
+  Init();
+  RollTheLog(false);
+  Put("key1", std::string(1024, 'a'));
+  // Create a zero record WAL file.
+  RollTheLog(false);
+  RollTheLog(false);
+
+  Put("key2", std::string(1024, 'a'));
+
+  auto iter = OpenTransactionLogIter(0);
+  ASSERT_EQ(2, CountRecords(iter.get()));
+}
+
+TEST(WalManagerTest, TransactionLogIteratorJustEmptyFile) {
+  Init();
+  RollTheLog(false);
+  auto iter = OpenTransactionLogIter(0);
+  // Check that an empty iterator is returned
+  ASSERT_TRUE(!iter->Valid());
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) { return rocksdb::test::RunAllTests(); }

From 217cc217d7235f8229a3942e5a879d5f8420cc5b Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Wed, 29 Oct 2014 17:55:19 -0700
Subject: [PATCH 359/829] Apply InfoLogLevel to the logs in
 table/meta_blocks.cc

Summary: Apply InfoLogLevel to the logs in table/meta_blocks.cc

Test Plan: make

Reviewers: ljin, sdong, igor

Reviewed By: igor

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D27903
---
 table/meta_blocks.cc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/table/meta_blocks.cc b/table/meta_blocks.cc
index 5aabffcb0..04b68eb95 100644
--- a/table/meta_blocks.cc
+++ b/table/meta_blocks.cc
@@ -86,9 +86,9 @@ void LogPropertiesCollectionError(
   assert(method == "Add" || method == "Finish");
 
   std::string msg =
-    "[Warning] encountered error when calling TablePropertiesCollector::" +
+    "Encountered error when calling TablePropertiesCollector::" +
     method + "() with collector name: " + name;
-  Log(info_log, "%s", msg.c_str());
+  Log(InfoLogLevel::ERROR_LEVEL, info_log, "%s", msg.c_str());
 }
 
 bool NotifyCollectTableCollectorsOnAdd(
@@ -192,9 +192,9 @@ Status ReadProperties(const Slice &handle_value, RandomAccessFile *file,
       if (!GetVarint64(&raw_val, &val)) {
         // skip malformed value
         auto error_msg =
-          "[Warning] detect malformed value in properties meta-block:"
+          "Detect malformed value in properties meta-block:"
           "\tkey: " + key + "\tval: " + raw_val.ToString();
-        Log(logger, "%s", error_msg.c_str());
+        Log(InfoLogLevel::ERROR_LEVEL, logger, "%s", error_msg.c_str());
         continue;
       }
       *(pos->second) = val;

From 37e9b637018ee39739080c5123f073f85e98723d Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Wed, 29 Oct 2014 17:57:00 -0700
Subject: [PATCH 360/829] Apply InfoLogLevel to the logs in
 utilities/ttl/db_ttl_impl.h

Summary: Apply InfoLogLevel to the logs in utilities/ttl/db_ttl_impl.h

Test Plan: make

Reviewers: ljin, sdong, igor

Reviewed By: igor

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D27885
---
 utilities/ttl/db_ttl_impl.h | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/utilities/ttl/db_ttl_impl.h b/utilities/ttl/db_ttl_impl.h
index 92b8eab7f..6ca1ac157 100644
--- a/utilities/ttl/db_ttl_impl.h
+++ b/utilities/ttl/db_ttl_impl.h
@@ -219,7 +219,8 @@ class TtlMergeOperator : public MergeOperator {
       override {
     const uint32_t ts_len = DBWithTTLImpl::kTSLength;
     if (existing_value && existing_value->size() < ts_len) {
-      Log(logger, "Error: Could not remove timestamp from existing value.");
+      Log(InfoLogLevel::ERROR_LEVEL, logger,
+          "Error: Could not remove timestamp from existing value.");
       return false;
     }
 
@@ -227,7 +228,8 @@ class TtlMergeOperator : public MergeOperator {
     std::deque<std::string> operands_without_ts;
     for (const auto& operand : operands) {
       if (operand.size() < ts_len) {
-        Log(logger, "Error: Could not remove timestamp from operand value.");
+        Log(InfoLogLevel::ERROR_LEVEL, logger,
+            "Error: Could not remove timestamp from operand value.");
         return false;
       }
       operands_without_ts.push_back(operand.substr(0, operand.size() - ts_len));
@@ -253,7 +255,7 @@ class TtlMergeOperator : public MergeOperator {
     // Augment the *new_value with the ttl time-stamp
     int64_t curtime;
     if (!env_->GetCurrentTime(&curtime).ok()) {
-      Log(logger,
+      Log(InfoLogLevel::ERROR_LEVEL, logger,
           "Error: Could not get current time to be attached internally "
           "to the new value.");
       return false;
@@ -274,7 +276,8 @@ class TtlMergeOperator : public MergeOperator {
 
     for (const auto& operand : operand_list) {
       if (operand.size() < ts_len) {
-        Log(logger, "Error: Could not remove timestamp from value.");
+        Log(InfoLogLevel::ERROR_LEVEL, logger,
+            "Error: Could not remove timestamp from value.");
         return false;
       }
 
@@ -292,7 +295,7 @@ class TtlMergeOperator : public MergeOperator {
     // Augment the *new_value with the ttl time-stamp
     int64_t curtime;
     if (!env_->GetCurrentTime(&curtime).ok()) {
-      Log(logger,
+      Log(InfoLogLevel::ERROR_LEVEL, logger,
           "Error: Could not get current time to be attached internally "
           "to the new value.");
       return false;

From c5db7f26059e8429f77e4307b7d3540a6aa3e29e Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Wed, 29 Oct 2014 21:03:45 -0700
Subject: [PATCH 361/829] Fix CompactionPickerTest.Level1Trigger2

Summary: CompactionPickerTest.Level1Trigger2 now depends on the STL implementation to be correct. Fix it.

Test Plan: Run the test

Reviewers: ljin, yhchiang, rven, igor

Reviewed By: igor

Subscribers: leveldb, dhruba

Differential Revision: https://reviews.facebook.net/D27963
---
 db/compaction_picker_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/db/compaction_picker_test.cc b/db/compaction_picker_test.cc
index f4417b8b5..c302d2a2a 100644
--- a/db/compaction_picker_test.cc
+++ b/db/compaction_picker_test.cc
@@ -127,7 +127,7 @@ TEST(CompactionPickerTest, Level1Trigger) {
 }
 
 TEST(CompactionPickerTest, Level1Trigger2) {
-  Add(1, 66U, "150", "200", 1000000000U);
+  Add(1, 66U, "150", "200", 1000000001U);
   Add(1, 88U, "201", "300", 1000000000U);
   Add(2, 6U, "150", "179", 1000000000U);
   Add(2, 7U, "180", "220", 1000000000U);

From 41af0f56b039fbb068707e1731ee199a1e450fee Mon Sep 17 00:00:00 2001
From: Damian Lezama <damian.lezama@gmail.com>
Date: Thu, 30 Oct 2014 10:36:13 -0700
Subject: [PATCH 362/829] Fix build break because of unsigned/signed mismatch

---
 util/env_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/util/env_test.cc b/util/env_test.cc
index 48e7d353d..f9c2336db 100644
--- a/util/env_test.cc
+++ b/util/env_test.cc
@@ -540,7 +540,7 @@ TEST(EnvPosixTest, AllocateTest) {
   stat(fname.c_str(), &f_stat);
   ASSERT_EQ((unsigned int)data.size(), f_stat.st_size);
   // verify that preallocated blocks were deallocated on file close
-  ASSERT_EQ((f_stat.st_size + kBlockSize - 1) / kBlockSize, f_stat.st_blocks);
+  ASSERT_EQ((f_stat.st_size + kBlockSize - 1) / kBlockSize, (unsigned int)f_stat.st_blocks);
 }
 #endif  // ROCKSDB_FALLOCATE_PRESENT
 

From 2c1bd8846f2abea1375d37fbb24e64c811b30e0f Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Mon, 20 Oct 2014 22:42:32 +0200
Subject: [PATCH 363/829] BlockBasedTableConfig & PlainTableConfig enhancements

Summary:
BlockBasedTableConfig
- ported Checksum
- ported IndexType

PlainTableConfig
- added missing options
- added EncodingType

Test Plan:
make rocksdbjava
make jtest

Differential Revision: https://reviews.facebook.net/D26595
---
 java/org/rocksdb/BlockBasedTableConfig.java |  61 ++++++--
 java/org/rocksdb/ChecksumType.java          |  39 ++++++
 java/org/rocksdb/EncodingType.java          |  55 ++++++++
 java/org/rocksdb/IndexType.java             |  37 +++++
 java/org/rocksdb/PlainTableConfig.java      | 148 ++++++++++++++++++--
 java/rocksjni/table.cc                      |  19 ++-
 6 files changed, 336 insertions(+), 23 deletions(-)
 create mode 100644 java/org/rocksdb/ChecksumType.java
 create mode 100644 java/org/rocksdb/EncodingType.java
 create mode 100644 java/org/rocksdb/IndexType.java

diff --git a/java/org/rocksdb/BlockBasedTableConfig.java b/java/org/rocksdb/BlockBasedTableConfig.java
index 2f9f0ac64..d236b1a39 100644
--- a/java/org/rocksdb/BlockBasedTableConfig.java
+++ b/java/org/rocksdb/BlockBasedTableConfig.java
@@ -22,6 +22,8 @@ public class BlockBasedTableConfig extends TableFormatConfig {
     cacheIndexAndFilterBlocks_ = false;
     hashIndexAllowCollision_ = true;
     blockCacheCompressedSize_ = 0;
+    checksumType_ = ChecksumType.kCRC32c;
+    indexType_ = IndexType.kBinarySearch;
   }
 
   /**
@@ -293,6 +295,44 @@ public class BlockBasedTableConfig extends TableFormatConfig {
     return this;
   }
 
+  /**
+   * Sets the checksum type to be used with this table.
+   *
+   * @param checksumType {@link org.rocksdb.ChecksumType} value.
+   * @return the reference to the current option.
+   */
+  public BlockBasedTableConfig setChecksumType(ChecksumType checksumType) {
+    checksumType_ = checksumType;
+    return this;
+  }
+
+  /**
+   *
+   * @return the currently set checksum type
+   */
+  public ChecksumType checksumType() {
+    return checksumType_;
+  }
+
+  /**
+   * Sets the index type to used with this table.
+   *
+   * @param indexType {@link org.rocksdb.IndexType} value
+   * @return the reference to the current option.
+   */
+  public BlockBasedTableConfig setIndexType(IndexType indexType) {
+    indexType_ = indexType;
+    return this;
+  }
+
+  /**
+   *
+   * @return the currently set index type
+   */
+  public IndexType indexType() {
+    return indexType_;
+  }
+
   @Override protected long newTableFactoryHandle() {
     long filterHandle = 0;
     if (filter_ != null) {
@@ -304,7 +344,8 @@ public class BlockBasedTableConfig extends TableFormatConfig {
         blockRestartInterval_, wholeKeyFiltering_,
         filterHandle, cacheIndexAndFilterBlocks_,
         hashIndexAllowCollision_, blockCacheCompressedSize_,
-        blockCacheCompressedNumShardBits_);
+        blockCacheCompressedNumShardBits_,
+        checksumType_.getValue(), indexType_.getValue());
   }
 
   private native long newTableFactoryHandle(
@@ -312,19 +353,21 @@ public class BlockBasedTableConfig extends TableFormatConfig {
       long blockSize, int blockSizeDeviation, int blockRestartInterval,
       boolean wholeKeyFiltering, long filterPolicyHandle,
       boolean cacheIndexAndFilterBlocks, boolean hashIndexAllowCollision,
-      long blockCacheCompressedSize, int blockCacheCompressedNumShardBits);
+      long blockCacheCompressedSize, int blockCacheCompressedNumShardBits,
+      byte checkSumType, byte indexType);
 
+  private boolean cacheIndexAndFilterBlocks_;
+  private IndexType indexType_;
+  private boolean hashIndexAllowCollision_;
+  private ChecksumType checksumType_;
   private boolean noBlockCache_;
+  private long blockSize_;
   private long blockCacheSize_;
   private int blockCacheNumShardBits_;
-  private long shard;
-  private long blockSize_;
+  private long blockCacheCompressedSize_;
+  private int blockCacheCompressedNumShardBits_;
   private int blockSizeDeviation_;
   private int blockRestartInterval_;
-  private boolean wholeKeyFiltering_;
   private Filter filter_;
-  private boolean cacheIndexAndFilterBlocks_;
-  private boolean hashIndexAllowCollision_;
-  private long blockCacheCompressedSize_;
-  private int blockCacheCompressedNumShardBits_;
+  private boolean wholeKeyFiltering_;
 }
diff --git a/java/org/rocksdb/ChecksumType.java b/java/org/rocksdb/ChecksumType.java
new file mode 100644
index 000000000..40ba032b3
--- /dev/null
+++ b/java/org/rocksdb/ChecksumType.java
@@ -0,0 +1,39 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+/**
+ * Checksum types used in conjunction with BlockBasedTable..
+ */
+public enum ChecksumType {
+  /**
+   * Not implemented yet.
+   */
+  kNoChecksum((byte) 0),
+  /**
+   * CRC32 Checksum
+   */
+  kCRC32c((byte)1),
+  /**
+   * XX Hash
+   */
+  kxxHash((byte)2);
+
+  private final byte value_;
+
+  private ChecksumType(byte value) {
+    value_ = value;
+  }
+
+  /**
+   * Returns the byte value of the enumerations value
+   *
+   * @return byte representation
+   */
+  public byte getValue() {
+    return value_;
+  }
+}
diff --git a/java/org/rocksdb/EncodingType.java b/java/org/rocksdb/EncodingType.java
new file mode 100644
index 000000000..1d0a36c37
--- /dev/null
+++ b/java/org/rocksdb/EncodingType.java
@@ -0,0 +1,55 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+/**
+ * EncodingType
+ *
+ * <p>The value will determine how to encode keys
+ * when writing to a new SST file.</p>
+ *
+ * <p>This value will be stored
+ * inside the SST file which will be used when reading from
+ * the file, which makes it possible for users to choose
+ * different encoding type when reopening a DB. Files with
+ * different encoding types can co-exist in the same DB and
+ * can be read.</p>
+ */
+public enum EncodingType {
+  /**
+   * Always write full keys without any special encoding.
+   */
+  kPlain((byte)0),
+  /**
+   * <p>Find opportunity to write the same prefix once for multiple rows.
+   * In some cases, when a key follows a previous key with the same prefix,
+   * instead of writing out the full key, it just writes out the size of the
+   * shared prefix, as well as other bytes, to save some bytes.</p>
+   *
+   * <p>When using this option, the user is required to use the same prefix
+   * extractor to make sure the same prefix will be extracted from the same key.
+   * The Name() value of the prefix extractor will be stored in the file. When
+   * reopening the file, the name of the options.prefix_extractor given will be
+   * bitwise compared to the prefix extractors stored in the file. An error
+   * will be returned if the two don't match.</p>
+   */
+  kPrefix((byte)1);
+
+  private final byte value_;
+
+  private EncodingType(byte value) {
+    value_ = value;
+  }
+
+  /**
+   * Returns the byte value of the enumerations value
+   *
+   * @return byte representation
+   */
+  public byte getValue() {
+    return value_;
+  }
+}
diff --git a/java/org/rocksdb/IndexType.java b/java/org/rocksdb/IndexType.java
new file mode 100644
index 000000000..47912f7b6
--- /dev/null
+++ b/java/org/rocksdb/IndexType.java
@@ -0,0 +1,37 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+/**
+ * IndexType used in conjunction with BlockBasedTable.
+ */
+public enum IndexType {
+  /**
+   * A space efficient index block that is optimized for
+   * binary-search-based index.
+   */
+  kBinarySearch((byte) 0),
+  /**
+   * The hash index, if enabled, will do the hash lookup when
+   * {@code Options.prefix_extractor} is provided.
+   */
+  kHashSearch((byte)1);
+
+  private final byte value_;
+
+  private IndexType(byte value) {
+    value_ = value;
+  }
+
+  /**
+   * Returns the byte value of the enumerations value
+   *
+   * @return byte representation
+   */
+  public byte getValue() {
+    return value_;
+  }
+}
diff --git a/java/org/rocksdb/PlainTableConfig.java b/java/org/rocksdb/PlainTableConfig.java
index 554ce3840..bb44e1ac1 100644
--- a/java/org/rocksdb/PlainTableConfig.java
+++ b/java/org/rocksdb/PlainTableConfig.java
@@ -7,28 +7,43 @@ package org.rocksdb;
 /**
  * The config for plain table sst format.
  *
- * PlainTable is a RocksDB's SST file format optimized for low query latency
- * on pure-memory or really low-latency media.  It also support prefix
- * hash feature.
+ * <p>PlainTable is a RocksDB's SST file format optimized for low query
+ * latency on pure-memory or really low-latency media.</p>
+ *
+ * <p>It also support prefix hash feature.</p>
  */
 public class PlainTableConfig extends TableFormatConfig {
   public static final int VARIABLE_LENGTH = 0;
   public static final int DEFAULT_BLOOM_BITS_PER_KEY = 10;
   public static final double DEFAULT_HASH_TABLE_RATIO = 0.75;
   public static final int DEFAULT_INDEX_SPARSENESS = 16;
+  public static final int DEFAULT_HUGE_TLB_SIZE = 0;
+  public static final EncodingType DEFAULT_ENCODING_TYPE =
+      EncodingType.kPlain;
+  public static final boolean DEFAULT_FULL_SCAN_MODE = false;
+  public static final boolean DEFAULT_STORE_INDEX_IN_FILE
+      = false;
 
   public PlainTableConfig() {
     keySize_ = VARIABLE_LENGTH;
     bloomBitsPerKey_ = DEFAULT_BLOOM_BITS_PER_KEY;
     hashTableRatio_ = DEFAULT_HASH_TABLE_RATIO;
     indexSparseness_ = DEFAULT_INDEX_SPARSENESS;
+    hugePageTlbSize_ = DEFAULT_HUGE_TLB_SIZE;
+    encodingType_ = DEFAULT_ENCODING_TYPE;
+    fullScanMode_ = DEFAULT_FULL_SCAN_MODE;
+    storeIndexInFile_ = DEFAULT_STORE_INDEX_IN_FILE;
   }
 
   /**
-   * Set the length of the user key. If it is set to be VARIABLE_LENGTH,
-   * then it indicates the user keys are variable-lengthed.  Otherwise,
-   * all the keys need to have the same length in byte.
-   * DEFAULT: VARIABLE_LENGTH
+   * <p>Set the length of the user key. If it is set to be
+   * VARIABLE_LENGTH, then it indicates the user keys are
+   * of variable length.</p>
+   *
+   * <p>Otherwise,all the keys need to have the same length
+   * in byte.</p>
+   *
+   * <p>DEFAULT: VARIABLE_LENGTH</p>
    *
    * @param keySize the length of the user key.
    * @return the reference to the current config.
@@ -103,21 +118,134 @@ public class PlainTableConfig extends TableFormatConfig {
   /**
    * @return the index sparseness.
    */
-  public int indexSparseness() {
+  public long indexSparseness() {
     return indexSparseness_;
   }
 
+  /**
+   * <p>huge_page_tlb_size: if <=0, allocate hash indexes and blooms
+   * from malloc otherwise from huge page TLB.</p>
+   *
+   * <p>The user needs to reserve huge pages for it to be allocated,
+   * like: {@code sysctl -w vm.nr_hugepages=20}</p>
+   *
+   * <p>See linux doc Documentation/vm/hugetlbpage.txt</p>
+   *
+   * @param hugePageTlbSize_
+   * @return the reference to the current config.
+   */
+  public PlainTableConfig setHugePageTlbSize_(int hugePageTlbSize_) {
+    this.hugePageTlbSize_ = hugePageTlbSize_;
+    return this;
+  }
+
+  /**
+   * Returns the value for huge page tlb size
+   *
+   * @return hugePageTlbSize
+   */
+  public int hugePageTlbSize() {
+    return hugePageTlbSize_;
+  }
+
+  /**
+   * Sets the encoding type.
+   *
+   * <p>This setting determines how to encode
+   * the keys. See enum {@link EncodingType} for
+   * the choices.</p>
+   *
+   * <p>The value will determine how to encode keys
+   * when writing to a new SST file. This value will be stored
+   * inside the SST file which will be used when reading from
+   * the file, which makes it possible for users to choose
+   * different encoding type when reopening a DB. Files with
+   * different encoding types can co-exist in the same DB and
+   * can be read.</p>
+   *
+   * @param encodingType {@link org.rocksdb.EncodingType} value.
+   * @return the reference to the current config.
+   */
+  public PlainTableConfig setEncodingType(EncodingType encodingType) {
+    this.encodingType_ = encodingType;
+    return this;
+  }
+
+  /**
+   * Returns the active EncodingType
+   *
+   * @return currently set encoding type
+   */
+  public EncodingType encodingType() {
+    return encodingType_;
+  }
+
+  /**
+   * Set full scan mode, if true the whole file will be read
+   * one record by one without using the index.
+   *
+   * @param fullScanMode boolean value indicating if full
+   *     scan mode shall be enabled.
+   * @return the reference to the current config.
+   */
+  public PlainTableConfig setFullScanMode(boolean fullScanMode) {
+    this.fullScanMode_ = fullScanMode;
+    return this;
+  }
+
+  /**
+   * Return if full scan mode is active
+   * @return boolean value indicating if the full scan mode is
+   *     enabled.
+   */
+  public boolean fullScanMode() {
+    return fullScanMode_;
+  }
+
+  /**
+   * <p>If set to true: compute plain table index and bloom
+   * filter during file building and store it in file.
+   * When reading file, index will be mmaped instead
+   * of doing recomputation.</p>
+   *
+   * @param storeIndexInFile value indicating if index shall
+   *     be stored in a file
+   * @return the reference to the current config.
+   */
+  public PlainTableConfig setStoreIndexInFile(boolean storeIndexInFile) {
+    this.storeIndexInFile_ = storeIndexInFile;
+    return this;
+  }
+
+  /**
+   * Return a boolean value indicating if index shall be stored
+   * in a file.
+   *
+   * @return currently set value for store index in file.
+   */
+  public boolean storeIndexInFile() {
+    return storeIndexInFile_;
+  }
+
   @Override protected long newTableFactoryHandle() {
     return newTableFactoryHandle(keySize_, bloomBitsPerKey_,
-        hashTableRatio_, indexSparseness_);
+        hashTableRatio_, indexSparseness_, hugePageTlbSize_,
+        encodingType_.getValue(), fullScanMode_,
+        storeIndexInFile_);
   }
 
   private native long newTableFactoryHandle(
       int keySize, int bloomBitsPerKey,
-      double hashTableRatio, int indexSparseness);
+      double hashTableRatio, int indexSparseness,
+      int hugePageTlbSize, byte encodingType,
+      boolean fullScanMode, boolean storeIndexInFile);
 
   private int keySize_;
   private int bloomBitsPerKey_;
   private double hashTableRatio_;
   private int indexSparseness_;
+  private int hugePageTlbSize_;
+  private EncodingType encodingType_;
+  private boolean fullScanMode_;
+  private boolean storeIndexInFile_;
 }
diff --git a/java/rocksjni/table.cc b/java/rocksjni/table.cc
index 1582900f3..1b576a754 100644
--- a/java/rocksjni/table.cc
+++ b/java/rocksjni/table.cc
@@ -15,23 +15,30 @@
 /*
  * Class:     org_rocksdb_PlainTableConfig
  * Method:    newTableFactoryHandle
- * Signature: (IIDI)J
+ * Signature: (IIDIIBZZ)J
  */
 jlong Java_org_rocksdb_PlainTableConfig_newTableFactoryHandle(
     JNIEnv* env, jobject jobj, jint jkey_size, jint jbloom_bits_per_key,
-    jdouble jhash_table_ratio, jint jindex_sparseness) {
+    jdouble jhash_table_ratio, jint jindex_sparseness,
+    jint jhuge_page_tlb_size, jbyte jencoding_type,
+    jboolean jfull_scan_mode, jboolean jstore_index_in_file) {
   rocksdb::PlainTableOptions options = rocksdb::PlainTableOptions();
   options.user_key_len = jkey_size;
   options.bloom_bits_per_key = jbloom_bits_per_key;
   options.hash_table_ratio = jhash_table_ratio;
   options.index_sparseness = jindex_sparseness;
+  options.huge_page_tlb_size = jhuge_page_tlb_size;
+  options.encoding_type = static_cast<rocksdb::EncodingType>(
+      jencoding_type);
+  options.full_scan_mode = jfull_scan_mode;
+  options.store_index_in_file = jstore_index_in_file;
   return reinterpret_cast<jlong>(rocksdb::NewPlainTableFactory(options));
 }
 
 /*
  * Class:     org_rocksdb_BlockBasedTableConfig
  * Method:    newTableFactoryHandle
- * Signature: (ZJIJIIZIZZJI)J
+ * Signature: (ZJIJIIZIZZJIBB)J
  */
 jlong Java_org_rocksdb_BlockBasedTableConfig_newTableFactoryHandle(
     JNIEnv* env, jobject jobj, jboolean no_block_cache, jlong block_cache_size,
@@ -39,7 +46,8 @@ jlong Java_org_rocksdb_BlockBasedTableConfig_newTableFactoryHandle(
     jint block_restart_interval, jboolean whole_key_filtering,
     jlong jfilterPolicy, jboolean cache_index_and_filter_blocks,
     jboolean hash_index_allow_collision, jlong block_cache_compressed_size,
-    jint block_cache_compressd_num_shard_bits) {
+    jint block_cache_compressd_num_shard_bits, jbyte jchecksum_type,
+    jbyte jindex_type) {
   rocksdb::BlockBasedTableOptions options;
   options.no_block_cache = no_block_cache;
 
@@ -72,6 +80,9 @@ jlong Java_org_rocksdb_BlockBasedTableConfig_newTableFactoryHandle(
       options.block_cache = rocksdb::NewLRUCache(block_cache_compressed_size);
     }
   }
+  options.checksum = static_cast<rocksdb::ChecksumType>(jchecksum_type);
+  options.index_type = static_cast<
+      rocksdb::BlockBasedTableOptions::IndexType>(jindex_type);
 
   return reinterpret_cast<jlong>(rocksdb::NewBlockBasedTableFactory(options));
 }

From b011e201fa5e7a27febe0ea101004b84cb06215c Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Sun, 26 Oct 2014 20:47:54 +0100
Subject: [PATCH 364/829] Integrated review comments by ankgup87

- Added tests
- Minor code-style changes
---
 java/Makefile                                 |  2 +
 java/org/rocksdb/BlockBasedTableConfig.java   |  2 +
 java/org/rocksdb/ChecksumType.java            |  4 +-
 java/org/rocksdb/EncodingType.java            |  4 +-
 java/org/rocksdb/IndexType.java               |  2 +-
 java/org/rocksdb/PlainTableConfig.java        |  6 +-
 .../test/BlockBasedTableConfigTest.java       | 64 +++++++++++++++++++
 .../rocksdb/test/PlainTableConfigTest.java    | 43 +++++++++++++
 8 files changed, 119 insertions(+), 8 deletions(-)
 create mode 100644 java/org/rocksdb/test/BlockBasedTableConfigTest.java
 create mode 100644 java/org/rocksdb/test/PlainTableConfigTest.java

diff --git a/java/Makefile b/java/Makefile
index 6050effde..765ed44fc 100644
--- a/java/Makefile
+++ b/java/Makefile
@@ -39,11 +39,13 @@ test: java
 	javac org/rocksdb/test/*.java
 	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.WriteBatchTest
 	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.BackupableDBTest
+	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.BlockBasedTableConfigTest
 	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.ColumnFamilyTest
 	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.FilterTest
 	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.KeyMayExistTest
 	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.MemTableTest
 	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.OptionsTest
+	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.PlainTableConfigTest
 	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.ReadOnlyTest
 	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.MergeTest
 	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.ReadOptionsTest
diff --git a/java/org/rocksdb/BlockBasedTableConfig.java b/java/org/rocksdb/BlockBasedTableConfig.java
index d236b1a39..76e930204 100644
--- a/java/org/rocksdb/BlockBasedTableConfig.java
+++ b/java/org/rocksdb/BlockBasedTableConfig.java
@@ -14,6 +14,7 @@ public class BlockBasedTableConfig extends TableFormatConfig {
   public BlockBasedTableConfig() {
     noBlockCache_ = false;
     blockCacheSize_ = 8 * 1024 * 1024;
+    blockCacheNumShardBits_ = 0;
     blockSize_ = 4 * 1024;
     blockSizeDeviation_ = 10;
     blockRestartInterval_ = 16;
@@ -22,6 +23,7 @@ public class BlockBasedTableConfig extends TableFormatConfig {
     cacheIndexAndFilterBlocks_ = false;
     hashIndexAllowCollision_ = true;
     blockCacheCompressedSize_ = 0;
+    blockCacheCompressedNumShardBits_ = 0;
     checksumType_ = ChecksumType.kCRC32c;
     indexType_ = IndexType.kBinarySearch;
   }
diff --git a/java/org/rocksdb/ChecksumType.java b/java/org/rocksdb/ChecksumType.java
index 40ba032b3..a538c4ea6 100644
--- a/java/org/rocksdb/ChecksumType.java
+++ b/java/org/rocksdb/ChecksumType.java
@@ -16,11 +16,11 @@ public enum ChecksumType {
   /**
    * CRC32 Checksum
    */
-  kCRC32c((byte)1),
+  kCRC32c((byte) 1),
   /**
    * XX Hash
    */
-  kxxHash((byte)2);
+  kxxHash((byte) 2);
 
   private final byte value_;
 
diff --git a/java/org/rocksdb/EncodingType.java b/java/org/rocksdb/EncodingType.java
index 1d0a36c37..a372b0d0f 100644
--- a/java/org/rocksdb/EncodingType.java
+++ b/java/org/rocksdb/EncodingType.java
@@ -22,7 +22,7 @@ public enum EncodingType {
   /**
    * Always write full keys without any special encoding.
    */
-  kPlain((byte)0),
+  kPlain((byte) 0),
   /**
    * <p>Find opportunity to write the same prefix once for multiple rows.
    * In some cases, when a key follows a previous key with the same prefix,
@@ -36,7 +36,7 @@ public enum EncodingType {
    * bitwise compared to the prefix extractors stored in the file. An error
    * will be returned if the two don't match.</p>
    */
-  kPrefix((byte)1);
+  kPrefix((byte) 1);
 
   private final byte value_;
 
diff --git a/java/org/rocksdb/IndexType.java b/java/org/rocksdb/IndexType.java
index 47912f7b6..3399b4452 100644
--- a/java/org/rocksdb/IndexType.java
+++ b/java/org/rocksdb/IndexType.java
@@ -18,7 +18,7 @@ public enum IndexType {
    * The hash index, if enabled, will do the hash lookup when
    * {@code Options.prefix_extractor} is provided.
    */
-  kHashSearch((byte)1);
+  kHashSearch((byte) 1);
 
   private final byte value_;
 
diff --git a/java/org/rocksdb/PlainTableConfig.java b/java/org/rocksdb/PlainTableConfig.java
index bb44e1ac1..71d75f72c 100644
--- a/java/org/rocksdb/PlainTableConfig.java
+++ b/java/org/rocksdb/PlainTableConfig.java
@@ -131,11 +131,11 @@ public class PlainTableConfig extends TableFormatConfig {
    *
    * <p>See linux doc Documentation/vm/hugetlbpage.txt</p>
    *
-   * @param hugePageTlbSize_
+   * @param hugePageTlbSize
    * @return the reference to the current config.
    */
-  public PlainTableConfig setHugePageTlbSize_(int hugePageTlbSize_) {
-    this.hugePageTlbSize_ = hugePageTlbSize_;
+  public PlainTableConfig setHugePageTlbSize(int hugePageTlbSize) {
+    this.hugePageTlbSize_ = hugePageTlbSize;
     return this;
   }
 
diff --git a/java/org/rocksdb/test/BlockBasedTableConfigTest.java b/java/org/rocksdb/test/BlockBasedTableConfigTest.java
new file mode 100644
index 000000000..1f7a62698
--- /dev/null
+++ b/java/org/rocksdb/test/BlockBasedTableConfigTest.java
@@ -0,0 +1,64 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb.test;
+
+import org.rocksdb.BlockBasedTableConfig;
+import org.rocksdb.ChecksumType;
+import org.rocksdb.IndexType;
+
+public class BlockBasedTableConfigTest {
+
+  public static void main(String[] args) {
+    BlockBasedTableConfig blockBasedTableConfig =
+        new BlockBasedTableConfig();
+    assert(!blockBasedTableConfig.noBlockCache());
+    blockBasedTableConfig.setNoBlockCache(true);
+    assert(blockBasedTableConfig.noBlockCache());
+    assert(blockBasedTableConfig.blockCacheSize() == (8*1024*1024));
+    blockBasedTableConfig.setBlockCacheSize(8*1024);
+    assert(blockBasedTableConfig.blockCacheSize() == (8*1024));
+    assert(blockBasedTableConfig.blockSizeDeviation() == 10);
+    blockBasedTableConfig.setBlockSizeDeviation(12);
+    assert(blockBasedTableConfig.blockSizeDeviation() == 12);
+    assert(blockBasedTableConfig.blockRestartInterval() == 16);
+    blockBasedTableConfig.setBlockRestartInterval(15);
+    assert(blockBasedTableConfig.blockRestartInterval() == 15);
+    assert(blockBasedTableConfig.wholeKeyFiltering());
+    blockBasedTableConfig.setWholeKeyFiltering(false);
+    assert(!blockBasedTableConfig.wholeKeyFiltering());
+    assert(!blockBasedTableConfig.cacheIndexAndFilterBlocks());
+    blockBasedTableConfig.setCacheIndexAndFilterBlocks(true);
+    assert(blockBasedTableConfig.cacheIndexAndFilterBlocks());
+    assert(blockBasedTableConfig.hashIndexAllowCollision());
+    blockBasedTableConfig.setHashIndexAllowCollision(false);
+    assert(!blockBasedTableConfig.hashIndexAllowCollision());
+    assert(blockBasedTableConfig.blockCacheCompressedSize() == 0);
+    blockBasedTableConfig.setBlockCacheCompressedSize(40);
+    assert(blockBasedTableConfig.blockCacheCompressedSize() == 40);
+    assert(blockBasedTableConfig.checksumType().equals(
+        ChecksumType.kCRC32c));
+    blockBasedTableConfig.setChecksumType(ChecksumType.kNoChecksum);
+    assert(blockBasedTableConfig.checksumType().equals(
+        ChecksumType.kNoChecksum));
+    blockBasedTableConfig.setChecksumType(ChecksumType.kxxHash);
+    assert(blockBasedTableConfig.checksumType().equals(
+        ChecksumType.kxxHash));
+    assert(blockBasedTableConfig.indexType().equals(
+        IndexType.kBinarySearch));
+    blockBasedTableConfig.setIndexType(IndexType.kHashSearch);
+    assert(blockBasedTableConfig.indexType().equals(
+        IndexType.kHashSearch));
+    assert(blockBasedTableConfig.blockCacheCompressedNumShardBits()
+        == 0);
+    blockBasedTableConfig.setBlockCacheCompressedNumShardBits(4);
+    assert(blockBasedTableConfig.blockCacheCompressedNumShardBits()
+        == 4);
+    assert(blockBasedTableConfig.cacheNumShardBits() == 0);
+    blockBasedTableConfig.setCacheNumShardBits(5);
+    assert(blockBasedTableConfig.cacheNumShardBits() == 5);
+    System.out.println("BlockBasedTableConfig test passed");
+  }
+}
diff --git a/java/org/rocksdb/test/PlainTableConfigTest.java b/java/org/rocksdb/test/PlainTableConfigTest.java
new file mode 100644
index 000000000..d8edb9618
--- /dev/null
+++ b/java/org/rocksdb/test/PlainTableConfigTest.java
@@ -0,0 +1,43 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb.test;
+
+import org.rocksdb.EncodingType;
+import org.rocksdb.PlainTableConfig;
+
+public class PlainTableConfigTest {
+
+  public static void main(String[] args) {
+    PlainTableConfig plainTableConfig = new PlainTableConfig();
+    assert(plainTableConfig.keySize() == 0);
+    plainTableConfig.setKeySize(5);
+    assert(plainTableConfig.keySize() == 5);
+    assert(plainTableConfig.bloomBitsPerKey() == 10);
+    plainTableConfig.setBloomBitsPerKey(11);
+    assert(plainTableConfig.bloomBitsPerKey() == 11);
+    assert(plainTableConfig.hashTableRatio() == 0.75);
+    plainTableConfig.setHashTableRatio(0.95);
+    assert(plainTableConfig.hashTableRatio() == 0.95);
+    assert(plainTableConfig.indexSparseness() == 16);
+    plainTableConfig.setIndexSparseness(18);
+    assert(plainTableConfig.indexSparseness() == 18);
+    assert(plainTableConfig.hugePageTlbSize() == 0);
+    plainTableConfig.setHugePageTlbSize(1);
+    assert(plainTableConfig.hugePageTlbSize() == 1);
+    assert(plainTableConfig.encodingType().equals(
+        EncodingType.kPlain));
+    plainTableConfig.setEncodingType(EncodingType.kPrefix);
+    assert(plainTableConfig.encodingType().equals(
+        EncodingType.kPrefix));
+    assert(!plainTableConfig.fullScanMode());
+    plainTableConfig.setFullScanMode(true);
+    assert(plainTableConfig.fullScanMode());
+    assert(!plainTableConfig.storeIndexInFile());
+    plainTableConfig.setStoreIndexInFile(true);
+    assert(plainTableConfig.storeIndexInFile());
+    System.out.println("PlainTableConfig test passed");
+  }
+}

From c73d13bb817a512037f94f286ab315d06be20e30 Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Tue, 28 Oct 2014 18:37:53 +0100
Subject: [PATCH 365/829] [RocksJava] Integrate review comments from yhchiang

---
 java/org/rocksdb/ChecksumType.java              | 14 +++++++-------
 java/org/rocksdb/EncodingType.java              | 12 ++++++------
 java/org/rocksdb/IndexType.java                 | 12 ++++++------
 .../rocksdb/test/BlockBasedTableConfigTest.java | 17 -----------------
 java/org/rocksdb/test/PlainTableConfigTest.java |  9 ---------
 5 files changed, 19 insertions(+), 45 deletions(-)

diff --git a/java/org/rocksdb/ChecksumType.java b/java/org/rocksdb/ChecksumType.java
index a538c4ea6..e685376bf 100644
--- a/java/org/rocksdb/ChecksumType.java
+++ b/java/org/rocksdb/ChecksumType.java
@@ -6,7 +6,7 @@
 package org.rocksdb;
 
 /**
- * Checksum types used in conjunction with BlockBasedTable..
+ * Checksum types used in conjunction with BlockBasedTable.
  */
 public enum ChecksumType {
   /**
@@ -22,12 +22,6 @@ public enum ChecksumType {
    */
   kxxHash((byte) 2);
 
-  private final byte value_;
-
-  private ChecksumType(byte value) {
-    value_ = value;
-  }
-
   /**
    * Returns the byte value of the enumerations value
    *
@@ -36,4 +30,10 @@ public enum ChecksumType {
   public byte getValue() {
     return value_;
   }
+
+  private ChecksumType(byte value) {
+    value_ = value;
+  }
+
+  private final byte value_;
 }
diff --git a/java/org/rocksdb/EncodingType.java b/java/org/rocksdb/EncodingType.java
index a372b0d0f..d639542aa 100644
--- a/java/org/rocksdb/EncodingType.java
+++ b/java/org/rocksdb/EncodingType.java
@@ -38,12 +38,6 @@ public enum EncodingType {
    */
   kPrefix((byte) 1);
 
-  private final byte value_;
-
-  private EncodingType(byte value) {
-    value_ = value;
-  }
-
   /**
    * Returns the byte value of the enumerations value
    *
@@ -52,4 +46,10 @@ public enum EncodingType {
   public byte getValue() {
     return value_;
   }
+
+  private EncodingType(byte value) {
+    value_ = value;
+  }
+
+  private final byte value_;
 }
diff --git a/java/org/rocksdb/IndexType.java b/java/org/rocksdb/IndexType.java
index 3399b4452..f3c104566 100644
--- a/java/org/rocksdb/IndexType.java
+++ b/java/org/rocksdb/IndexType.java
@@ -20,12 +20,6 @@ public enum IndexType {
    */
   kHashSearch((byte) 1);
 
-  private final byte value_;
-
-  private IndexType(byte value) {
-    value_ = value;
-  }
-
   /**
    * Returns the byte value of the enumerations value
    *
@@ -34,4 +28,10 @@ public enum IndexType {
   public byte getValue() {
     return value_;
   }
+
+  private IndexType(byte value) {
+    value_ = value;
+  }
+
+  private final byte value_;
 }
diff --git a/java/org/rocksdb/test/BlockBasedTableConfigTest.java b/java/org/rocksdb/test/BlockBasedTableConfigTest.java
index 1f7a62698..3f54d5a78 100644
--- a/java/org/rocksdb/test/BlockBasedTableConfigTest.java
+++ b/java/org/rocksdb/test/BlockBasedTableConfigTest.java
@@ -14,49 +14,32 @@ public class BlockBasedTableConfigTest {
   public static void main(String[] args) {
     BlockBasedTableConfig blockBasedTableConfig =
         new BlockBasedTableConfig();
-    assert(!blockBasedTableConfig.noBlockCache());
     blockBasedTableConfig.setNoBlockCache(true);
     assert(blockBasedTableConfig.noBlockCache());
-    assert(blockBasedTableConfig.blockCacheSize() == (8*1024*1024));
     blockBasedTableConfig.setBlockCacheSize(8*1024);
     assert(blockBasedTableConfig.blockCacheSize() == (8*1024));
-    assert(blockBasedTableConfig.blockSizeDeviation() == 10);
     blockBasedTableConfig.setBlockSizeDeviation(12);
     assert(blockBasedTableConfig.blockSizeDeviation() == 12);
-    assert(blockBasedTableConfig.blockRestartInterval() == 16);
     blockBasedTableConfig.setBlockRestartInterval(15);
     assert(blockBasedTableConfig.blockRestartInterval() == 15);
-    assert(blockBasedTableConfig.wholeKeyFiltering());
     blockBasedTableConfig.setWholeKeyFiltering(false);
     assert(!blockBasedTableConfig.wholeKeyFiltering());
-    assert(!blockBasedTableConfig.cacheIndexAndFilterBlocks());
     blockBasedTableConfig.setCacheIndexAndFilterBlocks(true);
     assert(blockBasedTableConfig.cacheIndexAndFilterBlocks());
-    assert(blockBasedTableConfig.hashIndexAllowCollision());
     blockBasedTableConfig.setHashIndexAllowCollision(false);
     assert(!blockBasedTableConfig.hashIndexAllowCollision());
-    assert(blockBasedTableConfig.blockCacheCompressedSize() == 0);
     blockBasedTableConfig.setBlockCacheCompressedSize(40);
     assert(blockBasedTableConfig.blockCacheCompressedSize() == 40);
-    assert(blockBasedTableConfig.checksumType().equals(
-        ChecksumType.kCRC32c));
     blockBasedTableConfig.setChecksumType(ChecksumType.kNoChecksum);
-    assert(blockBasedTableConfig.checksumType().equals(
-        ChecksumType.kNoChecksum));
     blockBasedTableConfig.setChecksumType(ChecksumType.kxxHash);
     assert(blockBasedTableConfig.checksumType().equals(
         ChecksumType.kxxHash));
-    assert(blockBasedTableConfig.indexType().equals(
-        IndexType.kBinarySearch));
     blockBasedTableConfig.setIndexType(IndexType.kHashSearch);
     assert(blockBasedTableConfig.indexType().equals(
         IndexType.kHashSearch));
-    assert(blockBasedTableConfig.blockCacheCompressedNumShardBits()
-        == 0);
     blockBasedTableConfig.setBlockCacheCompressedNumShardBits(4);
     assert(blockBasedTableConfig.blockCacheCompressedNumShardBits()
         == 4);
-    assert(blockBasedTableConfig.cacheNumShardBits() == 0);
     blockBasedTableConfig.setCacheNumShardBits(5);
     assert(blockBasedTableConfig.cacheNumShardBits() == 5);
     System.out.println("BlockBasedTableConfig test passed");
diff --git a/java/org/rocksdb/test/PlainTableConfigTest.java b/java/org/rocksdb/test/PlainTableConfigTest.java
index d8edb9618..888f35d81 100644
--- a/java/org/rocksdb/test/PlainTableConfigTest.java
+++ b/java/org/rocksdb/test/PlainTableConfigTest.java
@@ -12,30 +12,21 @@ public class PlainTableConfigTest {
 
   public static void main(String[] args) {
     PlainTableConfig plainTableConfig = new PlainTableConfig();
-    assert(plainTableConfig.keySize() == 0);
     plainTableConfig.setKeySize(5);
     assert(plainTableConfig.keySize() == 5);
-    assert(plainTableConfig.bloomBitsPerKey() == 10);
     plainTableConfig.setBloomBitsPerKey(11);
     assert(plainTableConfig.bloomBitsPerKey() == 11);
-    assert(plainTableConfig.hashTableRatio() == 0.75);
     plainTableConfig.setHashTableRatio(0.95);
     assert(plainTableConfig.hashTableRatio() == 0.95);
-    assert(plainTableConfig.indexSparseness() == 16);
     plainTableConfig.setIndexSparseness(18);
     assert(plainTableConfig.indexSparseness() == 18);
-    assert(plainTableConfig.hugePageTlbSize() == 0);
     plainTableConfig.setHugePageTlbSize(1);
     assert(plainTableConfig.hugePageTlbSize() == 1);
-    assert(plainTableConfig.encodingType().equals(
-        EncodingType.kPlain));
     plainTableConfig.setEncodingType(EncodingType.kPrefix);
     assert(plainTableConfig.encodingType().equals(
         EncodingType.kPrefix));
-    assert(!plainTableConfig.fullScanMode());
     plainTableConfig.setFullScanMode(true);
     assert(plainTableConfig.fullScanMode());
-    assert(!plainTableConfig.storeIndexInFile());
     plainTableConfig.setStoreIndexInFile(true);
     assert(plainTableConfig.storeIndexInFile());
     System.out.println("PlainTableConfig test passed");

From 73605d91798288a85536b4ad6a9000e6644d260f Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Thu, 30 Oct 2014 13:34:44 -0700
Subject: [PATCH 366/829] Apply InfoLogLevel to the logs in
 util/db_info_dumper.cc

Summary:
* Rename util/db_info_dummper.cc to util/db_info_dumper.cc
* Apply InfoLogLevel to the logs in util/db_info_dumper.cc

Test Plan: make

Reviewers: ljin, sdong, igor

Reviewed By: igor

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D27915
---
 .../{db_info_dummper.cc => db_info_dumper.cc} | 26 ++++++++++++-------
 1 file changed, 17 insertions(+), 9 deletions(-)
 rename util/{db_info_dummper.cc => db_info_dumper.cc} (78%)

diff --git a/util/db_info_dummper.cc b/util/db_info_dumper.cc
similarity index 78%
rename from util/db_info_dummper.cc
rename to util/db_info_dumper.cc
index 2e0d34481..7049e6853 100644
--- a/util/db_info_dummper.cc
+++ b/util/db_info_dumper.cc
@@ -36,10 +36,11 @@ void DumpDBFileSummary(const DBOptions& options, const std::string& dbname) {
   uint64_t file_size;
   std::string file_info, wal_info;
 
-  Log(options.info_log, "DB SUMMARY\n");
+  Log(InfoLogLevel::INFO_LEVEL, options.info_log, "DB SUMMARY\n");
   // Get files in dbname dir
   if (!env->GetChildren(dbname, &files).ok()) {
-    Log(options.info_log, "Error when reading %s dir\n", dbname.c_str());
+    Log(InfoLogLevel::ERROR_LEVEL,
+        options.info_log, "Error when reading %s dir\n", dbname.c_str());
   }
   std::sort(files.begin(), files.end());
   for (std::string file : files) {
@@ -48,14 +49,17 @@ void DumpDBFileSummary(const DBOptions& options, const std::string& dbname) {
     }
     switch (type) {
       case kCurrentFile:
-        Log(options.info_log, "CURRENT file:  %s\n", file.c_str());
+        Log(InfoLogLevel::INFO_LEVEL, options.info_log,
+            "CURRENT file:  %s\n", file.c_str());
         break;
       case kIdentityFile:
-        Log(options.info_log, "IDENTITY file:  %s\n", file.c_str());
+        Log(InfoLogLevel::INFO_LEVEL, options.info_log,
+            "IDENTITY file:  %s\n", file.c_str());
         break;
       case kDescriptorFile:
         env->GetFileSize(dbname + "/" + file, &file_size);
-        Log(options.info_log, "MANIFEST file:  %s size: %" PRIu64 " Bytes\n",
+        Log(InfoLogLevel::INFO_LEVEL, options.info_log,
+            "MANIFEST file:  %s size: %" PRIu64 " Bytes\n",
             file.c_str(), file_size);
         break;
       case kLogFile:
@@ -79,7 +83,8 @@ void DumpDBFileSummary(const DBOptions& options, const std::string& dbname) {
   for (auto& db_path : options.db_paths) {
     if (dbname.compare(db_path.path) != 0) {
       if (!env->GetChildren(db_path.path, &files).ok()) {
-        Log(options.info_log, "Error when reading %s dir\n",
+        Log(InfoLogLevel::ERROR_LEVEL, options.info_log,
+            "Error when reading %s dir\n",
             db_path.path.c_str());
         continue;
       }
@@ -92,7 +97,8 @@ void DumpDBFileSummary(const DBOptions& options, const std::string& dbname) {
         }
       }
     }
-    Log(options.info_log, "SST files in %s dir, Total Num: %" PRIu64 ", files: %s\n",
+    Log(InfoLogLevel::INFO_LEVEL, options.info_log,
+        "SST files in %s dir, Total Num: %" PRIu64 ", files: %s\n",
         db_path.path.c_str(), file_num, file_info.c_str());
     file_num = 0;
     file_info.clear();
@@ -101,7 +107,8 @@ void DumpDBFileSummary(const DBOptions& options, const std::string& dbname) {
   // Get wal file in wal_dir
   if (dbname.compare(options.wal_dir) != 0) {
     if (!env->GetChildren(options.wal_dir, &files).ok()) {
-      Log(options.info_log, "Error when reading %s dir\n",
+      Log(InfoLogLevel::ERROR_LEVEL, options.info_log,
+          "Error when reading %s dir\n",
           options.wal_dir.c_str());
       return;
     }
@@ -118,7 +125,8 @@ void DumpDBFileSummary(const DBOptions& options, const std::string& dbname) {
       }
     }
   }
-  Log(options.info_log, "Write Ahead Log file in %s: %s\n",
+  Log(InfoLogLevel::INFO_LEVEL, options.info_log,
+      "Write Ahead Log file in %s: %s\n",
       options.wal_dir.c_str(), wal_info.c_str());
 }
 }  // namespace rocksdb

From 0f7f3b860574a14fd06a6803fd44c40f33ba77b2 Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Thu, 30 Oct 2014 13:36:18 -0700
Subject: [PATCH 367/829] Check InfoLogLevel earlier in Log functions.

Summary: Check InfoLogLevel earlier in Log functions.

Test Plan: auto_roll_logger_test

Reviewers: ljin, sdong, igor

Reviewed By: igor

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D27993
---
 util/env.cc | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/util/env.cc b/util/env.cc
index 91ae0784b..6fd020489 100644
--- a/util/env.cc
+++ b/util/env.cc
@@ -41,7 +41,7 @@ void LogFlush(Logger *info_log) {
 }
 
 void Log(Logger* info_log, const char* format, ...) {
-  if (info_log) {
+  if (info_log && info_log->GetInfoLogLevel() <= InfoLogLevel::INFO_LEVEL) {
     va_list ap;
     va_start(ap, format);
     info_log->Logv(InfoLogLevel::INFO_LEVEL, format, ap);
@@ -51,7 +51,7 @@ void Log(Logger* info_log, const char* format, ...) {
 
 void Log(const InfoLogLevel log_level, Logger* info_log, const char* format,
          ...) {
-  if (info_log) {
+  if (info_log && info_log->GetInfoLogLevel() <= log_level) {
     va_list ap;
     va_start(ap, format);
     info_log->Logv(log_level, format, ap);
@@ -60,7 +60,7 @@ void Log(const InfoLogLevel log_level, Logger* info_log, const char* format,
 }
 
 void Debug(Logger* info_log, const char* format, ...) {
-  if (info_log) {
+  if (info_log && info_log->GetInfoLogLevel() <= InfoLogLevel::DEBUG_LEVEL) {
     va_list ap;
     va_start(ap, format);
     info_log->Logv(InfoLogLevel::DEBUG_LEVEL, format, ap);
@@ -69,7 +69,7 @@ void Debug(Logger* info_log, const char* format, ...) {
 }
 
 void Info(Logger* info_log, const char* format, ...) {
-  if (info_log) {
+  if (info_log && info_log->GetInfoLogLevel() <= InfoLogLevel::INFO_LEVEL) {
     va_list ap;
     va_start(ap, format);
     info_log->Logv(InfoLogLevel::INFO_LEVEL, format, ap);
@@ -78,7 +78,7 @@ void Info(Logger* info_log, const char* format, ...) {
 }
 
 void Warn(Logger* info_log, const char* format, ...) {
-  if (info_log) {
+  if (info_log && info_log->GetInfoLogLevel() <= InfoLogLevel::WARN_LEVEL) {
     va_list ap;
     va_start(ap, format);
     info_log->Logv(InfoLogLevel::WARN_LEVEL, format, ap);
@@ -86,7 +86,7 @@ void Warn(Logger* info_log, const char* format, ...) {
   }
 }
 void Error(Logger* info_log, const char* format, ...) {
-  if (info_log) {
+  if (info_log && info_log->GetInfoLogLevel() <= InfoLogLevel::ERROR_LEVEL) {
     va_list ap;
     va_start(ap, format);
     info_log->Logv(InfoLogLevel::ERROR_LEVEL, format, ap);
@@ -94,7 +94,7 @@ void Error(Logger* info_log, const char* format, ...) {
   }
 }
 void Fatal(Logger* info_log, const char* format, ...) {
-  if (info_log) {
+  if (info_log && info_log->GetInfoLogLevel() <= InfoLogLevel::FATAL_LEVEL) {
     va_list ap;
     va_start(ap, format);
     info_log->Logv(InfoLogLevel::FATAL_LEVEL, format, ap);

From 39464a990ff8bdc9a8560469641dec04c13eefe7 Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Tue, 28 Oct 2014 22:38:08 +0100
Subject: [PATCH 368/829] [RocksJava] Options Refactoring 3.6

Summary:
Options extends now two interfaces DBOptionsInterface
and ColumnFamilyOptionsInterface. There are also further
improvements to the Options bindings:

Optimize methods were ported to Java. (OptimizeForPointLookup,
OptimizeLevelCompaction, OptimizeUniversalCompaction).

To align BuiltinComparator with every other Enum it was moved to
a separate file.

Test Plan:
make rocksdbjava
make jtest
---
 java/Makefile                                 |    2 +-
 java/org/rocksdb/BuiltinComparator.java       |   20 +
 .../rocksdb/ColumnFamilyOptionsInterface.java |  979 ++++++++
 java/org/rocksdb/DBOptionsInterface.java      |  763 ++++++
 java/org/rocksdb/Options.java                 | 1993 ++++------------
 java/org/rocksdb/RateLimiterConfig.java       |    4 +-
 java/org/rocksdb/test/OptionsTest.java        |   13 +-
 java/rocksjni/options.cc                      | 2061 ++++++++++++++++-
 8 files changed, 4133 insertions(+), 1702 deletions(-)
 create mode 100644 java/org/rocksdb/BuiltinComparator.java
 create mode 100644 java/org/rocksdb/ColumnFamilyOptionsInterface.java
 create mode 100644 java/org/rocksdb/DBOptionsInterface.java

diff --git a/java/Makefile b/java/Makefile
index 765ed44fc..04eac63dd 100644
--- a/java/Makefile
+++ b/java/Makefile
@@ -1,4 +1,4 @@
-NATIVE_JAVA_CLASSES = org.rocksdb.RocksDB org.rocksdb.Options org.rocksdb.WriteBatch org.rocksdb.WriteBatchInternal org.rocksdb.WriteBatchTest org.rocksdb.WriteOptions org.rocksdb.BackupableDB org.rocksdb.BackupableDBOptions org.rocksdb.Statistics org.rocksdb.RocksIterator org.rocksdb.VectorMemTableConfig org.rocksdb.SkipListMemTableConfig org.rocksdb.HashLinkedListMemTableConfig org.rocksdb.HashSkipListMemTableConfig org.rocksdb.PlainTableConfig org.rocksdb.BlockBasedTableConfig org.rocksdb.ReadOptions org.rocksdb.Filter org.rocksdb.BloomFilter org.rocksdb.ComparatorOptions org.rocksdb.AbstractComparator org.rocksdb.Comparator org.rocksdb.DirectComparator org.rocksdb.AbstractSlice org.rocksdb.Slice org.rocksdb.DirectSlice org.rocksdb.RestoreOptions org.rocksdb.RestoreBackupableDB org.rocksdb.RocksEnv org.rocksdb.GenericRateLimiterConfig org.rocksdb.ColumnFamilyHandle org.rocksdb.MergeOperator org.rocksdb.StringAppendOperator
+NATIVE_JAVA_CLASSES = org.rocksdb.RocksDB org.rocksdb.Options org.rocksdb.DBOptions org.rocksdb.ColumnFamilyOptions org.rocksdb.WriteBatch org.rocksdb.WriteBatchInternal org.rocksdb.WriteBatchTest org.rocksdb.WriteOptions org.rocksdb.BackupableDB org.rocksdb.BackupableDBOptions org.rocksdb.Statistics org.rocksdb.RocksIterator org.rocksdb.VectorMemTableConfig org.rocksdb.SkipListMemTableConfig org.rocksdb.HashLinkedListMemTableConfig org.rocksdb.HashSkipListMemTableConfig org.rocksdb.PlainTableConfig org.rocksdb.BlockBasedTableConfig org.rocksdb.ReadOptions org.rocksdb.Filter org.rocksdb.BloomFilter org.rocksdb.ComparatorOptions org.rocksdb.AbstractComparator org.rocksdb.Comparator org.rocksdb.DirectComparator org.rocksdb.AbstractSlice org.rocksdb.Slice org.rocksdb.DirectSlice org.rocksdb.RestoreOptions org.rocksdb.RestoreBackupableDB org.rocksdb.RocksEnv org.rocksdb.GenericRateLimiterConfig org.rocksdb.ColumnFamilyHandle org.rocksdb.MergeOperator org.rocksdb.StringAppendOperator org.rocksdb.ComparatorOptions org.rocksdb.AbstractComparator org.rocksdb.Comparator org.rocksdb.DirectComparator org.rocksdb.AbstractSlice org.rocksdb.Slice org.rocksdb.DirectSlice
 
 ROCKSDB_MAJOR = $(shell egrep "ROCKSDB_MAJOR.[0-9]" ../include/rocksdb/version.h | cut -d ' ' -f 3)
 ROCKSDB_MINOR = $(shell egrep "ROCKSDB_MINOR.[0-9]" ../include/rocksdb/version.h | cut -d ' ' -f 3)
diff --git a/java/org/rocksdb/BuiltinComparator.java b/java/org/rocksdb/BuiltinComparator.java
new file mode 100644
index 000000000..ee92e8dd9
--- /dev/null
+++ b/java/org/rocksdb/BuiltinComparator.java
@@ -0,0 +1,20 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+/**
+ * Builtin RocksDB comparators
+ *
+ * <ol>
+ *   <li>BYTEWISE_COMPARATOR - Sorts all keys in ascending bytewise
+ *   order.</li>
+ *   <li>REVERSE_BYTEWISE_COMPARATOR - Sorts all keys in descending bytewise
+ *   order</li>
+ * </ol>
+ */
+public enum BuiltinComparator {
+  BYTEWISE_COMPARATOR, REVERSE_BYTEWISE_COMPARATOR
+}
diff --git a/java/org/rocksdb/ColumnFamilyOptionsInterface.java b/java/org/rocksdb/ColumnFamilyOptionsInterface.java
new file mode 100644
index 000000000..827fe8c64
--- /dev/null
+++ b/java/org/rocksdb/ColumnFamilyOptionsInterface.java
@@ -0,0 +1,979 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+public interface ColumnFamilyOptionsInterface {
+
+  /**
+   * Use this if you don't need to keep the data sorted, i.e. you'll never use
+   * an iterator, only Put() and Get() API calls
+   *
+   * @param blockCacheSizeMb Block cache size in MB
+   * @return the instance of the current Object.
+   */
+  Object optimizeForPointLookup(long blockCacheSizeMb);
+
+  /**
+   * <p>Default values for some parameters in ColumnFamilyOptions are not
+   * optimized for heavy workloads and big datasets, which means you might
+   * observe write stalls under some conditions. As a starting point for tuning
+   * RocksDB options, use the following for level style compaction.</p>
+   *
+   * <p>Make sure to also call IncreaseParallelism(), which will provide the
+   * biggest performance gains.</p>
+   * <p>Note: we might use more memory than memtable_memory_budget during high
+   * write rate period</p>
+   *
+   * @return the instance of the current Object.
+   */
+  Object optimizeLevelStyleCompaction();
+
+  /**
+   * <p>Default values for some parameters in ColumnFamilyOptions are not
+   * optimized for heavy workloads and big datasets, which means you might
+   * observe write stalls under some conditions. As a starting point for tuning
+   * RocksDB options, use the following for level style compaction.</p>
+   *
+   * <p>Make sure to also call IncreaseParallelism(), which will provide the
+   * biggest performance gains.</p>
+   * <p>Note: we might use more memory than memtable_memory_budget during high
+   * write rate period</p>
+   *
+   * @param memtableMemoryBudget memory budget in bytes
+   * @return the instance of the current Object.
+   */
+  Object optimizeLevelStyleCompaction(long memtableMemoryBudget);
+
+  /**
+   * <p>Default values for some parameters in ColumnFamilyOptions are not
+   * optimized for heavy workloads and big datasets, which means you might
+   * observe write stalls under some conditions. As a starting point for tuning
+   * RocksDB options, use the following for universal style compaction.</p>
+   *
+   * <p>Universal style compaction is focused on reducing Write Amplification
+   * Factor for big data sets, but increases Space Amplification.</p>
+   *
+   * <p>Make sure to also call IncreaseParallelism(), which will provide the
+   * biggest performance gains.</p>
+   *
+   * <p>Note: we might use more memory than memtable_memory_budget during high
+   * write rate period</p>
+   *
+   * @return the instance of the current Object.
+   */
+  Object optimizeUniversalStyleCompaction();
+
+  /**
+   * <p>Default values for some parameters in ColumnFamilyOptions are not
+   * optimized for heavy workloads and big datasets, which means you might
+   * observe write stalls under some conditions. As a starting point for tuning
+   * RocksDB options, use the following for universal style compaction.</p>
+   *
+   * <p>Universal style compaction is focused on reducing Write Amplification
+   * Factor for big data sets, but increases Space Amplification.</p>
+   *
+   * <p>Make sure to also call IncreaseParallelism(), which will provide the
+   * biggest performance gains.</p>
+   *
+   * <p>Note: we might use more memory than memtable_memory_budget during high
+   * write rate period</p>
+   *
+   * @param memtableMemoryBudget memory budget in bytes
+   * @return the instance of the current Object.
+   */
+  Object optimizeUniversalStyleCompaction(long memtableMemoryBudget);
+
+  /**
+   * Set {@link BuiltinComparator} to be used with RocksDB.
+   *
+   * Note: Comparator can be set once upon database creation.
+   *
+   * Default: BytewiseComparator.
+   * @param builtinComparator a {@link BuiltinComparator} type.
+   * @return the instance of the current Object.
+   */
+  Object setComparator(BuiltinComparator builtinComparator);
+
+  /**
+   * Use the specified comparator for key ordering.
+   *
+   * Comparator should not be disposed before options instances using this comparator is
+   * disposed. If dispose() function is not called, then comparator object will be
+   * GC'd automatically.
+   *
+   * Comparator instance can be re-used in multiple options instances.
+   *
+   * @param comparator java instance.
+   * @return the instance of the current Object.
+   */
+  Object setComparator(AbstractComparator comparator);
+
+  /**
+   * <p>Set the merge operator to be used for merging two merge operands
+   * of the same key. The merge function is invoked during
+   * compaction and at lookup time, if multiple key/value pairs belonging
+   * to the same key are found in the database.</p>
+   *
+   * @param name the name of the merge function, as defined by
+   * the MergeOperators factory (see utilities/MergeOperators.h)
+   * The merge function is specified by name and must be one of the
+   * standard merge operators provided by RocksDB. The available
+   * operators are "put", "uint64add", "stringappend" and "stringappendtest".
+   * @return the instance of the current Object.
+   */
+  public Object setMergeOperatorName(String name);
+
+  /**
+   * <p>Set the merge operator to be used for merging two different key/value
+   * pairs that share the same key. The merge function is invoked during
+   * compaction and at lookup time, if multiple key/value pairs belonging
+   * to the same key are found in the database.</p>
+   *
+   * @param mergeOperator {@link MergeOperator} instance.
+   * @return the instance of the current Object.
+   */
+  public Object setMergeOperator(MergeOperator mergeOperator);
+
+  /**
+   * Amount of data to build up in memory (backed by an unsorted log
+   * on disk) before converting to a sorted on-disk file.
+   *
+   * Larger values increase performance, especially during bulk loads.
+   * Up to {@code max_write_buffer_number} write buffers may be held in memory
+   * at the same time, so you may wish to adjust this parameter
+   * to control memory usage.
+   *
+   * Also, a larger write buffer will result in a longer recovery time
+   * the next time the database is opened.
+   *
+   * Default: 4MB
+   * @param writeBufferSize the size of write buffer.
+   * @return the instance of the current Object.
+   * @throws org.rocksdb.RocksDBException
+   */
+  Object setWriteBufferSize(long writeBufferSize)
+      throws RocksDBException;
+
+  /**
+   * Return size of write buffer size.
+   *
+   * @return size of write buffer.
+   * @see #setWriteBufferSize(long)
+   */
+  long writeBufferSize();
+
+  /**
+   * The maximum number of write buffers that are built up in memory.
+   * The default is 2, so that when 1 write buffer is being flushed to
+   * storage, new writes can continue to the other write buffer.
+   * Default: 2
+   *
+   * @param maxWriteBufferNumber maximum number of write buffers.
+   * @return the instance of the current Object.
+   */
+  Object setMaxWriteBufferNumber(
+      int maxWriteBufferNumber);
+
+  /**
+   * Returns maximum number of write buffers.
+   *
+   * @return maximum number of write buffers.
+   * @see #setMaxWriteBufferNumber(int)
+   */
+  int maxWriteBufferNumber();
+
+  /**
+   * The minimum number of write buffers that will be merged together
+   * before writing to storage.  If set to 1, then
+   * all write buffers are flushed to L0 as individual files and this increases
+   * read amplification because a get request has to check in all of these
+   * files. Also, an in-memory merge may result in writing lesser
+   * data to storage if there are duplicate records in each of these
+   * individual write buffers.  Default: 1
+   *
+   * @param minWriteBufferNumberToMerge the minimum number of write buffers
+   *     that will be merged together.
+   * @return the reference to the current option.
+   */
+  Object setMinWriteBufferNumberToMerge(
+      int minWriteBufferNumberToMerge);
+
+  /**
+   * The minimum number of write buffers that will be merged together
+   * before writing to storage.  If set to 1, then
+   * all write buffers are flushed to L0 as individual files and this increases
+   * read amplification because a get request has to check in all of these
+   * files. Also, an in-memory merge may result in writing lesser
+   * data to storage if there are duplicate records in each of these
+   * individual write buffers.  Default: 1
+   *
+   * @return the minimum number of write buffers that will be merged together.
+   */
+  int minWriteBufferNumberToMerge();
+
+  /**
+   * This prefix-extractor uses the first n bytes of a key as its prefix.
+   *
+   * In some hash-based memtable representation such as HashLinkedList
+   * and HashSkipList, prefixes are used to partition the keys into
+   * several buckets.  Prefix extractor is used to specify how to
+   * extract the prefix given a key.
+   *
+   * @param n use the first n bytes of a key as its prefix.
+   */
+  Object useFixedLengthPrefixExtractor(int n);
+
+  /**
+   * Compress blocks using the specified compression algorithm.  This
+   * parameter can be changed dynamically.
+   *
+   * Default: SNAPPY_COMPRESSION, which gives lightweight but fast compression.
+   *
+   * @param compressionType Compression Type.
+   * @return the reference to the current option.
+   */
+  Object setCompressionType(CompressionType compressionType);
+
+  /**
+   * Compress blocks using the specified compression algorithm.  This
+   * parameter can be changed dynamically.
+   *
+   * Default: SNAPPY_COMPRESSION, which gives lightweight but fast compression.
+   *
+   * @return Compression type.
+   */
+  CompressionType compressionType();
+
+  /**
+   * Set the number of levels for this database
+   * If level-styled compaction is used, then this number determines
+   * the total number of levels.
+   *
+   * @param numLevels the number of levels.
+   * @return the reference to the current option.
+   */
+  Object setNumLevels(int numLevels);
+
+  /**
+   * If level-styled compaction is used, then this number determines
+   * the total number of levels.
+   *
+   * @return the number of levels.
+   */
+  int numLevels();
+
+  /**
+   * Number of files to trigger level-0 compaction. A value < 0 means that
+   * level-0 compaction will not be triggered by number of files at all.
+   * Default: 4
+   *
+   * @param numFiles the number of files in level-0 to trigger compaction.
+   * @return the reference to the current option.
+   */
+  Object setLevelZeroFileNumCompactionTrigger(
+      int numFiles);
+
+  /**
+   * The number of files in level 0 to trigger compaction from level-0 to
+   * level-1.  A value < 0 means that level-0 compaction will not be
+   * triggered by number of files at all.
+   * Default: 4
+   *
+   * @return the number of files in level 0 to trigger compaction.
+   */
+  int levelZeroFileNumCompactionTrigger();
+
+  /**
+   * Soft limit on number of level-0 files. We start slowing down writes at this
+   * point. A value < 0 means that no writing slow down will be triggered by
+   * number of files in level-0.
+   *
+   * @param numFiles soft limit on number of level-0 files.
+   * @return the reference to the current option.
+   */
+  Object setLevelZeroSlowdownWritesTrigger(
+      int numFiles);
+
+  /**
+   * Soft limit on the number of level-0 files. We start slowing down writes
+   * at this point. A value < 0 means that no writing slow down will be
+   * triggered by number of files in level-0.
+   *
+   * @return the soft limit on the number of level-0 files.
+   */
+  int levelZeroSlowdownWritesTrigger();
+
+  /**
+   * Maximum number of level-0 files.  We stop writes at this point.
+   *
+   * @param numFiles the hard limit of the number of level-0 files.
+   * @return the reference to the current option.
+   */
+  Object setLevelZeroStopWritesTrigger(int numFiles);
+
+  /**
+   * Maximum number of level-0 files.  We stop writes at this point.
+   *
+   * @return the hard limit of the number of level-0 file.
+   */
+  int levelZeroStopWritesTrigger();
+
+  /**
+   * The highest level to which a new compacted memtable is pushed if it
+   * does not create overlap.  We try to push to level 2 to avoid the
+   * relatively expensive level 0=>1 compactions and to avoid some
+   * expensive manifest file operations.  We do not push all the way to
+   * the largest level since that can generate a lot of wasted disk
+   * space if the same key space is being repeatedly overwritten.
+   *
+   * @param maxMemCompactionLevel the highest level to which a new compacted
+   *     mem-table will be pushed.
+   * @return the reference to the current option.
+   */
+  Object setMaxMemCompactionLevel(
+      int maxMemCompactionLevel);
+
+  /**
+   * The highest level to which a new compacted memtable is pushed if it
+   * does not create overlap.  We try to push to level 2 to avoid the
+   * relatively expensive level 0=>1 compactions and to avoid some
+   * expensive manifest file operations.  We do not push all the way to
+   * the largest level since that can generate a lot of wasted disk
+   * space if the same key space is being repeatedly overwritten.
+   *
+   * @return the highest level where a new compacted memtable will be pushed.
+   */
+  int maxMemCompactionLevel();
+
+  /**
+   * The target file size for compaction.
+   * This targetFileSizeBase determines a level-1 file size.
+   * Target file size for level L can be calculated by
+   * targetFileSizeBase * (targetFileSizeMultiplier ^ (L-1))
+   * For example, if targetFileSizeBase is 2MB and
+   * target_file_size_multiplier is 10, then each file on level-1 will
+   * be 2MB, and each file on level 2 will be 20MB,
+   * and each file on level-3 will be 200MB.
+   * by default targetFileSizeBase is 2MB.
+   *
+   * @param targetFileSizeBase the target size of a level-0 file.
+   * @return the reference to the current option.
+   *
+   * @see #setTargetFileSizeMultiplier(int)
+   */
+  Object setTargetFileSizeBase(long targetFileSizeBase);
+
+  /**
+   * The target file size for compaction.
+   * This targetFileSizeBase determines a level-1 file size.
+   * Target file size for level L can be calculated by
+   * targetFileSizeBase * (targetFileSizeMultiplier ^ (L-1))
+   * For example, if targetFileSizeBase is 2MB and
+   * target_file_size_multiplier is 10, then each file on level-1 will
+   * be 2MB, and each file on level 2 will be 20MB,
+   * and each file on level-3 will be 200MB.
+   * by default targetFileSizeBase is 2MB.
+   *
+   * @return the target size of a level-0 file.
+   *
+   * @see #targetFileSizeMultiplier()
+   */
+  long targetFileSizeBase();
+
+  /**
+   * targetFileSizeMultiplier defines the size ratio between a
+   * level-L file and level-(L+1) file.
+   * By default target_file_size_multiplier is 1, meaning
+   * files in different levels have the same target.
+   *
+   * @param multiplier the size ratio between a level-(L+1) file
+   *     and level-L file.
+   * @return the reference to the current option.
+   */
+  Object setTargetFileSizeMultiplier(int multiplier);
+
+  /**
+   * targetFileSizeMultiplier defines the size ratio between a
+   * level-(L+1) file and level-L file.
+   * By default targetFileSizeMultiplier is 1, meaning
+   * files in different levels have the same target.
+   *
+   * @return the size ratio between a level-(L+1) file and level-L file.
+   */
+  int targetFileSizeMultiplier();
+
+  /**
+   * The upper-bound of the total size of level-1 files in bytes.
+   * Maximum number of bytes for level L can be calculated as
+   * (maxBytesForLevelBase) * (maxBytesForLevelMultiplier ^ (L-1))
+   * For example, if maxBytesForLevelBase is 20MB, and if
+   * max_bytes_for_level_multiplier is 10, total data size for level-1
+   * will be 20MB, total file size for level-2 will be 200MB,
+   * and total file size for level-3 will be 2GB.
+   * by default 'maxBytesForLevelBase' is 10MB.
+   *
+   * @return the reference to the current option.
+   * @see #setMaxBytesForLevelMultiplier(int)
+   */
+  Object setMaxBytesForLevelBase(
+      long maxBytesForLevelBase);
+
+  /**
+   * The upper-bound of the total size of level-1 files in bytes.
+   * Maximum number of bytes for level L can be calculated as
+   * (maxBytesForLevelBase) * (maxBytesForLevelMultiplier ^ (L-1))
+   * For example, if maxBytesForLevelBase is 20MB, and if
+   * max_bytes_for_level_multiplier is 10, total data size for level-1
+   * will be 20MB, total file size for level-2 will be 200MB,
+   * and total file size for level-3 will be 2GB.
+   * by default 'maxBytesForLevelBase' is 10MB.
+   *
+   * @return the upper-bound of the total size of leve-1 files in bytes.
+   * @see #maxBytesForLevelMultiplier()
+   */
+  long maxBytesForLevelBase();
+
+  /**
+   * The ratio between the total size of level-(L+1) files and the total
+   * size of level-L files for all L.
+   * DEFAULT: 10
+   *
+   * @param multiplier the ratio between the total size of level-(L+1)
+   *     files and the total size of level-L files for all L.
+   * @return the reference to the current option.
+   * @see #setMaxBytesForLevelBase(long)
+   */
+  Object setMaxBytesForLevelMultiplier(int multiplier);
+
+  /**
+   * The ratio between the total size of level-(L+1) files and the total
+   * size of level-L files for all L.
+   * DEFAULT: 10
+   *
+   * @return the ratio between the total size of level-(L+1) files and
+   *     the total size of level-L files for all L.
+   * @see #maxBytesForLevelBase()
+   */
+  int maxBytesForLevelMultiplier();
+
+  /**
+   * Maximum number of bytes in all compacted files.  We avoid expanding
+   * the lower level file set of a compaction if it would make the
+   * total compaction cover more than
+   * (expanded_compaction_factor * targetFileSizeLevel()) many bytes.
+   *
+   * @param expandedCompactionFactor the maximum number of bytes in all
+   *     compacted files.
+   * @return the reference to the current option.
+   * @see #setSourceCompactionFactor(int)
+   */
+  Object setExpandedCompactionFactor(int expandedCompactionFactor);
+
+  /**
+   * Maximum number of bytes in all compacted files.  We avoid expanding
+   * the lower level file set of a compaction if it would make the
+   * total compaction cover more than
+   * (expanded_compaction_factor * targetFileSizeLevel()) many bytes.
+   *
+   * @return the maximum number of bytes in all compacted files.
+   * @see #sourceCompactionFactor()
+   */
+  int expandedCompactionFactor();
+
+  /**
+   * Maximum number of bytes in all source files to be compacted in a
+   * single compaction run. We avoid picking too many files in the
+   * source level so that we do not exceed the total source bytes
+   * for compaction to exceed
+   * (source_compaction_factor * targetFileSizeLevel()) many bytes.
+   * Default:1, i.e. pick maxfilesize amount of data as the source of
+   * a compaction.
+   *
+   * @param sourceCompactionFactor the maximum number of bytes in all
+   *     source files to be compacted in a single compaction run.
+   * @return the reference to the current option.
+   * @see #setExpandedCompactionFactor(int)
+   */
+  Object setSourceCompactionFactor(int sourceCompactionFactor);
+
+  /**
+   * Maximum number of bytes in all source files to be compacted in a
+   * single compaction run. We avoid picking too many files in the
+   * source level so that we do not exceed the total source bytes
+   * for compaction to exceed
+   * (source_compaction_factor * targetFileSizeLevel()) many bytes.
+   * Default:1, i.e. pick maxfilesize amount of data as the source of
+   * a compaction.
+   *
+   * @return the maximum number of bytes in all source files to be compactedo.
+   * @see #expandedCompactionFactor()
+   */
+  int sourceCompactionFactor();
+
+  /**
+   * Control maximum bytes of overlaps in grandparent (i.e., level+2) before we
+   * stop building a single file in a level->level+1 compaction.
+   *
+   * @param maxGrandparentOverlapFactor maximum bytes of overlaps in
+   *     "grandparent" level.
+   * @return the reference to the current option.
+   */
+  Object setMaxGrandparentOverlapFactor(
+      int maxGrandparentOverlapFactor);
+
+  /**
+   * Control maximum bytes of overlaps in grandparent (i.e., level+2) before we
+   * stop building a single file in a level->level+1 compaction.
+   *
+   * @return maximum bytes of overlaps in "grandparent" level.
+   */
+  int maxGrandparentOverlapFactor();
+
+  /**
+   * Puts are delayed 0-1 ms when any level has a compaction score that exceeds
+   * soft_rate_limit. This is ignored when == 0.0.
+   * CONSTRAINT: soft_rate_limit <= hard_rate_limit. If this constraint does not
+   * hold, RocksDB will set soft_rate_limit = hard_rate_limit
+   * Default: 0 (disabled)
+   *
+   * @param softRateLimit the soft-rate-limit of a compaction score
+   *     for put delay.
+   * @return the reference to the current option.
+   */
+  Object setSoftRateLimit(double softRateLimit);
+
+  /**
+   * Puts are delayed 0-1 ms when any level has a compaction score that exceeds
+   * soft_rate_limit. This is ignored when == 0.0.
+   * CONSTRAINT: soft_rate_limit <= hard_rate_limit. If this constraint does not
+   * hold, RocksDB will set soft_rate_limit = hard_rate_limit
+   * Default: 0 (disabled)
+   *
+   * @return soft-rate-limit for put delay.
+   */
+  double softRateLimit();
+
+  /**
+   * Puts are delayed 1ms at a time when any level has a compaction score that
+   * exceeds hard_rate_limit. This is ignored when <= 1.0.
+   * Default: 0 (disabled)
+   *
+   * @param hardRateLimit the hard-rate-limit of a compaction score for put
+   *     delay.
+   * @return the reference to the current option.
+   */
+  Object setHardRateLimit(double hardRateLimit);
+
+  /**
+   * Puts are delayed 1ms at a time when any level has a compaction score that
+   * exceeds hard_rate_limit. This is ignored when <= 1.0.
+   * Default: 0 (disabled)
+   *
+   * @return the hard-rate-limit of a compaction score for put delay.
+   */
+  double hardRateLimit();
+
+  /**
+   * The maximum time interval a put will be stalled when hard_rate_limit
+   * is enforced. If 0, then there is no limit.
+   * Default: 1000
+   *
+   * @param rateLimitDelayMaxMilliseconds the maximum time interval a put
+   *     will be stalled.
+   * @return the reference to the current option.
+   */
+  Object setRateLimitDelayMaxMilliseconds(
+      int rateLimitDelayMaxMilliseconds);
+
+  /**
+   * The maximum time interval a put will be stalled when hard_rate_limit
+   * is enforced.  If 0, then there is no limit.
+   * Default: 1000
+   *
+   * @return the maximum time interval a put will be stalled when
+   *     hard_rate_limit is enforced.
+   */
+  int rateLimitDelayMaxMilliseconds();
+
+  /**
+   * The size of one block in arena memory allocation.
+   * If <= 0, a proper value is automatically calculated (usually 1/10 of
+   * writer_buffer_size).
+   *
+   * There are two additonal restriction of the The specified size:
+   * (1) size should be in the range of [4096, 2 << 30] and
+   * (2) be the multiple of the CPU word (which helps with the memory
+   * alignment).
+   *
+   * We'll automatically check and adjust the size number to make sure it
+   * conforms to the restrictions.
+   * Default: 0
+   *
+   * @param arenaBlockSize the size of an arena block
+   * @return the reference to the current option.
+   * @throws org.rocksdb.RocksDBException
+   */
+  Object setArenaBlockSize(long arenaBlockSize)
+      throws RocksDBException;
+
+  /**
+   * The size of one block in arena memory allocation.
+   * If <= 0, a proper value is automatically calculated (usually 1/10 of
+   * writer_buffer_size).
+   *
+   * There are two additonal restriction of the The specified size:
+   * (1) size should be in the range of [4096, 2 << 30] and
+   * (2) be the multiple of the CPU word (which helps with the memory
+   * alignment).
+   *
+   * We'll automatically check and adjust the size number to make sure it
+   * conforms to the restrictions.
+   * Default: 0
+   *
+   * @return the size of an arena block
+   */
+  long arenaBlockSize();
+
+  /**
+   * Disable automatic compactions. Manual compactions can still
+   * be issued on this column family
+   *
+   * @param disableAutoCompactions true if auto-compactions are disabled.
+   * @return the reference to the current option.
+   */
+  Object setDisableAutoCompactions(boolean disableAutoCompactions);
+
+  /**
+   * Disable automatic compactions. Manual compactions can still
+   * be issued on this column family
+   *
+   * @return true if auto-compactions are disabled.
+   */
+  boolean disableAutoCompactions();
+
+  /**
+   * Purge duplicate/deleted keys when a memtable is flushed to storage.
+   * Default: true
+   *
+   * @param purgeRedundantKvsWhileFlush true if purging keys is disabled.
+   * @return the reference to the current option.
+   */
+  Object setPurgeRedundantKvsWhileFlush(
+      boolean purgeRedundantKvsWhileFlush);
+
+  /**
+   * Purge duplicate/deleted keys when a memtable is flushed to storage.
+   * Default: true
+   *
+   * @return true if purging keys is disabled.
+   */
+  boolean purgeRedundantKvsWhileFlush();
+
+  /**
+   * Set compaction style for DB.
+   *
+   * Default: LEVEL.
+   *
+   * @param compactionStyle Compaction style.
+   * @return the reference to the current option.
+   */
+  Object setCompactionStyle(CompactionStyle compactionStyle);
+
+  /**
+   * Compaction style for DB.
+   *
+   * @return Compaction style.
+   */
+  CompactionStyle compactionStyle();
+
+  /**
+   * If true, compaction will verify checksum on every read that happens
+   * as part of compaction
+   * Default: true
+   *
+   * @param verifyChecksumsInCompaction true if compaction verifies
+   *     checksum on every read.
+   * @return the reference to the current option.
+   */
+  Object setVerifyChecksumsInCompaction(
+      boolean verifyChecksumsInCompaction);
+
+  /**
+   * If true, compaction will verify checksum on every read that happens
+   * as part of compaction
+   * Default: true
+   *
+   * @return true if compaction verifies checksum on every read.
+   */
+  boolean verifyChecksumsInCompaction();
+
+  /**
+   * Use KeyMayExist API to filter deletes when this is true.
+   * If KeyMayExist returns false, i.e. the key definitely does not exist, then
+   * the delete is a noop. KeyMayExist only incurs in-memory look up.
+   * This optimization avoids writing the delete to storage when appropriate.
+   * Default: false
+   *
+   * @param filterDeletes true if filter-deletes behavior is on.
+   * @return the reference to the current option.
+   */
+  Object setFilterDeletes(boolean filterDeletes);
+
+  /**
+   * Use KeyMayExist API to filter deletes when this is true.
+   * If KeyMayExist returns false, i.e. the key definitely does not exist, then
+   * the delete is a noop. KeyMayExist only incurs in-memory look up.
+   * This optimization avoids writing the delete to storage when appropriate.
+   * Default: false
+   *
+   * @return true if filter-deletes behavior is on.
+   */
+  boolean filterDeletes();
+
+  /**
+   * An iteration->Next() sequentially skips over keys with the same
+   * user-key unless this option is set. This number specifies the number
+   * of keys (with the same userkey) that will be sequentially
+   * skipped before a reseek is issued.
+   * Default: 8
+   *
+   * @param maxSequentialSkipInIterations the number of keys could
+   *     be skipped in a iteration.
+   * @return the reference to the current option.
+   */
+  Object setMaxSequentialSkipInIterations(long maxSequentialSkipInIterations);
+
+  /**
+   * An iteration->Next() sequentially skips over keys with the same
+   * user-key unless this option is set. This number specifies the number
+   * of keys (with the same userkey) that will be sequentially
+   * skipped before a reseek is issued.
+   * Default: 8
+   *
+   * @return the number of keys could be skipped in a iteration.
+   */
+  long maxSequentialSkipInIterations();
+
+  /**
+   * Set the config for mem-table.
+   *
+   * @param config the mem-table config.
+   * @return the instance of the current Object.
+   * @throws org.rocksdb.RocksDBException
+   */
+  Object setMemTableConfig(MemTableConfig config)
+      throws RocksDBException;
+
+  /**
+   * Returns the name of the current mem table representation.
+   * Memtable format can be set using setTableFormatConfig.
+   *
+   * @return the name of the currently-used memtable factory.
+   * @see #setTableFormatConfig(org.rocksdb.TableFormatConfig)
+   */
+  String memTableFactoryName();
+
+  /**
+   * Set the config for table format.
+   *
+   * @param config the table format config.
+   * @return the reference of the current Options.
+   */
+  Object setTableFormatConfig(TableFormatConfig config);
+
+  /**
+   * @return the name of the currently used table factory.
+   */
+  String tableFactoryName();
+
+  /**
+   * Allows thread-safe inplace updates.
+   * If inplace_callback function is not set,
+   *   Put(key, new_value) will update inplace the existing_value iff
+   *   * key exists in current memtable
+   *   * new sizeof(new_value) <= sizeof(existing_value)
+   *   * existing_value for that key is a put i.e. kTypeValue
+   * If inplace_callback function is set, check doc for inplace_callback.
+   * Default: false.
+   *
+   * @param inplaceUpdateSupport true if thread-safe inplace updates
+   *     are allowed.
+   * @return the reference to the current option.
+   */
+  Object setInplaceUpdateSupport(boolean inplaceUpdateSupport);
+
+  /**
+   * Allows thread-safe inplace updates.
+   * If inplace_callback function is not set,
+   *   Put(key, new_value) will update inplace the existing_value iff
+   *   * key exists in current memtable
+   *   * new sizeof(new_value) <= sizeof(existing_value)
+   *   * existing_value for that key is a put i.e. kTypeValue
+   * If inplace_callback function is set, check doc for inplace_callback.
+   * Default: false.
+   *
+   * @return true if thread-safe inplace updates are allowed.
+   */
+  boolean inplaceUpdateSupport();
+
+  /**
+   * Number of locks used for inplace update
+   * Default: 10000, if inplace_update_support = true, else 0.
+   *
+   * @param inplaceUpdateNumLocks the number of locks used for
+   *     inplace updates.
+   * @return the reference to the current option.
+   * @throws org.rocksdb.RocksDBException
+   */
+  Object setInplaceUpdateNumLocks(long inplaceUpdateNumLocks)
+      throws RocksDBException;
+
+  /**
+   * Number of locks used for inplace update
+   * Default: 10000, if inplace_update_support = true, else 0.
+   *
+   * @return the number of locks used for inplace update.
+   */
+  long inplaceUpdateNumLocks();
+
+  /**
+   * Sets the number of bits used in the prefix bloom filter.
+   *
+   * This value will be used only when a prefix-extractor is specified.
+   *
+   * @param memtablePrefixBloomBits the number of bits used in the
+   *     prefix bloom filter.
+   * @return the reference to the current option.
+   */
+  Object setMemtablePrefixBloomBits(int memtablePrefixBloomBits);
+
+  /**
+   * Returns the number of bits used in the prefix bloom filter.
+   *
+   * This value will be used only when a prefix-extractor is specified.
+   *
+   * @return the number of bloom-bits.
+   * @see #useFixedLengthPrefixExtractor(int)
+   */
+  int memtablePrefixBloomBits();
+
+  /**
+   * The number of hash probes per key used in the mem-table.
+   *
+   * @param memtablePrefixBloomProbes the number of hash probes per key.
+   * @return the reference to the current option.
+   */
+  Object setMemtablePrefixBloomProbes(int memtablePrefixBloomProbes);
+
+  /**
+   * The number of hash probes per key used in the mem-table.
+   *
+   * @return the number of hash probes per key.
+   */
+  int memtablePrefixBloomProbes();
+
+  /**
+   * Control locality of bloom filter probes to improve cache miss rate.
+   * This option only applies to memtable prefix bloom and plaintable
+   * prefix bloom. It essentially limits the max number of cache lines each
+   * bloom filter check can touch.
+   * This optimization is turned off when set to 0. The number should never
+   * be greater than number of probes. This option can boost performance
+   * for in-memory workload but should use with care since it can cause
+   * higher false positive rate.
+   * Default: 0
+   *
+   * @param bloomLocality the level of locality of bloom-filter probes.
+   * @return the reference to the current option.
+   */
+  Object setBloomLocality(int bloomLocality);
+
+  /**
+   * Control locality of bloom filter probes to improve cache miss rate.
+   * This option only applies to memtable prefix bloom and plaintable
+   * prefix bloom. It essentially limits the max number of cache lines each
+   * bloom filter check can touch.
+   * This optimization is turned off when set to 0. The number should never
+   * be greater than number of probes. This option can boost performance
+   * for in-memory workload but should use with care since it can cause
+   * higher false positive rate.
+   * Default: 0
+   *
+   * @return the level of locality of bloom-filter probes.
+   * @see #setMemtablePrefixBloomProbes(int)
+   */
+  int bloomLocality();
+
+  /**
+   * Maximum number of successive merge operations on a key in the memtable.
+   *
+   * When a merge operation is added to the memtable and the maximum number of
+   * successive merges is reached, the value of the key will be calculated and
+   * inserted into the memtable instead of the merge operation. This will
+   * ensure that there are never more than max_successive_merges merge
+   * operations in the memtable.
+   *
+   * Default: 0 (disabled)
+   *
+   * @param maxSuccessiveMerges the maximum number of successive merges.
+   * @return the reference to the current option.
+   * @throws org.rocksdb.RocksDBException
+   */
+  Object setMaxSuccessiveMerges(long maxSuccessiveMerges)
+      throws RocksDBException;
+
+  /**
+   * Maximum number of successive merge operations on a key in the memtable.
+   *
+   * When a merge operation is added to the memtable and the maximum number of
+   * successive merges is reached, the value of the key will be calculated and
+   * inserted into the memtable instead of the merge operation. This will
+   * ensure that there are never more than max_successive_merges merge
+   * operations in the memtable.
+   *
+   * Default: 0 (disabled)
+   *
+   * @return the maximum number of successive merges.
+   */
+  long maxSuccessiveMerges();
+
+  /**
+   * The number of partial merge operands to accumulate before partial
+   * merge will be performed. Partial merge will not be called
+   * if the list of values to merge is less than min_partial_merge_operands.
+   *
+   * If min_partial_merge_operands < 2, then it will be treated as 2.
+   *
+   * Default: 2
+   *
+   * @param minPartialMergeOperands min partial merge operands
+   * @return the reference to the current option.
+   */
+  Object setMinPartialMergeOperands(int minPartialMergeOperands);
+
+  /**
+   * The number of partial merge operands to accumulate before partial
+   * merge will be performed. Partial merge will not be called
+   * if the list of values to merge is less than min_partial_merge_operands.
+   *
+   * If min_partial_merge_operands < 2, then it will be treated as 2.
+   *
+   * Default: 2
+   *
+   * @return min partial merge operands
+   */
+  int minPartialMergeOperands();
+
+  /**
+   * Default memtable memory budget used with the following methods:
+   *
+   * <ol>
+   *   <li>{@link #optimizeLevelStyleCompaction()}</li>
+   *   <li>{@link #optimizeUniversalStyleCompaction()}</li>
+   * </ol>
+   */
+  long DEFAULT_COMPACTION_MEMTABLE_MEMORY_BUDGET = 512 * 1024 * 1024;
+}
diff --git a/java/org/rocksdb/DBOptionsInterface.java b/java/org/rocksdb/DBOptionsInterface.java
new file mode 100644
index 000000000..35c65eed2
--- /dev/null
+++ b/java/org/rocksdb/DBOptionsInterface.java
@@ -0,0 +1,763 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+public interface DBOptionsInterface {
+
+  /**
+   * If this value is set to true, then the database will be created
+   * if it is missing during {@code RocksDB.open()}.
+   * Default: false
+   *
+   * @param flag a flag indicating whether to create a database the
+   *     specified database in {@link RocksDB#open(org.rocksdb.Options, String)} operation
+   *     is missing.
+   * @return the instance of the current Options
+   * @see RocksDB#open(org.rocksdb.Options, String)
+   */
+  Object setCreateIfMissing(boolean flag);
+
+  /**
+   * Return true if the create_if_missing flag is set to true.
+   * If true, the database will be created if it is missing.
+   *
+   * @return true if the createIfMissing option is set to true.
+   * @see #setCreateIfMissing(boolean)
+   */
+  boolean createIfMissing();
+
+  /**
+   * <p>If true, missing column families will be automatically created</p>
+   *
+   * <p>Default: false</p>
+   *
+   * @param flag a flag indicating if missing column families shall be
+   *     created automatically.
+   * @return true if missing column families shall be created automatically
+   *     on open.
+   */
+  Object setCreateMissingColumnFamilies(boolean flag);
+
+  /**
+   * Return true if the create_missing_column_families flag is set
+   * to true. If true column families be created if missing.
+   *
+   * @return true if the createMissingColumnFamilies is set to
+   *     true.
+   * @see #setCreateMissingColumnFamilies(boolean)
+   */
+  boolean createMissingColumnFamilies();
+
+  /**
+   * If true, an error will be thrown during RocksDB.open() if the
+   * database already exists.
+   * Default: false
+   *
+   * @param errorIfExists if true, an exception will be thrown
+   *     during {@code RocksDB.open()} if the database already exists.
+   * @return the reference to the current option.
+   * @see RocksDB#open(org.rocksdb.Options, String)
+   */
+  Object setErrorIfExists(boolean errorIfExists);
+
+  /**
+   * If true, an error will be thrown during RocksDB.open() if the
+   * database already exists.
+   *
+   * @return if true, an error is raised when the specified database
+   *    already exists before open.
+   */
+  boolean errorIfExists();
+
+  /**
+   * If true, the implementation will do aggressive checking of the
+   * data it is processing and will stop early if it detects any
+   * errors.  This may have unforeseen ramifications: for example, a
+   * corruption of one DB entry may cause a large number of entries to
+   * become unreadable or for the entire DB to become unopenable.
+   * If any of the  writes to the database fails (Put, Delete, Merge, Write),
+   * the database will switch to read-only mode and fail all other
+   * Write operations.
+   * Default: true
+   *
+   * @param paranoidChecks a flag to indicate whether paranoid-check
+   *     is on.
+   * @return the reference to the current option.
+   */
+  Object setParanoidChecks(boolean paranoidChecks);
+
+  /**
+   * If true, the implementation will do aggressive checking of the
+   * data it is processing and will stop early if it detects any
+   * errors.  This may have unforeseen ramifications: for example, a
+   * corruption of one DB entry may cause a large number of entries to
+   * become unreadable or for the entire DB to become unopenable.
+   * If any of the  writes to the database fails (Put, Delete, Merge, Write),
+   * the database will switch to read-only mode and fail all other
+   * Write operations.
+   *
+   * @return a boolean indicating whether paranoid-check is on.
+   */
+  boolean paranoidChecks();
+
+  /**
+   * Use to control write rate of flush and compaction. Flush has higher
+   * priority than compaction. Rate limiting is disabled if nullptr.
+   * Default: nullptr
+   *
+   * @param config rate limiter config.
+   * @return the instance of the current Object.
+   */
+  Object setRateLimiterConfig(RateLimiterConfig config);
+
+  /**
+   * Number of open files that can be used by the DB.  You may need to
+   * increase this if your database has a large working set. Value -1 means
+   * files opened are always kept open. You can estimate number of files based
+   * on {@code target_file_size_base} and {@code target_file_size_multiplier}
+   * for level-based compaction. For universal-style compaction, you can usually
+   * set it to -1.
+   * Default: 5000
+   *
+   * @param maxOpenFiles the maximum number of open files.
+   * @return the reference to the current DBOptions.
+   */
+  Object setMaxOpenFiles(int maxOpenFiles);
+
+  /**
+   * Number of open files that can be used by the DB.  You may need to
+   * increase this if your database has a large working set. Value -1 means
+   * files opened are always kept open. You can estimate number of files based
+   * on {@code target_file_size_base} and {@code target_file_size_multiplier}
+   * for level-based compaction. For universal-style compaction, you can usually
+   * set it to -1.
+   *
+   * @return the maximum number of open files.
+   */
+  int maxOpenFiles();
+
+  /**
+   * <p>Once write-ahead logs exceed this size, we will start forcing the
+   * flush of column families whose memtables are backed by the oldest live
+   * WAL file (i.e. the ones that are causing all the space amplification).
+   * </p>
+   * <p>If set to 0 (default), we will dynamically choose the WAL size limit to
+   * be [sum of all write_buffer_size * max_write_buffer_number] * 2</p>
+   * <p>Default: 0</p>
+   */
+  Object setMaxTotalWalSize(long maxTotalWalSize);
+
+  /**
+   * <p>Returns the max total wal size. Once write-ahead logs exceed this size,
+   * we will start forcing the flush of column families whose memtables are
+   * backed by the oldest live WAL file (i.e. the ones that are causing all
+   * the space amplification).</p>
+   *
+   * <p>If set to 0 (default), we will dynamically choose the WAL size limit
+   * to be [sum of all write_buffer_size * max_write_buffer_number] * 2
+   * </p>
+   *
+   * @return max total wal size
+   */
+  long maxTotalWalSize();
+
+  /**
+   * <p>Creates statistics object which collects metrics about database operations.
+   * Statistics objects should not be shared between DB instances as
+   * it does not use any locks to prevent concurrent updates.</p>
+   *
+   * @return the instance of the current Object.
+   * @see RocksDB#open(org.rocksdb.Options, String)
+   */
+  Object createStatistics();
+
+  /**
+   * <p>Returns statistics object. Calls {@link #createStatistics()} if
+   * C++ returns {@code nullptr} for statistics.</p>
+   *
+   * @return the instance of the statistics object.
+   * @see #createStatistics()
+   */
+  Statistics statisticsPtr();
+
+  /**
+   * <p>If true, then the contents of manifest and data files are
+   * not synced to stable storage. Their contents remain in the
+   * OS buffers till theOS decides to flush them.</p>
+   *
+   * <p>This option is good for bulk-loading of data.</p>
+   *
+   * <p>Once the bulk-loading is complete, please issue a sync to
+   * the OS to flush all dirty buffers to stable storage.</p>
+   *
+   * <p>Default: false</p>
+   *
+   * @param disableDataSync a boolean flag to specify whether to
+   *     disable data sync.
+   * @return the reference to the current DBOptions.
+   */
+  Object setDisableDataSync(boolean disableDataSync);
+
+  /**
+   * If true, then the contents of data files are not synced
+   * to stable storage. Their contents remain in the OS buffers till the
+   * OS decides to flush them. This option is good for bulk-loading
+   * of data. Once the bulk-loading is complete, please issue a
+   * sync to the OS to flush all dirty buffers to stable storage.
+   *
+   * @return if true, then data-sync is disabled.
+   */
+  boolean disableDataSync();
+
+  /**
+   * <p>If true, then every store to stable storage will issue a fsync.</p>
+   * <p>If false, then every store to stable storage will issue a fdatasync.
+   * This parameter should be set to true while storing data to
+   * filesystem like ext3 that can lose files after a reboot.</p>
+   * <p>Default: false</p>
+   *
+   * @param useFsync a boolean flag to specify whether to use fsync
+   * @return the instance of the current Object.
+   */
+  Object setUseFsync(boolean useFsync);
+
+  /**
+   * <p>If true, then every store to stable storage will issue a fsync.</p>
+   * <p>If false, then every store to stable storage will issue a fdatasync.
+   * This parameter should be set to true while storing data to
+   * filesystem like ext3 that can lose files after a reboot.</p>
+   *
+   * @return boolean value indicating if fsync is used.
+   */
+  boolean useFsync();
+
+  /**
+   * This specifies the info LOG dir.
+   * If it is empty, the log files will be in the same dir as data.
+   * If it is non empty, the log files will be in the specified dir,
+   * and the db data dir's absolute path will be used as the log file
+   * name's prefix.
+   *
+   * @param dbLogDir the path to the info log directory
+   * @return the instance of the current Object.
+   */
+  Object setDbLogDir(String dbLogDir);
+
+  /**
+   * Returns the directory of info log.
+   *
+   * If it is empty, the log files will be in the same dir as data.
+   * If it is non empty, the log files will be in the specified dir,
+   * and the db data dir's absolute path will be used as the log file
+   * name's prefix.
+   *
+   * @return the path to the info log directory
+   */
+  String dbLogDir();
+
+  /**
+   * This specifies the absolute dir path for write-ahead logs (WAL).
+   * If it is empty, the log files will be in the same dir as data,
+   *   dbname is used as the data dir by default
+   * If it is non empty, the log files will be in kept the specified dir.
+   * When destroying the db,
+   *   all log files in wal_dir and the dir itself is deleted
+   *
+   * @param walDir the path to the write-ahead-log directory.
+   * @return the instance of the current Object.
+   */
+  Object setWalDir(String walDir);
+
+  /**
+   * Returns the path to the write-ahead-logs (WAL) directory.
+   *
+   * If it is empty, the log files will be in the same dir as data,
+   *   dbname is used as the data dir by default
+   * If it is non empty, the log files will be in kept the specified dir.
+   * When destroying the db,
+   *   all log files in wal_dir and the dir itself is deleted
+   *
+   * @return the path to the write-ahead-logs (WAL) directory.
+   */
+  String walDir();
+
+  /**
+   * The periodicity when obsolete files get deleted. The default
+   * value is 6 hours. The files that get out of scope by compaction
+   * process will still get automatically delete on every compaction,
+   * regardless of this setting
+   *
+   * @param micros the time interval in micros
+   * @return the instance of the current Object.
+   */
+  Object setDeleteObsoleteFilesPeriodMicros(long micros);
+
+  /**
+   * The periodicity when obsolete files get deleted. The default
+   * value is 6 hours. The files that get out of scope by compaction
+   * process will still get automatically delete on every compaction,
+   * regardless of this setting
+   *
+   * @return the time interval in micros when obsolete files will be deleted.
+   */
+  long deleteObsoleteFilesPeriodMicros();
+
+  /**
+   * Specifies the maximum number of concurrent background compaction jobs,
+   * submitted to the default LOW priority thread pool.
+   * If you're increasing this, also consider increasing number of threads in
+   * LOW priority thread pool. For more information, see
+   * Default: 1
+   *
+   * @param maxBackgroundCompactions the maximum number of background
+   *     compaction jobs.
+   * @return the instance of the current Object.
+   *
+   * @see RocksEnv#setBackgroundThreads(int)
+   * @see RocksEnv#setBackgroundThreads(int, int)
+   * @see #maxBackgroundFlushes()
+   */
+  Object setMaxBackgroundCompactions(int maxBackgroundCompactions);
+
+  /**
+   * Returns the maximum number of concurrent background compaction jobs,
+   * submitted to the default LOW priority thread pool.
+   * When increasing this number, we may also want to consider increasing
+   * number of threads in LOW priority thread pool.
+   * Default: 1
+   *
+   * @return the maximum number of concurrent background compaction jobs.
+   * @see RocksEnv#setBackgroundThreads(int)
+   * @see RocksEnv#setBackgroundThreads(int, int)
+   */
+  int maxBackgroundCompactions();
+
+  /**
+   * Specifies the maximum number of concurrent background flush jobs.
+   * If you're increasing this, also consider increasing number of threads in
+   * HIGH priority thread pool. For more information, see
+   * Default: 1
+   *
+   * @param maxBackgroundFlushes number of max concurrent flush jobs
+   * @return the instance of the current Object.
+   *
+   * @see RocksEnv#setBackgroundThreads(int)
+   * @see RocksEnv#setBackgroundThreads(int, int)
+   * @see #maxBackgroundCompactions()
+   */
+  Object setMaxBackgroundFlushes(int maxBackgroundFlushes);
+
+  /**
+   * Returns the maximum number of concurrent background flush jobs.
+   * If you're increasing this, also consider increasing number of threads in
+   * HIGH priority thread pool. For more information, see
+   * Default: 1
+   *
+   * @return the maximum number of concurrent background flush jobs.
+   * @see RocksEnv#setBackgroundThreads(int)
+   * @see RocksEnv#setBackgroundThreads(int, int)
+   */
+  int maxBackgroundFlushes();
+
+  /**
+   * Specifies the maximum size of a info log file. If the current log file
+   * is larger than `max_log_file_size`, a new info log file will
+   * be created.
+   * If 0, all logs will be written to one log file.
+   *
+   * @param maxLogFileSize the maximum size of a info log file.
+   * @return the instance of the current Object.
+   * @throws org.rocksdb.RocksDBException
+   */
+  Object setMaxLogFileSize(long maxLogFileSize)
+      throws RocksDBException;
+
+  /**
+   * Returns the maximum size of a info log file. If the current log file
+   * is larger than this size, a new info log file will be created.
+   * If 0, all logs will be written to one log file.
+   *
+   * @return the maximum size of the info log file.
+   */
+  long maxLogFileSize();
+
+  /**
+   * Specifies the time interval for the info log file to roll (in seconds).
+   * If specified with non-zero value, log file will be rolled
+   * if it has been active longer than `log_file_time_to_roll`.
+   * Default: 0 (disabled)
+   *
+   * @param logFileTimeToRoll the time interval in seconds.
+   * @return the instance of the current Object.
+   * @throws org.rocksdb.RocksDBException
+   */
+  Object setLogFileTimeToRoll(long logFileTimeToRoll)
+      throws RocksDBException;
+
+  /**
+   * Returns the time interval for the info log file to roll (in seconds).
+   * If specified with non-zero value, log file will be rolled
+   * if it has been active longer than `log_file_time_to_roll`.
+   * Default: 0 (disabled)
+   *
+   * @return the time interval in seconds.
+   */
+  long logFileTimeToRoll();
+
+  /**
+   * Specifies the maximum number of info log files to be kept.
+   * Default: 1000
+   *
+   * @param keepLogFileNum the maximum number of info log files to be kept.
+   * @return the instance of the current Object.
+   * @throws org.rocksdb.RocksDBException
+   */
+  Object setKeepLogFileNum(long keepLogFileNum)
+      throws RocksDBException;
+
+  /**
+   * Returns the maximum number of info log files to be kept.
+   * Default: 1000
+   *
+   * @return the maximum number of info log files to be kept.
+   */
+  long keepLogFileNum();
+
+  /**
+   * Manifest file is rolled over on reaching this limit.
+   * The older manifest file be deleted.
+   * The default value is MAX_INT so that roll-over does not take place.
+   *
+   * @param maxManifestFileSize the size limit of a manifest file.
+   * @return the instance of the current Object.
+   */
+  Object setMaxManifestFileSize(long maxManifestFileSize);
+
+  /**
+   * Manifest file is rolled over on reaching this limit.
+   * The older manifest file be deleted.
+   * The default value is MAX_INT so that roll-over does not take place.
+   *
+   * @return the size limit of a manifest file.
+   */
+  long maxManifestFileSize();
+
+  /**
+   * Number of shards used for table cache.
+   *
+   * @param tableCacheNumshardbits the number of chards
+   * @return the instance of the current Object.
+   */
+  Object setTableCacheNumshardbits(int tableCacheNumshardbits);
+
+  /**
+   * Number of shards used for table cache.
+   *
+   * @return the number of shards used for table cache.
+   */
+  int tableCacheNumshardbits();
+
+  /**
+   * During data eviction of table's LRU cache, it would be inefficient
+   * to strictly follow LRU because this piece of memory will not really
+   * be released unless its refcount falls to zero. Instead, make two
+   * passes: the first pass will release items with refcount = 1,
+   * and if not enough space releases after scanning the number of
+   * elements specified by this parameter, we will remove items in LRU
+   * order.
+   *
+   * @param limit scan count limit
+   * @return the instance of the current Object.
+   */
+  Object setTableCacheRemoveScanCountLimit(int limit);
+
+  /**
+   * During data eviction of table's LRU cache, it would be inefficient
+   * to strictly follow LRU because this piece of memory will not really
+   * be released unless its refcount falls to zero. Instead, make two
+   * passes: the first pass will release items with refcount = 1,
+   * and if not enough space releases after scanning the number of
+   * elements specified by this parameter, we will remove items in LRU
+   * order.
+   *
+   * @return scan count limit
+   */
+  int tableCacheRemoveScanCountLimit();
+
+  /**
+   * {@link #walTtlSeconds()} and {@link #walSizeLimitMB()} affect how archived logs
+   * will be deleted.
+   * <ol>
+   * <li>If both set to 0, logs will be deleted asap and will not get into
+   * the archive.</li>
+   * <li>If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0,
+   *    WAL files will be checked every 10 min and if total size is greater
+   *    then WAL_size_limit_MB, they will be deleted starting with the
+   *    earliest until size_limit is met. All empty files will be deleted.</li>
+   * <li>If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then
+   *    WAL files will be checked every WAL_ttl_secondsi / 2 and those that
+   *    are older than WAL_ttl_seconds will be deleted.</li>
+   * <li>If both are not 0, WAL files will be checked every 10 min and both
+   *    checks will be performed with ttl being first.</li>
+   *
+   * @param walTtlSeconds the ttl seconds
+   * @return the instance of the current Object.
+   * @see #setWalSizeLimitMB(long)
+   */
+  Object setWalTtlSeconds(long walTtlSeconds);
+
+  /**
+   * WalTtlSeconds() and walSizeLimitMB() affect how archived logs
+   * will be deleted.
+   * <ol>
+   * <li>If both set to 0, logs will be deleted asap and will not get into
+   * the archive.</li>
+   * <li>If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0,
+   * WAL files will be checked every 10 min and if total size is greater
+   * then WAL_size_limit_MB, they will be deleted starting with the
+   * earliest until size_limit is met. All empty files will be deleted.</li>
+   * <li>If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then
+   * WAL files will be checked every WAL_ttl_secondsi / 2 and those that
+   * are older than WAL_ttl_seconds will be deleted.</li>
+   * <li>If both are not 0, WAL files will be checked every 10 min and both
+   * checks will be performed with ttl being first.</li>
+   * </ol>
+   *
+   * @return the wal-ttl seconds
+   * @see #walSizeLimitMB()
+   */
+  long walTtlSeconds();
+
+  /**
+   * WalTtlSeconds() and walSizeLimitMB() affect how archived logs
+   * will be deleted.
+   * <ol>
+   * <li>If both set to 0, logs will be deleted asap and will not get into
+   *    the archive.</li>
+   * <li>If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0,
+   *    WAL files will be checked every 10 min and if total size is greater
+   *    then WAL_size_limit_MB, they will be deleted starting with the
+   *    earliest until size_limit is met. All empty files will be deleted.</li>
+   * <li>If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then
+   *    WAL files will be checked every WAL_ttl_secondsi / 2 and those that
+   *    are older than WAL_ttl_seconds will be deleted.</li>
+   * <li>If both are not 0, WAL files will be checked every 10 min and both
+   *    checks will be performed with ttl being first.</li>
+   *
+   * @param sizeLimitMB size limit in mega-bytes.
+   * @return the instance of the current Object.
+   * @see #setWalSizeLimitMB(long)
+   */
+  Object setWalSizeLimitMB(long sizeLimitMB);
+
+  /**
+   * {@link #walTtlSeconds()} and {@code #walSizeLimitMB()} affect how archived logs
+   * will be deleted.
+   * <ol>
+   * <li>If both set to 0, logs will be deleted asap and will not get into
+   *    the archive.</li>
+   * <li>If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0,
+   *    WAL files will be checked every 10 min and if total size is greater
+   *    then WAL_size_limit_MB, they will be deleted starting with the
+   *    earliest until size_limit is met. All empty files will be deleted.</li>
+   * <li>If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then
+   *    WAL files will be checked every WAL_ttl_seconds i / 2 and those that
+   *    are older than WAL_ttl_seconds will be deleted.</li>
+   * <li>If both are not 0, WAL files will be checked every 10 min and both
+   *    checks will be performed with ttl being first.</li>
+   * </ol>
+   * @return size limit in mega-bytes.
+   * @see #walSizeLimitMB()
+   */
+  long walSizeLimitMB();
+
+  /**
+   * Number of bytes to preallocate (via fallocate) the manifest
+   * files.  Default is 4mb, which is reasonable to reduce random IO
+   * as well as prevent overallocation for mounts that preallocate
+   * large amounts of data (such as xfs's allocsize option).
+   *
+   * @param size the size in byte
+   * @return the instance of the current Object.
+   * @throws org.rocksdb.RocksDBException
+   */
+  Object setManifestPreallocationSize(long size)
+      throws RocksDBException;
+
+  /**
+   * Number of bytes to preallocate (via fallocate) the manifest
+   * files.  Default is 4mb, which is reasonable to reduce random IO
+   * as well as prevent overallocation for mounts that preallocate
+   * large amounts of data (such as xfs's allocsize option).
+   *
+   * @return size in bytes.
+   */
+  long manifestPreallocationSize();
+
+  /**
+   * Data being read from file storage may be buffered in the OS
+   * Default: true
+   *
+   * @param allowOsBuffer if true, then OS buffering is allowed.
+   * @return the instance of the current Object.
+   */
+  Object setAllowOsBuffer(boolean allowOsBuffer);
+
+  /**
+   * Data being read from file storage may be buffered in the OS
+   * Default: true
+   *
+   * @return if true, then OS buffering is allowed.
+   */
+  boolean allowOsBuffer();
+
+  /**
+   * Allow the OS to mmap file for reading sst tables.
+   * Default: false
+   *
+   * @param allowMmapReads true if mmap reads are allowed.
+   * @return the instance of the current Object.
+   */
+  Object setAllowMmapReads(boolean allowMmapReads);
+
+  /**
+   * Allow the OS to mmap file for reading sst tables.
+   * Default: false
+   *
+   * @return true if mmap reads are allowed.
+   */
+  boolean allowMmapReads();
+
+  /**
+   * Allow the OS to mmap file for writing. Default: false
+   *
+   * @param allowMmapWrites true if mmap writes are allowd.
+   * @return the instance of the current Object.
+   */
+  Object setAllowMmapWrites(boolean allowMmapWrites);
+
+  /**
+   * Allow the OS to mmap file for writing. Default: false
+   *
+   * @return true if mmap writes are allowed.
+   */
+  boolean allowMmapWrites();
+
+  /**
+   * Disable child process inherit open files. Default: true
+   *
+   * @param isFdCloseOnExec true if child process inheriting open
+   *     files is disabled.
+   * @return the instance of the current Object.
+   */
+  Object setIsFdCloseOnExec(boolean isFdCloseOnExec);
+
+  /**
+   * Disable child process inherit open files. Default: true
+   *
+   * @return true if child process inheriting open files is disabled.
+   */
+  boolean isFdCloseOnExec();
+
+  /**
+   * Skip log corruption error on recovery (If client is ok with
+   * losing most recent changes)
+   * Default: false
+   *
+   * @param skip true if log corruption errors are skipped during recovery.
+   * @return the instance of the current Object.
+   */
+  Object setSkipLogErrorOnRecovery(boolean skip);
+
+  /**
+   * Skip log corruption error on recovery (If client is ok with
+   * losing most recent changes)
+   * Default: false
+   *
+   * @return true if log corruption errors are skipped during recovery.
+   */
+  boolean skipLogErrorOnRecovery();
+
+  /**
+   * if not zero, dump rocksdb.stats to LOG every stats_dump_period_sec
+   * Default: 3600 (1 hour)
+   *
+   * @param statsDumpPeriodSec time interval in seconds.
+   * @return the instance of the current Object.
+   */
+  Object setStatsDumpPeriodSec(int statsDumpPeriodSec);
+
+  /**
+   * If not zero, dump rocksdb.stats to LOG every stats_dump_period_sec
+   * Default: 3600 (1 hour)
+   *
+   * @return time interval in seconds.
+   */
+  int statsDumpPeriodSec();
+
+  /**
+   * If set true, will hint the underlying file system that the file
+   * access pattern is random, when a sst file is opened.
+   * Default: true
+   *
+   * @param adviseRandomOnOpen true if hinting random access is on.
+   * @return the instance of the current Object.
+   */
+  Object setAdviseRandomOnOpen(boolean adviseRandomOnOpen);
+
+  /**
+   * If set true, will hint the underlying file system that the file
+   * access pattern is random, when a sst file is opened.
+   * Default: true
+   *
+   * @return true if hinting random access is on.
+   */
+  boolean adviseRandomOnOpen();
+
+  /**
+   * Use adaptive mutex, which spins in the user space before resorting
+   * to kernel. This could reduce context switch when the mutex is not
+   * heavily contended. However, if the mutex is hot, we could end up
+   * wasting spin time.
+   * Default: false
+   *
+   * @param useAdaptiveMutex true if adaptive mutex is used.
+   * @return the instance of the current Object.
+   */
+  Object setUseAdaptiveMutex(boolean useAdaptiveMutex);
+
+  /**
+   * Use adaptive mutex, which spins in the user space before resorting
+   * to kernel. This could reduce context switch when the mutex is not
+   * heavily contended. However, if the mutex is hot, we could end up
+   * wasting spin time.
+   * Default: false
+   *
+   * @return true if adaptive mutex is used.
+   */
+  boolean useAdaptiveMutex();
+
+  /**
+   * Allows OS to incrementally sync files to disk while they are being
+   * written, asynchronously, in the background.
+   * Issue one request for every bytes_per_sync written. 0 turns it off.
+   * Default: 0
+   *
+   * @param bytesPerSync size in bytes
+   * @return the instance of the current Object.
+   */
+  Object setBytesPerSync(long bytesPerSync);
+
+  /**
+   * Allows OS to incrementally sync files to disk while they are being
+   * written, asynchronously, in the background.
+   * Issue one request for every bytes_per_sync written. 0 turns it off.
+   * Default: 0
+   *
+   * @return size in bytes
+   */
+  long bytesPerSync();
+}
diff --git a/java/org/rocksdb/Options.java b/java/org/rocksdb/Options.java
index a3ea38f7c..16db5e166 100644
--- a/java/org/rocksdb/Options.java
+++ b/java/org/rocksdb/Options.java
@@ -12,20 +12,11 @@ package org.rocksdb;
  * If {@link #dispose()} function is not called, then it will be GC'd automatically
  * and native resources will be released as part of the process.
  */
-public class Options extends RocksObject {
+public class Options extends RocksObject
+    implements DBOptionsInterface, ColumnFamilyOptionsInterface {
   static {
     RocksDB.loadLibrary();
   }
-  static final long DEFAULT_CACHE_SIZE = 8 << 20;
-  static final int DEFAULT_NUM_SHARD_BITS = -1;
-
-  /**
-   * Builtin RocksDB comparators
-   */
-  public enum BuiltinComparator {
-      BYTEWISE_COMPARATOR, REVERSE_BYTEWISE_COMPARATOR;
-  }
-
   /**
    * Construct options for opening a RocksDB.
    *
@@ -34,38 +25,18 @@ public class Options extends RocksObject {
    */
   public Options() {
     super();
-    cacheSize_ = DEFAULT_CACHE_SIZE;
-    numShardBits_ = DEFAULT_NUM_SHARD_BITS;
     newOptions();
     env_ = RocksEnv.getDefault();
   }
 
-  /**
-   * If this value is set to true, then the database will be created
-   * if it is missing during {@code RocksDB.open()}.
-   * Default: false
-   *
-   * @param flag a flag indicating whether to create a database the
-   *     specified database in {@link org.rocksdb.RocksDB#open(Options, String)} operation
-   *     is missing.
-   * @return the instance of the current Options
-   * @see org.rocksdb.RocksDB#open(Options, String)
-   */
+  @Override
   public Options setCreateIfMissing(boolean flag) {
     assert(isInitialized());
     setCreateIfMissing(nativeHandle_, flag);
     return this;
   }
 
-  /**
-   * <p>If true, missing column families will be automatically created</p>
-   *
-   * <p>Default: false</p>
-   *
-   * @param flag
-   * @return true if missing column families shall be created automatically
-   *     on open.
-   */
+  @Override
   public Options setCreateMissingColumnFamilies(boolean flag) {
     assert(isInitialized());
     setCreateMissingColumnFamilies(nativeHandle_, flag);
@@ -83,71 +54,105 @@ public class Options extends RocksObject {
     env_ = env;
     return this;
   }
-  private native void setEnv(long optHandle, long envHandle);
 
   public RocksEnv getEnv() {
     return env_;
   }
-  private native long getEnvHandle(long handle);
 
   /**
-   * Return true if the create_if_missing flag is set to true.
-   * If true, the database will be created if it is missing.
+   * Set appropriate parameters for bulk loading.
+   * The reason that this is a function that returns "this" instead of a
+   * constructor is to enable chaining of multiple similar calls in the future.
    *
-   * @return true if the createIfMissing option is set to true.
-   * @see #setCreateIfMissing(boolean)
+   * All data will be in level 0 without any automatic compaction.
+   * It's recommended to manually call CompactRange(NULL, NULL) before reading
+   * from the database, because otherwise the read can be very slow.
+   *
+   * @return the instance of the current Options.
    */
+  public Options prepareForBulkLoad() {
+    prepareForBulkLoad(nativeHandle_);
+    return this;
+  }
+
+  @Override
   public boolean createIfMissing() {
     assert(isInitialized());
     return createIfMissing(nativeHandle_);
   }
 
-  /**
-   * Return true if the create_missing_column_families flag is set
-   * to true. If true column families be created if missing.
-   *
-   * @return true if the createMissingColumnFamilies is set to
-   *     true.
-   * @see #setCreateMissingColumnFamilies(boolean)
-   */
+  @Override
   public boolean createMissingColumnFamilies() {
     assert(isInitialized());
     return createMissingColumnFamilies(nativeHandle_);
   }
 
-  /**
-   * Set {@link org.rocksdb.Options.BuiltinComparator} to be used with RocksDB.
-   *
-   * Note: Comparator can be set once upon database creation.
-   *
-   * Default: BytewiseComparator.
-   * @param builtinComparator a {@link org.rocksdb.Options.BuiltinComparator} type.
-   */
-  public void setBuiltinComparator(BuiltinComparator builtinComparator) {
+  @Override
+  public Options optimizeForPointLookup(
+      long blockCacheSizeMb) {
+    optimizeForPointLookup(nativeHandle_,
+        blockCacheSizeMb);
+    return this;
+  }
+
+  @Override
+  public Options optimizeLevelStyleCompaction() {
+    optimizeLevelStyleCompaction(nativeHandle_,
+        DEFAULT_COMPACTION_MEMTABLE_MEMORY_BUDGET);
+    return this;
+  }
+
+  @Override
+  public Options optimizeLevelStyleCompaction(
+      long memtableMemoryBudget) {
+    optimizeLevelStyleCompaction(nativeHandle_,
+        memtableMemoryBudget);
+    return this;
+  }
+
+  @Override
+  public Options optimizeUniversalStyleCompaction() {
+    optimizeUniversalStyleCompaction(nativeHandle_,
+        DEFAULT_COMPACTION_MEMTABLE_MEMORY_BUDGET);
+    return this;
+  }
+
+  @Override
+  public Options optimizeUniversalStyleCompaction(
+      long memtableMemoryBudget) {
+    optimizeUniversalStyleCompaction(nativeHandle_,
+        memtableMemoryBudget);
+    return this;
+  }
+
+  @Override
+  public Options setComparator(BuiltinComparator builtinComparator) {
     assert(isInitialized());
-    setBuiltinComparator(nativeHandle_, builtinComparator.ordinal());
+    setComparatorHandle(nativeHandle_, builtinComparator.ordinal());
+    return this;
   }
 
-  private native void setBuiltinComparator(long handle, int builtinComparator);
+  @Override
+  public Options setComparator(AbstractComparator comparator) {
+    assert (isInitialized());
+    setComparatorHandle(nativeHandle_, comparator.nativeHandle_);
+    comparator_ = comparator;
+    return this;
+  }
 
-  /**
-   * Amount of data to build up in memory (backed by an unsorted log
-   * on disk) before converting to a sorted on-disk file.
-   *
-   * Larger values increase performance, especially during bulk loads.
-   * Up to {@code max_write_buffer_number} write buffers may be held in memory
-   * at the same time, so you may wish to adjust this parameter
-   * to control memory usage.
-   *
-   * Also, a larger write buffer will result in a longer recovery time
-   * the next time the database is opened.
-   *
-   * Default: 4MB
-   * @param writeBufferSize the size of write buffer.
-   * @return the instance of the current Options.
-   * @see org.rocksdb.RocksDB#open(Options, String)
-   * @throws RocksDBException
-   */
+  @Override
+  public Options setMergeOperatorName(String name) {
+    setMergeOperatorName(nativeHandle_, name);
+    return this;
+  }
+
+  @Override
+  public Options setMergeOperator(MergeOperator mergeOperator) {
+    setMergeOperator(nativeHandle_, mergeOperator.newMergeOperatorHandle());
+    return this;
+  }
+
+  @Override
   public Options setWriteBufferSize(long writeBufferSize)
       throws RocksDBException {
     assert(isInitialized());
@@ -155,372 +160,156 @@ public class Options extends RocksObject {
     return this;
   }
 
-  /**
-   * Return size of write buffer size.
-   *
-   * @return size of write buffer.
-   * @see #setWriteBufferSize(long)
-   */
+  @Override
   public long writeBufferSize()  {
     assert(isInitialized());
     return writeBufferSize(nativeHandle_);
   }
 
-  /**
-   * The maximum number of write buffers that are built up in memory.
-   * The default is 2, so that when 1 write buffer is being flushed to
-   * storage, new writes can continue to the other write buffer.
-   * Default: 2
-   *
-   * @param maxWriteBufferNumber maximum number of write buffers.
-   * @return the instance of the current Options.
-   * @see org.rocksdb.RocksDB#open(Options, String)
-   */
+  @Override
   public Options setMaxWriteBufferNumber(int maxWriteBufferNumber) {
     assert(isInitialized());
     setMaxWriteBufferNumber(nativeHandle_, maxWriteBufferNumber);
     return this;
   }
 
-  /**
-   * Returns maximum number of write buffers.
-   *
-   * @return maximum number of write buffers.
-   * @see #setMaxWriteBufferNumber(int)
-   */
+  @Override
   public int maxWriteBufferNumber() {
     assert(isInitialized());
     return maxWriteBufferNumber(nativeHandle_);
   }
 
-  /**
-   * Use the specified comparator for key ordering.
-   *
-   * Comparator should not be disposed before options instances using this comparator is
-   * disposed. If dispose() function is not called, then comparator object will be
-   * GC'd automatically.
-   *
-   * Comparator instance can be re-used in multiple options instances.
-   *
-   * @param comparator java instance.
-   * @return the instance of the current Options.
-   * @see RocksDB.open()
-   */
-  public Options setComparator(AbstractComparator comparator) {
-      assert (isInitialized());
-      setComparatorHandle(nativeHandle_, comparator.nativeHandle_);
-      comparator_ = comparator;
-      return this;
-  }
-  private native void setComparatorHandle(long optHandle, long comparatorHandle);
-
-  /**
-   * If true, an error will be thrown during RocksDB.open() if the
-   * database already exists.
-   *
-   * @return if true, an error is raised when the specified database
-   *    already exists before open.
-   */
+  @Override
   public boolean errorIfExists() {
     assert(isInitialized());
     return errorIfExists(nativeHandle_);
   }
-  private native boolean errorIfExists(long handle);
 
-  /**
-   * If true, an error will be thrown during RocksDB.open() if the
-   * database already exists.
-   * Default: false
-   *
-   * @param errorIfExists if true, an exception will be thrown
-   *     during {@code RocksDB.open()} if the database already exists.
-   * @return the reference to the current option.
-   * @see org.rocksdb.RocksDB#open(Options, String)
-   */
+  @Override
   public Options setErrorIfExists(boolean errorIfExists) {
     assert(isInitialized());
     setErrorIfExists(nativeHandle_, errorIfExists);
     return this;
   }
-  private native void setErrorIfExists(long handle, boolean errorIfExists);
 
-  /**
-   * If true, the implementation will do aggressive checking of the
-   * data it is processing and will stop early if it detects any
-   * errors.  This may have unforeseen ramifications: for example, a
-   * corruption of one DB entry may cause a large number of entries to
-   * become unreadable or for the entire DB to become unopenable.
-   * If any of the  writes to the database fails (Put, Delete, Merge, Write),
-   * the database will switch to read-only mode and fail all other
-   * Write operations.
-   *
-   * @return a boolean indicating whether paranoid-check is on.
-   */
+  @Override
   public boolean paranoidChecks() {
     assert(isInitialized());
     return paranoidChecks(nativeHandle_);
   }
-  private native boolean paranoidChecks(long handle);
 
-  /**
-   * If true, the implementation will do aggressive checking of the
-   * data it is processing and will stop early if it detects any
-   * errors.  This may have unforeseen ramifications: for example, a
-   * corruption of one DB entry may cause a large number of entries to
-   * become unreadable or for the entire DB to become unopenable.
-   * If any of the  writes to the database fails (Put, Delete, Merge, Write),
-   * the database will switch to read-only mode and fail all other
-   * Write operations.
-   * Default: true
-   *
-   * @param paranoidChecks a flag to indicate whether paranoid-check
-   *     is on.
-   * @return the reference to the current option.
-   */
+  @Override
   public Options setParanoidChecks(boolean paranoidChecks) {
     assert(isInitialized());
     setParanoidChecks(nativeHandle_, paranoidChecks);
     return this;
   }
-  private native void setParanoidChecks(
-      long handle, boolean paranoidChecks);
 
-  /**
-   * Number of open files that can be used by the DB.  You may need to
-   * increase this if your database has a large working set. Value -1 means
-   * files opened are always kept open. You can estimate number of files based
-   * on {@code target_file_size_base} and {@code target_file_size_multiplier}
-   * for level-based compaction. For universal-style compaction, you can usually
-   * set it to -1.
-   *
-   * @return the maximum number of open files.
-   */
+  @Override
   public int maxOpenFiles() {
     assert(isInitialized());
     return maxOpenFiles(nativeHandle_);
   }
-  private native int maxOpenFiles(long handle);
 
-  /**
-   * Number of open files that can be used by the DB.  You may need to
-   * increase this if your database has a large working set. Value -1 means
-   * files opened are always kept open. You can estimate number of files based
-   * on {@code target_file_size_base} and {@code target_file_size_multiplier}
-   * for level-based compaction. For universal-style compaction, you can usually
-   * set it to -1.
-   * Default: 5000
-   *
-   * @param maxOpenFiles the maximum number of open files.
-   * @return the reference to the current option.
-   */
+  @Override
+  public Options setMaxTotalWalSize(long maxTotalWalSize) {
+    assert(isInitialized());
+    setMaxTotalWalSize(nativeHandle_, maxTotalWalSize);
+    return this;
+  }
+
+  @Override
+  public long maxTotalWalSize() {
+    assert(isInitialized());
+    return maxTotalWalSize(nativeHandle_);
+  }
+
+  @Override
   public Options setMaxOpenFiles(int maxOpenFiles) {
     assert(isInitialized());
     setMaxOpenFiles(nativeHandle_, maxOpenFiles);
     return this;
   }
-  private native void setMaxOpenFiles(long handle, int maxOpenFiles);
 
-  /**
-   * If true, then the contents of data files are not synced
-   * to stable storage. Their contents remain in the OS buffers till the
-   * OS decides to flush them. This option is good for bulk-loading
-   * of data. Once the bulk-loading is complete, please issue a
-   * sync to the OS to flush all dirty buffers to stable storage.
-   *
-   * @return if true, then data-sync is disabled.
-   */
+  @Override
   public boolean disableDataSync() {
     assert(isInitialized());
     return disableDataSync(nativeHandle_);
   }
-  private native boolean disableDataSync(long handle);
 
-  /**
-   * If true, then the contents of data files are not synced
-   * to stable storage. Their contents remain in the OS buffers till the
-   * OS decides to flush them. This option is good for bulk-loading
-   * of data. Once the bulk-loading is complete, please issue a
-   * sync to the OS to flush all dirty buffers to stable storage.
-   * Default: false
-   *
-   * @param disableDataSync a boolean flag to specify whether to
-   *     disable data sync.
-   * @return the reference to the current option.
-   */
+  @Override
   public Options setDisableDataSync(boolean disableDataSync) {
     assert(isInitialized());
     setDisableDataSync(nativeHandle_, disableDataSync);
     return this;
   }
-  private native void setDisableDataSync(long handle, boolean disableDataSync);
 
-  /**
-   * If true, then every store to stable storage will issue a fsync.
-   * If false, then every store to stable storage will issue a fdatasync.
-   * This parameter should be set to true while storing data to
-   * filesystem like ext3 that can lose files after a reboot.
-   *
-   * @return boolean value indicating if fsync is used.
-   */
+  @Override
   public boolean useFsync() {
     assert(isInitialized());
     return useFsync(nativeHandle_);
   }
-  private native boolean useFsync(long handle);
 
-  /**
-   * If true, then every store to stable storage will issue a fsync.
-   * If false, then every store to stable storage will issue a fdatasync.
-   * This parameter should be set to true while storing data to
-   * filesystem like ext3 that can lose files after a reboot.
-   * Default: false
-   *
-   * @param useFsync a boolean flag to specify whether to use fsync
-   * @return the reference to the current option.
-   */
+  @Override
   public Options setUseFsync(boolean useFsync) {
     assert(isInitialized());
     setUseFsync(nativeHandle_, useFsync);
     return this;
   }
-  private native void setUseFsync(long handle, boolean useFsync);
 
-  /**
-   * Returns the directory of info log.
-   *
-   * If it is empty, the log files will be in the same dir as data.
-   * If it is non empty, the log files will be in the specified dir,
-   * and the db data dir's absolute path will be used as the log file
-   * name's prefix.
-   *
-   * @return the path to the info log directory
-   */
+  @Override
   public String dbLogDir() {
     assert(isInitialized());
     return dbLogDir(nativeHandle_);
   }
-  private native String dbLogDir(long handle);
 
-  /**
-   * This specifies the info LOG dir.
-   * If it is empty, the log files will be in the same dir as data.
-   * If it is non empty, the log files will be in the specified dir,
-   * and the db data dir's absolute path will be used as the log file
-   * name's prefix.
-   *
-   * @param dbLogDir the path to the info log directory
-   * @return the reference to the current option.
-   */
+  @Override
   public Options setDbLogDir(String dbLogDir) {
     assert(isInitialized());
     setDbLogDir(nativeHandle_, dbLogDir);
     return this;
   }
-  private native void setDbLogDir(long handle, String dbLogDir);
 
-  /**
-   * Returns the path to the write-ahead-logs (WAL) directory.
-   *
-   * If it is empty, the log files will be in the same dir as data,
-   *   dbname is used as the data dir by default
-   * If it is non empty, the log files will be in kept the specified dir.
-   * When destroying the db,
-   *   all log files in wal_dir and the dir itself is deleted
-   *
-   * @return the path to the write-ahead-logs (WAL) directory.
-   */
+  @Override
   public String walDir() {
     assert(isInitialized());
     return walDir(nativeHandle_);
   }
-  private native String walDir(long handle);
 
-  /**
-   * This specifies the absolute dir path for write-ahead logs (WAL).
-   * If it is empty, the log files will be in the same dir as data,
-   *   dbname is used as the data dir by default
-   * If it is non empty, the log files will be in kept the specified dir.
-   * When destroying the db,
-   *   all log files in wal_dir and the dir itself is deleted
-   *
-   * @param walDir the path to the write-ahead-log directory.
-   * @return the reference to the current option.
-   */
+  @Override
   public Options setWalDir(String walDir) {
     assert(isInitialized());
     setWalDir(nativeHandle_, walDir);
     return this;
   }
-  private native void setWalDir(long handle, String walDir);
 
-  /**
-   * The periodicity when obsolete files get deleted. The default
-   * value is 6 hours. The files that get out of scope by compaction
-   * process will still get automatically delete on every compaction,
-   * regardless of this setting
-   *
-   * @return the time interval in micros when obsolete files will be deleted.
-   */
+  @Override
   public long deleteObsoleteFilesPeriodMicros() {
     assert(isInitialized());
     return deleteObsoleteFilesPeriodMicros(nativeHandle_);
   }
-  private native long deleteObsoleteFilesPeriodMicros(long handle);
 
-  /**
-   * The periodicity when obsolete files get deleted. The default
-   * value is 6 hours. The files that get out of scope by compaction
-   * process will still get automatically delete on every compaction,
-   * regardless of this setting
-   *
-   * @param micros the time interval in micros
-   * @return the reference to the current option.
-   */
+  @Override
   public Options setDeleteObsoleteFilesPeriodMicros(long micros) {
     assert(isInitialized());
     setDeleteObsoleteFilesPeriodMicros(nativeHandle_, micros);
     return this;
   }
-  private native void setDeleteObsoleteFilesPeriodMicros(
-      long handle, long micros);
 
-  /**
-   * Returns the maximum number of concurrent background compaction jobs,
-   * submitted to the default LOW priority thread pool.
-   * When increasing this number, we may also want to consider increasing
-   * number of threads in LOW priority thread pool.
-   * Default: 1
-   *
-   * @return the maximum number of concurrent background compaction jobs.
-   * @see org.rocksdb.RocksEnv#setBackgroundThreads(int)
-   * @see org.rocksdb.RocksEnv#setBackgroundThreads(int, int)
-   */
+  @Override
   public int maxBackgroundCompactions() {
     assert(isInitialized());
     return maxBackgroundCompactions(nativeHandle_);
   }
 
-  /**
-   * Creates statistics object which collects metrics about database operations.
-     Statistics objects should not be shared between DB instances as
-     it does not use any locks to prevent concurrent updates.
-   *
-   * @return the instance of the current Options.
-   * @see org.rocksdb.RocksDB#open(Options, String)
-   */
+  @Override
   public Options createStatistics() {
     assert(isInitialized());
     createStatistics(nativeHandle_);
     return this;
   }
 
-  /**
-   * Returns statistics object. Calls {@link #createStatistics()} if
-   * C++ returns {@code nullptr} for statistics.
-   *
-   * @return the instance of the statistics object.
-   * @see #createStatistics()
-   */
+  @Override
   public Statistics statisticsPtr() {
     assert(isInitialized());
 
@@ -533,646 +322,263 @@ public class Options extends RocksObject {
     return new Statistics(statsPtr);
   }
 
-  /**
-   * Specifies the maximum number of concurrent background compaction jobs,
-   * submitted to the default LOW priority thread pool.
-   * If you're increasing this, also consider increasing number of threads in
-   * LOW priority thread pool. For more information, see
-   * Default: 1
-   *
-   * @param maxBackgroundCompactions the maximum number of background
-   *     compaction jobs.
-   * @return the reference to the current option.
-   *
-   * @see org.rocksdb.RocksEnv#setBackgroundThreads(int)
-   * @see org.rocksdb.RocksEnv#setBackgroundThreads(int, int)
-   * @see #maxBackgroundFlushes()
-   */
+  @Override
   public Options setMaxBackgroundCompactions(int maxBackgroundCompactions) {
     assert(isInitialized());
     setMaxBackgroundCompactions(nativeHandle_, maxBackgroundCompactions);
     return this;
   }
 
-  /**
-   * Returns the maximum number of concurrent background flush jobs.
-   * If you're increasing this, also consider increasing number of threads in
-   * HIGH priority thread pool. For more information, see
-   * Default: 1
-   *
-   * @return the maximum number of concurrent background flush jobs.
-   * @see org.rocksdb.RocksEnv#setBackgroundThreads(int)
-   * @see org.rocksdb.RocksEnv#setBackgroundThreads(int, int)
-   */
+  @Override
   public int maxBackgroundFlushes() {
     assert(isInitialized());
     return maxBackgroundFlushes(nativeHandle_);
   }
-  private native int maxBackgroundFlushes(long handle);
 
-  /**
-   * Specifies the maximum number of concurrent background flush jobs.
-   * If you're increasing this, also consider increasing number of threads in
-   * HIGH priority thread pool. For more information, see
-   * Default: 1
-   *
-   * @param maxBackgroundFlushes number of max concurrent flush jobs
-   * @return the reference to the current option.
-   *
-   * @see org.rocksdb.RocksEnv#setBackgroundThreads(int)
-   * @see org.rocksdb.RocksEnv#setBackgroundThreads(int, int)
-   * @see #maxBackgroundCompactions()
-   */
+  @Override
   public Options setMaxBackgroundFlushes(int maxBackgroundFlushes) {
     assert(isInitialized());
     setMaxBackgroundFlushes(nativeHandle_, maxBackgroundFlushes);
     return this;
   }
-  private native void setMaxBackgroundFlushes(
-      long handle, int maxBackgroundFlushes);
 
-  /**
-   * Returns the maximum size of a info log file. If the current log file
-   * is larger than this size, a new info log file will be created.
-   * If 0, all logs will be written to one log file.
-   *
-   * @return the maximum size of the info log file.
-   */
+  @Override
   public long maxLogFileSize() {
     assert(isInitialized());
     return maxLogFileSize(nativeHandle_);
   }
-  private native long maxLogFileSize(long handle);
 
-  /**
-   * Specifies the maximum size of a info log file. If the current log file
-   * is larger than `max_log_file_size`, a new info log file will
-   * be created.
-   * If 0, all logs will be written to one log file.
-   *
-   * @param maxLogFileSize the maximum size of a info log file.
-   * @return the reference to the current option.
-   * @throws RocksDBException
-   */
+  @Override
   public Options setMaxLogFileSize(long maxLogFileSize)
       throws RocksDBException {
     assert(isInitialized());
     setMaxLogFileSize(nativeHandle_, maxLogFileSize);
     return this;
   }
-  private native void setMaxLogFileSize(long handle, long maxLogFileSize)
-      throws RocksDBException;
 
-  /**
-   * Returns the time interval for the info log file to roll (in seconds).
-   * If specified with non-zero value, log file will be rolled
-   * if it has been active longer than `log_file_time_to_roll`.
-   * Default: 0 (disabled)
-   *
-   * @return the time interval in seconds.
-   */
+  @Override
   public long logFileTimeToRoll() {
     assert(isInitialized());
     return logFileTimeToRoll(nativeHandle_);
   }
-  private native long logFileTimeToRoll(long handle);
 
-  /**
-   * Specifies the time interval for the info log file to roll (in seconds).
-   * If specified with non-zero value, log file will be rolled
-   * if it has been active longer than `log_file_time_to_roll`.
-   * Default: 0 (disabled)
-   *
-   * @param logFileTimeToRoll the time interval in seconds.
-   * @return the reference to the current option.
-   * @throws RocksDBException
-   */
+  @Override
   public Options setLogFileTimeToRoll(long logFileTimeToRoll)
       throws RocksDBException{
     assert(isInitialized());
     setLogFileTimeToRoll(nativeHandle_, logFileTimeToRoll);
     return this;
   }
-  private native void setLogFileTimeToRoll(
-      long handle, long logFileTimeToRoll) throws RocksDBException;
 
-  /**
-   * Returns the maximum number of info log files to be kept.
-   * Default: 1000
-   *
-   * @return the maximum number of info log files to be kept.
-   */
+  @Override
   public long keepLogFileNum() {
     assert(isInitialized());
     return keepLogFileNum(nativeHandle_);
   }
-  private native long keepLogFileNum(long handle);
 
-  /**
-   * Specifies the maximum number of info log files to be kept.
-   * Default: 1000
-   *
-   * @param keepLogFileNum the maximum number of info log files to be kept.
-   * @return the reference to the current option.
-   * @throws RocksDBException
-   */
+  @Override
   public Options setKeepLogFileNum(long keepLogFileNum)
       throws RocksDBException{
     assert(isInitialized());
     setKeepLogFileNum(nativeHandle_, keepLogFileNum);
     return this;
   }
-  private native void setKeepLogFileNum(long handle, long keepLogFileNum)
-      throws RocksDBException;
 
-  /**
-   * Manifest file is rolled over on reaching this limit.
-   * The older manifest file be deleted.
-   * The default value is MAX_INT so that roll-over does not take place.
-   *
-   * @return the size limit of a manifest file.
-   */
+  @Override
   public long maxManifestFileSize() {
     assert(isInitialized());
     return maxManifestFileSize(nativeHandle_);
   }
-  private native long maxManifestFileSize(long handle);
 
-  /**
-   * Manifest file is rolled over on reaching this limit.
-   * The older manifest file be deleted.
-   * The default value is MAX_INT so that roll-over does not take place.
-   *
-   * @param maxManifestFileSize the size limit of a manifest file.
-   * @return the reference to the current option.
-   */
+  @Override
   public Options setMaxManifestFileSize(long maxManifestFileSize) {
     assert(isInitialized());
     setMaxManifestFileSize(nativeHandle_, maxManifestFileSize);
     return this;
   }
-  private native void setMaxManifestFileSize(
-      long handle, long maxManifestFileSize);
 
-  /**
-   * Number of shards used for table cache.
-   *
-   * @return the number of shards used for table cache.
-   */
+  @Override
   public int tableCacheNumshardbits() {
     assert(isInitialized());
     return tableCacheNumshardbits(nativeHandle_);
   }
-  private native int tableCacheNumshardbits(long handle);
 
-  /**
-   * Number of shards used for table cache.
-   *
-   * @param tableCacheNumshardbits the number of chards
-   * @return the reference to the current option.
-   */
+  @Override
   public Options setTableCacheNumshardbits(int tableCacheNumshardbits) {
     assert(isInitialized());
     setTableCacheNumshardbits(nativeHandle_, tableCacheNumshardbits);
     return this;
   }
-  private native void setTableCacheNumshardbits(
-      long handle, int tableCacheNumshardbits);
 
-  /**
-   * During data eviction of table's LRU cache, it would be inefficient
-   * to strictly follow LRU because this piece of memory will not really
-   * be released unless its refcount falls to zero. Instead, make two
-   * passes: the first pass will release items with refcount = 1,
-   * and if not enough space releases after scanning the number of
-   * elements specified by this parameter, we will remove items in LRU
-   * order.
-   *
-   * @return scan count limit
-   */
+  @Override
   public int tableCacheRemoveScanCountLimit() {
     assert(isInitialized());
     return tableCacheRemoveScanCountLimit(nativeHandle_);
   }
-  private native int tableCacheRemoveScanCountLimit(long handle);
 
-  /**
-   * During data eviction of table's LRU cache, it would be inefficient
-   * to strictly follow LRU because this piece of memory will not really
-   * be released unless its refcount falls to zero. Instead, make two
-   * passes: the first pass will release items with refcount = 1,
-   * and if not enough space releases after scanning the number of
-   * elements specified by this parameter, we will remove items in LRU
-   * order.
-   *
-   * @param limit scan count limit
-   * @return the reference to the current option.
-   */
+  @Override
   public Options setTableCacheRemoveScanCountLimit(int limit) {
     assert(isInitialized());
     setTableCacheRemoveScanCountLimit(nativeHandle_, limit);
     return this;
   }
-  private native void setTableCacheRemoveScanCountLimit(
-      long handle, int limit);
 
-  /**
-   * WalTtlSeconds() and walSizeLimitMB() affect how archived logs
-   * will be deleted.
-   * <ol>
-   * <li>If both set to 0, logs will be deleted asap and will not get into
-   * the archive.</li>
-   * <li>If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0,
-   * WAL files will be checked every 10 min and if total size is greater
-   * then WAL_size_limit_MB, they will be deleted starting with the
-   * earliest until size_limit is met. All empty files will be deleted.</li>
-   * <li>If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then
-   * WAL files will be checked every WAL_ttl_secondsi / 2 and those that
-   * are older than WAL_ttl_seconds will be deleted.</li>
-   * <li>If both are not 0, WAL files will be checked every 10 min and both
-   * checks will be performed with ttl being first.</li>
-   * </ol>
-   *
-   * @return the wal-ttl seconds
-   * @see #walSizeLimitMB()
-   */
+  @Override
   public long walTtlSeconds() {
     assert(isInitialized());
     return walTtlSeconds(nativeHandle_);
   }
-  private native long walTtlSeconds(long handle);
 
-  /**
-   * {@link #walTtlSeconds()} and {@link #walSizeLimitMB()} affect how archived logs
-   * will be deleted.
-   * <ol>
-   * <li>If both set to 0, logs will be deleted asap and will not get into
-   * the archive.</li>
-   * <li>If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0,
-   *    WAL files will be checked every 10 min and if total size is greater
-   *    then WAL_size_limit_MB, they will be deleted starting with the
-   *    earliest until size_limit is met. All empty files will be deleted.</li>
-   * <li>If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then
-   *    WAL files will be checked every WAL_ttl_secondsi / 2 and those that
-   *    are older than WAL_ttl_seconds will be deleted.</li>
-   * <li>If both are not 0, WAL files will be checked every 10 min and both
-   *    checks will be performed with ttl being first.</li>
-   *
-   * @param walTtlSeconds the ttl seconds
-   * @return the reference to the current option.
-   * @see #setWalSizeLimitMB(long)
-   */
+  @Override
   public Options setWalTtlSeconds(long walTtlSeconds) {
     assert(isInitialized());
     setWalTtlSeconds(nativeHandle_, walTtlSeconds);
     return this;
   }
-  private native void setWalTtlSeconds(long handle, long walTtlSeconds);
 
-  /**
-   * {@link #walTtlSeconds()} and {@code #walSizeLimitMB()} affect how archived logs
-   * will be deleted.
-   * <ol>
-   * <li>If both set to 0, logs will be deleted asap and will not get into
-   *    the archive.</li>
-   * <li>If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0,
-   *    WAL files will be checked every 10 min and if total size is greater
-   *    then WAL_size_limit_MB, they will be deleted starting with the
-   *    earliest until size_limit is met. All empty files will be deleted.</li>
-   * <li>If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then
-   *    WAL files will be checked every WAL_ttl_seconds i / 2 and those that
-   *    are older than WAL_ttl_seconds will be deleted.</li>
-   * <li>If both are not 0, WAL files will be checked every 10 min and both
-   *    checks will be performed with ttl being first.</li>
-   * </ol>
-   * @return size limit in mega-bytes.
-   * @see #walSizeLimitMB()
-   */
+  @Override
   public long walSizeLimitMB() {
     assert(isInitialized());
     return walSizeLimitMB(nativeHandle_);
   }
-  private native long walSizeLimitMB(long handle);
 
-  /**
-   * WalTtlSeconds() and walSizeLimitMB() affect how archived logs
-   * will be deleted.
-   * <ol>
-   * <li>If both set to 0, logs will be deleted asap and will not get into
-   *    the archive.</li>
-   * <li>If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0,
-   *    WAL files will be checked every 10 min and if total size is greater
-   *    then WAL_size_limit_MB, they will be deleted starting with the
-   *    earliest until size_limit is met. All empty files will be deleted.</li>
-   * <li>If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then
-   *    WAL files will be checked every WAL_ttl_secondsi / 2 and those that
-   *    are older than WAL_ttl_seconds will be deleted.</li>
-   * <li>If both are not 0, WAL files will be checked every 10 min and both
-   *    checks will be performed with ttl being first.</li>
-   *
-   * @param sizeLimitMB size limit in mega-bytes.
-   * @return the reference to the current option.
-   * @see #setWalSizeLimitMB(long)
-   */
+  @Override
   public Options setWalSizeLimitMB(long sizeLimitMB) {
     assert(isInitialized());
     setWalSizeLimitMB(nativeHandle_, sizeLimitMB);
     return this;
   }
-  private native void setWalSizeLimitMB(long handle, long sizeLimitMB);
 
-  /**
-   * Number of bytes to preallocate (via fallocate) the manifest
-   * files.  Default is 4mb, which is reasonable to reduce random IO
-   * as well as prevent overallocation for mounts that preallocate
-   * large amounts of data (such as xfs's allocsize option).
-   *
-   * @return size in bytes.
-   */
+  @Override
   public long manifestPreallocationSize() {
     assert(isInitialized());
     return manifestPreallocationSize(nativeHandle_);
   }
-  private native long manifestPreallocationSize(long handle);
 
-  /**
-   * Number of bytes to preallocate (via fallocate) the manifest
-   * files.  Default is 4mb, which is reasonable to reduce random IO
-   * as well as prevent overallocation for mounts that preallocate
-   * large amounts of data (such as xfs's allocsize option).
-   *
-   * @param size the size in byte
-   * @return the reference to the current option.
-   * @throws RocksDBException
-   */
+  @Override
   public Options setManifestPreallocationSize(long size)
       throws RocksDBException {
     assert(isInitialized());
     setManifestPreallocationSize(nativeHandle_, size);
     return this;
   }
-  private native void setManifestPreallocationSize(
-      long handle, long size) throws RocksDBException;
 
-  /**
-   * Data being read from file storage may be buffered in the OS
-   * Default: true
-   *
-   * @return if true, then OS buffering is allowed.
-   */
+  @Override
   public boolean allowOsBuffer() {
     assert(isInitialized());
     return allowOsBuffer(nativeHandle_);
   }
-  private native boolean allowOsBuffer(long handle);
 
-  /**
-   * Data being read from file storage may be buffered in the OS
-   * Default: true
-   *
-   * @param allowOsBuffer if true, then OS buffering is allowed.
-   * @return the reference to the current option.
-   */
+  @Override
   public Options setAllowOsBuffer(boolean allowOsBuffer) {
     assert(isInitialized());
     setAllowOsBuffer(nativeHandle_, allowOsBuffer);
     return this;
   }
-  private native void setAllowOsBuffer(
-      long handle, boolean allowOsBuffer);
 
-  /**
-   * Allow the OS to mmap file for reading sst tables.
-   * Default: false
-   *
-   * @return true if mmap reads are allowed.
-   */
+  @Override
   public boolean allowMmapReads() {
     assert(isInitialized());
     return allowMmapReads(nativeHandle_);
   }
-  private native boolean allowMmapReads(long handle);
 
-  /**
-   * Allow the OS to mmap file for reading sst tables.
-   * Default: false
-   *
-   * @param allowMmapReads true if mmap reads are allowed.
-   * @return the reference to the current option.
-   */
+  @Override
   public Options setAllowMmapReads(boolean allowMmapReads) {
     assert(isInitialized());
     setAllowMmapReads(nativeHandle_, allowMmapReads);
     return this;
   }
-  private native void setAllowMmapReads(
-      long handle, boolean allowMmapReads);
 
-  /**
-   * Allow the OS to mmap file for writing. Default: false
-   *
-   * @return true if mmap writes are allowed.
-   */
+  @Override
   public boolean allowMmapWrites() {
     assert(isInitialized());
     return allowMmapWrites(nativeHandle_);
   }
-  private native boolean allowMmapWrites(long handle);
 
-  /**
-   * Allow the OS to mmap file for writing. Default: false
-   *
-   * @param allowMmapWrites true if mmap writes are allowd.
-   * @return the reference to the current option.
-   */
+  @Override
   public Options setAllowMmapWrites(boolean allowMmapWrites) {
     assert(isInitialized());
     setAllowMmapWrites(nativeHandle_, allowMmapWrites);
     return this;
   }
-  private native void setAllowMmapWrites(
-      long handle, boolean allowMmapWrites);
 
-  /**
-   * Disable child process inherit open files. Default: true
-   *
-   * @return true if child process inheriting open files is disabled.
-   */
+  @Override
   public boolean isFdCloseOnExec() {
     assert(isInitialized());
     return isFdCloseOnExec(nativeHandle_);
   }
-  private native boolean isFdCloseOnExec(long handle);
 
-  /**
-   * Disable child process inherit open files. Default: true
-   *
-   * @param isFdCloseOnExec true if child process inheriting open
-   *     files is disabled.
-   * @return the reference to the current option.
-   */
+  @Override
   public Options setIsFdCloseOnExec(boolean isFdCloseOnExec) {
     assert(isInitialized());
     setIsFdCloseOnExec(nativeHandle_, isFdCloseOnExec);
     return this;
   }
-  private native void setIsFdCloseOnExec(
-      long handle, boolean isFdCloseOnExec);
 
-  /**
-   * Skip log corruption error on recovery (If client is ok with
-   * losing most recent changes)
-   * Default: false
-   *
-   * @return true if log corruption errors are skipped during recovery.
-   */
+  @Override
   public boolean skipLogErrorOnRecovery() {
     assert(isInitialized());
     return skipLogErrorOnRecovery(nativeHandle_);
   }
-  private native boolean skipLogErrorOnRecovery(long handle);
 
-  /**
-   * Skip log corruption error on recovery (If client is ok with
-   * losing most recent changes)
-   * Default: false
-   *
-   * @param skip true if log corruption errors are skipped during recovery.
-   * @return the reference to the current option.
-   */
+  @Override
   public Options setSkipLogErrorOnRecovery(boolean skip) {
     assert(isInitialized());
     setSkipLogErrorOnRecovery(nativeHandle_, skip);
     return this;
   }
-  private native void setSkipLogErrorOnRecovery(
-      long handle, boolean skip);
 
-  /**
-   * If not zero, dump rocksdb.stats to LOG every stats_dump_period_sec
-   * Default: 3600 (1 hour)
-   *
-   * @return time interval in seconds.
-   */
+  @Override
   public int statsDumpPeriodSec() {
     assert(isInitialized());
     return statsDumpPeriodSec(nativeHandle_);
   }
-  private native int statsDumpPeriodSec(long handle);
 
-  /**
-   * if not zero, dump rocksdb.stats to LOG every stats_dump_period_sec
-   * Default: 3600 (1 hour)
-   *
-   * @param statsDumpPeriodSec time interval in seconds.
-   * @return the reference to the current option.
-   */
+  @Override
   public Options setStatsDumpPeriodSec(int statsDumpPeriodSec) {
     assert(isInitialized());
     setStatsDumpPeriodSec(nativeHandle_, statsDumpPeriodSec);
     return this;
   }
-  private native void setStatsDumpPeriodSec(
-      long handle, int statsDumpPeriodSec);
 
-  /**
-   * If set true, will hint the underlying file system that the file
-   * access pattern is random, when a sst file is opened.
-   * Default: true
-   *
-   * @return true if hinting random access is on.
-   */
+  @Override
   public boolean adviseRandomOnOpen() {
     return adviseRandomOnOpen(nativeHandle_);
   }
-  private native boolean adviseRandomOnOpen(long handle);
 
-  /**
-   * If set true, will hint the underlying file system that the file
-   * access pattern is random, when a sst file is opened.
-   * Default: true
-   *
-   * @param adviseRandomOnOpen true if hinting random access is on.
-   * @return the reference to the current option.
-   */
+  @Override
   public Options setAdviseRandomOnOpen(boolean adviseRandomOnOpen) {
     assert(isInitialized());
     setAdviseRandomOnOpen(nativeHandle_, adviseRandomOnOpen);
     return this;
   }
-  private native void setAdviseRandomOnOpen(
-      long handle, boolean adviseRandomOnOpen);
 
-  /**
-   * Use adaptive mutex, which spins in the user space before resorting
-   * to kernel. This could reduce context switch when the mutex is not
-   * heavily contended. However, if the mutex is hot, we could end up
-   * wasting spin time.
-   * Default: false
-   *
-   * @return true if adaptive mutex is used.
-   */
+  @Override
   public boolean useAdaptiveMutex() {
     assert(isInitialized());
     return useAdaptiveMutex(nativeHandle_);
   }
-  private native boolean useAdaptiveMutex(long handle);
 
-  /**
-   * Use adaptive mutex, which spins in the user space before resorting
-   * to kernel. This could reduce context switch when the mutex is not
-   * heavily contended. However, if the mutex is hot, we could end up
-   * wasting spin time.
-   * Default: false
-   *
-   * @param useAdaptiveMutex true if adaptive mutex is used.
-   * @return the reference to the current option.
-   */
+  @Override
   public Options setUseAdaptiveMutex(boolean useAdaptiveMutex) {
     assert(isInitialized());
     setUseAdaptiveMutex(nativeHandle_, useAdaptiveMutex);
     return this;
   }
-  private native void setUseAdaptiveMutex(
-      long handle, boolean useAdaptiveMutex);
 
-  /**
-   * Allows OS to incrementally sync files to disk while they are being
-   * written, asynchronously, in the background.
-   * Issue one request for every bytes_per_sync written. 0 turns it off.
-   * Default: 0
-   *
-   * @return size in bytes
-   */
+  @Override
   public long bytesPerSync() {
     return bytesPerSync(nativeHandle_);
   }
-  private native long bytesPerSync(long handle);
 
-  /**
-   * Allows OS to incrementally sync files to disk while they are being
-   * written, asynchronously, in the background.
-   * Issue one request for every bytes_per_sync written. 0 turns it off.
-   * Default: 0
-   *
-   * @param bytesPerSync size in bytes
-   * @return the reference to the current option.
-   */
+  @Override
   public Options setBytesPerSync(long bytesPerSync) {
     assert(isInitialized());
     setBytesPerSync(nativeHandle_, bytesPerSync);
     return this;
   }
-  private native void setBytesPerSync(
-      long handle, long bytesPerSync);
 
-  /**
-   * Set the config for mem-table.
-   *
-   * @param config the mem-table config.
-   * @return the instance of the current Options.
-   * @throws RocksDBException
-   */
+  @Override
   public Options setMemTableConfig(MemTableConfig config)
       throws RocksDBException {
     memTableConfig_ = config;
@@ -1180,1119 +586,392 @@ public class Options extends RocksObject {
     return this;
   }
 
-  /**
-   * Use to control write rate of flush and compaction. Flush has higher
-   * priority than compaction. Rate limiting is disabled if nullptr.
-   * Default: nullptr
-   *
-   * @param config rate limiter config.
-   * @return the instance of the current Options.
-   * @throws RocksDBException
-   */
+  @Override
   public Options setRateLimiterConfig(RateLimiterConfig config) {
     rateLimiterConfig_ = config;
     setRateLimiter(nativeHandle_, config.newRateLimiterHandle());
     return this;
   }
 
-  /**
-   * Returns the name of the current mem table representation.
-   * Memtable format can be set using setTableFormatConfig.
-   *
-   * @return the name of the currently-used memtable factory.
-   * @see #setTableFormatConfig(TableFormatConfig)
-   */
+  @Override
   public String memTableFactoryName() {
     assert(isInitialized());
     return memTableFactoryName(nativeHandle_);
   }
 
-  /**
-   * Set the config for table format.
-   *
-   * @param config the table format config.
-   * @return the reference of the current Options.
-   */
+  @Override
   public Options setTableFormatConfig(TableFormatConfig config) {
     tableFormatConfig_ = config;
     setTableFactory(nativeHandle_, config.newTableFactoryHandle());
     return this;
   }
 
-  /**
-   * @return the name of the currently used table factory.
-   */
+  @Override
   public String tableFactoryName() {
     assert(isInitialized());
     return tableFactoryName(nativeHandle_);
   }
 
-  /**
-   * This prefix-extractor uses the first n bytes of a key as its prefix.
-   *
-   * In some hash-based memtable representation such as HashLinkedList
-   * and HashSkipList, prefixes are used to partition the keys into
-   * several buckets.  Prefix extractor is used to specify how to
-   * extract the prefix given a key.
-   *
-   * @param n use the first n bytes of a key as its prefix.
-   */
+  @Override
   public Options useFixedLengthPrefixExtractor(int n) {
     assert(isInitialized());
     useFixedLengthPrefixExtractor(nativeHandle_, n);
     return this;
   }
 
-///////////////////////////////////////////////////////////////////////
-  /**
-   * Number of keys between restart points for delta encoding of keys.
-   * This parameter can be changed dynamically.  Most clients should
-   * leave this parameter alone.
-   * Default: 16
-   *
-   * @return the number of keys between restart points.
-   */
-  public int blockRestartInterval() {
-    return blockRestartInterval(nativeHandle_);
-  }
-  private native int blockRestartInterval(long handle);
-
-  /**
-   * Number of keys between restart points for delta encoding of keys.
-   * This parameter can be changed dynamically.  Most clients should
-   * leave this parameter alone.
-   * Default: 16
-   *
-   * @param blockRestartInterval the number of keys between restart points.
-   * @return the reference to the current option.
-   */
-  public Options setBlockRestartInterval(int blockRestartInterval) {
-    setBlockRestartInterval(nativeHandle_, blockRestartInterval);
-    return this;
-  }
-  private native void setBlockRestartInterval(
-      long handle, int blockRestartInterval);
-
-  /**
-   * Compress blocks using the specified compression algorithm.  This
-     parameter can be changed dynamically.
-   *
-   * Default: SNAPPY_COMPRESSION, which gives lightweight but fast compression.
-   *
-   * @return Compression type.
-   */
+  @Override
   public CompressionType compressionType() {
     return CompressionType.values()[compressionType(nativeHandle_)];
   }
-  private native byte compressionType(long handle);
 
-  /**
-   * Compress blocks using the specified compression algorithm.  This
-     parameter can be changed dynamically.
-   *
-   * Default: SNAPPY_COMPRESSION, which gives lightweight but fast compression.
-   *
-   * @param compressionType Compression Type.
-   * @return the reference to the current option.
-   */
+  @Override
   public Options setCompressionType(CompressionType compressionType) {
     setCompressionType(nativeHandle_, compressionType.getValue());
     return this;
   }
-  private native void setCompressionType(long handle, byte compressionType);
 
-   /**
-   * Compaction style for DB.
-   *
-   * @return Compaction style.
-   */
+  @Override
   public CompactionStyle compactionStyle() {
     return CompactionStyle.values()[compactionStyle(nativeHandle_)];
   }
-  private native byte compactionStyle(long handle);
 
-  /**
-   * Set compaction style for DB.
-   *
-   * Default: LEVEL.
-   *
-   * @param compactionStyle Compaction style.
-   * @return the reference to the current option.
-   */
+  @Override
   public Options setCompactionStyle(CompactionStyle compactionStyle) {
     setCompactionStyle(nativeHandle_, compactionStyle.getValue());
     return this;
   }
-  private native void setCompactionStyle(long handle, byte compactionStyle);
 
-  /**
-   * If level-styled compaction is used, then this number determines
-   * the total number of levels.
-   *
-   * @return the number of levels.
-   */
+  @Override
   public int numLevels() {
     return numLevels(nativeHandle_);
   }
-  private native int numLevels(long handle);
 
-  /**
-   * Set the number of levels for this database
-   * If level-styled compaction is used, then this number determines
-   * the total number of levels.
-   *
-   * @param numLevels the number of levels.
-   * @return the reference to the current option.
-   */
+  @Override
   public Options setNumLevels(int numLevels) {
     setNumLevels(nativeHandle_, numLevels);
     return this;
   }
-  private native void setNumLevels(
-      long handle, int numLevels);
 
-  /**
-   * The number of files in level 0 to trigger compaction from level-0 to
-   * level-1.  A value < 0 means that level-0 compaction will not be
-   * triggered by number of files at all.
-   * Default: 4
-   *
-   * @return the number of files in level 0 to trigger compaction.
-   */
+  @Override
   public int levelZeroFileNumCompactionTrigger() {
     return levelZeroFileNumCompactionTrigger(nativeHandle_);
   }
-  private native int levelZeroFileNumCompactionTrigger(long handle);
 
-  /**
-   * Number of files to trigger level-0 compaction. A value <0 means that
-   * level-0 compaction will not be triggered by number of files at all.
-   * Default: 4
-   *
-   * @param numFiles the number of files in level-0 to trigger compaction.
-   * @return the reference to the current option.
-   */
+  @Override
   public Options setLevelZeroFileNumCompactionTrigger(
       int numFiles) {
     setLevelZeroFileNumCompactionTrigger(
         nativeHandle_, numFiles);
     return this;
   }
-  private native void setLevelZeroFileNumCompactionTrigger(
-      long handle, int numFiles);
 
-  /**
-   * Soft limit on the number of level-0 files. We start slowing down writes
-   * at this point. A value < 0 means that no writing slow down will be
-   * triggered by number of files in level-0.
-   *
-   * @return the soft limit on the number of level-0 files.
-   */
+  @Override
   public int levelZeroSlowdownWritesTrigger() {
     return levelZeroSlowdownWritesTrigger(nativeHandle_);
   }
-  private native int levelZeroSlowdownWritesTrigger(long handle);
 
-  /**
-   * Soft limit on number of level-0 files. We start slowing down writes at this
-   * point. A value <0 means that no writing slow down will be triggered by
-   * number of files in level-0.
-   *
-   * @param numFiles soft limit on number of level-0 files.
-   * @return the reference to the current option.
-   */
+  @Override
   public Options setLevelZeroSlowdownWritesTrigger(
       int numFiles) {
     setLevelZeroSlowdownWritesTrigger(nativeHandle_, numFiles);
     return this;
   }
-  private native void setLevelZeroSlowdownWritesTrigger(
-      long handle, int numFiles);
 
-  /**
-   * Maximum number of level-0 files.  We stop writes at this point.
-   *
-   * @return the hard limit of the number of level-0 file.
-   */
+  @Override
   public int levelZeroStopWritesTrigger() {
     return levelZeroStopWritesTrigger(nativeHandle_);
   }
-  private native int levelZeroStopWritesTrigger(long handle);
 
-  /**
-   * Maximum number of level-0 files.  We stop writes at this point.
-   *
-   * @param numFiles the hard limit of the number of level-0 files.
-   * @return the reference to the current option.
-   */
+  @Override
   public Options setLevelZeroStopWritesTrigger(int numFiles) {
     setLevelZeroStopWritesTrigger(nativeHandle_, numFiles);
     return this;
   }
-  private native void setLevelZeroStopWritesTrigger(
-      long handle, int numFiles);
 
-  /**
-   * The highest level to which a new compacted memtable is pushed if it
-   * does not create overlap.  We try to push to level 2 to avoid the
-   * relatively expensive level 0=>1 compactions and to avoid some
-   * expensive manifest file operations.  We do not push all the way to
-   * the largest level since that can generate a lot of wasted disk
-   * space if the same key space is being repeatedly overwritten.
-   *
-   * @return the highest level where a new compacted memtable will be pushed.
-   */
+  @Override
   public int maxMemCompactionLevel() {
     return maxMemCompactionLevel(nativeHandle_);
   }
-  private native int maxMemCompactionLevel(long handle);
 
-  /**
-   * The highest level to which a new compacted memtable is pushed if it
-   * does not create overlap.  We try to push to level 2 to avoid the
-   * relatively expensive level 0=>1 compactions and to avoid some
-   * expensive manifest file operations.  We do not push all the way to
-   * the largest level since that can generate a lot of wasted disk
-   * space if the same key space is being repeatedly overwritten.
-   *
-   * @param maxMemCompactionLevel the highest level to which a new compacted
-   *     mem-table will be pushed.
-   * @return the reference to the current option.
-   */
+  @Override
   public Options setMaxMemCompactionLevel(int maxMemCompactionLevel) {
     setMaxMemCompactionLevel(nativeHandle_, maxMemCompactionLevel);
     return this;
   }
-  private native void setMaxMemCompactionLevel(
-      long handle, int maxMemCompactionLevel);
 
-  /**
-   * The target file size for compaction.
-   * This targetFileSizeBase determines a level-1 file size.
-   * Target file size for level L can be calculated by
-   * targetFileSizeBase * (targetFileSizeMultiplier ^ (L-1))
-   * For example, if targetFileSizeBase is 2MB and
-   * target_file_size_multiplier is 10, then each file on level-1 will
-   * be 2MB, and each file on level 2 will be 20MB,
-   * and each file on level-3 will be 200MB.
-   * by default targetFileSizeBase is 2MB.
-   *
-   * @return the target size of a level-0 file.
-   *
-   * @see #targetFileSizeMultiplier()
-   */
-  public int targetFileSizeBase() {
+  @Override
+  public long targetFileSizeBase() {
     return targetFileSizeBase(nativeHandle_);
   }
-  private native int targetFileSizeBase(long handle);
 
-  /**
-   * The target file size for compaction.
-   * This targetFileSizeBase determines a level-1 file size.
-   * Target file size for level L can be calculated by
-   * targetFileSizeBase * (targetFileSizeMultiplier ^ (L-1))
-   * For example, if targetFileSizeBase is 2MB and
-   * target_file_size_multiplier is 10, then each file on level-1 will
-   * be 2MB, and each file on level 2 will be 20MB,
-   * and each file on level-3 will be 200MB.
-   * by default targetFileSizeBase is 2MB.
-   *
-   * @param targetFileSizeBase the target size of a level-0 file.
-   * @return the reference to the current option.
-   *
-   * @see #setTargetFileSizeMultiplier(int)
-   */
-  public Options setTargetFileSizeBase(int targetFileSizeBase) {
+  @Override
+  public Options setTargetFileSizeBase(long targetFileSizeBase) {
     setTargetFileSizeBase(nativeHandle_, targetFileSizeBase);
     return this;
   }
-  private native void setTargetFileSizeBase(
-      long handle, int targetFileSizeBase);
 
-  /**
-   * targetFileSizeMultiplier defines the size ratio between a
-   * level-(L+1) file and level-L file.
-   * By default targetFileSizeMultiplier is 1, meaning
-   * files in different levels have the same target.
-   *
-   * @return the size ratio between a level-(L+1) file and level-L file.
-   */
+  @Override
   public int targetFileSizeMultiplier() {
     return targetFileSizeMultiplier(nativeHandle_);
   }
-  private native int targetFileSizeMultiplier(long handle);
 
-  /**
-   * targetFileSizeMultiplier defines the size ratio between a
-   * level-L file and level-(L+1) file.
-   * By default target_file_size_multiplier is 1, meaning
-   * files in different levels have the same target.
-   *
-   * @param multiplier the size ratio between a level-(L+1) file
-   *     and level-L file.
-   * @return the reference to the current option.
-   */
+  @Override
   public Options setTargetFileSizeMultiplier(int multiplier) {
     setTargetFileSizeMultiplier(nativeHandle_, multiplier);
     return this;
   }
-  private native void setTargetFileSizeMultiplier(
-      long handle, int multiplier);
 
-  /**
-   * The upper-bound of the total size of level-1 files in bytes.
-   * Maximum number of bytes for level L can be calculated as
-   * (maxBytesForLevelBase) * (maxBytesForLevelMultiplier ^ (L-1))
-   * For example, if maxBytesForLevelBase is 20MB, and if
-   * max_bytes_for_level_multiplier is 10, total data size for level-1
-   * will be 20MB, total file size for level-2 will be 200MB,
-   * and total file size for level-3 will be 2GB.
-   * by default 'maxBytesForLevelBase' is 10MB.
-   *
-   * @return the upper-bound of the total size of leve-1 files in bytes.
-   * @see #maxBytesForLevelMultiplier()
-   */
+  @Override
   public long maxBytesForLevelBase() {
     return maxBytesForLevelBase(nativeHandle_);
   }
-  private native long maxBytesForLevelBase(long handle);
 
-  /**
-   * The upper-bound of the total size of level-1 files in bytes.
-   * Maximum number of bytes for level L can be calculated as
-   * (maxBytesForLevelBase) * (maxBytesForLevelMultiplier ^ (L-1))
-   * For example, if maxBytesForLevelBase is 20MB, and if
-   * max_bytes_for_level_multiplier is 10, total data size for level-1
-   * will be 20MB, total file size for level-2 will be 200MB,
-   * and total file size for level-3 will be 2GB.
-   * by default 'maxBytesForLevelBase' is 10MB.
-   *
-   * @return the reference to the current option.
-   * @see #setMaxBytesForLevelMultiplier(int)
-   */
+  @Override
   public Options setMaxBytesForLevelBase(long maxBytesForLevelBase) {
     setMaxBytesForLevelBase(nativeHandle_, maxBytesForLevelBase);
     return this;
   }
-  private native void setMaxBytesForLevelBase(
-      long handle, long maxBytesForLevelBase);
 
-  /**
-   * The ratio between the total size of level-(L+1) files and the total
-   * size of level-L files for all L.
-   * DEFAULT: 10
-   *
-   * @return the ratio between the total size of level-(L+1) files and
-   *     the total size of level-L files for all L.
-   * @see #maxBytesForLevelBase()
-   */
+  @Override
   public int maxBytesForLevelMultiplier() {
     return maxBytesForLevelMultiplier(nativeHandle_);
   }
-  private native int maxBytesForLevelMultiplier(long handle);
 
-  /**
-   * The ratio between the total size of level-(L+1) files and the total
-   * size of level-L files for all L.
-   * DEFAULT: 10
-   *
-   * @param multiplier the ratio between the total size of level-(L+1)
-   *     files and the total size of level-L files for all L.
-   * @return the reference to the current option.
-   * @see #setMaxBytesForLevelBase(long)
-   */
+  @Override
   public Options setMaxBytesForLevelMultiplier(int multiplier) {
     setMaxBytesForLevelMultiplier(nativeHandle_, multiplier);
     return this;
   }
-  private native void setMaxBytesForLevelMultiplier(
-      long handle, int multiplier);
 
-  /**
-   * Maximum number of bytes in all compacted files.  We avoid expanding
-   * the lower level file set of a compaction if it would make the
-   * total compaction cover more than
-   * (expanded_compaction_factor * targetFileSizeLevel()) many bytes.
-   *
-   * @return the maximum number of bytes in all compacted files.
-   * @see #sourceCompactionFactor()
-   */
+  @Override
   public int expandedCompactionFactor() {
     return expandedCompactionFactor(nativeHandle_);
   }
-  private native int expandedCompactionFactor(long handle);
 
-  /**
-   * Maximum number of bytes in all compacted files.  We avoid expanding
-   * the lower level file set of a compaction if it would make the
-   * total compaction cover more than
-   * (expanded_compaction_factor * targetFileSizeLevel()) many bytes.
-   *
-   * @param expandedCompactionFactor the maximum number of bytes in all
-   *     compacted files.
-   * @return the reference to the current option.
-   * @see #setSourceCompactionFactor(int)
-   */
+  @Override
   public Options setExpandedCompactionFactor(int expandedCompactionFactor) {
     setExpandedCompactionFactor(nativeHandle_, expandedCompactionFactor);
     return this;
   }
-  private native void setExpandedCompactionFactor(
-      long handle, int expandedCompactionFactor);
 
-  /**
-   * Maximum number of bytes in all source files to be compacted in a
-   * single compaction run. We avoid picking too many files in the
-   * source level so that we do not exceed the total source bytes
-   * for compaction to exceed
-   * (source_compaction_factor * targetFileSizeLevel()) many bytes.
-   * Default:1, i.e. pick maxfilesize amount of data as the source of
-   * a compaction.
-   *
-   * @return the maximum number of bytes in all source files to be compactedo.
-   * @see #expandedCompactionFactor()
-   */
+  @Override
   public int sourceCompactionFactor() {
     return sourceCompactionFactor(nativeHandle_);
   }
-  private native int sourceCompactionFactor(long handle);
 
-  /**
-   * Maximum number of bytes in all source files to be compacted in a
-   * single compaction run. We avoid picking too many files in the
-   * source level so that we do not exceed the total source bytes
-   * for compaction to exceed
-   * (source_compaction_factor * targetFileSizeLevel()) many bytes.
-   * Default:1, i.e. pick maxfilesize amount of data as the source of
-   * a compaction.
-   *
-   * @param sourceCompactionFactor the maximum number of bytes in all
-   *     source files to be compacted in a single compaction run.
-   * @return the reference to the current option.
-   * @see #setExpandedCompactionFactor(int)
-   */
+  @Override
   public Options setSourceCompactionFactor(int sourceCompactionFactor) {
     setSourceCompactionFactor(nativeHandle_, sourceCompactionFactor);
     return this;
   }
-  private native void setSourceCompactionFactor(
-      long handle, int sourceCompactionFactor);
 
-  /**
-   * Control maximum bytes of overlaps in grandparent (i.e., level+2) before we
-   * stop building a single file in a level->level+1 compaction.
-   *
-   * @return maximum bytes of overlaps in "grandparent" level.
-   */
+  @Override
   public int maxGrandparentOverlapFactor() {
     return maxGrandparentOverlapFactor(nativeHandle_);
   }
-  private native int maxGrandparentOverlapFactor(long handle);
 
-  /**
-   * Control maximum bytes of overlaps in grandparent (i.e., level+2) before we
-   * stop building a single file in a level->level+1 compaction.
-   *
-   * @param maxGrandparentOverlapFactor maximum bytes of overlaps in
-   *     "grandparent" level.
-   * @return the reference to the current option.
-   */
+  @Override
   public Options setMaxGrandparentOverlapFactor(
       int maxGrandparentOverlapFactor) {
     setMaxGrandparentOverlapFactor(nativeHandle_, maxGrandparentOverlapFactor);
     return this;
   }
-  private native void setMaxGrandparentOverlapFactor(
-      long handle, int maxGrandparentOverlapFactor);
 
-  /**
-   * Puts are delayed 0-1 ms when any level has a compaction score that exceeds
-   * soft_rate_limit. This is ignored when == 0.0.
-   * CONSTRAINT: soft_rate_limit <= hard_rate_limit. If this constraint does not
-   * hold, RocksDB will set soft_rate_limit = hard_rate_limit
-   * Default: 0 (disabled)
-   *
-   * @return soft-rate-limit for put delay.
-   */
+  @Override
   public double softRateLimit() {
     return softRateLimit(nativeHandle_);
   }
-  private native double softRateLimit(long handle);
 
-  /**
-   * Puts are delayed 0-1 ms when any level has a compaction score that exceeds
-   * soft_rate_limit. This is ignored when == 0.0.
-   * CONSTRAINT: soft_rate_limit <= hard_rate_limit. If this constraint does not
-   * hold, RocksDB will set soft_rate_limit = hard_rate_limit
-   * Default: 0 (disabled)
-   *
-   * @param softRateLimit the soft-rate-limit of a compaction score
-   *     for put delay.
-   * @return the reference to the current option.
-   */
+  @Override
   public Options setSoftRateLimit(double softRateLimit) {
     setSoftRateLimit(nativeHandle_, softRateLimit);
     return this;
   }
-  private native void setSoftRateLimit(
-      long handle, double softRateLimit);
 
-  /**
-   * Puts are delayed 1ms at a time when any level has a compaction score that
-   * exceeds hard_rate_limit. This is ignored when <= 1.0.
-   * Default: 0 (disabled)
-   *
-   * @return the hard-rate-limit of a compaction score for put delay.
-   */
+  @Override
   public double hardRateLimit() {
     return hardRateLimit(nativeHandle_);
   }
-  private native double hardRateLimit(long handle);
 
-  /**
-   * Puts are delayed 1ms at a time when any level has a compaction score that
-   * exceeds hard_rate_limit. This is ignored when <= 1.0.
-   * Default: 0 (disabled)
-   *
-   * @param hardRateLimit the hard-rate-limit of a compaction score for put
-   *     delay.
-   * @return the reference to the current option.
-   */
+  @Override
   public Options setHardRateLimit(double hardRateLimit) {
     setHardRateLimit(nativeHandle_, hardRateLimit);
     return this;
   }
-  private native void setHardRateLimit(
-      long handle, double hardRateLimit);
 
-  /**
-   * The maximum time interval a put will be stalled when hard_rate_limit
-   * is enforced.  If 0, then there is no limit.
-   * Default: 1000
-   *
-   * @return the maximum time interval a put will be stalled when
-   *     hard_rate_limit is enforced.
-   */
+  @Override
   public int rateLimitDelayMaxMilliseconds() {
     return rateLimitDelayMaxMilliseconds(nativeHandle_);
   }
-  private native int rateLimitDelayMaxMilliseconds(long handle);
 
-  /**
-   * The maximum time interval a put will be stalled when hard_rate_limit
-   * is enforced. If 0, then there is no limit.
-   * Default: 1000
-   *
-   * @param rateLimitDelayMaxMilliseconds the maximum time interval a put
-   *     will be stalled.
-   * @return the reference to the current option.
-   */
+  @Override
   public Options setRateLimitDelayMaxMilliseconds(
       int rateLimitDelayMaxMilliseconds) {
     setRateLimitDelayMaxMilliseconds(
         nativeHandle_, rateLimitDelayMaxMilliseconds);
     return this;
   }
-  private native void setRateLimitDelayMaxMilliseconds(
-      long handle, int rateLimitDelayMaxMilliseconds);
 
-  /**
-   * The size of one block in arena memory allocation.
-   * If <= 0, a proper value is automatically calculated (usually 1/10 of
-   * writer_buffer_size).
-   *
-   * There are two additonal restriction of the The specified size:
-   * (1) size should be in the range of [4096, 2 << 30] and
-   * (2) be the multiple of the CPU word (which helps with the memory
-   * alignment).
-   *
-   * We'll automatically check and adjust the size number to make sure it
-   * conforms to the restrictions.
-   * Default: 0
-   *
-   * @return the size of an arena block
-   */
+  @Override
   public long arenaBlockSize() {
     return arenaBlockSize(nativeHandle_);
   }
-  private native long arenaBlockSize(long handle);
 
-  /**
-   * The size of one block in arena memory allocation.
-   * If <= 0, a proper value is automatically calculated (usually 1/10 of
-   * writer_buffer_size).
-   *
-   * There are two additonal restriction of the The specified size:
-   * (1) size should be in the range of [4096, 2 << 30] and
-   * (2) be the multiple of the CPU word (which helps with the memory
-   * alignment).
-   *
-   * We'll automatically check and adjust the size number to make sure it
-   * conforms to the restrictions.
-   * Default: 0
-   *
-   * @param arenaBlockSize the size of an arena block
-   * @return the reference to the current option.
-   * @throws RocksDBException
-   */
+  @Override
   public Options setArenaBlockSize(long arenaBlockSize)
       throws RocksDBException {
     setArenaBlockSize(nativeHandle_, arenaBlockSize);
     return this;
   }
-  private native void setArenaBlockSize(
-      long handle, long arenaBlockSize) throws RocksDBException;
 
-  /**
-   * Disable automatic compactions. Manual compactions can still
-   * be issued on this column family
-   *
-   * @return true if auto-compactions are disabled.
-   */
+  @Override
   public boolean disableAutoCompactions() {
     return disableAutoCompactions(nativeHandle_);
   }
-  private native boolean disableAutoCompactions(long handle);
 
-  /**
-   * Disable automatic compactions. Manual compactions can still
-   * be issued on this column family
-   *
-   * @param disableAutoCompactions true if auto-compactions are disabled.
-   * @return the reference to the current option.
-   */
+  @Override
   public Options setDisableAutoCompactions(boolean disableAutoCompactions) {
     setDisableAutoCompactions(nativeHandle_, disableAutoCompactions);
     return this;
   }
-  private native void setDisableAutoCompactions(
-      long handle, boolean disableAutoCompactions);
 
-  /**
-   * Purge duplicate/deleted keys when a memtable is flushed to storage.
-   * Default: true
-   *
-   * @return true if purging keys is disabled.
-   */
+  @Override
   public boolean purgeRedundantKvsWhileFlush() {
     return purgeRedundantKvsWhileFlush(nativeHandle_);
   }
-  private native boolean purgeRedundantKvsWhileFlush(long handle);
 
-  /**
-   * Purge duplicate/deleted keys when a memtable is flushed to storage.
-   * Default: true
-   *
-   * @param purgeRedundantKvsWhileFlush true if purging keys is disabled.
-   * @return the reference to the current option.
-   */
+  @Override
   public Options setPurgeRedundantKvsWhileFlush(
       boolean purgeRedundantKvsWhileFlush) {
     setPurgeRedundantKvsWhileFlush(
         nativeHandle_, purgeRedundantKvsWhileFlush);
     return this;
   }
-  private native void setPurgeRedundantKvsWhileFlush(
-      long handle, boolean purgeRedundantKvsWhileFlush);
 
-  /**
-   * If true, compaction will verify checksum on every read that happens
-   * as part of compaction
-   * Default: true
-   *
-   * @return true if compaction verifies checksum on every read.
-   */
+  @Override
   public boolean verifyChecksumsInCompaction() {
     return verifyChecksumsInCompaction(nativeHandle_);
   }
-  private native boolean verifyChecksumsInCompaction(long handle);
 
-  /**
-   * If true, compaction will verify checksum on every read that happens
-   * as part of compaction
-   * Default: true
-   *
-   * @param verifyChecksumsInCompaction true if compaction verifies
-   *     checksum on every read.
-   * @return the reference to the current option.
-   */
+  @Override
   public Options setVerifyChecksumsInCompaction(
       boolean verifyChecksumsInCompaction) {
     setVerifyChecksumsInCompaction(
         nativeHandle_, verifyChecksumsInCompaction);
     return this;
   }
-  private native void setVerifyChecksumsInCompaction(
-      long handle, boolean verifyChecksumsInCompaction);
 
-  /**
-   * Use KeyMayExist API to filter deletes when this is true.
-   * If KeyMayExist returns false, i.e. the key definitely does not exist, then
-   * the delete is a noop. KeyMayExist only incurs in-memory look up.
-   * This optimization avoids writing the delete to storage when appropriate.
-   * Default: false
-   *
-   * @return true if filter-deletes behavior is on.
-   */
+  @Override
   public boolean filterDeletes() {
     return filterDeletes(nativeHandle_);
   }
-  private native boolean filterDeletes(long handle);
 
-  /**
-   * Use KeyMayExist API to filter deletes when this is true.
-   * If KeyMayExist returns false, i.e. the key definitely does not exist, then
-   * the delete is a noop. KeyMayExist only incurs in-memory look up.
-   * This optimization avoids writing the delete to storage when appropriate.
-   * Default: false
-   *
-   * @param filterDeletes true if filter-deletes behavior is on.
-   * @return the reference to the current option.
-   */
+  @Override
   public Options setFilterDeletes(boolean filterDeletes) {
     setFilterDeletes(nativeHandle_, filterDeletes);
     return this;
   }
-  private native void setFilterDeletes(
-      long handle, boolean filterDeletes);
 
-  /**
-   * An iteration->Next() sequentially skips over keys with the same
-   * user-key unless this option is set. This number specifies the number
-   * of keys (with the same userkey) that will be sequentially
-   * skipped before a reseek is issued.
-   * Default: 8
-   *
-   * @return the number of keys could be skipped in a iteration.
-   */
+  @Override
   public long maxSequentialSkipInIterations() {
     return maxSequentialSkipInIterations(nativeHandle_);
   }
-  private native long maxSequentialSkipInIterations(long handle);
 
-  /**
-   * An iteration->Next() sequentially skips over keys with the same
-   * user-key unless this option is set. This number specifies the number
-   * of keys (with the same userkey) that will be sequentially
-   * skipped before a reseek is issued.
-   * Default: 8
-   *
-   * @param maxSequentialSkipInIterations the number of keys could
-   *     be skipped in a iteration.
-   * @return the reference to the current option.
-   */
+  @Override
   public Options setMaxSequentialSkipInIterations(long maxSequentialSkipInIterations) {
     setMaxSequentialSkipInIterations(nativeHandle_, maxSequentialSkipInIterations);
     return this;
   }
-  private native void setMaxSequentialSkipInIterations(
-      long handle, long maxSequentialSkipInIterations);
 
-  /**
-   * Allows thread-safe inplace updates.
-   * If inplace_callback function is not set,
-   *   Put(key, new_value) will update inplace the existing_value iff
-   *   * key exists in current memtable
-   *   * new sizeof(new_value) <= sizeof(existing_value)
-   *   * existing_value for that key is a put i.e. kTypeValue
-   * If inplace_callback function is set, check doc for inplace_callback.
-   * Default: false.
-   *
-   * @return true if thread-safe inplace updates are allowed.
-   */
+  @Override
   public boolean inplaceUpdateSupport() {
     return inplaceUpdateSupport(nativeHandle_);
   }
-  private native boolean inplaceUpdateSupport(long handle);
 
-  /**
-   * Allows thread-safe inplace updates.
-   * If inplace_callback function is not set,
-   *   Put(key, new_value) will update inplace the existing_value iff
-   *   * key exists in current memtable
-   *   * new sizeof(new_value) <= sizeof(existing_value)
-   *   * existing_value for that key is a put i.e. kTypeValue
-   * If inplace_callback function is set, check doc for inplace_callback.
-   * Default: false.
-   *
-   * @param inplaceUpdateSupport true if thread-safe inplace updates
-   *     are allowed.
-   * @return the reference to the current option.
-   */
+  @Override
   public Options setInplaceUpdateSupport(boolean inplaceUpdateSupport) {
     setInplaceUpdateSupport(nativeHandle_, inplaceUpdateSupport);
     return this;
   }
-  private native void setInplaceUpdateSupport(
-      long handle, boolean inplaceUpdateSupport);
 
-  /**
-   * Number of locks used for inplace update
-   * Default: 10000, if inplace_update_support = true, else 0.
-   *
-   * @return the number of locks used for inplace update.
-   */
+  @Override
   public long inplaceUpdateNumLocks() {
     return inplaceUpdateNumLocks(nativeHandle_);
   }
-  private native long inplaceUpdateNumLocks(long handle);
 
-  /**
-   * Number of locks used for inplace update
-   * Default: 10000, if inplace_update_support = true, else 0.
-   *
-   * @param inplaceUpdateNumLocks the number of locks used for
-   *     inplace updates.
-   * @return the reference to the current option.
-   * @throws RocksDBException
-   */
+  @Override
   public Options setInplaceUpdateNumLocks(long inplaceUpdateNumLocks)
       throws RocksDBException {
     setInplaceUpdateNumLocks(nativeHandle_, inplaceUpdateNumLocks);
     return this;
   }
-  private native void setInplaceUpdateNumLocks(
-      long handle, long inplaceUpdateNumLocks) throws RocksDBException;
 
-  /**
-   * Returns the number of bits used in the prefix bloom filter.
-   *
-   * This value will be used only when a prefix-extractor is specified.
-   *
-   * @return the number of bloom-bits.
-   * @see #useFixedLengthPrefixExtractor(int)
-   */
+  @Override
   public int memtablePrefixBloomBits() {
     return memtablePrefixBloomBits(nativeHandle_);
   }
-  private native int memtablePrefixBloomBits(long handle);
 
-  /**
-   * Sets the number of bits used in the prefix bloom filter.
-   *
-   * This value will be used only when a prefix-extractor is specified.
-   *
-   * @param memtablePrefixBloomBits the number of bits used in the
-   *     prefix bloom filter.
-   * @return the reference to the current option.
-   */
+  @Override
   public Options setMemtablePrefixBloomBits(int memtablePrefixBloomBits) {
     setMemtablePrefixBloomBits(nativeHandle_, memtablePrefixBloomBits);
     return this;
   }
-  private native void setMemtablePrefixBloomBits(
-      long handle, int memtablePrefixBloomBits);
 
-  /**
-   * The number of hash probes per key used in the mem-table.
-   *
-   * @return the number of hash probes per key.
-   */
+  @Override
   public int memtablePrefixBloomProbes() {
     return memtablePrefixBloomProbes(nativeHandle_);
   }
-  private native int memtablePrefixBloomProbes(long handle);
 
-  /**
-   * The number of hash probes per key used in the mem-table.
-   *
-   * @param memtablePrefixBloomProbes the number of hash probes per key.
-   * @return the reference to the current option.
-   */
+  @Override
   public Options setMemtablePrefixBloomProbes(int memtablePrefixBloomProbes) {
     setMemtablePrefixBloomProbes(nativeHandle_, memtablePrefixBloomProbes);
     return this;
   }
-  private native void setMemtablePrefixBloomProbes(
-      long handle, int memtablePrefixBloomProbes);
 
-  /**
-   * Control locality of bloom filter probes to improve cache miss rate.
-   * This option only applies to memtable prefix bloom and plaintable
-   * prefix bloom. It essentially limits the max number of cache lines each
-   * bloom filter check can touch.
-   * This optimization is turned off when set to 0. The number should never
-   * be greater than number of probes. This option can boost performance
-   * for in-memory workload but should use with care since it can cause
-   * higher false positive rate.
-   * Default: 0
-   *
-   * @return the level of locality of bloom-filter probes.
-   * @see #setMemtablePrefixBloomProbes(int)
-   */
+  @Override
   public int bloomLocality() {
     return bloomLocality(nativeHandle_);
   }
-  private native int bloomLocality(long handle);
 
-  /**
-   * Control locality of bloom filter probes to improve cache miss rate.
-   * This option only applies to memtable prefix bloom and plaintable
-   * prefix bloom. It essentially limits the max number of cache lines each
-   * bloom filter check can touch.
-   * This optimization is turned off when set to 0. The number should never
-   * be greater than number of probes. This option can boost performance
-   * for in-memory workload but should use with care since it can cause
-   * higher false positive rate.
-   * Default: 0
-   *
-   * @param bloomLocality the level of locality of bloom-filter probes.
-   * @return the reference to the current option.
-   */
+  @Override
   public Options setBloomLocality(int bloomLocality) {
     setBloomLocality(nativeHandle_, bloomLocality);
     return this;
   }
-  private native void setBloomLocality(
-      long handle, int bloomLocality);
 
-  /**
-   * Maximum number of successive merge operations on a key in the memtable.
-   *
-   * When a merge operation is added to the memtable and the maximum number of
-   * successive merges is reached, the value of the key will be calculated and
-   * inserted into the memtable instead of the merge operation. This will
-   * ensure that there are never more than max_successive_merges merge
-   * operations in the memtable.
-   *
-   * Default: 0 (disabled)
-   *
-   * @return the maximum number of successive merges.
-   */
+  @Override
   public long maxSuccessiveMerges() {
     return maxSuccessiveMerges(nativeHandle_);
   }
-  private native long maxSuccessiveMerges(long handle);
 
-  /**
-   * Maximum number of successive merge operations on a key in the memtable.
-   *
-   * When a merge operation is added to the memtable and the maximum number of
-   * successive merges is reached, the value of the key will be calculated and
-   * inserted into the memtable instead of the merge operation. This will
-   * ensure that there are never more than max_successive_merges merge
-   * operations in the memtable.
-   *
-   * Default: 0 (disabled)
-   *
-   * @param maxSuccessiveMerges the maximum number of successive merges.
-   * @return the reference to the current option.
-   * @throws RocksDBException
-   */
+  @Override
   public Options setMaxSuccessiveMerges(long maxSuccessiveMerges)
       throws RocksDBException {
     setMaxSuccessiveMerges(nativeHandle_, maxSuccessiveMerges);
     return this;
   }
-  private native void setMaxSuccessiveMerges(
-      long handle, long maxSuccessiveMerges) throws RocksDBException;
 
-  /**
-   * The minimum number of write buffers that will be merged together
-   * before writing to storage.  If set to 1, then
-   * all write buffers are fushed to L0 as individual files and this increases
-   * read amplification because a get request has to check in all of these
-   * files. Also, an in-memory merge may result in writing lesser
-   * data to storage if there are duplicate records in each of these
-   * individual write buffers.  Default: 1
-   *
-   * @return the minimum number of write buffers that will be merged together.
-   */
+  @Override
   public int minWriteBufferNumberToMerge() {
     return minWriteBufferNumberToMerge(nativeHandle_);
   }
-  private native int minWriteBufferNumberToMerge(long handle);
 
-  /**
-   * The minimum number of write buffers that will be merged together
-   * before writing to storage.  If set to 1, then
-   * all write buffers are fushed to L0 as individual files and this increases
-   * read amplification because a get request has to check in all of these
-   * files. Also, an in-memory merge may result in writing lesser
-   * data to storage if there are duplicate records in each of these
-   * individual write buffers.  Default: 1
-   *
-   * @param minWriteBufferNumberToMerge the minimum number of write buffers
-   *     that will be merged together.
-   * @return the reference to the current option.
-   */
+  @Override
   public Options setMinWriteBufferNumberToMerge(int minWriteBufferNumberToMerge) {
     setMinWriteBufferNumberToMerge(nativeHandle_, minWriteBufferNumberToMerge);
     return this;
   }
-  private native void setMinWriteBufferNumberToMerge(
-      long handle, int minWriteBufferNumberToMerge);
 
-  /**
-   * The number of partial merge operands to accumulate before partial
-   * merge will be performed. Partial merge will not be called
-   * if the list of values to merge is less than min_partial_merge_operands.
-   *
-   * If min_partial_merge_operands < 2, then it will be treated as 2.
-   *
-   * Default: 2
-   *
-   * @return min partial merge operands
-   */
+  @Override
   public int minPartialMergeOperands() {
     return minPartialMergeOperands(nativeHandle_);
   }
-  private native int minPartialMergeOperands(long handle);
 
-  /**
-   * The number of partial merge operands to accumulate before partial
-   * merge will be performed. Partial merge will not be called
-   * if the list of values to merge is less than min_partial_merge_operands.
-   *
-   * If min_partial_merge_operands < 2, then it will be treated as 2.
-   *
-   * Default: 2
-   *
-   * @param minPartialMergeOperands
-   * @return the reference to the current option.
-   */
+  @Override
   public Options setMinPartialMergeOperands(int minPartialMergeOperands) {
     setMinPartialMergeOperands(nativeHandle_, minPartialMergeOperands);
     return this;
   }
-  private native void setMinPartialMergeOperands(
-      long handle, int minPartialMergeOperands);
-
-  /**
-   * Set the merge operator to be used for merging two merge operands
-   * of the same key. The merge function is invoked during
-   * compaction and at lookup time, if multiple key/value pairs belonging
-   * to the same key are found in the database.
-   *
-   * @param name the name of the merge function, as defined by
-   * the MergeOperators factory (see utilities/MergeOperators.h)
-   * The merge function is specified by name and must be one of the
-   * standard merge operators provided by RocksDB. The available
-   * operators are "put", "uint64add", "stringappend" and "stringappendtest".
-   * @return the reference to the current option.
-   */
-  public Options setMergeOperatorName(String name) {
-      setMergeOperatorName(nativeHandle_, name);
-    return this;
-  }
-  private native void setMergeOperatorName(
-      long handle, String name);
-
-  /**
-   * Set the merge operator to be used for merging two different key/value
-   * pairs that share the same key. The merge function is invoked during
-   * compaction and at lookup time, if multiple key/value pairs belonging
-   * to the same key are found in the database.
-   *
-   * @param a {@link MergeOperator} object
-   * @return the reference to the current option.
-   */
-  public Options setMergeOperator(MergeOperator mergeOperator) {
-      setMergeOperator(nativeHandle_, mergeOperator.newMergeOperatorHandle());
-    return this;
-  }
-  private native void setMergeOperator(
-      long handle, long mergeOperatorHandle);
 
   /**
    * Release the memory allocated for the current instance
@@ -2303,45 +982,223 @@ public class Options extends RocksObject {
     disposeInternal(nativeHandle_);
   }
 
-  static final int DEFAULT_PLAIN_TABLE_BLOOM_BITS_PER_KEY = 10;
-  static final double DEFAULT_PLAIN_TABLE_HASH_TABLE_RATIO = 0.75;
-  static final int DEFAULT_PLAIN_TABLE_INDEX_SPARSENESS = 16;
-
   private native void newOptions();
+  private native void newOptions(long dbOptHandle,
+      long cfOptHandle);
   private native void disposeInternal(long handle);
+  private native void setEnv(long optHandle, long envHandle);
+  private native long getEnvHandle(long handle);
+  private native void prepareForBulkLoad(long handle);
+
+  // DB native handles
   private native void setCreateIfMissing(long handle, boolean flag);
   private native boolean createIfMissing(long handle);
-  private native void setWriteBufferSize(long handle, long writeBufferSize)
-      throws RocksDBException;
   private native void setCreateMissingColumnFamilies(
       long handle, boolean flag);
   private native boolean createMissingColumnFamilies(long handle);
+  private native void setErrorIfExists(long handle, boolean errorIfExists);
+  private native boolean errorIfExists(long handle);
+  private native void setParanoidChecks(
+      long handle, boolean paranoidChecks);
+  private native boolean paranoidChecks(long handle);
+  private native void setRateLimiter(long handle,
+      long rateLimiterHandle);
+  private native void setMaxOpenFiles(long handle, int maxOpenFiles);
+  private native int maxOpenFiles(long handle);
+  private native void setMaxTotalWalSize(long handle,
+      long maxTotalWalSize);
+  private native long maxTotalWalSize(long handle);
+  private native void createStatistics(long optHandle);
+  private native long statisticsPtr(long optHandle);
+  private native void setDisableDataSync(long handle, boolean disableDataSync);
+  private native boolean disableDataSync(long handle);
+  private native boolean useFsync(long handle);
+  private native void setUseFsync(long handle, boolean useFsync);
+  private native void setDbLogDir(long handle, String dbLogDir);
+  private native String dbLogDir(long handle);
+  private native void setWalDir(long handle, String walDir);
+  private native String walDir(long handle);
+  private native void setDeleteObsoleteFilesPeriodMicros(
+      long handle, long micros);
+  private native long deleteObsoleteFilesPeriodMicros(long handle);
+  private native void setMaxBackgroundCompactions(
+      long handle, int maxBackgroundCompactions);
+  private native int maxBackgroundCompactions(long handle);
+  private native void setMaxBackgroundFlushes(
+      long handle, int maxBackgroundFlushes);
+  private native int maxBackgroundFlushes(long handle);
+  private native void setMaxLogFileSize(long handle, long maxLogFileSize)
+      throws RocksDBException;
+  private native long maxLogFileSize(long handle);
+  private native void setLogFileTimeToRoll(
+      long handle, long logFileTimeToRoll) throws RocksDBException;
+  private native long logFileTimeToRoll(long handle);
+  private native void setKeepLogFileNum(long handle, long keepLogFileNum)
+      throws RocksDBException;
+  private native long keepLogFileNum(long handle);
+  private native void setMaxManifestFileSize(
+      long handle, long maxManifestFileSize);
+  private native long maxManifestFileSize(long handle);
+  private native void setTableCacheNumshardbits(
+      long handle, int tableCacheNumshardbits);
+  private native int tableCacheNumshardbits(long handle);
+  private native void setTableCacheRemoveScanCountLimit(
+      long handle, int limit);
+  private native int tableCacheRemoveScanCountLimit(long handle);
+  private native void setWalTtlSeconds(long handle, long walTtlSeconds);
+  private native long walTtlSeconds(long handle);
+  private native void setWalSizeLimitMB(long handle, long sizeLimitMB);
+  private native long walSizeLimitMB(long handle);
+  private native void setManifestPreallocationSize(
+      long handle, long size) throws RocksDBException;
+  private native long manifestPreallocationSize(long handle);
+  private native void setAllowOsBuffer(
+      long handle, boolean allowOsBuffer);
+  private native boolean allowOsBuffer(long handle);
+  private native void setAllowMmapReads(
+      long handle, boolean allowMmapReads);
+  private native boolean allowMmapReads(long handle);
+  private native void setAllowMmapWrites(
+      long handle, boolean allowMmapWrites);
+  private native boolean allowMmapWrites(long handle);
+  private native void setIsFdCloseOnExec(
+      long handle, boolean isFdCloseOnExec);
+  private native boolean isFdCloseOnExec(long handle);
+  private native void setSkipLogErrorOnRecovery(
+      long handle, boolean skip);
+  private native boolean skipLogErrorOnRecovery(long handle);
+  private native void setStatsDumpPeriodSec(
+      long handle, int statsDumpPeriodSec);
+  private native int statsDumpPeriodSec(long handle);
+  private native void setAdviseRandomOnOpen(
+      long handle, boolean adviseRandomOnOpen);
+  private native boolean adviseRandomOnOpen(long handle);
+  private native void setUseAdaptiveMutex(
+      long handle, boolean useAdaptiveMutex);
+  private native boolean useAdaptiveMutex(long handle);
+  private native void setBytesPerSync(
+      long handle, long bytesPerSync);
+  private native long bytesPerSync(long handle);
+  // CF native handles
+  private native void optimizeForPointLookup(long handle,
+      long blockCacheSizeMb);
+  private native void optimizeLevelStyleCompaction(long handle,
+      long memtableMemoryBudget);
+  private native void optimizeUniversalStyleCompaction(long handle,
+      long memtableMemoryBudget);
+  private native void setComparatorHandle(long handle, int builtinComparator);
+  private native void setComparatorHandle(long optHandle, long comparatorHandle);
+  private native void setMergeOperatorName(
+      long handle, String name);
+  private native void setMergeOperator(
+      long handle, long mergeOperatorHandle);
+  private native void setWriteBufferSize(long handle, long writeBufferSize)
+      throws RocksDBException;
   private native long writeBufferSize(long handle);
   private native void setMaxWriteBufferNumber(
       long handle, int maxWriteBufferNumber);
   private native int maxWriteBufferNumber(long handle);
-  private native void setMaxBackgroundCompactions(
-      long handle, int maxBackgroundCompactions);
-  private native int maxBackgroundCompactions(long handle);
-  private native void createStatistics(long optHandle);
-  private native long statisticsPtr(long optHandle);
-
+  private native void setMinWriteBufferNumberToMerge(
+      long handle, int minWriteBufferNumberToMerge);
+  private native int minWriteBufferNumberToMerge(long handle);
+  private native void setCompressionType(long handle, byte compressionType);
+  private native byte compressionType(long handle);
+  private native void useFixedLengthPrefixExtractor(
+      long handle, int prefixLength);
+  private native void setNumLevels(
+      long handle, int numLevels);
+  private native int numLevels(long handle);
+  private native void setLevelZeroFileNumCompactionTrigger(
+      long handle, int numFiles);
+  private native int levelZeroFileNumCompactionTrigger(long handle);
+  private native void setLevelZeroSlowdownWritesTrigger(
+      long handle, int numFiles);
+  private native int levelZeroSlowdownWritesTrigger(long handle);
+  private native void setLevelZeroStopWritesTrigger(
+      long handle, int numFiles);
+  private native int levelZeroStopWritesTrigger(long handle);
+  private native void setMaxMemCompactionLevel(
+      long handle, int maxMemCompactionLevel);
+  private native int maxMemCompactionLevel(long handle);
+  private native void setTargetFileSizeBase(
+      long handle, long targetFileSizeBase);
+  private native long targetFileSizeBase(long handle);
+  private native void setTargetFileSizeMultiplier(
+      long handle, int multiplier);
+  private native int targetFileSizeMultiplier(long handle);
+  private native void setMaxBytesForLevelBase(
+      long handle, long maxBytesForLevelBase);
+  private native long maxBytesForLevelBase(long handle);
+  private native void setMaxBytesForLevelMultiplier(
+      long handle, int multiplier);
+  private native int maxBytesForLevelMultiplier(long handle);
+  private native void setExpandedCompactionFactor(
+      long handle, int expandedCompactionFactor);
+  private native int expandedCompactionFactor(long handle);
+  private native void setSourceCompactionFactor(
+      long handle, int sourceCompactionFactor);
+  private native int sourceCompactionFactor(long handle);
+  private native void setMaxGrandparentOverlapFactor(
+      long handle, int maxGrandparentOverlapFactor);
+  private native int maxGrandparentOverlapFactor(long handle);
+  private native void setSoftRateLimit(
+      long handle, double softRateLimit);
+  private native double softRateLimit(long handle);
+  private native void setHardRateLimit(
+      long handle, double hardRateLimit);
+  private native double hardRateLimit(long handle);
+  private native void setRateLimitDelayMaxMilliseconds(
+      long handle, int rateLimitDelayMaxMilliseconds);
+  private native int rateLimitDelayMaxMilliseconds(long handle);
+  private native void setArenaBlockSize(
+      long handle, long arenaBlockSize) throws RocksDBException;
+  private native long arenaBlockSize(long handle);
+  private native void setDisableAutoCompactions(
+      long handle, boolean disableAutoCompactions);
+  private native boolean disableAutoCompactions(long handle);
+  private native void setCompactionStyle(long handle, byte compactionStyle);
+  private native byte compactionStyle(long handle);
+  private native void setPurgeRedundantKvsWhileFlush(
+      long handle, boolean purgeRedundantKvsWhileFlush);
+  private native boolean purgeRedundantKvsWhileFlush(long handle);
+  private native void setVerifyChecksumsInCompaction(
+      long handle, boolean verifyChecksumsInCompaction);
+  private native boolean verifyChecksumsInCompaction(long handle);
+  private native void setFilterDeletes(
+      long handle, boolean filterDeletes);
+  private native boolean filterDeletes(long handle);
+  private native void setMaxSequentialSkipInIterations(
+      long handle, long maxSequentialSkipInIterations);
+  private native long maxSequentialSkipInIterations(long handle);
   private native void setMemTableFactory(long handle, long factoryHandle);
-  private native void setRateLimiter(long handle,
-      long rateLimiterHandle);
   private native String memTableFactoryName(long handle);
-
   private native void setTableFactory(long handle, long factoryHandle);
   private native String tableFactoryName(long handle);
-
-  private native void useFixedLengthPrefixExtractor(
-      long handle, int prefixLength);
-
-  long cacheSize_;
-  int numShardBits_;
-  AbstractComparator comparator_;
+  private native void setInplaceUpdateSupport(
+      long handle, boolean inplaceUpdateSupport);
+  private native boolean inplaceUpdateSupport(long handle);
+  private native void setInplaceUpdateNumLocks(
+      long handle, long inplaceUpdateNumLocks) throws RocksDBException;
+  private native long inplaceUpdateNumLocks(long handle);
+  private native void setMemtablePrefixBloomBits(
+      long handle, int memtablePrefixBloomBits);
+  private native int memtablePrefixBloomBits(long handle);
+  private native void setMemtablePrefixBloomProbes(
+      long handle, int memtablePrefixBloomProbes);
+  private native int memtablePrefixBloomProbes(long handle);
+  private native void setBloomLocality(
+      long handle, int bloomLocality);
+  private native int bloomLocality(long handle);
+  private native void setMaxSuccessiveMerges(
+      long handle, long maxSuccessiveMerges) throws RocksDBException;
+  private native long maxSuccessiveMerges(long handle);
+  private native void setMinPartialMergeOperands(
+      long handle, int minPartialMergeOperands);
+  private native int minPartialMergeOperands(long handle);
+  // instance variables
   RocksEnv env_;
   MemTableConfig memTableConfig_;
   TableFormatConfig tableFormatConfig_;
   RateLimiterConfig rateLimiterConfig_;
+  AbstractComparator comparator_;
 }
diff --git a/java/org/rocksdb/RateLimiterConfig.java b/java/org/rocksdb/RateLimiterConfig.java
index 1b309e6c9..06f3990d0 100644
--- a/java/org/rocksdb/RateLimiterConfig.java
+++ b/java/org/rocksdb/RateLimiterConfig.java
@@ -11,11 +11,11 @@ package org.rocksdb;
 public abstract class RateLimiterConfig {
   /**
    * This function should only be called by
-   * {@link org.rocksdb.Options#setRateLimiter(long, long)}, which will
+   * {@link org.rocksdb.DBOptions#setRateLimiter(long, long)}, which will
    * create a c++ shared-pointer to the c++ {@code RateLimiter} that is associated
    * with a Java RateLimiterConfig.
    *
-   * @see org.rocksdb.Options#setRateLimiter(long, long)
+   * @see org.rocksdb.DBOptions#setRateLimiter(long, long)
    */
   abstract protected long newRateLimiterHandle();
 }
diff --git a/java/org/rocksdb/test/OptionsTest.java b/java/org/rocksdb/test/OptionsTest.java
index 222d87b8d..ef88e3503 100644
--- a/java/org/rocksdb/test/OptionsTest.java
+++ b/java/org/rocksdb/test/OptionsTest.java
@@ -44,6 +44,13 @@ public class OptionsTest {
       assert(opt.paranoidChecks() == boolValue);
     }
 
+    {
+      // MaxTotalWalSize test
+      long longValue = rand.nextLong();
+      opt.setMaxTotalWalSize(longValue);
+      assert(opt.maxTotalWalSize() == longValue);
+    }
+
     { // MaxOpenFiles test
       int intValue = rand.nextInt();
       opt.setMaxOpenFiles(intValue);
@@ -264,9 +271,9 @@ public class OptionsTest {
     }
 
     { // TargetFileSizeBase test
-      int intValue = rand.nextInt();
-      opt.setTargetFileSizeBase(intValue);
-      assert(opt.targetFileSizeBase() == intValue);
+      long longValue = rand.nextLong();
+      opt.setTargetFileSizeBase(longValue);
+      assert(opt.targetFileSizeBase() == longValue);
     }
 
     { // TargetFileSizeMultiplier test
diff --git a/java/rocksjni/options.cc b/java/rocksjni/options.cc
index 7d0834bb9..6cde26327 100644
--- a/java/rocksjni/options.cc
+++ b/java/rocksjni/options.cc
@@ -12,6 +12,8 @@
 #include <memory>
 
 #include "include/org_rocksdb_Options.h"
+#include "include/org_rocksdb_DBOptions.h"
+#include "include/org_rocksdb_ColumnFamilyOptions.h"
 #include "include/org_rocksdb_WriteOptions.h"
 #include "include/org_rocksdb_ReadOptions.h"
 #include "include/org_rocksdb_ComparatorOptions.h"
@@ -34,11 +36,25 @@
  * Method:    newOptions
  * Signature: ()V
  */
-void Java_org_rocksdb_Options_newOptions(JNIEnv* env, jobject jobj) {
+void Java_org_rocksdb_Options_newOptions__(JNIEnv* env, jobject jobj) {
   rocksdb::Options* op = new rocksdb::Options();
   rocksdb::OptionsJni::setHandle(env, jobj, op);
 }
 
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    newOptions
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_newOptions__JJ(JNIEnv* env, jobject jobj,
+    jlong jdboptions, jlong jcfoptions) {
+  auto dbOpt = reinterpret_cast<const rocksdb::DBOptions*>(jdboptions);
+  auto cfOpt = reinterpret_cast<const rocksdb::ColumnFamilyOptions*>(
+      jcfoptions);
+  rocksdb::Options* op = new rocksdb::Options(*dbOpt, *cfOpt);
+  rocksdb::OptionsJni::setHandle(env, jobj, op);
+}
+
 /*
  * Class:     org_rocksdb_Options
  * Method:    disposeInternal
@@ -93,10 +109,10 @@ jboolean Java_org_rocksdb_Options_createMissingColumnFamilies(
 
 /*
  * Class:     org_rocksdb_Options
- * Method:    useReverseBytewiseComparator
+ * Method:    setComparatorHandle
  * Signature: (JI)V
  */
-void Java_org_rocksdb_Options_setBuiltinComparator(
+void Java_org_rocksdb_Options_setComparatorHandle__JI(
     JNIEnv* env, jobject jobj, jlong jhandle, jint builtinComparator) {
   switch (builtinComparator) {
     case 1:
@@ -110,6 +126,41 @@ void Java_org_rocksdb_Options_setBuiltinComparator(
   }
 }
 
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setComparatorHandle
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setComparatorHandle__JJ(
+    JNIEnv* env, jobject jobj, jlong jopt_handle, jlong jcomparator_handle) {
+  reinterpret_cast<rocksdb::Options*>(jopt_handle)->comparator =
+      reinterpret_cast<rocksdb::Comparator*>(jcomparator_handle);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setMergeOperatorName
+ * Signature: (JJjava/lang/String)V
+ */
+void Java_org_rocksdb_Options_setMergeOperatorName(
+    JNIEnv* env, jobject jobj, jlong jhandle, jstring name) {
+  const char* op_name = env->GetStringUTFChars(name, 0);
+  reinterpret_cast<rocksdb::Options*>(jhandle)->merge_operator =
+    rocksdb::MergeOperators::CreateFromStringId(op_name);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setMergeOperator
+ * Signature: (JJjava/lang/String)V
+ */
+void Java_org_rocksdb_Options_setMergeOperator(
+  JNIEnv* env, jobject jobj, jlong jhandle, jlong mergeOperatorHandle) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->merge_operator =
+    *(reinterpret_cast<std::shared_ptr<rocksdb::MergeOperator>*>
+      (mergeOperatorHandle));
+}
+
 /*
  * Class:     org_rocksdb_Options
  * Method:    setWriteBufferSize
@@ -169,17 +220,6 @@ jlong Java_org_rocksdb_Options_statisticsPtr(
   return reinterpret_cast<jlong>(st);
 }
 
-/*
- * Class:     org_rocksdb_Options
- * Method:    setComparatorHandle
- * Signature: (JJ)V
- */
-void Java_org_rocksdb_Options_setComparatorHandle(
-    JNIEnv* env, jobject jobj, jlong jopt_handle, jlong jcomparator_handle) {
-  reinterpret_cast<rocksdb::Options*>(jopt_handle)->comparator =
-      reinterpret_cast<rocksdb::Comparator*>(jcomparator_handle);
-}
-
 /*
  * Class:     org_rocksdb_Options
  * Method:    maxWriteBufferNumber
@@ -232,6 +272,29 @@ void Java_org_rocksdb_Options_setParanoidChecks(
       static_cast<bool>(paranoid_checks);
 }
 
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setMaxTotalWalSize
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setMaxTotalWalSize(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jlong jmax_total_wal_size) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->max_total_wal_size =
+      static_cast<jlong>(jmax_total_wal_size);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    maxTotalWalSize
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_maxTotalWalSize(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)->
+      max_total_wal_size;
+}
+
 /*
  * Class:     org_rocksdb_Options
  * Method:    maxOpenFiles
@@ -1073,9 +1136,9 @@ void Java_org_rocksdb_Options_setMaxMemCompactionLevel(
 /*
  * Class:     org_rocksdb_Options
  * Method:    targetFileSizeBase
- * Signature: (J)I
+ * Signature: (J)J
  */
-jint Java_org_rocksdb_Options_targetFileSizeBase(
+jlong Java_org_rocksdb_Options_targetFileSizeBase(
     JNIEnv* env, jobject jobj, jlong jhandle) {
   return reinterpret_cast<rocksdb::Options*>(jhandle)->target_file_size_base;
 }
@@ -1083,13 +1146,13 @@ jint Java_org_rocksdb_Options_targetFileSizeBase(
 /*
  * Class:     org_rocksdb_Options
  * Method:    setTargetFileSizeBase
- * Signature: (JI)V
+ * Signature: (JJ)V
  */
 void Java_org_rocksdb_Options_setTargetFileSizeBase(
     JNIEnv* env, jobject jobj, jlong jhandle,
-    jint jtarget_file_size_base) {
+    jlong jtarget_file_size_base) {
   reinterpret_cast<rocksdb::Options*>(jhandle)->target_file_size_base =
-      static_cast<int>(jtarget_file_size_base);
+      static_cast<uint64_t>(jtarget_file_size_base);
 }
 
 /*
@@ -1619,185 +1682,1927 @@ void Java_org_rocksdb_Options_setMinPartialMergeOperands(
           static_cast<int32_t>(jmin_partial_merge_operands);
 }
 
+/*
+ * Method:    optimizeForPointLookup
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_optimizeForPointLookup(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jlong block_cache_size_mb) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->
+      OptimizeForPointLookup(block_cache_size_mb);
+}
+
+/*
+ * Method:    optimizeLevelStyleCompaction
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_optimizeLevelStyleCompaction(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jlong memtable_memory_budget) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->
+      OptimizeLevelStyleCompaction(memtable_memory_budget);
+}
+
 /*
  * Class:     org_rocksdb_Options
- * Method:    setMergeOperatorName
- * Signature: (JJjava/lang/String)V
+ * Method:    optimizeUniversalStyleCompaction
+ * Signature: (JJ)V
  */
-void Java_org_rocksdb_Options_setMergeOperatorName(
-    JNIEnv* env, jobject jobj, jlong jhandle, jstring name) {
-  const char* op_name = env->GetStringUTFChars(name, 0);
-  reinterpret_cast<rocksdb::Options*>(jhandle)->merge_operator =
-    rocksdb::MergeOperators::CreateFromStringId(op_name);
+void Java_org_rocksdb_Options_optimizeUniversalStyleCompaction(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jlong memtable_memory_budget) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->
+      OptimizeUniversalStyleCompaction(memtable_memory_budget);
 }
 
 /*
  * Class:     org_rocksdb_Options
- * Method:    setMergeOperator
- * Signature: (JJjava/lang/String)V
+ * Method:    prepareForBulkLoad
+ * Signature: (J)V
  */
-void Java_org_rocksdb_Options_setMergeOperator(
-  JNIEnv* env, jobject jobj, jlong jhandle, jlong mergeOperatorHandle) {
-  reinterpret_cast<rocksdb::Options*>(jhandle)->merge_operator =
-    *(reinterpret_cast<std::shared_ptr<rocksdb::MergeOperator>*>
-      (mergeOperatorHandle));
+void Java_org_rocksdb_Options_prepareForBulkLoad(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->
+      PrepareForBulkLoad();
 }
 
 //////////////////////////////////////////////////////////////////////////////
-// WriteOptions
+// rocksdb::ColumnFamilyOptions
 
 /*
- * Class:     org_rocksdb_WriteOptions
- * Method:    newWriteOptions
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    newColumnFamilyOptions
  * Signature: ()V
  */
-void Java_org_rocksdb_WriteOptions_newWriteOptions(
-    JNIEnv* env, jobject jwrite_options) {
-  rocksdb::WriteOptions* op = new rocksdb::WriteOptions();
-  rocksdb::WriteOptionsJni::setHandle(env, jwrite_options, op);
+void Java_org_rocksdb_ColumnFamilyOptions_newColumnFamilyOptions(
+    JNIEnv* env, jobject jobj) {
+  //rocksdb::ColumnFamilyOptions* op = new rocksdb::ColumnFamilyOptions();
+  //rocksdb::ColumnFamilyOptionsJni::setHandle(env, jobj, op);
 }
 
 /*
- * Class:     org_rocksdb_WriteOptions
+ * Class:     org_rocksdb_ColumnFamilyOptions
  * Method:    disposeInternal
- * Signature: ()V
+ * Signature: (J)V
  */
-void Java_org_rocksdb_WriteOptions_disposeInternal(
-    JNIEnv* env, jobject jwrite_options, jlong jhandle) {
-  auto write_options = reinterpret_cast<rocksdb::WriteOptions*>(jhandle);
-  delete write_options;
-
-  rocksdb::WriteOptionsJni::setHandle(env, jwrite_options, nullptr);
+void Java_org_rocksdb_ColumnFamilyOptions_disposeInternal(
+    JNIEnv* env, jobject jobj, jlong handle) {
+  delete reinterpret_cast<rocksdb::ColumnFamilyOptions*>(handle);
 }
 
 /*
- * Class:     org_rocksdb_WriteOptions
- * Method:    setSync
- * Signature: (JZ)V
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    optimizeForPointLookup
+ * Signature: (JJ)V
  */
-void Java_org_rocksdb_WriteOptions_setSync(
-  JNIEnv* env, jobject jwrite_options, jlong jhandle, jboolean jflag) {
-  reinterpret_cast<rocksdb::WriteOptions*>(jhandle)->sync = jflag;
+void Java_org_rocksdb_ColumnFamilyOptions_optimizeForPointLookup(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jlong block_cache_size_mb) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->
+      OptimizeForPointLookup(block_cache_size_mb);
 }
 
 /*
- * Class:     org_rocksdb_WriteOptions
- * Method:    sync
- * Signature: (J)Z
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    optimizeLevelStyleCompaction
+ * Signature: (JJ)V
  */
-jboolean Java_org_rocksdb_WriteOptions_sync(
-    JNIEnv* env, jobject jwrite_options, jlong jhandle) {
-  return reinterpret_cast<rocksdb::WriteOptions*>(jhandle)->sync;
+void Java_org_rocksdb_ColumnFamilyOptions_optimizeLevelStyleCompaction(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jlong memtable_memory_budget) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->
+      OptimizeLevelStyleCompaction(memtable_memory_budget);
 }
 
 /*
- * Class:     org_rocksdb_WriteOptions
- * Method:    setDisableWAL
- * Signature: (JZ)V
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    optimizeUniversalStyleCompaction
+ * Signature: (JJ)V
  */
-void Java_org_rocksdb_WriteOptions_setDisableWAL(
-    JNIEnv* env, jobject jwrite_options, jlong jhandle, jboolean jflag) {
-  reinterpret_cast<rocksdb::WriteOptions*>(jhandle)->disableWAL = jflag;
+void Java_org_rocksdb_ColumnFamilyOptions_optimizeUniversalStyleCompaction(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jlong memtable_memory_budget) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->
+      OptimizeUniversalStyleCompaction(memtable_memory_budget);
 }
 
 /*
- * Class:     org_rocksdb_WriteOptions
- * Method:    disableWAL
- * Signature: (J)Z
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setComparatorHandle
+ * Signature: (JI)V
  */
-jboolean Java_org_rocksdb_WriteOptions_disableWAL(
-    JNIEnv* env, jobject jwrite_options, jlong jhandle) {
-  return reinterpret_cast<rocksdb::WriteOptions*>(jhandle)->disableWAL;
+void Java_org_rocksdb_ColumnFamilyOptions_setComparatorHandle__JI(
+    JNIEnv* env, jobject jobj, jlong jhandle, jint builtinComparator) {
+  switch (builtinComparator) {
+    case 1:
+      reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->comparator =
+          rocksdb::ReverseBytewiseComparator();
+      break;
+    default:
+      reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->comparator =
+          rocksdb::BytewiseComparator();
+      break;
+  }
 }
 
-/////////////////////////////////////////////////////////////////////
-// rocksdb::ReadOptions
-
 /*
- * Class:     org_rocksdb_ReadOptions
- * Method:    newReadOptions
- * Signature: ()V
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setComparatorHandle
+ * Signature: (JJ)V
  */
-void Java_org_rocksdb_ReadOptions_newReadOptions(
-    JNIEnv* env, jobject jobj) {
-  auto read_opt = new rocksdb::ReadOptions();
-  rocksdb::ReadOptionsJni::setHandle(env, jobj, read_opt);
+void Java_org_rocksdb_ColumnFamilyOptions_setComparatorHandle__JI(
+    JNIEnv* env, jobject jobj, jlong jopt_handle, jlong jcomparator_handle) {
+  reinterpret_cast<rocksdb::Options*>(jopt_handle)->comparator =
+      reinterpret_cast<rocksdb::Comparator*>(jcomparator_handle);
 }
 
 /*
- * Class:     org_rocksdb_ReadOptions
- * Method:    disposeInternal
- * Signature: (J)V
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setMergeOperatorName
+ * Signature: (JJjava/lang/String)V
  */
-void Java_org_rocksdb_ReadOptions_disposeInternal(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  delete reinterpret_cast<rocksdb::ReadOptions*>(jhandle);
-  rocksdb::ReadOptionsJni::setHandle(env, jobj, nullptr);
+void Java_org_rocksdb_ColumnFamilyOptions_setMergeOperatorName(
+    JNIEnv* env, jobject jobj, jlong jhandle, jstring name) {
+  const char* op_name = env->GetStringUTFChars(name, 0);
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->merge_operator =
+    rocksdb::MergeOperators::CreateFromStringId(op_name);
 }
 
 /*
- * Class:     org_rocksdb_ReadOptions
- * Method:    verifyChecksums
- * Signature: (J)Z
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setMergeOperator
+ * Signature: (JJjava/lang/String)V
  */
-jboolean Java_org_rocksdb_ReadOptions_verifyChecksums(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::ReadOptions*>(
-      jhandle)->verify_checksums;
+void Java_org_rocksdb_ColumnFamilyOptions_setMergeOperator(
+  JNIEnv* env, jobject jobj, jlong jhandle, jlong mergeOperatorHandle) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->merge_operator =
+    *(reinterpret_cast<std::shared_ptr<rocksdb::MergeOperator>*>
+      (mergeOperatorHandle));
 }
 
 /*
- * Class:     org_rocksdb_ReadOptions
- * Method:    setVerifyChecksums
- * Signature: (JZ)V
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setWriteBufferSize
+ * Signature: (JJ)I
  */
-void Java_org_rocksdb_ReadOptions_setVerifyChecksums(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jboolean jverify_checksums) {
-  reinterpret_cast<rocksdb::ReadOptions*>(jhandle)->verify_checksums =
-      static_cast<bool>(jverify_checksums);
+void Java_org_rocksdb_ColumnFamilyOptions_setWriteBufferSize(
+    JNIEnv* env, jobject jobj, jlong jhandle, jlong jwrite_buffer_size) {
+  rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t(jwrite_buffer_size);
+  if (s.ok()) {
+    reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->
+        write_buffer_size = jwrite_buffer_size;
+  } else {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  }
 }
 
 /*
- * Class:     org_rocksdb_ReadOptions
- * Method:    fillCache
- * Signature: (J)Z
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    writeBufferSize
+ * Signature: (J)J
  */
-jboolean Java_org_rocksdb_ReadOptions_fillCache(
+jlong Java_org_rocksdb_ColumnFamilyOptions_writeBufferSize(
     JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::ReadOptions*>(jhandle)->fill_cache;
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->
+      write_buffer_size;
 }
 
 /*
- * Class:     org_rocksdb_ReadOptions
- * Method:    setFillCache
- * Signature: (JZ)V
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setMaxWriteBufferNumber
+ * Signature: (JI)V
  */
-void Java_org_rocksdb_ReadOptions_setFillCache(
-    JNIEnv* env, jobject jobj, jlong jhandle, jboolean jfill_cache) {
-  reinterpret_cast<rocksdb::ReadOptions*>(jhandle)->fill_cache =
-      static_cast<bool>(jfill_cache);
+void Java_org_rocksdb_ColumnFamilyOptions_setMaxWriteBufferNumber(
+    JNIEnv* env, jobject jobj, jlong jhandle, jint jmax_write_buffer_number) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->
+      max_write_buffer_number = jmax_write_buffer_number;
 }
 
 /*
- * Class:     org_rocksdb_ReadOptions
- * Method:    tailing
- * Signature: (J)Z
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    maxWriteBufferNumber
+ * Signature: (J)I
  */
-jboolean Java_org_rocksdb_ReadOptions_tailing(
+jint Java_org_rocksdb_ColumnFamilyOptions_maxWriteBufferNumber(
     JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::ReadOptions*>(jhandle)->tailing;
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->
+      max_write_buffer_number;
 }
 
 /*
- * Class:     org_rocksdb_ReadOptions
- * Method:    setTailing
- * Signature: (JZ)V
+ * Method:    setMemTableFactory
+ * Signature: (JJ)V
  */
-void Java_org_rocksdb_ReadOptions_setTailing(
-    JNIEnv* env, jobject jobj, jlong jhandle, jboolean jtailing) {
-  reinterpret_cast<rocksdb::ReadOptions*>(jhandle)->tailing =
-      static_cast<bool>(jtailing);
+void Java_org_rocksdb_ColumnFamilyOptions_setMemTableFactory(
+    JNIEnv* env, jobject jobj, jlong jhandle, jlong jfactory_handle) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->
+      memtable_factory.reset(
+      reinterpret_cast<rocksdb::MemTableRepFactory*>(jfactory_handle));
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    memTableFactoryName
+ * Signature: (J)Ljava/lang/String
+ */
+jstring Java_org_rocksdb_ColumnFamilyOptions_memTableFactoryName(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  auto opt = reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle);
+  rocksdb::MemTableRepFactory* tf = opt->memtable_factory.get();
+
+  // Should never be nullptr.
+  // Default memtable factory is SkipListFactory
+  assert(tf);
+
+  // temporarly fix for the historical typo
+  if (strcmp(tf->Name(), "HashLinkListRepFactory") == 0) {
+    return env->NewStringUTF("HashLinkedListRepFactory");
+  }
+
+  return env->NewStringUTF(tf->Name());
+}
+
+/*
+ * Method:    useFixedLengthPrefixExtractor
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_useFixedLengthPrefixExtractor(
+    JNIEnv* env, jobject jobj, jlong jhandle, jint jprefix_length) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->
+      prefix_extractor.reset(rocksdb::NewFixedPrefixTransform(
+          static_cast<int>(jprefix_length)));
+}
+
+/*
+ * Method:    setTableFactory
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setTableFactory(
+    JNIEnv* env, jobject jobj, jlong jhandle, jlong jfactory_handle) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->
+      table_factory.reset(reinterpret_cast<rocksdb::TableFactory*>(
+      jfactory_handle));
+}
+
+/*
+ * Method:    tableFactoryName
+ * Signature: (J)Ljava/lang/String
+ */
+jstring Java_org_rocksdb_ColumnFamilyOptions_tableFactoryName(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  auto opt = reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle);
+  rocksdb::TableFactory* tf = opt->table_factory.get();
+
+  // Should never be nullptr.
+  // Default memtable factory is SkipListFactory
+  assert(tf);
+
+  return env->NewStringUTF(tf->Name());
+}
+
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    minWriteBufferNumberToMerge
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_ColumnFamilyOptions_minWriteBufferNumberToMerge(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
+      jhandle)->min_write_buffer_number_to_merge;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setMinWriteBufferNumberToMerge
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setMinWriteBufferNumberToMerge(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jint jmin_write_buffer_number_to_merge) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
+      jhandle)->min_write_buffer_number_to_merge =
+          static_cast<int>(jmin_write_buffer_number_to_merge);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setCompressionType
+ * Signature: (JB)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setCompressionType(
+    JNIEnv* env, jobject jobj, jlong jhandle, jbyte compression) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->
+      compression = static_cast<rocksdb::CompressionType>(compression);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    compressionType
+ * Signature: (J)B
+ */
+jbyte Java_org_rocksdb_ColumnFamilyOptions_compressionType(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->
+      compression;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setCompactionStyle
+ * Signature: (JB)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setCompactionStyle(
+    JNIEnv* env, jobject jobj, jlong jhandle, jbyte compaction_style) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->compaction_style =
+      static_cast<rocksdb::CompactionStyle>(compaction_style);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    compactionStyle
+ * Signature: (J)B
+ */
+jbyte Java_org_rocksdb_ColumnFamilyOptions_compactionStyle(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>
+      (jhandle)->compaction_style;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    numLevels
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_ColumnFamilyOptions_numLevels(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->num_levels;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setNumLevels
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setNumLevels(
+    JNIEnv* env, jobject jobj, jlong jhandle, jint jnum_levels) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->num_levels =
+      static_cast<int>(jnum_levels);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    levelZeroFileNumCompactionTrigger
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_ColumnFamilyOptions_levelZeroFileNumCompactionTrigger(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
+      jhandle)->level0_file_num_compaction_trigger;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setLevelZeroFileNumCompactionTrigger
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setLevelZeroFileNumCompactionTrigger(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jint jlevel0_file_num_compaction_trigger) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
+      jhandle)->level0_file_num_compaction_trigger =
+          static_cast<int>(jlevel0_file_num_compaction_trigger);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    levelZeroSlowdownWritesTrigger
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_ColumnFamilyOptions_levelZeroSlowdownWritesTrigger(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
+      jhandle)->level0_slowdown_writes_trigger;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setLevelSlowdownWritesTrigger
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setLevelZeroSlowdownWritesTrigger(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jint jlevel0_slowdown_writes_trigger) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
+      jhandle)->level0_slowdown_writes_trigger =
+          static_cast<int>(jlevel0_slowdown_writes_trigger);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    levelZeroStopWritesTrigger
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_ColumnFamilyOptions_levelZeroStopWritesTrigger(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
+      jhandle)->level0_stop_writes_trigger;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setLevelStopWritesTrigger
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setLevelZeroStopWritesTrigger(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jint jlevel0_stop_writes_trigger) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->
+      level0_stop_writes_trigger = static_cast<int>(
+      jlevel0_stop_writes_trigger);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    maxMemCompactionLevel
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_ColumnFamilyOptions_maxMemCompactionLevel(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
+      jhandle)->max_mem_compaction_level;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setMaxMemCompactionLevel
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setMaxMemCompactionLevel(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jint jmax_mem_compaction_level) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->
+      max_mem_compaction_level = static_cast<int>(jmax_mem_compaction_level);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    targetFileSizeBase
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_ColumnFamilyOptions_targetFileSizeBase(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->
+      target_file_size_base;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setTargetFileSizeBase
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setTargetFileSizeBase(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jlong jtarget_file_size_base) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->
+      target_file_size_base = static_cast<uint64_t>(jtarget_file_size_base);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    targetFileSizeMultiplier
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_ColumnFamilyOptions_targetFileSizeMultiplier(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
+      jhandle)->target_file_size_multiplier;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setTargetFileSizeMultiplier
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setTargetFileSizeMultiplier(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jint jtarget_file_size_multiplier) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
+      jhandle)->target_file_size_multiplier =
+          static_cast<int>(jtarget_file_size_multiplier);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    maxBytesForLevelBase
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_ColumnFamilyOptions_maxBytesForLevelBase(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
+      jhandle)->max_bytes_for_level_base;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setMaxBytesForLevelBase
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setMaxBytesForLevelBase(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jlong jmax_bytes_for_level_base) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
+      jhandle)->max_bytes_for_level_base =
+          static_cast<int64_t>(jmax_bytes_for_level_base);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    maxBytesForLevelMultiplier
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_ColumnFamilyOptions_maxBytesForLevelMultiplier(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
+      jhandle)->max_bytes_for_level_multiplier;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setMaxBytesForLevelMultiplier
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setMaxBytesForLevelMultiplier(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jint jmax_bytes_for_level_multiplier) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
+      jhandle)->max_bytes_for_level_multiplier =
+          static_cast<int>(jmax_bytes_for_level_multiplier);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    expandedCompactionFactor
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_ColumnFamilyOptions_expandedCompactionFactor(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
+      jhandle)->expanded_compaction_factor;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setExpandedCompactionFactor
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setExpandedCompactionFactor(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jint jexpanded_compaction_factor) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
+      jhandle)->expanded_compaction_factor =
+          static_cast<int>(jexpanded_compaction_factor);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    sourceCompactionFactor
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_ColumnFamilyOptions_sourceCompactionFactor(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
+      jhandle)->source_compaction_factor;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setSourceCompactionFactor
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setSourceCompactionFactor(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+        jint jsource_compaction_factor) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
+      jhandle)->source_compaction_factor =
+          static_cast<int>(jsource_compaction_factor);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    maxGrandparentOverlapFactor
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_ColumnFamilyOptions_maxGrandparentOverlapFactor(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
+      jhandle)->max_grandparent_overlap_factor;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setMaxGrandparentOverlapFactor
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setMaxGrandparentOverlapFactor(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jint jmax_grandparent_overlap_factor) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
+      jhandle)->max_grandparent_overlap_factor =
+          static_cast<int>(jmax_grandparent_overlap_factor);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    softRateLimit
+ * Signature: (J)D
+ */
+jdouble Java_org_rocksdb_ColumnFamilyOptions_softRateLimit(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->
+      soft_rate_limit;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setSoftRateLimit
+ * Signature: (JD)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setSoftRateLimit(
+    JNIEnv* env, jobject jobj, jlong jhandle, jdouble jsoft_rate_limit) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->soft_rate_limit =
+      static_cast<double>(jsoft_rate_limit);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    hardRateLimit
+ * Signature: (J)D
+ */
+jdouble Java_org_rocksdb_ColumnFamilyOptions_hardRateLimit(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->
+      hard_rate_limit;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setHardRateLimit
+ * Signature: (JD)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setHardRateLimit(
+    JNIEnv* env, jobject jobj, jlong jhandle, jdouble jhard_rate_limit) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->hard_rate_limit =
+      static_cast<double>(jhard_rate_limit);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    rateLimitDelayMaxMilliseconds
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_ColumnFamilyOptions_rateLimitDelayMaxMilliseconds(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
+      jhandle)->rate_limit_delay_max_milliseconds;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setRateLimitDelayMaxMilliseconds
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setRateLimitDelayMaxMilliseconds(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jint jrate_limit_delay_max_milliseconds) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
+      jhandle)->rate_limit_delay_max_milliseconds =
+          static_cast<int>(jrate_limit_delay_max_milliseconds);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    arenaBlockSize
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_ColumnFamilyOptions_arenaBlockSize(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->
+      arena_block_size;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setArenaBlockSize
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setArenaBlockSize(
+    JNIEnv* env, jobject jobj, jlong jhandle, jlong jarena_block_size) {
+  rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t(jarena_block_size);
+  if (s.ok()) {
+    reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->
+        arena_block_size = jarena_block_size;
+  } else {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    disableAutoCompactions
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_ColumnFamilyOptions_disableAutoCompactions(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
+      jhandle)->disable_auto_compactions;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setDisableAutoCompactions
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setDisableAutoCompactions(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jboolean jdisable_auto_compactions) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
+      jhandle)->disable_auto_compactions =
+          static_cast<bool>(jdisable_auto_compactions);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    purgeRedundantKvsWhileFlush
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_ColumnFamilyOptions_purgeRedundantKvsWhileFlush(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
+      jhandle)->purge_redundant_kvs_while_flush;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setPurgeRedundantKvsWhileFlush
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setPurgeRedundantKvsWhileFlush(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jboolean jpurge_redundant_kvs_while_flush) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
+      jhandle)->purge_redundant_kvs_while_flush =
+          static_cast<bool>(jpurge_redundant_kvs_while_flush);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    verifyChecksumsInCompaction
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_ColumnFamilyOptions_verifyChecksumsInCompaction(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
+      jhandle)->verify_checksums_in_compaction;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setVerifyChecksumsInCompaction
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setVerifyChecksumsInCompaction(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jboolean jverify_checksums_in_compaction) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
+      jhandle)->verify_checksums_in_compaction =
+          static_cast<bool>(jverify_checksums_in_compaction);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    filterDeletes
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_ColumnFamilyOptions_filterDeletes(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->
+      filter_deletes;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setFilterDeletes
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setFilterDeletes(
+    JNIEnv* env, jobject jobj, jlong jhandle, jboolean jfilter_deletes) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->filter_deletes =
+      static_cast<bool>(jfilter_deletes);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    maxSequentialSkipInIterations
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_ColumnFamilyOptions_maxSequentialSkipInIterations(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
+      jhandle)->max_sequential_skip_in_iterations;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setMaxSequentialSkipInIterations
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setMaxSequentialSkipInIterations(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jlong jmax_sequential_skip_in_iterations) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
+      jhandle)->max_sequential_skip_in_iterations =
+          static_cast<int64_t>(jmax_sequential_skip_in_iterations);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    inplaceUpdateSupport
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_ColumnFamilyOptions_inplaceUpdateSupport(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
+      jhandle)->inplace_update_support;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setInplaceUpdateSupport
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setInplaceUpdateSupport(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jboolean jinplace_update_support) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
+      jhandle)->inplace_update_support =
+          static_cast<bool>(jinplace_update_support);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    inplaceUpdateNumLocks
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_ColumnFamilyOptions_inplaceUpdateNumLocks(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
+      jhandle)->inplace_update_num_locks;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setInplaceUpdateNumLocks
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setInplaceUpdateNumLocks(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jlong jinplace_update_num_locks) {
+  rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t(
+      jinplace_update_num_locks);
+  if (s.ok()) {
+    reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->
+        inplace_update_num_locks = jinplace_update_num_locks;
+  } else {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    memtablePrefixBloomBits
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_ColumnFamilyOptions_memtablePrefixBloomBits(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
+      jhandle)->memtable_prefix_bloom_bits;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setMemtablePrefixBloomBits
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setMemtablePrefixBloomBits(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jint jmemtable_prefix_bloom_bits) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
+      jhandle)->memtable_prefix_bloom_bits =
+          static_cast<int32_t>(jmemtable_prefix_bloom_bits);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    memtablePrefixBloomProbes
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_ColumnFamilyOptions_memtablePrefixBloomProbes(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
+      jhandle)->memtable_prefix_bloom_probes;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setMemtablePrefixBloomProbes
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setMemtablePrefixBloomProbes(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jint jmemtable_prefix_bloom_probes) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
+      jhandle)->memtable_prefix_bloom_probes =
+          static_cast<int32_t>(jmemtable_prefix_bloom_probes);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    bloomLocality
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_ColumnFamilyOptions_bloomLocality(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->
+      bloom_locality;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setBloomLocality
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setBloomLocality(
+    JNIEnv* env, jobject jobj, jlong jhandle, jint jbloom_locality) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->bloom_locality =
+      static_cast<int32_t>(jbloom_locality);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    maxSuccessiveMerges
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_ColumnFamilyOptions_maxSuccessiveMerges(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->
+      max_successive_merges;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setMaxSuccessiveMerges
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setMaxSuccessiveMerges(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jlong jmax_successive_merges) {
+  rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t(
+      jmax_successive_merges);
+  if (s.ok()) {
+    reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->
+        max_successive_merges = jmax_successive_merges;
+  } else {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    minPartialMergeOperands
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_ColumnFamilyOptions_minPartialMergeOperands(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
+      jhandle)->min_partial_merge_operands;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setMinPartialMergeOperands
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setMinPartialMergeOperands(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jint jmin_partial_merge_operands) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
+      jhandle)->min_partial_merge_operands =
+          static_cast<int32_t>(jmin_partial_merge_operands);
+}
+
+/////////////////////////////////////////////////////////////////////
+// rocksdb::DBOptions
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    newDBOptions
+ * Signature: ()V
+ */
+void Java_org_rocksdb_DBOptions_newDBOptions(JNIEnv* env,
+    jobject jobj) {
+  //rocksdb::DBOptions* dbop = new rocksdb::DBOptions();
+  //rocksdb::DBOptionsJni::setHandle(env, jobj, dbop);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_DBOptions_disposeInternal(
+    JNIEnv* env, jobject jobj, jlong handle) {
+  delete reinterpret_cast<rocksdb::DBOptions*>(handle);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setCreateIfMissing
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_DBOptions_setCreateIfMissing(
+    JNIEnv* env, jobject jobj, jlong jhandle, jboolean flag) {
+  reinterpret_cast<rocksdb::DBOptions*>(jhandle)->
+      create_if_missing = flag;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    createIfMissing
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_DBOptions_createIfMissing(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->create_if_missing;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setCreateMissingColumnFamilies
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_DBOptions_setCreateMissingColumnFamilies(
+    JNIEnv* env, jobject jobj, jlong jhandle, jboolean flag) {
+  reinterpret_cast<rocksdb::DBOptions*>
+      (jhandle)->create_missing_column_families = flag;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    createMissingColumnFamilies
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_DBOptions_createMissingColumnFamilies(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::DBOptions*>
+      (jhandle)->create_missing_column_families;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setErrorIfExists
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_DBOptions_setErrorIfExists(
+    JNIEnv* env, jobject jobj, jlong jhandle, jboolean error_if_exists) {
+  reinterpret_cast<rocksdb::DBOptions*>(jhandle)->error_if_exists =
+      static_cast<bool>(error_if_exists);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    errorIfExists
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_DBOptions_errorIfExists(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->error_if_exists;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setParanoidChecks
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_DBOptions_setParanoidChecks(
+    JNIEnv* env, jobject jobj, jlong jhandle, jboolean paranoid_checks) {
+  reinterpret_cast<rocksdb::DBOptions*>(jhandle)->paranoid_checks =
+      static_cast<bool>(paranoid_checks);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    paranoidChecks
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_DBOptions_paranoidChecks(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->paranoid_checks;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setRateLimiter
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_DBOptions_setRateLimiter(
+    JNIEnv* env, jobject jobj, jlong jhandle, jlong jrate_limiter_handle) {
+  reinterpret_cast<rocksdb::DBOptions*>(jhandle)->rate_limiter.reset(
+      reinterpret_cast<rocksdb::RateLimiter*>(jrate_limiter_handle));
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setMaxTotalWalSize
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_DBOptions_setMaxTotalWalSize(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jlong jmax_total_wal_size) {
+  reinterpret_cast<rocksdb::DBOptions*>(jhandle)->max_total_wal_size =
+      static_cast<jlong>(jmax_total_wal_size);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    maxTotalWalSize
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_DBOptions_maxTotalWalSize(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->
+      max_total_wal_size;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setMaxOpenFiles
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_DBOptions_setMaxOpenFiles(
+    JNIEnv* env, jobject jobj, jlong jhandle, jint max_open_files) {
+  reinterpret_cast<rocksdb::DBOptions*>(jhandle)->max_open_files =
+      static_cast<int>(max_open_files);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    maxOpenFiles
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_DBOptions_maxOpenFiles(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->max_open_files;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    createStatistics
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_DBOptions_createStatistics(
+    JNIEnv* env, jobject jobj, jlong jOptHandle) {
+  reinterpret_cast<rocksdb::DBOptions*>(jOptHandle)->statistics =
+      rocksdb::CreateDBStatistics();
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    statisticsPtr
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_DBOptions_statisticsPtr(
+    JNIEnv* env, jobject jobj, jlong jOptHandle) {
+  auto st = reinterpret_cast<rocksdb::DBOptions*>(jOptHandle)->
+      statistics.get();
+  return reinterpret_cast<jlong>(st);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setDisableDataSync
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_DBOptions_setDisableDataSync(
+    JNIEnv* env, jobject jobj, jlong jhandle, jboolean disableDataSync) {
+  reinterpret_cast<rocksdb::DBOptions*>(jhandle)->disableDataSync =
+      static_cast<bool>(disableDataSync);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    disableDataSync
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_DBOptions_disableDataSync(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->disableDataSync;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setUseFsync
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_DBOptions_setUseFsync(
+    JNIEnv* env, jobject jobj, jlong jhandle, jboolean use_fsync) {
+  reinterpret_cast<rocksdb::DBOptions*>(jhandle)->use_fsync =
+      static_cast<bool>(use_fsync);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    useFsync
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_DBOptions_useFsync(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->use_fsync;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setDbLogDir
+ * Signature: (JLjava/lang/String)V
+ */
+void Java_org_rocksdb_DBOptions_setDbLogDir(
+    JNIEnv* env, jobject jobj, jlong jhandle, jstring jdb_log_dir) {
+  const char* log_dir = env->GetStringUTFChars(jdb_log_dir, 0);
+  reinterpret_cast<rocksdb::DBOptions*>(jhandle)->db_log_dir.assign(log_dir);
+  env->ReleaseStringUTFChars(jdb_log_dir, log_dir);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    dbLogDir
+ * Signature: (J)Ljava/lang/String
+ */
+jstring Java_org_rocksdb_DBOptions_dbLogDir(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return env->NewStringUTF(
+      reinterpret_cast<rocksdb::DBOptions*>(jhandle)->db_log_dir.c_str());
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setWalDir
+ * Signature: (JLjava/lang/String)V
+ */
+void Java_org_rocksdb_DBOptions_setWalDir(
+    JNIEnv* env, jobject jobj, jlong jhandle, jstring jwal_dir) {
+  const char* wal_dir = env->GetStringUTFChars(jwal_dir, 0);
+  reinterpret_cast<rocksdb::DBOptions*>(jhandle)->wal_dir.assign(wal_dir);
+  env->ReleaseStringUTFChars(jwal_dir, wal_dir);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    walDir
+ * Signature: (J)Ljava/lang/String
+ */
+jstring Java_org_rocksdb_DBOptions_walDir(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return env->NewStringUTF(
+      reinterpret_cast<rocksdb::DBOptions*>(jhandle)->wal_dir.c_str());
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setDeleteObsoleteFilesPeriodMicros
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_DBOptions_setDeleteObsoleteFilesPeriodMicros(
+    JNIEnv* env, jobject jobj, jlong jhandle, jlong micros) {
+  reinterpret_cast<rocksdb::DBOptions*>(jhandle)
+      ->delete_obsolete_files_period_micros =
+          static_cast<int64_t>(micros);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    deleteObsoleteFilesPeriodMicros
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_DBOptions_deleteObsoleteFilesPeriodMicros(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::DBOptions*>(jhandle)
+      ->delete_obsolete_files_period_micros;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setMaxBackgroundCompactions
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_DBOptions_setMaxBackgroundCompactions(
+    JNIEnv* env, jobject jobj, jlong jhandle, jint max) {
+  reinterpret_cast<rocksdb::DBOptions*>(jhandle)
+      ->max_background_compactions = static_cast<int>(max);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    maxBackgroundCompactions
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_DBOptions_maxBackgroundCompactions(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::DBOptions*>(
+      jhandle)->max_background_compactions;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setMaxBackgroundFlushes
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_DBOptions_setMaxBackgroundFlushes(
+    JNIEnv* env, jobject jobj, jlong jhandle, jint max_background_flushes) {
+  reinterpret_cast<rocksdb::DBOptions*>(jhandle)->max_background_flushes =
+      static_cast<int>(max_background_flushes);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    maxBackgroundFlushes
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_DBOptions_maxBackgroundFlushes(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->
+      max_background_flushes;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setMaxLogFileSize
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_DBOptions_setMaxLogFileSize(
+    JNIEnv* env, jobject jobj, jlong jhandle, jlong max_log_file_size) {
+  rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t(max_log_file_size);
+  if (s.ok()) {
+    reinterpret_cast<rocksdb::DBOptions*>(jhandle)->max_log_file_size =
+        max_log_file_size;
+  } else {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    maxLogFileSize
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_DBOptions_maxLogFileSize(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->max_log_file_size;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setLogFileTimeToRoll
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_DBOptions_setLogFileTimeToRoll(
+    JNIEnv* env, jobject jobj, jlong jhandle, jlong log_file_time_to_roll) {
+  rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t(
+      log_file_time_to_roll);
+  if (s.ok()) {
+    reinterpret_cast<rocksdb::DBOptions*>(jhandle)->log_file_time_to_roll =
+        log_file_time_to_roll;
+  } else {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    logFileTimeToRoll
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_DBOptions_logFileTimeToRoll(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->log_file_time_to_roll;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setKeepLogFileNum
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_DBOptions_setKeepLogFileNum(
+    JNIEnv* env, jobject jobj, jlong jhandle, jlong keep_log_file_num) {
+  rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t(keep_log_file_num);
+  if (s.ok()) {
+    reinterpret_cast<rocksdb::DBOptions*>(jhandle)->keep_log_file_num =
+        keep_log_file_num;
+  } else {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    keepLogFileNum
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_DBOptions_keepLogFileNum(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->keep_log_file_num;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setMaxManifestFileSize
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_DBOptions_setMaxManifestFileSize(
+    JNIEnv* env, jobject jobj, jlong jhandle, jlong max_manifest_file_size) {
+  reinterpret_cast<rocksdb::DBOptions*>(jhandle)->max_manifest_file_size =
+      static_cast<int64_t>(max_manifest_file_size);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    maxManifestFileSize
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_DBOptions_maxManifestFileSize(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->
+      max_manifest_file_size;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setTableCacheNumshardbits
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_DBOptions_setTableCacheNumshardbits(
+    JNIEnv* env, jobject jobj, jlong jhandle, jint table_cache_numshardbits) {
+  reinterpret_cast<rocksdb::DBOptions*>(jhandle)->table_cache_numshardbits =
+      static_cast<int>(table_cache_numshardbits);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    tableCacheNumshardbits
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_DBOptions_tableCacheNumshardbits(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->
+      table_cache_numshardbits;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setTableCacheRemoveScanCountLimit
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_DBOptions_setTableCacheRemoveScanCountLimit(
+    JNIEnv* env, jobject jobj, jlong jhandle, jint limit) {
+  reinterpret_cast<rocksdb::DBOptions*>(
+      jhandle)->table_cache_remove_scan_count_limit = static_cast<int>(limit);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    tableCacheRemoveScanCountLimit
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_DBOptions_tableCacheRemoveScanCountLimit(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::DBOptions*>(
+      jhandle)->table_cache_remove_scan_count_limit;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setWalTtlSeconds
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_DBOptions_setWalTtlSeconds(
+    JNIEnv* env, jobject jobj, jlong jhandle, jlong WAL_ttl_seconds) {
+  reinterpret_cast<rocksdb::DBOptions*>(jhandle)->WAL_ttl_seconds =
+      static_cast<int64_t>(WAL_ttl_seconds);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    walTtlSeconds
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_DBOptions_walTtlSeconds(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->WAL_ttl_seconds;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setWalSizeLimitMB
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_DBOptions_setWalSizeLimitMB(
+    JNIEnv* env, jobject jobj, jlong jhandle, jlong WAL_size_limit_MB) {
+  reinterpret_cast<rocksdb::DBOptions*>(jhandle)->WAL_size_limit_MB =
+      static_cast<int64_t>(WAL_size_limit_MB);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    walTtlSeconds
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_DBOptions_walSizeLimitMB(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->WAL_size_limit_MB;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setManifestPreallocationSize
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_DBOptions_setManifestPreallocationSize(
+    JNIEnv* env, jobject jobj, jlong jhandle, jlong preallocation_size) {
+  rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t(preallocation_size);
+  if (s.ok()) {
+    reinterpret_cast<rocksdb::DBOptions*>(jhandle)->
+        manifest_preallocation_size = preallocation_size;
+  } else {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    manifestPreallocationSize
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_DBOptions_manifestPreallocationSize(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::DBOptions*>(jhandle)
+      ->manifest_preallocation_size;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setAllowOsBuffer
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_DBOptions_setAllowOsBuffer(
+    JNIEnv* env, jobject jobj, jlong jhandle, jboolean allow_os_buffer) {
+  reinterpret_cast<rocksdb::DBOptions*>(jhandle)->allow_os_buffer =
+      static_cast<bool>(allow_os_buffer);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    allowOsBuffer
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_DBOptions_allowOsBuffer(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->allow_os_buffer;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setAllowMmapReads
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_DBOptions_setAllowMmapReads(
+    JNIEnv* env, jobject jobj, jlong jhandle, jboolean allow_mmap_reads) {
+  reinterpret_cast<rocksdb::DBOptions*>(jhandle)->allow_mmap_reads =
+      static_cast<bool>(allow_mmap_reads);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    allowMmapReads
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_DBOptions_allowMmapReads(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->allow_mmap_reads;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setAllowMmapWrites
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_DBOptions_setAllowMmapWrites(
+    JNIEnv* env, jobject jobj, jlong jhandle, jboolean allow_mmap_writes) {
+  reinterpret_cast<rocksdb::DBOptions*>(jhandle)->allow_mmap_writes =
+      static_cast<bool>(allow_mmap_writes);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    allowMmapWrites
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_DBOptions_allowMmapWrites(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->allow_mmap_writes;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setIsFdCloseOnExec
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_DBOptions_setIsFdCloseOnExec(
+    JNIEnv* env, jobject jobj, jlong jhandle, jboolean is_fd_close_on_exec) {
+  reinterpret_cast<rocksdb::DBOptions*>(jhandle)->is_fd_close_on_exec =
+      static_cast<bool>(is_fd_close_on_exec);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    isFdCloseOnExec
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_DBOptions_isFdCloseOnExec(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->is_fd_close_on_exec;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setSkipLogErrorOnRecovery
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_DBOptions_setSkipLogErrorOnRecovery(
+    JNIEnv* env, jobject jobj, jlong jhandle, jboolean skip) {
+  reinterpret_cast<rocksdb::DBOptions*>(jhandle)->skip_log_error_on_recovery =
+      static_cast<bool>(skip);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    skipLogErrorOnRecovery
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_DBOptions_skipLogErrorOnRecovery(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::DBOptions*>(jhandle)
+      ->skip_log_error_on_recovery;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setStatsDumpPeriodSec
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_DBOptions_setStatsDumpPeriodSec(
+    JNIEnv* env, jobject jobj, jlong jhandle, jint stats_dump_period_sec) {
+  reinterpret_cast<rocksdb::DBOptions*>(jhandle)->stats_dump_period_sec =
+      static_cast<int>(stats_dump_period_sec);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    statsDumpPeriodSec
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_DBOptions_statsDumpPeriodSec(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->stats_dump_period_sec;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setAdviseRandomOnOpen
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_DBOptions_setAdviseRandomOnOpen(
+    JNIEnv* env, jobject jobj, jlong jhandle, jboolean advise_random_on_open) {
+  reinterpret_cast<rocksdb::DBOptions*>(jhandle)->advise_random_on_open =
+      static_cast<bool>(advise_random_on_open);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    adviseRandomOnOpen
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_DBOptions_adviseRandomOnOpen(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->advise_random_on_open;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setUseAdaptiveMutex
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_DBOptions_setUseAdaptiveMutex(
+    JNIEnv* env, jobject jobj, jlong jhandle, jboolean use_adaptive_mutex) {
+  reinterpret_cast<rocksdb::DBOptions*>(jhandle)->use_adaptive_mutex =
+      static_cast<bool>(use_adaptive_mutex);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    useAdaptiveMutex
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_DBOptions_useAdaptiveMutex(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->use_adaptive_mutex;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setBytesPerSync
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_DBOptions_setBytesPerSync(
+    JNIEnv* env, jobject jobj, jlong jhandle, jlong bytes_per_sync) {
+  reinterpret_cast<rocksdb::DBOptions*>(jhandle)->bytes_per_sync =
+      static_cast<int64_t>(bytes_per_sync);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    bytesPerSync
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_DBOptions_bytesPerSync(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->bytes_per_sync;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// rocksdb::WriteOptions
+
+/*
+ * Class:     org_rocksdb_WriteOptions
+ * Method:    newWriteOptions
+ * Signature: ()V
+ */
+void Java_org_rocksdb_WriteOptions_newWriteOptions(
+    JNIEnv* env, jobject jwrite_options) {
+  rocksdb::WriteOptions* op = new rocksdb::WriteOptions();
+  rocksdb::WriteOptionsJni::setHandle(env, jwrite_options, op);
+}
+
+/*
+ * Class:     org_rocksdb_WriteOptions
+ * Method:    disposeInternal
+ * Signature: ()V
+ */
+void Java_org_rocksdb_WriteOptions_disposeInternal(
+    JNIEnv* env, jobject jwrite_options, jlong jhandle) {
+  auto write_options = reinterpret_cast<rocksdb::WriteOptions*>(jhandle);
+  delete write_options;
+
+  rocksdb::WriteOptionsJni::setHandle(env, jwrite_options, nullptr);
+}
+
+/*
+ * Class:     org_rocksdb_WriteOptions
+ * Method:    setSync
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_WriteOptions_setSync(
+  JNIEnv* env, jobject jwrite_options, jlong jhandle, jboolean jflag) {
+  reinterpret_cast<rocksdb::WriteOptions*>(jhandle)->sync = jflag;
+}
+
+/*
+ * Class:     org_rocksdb_WriteOptions
+ * Method:    sync
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_WriteOptions_sync(
+    JNIEnv* env, jobject jwrite_options, jlong jhandle) {
+  return reinterpret_cast<rocksdb::WriteOptions*>(jhandle)->sync;
+}
+
+/*
+ * Class:     org_rocksdb_WriteOptions
+ * Method:    setDisableWAL
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_WriteOptions_setDisableWAL(
+    JNIEnv* env, jobject jwrite_options, jlong jhandle, jboolean jflag) {
+  reinterpret_cast<rocksdb::WriteOptions*>(jhandle)->disableWAL = jflag;
+}
+
+/*
+ * Class:     org_rocksdb_WriteOptions
+ * Method:    disableWAL
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_WriteOptions_disableWAL(
+    JNIEnv* env, jobject jwrite_options, jlong jhandle) {
+  return reinterpret_cast<rocksdb::WriteOptions*>(jhandle)->disableWAL;
+}
+
+/////////////////////////////////////////////////////////////////////
+// rocksdb::ReadOptions
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    newReadOptions
+ * Signature: ()V
+ */
+void Java_org_rocksdb_ReadOptions_newReadOptions(
+    JNIEnv* env, jobject jobj) {
+  auto read_opt = new rocksdb::ReadOptions();
+  rocksdb::ReadOptionsJni::setHandle(env, jobj, read_opt);
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_ReadOptions_disposeInternal(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  delete reinterpret_cast<rocksdb::ReadOptions*>(jhandle);
+  rocksdb::ReadOptionsJni::setHandle(env, jobj, nullptr);
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    setVerifyChecksums
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_ReadOptions_setVerifyChecksums(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jboolean jverify_checksums) {
+  reinterpret_cast<rocksdb::ReadOptions*>(jhandle)->verify_checksums =
+      static_cast<bool>(jverify_checksums);
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    verifyChecksums
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_ReadOptions_verifyChecksums(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ReadOptions*>(
+      jhandle)->verify_checksums;
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    setFillCache
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_ReadOptions_setFillCache(
+    JNIEnv* env, jobject jobj, jlong jhandle, jboolean jfill_cache) {
+  reinterpret_cast<rocksdb::ReadOptions*>(jhandle)->fill_cache =
+      static_cast<bool>(jfill_cache);
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    fillCache
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_ReadOptions_fillCache(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ReadOptions*>(jhandle)->fill_cache;
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    setTailing
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_ReadOptions_setTailing(
+    JNIEnv* env, jobject jobj, jlong jhandle, jboolean jtailing) {
+  reinterpret_cast<rocksdb::ReadOptions*>(jhandle)->tailing =
+      static_cast<bool>(jtailing);
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    tailing
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_ReadOptions_tailing(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ReadOptions*>(jhandle)->tailing;
 }
 
 /*

From 171be0ed55b5d0664a4c95ec8a4b021526c8fa7a Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Wed, 29 Oct 2014 18:40:44 +0100
Subject: [PATCH 369/829] Merge with ColumnFamilies & Hardening CFHandle

Summary:
ColumnFamilyHandles face the same problem as RocksIterator previously
so used methods were also applied for ColumnFamilyHandles.

Another problem with CF was that Options passed to CFs were
always filled with default values. To enable Merge, all parts
of the database must share the same merge functionality which
is not possible using default values. So from now on every
CF will inherit from db options.

Changes to RocksDB:
- merge can now take also a cfhandle

Changes to MergeTest:
- Corrected formatting
- Included also GC tests
- Extended tests to cover CF related parts
- Corrected paths to cleanup properly within the test process
- Reduced verbosity of the test

Test Plan:
make rocksdbjava
make jtest

Subscribers: dhruba

Differential Revision: https://reviews.facebook.net/D27999
---
 java/org/rocksdb/ColumnFamilyHandle.java |  22 +-
 java/org/rocksdb/RocksDB.java            |  52 ++++-
 java/org/rocksdb/test/MergeTest.java     | 243 ++++++++++++++++-------
 java/rocksjni/rocksjni.cc                |  72 ++++++-
 4 files changed, 292 insertions(+), 97 deletions(-)

diff --git a/java/org/rocksdb/ColumnFamilyHandle.java b/java/org/rocksdb/ColumnFamilyHandle.java
index 334abd96d..92a4d7cef 100644
--- a/java/org/rocksdb/ColumnFamilyHandle.java
+++ b/java/org/rocksdb/ColumnFamilyHandle.java
@@ -10,23 +10,33 @@ package org.rocksdb;
  * ColumnFamily Pointers.
  */
 public class ColumnFamilyHandle extends RocksObject {
-  ColumnFamilyHandle(long nativeHandle) {
+  ColumnFamilyHandle(RocksDB rocksDB, long nativeHandle) {
     super();
     nativeHandle_ = nativeHandle;
+    // rocksDB must point to a valid RocksDB instance;
+    assert(rocksDB != null);
+    // ColumnFamilyHandle must hold a reference to the related RocksDB instance
+    // to guarantee that while a GC cycle starts ColumnFamilyHandle instances
+    // are freed prior to RocksDB instances.
+    rocksDB_ = rocksDB;
   }
 
   /**
-   * Deletes underlying C++ filter pointer.
+   * <p>Deletes underlying C++ iterator pointer.</p>
    *
-   * Note that this function should be called only after all
-   * RocksDB instances referencing the filter are closed.
-   * Otherwise an undefined behavior will occur.
+   * <p>Note: the underlying handle can only be safely deleted if the RocksDB
+   * instance related to a certain ColumnFamilyHandle is still valid and initialized.
+   * Therefore {@code disposeInternal()} checks if the RocksDB is initialized
+   * before freeing the native handle.</p>
    */
   @Override protected void disposeInternal() {
     assert(isInitialized());
-    disposeInternal(nativeHandle_);
+    if (rocksDB_.isInitialized()) {
+      disposeInternal(nativeHandle_);
+    }
   }
 
   private native void disposeInternal(long handle);
 
+  private RocksDB rocksDB_;
 }
diff --git a/java/org/rocksdb/RocksDB.java b/java/org/rocksdb/RocksDB.java
index 291c505c7..2a90c7370 100644
--- a/java/org/rocksdb/RocksDB.java
+++ b/java/org/rocksdb/RocksDB.java
@@ -214,7 +214,7 @@ public class RocksDB extends RocksObject {
     List<Long> cfReferences = db.open(options.nativeHandle_, path,
         columnFamilyNames, columnFamilyNames.size());
     for (int i=0; i<columnFamilyNames.size(); i++) {
-      columnFamilyHandles.add(new ColumnFamilyHandle(cfReferences.get(i)));
+      columnFamilyHandles.add(new ColumnFamilyHandle(db, cfReferences.get(i)));
     }
     db.storeOptionsInstance(options);
     return db;
@@ -316,7 +316,7 @@ public class RocksDB extends RocksObject {
     List<Long> cfReferences = db.openROnly(options.nativeHandle_, path,
         columnFamilyNames, columnFamilyNames.size());
     for (int i=0; i<columnFamilyNames.size(); i++) {
-      columnFamilyHandles.add(new ColumnFamilyHandle(cfReferences.get(i)));
+      columnFamilyHandles.add(new ColumnFamilyHandle(db, cfReferences.get(i)));
     }
 
     db.storeOptionsInstance(options);
@@ -426,7 +426,7 @@ public class RocksDB extends RocksObject {
    * This check is potentially lighter-weight than invoking DB::Get(). One way
    * to make this lighter weight is to avoid doing any IOs.
    *
-   * @param columnFamilyHandle {@link ColumnFamilyHandle} instnace
+   * @param columnFamilyHandle {@link ColumnFamilyHandle} instance
    * @param key byte array of a key to search for
    * @param value StringBuffer instance which is a out parameter if a value is
    *    found in block-cache.
@@ -446,7 +446,7 @@ public class RocksDB extends RocksObject {
    * to make this lighter weight is to avoid doing any IOs.
    *
    * @param readOptions {@link ReadOptions} instance
-   * @param columnFamilyHandle {@link ColumnFamilyHandle} instnace
+   * @param columnFamilyHandle {@link ColumnFamilyHandle} instance
    * @param key byte array of a key to search for
    * @param value StringBuffer instance which is a out parameter if a value is
    *    found in block-cache.
@@ -483,6 +483,20 @@ public class RocksDB extends RocksObject {
     merge(nativeHandle_, key, key.length, value, value.length);
   }
 
+  /**
+   * Add merge operand for key/value pair in a ColumnFamily.
+   *
+   * @param columnFamilyHandle {@link ColumnFamilyHandle} instance
+   * @param key the specified key to be merged.
+   * @param value the value to be nerged with the current value for
+   * the specified key.
+   */
+  public void merge(ColumnFamilyHandle columnFamilyHandle, byte[] key,
+      byte[] value) throws RocksDBException {
+    merge(nativeHandle_, key, key.length, value, value.length,
+        columnFamilyHandle.nativeHandle_);
+  }
+
   /**
    * Add merge operand for key/value pair.
    *
@@ -497,6 +511,22 @@ public class RocksDB extends RocksObject {
         key, key.length, value, value.length);
   }
 
+  /**
+   * Add merge operand for key/value pair.
+   *
+   * @param columnFamilyHandle {@link ColumnFamilyHandle} instance
+   * @param writeOpts {@link WriteOptions} for this write.
+   * @param key the specified key to be merged.
+   * @param value the value to be merged with the current value for
+   * the specified key.
+   */
+  public void merge(ColumnFamilyHandle columnFamilyHandle,
+      WriteOptions writeOpts, byte[] key, byte[] value)
+      throws RocksDBException {
+    merge(nativeHandle_, writeOpts.nativeHandle_,
+        key, key.length, value, value.length,
+        columnFamilyHandle.nativeHandle_);
+  }
 
   /**
    * Get the value associated with the specified key within column family*
@@ -1000,8 +1030,8 @@ public class RocksDB extends RocksObject {
    */
   public ColumnFamilyHandle createColumnFamily(String columnFamilyName)
       throws RocksDBException {
-    return new ColumnFamilyHandle(createColumnFamily(nativeHandle_,
-        columnFamilyName));
+    return new ColumnFamilyHandle(this, createColumnFamily(nativeHandle_,
+        options_.nativeHandle_, columnFamilyName));
   }
 
   /**
@@ -1063,10 +1093,17 @@ public class RocksDB extends RocksObject {
   protected native void merge(
       long handle, byte[] key, int keyLen,
       byte[] value, int valueLen) throws RocksDBException;
+  protected native void merge(
+      long handle, byte[] key, int keyLen,
+      byte[] value, int valueLen, long cfHandle) throws RocksDBException;
   protected native void merge(
       long handle, long writeOptHandle,
       byte[] key, int keyLen,
       byte[] value, int valueLen) throws RocksDBException;
+  protected native void merge(
+      long handle, long writeOptHandle,
+      byte[] key, int keyLen,
+      byte[] value, int valueLen, long cfHandle) throws RocksDBException;
   protected native int get(
       long handle, byte[] key, int keyLen,
       byte[] value, int valueLen) throws RocksDBException;
@@ -1122,7 +1159,8 @@ public class RocksDB extends RocksObject {
       long nativeHandle, long snapshotHandle);
   private native void disposeInternal(long handle);
 
-  private native long createColumnFamily(long handle, String name) throws RocksDBException;
+  private native long createColumnFamily(long handle, long opt_handle,
+      String name) throws RocksDBException;
   private native void dropColumnFamily(long handle, long cfHandle) throws RocksDBException;
 
   protected Options options_;
diff --git a/java/org/rocksdb/test/MergeTest.java b/java/org/rocksdb/test/MergeTest.java
index f6acff9b2..d102038d3 100644
--- a/java/org/rocksdb/test/MergeTest.java
+++ b/java/org/rocksdb/test/MergeTest.java
@@ -5,81 +5,178 @@
 
 package org.rocksdb.test;
 
+import java.util.List;
+import java.util.ArrayList;
 import java.util.Collections;
 import org.rocksdb.*;
 
 public class MergeTest {
-        static final String db_path_string = "/tmp/mergestringjni_db";
-        static final String db_path_function = "/tmp/mergefunctionjni_db";
-        static {
-                RocksDB.loadLibrary();
-        }
-
-        public static void testStringOption()
-                throws InterruptedException, RocksDBException {
-
-                System.out.println("Testing merge function string option ===");
-
-                Options opt = new Options();
-                opt.setCreateIfMissing(true);
-                opt.setMergeOperatorName("stringappend");
-
-                RocksDB db = RocksDB.open(opt, db_path_string);
-
-                System.out.println("Writing aa under key...");
-                db.put("key".getBytes(), "aa".getBytes());
-
-                System.out.println("Writing bb under key...");
-                db.merge("key".getBytes(), "bb".getBytes());
-
-                byte[] value = db.get("key".getBytes());
-                String strValue = new String(value);
-
-                System.out.println("Retrieved value: " + strValue);
-
-                db.close();
-                opt.dispose();
-
-                assert(strValue.equals("aa,bb"));
-
-                System.out.println("Merge function string option passed!");
-        }
-
-        public static void testOperatorOption()
-                throws InterruptedException, RocksDBException {
-
-                System.out.println("Testing merge function operator option ===");
-
-                Options opt = new Options();
-                opt.setCreateIfMissing(true);
-
-                StringAppendOperator stringAppendOperator = new StringAppendOperator();
-                opt.setMergeOperator(stringAppendOperator);
-
-                RocksDB db = RocksDB.open(opt, db_path_string);
-
-                System.out.println("Writing aa under key...");
-                db.put("key".getBytes(), "aa".getBytes());
-
-                System.out.println("Writing bb under key...");
-                db.merge("key".getBytes(), "bb".getBytes());
-
-                byte[] value = db.get("key".getBytes());
-                String strValue = new String(value);
-
-                System.out.println("Retrieved value: " + strValue);
-
-                db.close();
-                opt.dispose();
-
-                assert(strValue.equals("aa,bb"));
-
-                System.out.println("Merge function operator option passed!");
-        }
-
-        public static void main(String[] args)
-                throws InterruptedException, RocksDBException {
-                testStringOption();
-                testOperatorOption();
-        }
+  static final String db_path_string = "/tmp/rocksdbjni_mergestring_db";
+  static final String db_cf_path_string = "/tmp/rocksdbjni_mergecfstring_db";
+  static final String db_path_operator = "/tmp/rocksdbjni_mergeoperator_db";
+
+  static {
+    RocksDB.loadLibrary();
+  }
+
+  public static void testStringOption()
+      throws InterruptedException, RocksDBException {
+    Options opt = new Options();
+    opt.setCreateIfMissing(true);
+    opt.setMergeOperatorName("stringappend");
+
+    RocksDB db = RocksDB.open(opt, db_path_string);
+    // writing aa under key
+    db.put("key".getBytes(), "aa".getBytes());
+    // merge bb under key
+    db.merge("key".getBytes(), "bb".getBytes());
+
+    byte[] value = db.get("key".getBytes());
+    String strValue = new String(value);
+
+    db.close();
+    opt.dispose();
+    assert(strValue.equals("aa,bb"));
+  }
+
+  public static void testCFStringOption()
+      throws InterruptedException, RocksDBException {
+    Options opt = new Options();
+    opt.setCreateIfMissing(true);
+    opt.setCreateMissingColumnFamilies(true);
+    opt.setMergeOperatorName("stringappend");
+
+    List<String> cfNames = new ArrayList<String>();
+    List<ColumnFamilyHandle> columnFamilyHandleList =
+    new ArrayList<ColumnFamilyHandle>();
+    cfNames.add("default");
+    cfNames.add("new_cf");
+    RocksDB db = RocksDB.open(opt, db_cf_path_string, cfNames, columnFamilyHandleList);
+
+    // writing aa under key
+    db.put(columnFamilyHandleList.get(1), "cfkey".getBytes(), "aa".getBytes());
+    // merge bb under key
+    db.merge(columnFamilyHandleList.get(1), "cfkey".getBytes(), "bb".getBytes());
+
+    byte[] value = db.get(columnFamilyHandleList.get(1), "cfkey".getBytes());
+    String strValue = new String(value);
+
+    for (ColumnFamilyHandle handle : columnFamilyHandleList) {
+      handle.dispose();
+    }
+    db.close();
+    opt.dispose();
+    assert(strValue.equals("aa,bb"));
+  }
+
+  public static void testOperatorOption()
+      throws InterruptedException, RocksDBException {
+    Options opt = new Options();
+    opt.setCreateIfMissing(true);
+
+    StringAppendOperator stringAppendOperator = new StringAppendOperator();
+    opt.setMergeOperator(stringAppendOperator);
+
+    RocksDB db = RocksDB.open(opt, db_path_string);
+    // Writing aa under key
+    db.put("key".getBytes(), "aa".getBytes());
+
+    // Writing bb under key
+    db.merge("key".getBytes(), "bb".getBytes());
+
+    byte[] value = db.get("key".getBytes());
+    String strValue = new String(value);
+
+    db.close();
+    opt.dispose();
+    assert(strValue.equals("aa,bb"));
+  }
+
+  public static void testCFOperatorOption()
+      throws InterruptedException, RocksDBException {
+    Options opt = new Options();
+    opt.setCreateIfMissing(true);
+    opt.setCreateMissingColumnFamilies(true);
+    StringAppendOperator stringAppendOperator = new StringAppendOperator();
+    opt.setMergeOperator(stringAppendOperator);
+
+    List<String> cfNames = new ArrayList<String>();
+    List<ColumnFamilyHandle> columnFamilyHandleList =
+    new ArrayList<ColumnFamilyHandle>();
+    cfNames.add("default");
+    cfNames.add("new_cf");
+    RocksDB db = RocksDB.open(opt, db_path_operator, cfNames, columnFamilyHandleList);
+
+    // writing aa under key
+    db.put(columnFamilyHandleList.get(1), "cfkey".getBytes(), "aa".getBytes());
+    // merge bb under key
+    db.merge(columnFamilyHandleList.get(1), "cfkey".getBytes(), "bb".getBytes());
+    byte[] value = db.get(columnFamilyHandleList.get(1), "cfkey".getBytes());
+    String strValue = new String(value);
+
+    // Test also with createColumnFamily
+    ColumnFamilyHandle columnFamilyHandle = db.createColumnFamily("new_cf2");
+    // writing xx under cfkey2
+    db.put(columnFamilyHandle, "cfkey2".getBytes(), "xx".getBytes());
+    // merge yy under cfkey2
+    db.merge(columnFamilyHandle, "cfkey2".getBytes(), "yy".getBytes());
+    value = db.get(columnFamilyHandle, "cfkey2".getBytes());
+    String strValueTmpCf = new String(value);
+
+    db.close();
+    opt.dispose();
+    assert(strValue.equals("aa,bb"));
+    assert(strValueTmpCf.equals("xx,yy"));
+  }
+
+  public static void testOperatorGcBehaviour()
+      throws RocksDBException {
+    Options opt = new Options();
+    opt.setCreateIfMissing(true);
+    StringAppendOperator stringAppendOperator = new StringAppendOperator();
+    opt.setMergeOperator(stringAppendOperator);
+    RocksDB db = RocksDB.open(opt, db_path_string);
+    db.close();
+    opt.dispose();
+    System.gc();
+    System.runFinalization();
+    // test reuse
+    opt = new Options();
+    opt.setMergeOperator(stringAppendOperator);
+    db = RocksDB.open(opt, db_path_string);
+    db.close();
+    opt.dispose();
+    System.gc();
+    System.runFinalization();
+    // test param init
+    opt = new Options();
+    opt.setMergeOperator(new StringAppendOperator());
+    db = RocksDB.open(opt, db_path_string);
+    db.close();
+    opt.dispose();
+    System.gc();
+    System.runFinalization();
+    // test replace one with another merge operator instance
+    opt = new Options();
+    opt.setMergeOperator(stringAppendOperator);
+    StringAppendOperator newStringAppendOperator = new StringAppendOperator();
+    opt.setMergeOperator(newStringAppendOperator);
+    db = RocksDB.open(opt, db_path_string);
+    db.close();
+    opt.dispose();
+    stringAppendOperator = null;
+    newStringAppendOperator = null;
+    System.gc();
+    System.runFinalization();
+  }
+
+  public static void main(String[] args)
+      throws InterruptedException, RocksDBException {
+    testStringOption();
+    testCFStringOption();
+    testOperatorOption();
+    testCFOperatorOption();
+    testOperatorGcBehaviour();
+    System.out.println("Passed MergeTest.");
+  }
 }
diff --git a/java/rocksjni/rocksjni.cc b/java/rocksjni/rocksjni.cc
index 1e886e2e2..50cd8a359 100644
--- a/java/rocksjni/rocksjni.cc
+++ b/java/rocksjni/rocksjni.cc
@@ -95,7 +95,7 @@ jobject
       cfnames_to_free.push_back(cfname);
       jcfnames_for_free.push_back(jstr);
       column_families.push_back(rocksdb::ColumnFamilyDescriptor(cfname,
-          rocksdb::ColumnFamilyOptions()));
+          *static_cast<rocksdb::ColumnFamilyOptions*>(opt)));
   }
 
   rocksdb::Status s = rocksdb::DB::OpenForReadOnly(*opt,
@@ -167,7 +167,7 @@ jobject Java_org_rocksdb_RocksDB_open__JLjava_lang_String_2Ljava_util_List_2I(
       cfnames_to_free.push_back(cfname);
       jcfnames_for_free.push_back(jstr);
       column_families.push_back(rocksdb::ColumnFamilyDescriptor(cfname,
-          rocksdb::ColumnFamilyOptions()));
+          *static_cast<rocksdb::ColumnFamilyOptions*>(opt)));
   }
 
   rocksdb::Status s = rocksdb::DB::Open(*opt, db_path, column_families,
@@ -919,7 +919,7 @@ void Java_org_rocksdb_RocksDB_remove__JJ_3BIJ(
 
 void rocksdb_merge_helper(
     JNIEnv* env, rocksdb::DB* db, const rocksdb::WriteOptions& write_options,
-    jbyteArray jkey, jint jkey_len,
+    rocksdb::ColumnFamilyHandle* cf_handle, jbyteArray jkey, jint jkey_len,
     jbyteArray jvalue, jint jvalue_len) {
 
   jbyte* key = env->GetByteArrayElements(jkey, 0);
@@ -927,7 +927,12 @@ void rocksdb_merge_helper(
   rocksdb::Slice key_slice(reinterpret_cast<char*>(key), jkey_len);
   rocksdb::Slice value_slice(reinterpret_cast<char*>(value), jvalue_len);
 
-  rocksdb::Status s = db->Merge(write_options, key_slice, value_slice);
+  rocksdb::Status s;
+  if (cf_handle != nullptr) {
+    s = db->Merge(write_options, cf_handle, key_slice, value_slice);
+  } else {
+    s = db->Merge(write_options, key_slice, value_slice);
+  }
 
   // trigger java unref on key and value.
   // by passing JNI_ABORT, it will simply release the reference without
@@ -955,8 +960,29 @@ void Java_org_rocksdb_RocksDB_merge__J_3BI_3BI(
       rocksdb::WriteOptions();
 
   rocksdb_merge_helper(env, db, default_write_options,
-                     jkey, jkey_len,
-                     jvalue, jvalue_len);
+      nullptr, jkey, jkey_len, jvalue, jvalue_len);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    merge
+ * Signature: (J[BI[BIJ)V
+ */
+void Java_org_rocksdb_RocksDB_merge__J_3BI_3BIJ(
+    JNIEnv* env, jobject jdb, jlong jdb_handle,
+    jbyteArray jkey, jint jkey_len,
+    jbyteArray jvalue, jint jvalue_len, jlong jcf_handle) {
+  auto db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  static const rocksdb::WriteOptions default_write_options =
+      rocksdb::WriteOptions();
+  auto cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
+  if (cf_handle != nullptr) {
+    rocksdb_merge_helper(env, db, default_write_options,
+        cf_handle, jkey, jkey_len, jvalue, jvalue_len);
+  } else {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env,
+        rocksdb::Status::InvalidArgument("Invalid ColumnFamilyHandle."));
+  }
 }
 
 /*
@@ -974,8 +1000,30 @@ void Java_org_rocksdb_RocksDB_merge__JJ_3BI_3BI(
       jwrite_options_handle);
 
   rocksdb_merge_helper(env, db, *write_options,
-                     jkey, jkey_len,
-                     jvalue, jvalue_len);
+      nullptr, jkey, jkey_len, jvalue, jvalue_len);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    merge
+ * Signature: (JJ[BI[BIJ)V
+ */
+void Java_org_rocksdb_RocksDB_merge__JJ_3BI_3BIJ(
+    JNIEnv* env, jobject jdb,
+    jlong jdb_handle, jlong jwrite_options_handle,
+    jbyteArray jkey, jint jkey_len,
+    jbyteArray jvalue, jint jvalue_len, jlong jcf_handle) {
+  auto db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  auto write_options = reinterpret_cast<rocksdb::WriteOptions*>(
+      jwrite_options_handle);
+  auto cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
+  if (cf_handle != nullptr) {
+    rocksdb_merge_helper(env, db, *write_options,
+        cf_handle, jkey, jkey_len, jvalue, jvalue_len);
+  } else {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env,
+        rocksdb::Status::InvalidArgument("Invalid ColumnFamilyHandle."));
+  }
 }
 
 //////////////////////////////////////////////////////////////////////////////
@@ -1062,15 +1110,17 @@ jlongArray Java_org_rocksdb_RocksDB_iterators(
 /*
  * Class:     org_rocksdb_RocksDB
  * Method:    createColumnFamily
- * Signature: (JLjava/lang/String;)J;
+ * Signature: (JJLjava/lang/String;)J;
  */
 jlong Java_org_rocksdb_RocksDB_createColumnFamily(
-    JNIEnv* env, jobject jdb, jlong jdb_handle, jstring jcfname) {
+    JNIEnv* env, jobject jdb, jlong jdb_handle, jlong jopt_handle,
+    jstring jcfname) {
   rocksdb::ColumnFamilyHandle* handle;
   const char* cfname = env->GetStringUTFChars(jcfname, 0);
   auto db_handle = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  auto opt = reinterpret_cast<rocksdb::Options*>(jopt_handle);
   rocksdb::Status s = db_handle->CreateColumnFamily(
-      rocksdb::ColumnFamilyOptions(), cfname, &handle);
+      *static_cast<rocksdb::ColumnFamilyOptions*>(opt), cfname, &handle);
   env->ReleaseStringUTFChars(jcfname, cfname);
 
   if (s.ok()) {

From df7abb4e8de0cac06107fbb5f9a909c0bbc9f08b Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Fri, 31 Oct 2014 00:04:16 +0100
Subject: [PATCH 370/829] [RocksJava] Integrated code review comments

- Added TODO comments for disabled methods
---
 java/rocksjni/options.cc | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/java/rocksjni/options.cc b/java/rocksjni/options.cc
index 6cde26327..0f4c19232 100644
--- a/java/rocksjni/options.cc
+++ b/java/rocksjni/options.cc
@@ -1737,8 +1737,9 @@ void Java_org_rocksdb_Options_prepareForBulkLoad(
  */
 void Java_org_rocksdb_ColumnFamilyOptions_newColumnFamilyOptions(
     JNIEnv* env, jobject jobj) {
-  //rocksdb::ColumnFamilyOptions* op = new rocksdb::ColumnFamilyOptions();
-  //rocksdb::ColumnFamilyOptionsJni::setHandle(env, jobj, op);
+  // TODO(fyrz) needs to be enabled back when ColumnFamilyOptions are available
+  // rocksdb::ColumnFamilyOptions* op = new rocksdb::ColumnFamilyOptions();
+  // rocksdb::ColumnFamilyOptionsJni::setHandle(env, jobj, op);
 }
 
 /*
@@ -2712,8 +2713,9 @@ void Java_org_rocksdb_ColumnFamilyOptions_setMinPartialMergeOperands(
  */
 void Java_org_rocksdb_DBOptions_newDBOptions(JNIEnv* env,
     jobject jobj) {
-  //rocksdb::DBOptions* dbop = new rocksdb::DBOptions();
-  //rocksdb::DBOptionsJni::setHandle(env, jobj, dbop);
+  // TODO(fyrz) needs to be enabled back when DBOptions are available
+  // rocksdb::DBOptions* dbop = new rocksdb::DBOptions();
+  // rocksdb::DBOptionsJni::setHandle(env, jobj, dbop);
 }
 
 /*

From 85b04ca765be0e127aff74c36950c4099a67d93f Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Fri, 31 Oct 2014 00:41:04 +0100
Subject: [PATCH 371/829] [RocksJava] Review comments - reformatted MergeTest

---
 java/org/rocksdb/test/MergeTest.java | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/java/org/rocksdb/test/MergeTest.java b/java/org/rocksdb/test/MergeTest.java
index d102038d3..9435718f8 100644
--- a/java/org/rocksdb/test/MergeTest.java
+++ b/java/org/rocksdb/test/MergeTest.java
@@ -51,12 +51,15 @@ public class MergeTest {
     new ArrayList<ColumnFamilyHandle>();
     cfNames.add("default");
     cfNames.add("new_cf");
-    RocksDB db = RocksDB.open(opt, db_cf_path_string, cfNames, columnFamilyHandleList);
+    RocksDB db = RocksDB.open(opt, db_cf_path_string,
+        cfNames, columnFamilyHandleList);
 
     // writing aa under key
-    db.put(columnFamilyHandleList.get(1), "cfkey".getBytes(), "aa".getBytes());
+    db.put(columnFamilyHandleList.get(1),
+        "cfkey".getBytes(), "aa".getBytes());
     // merge bb under key
-    db.merge(columnFamilyHandleList.get(1), "cfkey".getBytes(), "bb".getBytes());
+    db.merge(columnFamilyHandleList.get(1),
+        "cfkey".getBytes(), "bb".getBytes());
 
     byte[] value = db.get(columnFamilyHandleList.get(1), "cfkey".getBytes());
     String strValue = new String(value);
@@ -105,12 +108,15 @@ public class MergeTest {
     new ArrayList<ColumnFamilyHandle>();
     cfNames.add("default");
     cfNames.add("new_cf");
-    RocksDB db = RocksDB.open(opt, db_path_operator, cfNames, columnFamilyHandleList);
+    RocksDB db = RocksDB.open(opt, db_path_operator,
+        cfNames, columnFamilyHandleList);
 
     // writing aa under key
-    db.put(columnFamilyHandleList.get(1), "cfkey".getBytes(), "aa".getBytes());
+    db.put(columnFamilyHandleList.get(1),
+        "cfkey".getBytes(), "aa".getBytes());
     // merge bb under key
-    db.merge(columnFamilyHandleList.get(1), "cfkey".getBytes(), "bb".getBytes());
+    db.merge(columnFamilyHandleList.get(1),
+        "cfkey".getBytes(), "bb".getBytes());
     byte[] value = db.get(columnFamilyHandleList.get(1), "cfkey".getBytes());
     String strValue = new String(value);
 

From 2b1f23dcae79216c1ff259afe176743ae25dce84 Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Thu, 30 Oct 2014 16:54:34 -0700
Subject: [PATCH 372/829] Apply InfoLogLevel to the logs in db/db_iter.cc

Summary: Apply InfoLogLevel to the logs in db/db_iter.cc

Test Plan: make

Reviewers: igor, ljin, sdong

Reviewed By: sdong

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D27861
---
 db/db_iter.cc | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/db/db_iter.cc b/db/db_iter.cc
index 815562c9f..2fd4a9e2e 100644
--- a/db/db_iter.cc
+++ b/db/db_iter.cc
@@ -162,7 +162,8 @@ class DBIter: public Iterator {
 inline bool DBIter::ParseKey(ParsedInternalKey* ikey) {
   if (!ParseInternalKey(iter_->key(), ikey)) {
     status_ = Status::Corruption("corrupted internal key in DBIter");
-    Log(logger_, "corrupted internal key in DBIter: %s",
+    Log(InfoLogLevel::ERROR_LEVEL,
+        logger_, "corrupted internal key in DBIter: %s",
         iter_->key().ToString(true).c_str());
     return false;
   } else {
@@ -278,7 +279,8 @@ void DBIter::FindNextUserEntryInternal(bool skipping) {
 //       iter_ points to the next entry (or invalid)
 void DBIter::MergeValuesNewToOld() {
   if (!user_merge_operator_) {
-    Log(logger_, "Options::merge_operator is null.");
+    Log(InfoLogLevel::ERROR_LEVEL,
+        logger_, "Options::merge_operator is null.");
     throw std::logic_error("DBIter::MergeValuesNewToOld() with"
                            " Options::merge_operator null");
   }

From 4d2ba38b6508f464e0e8b295ffe7ed7ee4752386 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Fri, 31 Oct 2014 08:48:19 -0700
Subject: [PATCH 373/829] Make VersionBuilder unit testable

Summary:
Rename Version::Builder to VersionBuilder and expose its definition to a header.
Make VerisonBuilder not reference Version or ColumnFamilyData, only working with VersionStorageInfo.
Add version_builder_test which has a simple test.

Test Plan: make all check

Reviewers: rven, yhchiang, igor, ljin

Reviewed By: igor

Subscribers: leveldb, dhruba

Differential Revision: https://reviews.facebook.net/D27969
---
 Makefile                                    |   4 +
 db/column_family.cc                         |   6 +-
 db/compaction.cc                            |   4 +-
 db/compaction_picker_test.cc                |   7 +-
 db/db_impl.cc                               |  23 +-
 db/db_impl_debug.cc                         |   7 +-
 db/flush_job.cc                             |   2 +-
 db/forward_iterator.cc                      |   6 +-
 db/internal_stats.cc                        |   6 +-
 db/version_builder.h                        |  40 +++
 db/version_builder_test.cc                  | 123 +++++++
 db/version_edit.h                           |   9 +-
 db/version_set.cc                           | 346 +++++++++++---------
 db/version_set.h                            |  42 +--
 util/ldb_cmd.cc                             |   2 +-
 utilities/compacted_db/compacted_db_impl.cc |   2 +-
 16 files changed, 421 insertions(+), 208 deletions(-)
 create mode 100644 db/version_builder.h
 create mode 100644 db/version_builder_test.cc

diff --git a/Makefile b/Makefile
index 8642834b8..5ed8a5a67 100644
--- a/Makefile
+++ b/Makefile
@@ -133,6 +133,7 @@ TESTS = \
 	version_edit_test \
 	version_set_test \
   compaction_picker_test \
+	version_builder_test \
 	file_indexer_test \
 	write_batch_test \
 	write_controller_test\
@@ -464,6 +465,9 @@ version_set_test: db/version_set_test.o $(LIBOBJECTS) $(TESTHARNESS)
 compaction_picker_test: db/compaction_picker_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) db/compaction_picker_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
 
+version_builder_test: db/version_builder_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) db/version_builder_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
 file_indexer_test : db/file_indexer_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) db/file_indexer_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
 
diff --git a/db/column_family.cc b/db/column_family.cc
index e6298692a..b7497ecfe 100644
--- a/db/column_family.cc
+++ b/db/column_family.cc
@@ -324,7 +324,7 @@ ColumnFamilyData::~ColumnFamilyData() {
 void ColumnFamilyData::RecalculateWriteStallConditions(
       const MutableCFOptions& mutable_cf_options) {
   if (current_ != nullptr) {
-    auto* vstorage = current_->GetStorageInfo();
+    auto* vstorage = current_->storage_info();
     const double score = vstorage->MaxCompactionScore();
     const int max_level = vstorage->MaxCompactionScoreLevel();
 
@@ -405,7 +405,7 @@ void ColumnFamilyData::CreateNewMemtable(
 Compaction* ColumnFamilyData::PickCompaction(
     const MutableCFOptions& mutable_options, LogBuffer* log_buffer) {
   auto* result = compaction_picker_->PickCompaction(
-      GetName(), mutable_options, current_->GetStorageInfo(), log_buffer);
+      GetName(), mutable_options, current_->storage_info(), log_buffer);
   if (result != nullptr) {
     result->SetInputVersion(current_);
   }
@@ -418,7 +418,7 @@ Compaction* ColumnFamilyData::CompactRange(
     const InternalKey* begin, const InternalKey* end,
     InternalKey** compaction_end) {
   auto* result = compaction_picker_->CompactRange(
-      GetName(), mutable_cf_options, current_->GetStorageInfo(), input_level,
+      GetName(), mutable_cf_options, current_->storage_info(), input_level,
       output_level, output_path_id, begin, end, compaction_end);
   if (result != nullptr) {
     result->SetInputVersion(current_);
diff --git a/db/compaction.cc b/db/compaction.cc
index a739da29e..6c76012db 100644
--- a/db/compaction.cc
+++ b/db/compaction.cc
@@ -129,7 +129,7 @@ bool Compaction::KeyNotExistsBeyondOutputLevel(const Slice& user_key) {
   const Comparator* user_cmp = cfd_->user_comparator();
   for (int lvl = output_level_ + 1; lvl < number_levels_; lvl++) {
     const std::vector<FileMetaData*>& files =
-        input_version_->GetStorageInfo()->LevelFiles(lvl);
+        input_version_->storage_info()->LevelFiles(lvl);
     for (; level_ptrs_[lvl] < files.size(); ) {
       FileMetaData* f = files[level_ptrs_[lvl]];
       if (user_cmp->Compare(user_key, f->largest.user_key()) <= 0) {
@@ -228,7 +228,7 @@ void Compaction::ReleaseCompactionFiles(Status status) {
 
 void Compaction::ResetNextCompactionIndex() {
   assert(input_version_ != nullptr);
-  input_version_->GetStorageInfo()->ResetNextCompactionIndex(start_level_);
+  input_version_->storage_info()->ResetNextCompactionIndex(start_level_);
 }
 
 namespace {
diff --git a/db/compaction_picker_test.cc b/db/compaction_picker_test.cc
index c302d2a2a..f094fbafb 100644
--- a/db/compaction_picker_test.cc
+++ b/db/compaction_picker_test.cc
@@ -50,9 +50,8 @@ class CompactionPickerTest {
   }
 
   ~CompactionPickerTest() {
-    auto* files = vstorage.GetFiles();
     for (int i = 0; i < vstorage.NumberLevels(); i++) {
-      for (auto* f : files[i]) {
+      for (auto* f : vstorage.LevelFiles(i)) {
         delete f;
       }
     }
@@ -63,13 +62,13 @@ class CompactionPickerTest {
            SequenceNumber smallest_seq = 100,
            SequenceNumber largest_seq = 100) {
     assert(level < vstorage.NumberLevels());
-    auto& files = vstorage.GetFiles()[level];
     FileMetaData* f = new FileMetaData;
     f->fd = FileDescriptor(file_number, path_id, file_size);
     f->smallest = InternalKey(smallest, smallest_seq, kTypeValue);
     f->largest = InternalKey(largest, largest_seq, kTypeValue);
     f->compensated_file_size = file_size;
-    files.push_back(f);
+    f->refs = 0;
+    vstorage.MaybeAddFile(level, f);
   }
 
   void UpdateVersionStorageInfo() {
diff --git a/db/db_impl.cc b/db/db_impl.cc
index 78fb4ce13..231325cc3 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -1181,7 +1181,7 @@ Status DBImpl::FlushMemTableToOutputFile(
     }
     VersionStorageInfo::LevelSummaryStorage tmp;
     LogToBuffer(log_buffer, "[%s] Level summary: %s\n", cfd->GetName().c_str(),
-                cfd->current()->GetStorageInfo()->LevelSummary(&tmp));
+                cfd->current()->storage_info()->LevelSummary(&tmp));
 
     if (disable_delete_obsolete_files_ == 0) {
       // add to deletion state
@@ -1227,7 +1227,7 @@ Status DBImpl::CompactRange(ColumnFamilyHandle* column_family,
     MutexLock l(&mutex_);
     Version* base = cfd->current();
     for (int level = 1; level < cfd->NumberLevels(); level++) {
-      if (base->GetStorageInfo()->OverlapInLevel(level, begin, end)) {
+      if (base->storage_info()->OverlapInLevel(level, begin, end)) {
         max_level_with_files = level;
       }
     }
@@ -1305,7 +1305,7 @@ bool DBImpl::SetOptions(ColumnFamilyHandle* column_family,
 int DBImpl::FindMinimumEmptyLevelFitting(ColumnFamilyData* cfd,
     const MutableCFOptions& mutable_cf_options, int level) {
   mutex_.AssertHeld();
-  auto* vstorage = cfd->current()->GetStorageInfo();
+  const auto* vstorage = cfd->current()->storage_info();
   int minimum_level = level;
   for (int i = level - 1; i > 0; --i) {
     // stop if level i is not empty
@@ -1364,7 +1364,7 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) {
 
     VersionEdit edit;
     edit.SetColumnFamily(cfd->GetID());
-    for (const auto& f : cfd->current()->GetStorageInfo()->files_[level]) {
+    for (const auto& f : cfd->current()->storage_info()->LevelFiles(level)) {
       edit.DeleteFile(level, f->fd.GetNumber());
       edit.AddFile(to_level, f->fd.GetNumber(), f->fd.GetPathId(),
                    f->fd.GetFileSize(), f->smallest, f->largest,
@@ -1580,7 +1580,7 @@ void DBImpl::MaybeScheduleFlushOrCompaction() {
     bool is_compaction_needed = false;
     // no need to refcount since we're under a mutex
     for (auto cfd : *versions_->GetColumnFamilySet()) {
-      if (cfd->current()->GetStorageInfo()->NeedsCompaction()) {
+      if (cfd->current()->storage_info()->NeedsCompaction()) {
         is_compaction_needed = true;
         break;
       }
@@ -1956,7 +1956,7 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress, JobContext* job_context,
                             " bytes %s: %s\n",
                 c->column_family_data()->GetName().c_str(), f->fd.GetNumber(),
                 c->level() + 1, f->fd.GetFileSize(), status.ToString().c_str(),
-                c->input_version()->GetStorageInfo()->LevelSummary(&tmp));
+                c->input_version()->storage_info()->LevelSummary(&tmp));
     c->ReleaseCompactionFiles(status);
     *madeProgress = true;
   } else {
@@ -2688,7 +2688,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact,
   LogToBuffer(log_buffer, "[%s] Compaction start summary: %s\n",
               cfd->GetName().c_str(), scratch);
 
-  assert(cfd->current()->GetStorageInfo()->NumLevelFiles(
+  assert(cfd->current()->storage_info()->NumLevelFiles(
              compact->compaction->level()) > 0);
   assert(compact->builder == nullptr);
   assert(!compact->outfile);
@@ -2934,7 +2934,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact,
               "MB in(%.1f, %.1f) out(%.1f), read-write-amplify(%.1f) "
               "write-amplify(%.1f) %s, records in: %d, records dropped: %d\n",
               cfd->GetName().c_str(),
-              cfd->current()->GetStorageInfo()->LevelSummary(&tmp),
+              cfd->current()->storage_info()->LevelSummary(&tmp),
               (stats.bytes_readn + stats.bytes_readnp1) /
                   static_cast<double>(stats.micros),
               stats.bytes_written / static_cast<double>(stats.micros),
@@ -4040,7 +4040,7 @@ Status DBImpl::DeleteFile(std::string name) {
     // Only the files in the last level can be deleted externally.
     // This is to make sure that any deletion tombstones are not
     // lost. Check that the level passed is the last level.
-    auto* vstoreage = cfd->current()->GetStorageInfo();
+    auto* vstoreage = cfd->current()->storage_info();
     for (int i = level + 1; i < cfd->NumberLevels(); i++) {
       if (vstoreage->NumLevelFiles(i) != 0) {
         Log(db_options_.info_log,
@@ -4049,7 +4049,8 @@ Status DBImpl::DeleteFile(std::string name) {
       }
     }
     // if level == 0, it has to be the oldest file
-    if (level == 0 && vstoreage->files_[0].back()->fd.GetNumber() != number) {
+    if (level == 0 &&
+        vstoreage->LevelFiles(0).back()->fd.GetNumber() != number) {
       return Status::InvalidArgument("File in level 0, but not oldest");
     }
     edit.SetColumnFamily(cfd->GetID());
@@ -4302,7 +4303,7 @@ Status DB::Open(const DBOptions& db_options, const std::string& dbname,
     for (auto cfd : *impl->versions_->GetColumnFamilySet()) {
       if (cfd->ioptions()->compaction_style == kCompactionStyleUniversal ||
           cfd->ioptions()->compaction_style == kCompactionStyleFIFO) {
-        auto* vstorage = cfd->current()->GetStorageInfo();
+        auto* vstorage = cfd->current()->storage_info();
         for (int i = 1; i < vstorage->NumberLevels(); ++i) {
           int num_files = vstorage->NumLevelFiles(i);
           if (num_files > 0) {
diff --git a/db/db_impl_debug.cc b/db/db_impl_debug.cc
index 2d67167ba..283f9393f 100644
--- a/db/db_impl_debug.cc
+++ b/db/db_impl_debug.cc
@@ -15,8 +15,7 @@ namespace rocksdb {
 
 uint64_t DBImpl::TEST_GetLevel0TotalSize() {
   MutexLock l(&mutex_);
-  return default_cf_handle_->cfd()->current()->GetStorageInfo()->NumLevelBytes(
-      0);
+  return default_cf_handle_->cfd()->current()->storage_info()->NumLevelBytes(0);
 }
 
 Iterator* DBImpl::TEST_NewInternalIterator(Arena* arena,
@@ -46,7 +45,7 @@ int64_t DBImpl::TEST_MaxNextLevelOverlappingBytes(
     cfd = cfh->cfd();
   }
   MutexLock l(&mutex_);
-  return cfd->current()->GetStorageInfo()->MaxNextLevelOverlappingBytes();
+  return cfd->current()->storage_info()->MaxNextLevelOverlappingBytes();
 }
 
 void DBImpl::TEST_GetFilesMetaData(
@@ -58,7 +57,7 @@ void DBImpl::TEST_GetFilesMetaData(
   metadata->resize(NumberLevels());
   for (int level = 0; level < NumberLevels(); level++) {
     const std::vector<FileMetaData*>& files =
-        cfd->current()->GetStorageInfo()->LevelFiles(level);
+        cfd->current()->storage_info()->LevelFiles(level);
 
     (*metadata)[level].clear();
     for (const auto& f : files) {
diff --git a/db/flush_job.cc b/db/flush_job.cc
index fda80cea8..c477a5e8d 100644
--- a/db/flush_job.cc
+++ b/db/flush_job.cc
@@ -202,7 +202,7 @@ Status FlushJob::WriteLevel0Table(const autovector<MemTable*>& mems,
     if (base != nullptr && db_options_.max_background_compactions <= 1 &&
         db_options_.max_background_flushes == 0 &&
         cfd_->ioptions()->compaction_style == kCompactionStyleLevel) {
-      level = base->GetStorageInfo()->PickLevelForMemTableOutput(
+      level = base->storage_info()->PickLevelForMemTableOutput(
           mutable_cf_options_, min_user_key, max_user_key);
     }
     edit->AddFile(level, meta.fd.GetNumber(), meta.fd.GetPathId(),
diff --git a/db/forward_iterator.cc b/db/forward_iterator.cc
index 88415e5b8..154af1147 100644
--- a/db/forward_iterator.cc
+++ b/db/forward_iterator.cc
@@ -220,7 +220,7 @@ void ForwardIterator::SeekInternal(const Slice& internal_key,
     if (!seek_to_first) {
       user_key = ExtractUserKey(internal_key);
     }
-    VersionStorageInfo* vstorage = sv_->current->GetStorageInfo();
+    const VersionStorageInfo* vstorage = sv_->current->storage_info();
     const std::vector<FileMetaData*>& l0 = vstorage->LevelFiles(0);
     for (uint32_t i = 0; i < l0.size(); ++i) {
       if (seek_to_first) {
@@ -430,7 +430,7 @@ void ForwardIterator::RebuildIterators(bool refresh_sv) {
   mutable_iter_ = sv_->mem->NewIterator(read_options_, &arena_);
   sv_->imm->AddIterators(read_options_, &imm_iters_, &arena_);
 
-  auto* vstorage = sv_->current->GetStorageInfo();
+  const auto* vstorage = sv_->current->storage_info();
   const auto& l0_files = vstorage->LevelFiles(0);
   l0_iters_.reserve(l0_files.size());
   for (const auto* l0 : l0_files) {
@@ -454,7 +454,7 @@ void ForwardIterator::RebuildIterators(bool refresh_sv) {
 }
 
 void ForwardIterator::ResetIncompleteIterators() {
-  const auto& l0_files = sv_->current->GetStorageInfo()->LevelFiles(0);
+  const auto& l0_files = sv_->current->storage_info()->LevelFiles(0);
   for (uint32_t i = 0; i < l0_iters_.size(); ++i) {
     assert(i < l0_files.size());
     if (!l0_iters_[i]->status().IsIncomplete()) {
diff --git a/db/internal_stats.cc b/db/internal_stats.cc
index ca0a8d62c..1440dbe42 100644
--- a/db/internal_stats.cc
+++ b/db/internal_stats.cc
@@ -170,7 +170,7 @@ bool InternalStats::GetStringProperty(DBPropertyType property_type,
                                       std::string* value) {
   assert(value != nullptr);
   auto* current = cfd_->current();
-  auto* vstorage = current->GetStorageInfo();
+  const auto* vstorage = current->storage_info();
   Slice in = property;
 
   switch (property_type) {
@@ -230,7 +230,7 @@ bool InternalStats::GetStringProperty(DBPropertyType property_type,
 
 bool InternalStats::GetIntProperty(DBPropertyType property_type,
                                    uint64_t* value, DBImpl* db) const {
-  auto* vstorage = cfd_->current()->GetStorageInfo();
+  const auto* vstorage = cfd_->current()->storage_info();
 
   switch (property_type) {
     case kNumImmutableMemTable:
@@ -366,7 +366,7 @@ void InternalStats::DumpDBStats(std::string* value) {
 }
 
 void InternalStats::DumpCFStats(std::string* value) {
-  VersionStorageInfo* vstorage = cfd_->current()->GetStorageInfo();
+  const VersionStorageInfo* vstorage = cfd_->current()->storage_info();
 
   int num_levels_to_check =
       (cfd_->options()->compaction_style != kCompactionStyleUniversal &&
diff --git a/db/version_builder.h b/db/version_builder.h
new file mode 100644
index 000000000..f8c91a88c
--- /dev/null
+++ b/db/version_builder.h
@@ -0,0 +1,40 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+#pragma once
+#include "rocksdb/env.h"
+
+namespace rocksdb {
+
+class TableCache;
+class VersionStorageInfo;
+class VersionEdit;
+class FileMetaData;
+
+// A helper class so we can efficiently apply a whole sequence
+// of edits to a particular state without creating intermediate
+// Versions that contain full copies of the intermediate state.
+class VersionBuilder {
+ public:
+  VersionBuilder(const EnvOptions& env_options, TableCache* table_cache,
+                 VersionStorageInfo* base_vstorage);
+  ~VersionBuilder();
+  void CheckConsistency(VersionStorageInfo* vstorage);
+  void CheckConsistencyForDeletes(VersionEdit* edit, uint64_t number,
+                                  int level);
+  void Apply(VersionEdit* edit);
+  void SaveTo(VersionStorageInfo* vstorage);
+  void LoadTableHandlers();
+  void MaybeAddFile(VersionStorageInfo* vstorage, int level, FileMetaData* f);
+
+ private:
+  class Rep;
+  Rep* rep_;
+};
+}  // namespace rocksdb
diff --git a/db/version_builder_test.cc b/db/version_builder_test.cc
new file mode 100644
index 000000000..e11f78eb1
--- /dev/null
+++ b/db/version_builder_test.cc
@@ -0,0 +1,123 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include <string>
+#include "db/version_edit.h"
+#include "db/version_set.h"
+#include "util/logging.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+namespace rocksdb {
+
+class VersionBuilderTest {
+ public:
+  const Comparator* ucmp;
+  InternalKeyComparator icmp;
+  Options options;
+  ImmutableCFOptions ioptions;
+  MutableCFOptions mutable_cf_options;
+  VersionStorageInfo vstorage;
+  uint32_t file_num;
+  CompactionOptionsFIFO fifo_options;
+  std::vector<uint64_t> size_being_compacted;
+
+  VersionBuilderTest()
+      : ucmp(BytewiseComparator()),
+        icmp(ucmp),
+        ioptions(options),
+        mutable_cf_options(options, ioptions),
+        vstorage(&icmp, ucmp, options.num_levels, kCompactionStyleLevel,
+                 nullptr),
+        file_num(1) {
+    mutable_cf_options.RefreshDerivedOptions(ioptions);
+    size_being_compacted.resize(options.num_levels);
+  }
+
+  ~VersionBuilderTest() {
+    for (int i = 0; i < vstorage.NumberLevels(); i++) {
+      for (auto* f : vstorage.LevelFiles(i)) {
+        if (--f->refs == 0) {
+          delete f;
+        }
+      }
+    }
+  }
+
+  InternalKey GetInternalKey(const char* ukey,
+                             SequenceNumber smallest_seq = 100) {
+    return InternalKey(ukey, smallest_seq, kTypeValue);
+  }
+
+  void Add(int level, uint32_t file_number, const char* smallest,
+           const char* largest, uint64_t file_size = 0, uint32_t path_id = 0,
+           SequenceNumber smallest_seq = 100,
+           SequenceNumber largest_seq = 100) {
+    assert(level < vstorage.NumberLevels());
+    FileMetaData* f = new FileMetaData;
+    f->fd = FileDescriptor(file_number, path_id, file_size);
+    f->smallest = GetInternalKey(smallest, smallest_seq);
+    f->largest = GetInternalKey(largest, largest_seq);
+    f->compensated_file_size = file_size;
+    f->refs = 0;
+    vstorage.MaybeAddFile(level, f);
+  }
+
+  void UpdateVersionStorageInfo() {
+    vstorage.ComputeCompactionScore(mutable_cf_options, fifo_options,
+                                    size_being_compacted);
+    vstorage.UpdateFilesBySize();
+    vstorage.UpdateNumNonEmptyLevels();
+    vstorage.GenerateFileIndexer();
+    vstorage.GenerateLevelFilesBrief();
+    vstorage.SetFinalized();
+  }
+};
+
+TEST(VersionBuilderTest, ApplyAndSaveTo) {
+  Add(0, 1U, "150", "200", 100U);
+  // Level 1 score 1.2
+  Add(1, 66U, "150", "200", 100U);
+  Add(1, 88U, "201", "300", 100U);
+  // Level 2 score 1.8. File 7 is the largest. Should be picked
+  Add(2, 6U, "150", "179", 100U);
+  Add(2, 7U, "180", "220", 100U);
+  Add(2, 8U, "221", "300", 100U);
+  // Level 3 score slightly larger than 1
+  Add(3, 26U, "150", "170", 100U);
+  Add(3, 27U, "171", "179", 100U);
+  Add(3, 28U, "191", "220", 100U);
+  Add(3, 29U, "221", "300", 100U);
+  UpdateVersionStorageInfo();
+
+  VersionEdit version_edit;
+  version_edit.AddFile(2, 666, 0, 100U, GetInternalKey("301"),
+                       GetInternalKey("350"), 200, 200);
+  version_edit.DeleteFile(3, 27U);
+
+  EnvOptions env_options;
+
+  VersionBuilder version_builder(env_options, nullptr, &vstorage);
+
+  VersionStorageInfo new_vstorage(&icmp, ucmp, options.num_levels,
+                                  kCompactionStyleLevel, nullptr);
+  version_builder.Apply(&version_edit);
+  version_builder.SaveTo(&new_vstorage);
+
+  ASSERT_EQ(400U, new_vstorage.NumLevelBytes(2));
+  ASSERT_EQ(300U, new_vstorage.NumLevelBytes(3));
+
+  for (int i = 0; i < new_vstorage.NumberLevels(); i++) {
+    for (auto* f : new_vstorage.LevelFiles(i)) {
+      if (--f->refs == 0) {
+        delete f;
+      }
+    }
+  }
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) { return rocksdb::test::RunAllTests(); }
diff --git a/db/version_edit.h b/db/version_edit.h
index 3317b11c4..f8e71d2e9 100644
--- a/db/version_edit.h
+++ b/db/version_edit.h
@@ -217,14 +217,19 @@ class VersionEdit {
   bool EncodeTo(std::string* dst) const;
   Status DecodeFrom(const Slice& src);
 
+  typedef std::set<std::pair<int, uint64_t>> DeletedFileSet;
+
+  const DeletedFileSet& GetDeletedFiles() { return deleted_files_; }
+  const std::vector<std::pair<int, FileMetaData>>& GetNewFiles() {
+    return new_files_;
+  }
+
   std::string DebugString(bool hex_key = false) const;
 
  private:
   friend class VersionSet;
   friend class Version;
 
-  typedef std::set< std::pair<int, uint64_t>> DeletedFileSet;
-
   bool GetLevel(Slice* input, int* level, const char** msg);
 
   int max_level_;
diff --git a/db/version_set.cc b/db/version_set.cc
index 0069ef6b0..a195a7168 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -29,6 +29,7 @@
 #include "db/merge_context.h"
 #include "db/table_cache.h"
 #include "db/compaction.h"
+#include "db/version_builder.h"
 #include "rocksdb/env.h"
 #include "rocksdb/merge_operator.h"
 #include "table/table_reader.h"
@@ -315,9 +316,9 @@ Version::~Version() {
   next_->prev_ = prev_;
 
   // Drop references to files
-  for (int level = 0; level < vstorage_.num_levels_; level++) {
-    for (size_t i = 0; i < vstorage_.files_[level].size(); i++) {
-      FileMetaData* f = vstorage_.files_[level][i];
+  for (int level = 0; level < storage_info_.num_levels_; level++) {
+    for (size_t i = 0; i < storage_info_.files_[level].size(); i++) {
+      FileMetaData* f = storage_info_.files_[level][i];
       assert(f->refs > 0);
       f->refs--;
       if (f->refs <= 0) {
@@ -512,6 +513,23 @@ class LevelFileIteratorState : public TwoLevelIteratorState {
   bool for_compaction_;
 };
 
+// A wrapper of version builder which references the current version in
+// constructor and unref it in the destructor.
+class BaseReferencedVersionBuilder {
+ public:
+  explicit BaseReferencedVersionBuilder(ColumnFamilyData* cfd)
+      : version_builder_(cfd->current()->version_set()->GetEnvOptions(),
+                         cfd->table_cache(), cfd->current()->storage_info()),
+        version_(cfd->current()) {
+    version_->Ref();
+  }
+  ~BaseReferencedVersionBuilder() { version_->Unref(); }
+  VersionBuilder* GetVersionBuilder() { return &version_builder_; }
+
+ private:
+  VersionBuilder version_builder_;
+  Version* version_;
+};
 }  // anonymous namespace
 
 Status Version::GetTableProperties(std::shared_ptr<const TableProperties>* tp,
@@ -565,8 +583,8 @@ Status Version::GetTableProperties(std::shared_ptr<const TableProperties>* tp,
 }
 
 Status Version::GetPropertiesOfAllTables(TablePropertiesCollection* props) {
-  for (int level = 0; level < vstorage_.num_levels_; level++) {
-    for (const auto& file_meta : vstorage_.files_[level]) {
+  for (int level = 0; level < storage_info_.num_levels_; level++) {
+    for (const auto& file_meta : storage_info_.files_[level]) {
       auto fname =
           TableFileName(vset_->db_options_->db_paths, file_meta->fd.GetNumber(),
                         file_meta->fd.GetPathId());
@@ -587,7 +605,7 @@ Status Version::GetPropertiesOfAllTables(TablePropertiesCollection* props) {
 
 size_t Version::GetMemoryUsageByTableReaders() {
   size_t total_usage = 0;
-  for (auto& file_level : vstorage_.level_files_brief_) {
+  for (auto& file_level : storage_info_.level_files_brief_) {
     for (size_t i = 0; i < file_level.num_files; i++) {
       total_usage += cfd_->table_cache()->GetMemoryUsageByTableReader(
           vset_->env_options_, cfd_->internal_comparator(),
@@ -597,7 +615,7 @@ size_t Version::GetMemoryUsageByTableReaders() {
   return total_usage;
 }
 
-uint64_t VersionStorageInfo::GetEstimatedActiveKeys() {
+uint64_t VersionStorageInfo::GetEstimatedActiveKeys() const {
   // Estimation will be not accurate when:
   // (1) there is merge keys
   // (2) keys are directly overwritten
@@ -620,11 +638,11 @@ uint64_t VersionStorageInfo::GetEstimatedActiveKeys() {
 void Version::AddIterators(const ReadOptions& read_options,
                            const EnvOptions& soptions,
                            MergeIteratorBuilder* merge_iter_builder) {
-  assert(vstorage_.finalized_);
+  assert(storage_info_.finalized_);
 
   // Merge all level zero files together since they may overlap
-  for (size_t i = 0; i < vstorage_.level_files_brief_[0].num_files; i++) {
-    const auto& file = vstorage_.level_files_brief_[0].files[i];
+  for (size_t i = 0; i < storage_info_.LevelFilesBrief(0).num_files; i++) {
+    const auto& file = storage_info_.LevelFilesBrief(0).files[i];
     merge_iter_builder->AddIterator(cfd_->table_cache()->NewIterator(
         read_options, soptions, cfd_->internal_comparator(), file.fd, nullptr,
         false, merge_iter_builder->GetArena()));
@@ -633,15 +651,15 @@ void Version::AddIterators(const ReadOptions& read_options,
   // For levels > 0, we can use a concatenating iterator that sequentially
   // walks through the non-overlapping files in the level, opening them
   // lazily.
-  for (int level = 1; level < vstorage_.num_levels_; level++) {
-    if (vstorage_.level_files_brief_[level].num_files != 0) {
+  for (int level = 1; level < storage_info_.NumberLevels(); level++) {
+    if (storage_info_.level_files_brief_[level].num_files != 0) {
       merge_iter_builder->AddIterator(NewTwoLevelIterator(
           new LevelFileIteratorState(
               cfd_->table_cache(), read_options, soptions,
               cfd_->internal_comparator(), false /* for_compaction */,
               cfd_->ioptions()->prefix_extractor != nullptr),
           new LevelFileNumIterator(cfd_->internal_comparator(),
-                                   &vstorage_.level_files_brief_[level]),
+                                   &storage_info_.LevelFilesBrief(level)),
           merge_iter_builder->GetArena()));
     }
   }
@@ -689,14 +707,14 @@ Version::Version(ColumnFamilyData* cfd, VersionSet* vset,
       table_cache_((cfd == nullptr) ? nullptr : cfd->table_cache()),
       merge_operator_((cfd == nullptr) ? nullptr
                                        : cfd->ioptions()->merge_operator),
-      vstorage_((cfd == nullptr) ? nullptr : &cfd->internal_comparator(),
-                (cfd == nullptr) ? nullptr : cfd->user_comparator(),
-                cfd == nullptr ? 0 : cfd->NumberLevels(),
-                cfd == nullptr ? kCompactionStyleLevel
-                               : cfd->ioptions()->compaction_style,
-                (cfd == nullptr || cfd->current() == nullptr)
-                    ? nullptr
-                    : cfd->current()->GetStorageInfo()),
+      storage_info_((cfd == nullptr) ? nullptr : &cfd->internal_comparator(),
+                    (cfd == nullptr) ? nullptr : cfd->user_comparator(),
+                    cfd == nullptr ? 0 : cfd->NumberLevels(),
+                    cfd == nullptr ? kCompactionStyleLevel
+                                   : cfd->ioptions()->compaction_style,
+                    (cfd == nullptr || cfd->current() == nullptr)
+                        ? nullptr
+                        : cfd->current()->storage_info()),
       vset_(vset),
       next_(this),
       prev_(this),
@@ -715,16 +733,17 @@ void Version::Get(const ReadOptions& read_options,
   assert(status->ok() || status->IsMergeInProgress());
 
   GetContext get_context(
-      GetUserComparator(), merge_operator_, info_log_, db_statistics_,
+      user_comparator(), merge_operator_, info_log_, db_statistics_,
       status->ok() ? GetContext::kNotFound : GetContext::kMerge, user_key,
       value, value_found, merge_context);
 
-  FilePicker fp(vstorage_.files_, user_key, ikey, &vstorage_.level_files_brief_,
-                vstorage_.num_non_empty_levels_, &vstorage_.file_indexer_,
-                GetUserComparator(), GetInternalComparator());
+  FilePicker fp(
+      storage_info_.files_, user_key, ikey, &storage_info_.level_files_brief_,
+      storage_info_.num_non_empty_levels_, &storage_info_.file_indexer_,
+      user_comparator(), internal_comparator());
   FdWithKeyRange* f = fp.GetNextFile();
   while (f != nullptr) {
-    *status = table_cache_->Get(read_options, *GetInternalComparator(), f->fd,
+    *status = table_cache_->Get(read_options, *internal_comparator(), f->fd,
                                 ikey, &get_context);
     // TODO: examine the behavior for corrupted key
     if (!status->ok()) {
@@ -783,13 +802,13 @@ void VersionStorageInfo::GenerateLevelFilesBrief() {
 void Version::PrepareApply(const MutableCFOptions& mutable_cf_options,
                            std::vector<uint64_t>& size_being_compacted) {
   UpdateAccumulatedStats();
-  vstorage_.ComputeCompactionScore(mutable_cf_options,
-                                   cfd_->ioptions()->compaction_options_fifo,
-                                   size_being_compacted);
-  vstorage_.UpdateFilesBySize();
-  vstorage_.UpdateNumNonEmptyLevels();
-  vstorage_.GenerateFileIndexer();
-  vstorage_.GenerateLevelFilesBrief();
+  storage_info_.ComputeCompactionScore(
+      mutable_cf_options, cfd_->ioptions()->compaction_options_fifo,
+      size_being_compacted);
+  storage_info_.UpdateFilesBySize();
+  storage_info_.UpdateNumNonEmptyLevels();
+  storage_info_.GenerateFileIndexer();
+  storage_info_.GenerateLevelFilesBrief();
 }
 
 bool Version::MaybeInitializeFileMetaData(FileMetaData* file_meta) {
@@ -841,11 +860,12 @@ void Version::UpdateAccumulatedStats() {
   // will be triggered, which creates higher-level files whose num_deletions
   // will be updated here.
   for (int level = 0;
-       level < vstorage_.num_levels_ && init_count < kMaxInitCount; ++level) {
-    for (auto* file_meta : vstorage_.files_[level]) {
+       level < storage_info_.num_levels_ && init_count < kMaxInitCount;
+       ++level) {
+    for (auto* file_meta : storage_info_.files_[level]) {
       if (MaybeInitializeFileMetaData(file_meta)) {
         // each FileMeta will be initialized only once.
-        vstorage_.UpdateAccumulatedStats(file_meta);
+        storage_info_.UpdateAccumulatedStats(file_meta);
         if (++init_count >= kMaxInitCount) {
           break;
         }
@@ -855,17 +875,17 @@ void Version::UpdateAccumulatedStats() {
   // In case all sampled-files contain only deletion entries, then we
   // load the table-property of a file in higher-level to initialize
   // that value.
-  for (int level = vstorage_.num_levels_ - 1;
-       vstorage_.accumulated_raw_value_size_ == 0 && level >= 0; --level) {
-    for (int i = static_cast<int>(vstorage_.files_[level].size()) - 1;
-         vstorage_.accumulated_raw_value_size_ == 0 && i >= 0; --i) {
-      if (MaybeInitializeFileMetaData(vstorage_.files_[level][i])) {
-        vstorage_.UpdateAccumulatedStats(vstorage_.files_[level][i]);
+  for (int level = storage_info_.num_levels_ - 1;
+       storage_info_.accumulated_raw_value_size_ == 0 && level >= 0; --level) {
+    for (int i = static_cast<int>(storage_info_.files_[level].size()) - 1;
+         storage_info_.accumulated_raw_value_size_ == 0 && i >= 0; --i) {
+      if (MaybeInitializeFileMetaData(storage_info_.files_[level][i])) {
+        storage_info_.UpdateAccumulatedStats(storage_info_.files_[level][i]);
       }
     }
   }
 
-  vstorage_.ComputeCompensatedSizes();
+  storage_info_.ComputeCompensatedSizes();
 }
 
 void VersionStorageInfo::ComputeCompensatedSizes() {
@@ -987,6 +1007,18 @@ bool CompareCompensatedSizeDescending(const Fsize& first, const Fsize& second) {
 
 } // anonymous namespace
 
+void VersionStorageInfo::MaybeAddFile(int level, FileMetaData* f) {
+  assert(level < NumberLevels());
+  auto* level_files = &files_[level];
+  // Must not overlap
+  assert(level <= 0 || level_files->empty() ||
+         internal_comparator_->Compare(
+             (*level_files)[level_files->size() - 1]->largest, f->smallest) <
+             0);
+  f->refs++;
+  level_files->push_back(f);
+}
+
 void VersionStorageInfo::UpdateNumNonEmptyLevels() {
   num_non_empty_levels_ = num_levels_;
   for (int i = num_levels_ - 1; i >= 0; i--) {
@@ -1379,8 +1411,8 @@ int64_t VersionStorageInfo::MaxNextLevelOverlappingBytes() {
 }
 
 void Version::AddLiveFiles(std::vector<FileDescriptor>* live) {
-  for (int level = 0; level < vstorage_.NumberLevels(); level++) {
-    const std::vector<FileMetaData*>& files = vstorage_.files_[level];
+  for (int level = 0; level < storage_info_.NumberLevels(); level++) {
+    const std::vector<FileMetaData*>& files = storage_info_.files_[level];
     for (const auto& file : files) {
       live->push_back(file->fd);
     }
@@ -1389,7 +1421,7 @@ void Version::AddLiveFiles(std::vector<FileDescriptor>* live) {
 
 std::string Version::DebugString(bool hex) const {
   std::string r;
-  for (int level = 0; level < vstorage_.num_levels_; level++) {
+  for (int level = 0; level < storage_info_.num_levels_; level++) {
     // E.g.,
     //   --- level 1 ---
     //   17:123['a' .. 'd']
@@ -1399,7 +1431,7 @@ std::string Version::DebugString(bool hex) const {
     r.append(" --- version# ");
     AppendNumberTo(&r, version_number_);
     r.append(" ---\n");
-    const std::vector<FileMetaData*>& files = vstorage_.files_[level];
+    const std::vector<FileMetaData*>& files = storage_info_.files_[level];
     for (size_t i = 0; i < files.size(); i++) {
       r.push_back(' ');
       AppendNumberTo(&r, files[i]->fd.GetNumber());
@@ -1428,10 +1460,7 @@ struct VersionSet::ManifestWriter {
       : done(false), cv(mu), cfd(cfd), edit(e) {}
 };
 
-// A helper class so we can efficiently apply a whole sequence
-// of edits to a particular state without creating intermediate
-// Versions that contain full copies of the intermediate state.
-class VersionSet::Builder {
+class VersionBuilder::Rep {
  private:
   // Helper to sort files_ in v
   // kLevel0 -- NewestFirstBySeqNo
@@ -1461,30 +1490,33 @@ class VersionSet::Builder {
     FileSet* added_files;
   };
 
-  ColumnFamilyData* cfd_;
-  Version* base_;
+  const EnvOptions& env_options_;
+  TableCache* table_cache_;
+  VersionStorageInfo* base_vstorage_;
   LevelState* levels_;
   FileComparator level_zero_cmp_;
   FileComparator level_nonzero_cmp_;
 
  public:
-  Builder(ColumnFamilyData* cfd) : cfd_(cfd), base_(cfd->current()) {
-    base_->Ref();
-    levels_ = new LevelState[base_->GetStorageInfo()->NumberLevels()];
+  Rep(const EnvOptions& env_options, TableCache* table_cache,
+      VersionStorageInfo* base_vstorage)
+      : env_options_(env_options),
+        table_cache_(table_cache),
+        base_vstorage_(base_vstorage) {
+    levels_ = new LevelState[base_vstorage_->NumberLevels()];
     level_zero_cmp_.sort_method = FileComparator::kLevel0;
     level_nonzero_cmp_.sort_method = FileComparator::kLevelNon0;
-    level_nonzero_cmp_.internal_comparator = &cfd->internal_comparator();
+    level_nonzero_cmp_.internal_comparator =
+        base_vstorage_->InternalComparator();
 
     levels_[0].added_files = new FileSet(level_zero_cmp_);
-    for (int level = 1; level < base_->GetStorageInfo()->NumberLevels();
-         level++) {
+    for (int level = 1; level < base_vstorage_->NumberLevels(); level++) {
         levels_[level].added_files = new FileSet(level_nonzero_cmp_);
     }
   }
 
-  ~Builder() {
-    for (int level = 0; level < base_->GetStorageInfo()->NumberLevels();
-         level++) {
+  ~Rep() {
+    for (int level = 0; level < base_vstorage_->NumberLevels(); level++) {
       const FileSet* added = levels_[level].added_files;
       std::vector<FileMetaData*> to_unref;
       to_unref.reserve(added->size());
@@ -1498,7 +1530,8 @@ class VersionSet::Builder {
         f->refs--;
         if (f->refs <= 0) {
           if (f->table_reader_handle) {
-            cfd_->table_cache()->ReleaseHandle(f->table_reader_handle);
+            assert(table_cache_ != nullptr);
+            table_cache_->ReleaseHandle(f->table_reader_handle);
             f->table_reader_handle = nullptr;
           }
           delete f;
@@ -1507,17 +1540,16 @@ class VersionSet::Builder {
     }
 
     delete[] levels_;
-    base_->Unref();
   }
 
-  void CheckConsistency(Version* v) {
+  void CheckConsistency(VersionStorageInfo* vstorage) {
 #ifndef NDEBUG
     // make sure the files are sorted correctly
-    auto* files = v->GetFiles();
-    for (int level = 0; level < v->GetStorageInfo()->NumberLevels(); level++) {
-      for (size_t i = 1; i < files[level].size(); i++) {
-        auto f1 = files[level][i - 1];
-        auto f2 = files[level][i];
+    for (int level = 0; level < vstorage->NumberLevels(); level++) {
+      auto& level_files = vstorage->LevelFiles(level);
+      for (size_t i = 1; i < level_files.size(); i++) {
+        auto f1 = level_files[i - 1];
+        auto f2 = level_files[i];
         if (level == 0) {
           assert(level_zero_cmp_(f1, f2));
           assert(f1->largest_seqno > f2->largest_seqno);
@@ -1525,8 +1557,8 @@ class VersionSet::Builder {
           assert(level_nonzero_cmp_(f1, f2));
 
           // Make sure there is no overlap in levels > 0
-          if (cfd_->internal_comparator().Compare(f1->largest, f2->smallest) >=
-              0) {
+          if (vstorage->InternalComparator()->Compare(f1->largest,
+                                                      f2->smallest) >= 0) {
             fprintf(stderr, "overlapping ranges in same level %s vs. %s\n",
                     (f1->largest).DebugString().c_str(),
                     (f2->smallest).DebugString().c_str());
@@ -1543,10 +1575,9 @@ class VersionSet::Builder {
 #ifndef NDEBUG
       // a file to be deleted better exist in the previous version
       bool found = false;
-      auto* files = base_->GetFiles();
-      for (int l = 0; !found && l < base_->GetStorageInfo()->NumberLevels();
-           l++) {
-        const std::vector<FileMetaData*>& base_files = files[l];
+      for (int l = 0; !found && l < base_vstorage_->NumberLevels(); l++) {
+        const std::vector<FileMetaData*>& base_files =
+            base_vstorage_->LevelFiles(l);
         for (unsigned int i = 0; i < base_files.size(); i++) {
           FileMetaData* f = base_files[i];
           if (f->fd.GetNumber() == number) {
@@ -1558,8 +1589,8 @@ class VersionSet::Builder {
       // if the file did not exist in the previous version, then it
       // is possibly moved from lower level to higher level in current
       // version
-      for (int l = level + 1;
-           !found && l < base_->GetStorageInfo()->NumberLevels(); l++) {
+      for (int l = level + 1; !found && l < base_vstorage_->NumberLevels();
+           l++) {
         const FileSet* added = levels_[l].added_files;
         for (FileSet::const_iterator added_iter = added->begin();
              added_iter != added->end(); ++added_iter) {
@@ -1592,10 +1623,10 @@ class VersionSet::Builder {
 
   // Apply all of the edits in *edit to the current state.
   void Apply(VersionEdit* edit) {
-    CheckConsistency(base_);
+    CheckConsistency(base_vstorage_);
 
     // Delete files
-    const VersionEdit::DeletedFileSet& del = edit->deleted_files_;
+    const VersionEdit::DeletedFileSet& del = edit->GetDeletedFiles();
     for (const auto& del_file : del) {
       const auto level = del_file.first;
       const auto number = del_file.second;
@@ -1604,7 +1635,7 @@ class VersionSet::Builder {
     }
 
     // Add new files
-    for (const auto& new_file : edit->new_files_) {
+    for (const auto& new_file : edit->GetNewFiles()) {
       const int level = new_file.first;
       FileMetaData* f = new FileMetaData(new_file.second);
       f->refs = 1;
@@ -1615,77 +1646,88 @@ class VersionSet::Builder {
   }
 
   // Save the current state in *v.
-  void SaveTo(Version* v) {
-    CheckConsistency(base_);
-    CheckConsistency(v);
+  void SaveTo(VersionStorageInfo* vstorage) {
+    CheckConsistency(base_vstorage_);
+    CheckConsistency(vstorage);
 
-    auto* out_files = v->GetFiles();
-    for (int level = 0; level < base_->GetStorageInfo()->NumberLevels();
-         level++) {
+    for (int level = 0; level < base_vstorage_->NumberLevels(); level++) {
       const auto& cmp = (level == 0) ? level_zero_cmp_ : level_nonzero_cmp_;
       // Merge the set of added files with the set of pre-existing files.
       // Drop any deleted files.  Store the result in *v.
-      const auto& base_files = base_->GetStorageInfo()->LevelFiles(level);
+      const auto& base_files = base_vstorage_->LevelFiles(level);
       auto base_iter = base_files.begin();
       auto base_end = base_files.end();
       const auto& added_files = *levels_[level].added_files;
-      out_files[level].reserve(base_files.size() + added_files.size());
+      vstorage->Reserve(level, base_files.size() + added_files.size());
 
       for (const auto& added : added_files) {
         // Add all smaller files listed in base_
         for (auto bpos = std::upper_bound(base_iter, base_end, added, cmp);
              base_iter != bpos;
              ++base_iter) {
-          MaybeAddFile(v, level, *base_iter);
+          MaybeAddFile(vstorage, level, *base_iter);
         }
 
-        MaybeAddFile(v, level, added);
+        MaybeAddFile(vstorage, level, added);
       }
 
       // Add remaining base files
       for (; base_iter != base_end; ++base_iter) {
-        MaybeAddFile(v, level, *base_iter);
+        MaybeAddFile(vstorage, level, *base_iter);
       }
     }
 
-    CheckConsistency(v);
+    CheckConsistency(vstorage);
   }
 
   void LoadTableHandlers() {
-    for (int level = 0; level < cfd_->NumberLevels(); level++) {
+    assert(table_cache_ != nullptr);
+    for (int level = 0; level < base_vstorage_->NumberLevels(); level++) {
       for (auto& file_meta : *(levels_[level].added_files)) {
-        assert (!file_meta->table_reader_handle);
-        cfd_->table_cache()->FindTable(
-            base_->GetVersionSet()->env_options_, cfd_->internal_comparator(),
+        assert(!file_meta->table_reader_handle);
+        table_cache_->FindTable(
+            env_options_, *(base_vstorage_->InternalComparator()),
             file_meta->fd, &file_meta->table_reader_handle, false);
         if (file_meta->table_reader_handle != nullptr) {
           // Load table_reader
-          file_meta->fd.table_reader =
-              cfd_->table_cache()->GetTableReaderFromHandle(
-                  file_meta->table_reader_handle);
-        }
+          file_meta->fd.table_reader = table_cache_->GetTableReaderFromHandle(
+              file_meta->table_reader_handle);
       }
     }
   }
+  }
 
-  void MaybeAddFile(Version* v, int level, FileMetaData* f) {
+  void MaybeAddFile(VersionStorageInfo* vstorage, int level, FileMetaData* f) {
     if (levels_[level].deleted_files.count(f->fd.GetNumber()) > 0) {
       // File is deleted: do nothing
     } else {
-      auto* files = v->GetFiles();
-      auto* level_files = &files[level];
-      if (level > 0 && !level_files->empty()) {
-        // Must not overlap
-        assert(cfd_->internal_comparator().Compare(
-                   (*level_files)[level_files->size() - 1]->largest,
-                   f->smallest) < 0);
-      }
-      f->refs++;
-      level_files->push_back(f);
+      vstorage->MaybeAddFile(level, f);
     }
   }
 };
 
+VersionBuilder::VersionBuilder(const EnvOptions& env_options,
+                               TableCache* table_cache,
+                               VersionStorageInfo* base_vstorage)
+    : rep_(new Rep(env_options, table_cache, base_vstorage)) {}
+VersionBuilder::~VersionBuilder() { delete rep_; }
+void VersionBuilder::CheckConsistency(VersionStorageInfo* vstorage) {
+  rep_->CheckConsistency(vstorage);
+}
+void VersionBuilder::CheckConsistencyForDeletes(VersionEdit* edit,
+                                                uint64_t number, int level) {
+  rep_->CheckConsistencyForDeletes(edit, number, level);
+}
+void VersionBuilder::Apply(VersionEdit* edit) { rep_->Apply(edit); }
+void VersionBuilder::SaveTo(VersionStorageInfo* vstorage) {
+  rep_->SaveTo(vstorage);
+}
+void VersionBuilder::LoadTableHandlers() { rep_->LoadTableHandlers(); }
+void VersionBuilder::MaybeAddFile(VersionStorageInfo* vstorage, int level,
+                                  FileMetaData* f) {
+  rep_->MaybeAddFile(vstorage, level, f);
+}
+
 VersionSet::VersionSet(const std::string& dbname, const DBOptions* db_options,
                        const EnvOptions& env_options, Cache* table_cache,
                        WriteController* write_controller)
@@ -1717,7 +1759,7 @@ VersionSet::~VersionSet() {
 void VersionSet::AppendVersion(ColumnFamilyData* column_family_data,
                                Version* v) {
   // Mark v finalized
-  v->vstorage_.SetFinalized();
+  v->storage_info_.SetFinalized();
 
   // Make "v" current
   assert(v->refs_ == 0);
@@ -1773,7 +1815,7 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
 
   std::vector<VersionEdit*> batch_edits;
   Version* v = nullptr;
-  std::unique_ptr<Builder> builder(nullptr);
+  std::unique_ptr<BaseReferencedVersionBuilder> builder_guard(nullptr);
 
   // process all requests in the queue
   ManifestWriter* last_writer = &w;
@@ -1785,7 +1827,8 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
     batch_edits.push_back(edit);
   } else {
     v = new Version(column_family_data, this, current_version_number_++);
-    builder.reset(new Builder(column_family_data));
+    builder_guard.reset(new BaseReferencedVersionBuilder(column_family_data));
+    auto* builder = builder_guard->GetVersionBuilder();
     for (const auto& writer : manifest_writers_) {
       if (writer->edit->IsColumnFamilyManipulation() ||
           writer->cfd->GetID() != column_family_data->GetID()) {
@@ -1794,11 +1837,10 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
         break;
       }
       last_writer = writer;
-      LogAndApplyHelper(column_family_data, builder.get(), v, last_writer->edit,
-                        mu);
+      LogAndApplyHelper(column_family_data, builder, v, last_writer->edit, mu);
       batch_edits.push_back(last_writer->edit);
     }
-    builder->SaveTo(v);
+    builder->SaveTo(v->storage_info());
   }
 
   // Initialize new descriptor log file if necessary by creating
@@ -1828,7 +1870,7 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
   {
     std::vector<uint64_t> size_being_compacted;
     if (!edit->IsColumnFamilyManipulation()) {
-      size_being_compacted.resize(v->GetStorageInfo()->NumberLevels() - 1);
+      size_being_compacted.resize(v->storage_info()->NumberLevels() - 1);
       // calculate the amount of data being compacted at every level
       column_family_data->compaction_picker()->SizeBeingCompacted(
           size_being_compacted);
@@ -1840,7 +1882,7 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
         db_options_->max_open_files == -1) {
       // unlimited table cache. Pre-load table handle now.
       // Need to do it out of the mutex.
-      builder->LoadTableHandlers();
+      builder_guard->GetVersionBuilder()->LoadTableHandlers();
     }
 
     // This is fine because everything inside of this block is serialized --
@@ -2019,9 +2061,9 @@ void VersionSet::LogAndApplyCFHelper(VersionEdit* edit) {
   }
 }
 
-void VersionSet::LogAndApplyHelper(ColumnFamilyData* cfd, Builder* builder,
-                                   Version* v, VersionEdit* edit,
-                                   port::Mutex* mu) {
+void VersionSet::LogAndApplyHelper(ColumnFamilyData* cfd,
+                                   VersionBuilder* builder, Version* v,
+                                   VersionEdit* edit, port::Mutex* mu) {
   mu->AssertHeld();
   assert(!edit->IsColumnFamilyManipulation());
 
@@ -2097,7 +2139,7 @@ Status VersionSet::Recover(
   uint64_t log_number = 0;
   uint64_t prev_log_number = 0;
   uint32_t max_column_family = 0;
-  std::unordered_map<uint32_t, Builder*> builders;
+  std::unordered_map<uint32_t, BaseReferencedVersionBuilder*> builders;
 
   // add default column family
   auto default_cf_iter = cf_name_to_options.find(kDefaultColumnFamilyName);
@@ -2109,7 +2151,7 @@ Status VersionSet::Recover(
   default_cf_edit.SetColumnFamily(0);
   ColumnFamilyData* default_cfd =
       CreateColumnFamily(default_cf_iter->second, &default_cf_edit);
-  builders.insert({0, new Builder(default_cfd)});
+  builders.insert({0, new BaseReferencedVersionBuilder(default_cfd)});
 
   {
     VersionSet::LogReporter reporter;
@@ -2155,7 +2197,8 @@ Status VersionSet::Recover(
               {edit.column_family_, edit.column_family_name_});
         } else {
           cfd = CreateColumnFamily(cf_options->second, &edit);
-          builders.insert({edit.column_family_, new Builder(cfd)});
+          builders.insert(
+              {edit.column_family_, new BaseReferencedVersionBuilder(cfd)});
         }
       } else if (edit.is_column_family_drop_) {
         if (cf_in_builders) {
@@ -2188,8 +2231,7 @@ Status VersionSet::Recover(
         cfd = column_family_set_->GetColumnFamily(edit.column_family_);
         // this should never happen since cf_in_builders is true
         assert(cfd != nullptr);
-        if (edit.max_level_ >=
-            cfd->current()->GetStorageInfo()->NumberLevels()) {
+        if (edit.max_level_ >= cfd->current()->storage_info()->NumberLevels()) {
           s = Status::InvalidArgument(
               "db has more levels than options.num_levels");
           break;
@@ -2200,7 +2242,7 @@ Status VersionSet::Recover(
         // to builder
         auto builder = builders.find(edit.column_family_);
         assert(builder != builders.end());
-        builder->second->Apply(&edit);
+        builder->second->GetVersionBuilder()->Apply(&edit);
       }
 
       if (cfd != nullptr) {
@@ -2280,7 +2322,7 @@ Status VersionSet::Recover(
     for (auto cfd : *column_family_set_) {
       auto builders_iter = builders.find(cfd->GetID());
       assert(builders_iter != builders.end());
-      auto builder = builders_iter->second;
+      auto builder = builders_iter->second->GetVersionBuilder();
 
       if (db_options_->max_open_files == -1) {
       // unlimited table cache. Pre-load table handle now.
@@ -2289,11 +2331,11 @@ Status VersionSet::Recover(
       }
 
       Version* v = new Version(cfd, this, current_version_number_++);
-      builder->SaveTo(v);
+      builder->SaveTo(v->storage_info());
 
       // Install recovered version
       std::vector<uint64_t> size_being_compacted(
-          v->GetStorageInfo()->NumberLevels() - 1);
+          v->storage_info()->NumberLevels() - 1);
       cfd->compaction_picker()->SizeBeingCompacted(size_being_compacted);
       v->PrepareApply(*cfd->GetLatestMutableCFOptions(), size_being_compacted);
       AppendVersion(cfd, v);
@@ -2425,7 +2467,7 @@ Status VersionSet::ReduceNumberOfLevels(const std::string& dbname,
 
   Version* current_version =
       versions.GetColumnFamilySet()->GetDefault()->current();
-  auto* vstorage = current_version->GetStorageInfo();
+  auto* vstorage = current_version->storage_info();
   int current_levels = vstorage->NumberLevels();
 
   if (current_levels <= new_levels) {
@@ -2454,18 +2496,17 @@ Status VersionSet::ReduceNumberOfLevels(const std::string& dbname,
     }
   }
 
-  std::vector<FileMetaData*>* old_files_list = vstorage->GetFiles();
   // we need to allocate an array with the old number of levels size to
   // avoid SIGSEGV in WriteSnapshot()
   // however, all levels bigger or equal to new_levels will be empty
   std::vector<FileMetaData*>* new_files_list =
       new std::vector<FileMetaData*>[current_levels];
   for (int i = 0; i < new_levels - 1; i++) {
-    new_files_list[i] = old_files_list[i];
+    new_files_list[i] = vstorage->LevelFiles(i);
   }
 
   if (first_nonempty_level > 0) {
-    new_files_list[new_levels - 1] = old_files_list[first_nonempty_level];
+    new_files_list[new_levels - 1] = vstorage->LevelFiles(first_nonempty_level);
   }
 
   delete[] vstorage -> files_;
@@ -2498,7 +2539,7 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname,
   uint64_t prev_log_number = 0;
   int count = 0;
   std::unordered_map<uint32_t, std::string> comparators;
-  std::unordered_map<uint32_t, Builder*> builders;
+  std::unordered_map<uint32_t, BaseReferencedVersionBuilder*> builders;
 
   // add default column family
   VersionEdit default_cf_edit;
@@ -2506,7 +2547,7 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname,
   default_cf_edit.SetColumnFamily(0);
   ColumnFamilyData* default_cfd =
       CreateColumnFamily(ColumnFamilyOptions(options), &default_cf_edit);
-  builders.insert({0, new Builder(default_cfd)});
+  builders.insert({0, new BaseReferencedVersionBuilder(default_cfd)});
 
   {
     VersionSet::LogReporter reporter;
@@ -2545,7 +2586,8 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname,
           break;
         }
         cfd = CreateColumnFamily(ColumnFamilyOptions(options), &edit);
-        builders.insert({edit.column_family_, new Builder(cfd)});
+        builders.insert(
+            {edit.column_family_, new BaseReferencedVersionBuilder(cfd)});
       } else if (edit.is_column_family_drop_) {
         if (!cf_in_builders) {
           s = Status::Corruption(
@@ -2577,7 +2619,7 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname,
         // to builder
         auto builder = builders.find(edit.column_family_);
         assert(builder != builders.end());
-        builder->second->Apply(&edit);
+        builder->second->GetVersionBuilder()->Apply(&edit);
       }
 
       if (cfd != nullptr && edit.has_log_number_) {
@@ -2624,12 +2666,12 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname,
     for (auto cfd : *column_family_set_) {
       auto builders_iter = builders.find(cfd->GetID());
       assert(builders_iter != builders.end());
-      auto builder = builders_iter->second;
+      auto builder = builders_iter->second->GetVersionBuilder();
 
       Version* v = new Version(cfd, this, current_version_number_++);
-      builder->SaveTo(v);
+      builder->SaveTo(v->storage_info());
       std::vector<uint64_t> size_being_compacted(
-          v->GetStorageInfo()->NumberLevels() - 1);
+          v->storage_info()->NumberLevels() - 1);
       cfd->compaction_picker()->SizeBeingCompacted(size_being_compacted);
       v->PrepareApply(*cfd->GetLatestMutableCFOptions(), size_being_compacted);
       delete builder;
@@ -2706,8 +2748,8 @@ Status VersionSet::WriteSnapshot(log::Writer* log) {
       edit.SetColumnFamily(cfd->GetID());
 
       for (int level = 0; level < cfd->NumberLevels(); level++) {
-        auto* files = cfd->current()->GetFiles();
-        for (const auto& f : files[level]) {
+        for (const auto& f :
+             cfd->current()->storage_info()->LevelFiles(level)) {
           edit.AddFile(level, f->fd.GetNumber(), f->fd.GetPathId(),
                        f->fd.GetFileSize(), f->smallest, f->largest,
                        f->smallest_seqno, f->largest_seqno);
@@ -2762,7 +2804,7 @@ bool VersionSet::ManifestContains(uint64_t manifest_file_number,
 
 uint64_t VersionSet::ApproximateOffsetOf(Version* v, const InternalKey& ikey) {
   uint64_t result = 0;
-  auto* vstorage = v->GetStorageInfo();
+  const auto* vstorage = v->storage_info();
   for (int level = 0; level < vstorage->NumberLevels(); level++) {
     const std::vector<FileMetaData*>& files = vstorage->LevelFiles(level);
     for (size_t i = 0; i < files.size(); i++) {
@@ -2803,7 +2845,7 @@ void VersionSet::AddLiveFiles(std::vector<FileDescriptor>* live_list) {
     Version* dummy_versions = cfd->dummy_versions();
     for (Version* v = dummy_versions->next_; v != dummy_versions;
          v = v->next_) {
-      auto* vstorage = v->GetStorageInfo();
+      const auto* vstorage = v->storage_info();
       for (int level = 0; level < vstorage->NumberLevels(); level++) {
         total_files += vstorage->LevelFiles(level).size();
       }
@@ -2817,7 +2859,7 @@ void VersionSet::AddLiveFiles(std::vector<FileDescriptor>* live_list) {
     Version* dummy_versions = cfd->dummy_versions();
     for (Version* v = dummy_versions->next_; v != dummy_versions;
          v = v->next_) {
-      auto* vstorage = v->GetStorageInfo();
+      const auto* vstorage = v->storage_info();
       for (int level = 0; level < vstorage->NumberLevels(); level++) {
         for (const auto& f : vstorage->LevelFiles(level)) {
           live_list->push_back(f->fd);
@@ -2875,7 +2917,7 @@ Iterator* VersionSet::MakeInputIterator(Compaction* c) {
 bool VersionSet::VerifyCompactionFileConsistency(Compaction* c) {
 #ifndef NDEBUG
   Version* version = c->column_family_data()->current();
-  VersionStorageInfo* vstorage = version->GetStorageInfo();
+  const VersionStorageInfo* vstorage = version->storage_info();
   if (c->input_version() != version) {
     Log(db_options_->info_log,
         "[%s] VerifyCompactionFileConsistency version mismatch",
@@ -2927,7 +2969,7 @@ Status VersionSet::GetMetadataForFile(uint64_t number, int* filelevel,
                                       ColumnFamilyData** cfd) {
   for (auto cfd_iter : *column_family_set_) {
     Version* version = cfd_iter->current();
-    auto* vstorage = version->GetStorageInfo();
+    const auto* vstorage = version->storage_info();
     for (int level = 0; level < vstorage->NumberLevels(); level++) {
       for (const auto& file : vstorage->LevelFiles(level)) {
         if (file->fd.GetNumber() == number) {
@@ -2944,9 +2986,9 @@ Status VersionSet::GetMetadataForFile(uint64_t number, int* filelevel,
 
 void VersionSet::GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata) {
   for (auto cfd : *column_family_set_) {
-    auto* files = cfd->current()->GetFiles();
     for (int level = 0; level < cfd->NumberLevels(); level++) {
-      for (const auto& file : files[level]) {
+      for (const auto& file :
+           cfd->current()->storage_info()->LevelFiles(level)) {
         LiveFileMetaData filemetadata;
         filemetadata.column_family_name = cfd->GetName();
         uint32_t path_id = file->fd.GetPathId();
diff --git a/db/version_set.h b/db/version_set.h
index 98ce172e3..44e6f94b2 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -26,6 +26,7 @@
 #include <atomic>
 #include <limits>
 #include "db/dbformat.h"
+#include "db/version_builder.h"
 #include "db/version_edit.h"
 #include "port/port.h"
 #include "db/table_cache.h"
@@ -91,6 +92,10 @@ class VersionStorageInfo {
                      VersionStorageInfo* src_vstorage);
   ~VersionStorageInfo();
 
+  void Reserve(int level, size_t size) { files_[level].reserve(size); }
+
+  void MaybeAddFile(int level, FileMetaData* f);
+
   void SetFinalized() { finalized_ = true; }
 
   // Update num_non_empty_levels_.
@@ -197,7 +202,6 @@ class VersionStorageInfo {
 
   // REQUIRES: This version has been saved (see VersionSet::SaveTo)
   const std::vector<FileMetaData*>& LevelFiles(int level) const {
-    assert(finalized_);
     return files_[level];
   }
 
@@ -249,8 +253,6 @@ class VersionStorageInfo {
   // in a specified level.  Uses *scratch as backing store.
   const char* LevelFileSummary(FileSummaryStorage* scratch, int level) const;
 
-  std::vector<FileMetaData*>* GetFiles() { return files_; }
-
   // Return the maximum overlapping data (in bytes) at next level for any
   // file at a level >= 1.
   int64_t MaxNextLevelOverlappingBytes();
@@ -269,7 +271,7 @@ class VersionStorageInfo {
            (accumulated_raw_key_size_ + accumulated_raw_value_size_);
   }
 
-  uint64_t GetEstimatedActiveKeys();
+  uint64_t GetEstimatedActiveKeys() const;
 
   // re-initializes the index that is used to offset into files_by_size_
   // to find the next compaction candidate file.
@@ -277,6 +279,10 @@ class VersionStorageInfo {
     next_file_to_compact_by_size_[level] = 0;
   }
 
+  const InternalKeyComparator* InternalComparator() {
+    return internal_comparator_;
+  }
+
  private:
   const InternalKeyComparator* internal_comparator_;
   const Comparator* user_comparator_;
@@ -374,8 +380,6 @@ class Version {
   // and return true. Otherwise, return false.
   bool Unref();
 
-  std::vector<FileMetaData*>* GetFiles() { return vstorage_.GetFiles(); }
-
   // Add all files listed in the current version to *live.
   void AddLiveFiles(std::vector<FileDescriptor>* live);
 
@@ -385,10 +389,6 @@ class Version {
   // Returns the version nuber of this version
   uint64_t GetVersionNumber() const { return version_number_; }
 
-  uint64_t GetAverageValueSize() const {
-    return vstorage_.GetAverageValueSize();
-  }
-
   // REQUIRES: lock is held
   // On success, "tp" will contains the table properties of the file
   // specified in "file_meta".  If the file name of "file_meta" is
@@ -405,7 +405,7 @@ class Version {
   Status GetPropertiesOfAllTables(TablePropertiesCollection* props);
 
   uint64_t GetEstimatedActiveKeys() {
-    return vstorage_.GetEstimatedActiveKeys();
+    return storage_info_.GetEstimatedActiveKeys();
   }
 
   size_t GetMemoryUsageByTableReaders();
@@ -418,16 +418,18 @@ class Version {
     return next_;
   }
 
-  VersionStorageInfo* GetStorageInfo() { return &vstorage_; }
+  VersionStorageInfo* storage_info() { return &storage_info_; }
+
+  VersionSet* version_set() { return vset_; }
 
  private:
   friend class VersionSet;
 
-  const InternalKeyComparator* GetInternalComparator() const {
-    return vstorage_.internal_comparator_;
+  const InternalKeyComparator* internal_comparator() const {
+    return storage_info_.internal_comparator_;
   }
-  const Comparator* GetUserComparator() const {
-    return vstorage_.user_comparator_;
+  const Comparator* user_comparator() const {
+    return storage_info_.user_comparator_;
   }
 
   bool PrefixMayMatch(const ReadOptions& read_options, Iterator* level_iter,
@@ -446,15 +448,13 @@ class Version {
   // record results in files_by_size_. The largest files are listed first.
   void UpdateFilesBySize();
 
-  VersionSet* GetVersionSet() { return vset_; }
-
   ColumnFamilyData* cfd_;  // ColumnFamilyData to which this Version belongs
   Logger* info_log_;
   Statistics* db_statistics_;
   TableCache* table_cache_;
   const MergeOperator* merge_operator_;
 
-  VersionStorageInfo vstorage_;
+  VersionStorageInfo storage_info_;
   VersionSet* vset_;            // VersionSet to which this Version belongs
   Version* next_;               // Next version in linked list
   Version* prev_;               // Previous version in linked list
@@ -602,9 +602,9 @@ class VersionSet {
   void GetObsoleteFiles(std::vector<FileMetaData*>* files);
 
   ColumnFamilySet* GetColumnFamilySet() { return column_family_set_.get(); }
+  const EnvOptions& GetEnvOptions() { return env_options_; }
 
  private:
-  class Builder;
   struct ManifestWriter;
 
   friend class Version;
@@ -664,7 +664,7 @@ class VersionSet {
   void operator=(const VersionSet&);
 
   void LogAndApplyCFHelper(VersionEdit* edit);
-  void LogAndApplyHelper(ColumnFamilyData* cfd, Builder* b, Version* v,
+  void LogAndApplyHelper(ColumnFamilyData* cfd, VersionBuilder* b, Version* v,
                          VersionEdit* edit, port::Mutex* mu);
 };
 
diff --git a/util/ldb_cmd.cc b/util/ldb_cmd.cc
index 3ff31359b..618c10a35 100644
--- a/util/ldb_cmd.cc
+++ b/util/ldb_cmd.cc
@@ -1125,7 +1125,7 @@ Status ReduceDBLevelsCommand::GetOldNumOfLevels(Options& opt,
   int max = -1;
   auto default_cfd = versions.GetColumnFamilySet()->GetDefault();
   for (int i = 0; i < default_cfd->NumberLevels(); i++) {
-    if (default_cfd->current()->GetStorageInfo()->NumLevelFiles(i)) {
+    if (default_cfd->current()->storage_info()->NumLevelFiles(i)) {
       max = i;
     }
   }
diff --git a/utilities/compacted_db/compacted_db_impl.cc b/utilities/compacted_db/compacted_db_impl.cc
index 455b312fa..3a417de2b 100644
--- a/utilities/compacted_db/compacted_db_impl.cc
+++ b/utilities/compacted_db/compacted_db_impl.cc
@@ -104,7 +104,7 @@ Status CompactedDBImpl::Init(const Options& options) {
   }
   version_ = cfd_->GetSuperVersion()->current;
   user_comparator_ = cfd_->user_comparator();
-  auto* vstorage = version_->GetStorageInfo();
+  auto* vstorage = version_->storage_info();
   const LevelFilesBrief& l0 = vstorage->LevelFilesBrief(0);
   // L0 should not have files
   if (l0.num_files > 1) {

From 98849a35fa2701f44acef70e1a4cd6c508306591 Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Fri, 31 Oct 2014 11:41:15 -0700
Subject: [PATCH 374/829] Apply InfoLogLevel to the logs in
 table/block_based_table_reader.cc

Summary:
Apply InfoLogLevel to the logs in table/block_based_table_reader.cc

Also, add missing checks for the returned status in BlockBasedTable::Open

Test Plan: make

Reviewers: sdong, ljin, igor

Reviewed By: igor

Subscribers: dhruba

Differential Revision: https://reviews.facebook.net/D28005
---
 table/block_based_table_reader.cc | 35 ++++++++++++++++++-------------
 1 file changed, 20 insertions(+), 15 deletions(-)

diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc
index c973b755e..1fe1a7d02 100644
--- a/table/block_based_table_reader.cc
+++ b/table/block_based_table_reader.cc
@@ -438,7 +438,9 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions,
 
   Footer footer(kBlockBasedTableMagicNumber);
   auto s = ReadFooterFromFile(file.get(), file_size, &footer);
-  if (!s.ok()) return s;
+  if (!s.ok()) {
+    return s;
+  }
 
   // We've successfully read the footer and the index block: we're
   // ready to serve requests.
@@ -455,12 +457,19 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions,
   std::unique_ptr<Block> meta;
   std::unique_ptr<Iterator> meta_iter;
   s = ReadMetaBlock(rep, &meta, &meta_iter);
+  if (!s.ok()) {
+    return s;
+  }
 
   // Read the properties
   bool found_properties_block = true;
   s = SeekToPropertiesBlock(meta_iter.get(), &found_properties_block);
 
-  if (found_properties_block) {
+  if (!s.ok()) {
+    Log(InfoLogLevel::WARN_LEVEL, rep->ioptions.info_log,
+        "Cannot seek to properties block from file: %s",
+        s.ToString().c_str());
+  } else if (found_properties_block) {
     s = meta_iter->status();
     TableProperties* table_properties = nullptr;
     if (s.ok()) {
@@ -470,15 +479,14 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions,
     }
 
     if (!s.ok()) {
-      auto err_msg =
-        "[Warning] Encountered error while reading data from properties "
-        "block " + s.ToString();
-      Log(rep->ioptions.info_log, "%s", err_msg.c_str());
+      Log(InfoLogLevel::WARN_LEVEL, rep->ioptions.info_log,
+        "Encountered error while reading data from properties "
+        "block %s", s.ToString().c_str());
     } else {
       rep->table_properties.reset(table_properties);
     }
   } else {
-    Log(WARN_LEVEL, rep->ioptions.info_log,
+    Log(InfoLogLevel::ERROR_LEVEL, rep->ioptions.info_log,
         "Cannot find Properties block from file.");
   }
 
@@ -573,13 +581,10 @@ Status BlockBasedTable::ReadMetaBlock(
       &meta,
       rep->ioptions.env);
 
-    if (!s.ok()) {
-      auto err_msg =
-        "[Warning] Encountered error while reading data from properties"
-        "block " + s.ToString();
-      Log(rep->ioptions.info_log, "%s", err_msg.c_str());
-    }
   if (!s.ok()) {
+    Log(InfoLogLevel::ERROR_LEVEL, rep->ioptions.info_log,
+        "Encountered error while reading data from properties"
+        " block %s", s.ToString().c_str());
     delete meta;
     return s;
   }
@@ -1219,7 +1224,7 @@ Status BlockBasedTable::CreateIndexReader(IndexReader** index_reader,
 
   if (index_type_on_file == BlockBasedTableOptions::kHashSearch &&
       rep_->ioptions.prefix_extractor == nullptr) {
-    Log(rep_->ioptions.info_log,
+    Log(InfoLogLevel::WARN_LEVEL, rep_->ioptions.info_log,
         "BlockBasedTableOptions::kHashSearch requires "
         "options.prefix_extractor to be set."
         " Fall back to binary seach index.");
@@ -1240,7 +1245,7 @@ Status BlockBasedTable::CreateIndexReader(IndexReader** index_reader,
         if (!s.ok()) {
           // we simply fall back to binary search in case there is any
           // problem with prefix hash index loading.
-          Log(rep_->ioptions.info_log,
+          Log(InfoLogLevel::WARN_LEVEL, rep_->ioptions.info_log,
               "Unable to read the metaindex block."
               " Fall back to binary seach index.");
           return BinarySearchIndexReader::Create(

From 9f7fc3ac45fad83fa49a93a67ef9651ac4401d02 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Fri, 31 Oct 2014 11:59:54 -0700
Subject: [PATCH 375/829] Turn on -Wshadow

Summary:
...and fix all the errors :)

Jim suggested turning on -Wshadow because it helped him fix number of critical bugs in fbcode. I think it's a good idea to be -Wshadow clean.

Test Plan: compiles

Reviewers: yhchiang, rven, sdong, ljin

Reviewed By: ljin

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D27711
---
 Makefile                                      |  2 +-
 db/builder.cc                                 | 12 +++---
 db/c_test.c                                   |  4 --
 db/column_family.cc                           |  3 +-
 db/compaction_picker.cc                       | 36 +++++++++--------
 db/db_impl.cc                                 | 14 +++----
 db/db_iter_test.cc                            | 23 +++++------
 db/db_test.cc                                 | 33 +++++++--------
 db/file_indexer.cc                            |  2 +-
 db/internal_stats.h                           |  4 +-
 db/merge_test.cc                              | 25 ++++++------
 db/prefix_test.cc                             |  3 +-
 db/skiplist_test.cc                           |  2 +-
 db/version_builder.h                          |  2 +-
 db/version_edit.cc                            |  8 ++--
 db/version_edit.h                             | 12 ++----
 db/version_set.cc                             |  4 +-
 db/wal_manager.cc                             | 10 ++---
 port/port_posix.h                             |  8 ++--
 table/block_based_table_builder.cc            | 25 ++++++------
 table/block_based_table_reader.cc             | 23 +++++------
 table/block_hash_index.h                      |  4 +-
 table/block_prefix_index.cc                   |  4 +-
 table/cuckoo_table_builder.cc                 | 11 +++--
 table/cuckoo_table_reader_test.cc             |  4 +-
 table/plain_table_reader.cc                   |  6 +--
 table/table_test.cc                           | 39 +++++++++---------
 table/two_level_iterator.h                    |  4 +-
 util/blob_store.h                             |  4 +-
 util/cache.cc                                 |  2 +-
 util/rate_limiter.cc                          |  4 +-
 util/rate_limiter_test.cc                     | 18 ++++-----
 util/thread_local_test.cc                     | 40 +++++++++----------
 utilities/geodb/geodb_impl.cc                 |  4 +-
 .../write_batch_with_index.cc                 |  4 +-
 35 files changed, 198 insertions(+), 205 deletions(-)

diff --git a/Makefile b/Makefile
index 5ed8a5a67..0beadda85 100644
--- a/Makefile
+++ b/Makefile
@@ -69,7 +69,7 @@ install:
 	@[ ! -e $(SHARED) ] || install -C -m 644 $(SHARED) $(INSTALL_PATH)/lib
 #-------------------------------------------------
 
-WARNING_FLAGS = -Wall -Werror -Wsign-compare
+WARNING_FLAGS = -Wall -Werror -Wsign-compare -Wshadow
 CFLAGS += $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CCFLAGS) $(OPT)
 CXXFLAGS += $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CXXFLAGS) $(OPT) -Woverloaded-virtual
 
diff --git a/db/builder.cc b/db/builder.cc
index 2c5094370..5d3273e78 100644
--- a/db/builder.cc
+++ b/db/builder.cc
@@ -72,11 +72,13 @@ Status BuildTable(const std::string& dbname, Env* env,
         ioptions, internal_comparator, file.get(),
         compression, compression_opts);
 
-    // the first key is the smallest key
-    Slice key = iter->key();
-    meta->smallest.DecodeFrom(key);
-    meta->smallest_seqno = GetInternalKeySeqno(key);
-    meta->largest_seqno = meta->smallest_seqno;
+    {
+      // the first key is the smallest key
+      Slice key = iter->key();
+      meta->smallest.DecodeFrom(key);
+      meta->smallest_seqno = GetInternalKeySeqno(key);
+      meta->largest_seqno = meta->smallest_seqno;
+    }
 
     MergeHelper merge(internal_comparator.user_comparator(),
                       ioptions.merge_operator, ioptions.info_log,
diff --git a/db/c_test.c b/db/c_test.c
index 171fd6d5c..d693f52ca 100644
--- a/db/c_test.c
+++ b/db/c_test.c
@@ -576,7 +576,6 @@ int main(int argc, char** argv) {
 
   StartPhase("compaction_filter");
   {
-    rocksdb_options_t* options = rocksdb_options_create();
     rocksdb_options_set_create_if_missing(options, 1);
     rocksdb_compactionfilter_t* cfilter;
     cfilter = rocksdb_compactionfilter_create(NULL, CFilterDestroy,
@@ -589,12 +588,10 @@ int main(int argc, char** argv) {
 
     rocksdb_options_set_compaction_filter(options, NULL);
     rocksdb_compactionfilter_destroy(cfilter);
-    rocksdb_options_destroy(options);
   }
 
   StartPhase("compaction_filter_factory");
   {
-    rocksdb_options_t* options = rocksdb_options_create();
     rocksdb_options_set_create_if_missing(options, 1);
     rocksdb_compactionfilterfactory_t* factory;
     factory = rocksdb_compactionfilterfactory_create(
@@ -606,7 +603,6 @@ int main(int argc, char** argv) {
     db = CheckCompaction(db, options, roptions, woptions);
 
     rocksdb_options_set_compaction_filter_factory(options, NULL);
-    rocksdb_options_destroy(options);
   }
 
   StartPhase("compaction_filter_v2");
diff --git a/db/column_family.cc b/db/column_family.cc
index b7497ecfe..c5c4e35e5 100644
--- a/db/column_family.cc
+++ b/db/column_family.cc
@@ -263,8 +263,7 @@ ColumnFamilyData::ColumnFamilyData(uint32_t id, const std::string& name,
     Log(InfoLogLevel::INFO_LEVEL,
         ioptions_.info_log, "Options for column family \"%s\":\n",
         name.c_str());
-    const ColumnFamilyOptions* cf_options = &options_;
-    cf_options->Dump(ioptions_.info_log);
+    options_.Dump(ioptions_.info_log);
   }
 
   RecalculateWriteStallConditions(mutable_cf_options_);
diff --git a/db/compaction_picker.cc b/db/compaction_picker.cc
index 676f39b7d..096f0d77d 100644
--- a/db/compaction_picker.cc
+++ b/db/compaction_picker.cc
@@ -697,8 +697,8 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalReadAmp(
     // Check if the suceeding files need compaction.
     for (unsigned int i = loop + 1;
          candidate_count < max_files_to_compact && i < files.size(); i++) {
-      FileMetaData* f = files[i];
-      if (f->being_compacted) {
+      FileMetaData* suceeding_file = files[i];
+      if (suceeding_file->being_compacted) {
         break;
       }
       // Pick files if the total/last candidate file size (increased by the
@@ -708,14 +708,14 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalReadAmp(
       // kCompactionStopStyleSimilarSize, it's simply the size of the last
       // picked file.
       double sz = candidate_size * (100.0 + ratio) / 100.0;
-      if (sz < static_cast<double>(f->fd.GetFileSize())) {
+      if (sz < static_cast<double>(suceeding_file->fd.GetFileSize())) {
         break;
       }
       if (ioptions_.compaction_options_universal.stop_style ==
           kCompactionStopStyleSimilarSize) {
         // Similar-size stopping rule: also check the last picked file isn't
         // far larger than the next candidate file.
-        sz = (f->fd.GetFileSize() * (100.0 + ratio)) / 100.0;
+        sz = (suceeding_file->fd.GetFileSize() * (100.0 + ratio)) / 100.0;
         if (sz < static_cast<double>(candidate_size)) {
           // If the small file we've encountered begins a run of similar-size
           // files, we'll pick them up on a future iteration of the outer
@@ -723,9 +723,9 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalReadAmp(
           // by the last-resort read amp strategy which disregards size ratios.
           break;
         }
-        candidate_size = f->compensated_file_size;
-      } else { // default kCompactionStopStyleTotalSize
-        candidate_size += f->compensated_file_size;
+        candidate_size = suceeding_file->compensated_file_size;
+      } else {  // default kCompactionStopStyleTotalSize
+        candidate_size += suceeding_file->compensated_file_size;
       }
       candidate_count++;
     }
@@ -738,12 +738,14 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalReadAmp(
     } else {
       for (unsigned int i = loop;
            i < loop + candidate_count && i < files.size(); i++) {
-        FileMetaData* f = files[i];
+        FileMetaData* skipping_file = files[i];
         LogToBuffer(log_buffer, "[%s] Universal: Skipping file %" PRIu64
                                 "[%d] with size %" PRIu64
                                 " (compensated size %" PRIu64 ") %d\n",
-                    cf_name.c_str(), f->fd.GetNumber(), i, f->fd.GetFileSize(),
-                    f->compensated_file_size, f->being_compacted);
+                    cf_name.c_str(), f->fd.GetNumber(), i,
+                    skipping_file->fd.GetFileSize(),
+                    skipping_file->compensated_file_size,
+                    skipping_file->being_compacted);
       }
     }
   }
@@ -782,16 +784,17 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalReadAmp(
   c->score_ = score;
 
   for (unsigned int i = start_index; i < first_index_after; i++) {
-    FileMetaData* f = files[i];
-    c->inputs_[0].files.push_back(f);
+    FileMetaData* picking_file = files[i];
+    c->inputs_[0].files.push_back(picking_file);
     char file_num_buf[kFormatFileNumberBufSize];
-    FormatFileNumber(f->fd.GetNumber(), f->fd.GetPathId(), file_num_buf,
-                     sizeof(file_num_buf));
+    FormatFileNumber(picking_file->fd.GetNumber(), picking_file->fd.GetPathId(),
+                     file_num_buf, sizeof(file_num_buf));
     LogToBuffer(log_buffer,
                 "[%s] Universal: Picking file %s[%d] "
                 "with size %" PRIu64 " (compensated size %" PRIu64 ")\n",
-                cf_name.c_str(), file_num_buf, i, f->fd.GetFileSize(),
-                f->compensated_file_size);
+                cf_name.c_str(), file_num_buf, i,
+                picking_file->fd.GetFileSize(),
+                picking_file->compensated_file_size);
   }
   return c;
 }
@@ -850,7 +853,6 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalSizeAmp(
   for (unsigned int loop = start_index; loop < files.size() - 1; loop++) {
     f = files[loop];
     if (f->being_compacted) {
-      char file_num_buf[kFormatFileNumberBufSize];
       FormatFileNumber(f->fd.GetNumber(), f->fd.GetPathId(), file_num_buf,
                        sizeof(file_num_buf));
       LogToBuffer(
diff --git a/db/db_impl.cc b/db/db_impl.cc
index 231325cc3..5b2635d1a 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -2953,8 +2953,8 @@ Status DBImpl::DoCompactionWork(CompactionState* compact,
 
 namespace {
 struct IterState {
-  IterState(DBImpl* db, port::Mutex* mu, SuperVersion* super_version)
-      : db(db), mu(mu), super_version(super_version) {}
+  IterState(DBImpl* _db, port::Mutex* _mu, SuperVersion* _super_version)
+      : db(_db), mu(_mu), super_version(_super_version) {}
 
   DBImpl* db;
   port::Mutex* mu;
@@ -3812,14 +3812,14 @@ Status DBImpl::SetNewMemtableAndNewLogFile(ColumnFamilyData* cfd,
     log_.reset(new_log);
     log_empty_ = true;
     alive_log_files_.push_back(LogFileNumberSize(logfile_number_));
-    for (auto cfd : *versions_->GetColumnFamilySet()) {
+    for (auto loop_cfd : *versions_->GetColumnFamilySet()) {
       // all this is just optimization to delete logs that
       // are no longer needed -- if CF is empty, that means it
       // doesn't need that particular log to stay alive, so we just
       // advance the log number. no need to persist this in the manifest
-      if (cfd->mem()->GetFirstSequenceNumber() == 0 &&
-          cfd->imm()->size() == 0) {
-        cfd->SetLogNumber(logfile_number_);
+      if (loop_cfd->mem()->GetFirstSequenceNumber() == 0 &&
+          loop_cfd->imm()->size() == 0) {
+        loop_cfd->SetLogNumber(logfile_number_);
       }
     }
   }
@@ -4398,8 +4398,6 @@ Status DestroyDB(const std::string& dbname, const Options& options) {
 
     for (auto& db_path : options.db_paths) {
       env->GetChildren(db_path.path, &filenames);
-      uint64_t number;
-      FileType type;
       for (size_t i = 0; i < filenames.size(); i++) {
         if (ParseFileName(filenames[i], &number, &type) &&
             type == kTableFile) {  // Lock file will be deleted at end
diff --git a/db/db_iter_test.cc b/db/db_iter_test.cc
index 2aa30e327..e6b96c410 100644
--- a/db/db_iter_test.cc
+++ b/db/db_iter_test.cc
@@ -217,7 +217,6 @@ TEST(DBIteratorTest, DBIteratorPrevNext) {
   }
 
   {
-    Options options;
     TestIterator* internal_iter = new TestIterator(BytewiseComparator());
     internal_iter->AddPut("a", "val_a");
     internal_iter->AddPut("b", "val_b");
@@ -254,7 +253,6 @@ TEST(DBIteratorTest, DBIteratorPrevNext) {
   }
 
   {
-    Options options;
     TestIterator* internal_iter = new TestIterator(BytewiseComparator());
     internal_iter->AddPut("a", "val_a");
     internal_iter->AddPut("a", "val_a");
@@ -364,8 +362,8 @@ TEST(DBIteratorTest, DBIteratorUseSkip) {
       TestIterator* internal_iter = new TestIterator(BytewiseComparator());
       internal_iter->AddMerge("b", "merge_1");
       internal_iter->AddMerge("a", "merge_2");
-      for (size_t i = 0; i < 200; ++i) {
-        internal_iter->AddPut("c", std::to_string(i));
+      for (size_t k = 0; k < 200; ++k) {
+        internal_iter->AddPut("c", std::to_string(k));
       }
       internal_iter->Finish();
 
@@ -400,7 +398,7 @@ TEST(DBIteratorTest, DBIteratorUseSkip) {
       TestIterator* internal_iter = new TestIterator(BytewiseComparator());
       internal_iter->AddMerge("b", "merge_1");
       internal_iter->AddMerge("a", "merge_2");
-      for (size_t i = 0; i < 200; ++i) {
+      for (size_t k = 0; k < 200; ++k) {
         internal_iter->AddDeletion("c");
       }
       internal_iter->AddPut("c", "200");
@@ -463,7 +461,7 @@ TEST(DBIteratorTest, DBIteratorUseSkip) {
   {
     for (size_t i = 0; i < 200; ++i) {
       TestIterator* internal_iter = new TestIterator(BytewiseComparator());
-      for (size_t i = 0; i < 200; ++i) {
+      for (size_t k = 0; k < 200; ++k) {
         internal_iter->AddDeletion("c");
       }
       internal_iter->AddPut("c", "200");
@@ -511,12 +509,12 @@ TEST(DBIteratorTest, DBIteratorUseSkip) {
       TestIterator* internal_iter = new TestIterator(BytewiseComparator());
       internal_iter->AddMerge("b", "merge_1");
       internal_iter->AddMerge("a", "merge_2");
-      for (size_t i = 0; i < 200; ++i) {
-        internal_iter->AddPut("d", std::to_string(i));
+      for (size_t k = 0; k < 200; ++k) {
+        internal_iter->AddPut("d", std::to_string(k));
       }
 
-      for (size_t i = 0; i < 200; ++i) {
-        internal_iter->AddPut("c", std::to_string(i));
+      for (size_t k = 0; k < 200; ++k) {
+        internal_iter->AddPut("c", std::to_string(k));
       }
       internal_iter->Finish();
 
@@ -550,8 +548,8 @@ TEST(DBIteratorTest, DBIteratorUseSkip) {
       TestIterator* internal_iter = new TestIterator(BytewiseComparator());
       internal_iter->AddMerge("b", "b");
       internal_iter->AddMerge("a", "a");
-      for (size_t i = 0; i < 200; ++i) {
-        internal_iter->AddMerge("c", std::to_string(i));
+      for (size_t k = 0; k < 200; ++k) {
+        internal_iter->AddMerge("c", std::to_string(k));
       }
       internal_iter->Finish();
 
@@ -1390,7 +1388,6 @@ TEST(DBIteratorTest, DBIterator) {
   }
 
   {
-    Options options;
     TestIterator* internal_iter = new TestIterator(BytewiseComparator());
     internal_iter->AddDeletion("a");
     internal_iter->AddPut("a", "0");
diff --git a/db/db_test.cc b/db/db_test.cc
index 59b611c65..62c5e483b 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -4544,16 +4544,18 @@ TEST(DBTest, CompactionFilter) {
   ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
   ASSERT_EQ(NumTableFilesAtLevel(1, 1), 0);
 
-  // Scan the entire database to ensure that nothing is left
-  Iterator* iter = db_->NewIterator(ReadOptions(), handles_[1]);
-  iter->SeekToFirst();
-  count = 0;
-  while (iter->Valid()) {
-    count++;
-    iter->Next();
+  {
+    // Scan the entire database to ensure that nothing is left
+    std::unique_ptr<Iterator> iter(
+        db_->NewIterator(ReadOptions(), handles_[1]));
+    iter->SeekToFirst();
+    count = 0;
+    while (iter->Valid()) {
+      count++;
+      iter->Next();
+    }
+    ASSERT_EQ(count, 0);
   }
-  ASSERT_EQ(count, 0);
-  delete iter;
 
   // The sequence number of the remaining record
   // is not zeroed out even though it is at the
@@ -5014,7 +5016,7 @@ TEST(DBTest, CompactionFilterV2NULLPrefix) {
   for (int i = 1; i < 100000; i++) {
     char key[100];
     snprintf(key, sizeof(key), "%08d%010d", i, i);
-    std::string newvalue = Get(key);
+    newvalue = Get(key);
     ASSERT_EQ(newvalue.compare(NEW_VALUE), 0);
   }
 }
@@ -5623,7 +5625,7 @@ TEST(DBTest, ManualCompaction) {
     ASSERT_EQ("0,0,1", FilesPerLevel(1));
 
     if (iter == 0) {
-      Options options = CurrentOptions();
+      options = CurrentOptions();
       options.max_background_flushes = 0;
       options.num_levels = 3;
       options.create_if_missing = true;
@@ -7591,7 +7593,6 @@ void PrefixScanInit(DBTest *dbtest) {
 
   // GROUP 2
   for (int i = 1; i <= big_range_sstfiles; i++) {
-    std::string keystr;
     snprintf(buf, sizeof(buf), "%02d______:start", 0);
     keystr = std::string(buf);
     ASSERT_OK(dbtest->Put(keystr, keystr));
@@ -8877,7 +8878,7 @@ TEST(DBTest, PartialCompactionFailure) {
 
   DestroyAndReopen(options);
 
-  const int kNumKeys =
+  const int kNumInsertedKeys =
       options.level0_file_num_compaction_trigger *
       (options.max_write_buffer_number - 1) *
       kKeysPerBuffer;
@@ -8885,7 +8886,7 @@ TEST(DBTest, PartialCompactionFailure) {
   Random rnd(301);
   std::vector<std::string> keys;
   std::vector<std::string> values;
-  for (int k = 0; k < kNumKeys; ++k) {
+  for (int k = 0; k < kNumInsertedKeys; ++k) {
     keys.emplace_back(RandomString(&rnd, kKeySize));
     values.emplace_back(RandomString(&rnd, kKvSize - kKeySize));
     ASSERT_OK(Put(Slice(keys[k]), Slice(values[k])));
@@ -8914,7 +8915,7 @@ TEST(DBTest, PartialCompactionFailure) {
   ASSERT_EQ(NumTableFilesAtLevel(0), previous_num_level0_files);
 
   // All key-values must exist after compaction fails.
-  for (int k = 0; k < kNumKeys; ++k) {
+  for (int k = 0; k < kNumInsertedKeys; ++k) {
     ASSERT_EQ(values[k], Get(keys[k]));
   }
 
@@ -8924,7 +8925,7 @@ TEST(DBTest, PartialCompactionFailure) {
   Reopen(options);
 
   // Verify again after reopen.
-  for (int k = 0; k < kNumKeys; ++k) {
+  for (int k = 0; k < kNumInsertedKeys; ++k) {
     ASSERT_EQ(values[k], Get(keys[k]));
   }
 }
diff --git a/db/file_indexer.cc b/db/file_indexer.cc
index ca2ef9bc8..8c0ca043e 100644
--- a/db/file_indexer.cc
+++ b/db/file_indexer.cc
@@ -100,7 +100,7 @@ void FileIndexer::UpdateIndex(Arena* arena, const uint32_t num_levels,
     }
     IndexLevel& index_level = next_level_index_[level];
     index_level.num_index = upper_size;
-    char* mem = arena->AllocateAligned(upper_size * sizeof(IndexUnit));
+    mem = arena->AllocateAligned(upper_size * sizeof(IndexUnit));
     index_level.index_units = new (mem) IndexUnit[upper_size];
 
     CalculateLB(
diff --git a/db/internal_stats.h b/db/internal_stats.h
index 5caa33415..0c3ee6db7 100644
--- a/db/internal_stats.h
+++ b/db/internal_stats.h
@@ -136,7 +136,7 @@ class InternalStats {
     // Number of compactions done
     int count;
 
-    explicit CompactionStats(int count = 0)
+    explicit CompactionStats(int _count = 0)
         : micros(0),
           bytes_readn(0),
           bytes_readnp1(0),
@@ -146,7 +146,7 @@ class InternalStats {
           files_out_levelnp1(0),
           num_input_records(0),
           num_dropped_records(0),
-          count(count) {}
+          count(_count) {}
 
     explicit CompactionStats(const CompactionStats& c)
         : micros(c.micros),
diff --git a/db/merge_test.cc b/db/merge_test.cc
index 7e71ccf86..249e96ad7 100644
--- a/db/merge_test.cc
+++ b/db/merge_test.cc
@@ -407,14 +407,6 @@ void testSingleBatchSuccessiveMerge(
 }
 
 void runTest(int argc, const string& dbname, const bool use_ttl = false) {
-  auto db = OpenDb(dbname, use_ttl);
-
-  {
-    cout << "Test read-modify-write counters... \n";
-    Counters counters(db, 0);
-    testCounters(counters, db.get(), true);
-  }
-
   bool compact = false;
   if (argc > 1) {
     compact = true;
@@ -422,13 +414,22 @@ void runTest(int argc, const string& dbname, const bool use_ttl = false) {
   }
 
   {
-    cout << "Test merge-based counters... \n";
-    MergeBasedCounters counters(db, 0);
-    testCounters(counters, db.get(), compact);
+    auto db = OpenDb(dbname, use_ttl);
+
+    {
+      cout << "Test read-modify-write counters... \n";
+      Counters counters(db, 0);
+      testCounters(counters, db.get(), true);
+    }
+
+    {
+      cout << "Test merge-based counters... \n";
+      MergeBasedCounters counters(db, 0);
+      testCounters(counters, db.get(), compact);
+    }
   }
 
   DestroyDB(dbname, Options());
-  db.reset();
 
   {
     cout << "Test merge in memtable... \n";
diff --git a/db/prefix_test.cc b/db/prefix_test.cc
index a69dda2b4..c896ab8d8 100644
--- a/db/prefix_test.cc
+++ b/db/prefix_test.cc
@@ -52,7 +52,8 @@ struct TestKey {
   uint64_t prefix;
   uint64_t sorted;
 
-  TestKey(uint64_t prefix, uint64_t sorted) : prefix(prefix), sorted(sorted) {}
+  TestKey(uint64_t _prefix, uint64_t _sorted)
+      : prefix(_prefix), sorted(_sorted) {}
 };
 
 // return a slice backed by test_key
diff --git a/db/skiplist_test.cc b/db/skiplist_test.cc
index 48323b244..010616cc0 100644
--- a/db/skiplist_test.cc
+++ b/db/skiplist_test.cc
@@ -357,7 +357,7 @@ static void RunConcurrent(int run) {
     TestState state(seed + 1);
     Env::Default()->Schedule(ConcurrentReader, &state);
     state.Wait(TestState::RUNNING);
-    for (int i = 0; i < kSize; i++) {
+    for (int k = 0; k < kSize; k++) {
       state.t_.WriteStep(&rnd);
     }
     state.quit_flag_.store(true, std::memory_order_release);
diff --git a/db/version_builder.h b/db/version_builder.h
index f8c91a88c..caeb34970 100644
--- a/db/version_builder.h
+++ b/db/version_builder.h
@@ -15,7 +15,7 @@ namespace rocksdb {
 class TableCache;
 class VersionStorageInfo;
 class VersionEdit;
-class FileMetaData;
+struct FileMetaData;
 
 // A helper class so we can efficiently apply a whole sequence
 // of edits to a particular state without creating intermediate
diff --git a/db/version_edit.cc b/db/version_edit.cc
index 4a6506c7d..f7b288870 100644
--- a/db/version_edit.cc
+++ b/db/version_edit.cc
@@ -168,7 +168,6 @@ Status VersionEdit::DecodeFrom(const Slice& src) {
 
   // Temporary storage for parsing
   int level;
-  uint64_t number;
   FileMetaData f;
   Slice str;
   InternalKey key;
@@ -237,9 +236,9 @@ Status VersionEdit::DecodeFrom(const Slice& src) {
         }
         break;
 
-      case kDeletedFile:
-        if (GetLevel(&input, &level, &msg) &&
-            GetVarint64(&input, &number)) {
+      case kDeletedFile: {
+        uint64_t number;
+        if (GetLevel(&input, &level, &msg) && GetVarint64(&input, &number)) {
           deleted_files_.insert(std::make_pair(level, number));
         } else {
           if (!msg) {
@@ -247,6 +246,7 @@ Status VersionEdit::DecodeFrom(const Slice& src) {
           }
         }
         break;
+      }
 
       case kNewFile: {
         uint64_t number;
diff --git a/db/version_edit.h b/db/version_edit.h
index f8e71d2e9..0a8bbf257 100644
--- a/db/version_edit.h
+++ b/db/version_edit.h
@@ -39,10 +39,10 @@ struct FileDescriptor {
 
   FileDescriptor() : FileDescriptor(0, 0, 0) {}
 
-  FileDescriptor(uint64_t number, uint32_t path_id, uint64_t file_size)
+  FileDescriptor(uint64_t number, uint32_t path_id, uint64_t _file_size)
       : table_reader(nullptr),
         packed_number_and_path_id(PackFileNumberAndPathId(number, path_id)),
-        file_size(file_size) {}
+        file_size(_file_size) {}
 
   FileDescriptor& operator=(const FileDescriptor& fd) {
     table_reader = fd.table_reader;
@@ -110,12 +110,8 @@ struct FdWithKeyRange {
         largest_key() {
   }
 
-  FdWithKeyRange(FileDescriptor fd,
-      Slice smallest_key, Slice largest_key)
-      : fd(fd),
-        smallest_key(smallest_key),
-        largest_key(largest_key) {
-  }
+  FdWithKeyRange(FileDescriptor _fd, Slice _smallest_key, Slice _largest_key)
+      : fd(_fd), smallest_key(_smallest_key), largest_key(_largest_key) {}
 };
 
 // Data structure to store an array of FdWithKeyRange in one level
diff --git a/db/version_set.cc b/db/version_set.cc
index a195a7168..fc37460b8 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -1455,9 +1455,9 @@ struct VersionSet::ManifestWriter {
   ColumnFamilyData* cfd;
   VersionEdit* edit;
 
-  explicit ManifestWriter(port::Mutex* mu, ColumnFamilyData* cfd,
+  explicit ManifestWriter(port::Mutex* mu, ColumnFamilyData* _cfd,
                           VersionEdit* e)
-      : done(false), cv(mu), cfd(cfd), edit(e) {}
+      : done(false), cv(mu), cfd(_cfd), edit(e) {}
 };
 
 class VersionBuilder::Rep {
diff --git a/db/wal_manager.cc b/db/wal_manager.cc
index c08b3b220..9b86a0f97 100644
--- a/db/wal_manager.cc
+++ b/db/wal_manager.cc
@@ -166,14 +166,14 @@ void WalManager::PurgeObsoleteWALFiles() {
       std::string const file_path = archival_dir + "/" + f;
       if (ttl_enabled) {
         uint64_t file_m_time;
-        Status const s = env_->GetFileModificationTime(file_path, &file_m_time);
+        s = env_->GetFileModificationTime(file_path, &file_m_time);
         if (!s.ok()) {
           Log(db_options_.info_log, "Can't get file mod time: %s: %s",
               file_path.c_str(), s.ToString().c_str());
           continue;
         }
         if (now_seconds - file_m_time > db_options_.WAL_ttl_seconds) {
-          Status const s = env_->DeleteFile(file_path);
+          s = env_->DeleteFile(file_path);
           if (!s.ok()) {
             Log(db_options_.info_log, "Can't delete file: %s: %s",
                 file_path.c_str(), s.ToString().c_str());
@@ -188,7 +188,7 @@ void WalManager::PurgeObsoleteWALFiles() {
 
       if (size_limit_enabled) {
         uint64_t file_size;
-        Status const s = env_->GetFileSize(file_path, &file_size);
+        s = env_->GetFileSize(file_path, &file_size);
         if (!s.ok()) {
           Log(db_options_.info_log, "Can't get file size: %s: %s",
               file_path.c_str(), s.ToString().c_str());
@@ -198,7 +198,7 @@ void WalManager::PurgeObsoleteWALFiles() {
             log_file_size = std::max(log_file_size, file_size);
             ++log_files_num;
           } else {
-            Status s = env_->DeleteFile(file_path);
+            s = env_->DeleteFile(file_path);
             if (!s.ok()) {
               Log(db_options_.info_log, "Can't delete file: %s: %s",
                   file_path.c_str(), s.ToString().c_str());
@@ -236,7 +236,7 @@ void WalManager::PurgeObsoleteWALFiles() {
 
   for (size_t i = 0; i < files_del_num; ++i) {
     std::string const file_path = archived_logs[i]->PathName();
-    Status const s = env_->DeleteFile(db_options_.wal_dir + "/" + file_path);
+    s = env_->DeleteFile(db_options_.wal_dir + "/" + file_path);
     if (!s.ok()) {
       Log(db_options_.info_log, "Can't delete file: %s: %s", file_path.c_str(),
           s.ToString().c_str());
diff --git a/port/port_posix.h b/port/port_posix.h
index dae8f7219..ceb6d0aa1 100644
--- a/port/port_posix.h
+++ b/port/port_posix.h
@@ -212,7 +212,7 @@ inline bool Zlib_Compress(const CompressionOptions& opts, const char* input,
   int old_sz =0, new_sz =0, new_sz_delta =0;
   bool done = false;
   while (!done) {
-    int st = deflate(&_stream, Z_FINISH);
+    st = deflate(&_stream, Z_FINISH);
     switch (st) {
       case Z_STREAM_END:
         done = true;
@@ -274,7 +274,7 @@ inline char* Zlib_Uncompress(const char* input_data, size_t input_length,
 
   //while(_stream.next_in != nullptr && _stream.avail_in != 0) {
   while (!done) {
-    int st = inflate(&_stream, Z_SYNC_FLUSH);
+    st = inflate(&_stream, Z_SYNC_FLUSH);
     switch (st) {
       case Z_STREAM_END:
         done = true;
@@ -337,7 +337,7 @@ inline bool BZip2_Compress(const CompressionOptions& opts, const char* input,
 
   int old_sz =0, new_sz =0;
   while(_stream.next_in != nullptr && _stream.avail_in != 0) {
-    int st = BZ2_bzCompress(&_stream, BZ_FINISH);
+    st = BZ2_bzCompress(&_stream, BZ_FINISH);
     switch (st) {
       case BZ_STREAM_END:
         break;
@@ -390,7 +390,7 @@ inline char* BZip2_Uncompress(const char* input_data, size_t input_length,
   char* tmp = nullptr;
 
   while(_stream.next_in != nullptr && _stream.avail_in != 0) {
-    int st = BZ2_bzDecompress(&_stream);
+    st = BZ2_bzDecompress(&_stream);
     switch (st) {
       case BZ_STREAM_END:
         break;
diff --git a/table/block_based_table_builder.cc b/table/block_based_table_builder.cc
index c053e7e4f..f158ca8c4 100644
--- a/table/block_based_table_builder.cc
+++ b/table/block_based_table_builder.cc
@@ -430,25 +430,26 @@ struct BlockBasedTableBuilder::Rep {
   std::vector<std::unique_ptr<TablePropertiesCollector>>
       table_properties_collectors;
 
-  Rep(const ImmutableCFOptions& ioptions,
+  Rep(const ImmutableCFOptions& _ioptions,
       const BlockBasedTableOptions& table_opt,
-      const InternalKeyComparator& icomparator,
-      WritableFile* f, const CompressionType compression_type,
-      const CompressionOptions& compression_opts)
-      : ioptions(ioptions),
+      const InternalKeyComparator& icomparator, WritableFile* f,
+      const CompressionType _compression_type,
+      const CompressionOptions& _compression_opts)
+      : ioptions(_ioptions),
         table_options(table_opt),
         internal_comparator(icomparator),
         file(f),
         data_block(table_options.block_restart_interval),
-        internal_prefix_transform(ioptions.prefix_extractor),
-        index_builder(CreateIndexBuilder(
-              table_options.index_type, &internal_comparator,
-              &this->internal_prefix_transform)),
-        compression_type(compression_type),
-        filter_block(CreateFilterBlockBuilder(ioptions, table_options)),
+        internal_prefix_transform(_ioptions.prefix_extractor),
+        index_builder(CreateIndexBuilder(table_options.index_type,
+                                         &internal_comparator,
+                                         &this->internal_prefix_transform)),
+        compression_type(_compression_type),
+        compression_opts(_compression_opts),
+        filter_block(CreateFilterBlockBuilder(_ioptions, table_options)),
         flush_block_policy(
             table_options.flush_block_policy_factory->NewFlushBlockPolicy(
-              table_options, data_block)) {
+                table_options, data_block)) {
     for (auto& collector_factories :
          ioptions.table_properties_collector_factories) {
       table_properties_collectors.emplace_back(
diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc
index 1fe1a7d02..5cb35834a 100644
--- a/table/block_based_table_reader.cc
+++ b/table/block_based_table_reader.cc
@@ -316,13 +316,14 @@ class HashIndexReader : public IndexReader {
 
 
 struct BlockBasedTable::Rep {
-  Rep(const ImmutableCFOptions& ioptions,
-      const EnvOptions& env_options,
-      const BlockBasedTableOptions& table_opt,
-      const InternalKeyComparator& internal_comparator)
-      : ioptions(ioptions), env_options(env_options), table_options(table_opt),
-        filter_policy(table_opt.filter_policy.get()),
-        internal_comparator(internal_comparator) {}
+  Rep(const ImmutableCFOptions& _ioptions, const EnvOptions& _env_options,
+      const BlockBasedTableOptions& _table_opt,
+      const InternalKeyComparator& _internal_comparator)
+      : ioptions(_ioptions),
+        env_options(_env_options),
+        table_options(_table_opt),
+        filter_policy(_table_opt.filter_policy.get()),
+        internal_comparator(_internal_comparator) {}
 
   const ImmutableCFOptions& ioptions;
   const EnvOptions& env_options;
@@ -364,11 +365,9 @@ BlockBasedTable::~BlockBasedTable() {
 //    was not read from cache, `cache_handle` will be nullptr.
 template <class TValue>
 struct BlockBasedTable::CachableEntry {
-  CachableEntry(TValue* value, Cache::Handle* cache_handle)
-    : value(value)
-    , cache_handle(cache_handle) {
-  }
-  CachableEntry(): CachableEntry(nullptr, nullptr) { }
+  CachableEntry(TValue* _value, Cache::Handle* _cache_handle)
+      : value(_value), cache_handle(_cache_handle) {}
+  CachableEntry() : CachableEntry(nullptr, nullptr) {}
   void Release(Cache* cache) {
     if (cache_handle) {
       cache->Release(cache_handle);
diff --git a/table/block_hash_index.h b/table/block_hash_index.h
index d5603d366..582910796 100644
--- a/table/block_hash_index.h
+++ b/table/block_hash_index.h
@@ -25,8 +25,8 @@ class BlockHashIndex {
  public:
   // Represents a restart index in the index block's restart array.
   struct RestartIndex {
-    explicit RestartIndex(uint32_t first_index, uint32_t num_blocks = 1)
-        : first_index(first_index), num_blocks(num_blocks) {}
+    explicit RestartIndex(uint32_t _first_index, uint32_t _num_blocks = 1)
+        : first_index(_first_index), num_blocks(_num_blocks) {}
 
     // For a given prefix, what is the restart index for the first data block
     // that contains it.
diff --git a/table/block_prefix_index.cc b/table/block_prefix_index.cc
index d64b73b98..c1c9d520e 100644
--- a/table/block_prefix_index.cc
+++ b/table/block_prefix_index.cc
@@ -143,8 +143,8 @@ class BlockPrefixIndex::Builder {
         auto current = prefixes_per_bucket[i];
         // populate block ids from largest to smallest
         while (current != nullptr) {
-          for (uint32_t i = 0; i < current->num_blocks; i++) {
-            *last_block = current->end_block - i;
+          for (uint32_t iter = 0; iter < current->num_blocks; iter++) {
+            *last_block = current->end_block - iter;
             last_block--;
           }
           current = current->next;
diff --git a/table/cuckoo_table_builder.cc b/table/cuckoo_table_builder.cc
index 6ff1fa0cf..a11945cf7 100644
--- a/table/cuckoo_table_builder.cc
+++ b/table/cuckoo_table_builder.cc
@@ -436,8 +436,8 @@ bool CuckooTableBuilder::MakeSpaceForKey(
     uint64_t bucket_id;
     uint32_t depth;
     uint32_t parent_pos;
-    CuckooNode(uint64_t bucket_id, uint32_t depth, int parent_pos)
-      : bucket_id(bucket_id), depth(depth), parent_pos(parent_pos) {}
+    CuckooNode(uint64_t _bucket_id, uint32_t _depth, int _parent_pos)
+        : bucket_id(_bucket_id), depth(_depth), parent_pos(_parent_pos) {}
   };
   // This is BFS search tree that is stored simply as a vector.
   // Each node stores the index of parent node in the vector.
@@ -451,10 +451,9 @@ bool CuckooTableBuilder::MakeSpaceForKey(
   // It is unlikely for the increment operation to overflow because the maximum
   // no. of times this will be called is <= max_num_hash_func_ + num_entries_.
   for (uint32_t hash_cnt = 0; hash_cnt < num_hash_func_; ++hash_cnt) {
-    uint64_t bucket_id = hash_vals[hash_cnt];
-    (*buckets)[bucket_id].make_space_for_key_call_id =
-      make_space_for_key_call_id;
-    tree.push_back(CuckooNode(bucket_id, 0, 0));
+    uint64_t bid = hash_vals[hash_cnt];
+    (*buckets)[bid].make_space_for_key_call_id = make_space_for_key_call_id;
+    tree.push_back(CuckooNode(bid, 0, 0));
   }
   bool null_found = false;
   uint32_t curr_pos = 0;
diff --git a/table/cuckoo_table_reader_test.cc b/table/cuckoo_table_reader_test.cc
index 66d88fc71..7bd18f536 100644
--- a/table/cuckoo_table_reader_test.cc
+++ b/table/cuckoo_table_reader_test.cc
@@ -72,8 +72,8 @@ class CuckooReaderTest {
     env_options = EnvOptions(options);
   }
 
-  void SetUp(int num_items) {
-    this->num_items = num_items;
+  void SetUp(int num) {
+    num_items = num;
     hash_map.clear();
     keys.clear();
     keys.resize(num_items);
diff --git a/table/plain_table_reader.cc b/table/plain_table_reader.cc
index db37241a9..16120d32b 100644
--- a/table/plain_table_reader.cc
+++ b/table/plain_table_reader.cc
@@ -358,12 +358,12 @@ Status PlainTableReader::PopulateIndex(TableProperties* props,
 
   std::vector<uint32_t> prefix_hashes;
   if (!index_in_file) {
-    Status s = PopulateIndexRecordList(&index_builder, &prefix_hashes);
+    s = PopulateIndexRecordList(&index_builder, &prefix_hashes);
     if (!s.ok()) {
       return s;
     }
   } else {
-    Status s = index_.InitFromRawData(*index_block);
+    s = index_.InitFromRawData(*index_block);
     if (!s.ok()) {
       return s;
     }
@@ -566,7 +566,7 @@ Status PlainTableReader::Get(const ReadOptions& ro, const Slice& target,
   PlainTableKeyDecoder decoder(encoding_type_, user_key_len_,
                                ioptions_.prefix_extractor);
   while (offset < data_end_offset_) {
-    Status s = Next(&decoder, &offset, &found_key, nullptr, &found_value);
+    s = Next(&decoder, &offset, &found_key, nullptr, &found_value);
     if (!s.ok()) {
       return s;
     }
diff --git a/table/table_test.cc b/table/table_test.cc
index 362905eea..5f34e92eb 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -1416,31 +1416,32 @@ class BlockCachePropertiesSnapshot {
     filter_block_cache_hit = statistics->getTickerCount(BLOCK_CACHE_FILTER_HIT);
   }
 
-  void AssertIndexBlockStat(int64_t index_block_cache_miss,
-                            int64_t index_block_cache_hit) {
-    ASSERT_EQ(index_block_cache_miss, this->index_block_cache_miss);
-    ASSERT_EQ(index_block_cache_hit, this->index_block_cache_hit);
+  void AssertIndexBlockStat(int64_t expected_index_block_cache_miss,
+                            int64_t expected_index_block_cache_hit) {
+    ASSERT_EQ(expected_index_block_cache_miss, index_block_cache_miss);
+    ASSERT_EQ(expected_index_block_cache_hit, index_block_cache_hit);
   }
 
-  void AssertFilterBlockStat(int64_t filter_block_cache_miss,
-                             int64_t filter_block_cache_hit) {
-    ASSERT_EQ(filter_block_cache_miss, this->filter_block_cache_miss);
-    ASSERT_EQ(filter_block_cache_hit, this->filter_block_cache_hit);
+  void AssertFilterBlockStat(int64_t expected_filter_block_cache_miss,
+                             int64_t expected_filter_block_cache_hit) {
+    ASSERT_EQ(expected_filter_block_cache_miss, filter_block_cache_miss);
+    ASSERT_EQ(expected_filter_block_cache_hit, filter_block_cache_hit);
   }
 
   // Check if the fetched props matches the expected ones.
   // TODO(kailiu) Use this only when you disabled filter policy!
-  void AssertEqual(int64_t index_block_cache_miss,
-                   int64_t index_block_cache_hit, int64_t data_block_cache_miss,
-                   int64_t data_block_cache_hit) const {
-    ASSERT_EQ(index_block_cache_miss, this->index_block_cache_miss);
-    ASSERT_EQ(index_block_cache_hit, this->index_block_cache_hit);
-    ASSERT_EQ(data_block_cache_miss, this->data_block_cache_miss);
-    ASSERT_EQ(data_block_cache_hit, this->data_block_cache_hit);
-    ASSERT_EQ(index_block_cache_miss + data_block_cache_miss,
-              this->block_cache_miss);
-    ASSERT_EQ(index_block_cache_hit + data_block_cache_hit,
-              this->block_cache_hit);
+  void AssertEqual(int64_t expected_index_block_cache_miss,
+                   int64_t expected_index_block_cache_hit,
+                   int64_t expected_data_block_cache_miss,
+                   int64_t expected_data_block_cache_hit) const {
+    ASSERT_EQ(expected_index_block_cache_miss, index_block_cache_miss);
+    ASSERT_EQ(expected_index_block_cache_hit, index_block_cache_hit);
+    ASSERT_EQ(expected_data_block_cache_miss, data_block_cache_miss);
+    ASSERT_EQ(expected_data_block_cache_hit, data_block_cache_hit);
+    ASSERT_EQ(expected_index_block_cache_miss + expected_data_block_cache_miss,
+              block_cache_miss);
+    ASSERT_EQ(expected_index_block_cache_hit + expected_data_block_cache_hit,
+              block_cache_hit);
   }
 
  private:
diff --git a/table/two_level_iterator.h b/table/two_level_iterator.h
index d955dd763..030193597 100644
--- a/table/two_level_iterator.h
+++ b/table/two_level_iterator.h
@@ -19,8 +19,8 @@ class InternalKeyComparator;
 class Arena;
 
 struct TwoLevelIteratorState {
-  explicit TwoLevelIteratorState(bool check_prefix_may_match)
-    : check_prefix_may_match(check_prefix_may_match) {}
+  explicit TwoLevelIteratorState(bool _check_prefix_may_match)
+      : check_prefix_may_match(_check_prefix_may_match) {}
 
   virtual ~TwoLevelIteratorState() {}
   virtual Iterator* NewSecondaryIterator(const Slice& handle) = 0;
diff --git a/util/blob_store.h b/util/blob_store.h
index ce8633740..917fb947e 100644
--- a/util/blob_store.h
+++ b/util/blob_store.h
@@ -25,8 +25,8 @@ struct BlobChunk {
   uint32_t offset; // in blocks
   uint32_t size; // in blocks
   BlobChunk() {}
-  BlobChunk(uint32_t bucket_id, uint32_t offset, uint32_t size) :
-    bucket_id(bucket_id), offset(offset), size(size) {}
+  BlobChunk(uint32_t _bucket_id, uint32_t _offset, uint32_t _size)
+      : bucket_id(_bucket_id), offset(_offset), size(_size) {}
 
   // returns true if it's immediately before chunk
   bool ImmediatelyBefore(const BlobChunk& chunk) const;
diff --git a/util/cache.cc b/util/cache.cc
index f1c48a829..850fdb537 100644
--- a/util/cache.cc
+++ b/util/cache.cc
@@ -325,7 +325,7 @@ Cache::Handle* LRUCache::Insert(
     // Free the space following strict LRU policy until enough space
     // is freed.
     while (usage_ > capacity_ && lru_.next != &lru_) {
-      LRUHandle* old = lru_.next;
+      old = lru_.next;
       LRU_Remove(old);
       table_.Remove(old->key(), old->hash);
       if (Unref(old)) {
diff --git a/util/rate_limiter.cc b/util/rate_limiter.cc
index 47f96de84..3da2627e7 100644
--- a/util/rate_limiter.cc
+++ b/util/rate_limiter.cc
@@ -15,8 +15,8 @@ namespace rocksdb {
 
 // Pending request
 struct GenericRateLimiter::Req {
-  explicit Req(int64_t bytes, port::Mutex* mu) :
-    bytes(bytes), cv(mu), granted(false) {}
+  explicit Req(int64_t _bytes, port::Mutex* _mu)
+      : bytes(_bytes), cv(_mu), granted(false) {}
   int64_t bytes;
   port::CondVar cv;
   bool granted;
diff --git a/util/rate_limiter_test.cc b/util/rate_limiter_test.cc
index 9d6cfb7e6..cdeca578d 100644
--- a/util/rate_limiter_test.cc
+++ b/util/rate_limiter_test.cc
@@ -30,23 +30,23 @@ TEST(RateLimiterTest, StartStop) {
 TEST(RateLimiterTest, Rate) {
   auto* env = Env::Default();
   struct Arg {
-    Arg(int64_t target_rate, int burst)
-      : limiter(new GenericRateLimiter(target_rate, 100 * 1000, 10)),
-        request_size(target_rate / 10),
-        burst(burst) {}
+    Arg(int64_t _target_rate, int _burst)
+        : limiter(new GenericRateLimiter(_target_rate, 100 * 1000, 10)),
+          request_size(_target_rate / 10),
+          burst(_burst) {}
     std::unique_ptr<RateLimiter> limiter;
     int64_t request_size;
     int burst;
   };
 
   auto writer = [](void* p) {
-    auto* env = Env::Default();
+    auto* thread_env = Env::Default();
     auto* arg = static_cast<Arg*>(p);
     // Test for 2 seconds
-    auto until = env->NowMicros() + 2 * 1000000;
-    Random r((uint32_t)(env->NowNanos() %
-          std::numeric_limits<uint32_t>::max()));
-    while (env->NowMicros() < until) {
+    auto until = thread_env->NowMicros() + 2 * 1000000;
+    Random r((uint32_t)(thread_env->NowNanos() %
+                        std::numeric_limits<uint32_t>::max()));
+    while (thread_env->NowMicros() < until) {
       for (int i = 0; i < static_cast<int>(r.Skewed(arg->burst) + 1); ++i) {
         arg->limiter->Request(r.Uniform(arg->request_size - 1) + 1,
                               Env::IO_HIGH);
diff --git a/util/thread_local_test.cc b/util/thread_local_test.cc
index 70dfa956e..155ef243c 100644
--- a/util/thread_local_test.cc
+++ b/util/thread_local_test.cc
@@ -24,11 +24,11 @@ class ThreadLocalTest {
 namespace {
 
 struct Params {
-  Params(port::Mutex* m, port::CondVar* c, int* unref, int n,
+  Params(port::Mutex* m, port::CondVar* c, int* u, int n,
          UnrefHandler handler = nullptr)
       : mu(m),
         cv(c),
-        unref(unref),
+        unref(u),
         total(n),
         started(0),
         completed(0),
@@ -112,24 +112,24 @@ TEST(ThreadLocalTest, SequentialReadWriteTest) {
   p.tls2 = &tls2;
 
   auto func = [](void* ptr) {
-    auto& p = *static_cast<Params*>(ptr);
-
-    ASSERT_TRUE(p.tls1.Get() == nullptr);
-    p.tls1.Reset(reinterpret_cast<int*>(1));
-    ASSERT_TRUE(p.tls1.Get() == reinterpret_cast<int*>(1));
-    p.tls1.Reset(reinterpret_cast<int*>(2));
-    ASSERT_TRUE(p.tls1.Get() == reinterpret_cast<int*>(2));
-
-    ASSERT_TRUE(p.tls2->Get() == nullptr);
-    p.tls2->Reset(reinterpret_cast<int*>(1));
-    ASSERT_TRUE(p.tls2->Get() == reinterpret_cast<int*>(1));
-    p.tls2->Reset(reinterpret_cast<int*>(2));
-    ASSERT_TRUE(p.tls2->Get() == reinterpret_cast<int*>(2));
-
-    p.mu->Lock();
-    ++(p.completed);
-    p.cv->SignalAll();
-    p.mu->Unlock();
+    auto& params = *static_cast<Params*>(ptr);
+
+    ASSERT_TRUE(params.tls1.Get() == nullptr);
+    params.tls1.Reset(reinterpret_cast<int*>(1));
+    ASSERT_TRUE(params.tls1.Get() == reinterpret_cast<int*>(1));
+    params.tls1.Reset(reinterpret_cast<int*>(2));
+    ASSERT_TRUE(params.tls1.Get() == reinterpret_cast<int*>(2));
+
+    ASSERT_TRUE(params.tls2->Get() == nullptr);
+    params.tls2->Reset(reinterpret_cast<int*>(1));
+    ASSERT_TRUE(params.tls2->Get() == reinterpret_cast<int*>(1));
+    params.tls2->Reset(reinterpret_cast<int*>(2));
+    ASSERT_TRUE(params.tls2->Get() == reinterpret_cast<int*>(2));
+
+    params.mu->Lock();
+    ++(params.completed);
+    params.cv->SignalAll();
+    params.mu->Unlock();
   };
 
   for (int iter = 0; iter < 1024; ++iter) {
diff --git a/utilities/geodb/geodb_impl.cc b/utilities/geodb/geodb_impl.cc
index 6c13fd691..fc387b4ca 100644
--- a/utilities/geodb/geodb_impl.cc
+++ b/utilities/geodb/geodb_impl.cc
@@ -192,8 +192,8 @@ Status GeoDBImpl::SearchRadial(const GeoPosition& pos,
       // we are looking for.
       auto res = std::mismatch(qid.begin(), qid.end(), quadkey->begin());
       if (res.first == qid.end()) {
-        GeoPosition pos(atof(parts[3].c_str()), atof(parts[4].c_str()));
-        GeoObject obj(pos, parts[4], iter->value().ToString());
+        GeoPosition obj_pos(atof(parts[3].c_str()), atof(parts[4].c_str()));
+        GeoObject obj(obj_pos, parts[4], iter->value().ToString());
         values->push_back(obj);
         number_of_values--;
       } else {
diff --git a/utilities/write_batch_with_index/write_batch_with_index.cc b/utilities/write_batch_with_index/write_batch_with_index.cc
index 4ba063a06..ff9d89f2f 100644
--- a/utilities/write_batch_with_index/write_batch_with_index.cc
+++ b/utilities/write_batch_with_index/write_batch_with_index.cc
@@ -437,11 +437,11 @@ class WBWIIteratorImpl : public WBWIIterator {
 
 struct WriteBatchWithIndex::Rep {
   Rep(const Comparator* index_comparator, size_t reserved_bytes = 0,
-      bool overwrite_key = false)
+      bool _overwrite_key = false)
       : write_batch(reserved_bytes),
         comparator(index_comparator, &write_batch),
         skip_list(comparator, &arena),
-        overwrite_key(overwrite_key),
+        overwrite_key(_overwrite_key),
         last_entry_offset(0) {}
   ReadableWriteBatch write_batch;
   WriteBatchEntryComparator comparator;

From 7e01d120267da015fbed5073d9805be06cd74a51 Mon Sep 17 00:00:00 2001
From: Venkatesh Radhakrishnan <rven@fb.com>
Date: Fri, 31 Oct 2014 12:02:14 -0700
Subject: [PATCH 376/829] Add support for in place update for db_stress

Summary:
Added two flags which operate as follows:
in_place_update: enable in_place_update for default column family
set_in_place_one_in: toggles the value of the option inplace_update_support with a probability of 1/N

Test Plan:
Run db_stress with the two flags above set.
Specifically tried in_place_update set to true and set_in_place_one_in set to 10,000.

Reviewers: ljin, igor, sdong

Reviewed By: sdong

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D28029
---
 tools/db_stress.cc | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/tools/db_stress.cc b/tools/db_stress.cc
index 2cdf241bb..d2bdec7e0 100644
--- a/tools/db_stress.cc
+++ b/tools/db_stress.cc
@@ -194,6 +194,9 @@ DEFINE_int32(clear_column_family_one_in, 1000000,
 DEFINE_int32(set_options_one_in, 0,
              "With a chance of 1/N, change some random options");
 
+DEFINE_int32(set_in_place_one_in, 0,
+             "With a chance of 1/N, toggle in place support option");
+
 DEFINE_int64(cache_size, 2 * KB * KB * KB,
              "Number of bytes to use as a cache of uncompressed data.");
 
@@ -341,6 +344,8 @@ static const bool FLAGS_purge_redundant_percent_dummy __attribute__((unused)) =
 DEFINE_bool(filter_deletes, false, "On true, deletes use KeyMayExist to drop"
             " the delete if key not present");
 
+DEFINE_bool(in_place_update, false, "On true, does inplace update in memtable");
+
 enum RepFactory {
   kSkipList,
   kHashSkipList,
@@ -1362,6 +1367,11 @@ class StressTest {
         SetOptions(thread);
       }
 
+      if (FLAGS_set_in_place_one_in > 0 &&
+          thread->rand.OneIn(FLAGS_set_in_place_one_in)) {
+        options_.inplace_update_support ^= options_.inplace_update_support;
+      }
+
       if (!FLAGS_test_batches_snapshots &&
           FLAGS_clear_column_family_one_in != 0 && FLAGS_column_families > 1) {
         if (thread->rand.OneIn(FLAGS_clear_column_family_one_in)) {
@@ -1682,6 +1692,8 @@ class StressTest {
             FLAGS_purge_redundant_percent);
     fprintf(stdout, "Deletes use filter  : %d\n",
             FLAGS_filter_deletes);
+    fprintf(stdout, "Do update in place  : %d\n",
+            FLAGS_in_place_update);
     fprintf(stdout, "Num keys per lock   : %d\n",
             1 << FLAGS_log2_keys_per_lock);
 
@@ -1765,6 +1777,7 @@ class StressTest {
     options_.create_if_missing = true;
     options_.max_manifest_file_size = 10 * 1024;
     options_.filter_deletes = FLAGS_filter_deletes;
+    options_.inplace_update_support = FLAGS_in_place_update;
     if ((FLAGS_prefix_size == 0) == (FLAGS_rep_factory == kHashSkipList)) {
       fprintf(stderr,
             "prefix_size should be non-zero iff memtablerep == prefix_hash\n");

From c1a924b9f0e4386bd0cdcd19b99cd4671bcbd6c0 Mon Sep 17 00:00:00 2001
From: Jonah Cohen <jonah@fb.com>
Date: Fri, 31 Oct 2014 12:08:43 -0700
Subject: [PATCH 377/829] Move convenience.h to /include

Summary: Move header file so it can be referenced externally.

Test Plan: Rebuild.

Reviewers: ljin

Reviewed By: ljin

Subscribers: dhruba

Differential Revision: https://reviews.facebook.net/D28095
---
 {utilities/options => include/utilities}/convenience.h | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename {utilities/options => include/utilities}/convenience.h (100%)

diff --git a/utilities/options/convenience.h b/include/utilities/convenience.h
similarity index 100%
rename from utilities/options/convenience.h
rename to include/utilities/convenience.h

From 29d83cc33a30e5e65a9ff6165697709f334d1ad7 Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Fri, 31 Oct 2014 12:20:39 -0700
Subject: [PATCH 378/829] temporarily remove -Wshadow

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 0beadda85..5ed8a5a67 100644
--- a/Makefile
+++ b/Makefile
@@ -69,7 +69,7 @@ install:
 	@[ ! -e $(SHARED) ] || install -C -m 644 $(SHARED) $(INSTALL_PATH)/lib
 #-------------------------------------------------
 
-WARNING_FLAGS = -Wall -Werror -Wsign-compare -Wshadow
+WARNING_FLAGS = -Wall -Werror -Wsign-compare
 CFLAGS += $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CCFLAGS) $(OPT)
 CXXFLAGS += $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CXXFLAGS) $(OPT) -Woverloaded-virtual
 

From b452dede5cab2930a88a1633f80a023dc5362501 Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Fri, 31 Oct 2014 12:23:00 -0700
Subject: [PATCH 379/829] fix

Summary:

Test Plan:

Reviewers:

CC:

Task ID: #

Blame Rev:
---
 util/options_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/util/options_test.cc b/util/options_test.cc
index f1258b8ed..e0b8fc908 100644
--- a/util/options_test.cc
+++ b/util/options_test.cc
@@ -17,7 +17,7 @@
 
 #include "rocksdb/options.h"
 #include "util/testharness.h"
-#include "utilities/options/convenience.h"
+#include "rocksdb/utilities/options/convenience.h"
 
 using GFLAGS::ParseCommandLineFlags;
 DEFINE_bool(enable_print, false, "Print options generated to console.");

From c76dcb44d4cca2ad756214837af0b3016aa8d936 Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Fri, 31 Oct 2014 13:28:07 -0700
Subject: [PATCH 380/829] fix

Summary:

Test Plan:

Reviewers:

CC:

Task ID: #

Blame Rev:
---
 util/options_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/util/options_test.cc b/util/options_test.cc
index e0b8fc908..341240130 100644
--- a/util/options_test.cc
+++ b/util/options_test.cc
@@ -17,7 +17,7 @@
 
 #include "rocksdb/options.h"
 #include "util/testharness.h"
-#include "rocksdb/utilities/options/convenience.h"
+#include "utilities/convenience.h"
 
 using GFLAGS::ParseCommandLineFlags;
 DEFINE_bool(enable_print, false, "Print options generated to console.");

From f7e6c856ab3fa3c37c5cd3f8ba2a7cf59f037385 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Fri, 31 Oct 2014 12:16:35 -0700
Subject: [PATCH 381/829] Fix BaseReferencedVersionBuilder's destructor order

Summary: BaseReferencedVersionBuilder now unreference version before destructing VersionBuilder, which is wrong. Fix it.

Test Plan:
make all check
valgrind test to tests that used to fail

Reviewers: igor, yhchiang, rven, ljin

Reviewed By: ljin

Subscribers: leveldb, dhruba

Differential Revision: https://reviews.facebook.net/D28101
---
 db/version_set.cc | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/db/version_set.cc b/db/version_set.cc
index fc37460b8..3c2c0d42e 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -515,19 +515,24 @@ class LevelFileIteratorState : public TwoLevelIteratorState {
 
 // A wrapper of version builder which references the current version in
 // constructor and unref it in the destructor.
+// Both of the constructor and destructor need to be called inside DB Mutex.
 class BaseReferencedVersionBuilder {
  public:
   explicit BaseReferencedVersionBuilder(ColumnFamilyData* cfd)
-      : version_builder_(cfd->current()->version_set()->GetEnvOptions(),
-                         cfd->table_cache(), cfd->current()->storage_info()),
+      : version_builder_(new VersionBuilder(
+            cfd->current()->version_set()->GetEnvOptions(), cfd->table_cache(),
+            cfd->current()->storage_info())),
         version_(cfd->current()) {
     version_->Ref();
   }
-  ~BaseReferencedVersionBuilder() { version_->Unref(); }
-  VersionBuilder* GetVersionBuilder() { return &version_builder_; }
+  ~BaseReferencedVersionBuilder() {
+    delete version_builder_;
+    version_->Unref();
+  }
+  VersionBuilder* GetVersionBuilder() { return version_builder_; }
 
  private:
-  VersionBuilder version_builder_;
+  VersionBuilder* version_builder_;
   Version* version_;
 };
 }  // anonymous namespace
@@ -2322,7 +2327,7 @@ Status VersionSet::Recover(
     for (auto cfd : *column_family_set_) {
       auto builders_iter = builders.find(cfd->GetID());
       assert(builders_iter != builders.end());
-      auto builder = builders_iter->second->GetVersionBuilder();
+      auto* builder = builders_iter->second->GetVersionBuilder();
 
       if (db_options_->max_open_files == -1) {
       // unlimited table cache. Pre-load table handle now.

From c645250ee09e44995557f9d9b50b7a9e4d0f73f6 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Fri, 31 Oct 2014 13:51:16 -0700
Subject: [PATCH 382/829] CompactionStats to support larger value of RecordIn
 and RecordDrop

Summary: now we use %8d for RecordIn and %10d for RecordDrop, which is far too small for some use cases. Extend both of them to %12d.

Test Plan: run one test in db_test and see the LOG file.

Reviewers: igor, MarkCallaghan, ljin

Reviewed By: ljin

Subscribers: leveldb, dhruba

Differential Revision: https://reviews.facebook.net/D28041
---
 db/internal_stats.cc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/db/internal_stats.cc b/db/internal_stats.cc
index 1440dbe42..6aaf6b2c4 100644
--- a/db/internal_stats.cc
+++ b/db/internal_stats.cc
@@ -30,7 +30,7 @@ void PrintLevelStatsHeader(char* buf, size_t len, const std::string& cf_name) {
       "Level   Files   Size(MB) Score Read(GB)  Rn(GB) Rnp1(GB) "
       "Write(GB) Wnew(GB) RW-Amp W-Amp Rd(MB/s) Wr(MB/s)  Rn(cnt) "
       "Rnp1(cnt) Wnp1(cnt) Wnew(cnt)  Comp(sec) Comp(cnt) Avg(sec) "
-      "Stall(sec) Stall(cnt) Avg(ms) RecordIn RecordDrop\n"
+      "Stall(sec) Stall(cnt) Avg(ms)     RecordIn   RecordDrop\n"
       "--------------------------------------------------------------------"
       "--------------------------------------------------------------------"
       "--------------------------------------------------------------------\n",
@@ -65,9 +65,9 @@ void PrintLevelStats(char* buf, size_t len, const std::string& name,
     "%8.3f " /* Avg(sec) */
     "%10.2f " /* Stall(sec) */
     "%10" PRIu64 " " /* Stall(cnt) */
-    "%7.2f" /* Avg(ms) */
-    "%8d " /* input entries */
-    "%10d\n" /* number of records reduced */,
+    "%7.2f " /* Avg(ms) */
+    "%12d " /* input entries */
+    "%12d\n" /* number of records reduced */,
     name.c_str(), num_files, being_compacted, total_file_size / kMB, score,
     bytes_read / kGB,
     stats.bytes_readn / kGB,

From bc9f36fd5e5f0eae69a5a1b7269bb2623cc0eb1f Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Fri, 31 Oct 2014 15:01:39 -0700
Subject: [PATCH 383/829] Fix lint errors and coding style of ldb related
 codes.

Summary: Fix lint errors and coding style of ldb related codes.

Test Plan: ./ldb

Reviewers: ljin, sdong, igor

Reviewed By: igor

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D28125
---
 tools/sst_dump.cc             |   4 +-
 util/ldb_cmd.cc               | 904 ++++++++++++++++++----------------
 util/ldb_cmd.h                | 465 ++++++++---------
 util/ldb_cmd_execute_result.h |   6 +-
 util/ldb_tool.cc              |  36 +-
 5 files changed, 741 insertions(+), 674 deletions(-)

diff --git a/tools/sst_dump.cc b/tools/sst_dump.cc
index 6c496e8dd..e9fdf1c3f 100644
--- a/tools/sst_dump.cc
+++ b/tools/sst_dump.cc
@@ -275,8 +275,8 @@ static void print_help() {
 }
 
 namespace {
-string HexToString(const string& str) {
-  string parsed;
+std::string HexToString(const std::string& str) {
+  std::string parsed;
   if (str[0] != '0' || str[1] != 'x') {
     fprintf(stderr, "Invalid hex input %s.  Must start with 0x\n",
             str.c_str());
diff --git a/util/ldb_cmd.cc b/util/ldb_cmd.cc
index 618c10a35..c03c1b31a 100644
--- a/util/ldb_cmd.cc
+++ b/util/ldb_cmd.cc
@@ -4,6 +4,20 @@
 //  of patent rights can be found in the PATENTS file in the same directory.
 //
 #ifndef ROCKSDB_LITE
+#include <dirent.h>
+#include <ctime>
+#include <limits>
+#include <sstream>
+#include <string>
+#include <stdexcept>
+#include <utility>
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include <inttypes.h>
+
 #include "util/ldb_cmd.h"
 
 #include "db/dbformat.h"
@@ -17,46 +31,36 @@
 #include "util/scoped_arena_iterator.h"
 #include "utilities/ttl/db_ttl_impl.h"
 
-#include <ctime>
-#include <dirent.h>
-#include <limits>
-#include <sstream>
-#include <string>
-#include <stdexcept>
-
 namespace rocksdb {
 
-using namespace std;
-
-const string LDBCommand::ARG_DB = "db";
-const string LDBCommand::ARG_HEX = "hex";
-const string LDBCommand::ARG_KEY_HEX = "key_hex";
-const string LDBCommand::ARG_VALUE_HEX = "value_hex";
-const string LDBCommand::ARG_TTL = "ttl";
-const string LDBCommand::ARG_TTL_START = "start_time";
-const string LDBCommand::ARG_TTL_END = "end_time";
-const string LDBCommand::ARG_TIMESTAMP = "timestamp";
-const string LDBCommand::ARG_FROM = "from";
-const string LDBCommand::ARG_TO = "to";
-const string LDBCommand::ARG_MAX_KEYS = "max_keys";
-const string LDBCommand::ARG_BLOOM_BITS = "bloom_bits";
-const string LDBCommand::ARG_FIX_PREFIX_LEN = "fix_prefix_len";
-const string LDBCommand::ARG_COMPRESSION_TYPE = "compression_type";
-const string LDBCommand::ARG_BLOCK_SIZE = "block_size";
-const string LDBCommand::ARG_AUTO_COMPACTION = "auto_compaction";
-const string LDBCommand::ARG_WRITE_BUFFER_SIZE = "write_buffer_size";
-const string LDBCommand::ARG_FILE_SIZE = "file_size";
-const string LDBCommand::ARG_CREATE_IF_MISSING = "create_if_missing";
+const std::string LDBCommand::ARG_DB = "db";
+const std::string LDBCommand::ARG_HEX = "hex";
+const std::string LDBCommand::ARG_KEY_HEX = "key_hex";
+const std::string LDBCommand::ARG_VALUE_HEX = "value_hex";
+const std::string LDBCommand::ARG_TTL = "ttl";
+const std::string LDBCommand::ARG_TTL_START = "start_time";
+const std::string LDBCommand::ARG_TTL_END = "end_time";
+const std::string LDBCommand::ARG_TIMESTAMP = "timestamp";
+const std::string LDBCommand::ARG_FROM = "from";
+const std::string LDBCommand::ARG_TO = "to";
+const std::string LDBCommand::ARG_MAX_KEYS = "max_keys";
+const std::string LDBCommand::ARG_BLOOM_BITS = "bloom_bits";
+const std::string LDBCommand::ARG_FIX_PREFIX_LEN = "fix_prefix_len";
+const std::string LDBCommand::ARG_COMPRESSION_TYPE = "compression_type";
+const std::string LDBCommand::ARG_BLOCK_SIZE = "block_size";
+const std::string LDBCommand::ARG_AUTO_COMPACTION = "auto_compaction";
+const std::string LDBCommand::ARG_WRITE_BUFFER_SIZE = "write_buffer_size";
+const std::string LDBCommand::ARG_FILE_SIZE = "file_size";
+const std::string LDBCommand::ARG_CREATE_IF_MISSING = "create_if_missing";
 
 const char* LDBCommand::DELIM = " ==> ";
 
 LDBCommand* LDBCommand::InitFromCmdLineArgs(
-  int argc,
-  char** argv,
-  const Options& options,
-  const LDBOptions& ldb_options
-) {
-  vector<string> args;
+    int argc,
+    char** argv,
+    const Options& options,
+    const LDBOptions& ldb_options) {
+  std::vector<std::string> args;
   for (int i = 1; i < argc; i++) {
     args.push_back(argv[i]);
   }
@@ -67,37 +71,37 @@ LDBCommand* LDBCommand::InitFromCmdLineArgs(
  * Parse the command-line arguments and create the appropriate LDBCommand2
  * instance.
  * The command line arguments must be in the following format:
- * ./ldb --db=PATH_TO_DB [--commonOpt1=commonOpt1Val] ..
- *        COMMAND <PARAM1> <PARAM2> ... [-cmdSpecificOpt1=cmdSpecificOpt1Val] ..
+ * ./ldb --db = PATH_TO_DB [--commonOpt1 = commonOpt1Val] ..
+ *        COMMAND <PARAM1> <PARAM2> ...
+ *        [-cmdSpecificOpt1 = cmdSpecificOpt1Val] ..
  * This is similar to the command line format used by HBaseClientTool.
  * Command name is not included in args.
  * Returns nullptr if the command-line cannot be parsed.
  */
 LDBCommand* LDBCommand::InitFromCmdLineArgs(
-  const vector<string>& args,
-  const Options& options,
-  const LDBOptions& ldb_options
-) {
-  // --x=y command line arguments are added as x->y map entries.
-  map<string, string> option_map;
+    const std::vector<std::string>& args,
+    const Options& options,
+    const LDBOptions& ldb_options) {
+  // --x = y command line arguments are added as x->y std::map entries.
+  std::map<std::string, std::string> option_map;
 
   // Command-line arguments of the form --hex end up in this array as hex
-  vector<string> flags;
+  std::vector<std::string> flags;
 
   // Everything other than option_map and flags. Represents commands
-  // and their parameters.  For eg: put key1 value1 go into this vector.
-  vector<string> cmdTokens;
+  // and their parameters.  For eg: put key1 value1 go into this std::vector.
+  std::vector<std::string> cmdTokens;
 
-  const string OPTION_PREFIX = "--";
+  const std::string OPTION_PREFIX = "--";
 
   for (const auto& arg : args) {
-    if (arg[0] == '-' && arg[1] == '-'){
-      vector<string> splits = stringSplit(arg, '=');
+    if (arg[0] == '-' && arg[1] == '-') {
+      std::vector<std::string> splits = stringSplit(arg, '=');
       if (splits.size() == 2) {
-        string optionKey = splits[0].substr(OPTION_PREFIX.size());
+        std::string optionKey = splits[0].substr(OPTION_PREFIX.size());
         option_map[optionKey] = splits[1];
       } else {
-        string optionKey = splits[0].substr(OPTION_PREFIX.size());
+        std::string optionKey = splits[0].substr(OPTION_PREFIX.size());
         flags.push_back(optionKey);
       }
     } else {
@@ -110,14 +114,10 @@ LDBCommand* LDBCommand::InitFromCmdLineArgs(
     return nullptr;
   }
 
-  string cmd = cmdTokens[0];
-  vector<string> cmdParams(cmdTokens.begin()+1, cmdTokens.end());
+  std::string cmd = cmdTokens[0];
+  std::vector<std::string> cmdParams(cmdTokens.begin()+1, cmdTokens.end());
   LDBCommand* command = LDBCommand::SelectCommand(
-    cmd,
-    cmdParams,
-    option_map,
-    flags
-  );
+      cmd, cmdParams, option_map, flags);
 
   if (command) {
     command->SetDBOptions(options);
@@ -128,11 +128,9 @@ LDBCommand* LDBCommand::InitFromCmdLineArgs(
 
 LDBCommand* LDBCommand::SelectCommand(
     const std::string& cmd,
-    const vector<string>& cmdParams,
-    const map<string, string>& option_map,
-    const vector<string>& flags
-  ) {
-
+    const std::vector<std::string>& cmdParams,
+    const std::map<std::string, std::string>& option_map,
+    const std::vector<std::string>& flags) {
   if (cmd == GetCommand::Name()) {
     return new GetCommand(cmdParams, option_map, flags);
   } else if (cmd == PutCommand::Name()) {
@@ -179,21 +177,21 @@ LDBCommand* LDBCommand::SelectCommand(
  * value.  If there is an error, the specified exec_state is also
  * updated.
  */
-bool LDBCommand::ParseIntOption(const map<string, string>& options,
-                                const string& option, int& value,
-                                LDBCommandExecuteResult& exec_state) {
-
-  map<string, string>::const_iterator itr = option_map_.find(option);
+bool LDBCommand::ParseIntOption(
+    const std::map<std::string, std::string>& options,
+    const std::string& option, int* value,
+    LDBCommandExecuteResult* exec_state) {
+  auto itr = option_map_.find(option);
   if (itr != option_map_.end()) {
     try {
-      value = stoi(itr->second);
+      *value = stoi(itr->second);
       return true;
-    } catch(const invalid_argument&) {
-      exec_state = LDBCommandExecuteResult::FAILED(option +
-                      " has an invalid value.");
-    } catch(const out_of_range&) {
-      exec_state = LDBCommandExecuteResult::FAILED(option +
-                      " has a value out-of-range.");
+    } catch(const std::invalid_argument&) {
+      *exec_state = LDBCommandExecuteResult::FAILED(
+          option + " has an invalid value.");
+    } catch(const std::out_of_range&) {
+      *exec_state = LDBCommandExecuteResult::FAILED(
+          option + " has a value out-of-range.");
     }
   }
   return false;
@@ -204,8 +202,9 @@ bool LDBCommand::ParseIntOption(const map<string, string>& options,
  * Returns true if the option is found.
  * Returns false otherwise.
  */
-bool LDBCommand::ParseStringOption(const map<string, string>& options,
-                                   const string& option, string* value) {
+bool LDBCommand::ParseStringOption(
+    const std::map<std::string, std::string>& options,
+    const std::string& option, std::string* value) {
   auto itr = option_map_.find(option);
   if (itr != option_map_.end()) {
     *value = itr->second;
@@ -219,12 +218,12 @@ Options LDBCommand::PrepareOptionsForOpenDB() {
   Options opt = options_;
   opt.create_if_missing = false;
 
-  map<string, string>::const_iterator itr;
+  std::map<std::string, std::string>::const_iterator itr;
 
   BlockBasedTableOptions table_options;
   bool use_table_options = false;
   int bits;
-  if (ParseIntOption(option_map_, ARG_BLOOM_BITS, bits, exec_state_)) {
+  if (ParseIntOption(option_map_, ARG_BLOOM_BITS, &bits, &exec_state_)) {
     if (bits > 0) {
       use_table_options = true;
       table_options.filter_policy.reset(NewBloomFilterPolicy(bits));
@@ -235,7 +234,7 @@ Options LDBCommand::PrepareOptionsForOpenDB() {
   }
 
   int block_size;
-  if (ParseIntOption(option_map_, ARG_BLOCK_SIZE, block_size, exec_state_)) {
+  if (ParseIntOption(option_map_, ARG_BLOCK_SIZE, &block_size, &exec_state_)) {
     if (block_size > 0) {
       use_table_options = true;
       table_options.block_size = block_size;
@@ -256,7 +255,7 @@ Options LDBCommand::PrepareOptionsForOpenDB() {
 
   itr = option_map_.find(ARG_COMPRESSION_TYPE);
   if (itr != option_map_.end()) {
-    string comp = itr->second;
+    std::string comp = itr->second;
     if (comp == "no") {
       opt.compression = kNoCompression;
     } else if (comp == "snappy") {
@@ -277,8 +276,8 @@ Options LDBCommand::PrepareOptionsForOpenDB() {
   }
 
   int write_buffer_size;
-  if (ParseIntOption(option_map_, ARG_WRITE_BUFFER_SIZE, write_buffer_size,
-        exec_state_)) {
+  if (ParseIntOption(option_map_, ARG_WRITE_BUFFER_SIZE,
+      &write_buffer_size, &exec_state_)) {
     if (write_buffer_size > 0) {
       opt.write_buffer_size = write_buffer_size;
     } else {
@@ -288,7 +287,7 @@ Options LDBCommand::PrepareOptionsForOpenDB() {
   }
 
   int file_size;
-  if (ParseIntOption(option_map_, ARG_FILE_SIZE, file_size, exec_state_)) {
+  if (ParseIntOption(option_map_, ARG_FILE_SIZE, &file_size, &exec_state_)) {
     if (file_size > 0) {
       opt.target_file_size_base = file_size;
     } else {
@@ -302,13 +301,13 @@ Options LDBCommand::PrepareOptionsForOpenDB() {
   }
 
   int fix_prefix_len;
-  if (ParseIntOption(option_map_, ARG_FIX_PREFIX_LEN, fix_prefix_len,
-                     exec_state_)) {
+  if (ParseIntOption(option_map_, ARG_FIX_PREFIX_LEN,
+                     &fix_prefix_len, &exec_state_)) {
     if (fix_prefix_len > 0) {
       opt.prefix_extractor.reset(
           NewFixedPrefixTransform(static_cast<size_t>(fix_prefix_len)));
     } else {
-      exec_state_ =
+      exec_state_  =
           LDBCommandExecuteResult::FAILED(ARG_FIX_PREFIX_LEN + " must be > 0.");
     }
   }
@@ -316,10 +315,11 @@ Options LDBCommand::PrepareOptionsForOpenDB() {
   return opt;
 }
 
-bool LDBCommand::ParseKeyValue(const string& line, string* key, string* value,
-                              bool is_key_hex, bool is_value_hex) {
+bool LDBCommand::ParseKeyValue(
+    const std::string& line, std::string* key, std::string* value,
+    bool is_key_hex, bool is_value_hex) {
   size_t pos = line.find(DELIM);
-  if (pos != string::npos) {
+  if (pos != std::string::npos) {
     *key = line.substr(0, pos);
     *value = line.substr(pos + strlen(DELIM));
     if (is_key_hex) {
@@ -343,20 +343,20 @@ bool LDBCommand::ParseKeyValue(const string& line, string* key, string* value,
  */
 bool LDBCommand::ValidateCmdLineOptions() {
 
-  for (map<string, string>::const_iterator itr = option_map_.begin();
-        itr != option_map_.end(); ++itr) {
+  for (auto itr = option_map_.begin();
+       itr != option_map_.end(); ++itr) {
     if (find(valid_cmd_line_options_.begin(),
-          valid_cmd_line_options_.end(), itr->first) ==
+          valid_cmd_line_options_.end(), itr->first)  ==
           valid_cmd_line_options_.end()) {
       fprintf(stderr, "Invalid command-line option %s\n", itr->first.c_str());
       return false;
     }
   }
 
-  for (vector<string>::const_iterator itr = flags_.begin();
+  for (std::vector<std::string>::const_iterator itr = flags_.begin();
         itr != flags_.end(); ++itr) {
     if (find(valid_cmd_line_options_.begin(),
-          valid_cmd_line_options_.end(), *itr) ==
+          valid_cmd_line_options_.end(), *itr)  ==
           valid_cmd_line_options_.end()) {
       fprintf(stderr, "Invalid command-line flag %s\n", itr->c_str());
       return false;
@@ -371,14 +371,15 @@ bool LDBCommand::ValidateCmdLineOptions() {
   return true;
 }
 
-CompactorCommand::CompactorCommand(const vector<string>& params,
-      const map<string, string>& options, const vector<string>& flags) :
+CompactorCommand::CompactorCommand(
+    const std::vector<std::string>& params,
+    const std::map<std::string, std::string>& options,
+    const std::vector<std::string>& flags) :
     LDBCommand(options, flags, false,
                BuildCmdLineOptions({ARG_FROM, ARG_TO, ARG_HEX, ARG_KEY_HEX,
                                     ARG_VALUE_HEX, ARG_TTL})),
     null_from_(true), null_to_(true) {
-
-  map<string, string>::const_iterator itr = options.find(ARG_FROM);
+  auto itr = options.find(ARG_FROM);
   if (itr != options.end()) {
     null_from_ = false;
     from_ = itr->second;
@@ -400,11 +401,11 @@ CompactorCommand::CompactorCommand(const vector<string>& params,
   }
 }
 
-void CompactorCommand::Help(string& ret) {
-  ret.append("  ");
-  ret.append(CompactorCommand::Name());
-  ret.append(HelpRangeCmdArgs());
-  ret.append("\n");
+void CompactorCommand::Help(std::string* ret) {
+  ret->append("  ");
+  ret->append(CompactorCommand::Name());
+  ret->append(HelpRangeCmdArgs());
+  ret->append("\n");
 }
 
 void CompactorCommand::DoCommand() {
@@ -425,12 +426,14 @@ void CompactorCommand::DoCommand() {
   delete end;
 }
 
-const string DBLoaderCommand::ARG_DISABLE_WAL = "disable_wal";
-const string DBLoaderCommand::ARG_BULK_LOAD = "bulk_load";
-const string DBLoaderCommand::ARG_COMPACT = "compact";
+const std::string DBLoaderCommand::ARG_DISABLE_WAL = "disable_wal";
+const std::string DBLoaderCommand::ARG_BULK_LOAD = "bulk_load";
+const std::string DBLoaderCommand::ARG_COMPACT = "compact";
 
-DBLoaderCommand::DBLoaderCommand(const vector<string>& params,
-      const map<string, string>& options, const vector<string>& flags) :
+DBLoaderCommand::DBLoaderCommand(
+    const std::vector<std::string>& params,
+    const std::map<std::string, std::string>& options,
+    const std::vector<std::string>& flags) :
     LDBCommand(options, flags, false,
                BuildCmdLineOptions({ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX,
                                     ARG_FROM, ARG_TO, ARG_CREATE_IF_MISSING,
@@ -445,14 +448,14 @@ DBLoaderCommand::DBLoaderCommand(const vector<string>& params,
   compact_ = IsFlagPresent(flags, ARG_COMPACT);
 }
 
-void DBLoaderCommand::Help(string& ret) {
-  ret.append("  ");
-  ret.append(DBLoaderCommand::Name());
-  ret.append(" [--" + ARG_CREATE_IF_MISSING + "]");
-  ret.append(" [--" + ARG_DISABLE_WAL + "]");
-  ret.append(" [--" + ARG_BULK_LOAD + "]");
-  ret.append(" [--" + ARG_COMPACT + "]");
-  ret.append("\n");
+void DBLoaderCommand::Help(std::string* ret) {
+  ret->append("  ");
+  ret->append(DBLoaderCommand::Name());
+  ret->append(" [--" + ARG_CREATE_IF_MISSING + "]");
+  ret->append(" [--" + ARG_DISABLE_WAL + "]");
+  ret->append(" [--" + ARG_BULK_LOAD + "]");
+  ret->append(" [--" + ARG_COMPACT + "]");
+  ret->append("\n");
 }
 
 Options DBLoaderCommand::PrepareOptionsForOpenDB() {
@@ -475,10 +478,10 @@ void DBLoaderCommand::DoCommand() {
   }
 
   int bad_lines = 0;
-  string line;
-  while (getline(cin, line, '\n')) {
-    string key;
-    string value;
+  std::string line;
+  while (getline(std::cin, line, '\n')) {
+    std::string key;
+    std::string value;
     if (ParseKeyValue(line, &key, &value, is_key_hex_, is_value_hex_)) {
       db_->Put(write_options, Slice(key), Slice(value));
     } else if (0 == line.find("Keys in range:")) {
@@ -491,7 +494,7 @@ void DBLoaderCommand::DoCommand() {
   }
 
   if (bad_lines > 0) {
-    cout << "Warning: " << bad_lines << " bad lines ignored." << endl;
+    std::cout << "Warning: " << bad_lines << " bad lines ignored." << std::endl;
   }
   if (compact_) {
     db_->CompactRange(nullptr, nullptr);
@@ -500,27 +503,28 @@ void DBLoaderCommand::DoCommand() {
 
 // ----------------------------------------------------------------------------
 
-const string ManifestDumpCommand::ARG_VERBOSE = "verbose";
-const string ManifestDumpCommand::ARG_PATH    = "path";
+const std::string ManifestDumpCommand::ARG_VERBOSE = "verbose";
+const std::string ManifestDumpCommand::ARG_PATH    = "path";
 
-void ManifestDumpCommand::Help(string& ret) {
-  ret.append("  ");
-  ret.append(ManifestDumpCommand::Name());
-  ret.append(" [--" + ARG_VERBOSE + "]");
-  ret.append(" [--" + ARG_PATH + "=<path_to_manifest_file>]");
-  ret.append("\n");
+void ManifestDumpCommand::Help(std::string* ret) {
+  ret->append("  ");
+  ret->append(ManifestDumpCommand::Name());
+  ret->append(" [--" + ARG_VERBOSE + "]");
+  ret->append(" [--" + ARG_PATH + " = <path_to_manifest_file>]");
+  ret->append("\n");
 }
 
-ManifestDumpCommand::ManifestDumpCommand(const vector<string>& params,
-      const map<string, string>& options, const vector<string>& flags) :
+ManifestDumpCommand::ManifestDumpCommand(
+    const std::vector<std::string>& params,
+    const std::map<std::string, std::string>& options,
+    const std::vector<std::string>& flags) :
     LDBCommand(options, flags, false,
                BuildCmdLineOptions({ARG_VERBOSE, ARG_PATH, ARG_HEX})),
     verbose_(false),
-    path_("")
-{
+    path_("") {
   verbose_ = IsFlagPresent(flags, ARG_VERBOSE);
 
-  map<string, string>::const_iterator itr = options.find(ARG_PATH);
+  auto itr = options.find(ARG_PATH);
   if (itr != options.end()) {
     path_ = itr->second;
     if (path_.empty()) {
@@ -597,16 +601,17 @@ void ManifestDumpCommand::DoCommand() {
 
 // ----------------------------------------------------------------------------
 
-void ListColumnFamiliesCommand::Help(string& ret) {
-  ret.append("  ");
-  ret.append(ListColumnFamiliesCommand::Name());
-  ret.append(" full_path_to_db_directory ");
-  ret.append("\n");
+void ListColumnFamiliesCommand::Help(std::string* ret) {
+  ret->append("  ");
+  ret->append(ListColumnFamiliesCommand::Name());
+  ret->append(" full_path_to_db_directory ");
+  ret->append("\n");
 }
 
 ListColumnFamiliesCommand::ListColumnFamiliesCommand(
-    const vector<string>& params, const map<string, string>& options,
-    const vector<string>& flags)
+    const std::vector<std::string>& params,
+    const std::map<std::string, std::string>& options,
+    const std::vector<std::string>& flags)
     : LDBCommand(options, flags, false, {}) {
 
   if (params.size() != 1) {
@@ -618,7 +623,7 @@ ListColumnFamiliesCommand::ListColumnFamiliesCommand(
 }
 
 void ListColumnFamiliesCommand::DoCommand() {
-  vector<string> column_families;
+  std::vector<std::string> column_families;
   Status s = DB::ListColumnFamilies(DBOptions(), dbname_, &column_families);
   if (!s.ok()) {
     printf("Error in processing db %s %s\n", dbname_.c_str(),
@@ -641,54 +646,56 @@ void ListColumnFamiliesCommand::DoCommand() {
 
 namespace {
 
-string ReadableTime(int unixtime) {
+std::string ReadableTime(int unixtime) {
   char time_buffer [80];
   time_t rawtime = unixtime;
   struct tm * timeinfo = localtime(&rawtime);
   strftime(time_buffer, 80, "%c", timeinfo);
-  return string(time_buffer);
+  return std::string(time_buffer);
 }
 
 // This function only called when it's the sane case of >1 buckets in time-range
 // Also called only when timekv falls between ttl_start and ttl_end provided
-void IncBucketCounts(vector<uint64_t>& bucket_counts, int ttl_start,
+void IncBucketCounts(std::vector<uint64_t>* bucket_counts, int ttl_start,
       int time_range, int bucket_size, int timekv, int num_buckets) {
   assert(time_range > 0 && timekv >= ttl_start && bucket_size > 0 &&
     timekv < (ttl_start + time_range) && num_buckets > 1);
   int bucket = (timekv - ttl_start) / bucket_size;
-  bucket_counts[bucket]++;
+  (*bucket_counts)[bucket]++;
 }
 
-void PrintBucketCounts(const vector<uint64_t>& bucket_counts, int ttl_start,
-      int ttl_end, int bucket_size, int num_buckets) {
+void PrintBucketCounts(
+    const std::vector<uint64_t>& bucket_counts, int ttl_start,
+    int ttl_end, int bucket_size, int num_buckets) {
   int time_point = ttl_start;
-  for(int i = 0; i < num_buckets - 1; i++, time_point += bucket_size) {
-    fprintf(stdout, "Keys in range %s to %s : %lu\n",
+  for (int i = 0; i < num_buckets - 1; i++, time_point += bucket_size) {
+    fprintf(stdout, "Keys in range %s to %s : %" PRIu64 "\n",
             ReadableTime(time_point).c_str(),
             ReadableTime(time_point + bucket_size).c_str(),
-            (unsigned long)bucket_counts[i]);
+            bucket_counts[i]);
   }
-  fprintf(stdout, "Keys in range %s to %s : %lu\n",
+  fprintf(stdout, "Keys in range %s to %s : %" PRIu64 "\n",
           ReadableTime(time_point).c_str(),
           ReadableTime(ttl_end).c_str(),
-          (unsigned long)bucket_counts[num_buckets - 1]);
+          bucket_counts[num_buckets - 1]);
 }
 
 }  // namespace
 
-const string InternalDumpCommand::ARG_COUNT_ONLY = "count_only";
-const string InternalDumpCommand::ARG_COUNT_DELIM = "count_delim";
-const string InternalDumpCommand::ARG_STATS = "stats";
-const string InternalDumpCommand::ARG_INPUT_KEY_HEX = "input_key_hex";
+const std::string InternalDumpCommand::ARG_COUNT_ONLY = "count_only";
+const std::string InternalDumpCommand::ARG_COUNT_DELIM = "count_delim";
+const std::string InternalDumpCommand::ARG_STATS = "stats";
+const std::string InternalDumpCommand::ARG_INPUT_KEY_HEX = "input_key_hex";
 
-InternalDumpCommand::InternalDumpCommand(const vector<string>& params,
-                                         const map<string, string>& options,
-                                         const vector<string>& flags) :
+InternalDumpCommand::InternalDumpCommand(
+    const std::vector<std::string>& params,
+    const std::map<std::string, std::string>& options,
+    const std::vector<std::string>& flags) :
     LDBCommand(options, flags, true,
-               BuildCmdLineOptions({ ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX,
-                                     ARG_FROM, ARG_TO, ARG_MAX_KEYS,
-                                     ARG_COUNT_ONLY, ARG_COUNT_DELIM, ARG_STATS,
-                                     ARG_INPUT_KEY_HEX})),
+         BuildCmdLineOptions({ ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX,
+                               ARG_FROM, ARG_TO, ARG_MAX_KEYS,
+                               ARG_COUNT_ONLY, ARG_COUNT_DELIM, ARG_STATS,
+                               ARG_INPUT_KEY_HEX})),
     has_from_(false),
     has_to_(false),
     max_keys_(-1),
@@ -701,15 +708,14 @@ InternalDumpCommand::InternalDumpCommand(const vector<string>& params,
   has_from_ = ParseStringOption(options, ARG_FROM, &from_);
   has_to_ = ParseStringOption(options, ARG_TO, &to_);
 
-  ParseIntOption(options, ARG_MAX_KEYS, max_keys_, exec_state_);
-  map<string, string>::const_iterator itr = options.find(ARG_COUNT_DELIM);
+  ParseIntOption(options, ARG_MAX_KEYS, &max_keys_, &exec_state_);
+  auto itr = options.find(ARG_COUNT_DELIM);
   if (itr != options.end()) {
     delim_ = itr->second;
     count_delim_ = true;
-   // fprintf(stdout,"delim = %c\n",delim_[0]);
   } else {
     count_delim_ = IsFlagPresent(flags, ARG_COUNT_DELIM);
-    delim_=".";
+    delim_ = ".";
   }
 
   print_stats_ = IsFlagPresent(flags, ARG_STATS);
@@ -726,16 +732,16 @@ InternalDumpCommand::InternalDumpCommand(const vector<string>& params,
   }
 }
 
-void InternalDumpCommand::Help(string& ret) {
-  ret.append("  ");
-  ret.append(InternalDumpCommand::Name());
-  ret.append(HelpRangeCmdArgs());
-  ret.append(" [--" + ARG_INPUT_KEY_HEX + "]");
-  ret.append(" [--" + ARG_MAX_KEYS + "=<N>]");
-  ret.append(" [--" + ARG_COUNT_ONLY + "]");
-  ret.append(" [--" + ARG_COUNT_DELIM + "=<char>]");
-  ret.append(" [--" + ARG_STATS + "]");
-  ret.append("\n");
+void InternalDumpCommand::Help(std::string* ret) {
+  ret->append("  ");
+  ret->append(InternalDumpCommand::Name());
+  ret->append(HelpRangeCmdArgs());
+  ret->append(" [--" + ARG_INPUT_KEY_HEX + "]");
+  ret->append(" [--" + ARG_MAX_KEYS + " = <N>]");
+  ret->append(" [--" + ARG_COUNT_ONLY + "]");
+  ret->append(" [--" + ARG_COUNT_DELIM + " = <char>]");
+  ret->append(" [--" + ARG_STATS + "]");
+  ret->append("\n");
 }
 
 void InternalDumpCommand::DoCommand() {
@@ -744,7 +750,7 @@ void InternalDumpCommand::DoCommand() {
   }
 
   if (print_stats_) {
-    string stats;
+    std::string stats;
     if (db_->GetProperty("rocksdb.stats", &stats)) {
       fprintf(stdout, "%s\n", stats.c_str());
     }
@@ -756,10 +762,10 @@ void InternalDumpCommand::DoCommand() {
     exec_state_ = LDBCommandExecuteResult::FAILED("DB is not DBImpl");
     return;
   }
-  string rtype1,rtype2,row,val;
+  std::string rtype1, rtype2, row, val;
   rtype2 = "";
-  uint64_t c=0;
-  uint64_t s1=0,s2=0;
+  uint64_t c = 0;
+  uint64_t s1 = 0, s2 = 0;
   // Setup internal key iterator
   Arena arena;
   ScopedArenaIterator iter(idb->TEST_NewInternalIterator(&arena));
@@ -776,7 +782,7 @@ void InternalDumpCommand::DoCommand() {
     iter->SeekToFirst();
   }
 
-  long long count = 0;
+  uint64_t count = 0;
   for (; iter->Valid(); iter->Next()) {
     ParsedInternalKey ikey;
     if (!ParseInternalKey(iter->key(), &ikey)) {
@@ -795,59 +801,69 @@ void InternalDumpCommand::DoCommand() {
     int k;
     if (count_delim_) {
       rtype1 = "";
-      s1=0;
+      s1 = 0;
       row = iter->key().ToString();
       val = iter->value().ToString();
-      for(k=0;row[k]!='\x01' && row[k]!='\0';k++)
+      for (k = 0; row[k] != '\x01' && row[k] != '\0'; k++) {
         s1++;
-      for(k=0;val[k]!='\x01' && val[k]!='\0';k++)
+      }
+      for (k = 0; val[k] != '\x01' && val[k] != '\0'; k++) {
         s1++;
-      for(int j=0;row[j]!=delim_[0] && row[j]!='\0' && row[j]!='\x01';j++)
-        rtype1+=row[j];
-      if(rtype2.compare("") && rtype2.compare(rtype1)!=0) {
-        fprintf(stdout,"%s => count:%lld\tsize:%lld\n",rtype2.c_str(),
-            (long long)c,(long long)s2);
-        c=1;
-        s2=s1;
+      }
+      for (int j = 0;
+           row[j] != delim_[0] && row[j] != '\0' && row[j] != '\x01';
+           j++) {
+        rtype1+= row[j];
+      }
+      if (rtype2.compare("") && rtype2.compare(rtype1) != 0) {
+        fprintf(stdout, "%s  = > count:%" PRIu64 "\tsize:%" PRIu64 "\n",
+            rtype2.c_str(), c, s2);
+        c = 1;
+        s2 = s1;
         rtype2 = rtype1;
       } else {
         c++;
-        s2+=s1;
-        rtype2=rtype1;
+        s2+= s1;
+        rtype2 = rtype1;
     }
   }
 
     if (!count_only_ && !count_delim_) {
-      string key = ikey.DebugString(is_key_hex_);
-      string value = iter->value().ToString(is_value_hex_);
-      std::cout << key << " => " << value << "\n";
+      std::string key = ikey.DebugString(is_key_hex_);
+      std::string value = iter->value().ToString(is_value_hex_);
+      std::cout << key << "  = > " << value << "\n";
     }
 
     // Terminate if maximum number of keys have been dumped
-    if (max_keys_ > 0 && count >= max_keys_) break;
+    if (max_keys_ > 0 && count >= static_cast<uint64_t>(max_keys_)) {
+      break;
+    }
   }
-  if(count_delim_) {
-    fprintf(stdout,"%s => count:%lld\tsize:%lld\n", rtype2.c_str(),
-        (long long)c,(long long)s2);
+  if (count_delim_) {
+    fprintf(stdout,
+        "%s  = > count:%" PRIu64 "\tsize:%" PRIu64 "\n",
+        rtype2.c_str(), c, s2);
   } else
-  fprintf(stdout, "Internal keys in range: %lld\n", (long long) count);
+  fprintf(stdout, "Internal keys in range: %" PRIu64 "\n", count);
 }
 
 
-const string DBDumperCommand::ARG_COUNT_ONLY = "count_only";
-const string DBDumperCommand::ARG_COUNT_DELIM = "count_delim";
-const string DBDumperCommand::ARG_STATS = "stats";
-const string DBDumperCommand::ARG_TTL_BUCKET = "bucket";
+const std::string DBDumperCommand::ARG_COUNT_ONLY = "count_only";
+const std::string DBDumperCommand::ARG_COUNT_DELIM = "count_delim";
+const std::string DBDumperCommand::ARG_STATS = "stats";
+const std::string DBDumperCommand::ARG_TTL_BUCKET = "bucket";
 
-DBDumperCommand::DBDumperCommand(const vector<string>& params,
-      const map<string, string>& options, const vector<string>& flags) :
+DBDumperCommand::DBDumperCommand(
+    const std::vector<std::string>& params,
+    const std::map<std::string, std::string>& options,
+    const std::vector<std::string>& flags) :
     LDBCommand(options, flags, true,
-               BuildCmdLineOptions({ARG_TTL, ARG_HEX, ARG_KEY_HEX,
-                                    ARG_VALUE_HEX, ARG_FROM, ARG_TO,
-                                    ARG_MAX_KEYS, ARG_COUNT_ONLY,
-                                    ARG_COUNT_DELIM, ARG_STATS, ARG_TTL_START,
-                                    ARG_TTL_END, ARG_TTL_BUCKET,
-                                    ARG_TIMESTAMP})),
+        BuildCmdLineOptions({ARG_TTL, ARG_HEX, ARG_KEY_HEX,
+                             ARG_VALUE_HEX, ARG_FROM, ARG_TO,
+                             ARG_MAX_KEYS, ARG_COUNT_ONLY,
+                             ARG_COUNT_DELIM, ARG_STATS, ARG_TTL_START,
+                             ARG_TTL_END, ARG_TTL_BUCKET,
+                             ARG_TIMESTAMP})),
     null_from_(true),
     null_to_(true),
     max_keys_(-1),
@@ -855,7 +871,7 @@ DBDumperCommand::DBDumperCommand(const vector<string>& params,
     count_delim_(false),
     print_stats_(false) {
 
-  map<string, string>::const_iterator itr = options.find(ARG_FROM);
+  auto itr = options.find(ARG_FROM);
   if (itr != options.end()) {
     null_from_ = false;
     from_ = itr->second;
@@ -871,10 +887,10 @@ DBDumperCommand::DBDumperCommand(const vector<string>& params,
   if (itr != options.end()) {
     try {
       max_keys_ = stoi(itr->second);
-    } catch(const invalid_argument&) {
+    } catch(const std::invalid_argument&) {
       exec_state_ = LDBCommandExecuteResult::FAILED(ARG_MAX_KEYS +
                         " has an invalid value");
-    } catch(const out_of_range&) {
+    } catch(const std::out_of_range&) {
       exec_state_ = LDBCommandExecuteResult::FAILED(ARG_MAX_KEYS +
                         " has a value out-of-range");
     }
@@ -885,7 +901,7 @@ DBDumperCommand::DBDumperCommand(const vector<string>& params,
     count_delim_ = true;
   } else {
     count_delim_ = IsFlagPresent(flags, ARG_COUNT_DELIM);
-    delim_=".";
+    delim_ = ".";
   }
 
   print_stats_ = IsFlagPresent(flags, ARG_STATS);
@@ -901,20 +917,20 @@ DBDumperCommand::DBDumperCommand(const vector<string>& params,
   }
 }
 
-void DBDumperCommand::Help(string& ret) {
-  ret.append("  ");
-  ret.append(DBDumperCommand::Name());
-  ret.append(HelpRangeCmdArgs());
-  ret.append(" [--" + ARG_TTL + "]");
-  ret.append(" [--" + ARG_MAX_KEYS + "=<N>]");
-  ret.append(" [--" + ARG_TIMESTAMP + "]");
-  ret.append(" [--" + ARG_COUNT_ONLY + "]");
-  ret.append(" [--" + ARG_COUNT_DELIM + "=<char>]");
-  ret.append(" [--" + ARG_STATS + "]");
-  ret.append(" [--" + ARG_TTL_BUCKET + "=<N>]");
-  ret.append(" [--" + ARG_TTL_START + "=<N>:- is inclusive]");
-  ret.append(" [--" + ARG_TTL_END + "=<N>:- is exclusive]");
-  ret.append("\n");
+void DBDumperCommand::Help(std::string* ret) {
+  ret->append("  ");
+  ret->append(DBDumperCommand::Name());
+  ret->append(HelpRangeCmdArgs());
+  ret->append(" [--" + ARG_TTL + "]");
+  ret->append(" [--" + ARG_MAX_KEYS + " = <N>]");
+  ret->append(" [--" + ARG_TIMESTAMP + "]");
+  ret->append(" [--" + ARG_COUNT_ONLY + "]");
+  ret->append(" [--" + ARG_COUNT_DELIM + " = <char>]");
+  ret->append(" [--" + ARG_STATS + "]");
+  ret->append(" [--" + ARG_TTL_BUCKET + " = <N>]");
+  ret->append(" [--" + ARG_TTL_START + " = <N>:- is inclusive]");
+  ret->append(" [--" + ARG_TTL_END + " = <N>:- is exclusive]");
+  ret->append("\n");
 }
 
 void DBDumperCommand::DoCommand() {
@@ -924,7 +940,7 @@ void DBDumperCommand::DoCommand() {
   // Parse command line args
   uint64_t count = 0;
   if (print_stats_) {
-    string stats;
+    std::string stats;
     if (db_->GetProperty("rocksdb.stats", &stats)) {
       fprintf(stdout, "%s\n", stats.c_str());
     }
@@ -946,11 +962,11 @@ void DBDumperCommand::DoCommand() {
 
   int max_keys = max_keys_;
   int ttl_start;
-  if (!ParseIntOption(option_map_, ARG_TTL_START, ttl_start, exec_state_)) {
+  if (!ParseIntOption(option_map_, ARG_TTL_START, &ttl_start, &exec_state_)) {
     ttl_start = DBWithTTLImpl::kMinTimestamp;  // TTL introduction time
   }
   int ttl_end;
-  if (!ParseIntOption(option_map_, ARG_TTL_END, ttl_end, exec_state_)) {
+  if (!ParseIntOption(option_map_, ARG_TTL_END, &ttl_end, &exec_state_)) {
     ttl_end = DBWithTTLImpl::kMaxTimestamp;  // Max time allowed by TTL feature
   }
   if (ttl_end < ttl_start) {
@@ -960,20 +976,21 @@ void DBDumperCommand::DoCommand() {
   }
   int time_range = ttl_end - ttl_start;
   int bucket_size;
-  if (!ParseIntOption(option_map_, ARG_TTL_BUCKET, bucket_size, exec_state_) ||
+  if (!ParseIntOption(
+          option_map_, ARG_TTL_BUCKET, &bucket_size, &exec_state_) ||
       bucket_size <= 0) {
     bucket_size = time_range; // Will have just 1 bucket by default
   }
   //cretaing variables for row count of each type
-  string rtype1,rtype2,row,val;
+  std::string rtype1, rtype2, row, val;
   rtype2 = "";
-  uint64_t c=0;
-  uint64_t s1=0,s2=0;
+  uint64_t c = 0;
+  uint64_t s1 = 0, s2 = 0;
 
-  // At this point, bucket_size=0 => time_range=0
+  // At this point, bucket_size = 0  = > time_range = 0
   uint64_t num_buckets = (bucket_size >= time_range) ? 1 :
     ((time_range + bucket_size - 1) / bucket_size);
-  vector<uint64_t> bucket_counts(num_buckets, 0);
+  std::vector<uint64_t> bucket_counts(num_buckets, 0);
   if (is_db_ttl_ && !count_only_ && timestamp_ && !count_delim_) {
     fprintf(stdout, "Dumping key-values from %s to %s\n",
             ReadableTime(ttl_start).c_str(), ReadableTime(ttl_end).c_str());
@@ -999,7 +1016,7 @@ void DBDumperCommand::DoCommand() {
       --max_keys;
     }
     if (is_db_ttl_ && num_buckets > 1) {
-      IncBucketCounts(bucket_counts, ttl_start, time_range, bucket_size,
+      IncBucketCounts(&bucket_counts, ttl_start, time_range, bucket_size,
                       rawtime, num_buckets);
     }
     ++count;
@@ -1008,29 +1025,28 @@ void DBDumperCommand::DoCommand() {
       row = iter->key().ToString();
       val = iter->value().ToString();
       s1 = row.size()+val.size();
-      for(int j=0;row[j]!=delim_[0] && row[j]!='\0';j++)
-        rtype1+=row[j];
-      if(rtype2.compare("") && rtype2.compare(rtype1)!=0) {
-        fprintf(stdout,"%s => count:%lld\tsize:%lld\n",rtype2.c_str(),
-            (long long )c,(long long)s2);
-        c=1;
-        s2=s1;
+      for (int j = 0; row[j] != delim_[0] && row[j] != '\0'; j++) {
+        rtype1 += row[j];
+      }
+      if (rtype2.compare("") && rtype2.compare(rtype1) != 0) {
+        fprintf(stdout,
+            "%s  = > count:%" PRIu64 "\tsize:%" PRIu64 "\n",
+            rtype2.c_str(), c, s2);
+        c = 1;
+        s2 = s1;
         rtype2 = rtype1;
       } else {
-          c++;
-          s2+=s1;
-          rtype2=rtype1;
+        c++;
+        s2 += s1;
+        rtype2 = rtype1;
       }
-
     }
 
-
-
     if (!count_only_ && !count_delim_) {
       if (is_db_ttl_ && timestamp_) {
         fprintf(stdout, "%s ", ReadableTime(rawtime).c_str());
       }
-      string str = PrintKeyValue(iter->key().ToString(),
+      std::string str = PrintKeyValue(iter->key().ToString(),
                                  iter->value().ToString(), is_key_hex_,
                                  is_value_hex_);
       fprintf(stdout, "%s\n", str.c_str());
@@ -1040,21 +1056,25 @@ void DBDumperCommand::DoCommand() {
   if (num_buckets > 1 && is_db_ttl_) {
     PrintBucketCounts(bucket_counts, ttl_start, ttl_end, bucket_size,
                       num_buckets);
-  } else if(count_delim_) {
-    fprintf(stdout,"%s => count:%lld\tsize:%lld\n",rtype2.c_str(),
-        (long long )c,(long long)s2);
+  } else if (count_delim_) {
+    fprintf(stdout, "%s  = > count:%" PRIu64 "\tsize:%" PRIu64 "\n",
+        rtype2.c_str(), c, s2);
   } else {
-    fprintf(stdout, "Keys in range: %lld\n", (long long) count);
+    fprintf(stdout, "Keys in range: %" PRIu64 "\n", count);
   }
   // Clean up
   delete iter;
 }
 
-const string ReduceDBLevelsCommand::ARG_NEW_LEVELS = "new_levels";
-const string  ReduceDBLevelsCommand::ARG_PRINT_OLD_LEVELS = "print_old_levels";
+const std::string
+    ReduceDBLevelsCommand::ARG_NEW_LEVELS = "new_levels";
+const std::string
+    ReduceDBLevelsCommand::ARG_PRINT_OLD_LEVELS = "print_old_levels";
 
-ReduceDBLevelsCommand::ReduceDBLevelsCommand(const vector<string>& params,
-      const map<string, string>& options, const vector<string>& flags) :
+ReduceDBLevelsCommand::ReduceDBLevelsCommand(
+    const std::vector<std::string>& params,
+    const std::map<std::string, std::string>& options,
+    const std::vector<std::string>& flags) :
     LDBCommand(options, flags, false,
                BuildCmdLineOptions({ARG_NEW_LEVELS, ARG_PRINT_OLD_LEVELS})),
     old_levels_(1 << 16),
@@ -1062,33 +1082,34 @@ ReduceDBLevelsCommand::ReduceDBLevelsCommand(const vector<string>& params,
     print_old_levels_(false) {
 
 
-  ParseIntOption(option_map_, ARG_NEW_LEVELS, new_levels_, exec_state_);
+  ParseIntOption(option_map_, ARG_NEW_LEVELS, &new_levels_, &exec_state_);
   print_old_levels_ = IsFlagPresent(flags, ARG_PRINT_OLD_LEVELS);
 
-  if(new_levels_ <= 0) {
+  if (new_levels_ <= 0) {
     exec_state_ = LDBCommandExecuteResult::FAILED(
            " Use --" + ARG_NEW_LEVELS + " to specify a new level number\n");
   }
 }
 
-vector<string> ReduceDBLevelsCommand::PrepareArgs(const string& db_path,
+std::vector<std::string> ReduceDBLevelsCommand::PrepareArgs(
+    const std::string& db_path,
     int new_levels, bool print_old_level) {
-  vector<string> ret;
+  std::vector<std::string> ret;
   ret.push_back("reduce_levels");
-  ret.push_back("--" + ARG_DB + "=" + db_path);
-  ret.push_back("--" + ARG_NEW_LEVELS + "=" + to_string(new_levels));
-  if(print_old_level) {
+  ret.push_back("--" + ARG_DB + " = " + db_path);
+  ret.push_back("--" + ARG_NEW_LEVELS + " = " + std::to_string(new_levels));
+  if (print_old_level) {
     ret.push_back("--" + ARG_PRINT_OLD_LEVELS);
   }
   return ret;
 }
 
-void ReduceDBLevelsCommand::Help(string& ret) {
-  ret.append("  ");
-  ret.append(ReduceDBLevelsCommand::Name());
-  ret.append(" --" + ARG_NEW_LEVELS + "=<New number of levels>");
-  ret.append(" [--" + ARG_PRINT_OLD_LEVELS + "]");
-  ret.append("\n");
+void ReduceDBLevelsCommand::Help(std::string* ret) {
+  ret->append("  ");
+  ret->append(ReduceDBLevelsCommand::Name());
+  ret->append(" --" + ARG_NEW_LEVELS + " = <New number of levels>");
+  ret->append(" [--" + ARG_PRINT_OLD_LEVELS + "]");
+  ret->append("\n");
 }
 
 Options ReduceDBLevelsCommand::PrepareOptionsForOpenDB() {
@@ -1151,7 +1172,8 @@ void ReduceDBLevelsCommand::DoCommand() {
   }
 
   if (print_old_levels_) {
-    fprintf(stdout, "The old number of levels in use is %d\n", old_level_num);
+    fprintf(stdout, "The old number of levels in use is %d\n",
+            old_level_num);
   }
 
   if (old_level_num <= new_levels_) {
@@ -1170,29 +1192,31 @@ void ReduceDBLevelsCommand::DoCommand() {
   CloseDB();
 
   EnvOptions soptions;
-  st = VersionSet::ReduceNumberOfLevels(db_path_, &opt, soptions, new_levels_);
+  st = VersionSet::ReduceNumberOfLevels(
+      db_path_, &opt, soptions, new_levels_);
   if (!st.ok()) {
     exec_state_ = LDBCommandExecuteResult::FAILED(st.ToString());
     return;
   }
 }
 
-const string ChangeCompactionStyleCommand::ARG_OLD_COMPACTION_STYLE =
+const std::string ChangeCompactionStyleCommand::ARG_OLD_COMPACTION_STYLE  =
   "old_compaction_style";
-const string ChangeCompactionStyleCommand::ARG_NEW_COMPACTION_STYLE =
+const std::string ChangeCompactionStyleCommand::ARG_NEW_COMPACTION_STYLE  =
   "new_compaction_style";
 
 ChangeCompactionStyleCommand::ChangeCompactionStyleCommand(
-      const vector<string>& params, const map<string, string>& options,
-      const vector<string>& flags) :
+      const std::vector<std::string>& params,
+      const std::map<std::string, std::string>& options,
+      const std::vector<std::string>& flags) :
     LDBCommand(options, flags, false,
                BuildCmdLineOptions({ARG_OLD_COMPACTION_STYLE,
                                     ARG_NEW_COMPACTION_STYLE})),
     old_compaction_style_(-1),
     new_compaction_style_(-1) {
 
-  ParseIntOption(option_map_, ARG_OLD_COMPACTION_STYLE, old_compaction_style_,
-    exec_state_);
+  ParseIntOption(option_map_, ARG_OLD_COMPACTION_STYLE,
+                 &old_compaction_style_, &exec_state_);
   if (old_compaction_style_ != kCompactionStyleLevel &&
      old_compaction_style_ != kCompactionStyleUniversal) {
     exec_state_ = LDBCommandExecuteResult::FAILED(
@@ -1201,8 +1225,8 @@ ChangeCompactionStyleCommand::ChangeCompactionStyleCommand(
     return;
   }
 
-  ParseIntOption(option_map_, ARG_NEW_COMPACTION_STYLE, new_compaction_style_,
-    exec_state_);
+  ParseIntOption(option_map_, ARG_NEW_COMPACTION_STYLE,
+                 &new_compaction_style_, &exec_state_);
   if (new_compaction_style_ != kCompactionStyleLevel &&
      new_compaction_style_ != kCompactionStyleUniversal) {
     exec_state_ = LDBCommandExecuteResult::FAILED(
@@ -1227,14 +1251,16 @@ ChangeCompactionStyleCommand::ChangeCompactionStyleCommand(
   }
 }
 
-void ChangeCompactionStyleCommand::Help(string& ret) {
-  ret.append("  ");
-  ret.append(ChangeCompactionStyleCommand::Name());
-  ret.append(" --" + ARG_OLD_COMPACTION_STYLE + "=<Old compaction style: 0 " +
-             "for level compaction, 1 for universal compaction>");
-  ret.append(" --" + ARG_NEW_COMPACTION_STYLE + "=<New compaction style: 0 " +
-             "for level compaction, 1 for universal compaction>");
-  ret.append("\n");
+void ChangeCompactionStyleCommand::Help(std::string* ret) {
+  ret->append("  ");
+  ret->append(ChangeCompactionStyleCommand::Name());
+  ret->append(
+      " --" + ARG_OLD_COMPACTION_STYLE + " = <Old compaction style: 0 " +
+      "for level compaction, 1 for universal compaction>");
+  ret->append(
+      " --" + ARG_NEW_COMPACTION_STYLE + " = <New compaction style: 0 " +
+      "for level compaction, 1 for universal compaction>");
+  ret->append("\n");
 }
 
 Options ChangeCompactionStyleCommand::PrepareOptionsForOpenDB() {
@@ -1262,9 +1288,9 @@ void ChangeCompactionStyleCommand::DoCommand() {
     db_->GetProperty("rocksdb.num-files-at-level" + NumberToString(i),
                      &property);
 
-    // format print string
+    // format print std::string
     char buf[100];
-    snprintf(buf, sizeof(buf), "%s%s", (i ? "," : ""), property.c_str());
+    snprintf(buf, sizeof(buf), "%s%s", (i ? ", " : ""), property.c_str());
     files_per_level += buf;
   }
   fprintf(stdout, "files per level before compaction: %s\n",
@@ -1282,9 +1308,9 @@ void ChangeCompactionStyleCommand::DoCommand() {
     db_->GetProperty("rocksdb.num-files-at-level" + NumberToString(i),
                      &property);
 
-    // format print string
+    // format print std::string
     char buf[100];
-    snprintf(buf, sizeof(buf), "%s%s", (i ? "," : ""), property.c_str());
+    snprintf(buf, sizeof(buf), "%s%s", (i ? ", " : ""), property.c_str());
     files_per_level += buf;
 
     num_files = atoi(property.c_str());
@@ -1292,15 +1318,15 @@ void ChangeCompactionStyleCommand::DoCommand() {
     // level 0 should have only 1 file
     if (i == 0 && num_files != 1) {
       exec_state_ = LDBCommandExecuteResult::FAILED("Number of db files at "
-        "level 0 after compaction is " + std::to_string(num_files) +
-        ", not 1.\n");
+          "level 0 after compaction is " + std::to_string(num_files) +
+          ", not 1.\n");
       return;
     }
     // other levels should have no file
     if (i > 0 && num_files != 0) {
       exec_state_ = LDBCommandExecuteResult::FAILED("Number of db files at "
-        "level " + std::to_string(i) + " after compaction is " +
-        std::to_string(num_files) + ", not 0.\n");
+          "level " + std::to_string(i) + " after compaction is " +
+          std::to_string(num_files) + ", not 0.\n");
       return;
     }
   }
@@ -1311,14 +1337,15 @@ void ChangeCompactionStyleCommand::DoCommand() {
 
 class InMemoryHandler : public WriteBatch::Handler {
  public:
-  InMemoryHandler(stringstream& row, bool print_values) : Handler(),row_(row) {
+  InMemoryHandler(std::stringstream& row, bool print_values)
+      : Handler(), row_(row) {
     print_values_ = print_values;
   }
 
   void commonPutMerge(const Slice& key, const Slice& value) {
-    string k = LDBCommand::StringToHex(key.ToString());
+    std::string k = LDBCommand::StringToHex(key.ToString());
     if (print_values_) {
-      string v = LDBCommand::StringToHex(value.ToString());
+      std::string v = LDBCommand::StringToHex(value.ToString());
       row_ << k << " : ";
       row_ << v << " ";
     } else {
@@ -1337,23 +1364,25 @@ class InMemoryHandler : public WriteBatch::Handler {
   }
 
   virtual void Delete(const Slice& key) {
-    row_ <<",DELETE : ";
+    row_  << ", DELETE : ";
     row_ << LDBCommand::StringToHex(key.ToString()) << " ";
   }
 
   virtual ~InMemoryHandler() { };
 
  private:
-  stringstream & row_;
+  std::stringstream & row_;
   bool print_values_;
 };
 
-const string WALDumperCommand::ARG_WAL_FILE = "walfile";
-const string WALDumperCommand::ARG_PRINT_VALUE = "print_value";
-const string WALDumperCommand::ARG_PRINT_HEADER = "header";
+const std::string WALDumperCommand::ARG_WAL_FILE = "walfile";
+const std::string WALDumperCommand::ARG_PRINT_VALUE = "print_value";
+const std::string WALDumperCommand::ARG_PRINT_HEADER = "header";
 
-WALDumperCommand::WALDumperCommand(const vector<string>& params,
-      const map<string, string>& options, const vector<string>& flags) :
+WALDumperCommand::WALDumperCommand(
+    const std::vector<std::string>& params,
+    const std::map<std::string, std::string>& options,
+    const std::vector<std::string>& flags) :
     LDBCommand(options, flags, true,
                BuildCmdLineOptions(
                 {ARG_WAL_FILE, ARG_PRINT_HEADER, ARG_PRINT_VALUE})),
@@ -1361,7 +1390,8 @@ WALDumperCommand::WALDumperCommand(const vector<string>& params,
 
   wal_file_.clear();
 
-  map<string, string>::const_iterator itr = options.find(ARG_WAL_FILE);
+  std::map<std::string, std::string>::const_iterator itr  =
+      options.find(ARG_WAL_FILE);
   if (itr != options.end()) {
     wal_file_ = itr->second;
   }
@@ -1375,19 +1405,19 @@ WALDumperCommand::WALDumperCommand(const vector<string>& params,
   }
 }
 
-void WALDumperCommand::Help(string& ret) {
-  ret.append("  ");
-  ret.append(WALDumperCommand::Name());
-  ret.append(" --" + ARG_WAL_FILE + "=<write_ahead_log_file_path>");
-  ret.append(" [--" + ARG_PRINT_HEADER + "] ");
-  ret.append(" [--" + ARG_PRINT_VALUE + "] ");
-  ret.append("\n");
+void WALDumperCommand::Help(std::string* ret) {
+  ret->append("  ");
+  ret->append(WALDumperCommand::Name());
+  ret->append(" --" + ARG_WAL_FILE + " = <write_ahead_log_file_path>");
+  ret->append(" [--" + ARG_PRINT_HEADER + "] ");
+  ret->append(" [--" + ARG_PRINT_VALUE + "] ");
+  ret->append("\n");
 }
 
 void WALDumperCommand::DoCommand() {
   struct StdErrReporter : public log::Reader::Reporter {
     virtual void Corruption(size_t bytes, const Status& s) {
-      cerr<<"Corruption detected in log file "<<s.ToString()<<"\n";
+      std::cerr << "Corruption detected in log file " << s.ToString() << "\n";
     }
   };
 
@@ -1396,21 +1426,21 @@ void WALDumperCommand::DoCommand() {
   EnvOptions soptions;
   Status status = env_->NewSequentialFile(wal_file_, &file, soptions);
   if (!status.ok()) {
-    exec_state_ = LDBCommandExecuteResult::FAILED("Failed to open WAL file " +
-      status.ToString());
+    exec_state_ = LDBCommandExecuteResult::FAILED(
+        "Failed to open WAL file " + status.ToString());
   } else {
     StdErrReporter reporter;
     log::Reader reader(move(file), &reporter, true, 0);
-    string scratch;
+    std::string scratch;
     WriteBatch batch;
     Slice record;
-    stringstream row;
+    std::stringstream row;
     if (print_header_) {
-      cout<<"Sequence,Count,ByteSize,Physical Offset,Key(s)";
+      std::cout << "Sequence, Count, ByteSize, Physical Offset, Key(s)";
       if (print_values_) {
-        cout << " : value ";
+        std::cout << " : value ";
       }
-      cout << "\n";
+      std::cout << "\n";
     }
     while(reader.ReadRecord(&record, &scratch)) {
       row.str("");
@@ -1419,22 +1449,24 @@ void WALDumperCommand::DoCommand() {
             record.size(), Status::Corruption("log record too small"));
       } else {
         WriteBatchInternal::SetContents(&batch, record);
-        row<<WriteBatchInternal::Sequence(&batch)<<",";
-        row<<WriteBatchInternal::Count(&batch)<<",";
-        row<<WriteBatchInternal::ByteSize(&batch)<<",";
-        row<<reader.LastRecordOffset()<<",";
+        row << WriteBatchInternal::Sequence(&batch) << ", ";
+        row << WriteBatchInternal::Count(&batch) << ", ";
+        row << WriteBatchInternal::ByteSize(&batch) << ", ";
+        row << reader.LastRecordOffset() << ", ";
         InMemoryHandler handler(row, print_values_);
         batch.Iterate(&handler);
-        row<<"\n";
+        row << "\n";
       }
-      cout<<row.str();
+      std::cout << row.str();
     }
   }
 }
 
 
-GetCommand::GetCommand(const vector<string>& params,
-      const map<string, string>& options, const vector<string>& flags) :
+GetCommand::GetCommand(
+    const std::vector<std::string>& params,
+    const std::map<std::string, std::string>& options,
+    const std::vector<std::string>& flags) :
   LDBCommand(options, flags, true, BuildCmdLineOptions({ARG_TTL, ARG_HEX,
                                                         ARG_KEY_HEX,
                                                         ARG_VALUE_HEX})) {
@@ -1451,16 +1483,16 @@ GetCommand::GetCommand(const vector<string>& params,
   }
 }
 
-void GetCommand::Help(string& ret) {
-  ret.append("  ");
-  ret.append(GetCommand::Name());
-  ret.append(" <key>");
-  ret.append(" [--" + ARG_TTL + "]");
-  ret.append("\n");
+void GetCommand::Help(std::string* ret) {
+  ret->append("  ");
+  ret->append(GetCommand::Name());
+  ret->append(" <key>");
+  ret->append(" [--" + ARG_TTL + "]");
+  ret->append("\n");
 }
 
 void GetCommand::DoCommand() {
-  string value;
+  std::string value;
   Status st = db_->Get(ReadOptions(), key_, &value);
   if (st.ok()) {
     fprintf(stdout, "%s\n",
@@ -1471,8 +1503,10 @@ void GetCommand::DoCommand() {
 }
 
 
-ApproxSizeCommand::ApproxSizeCommand(const vector<string>& params,
-      const map<string, string>& options, const vector<string>& flags) :
+ApproxSizeCommand::ApproxSizeCommand(
+    const std::vector<std::string>& params,
+    const std::map<std::string, std::string>& options,
+    const std::vector<std::string>& flags) :
   LDBCommand(options, flags, true,
              BuildCmdLineOptions({ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX,
                                   ARG_FROM, ARG_TO})) {
@@ -1499,11 +1533,11 @@ ApproxSizeCommand::ApproxSizeCommand(const vector<string>& params,
   }
 }
 
-void ApproxSizeCommand::Help(string& ret) {
-  ret.append("  ");
-  ret.append(ApproxSizeCommand::Name());
-  ret.append(HelpRangeCmdArgs());
-  ret.append("\n");
+void ApproxSizeCommand::Help(std::string* ret) {
+  ret->append("  ");
+  ret->append(ApproxSizeCommand::Name());
+  ret->append(HelpRangeCmdArgs());
+  ret->append("\n");
 }
 
 void ApproxSizeCommand::DoCommand() {
@@ -1522,43 +1556,45 @@ void ApproxSizeCommand::DoCommand() {
 }
 
 
-BatchPutCommand::BatchPutCommand(const vector<string>& params,
-      const map<string, string>& options, const vector<string>& flags) :
-  LDBCommand(options, flags, false,
-             BuildCmdLineOptions({ARG_TTL, ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX,
-                                  ARG_CREATE_IF_MISSING})) {
+BatchPutCommand::BatchPutCommand(
+    const std::vector<std::string>& params,
+    const std::map<std::string, std::string>& options,
+    const std::vector<std::string>& flags) :
+    LDBCommand(options, flags, false,
+        BuildCmdLineOptions({ARG_TTL, ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX,
+                             ARG_CREATE_IF_MISSING})) {
 
   if (params.size() < 2) {
     exec_state_ = LDBCommandExecuteResult::FAILED(
-        "At least one <key> <value> pair must be specified batchput.");
+        "At least one <key> <value> std::pair must be specified batchput.");
   } else if (params.size() % 2 != 0) {
     exec_state_ = LDBCommandExecuteResult::FAILED(
         "Equal number of <key>s and <value>s must be specified for batchput.");
   } else {
     for (size_t i = 0; i < params.size(); i += 2) {
-      string key = params.at(i);
-      string value = params.at(i+1);
-      key_values_.push_back(pair<string, string>(
+      std::string key = params.at(i);
+      std::string value = params.at(i+1);
+      key_values_.push_back(std::pair<std::string, std::string>(
                     is_key_hex_ ? HexToString(key) : key,
                     is_value_hex_ ? HexToString(value) : value));
     }
   }
 }
 
-void BatchPutCommand::Help(string& ret) {
-  ret.append("  ");
-  ret.append(BatchPutCommand::Name());
-  ret.append(" <key> <value> [<key> <value>] [..]");
-  ret.append(" [--" + ARG_TTL + "]");
-  ret.append("\n");
+void BatchPutCommand::Help(std::string* ret) {
+  ret->append("  ");
+  ret->append(BatchPutCommand::Name());
+  ret->append(" <key> <value> [<key> <value>] [..]");
+  ret->append(" [--" + ARG_TTL + "]");
+  ret->append("\n");
 }
 
 void BatchPutCommand::DoCommand() {
   WriteBatch batch;
 
-  for (vector<pair<string, string>>::const_iterator itr
+  for (std::vector<std::pair<std::string, std::string>>::const_iterator itr
         = key_values_.begin(); itr != key_values_.end(); ++itr) {
-      batch.Put(itr->first, itr->second);
+    batch.Put(itr->first, itr->second);
   }
   Status st = db_->Write(WriteOptions(), &batch);
   if (st.ok()) {
@@ -1575,8 +1611,10 @@ Options BatchPutCommand::PrepareOptionsForOpenDB() {
 }
 
 
-ScanCommand::ScanCommand(const vector<string>& params,
-      const map<string, string>& options, const vector<string>& flags) :
+ScanCommand::ScanCommand(
+    const std::vector<std::string>& params,
+    const std::map<std::string, std::string>& options,
+    const std::vector<std::string>& flags) :
     LDBCommand(options, flags, true,
                BuildCmdLineOptions({ARG_TTL, ARG_HEX, ARG_KEY_HEX, ARG_TO,
                                     ARG_VALUE_HEX, ARG_FROM, ARG_TIMESTAMP,
@@ -1585,7 +1623,7 @@ ScanCommand::ScanCommand(const vector<string>& params,
     end_key_specified_(false),
     max_keys_scanned_(-1) {
 
-  map<string, string>::const_iterator itr = options.find(ARG_FROM);
+  auto itr = options.find(ARG_FROM);
   if (itr != options.end()) {
     start_key_ = itr->second;
     if (is_key_hex_) {
@@ -1606,26 +1644,26 @@ ScanCommand::ScanCommand(const vector<string>& params,
   if (itr != options.end()) {
     try {
       max_keys_scanned_ = stoi(itr->second);
-    } catch(const invalid_argument&) {
+    } catch(const std::invalid_argument&) {
       exec_state_ = LDBCommandExecuteResult::FAILED(ARG_MAX_KEYS +
                         " has an invalid value");
-    } catch(const out_of_range&) {
+    } catch(const std::out_of_range&) {
       exec_state_ = LDBCommandExecuteResult::FAILED(ARG_MAX_KEYS +
                         " has a value out-of-range");
     }
   }
 }
 
-void ScanCommand::Help(string& ret) {
-  ret.append("  ");
-  ret.append(ScanCommand::Name());
-  ret.append(HelpRangeCmdArgs());
-  ret.append(" [--" + ARG_TTL + "]");
-  ret.append(" [--" + ARG_TIMESTAMP + "]");
-  ret.append(" [--" + ARG_MAX_KEYS + "=<N>q] ");
-  ret.append(" [--" + ARG_TTL_START + "=<N>:- is inclusive]");
-  ret.append(" [--" + ARG_TTL_END + "=<N>:- is exclusive]");
-  ret.append("\n");
+void ScanCommand::Help(std::string* ret) {
+  ret->append("  ");
+  ret->append(ScanCommand::Name());
+  ret->append(HelpRangeCmdArgs());
+  ret->append(" [--" + ARG_TTL + "]");
+  ret->append(" [--" + ARG_TIMESTAMP + "]");
+  ret->append(" [--" + ARG_MAX_KEYS + " = <N>q] ");
+  ret->append(" [--" + ARG_TTL_START + " = <N>:- is inclusive]");
+  ret->append(" [--" + ARG_TTL_END + " = <N>:- is exclusive]");
+  ret->append("\n");
 }
 
 void ScanCommand::DoCommand() {
@@ -1638,11 +1676,11 @@ void ScanCommand::DoCommand() {
     it->SeekToFirst();
   }
   int ttl_start;
-  if (!ParseIntOption(option_map_, ARG_TTL_START, ttl_start, exec_state_)) {
+  if (!ParseIntOption(option_map_, ARG_TTL_START, &ttl_start, &exec_state_)) {
     ttl_start = DBWithTTLImpl::kMinTimestamp;  // TTL introduction time
   }
   int ttl_end;
-  if (!ParseIntOption(option_map_, ARG_TTL_END, ttl_end, exec_state_)) {
+  if (!ParseIntOption(option_map_, ARG_TTL_END, &ttl_end, &exec_state_)) {
     ttl_end = DBWithTTLImpl::kMaxTimestamp;  // Max time allowed by TTL feature
   }
   if (ttl_end < ttl_start) {
@@ -1655,9 +1693,9 @@ void ScanCommand::DoCommand() {
             ReadableTime(ttl_start).c_str(), ReadableTime(ttl_end).c_str());
   }
   for ( ;
-        it->Valid() && (!end_key_specified_ || it->key().ToString() < end_key_);
-        it->Next()) {
-    string key = ldb_options_.key_formatter->Format(it->key());
+      it->Valid() && (!end_key_specified_ || it->key().ToString() < end_key_);
+      it->Next()) {
+    std::string key = ldb_options_.key_formatter->Format(it->key());
     if (is_db_ttl_) {
       TtlIterator* it_ttl = dynamic_cast<TtlIterator*>(it);
       assert(it_ttl);
@@ -1669,11 +1707,10 @@ void ScanCommand::DoCommand() {
         fprintf(stdout, "%s ", ReadableTime(rawtime).c_str());
       }
     }
-    string value = it->value().ToString();
+    std::string value = it->value().ToString();
     fprintf(stdout, "%s : %s\n",
             (is_key_hex_ ? "0x" + it->key().ToString(true) : key).c_str(),
-            (is_value_hex_ ? StringToHex(value) : value).c_str()
-        );
+            (is_value_hex_ ? StringToHex(value) : value).c_str());
     num_keys_scanned++;
     if (max_keys_scanned_ >= 0 && num_keys_scanned >= max_keys_scanned_) {
       break;
@@ -1686,8 +1723,9 @@ void ScanCommand::DoCommand() {
 }
 
 
-DeleteCommand::DeleteCommand(const vector<string>& params,
-      const map<string, string>& options, const vector<string>& flags) :
+DeleteCommand::DeleteCommand(const std::vector<std::string>& params,
+      const std::map<std::string, std::string>& options,
+      const std::vector<std::string>& flags) :
   LDBCommand(options, flags, false,
              BuildCmdLineOptions({ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX})) {
 
@@ -1702,10 +1740,10 @@ DeleteCommand::DeleteCommand(const vector<string>& params,
   }
 }
 
-void DeleteCommand::Help(string& ret) {
-  ret.append("  ");
-  ret.append(DeleteCommand::Name() + " <key>");
-  ret.append("\n");
+void DeleteCommand::Help(std::string* ret) {
+  ret->append("  ");
+  ret->append(DeleteCommand::Name() + " <key>");
+  ret->append("\n");
 }
 
 void DeleteCommand::DoCommand() {
@@ -1718,8 +1756,10 @@ void DeleteCommand::DoCommand() {
 }
 
 
-PutCommand::PutCommand(const vector<string>& params,
-      const map<string, string>& options, const vector<string>& flags) :
+PutCommand::PutCommand(
+    const std::vector<std::string>& params,
+    const std::map<std::string, std::string>& options,
+    const std::vector<std::string>& flags) :
   LDBCommand(options, flags, false,
              BuildCmdLineOptions({ARG_TTL, ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX,
                                   ARG_CREATE_IF_MISSING})) {
@@ -1741,12 +1781,12 @@ PutCommand::PutCommand(const vector<string>& params,
   }
 }
 
-void PutCommand::Help(string& ret) {
-  ret.append("  ");
-  ret.append(PutCommand::Name());
-  ret.append(" <key> <value> ");
-  ret.append(" [--" + ARG_TTL + "]");
-  ret.append("\n");
+void PutCommand::Help(std::string* ret) {
+  ret->append("  ");
+  ret->append(PutCommand::Name());
+  ret->append(" <key> <value> ");
+  ret->append(" [--" + ARG_TTL + "]");
+  ret->append("\n");
 }
 
 void PutCommand::DoCommand() {
@@ -1770,43 +1810,43 @@ const char* DBQuerierCommand::GET_CMD = "get";
 const char* DBQuerierCommand::PUT_CMD = "put";
 const char* DBQuerierCommand::DELETE_CMD = "delete";
 
-DBQuerierCommand::DBQuerierCommand(const vector<string>& params,
-    const map<string, string>& options, const vector<string>& flags) :
+DBQuerierCommand::DBQuerierCommand(
+    const std::vector<std::string>& params,
+    const std::map<std::string, std::string>& options,
+    const std::vector<std::string>& flags) :
   LDBCommand(options, flags, false,
              BuildCmdLineOptions({ARG_TTL, ARG_HEX, ARG_KEY_HEX,
                                   ARG_VALUE_HEX})) {
 
 }
 
-void DBQuerierCommand::Help(string& ret) {
-  ret.append("  ");
-  ret.append(DBQuerierCommand::Name());
-  ret.append(" [--" + ARG_TTL + "]");
-  ret.append("\n");
-  ret.append("    Starts a REPL shell.  Type help for list of available "
+void DBQuerierCommand::Help(std::string* ret) {
+  ret->append("  ");
+  ret->append(DBQuerierCommand::Name());
+  ret->append(" [--" + ARG_TTL + "]");
+  ret->append("\n");
+  ret->append("    Starts a REPL shell.  Type help for list of available "
              "commands.");
-  ret.append("\n");
+  ret->append("\n");
 }
 
 void DBQuerierCommand::DoCommand() {
   if (!db_) {
     return;
   }
-
   ReadOptions read_options;
   WriteOptions write_options;
 
-  string line;
-  string key;
-  string value;
-  while (getline(cin, line, '\n')) {
-
-    // Parse line into vector<string>
-    vector<string> tokens;
+  std::string line;
+  std::string key;
+  std::string value;
+  while (getline(std::cin, line, '\n')) {
+    // Parse line into std::vector<std::string>
+    std::vector<std::string> tokens;
     size_t pos = 0;
     while (true) {
       size_t pos2 = line.find(' ', pos);
-      if (pos2 == string::npos) {
+      if (pos2 == std::string::npos) {
         break;
       }
       tokens.push_back(line.substr(pos, pos2-pos));
@@ -1814,7 +1854,7 @@ void DBQuerierCommand::DoCommand() {
     }
     tokens.push_back(line.substr(pos));
 
-    const string& cmd = tokens[0];
+    const std::string& cmd = tokens[0];
 
     if (cmd == HELP_CMD) {
       fprintf(stdout,
@@ -1845,16 +1885,18 @@ void DBQuerierCommand::DoCommand() {
   }
 }
 
-CheckConsistencyCommand::CheckConsistencyCommand(const vector<string>& params,
-    const map<string, string>& options, const vector<string>& flags) :
+CheckConsistencyCommand::CheckConsistencyCommand(
+    const std::vector<std::string>& params,
+    const std::map<std::string, std::string>& options,
+    const std::vector<std::string>& flags) :
   LDBCommand(options, flags, false,
              BuildCmdLineOptions({})) {
 }
 
-void CheckConsistencyCommand::Help(string& ret) {
-  ret.append("  ");
-  ret.append(CheckConsistencyCommand::Name());
-  ret.append("\n");
+void CheckConsistencyCommand::Help(std::string* ret) {
+  ret->append("  ");
+  ret->append(CheckConsistencyCommand::Name());
+  ret->append("\n");
 }
 
 void CheckConsistencyCommand::DoCommand() {
diff --git a/util/ldb_cmd.h b/util/ldb_cmd.h
index 9ffe0eabc..b42d779c3 100644
--- a/util/ldb_cmd.h
+++ b/util/ldb_cmd.h
@@ -4,12 +4,15 @@
 //  of patent rights can be found in the PATENTS file in the same directory.
 //
 #pragma once
+#include <stdio.h>
+#include <stdlib.h>
 #include <string>
 #include <iostream>
 #include <sstream>
-#include <stdlib.h>
 #include <algorithm>
-#include <stdio.h>
+#include <map>
+#include <vector>
+#include <utility>
 
 #include "db/version_set.h"
 #include "rocksdb/env.h"
@@ -23,39 +26,34 @@
 #include "util/string_util.h"
 #include "utilities/ttl/db_ttl_impl.h"
 
-using std::string;
-using std::map;
-using std::vector;
-using std::ostringstream;
-
 namespace rocksdb {
 
 class LDBCommand {
-public:
+ public:
 
   // Command-line arguments
-  static const string ARG_DB;
-  static const string ARG_HEX;
-  static const string ARG_KEY_HEX;
-  static const string ARG_VALUE_HEX;
-  static const string ARG_TTL;
-  static const string ARG_TTL_START;
-  static const string ARG_TTL_END;
-  static const string ARG_TIMESTAMP;
-  static const string ARG_FROM;
-  static const string ARG_TO;
-  static const string ARG_MAX_KEYS;
-  static const string ARG_BLOOM_BITS;
-  static const string ARG_FIX_PREFIX_LEN;
-  static const string ARG_COMPRESSION_TYPE;
-  static const string ARG_BLOCK_SIZE;
-  static const string ARG_AUTO_COMPACTION;
-  static const string ARG_WRITE_BUFFER_SIZE;
-  static const string ARG_FILE_SIZE;
-  static const string ARG_CREATE_IF_MISSING;
+  static const std::string ARG_DB;
+  static const std::string ARG_HEX;
+  static const std::string ARG_KEY_HEX;
+  static const std::string ARG_VALUE_HEX;
+  static const std::string ARG_TTL;
+  static const std::string ARG_TTL_START;
+  static const std::string ARG_TTL_END;
+  static const std::string ARG_TIMESTAMP;
+  static const std::string ARG_FROM;
+  static const std::string ARG_TO;
+  static const std::string ARG_MAX_KEYS;
+  static const std::string ARG_BLOOM_BITS;
+  static const std::string ARG_FIX_PREFIX_LEN;
+  static const std::string ARG_COMPRESSION_TYPE;
+  static const std::string ARG_BLOCK_SIZE;
+  static const std::string ARG_AUTO_COMPACTION;
+  static const std::string ARG_WRITE_BUFFER_SIZE;
+  static const std::string ARG_FILE_SIZE;
+  static const std::string ARG_CREATE_IF_MISSING;
 
   static LDBCommand* InitFromCmdLineArgs(
-    const vector<string>& args,
+    const std::vector<std::string>& args,
     const Options& options,
     const LDBOptions& ldb_options
   );
@@ -123,8 +121,8 @@ public:
     exec_state_.Reset();
   }
 
-  static string HexToString(const string& str) {
-    string parsed;
+  static std::string HexToString(const std::string& str) {
+    std::string parsed;
     if (str[0] != '0' || str[1] != 'x') {
       fprintf(stderr, "Invalid hex input %s.  Must start with 0x\n",
               str.c_str());
@@ -140,8 +138,8 @@ public:
     return parsed;
   }
 
-  static string StringToHex(const string& str) {
-    string result = "0x";
+  static std::string StringToHex(const std::string& str) {
+    std::string result = "0x";
     char buf[10];
     for (size_t i = 0; i < str.length(); i++) {
       snprintf(buf, 10, "%02X", (unsigned char)str[i]);
@@ -155,7 +153,7 @@ public:
 protected:
 
   LDBCommandExecuteResult exec_state_;
-  string db_path_;
+  std::string db_path_;
   DB* db_;
   DBWithTTL* db_ttl_;
 
@@ -180,21 +178,24 @@ protected:
   /**
    * Map of options passed on the command-line.
    */
-  const map<string, string> option_map_;
+  const std::map<std::string, std::string> option_map_;
 
   /**
    * Flags passed on the command-line.
    */
-  const vector<string> flags_;
+  const std::vector<std::string> flags_;
 
   /** List of command-line options valid for this command */
-  const vector<string> valid_cmd_line_options_;
+  const std::vector<std::string> valid_cmd_line_options_;
 
-  bool ParseKeyValue(const string& line, string* key, string* value,
-                      bool is_key_hex, bool is_value_hex);
+  bool ParseKeyValue(const std::string& line,
+                     std::string* key, std::string* value,
+                     bool is_key_hex, bool is_value_hex);
 
-  LDBCommand(const map<string, string>& options, const vector<string>& flags,
-             bool is_read_only, const vector<string>& valid_cmd_line_options) :
+  LDBCommand(const std::map<std::string, std::string>& options,
+             const std::vector<std::string>& flags,
+             bool is_read_only,
+             const std::vector<std::string>& valid_cmd_line_options) :
       db_(nullptr),
       is_read_only_(is_read_only),
       is_key_hex_(false),
@@ -205,7 +206,7 @@ protected:
       flags_(flags),
       valid_cmd_line_options_(valid_cmd_line_options) {
 
-    map<string, string>::const_iterator itr = options.find(ARG_DB);
+    auto itr = options.find(ARG_DB);
     if (itr != options.end()) {
       db_path_ = itr->second;
     }
@@ -236,7 +237,7 @@ protected:
       st = DB::Open(opt, db_path_, &db_);
     }
     if (!st.ok()) {
-      string msg = st.ToString();
+      std::string msg = st.ToString();
       exec_state_ = LDBCommandExecuteResult::FAILED(msg);
     }
 
@@ -250,29 +251,33 @@ protected:
     }
   }
 
-  static string PrintKeyValue(const string& key, const string& value,
-        bool is_key_hex, bool is_value_hex) {
-    string result;
+  static std::string PrintKeyValue(
+      const std::string& key, const std::string& value,
+      bool is_key_hex, bool is_value_hex) {
+    std::string result;
     result.append(is_key_hex ? StringToHex(key) : key);
     result.append(DELIM);
     result.append(is_value_hex ? StringToHex(value) : value);
     return result;
   }
 
-  static string PrintKeyValue(const string& key, const string& value,
-        bool is_hex) {
+  static std::string PrintKeyValue(
+      const std::string& key, const std::string& value,
+      bool is_hex) {
     return PrintKeyValue(key, value, is_hex, is_hex);
   }
 
   /**
-   * Return true if the specified flag is present in the specified flags vector
+   * Return true if the specified flag is present in the specified
+   * flags vector
    */
-  static bool IsFlagPresent(const vector<string>& flags, const string& flag) {
+  static bool IsFlagPresent(
+      const std::vector<std::string>& flags, const std::string& flag) {
     return (std::find(flags.begin(), flags.end(), flag) != flags.end());
   }
 
-  static string HelpRangeCmdArgs() {
-    ostringstream str_stream;
+  static std::string HelpRangeCmdArgs() {
+    std::ostringstream str_stream;
     str_stream << " ";
     str_stream << "[--" << ARG_FROM << "] ";
     str_stream << "[--" << ARG_TO << "] ";
@@ -284,32 +289,35 @@ protected:
    * used by this command.  It includes the common options and the ones
    * passed in.
    */
-  vector<string> BuildCmdLineOptions(vector<string> options) {
-    vector<string> ret = {ARG_DB,               ARG_BLOOM_BITS,
-                          ARG_BLOCK_SIZE,       ARG_AUTO_COMPACTION,
-                          ARG_COMPRESSION_TYPE, ARG_WRITE_BUFFER_SIZE,
-                          ARG_FILE_SIZE,        ARG_FIX_PREFIX_LEN};
+  std::vector<std::string> BuildCmdLineOptions(
+      std::vector<std::string> options) {
+    std::vector<std::string> ret = {
+        ARG_DB,               ARG_BLOOM_BITS,
+        ARG_BLOCK_SIZE,       ARG_AUTO_COMPACTION,
+        ARG_COMPRESSION_TYPE, ARG_WRITE_BUFFER_SIZE,
+        ARG_FILE_SIZE,        ARG_FIX_PREFIX_LEN};
     ret.insert(ret.end(), options.begin(), options.end());
     return ret;
   }
 
-  bool ParseIntOption(const map<string, string>& options, const string& option,
-                      int& value, LDBCommandExecuteResult& exec_state);
+  bool ParseIntOption(const std::map<std::string, std::string>& options,
+                      const std::string& option,
+                      int* value, LDBCommandExecuteResult* exec_state);
 
-  bool ParseStringOption(const map<string, string>& options,
-                         const string& option, string* value);
+  bool ParseStringOption(const std::map<std::string, std::string>& options,
+                         const std::string& option, std::string* value);
 
   Options options_;
   LDBOptions ldb_options_;
 
-private:
+ private:
 
   /**
    * Interpret command line options and flags to determine if the key
    * should be input/output in hex.
    */
-  bool IsKeyHex(const map<string, string>& options,
-      const vector<string>& flags) {
+  bool IsKeyHex(const std::map<std::string, std::string>& options,
+      const std::vector<std::string>& flags) {
     return (IsFlagPresent(flags, ARG_HEX) ||
         IsFlagPresent(flags, ARG_KEY_HEX) ||
         ParseBooleanOption(options, ARG_HEX, false) ||
@@ -320,8 +328,8 @@ private:
    * Interpret command line options and flags to determine if the value
    * should be input/output in hex.
    */
-  bool IsValueHex(const map<string, string>& options,
-      const vector<string>& flags) {
+  bool IsValueHex(const std::map<std::string, std::string>& options,
+      const std::vector<std::string>& flags) {
     return (IsFlagPresent(flags, ARG_HEX) ||
           IsFlagPresent(flags, ARG_VALUE_HEX) ||
           ParseBooleanOption(options, ARG_HEX, false) ||
@@ -334,12 +342,13 @@ private:
    * Throws an exception if the value of the option is not
    * "true" or "false" (case insensitive).
    */
-  bool ParseBooleanOption(const map<string, string>& options,
-      const string& option, bool default_val) {
+  bool ParseBooleanOption(
+      const std::map<std::string, std::string>& options,
+      const std::string& option, bool default_val) {
 
-    map<string, string>::const_iterator itr = options.find(option);
+    auto itr = options.find(option);
     if (itr != options.end()) {
-      string option_val = itr->second;
+      std::string option_val = itr->second;
       return StringToBool(itr->second);
     }
     return default_val;
@@ -350,7 +359,7 @@ private:
    * val must be either true or false (case insensitive).
    * Otherwise an exception is thrown.
    */
-  bool StringToBool(string val) {
+  bool StringToBool(std::string val) {
     std::transform(val.begin(), val.end(), val.begin(), ::tolower);
     if (val == "true") {
       return true;
@@ -362,161 +371,165 @@ private:
   }
 
   static LDBCommand* SelectCommand(
-    const string& cmd,
-    const vector<string>& cmdParams,
-    const map<string, string>& option_map,
-    const vector<string>& flags
+    const std::string& cmd,
+    const std::vector<std::string>& cmdParams,
+    const std::map<std::string, std::string>& option_map,
+    const std::vector<std::string>& flags
   );
 
 };
 
 class CompactorCommand: public LDBCommand {
-public:
-  static string Name() { return "compact"; }
+ public:
+  static std::string Name() { return "compact"; }
 
-  CompactorCommand(const vector<string>& params,
-      const map<string, string>& options, const vector<string>& flags);
+  CompactorCommand(const std::vector<std::string>& params,
+      const std::map<std::string, std::string>& options,
+      const std::vector<std::string>& flags);
 
-  static void Help(string& ret);
+  static void Help(std::string* ret);
 
   virtual void DoCommand();
 
-private:
+ private:
   bool null_from_;
-  string from_;
+  std::string from_;
   bool null_to_;
-  string to_;
+  std::string to_;
 };
 
 class DBDumperCommand: public LDBCommand {
-public:
-  static string Name() { return "dump"; }
+ public:
+  static std::string Name() { return "dump"; }
 
-  DBDumperCommand(const vector<string>& params,
-      const map<string, string>& options, const vector<string>& flags);
+  DBDumperCommand(const std::vector<std::string>& params,
+      const std::map<std::string, std::string>& options,
+      const std::vector<std::string>& flags);
 
-  static void Help(string& ret);
+  static void Help(std::string* ret);
 
   virtual void DoCommand();
 
-private:
+ private:
   bool null_from_;
-  string from_;
+  std::string from_;
   bool null_to_;
-  string to_;
-  int max_keys_;
-  string delim_;
+  std::string to_;
+  uint64_t max_keys_;
+  std::string delim_;
   bool count_only_;
   bool count_delim_;
   bool print_stats_;
 
-  static const string ARG_COUNT_ONLY;
-  static const string ARG_COUNT_DELIM;
-  static const string ARG_STATS;
-  static const string ARG_TTL_BUCKET;
+  static const std::string ARG_COUNT_ONLY;
+  static const std::string ARG_COUNT_DELIM;
+  static const std::string ARG_STATS;
+  static const std::string ARG_TTL_BUCKET;
 };
 
 class InternalDumpCommand: public LDBCommand {
-public:
-  static string Name() { return "idump"; }
+ public:
+  static std::string Name() { return "idump"; }
 
-  InternalDumpCommand(const vector<string>& params,
-                      const map<string, string>& options,
-                      const vector<string>& flags);
+  InternalDumpCommand(const std::vector<std::string>& params,
+                      const std::map<std::string, std::string>& options,
+                      const std::vector<std::string>& flags);
 
-  static void Help(string& ret);
+  static void Help(std::string* ret);
 
   virtual void DoCommand();
 
-private:
+ private:
   bool has_from_;
-  string from_;
+  std::string from_;
   bool has_to_;
-  string to_;
+  std::string to_;
   int max_keys_;
-  string delim_;
+  std::string delim_;
   bool count_only_;
   bool count_delim_;
   bool print_stats_;
   bool is_input_key_hex_;
 
-  static const string ARG_DELIM;
-  static const string ARG_COUNT_ONLY;
-  static const string ARG_COUNT_DELIM;
-  static const string ARG_STATS;
-  static const string ARG_INPUT_KEY_HEX;
+  static const std::string ARG_DELIM;
+  static const std::string ARG_COUNT_ONLY;
+  static const std::string ARG_COUNT_DELIM;
+  static const std::string ARG_STATS;
+  static const std::string ARG_INPUT_KEY_HEX;
 };
 
 class DBLoaderCommand: public LDBCommand {
-public:
-  static string Name() { return "load"; }
-
-  DBLoaderCommand(string& db_name, vector<string>& args);
+ public:
+  static std::string Name() { return "load"; }
 
-  DBLoaderCommand(const vector<string>& params,
-      const map<string, string>& options, const vector<string>& flags);
+  DBLoaderCommand(const std::vector<std::string>& params,
+      const std::map<std::string, std::string>& options,
+      const std::vector<std::string>& flags);
 
-  static void Help(string& ret);
+  static void Help(std::string* ret);
   virtual void DoCommand();
 
   virtual Options PrepareOptionsForOpenDB();
 
-private:
+ private:
   bool create_if_missing_;
   bool disable_wal_;
   bool bulk_load_;
   bool compact_;
 
-  static const string ARG_DISABLE_WAL;
-  static const string ARG_BULK_LOAD;
-  static const string ARG_COMPACT;
+  static const std::string ARG_DISABLE_WAL;
+  static const std::string ARG_BULK_LOAD;
+  static const std::string ARG_COMPACT;
 };
 
 class ManifestDumpCommand: public LDBCommand {
-public:
-  static string Name() { return "manifest_dump"; }
+ public:
+  static std::string Name() { return "manifest_dump"; }
 
-  ManifestDumpCommand(const vector<string>& params,
-      const map<string, string>& options, const vector<string>& flags);
+  ManifestDumpCommand(const std::vector<std::string>& params,
+      const std::map<std::string, std::string>& options,
+      const std::vector<std::string>& flags);
 
-  static void Help(string& ret);
+  static void Help(std::string* ret);
   virtual void DoCommand();
 
   virtual bool NoDBOpen() {
     return true;
   }
 
-private:
+ private:
   bool verbose_;
-  string path_;
+  std::string path_;
 
-  static const string ARG_VERBOSE;
-  static const string ARG_PATH;
+  static const std::string ARG_VERBOSE;
+  static const std::string ARG_PATH;
 };
 
 class ListColumnFamiliesCommand : public LDBCommand {
  public:
-  static string Name() { return "list_column_families"; }
+  static std::string Name() { return "list_column_families"; }
 
-  ListColumnFamiliesCommand(const vector<string>& params,
-                            const map<string, string>& options,
-                            const vector<string>& flags);
+  ListColumnFamiliesCommand(
+      const std::vector<std::string>& params,
+      const std::map<std::string, std::string>& options,
+      const std::vector<std::string>& flags);
 
-  static void Help(string& ret);
+  static void Help(std::string* ret);
   virtual void DoCommand();
 
   virtual bool NoDBOpen() { return true; }
 
  private:
-  string dbname_;
+  std::string dbname_;
 };
 
 class ReduceDBLevelsCommand : public LDBCommand {
-public:
-  static string Name() { return "reduce_levels"; }
+ public:
+  static std::string Name() { return "reduce_levels"; }
 
-  ReduceDBLevelsCommand(const vector<string>& params,
-      const map<string, string>& options, const vector<string>& flags);
+  ReduceDBLevelsCommand(const std::vector<std::string>& params,
+      const std::map<std::string, std::string>& options,
+      const std::vector<std::string>& flags);
 
   virtual Options PrepareOptionsForOpenDB();
 
@@ -526,169 +539,179 @@ public:
     return true;
   }
 
-  static void Help(string& msg);
+  static void Help(std::string* msg);
 
-  static vector<string> PrepareArgs(const string& db_path, int new_levels,
+  static std::vector<std::string> PrepareArgs(
+      const std::string& db_path,
+      int new_levels,
       bool print_old_level = false);
 
-private:
+ private:
   int old_levels_;
   int new_levels_;
   bool print_old_levels_;
 
-  static const string ARG_NEW_LEVELS;
-  static const string ARG_PRINT_OLD_LEVELS;
+  static const std::string ARG_NEW_LEVELS;
+  static const std::string ARG_PRINT_OLD_LEVELS;
 
   Status GetOldNumOfLevels(Options& opt, int* levels);
 };
 
 class ChangeCompactionStyleCommand : public LDBCommand {
-public:
-  static string Name() { return "change_compaction_style"; }
+ public:
+  static std::string Name() { return "change_compaction_style"; }
 
-  ChangeCompactionStyleCommand(const vector<string>& params,
-      const map<string, string>& options, const vector<string>& flags);
+  ChangeCompactionStyleCommand(const std::vector<std::string>& params,
+      const std::map<std::string, std::string>& options,
+      const std::vector<std::string>& flags);
 
   virtual Options PrepareOptionsForOpenDB();
 
   virtual void DoCommand();
 
-  static void Help(string& msg);
+  static void Help(std::string* msg);
 
-private:
+ private:
   int old_compaction_style_;
   int new_compaction_style_;
 
-  static const string ARG_OLD_COMPACTION_STYLE;
-  static const string ARG_NEW_COMPACTION_STYLE;
+  static const std::string ARG_OLD_COMPACTION_STYLE;
+  static const std::string ARG_NEW_COMPACTION_STYLE;
 };
 
 class WALDumperCommand : public LDBCommand {
-public:
-  static string Name() { return "dump_wal"; }
+ public:
+  static std::string Name() { return "dump_wal"; }
 
-  WALDumperCommand(const vector<string>& params,
-      const map<string, string>& options, const vector<string>& flags);
+  WALDumperCommand(const std::vector<std::string>& params,
+      const std::map<std::string, std::string>& options,
+      const std::vector<std::string>& flags);
 
   virtual bool  NoDBOpen() {
     return true;
   }
 
-  static void Help(string& ret);
+  static void Help(std::string* ret);
   virtual void DoCommand();
 
-private:
+ private:
   bool print_header_;
-  string wal_file_;
+  std::string wal_file_;
   bool print_values_;
 
-  static const string ARG_WAL_FILE;
-  static const string ARG_PRINT_HEADER;
-  static const string ARG_PRINT_VALUE;
+  static const std::string ARG_WAL_FILE;
+  static const std::string ARG_PRINT_HEADER;
+  static const std::string ARG_PRINT_VALUE;
 };
 
 
 class GetCommand : public LDBCommand {
-public:
-  static string Name() { return "get"; }
+ public:
+  static std::string Name() { return "get"; }
 
-  GetCommand(const vector<string>& params, const map<string, string>& options,
-      const vector<string>& flags);
+  GetCommand(const std::vector<std::string>& params,
+      const std::map<std::string, std::string>& options,
+      const std::vector<std::string>& flags);
 
   virtual void DoCommand();
 
-  static void Help(string& ret);
+  static void Help(std::string* ret);
 
-private:
-  string key_;
+ private:
+  std::string key_;
 };
 
 class ApproxSizeCommand : public LDBCommand {
-public:
-  static string Name() { return "approxsize"; }
+ public:
+  static std::string Name() { return "approxsize"; }
 
-  ApproxSizeCommand(const vector<string>& params,
-      const map<string, string>& options, const vector<string>& flags);
+  ApproxSizeCommand(const std::vector<std::string>& params,
+      const std::map<std::string, std::string>& options,
+      const std::vector<std::string>& flags);
 
   virtual void DoCommand();
 
-  static void Help(string& ret);
+  static void Help(std::string* ret);
 
-private:
-  string start_key_;
-  string end_key_;
+ private:
+  std::string start_key_;
+  std::string end_key_;
 };
 
 class BatchPutCommand : public LDBCommand {
-public:
-  static string Name() { return "batchput"; }
+ public:
+  static std::string Name() { return "batchput"; }
 
-  BatchPutCommand(const vector<string>& params,
-      const map<string, string>& options, const vector<string>& flags);
+  BatchPutCommand(const std::vector<std::string>& params,
+      const std::map<std::string, std::string>& options,
+      const std::vector<std::string>& flags);
 
   virtual void DoCommand();
 
-  static void Help(string& ret);
+  static void Help(std::string* ret);
 
   virtual Options PrepareOptionsForOpenDB();
 
-private:
+ private:
   /**
    * The key-values to be inserted.
    */
-  vector<std::pair<string, string>> key_values_;
+  std::vector<std::pair<std::string, std::string>> key_values_;
 };
 
 class ScanCommand : public LDBCommand {
-public:
-  static string Name() { return "scan"; }
+ public:
+  static std::string Name() { return "scan"; }
 
-  ScanCommand(const vector<string>& params, const map<string, string>& options,
-      const vector<string>& flags);
+  ScanCommand(const std::vector<std::string>& params,
+      const std::map<std::string, std::string>& options,
+      const std::vector<std::string>& flags);
 
   virtual void DoCommand();
 
-  static void Help(string& ret);
+  static void Help(std::string* ret);
 
-private:
-  string start_key_;
-  string end_key_;
+ private:
+  std::string start_key_;
+  std::string end_key_;
   bool start_key_specified_;
   bool end_key_specified_;
   int max_keys_scanned_;
 };
 
 class DeleteCommand : public LDBCommand {
-public:
-  static string Name() { return "delete"; }
+ public:
+  static std::string Name() { return "delete"; }
 
-  DeleteCommand(const vector<string>& params,
-      const map<string, string>& options, const vector<string>& flags);
+  DeleteCommand(const std::vector<std::string>& params,
+      const std::map<std::string, std::string>& options,
+      const std::vector<std::string>& flags);
 
   virtual void DoCommand();
 
-  static void Help(string& ret);
+  static void Help(std::string* ret);
 
-private:
-  string key_;
+ private:
+  std::string key_;
 };
 
 class PutCommand : public LDBCommand {
-public:
-  static string Name() { return "put"; }
+ public:
+  static std::string Name() { return "put"; }
 
-  PutCommand(const vector<string>& params, const map<string, string>& options,
-      const vector<string>& flags);
+  PutCommand(const std::vector<std::string>& params,
+      const std::map<std::string, std::string>& options,
+      const std::vector<std::string>& flags);
 
   virtual void DoCommand();
 
-  static void Help(string& ret);
+  static void Help(std::string* ret);
 
   virtual Options PrepareOptionsForOpenDB();
 
-private:
-  string key_;
-  string value_;
+ private:
+  std::string key_;
+  std::string value_;
 };
 
 /**
@@ -696,17 +719,18 @@ private:
  * get/put/delete.
  */
 class DBQuerierCommand: public LDBCommand {
-public:
-  static string Name() { return "query"; }
+ public:
+  static std::string Name() { return "query"; }
 
-  DBQuerierCommand(const vector<string>& params,
-      const map<string, string>& options, const vector<string>& flags);
+  DBQuerierCommand(const std::vector<std::string>& params,
+      const std::map<std::string, std::string>& options,
+      const std::vector<std::string>& flags);
 
-  static void Help(string& ret);
+  static void Help(std::string* ret);
 
   virtual void DoCommand();
 
-private:
+ private:
   static const char* HELP_CMD;
   static const char* GET_CMD;
   static const char* PUT_CMD;
@@ -714,11 +738,12 @@ private:
 };
 
 class CheckConsistencyCommand : public LDBCommand {
-public:
-  static string Name() { return "checkconsistency"; }
+ public:
+  static std::string Name() { return "checkconsistency"; }
 
-  CheckConsistencyCommand(const vector<string>& params,
-      const map<string, string>& options, const vector<string>& flags);
+  CheckConsistencyCommand(const std::vector<std::string>& params,
+      const std::map<std::string, std::string>& options,
+      const std::vector<std::string>& flags);
 
   virtual void DoCommand();
 
@@ -726,7 +751,7 @@ public:
     return true;
   }
 
-  static void Help(string& ret);
+  static void Help(std::string* ret);
 };
 
 } // namespace rocksdb
diff --git a/util/ldb_cmd_execute_result.h b/util/ldb_cmd_execute_result.h
index b8e6c4634..48a4b495c 100644
--- a/util/ldb_cmd_execute_result.h
+++ b/util/ldb_cmd_execute_result.h
@@ -15,7 +15,7 @@ public:
 
   LDBCommandExecuteResult() : state_(EXEC_NOT_STARTED), message_("") {}
 
-  LDBCommandExecuteResult(State state, std::string& msg) :
+  LDBCommandExecuteResult(State state, const std::string& msg) :
     state_(state), message_(msg) {}
 
   std::string ToString() {
@@ -52,11 +52,11 @@ public:
     return state_ == EXEC_FAILED;
   }
 
-  static LDBCommandExecuteResult SUCCEED(std::string msg) {
+  static LDBCommandExecuteResult SUCCEED(const std::string& msg) {
     return LDBCommandExecuteResult(EXEC_SUCCEED, msg);
   }
 
-  static LDBCommandExecuteResult FAILED(std::string msg) {
+  static LDBCommandExecuteResult FAILED(const std::string& msg) {
     return LDBCommandExecuteResult(EXEC_FAILED, msg);
   }
 
diff --git a/util/ldb_tool.cc b/util/ldb_tool.cc
index bb6c8ffca..9824c0210 100644
--- a/util/ldb_tool.cc
+++ b/util/ldb_tool.cc
@@ -24,7 +24,7 @@ class LDBCommandRunner {
 public:
 
   static void PrintHelp(const char* exec_name) {
-    string ret;
+    std::string ret;
 
     ret.append("ldb - LevelDB Tool");
     ret.append("\n\n");
@@ -59,26 +59,26 @@ public:
 
     ret.append("\n\n");
     ret.append("Data Access Commands:\n");
-    PutCommand::Help(ret);
-    GetCommand::Help(ret);
-    BatchPutCommand::Help(ret);
-    ScanCommand::Help(ret);
-    DeleteCommand::Help(ret);
-    DBQuerierCommand::Help(ret);
-    ApproxSizeCommand::Help(ret);
-    CheckConsistencyCommand::Help(ret);
+    PutCommand::Help(&ret);
+    GetCommand::Help(&ret);
+    BatchPutCommand::Help(&ret);
+    ScanCommand::Help(&ret);
+    DeleteCommand::Help(&ret);
+    DBQuerierCommand::Help(&ret);
+    ApproxSizeCommand::Help(&ret);
+    CheckConsistencyCommand::Help(&ret);
 
     ret.append("\n\n");
     ret.append("Admin Commands:\n");
-    WALDumperCommand::Help(ret);
-    CompactorCommand::Help(ret);
-    ReduceDBLevelsCommand::Help(ret);
-    ChangeCompactionStyleCommand::Help(ret);
-    DBDumperCommand::Help(ret);
-    DBLoaderCommand::Help(ret);
-    ManifestDumpCommand::Help(ret);
-    ListColumnFamiliesCommand::Help(ret);
-    InternalDumpCommand::Help(ret);
+    WALDumperCommand::Help(&ret);
+    CompactorCommand::Help(&ret);
+    ReduceDBLevelsCommand::Help(&ret);
+    ChangeCompactionStyleCommand::Help(&ret);
+    DBDumperCommand::Help(&ret);
+    DBLoaderCommand::Help(&ret);
+    ManifestDumpCommand::Help(&ret);
+    ListColumnFamiliesCommand::Help(&ret);
+    InternalDumpCommand::Help(&ret);
 
     fprintf(stderr, "%s\n", ret.c_str());
   }

From 82e3ae5403cbaaf17062ec6ea682aeb510299bf5 Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Fri, 31 Oct 2014 15:04:01 -0700
Subject: [PATCH 384/829] fix c_test

Summary: as title

Test Plan: ./c_test

Reviewers: igor

Reviewed By: igor

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D28119
---
 Makefile    |  2 +-
 db/c_test.c | 28 +++++++++++++++++-----------
 2 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/Makefile b/Makefile
index 5ed8a5a67..ca51442d2 100644
--- a/Makefile
+++ b/Makefile
@@ -132,7 +132,7 @@ TESTS = \
 	spatial_db_test \
 	version_edit_test \
 	version_set_test \
-  compaction_picker_test \
+	compaction_picker_test \
 	version_builder_test \
 	file_indexer_test \
 	write_batch_test \
diff --git a/db/c_test.c b/db/c_test.c
index d693f52ca..c17267114 100644
--- a/db/c_test.c
+++ b/db/c_test.c
@@ -576,33 +576,39 @@ int main(int argc, char** argv) {
 
   StartPhase("compaction_filter");
   {
-    rocksdb_options_set_create_if_missing(options, 1);
+    rocksdb_options_t* options_with_filter = rocksdb_options_create();
+    rocksdb_options_set_create_if_missing(options_with_filter, 1);
     rocksdb_compactionfilter_t* cfilter;
     cfilter = rocksdb_compactionfilter_create(NULL, CFilterDestroy,
                                               CFilterFilter, CFilterName);
     // Create new database
     rocksdb_close(db);
-    rocksdb_destroy_db(options, dbname, &err);
-    rocksdb_options_set_compaction_filter(options, cfilter);
-    db = CheckCompaction(db, options, roptions, woptions);
+    rocksdb_destroy_db(options_with_filter, dbname, &err);
+    rocksdb_options_set_compaction_filter(options_with_filter, cfilter);
+    db = CheckCompaction(db, options_with_filter, roptions, woptions);
 
-    rocksdb_options_set_compaction_filter(options, NULL);
+    rocksdb_options_set_compaction_filter(options_with_filter, NULL);
     rocksdb_compactionfilter_destroy(cfilter);
+    rocksdb_options_destroy(options_with_filter);
   }
 
   StartPhase("compaction_filter_factory");
   {
-    rocksdb_options_set_create_if_missing(options, 1);
+    rocksdb_options_t* options_with_filter_factory = rocksdb_options_create();
+    rocksdb_options_set_create_if_missing(options_with_filter_factory, 1);
     rocksdb_compactionfilterfactory_t* factory;
     factory = rocksdb_compactionfilterfactory_create(
         NULL, CFilterFactoryDestroy, CFilterCreate, CFilterFactoryName);
     // Create new database
     rocksdb_close(db);
-    rocksdb_destroy_db(options, dbname, &err);
-    rocksdb_options_set_compaction_filter_factory(options, factory);
-    db = CheckCompaction(db, options, roptions, woptions);
-
-    rocksdb_options_set_compaction_filter_factory(options, NULL);
+    rocksdb_destroy_db(options_with_filter_factory, dbname, &err);
+    rocksdb_options_set_compaction_filter_factory(options_with_filter_factory,
+                                                  factory);
+    db = CheckCompaction(db, options_with_filter_factory, roptions, woptions);
+
+    rocksdb_options_set_compaction_filter_factory(
+        options_with_filter_factory, NULL);
+    rocksdb_options_destroy(options_with_filter_factory);
   }
 
   StartPhase("compaction_filter_v2");

From 5594d446ffa256ccecbbeb28c8c22561752707c7 Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Fri, 31 Oct 2014 15:04:39 -0700
Subject: [PATCH 385/829] unfriend DBImpl and InternalStats from
 VersionStorageInfo

Summary: as title

Test Plan: make release

Reviewers: sdong, yhchiang, rven, igor

Reviewed By: igor

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D28053
---
 db/internal_stats.cc | 6 +++---
 db/version_set.h     | 2 --
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/db/internal_stats.cc b/db/internal_stats.cc
index 6aaf6b2c4..a59da4317 100644
--- a/db/internal_stats.cc
+++ b/db/internal_stats.cc
@@ -378,13 +378,13 @@ void InternalStats::DumpCFStats(std::string* value) {
   // level order
   std::vector<double> compaction_score(number_levels_, 0);
   for (int i = 0; i < num_levels_to_check; ++i) {
-    compaction_score[vstorage->compaction_level_[i]] =
-        vstorage->compaction_score_[i];
+    compaction_score[vstorage->CompactionScoreLevel(i)] =
+        vstorage->CompactionScore(i);
   }
   // Count # of files being compacted for each level
   std::vector<int> files_being_compacted(number_levels_, 0);
   for (int level = 0; level < num_levels_to_check; ++level) {
-    for (auto* f : vstorage->files_[level]) {
+    for (auto* f : vstorage->LevelFiles(level)) {
       if (f->being_compacted) {
         ++files_being_compacted[level];
       }
diff --git a/db/version_set.h b/db/version_set.h
index 44e6f94b2..ae3d53cd2 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -345,8 +345,6 @@ class VersionStorageInfo {
 
   friend class Version;
   friend class VersionSet;
-  friend class DBImpl;
-  friend class InternalStats;
   // No copying allowed
   VersionStorageInfo(const VersionStorageInfo&) = delete;
   void operator=(const VersionStorageInfo&) = delete;

From 8db24f4b35664a9c17ed0a0a37af13734d9c3e81 Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Fri, 31 Oct 2014 15:07:27 -0700
Subject: [PATCH 386/829] exclude mock test file from MOCK_SOURCES

Summary: as title

Test Plan: build with mock_env_test.cc

Reviewers: sdong, yhchiang, rven, igor

Reviewed By: igor

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D28107
---
 build_tools/build_detect_platform | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/build_tools/build_detect_platform b/build_tools/build_detect_platform
index 8e92b9b6b..29d94f01d 100755
--- a/build_tools/build_detect_platform
+++ b/build_tools/build_detect_platform
@@ -175,7 +175,7 @@ PRUNE_TEST="-name *test*.cc -prune"
 PRUNE_BENCH="-name *bench*.cc -prune"
 PRUNE_MOCK="-name *mock*.cc -prune"
 PORTABLE_FILES=`cd "$ROCKSDB_ROOT"; find $DIRS $PRUNE_TEST -o $PRUNE_BENCH -o $PRUNE_MOCK -o -name '*.cc' -print | sort | tr "\n" " "`
-MOCK_SOURCES=`cd "$ROCKSDB_ROOT"; find $DIRS -name '*mock*.cc' -print | sort | tr "\n" " "`
+MOCK_SOURCES=`cd "$ROCKSDB_ROOT"; find $DIRS -name '*mock*.cc' -print | grep -v "test" | sort | tr "\n" " "`
 set +f # re-enable globbing
 
 # The sources consist of the portable files, plus the platform-specific port

From 0e526eb9d70726ffa2c5c9a60298435df13c75a6 Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Fri, 31 Oct 2014 15:08:10 -0700
Subject: [PATCH 387/829] introduce TestMemEnv and use it in db_test

Summary:
TestMemEnv simulates all Env APIs using in-memory data structures.
We can use it to speed up db_test run, which is now reduced ~7mins when it is
enabled.
We can also add features to simulate power/disk failures in the next
step
TestMemEnv is derived from helper/mem_env
mem_env can not be used for rocksdb since some of its APIs do not give
the same results as env_posix. And its file read/write is not thread safe

Test Plan:
make all -j32
./db_test
./env_mem_test

Reviewers: sdong, yhchiang, rven, igor

Reviewed By: igor

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D28035
---
 Makefile              |   4 +
 db/db_test.cc         | 183 ++++++-------
 util/env_test.cc      |  30 +++
 util/mock_env.cc      | 607 ++++++++++++++++++++++++++++++++++++++++++
 util/mock_env.h       |  93 +++++++
 util/mock_env_test.cc | 232 ++++++++++++++++
 util/testharness.cc   |   4 +-
 util/testharness.h    |   3 +-
 8 files changed, 1054 insertions(+), 102 deletions(-)
 create mode 100644 util/mock_env.cc
 create mode 100644 util/mock_env.h
 create mode 100644 util/mock_env_test.cc

diff --git a/Makefile b/Makefile
index ca51442d2..d06d6f2a1 100644
--- a/Makefile
+++ b/Makefile
@@ -116,6 +116,7 @@ TESTS = \
 	log_test \
 	manual_compaction_test \
 	memenv_test \
+	mock_env_test \
 	merge_test \
 	merger_test \
 	redis_test \
@@ -511,6 +512,9 @@ $(MEMENVLIBRARY) : $(MEMENVOBJECTS)
 memenv_test : helpers/memenv/memenv_test.o $(MEMENVOBJECTS) $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) helpers/memenv/memenv_test.o $(MEMENVOBJECTS) $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
 
+mock_env_test : util/mock_env_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) util/mock_env_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
 manual_compaction_test: util/manual_compaction_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) util/manual_compaction_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
 
diff --git a/db/db_test.cc b/db/db_test.cc
index 62c5e483b..4807ef121 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -45,6 +45,7 @@
 #include "util/scoped_arena_iterator.h"
 #include "util/sync_point.h"
 #include "util/testutil.h"
+#include "util/mock_env.h"
 
 namespace rocksdb {
 
@@ -238,6 +239,9 @@ class SpecialEnv : public EnvWrapper {
           return base_->Sync();
         }
       }
+      uint64_t GetFileSize() {
+        return base_->GetFileSize();
+      }
     };
     class LogFile : public WritableFile {
      private:
@@ -381,6 +385,7 @@ class DBTest {
 
  public:
   std::string dbname_;
+  MockEnv* mem_env_;
   SpecialEnv* env_;
   DB* db_;
   std::vector<ColumnFamilyHandle*> handles_;
@@ -404,10 +409,11 @@ class DBTest {
 
 
   DBTest() : option_config_(kDefault),
-             env_(new SpecialEnv(Env::Default())) {
-    dbname_ = test::TmpDir() + "/db_test";
-    Options options;
-    options.create_if_missing = true;
+             mem_env_(!getenv("MEM_ENV") ? nullptr :
+                                           new MockEnv(Env::Default())),
+             env_(new SpecialEnv(mem_env_ ? mem_env_ : Env::Default())) {
+    dbname_ = test::TmpDir(env_) + "/db_test";
+    auto options = CurrentOptions();
     ASSERT_OK(DestroyDB(dbname_, options));
     db_ = nullptr;
     Reopen(options);
@@ -561,10 +567,10 @@ class DBTest {
         options.num_levels = 3;
         break;
       case kDBLogDir:
-        options.db_log_dir = test::TmpDir();
+        options.db_log_dir = test::TmpDir(env_);
         break;
       case kWalDirAndMmapReads:
-        options.wal_dir = test::TmpDir() + "/wal";
+        options.wal_dir = test::TmpDir(env_) + "/wal";
         // mmap reads should be orthogonal to WalDir setting, so we piggyback to
         // this option config to test mmap reads as well
         options.allow_mmap_reads = true;
@@ -633,6 +639,8 @@ class DBTest {
     if (set_block_based_table_factory) {
       options.table_factory.reset(NewBlockBasedTableFactory(table_options));
     }
+    options.env = env_;
+    options.create_if_missing = true;
     return options;
   }
 
@@ -712,8 +720,8 @@ class DBTest {
     ASSERT_OK(DestroyDB(dbname_, options));
   }
 
-  Status ReadOnlyReopen(Options* options) {
-    return DB::OpenForReadOnly(*options, dbname_, &db_);
+  Status ReadOnlyReopen(const Options& options) {
+    return DB::OpenForReadOnly(options, dbname_, &db_);
   }
 
   Status TryReopen(const Options& options) {
@@ -1266,8 +1274,9 @@ TEST(DBTest, ReadOnlyDB) {
   ASSERT_OK(Put("foo", "v3"));
   Close();
 
-  Options options;
-  ASSERT_OK(ReadOnlyReopen(&options));
+  auto options = CurrentOptions();
+  assert(options.env = env_);
+  ASSERT_OK(ReadOnlyReopen(options));
   ASSERT_EQ("v3", Get("foo"));
   ASSERT_EQ("v2", Get("bar"));
   Iterator* iter = db_->NewIterator(ReadOptions());
@@ -1285,7 +1294,7 @@ TEST(DBTest, ReadOnlyDB) {
   Flush();
   Close();
   // Now check keys in read only mode.
-  ASSERT_OK(ReadOnlyReopen(&options));
+  ASSERT_OK(ReadOnlyReopen(options));
   ASSERT_EQ("v3", Get("foo"));
   ASSERT_EQ("v2", Get("bar"));
 }
@@ -1299,19 +1308,20 @@ TEST(DBTest, CompactedDB) {
   options.target_file_size_base = kFileSize;
   options.max_bytes_for_level_base = 1 << 30;
   options.compression = kNoCompression;
+  options = CurrentOptions(options);
   Reopen(options);
   // 1 L0 file, use CompactedDB if max_open_files = -1
   ASSERT_OK(Put("aaa", DummyString(kFileSize / 2, '1')));
   Flush();
   Close();
-  ASSERT_OK(ReadOnlyReopen(&options));
+  ASSERT_OK(ReadOnlyReopen(options));
   Status s = Put("new", "value");
   ASSERT_EQ(s.ToString(),
             "Not implemented: Not supported operation in read only mode.");
   ASSERT_EQ(DummyString(kFileSize / 2, '1'), Get("aaa"));
   Close();
   options.max_open_files = -1;
-  ASSERT_OK(ReadOnlyReopen(&options));
+  ASSERT_OK(ReadOnlyReopen(options));
   s = Put("new", "value");
   ASSERT_EQ(s.ToString(),
             "Not implemented: Not supported in compacted db mode.");
@@ -1327,7 +1337,7 @@ TEST(DBTest, CompactedDB) {
   Flush();
   Close();
 
-  ASSERT_OK(ReadOnlyReopen(&options));
+  ASSERT_OK(ReadOnlyReopen(options));
   // Fallback to read-only DB
   s = Put("new", "value");
   ASSERT_EQ(s.ToString(),
@@ -1347,7 +1357,7 @@ TEST(DBTest, CompactedDB) {
   Close();
 
   // CompactedDB
-  ASSERT_OK(ReadOnlyReopen(&options));
+  ASSERT_OK(ReadOnlyReopen(options));
   s = Put("new", "value");
   ASSERT_EQ(s.ToString(),
             "Not implemented: Not supported in compacted db mode.");
@@ -1493,36 +1503,6 @@ TEST(DBTest, LevelLimitReopen) {
   ASSERT_OK(TryReopenWithColumnFamilies({"default", "pikachu"}, options));
 }
 
-TEST(DBTest, Preallocation) {
-  const std::string src = dbname_ + "/alloc_test";
-  unique_ptr<WritableFile> srcfile;
-  const EnvOptions soptions;
-  ASSERT_OK(env_->NewWritableFile(src, &srcfile, soptions));
-  srcfile->SetPreallocationBlockSize(1024 * 1024);
-
-  // No writes should mean no preallocation
-  size_t block_size, last_allocated_block;
-  srcfile->GetPreallocationStatus(&block_size, &last_allocated_block);
-  ASSERT_EQ(last_allocated_block, 0UL);
-
-  // Small write should preallocate one block
-  srcfile->Append("test");
-  srcfile->GetPreallocationStatus(&block_size, &last_allocated_block);
-  ASSERT_EQ(last_allocated_block, 1UL);
-
-  // Write an entire preallocation block, make sure we increased by two.
-  std::string buf(block_size, ' ');
-  srcfile->Append(buf);
-  srcfile->GetPreallocationStatus(&block_size, &last_allocated_block);
-  ASSERT_EQ(last_allocated_block, 2UL);
-
-  // Write five more blocks at once, ensure we're where we need to be.
-  buf = std::string(block_size * 5, ' ');
-  srcfile->Append(buf);
-  srcfile->GetPreallocationStatus(&block_size, &last_allocated_block);
-  ASSERT_EQ(last_allocated_block, 7UL);
-}
-
 TEST(DBTest, PutDeleteGet) {
   do {
     CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
@@ -3146,8 +3126,7 @@ Options DeletionTriggerOptions() {
 }  // anonymous namespace
 
 TEST(DBTest, CompactionDeletionTrigger) {
-  Options options = DeletionTriggerOptions();
-  options.create_if_missing = true;
+  Options options = CurrentOptions(DeletionTriggerOptions());
 
   for (int tid = 0; tid < 2; ++tid) {
     uint64_t db_size[2];
@@ -3184,8 +3163,7 @@ TEST(DBTest, CompactionDeletionTrigger) {
 TEST(DBTest, CompactionDeletionTriggerReopen) {
   for (int tid = 0; tid < 2; ++tid) {
     uint64_t db_size[3];
-    Options options = DeletionTriggerOptions();
-    options.create_if_missing = true;
+    Options options = CurrentOptions(DeletionTriggerOptions());
 
     DestroyAndReopen(options);
     Random rnd(301);
@@ -3474,6 +3452,7 @@ TEST(DBTest, UniversalCompactionSizeAmplification) {
   options.compaction_style = kCompactionStyleUniversal;
   options.write_buffer_size = 100<<10; //100KB
   options.level0_file_num_compaction_trigger = 3;
+  options = CurrentOptions(options);
   CreateAndReopenWithCF({"pikachu"}, options);
 
   // Trigger compaction if size amplification exceeds 110%
@@ -3638,6 +3617,7 @@ TEST(DBTest, CompressedCache) {
     Options options;
     options.write_buffer_size = 64*1024;        // small write buffer
     options.statistics = rocksdb::CreateDBStatistics();
+    options = CurrentOptions(options);
 
     BlockBasedTableOptions table_options;
     switch (iter) {
@@ -3675,6 +3655,7 @@ TEST(DBTest, CompressedCache) {
     // default column family doesn't have block cache
     Options no_block_cache_opts;
     no_block_cache_opts.statistics = options.statistics;
+    no_block_cache_opts = CurrentOptions(no_block_cache_opts);
     BlockBasedTableOptions table_options_no_bc;
     table_options_no_bc.no_block_cache = true;
     no_block_cache_opts.table_factory.reset(
@@ -4587,6 +4568,7 @@ TEST(DBTest, CompactionFilterDeletesAll) {
   options.compaction_filter_factory = std::make_shared<DeleteFilterFactory>();
   options.disable_auto_compactions = true;
   options.create_if_missing = true;
+  options = CurrentOptions(options);
   DestroyAndReopen(options);
 
   // put some data
@@ -5685,20 +5667,20 @@ TEST(DBTest, ManualCompactionOutputPathId) {
 }
 
 TEST(DBTest, DBOpen_Options) {
-  std::string dbname = test::TmpDir() + "/db_options_test";
-  ASSERT_OK(DestroyDB(dbname, Options()));
+  Options options = CurrentOptions();
+  std::string dbname = test::TmpDir(env_) + "/db_options_test";
+  ASSERT_OK(DestroyDB(dbname, options));
 
   // Does not exist, and create_if_missing == false: error
   DB* db = nullptr;
-  Options opts;
-  opts.create_if_missing = false;
-  Status s = DB::Open(opts, dbname, &db);
+  options.create_if_missing = false;
+  Status s = DB::Open(options, dbname, &db);
   ASSERT_TRUE(strstr(s.ToString().c_str(), "does not exist") != nullptr);
   ASSERT_TRUE(db == nullptr);
 
   // Does not exist, and create_if_missing == true: OK
-  opts.create_if_missing = true;
-  s = DB::Open(opts, dbname, &db);
+  options.create_if_missing = true;
+  s = DB::Open(options, dbname, &db);
   ASSERT_OK(s);
   ASSERT_TRUE(db != nullptr);
 
@@ -5706,16 +5688,16 @@ TEST(DBTest, DBOpen_Options) {
   db = nullptr;
 
   // Does exist, and error_if_exists == true: error
-  opts.create_if_missing = false;
-  opts.error_if_exists = true;
-  s = DB::Open(opts, dbname, &db);
+  options.create_if_missing = false;
+  options.error_if_exists = true;
+  s = DB::Open(options, dbname, &db);
   ASSERT_TRUE(strstr(s.ToString().c_str(), "exists") != nullptr);
   ASSERT_TRUE(db == nullptr);
 
   // Does exist, and error_if_exists == false: OK
-  opts.create_if_missing = true;
-  opts.error_if_exists = false;
-  s = DB::Open(opts, dbname, &db);
+  options.create_if_missing = true;
+  options.error_if_exists = false;
+  s = DB::Open(options, dbname, &db);
   ASSERT_OK(s);
   ASSERT_TRUE(db != nullptr);
 
@@ -5724,57 +5706,56 @@ TEST(DBTest, DBOpen_Options) {
 }
 
 TEST(DBTest, DBOpen_Change_NumLevels) {
-  Options opts;
-  opts.create_if_missing = true;
-  opts.max_background_flushes = 0;
-  DestroyAndReopen(opts);
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.max_background_flushes = 0;
+  DestroyAndReopen(options);
   ASSERT_TRUE(db_ != nullptr);
-  CreateAndReopenWithCF({"pikachu"}, opts);
+  CreateAndReopenWithCF({"pikachu"}, options);
 
   ASSERT_OK(Put(1, "a", "123"));
   ASSERT_OK(Put(1, "b", "234"));
   db_->CompactRange(handles_[1], nullptr, nullptr);
   Close();
 
-  opts.create_if_missing = false;
-  opts.num_levels = 2;
-  Status s = TryReopenWithColumnFamilies({"default", "pikachu"}, opts);
+  options.create_if_missing = false;
+  options.num_levels = 2;
+  Status s = TryReopenWithColumnFamilies({"default", "pikachu"}, options);
   ASSERT_TRUE(strstr(s.ToString().c_str(), "Invalid argument") != nullptr);
   ASSERT_TRUE(db_ == nullptr);
 }
 
 TEST(DBTest, DestroyDBMetaDatabase) {
-  std::string dbname = test::TmpDir() + "/db_meta";
+  std::string dbname = test::TmpDir(env_) + "/db_meta";
   std::string metadbname = MetaDatabaseName(dbname, 0);
   std::string metametadbname = MetaDatabaseName(metadbname, 0);
 
   // Destroy previous versions if they exist. Using the long way.
-  ASSERT_OK(DestroyDB(metametadbname, Options()));
-  ASSERT_OK(DestroyDB(metadbname, Options()));
-  ASSERT_OK(DestroyDB(dbname, Options()));
+  Options options = CurrentOptions();
+  ASSERT_OK(DestroyDB(metametadbname, options));
+  ASSERT_OK(DestroyDB(metadbname, options));
+  ASSERT_OK(DestroyDB(dbname, options));
 
   // Setup databases
-  Options opts;
-  opts.create_if_missing = true;
   DB* db = nullptr;
-  ASSERT_OK(DB::Open(opts, dbname, &db));
+  ASSERT_OK(DB::Open(options, dbname, &db));
   delete db;
   db = nullptr;
-  ASSERT_OK(DB::Open(opts, metadbname, &db));
+  ASSERT_OK(DB::Open(options, metadbname, &db));
   delete db;
   db = nullptr;
-  ASSERT_OK(DB::Open(opts, metametadbname, &db));
+  ASSERT_OK(DB::Open(options, metametadbname, &db));
   delete db;
   db = nullptr;
 
   // Delete databases
-  ASSERT_OK(DestroyDB(dbname, Options()));
+  ASSERT_OK(DestroyDB(dbname, options));
 
   // Check if deletion worked.
-  opts.create_if_missing = false;
-  ASSERT_TRUE(!(DB::Open(opts, dbname, &db)).ok());
-  ASSERT_TRUE(!(DB::Open(opts, metadbname, &db)).ok());
-  ASSERT_TRUE(!(DB::Open(opts, metametadbname, &db)).ok());
+  options.create_if_missing = false;
+  ASSERT_TRUE(!(DB::Open(options, dbname, &db)).ok());
+  ASSERT_TRUE(!(DB::Open(options, metadbname, &db)).ok());
+  ASSERT_TRUE(!(DB::Open(options, metametadbname, &db)).ok());
 }
 
 // Check that number of files does not grow when writes are dropped
@@ -6076,7 +6057,7 @@ TEST(DBTest, BloomFilterRate) {
 }
 
 TEST(DBTest, BloomFilterCompatibility) {
-  Options options;
+  Options options = CurrentOptions();
   options.statistics = rocksdb::CreateDBStatistics();
   BlockBasedTableOptions table_options;
   table_options.filter_policy.reset(NewBloomFilterPolicy(10, true));
@@ -6105,7 +6086,7 @@ TEST(DBTest, BloomFilterCompatibility) {
 }
 
 TEST(DBTest, BloomFilterReverseCompatibility) {
-  Options options;
+  Options options = CurrentOptions();
   options.statistics = rocksdb::CreateDBStatistics();
   BlockBasedTableOptions table_options;
   table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
@@ -6173,7 +6154,7 @@ class WrappedBloom : public FilterPolicy {
 }  // namespace
 
 TEST(DBTest, BloomFilterWrapper) {
-  Options options;
+  Options options = CurrentOptions();
   options.statistics = rocksdb::CreateDBStatistics();
 
   BlockBasedTableOptions table_options;
@@ -6241,8 +6222,7 @@ TEST(DBTest, SnapshotFiles) {
 
     // copy these files to a new snapshot directory
     std::string snapdir = dbname_ + ".snapdir/";
-    std::string mkdir = "mkdir -p " + snapdir;
-    ASSERT_EQ(system(mkdir.c_str()), 0);
+    ASSERT_OK(env_->CreateDirIfMissing(snapdir));
 
     for (unsigned int i = 0; i < files.size(); i++) {
       // our clients require that GetLiveFiles returns
@@ -6270,7 +6250,6 @@ TEST(DBTest, SnapshotFiles) {
 
     // release file snapshot
     dbfull()->DisableFileDeletions();
-
     // overwrite one key, this key should not appear in the snapshot
     std::vector<std::string> extras;
     for (unsigned int i = 0; i < 1; i++) {
@@ -6285,6 +6264,7 @@ TEST(DBTest, SnapshotFiles) {
     std::vector<ColumnFamilyHandle*> cf_handles;
     DB* snapdb;
     DBOptions opts;
+    opts.env = env_;
     opts.create_if_missing = false;
     Status stat =
         DB::Open(opts, snapdir, column_families, &cf_handles, &snapdb);
@@ -6446,7 +6426,7 @@ std::vector<std::uint64_t> ListTableFiles(Env* env, const std::string& path) {
 }  // namespace
 
 TEST(DBTest, FlushOneColumnFamily) {
-  Options options;
+  Options options = CurrentOptions();
   CreateAndReopenWithCF({"pikachu", "ilya", "muromec", "dobrynia", "nikitich",
                          "alyosha", "popovich"},
                         options);
@@ -6473,7 +6453,7 @@ TEST(DBTest, FlushOneColumnFamily) {
 // we try to create the smallest number of table files by merging
 // updates from multiple logs
 TEST(DBTest, RecoverCheckFileAmountWithSmallWriteBuffer) {
-  Options options;
+  Options options = CurrentOptions();
   options.write_buffer_size = 5000000;
   CreateAndReopenWithCF({"pikachu", "dobrynia", "nikitich"}, options);
 
@@ -6528,7 +6508,7 @@ TEST(DBTest, RecoverCheckFileAmountWithSmallWriteBuffer) {
 // we try to create the smallest number of table files by merging
 // updates from multiple logs
 TEST(DBTest, RecoverCheckFileAmount) {
-  Options options;
+  Options options = CurrentOptions();
   options.write_buffer_size = 100000;
   CreateAndReopenWithCF({"pikachu", "dobrynia", "nikitich"}, options);
 
@@ -6790,10 +6770,14 @@ TEST(DBTest, TransactionLogIteratorCorruptedLog) {
     // Corrupt this log to create a gap
     rocksdb::VectorLogPtr wal_files;
     ASSERT_OK(dbfull()->GetSortedWalFiles(wal_files));
-    const auto logfilePath = dbname_ + "/" + wal_files.front()->PathName();
-    ASSERT_EQ(
-      0,
-      truncate(logfilePath.c_str(), wal_files.front()->SizeFileBytes() / 2));
+    const auto logfile_path = dbname_ + "/" + wal_files.front()->PathName();
+    if (mem_env_) {
+      mem_env_->Truncate(logfile_path, wal_files.front()->SizeFileBytes() / 2);
+    } else {
+      ASSERT_EQ(0, truncate(logfile_path.c_str(),
+                   wal_files.front()->SizeFileBytes() / 2));
+    }
+
     // Insert a new entry to a new log file
     Put("key1025", DummyString(10));
     // Try to read from the beginning. Should stop before the gap and read less
@@ -7939,6 +7923,7 @@ TEST(DBTest, FIFOCompactionTest) {
     if (iter == 1) {
       options.disable_auto_compactions = true;
     }
+    options = CurrentOptions(options);
     DestroyAndReopen(options);
 
     Random rnd(301);
@@ -8200,7 +8185,7 @@ TEST(DBTest, TableOptionsSanitizeTest) {
   // block-based table
   BlockBasedTableOptions to;
   to.index_type = BlockBasedTableOptions::kHashSearch;
-  options = Options();
+  options = CurrentOptions();
   options.create_if_missing = true;
   options.table_factory.reset(NewBlockBasedTableFactory(to));
   ASSERT_TRUE(TryReopen(options).IsInvalidArgument());
@@ -8209,7 +8194,7 @@ TEST(DBTest, TableOptionsSanitizeTest) {
 }
 
 TEST(DBTest, DBIteratorBoundTest) {
-  Options options;
+  Options options = CurrentOptions();
   options.env = env_;
   options.create_if_missing = true;
 
diff --git a/util/env_test.cc b/util/env_test.cc
index f9c2336db..3bb4fb68c 100644
--- a/util/env_test.cc
+++ b/util/env_test.cc
@@ -805,6 +805,36 @@ TEST(EnvPosixTest, LogBufferMaxSizeTest) {
   }
 }
 
+TEST(EnvPosixTest, Preallocation) {
+  const std::string src = test::TmpDir() + "/" + "testfile";
+  unique_ptr<WritableFile> srcfile;
+  const EnvOptions soptions;
+  ASSERT_OK(env_->NewWritableFile(src, &srcfile, soptions));
+  srcfile->SetPreallocationBlockSize(1024 * 1024);
+
+  // No writes should mean no preallocation
+  size_t block_size, last_allocated_block;
+  srcfile->GetPreallocationStatus(&block_size, &last_allocated_block);
+  ASSERT_EQ(last_allocated_block, 0UL);
+
+  // Small write should preallocate one block
+  srcfile->Append("test");
+  srcfile->GetPreallocationStatus(&block_size, &last_allocated_block);
+  ASSERT_EQ(last_allocated_block, 1UL);
+
+  // Write an entire preallocation block, make sure we increased by two.
+  std::string buf(block_size, ' ');
+  srcfile->Append(buf);
+  srcfile->GetPreallocationStatus(&block_size, &last_allocated_block);
+  ASSERT_EQ(last_allocated_block, 2UL);
+
+  // Write five more blocks at once, ensure we're where we need to be.
+  buf = std::string(block_size * 5, ' ');
+  srcfile->Append(buf);
+  srcfile->GetPreallocationStatus(&block_size, &last_allocated_block);
+  ASSERT_EQ(last_allocated_block, 7UL);
+}
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
diff --git a/util/mock_env.cc b/util/mock_env.cc
new file mode 100644
index 000000000..32c202beb
--- /dev/null
+++ b/util/mock_env.cc
@@ -0,0 +1,607 @@
+//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "util/mock_env.h"
+#include <sys/time.h>
+#include <algorithm>
+#include <chrono>
+#include "util/rate_limiter.h"
+
+namespace rocksdb {
+
+class MemFile {
+ public:
+  explicit MemFile(const std::string& fn) :
+    fn_(fn), refs_(0), size_(0), modified_time_(Now()) {}
+
+  void Ref() {
+    MutexLock lock(&mutex_);
+    ++refs_;
+  }
+
+  void Unref() {
+    bool do_delete = false;
+    {
+      MutexLock lock(&mutex_);
+      --refs_;
+      assert(refs_ >= 0);
+      if (refs_ <= 0) {
+        do_delete = true;
+      }
+    }
+
+    if (do_delete) {
+      delete this;
+    }
+  }
+
+  uint64_t Size() const {
+    return size_;
+  }
+
+  void Truncate(size_t size) {
+    MutexLock lock(&mutex_);
+    if (size < size_) {
+      data_.resize(size);
+      size_ = size;
+    }
+  }
+
+  Status Read(uint64_t offset, size_t n, Slice* result, char* scratch) const {
+    MutexLock lock(&mutex_);
+    if (offset > Size()) {
+      return Status::IOError("Offset greater than file size.");
+    }
+    const uint64_t available = Size() - offset;
+    if (n > available) {
+      n = available;
+    }
+    if (n == 0) {
+      *result = Slice();
+      return Status::OK();
+    }
+    if (scratch) {
+      memcpy(scratch, &(data_[offset]), n);
+      *result = Slice(scratch, n);
+    } else {
+      *result = Slice(&(data_[offset]), n);
+    }
+    return Status::OK();
+  }
+
+  Status Append(const Slice& data) {
+    MutexLock lock(&mutex_);
+    data_.append(data.data(), data.size());
+    size_ = data_.size();
+    modified_time_ = Now();
+    return Status::OK();
+  }
+
+  Status Fsync() {
+    return Status::OK();
+  }
+
+  uint64_t ModifiedTime() const {
+    return modified_time_;
+  }
+
+ private:
+  uint64_t Now() {
+    return std::chrono::duration_cast<std::chrono::seconds>(
+      std::chrono::system_clock::now().time_since_epoch()).count();
+  }
+
+  // Private since only Unref() should be used to delete it.
+  ~MemFile() {
+    assert(refs_ == 0);
+  }
+
+  // No copying allowed.
+  MemFile(const MemFile&);
+  void operator=(const MemFile&);
+
+  const std::string fn_;
+  mutable port::Mutex mutex_;
+  int refs_;
+
+  std::string data_;
+  std::atomic<uint64_t> size_;
+  std::atomic<uint64_t> modified_time_;
+};
+
+namespace {
+
+class SequentialFileImpl : public SequentialFile {
+ public:
+  explicit SequentialFileImpl(MemFile* file) : file_(file), pos_(0) {
+    file_->Ref();
+  }
+
+  ~SequentialFileImpl() {
+    file_->Unref();
+  }
+
+  virtual Status Read(size_t n, Slice* result, char* scratch) {
+    Status s = file_->Read(pos_, n, result, scratch);
+    if (s.ok()) {
+      pos_ += result->size();
+    }
+    return s;
+  }
+
+  virtual Status Skip(uint64_t n) {
+    if (pos_ > file_->Size()) {
+      return Status::IOError("pos_ > file_->Size()");
+    }
+    const size_t available = file_->Size() - pos_;
+    if (n > available) {
+      n = available;
+    }
+    pos_ += n;
+    return Status::OK();
+  }
+
+ private:
+  MemFile* file_;
+  size_t pos_;
+};
+
+class RandomAccessFileImpl : public RandomAccessFile {
+ public:
+  explicit RandomAccessFileImpl(MemFile* file) : file_(file) {
+    file_->Ref();
+  }
+
+  ~RandomAccessFileImpl() {
+    file_->Unref();
+  }
+
+  virtual Status Read(uint64_t offset, size_t n, Slice* result,
+                      char* scratch) const {
+    return file_->Read(offset, n, result, scratch);
+  }
+
+ private:
+  MemFile* file_;
+};
+
+class WritableFileImpl : public WritableFile {
+ public:
+  WritableFileImpl(MemFile* file, RateLimiter* rate_limiter)
+    : file_(file),
+      rate_limiter_(rate_limiter) {
+    file_->Ref();
+  }
+
+  ~WritableFileImpl() {
+    file_->Unref();
+  }
+
+  virtual Status Append(const Slice& data) {
+    uint64_t bytes_written = 0;
+    while (bytes_written < data.size()) {
+      auto bytes = RequestToken(data.size() - bytes_written);
+      Status s = file_->Append(Slice(data.data() + bytes_written, bytes));
+      if (!s.ok()) {
+        return s;
+      }
+      bytes_written += bytes;
+    }
+    return Status::OK();
+  }
+
+  virtual Status Close() {
+    return Status::OK();
+  }
+
+  virtual Status Flush() {
+    return Status::OK();
+  }
+
+  virtual Status Sync() {
+    return file_->Fsync();
+  }
+
+  virtual uint64_t GetFileSize() {
+    return file_->Size();
+  }
+
+ private:
+  inline size_t RequestToken(size_t bytes) {
+    if (rate_limiter_ && io_priority_ < Env::IO_TOTAL) {
+      bytes = std::min(bytes,
+          static_cast<size_t>(rate_limiter_->GetSingleBurstBytes()));
+      rate_limiter_->Request(bytes, io_priority_);
+    }
+    return bytes;
+  }
+
+  MemFile* file_;
+  RateLimiter* rate_limiter_;
+};
+
+class MockEnvDirectory : public Directory {
+ public:
+  virtual Status Fsync() { return Status::OK(); }
+};
+
+class MockEnvFileLock : public FileLock {
+ public:
+  explicit MockEnvFileLock(const std::string& fname)
+    : fname_(fname) {}
+
+  std::string FileName() const {
+    return fname_;
+  }
+
+ private:
+  const std::string fname_;
+};
+
+class TestMemLogger : public Logger {
+ private:
+  std::unique_ptr<WritableFile> file_;
+  std::atomic_size_t log_size_;
+  static const uint64_t flush_every_seconds_ = 5;
+  std::atomic_uint_fast64_t last_flush_micros_;
+  Env* env_;
+  bool flush_pending_;
+
+ public:
+  TestMemLogger(std::unique_ptr<WritableFile> f, Env* env,
+                const InfoLogLevel log_level = InfoLogLevel::ERROR_LEVEL)
+      : Logger(log_level),
+        file_(std::move(f)),
+        log_size_(0),
+        last_flush_micros_(0),
+        env_(env),
+        flush_pending_(false) {}
+  virtual ~TestMemLogger() {
+  }
+
+  virtual void Flush() {
+    if (flush_pending_) {
+      flush_pending_ = false;
+    }
+    last_flush_micros_ = env_->NowMicros();
+  }
+  virtual void Logv(const char* format, va_list ap) {
+    // We try twice: the first time with a fixed-size stack allocated buffer,
+    // and the second time with a much larger dynamically allocated buffer.
+    char buffer[500];
+    for (int iter = 0; iter < 2; iter++) {
+      char* base;
+      int bufsize;
+      if (iter == 0) {
+        bufsize = sizeof(buffer);
+        base = buffer;
+      } else {
+        bufsize = 30000;
+        base = new char[bufsize];
+      }
+      char* p = base;
+      char* limit = base + bufsize;
+
+      struct timeval now_tv;
+      gettimeofday(&now_tv, nullptr);
+      const time_t seconds = now_tv.tv_sec;
+      struct tm t;
+      localtime_r(&seconds, &t);
+      p += snprintf(p, limit - p,
+                    "%04d/%02d/%02d-%02d:%02d:%02d.%06d ",
+                    t.tm_year + 1900,
+                    t.tm_mon + 1,
+                    t.tm_mday,
+                    t.tm_hour,
+                    t.tm_min,
+                    t.tm_sec,
+                    static_cast<int>(now_tv.tv_usec));
+
+      // Print the message
+      if (p < limit) {
+        va_list backup_ap;
+        va_copy(backup_ap, ap);
+        p += vsnprintf(p, limit - p, format, backup_ap);
+        va_end(backup_ap);
+      }
+
+      // Truncate to available space if necessary
+      if (p >= limit) {
+        if (iter == 0) {
+          continue;       // Try again with larger buffer
+        } else {
+          p = limit - 1;
+        }
+      }
+
+      // Add newline if necessary
+      if (p == base || p[-1] != '\n') {
+        *p++ = '\n';
+      }
+
+      assert(p <= limit);
+      const size_t write_size = p - base;
+
+      file_->Append(Slice(base, write_size));
+      flush_pending_ = true;
+      log_size_ += write_size;
+      uint64_t now_micros = static_cast<uint64_t>(now_tv.tv_sec) * 1000000 +
+        now_tv.tv_usec;
+      if (now_micros - last_flush_micros_ >= flush_every_seconds_ * 1000000) {
+        flush_pending_ = false;
+        last_flush_micros_ = now_micros;
+      }
+      if (base != buffer) {
+        delete[] base;
+      }
+      break;
+    }
+  }
+  size_t GetLogFileSize() const {
+    return log_size_;
+  }
+};
+
+}  // Anonymous namespace
+
+MockEnv::MockEnv(Env* base_env)
+  : EnvWrapper(base_env) {}
+
+MockEnv::~MockEnv() {
+  for (FileSystem::iterator i = file_map_.begin(); i != file_map_.end(); ++i) {
+    i->second->Unref();
+  }
+}
+
+  // Partial implementation of the Env interface.
+Status MockEnv::NewSequentialFile(const std::string& fname,
+                                     unique_ptr<SequentialFile>* result,
+                                     const EnvOptions& soptions) {
+  auto fn = NormalizePath(fname);
+  MutexLock lock(&mutex_);
+  if (file_map_.find(fn) == file_map_.end()) {
+    *result = NULL;
+    return Status::IOError(fn, "File not found");
+  }
+  auto* f = file_map_[fn];
+  result->reset(new SequentialFileImpl(f));
+  return Status::OK();
+}
+
+Status MockEnv::NewRandomAccessFile(const std::string& fname,
+                                       unique_ptr<RandomAccessFile>* result,
+                                       const EnvOptions& soptions) {
+  auto fn = NormalizePath(fname);
+  MutexLock lock(&mutex_);
+  if (file_map_.find(fn) == file_map_.end()) {
+    *result = NULL;
+    return Status::IOError(fn, "File not found");
+  }
+  auto* f = file_map_[fn];
+  result->reset(new RandomAccessFileImpl(f));
+  return Status::OK();
+}
+
+Status MockEnv::NewWritableFile(const std::string& fname,
+                                   unique_ptr<WritableFile>* result,
+                                   const EnvOptions& env_options) {
+  auto fn = NormalizePath(fname);
+  MutexLock lock(&mutex_);
+  if (file_map_.find(fn) != file_map_.end()) {
+    DeleteFileInternal(fn);
+  }
+  MemFile* file = new MemFile(fn);
+  file->Ref();
+  file_map_[fn] = file;
+
+  result->reset(new WritableFileImpl(file, env_options.rate_limiter));
+  return Status::OK();
+}
+
+Status MockEnv::NewRandomRWFile(const std::string& fname,
+                                   unique_ptr<RandomRWFile>* result,
+                                   const EnvOptions& options) {
+  return Status::OK();
+}
+
+Status MockEnv::NewDirectory(const std::string& name,
+                                unique_ptr<Directory>* result) {
+  result->reset(new MockEnvDirectory());
+  return Status::OK();
+}
+
+bool MockEnv::FileExists(const std::string& fname) {
+  auto fn = NormalizePath(fname);
+  MutexLock lock(&mutex_);
+  if (file_map_.find(fn) != file_map_.end()) {
+    // File exists
+    return true;
+  }
+  // Now also check if fn exists as a dir
+  for (const auto& iter : file_map_) {
+    const std::string& filename = iter.first;
+    if (filename.size() >= fn.size() + 1 &&
+        filename[fn.size()] == '/' &&
+        Slice(filename).starts_with(Slice(fn))) {
+      return true;
+    }
+  }
+  return false;
+}
+
+Status MockEnv::GetChildren(const std::string& dir,
+                               std::vector<std::string>* result) {
+  auto d = NormalizePath(dir);
+  {
+    MutexLock lock(&mutex_);
+    result->clear();
+    for (const auto& iter : file_map_) {
+      const std::string& filename = iter.first;
+
+      if (filename.size() >= d.size() + 1 && filename[d.size()] == '/' &&
+          Slice(filename).starts_with(Slice(d))) {
+        size_t next_slash = filename.find('/', d.size() + 1);
+        if (next_slash != std::string::npos) {
+          result->push_back(filename.substr(
+                d.size() + 1, next_slash - d.size() - 1));
+        } else {
+          result->push_back(filename.substr(d.size() + 1));
+        }
+      }
+    }
+  }
+  result->erase(std::unique(result->begin(), result->end()), result->end());
+  return Status::OK();
+}
+
+void MockEnv::DeleteFileInternal(const std::string& fname) {
+  assert(fname == NormalizePath(fname));
+  if (file_map_.find(fname) == file_map_.end()) {
+    return;
+  }
+
+  file_map_[fname]->Unref();
+  file_map_.erase(fname);
+}
+
+Status MockEnv::DeleteFile(const std::string& fname) {
+  auto fn = NormalizePath(fname);
+  MutexLock lock(&mutex_);
+  if (file_map_.find(fn) == file_map_.end()) {
+    return Status::IOError(fn, "File not found");
+  }
+
+  DeleteFileInternal(fn);
+  return Status::OK();
+}
+
+Status MockEnv::CreateDir(const std::string& dirname) {
+  return Status::OK();
+}
+
+Status MockEnv::CreateDirIfMissing(const std::string& dirname) {
+  return Status::OK();
+}
+
+Status MockEnv::DeleteDir(const std::string& dirname) {
+  return Status::OK();
+}
+
+Status MockEnv::GetFileSize(const std::string& fname, uint64_t* file_size) {
+  auto fn = NormalizePath(fname);
+  MutexLock lock(&mutex_);
+  auto iter = file_map_.find(fn);
+  if (iter == file_map_.end()) {
+    return Status::IOError(fn, "File not found");
+  }
+
+  *file_size = iter->second->Size();
+  return Status::OK();
+}
+
+Status MockEnv::GetFileModificationTime(const std::string& fname,
+                                           uint64_t* time) {
+  auto fn = NormalizePath(fname);
+  MutexLock lock(&mutex_);
+  auto iter = file_map_.find(fn);
+  if (iter == file_map_.end()) {
+    return Status::IOError(fn, "File not found");
+  }
+  *time = iter->second->ModifiedTime();
+  return Status::OK();
+}
+
+Status MockEnv::RenameFile(const std::string& src,
+                              const std::string& target) {
+  auto s = NormalizePath(src);
+  auto t = NormalizePath(target);
+  MutexLock lock(&mutex_);
+  if (file_map_.find(s) == file_map_.end()) {
+    return Status::IOError(s, "File not found");
+  }
+
+  DeleteFileInternal(t);
+  file_map_[t] = file_map_[s];
+  file_map_.erase(s);
+  return Status::OK();
+}
+
+Status MockEnv::NewLogger(const std::string& fname,
+                             shared_ptr<Logger>* result) {
+  auto fn = NormalizePath(fname);
+  MutexLock lock(&mutex_);
+  auto iter = file_map_.find(fn);
+  MemFile* file = nullptr;
+  if (iter == file_map_.end()) {
+    file = new MemFile(fn);
+    file->Ref();
+    file_map_[fn] = file;
+  } else {
+    file = iter->second;
+  }
+  std::unique_ptr<WritableFile> f(new WritableFileImpl(file, nullptr));
+  result->reset(new TestMemLogger(std::move(f), this));
+  return Status::OK();
+}
+
+Status MockEnv::LockFile(const std::string& fname, FileLock** flock) {
+  auto fn = NormalizePath(fname);
+  {
+    MutexLock lock(&mutex_);
+    if (file_map_.find(fn) != file_map_.end()) {
+      return Status::IOError(fn, "Lock file exists");
+    }
+    file_map_[fn] = nullptr;
+  }
+  *flock = new MockEnvFileLock(fn);
+  return Status::OK();
+}
+
+Status MockEnv::UnlockFile(FileLock* flock) {
+  std::string fn = dynamic_cast<MockEnvFileLock*>(flock)->FileName();
+  {
+    MutexLock lock(&mutex_);
+    auto iter = file_map_.find(fn);
+    if (iter != file_map_.end()) {
+      file_map_.erase(fn);
+    }
+  }
+  delete flock;
+  return Status::OK();
+}
+
+Status MockEnv::GetTestDirectory(std::string* path) {
+  *path = "/test";
+  return Status::OK();
+}
+
+  // Non-virtual functions, specific to MockEnv
+Status MockEnv::Truncate(const std::string& fname, size_t size) {
+  auto fn = NormalizePath(fname);
+  MutexLock lock(&mutex_);
+  auto iter = file_map_.find(fn);
+  if (iter == file_map_.end()) {
+    return Status::IOError(fn, "File not found");
+  }
+  iter->second->Truncate(size);
+  return Status::OK();
+}
+
+std::string MockEnv::NormalizePath(const std::string path) {
+  std::string dst;
+  for (auto c : path) {
+    if (!dst.empty() && c == '/' && dst.back() == '/') {
+      continue;
+    }
+    dst.push_back(c);
+  }
+  return dst;
+}
+
+}  // namespace rocksdb
diff --git a/util/mock_env.h b/util/mock_env.h
new file mode 100644
index 000000000..d128c75b0
--- /dev/null
+++ b/util/mock_env.h
@@ -0,0 +1,93 @@
+//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include <atomic>
+#include <map>
+#include <string>
+#include <vector>
+#include "rocksdb/env.h"
+#include "rocksdb/status.h"
+#include "port/port.h"
+#include "util/mutexlock.h"
+
+namespace rocksdb {
+
+class MemFile;
+class MockEnv : public EnvWrapper {
+ public:
+  explicit MockEnv(Env* base_env);
+
+  virtual ~MockEnv();
+
+  // Partial implementation of the Env interface.
+  virtual Status NewSequentialFile(const std::string& fname,
+                                   unique_ptr<SequentialFile>* result,
+                                   const EnvOptions& soptions);
+
+  virtual Status NewRandomAccessFile(const std::string& fname,
+                                     unique_ptr<RandomAccessFile>* result,
+                                     const EnvOptions& soptions);
+
+  virtual Status NewWritableFile(const std::string& fname,
+                                 unique_ptr<WritableFile>* result,
+                                 const EnvOptions& env_options);
+
+  virtual Status NewRandomRWFile(const std::string& fname,
+                                 unique_ptr<RandomRWFile>* result,
+                                 const EnvOptions& options);
+
+  virtual Status NewDirectory(const std::string& name,
+                              unique_ptr<Directory>* result);
+
+  virtual bool FileExists(const std::string& fname);
+
+  virtual Status GetChildren(const std::string& dir,
+                             std::vector<std::string>* result);
+
+  void DeleteFileInternal(const std::string& fname);
+
+  virtual Status DeleteFile(const std::string& fname);
+
+  virtual Status CreateDir(const std::string& dirname);
+
+  virtual Status CreateDirIfMissing(const std::string& dirname);
+
+  virtual Status DeleteDir(const std::string& dirname);
+
+  virtual Status GetFileSize(const std::string& fname, uint64_t* file_size);
+
+  virtual Status GetFileModificationTime(const std::string& fname,
+                                         uint64_t* time);
+
+  virtual Status RenameFile(const std::string& src,
+                            const std::string& target);
+
+  virtual Status NewLogger(const std::string& fname,
+                           shared_ptr<Logger>* result);
+
+  virtual Status LockFile(const std::string& fname, FileLock** flock);
+
+  virtual Status UnlockFile(FileLock* flock);
+
+  virtual Status GetTestDirectory(std::string* path);
+
+  // Non-virtual functions, specific to MockEnv
+  Status Truncate(const std::string& fname, size_t size);
+
+ private:
+  std::string NormalizePath(const std::string path);
+
+  // Map from filenames to MemFile objects, representing a simple file system.
+  typedef std::map<std::string, MemFile*> FileSystem;
+  port::Mutex mutex_;
+  FileSystem file_map_;  // Protected by mutex_.
+};
+
+}  // namespace rocksdb
diff --git a/util/mock_env_test.cc b/util/mock_env_test.cc
new file mode 100644
index 000000000..51ae8e296
--- /dev/null
+++ b/util/mock_env_test.cc
@@ -0,0 +1,232 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <memory>
+#include <string>
+#include <vector>
+#include "util/mock_env.h"
+#include "db/db_impl.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "util/testharness.h"
+
+namespace rocksdb {
+
+class MockEnvTest {
+ public:
+  Env* env_;
+  const EnvOptions soptions_;
+
+  MockEnvTest()
+      : env_(new MockEnv(Env::Default())) {
+  }
+  ~MockEnvTest() {
+    delete env_;
+  }
+};
+
+TEST(MockEnvTest, Basics) {
+  uint64_t file_size;
+  unique_ptr<WritableFile> writable_file;
+  std::vector<std::string> children;
+
+  ASSERT_OK(env_->CreateDir("/dir"));
+
+  // Check that the directory is empty.
+  ASSERT_TRUE(!env_->FileExists("/dir/non_existent"));
+  ASSERT_TRUE(!env_->GetFileSize("/dir/non_existent", &file_size).ok());
+  ASSERT_OK(env_->GetChildren("/dir", &children));
+  ASSERT_EQ(0U, children.size());
+
+  // Create a file.
+  ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file, soptions_));
+  writable_file.reset();
+
+  // Check that the file exists.
+  ASSERT_TRUE(env_->FileExists("/dir/f"));
+  ASSERT_OK(env_->GetFileSize("/dir/f", &file_size));
+  ASSERT_EQ(0U, file_size);
+  ASSERT_OK(env_->GetChildren("/dir", &children));
+  ASSERT_EQ(1U, children.size());
+  ASSERT_EQ("f", children[0]);
+
+  // Write to the file.
+  ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file, soptions_));
+  ASSERT_OK(writable_file->Append("abc"));
+  writable_file.reset();
+
+  // Check for expected size.
+  ASSERT_OK(env_->GetFileSize("/dir/f", &file_size));
+  ASSERT_EQ(3U, file_size);
+
+  // Check that renaming works.
+  ASSERT_TRUE(!env_->RenameFile("/dir/non_existent", "/dir/g").ok());
+  ASSERT_OK(env_->RenameFile("/dir/f", "/dir/g"));
+  ASSERT_TRUE(!env_->FileExists("/dir/f"));
+  ASSERT_TRUE(env_->FileExists("/dir/g"));
+  ASSERT_OK(env_->GetFileSize("/dir/g", &file_size));
+  ASSERT_EQ(3U, file_size);
+
+  // Check that opening non-existent file fails.
+  unique_ptr<SequentialFile> seq_file;
+  unique_ptr<RandomAccessFile> rand_file;
+  ASSERT_TRUE(!env_->NewSequentialFile("/dir/non_existent", &seq_file,
+                                       soptions_).ok());
+  ASSERT_TRUE(!seq_file);
+  ASSERT_TRUE(!env_->NewRandomAccessFile("/dir/non_existent", &rand_file,
+                                         soptions_).ok());
+  ASSERT_TRUE(!rand_file);
+
+  // Check that deleting works.
+  ASSERT_TRUE(!env_->DeleteFile("/dir/non_existent").ok());
+  ASSERT_OK(env_->DeleteFile("/dir/g"));
+  ASSERT_TRUE(!env_->FileExists("/dir/g"));
+  ASSERT_OK(env_->GetChildren("/dir", &children));
+  ASSERT_EQ(0U, children.size());
+  ASSERT_OK(env_->DeleteDir("/dir"));
+}
+
+TEST(MockEnvTest, ReadWrite) {
+  unique_ptr<WritableFile> writable_file;
+  unique_ptr<SequentialFile> seq_file;
+  unique_ptr<RandomAccessFile> rand_file;
+  Slice result;
+  char scratch[100];
+
+  ASSERT_OK(env_->CreateDir("/dir"));
+
+  ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file, soptions_));
+  ASSERT_OK(writable_file->Append("hello "));
+  ASSERT_OK(writable_file->Append("world"));
+  writable_file.reset();
+
+  // Read sequentially.
+  ASSERT_OK(env_->NewSequentialFile("/dir/f", &seq_file, soptions_));
+  ASSERT_OK(seq_file->Read(5, &result, scratch));  // Read "hello".
+  ASSERT_EQ(0, result.compare("hello"));
+  ASSERT_OK(seq_file->Skip(1));
+  ASSERT_OK(seq_file->Read(1000, &result, scratch));  // Read "world".
+  ASSERT_EQ(0, result.compare("world"));
+  ASSERT_OK(seq_file->Read(1000, &result, scratch));  // Try reading past EOF.
+  ASSERT_EQ(0U, result.size());
+  ASSERT_OK(seq_file->Skip(100));  // Try to skip past end of file.
+  ASSERT_OK(seq_file->Read(1000, &result, scratch));
+  ASSERT_EQ(0U, result.size());
+
+  // Random reads.
+  ASSERT_OK(env_->NewRandomAccessFile("/dir/f", &rand_file, soptions_));
+  ASSERT_OK(rand_file->Read(6, 5, &result, scratch));  // Read "world".
+  ASSERT_EQ(0, result.compare("world"));
+  ASSERT_OK(rand_file->Read(0, 5, &result, scratch));  // Read "hello".
+  ASSERT_EQ(0, result.compare("hello"));
+  ASSERT_OK(rand_file->Read(10, 100, &result, scratch));  // Read "d".
+  ASSERT_EQ(0, result.compare("d"));
+
+  // Too high offset.
+  ASSERT_TRUE(!rand_file->Read(1000, 5, &result, scratch).ok());
+}
+
+TEST(MockEnvTest, Locks) {
+  FileLock* lock;
+
+  // These are no-ops, but we test they return success.
+  ASSERT_OK(env_->LockFile("some file", &lock));
+  ASSERT_OK(env_->UnlockFile(lock));
+}
+
+TEST(MockEnvTest, Misc) {
+  std::string test_dir;
+  ASSERT_OK(env_->GetTestDirectory(&test_dir));
+  ASSERT_TRUE(!test_dir.empty());
+
+  unique_ptr<WritableFile> writable_file;
+  ASSERT_OK(env_->NewWritableFile("/a/b", &writable_file, soptions_));
+
+  // These are no-ops, but we test they return success.
+  ASSERT_OK(writable_file->Sync());
+  ASSERT_OK(writable_file->Flush());
+  ASSERT_OK(writable_file->Close());
+  writable_file.reset();
+}
+
+TEST(MockEnvTest, LargeWrite) {
+  const size_t kWriteSize = 300 * 1024;
+  char* scratch = new char[kWriteSize * 2];
+
+  std::string write_data;
+  for (size_t i = 0; i < kWriteSize; ++i) {
+    write_data.append(1, static_cast<char>(i));
+  }
+
+  unique_ptr<WritableFile> writable_file;
+  ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file, soptions_));
+  ASSERT_OK(writable_file->Append("foo"));
+  ASSERT_OK(writable_file->Append(write_data));
+  writable_file.reset();
+
+  unique_ptr<SequentialFile> seq_file;
+  Slice result;
+  ASSERT_OK(env_->NewSequentialFile("/dir/f", &seq_file, soptions_));
+  ASSERT_OK(seq_file->Read(3, &result, scratch));  // Read "foo".
+  ASSERT_EQ(0, result.compare("foo"));
+
+  size_t read = 0;
+  std::string read_data;
+  while (read < kWriteSize) {
+    ASSERT_OK(seq_file->Read(kWriteSize - read, &result, scratch));
+    read_data.append(result.data(), result.size());
+    read += result.size();
+  }
+  ASSERT_TRUE(write_data == read_data);
+  delete [] scratch;
+}
+
+TEST(MockEnvTest, DBTest) {
+  Options options;
+  options.create_if_missing = true;
+  options.env = env_;
+  DB* db;
+
+  const Slice keys[] = {Slice("aaa"), Slice("bbb"), Slice("ccc")};
+  const Slice vals[] = {Slice("foo"), Slice("bar"), Slice("baz")};
+
+  ASSERT_OK(DB::Open(options, "/dir/db", &db));
+  for (size_t i = 0; i < 3; ++i) {
+    ASSERT_OK(db->Put(WriteOptions(), keys[i], vals[i]));
+  }
+
+  for (size_t i = 0; i < 3; ++i) {
+    std::string res;
+    ASSERT_OK(db->Get(ReadOptions(), keys[i], &res));
+    ASSERT_TRUE(res == vals[i]);
+  }
+
+  Iterator* iterator = db->NewIterator(ReadOptions());
+  iterator->SeekToFirst();
+  for (size_t i = 0; i < 3; ++i) {
+    ASSERT_TRUE(iterator->Valid());
+    ASSERT_TRUE(keys[i] == iterator->key());
+    ASSERT_TRUE(vals[i] == iterator->value());
+    iterator->Next();
+  }
+  ASSERT_TRUE(!iterator->Valid());
+  delete iterator;
+
+  DBImpl* dbi = reinterpret_cast<DBImpl*>(db);
+  ASSERT_OK(dbi->TEST_FlushMemTable());
+
+  for (size_t i = 0; i < 3; ++i) {
+    std::string res;
+    ASSERT_OK(db->Get(ReadOptions(), keys[i], &res));
+    ASSERT_TRUE(res == vals[i]);
+  }
+
+  delete db;
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
diff --git a/util/testharness.cc b/util/testharness.cc
index 16773f69f..967a8f20a 100644
--- a/util/testharness.cc
+++ b/util/testharness.cc
@@ -75,9 +75,9 @@ int RunAllTests() {
   return 0;
 }
 
-std::string TmpDir() {
+std::string TmpDir(Env* env) {
   std::string dir;
-  Status s = Env::Default()->GetTestDirectory(&dir);
+  Status s = env->GetTestDirectory(&dir);
   ASSERT_TRUE(s.ok()) << s.ToString();
   return dir;
 }
diff --git a/util/testharness.h b/util/testharness.h
index af4b2858c..6115d68f7 100644
--- a/util/testharness.h
+++ b/util/testharness.h
@@ -12,6 +12,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <sstream>
+#include <string>
 #include "port/stack_trace.h"
 #include "rocksdb/env.h"
 #include "rocksdb/slice.h"
@@ -37,7 +38,7 @@ namespace test {
 extern int RunAllTests();
 
 // Return the directory to use for temporary storage.
-extern std::string TmpDir();
+extern std::string TmpDir(Env* env = Env::Default());
 
 // Return a randomization seed for this run.  Typically returns the
 // same number on repeated invocations of this binary, but automated

From 72cb7cf201e17399226c111855eda567aaf26394 Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Fri, 31 Oct 2014 15:16:31 -0700
Subject: [PATCH 388/829] Add fsync / corrupt simulation to env_mem

Summary: as title

Test Plan: env_mem_test

Reviewers: sdong, yhchiang, rven, igor

Reviewed By: igor

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D28077
---
 util/mock_env.cc      | 39 ++++++++++++++++++++++++++++++++++++---
 util/mock_env.h       |  2 ++
 util/mock_env_test.cc | 39 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 77 insertions(+), 3 deletions(-)

diff --git a/util/mock_env.cc b/util/mock_env.cc
index 32c202beb..c44592314 100644
--- a/util/mock_env.cc
+++ b/util/mock_env.cc
@@ -12,13 +12,16 @@
 #include <algorithm>
 #include <chrono>
 #include "util/rate_limiter.h"
+#include "util/random.h"
+#include "util/murmurhash.h"
 
 namespace rocksdb {
 
 class MemFile {
  public:
   explicit MemFile(const std::string& fn) :
-    fn_(fn), refs_(0), size_(0), modified_time_(Now()) {}
+    fn_(fn), refs_(0), size_(0), modified_time_(Now()),
+    rnd_((uint32_t)MurmurHash(fn.data(), fn.size(), 0)), fsynced_bytes_(0) {}
 
   void Ref() {
     MutexLock lock(&mutex_);
@@ -53,6 +56,19 @@ class MemFile {
     }
   }
 
+  void CorruptBuffer() {
+    if (fsynced_bytes_ >= size_) {
+      return;
+    }
+    uint64_t buffered_bytes = size_ - fsynced_bytes_;
+    uint64_t start = fsynced_bytes_ + rnd_.Uniform(buffered_bytes);
+    uint64_t end = std::min(start + 512, size_.load());
+    MutexLock lock(&mutex_);
+    for (uint64_t pos = start; pos < end; ++pos) {
+      data_[pos] = static_cast<char>(rnd_.Uniform(256));
+    }
+  }
+
   Status Read(uint64_t offset, size_t n, Slice* result, char* scratch) const {
     MutexLock lock(&mutex_);
     if (offset > Size()) {
@@ -84,6 +100,7 @@ class MemFile {
   }
 
   Status Fsync() {
+    fsynced_bytes_ = size_.load();
     return Status::OK();
   }
 
@@ -110,9 +127,14 @@ class MemFile {
   mutable port::Mutex mutex_;
   int refs_;
 
+  // Data written into this file, all bytes before fsynced_bytes are
+  // persistent.
   std::string data_;
   std::atomic<uint64_t> size_;
   std::atomic<uint64_t> modified_time_;
+
+  Random rnd_;
+  std::atomic<uint64_t> fsynced_bytes_;
 };
 
 namespace {
@@ -197,7 +219,7 @@ class WritableFileImpl : public WritableFile {
   }
 
   virtual Status Close() {
-    return Status::OK();
+    return file_->Fsync();
   }
 
   virtual Status Flush() {
@@ -581,7 +603,7 @@ Status MockEnv::GetTestDirectory(std::string* path) {
   return Status::OK();
 }
 
-  // Non-virtual functions, specific to MockEnv
+// Non-virtual functions, specific to MockEnv
 Status MockEnv::Truncate(const std::string& fname, size_t size) {
   auto fn = NormalizePath(fname);
   MutexLock lock(&mutex_);
@@ -593,6 +615,17 @@ Status MockEnv::Truncate(const std::string& fname, size_t size) {
   return Status::OK();
 }
 
+Status MockEnv::CorruptBuffer(const std::string& fname) {
+  auto fn = NormalizePath(fname);
+  MutexLock lock(&mutex_);
+  auto iter = file_map_.find(fn);
+  if (iter == file_map_.end()) {
+    return Status::IOError(fn, "File not found");
+  }
+  iter->second->CorruptBuffer();
+  return Status::OK();
+}
+
 std::string MockEnv::NormalizePath(const std::string path) {
   std::string dst;
   for (auto c : path) {
diff --git a/util/mock_env.h b/util/mock_env.h
index d128c75b0..b92caa5cf 100644
--- a/util/mock_env.h
+++ b/util/mock_env.h
@@ -81,6 +81,8 @@ class MockEnv : public EnvWrapper {
   // Non-virtual functions, specific to MockEnv
   Status Truncate(const std::string& fname, size_t size);
 
+  Status CorruptBuffer(const std::string& fname);
+
  private:
   std::string NormalizePath(const std::string path);
 
diff --git a/util/mock_env_test.cc b/util/mock_env_test.cc
index 51ae8e296..521f0fb1c 100644
--- a/util/mock_env_test.cc
+++ b/util/mock_env_test.cc
@@ -182,6 +182,45 @@ TEST(MockEnvTest, LargeWrite) {
   delete [] scratch;
 }
 
+TEST(MockEnvTest, Corrupt) {
+  const std::string kGood = "this is a good string, synced to disk";
+  const std::string kCorrupted = "this part may be corrupted";
+  const std::string kFileName = "/dir/f";
+  unique_ptr<WritableFile> writable_file;
+  ASSERT_OK(env_->NewWritableFile(kFileName, &writable_file, soptions_));
+  ASSERT_OK(writable_file->Append(kGood));
+  ASSERT_TRUE(writable_file->GetFileSize() == kGood.size());
+
+  std::string scratch;
+  scratch.resize(kGood.size() + kCorrupted.size() + 16);
+  Slice result;
+  unique_ptr<RandomAccessFile> rand_file;
+  ASSERT_OK(env_->NewRandomAccessFile(kFileName, &rand_file, soptions_));
+  ASSERT_OK(rand_file->Read(0, kGood.size(), &result, &(scratch[0])));
+  ASSERT_EQ(result.compare(kGood), 0);
+
+  // Sync + corrupt => no change
+  ASSERT_OK(writable_file->Fsync());
+  ASSERT_OK(dynamic_cast<MockEnv*>(env_)->CorruptBuffer(kFileName));
+  result.clear();
+  ASSERT_OK(rand_file->Read(0, kGood.size(), &result, &(scratch[0])));
+  ASSERT_EQ(result.compare(kGood), 0);
+
+  // Add new data and corrupt it
+  ASSERT_OK(writable_file->Append(kCorrupted));
+  ASSERT_TRUE(writable_file->GetFileSize() == kGood.size() + kCorrupted.size());
+  result.clear();
+  ASSERT_OK(rand_file->Read(kGood.size(), kCorrupted.size(),
+            &result, &(scratch[0])));
+  ASSERT_EQ(result.compare(kCorrupted), 0);
+  // Corrupted
+  ASSERT_OK(dynamic_cast<MockEnv*>(env_)->CorruptBuffer(kFileName));
+  result.clear();
+  ASSERT_OK(rand_file->Read(kGood.size(), kCorrupted.size(),
+            &result, &(scratch[0])));
+  ASSERT_NE(result.compare(kCorrupted), 0);
+}
+
 TEST(MockEnvTest, DBTest) {
   Options options;
   options.create_if_missing = true;

From 46c14c6661475d688a5857adf0c29534b0d8abce Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Fri, 31 Oct 2014 15:41:25 -0700
Subject: [PATCH 389/829] Fix #258. benchmarkharness -- make bm_min_usec uint

---
 util/benchharness.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/util/benchharness.cc b/util/benchharness.cc
index 8cd37007b..fef8df56e 100644
--- a/util/benchharness.cc
+++ b/util/benchharness.cc
@@ -30,7 +30,7 @@ using std::vector;
 
 DEFINE_bool(benchmark, false, "Run benchmarks.");
 
-DEFINE_int64(bm_min_usec, 100,
+DEFINE_uint64(bm_min_usec, 100,
              "Minimum # of microseconds we'll accept for each benchmark.");
 
 DEFINE_int64(bm_min_iters, 1,

From 8ddddd62d0bf7ac74cf64a024452587a007b6096 Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Fri, 31 Oct 2014 16:15:15 -0700
Subject: [PATCH 390/829] Fix incorrect fixing of lint errors in ldb_cmd.cc

Summary: Fix incorrect fixing of lint errors in ldb_cmd.cc

Test Plan: reduce_levels_test

Reviewers: igor, sdong, ljin

Reviewed By: ljin

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D28137
---
 util/ldb_cmd.cc | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/util/ldb_cmd.cc b/util/ldb_cmd.cc
index c03c1b31a..1dfdd732d 100644
--- a/util/ldb_cmd.cc
+++ b/util/ldb_cmd.cc
@@ -510,7 +510,7 @@ void ManifestDumpCommand::Help(std::string* ret) {
   ret->append("  ");
   ret->append(ManifestDumpCommand::Name());
   ret->append(" [--" + ARG_VERBOSE + "]");
-  ret->append(" [--" + ARG_PATH + " = <path_to_manifest_file>]");
+  ret->append(" [--" + ARG_PATH + "=<path_to_manifest_file>]");
   ret->append("\n");
 }
 
@@ -737,9 +737,9 @@ void InternalDumpCommand::Help(std::string* ret) {
   ret->append(InternalDumpCommand::Name());
   ret->append(HelpRangeCmdArgs());
   ret->append(" [--" + ARG_INPUT_KEY_HEX + "]");
-  ret->append(" [--" + ARG_MAX_KEYS + " = <N>]");
+  ret->append(" [--" + ARG_MAX_KEYS + "=<N>]");
   ret->append(" [--" + ARG_COUNT_ONLY + "]");
-  ret->append(" [--" + ARG_COUNT_DELIM + " = <char>]");
+  ret->append(" [--" + ARG_COUNT_DELIM + "=<char>]");
   ret->append(" [--" + ARG_STATS + "]");
   ret->append("\n");
 }
@@ -922,14 +922,14 @@ void DBDumperCommand::Help(std::string* ret) {
   ret->append(DBDumperCommand::Name());
   ret->append(HelpRangeCmdArgs());
   ret->append(" [--" + ARG_TTL + "]");
-  ret->append(" [--" + ARG_MAX_KEYS + " = <N>]");
+  ret->append(" [--" + ARG_MAX_KEYS + "=<N>]");
   ret->append(" [--" + ARG_TIMESTAMP + "]");
   ret->append(" [--" + ARG_COUNT_ONLY + "]");
-  ret->append(" [--" + ARG_COUNT_DELIM + " = <char>]");
+  ret->append(" [--" + ARG_COUNT_DELIM + "=<char>]");
   ret->append(" [--" + ARG_STATS + "]");
-  ret->append(" [--" + ARG_TTL_BUCKET + " = <N>]");
-  ret->append(" [--" + ARG_TTL_START + " = <N>:- is inclusive]");
-  ret->append(" [--" + ARG_TTL_END + " = <N>:- is exclusive]");
+  ret->append(" [--" + ARG_TTL_BUCKET + "=<N>]");
+  ret->append(" [--" + ARG_TTL_START + "=<N>:- is inclusive]");
+  ret->append(" [--" + ARG_TTL_END + "=<N>:- is exclusive]");
   ret->append("\n");
 }
 
@@ -1096,8 +1096,8 @@ std::vector<std::string> ReduceDBLevelsCommand::PrepareArgs(
     int new_levels, bool print_old_level) {
   std::vector<std::string> ret;
   ret.push_back("reduce_levels");
-  ret.push_back("--" + ARG_DB + " = " + db_path);
-  ret.push_back("--" + ARG_NEW_LEVELS + " = " + std::to_string(new_levels));
+  ret.push_back("--" + ARG_DB + "=" + db_path);
+  ret.push_back("--" + ARG_NEW_LEVELS + "=" + std::to_string(new_levels));
   if (print_old_level) {
     ret.push_back("--" + ARG_PRINT_OLD_LEVELS);
   }
@@ -1107,7 +1107,7 @@ std::vector<std::string> ReduceDBLevelsCommand::PrepareArgs(
 void ReduceDBLevelsCommand::Help(std::string* ret) {
   ret->append("  ");
   ret->append(ReduceDBLevelsCommand::Name());
-  ret->append(" --" + ARG_NEW_LEVELS + " = <New number of levels>");
+  ret->append(" --" + ARG_NEW_LEVELS + "=<New number of levels>");
   ret->append(" [--" + ARG_PRINT_OLD_LEVELS + "]");
   ret->append("\n");
 }
@@ -1255,10 +1255,10 @@ void ChangeCompactionStyleCommand::Help(std::string* ret) {
   ret->append("  ");
   ret->append(ChangeCompactionStyleCommand::Name());
   ret->append(
-      " --" + ARG_OLD_COMPACTION_STYLE + " = <Old compaction style: 0 " +
+      " --" + ARG_OLD_COMPACTION_STYLE + "=<Old compaction style: 0 " +
       "for level compaction, 1 for universal compaction>");
   ret->append(
-      " --" + ARG_NEW_COMPACTION_STYLE + " = <New compaction style: 0 " +
+      " --" + ARG_NEW_COMPACTION_STYLE + "=<New compaction style: 0 " +
       "for level compaction, 1 for universal compaction>");
   ret->append("\n");
 }
@@ -1408,7 +1408,7 @@ WALDumperCommand::WALDumperCommand(
 void WALDumperCommand::Help(std::string* ret) {
   ret->append("  ");
   ret->append(WALDumperCommand::Name());
-  ret->append(" --" + ARG_WAL_FILE + " = <write_ahead_log_file_path>");
+  ret->append(" --" + ARG_WAL_FILE + "=<write_ahead_log_file_path>");
   ret->append(" [--" + ARG_PRINT_HEADER + "] ");
   ret->append(" [--" + ARG_PRINT_VALUE + "] ");
   ret->append("\n");
@@ -1660,9 +1660,9 @@ void ScanCommand::Help(std::string* ret) {
   ret->append(HelpRangeCmdArgs());
   ret->append(" [--" + ARG_TTL + "]");
   ret->append(" [--" + ARG_TIMESTAMP + "]");
-  ret->append(" [--" + ARG_MAX_KEYS + " = <N>q] ");
-  ret->append(" [--" + ARG_TTL_START + " = <N>:- is inclusive]");
-  ret->append(" [--" + ARG_TTL_END + " = <N>:- is exclusive]");
+  ret->append(" [--" + ARG_MAX_KEYS + "=<N>q] ");
+  ret->append(" [--" + ARG_TTL_START + "=<N>:- is inclusive]");
+  ret->append(" [--" + ARG_TTL_END + "=<N>:- is exclusive]");
   ret->append("\n");
 }
 

From 74eb4fbe93394c40269933d4c0667c6b4aae41db Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Fri, 31 Oct 2014 16:31:25 -0700
Subject: [PATCH 391/829] CompactionJob

Summary:
Long awaited CompactionJob class! Move most compaction-related things from DBImpl to CompactionJob, making CompactionJob easier to test and understand.

Currently this is just replicating exactly the same functionality with as little as change as possible. As future work, we should:
1. Add CompactionJob tests (I think I'll do that tomorrow)
2. Reduce CompactionJob's state that it inherits from DBImpl
3. Figure out how to do yielding to flush better. Currently I implemented a callback as we agreed yesterday, but I don't think it's a good long term solution.

This reduces db_impl.cc from 5000+ LOC to 3400!

Test Plan: make check, will add CompactionJob-specific tests, probably also move some tests from db_test to compaction_job_test

Reviewers: rven, yhchiang, sdong, ljin

Reviewed By: ljin

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D27957
---
 db/compaction_job.cc | 1124 ++++++++++++++++++++++++++++++++++++++++++
 db/compaction_job.h  |  131 +++++
 db/db_impl.cc        | 1081 +---------------------------------------
 db/db_impl.h         |   27 -
 4 files changed, 1276 insertions(+), 1087 deletions(-)
 create mode 100644 db/compaction_job.cc
 create mode 100644 db/compaction_job.h

diff --git a/db/compaction_job.cc b/db/compaction_job.cc
new file mode 100644
index 000000000..5a1a315ff
--- /dev/null
+++ b/db/compaction_job.cc
@@ -0,0 +1,1124 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/compaction_job.h"
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include <inttypes.h>
+#include <algorithm>
+#include <vector>
+#include <memory>
+#include <list>
+
+#include "db/builder.h"
+#include "db/db_iter.h"
+#include "db/dbformat.h"
+#include "db/filename.h"
+#include "db/log_reader.h"
+#include "db/log_writer.h"
+#include "db/memtable.h"
+#include "db/merge_helper.h"
+#include "db/memtable_list.h"
+#include "db/merge_context.h"
+#include "db/version_set.h"
+#include "port/port.h"
+#include "port/likely.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table.h"
+#include "table/block.h"
+#include "table/block_based_table_factory.h"
+#include "table/merger.h"
+#include "table/table_builder.h"
+#include "table/two_level_iterator.h"
+#include "util/coding.h"
+#include "util/logging.h"
+#include "util/log_buffer.h"
+#include "util/mutexlock.h"
+#include "util/perf_context_imp.h"
+#include "util/iostats_context_imp.h"
+#include "util/stop_watch.h"
+#include "util/sync_point.h"
+
+namespace rocksdb {
+
+struct CompactionJob::CompactionState {
+  Compaction* const compaction;
+
+  // If there were two snapshots with seq numbers s1 and
+  // s2 and s1 < s2, and if we find two instances of a key k1 then lies
+  // entirely within s1 and s2, then the earlier version of k1 can be safely
+  // deleted because that version is not visible in any snapshot.
+  std::vector<SequenceNumber> existing_snapshots;
+
+  // Files produced by compaction
+  struct Output {
+    uint64_t number;
+    uint32_t path_id;
+    uint64_t file_size;
+    InternalKey smallest, largest;
+    SequenceNumber smallest_seqno, largest_seqno;
+  };
+  std::vector<Output> outputs;
+  std::list<uint64_t> allocated_file_numbers;
+
+  // State kept for output being generated
+  std::unique_ptr<WritableFile> outfile;
+  std::unique_ptr<TableBuilder> builder;
+
+  uint64_t total_bytes;
+
+  Output* current_output() { return &outputs[outputs.size() - 1]; }
+
+  explicit CompactionState(Compaction* c) : compaction(c), total_bytes(0) {}
+
+  // Create a client visible context of this compaction
+  CompactionFilter::Context GetFilterContextV1() {
+    CompactionFilter::Context context;
+    context.is_full_compaction = compaction->IsFullCompaction();
+    context.is_manual_compaction = compaction->IsManualCompaction();
+    return context;
+  }
+
+  // Create a client visible context of this compaction
+  CompactionFilterContext GetFilterContext() {
+    CompactionFilterContext context;
+    context.is_full_compaction = compaction->IsFullCompaction();
+    context.is_manual_compaction = compaction->IsManualCompaction();
+    return context;
+  }
+
+  std::vector<std::string> key_str_buf_;
+  std::vector<std::string> existing_value_str_buf_;
+  // new_value_buf_ will only be appended if a value changes
+  std::vector<std::string> new_value_buf_;
+  // if values_changed_buf_[i] is true
+  // new_value_buf_ will add a new entry with the changed value
+  std::vector<bool> value_changed_buf_;
+  // to_delete_buf_[i] is true iff key_buf_[i] is deleted
+  std::vector<bool> to_delete_buf_;
+
+  std::vector<std::string> other_key_str_buf_;
+  std::vector<std::string> other_value_str_buf_;
+
+  std::vector<Slice> combined_key_buf_;
+  std::vector<Slice> combined_value_buf_;
+
+  std::string cur_prefix_;
+
+  // Buffers the kv-pair that will be run through compaction filter V2
+  // in the future.
+  void BufferKeyValueSlices(const Slice& key, const Slice& value) {
+    key_str_buf_.emplace_back(key.ToString());
+    existing_value_str_buf_.emplace_back(value.ToString());
+  }
+
+  // Buffers the kv-pair that will not be run through compaction filter V2
+  // in the future.
+  void BufferOtherKeyValueSlices(const Slice& key, const Slice& value) {
+    other_key_str_buf_.emplace_back(key.ToString());
+    other_value_str_buf_.emplace_back(value.ToString());
+  }
+
+  // Add a kv-pair to the combined buffer
+  void AddToCombinedKeyValueSlices(const Slice& key, const Slice& value) {
+    // The real strings are stored in the batch buffers
+    combined_key_buf_.emplace_back(key);
+    combined_value_buf_.emplace_back(value);
+  }
+
+  // Merging the two buffers
+  void MergeKeyValueSliceBuffer(const InternalKeyComparator* comparator) {
+    size_t i = 0;
+    size_t j = 0;
+    size_t total_size = key_str_buf_.size() + other_key_str_buf_.size();
+    combined_key_buf_.reserve(total_size);
+    combined_value_buf_.reserve(total_size);
+
+    while (i + j < total_size) {
+      int comp_res = 0;
+      if (i < key_str_buf_.size() && j < other_key_str_buf_.size()) {
+        comp_res = comparator->Compare(key_str_buf_[i], other_key_str_buf_[j]);
+      } else if (i >= key_str_buf_.size() && j < other_key_str_buf_.size()) {
+        comp_res = 1;
+      } else if (j >= other_key_str_buf_.size() && i < key_str_buf_.size()) {
+        comp_res = -1;
+      }
+      if (comp_res > 0) {
+        AddToCombinedKeyValueSlices(other_key_str_buf_[j],
+                                    other_value_str_buf_[j]);
+        j++;
+      } else if (comp_res < 0) {
+        AddToCombinedKeyValueSlices(key_str_buf_[i],
+                                    existing_value_str_buf_[i]);
+        i++;
+      }
+    }
+  }
+
+  void CleanupBatchBuffer() {
+    to_delete_buf_.clear();
+    key_str_buf_.clear();
+    existing_value_str_buf_.clear();
+    new_value_buf_.clear();
+    value_changed_buf_.clear();
+
+    to_delete_buf_.shrink_to_fit();
+    key_str_buf_.shrink_to_fit();
+    existing_value_str_buf_.shrink_to_fit();
+    new_value_buf_.shrink_to_fit();
+    value_changed_buf_.shrink_to_fit();
+
+    other_key_str_buf_.clear();
+    other_value_str_buf_.clear();
+    other_key_str_buf_.shrink_to_fit();
+    other_value_str_buf_.shrink_to_fit();
+  }
+
+  void CleanupMergedBuffer() {
+    combined_key_buf_.clear();
+    combined_value_buf_.clear();
+    combined_key_buf_.shrink_to_fit();
+    combined_value_buf_.shrink_to_fit();
+  }
+};
+
+CompactionJob::CompactionJob(
+    Compaction* compaction, const DBOptions& db_options,
+    const MutableCFOptions& mutable_cf_options, const EnvOptions& env_options,
+    VersionSet* versions, port::Mutex* db_mutex,
+    std::atomic<bool>* shutting_down, FileNumToPathIdMap* pending_outputs,
+    LogBuffer* log_buffer, Directory* db_directory, Statistics* stats,
+    SnapshotList* snapshots, bool is_snapshot_supported,
+    std::shared_ptr<Cache> table_cache,
+    std::function<uint64_t()> yield_callback)
+    : compact_(new CompactionState(compaction)),
+      compaction_stats_(1),
+      db_options_(db_options),
+      mutable_cf_options_(mutable_cf_options),
+      env_options_(env_options),
+      env_(db_options.env),
+      versions_(versions),
+      db_mutex_(db_mutex),
+      shutting_down_(shutting_down),
+      pending_outputs_(pending_outputs),
+      log_buffer_(log_buffer),
+      db_directory_(db_directory),
+      stats_(stats),
+      snapshots_(snapshots),
+      is_snapshot_supported_(is_snapshot_supported),
+      table_cache_(std::move(table_cache)),
+      yield_callback_(std::move(yield_callback)) {}
+
+void CompactionJob::Prepare() {
+  db_mutex_->AssertHeld();
+  compact_->CleanupBatchBuffer();
+  compact_->CleanupMergedBuffer();
+
+  // Generate file_levels_ for compaction berfore making Iterator
+  compact_->compaction->GenerateFileLevels();
+  ColumnFamilyData* cfd = compact_->compaction->column_family_data();
+  LogToBuffer(
+      log_buffer_, "[%s] Compacting %d@%d + %d@%d files, score %.2f",
+      cfd->GetName().c_str(), compact_->compaction->num_input_files(0),
+      compact_->compaction->level(), compact_->compaction->num_input_files(1),
+      compact_->compaction->output_level(), compact_->compaction->score());
+  char scratch[2345];
+  compact_->compaction->Summary(scratch, sizeof(scratch));
+  LogToBuffer(log_buffer_, "[%s] Compaction start summary: %s\n",
+              cfd->GetName().c_str(), scratch);
+
+  assert(cfd->current()->storage_info()->NumLevelFiles(
+             compact_->compaction->level()) > 0);
+  assert(compact_->builder == nullptr);
+  assert(!compact_->outfile);
+
+  visible_at_tip_ = 0;
+  latest_snapshot_ = 0;
+  // TODO(icanadi) move snapshots_ out of CompactionJob
+  snapshots_->getAll(compact_->existing_snapshots);
+  if (compact_->existing_snapshots.size() == 0) {
+    // optimize for fast path if there are no snapshots
+    visible_at_tip_ = versions_->LastSequence();
+    earliest_snapshot_ = visible_at_tip_;
+  } else {
+    latest_snapshot_ = compact_->existing_snapshots.back();
+    // Add the current seqno as the 'latest' virtual
+    // snapshot to the end of this list.
+    compact_->existing_snapshots.push_back(versions_->LastSequence());
+    earliest_snapshot_ = compact_->existing_snapshots[0];
+  }
+
+  // Is this compaction producing files at the bottommost level?
+  bottommost_level_ = compact_->compaction->BottomMostLevel();
+
+  // Allocate the output file numbers before we release the lock
+  AllocateCompactionOutputFileNumbers();
+}
+
+Status CompactionJob::Run() {
+  log_buffer_->FlushBufferToLog();
+  ColumnFamilyData* cfd = compact_->compaction->column_family_data();
+
+  int num_output_records = 0;
+  const uint64_t start_micros = env_->NowMicros();
+  std::unique_ptr<Iterator> input(
+      versions_->MakeInputIterator(compact_->compaction));
+  input->SeekToFirst();
+
+  Status status;
+  ParsedInternalKey ikey;
+  std::unique_ptr<CompactionFilterV2> compaction_filter_from_factory_v2 =
+      nullptr;
+  auto context = compact_->GetFilterContext();
+  compaction_filter_from_factory_v2 =
+      cfd->ioptions()->compaction_filter_factory_v2->CreateCompactionFilterV2(
+          context);
+  auto compaction_filter_v2 = compaction_filter_from_factory_v2.get();
+
+  int64_t imm_micros = 0;  // Micros spent doing imm_ compactions
+  if (!compaction_filter_v2) {
+    status = ProcessKeyValueCompaction(&imm_micros, input.get(), false,
+                                       &num_output_records);
+  } else {
+    // temp_backup_input always point to the start of the current buffer
+    // temp_backup_input = backup_input;
+    // iterate through input,
+    // 1) buffer ineligible keys and value keys into 2 separate buffers;
+    // 2) send value_buffer to compaction filter and alternate the values;
+    // 3) merge value_buffer with ineligible_value_buffer;
+    // 4) run the modified "compaction" using the old for loop.
+    bool prefix_initialized = false;
+    shared_ptr<Iterator> backup_input(
+        versions_->MakeInputIterator(compact_->compaction));
+    backup_input->SeekToFirst();
+    while (backup_input->Valid() &&
+           !shutting_down_->load(std::memory_order_acquire) &&
+           !cfd->IsDropped()) {
+      // FLUSH preempts compaction
+      // TODO(icanadi) this currently only checks if flush is necessary on
+      // compacting column family. we should also check if flush is necessary on
+      // other column families, too
+
+      imm_micros += yield_callback_();
+
+      Slice key = backup_input->key();
+      Slice value = backup_input->value();
+
+      if (!ParseInternalKey(key, &ikey)) {
+        // log error
+        Log(db_options_.info_log, "[%s] Failed to parse key: %s",
+            cfd->GetName().c_str(), key.ToString().c_str());
+        continue;
+      } else {
+        const SliceTransform* transformer =
+            cfd->ioptions()->compaction_filter_factory_v2->GetPrefixExtractor();
+        const auto key_prefix = transformer->Transform(ikey.user_key);
+        if (!prefix_initialized) {
+          compact_->cur_prefix_ = key_prefix.ToString();
+          prefix_initialized = true;
+        }
+        // If the prefix remains the same, keep buffering
+        if (key_prefix.compare(Slice(compact_->cur_prefix_)) == 0) {
+          // Apply the compaction filter V2 to all the kv pairs sharing
+          // the same prefix
+          if (ikey.type == kTypeValue &&
+              (visible_at_tip_ || ikey.sequence > latest_snapshot_)) {
+            // Buffer all keys sharing the same prefix for CompactionFilterV2
+            // Iterate through keys to check prefix
+            compact_->BufferKeyValueSlices(key, value);
+          } else {
+            // buffer ineligible keys
+            compact_->BufferOtherKeyValueSlices(key, value);
+          }
+          backup_input->Next();
+          continue;
+          // finish changing values for eligible keys
+        } else {
+          // Now prefix changes, this batch is done.
+          // Call compaction filter on the buffered values to change the value
+          if (compact_->key_str_buf_.size() > 0) {
+            CallCompactionFilterV2(compaction_filter_v2);
+          }
+          compact_->cur_prefix_ = key_prefix.ToString();
+        }
+      }
+
+      // Merge this batch of data (values + ineligible keys)
+      compact_->MergeKeyValueSliceBuffer(&cfd->internal_comparator());
+
+      // Done buffering for the current prefix. Spit it out to disk
+      // Now just iterate through all the kv-pairs
+      status = ProcessKeyValueCompaction(&imm_micros, input.get(), true,
+                                         &num_output_records);
+
+      if (!status.ok()) {
+        break;
+      }
+
+      // After writing the kv-pairs, we can safely remove the reference
+      // to the string buffer and clean them up
+      compact_->CleanupBatchBuffer();
+      compact_->CleanupMergedBuffer();
+      // Buffer the key that triggers the mismatch in prefix
+      if (ikey.type == kTypeValue &&
+          (visible_at_tip_ || ikey.sequence > latest_snapshot_)) {
+        compact_->BufferKeyValueSlices(key, value);
+      } else {
+        compact_->BufferOtherKeyValueSlices(key, value);
+      }
+      backup_input->Next();
+      if (!backup_input->Valid()) {
+        // If this is the single last value, we need to merge it.
+        if (compact_->key_str_buf_.size() > 0) {
+          CallCompactionFilterV2(compaction_filter_v2);
+        }
+        compact_->MergeKeyValueSliceBuffer(&cfd->internal_comparator());
+
+        status = ProcessKeyValueCompaction(&imm_micros, input.get(), true,
+                                           &num_output_records);
+
+        compact_->CleanupBatchBuffer();
+        compact_->CleanupMergedBuffer();
+      }
+    }  // done processing all prefix batches
+    // finish the last batch
+    if (compact_->key_str_buf_.size() > 0) {
+      CallCompactionFilterV2(compaction_filter_v2);
+    }
+    compact_->MergeKeyValueSliceBuffer(&cfd->internal_comparator());
+    status = ProcessKeyValueCompaction(&imm_micros, input.get(), true,
+                                       &num_output_records);
+  }  // checking for compaction filter v2
+
+  if (status.ok() &&
+      (shutting_down_->load(std::memory_order_acquire) || cfd->IsDropped())) {
+    status = Status::ShutdownInProgress(
+        "Database shutdown or Column family drop during compaction");
+  }
+  if (status.ok() && compact_->builder != nullptr) {
+    status = FinishCompactionOutputFile(input.get());
+  }
+  if (status.ok()) {
+    status = input->status();
+  }
+  input.reset();
+
+  if (db_directory_ && !db_options_.disableDataSync) {
+    db_directory_->Fsync();
+  }
+
+  compaction_stats_.micros = env_->NowMicros() - start_micros - imm_micros;
+  compaction_stats_.files_in_leveln = compact_->compaction->num_input_files(0);
+  compaction_stats_.files_in_levelnp1 =
+      compact_->compaction->num_input_files(1);
+  MeasureTime(stats_, COMPACTION_TIME, compaction_stats_.micros);
+
+  int num_output_files = compact_->outputs.size();
+  if (compact_->builder != nullptr) {
+    // An error occurred so ignore the last output.
+    assert(num_output_files > 0);
+    --num_output_files;
+  }
+  compaction_stats_.files_out_levelnp1 = num_output_files;
+
+  uint64_t num_input_records = 0;
+
+  for (int i = 0; i < compact_->compaction->num_input_files(0); i++) {
+    compaction_stats_.bytes_readn +=
+        compact_->compaction->input(0, i)->fd.GetFileSize();
+    compaction_stats_.num_input_records +=
+        compact_->compaction->input(0, i)->num_entries;
+    num_input_records += compact_->compaction->input(0, i)->num_entries;
+  }
+
+  for (int i = 0; i < compact_->compaction->num_input_files(1); i++) {
+    compaction_stats_.bytes_readnp1 +=
+        compact_->compaction->input(1, i)->fd.GetFileSize();
+    num_input_records += compact_->compaction->input(1, i)->num_entries;
+  }
+
+  for (int i = 0; i < num_output_files; i++) {
+    compaction_stats_.bytes_written += compact_->outputs[i].file_size;
+  }
+  compaction_stats_.num_dropped_records =
+      static_cast<int>(num_input_records) - num_output_records;
+
+  RecordCompactionIOStats();
+
+  LogFlush(db_options_.info_log);
+  return status;
+}
+
+Status CompactionJob::Install(Status status) {
+  db_mutex_->AssertHeld();
+  ColumnFamilyData* cfd = compact_->compaction->column_family_data();
+  cfd->internal_stats()->AddCompactionStats(
+      compact_->compaction->output_level(), compaction_stats_);
+
+  // if there were any unused file number (mostly in case of
+  // compaction error), free up the entry from pending_putputs
+  ReleaseCompactionUnusedFileNumbers();
+
+  if (status.ok()) {
+    status = InstallCompactionResults();
+  }
+  VersionStorageInfo::LevelSummaryStorage tmp;
+  const auto& stats = compaction_stats_;
+  LogToBuffer(log_buffer_,
+              "[%s] compacted to: %s, MB/sec: %.1f rd, %.1f wr, level %d, "
+              "files in(%d, %d) out(%d) "
+              "MB in(%.1f, %.1f) out(%.1f), read-write-amplify(%.1f) "
+              "write-amplify(%.1f) %s, records in: %d, records dropped: %d\n",
+              cfd->GetName().c_str(),
+              cfd->current()->storage_info()->LevelSummary(&tmp),
+              (stats.bytes_readn + stats.bytes_readnp1) /
+                  static_cast<double>(stats.micros),
+              stats.bytes_written / static_cast<double>(stats.micros),
+              compact_->compaction->output_level(), stats.files_in_leveln,
+              stats.files_in_levelnp1, stats.files_out_levelnp1,
+              stats.bytes_readn / 1048576.0, stats.bytes_readnp1 / 1048576.0,
+              stats.bytes_written / 1048576.0,
+              (stats.bytes_written + stats.bytes_readnp1 + stats.bytes_readn) /
+                  static_cast<double>(stats.bytes_readn),
+              stats.bytes_written / static_cast<double>(stats.bytes_readn),
+              status.ToString().c_str(), stats.num_input_records,
+              stats.num_dropped_records);
+
+  CleanupCompaction(status);
+  return status;
+}
+
+// Allocate the file numbers for the output file. We allocate as
+// many output file numbers as there are files in level+1 (at least one)
+// Insert them into pending_outputs so that they do not get deleted.
+void CompactionJob::AllocateCompactionOutputFileNumbers() {
+  db_mutex_->AssertHeld();
+  assert(compact_->builder == nullptr);
+  int filesNeeded = compact_->compaction->num_input_files(1);
+  for (int i = 0; i < std::max(filesNeeded, 1); i++) {
+    uint64_t file_number = versions_->NewFileNumber();
+    pending_outputs_->insert(
+        {file_number, compact_->compaction->GetOutputPathId()});
+    compact_->allocated_file_numbers.push_back(file_number);
+  }
+}
+
+Status CompactionJob::ProcessKeyValueCompaction(int64_t* imm_micros,
+                                                Iterator* input,
+                                                bool is_compaction_v2,
+                                                int* num_output_records) {
+  assert(num_output_records != nullptr);
+
+  size_t combined_idx = 0;
+  Status status;
+  std::string compaction_filter_value;
+  ParsedInternalKey ikey;
+  IterKey current_user_key;
+  bool has_current_user_key = false;
+  IterKey delete_key;
+  SequenceNumber last_sequence_for_key __attribute__((unused)) =
+      kMaxSequenceNumber;
+  SequenceNumber visible_in_snapshot = kMaxSequenceNumber;
+  ColumnFamilyData* cfd = compact_->compaction->column_family_data();
+  MergeHelper merge(cfd->user_comparator(), cfd->ioptions()->merge_operator,
+                    db_options_.info_log.get(),
+                    cfd->ioptions()->min_partial_merge_operands,
+                    false /* internal key corruption is expected */);
+  auto compaction_filter = cfd->ioptions()->compaction_filter;
+  std::unique_ptr<CompactionFilter> compaction_filter_from_factory = nullptr;
+  if (!compaction_filter) {
+    auto context = compact_->GetFilterContextV1();
+    compaction_filter_from_factory =
+        cfd->ioptions()->compaction_filter_factory->CreateCompactionFilter(
+            context);
+    compaction_filter = compaction_filter_from_factory.get();
+  }
+
+  int64_t key_drop_user = 0;
+  int64_t key_drop_newer_entry = 0;
+  int64_t key_drop_obsolete = 0;
+  int64_t loop_cnt = 0;
+  while (input->Valid() && !shutting_down_->load(std::memory_order_acquire) &&
+         !cfd->IsDropped() && status.ok()) {
+    if (++loop_cnt > 1000) {
+      if (key_drop_user > 0) {
+        RecordTick(stats_, COMPACTION_KEY_DROP_USER, key_drop_user);
+        key_drop_user = 0;
+      }
+      if (key_drop_newer_entry > 0) {
+        RecordTick(stats_, COMPACTION_KEY_DROP_NEWER_ENTRY,
+                   key_drop_newer_entry);
+        key_drop_newer_entry = 0;
+      }
+      if (key_drop_obsolete > 0) {
+        RecordTick(stats_, COMPACTION_KEY_DROP_OBSOLETE, key_drop_obsolete);
+        key_drop_obsolete = 0;
+      }
+      RecordCompactionIOStats();
+      loop_cnt = 0;
+    }
+    // FLUSH preempts compaction
+    // TODO(icanadi) this currently only checks if flush is necessary on
+    // compacting column family. we should also check if flush is necessary on
+    // other column families, too
+    (*imm_micros) += yield_callback_();
+
+    Slice key;
+    Slice value;
+    // If is_compaction_v2 is on, kv-pairs are reset to the prefix batch.
+    // This prefix batch should contain results after calling
+    // compaction_filter_v2.
+    //
+    // If is_compaction_v2 is off, this function will go through all the
+    // kv-pairs in input.
+    if (!is_compaction_v2) {
+      key = input->key();
+      value = input->value();
+    } else {
+      if (combined_idx >= compact_->combined_key_buf_.size()) {
+        break;
+      }
+      assert(combined_idx < compact_->combined_key_buf_.size());
+      key = compact_->combined_key_buf_[combined_idx];
+      value = compact_->combined_value_buf_[combined_idx];
+
+      ++combined_idx;
+    }
+
+    if (compact_->compaction->ShouldStopBefore(key) &&
+        compact_->builder != nullptr) {
+      status = FinishCompactionOutputFile(input);
+      if (!status.ok()) {
+        break;
+      }
+    }
+
+    // Handle key/value, add to state, etc.
+    bool drop = false;
+    bool current_entry_is_merging = false;
+    if (!ParseInternalKey(key, &ikey)) {
+      // Do not hide error keys
+      // TODO: error key stays in db forever? Figure out the intention/rationale
+      // v10 error v8 : we cannot hide v8 even though it's pretty obvious.
+      current_user_key.Clear();
+      has_current_user_key = false;
+      last_sequence_for_key = kMaxSequenceNumber;
+      visible_in_snapshot = kMaxSequenceNumber;
+    } else {
+      if (!has_current_user_key ||
+          cfd->user_comparator()->Compare(ikey.user_key,
+                                          current_user_key.GetKey()) != 0) {
+        // First occurrence of this user key
+        current_user_key.SetKey(ikey.user_key);
+        has_current_user_key = true;
+        last_sequence_for_key = kMaxSequenceNumber;
+        visible_in_snapshot = kMaxSequenceNumber;
+        // apply the compaction filter to the first occurrence of the user key
+        if (compaction_filter && !is_compaction_v2 && ikey.type == kTypeValue &&
+            (visible_at_tip_ || ikey.sequence > latest_snapshot_)) {
+          // If the user has specified a compaction filter and the sequence
+          // number is greater than any external snapshot, then invoke the
+          // filter.
+          // If the return value of the compaction filter is true, replace
+          // the entry with a delete marker.
+          bool value_changed = false;
+          compaction_filter_value.clear();
+          bool to_delete = compaction_filter->Filter(
+              compact_->compaction->level(), ikey.user_key, value,
+              &compaction_filter_value, &value_changed);
+          if (to_delete) {
+            // make a copy of the original key and convert it to a delete
+            delete_key.SetInternalKey(ExtractUserKey(key), ikey.sequence,
+                                      kTypeDeletion);
+            // anchor the key again
+            key = delete_key.GetKey();
+            // needed because ikey is backed by key
+            ParseInternalKey(key, &ikey);
+            // no value associated with delete
+            value.clear();
+            ++key_drop_user;
+          } else if (value_changed) {
+            value = compaction_filter_value;
+          }
+        }
+      }
+
+      // If there are no snapshots, then this kv affect visibility at tip.
+      // Otherwise, search though all existing snapshots to find
+      // the earlist snapshot that is affected by this kv.
+      SequenceNumber prev_snapshot = 0;  // 0 means no previous snapshot
+      SequenceNumber visible =
+          visible_at_tip_
+              ? visible_at_tip_
+              : is_snapshot_supported_
+                    ? findEarliestVisibleSnapshot(ikey.sequence,
+                                                  compact_->existing_snapshots,
+                                                  &prev_snapshot)
+                    : 0;
+
+      if (visible_in_snapshot == visible) {
+        // If the earliest snapshot is which this key is visible in
+        // is the same as the visibily of a previous instance of the
+        // same key, then this kv is not visible in any snapshot.
+        // Hidden by an newer entry for same user key
+        // TODO: why not > ?
+        assert(last_sequence_for_key >= ikey.sequence);
+        drop = true;  // (A)
+        ++key_drop_newer_entry;
+      } else if (ikey.type == kTypeDeletion &&
+                 ikey.sequence <= earliest_snapshot_ &&
+                 compact_->compaction->KeyNotExistsBeyondOutputLevel(
+                     ikey.user_key)) {
+        // For this user key:
+        // (1) there is no data in higher levels
+        // (2) data in lower levels will have larger sequence numbers
+        // (3) data in layers that are being compacted here and have
+        //     smaller sequence numbers will be dropped in the next
+        //     few iterations of this loop (by rule (A) above).
+        // Therefore this deletion marker is obsolete and can be dropped.
+        drop = true;
+        ++key_drop_obsolete;
+      } else if (ikey.type == kTypeMerge) {
+        if (!merge.HasOperator()) {
+          LogToBuffer(log_buffer_, "Options::merge_operator is null.");
+          status = Status::InvalidArgument(
+              "merge_operator is not properly initialized.");
+          break;
+        }
+        // We know the merge type entry is not hidden, otherwise we would
+        // have hit (A)
+        // We encapsulate the merge related state machine in a different
+        // object to minimize change to the existing flow. Turn out this
+        // logic could also be nicely re-used for memtable flush purge
+        // optimization in BuildTable.
+        int steps = 0;
+        merge.MergeUntil(input, prev_snapshot, bottommost_level_,
+                         db_options_.statistics.get(), &steps);
+        // Skip the Merge ops
+        combined_idx = combined_idx - 1 + steps;
+
+        current_entry_is_merging = true;
+        if (merge.IsSuccess()) {
+          // Successfully found Put/Delete/(end-of-key-range) while merging
+          // Get the merge result
+          key = merge.key();
+          ParseInternalKey(key, &ikey);
+          value = merge.value();
+        } else {
+          // Did not find a Put/Delete/(end-of-key-range) while merging
+          // We now have some stack of merge operands to write out.
+          // NOTE: key,value, and ikey are now referring to old entries.
+          //       These will be correctly set below.
+          assert(!merge.keys().empty());
+          assert(merge.keys().size() == merge.values().size());
+
+          // Hack to make sure last_sequence_for_key is correct
+          ParseInternalKey(merge.keys().front(), &ikey);
+        }
+      }
+
+      last_sequence_for_key = ikey.sequence;
+      visible_in_snapshot = visible;
+    }
+
+    if (!drop) {
+      // We may write a single key (e.g.: for Put/Delete or successful merge).
+      // Or we may instead have to write a sequence/list of keys.
+      // We have to write a sequence iff we have an unsuccessful merge
+      bool has_merge_list = current_entry_is_merging && !merge.IsSuccess();
+      const std::deque<std::string>* keys = nullptr;
+      const std::deque<std::string>* values = nullptr;
+      std::deque<std::string>::const_reverse_iterator key_iter;
+      std::deque<std::string>::const_reverse_iterator value_iter;
+      if (has_merge_list) {
+        keys = &merge.keys();
+        values = &merge.values();
+        key_iter = keys->rbegin();  // The back (*rbegin()) is the first key
+        value_iter = values->rbegin();
+
+        key = Slice(*key_iter);
+        value = Slice(*value_iter);
+      }
+
+      // If we have a list of keys to write, traverse the list.
+      // If we have a single key to write, simply write that key.
+      while (true) {
+        // Invariant: key,value,ikey will always be the next entry to write
+        char* kptr = (char*)key.data();
+        std::string kstr;
+
+        // Zeroing out the sequence number leads to better compression.
+        // If this is the bottommost level (no files in lower levels)
+        // and the earliest snapshot is larger than this seqno
+        // then we can squash the seqno to zero.
+        if (bottommost_level_ && ikey.sequence < earliest_snapshot_ &&
+            ikey.type != kTypeMerge) {
+          assert(ikey.type != kTypeDeletion);
+          // make a copy because updating in place would cause problems
+          // with the priority queue that is managing the input key iterator
+          kstr.assign(key.data(), key.size());
+          kptr = (char*)kstr.c_str();
+          UpdateInternalKey(kptr, key.size(), (uint64_t)0, ikey.type);
+        }
+
+        Slice newkey(kptr, key.size());
+        assert((key.clear(), 1));  // we do not need 'key' anymore
+
+        // Open output file if necessary
+        if (compact_->builder == nullptr) {
+          status = OpenCompactionOutputFile();
+          if (!status.ok()) {
+            break;
+          }
+        }
+
+        SequenceNumber seqno = GetInternalKeySeqno(newkey);
+        if (compact_->builder->NumEntries() == 0) {
+          compact_->current_output()->smallest.DecodeFrom(newkey);
+          compact_->current_output()->smallest_seqno = seqno;
+        } else {
+          compact_->current_output()->smallest_seqno =
+              std::min(compact_->current_output()->smallest_seqno, seqno);
+        }
+        compact_->current_output()->largest.DecodeFrom(newkey);
+        compact_->builder->Add(newkey, value);
+        (*num_output_records)++,
+            compact_->current_output()->largest_seqno =
+                std::max(compact_->current_output()->largest_seqno, seqno);
+
+        // Close output file if it is big enough
+        if (compact_->builder->FileSize() >=
+            compact_->compaction->MaxOutputFileSize()) {
+          status = FinishCompactionOutputFile(input);
+          if (!status.ok()) {
+            break;
+          }
+        }
+
+        // If we have a list of entries, move to next element
+        // If we only had one entry, then break the loop.
+        if (has_merge_list) {
+          ++key_iter;
+          ++value_iter;
+
+          // If at end of list
+          if (key_iter == keys->rend() || value_iter == values->rend()) {
+            // Sanity Check: if one ends, then both end
+            assert(key_iter == keys->rend() && value_iter == values->rend());
+            break;
+          }
+
+          // Otherwise not at end of list. Update key, value, and ikey.
+          key = Slice(*key_iter);
+          value = Slice(*value_iter);
+          ParseInternalKey(key, &ikey);
+
+        } else {
+          // Only had one item to begin with (Put/Delete)
+          break;
+        }
+      }  // while (true)
+    }    // if (!drop)
+
+    // MergeUntil has moved input to the next entry
+    if (!current_entry_is_merging) {
+      input->Next();
+    }
+  }
+  if (key_drop_user > 0) {
+    RecordTick(stats_, COMPACTION_KEY_DROP_USER, key_drop_user);
+  }
+  if (key_drop_newer_entry > 0) {
+    RecordTick(stats_, COMPACTION_KEY_DROP_NEWER_ENTRY, key_drop_newer_entry);
+  }
+  if (key_drop_obsolete > 0) {
+    RecordTick(stats_, COMPACTION_KEY_DROP_OBSOLETE, key_drop_obsolete);
+  }
+  RecordCompactionIOStats();
+
+  return status;
+}
+
+void CompactionJob::CallCompactionFilterV2(
+    CompactionFilterV2* compaction_filter_v2) {
+  if (compact_ == nullptr || compaction_filter_v2 == nullptr) {
+    return;
+  }
+
+  // Assemble slice vectors for user keys and existing values.
+  // We also keep track of our parsed internal key structs because
+  // we may need to access the sequence number in the event that
+  // keys are garbage collected during the filter process.
+  std::vector<ParsedInternalKey> ikey_buf;
+  std::vector<Slice> user_key_buf;
+  std::vector<Slice> existing_value_buf;
+
+  for (const auto& key : compact_->key_str_buf_) {
+    ParsedInternalKey ikey;
+    ParseInternalKey(Slice(key), &ikey);
+    ikey_buf.emplace_back(ikey);
+    user_key_buf.emplace_back(ikey.user_key);
+  }
+  for (const auto& value : compact_->existing_value_str_buf_) {
+    existing_value_buf.emplace_back(Slice(value));
+  }
+
+  // If the user has specified a compaction filter and the sequence
+  // number is greater than any external snapshot, then invoke the
+  // filter.
+  // If the return value of the compaction filter is true, replace
+  // the entry with a delete marker.
+  compact_->to_delete_buf_ = compaction_filter_v2->Filter(
+      compact_->compaction->level(), user_key_buf, existing_value_buf,
+      &compact_->new_value_buf_, &compact_->value_changed_buf_);
+
+  // new_value_buf_.size() <= to_delete__buf_.size(). "=" iff all
+  // kv-pairs in this compaction run needs to be deleted.
+  assert(compact_->to_delete_buf_.size() == compact_->key_str_buf_.size());
+  assert(compact_->to_delete_buf_.size() ==
+         compact_->existing_value_str_buf_.size());
+  assert(compact_->to_delete_buf_.size() ==
+         compact_->value_changed_buf_.size());
+
+  int new_value_idx = 0;
+  for (unsigned int i = 0; i < compact_->to_delete_buf_.size(); ++i) {
+    if (compact_->to_delete_buf_[i]) {
+      // update the string buffer directly
+      // the Slice buffer points to the updated buffer
+      UpdateInternalKey(&compact_->key_str_buf_[i][0],
+                        compact_->key_str_buf_[i].size(), ikey_buf[i].sequence,
+                        kTypeDeletion);
+
+      // no value associated with delete
+      compact_->existing_value_str_buf_[i].clear();
+      RecordTick(stats_, COMPACTION_KEY_DROP_USER);
+    } else if (compact_->value_changed_buf_[i]) {
+      compact_->existing_value_str_buf_[i] =
+          compact_->new_value_buf_[new_value_idx++];
+    }
+  }  // for
+}
+
+Status CompactionJob::FinishCompactionOutputFile(Iterator* input) {
+  assert(compact_ != nullptr);
+  assert(compact_->outfile);
+  assert(compact_->builder != nullptr);
+
+  const uint64_t output_number = compact_->current_output()->number;
+  const uint32_t output_path_id = compact_->current_output()->path_id;
+  assert(output_number != 0);
+
+  // Check for iterator errors
+  Status s = input->status();
+  const uint64_t current_entries = compact_->builder->NumEntries();
+  if (s.ok()) {
+    s = compact_->builder->Finish();
+  } else {
+    compact_->builder->Abandon();
+  }
+  const uint64_t current_bytes = compact_->builder->FileSize();
+  compact_->current_output()->file_size = current_bytes;
+  compact_->total_bytes += current_bytes;
+  compact_->builder.reset();
+
+  // Finish and check for file errors
+  if (s.ok() && !db_options_.disableDataSync) {
+    if (db_options_.use_fsync) {
+      StopWatch sw(env_, stats_, COMPACTION_OUTFILE_SYNC_MICROS);
+      s = compact_->outfile->Fsync();
+    } else {
+      StopWatch sw(env_, stats_, COMPACTION_OUTFILE_SYNC_MICROS);
+      s = compact_->outfile->Sync();
+    }
+  }
+  if (s.ok()) {
+    s = compact_->outfile->Close();
+  }
+  compact_->outfile.reset();
+
+  if (s.ok() && current_entries > 0) {
+    // Verify that the table is usable
+    ColumnFamilyData* cfd = compact_->compaction->column_family_data();
+    FileDescriptor fd(output_number, output_path_id, current_bytes);
+    Iterator* iter = cfd->table_cache()->NewIterator(
+        ReadOptions(), env_options_, cfd->internal_comparator(), fd);
+    s = iter->status();
+    delete iter;
+    if (s.ok()) {
+      Log(db_options_.info_log, "[%s] Generated table #%" PRIu64 ": %" PRIu64
+                                " keys, %" PRIu64 " bytes",
+          cfd->GetName().c_str(), output_number, current_entries,
+          current_bytes);
+    }
+  }
+  return s;
+}
+
+Status CompactionJob::InstallCompactionResults() {
+  db_mutex_->AssertHeld();
+
+  // paranoia: verify that the files that we started with
+  // still exist in the current version and in the same original level.
+  // This ensures that a concurrent compaction did not erroneously
+  // pick the same files to compact_.
+  if (!versions_->VerifyCompactionFileConsistency(compact_->compaction)) {
+    Log(db_options_.info_log, "[%s] Compaction %d@%d + %d@%d files aborted",
+        compact_->compaction->column_family_data()->GetName().c_str(),
+        compact_->compaction->num_input_files(0), compact_->compaction->level(),
+        compact_->compaction->num_input_files(1),
+        compact_->compaction->output_level());
+    return Status::Corruption("Compaction input files inconsistent");
+  }
+
+  LogToBuffer(log_buffer_, "[%s] Compacted %d@%d + %d@%d files => %lld bytes",
+              compact_->compaction->column_family_data()->GetName().c_str(),
+              compact_->compaction->num_input_files(0),
+              compact_->compaction->level(),
+              compact_->compaction->num_input_files(1),
+              compact_->compaction->output_level(),
+              static_cast<long long>(compact_->total_bytes));
+
+  // Add compaction outputs
+  compact_->compaction->AddInputDeletions(compact_->compaction->edit());
+  for (size_t i = 0; i < compact_->outputs.size(); i++) {
+    const CompactionState::Output& out = compact_->outputs[i];
+    compact_->compaction->edit()->AddFile(
+        compact_->compaction->output_level(), out.number, out.path_id,
+        out.file_size, out.smallest, out.largest, out.smallest_seqno,
+        out.largest_seqno);
+  }
+  return versions_->LogAndApply(
+      compact_->compaction->column_family_data(), mutable_cf_options_,
+      compact_->compaction->edit(), db_mutex_, db_directory_);
+}
+
+// Given a sequence number, return the sequence number of the
+// earliest snapshot that this sequence number is visible in.
+// The snapshots themselves are arranged in ascending order of
+// sequence numbers.
+// Employ a sequential search because the total number of
+// snapshots are typically small.
+inline SequenceNumber CompactionJob::findEarliestVisibleSnapshot(
+    SequenceNumber in, const std::vector<SequenceNumber>& snapshots,
+    SequenceNumber* prev_snapshot) {
+  SequenceNumber prev __attribute__((unused)) = 0;
+  for (const auto cur : snapshots) {
+    assert(prev <= cur);
+    if (cur >= in) {
+      *prev_snapshot = prev;
+      return cur;
+    }
+    prev = cur;  // assignment
+    assert(prev);
+  }
+  Log(db_options_.info_log,
+      "Looking for seqid %" PRIu64 " but maxseqid is %" PRIu64 "", in,
+      snapshots[snapshots.size() - 1]);
+  assert(0);
+  return 0;
+}
+
+void CompactionJob::RecordCompactionIOStats() {
+  RecordTick(stats_, COMPACT_READ_BYTES, IOSTATS(bytes_read));
+  IOSTATS_RESET(bytes_read);
+  RecordTick(stats_, COMPACT_WRITE_BYTES, IOSTATS(bytes_written));
+  IOSTATS_RESET(bytes_written);
+}
+
+// Frees up unused file number.
+void CompactionJob::ReleaseCompactionUnusedFileNumbers() {
+  db_mutex_->AssertHeld();
+  for (const auto file_number : compact_->allocated_file_numbers) {
+    pending_outputs_->erase(file_number);
+  }
+}
+
+Status CompactionJob::OpenCompactionOutputFile() {
+  assert(compact_ != nullptr);
+  assert(compact_->builder == nullptr);
+  uint64_t file_number;
+  // If we have not yet exhausted the pre-allocated file numbers,
+  // then use the one from the front. Otherwise, we have to acquire
+  // the heavyweight lock and allocate a new file number.
+  if (!compact_->allocated_file_numbers.empty()) {
+    file_number = compact_->allocated_file_numbers.front();
+    compact_->allocated_file_numbers.pop_front();
+  } else {
+    db_mutex_->Lock();
+    file_number = versions_->NewFileNumber();
+    pending_outputs_->insert(
+        {file_number, compact_->compaction->GetOutputPathId()});
+    db_mutex_->Unlock();
+  }
+  // Make the output file
+  std::string fname = TableFileName(db_options_.db_paths, file_number,
+                                    compact_->compaction->GetOutputPathId());
+  Status s = env_->NewWritableFile(fname, &compact_->outfile, env_options_);
+
+  if (!s.ok()) {
+    Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
+        "[%s] OpenCompactionOutputFiles for table #%" PRIu64
+        " "
+        "fails at NewWritableFile with status %s",
+        compact_->compaction->column_family_data()->GetName().c_str(),
+        file_number, s.ToString().c_str());
+    LogFlush(db_options_.info_log);
+    return s;
+  }
+  CompactionState::Output out;
+  out.number = file_number;
+  out.path_id = compact_->compaction->GetOutputPathId();
+  out.smallest.Clear();
+  out.largest.Clear();
+  out.smallest_seqno = out.largest_seqno = 0;
+
+  compact_->outputs.push_back(out);
+  compact_->outfile->SetIOPriority(Env::IO_LOW);
+  compact_->outfile->SetPreallocationBlockSize(
+      compact_->compaction->OutputFilePreallocationSize(mutable_cf_options_));
+
+  ColumnFamilyData* cfd = compact_->compaction->column_family_data();
+  compact_->builder.reset(NewTableBuilder(
+      *cfd->ioptions(), cfd->internal_comparator(), compact_->outfile.get(),
+      compact_->compaction->OutputCompressionType(),
+      cfd->ioptions()->compression_opts));
+  LogFlush(db_options_.info_log);
+  return s;
+}
+
+void CompactionJob::CleanupCompaction(Status status) {
+  db_mutex_->AssertHeld();
+  if (compact_->builder != nullptr) {
+    // May happen if we get a shutdown call in the middle of compaction
+    compact_->builder->Abandon();
+    compact_->builder.reset();
+  } else {
+    assert(!status.ok() || compact_->outfile == nullptr);
+  }
+  for (size_t i = 0; i < compact_->outputs.size(); i++) {
+    const CompactionState::Output& out = compact_->outputs[i];
+    pending_outputs_->erase(out.number);
+
+    // If this file was inserted into the table cache then remove
+    // them here because this compaction was not committed.
+    if (!status.ok()) {
+      TableCache::Evict(table_cache_.get(), out.number);
+    }
+  }
+  delete compact_;
+  compact_ = nullptr;
+}
+
+}  // namespace rocksdb
diff --git a/db/compaction_job.h b/db/compaction_job.h
new file mode 100644
index 000000000..7b91e012a
--- /dev/null
+++ b/db/compaction_job.h
@@ -0,0 +1,131 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include <atomic>
+#include <deque>
+#include <limits>
+#include <set>
+#include <utility>
+#include <vector>
+#include <string>
+#include <functional>
+
+#include "db/dbformat.h"
+#include "db/log_writer.h"
+#include "db/snapshot.h"
+#include "db/column_family.h"
+#include "db/version_edit.h"
+#include "db/memtable_list.h"
+#include "port/port.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/transaction_log.h"
+#include "util/autovector.h"
+#include "util/stop_watch.h"
+#include "util/thread_local.h"
+#include "util/scoped_arena_iterator.h"
+#include "db/internal_stats.h"
+#include "db/write_controller.h"
+#include "db/flush_scheduler.h"
+#include "db/write_thread.h"
+#include "db/job_context.h"
+
+namespace rocksdb {
+
+class MemTable;
+class TableCache;
+class Version;
+class VersionEdit;
+class VersionSet;
+class Arena;
+
+class CompactionJob {
+ public:
+  // TODO(icanadi) make effort to reduce number of parameters here
+  // IMPORTANT: mutable_cf_options needs to be alive while CompactionJob is
+  // alive
+  CompactionJob(Compaction* compaction, const DBOptions& db_options,
+                const MutableCFOptions& mutable_cf_options,
+                const EnvOptions& env_options, VersionSet* versions,
+                port::Mutex* db_mutex, std::atomic<bool>* shutting_down,
+                FileNumToPathIdMap* pending_outputs, LogBuffer* log_buffer,
+                Directory* db_directory, Statistics* stats,
+                SnapshotList* snapshot_list, bool is_snapshot_supported,
+                std::shared_ptr<Cache> table_cache,
+                std::function<uint64_t()> yield_callback);
+
+  ~CompactionJob() { assert(compact_ == nullptr); }
+
+  // no copy/move
+  CompactionJob(CompactionJob&& job) = delete;
+  CompactionJob(const CompactionJob& job) = delete;
+  CompactionJob& operator=(const CompactionJob& job) = delete;
+
+  // REQUIRED: mutex held
+  void Prepare();
+  // REQUIRED mutex not held
+  Status Run();
+  // REQUIRED: mutex held
+  // status is the return of Run()
+  Status Install(Status status);
+
+ private:
+  void AllocateCompactionOutputFileNumbers();
+  // Call compaction filter if is_compaction_v2 is not true. Then iterate
+  // through input and compact the kv-pairs
+  Status ProcessKeyValueCompaction(int64_t* imm_micros, Iterator* input,
+                                   bool is_compaction_v2,
+                                   int* num_output_records);
+  // Call compaction_filter_v2->Filter() on kv-pairs in compact
+  void CallCompactionFilterV2(CompactionFilterV2* compaction_filter_v2);
+  Status FinishCompactionOutputFile(Iterator* input);
+  Status InstallCompactionResults();
+  SequenceNumber findEarliestVisibleSnapshot(
+      SequenceNumber in, const std::vector<SequenceNumber>& snapshots,
+      SequenceNumber* prev_snapshot);
+  void RecordCompactionIOStats();
+  void ReleaseCompactionUnusedFileNumbers();
+  Status OpenCompactionOutputFile();
+  void CleanupCompaction(Status status);
+
+  // CompactionJob state
+  struct CompactionState;
+  CompactionState* compact_;
+
+  bool bottommost_level_;
+  SequenceNumber earliest_snapshot_;
+  SequenceNumber visible_at_tip_;
+  SequenceNumber latest_snapshot_;
+
+  InternalStats::CompactionStats compaction_stats_;
+
+  // DBImpl state
+  const DBOptions& db_options_;
+  const MutableCFOptions& mutable_cf_options_;
+  const EnvOptions& env_options_;
+  Env* env_;
+  VersionSet* versions_;
+  port::Mutex* db_mutex_;
+  std::atomic<bool>* shutting_down_;
+  FileNumToPathIdMap* pending_outputs_;
+  LogBuffer* log_buffer_;
+  Directory* db_directory_;
+  Statistics* stats_;
+  SnapshotList* snapshots_;
+  bool is_snapshot_supported_;
+  std::shared_ptr<Cache> table_cache_;
+
+  // yield callback
+  std::function<uint64_t()> yield_callback_;
+};
+
+}  // namespace rocksdb
diff --git a/db/db_impl.cc b/db/db_impl.cc
index 5b2635d1a..2fbd40637 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -28,6 +28,7 @@
 
 #include "db/builder.h"
 #include "db/flush_job.h"
+#include "db/compaction_job.h"
 #include "db/db_iter.h"
 #include "db/dbformat.h"
 #include "db/filename.h"
@@ -93,148 +94,6 @@ struct DBImpl::WriteContext {
   }
 };
 
-struct DBImpl::CompactionState {
-  Compaction* const compaction;
-
-  // If there were two snapshots with seq numbers s1 and
-  // s2 and s1 < s2, and if we find two instances of a key k1 then lies
-  // entirely within s1 and s2, then the earlier version of k1 can be safely
-  // deleted because that version is not visible in any snapshot.
-  std::vector<SequenceNumber> existing_snapshots;
-
-  // Files produced by compaction
-  struct Output {
-    uint64_t number;
-    uint32_t path_id;
-    uint64_t file_size;
-    InternalKey smallest, largest;
-    SequenceNumber smallest_seqno, largest_seqno;
-  };
-  std::vector<Output> outputs;
-  std::list<uint64_t> allocated_file_numbers;
-
-  // State kept for output being generated
-  unique_ptr<WritableFile> outfile;
-  unique_ptr<TableBuilder> builder;
-
-  uint64_t total_bytes;
-
-  Output* current_output() { return &outputs[outputs.size()-1]; }
-
-  explicit CompactionState(Compaction* c)
-      : compaction(c),
-        total_bytes(0) {
-  }
-
-  // Create a client visible context of this compaction
-  CompactionFilter::Context GetFilterContextV1() {
-    CompactionFilter::Context context;
-    context.is_full_compaction = compaction->IsFullCompaction();
-    context.is_manual_compaction = compaction->IsManualCompaction();
-    return context;
-  }
-
-  // Create a client visible context of this compaction
-  CompactionFilterContext GetFilterContext() {
-    CompactionFilterContext context;
-    context.is_full_compaction = compaction->IsFullCompaction();
-    context.is_manual_compaction = compaction->IsManualCompaction();
-    return context;
-  }
-
-  std::vector<std::string> key_str_buf_;
-  std::vector<std::string> existing_value_str_buf_;
-  // new_value_buf_ will only be appended if a value changes
-  std::vector<std::string> new_value_buf_;
-  // if values_changed_buf_[i] is true
-  // new_value_buf_ will add a new entry with the changed value
-  std::vector<bool> value_changed_buf_;
-  // to_delete_buf_[i] is true iff key_buf_[i] is deleted
-  std::vector<bool> to_delete_buf_;
-
-  std::vector<std::string> other_key_str_buf_;
-  std::vector<std::string> other_value_str_buf_;
-
-  std::vector<Slice> combined_key_buf_;
-  std::vector<Slice> combined_value_buf_;
-
-  std::string cur_prefix_;
-
-  // Buffers the kv-pair that will be run through compaction filter V2
-  // in the future.
-  void BufferKeyValueSlices(const Slice& key, const Slice& value) {
-    key_str_buf_.emplace_back(key.ToString());
-    existing_value_str_buf_.emplace_back(value.ToString());
-  }
-
-  // Buffers the kv-pair that will not be run through compaction filter V2
-  // in the future.
-  void BufferOtherKeyValueSlices(const Slice& key, const Slice& value) {
-    other_key_str_buf_.emplace_back(key.ToString());
-    other_value_str_buf_.emplace_back(value.ToString());
-  }
-
-  // Add a kv-pair to the combined buffer
-  void AddToCombinedKeyValueSlices(const Slice& key, const Slice& value) {
-    // The real strings are stored in the batch buffers
-    combined_key_buf_.emplace_back(key);
-    combined_value_buf_.emplace_back(value);
-  }
-
-  // Merging the two buffers
-  void MergeKeyValueSliceBuffer(const InternalKeyComparator* comparator) {
-    size_t i = 0;
-    size_t j = 0;
-    size_t total_size = key_str_buf_.size() + other_key_str_buf_.size();
-    combined_key_buf_.reserve(total_size);
-    combined_value_buf_.reserve(total_size);
-
-    while (i + j < total_size) {
-      int comp_res = 0;
-      if (i < key_str_buf_.size() && j < other_key_str_buf_.size()) {
-        comp_res = comparator->Compare(key_str_buf_[i], other_key_str_buf_[j]);
-      } else if (i >= key_str_buf_.size() && j < other_key_str_buf_.size()) {
-        comp_res = 1;
-      } else if (j >= other_key_str_buf_.size() && i < key_str_buf_.size()) {
-        comp_res = -1;
-      }
-      if (comp_res > 0) {
-        AddToCombinedKeyValueSlices(other_key_str_buf_[j], other_value_str_buf_[j]);
-        j++;
-      } else if (comp_res < 0) {
-        AddToCombinedKeyValueSlices(key_str_buf_[i], existing_value_str_buf_[i]);
-        i++;
-      }
-    }
-  }
-
-  void CleanupBatchBuffer() {
-    to_delete_buf_.clear();
-    key_str_buf_.clear();
-    existing_value_str_buf_.clear();
-    new_value_buf_.clear();
-    value_changed_buf_.clear();
-
-    to_delete_buf_.shrink_to_fit();
-    key_str_buf_.shrink_to_fit();
-    existing_value_str_buf_.shrink_to_fit();
-    new_value_buf_.shrink_to_fit();
-    value_changed_buf_.shrink_to_fit();
-
-    other_key_str_buf_.clear();
-    other_value_str_buf_.clear();
-    other_key_str_buf_.shrink_to_fit();
-    other_value_str_buf_.shrink_to_fit();
-  }
-
-  void CleanupMergedBuffer() {
-    combined_key_buf_.clear();
-    combined_value_buf_.clear();
-    combined_key_buf_.shrink_to_fit();
-    combined_value_buf_.shrink_to_fit();
-  }
-};
-
 Options SanitizeOptions(const std::string& dbname,
                         const InternalKeyComparator* icmp,
                         const Options& src) {
@@ -1608,13 +1467,6 @@ void DBImpl::RecordFlushIOStats() {
   IOSTATS_RESET(bytes_written);
 }
 
-void DBImpl::RecordCompactionIOStats() {
-  RecordTick(stats_, COMPACT_READ_BYTES, IOSTATS(bytes_read));
-  IOSTATS_RESET(bytes_read);
-  RecordTick(stats_, COMPACT_WRITE_BYTES, IOSTATS(bytes_written));
-  IOSTATS_RESET(bytes_written);
-}
-
 void DBImpl::BGWorkFlush(void* db) {
   IOSTATS_SET_THREAD_POOL_ID(Env::Priority::HIGH);
   reinterpret_cast<DBImpl*>(db)->BackgroundCallFlush();
@@ -1961,10 +1813,26 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress, JobContext* job_context,
     *madeProgress = true;
   } else {
     MaybeScheduleFlushOrCompaction(); // do more compaction work in parallel.
-    CompactionState* compact = new CompactionState(c.get());
-    status = DoCompactionWork(compact, *c->mutable_cf_options(), job_context,
-                              log_buffer);
-    CleanupCompaction(compact, status);
+
+    auto yield_callback = [&]() {
+      return CallFlushDuringCompaction(c->column_family_data(),
+                                       *c->mutable_cf_options(), job_context,
+                                       log_buffer);
+    };
+    CompactionJob compaction_job(
+        c.get(), db_options_, *c->mutable_cf_options(), env_options_,
+        versions_.get(), &mutex_, &shutting_down_, &pending_outputs_,
+        log_buffer, db_directory_.get(), stats_, &snapshots_,
+        IsSnapshotSupported(), table_cache_, std::move(yield_callback));
+    compaction_job.Prepare();
+    mutex_.Unlock();
+    status = compaction_job.Run();
+    mutex_.Lock();
+    status = compaction_job.Install(status);
+    if (status.ok()) {
+      InstallSuperVersionBackground(c->column_family_data(), job_context,
+                                    *c->mutable_cf_options());
+    }
     c->ReleaseCompactionFiles(status);
     c->ReleaseInputs();
     *madeProgress = true;
@@ -2020,226 +1888,6 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress, JobContext* job_context,
   return status;
 }
 
-void DBImpl::CleanupCompaction(CompactionState* compact, Status status) {
-  mutex_.AssertHeld();
-  if (compact->builder != nullptr) {
-    // May happen if we get a shutdown call in the middle of compaction
-    compact->builder->Abandon();
-    compact->builder.reset();
-  } else {
-    assert(!status.ok() || compact->outfile == nullptr);
-  }
-  for (size_t i = 0; i < compact->outputs.size(); i++) {
-    const CompactionState::Output& out = compact->outputs[i];
-    pending_outputs_.erase(out.number);
-
-    // If this file was inserted into the table cache then remove
-    // them here because this compaction was not committed.
-    if (!status.ok()) {
-      TableCache::Evict(table_cache_.get(), out.number);
-    }
-  }
-  delete compact;
-}
-
-// Allocate the file numbers for the output file. We allocate as
-// many output file numbers as there are files in level+1 (at least one)
-// Insert them into pending_outputs so that they do not get deleted.
-void DBImpl::AllocateCompactionOutputFileNumbers(CompactionState* compact) {
-  mutex_.AssertHeld();
-  assert(compact != nullptr);
-  assert(compact->builder == nullptr);
-  int filesNeeded = compact->compaction->num_input_files(1);
-  for (int i = 0; i < std::max(filesNeeded, 1); i++) {
-    uint64_t file_number = versions_->NewFileNumber();
-    pending_outputs_[file_number] = compact->compaction->GetOutputPathId();
-    compact->allocated_file_numbers.push_back(file_number);
-  }
-}
-
-// Frees up unused file number.
-void DBImpl::ReleaseCompactionUnusedFileNumbers(CompactionState* compact) {
-  mutex_.AssertHeld();
-  for (const auto file_number : compact->allocated_file_numbers) {
-    pending_outputs_.erase(file_number);
-  }
-}
-
-Status DBImpl::OpenCompactionOutputFile(
-    CompactionState* compact, const MutableCFOptions& mutable_cf_options) {
-  assert(compact != nullptr);
-  assert(compact->builder == nullptr);
-  uint64_t file_number;
-  // If we have not yet exhausted the pre-allocated file numbers,
-  // then use the one from the front. Otherwise, we have to acquire
-  // the heavyweight lock and allocate a new file number.
-  if (!compact->allocated_file_numbers.empty()) {
-    file_number = compact->allocated_file_numbers.front();
-    compact->allocated_file_numbers.pop_front();
-  } else {
-    mutex_.Lock();
-    file_number = versions_->NewFileNumber();
-    pending_outputs_[file_number] = compact->compaction->GetOutputPathId();
-    mutex_.Unlock();
-  }
-  // Make the output file
-  std::string fname = TableFileName(db_options_.db_paths, file_number,
-                                    compact->compaction->GetOutputPathId());
-  Status s = env_->NewWritableFile(fname, &compact->outfile, env_options_);
-
-  if (!s.ok()) {
-    Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
-        "[%s] OpenCompactionOutputFiles for table #%" PRIu64 " "
-        "fails at NewWritableFile with status %s",
-        compact->compaction->column_family_data()->GetName().c_str(),
-        file_number, s.ToString().c_str());
-    LogFlush(db_options_.info_log);
-    return s;
-  }
-  CompactionState::Output out;
-  out.number = file_number;
-  out.path_id = compact->compaction->GetOutputPathId();
-  out.smallest.Clear();
-  out.largest.Clear();
-  out.smallest_seqno = out.largest_seqno = 0;
-
-  compact->outputs.push_back(out);
-  compact->outfile->SetIOPriority(Env::IO_LOW);
-  compact->outfile->SetPreallocationBlockSize(
-      compact->compaction->OutputFilePreallocationSize(mutable_cf_options));
-
-  ColumnFamilyData* cfd = compact->compaction->column_family_data();
-  compact->builder.reset(NewTableBuilder(
-      *cfd->ioptions(), cfd->internal_comparator(), compact->outfile.get(),
-      compact->compaction->OutputCompressionType(),
-      cfd->ioptions()->compression_opts));
-  LogFlush(db_options_.info_log);
-  return s;
-}
-
-Status DBImpl::FinishCompactionOutputFile(CompactionState* compact,
-                                          Iterator* input) {
-  assert(compact != nullptr);
-  assert(compact->outfile);
-  assert(compact->builder != nullptr);
-
-  const uint64_t output_number = compact->current_output()->number;
-  const uint32_t output_path_id = compact->current_output()->path_id;
-  assert(output_number != 0);
-
-  // Check for iterator errors
-  Status s = input->status();
-  const uint64_t current_entries = compact->builder->NumEntries();
-  if (s.ok()) {
-    s = compact->builder->Finish();
-  } else {
-    compact->builder->Abandon();
-  }
-  const uint64_t current_bytes = compact->builder->FileSize();
-  compact->current_output()->file_size = current_bytes;
-  compact->total_bytes += current_bytes;
-  compact->builder.reset();
-
-  // Finish and check for file errors
-  if (s.ok() && !db_options_.disableDataSync) {
-    if (db_options_.use_fsync) {
-      StopWatch sw(env_, stats_, COMPACTION_OUTFILE_SYNC_MICROS);
-      s = compact->outfile->Fsync();
-    } else {
-      StopWatch sw(env_, stats_, COMPACTION_OUTFILE_SYNC_MICROS);
-      s = compact->outfile->Sync();
-    }
-  }
-  if (s.ok()) {
-    s = compact->outfile->Close();
-  }
-  compact->outfile.reset();
-
-  if (s.ok() && current_entries > 0) {
-    // Verify that the table is usable
-    ColumnFamilyData* cfd = compact->compaction->column_family_data();
-    FileDescriptor fd(output_number, output_path_id, current_bytes);
-    Iterator* iter = cfd->table_cache()->NewIterator(
-        ReadOptions(), env_options_, cfd->internal_comparator(), fd);
-    s = iter->status();
-    delete iter;
-    if (s.ok()) {
-      Log(db_options_.info_log, "[%s] Generated table #%" PRIu64 ": %" PRIu64
-                             " keys, %" PRIu64 " bytes",
-          cfd->GetName().c_str(), output_number, current_entries,
-          current_bytes);
-    }
-  }
-  return s;
-}
-
-
-Status DBImpl::InstallCompactionResults(CompactionState* compact,
-    const MutableCFOptions& mutable_cf_options, LogBuffer* log_buffer) {
-  mutex_.AssertHeld();
-
-  // paranoia: verify that the files that we started with
-  // still exist in the current version and in the same original level.
-  // This ensures that a concurrent compaction did not erroneously
-  // pick the same files to compact.
-  if (!versions_->VerifyCompactionFileConsistency(compact->compaction)) {
-    Log(db_options_.info_log, "[%s] Compaction %d@%d + %d@%d files aborted",
-        compact->compaction->column_family_data()->GetName().c_str(),
-        compact->compaction->num_input_files(0), compact->compaction->level(),
-        compact->compaction->num_input_files(1),
-        compact->compaction->output_level());
-    return Status::Corruption("Compaction input files inconsistent");
-  }
-
-  LogToBuffer(log_buffer, "[%s] Compacted %d@%d + %d@%d files => %lld bytes",
-              compact->compaction->column_family_data()->GetName().c_str(),
-              compact->compaction->num_input_files(0),
-              compact->compaction->level(),
-              compact->compaction->num_input_files(1),
-              compact->compaction->output_level(),
-              static_cast<long long>(compact->total_bytes));
-
-  // Add compaction outputs
-  compact->compaction->AddInputDeletions(compact->compaction->edit());
-  for (size_t i = 0; i < compact->outputs.size(); i++) {
-    const CompactionState::Output& out = compact->outputs[i];
-    compact->compaction->edit()->AddFile(compact->compaction->output_level(),
-                                         out.number, out.path_id, out.file_size,
-                                         out.smallest, out.largest,
-                                         out.smallest_seqno, out.largest_seqno);
-  }
-  return versions_->LogAndApply(compact->compaction->column_family_data(),
-                                mutable_cf_options,
-                                compact->compaction->edit(), &mutex_,
-                                db_directory_.get());
-}
-
-// Given a sequence number, return the sequence number of the
-// earliest snapshot that this sequence number is visible in.
-// The snapshots themselves are arranged in ascending order of
-// sequence numbers.
-// Employ a sequential search because the total number of
-// snapshots are typically small.
-inline SequenceNumber DBImpl::findEarliestVisibleSnapshot(
-  SequenceNumber in, std::vector<SequenceNumber>& snapshots,
-  SequenceNumber* prev_snapshot) {
-  SequenceNumber prev __attribute__((unused)) = 0;
-  for (const auto cur : snapshots) {
-    assert(prev <= cur);
-    if (cur >= in) {
-      *prev_snapshot = prev;
-      return cur;
-    }
-    prev = cur; // assignment
-    assert(prev);
-  }
-  Log(db_options_.info_log,
-      "Looking for seqid %" PRIu64 " but maxseqid is %" PRIu64 "", in,
-      snapshots[snapshots.size() - 1]);
-  assert(0);
-  return 0;
-}
-
 uint64_t DBImpl::CallFlushDuringCompaction(
     ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
     JobContext* job_context, LogBuffer* log_buffer) {
@@ -2264,693 +1912,6 @@ uint64_t DBImpl::CallFlushDuringCompaction(
   return 0;
 }
 
-Status DBImpl::ProcessKeyValueCompaction(
-    const MutableCFOptions& mutable_cf_options, bool is_snapshot_supported,
-    SequenceNumber visible_at_tip, SequenceNumber earliest_snapshot,
-    SequenceNumber latest_snapshot, JobContext* job_context,
-    bool bottommost_level, int64_t* imm_micros, Iterator* input,
-    CompactionState* compact, bool is_compaction_v2, int* num_output_records,
-    LogBuffer* log_buffer) {
-  assert(num_output_records != nullptr);
-
-  size_t combined_idx = 0;
-  Status status;
-  std::string compaction_filter_value;
-  ParsedInternalKey ikey;
-  IterKey current_user_key;
-  bool has_current_user_key = false;
-  IterKey delete_key;
-  SequenceNumber last_sequence_for_key __attribute__((unused)) =
-    kMaxSequenceNumber;
-  SequenceNumber visible_in_snapshot = kMaxSequenceNumber;
-  ColumnFamilyData* cfd = compact->compaction->column_family_data();
-  MergeHelper merge(
-      cfd->user_comparator(), cfd->ioptions()->merge_operator,
-      db_options_.info_log.get(), cfd->ioptions()->min_partial_merge_operands,
-      false /* internal key corruption is expected */);
-  auto compaction_filter = cfd->ioptions()->compaction_filter;
-  std::unique_ptr<CompactionFilter> compaction_filter_from_factory = nullptr;
-  if (!compaction_filter) {
-    auto context = compact->GetFilterContextV1();
-    compaction_filter_from_factory =
-        cfd->ioptions()->compaction_filter_factory->CreateCompactionFilter(
-            context);
-    compaction_filter = compaction_filter_from_factory.get();
-  }
-
-  int64_t key_drop_user = 0;
-  int64_t key_drop_newer_entry = 0;
-  int64_t key_drop_obsolete = 0;
-  int64_t loop_cnt = 0;
-  while (input->Valid() && !shutting_down_.load(std::memory_order_acquire) &&
-         !cfd->IsDropped() && status.ok()) {
-    if (++loop_cnt > 1000) {
-      if (key_drop_user > 0) {
-        RecordTick(stats_, COMPACTION_KEY_DROP_USER, key_drop_user);
-        key_drop_user = 0;
-      }
-      if (key_drop_newer_entry > 0) {
-        RecordTick(stats_, COMPACTION_KEY_DROP_NEWER_ENTRY,
-                   key_drop_newer_entry);
-        key_drop_newer_entry = 0;
-      }
-      if (key_drop_obsolete > 0) {
-        RecordTick(stats_, COMPACTION_KEY_DROP_OBSOLETE, key_drop_obsolete);
-        key_drop_obsolete = 0;
-      }
-      RecordCompactionIOStats();
-      loop_cnt = 0;
-    }
-    // FLUSH preempts compaction
-    // TODO(icanadi) this currently only checks if flush is necessary on
-    // compacting column family. we should also check if flush is necessary on
-    // other column families, too
-    (*imm_micros) += CallFlushDuringCompaction(cfd, mutable_cf_options,
-                                               job_context, log_buffer);
-
-    Slice key;
-    Slice value;
-    // If is_compaction_v2 is on, kv-pairs are reset to the prefix batch.
-    // This prefix batch should contain results after calling
-    // compaction_filter_v2.
-    //
-    // If is_compaction_v2 is off, this function will go through all the
-    // kv-pairs in input.
-    if (!is_compaction_v2) {
-      key = input->key();
-      value = input->value();
-    } else {
-      if (combined_idx >= compact->combined_key_buf_.size()) {
-        break;
-      }
-      assert(combined_idx < compact->combined_key_buf_.size());
-      key = compact->combined_key_buf_[combined_idx];
-      value = compact->combined_value_buf_[combined_idx];
-
-      ++combined_idx;
-    }
-
-    if (compact->compaction->ShouldStopBefore(key) &&
-        compact->builder != nullptr) {
-      status = FinishCompactionOutputFile(compact, input);
-      if (!status.ok()) {
-        break;
-      }
-    }
-
-    // Handle key/value, add to state, etc.
-    bool drop = false;
-    bool current_entry_is_merging = false;
-    if (!ParseInternalKey(key, &ikey)) {
-      // Do not hide error keys
-      // TODO: error key stays in db forever? Figure out the intention/rationale
-      // v10 error v8 : we cannot hide v8 even though it's pretty obvious.
-      current_user_key.Clear();
-      has_current_user_key = false;
-      last_sequence_for_key = kMaxSequenceNumber;
-      visible_in_snapshot = kMaxSequenceNumber;
-    } else {
-      if (!has_current_user_key ||
-          cfd->user_comparator()->Compare(ikey.user_key,
-                                          current_user_key.GetKey()) != 0) {
-        // First occurrence of this user key
-        current_user_key.SetKey(ikey.user_key);
-        has_current_user_key = true;
-        last_sequence_for_key = kMaxSequenceNumber;
-        visible_in_snapshot = kMaxSequenceNumber;
-        // apply the compaction filter to the first occurrence of the user key
-        if (compaction_filter && !is_compaction_v2 &&
-            ikey.type == kTypeValue &&
-            (visible_at_tip || ikey.sequence > latest_snapshot)) {
-          // If the user has specified a compaction filter and the sequence
-          // number is greater than any external snapshot, then invoke the
-          // filter.
-          // If the return value of the compaction filter is true, replace
-          // the entry with a delete marker.
-          bool value_changed = false;
-          compaction_filter_value.clear();
-          bool to_delete = compaction_filter->Filter(
-              compact->compaction->level(), ikey.user_key, value,
-              &compaction_filter_value, &value_changed);
-          if (to_delete) {
-            // make a copy of the original key and convert it to a delete
-            delete_key.SetInternalKey(ExtractUserKey(key), ikey.sequence,
-                                      kTypeDeletion);
-            // anchor the key again
-            key = delete_key.GetKey();
-            // needed because ikey is backed by key
-            ParseInternalKey(key, &ikey);
-            // no value associated with delete
-            value.clear();
-            ++key_drop_user;
-          } else if (value_changed) {
-            value = compaction_filter_value;
-          }
-        }
-      }
-
-      // If there are no snapshots, then this kv affect visibility at tip.
-      // Otherwise, search though all existing snapshots to find
-      // the earlist snapshot that is affected by this kv.
-      SequenceNumber prev_snapshot = 0; // 0 means no previous snapshot
-      SequenceNumber visible = visible_at_tip ? visible_at_tip :
-        is_snapshot_supported ?  findEarliestVisibleSnapshot(ikey.sequence,
-                                  compact->existing_snapshots, &prev_snapshot)
-                              : 0;
-
-      if (visible_in_snapshot == visible) {
-        // If the earliest snapshot is which this key is visible in
-        // is the same as the visibily of a previous instance of the
-        // same key, then this kv is not visible in any snapshot.
-        // Hidden by an newer entry for same user key
-        // TODO: why not > ?
-        assert(last_sequence_for_key >= ikey.sequence);
-        drop = true;    // (A)
-        ++key_drop_newer_entry;
-      } else if (ikey.type == kTypeDeletion &&
-          ikey.sequence <= earliest_snapshot &&
-          compact->compaction->KeyNotExistsBeyondOutputLevel(ikey.user_key)) {
-        // For this user key:
-        // (1) there is no data in higher levels
-        // (2) data in lower levels will have larger sequence numbers
-        // (3) data in layers that are being compacted here and have
-        //     smaller sequence numbers will be dropped in the next
-        //     few iterations of this loop (by rule (A) above).
-        // Therefore this deletion marker is obsolete and can be dropped.
-        drop = true;
-        ++key_drop_obsolete;
-      } else if (ikey.type == kTypeMerge) {
-        if (!merge.HasOperator()) {
-          LogToBuffer(log_buffer, "Options::merge_operator is null.");
-          status = Status::InvalidArgument(
-              "merge_operator is not properly initialized.");
-          break;
-        }
-        // We know the merge type entry is not hidden, otherwise we would
-        // have hit (A)
-        // We encapsulate the merge related state machine in a different
-        // object to minimize change to the existing flow. Turn out this
-        // logic could also be nicely re-used for memtable flush purge
-        // optimization in BuildTable.
-        int steps = 0;
-        merge.MergeUntil(input, prev_snapshot, bottommost_level,
-            db_options_.statistics.get(), &steps);
-        // Skip the Merge ops
-        combined_idx = combined_idx - 1 + steps;
-
-        current_entry_is_merging = true;
-        if (merge.IsSuccess()) {
-          // Successfully found Put/Delete/(end-of-key-range) while merging
-          // Get the merge result
-          key = merge.key();
-          ParseInternalKey(key, &ikey);
-          value = merge.value();
-        } else {
-          // Did not find a Put/Delete/(end-of-key-range) while merging
-          // We now have some stack of merge operands to write out.
-          // NOTE: key,value, and ikey are now referring to old entries.
-          //       These will be correctly set below.
-          assert(!merge.keys().empty());
-          assert(merge.keys().size() == merge.values().size());
-
-          // Hack to make sure last_sequence_for_key is correct
-          ParseInternalKey(merge.keys().front(), &ikey);
-        }
-      }
-
-      last_sequence_for_key = ikey.sequence;
-      visible_in_snapshot = visible;
-    }
-
-    if (!drop) {
-      // We may write a single key (e.g.: for Put/Delete or successful merge).
-      // Or we may instead have to write a sequence/list of keys.
-      // We have to write a sequence iff we have an unsuccessful merge
-      bool has_merge_list = current_entry_is_merging && !merge.IsSuccess();
-      const std::deque<std::string>* keys = nullptr;
-      const std::deque<std::string>* values = nullptr;
-      std::deque<std::string>::const_reverse_iterator key_iter;
-      std::deque<std::string>::const_reverse_iterator value_iter;
-      if (has_merge_list) {
-        keys = &merge.keys();
-        values = &merge.values();
-        key_iter = keys->rbegin();    // The back (*rbegin()) is the first key
-        value_iter = values->rbegin();
-
-        key = Slice(*key_iter);
-        value = Slice(*value_iter);
-      }
-
-      // If we have a list of keys to write, traverse the list.
-      // If we have a single key to write, simply write that key.
-      while (true) {
-        // Invariant: key,value,ikey will always be the next entry to write
-        char* kptr = (char*)key.data();
-        std::string kstr;
-
-        // Zeroing out the sequence number leads to better compression.
-        // If this is the bottommost level (no files in lower levels)
-        // and the earliest snapshot is larger than this seqno
-        // then we can squash the seqno to zero.
-        if (bottommost_level && ikey.sequence < earliest_snapshot &&
-            ikey.type != kTypeMerge) {
-          assert(ikey.type != kTypeDeletion);
-          // make a copy because updating in place would cause problems
-          // with the priority queue that is managing the input key iterator
-          kstr.assign(key.data(), key.size());
-          kptr = (char *)kstr.c_str();
-          UpdateInternalKey(kptr, key.size(), (uint64_t)0, ikey.type);
-        }
-
-        Slice newkey(kptr, key.size());
-        assert((key.clear(), 1)); // we do not need 'key' anymore
-
-        // Open output file if necessary
-        if (compact->builder == nullptr) {
-          status = OpenCompactionOutputFile(compact, mutable_cf_options);
-          if (!status.ok()) {
-            break;
-          }
-        }
-
-        SequenceNumber seqno = GetInternalKeySeqno(newkey);
-        if (compact->builder->NumEntries() == 0) {
-          compact->current_output()->smallest.DecodeFrom(newkey);
-          compact->current_output()->smallest_seqno = seqno;
-        } else {
-          compact->current_output()->smallest_seqno =
-            std::min(compact->current_output()->smallest_seqno, seqno);
-        }
-        compact->current_output()->largest.DecodeFrom(newkey);
-        compact->builder->Add(newkey, value);
-        (*num_output_records)++,
-        compact->current_output()->largest_seqno =
-          std::max(compact->current_output()->largest_seqno, seqno);
-
-        // Close output file if it is big enough
-        if (compact->builder->FileSize() >=
-            compact->compaction->MaxOutputFileSize()) {
-          status = FinishCompactionOutputFile(compact, input);
-          if (!status.ok()) {
-            break;
-          }
-        }
-
-        // If we have a list of entries, move to next element
-        // If we only had one entry, then break the loop.
-        if (has_merge_list) {
-          ++key_iter;
-          ++value_iter;
-
-          // If at end of list
-          if (key_iter == keys->rend() || value_iter == values->rend()) {
-            // Sanity Check: if one ends, then both end
-            assert(key_iter == keys->rend() && value_iter == values->rend());
-            break;
-          }
-
-          // Otherwise not at end of list. Update key, value, and ikey.
-          key = Slice(*key_iter);
-          value = Slice(*value_iter);
-          ParseInternalKey(key, &ikey);
-
-        } else{
-          // Only had one item to begin with (Put/Delete)
-          break;
-        }
-      }  // while (true)
-    }  // if (!drop)
-
-    // MergeUntil has moved input to the next entry
-    if (!current_entry_is_merging) {
-      input->Next();
-    }
-  }
-  if (key_drop_user > 0) {
-    RecordTick(stats_, COMPACTION_KEY_DROP_USER, key_drop_user);
-  }
-  if (key_drop_newer_entry > 0) {
-    RecordTick(stats_, COMPACTION_KEY_DROP_NEWER_ENTRY, key_drop_newer_entry);
-  }
-  if (key_drop_obsolete > 0) {
-    RecordTick(stats_, COMPACTION_KEY_DROP_OBSOLETE, key_drop_obsolete);
-  }
-  RecordCompactionIOStats();
-
-  return status;
-}
-
-void DBImpl::CallCompactionFilterV2(CompactionState* compact,
-  CompactionFilterV2* compaction_filter_v2) {
-  if (compact == nullptr || compaction_filter_v2 == nullptr) {
-    return;
-  }
-
-  // Assemble slice vectors for user keys and existing values.
-  // We also keep track of our parsed internal key structs because
-  // we may need to access the sequence number in the event that
-  // keys are garbage collected during the filter process.
-  std::vector<ParsedInternalKey> ikey_buf;
-  std::vector<Slice> user_key_buf;
-  std::vector<Slice> existing_value_buf;
-
-  for (const auto& key : compact->key_str_buf_) {
-    ParsedInternalKey ikey;
-    ParseInternalKey(Slice(key), &ikey);
-    ikey_buf.emplace_back(ikey);
-    user_key_buf.emplace_back(ikey.user_key);
-  }
-  for (const auto& value : compact->existing_value_str_buf_) {
-    existing_value_buf.emplace_back(Slice(value));
-  }
-
-  // If the user has specified a compaction filter and the sequence
-  // number is greater than any external snapshot, then invoke the
-  // filter.
-  // If the return value of the compaction filter is true, replace
-  // the entry with a delete marker.
-  compact->to_delete_buf_ = compaction_filter_v2->Filter(
-      compact->compaction->level(),
-      user_key_buf, existing_value_buf,
-      &compact->new_value_buf_,
-      &compact->value_changed_buf_);
-
-  // new_value_buf_.size() <= to_delete__buf_.size(). "=" iff all
-  // kv-pairs in this compaction run needs to be deleted.
-  assert(compact->to_delete_buf_.size() ==
-      compact->key_str_buf_.size());
-  assert(compact->to_delete_buf_.size() ==
-      compact->existing_value_str_buf_.size());
-  assert(compact->to_delete_buf_.size() ==
-      compact->value_changed_buf_.size());
-
-  int new_value_idx = 0;
-  for (unsigned int i = 0; i < compact->to_delete_buf_.size(); ++i) {
-    if (compact->to_delete_buf_[i]) {
-      // update the string buffer directly
-      // the Slice buffer points to the updated buffer
-      UpdateInternalKey(&compact->key_str_buf_[i][0],
-                        compact->key_str_buf_[i].size(),
-                        ikey_buf[i].sequence,
-                        kTypeDeletion);
-
-      // no value associated with delete
-      compact->existing_value_str_buf_[i].clear();
-      RecordTick(stats_, COMPACTION_KEY_DROP_USER);
-    } else if (compact->value_changed_buf_[i]) {
-      compact->existing_value_str_buf_[i] =
-          compact->new_value_buf_[new_value_idx++];
-    }
-  }  // for
-}
-
-Status DBImpl::DoCompactionWork(CompactionState* compact,
-                                const MutableCFOptions& mutable_cf_options,
-                                JobContext* job_context,
-                                LogBuffer* log_buffer) {
-  assert(compact);
-  compact->CleanupBatchBuffer();
-  compact->CleanupMergedBuffer();
-
-  // Generate file_levels_ for compaction berfore making Iterator
-  compact->compaction->GenerateFileLevels();
-  int64_t imm_micros = 0;  // Micros spent doing imm_ compactions
-  ColumnFamilyData* cfd = compact->compaction->column_family_data();
-  LogToBuffer(
-      log_buffer,
-      "[%s] Compacting %d@%d + %d@%d files, score %.2f slots available %d",
-      cfd->GetName().c_str(), compact->compaction->num_input_files(0),
-      compact->compaction->level(), compact->compaction->num_input_files(1),
-      compact->compaction->output_level(), compact->compaction->score(),
-      db_options_.max_background_compactions - bg_compaction_scheduled_);
-  char scratch[2345];
-  compact->compaction->Summary(scratch, sizeof(scratch));
-  LogToBuffer(log_buffer, "[%s] Compaction start summary: %s\n",
-              cfd->GetName().c_str(), scratch);
-
-  assert(cfd->current()->storage_info()->NumLevelFiles(
-             compact->compaction->level()) > 0);
-  assert(compact->builder == nullptr);
-  assert(!compact->outfile);
-
-  SequenceNumber visible_at_tip = 0;
-  SequenceNumber earliest_snapshot;
-  SequenceNumber latest_snapshot = 0;
-  snapshots_.getAll(compact->existing_snapshots);
-  if (compact->existing_snapshots.size() == 0) {
-    // optimize for fast path if there are no snapshots
-    visible_at_tip = versions_->LastSequence();
-    earliest_snapshot = visible_at_tip;
-  } else {
-    latest_snapshot = compact->existing_snapshots.back();
-    // Add the current seqno as the 'latest' virtual
-    // snapshot to the end of this list.
-    compact->existing_snapshots.push_back(versions_->LastSequence());
-    earliest_snapshot = compact->existing_snapshots[0];
-  }
-
-  // Is this compaction producing files at the bottommost level?
-  bool bottommost_level = compact->compaction->BottomMostLevel();
-
-  // Allocate the output file numbers before we release the lock
-  AllocateCompactionOutputFileNumbers(compact);
-
-  bool is_snapshot_supported = IsSnapshotSupported();
-  // Release mutex while we're actually doing the compaction work
-  mutex_.Unlock();
-  log_buffer->FlushBufferToLog();
-
-  int num_output_records = 0;
-  const uint64_t start_micros = env_->NowMicros();
-  unique_ptr<Iterator> input(versions_->MakeInputIterator(compact->compaction));
-  input->SeekToFirst();
-
-  Status status;
-  ParsedInternalKey ikey;
-  std::unique_ptr<CompactionFilterV2> compaction_filter_from_factory_v2
-    = nullptr;
-  auto context = compact->GetFilterContext();
-  compaction_filter_from_factory_v2 =
-      cfd->ioptions()->compaction_filter_factory_v2->
-          CreateCompactionFilterV2(context);
-  auto compaction_filter_v2 =
-    compaction_filter_from_factory_v2.get();
-
-  if (!compaction_filter_v2) {
-    status = ProcessKeyValueCompaction(
-        mutable_cf_options, is_snapshot_supported, visible_at_tip,
-        earliest_snapshot, latest_snapshot, job_context, bottommost_level,
-        &imm_micros, input.get(), compact, false, &num_output_records,
-        log_buffer);
-  } else {
-    // temp_backup_input always point to the start of the current buffer
-    // temp_backup_input = backup_input;
-    // iterate through input,
-    // 1) buffer ineligible keys and value keys into 2 separate buffers;
-    // 2) send value_buffer to compaction filter and alternate the values;
-    // 3) merge value_buffer with ineligible_value_buffer;
-    // 4) run the modified "compaction" using the old for loop.
-    bool prefix_initialized = false;
-    shared_ptr<Iterator> backup_input(
-        versions_->MakeInputIterator(compact->compaction));
-    backup_input->SeekToFirst();
-    while (backup_input->Valid() &&
-           !shutting_down_.load(std::memory_order_acquire) &&
-           !cfd->IsDropped()) {
-      // FLUSH preempts compaction
-      // TODO(icanadi) this currently only checks if flush is necessary on
-      // compacting column family. we should also check if flush is necessary on
-      // other column families, too
-      imm_micros += CallFlushDuringCompaction(cfd, mutable_cf_options,
-                                              job_context, log_buffer);
-
-      Slice key = backup_input->key();
-      Slice value = backup_input->value();
-
-      if (!ParseInternalKey(key, &ikey)) {
-        // log error
-        Log(db_options_.info_log, "[%s] Failed to parse key: %s",
-            cfd->GetName().c_str(), key.ToString().c_str());
-        continue;
-      } else {
-        const SliceTransform* transformer =
-            cfd->ioptions()->compaction_filter_factory_v2->GetPrefixExtractor();
-        const auto key_prefix = transformer->Transform(ikey.user_key);
-        if (!prefix_initialized) {
-          compact->cur_prefix_ = key_prefix.ToString();
-          prefix_initialized = true;
-        }
-        // If the prefix remains the same, keep buffering
-        if (key_prefix.compare(Slice(compact->cur_prefix_)) == 0) {
-          // Apply the compaction filter V2 to all the kv pairs sharing
-          // the same prefix
-          if (ikey.type == kTypeValue &&
-              (visible_at_tip || ikey.sequence > latest_snapshot)) {
-            // Buffer all keys sharing the same prefix for CompactionFilterV2
-            // Iterate through keys to check prefix
-            compact->BufferKeyValueSlices(key, value);
-          } else {
-            // buffer ineligible keys
-            compact->BufferOtherKeyValueSlices(key, value);
-          }
-          backup_input->Next();
-          continue;
-          // finish changing values for eligible keys
-        } else {
-          // Now prefix changes, this batch is done.
-          // Call compaction filter on the buffered values to change the value
-          if (compact->key_str_buf_.size() > 0) {
-            CallCompactionFilterV2(compact, compaction_filter_v2);
-          }
-          compact->cur_prefix_ = key_prefix.ToString();
-        }
-      }
-
-      // Merge this batch of data (values + ineligible keys)
-      compact->MergeKeyValueSliceBuffer(&cfd->internal_comparator());
-
-      // Done buffering for the current prefix. Spit it out to disk
-      // Now just iterate through all the kv-pairs
-      status = ProcessKeyValueCompaction(
-          mutable_cf_options, is_snapshot_supported, visible_at_tip,
-          earliest_snapshot, latest_snapshot, job_context, bottommost_level,
-          &imm_micros, input.get(), compact, true, &num_output_records,
-          log_buffer);
-
-      if (!status.ok()) {
-        break;
-      }
-
-      // After writing the kv-pairs, we can safely remove the reference
-      // to the string buffer and clean them up
-      compact->CleanupBatchBuffer();
-      compact->CleanupMergedBuffer();
-      // Buffer the key that triggers the mismatch in prefix
-      if (ikey.type == kTypeValue &&
-        (visible_at_tip || ikey.sequence > latest_snapshot)) {
-        compact->BufferKeyValueSlices(key, value);
-      } else {
-        compact->BufferOtherKeyValueSlices(key, value);
-      }
-      backup_input->Next();
-      if (!backup_input->Valid()) {
-        // If this is the single last value, we need to merge it.
-        if (compact->key_str_buf_.size() > 0) {
-          CallCompactionFilterV2(compact, compaction_filter_v2);
-        }
-        compact->MergeKeyValueSliceBuffer(&cfd->internal_comparator());
-
-        status = ProcessKeyValueCompaction(
-            mutable_cf_options, is_snapshot_supported, visible_at_tip,
-            earliest_snapshot, latest_snapshot, job_context, bottommost_level,
-            &imm_micros, input.get(), compact, true, &num_output_records,
-            log_buffer);
-
-        compact->CleanupBatchBuffer();
-        compact->CleanupMergedBuffer();
-      }
-    }  // done processing all prefix batches
-    // finish the last batch
-    if (compact->key_str_buf_.size() > 0) {
-      CallCompactionFilterV2(compact, compaction_filter_v2);
-    }
-    compact->MergeKeyValueSliceBuffer(&cfd->internal_comparator());
-    status = ProcessKeyValueCompaction(
-        mutable_cf_options, is_snapshot_supported, visible_at_tip,
-        earliest_snapshot, latest_snapshot, job_context, bottommost_level,
-        &imm_micros, input.get(), compact, true, &num_output_records,
-        log_buffer);
-  }  // checking for compaction filter v2
-
-  if (status.ok() &&
-      (shutting_down_.load(std::memory_order_acquire) || cfd->IsDropped())) {
-    status = Status::ShutdownInProgress(
-        "Database shutdown or Column family drop during compaction");
-  }
-  if (status.ok() && compact->builder != nullptr) {
-    status = FinishCompactionOutputFile(compact, input.get());
-  }
-  if (status.ok()) {
-    status = input->status();
-  }
-  input.reset();
-
-  if (!db_options_.disableDataSync) {
-    db_directory_->Fsync();
-  }
-
-  InternalStats::CompactionStats stats(1);
-  stats.micros = env_->NowMicros() - start_micros - imm_micros;
-  stats.files_in_leveln = compact->compaction->num_input_files(0);
-  stats.files_in_levelnp1 = compact->compaction->num_input_files(1);
-  MeasureTime(stats_, COMPACTION_TIME, stats.micros);
-
-  int num_output_files = compact->outputs.size();
-  if (compact->builder != nullptr) {
-    // An error occurred so ignore the last output.
-    assert(num_output_files > 0);
-    --num_output_files;
-  }
-  stats.files_out_levelnp1 = num_output_files;
-
-  uint64_t num_input_records = 0;
-
-  for (int i = 0; i < compact->compaction->num_input_files(0); i++) {
-    stats.bytes_readn += compact->compaction->input(0, i)->fd.GetFileSize();
-    stats.num_input_records += compact->compaction->input(0, i)->num_entries;
-    num_input_records += compact->compaction->input(0, i)->num_entries;
-  }
-
-  for (int i = 0; i < compact->compaction->num_input_files(1); i++) {
-    stats.bytes_readnp1 += compact->compaction->input(1, i)->fd.GetFileSize();
-    num_input_records += compact->compaction->input(1, i)->num_entries;
-  }
-
-  for (int i = 0; i < num_output_files; i++) {
-    stats.bytes_written += compact->outputs[i].file_size;
-  }
-  stats.num_dropped_records =
-      static_cast<int>(num_input_records) - num_output_records;
-
-  RecordCompactionIOStats();
-
-  LogFlush(db_options_.info_log);
-  mutex_.Lock();
-  cfd->internal_stats()->AddCompactionStats(
-      compact->compaction->output_level(), stats);
-
-  // if there were any unused file number (mostly in case of
-  // compaction error), free up the entry from pending_putputs
-  ReleaseCompactionUnusedFileNumbers(compact);
-
-  if (status.ok()) {
-    status = InstallCompactionResults(compact, mutable_cf_options, log_buffer);
-    InstallSuperVersionBackground(cfd, job_context, mutable_cf_options);
-  }
-  VersionStorageInfo::LevelSummaryStorage tmp;
-  LogToBuffer(log_buffer,
-              "[%s] compacted to: %s, MB/sec: %.1f rd, %.1f wr, level %d, "
-              "files in(%d, %d) out(%d) "
-              "MB in(%.1f, %.1f) out(%.1f), read-write-amplify(%.1f) "
-              "write-amplify(%.1f) %s, records in: %d, records dropped: %d\n",
-              cfd->GetName().c_str(),
-              cfd->current()->storage_info()->LevelSummary(&tmp),
-              (stats.bytes_readn + stats.bytes_readnp1) /
-                  static_cast<double>(stats.micros),
-              stats.bytes_written / static_cast<double>(stats.micros),
-              compact->compaction->output_level(), stats.files_in_leveln,
-              stats.files_in_levelnp1, stats.files_out_levelnp1,
-              stats.bytes_readn / 1048576.0, stats.bytes_readnp1 / 1048576.0,
-              stats.bytes_written / 1048576.0,
-              (stats.bytes_written + stats.bytes_readnp1 + stats.bytes_readn) /
-                  (double)stats.bytes_readn,
-              stats.bytes_written / (double)stats.bytes_readn,
-              status.ToString().c_str(), stats.num_input_records,
-              stats.num_dropped_records);
-
-  return status;
-}
-
 namespace {
 struct IterState {
   IterState(DBImpl* _db, port::Mutex* _mu, SuperVersion* _super_version)
diff --git a/db/db_impl.h b/db/db_impl.h
index 547a85da5..5aa1eb8ed 100644
--- a/db/db_impl.h
+++ b/db/db_impl.h
@@ -308,41 +308,14 @@ class DBImpl : public DB {
                               LogBuffer* log_buffer);
   Status BackgroundFlush(bool* madeProgress, JobContext* job_context,
                          LogBuffer* log_buffer);
-  void CleanupCompaction(CompactionState* compact, Status status);
-  Status DoCompactionWork(CompactionState* compact,
-                          const MutableCFOptions& mutable_cf_options,
-                          JobContext* job_context, LogBuffer* log_buffer);
 
   // This function is called as part of compaction. It enables Flush process to
   // preempt compaction, since it's higher prioirty
-  // Returns: micros spent executing
   uint64_t CallFlushDuringCompaction(ColumnFamilyData* cfd,
                                      const MutableCFOptions& mutable_cf_options,
                                      JobContext* job_context,
                                      LogBuffer* log_buffer);
 
-  // Call compaction filter if is_compaction_v2 is not true. Then iterate
-  // through input and compact the kv-pairs
-  Status ProcessKeyValueCompaction(
-      const MutableCFOptions& mutable_cf_options, bool is_snapshot_supported,
-      SequenceNumber visible_at_tip, SequenceNumber earliest_snapshot,
-      SequenceNumber latest_snapshot, JobContext* job_context,
-      bool bottommost_level, int64_t* imm_micros, Iterator* input,
-      CompactionState* compact, bool is_compaction_v2, int* num_output_records,
-      LogBuffer* log_buffer);
-
-  // Call compaction_filter_v2->Filter() on kv-pairs in compact
-  void CallCompactionFilterV2(CompactionState* compact,
-    CompactionFilterV2* compaction_filter_v2);
-
-  Status OpenCompactionOutputFile(CompactionState* compact,
-      const MutableCFOptions& mutable_cf_options);
-  Status FinishCompactionOutputFile(CompactionState* compact, Iterator* input);
-  Status InstallCompactionResults(CompactionState* compact,
-      const MutableCFOptions& mutable_cf_options, LogBuffer* log_buffer);
-  void AllocateCompactionOutputFileNumbers(CompactionState* compact);
-  void ReleaseCompactionUnusedFileNumbers(CompactionState* compact);
-
   void PrintStatistics();
 
   // dump rocksdb.stats to LOG

From 86905e3cbb0b181c6171c9ca8a4171c48974076b Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Fri, 31 Oct 2014 11:54:05 -0700
Subject: [PATCH 392/829] Move VersionBuilder logic to a separate .cc file

Summary: Move all the logic of VersionBuilder to a separate .cc file

Test Plan: make all check

Reviewers: ljin, yhchiang, rven, igor

Reviewed By: igor

Subscribers: leveldb, dhruba

Differential Revision: https://reviews.facebook.net/D28083
---
 db/version_builder.cc | 319 ++++++++++++++++++++++++++++++++++++++++++
 db/version_builder.h  |   2 +
 db/version_set.cc     | 289 --------------------------------------
 3 files changed, 321 insertions(+), 289 deletions(-)
 create mode 100644 db/version_builder.cc

diff --git a/db/version_builder.cc b/db/version_builder.cc
new file mode 100644
index 000000000..61205704f
--- /dev/null
+++ b/db/version_builder.cc
@@ -0,0 +1,319 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/version_builder.h"
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include <inttypes.h>
+#include <algorithm>
+#include <set>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "db/table_cache.h"
+#include "db/version_set.h"
+#include "table/table_reader.h"
+
+namespace rocksdb {
+
+bool NewestFirstBySeqNo(FileMetaData* a, FileMetaData* b) {
+  if (a->smallest_seqno != b->smallest_seqno) {
+    return a->smallest_seqno > b->smallest_seqno;
+  }
+  if (a->largest_seqno != b->largest_seqno) {
+    return a->largest_seqno > b->largest_seqno;
+  }
+  // Break ties by file number
+  return a->fd.GetNumber() > b->fd.GetNumber();
+}
+
+namespace {
+bool BySmallestKey(FileMetaData* a, FileMetaData* b,
+                   const InternalKeyComparator* cmp) {
+  int r = cmp->Compare(a->smallest, b->smallest);
+  if (r != 0) {
+    return (r < 0);
+  }
+  // Break ties by file number
+  return (a->fd.GetNumber() < b->fd.GetNumber());
+}
+}  // namespace
+
+class VersionBuilder::Rep {
+ private:
+  // Helper to sort files_ in v
+  // kLevel0 -- NewestFirstBySeqNo
+  // kLevelNon0 -- BySmallestKey
+  struct FileComparator {
+    enum SortMethod {
+      kLevel0 = 0,
+      kLevelNon0 = 1,
+    } sort_method;
+    const InternalKeyComparator* internal_comparator;
+
+    bool operator()(FileMetaData* f1, FileMetaData* f2) const {
+      switch (sort_method) {
+        case kLevel0:
+          return NewestFirstBySeqNo(f1, f2);
+        case kLevelNon0:
+          return BySmallestKey(f1, f2, internal_comparator);
+      }
+      assert(false);
+      return false;
+    }
+  };
+
+  typedef std::set<FileMetaData*, FileComparator> FileSet;
+  struct LevelState {
+    std::set<uint64_t> deleted_files;
+    FileSet* added_files;
+  };
+
+  const EnvOptions& env_options_;
+  TableCache* table_cache_;
+  VersionStorageInfo* base_vstorage_;
+  LevelState* levels_;
+  FileComparator level_zero_cmp_;
+  FileComparator level_nonzero_cmp_;
+
+ public:
+  Rep(const EnvOptions& env_options, TableCache* table_cache,
+      VersionStorageInfo* base_vstorage)
+      : env_options_(env_options),
+        table_cache_(table_cache),
+        base_vstorage_(base_vstorage) {
+    levels_ = new LevelState[base_vstorage_->NumberLevels()];
+    level_zero_cmp_.sort_method = FileComparator::kLevel0;
+    level_nonzero_cmp_.sort_method = FileComparator::kLevelNon0;
+    level_nonzero_cmp_.internal_comparator =
+        base_vstorage_->InternalComparator();
+
+    levels_[0].added_files = new FileSet(level_zero_cmp_);
+    for (int level = 1; level < base_vstorage_->NumberLevels(); level++) {
+        levels_[level].added_files = new FileSet(level_nonzero_cmp_);
+    }
+  }
+
+  ~Rep() {
+    for (int level = 0; level < base_vstorage_->NumberLevels(); level++) {
+      const FileSet* added = levels_[level].added_files;
+      std::vector<FileMetaData*> to_unref;
+      to_unref.reserve(added->size());
+      for (FileSet::const_iterator it = added->begin();
+          it != added->end(); ++it) {
+        to_unref.push_back(*it);
+      }
+      delete added;
+      for (uint32_t i = 0; i < to_unref.size(); i++) {
+        FileMetaData* f = to_unref[i];
+        f->refs--;
+        if (f->refs <= 0) {
+          if (f->table_reader_handle) {
+            assert(table_cache_ != nullptr);
+            table_cache_->ReleaseHandle(f->table_reader_handle);
+            f->table_reader_handle = nullptr;
+          }
+          delete f;
+        }
+      }
+    }
+
+    delete[] levels_;
+  }
+
+  void CheckConsistency(VersionStorageInfo* vstorage) {
+#ifndef NDEBUG
+    // make sure the files are sorted correctly
+    for (int level = 0; level < vstorage->NumberLevels(); level++) {
+      auto& level_files = vstorage->LevelFiles(level);
+      for (size_t i = 1; i < level_files.size(); i++) {
+        auto f1 = level_files[i - 1];
+        auto f2 = level_files[i];
+        if (level == 0) {
+          assert(level_zero_cmp_(f1, f2));
+          assert(f1->largest_seqno > f2->largest_seqno);
+        } else {
+          assert(level_nonzero_cmp_(f1, f2));
+
+          // Make sure there is no overlap in levels > 0
+          if (vstorage->InternalComparator()->Compare(f1->largest,
+                                                      f2->smallest) >= 0) {
+            fprintf(stderr, "overlapping ranges in same level %s vs. %s\n",
+                    (f1->largest).DebugString().c_str(),
+                    (f2->smallest).DebugString().c_str());
+            abort();
+          }
+        }
+      }
+    }
+#endif
+  }
+
+  void CheckConsistencyForDeletes(VersionEdit* edit, uint64_t number,
+                                  int level) {
+#ifndef NDEBUG
+      // a file to be deleted better exist in the previous version
+      bool found = false;
+      for (int l = 0; !found && l < base_vstorage_->NumberLevels(); l++) {
+        const std::vector<FileMetaData*>& base_files =
+            base_vstorage_->LevelFiles(l);
+        for (unsigned int i = 0; i < base_files.size(); i++) {
+          FileMetaData* f = base_files[i];
+          if (f->fd.GetNumber() == number) {
+            found =  true;
+            break;
+          }
+        }
+      }
+      // if the file did not exist in the previous version, then it
+      // is possibly moved from lower level to higher level in current
+      // version
+      for (int l = level + 1; !found && l < base_vstorage_->NumberLevels();
+           l++) {
+        const FileSet* added = levels_[l].added_files;
+        for (FileSet::const_iterator added_iter = added->begin();
+             added_iter != added->end(); ++added_iter) {
+          FileMetaData* f = *added_iter;
+          if (f->fd.GetNumber() == number) {
+            found = true;
+            break;
+          }
+        }
+      }
+
+      // maybe this file was added in a previous edit that was Applied
+      if (!found) {
+        const FileSet* added = levels_[level].added_files;
+        for (FileSet::const_iterator added_iter = added->begin();
+             added_iter != added->end(); ++added_iter) {
+          FileMetaData* f = *added_iter;
+          if (f->fd.GetNumber() == number) {
+            found = true;
+            break;
+          }
+        }
+      }
+      if (!found) {
+        fprintf(stderr, "not found %" PRIu64 "\n", number);
+      }
+      assert(found);
+#endif
+  }
+
+  // Apply all of the edits in *edit to the current state.
+  void Apply(VersionEdit* edit) {
+    CheckConsistency(base_vstorage_);
+
+    // Delete files
+    const VersionEdit::DeletedFileSet& del = edit->GetDeletedFiles();
+    for (const auto& del_file : del) {
+      const auto level = del_file.first;
+      const auto number = del_file.second;
+      levels_[level].deleted_files.insert(number);
+      CheckConsistencyForDeletes(edit, number, level);
+    }
+
+    // Add new files
+    for (const auto& new_file : edit->GetNewFiles()) {
+      const int level = new_file.first;
+      FileMetaData* f = new FileMetaData(new_file.second);
+      f->refs = 1;
+
+      levels_[level].deleted_files.erase(f->fd.GetNumber());
+      levels_[level].added_files->insert(f);
+    }
+  }
+
+  // Save the current state in *v.
+  void SaveTo(VersionStorageInfo* vstorage) {
+    CheckConsistency(base_vstorage_);
+    CheckConsistency(vstorage);
+
+    for (int level = 0; level < base_vstorage_->NumberLevels(); level++) {
+      const auto& cmp = (level == 0) ? level_zero_cmp_ : level_nonzero_cmp_;
+      // Merge the set of added files with the set of pre-existing files.
+      // Drop any deleted files.  Store the result in *v.
+      const auto& base_files = base_vstorage_->LevelFiles(level);
+      auto base_iter = base_files.begin();
+      auto base_end = base_files.end();
+      const auto& added_files = *levels_[level].added_files;
+      vstorage->Reserve(level, base_files.size() + added_files.size());
+
+      for (const auto& added : added_files) {
+        // Add all smaller files listed in base_
+        for (auto bpos = std::upper_bound(base_iter, base_end, added, cmp);
+             base_iter != bpos;
+             ++base_iter) {
+          MaybeAddFile(vstorage, level, *base_iter);
+        }
+
+        MaybeAddFile(vstorage, level, added);
+      }
+
+      // Add remaining base files
+      for (; base_iter != base_end; ++base_iter) {
+        MaybeAddFile(vstorage, level, *base_iter);
+      }
+    }
+
+    CheckConsistency(vstorage);
+  }
+
+  void LoadTableHandlers() {
+    assert(table_cache_ != nullptr);
+    for (int level = 0; level < base_vstorage_->NumberLevels(); level++) {
+      for (auto& file_meta : *(levels_[level].added_files)) {
+        assert(!file_meta->table_reader_handle);
+        table_cache_->FindTable(
+            env_options_, *(base_vstorage_->InternalComparator()),
+            file_meta->fd, &file_meta->table_reader_handle, false);
+        if (file_meta->table_reader_handle != nullptr) {
+          // Load table_reader
+          file_meta->fd.table_reader = table_cache_->GetTableReaderFromHandle(
+              file_meta->table_reader_handle);
+      }
+    }
+  }
+  }
+
+  void MaybeAddFile(VersionStorageInfo* vstorage, int level, FileMetaData* f) {
+    if (levels_[level].deleted_files.count(f->fd.GetNumber()) > 0) {
+      // File is deleted: do nothing
+    } else {
+      vstorage->MaybeAddFile(level, f);
+    }
+  }
+};
+
+VersionBuilder::VersionBuilder(const EnvOptions& env_options,
+                               TableCache* table_cache,
+                               VersionStorageInfo* base_vstorage)
+    : rep_(new Rep(env_options, table_cache, base_vstorage)) {}
+VersionBuilder::~VersionBuilder() { delete rep_; }
+void VersionBuilder::CheckConsistency(VersionStorageInfo* vstorage) {
+  rep_->CheckConsistency(vstorage);
+}
+void VersionBuilder::CheckConsistencyForDeletes(VersionEdit* edit,
+                                                uint64_t number, int level) {
+  rep_->CheckConsistencyForDeletes(edit, number, level);
+}
+void VersionBuilder::Apply(VersionEdit* edit) { rep_->Apply(edit); }
+void VersionBuilder::SaveTo(VersionStorageInfo* vstorage) {
+  rep_->SaveTo(vstorage);
+}
+void VersionBuilder::LoadTableHandlers() { rep_->LoadTableHandlers(); }
+void VersionBuilder::MaybeAddFile(VersionStorageInfo* vstorage, int level,
+                                  FileMetaData* f) {
+  rep_->MaybeAddFile(vstorage, level, f);
+}
+
+}  // namespace rocksdb
diff --git a/db/version_builder.h b/db/version_builder.h
index caeb34970..452604f17 100644
--- a/db/version_builder.h
+++ b/db/version_builder.h
@@ -37,4 +37,6 @@ class VersionBuilder {
   class Rep;
   Rep* rep_;
 };
+
+extern bool NewestFirstBySeqNo(FileMetaData* a, FileMetaData* b);
 }  // namespace rocksdb
diff --git a/db/version_set.cc b/db/version_set.cc
index 3c2c0d42e..4b068297f 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -70,27 +70,6 @@ int FindFileInRange(const InternalKeyComparator& icmp,
   return right;
 }
 
-bool NewestFirstBySeqNo(FileMetaData* a, FileMetaData* b) {
-  if (a->smallest_seqno != b->smallest_seqno) {
-    return a->smallest_seqno > b->smallest_seqno;
-  }
-  if (a->largest_seqno != b->largest_seqno) {
-    return a->largest_seqno > b->largest_seqno;
-  }
-  // Break ties by file number
-  return a->fd.GetNumber() > b->fd.GetNumber();
-}
-
-bool BySmallestKey(FileMetaData* a, FileMetaData* b,
-                   const InternalKeyComparator* cmp) {
-  int r = cmp->Compare(a->smallest, b->smallest);
-  if (r != 0) {
-    return (r < 0);
-  }
-  // Break ties by file number
-  return (a->fd.GetNumber() < b->fd.GetNumber());
-}
-
 // Class to help choose the next file to search for the particular key.
 // Searches and returns files level by level.
 // We can search level-by-level since entries never hop across
@@ -1465,274 +1444,6 @@ struct VersionSet::ManifestWriter {
       : done(false), cv(mu), cfd(_cfd), edit(e) {}
 };
 
-class VersionBuilder::Rep {
- private:
-  // Helper to sort files_ in v
-  // kLevel0 -- NewestFirstBySeqNo
-  // kLevelNon0 -- BySmallestKey
-  struct FileComparator {
-    enum SortMethod {
-      kLevel0 = 0,
-      kLevelNon0 = 1,
-    } sort_method;
-    const InternalKeyComparator* internal_comparator;
-
-    bool operator()(FileMetaData* f1, FileMetaData* f2) const {
-      switch (sort_method) {
-        case kLevel0:
-          return NewestFirstBySeqNo(f1, f2);
-        case kLevelNon0:
-          return BySmallestKey(f1, f2, internal_comparator);
-      }
-      assert(false);
-      return false;
-    }
-  };
-
-  typedef std::set<FileMetaData*, FileComparator> FileSet;
-  struct LevelState {
-    std::set<uint64_t> deleted_files;
-    FileSet* added_files;
-  };
-
-  const EnvOptions& env_options_;
-  TableCache* table_cache_;
-  VersionStorageInfo* base_vstorage_;
-  LevelState* levels_;
-  FileComparator level_zero_cmp_;
-  FileComparator level_nonzero_cmp_;
-
- public:
-  Rep(const EnvOptions& env_options, TableCache* table_cache,
-      VersionStorageInfo* base_vstorage)
-      : env_options_(env_options),
-        table_cache_(table_cache),
-        base_vstorage_(base_vstorage) {
-    levels_ = new LevelState[base_vstorage_->NumberLevels()];
-    level_zero_cmp_.sort_method = FileComparator::kLevel0;
-    level_nonzero_cmp_.sort_method = FileComparator::kLevelNon0;
-    level_nonzero_cmp_.internal_comparator =
-        base_vstorage_->InternalComparator();
-
-    levels_[0].added_files = new FileSet(level_zero_cmp_);
-    for (int level = 1; level < base_vstorage_->NumberLevels(); level++) {
-        levels_[level].added_files = new FileSet(level_nonzero_cmp_);
-    }
-  }
-
-  ~Rep() {
-    for (int level = 0; level < base_vstorage_->NumberLevels(); level++) {
-      const FileSet* added = levels_[level].added_files;
-      std::vector<FileMetaData*> to_unref;
-      to_unref.reserve(added->size());
-      for (FileSet::const_iterator it = added->begin();
-          it != added->end(); ++it) {
-        to_unref.push_back(*it);
-      }
-      delete added;
-      for (uint32_t i = 0; i < to_unref.size(); i++) {
-        FileMetaData* f = to_unref[i];
-        f->refs--;
-        if (f->refs <= 0) {
-          if (f->table_reader_handle) {
-            assert(table_cache_ != nullptr);
-            table_cache_->ReleaseHandle(f->table_reader_handle);
-            f->table_reader_handle = nullptr;
-          }
-          delete f;
-        }
-      }
-    }
-
-    delete[] levels_;
-  }
-
-  void CheckConsistency(VersionStorageInfo* vstorage) {
-#ifndef NDEBUG
-    // make sure the files are sorted correctly
-    for (int level = 0; level < vstorage->NumberLevels(); level++) {
-      auto& level_files = vstorage->LevelFiles(level);
-      for (size_t i = 1; i < level_files.size(); i++) {
-        auto f1 = level_files[i - 1];
-        auto f2 = level_files[i];
-        if (level == 0) {
-          assert(level_zero_cmp_(f1, f2));
-          assert(f1->largest_seqno > f2->largest_seqno);
-        } else {
-          assert(level_nonzero_cmp_(f1, f2));
-
-          // Make sure there is no overlap in levels > 0
-          if (vstorage->InternalComparator()->Compare(f1->largest,
-                                                      f2->smallest) >= 0) {
-            fprintf(stderr, "overlapping ranges in same level %s vs. %s\n",
-                    (f1->largest).DebugString().c_str(),
-                    (f2->smallest).DebugString().c_str());
-            abort();
-          }
-        }
-      }
-    }
-#endif
-  }
-
-  void CheckConsistencyForDeletes(VersionEdit* edit, uint64_t number,
-                                  int level) {
-#ifndef NDEBUG
-      // a file to be deleted better exist in the previous version
-      bool found = false;
-      for (int l = 0; !found && l < base_vstorage_->NumberLevels(); l++) {
-        const std::vector<FileMetaData*>& base_files =
-            base_vstorage_->LevelFiles(l);
-        for (unsigned int i = 0; i < base_files.size(); i++) {
-          FileMetaData* f = base_files[i];
-          if (f->fd.GetNumber() == number) {
-            found =  true;
-            break;
-          }
-        }
-      }
-      // if the file did not exist in the previous version, then it
-      // is possibly moved from lower level to higher level in current
-      // version
-      for (int l = level + 1; !found && l < base_vstorage_->NumberLevels();
-           l++) {
-        const FileSet* added = levels_[l].added_files;
-        for (FileSet::const_iterator added_iter = added->begin();
-             added_iter != added->end(); ++added_iter) {
-          FileMetaData* f = *added_iter;
-          if (f->fd.GetNumber() == number) {
-            found = true;
-            break;
-          }
-        }
-      }
-
-      // maybe this file was added in a previous edit that was Applied
-      if (!found) {
-        const FileSet* added = levels_[level].added_files;
-        for (FileSet::const_iterator added_iter = added->begin();
-             added_iter != added->end(); ++added_iter) {
-          FileMetaData* f = *added_iter;
-          if (f->fd.GetNumber() == number) {
-            found = true;
-            break;
-          }
-        }
-      }
-      if (!found) {
-        fprintf(stderr, "not found %" PRIu64 "\n", number);
-      }
-      assert(found);
-#endif
-  }
-
-  // Apply all of the edits in *edit to the current state.
-  void Apply(VersionEdit* edit) {
-    CheckConsistency(base_vstorage_);
-
-    // Delete files
-    const VersionEdit::DeletedFileSet& del = edit->GetDeletedFiles();
-    for (const auto& del_file : del) {
-      const auto level = del_file.first;
-      const auto number = del_file.second;
-      levels_[level].deleted_files.insert(number);
-      CheckConsistencyForDeletes(edit, number, level);
-    }
-
-    // Add new files
-    for (const auto& new_file : edit->GetNewFiles()) {
-      const int level = new_file.first;
-      FileMetaData* f = new FileMetaData(new_file.second);
-      f->refs = 1;
-
-      levels_[level].deleted_files.erase(f->fd.GetNumber());
-      levels_[level].added_files->insert(f);
-    }
-  }
-
-  // Save the current state in *v.
-  void SaveTo(VersionStorageInfo* vstorage) {
-    CheckConsistency(base_vstorage_);
-    CheckConsistency(vstorage);
-
-    for (int level = 0; level < base_vstorage_->NumberLevels(); level++) {
-      const auto& cmp = (level == 0) ? level_zero_cmp_ : level_nonzero_cmp_;
-      // Merge the set of added files with the set of pre-existing files.
-      // Drop any deleted files.  Store the result in *v.
-      const auto& base_files = base_vstorage_->LevelFiles(level);
-      auto base_iter = base_files.begin();
-      auto base_end = base_files.end();
-      const auto& added_files = *levels_[level].added_files;
-      vstorage->Reserve(level, base_files.size() + added_files.size());
-
-      for (const auto& added : added_files) {
-        // Add all smaller files listed in base_
-        for (auto bpos = std::upper_bound(base_iter, base_end, added, cmp);
-             base_iter != bpos;
-             ++base_iter) {
-          MaybeAddFile(vstorage, level, *base_iter);
-        }
-
-        MaybeAddFile(vstorage, level, added);
-      }
-
-      // Add remaining base files
-      for (; base_iter != base_end; ++base_iter) {
-        MaybeAddFile(vstorage, level, *base_iter);
-      }
-    }
-
-    CheckConsistency(vstorage);
-  }
-
-  void LoadTableHandlers() {
-    assert(table_cache_ != nullptr);
-    for (int level = 0; level < base_vstorage_->NumberLevels(); level++) {
-      for (auto& file_meta : *(levels_[level].added_files)) {
-        assert(!file_meta->table_reader_handle);
-        table_cache_->FindTable(
-            env_options_, *(base_vstorage_->InternalComparator()),
-            file_meta->fd, &file_meta->table_reader_handle, false);
-        if (file_meta->table_reader_handle != nullptr) {
-          // Load table_reader
-          file_meta->fd.table_reader = table_cache_->GetTableReaderFromHandle(
-              file_meta->table_reader_handle);
-      }
-    }
-  }
-  }
-
-  void MaybeAddFile(VersionStorageInfo* vstorage, int level, FileMetaData* f) {
-    if (levels_[level].deleted_files.count(f->fd.GetNumber()) > 0) {
-      // File is deleted: do nothing
-    } else {
-      vstorage->MaybeAddFile(level, f);
-    }
-  }
-};
-
-VersionBuilder::VersionBuilder(const EnvOptions& env_options,
-                               TableCache* table_cache,
-                               VersionStorageInfo* base_vstorage)
-    : rep_(new Rep(env_options, table_cache, base_vstorage)) {}
-VersionBuilder::~VersionBuilder() { delete rep_; }
-void VersionBuilder::CheckConsistency(VersionStorageInfo* vstorage) {
-  rep_->CheckConsistency(vstorage);
-}
-void VersionBuilder::CheckConsistencyForDeletes(VersionEdit* edit,
-                                                uint64_t number, int level) {
-  rep_->CheckConsistencyForDeletes(edit, number, level);
-}
-void VersionBuilder::Apply(VersionEdit* edit) { rep_->Apply(edit); }
-void VersionBuilder::SaveTo(VersionStorageInfo* vstorage) {
-  rep_->SaveTo(vstorage);
-}
-void VersionBuilder::LoadTableHandlers() { rep_->LoadTableHandlers(); }
-void VersionBuilder::MaybeAddFile(VersionStorageInfo* vstorage, int level,
-                                  FileMetaData* f) {
-  rep_->MaybeAddFile(vstorage, level, f);
-}
-
 VersionSet::VersionSet(const std::string& dbname, const DBOptions* db_options,
                        const EnvOptions& env_options, Cache* table_cache,
                        WriteController* write_controller)

From 27129c739fe5c6b9811049bf93be72fc0c87e254 Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Sat, 1 Nov 2014 01:02:12 +0100
Subject: [PATCH 393/829] [RocksJava] KeyMayExist w/o ColumnFamilies

---
 java/org/rocksdb/RocksDB.java              | 40 +++++++++++++++++++++
 java/org/rocksdb/test/KeyMayExistTest.java | 31 +++++++++++++---
 java/rocksjni/rocksjni.cc                  | 41 ++++++++++++++++++++--
 3 files changed, 105 insertions(+), 7 deletions(-)

diff --git a/java/org/rocksdb/RocksDB.java b/java/org/rocksdb/RocksDB.java
index 2a90c7370..f536765f8 100644
--- a/java/org/rocksdb/RocksDB.java
+++ b/java/org/rocksdb/RocksDB.java
@@ -419,6 +419,22 @@ public class RocksDB extends RocksObject {
         columnFamilyHandle.nativeHandle_);
   }
 
+  /**
+   * If the key definitely does not exist in the database, then this method
+   * returns false, else true.
+   *
+   * This check is potentially lighter-weight than invoking DB::Get(). One way
+   * to make this lighter weight is to avoid doing any IOs.
+   *
+   * @param key byte array of a key to search for
+   * @param value StringBuffer instance which is a out parameter if a value is
+   *    found in block-cache.
+   * @return boolean value indicating if key does not exist or might exist.
+   */
+  public boolean keyMayExist(byte[] key, StringBuffer value){
+    return keyMayExist(key, key.length, value);
+  }
+
   /**
    * If the key definitely does not exist in the database, then this method
    * returns false, else true.
@@ -438,6 +454,26 @@ public class RocksDB extends RocksObject {
         value);
   }
 
+  /**
+   * If the key definitely does not exist in the database, then this method
+   * returns false, else true.
+   *
+   * This check is potentially lighter-weight than invoking DB::Get(). One way
+   * to make this lighter weight is to avoid doing any IOs.
+   *
+   * @param readOptions {@link ReadOptions} instance
+   * @param columnFamilyHandle {@link ColumnFamilyHandle} instance
+   * @param key byte array of a key to search for
+   * @param value StringBuffer instance which is a out parameter if a value is
+   *    found in block-cache.
+   * @return boolean value indicating if key does not exist or might exist.
+   */
+  public boolean keyMayExist(ReadOptions readOptions,
+      byte[] key, StringBuffer value){
+    return keyMayExist(readOptions.nativeHandle_,
+        key, key.length, value);
+  }
+
   /**
    * If the key definitely does not exist in the database, then this method
    * returns false, else true.
@@ -1086,8 +1122,12 @@ public class RocksDB extends RocksObject {
       byte[] value, int valueLen, long cfHandle) throws RocksDBException;
   protected native void write(
       long writeOptHandle, long batchHandle) throws RocksDBException;
+  protected native boolean keyMayExist(byte[] key, int keyLen,
+      StringBuffer stringBuffer);
   protected native boolean keyMayExist(byte[] key, int keyLen,
       long cfHandle, StringBuffer stringBuffer);
+  protected native boolean keyMayExist(long optionsHandle, byte[] key, int keyLen,
+      StringBuffer stringBuffer);
   protected native boolean keyMayExist(long optionsHandle, byte[] key, int keyLen,
       long cfHandle, StringBuffer stringBuffer);
   protected native void merge(
diff --git a/java/org/rocksdb/test/KeyMayExistTest.java b/java/org/rocksdb/test/KeyMayExistTest.java
index a4ecb53da..c83a70e52 100644
--- a/java/org/rocksdb/test/KeyMayExistTest.java
+++ b/java/org/rocksdb/test/KeyMayExistTest.java
@@ -4,10 +4,7 @@
 // of patent rights can be found in the PATENTS file in the same directory.
 package org.rocksdb.test;
 
-import org.rocksdb.ColumnFamilyHandle;
-import org.rocksdb.Options;
-import org.rocksdb.RocksDB;
-import org.rocksdb.RocksDBException;
+import org.rocksdb.*;
 
 import java.util.ArrayList;
 import java.util.List;
@@ -34,13 +31,39 @@ public class KeyMayExistTest {
       assert(columnFamilyHandleList.size()==2);
 
       db.put("key".getBytes(), "value".getBytes());
+      // Test without column family
       StringBuffer retValue = new StringBuffer();
+      if (db.keyMayExist("key".getBytes(), retValue)) {
+        assert(retValue.toString().equals("value"));
+      } else {
+        assert(false);
+      }
+      // Test without column family but with readOptions
+      retValue = new StringBuffer();
+      if (db.keyMayExist(new ReadOptions(), "key".getBytes(),
+          retValue)) {
+        assert(retValue.toString().equals("value"));
+      } else {
+        assert(false);
+      }
+      // Test with column family
+      retValue = new StringBuffer();
       if (db.keyMayExist(columnFamilyHandleList.get(0), "key".getBytes(),
           retValue)) {
         assert(retValue.toString().equals("value"));
       } else {
         assert(false);
       }
+      // Test with column family and readOptions
+      retValue = new StringBuffer();
+      if (db.keyMayExist(new ReadOptions(),
+          columnFamilyHandleList.get(0), "key".getBytes(),
+          retValue)) {
+        assert(retValue.toString().equals("value"));
+      } else {
+        assert(false);
+      }
+      // KeyMayExist in CF1 must return false
       assert(db.keyMayExist(columnFamilyHandleList.get(1), "key".getBytes(),
           retValue) == false);
       System.out.println("Passed KeyMayExistTest");
diff --git a/java/rocksjni/rocksjni.cc b/java/rocksjni/rocksjni.cc
index 50cd8a359..d1a8bb7be 100644
--- a/java/rocksjni/rocksjni.cc
+++ b/java/rocksjni/rocksjni.cc
@@ -390,8 +390,15 @@ jboolean key_may_exist_helper(JNIEnv* env, rocksdb::DB* db,
   jboolean isCopy;
   jbyte* key = env->GetByteArrayElements(jkey, &isCopy);
   rocksdb::Slice key_slice(reinterpret_cast<char*>(key), jkey_len);
-  bool keyMaxExist = db->KeyMayExist(read_opt, cf_handle, key_slice,
-       &value, &value_found);
+  bool keyMayExist;
+  if (cf_handle != nullptr) {
+    keyMayExist = db->KeyMayExist(read_opt, cf_handle, key_slice,
+        &value, &value_found);
+  } else {
+    keyMayExist = db->KeyMayExist(read_opt, key_slice,
+        &value, &value_found);
+  }
+
   if (value_found && !value.empty()) {
     jclass clazz = env->GetObjectClass(jvalue);
     jmethodID mid = env->GetMethodID(clazz, "append",
@@ -400,7 +407,20 @@ jboolean key_may_exist_helper(JNIEnv* env, rocksdb::DB* db,
     env->CallObjectMethod(jvalue, mid, new_value_str);
   }
   env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
-  return static_cast<jboolean>(keyMaxExist);
+  return static_cast<jboolean>(keyMayExist);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    keyMayExist
+ * Signature: ([BILjava/lang/StringBuffer;)Z
+ */
+jboolean Java_org_rocksdb_RocksDB_keyMayExist___3BILjava_lang_StringBuffer_2(
+    JNIEnv* env, jobject jdb, jbyteArray jkey, jint jkey_len,
+    jobject jvalue) {
+  rocksdb::DB* db = rocksdb::RocksDBJni::getHandle(env, jdb);
+  return key_may_exist_helper(env, db, rocksdb::ReadOptions(),
+      nullptr, jkey, jkey_len, jvalue);
 }
 
 /*
@@ -424,6 +444,21 @@ jboolean Java_org_rocksdb_RocksDB_keyMayExist___3BIJLjava_lang_StringBuffer_2(
   return true;
 }
 
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    keyMayExist
+ * Signature: (J[BILjava/lang/StringBuffer;)Z
+ */
+jboolean Java_org_rocksdb_RocksDB_keyMayExist__J_3BILjava_lang_StringBuffer_2(
+    JNIEnv* env, jobject jdb, jlong jread_options_handle,
+    jbyteArray jkey, jint jkey_len, jobject jvalue) {
+  rocksdb::DB* db = rocksdb::RocksDBJni::getHandle(env, jdb);
+  auto& read_options = *reinterpret_cast<rocksdb::ReadOptions*>(
+      jread_options_handle);
+  return key_may_exist_helper(env, db, read_options,
+      nullptr, jkey, jkey_len, jvalue);
+}
+
 /*
  * Class:     org_rocksdb_RocksDB
  * Method:    keyMayExist

From 45a612f990718634ae8f89500b53cb2c2c9ddc27 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Fri, 31 Oct 2014 19:22:42 -0700
Subject: [PATCH 394/829] Revert "Fix incorrect fixing of lint errors in
 ldb_cmd.cc"

This reverts commit 8ddddd62d0bf7ac74cf64a024452587a007b6096.
---
 util/ldb_cmd.cc | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/util/ldb_cmd.cc b/util/ldb_cmd.cc
index 1dfdd732d..c03c1b31a 100644
--- a/util/ldb_cmd.cc
+++ b/util/ldb_cmd.cc
@@ -510,7 +510,7 @@ void ManifestDumpCommand::Help(std::string* ret) {
   ret->append("  ");
   ret->append(ManifestDumpCommand::Name());
   ret->append(" [--" + ARG_VERBOSE + "]");
-  ret->append(" [--" + ARG_PATH + "=<path_to_manifest_file>]");
+  ret->append(" [--" + ARG_PATH + " = <path_to_manifest_file>]");
   ret->append("\n");
 }
 
@@ -737,9 +737,9 @@ void InternalDumpCommand::Help(std::string* ret) {
   ret->append(InternalDumpCommand::Name());
   ret->append(HelpRangeCmdArgs());
   ret->append(" [--" + ARG_INPUT_KEY_HEX + "]");
-  ret->append(" [--" + ARG_MAX_KEYS + "=<N>]");
+  ret->append(" [--" + ARG_MAX_KEYS + " = <N>]");
   ret->append(" [--" + ARG_COUNT_ONLY + "]");
-  ret->append(" [--" + ARG_COUNT_DELIM + "=<char>]");
+  ret->append(" [--" + ARG_COUNT_DELIM + " = <char>]");
   ret->append(" [--" + ARG_STATS + "]");
   ret->append("\n");
 }
@@ -922,14 +922,14 @@ void DBDumperCommand::Help(std::string* ret) {
   ret->append(DBDumperCommand::Name());
   ret->append(HelpRangeCmdArgs());
   ret->append(" [--" + ARG_TTL + "]");
-  ret->append(" [--" + ARG_MAX_KEYS + "=<N>]");
+  ret->append(" [--" + ARG_MAX_KEYS + " = <N>]");
   ret->append(" [--" + ARG_TIMESTAMP + "]");
   ret->append(" [--" + ARG_COUNT_ONLY + "]");
-  ret->append(" [--" + ARG_COUNT_DELIM + "=<char>]");
+  ret->append(" [--" + ARG_COUNT_DELIM + " = <char>]");
   ret->append(" [--" + ARG_STATS + "]");
-  ret->append(" [--" + ARG_TTL_BUCKET + "=<N>]");
-  ret->append(" [--" + ARG_TTL_START + "=<N>:- is inclusive]");
-  ret->append(" [--" + ARG_TTL_END + "=<N>:- is exclusive]");
+  ret->append(" [--" + ARG_TTL_BUCKET + " = <N>]");
+  ret->append(" [--" + ARG_TTL_START + " = <N>:- is inclusive]");
+  ret->append(" [--" + ARG_TTL_END + " = <N>:- is exclusive]");
   ret->append("\n");
 }
 
@@ -1096,8 +1096,8 @@ std::vector<std::string> ReduceDBLevelsCommand::PrepareArgs(
     int new_levels, bool print_old_level) {
   std::vector<std::string> ret;
   ret.push_back("reduce_levels");
-  ret.push_back("--" + ARG_DB + "=" + db_path);
-  ret.push_back("--" + ARG_NEW_LEVELS + "=" + std::to_string(new_levels));
+  ret.push_back("--" + ARG_DB + " = " + db_path);
+  ret.push_back("--" + ARG_NEW_LEVELS + " = " + std::to_string(new_levels));
   if (print_old_level) {
     ret.push_back("--" + ARG_PRINT_OLD_LEVELS);
   }
@@ -1107,7 +1107,7 @@ std::vector<std::string> ReduceDBLevelsCommand::PrepareArgs(
 void ReduceDBLevelsCommand::Help(std::string* ret) {
   ret->append("  ");
   ret->append(ReduceDBLevelsCommand::Name());
-  ret->append(" --" + ARG_NEW_LEVELS + "=<New number of levels>");
+  ret->append(" --" + ARG_NEW_LEVELS + " = <New number of levels>");
   ret->append(" [--" + ARG_PRINT_OLD_LEVELS + "]");
   ret->append("\n");
 }
@@ -1255,10 +1255,10 @@ void ChangeCompactionStyleCommand::Help(std::string* ret) {
   ret->append("  ");
   ret->append(ChangeCompactionStyleCommand::Name());
   ret->append(
-      " --" + ARG_OLD_COMPACTION_STYLE + "=<Old compaction style: 0 " +
+      " --" + ARG_OLD_COMPACTION_STYLE + " = <Old compaction style: 0 " +
       "for level compaction, 1 for universal compaction>");
   ret->append(
-      " --" + ARG_NEW_COMPACTION_STYLE + "=<New compaction style: 0 " +
+      " --" + ARG_NEW_COMPACTION_STYLE + " = <New compaction style: 0 " +
       "for level compaction, 1 for universal compaction>");
   ret->append("\n");
 }
@@ -1408,7 +1408,7 @@ WALDumperCommand::WALDumperCommand(
 void WALDumperCommand::Help(std::string* ret) {
   ret->append("  ");
   ret->append(WALDumperCommand::Name());
-  ret->append(" --" + ARG_WAL_FILE + "=<write_ahead_log_file_path>");
+  ret->append(" --" + ARG_WAL_FILE + " = <write_ahead_log_file_path>");
   ret->append(" [--" + ARG_PRINT_HEADER + "] ");
   ret->append(" [--" + ARG_PRINT_VALUE + "] ");
   ret->append("\n");
@@ -1660,9 +1660,9 @@ void ScanCommand::Help(std::string* ret) {
   ret->append(HelpRangeCmdArgs());
   ret->append(" [--" + ARG_TTL + "]");
   ret->append(" [--" + ARG_TIMESTAMP + "]");
-  ret->append(" [--" + ARG_MAX_KEYS + "=<N>q] ");
-  ret->append(" [--" + ARG_TTL_START + "=<N>:- is inclusive]");
-  ret->append(" [--" + ARG_TTL_END + "=<N>:- is exclusive]");
+  ret->append(" [--" + ARG_MAX_KEYS + " = <N>q] ");
+  ret->append(" [--" + ARG_TTL_START + " = <N>:- is inclusive]");
+  ret->append(" [--" + ARG_TTL_END + " = <N>:- is exclusive]");
   ret->append("\n");
 }
 

From 8e79ce68cef08aaa9dbc62d948d2fdefff2f315c Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Fri, 31 Oct 2014 19:22:49 -0700
Subject: [PATCH 395/829] Revert "Fix lint errors and coding style of ldb
 related codes."

This reverts commit bc9f36fd5e5f0eae69a5a1b7269bb2623cc0eb1f.
---
 tools/sst_dump.cc             |   4 +-
 util/ldb_cmd.cc               | 904 ++++++++++++++++------------------
 util/ldb_cmd.h                | 465 +++++++++--------
 util/ldb_cmd_execute_result.h |   6 +-
 util/ldb_tool.cc              |  36 +-
 5 files changed, 674 insertions(+), 741 deletions(-)

diff --git a/tools/sst_dump.cc b/tools/sst_dump.cc
index e9fdf1c3f..6c496e8dd 100644
--- a/tools/sst_dump.cc
+++ b/tools/sst_dump.cc
@@ -275,8 +275,8 @@ static void print_help() {
 }
 
 namespace {
-std::string HexToString(const std::string& str) {
-  std::string parsed;
+string HexToString(const string& str) {
+  string parsed;
   if (str[0] != '0' || str[1] != 'x') {
     fprintf(stderr, "Invalid hex input %s.  Must start with 0x\n",
             str.c_str());
diff --git a/util/ldb_cmd.cc b/util/ldb_cmd.cc
index c03c1b31a..618c10a35 100644
--- a/util/ldb_cmd.cc
+++ b/util/ldb_cmd.cc
@@ -4,20 +4,6 @@
 //  of patent rights can be found in the PATENTS file in the same directory.
 //
 #ifndef ROCKSDB_LITE
-#include <dirent.h>
-#include <ctime>
-#include <limits>
-#include <sstream>
-#include <string>
-#include <stdexcept>
-#include <utility>
-
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <inttypes.h>
-
 #include "util/ldb_cmd.h"
 
 #include "db/dbformat.h"
@@ -31,36 +17,46 @@
 #include "util/scoped_arena_iterator.h"
 #include "utilities/ttl/db_ttl_impl.h"
 
+#include <ctime>
+#include <dirent.h>
+#include <limits>
+#include <sstream>
+#include <string>
+#include <stdexcept>
+
 namespace rocksdb {
 
-const std::string LDBCommand::ARG_DB = "db";
-const std::string LDBCommand::ARG_HEX = "hex";
-const std::string LDBCommand::ARG_KEY_HEX = "key_hex";
-const std::string LDBCommand::ARG_VALUE_HEX = "value_hex";
-const std::string LDBCommand::ARG_TTL = "ttl";
-const std::string LDBCommand::ARG_TTL_START = "start_time";
-const std::string LDBCommand::ARG_TTL_END = "end_time";
-const std::string LDBCommand::ARG_TIMESTAMP = "timestamp";
-const std::string LDBCommand::ARG_FROM = "from";
-const std::string LDBCommand::ARG_TO = "to";
-const std::string LDBCommand::ARG_MAX_KEYS = "max_keys";
-const std::string LDBCommand::ARG_BLOOM_BITS = "bloom_bits";
-const std::string LDBCommand::ARG_FIX_PREFIX_LEN = "fix_prefix_len";
-const std::string LDBCommand::ARG_COMPRESSION_TYPE = "compression_type";
-const std::string LDBCommand::ARG_BLOCK_SIZE = "block_size";
-const std::string LDBCommand::ARG_AUTO_COMPACTION = "auto_compaction";
-const std::string LDBCommand::ARG_WRITE_BUFFER_SIZE = "write_buffer_size";
-const std::string LDBCommand::ARG_FILE_SIZE = "file_size";
-const std::string LDBCommand::ARG_CREATE_IF_MISSING = "create_if_missing";
+using namespace std;
+
+const string LDBCommand::ARG_DB = "db";
+const string LDBCommand::ARG_HEX = "hex";
+const string LDBCommand::ARG_KEY_HEX = "key_hex";
+const string LDBCommand::ARG_VALUE_HEX = "value_hex";
+const string LDBCommand::ARG_TTL = "ttl";
+const string LDBCommand::ARG_TTL_START = "start_time";
+const string LDBCommand::ARG_TTL_END = "end_time";
+const string LDBCommand::ARG_TIMESTAMP = "timestamp";
+const string LDBCommand::ARG_FROM = "from";
+const string LDBCommand::ARG_TO = "to";
+const string LDBCommand::ARG_MAX_KEYS = "max_keys";
+const string LDBCommand::ARG_BLOOM_BITS = "bloom_bits";
+const string LDBCommand::ARG_FIX_PREFIX_LEN = "fix_prefix_len";
+const string LDBCommand::ARG_COMPRESSION_TYPE = "compression_type";
+const string LDBCommand::ARG_BLOCK_SIZE = "block_size";
+const string LDBCommand::ARG_AUTO_COMPACTION = "auto_compaction";
+const string LDBCommand::ARG_WRITE_BUFFER_SIZE = "write_buffer_size";
+const string LDBCommand::ARG_FILE_SIZE = "file_size";
+const string LDBCommand::ARG_CREATE_IF_MISSING = "create_if_missing";
 
 const char* LDBCommand::DELIM = " ==> ";
 
 LDBCommand* LDBCommand::InitFromCmdLineArgs(
-    int argc,
-    char** argv,
-    const Options& options,
-    const LDBOptions& ldb_options) {
-  std::vector<std::string> args;
+  int argc,
+  char** argv,
+  const Options& options,
+  const LDBOptions& ldb_options
+) {
+  vector<string> args;
   for (int i = 1; i < argc; i++) {
     args.push_back(argv[i]);
   }
@@ -71,37 +67,37 @@ LDBCommand* LDBCommand::InitFromCmdLineArgs(
  * Parse the command-line arguments and create the appropriate LDBCommand2
  * instance.
  * The command line arguments must be in the following format:
- * ./ldb --db = PATH_TO_DB [--commonOpt1 = commonOpt1Val] ..
- *        COMMAND <PARAM1> <PARAM2> ...
- *        [-cmdSpecificOpt1 = cmdSpecificOpt1Val] ..
+ * ./ldb --db=PATH_TO_DB [--commonOpt1=commonOpt1Val] ..
+ *        COMMAND <PARAM1> <PARAM2> ... [-cmdSpecificOpt1=cmdSpecificOpt1Val] ..
  * This is similar to the command line format used by HBaseClientTool.
  * Command name is not included in args.
  * Returns nullptr if the command-line cannot be parsed.
  */
 LDBCommand* LDBCommand::InitFromCmdLineArgs(
-    const std::vector<std::string>& args,
-    const Options& options,
-    const LDBOptions& ldb_options) {
-  // --x = y command line arguments are added as x->y std::map entries.
-  std::map<std::string, std::string> option_map;
+  const vector<string>& args,
+  const Options& options,
+  const LDBOptions& ldb_options
+) {
+  // --x=y command line arguments are added as x->y map entries.
+  map<string, string> option_map;
 
   // Command-line arguments of the form --hex end up in this array as hex
-  std::vector<std::string> flags;
+  vector<string> flags;
 
   // Everything other than option_map and flags. Represents commands
-  // and their parameters.  For eg: put key1 value1 go into this std::vector.
-  std::vector<std::string> cmdTokens;
+  // and their parameters.  For eg: put key1 value1 go into this vector.
+  vector<string> cmdTokens;
 
-  const std::string OPTION_PREFIX = "--";
+  const string OPTION_PREFIX = "--";
 
   for (const auto& arg : args) {
-    if (arg[0] == '-' && arg[1] == '-') {
-      std::vector<std::string> splits = stringSplit(arg, '=');
+    if (arg[0] == '-' && arg[1] == '-'){
+      vector<string> splits = stringSplit(arg, '=');
       if (splits.size() == 2) {
-        std::string optionKey = splits[0].substr(OPTION_PREFIX.size());
+        string optionKey = splits[0].substr(OPTION_PREFIX.size());
         option_map[optionKey] = splits[1];
       } else {
-        std::string optionKey = splits[0].substr(OPTION_PREFIX.size());
+        string optionKey = splits[0].substr(OPTION_PREFIX.size());
         flags.push_back(optionKey);
       }
     } else {
@@ -114,10 +110,14 @@ LDBCommand* LDBCommand::InitFromCmdLineArgs(
     return nullptr;
   }
 
-  std::string cmd = cmdTokens[0];
-  std::vector<std::string> cmdParams(cmdTokens.begin()+1, cmdTokens.end());
+  string cmd = cmdTokens[0];
+  vector<string> cmdParams(cmdTokens.begin()+1, cmdTokens.end());
   LDBCommand* command = LDBCommand::SelectCommand(
-      cmd, cmdParams, option_map, flags);
+    cmd,
+    cmdParams,
+    option_map,
+    flags
+  );
 
   if (command) {
     command->SetDBOptions(options);
@@ -128,9 +128,11 @@ LDBCommand* LDBCommand::InitFromCmdLineArgs(
 
 LDBCommand* LDBCommand::SelectCommand(
     const std::string& cmd,
-    const std::vector<std::string>& cmdParams,
-    const std::map<std::string, std::string>& option_map,
-    const std::vector<std::string>& flags) {
+    const vector<string>& cmdParams,
+    const map<string, string>& option_map,
+    const vector<string>& flags
+  ) {
+
   if (cmd == GetCommand::Name()) {
     return new GetCommand(cmdParams, option_map, flags);
   } else if (cmd == PutCommand::Name()) {
@@ -177,21 +179,21 @@ LDBCommand* LDBCommand::SelectCommand(
  * value.  If there is an error, the specified exec_state is also
  * updated.
  */
-bool LDBCommand::ParseIntOption(
-    const std::map<std::string, std::string>& options,
-    const std::string& option, int* value,
-    LDBCommandExecuteResult* exec_state) {
-  auto itr = option_map_.find(option);
+bool LDBCommand::ParseIntOption(const map<string, string>& options,
+                                const string& option, int& value,
+                                LDBCommandExecuteResult& exec_state) {
+
+  map<string, string>::const_iterator itr = option_map_.find(option);
   if (itr != option_map_.end()) {
     try {
-      *value = stoi(itr->second);
+      value = stoi(itr->second);
       return true;
-    } catch(const std::invalid_argument&) {
-      *exec_state = LDBCommandExecuteResult::FAILED(
-          option + " has an invalid value.");
-    } catch(const std::out_of_range&) {
-      *exec_state = LDBCommandExecuteResult::FAILED(
-          option + " has a value out-of-range.");
+    } catch(const invalid_argument&) {
+      exec_state = LDBCommandExecuteResult::FAILED(option +
+                      " has an invalid value.");
+    } catch(const out_of_range&) {
+      exec_state = LDBCommandExecuteResult::FAILED(option +
+                      " has a value out-of-range.");
     }
   }
   return false;
@@ -202,9 +204,8 @@ bool LDBCommand::ParseIntOption(
  * Returns true if the option is found.
  * Returns false otherwise.
  */
-bool LDBCommand::ParseStringOption(
-    const std::map<std::string, std::string>& options,
-    const std::string& option, std::string* value) {
+bool LDBCommand::ParseStringOption(const map<string, string>& options,
+                                   const string& option, string* value) {
   auto itr = option_map_.find(option);
   if (itr != option_map_.end()) {
     *value = itr->second;
@@ -218,12 +219,12 @@ Options LDBCommand::PrepareOptionsForOpenDB() {
   Options opt = options_;
   opt.create_if_missing = false;
 
-  std::map<std::string, std::string>::const_iterator itr;
+  map<string, string>::const_iterator itr;
 
   BlockBasedTableOptions table_options;
   bool use_table_options = false;
   int bits;
-  if (ParseIntOption(option_map_, ARG_BLOOM_BITS, &bits, &exec_state_)) {
+  if (ParseIntOption(option_map_, ARG_BLOOM_BITS, bits, exec_state_)) {
     if (bits > 0) {
       use_table_options = true;
       table_options.filter_policy.reset(NewBloomFilterPolicy(bits));
@@ -234,7 +235,7 @@ Options LDBCommand::PrepareOptionsForOpenDB() {
   }
 
   int block_size;
-  if (ParseIntOption(option_map_, ARG_BLOCK_SIZE, &block_size, &exec_state_)) {
+  if (ParseIntOption(option_map_, ARG_BLOCK_SIZE, block_size, exec_state_)) {
     if (block_size > 0) {
       use_table_options = true;
       table_options.block_size = block_size;
@@ -255,7 +256,7 @@ Options LDBCommand::PrepareOptionsForOpenDB() {
 
   itr = option_map_.find(ARG_COMPRESSION_TYPE);
   if (itr != option_map_.end()) {
-    std::string comp = itr->second;
+    string comp = itr->second;
     if (comp == "no") {
       opt.compression = kNoCompression;
     } else if (comp == "snappy") {
@@ -276,8 +277,8 @@ Options LDBCommand::PrepareOptionsForOpenDB() {
   }
 
   int write_buffer_size;
-  if (ParseIntOption(option_map_, ARG_WRITE_BUFFER_SIZE,
-      &write_buffer_size, &exec_state_)) {
+  if (ParseIntOption(option_map_, ARG_WRITE_BUFFER_SIZE, write_buffer_size,
+        exec_state_)) {
     if (write_buffer_size > 0) {
       opt.write_buffer_size = write_buffer_size;
     } else {
@@ -287,7 +288,7 @@ Options LDBCommand::PrepareOptionsForOpenDB() {
   }
 
   int file_size;
-  if (ParseIntOption(option_map_, ARG_FILE_SIZE, &file_size, &exec_state_)) {
+  if (ParseIntOption(option_map_, ARG_FILE_SIZE, file_size, exec_state_)) {
     if (file_size > 0) {
       opt.target_file_size_base = file_size;
     } else {
@@ -301,13 +302,13 @@ Options LDBCommand::PrepareOptionsForOpenDB() {
   }
 
   int fix_prefix_len;
-  if (ParseIntOption(option_map_, ARG_FIX_PREFIX_LEN,
-                     &fix_prefix_len, &exec_state_)) {
+  if (ParseIntOption(option_map_, ARG_FIX_PREFIX_LEN, fix_prefix_len,
+                     exec_state_)) {
     if (fix_prefix_len > 0) {
       opt.prefix_extractor.reset(
           NewFixedPrefixTransform(static_cast<size_t>(fix_prefix_len)));
     } else {
-      exec_state_  =
+      exec_state_ =
           LDBCommandExecuteResult::FAILED(ARG_FIX_PREFIX_LEN + " must be > 0.");
     }
   }
@@ -315,11 +316,10 @@ Options LDBCommand::PrepareOptionsForOpenDB() {
   return opt;
 }
 
-bool LDBCommand::ParseKeyValue(
-    const std::string& line, std::string* key, std::string* value,
-    bool is_key_hex, bool is_value_hex) {
+bool LDBCommand::ParseKeyValue(const string& line, string* key, string* value,
+                              bool is_key_hex, bool is_value_hex) {
   size_t pos = line.find(DELIM);
-  if (pos != std::string::npos) {
+  if (pos != string::npos) {
     *key = line.substr(0, pos);
     *value = line.substr(pos + strlen(DELIM));
     if (is_key_hex) {
@@ -343,20 +343,20 @@ bool LDBCommand::ParseKeyValue(
  */
 bool LDBCommand::ValidateCmdLineOptions() {
 
-  for (auto itr = option_map_.begin();
-       itr != option_map_.end(); ++itr) {
+  for (map<string, string>::const_iterator itr = option_map_.begin();
+        itr != option_map_.end(); ++itr) {
     if (find(valid_cmd_line_options_.begin(),
-          valid_cmd_line_options_.end(), itr->first)  ==
+          valid_cmd_line_options_.end(), itr->first) ==
           valid_cmd_line_options_.end()) {
       fprintf(stderr, "Invalid command-line option %s\n", itr->first.c_str());
       return false;
     }
   }
 
-  for (std::vector<std::string>::const_iterator itr = flags_.begin();
+  for (vector<string>::const_iterator itr = flags_.begin();
         itr != flags_.end(); ++itr) {
     if (find(valid_cmd_line_options_.begin(),
-          valid_cmd_line_options_.end(), *itr)  ==
+          valid_cmd_line_options_.end(), *itr) ==
           valid_cmd_line_options_.end()) {
       fprintf(stderr, "Invalid command-line flag %s\n", itr->c_str());
       return false;
@@ -371,15 +371,14 @@ bool LDBCommand::ValidateCmdLineOptions() {
   return true;
 }
 
-CompactorCommand::CompactorCommand(
-    const std::vector<std::string>& params,
-    const std::map<std::string, std::string>& options,
-    const std::vector<std::string>& flags) :
+CompactorCommand::CompactorCommand(const vector<string>& params,
+      const map<string, string>& options, const vector<string>& flags) :
     LDBCommand(options, flags, false,
                BuildCmdLineOptions({ARG_FROM, ARG_TO, ARG_HEX, ARG_KEY_HEX,
                                     ARG_VALUE_HEX, ARG_TTL})),
     null_from_(true), null_to_(true) {
-  auto itr = options.find(ARG_FROM);
+
+  map<string, string>::const_iterator itr = options.find(ARG_FROM);
   if (itr != options.end()) {
     null_from_ = false;
     from_ = itr->second;
@@ -401,11 +400,11 @@ CompactorCommand::CompactorCommand(
   }
 }
 
-void CompactorCommand::Help(std::string* ret) {
-  ret->append("  ");
-  ret->append(CompactorCommand::Name());
-  ret->append(HelpRangeCmdArgs());
-  ret->append("\n");
+void CompactorCommand::Help(string& ret) {
+  ret.append("  ");
+  ret.append(CompactorCommand::Name());
+  ret.append(HelpRangeCmdArgs());
+  ret.append("\n");
 }
 
 void CompactorCommand::DoCommand() {
@@ -426,14 +425,12 @@ void CompactorCommand::DoCommand() {
   delete end;
 }
 
-const std::string DBLoaderCommand::ARG_DISABLE_WAL = "disable_wal";
-const std::string DBLoaderCommand::ARG_BULK_LOAD = "bulk_load";
-const std::string DBLoaderCommand::ARG_COMPACT = "compact";
+const string DBLoaderCommand::ARG_DISABLE_WAL = "disable_wal";
+const string DBLoaderCommand::ARG_BULK_LOAD = "bulk_load";
+const string DBLoaderCommand::ARG_COMPACT = "compact";
 
-DBLoaderCommand::DBLoaderCommand(
-    const std::vector<std::string>& params,
-    const std::map<std::string, std::string>& options,
-    const std::vector<std::string>& flags) :
+DBLoaderCommand::DBLoaderCommand(const vector<string>& params,
+      const map<string, string>& options, const vector<string>& flags) :
     LDBCommand(options, flags, false,
                BuildCmdLineOptions({ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX,
                                     ARG_FROM, ARG_TO, ARG_CREATE_IF_MISSING,
@@ -448,14 +445,14 @@ DBLoaderCommand::DBLoaderCommand(
   compact_ = IsFlagPresent(flags, ARG_COMPACT);
 }
 
-void DBLoaderCommand::Help(std::string* ret) {
-  ret->append("  ");
-  ret->append(DBLoaderCommand::Name());
-  ret->append(" [--" + ARG_CREATE_IF_MISSING + "]");
-  ret->append(" [--" + ARG_DISABLE_WAL + "]");
-  ret->append(" [--" + ARG_BULK_LOAD + "]");
-  ret->append(" [--" + ARG_COMPACT + "]");
-  ret->append("\n");
+void DBLoaderCommand::Help(string& ret) {
+  ret.append("  ");
+  ret.append(DBLoaderCommand::Name());
+  ret.append(" [--" + ARG_CREATE_IF_MISSING + "]");
+  ret.append(" [--" + ARG_DISABLE_WAL + "]");
+  ret.append(" [--" + ARG_BULK_LOAD + "]");
+  ret.append(" [--" + ARG_COMPACT + "]");
+  ret.append("\n");
 }
 
 Options DBLoaderCommand::PrepareOptionsForOpenDB() {
@@ -478,10 +475,10 @@ void DBLoaderCommand::DoCommand() {
   }
 
   int bad_lines = 0;
-  std::string line;
-  while (getline(std::cin, line, '\n')) {
-    std::string key;
-    std::string value;
+  string line;
+  while (getline(cin, line, '\n')) {
+    string key;
+    string value;
     if (ParseKeyValue(line, &key, &value, is_key_hex_, is_value_hex_)) {
       db_->Put(write_options, Slice(key), Slice(value));
     } else if (0 == line.find("Keys in range:")) {
@@ -494,7 +491,7 @@ void DBLoaderCommand::DoCommand() {
   }
 
   if (bad_lines > 0) {
-    std::cout << "Warning: " << bad_lines << " bad lines ignored." << std::endl;
+    cout << "Warning: " << bad_lines << " bad lines ignored." << endl;
   }
   if (compact_) {
     db_->CompactRange(nullptr, nullptr);
@@ -503,28 +500,27 @@ void DBLoaderCommand::DoCommand() {
 
 // ----------------------------------------------------------------------------
 
-const std::string ManifestDumpCommand::ARG_VERBOSE = "verbose";
-const std::string ManifestDumpCommand::ARG_PATH    = "path";
+const string ManifestDumpCommand::ARG_VERBOSE = "verbose";
+const string ManifestDumpCommand::ARG_PATH    = "path";
 
-void ManifestDumpCommand::Help(std::string* ret) {
-  ret->append("  ");
-  ret->append(ManifestDumpCommand::Name());
-  ret->append(" [--" + ARG_VERBOSE + "]");
-  ret->append(" [--" + ARG_PATH + " = <path_to_manifest_file>]");
-  ret->append("\n");
+void ManifestDumpCommand::Help(string& ret) {
+  ret.append("  ");
+  ret.append(ManifestDumpCommand::Name());
+  ret.append(" [--" + ARG_VERBOSE + "]");
+  ret.append(" [--" + ARG_PATH + "=<path_to_manifest_file>]");
+  ret.append("\n");
 }
 
-ManifestDumpCommand::ManifestDumpCommand(
-    const std::vector<std::string>& params,
-    const std::map<std::string, std::string>& options,
-    const std::vector<std::string>& flags) :
+ManifestDumpCommand::ManifestDumpCommand(const vector<string>& params,
+      const map<string, string>& options, const vector<string>& flags) :
     LDBCommand(options, flags, false,
                BuildCmdLineOptions({ARG_VERBOSE, ARG_PATH, ARG_HEX})),
     verbose_(false),
-    path_("") {
+    path_("")
+{
   verbose_ = IsFlagPresent(flags, ARG_VERBOSE);
 
-  auto itr = options.find(ARG_PATH);
+  map<string, string>::const_iterator itr = options.find(ARG_PATH);
   if (itr != options.end()) {
     path_ = itr->second;
     if (path_.empty()) {
@@ -601,17 +597,16 @@ void ManifestDumpCommand::DoCommand() {
 
 // ----------------------------------------------------------------------------
 
-void ListColumnFamiliesCommand::Help(std::string* ret) {
-  ret->append("  ");
-  ret->append(ListColumnFamiliesCommand::Name());
-  ret->append(" full_path_to_db_directory ");
-  ret->append("\n");
+void ListColumnFamiliesCommand::Help(string& ret) {
+  ret.append("  ");
+  ret.append(ListColumnFamiliesCommand::Name());
+  ret.append(" full_path_to_db_directory ");
+  ret.append("\n");
 }
 
 ListColumnFamiliesCommand::ListColumnFamiliesCommand(
-    const std::vector<std::string>& params,
-    const std::map<std::string, std::string>& options,
-    const std::vector<std::string>& flags)
+    const vector<string>& params, const map<string, string>& options,
+    const vector<string>& flags)
     : LDBCommand(options, flags, false, {}) {
 
   if (params.size() != 1) {
@@ -623,7 +618,7 @@ ListColumnFamiliesCommand::ListColumnFamiliesCommand(
 }
 
 void ListColumnFamiliesCommand::DoCommand() {
-  std::vector<std::string> column_families;
+  vector<string> column_families;
   Status s = DB::ListColumnFamilies(DBOptions(), dbname_, &column_families);
   if (!s.ok()) {
     printf("Error in processing db %s %s\n", dbname_.c_str(),
@@ -646,56 +641,54 @@ void ListColumnFamiliesCommand::DoCommand() {
 
 namespace {
 
-std::string ReadableTime(int unixtime) {
+string ReadableTime(int unixtime) {
   char time_buffer [80];
   time_t rawtime = unixtime;
   struct tm * timeinfo = localtime(&rawtime);
   strftime(time_buffer, 80, "%c", timeinfo);
-  return std::string(time_buffer);
+  return string(time_buffer);
 }
 
 // This function only called when it's the sane case of >1 buckets in time-range
 // Also called only when timekv falls between ttl_start and ttl_end provided
-void IncBucketCounts(std::vector<uint64_t>* bucket_counts, int ttl_start,
+void IncBucketCounts(vector<uint64_t>& bucket_counts, int ttl_start,
       int time_range, int bucket_size, int timekv, int num_buckets) {
   assert(time_range > 0 && timekv >= ttl_start && bucket_size > 0 &&
     timekv < (ttl_start + time_range) && num_buckets > 1);
   int bucket = (timekv - ttl_start) / bucket_size;
-  (*bucket_counts)[bucket]++;
+  bucket_counts[bucket]++;
 }
 
-void PrintBucketCounts(
-    const std::vector<uint64_t>& bucket_counts, int ttl_start,
-    int ttl_end, int bucket_size, int num_buckets) {
+void PrintBucketCounts(const vector<uint64_t>& bucket_counts, int ttl_start,
+      int ttl_end, int bucket_size, int num_buckets) {
   int time_point = ttl_start;
-  for (int i = 0; i < num_buckets - 1; i++, time_point += bucket_size) {
-    fprintf(stdout, "Keys in range %s to %s : %" PRIu64 "\n",
+  for(int i = 0; i < num_buckets - 1; i++, time_point += bucket_size) {
+    fprintf(stdout, "Keys in range %s to %s : %lu\n",
             ReadableTime(time_point).c_str(),
             ReadableTime(time_point + bucket_size).c_str(),
-            bucket_counts[i]);
+            (unsigned long)bucket_counts[i]);
   }
-  fprintf(stdout, "Keys in range %s to %s : %" PRIu64 "\n",
+  fprintf(stdout, "Keys in range %s to %s : %lu\n",
           ReadableTime(time_point).c_str(),
           ReadableTime(ttl_end).c_str(),
-          bucket_counts[num_buckets - 1]);
+          (unsigned long)bucket_counts[num_buckets - 1]);
 }
 
 }  // namespace
 
-const std::string InternalDumpCommand::ARG_COUNT_ONLY = "count_only";
-const std::string InternalDumpCommand::ARG_COUNT_DELIM = "count_delim";
-const std::string InternalDumpCommand::ARG_STATS = "stats";
-const std::string InternalDumpCommand::ARG_INPUT_KEY_HEX = "input_key_hex";
+const string InternalDumpCommand::ARG_COUNT_ONLY = "count_only";
+const string InternalDumpCommand::ARG_COUNT_DELIM = "count_delim";
+const string InternalDumpCommand::ARG_STATS = "stats";
+const string InternalDumpCommand::ARG_INPUT_KEY_HEX = "input_key_hex";
 
-InternalDumpCommand::InternalDumpCommand(
-    const std::vector<std::string>& params,
-    const std::map<std::string, std::string>& options,
-    const std::vector<std::string>& flags) :
+InternalDumpCommand::InternalDumpCommand(const vector<string>& params,
+                                         const map<string, string>& options,
+                                         const vector<string>& flags) :
     LDBCommand(options, flags, true,
-         BuildCmdLineOptions({ ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX,
-                               ARG_FROM, ARG_TO, ARG_MAX_KEYS,
-                               ARG_COUNT_ONLY, ARG_COUNT_DELIM, ARG_STATS,
-                               ARG_INPUT_KEY_HEX})),
+               BuildCmdLineOptions({ ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX,
+                                     ARG_FROM, ARG_TO, ARG_MAX_KEYS,
+                                     ARG_COUNT_ONLY, ARG_COUNT_DELIM, ARG_STATS,
+                                     ARG_INPUT_KEY_HEX})),
     has_from_(false),
     has_to_(false),
     max_keys_(-1),
@@ -708,14 +701,15 @@ InternalDumpCommand::InternalDumpCommand(
   has_from_ = ParseStringOption(options, ARG_FROM, &from_);
   has_to_ = ParseStringOption(options, ARG_TO, &to_);
 
-  ParseIntOption(options, ARG_MAX_KEYS, &max_keys_, &exec_state_);
-  auto itr = options.find(ARG_COUNT_DELIM);
+  ParseIntOption(options, ARG_MAX_KEYS, max_keys_, exec_state_);
+  map<string, string>::const_iterator itr = options.find(ARG_COUNT_DELIM);
   if (itr != options.end()) {
     delim_ = itr->second;
     count_delim_ = true;
+   // fprintf(stdout,"delim = %c\n",delim_[0]);
   } else {
     count_delim_ = IsFlagPresent(flags, ARG_COUNT_DELIM);
-    delim_ = ".";
+    delim_=".";
   }
 
   print_stats_ = IsFlagPresent(flags, ARG_STATS);
@@ -732,16 +726,16 @@ InternalDumpCommand::InternalDumpCommand(
   }
 }
 
-void InternalDumpCommand::Help(std::string* ret) {
-  ret->append("  ");
-  ret->append(InternalDumpCommand::Name());
-  ret->append(HelpRangeCmdArgs());
-  ret->append(" [--" + ARG_INPUT_KEY_HEX + "]");
-  ret->append(" [--" + ARG_MAX_KEYS + " = <N>]");
-  ret->append(" [--" + ARG_COUNT_ONLY + "]");
-  ret->append(" [--" + ARG_COUNT_DELIM + " = <char>]");
-  ret->append(" [--" + ARG_STATS + "]");
-  ret->append("\n");
+void InternalDumpCommand::Help(string& ret) {
+  ret.append("  ");
+  ret.append(InternalDumpCommand::Name());
+  ret.append(HelpRangeCmdArgs());
+  ret.append(" [--" + ARG_INPUT_KEY_HEX + "]");
+  ret.append(" [--" + ARG_MAX_KEYS + "=<N>]");
+  ret.append(" [--" + ARG_COUNT_ONLY + "]");
+  ret.append(" [--" + ARG_COUNT_DELIM + "=<char>]");
+  ret.append(" [--" + ARG_STATS + "]");
+  ret.append("\n");
 }
 
 void InternalDumpCommand::DoCommand() {
@@ -750,7 +744,7 @@ void InternalDumpCommand::DoCommand() {
   }
 
   if (print_stats_) {
-    std::string stats;
+    string stats;
     if (db_->GetProperty("rocksdb.stats", &stats)) {
       fprintf(stdout, "%s\n", stats.c_str());
     }
@@ -762,10 +756,10 @@ void InternalDumpCommand::DoCommand() {
     exec_state_ = LDBCommandExecuteResult::FAILED("DB is not DBImpl");
     return;
   }
-  std::string rtype1, rtype2, row, val;
+  string rtype1,rtype2,row,val;
   rtype2 = "";
-  uint64_t c = 0;
-  uint64_t s1 = 0, s2 = 0;
+  uint64_t c=0;
+  uint64_t s1=0,s2=0;
   // Setup internal key iterator
   Arena arena;
   ScopedArenaIterator iter(idb->TEST_NewInternalIterator(&arena));
@@ -782,7 +776,7 @@ void InternalDumpCommand::DoCommand() {
     iter->SeekToFirst();
   }
 
-  uint64_t count = 0;
+  long long count = 0;
   for (; iter->Valid(); iter->Next()) {
     ParsedInternalKey ikey;
     if (!ParseInternalKey(iter->key(), &ikey)) {
@@ -801,69 +795,59 @@ void InternalDumpCommand::DoCommand() {
     int k;
     if (count_delim_) {
       rtype1 = "";
-      s1 = 0;
+      s1=0;
       row = iter->key().ToString();
       val = iter->value().ToString();
-      for (k = 0; row[k] != '\x01' && row[k] != '\0'; k++) {
+      for(k=0;row[k]!='\x01' && row[k]!='\0';k++)
         s1++;
-      }
-      for (k = 0; val[k] != '\x01' && val[k] != '\0'; k++) {
+      for(k=0;val[k]!='\x01' && val[k]!='\0';k++)
         s1++;
-      }
-      for (int j = 0;
-           row[j] != delim_[0] && row[j] != '\0' && row[j] != '\x01';
-           j++) {
-        rtype1+= row[j];
-      }
-      if (rtype2.compare("") && rtype2.compare(rtype1) != 0) {
-        fprintf(stdout, "%s  = > count:%" PRIu64 "\tsize:%" PRIu64 "\n",
-            rtype2.c_str(), c, s2);
-        c = 1;
-        s2 = s1;
+      for(int j=0;row[j]!=delim_[0] && row[j]!='\0' && row[j]!='\x01';j++)
+        rtype1+=row[j];
+      if(rtype2.compare("") && rtype2.compare(rtype1)!=0) {
+        fprintf(stdout,"%s => count:%lld\tsize:%lld\n",rtype2.c_str(),
+            (long long)c,(long long)s2);
+        c=1;
+        s2=s1;
         rtype2 = rtype1;
       } else {
         c++;
-        s2+= s1;
-        rtype2 = rtype1;
+        s2+=s1;
+        rtype2=rtype1;
     }
   }
 
     if (!count_only_ && !count_delim_) {
-      std::string key = ikey.DebugString(is_key_hex_);
-      std::string value = iter->value().ToString(is_value_hex_);
-      std::cout << key << "  = > " << value << "\n";
+      string key = ikey.DebugString(is_key_hex_);
+      string value = iter->value().ToString(is_value_hex_);
+      std::cout << key << " => " << value << "\n";
     }
 
     // Terminate if maximum number of keys have been dumped
-    if (max_keys_ > 0 && count >= static_cast<uint64_t>(max_keys_)) {
-      break;
-    }
+    if (max_keys_ > 0 && count >= max_keys_) break;
   }
-  if (count_delim_) {
-    fprintf(stdout,
-        "%s  = > count:%" PRIu64 "\tsize:%" PRIu64 "\n",
-        rtype2.c_str(), c, s2);
+  if(count_delim_) {
+    fprintf(stdout,"%s => count:%lld\tsize:%lld\n", rtype2.c_str(),
+        (long long)c,(long long)s2);
   } else
-  fprintf(stdout, "Internal keys in range: %" PRIu64 "\n", count);
+  fprintf(stdout, "Internal keys in range: %lld\n", (long long) count);
 }
 
 
-const std::string DBDumperCommand::ARG_COUNT_ONLY = "count_only";
-const std::string DBDumperCommand::ARG_COUNT_DELIM = "count_delim";
-const std::string DBDumperCommand::ARG_STATS = "stats";
-const std::string DBDumperCommand::ARG_TTL_BUCKET = "bucket";
+const string DBDumperCommand::ARG_COUNT_ONLY = "count_only";
+const string DBDumperCommand::ARG_COUNT_DELIM = "count_delim";
+const string DBDumperCommand::ARG_STATS = "stats";
+const string DBDumperCommand::ARG_TTL_BUCKET = "bucket";
 
-DBDumperCommand::DBDumperCommand(
-    const std::vector<std::string>& params,
-    const std::map<std::string, std::string>& options,
-    const std::vector<std::string>& flags) :
+DBDumperCommand::DBDumperCommand(const vector<string>& params,
+      const map<string, string>& options, const vector<string>& flags) :
     LDBCommand(options, flags, true,
-        BuildCmdLineOptions({ARG_TTL, ARG_HEX, ARG_KEY_HEX,
-                             ARG_VALUE_HEX, ARG_FROM, ARG_TO,
-                             ARG_MAX_KEYS, ARG_COUNT_ONLY,
-                             ARG_COUNT_DELIM, ARG_STATS, ARG_TTL_START,
-                             ARG_TTL_END, ARG_TTL_BUCKET,
-                             ARG_TIMESTAMP})),
+               BuildCmdLineOptions({ARG_TTL, ARG_HEX, ARG_KEY_HEX,
+                                    ARG_VALUE_HEX, ARG_FROM, ARG_TO,
+                                    ARG_MAX_KEYS, ARG_COUNT_ONLY,
+                                    ARG_COUNT_DELIM, ARG_STATS, ARG_TTL_START,
+                                    ARG_TTL_END, ARG_TTL_BUCKET,
+                                    ARG_TIMESTAMP})),
     null_from_(true),
     null_to_(true),
     max_keys_(-1),
@@ -871,7 +855,7 @@ DBDumperCommand::DBDumperCommand(
     count_delim_(false),
     print_stats_(false) {
 
-  auto itr = options.find(ARG_FROM);
+  map<string, string>::const_iterator itr = options.find(ARG_FROM);
   if (itr != options.end()) {
     null_from_ = false;
     from_ = itr->second;
@@ -887,10 +871,10 @@ DBDumperCommand::DBDumperCommand(
   if (itr != options.end()) {
     try {
       max_keys_ = stoi(itr->second);
-    } catch(const std::invalid_argument&) {
+    } catch(const invalid_argument&) {
       exec_state_ = LDBCommandExecuteResult::FAILED(ARG_MAX_KEYS +
                         " has an invalid value");
-    } catch(const std::out_of_range&) {
+    } catch(const out_of_range&) {
       exec_state_ = LDBCommandExecuteResult::FAILED(ARG_MAX_KEYS +
                         " has a value out-of-range");
     }
@@ -901,7 +885,7 @@ DBDumperCommand::DBDumperCommand(
     count_delim_ = true;
   } else {
     count_delim_ = IsFlagPresent(flags, ARG_COUNT_DELIM);
-    delim_ = ".";
+    delim_=".";
   }
 
   print_stats_ = IsFlagPresent(flags, ARG_STATS);
@@ -917,20 +901,20 @@ DBDumperCommand::DBDumperCommand(
   }
 }
 
-void DBDumperCommand::Help(std::string* ret) {
-  ret->append("  ");
-  ret->append(DBDumperCommand::Name());
-  ret->append(HelpRangeCmdArgs());
-  ret->append(" [--" + ARG_TTL + "]");
-  ret->append(" [--" + ARG_MAX_KEYS + " = <N>]");
-  ret->append(" [--" + ARG_TIMESTAMP + "]");
-  ret->append(" [--" + ARG_COUNT_ONLY + "]");
-  ret->append(" [--" + ARG_COUNT_DELIM + " = <char>]");
-  ret->append(" [--" + ARG_STATS + "]");
-  ret->append(" [--" + ARG_TTL_BUCKET + " = <N>]");
-  ret->append(" [--" + ARG_TTL_START + " = <N>:- is inclusive]");
-  ret->append(" [--" + ARG_TTL_END + " = <N>:- is exclusive]");
-  ret->append("\n");
+void DBDumperCommand::Help(string& ret) {
+  ret.append("  ");
+  ret.append(DBDumperCommand::Name());
+  ret.append(HelpRangeCmdArgs());
+  ret.append(" [--" + ARG_TTL + "]");
+  ret.append(" [--" + ARG_MAX_KEYS + "=<N>]");
+  ret.append(" [--" + ARG_TIMESTAMP + "]");
+  ret.append(" [--" + ARG_COUNT_ONLY + "]");
+  ret.append(" [--" + ARG_COUNT_DELIM + "=<char>]");
+  ret.append(" [--" + ARG_STATS + "]");
+  ret.append(" [--" + ARG_TTL_BUCKET + "=<N>]");
+  ret.append(" [--" + ARG_TTL_START + "=<N>:- is inclusive]");
+  ret.append(" [--" + ARG_TTL_END + "=<N>:- is exclusive]");
+  ret.append("\n");
 }
 
 void DBDumperCommand::DoCommand() {
@@ -940,7 +924,7 @@ void DBDumperCommand::DoCommand() {
   // Parse command line args
   uint64_t count = 0;
   if (print_stats_) {
-    std::string stats;
+    string stats;
     if (db_->GetProperty("rocksdb.stats", &stats)) {
       fprintf(stdout, "%s\n", stats.c_str());
     }
@@ -962,11 +946,11 @@ void DBDumperCommand::DoCommand() {
 
   int max_keys = max_keys_;
   int ttl_start;
-  if (!ParseIntOption(option_map_, ARG_TTL_START, &ttl_start, &exec_state_)) {
+  if (!ParseIntOption(option_map_, ARG_TTL_START, ttl_start, exec_state_)) {
     ttl_start = DBWithTTLImpl::kMinTimestamp;  // TTL introduction time
   }
   int ttl_end;
-  if (!ParseIntOption(option_map_, ARG_TTL_END, &ttl_end, &exec_state_)) {
+  if (!ParseIntOption(option_map_, ARG_TTL_END, ttl_end, exec_state_)) {
     ttl_end = DBWithTTLImpl::kMaxTimestamp;  // Max time allowed by TTL feature
   }
   if (ttl_end < ttl_start) {
@@ -976,21 +960,20 @@ void DBDumperCommand::DoCommand() {
   }
   int time_range = ttl_end - ttl_start;
   int bucket_size;
-  if (!ParseIntOption(
-          option_map_, ARG_TTL_BUCKET, &bucket_size, &exec_state_) ||
+  if (!ParseIntOption(option_map_, ARG_TTL_BUCKET, bucket_size, exec_state_) ||
       bucket_size <= 0) {
     bucket_size = time_range; // Will have just 1 bucket by default
   }
   //cretaing variables for row count of each type
-  std::string rtype1, rtype2, row, val;
+  string rtype1,rtype2,row,val;
   rtype2 = "";
-  uint64_t c = 0;
-  uint64_t s1 = 0, s2 = 0;
+  uint64_t c=0;
+  uint64_t s1=0,s2=0;
 
-  // At this point, bucket_size = 0  = > time_range = 0
+  // At this point, bucket_size=0 => time_range=0
   uint64_t num_buckets = (bucket_size >= time_range) ? 1 :
     ((time_range + bucket_size - 1) / bucket_size);
-  std::vector<uint64_t> bucket_counts(num_buckets, 0);
+  vector<uint64_t> bucket_counts(num_buckets, 0);
   if (is_db_ttl_ && !count_only_ && timestamp_ && !count_delim_) {
     fprintf(stdout, "Dumping key-values from %s to %s\n",
             ReadableTime(ttl_start).c_str(), ReadableTime(ttl_end).c_str());
@@ -1016,7 +999,7 @@ void DBDumperCommand::DoCommand() {
       --max_keys;
     }
     if (is_db_ttl_ && num_buckets > 1) {
-      IncBucketCounts(&bucket_counts, ttl_start, time_range, bucket_size,
+      IncBucketCounts(bucket_counts, ttl_start, time_range, bucket_size,
                       rawtime, num_buckets);
     }
     ++count;
@@ -1025,28 +1008,29 @@ void DBDumperCommand::DoCommand() {
       row = iter->key().ToString();
       val = iter->value().ToString();
       s1 = row.size()+val.size();
-      for (int j = 0; row[j] != delim_[0] && row[j] != '\0'; j++) {
-        rtype1 += row[j];
-      }
-      if (rtype2.compare("") && rtype2.compare(rtype1) != 0) {
-        fprintf(stdout,
-            "%s  = > count:%" PRIu64 "\tsize:%" PRIu64 "\n",
-            rtype2.c_str(), c, s2);
-        c = 1;
-        s2 = s1;
+      for(int j=0;row[j]!=delim_[0] && row[j]!='\0';j++)
+        rtype1+=row[j];
+      if(rtype2.compare("") && rtype2.compare(rtype1)!=0) {
+        fprintf(stdout,"%s => count:%lld\tsize:%lld\n",rtype2.c_str(),
+            (long long )c,(long long)s2);
+        c=1;
+        s2=s1;
         rtype2 = rtype1;
       } else {
-        c++;
-        s2 += s1;
-        rtype2 = rtype1;
+          c++;
+          s2+=s1;
+          rtype2=rtype1;
       }
+
     }
 
+
+
     if (!count_only_ && !count_delim_) {
       if (is_db_ttl_ && timestamp_) {
         fprintf(stdout, "%s ", ReadableTime(rawtime).c_str());
       }
-      std::string str = PrintKeyValue(iter->key().ToString(),
+      string str = PrintKeyValue(iter->key().ToString(),
                                  iter->value().ToString(), is_key_hex_,
                                  is_value_hex_);
       fprintf(stdout, "%s\n", str.c_str());
@@ -1056,25 +1040,21 @@ void DBDumperCommand::DoCommand() {
   if (num_buckets > 1 && is_db_ttl_) {
     PrintBucketCounts(bucket_counts, ttl_start, ttl_end, bucket_size,
                       num_buckets);
-  } else if (count_delim_) {
-    fprintf(stdout, "%s  = > count:%" PRIu64 "\tsize:%" PRIu64 "\n",
-        rtype2.c_str(), c, s2);
+  } else if(count_delim_) {
+    fprintf(stdout,"%s => count:%lld\tsize:%lld\n",rtype2.c_str(),
+        (long long )c,(long long)s2);
   } else {
-    fprintf(stdout, "Keys in range: %" PRIu64 "\n", count);
+    fprintf(stdout, "Keys in range: %lld\n", (long long) count);
   }
   // Clean up
   delete iter;
 }
 
-const std::string
-    ReduceDBLevelsCommand::ARG_NEW_LEVELS = "new_levels";
-const std::string
-    ReduceDBLevelsCommand::ARG_PRINT_OLD_LEVELS = "print_old_levels";
+const string ReduceDBLevelsCommand::ARG_NEW_LEVELS = "new_levels";
+const string  ReduceDBLevelsCommand::ARG_PRINT_OLD_LEVELS = "print_old_levels";
 
-ReduceDBLevelsCommand::ReduceDBLevelsCommand(
-    const std::vector<std::string>& params,
-    const std::map<std::string, std::string>& options,
-    const std::vector<std::string>& flags) :
+ReduceDBLevelsCommand::ReduceDBLevelsCommand(const vector<string>& params,
+      const map<string, string>& options, const vector<string>& flags) :
     LDBCommand(options, flags, false,
                BuildCmdLineOptions({ARG_NEW_LEVELS, ARG_PRINT_OLD_LEVELS})),
     old_levels_(1 << 16),
@@ -1082,34 +1062,33 @@ ReduceDBLevelsCommand::ReduceDBLevelsCommand(
     print_old_levels_(false) {
 
 
-  ParseIntOption(option_map_, ARG_NEW_LEVELS, &new_levels_, &exec_state_);
+  ParseIntOption(option_map_, ARG_NEW_LEVELS, new_levels_, exec_state_);
   print_old_levels_ = IsFlagPresent(flags, ARG_PRINT_OLD_LEVELS);
 
-  if (new_levels_ <= 0) {
+  if(new_levels_ <= 0) {
     exec_state_ = LDBCommandExecuteResult::FAILED(
            " Use --" + ARG_NEW_LEVELS + " to specify a new level number\n");
   }
 }
 
-std::vector<std::string> ReduceDBLevelsCommand::PrepareArgs(
-    const std::string& db_path,
+vector<string> ReduceDBLevelsCommand::PrepareArgs(const string& db_path,
     int new_levels, bool print_old_level) {
-  std::vector<std::string> ret;
+  vector<string> ret;
   ret.push_back("reduce_levels");
-  ret.push_back("--" + ARG_DB + " = " + db_path);
-  ret.push_back("--" + ARG_NEW_LEVELS + " = " + std::to_string(new_levels));
-  if (print_old_level) {
+  ret.push_back("--" + ARG_DB + "=" + db_path);
+  ret.push_back("--" + ARG_NEW_LEVELS + "=" + to_string(new_levels));
+  if(print_old_level) {
     ret.push_back("--" + ARG_PRINT_OLD_LEVELS);
   }
   return ret;
 }
 
-void ReduceDBLevelsCommand::Help(std::string* ret) {
-  ret->append("  ");
-  ret->append(ReduceDBLevelsCommand::Name());
-  ret->append(" --" + ARG_NEW_LEVELS + " = <New number of levels>");
-  ret->append(" [--" + ARG_PRINT_OLD_LEVELS + "]");
-  ret->append("\n");
+void ReduceDBLevelsCommand::Help(string& ret) {
+  ret.append("  ");
+  ret.append(ReduceDBLevelsCommand::Name());
+  ret.append(" --" + ARG_NEW_LEVELS + "=<New number of levels>");
+  ret.append(" [--" + ARG_PRINT_OLD_LEVELS + "]");
+  ret.append("\n");
 }
 
 Options ReduceDBLevelsCommand::PrepareOptionsForOpenDB() {
@@ -1172,8 +1151,7 @@ void ReduceDBLevelsCommand::DoCommand() {
   }
 
   if (print_old_levels_) {
-    fprintf(stdout, "The old number of levels in use is %d\n",
-            old_level_num);
+    fprintf(stdout, "The old number of levels in use is %d\n", old_level_num);
   }
 
   if (old_level_num <= new_levels_) {
@@ -1192,31 +1170,29 @@ void ReduceDBLevelsCommand::DoCommand() {
   CloseDB();
 
   EnvOptions soptions;
-  st = VersionSet::ReduceNumberOfLevels(
-      db_path_, &opt, soptions, new_levels_);
+  st = VersionSet::ReduceNumberOfLevels(db_path_, &opt, soptions, new_levels_);
   if (!st.ok()) {
     exec_state_ = LDBCommandExecuteResult::FAILED(st.ToString());
     return;
   }
 }
 
-const std::string ChangeCompactionStyleCommand::ARG_OLD_COMPACTION_STYLE  =
+const string ChangeCompactionStyleCommand::ARG_OLD_COMPACTION_STYLE =
   "old_compaction_style";
-const std::string ChangeCompactionStyleCommand::ARG_NEW_COMPACTION_STYLE  =
+const string ChangeCompactionStyleCommand::ARG_NEW_COMPACTION_STYLE =
   "new_compaction_style";
 
 ChangeCompactionStyleCommand::ChangeCompactionStyleCommand(
-      const std::vector<std::string>& params,
-      const std::map<std::string, std::string>& options,
-      const std::vector<std::string>& flags) :
+      const vector<string>& params, const map<string, string>& options,
+      const vector<string>& flags) :
     LDBCommand(options, flags, false,
                BuildCmdLineOptions({ARG_OLD_COMPACTION_STYLE,
                                     ARG_NEW_COMPACTION_STYLE})),
     old_compaction_style_(-1),
     new_compaction_style_(-1) {
 
-  ParseIntOption(option_map_, ARG_OLD_COMPACTION_STYLE,
-                 &old_compaction_style_, &exec_state_);
+  ParseIntOption(option_map_, ARG_OLD_COMPACTION_STYLE, old_compaction_style_,
+    exec_state_);
   if (old_compaction_style_ != kCompactionStyleLevel &&
      old_compaction_style_ != kCompactionStyleUniversal) {
     exec_state_ = LDBCommandExecuteResult::FAILED(
@@ -1225,8 +1201,8 @@ ChangeCompactionStyleCommand::ChangeCompactionStyleCommand(
     return;
   }
 
-  ParseIntOption(option_map_, ARG_NEW_COMPACTION_STYLE,
-                 &new_compaction_style_, &exec_state_);
+  ParseIntOption(option_map_, ARG_NEW_COMPACTION_STYLE, new_compaction_style_,
+    exec_state_);
   if (new_compaction_style_ != kCompactionStyleLevel &&
      new_compaction_style_ != kCompactionStyleUniversal) {
     exec_state_ = LDBCommandExecuteResult::FAILED(
@@ -1251,16 +1227,14 @@ ChangeCompactionStyleCommand::ChangeCompactionStyleCommand(
   }
 }
 
-void ChangeCompactionStyleCommand::Help(std::string* ret) {
-  ret->append("  ");
-  ret->append(ChangeCompactionStyleCommand::Name());
-  ret->append(
-      " --" + ARG_OLD_COMPACTION_STYLE + " = <Old compaction style: 0 " +
-      "for level compaction, 1 for universal compaction>");
-  ret->append(
-      " --" + ARG_NEW_COMPACTION_STYLE + " = <New compaction style: 0 " +
-      "for level compaction, 1 for universal compaction>");
-  ret->append("\n");
+void ChangeCompactionStyleCommand::Help(string& ret) {
+  ret.append("  ");
+  ret.append(ChangeCompactionStyleCommand::Name());
+  ret.append(" --" + ARG_OLD_COMPACTION_STYLE + "=<Old compaction style: 0 " +
+             "for level compaction, 1 for universal compaction>");
+  ret.append(" --" + ARG_NEW_COMPACTION_STYLE + "=<New compaction style: 0 " +
+             "for level compaction, 1 for universal compaction>");
+  ret.append("\n");
 }
 
 Options ChangeCompactionStyleCommand::PrepareOptionsForOpenDB() {
@@ -1288,9 +1262,9 @@ void ChangeCompactionStyleCommand::DoCommand() {
     db_->GetProperty("rocksdb.num-files-at-level" + NumberToString(i),
                      &property);
 
-    // format print std::string
+    // format print string
     char buf[100];
-    snprintf(buf, sizeof(buf), "%s%s", (i ? ", " : ""), property.c_str());
+    snprintf(buf, sizeof(buf), "%s%s", (i ? "," : ""), property.c_str());
     files_per_level += buf;
   }
   fprintf(stdout, "files per level before compaction: %s\n",
@@ -1308,9 +1282,9 @@ void ChangeCompactionStyleCommand::DoCommand() {
     db_->GetProperty("rocksdb.num-files-at-level" + NumberToString(i),
                      &property);
 
-    // format print std::string
+    // format print string
     char buf[100];
-    snprintf(buf, sizeof(buf), "%s%s", (i ? ", " : ""), property.c_str());
+    snprintf(buf, sizeof(buf), "%s%s", (i ? "," : ""), property.c_str());
     files_per_level += buf;
 
     num_files = atoi(property.c_str());
@@ -1318,15 +1292,15 @@ void ChangeCompactionStyleCommand::DoCommand() {
     // level 0 should have only 1 file
     if (i == 0 && num_files != 1) {
       exec_state_ = LDBCommandExecuteResult::FAILED("Number of db files at "
-          "level 0 after compaction is " + std::to_string(num_files) +
-          ", not 1.\n");
+        "level 0 after compaction is " + std::to_string(num_files) +
+        ", not 1.\n");
       return;
     }
     // other levels should have no file
     if (i > 0 && num_files != 0) {
       exec_state_ = LDBCommandExecuteResult::FAILED("Number of db files at "
-          "level " + std::to_string(i) + " after compaction is " +
-          std::to_string(num_files) + ", not 0.\n");
+        "level " + std::to_string(i) + " after compaction is " +
+        std::to_string(num_files) + ", not 0.\n");
       return;
     }
   }
@@ -1337,15 +1311,14 @@ void ChangeCompactionStyleCommand::DoCommand() {
 
 class InMemoryHandler : public WriteBatch::Handler {
  public:
-  InMemoryHandler(std::stringstream& row, bool print_values)
-      : Handler(), row_(row) {
+  InMemoryHandler(stringstream& row, bool print_values) : Handler(),row_(row) {
     print_values_ = print_values;
   }
 
   void commonPutMerge(const Slice& key, const Slice& value) {
-    std::string k = LDBCommand::StringToHex(key.ToString());
+    string k = LDBCommand::StringToHex(key.ToString());
     if (print_values_) {
-      std::string v = LDBCommand::StringToHex(value.ToString());
+      string v = LDBCommand::StringToHex(value.ToString());
       row_ << k << " : ";
       row_ << v << " ";
     } else {
@@ -1364,25 +1337,23 @@ class InMemoryHandler : public WriteBatch::Handler {
   }
 
   virtual void Delete(const Slice& key) {
-    row_  << ", DELETE : ";
+    row_ <<",DELETE : ";
     row_ << LDBCommand::StringToHex(key.ToString()) << " ";
   }
 
   virtual ~InMemoryHandler() { };
 
  private:
-  std::stringstream & row_;
+  stringstream & row_;
   bool print_values_;
 };
 
-const std::string WALDumperCommand::ARG_WAL_FILE = "walfile";
-const std::string WALDumperCommand::ARG_PRINT_VALUE = "print_value";
-const std::string WALDumperCommand::ARG_PRINT_HEADER = "header";
+const string WALDumperCommand::ARG_WAL_FILE = "walfile";
+const string WALDumperCommand::ARG_PRINT_VALUE = "print_value";
+const string WALDumperCommand::ARG_PRINT_HEADER = "header";
 
-WALDumperCommand::WALDumperCommand(
-    const std::vector<std::string>& params,
-    const std::map<std::string, std::string>& options,
-    const std::vector<std::string>& flags) :
+WALDumperCommand::WALDumperCommand(const vector<string>& params,
+      const map<string, string>& options, const vector<string>& flags) :
     LDBCommand(options, flags, true,
                BuildCmdLineOptions(
                 {ARG_WAL_FILE, ARG_PRINT_HEADER, ARG_PRINT_VALUE})),
@@ -1390,8 +1361,7 @@ WALDumperCommand::WALDumperCommand(
 
   wal_file_.clear();
 
-  std::map<std::string, std::string>::const_iterator itr  =
-      options.find(ARG_WAL_FILE);
+  map<string, string>::const_iterator itr = options.find(ARG_WAL_FILE);
   if (itr != options.end()) {
     wal_file_ = itr->second;
   }
@@ -1405,19 +1375,19 @@ WALDumperCommand::WALDumperCommand(
   }
 }
 
-void WALDumperCommand::Help(std::string* ret) {
-  ret->append("  ");
-  ret->append(WALDumperCommand::Name());
-  ret->append(" --" + ARG_WAL_FILE + " = <write_ahead_log_file_path>");
-  ret->append(" [--" + ARG_PRINT_HEADER + "] ");
-  ret->append(" [--" + ARG_PRINT_VALUE + "] ");
-  ret->append("\n");
+void WALDumperCommand::Help(string& ret) {
+  ret.append("  ");
+  ret.append(WALDumperCommand::Name());
+  ret.append(" --" + ARG_WAL_FILE + "=<write_ahead_log_file_path>");
+  ret.append(" [--" + ARG_PRINT_HEADER + "] ");
+  ret.append(" [--" + ARG_PRINT_VALUE + "] ");
+  ret.append("\n");
 }
 
 void WALDumperCommand::DoCommand() {
   struct StdErrReporter : public log::Reader::Reporter {
     virtual void Corruption(size_t bytes, const Status& s) {
-      std::cerr << "Corruption detected in log file " << s.ToString() << "\n";
+      cerr<<"Corruption detected in log file "<<s.ToString()<<"\n";
     }
   };
 
@@ -1426,21 +1396,21 @@ void WALDumperCommand::DoCommand() {
   EnvOptions soptions;
   Status status = env_->NewSequentialFile(wal_file_, &file, soptions);
   if (!status.ok()) {
-    exec_state_ = LDBCommandExecuteResult::FAILED(
-        "Failed to open WAL file " + status.ToString());
+    exec_state_ = LDBCommandExecuteResult::FAILED("Failed to open WAL file " +
+      status.ToString());
   } else {
     StdErrReporter reporter;
     log::Reader reader(move(file), &reporter, true, 0);
-    std::string scratch;
+    string scratch;
     WriteBatch batch;
     Slice record;
-    std::stringstream row;
+    stringstream row;
     if (print_header_) {
-      std::cout << "Sequence, Count, ByteSize, Physical Offset, Key(s)";
+      cout<<"Sequence,Count,ByteSize,Physical Offset,Key(s)";
       if (print_values_) {
-        std::cout << " : value ";
+        cout << " : value ";
       }
-      std::cout << "\n";
+      cout << "\n";
     }
     while(reader.ReadRecord(&record, &scratch)) {
       row.str("");
@@ -1449,24 +1419,22 @@ void WALDumperCommand::DoCommand() {
             record.size(), Status::Corruption("log record too small"));
       } else {
         WriteBatchInternal::SetContents(&batch, record);
-        row << WriteBatchInternal::Sequence(&batch) << ", ";
-        row << WriteBatchInternal::Count(&batch) << ", ";
-        row << WriteBatchInternal::ByteSize(&batch) << ", ";
-        row << reader.LastRecordOffset() << ", ";
+        row<<WriteBatchInternal::Sequence(&batch)<<",";
+        row<<WriteBatchInternal::Count(&batch)<<",";
+        row<<WriteBatchInternal::ByteSize(&batch)<<",";
+        row<<reader.LastRecordOffset()<<",";
         InMemoryHandler handler(row, print_values_);
         batch.Iterate(&handler);
-        row << "\n";
+        row<<"\n";
       }
-      std::cout << row.str();
+      cout<<row.str();
     }
   }
 }
 
 
-GetCommand::GetCommand(
-    const std::vector<std::string>& params,
-    const std::map<std::string, std::string>& options,
-    const std::vector<std::string>& flags) :
+GetCommand::GetCommand(const vector<string>& params,
+      const map<string, string>& options, const vector<string>& flags) :
   LDBCommand(options, flags, true, BuildCmdLineOptions({ARG_TTL, ARG_HEX,
                                                         ARG_KEY_HEX,
                                                         ARG_VALUE_HEX})) {
@@ -1483,16 +1451,16 @@ GetCommand::GetCommand(
   }
 }
 
-void GetCommand::Help(std::string* ret) {
-  ret->append("  ");
-  ret->append(GetCommand::Name());
-  ret->append(" <key>");
-  ret->append(" [--" + ARG_TTL + "]");
-  ret->append("\n");
+void GetCommand::Help(string& ret) {
+  ret.append("  ");
+  ret.append(GetCommand::Name());
+  ret.append(" <key>");
+  ret.append(" [--" + ARG_TTL + "]");
+  ret.append("\n");
 }
 
 void GetCommand::DoCommand() {
-  std::string value;
+  string value;
   Status st = db_->Get(ReadOptions(), key_, &value);
   if (st.ok()) {
     fprintf(stdout, "%s\n",
@@ -1503,10 +1471,8 @@ void GetCommand::DoCommand() {
 }
 
 
-ApproxSizeCommand::ApproxSizeCommand(
-    const std::vector<std::string>& params,
-    const std::map<std::string, std::string>& options,
-    const std::vector<std::string>& flags) :
+ApproxSizeCommand::ApproxSizeCommand(const vector<string>& params,
+      const map<string, string>& options, const vector<string>& flags) :
   LDBCommand(options, flags, true,
              BuildCmdLineOptions({ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX,
                                   ARG_FROM, ARG_TO})) {
@@ -1533,11 +1499,11 @@ ApproxSizeCommand::ApproxSizeCommand(
   }
 }
 
-void ApproxSizeCommand::Help(std::string* ret) {
-  ret->append("  ");
-  ret->append(ApproxSizeCommand::Name());
-  ret->append(HelpRangeCmdArgs());
-  ret->append("\n");
+void ApproxSizeCommand::Help(string& ret) {
+  ret.append("  ");
+  ret.append(ApproxSizeCommand::Name());
+  ret.append(HelpRangeCmdArgs());
+  ret.append("\n");
 }
 
 void ApproxSizeCommand::DoCommand() {
@@ -1556,45 +1522,43 @@ void ApproxSizeCommand::DoCommand() {
 }
 
 
-BatchPutCommand::BatchPutCommand(
-    const std::vector<std::string>& params,
-    const std::map<std::string, std::string>& options,
-    const std::vector<std::string>& flags) :
-    LDBCommand(options, flags, false,
-        BuildCmdLineOptions({ARG_TTL, ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX,
-                             ARG_CREATE_IF_MISSING})) {
+BatchPutCommand::BatchPutCommand(const vector<string>& params,
+      const map<string, string>& options, const vector<string>& flags) :
+  LDBCommand(options, flags, false,
+             BuildCmdLineOptions({ARG_TTL, ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX,
+                                  ARG_CREATE_IF_MISSING})) {
 
   if (params.size() < 2) {
     exec_state_ = LDBCommandExecuteResult::FAILED(
-        "At least one <key> <value> std::pair must be specified batchput.");
+        "At least one <key> <value> pair must be specified batchput.");
   } else if (params.size() % 2 != 0) {
     exec_state_ = LDBCommandExecuteResult::FAILED(
         "Equal number of <key>s and <value>s must be specified for batchput.");
   } else {
     for (size_t i = 0; i < params.size(); i += 2) {
-      std::string key = params.at(i);
-      std::string value = params.at(i+1);
-      key_values_.push_back(std::pair<std::string, std::string>(
+      string key = params.at(i);
+      string value = params.at(i+1);
+      key_values_.push_back(pair<string, string>(
                     is_key_hex_ ? HexToString(key) : key,
                     is_value_hex_ ? HexToString(value) : value));
     }
   }
 }
 
-void BatchPutCommand::Help(std::string* ret) {
-  ret->append("  ");
-  ret->append(BatchPutCommand::Name());
-  ret->append(" <key> <value> [<key> <value>] [..]");
-  ret->append(" [--" + ARG_TTL + "]");
-  ret->append("\n");
+void BatchPutCommand::Help(string& ret) {
+  ret.append("  ");
+  ret.append(BatchPutCommand::Name());
+  ret.append(" <key> <value> [<key> <value>] [..]");
+  ret.append(" [--" + ARG_TTL + "]");
+  ret.append("\n");
 }
 
 void BatchPutCommand::DoCommand() {
   WriteBatch batch;
 
-  for (std::vector<std::pair<std::string, std::string>>::const_iterator itr
+  for (vector<pair<string, string>>::const_iterator itr
         = key_values_.begin(); itr != key_values_.end(); ++itr) {
-    batch.Put(itr->first, itr->second);
+      batch.Put(itr->first, itr->second);
   }
   Status st = db_->Write(WriteOptions(), &batch);
   if (st.ok()) {
@@ -1611,10 +1575,8 @@ Options BatchPutCommand::PrepareOptionsForOpenDB() {
 }
 
 
-ScanCommand::ScanCommand(
-    const std::vector<std::string>& params,
-    const std::map<std::string, std::string>& options,
-    const std::vector<std::string>& flags) :
+ScanCommand::ScanCommand(const vector<string>& params,
+      const map<string, string>& options, const vector<string>& flags) :
     LDBCommand(options, flags, true,
                BuildCmdLineOptions({ARG_TTL, ARG_HEX, ARG_KEY_HEX, ARG_TO,
                                     ARG_VALUE_HEX, ARG_FROM, ARG_TIMESTAMP,
@@ -1623,7 +1585,7 @@ ScanCommand::ScanCommand(
     end_key_specified_(false),
     max_keys_scanned_(-1) {
 
-  auto itr = options.find(ARG_FROM);
+  map<string, string>::const_iterator itr = options.find(ARG_FROM);
   if (itr != options.end()) {
     start_key_ = itr->second;
     if (is_key_hex_) {
@@ -1644,26 +1606,26 @@ ScanCommand::ScanCommand(
   if (itr != options.end()) {
     try {
       max_keys_scanned_ = stoi(itr->second);
-    } catch(const std::invalid_argument&) {
+    } catch(const invalid_argument&) {
       exec_state_ = LDBCommandExecuteResult::FAILED(ARG_MAX_KEYS +
                         " has an invalid value");
-    } catch(const std::out_of_range&) {
+    } catch(const out_of_range&) {
       exec_state_ = LDBCommandExecuteResult::FAILED(ARG_MAX_KEYS +
                         " has a value out-of-range");
     }
   }
 }
 
-void ScanCommand::Help(std::string* ret) {
-  ret->append("  ");
-  ret->append(ScanCommand::Name());
-  ret->append(HelpRangeCmdArgs());
-  ret->append(" [--" + ARG_TTL + "]");
-  ret->append(" [--" + ARG_TIMESTAMP + "]");
-  ret->append(" [--" + ARG_MAX_KEYS + " = <N>q] ");
-  ret->append(" [--" + ARG_TTL_START + " = <N>:- is inclusive]");
-  ret->append(" [--" + ARG_TTL_END + " = <N>:- is exclusive]");
-  ret->append("\n");
+void ScanCommand::Help(string& ret) {
+  ret.append("  ");
+  ret.append(ScanCommand::Name());
+  ret.append(HelpRangeCmdArgs());
+  ret.append(" [--" + ARG_TTL + "]");
+  ret.append(" [--" + ARG_TIMESTAMP + "]");
+  ret.append(" [--" + ARG_MAX_KEYS + "=<N>q] ");
+  ret.append(" [--" + ARG_TTL_START + "=<N>:- is inclusive]");
+  ret.append(" [--" + ARG_TTL_END + "=<N>:- is exclusive]");
+  ret.append("\n");
 }
 
 void ScanCommand::DoCommand() {
@@ -1676,11 +1638,11 @@ void ScanCommand::DoCommand() {
     it->SeekToFirst();
   }
   int ttl_start;
-  if (!ParseIntOption(option_map_, ARG_TTL_START, &ttl_start, &exec_state_)) {
+  if (!ParseIntOption(option_map_, ARG_TTL_START, ttl_start, exec_state_)) {
     ttl_start = DBWithTTLImpl::kMinTimestamp;  // TTL introduction time
   }
   int ttl_end;
-  if (!ParseIntOption(option_map_, ARG_TTL_END, &ttl_end, &exec_state_)) {
+  if (!ParseIntOption(option_map_, ARG_TTL_END, ttl_end, exec_state_)) {
     ttl_end = DBWithTTLImpl::kMaxTimestamp;  // Max time allowed by TTL feature
   }
   if (ttl_end < ttl_start) {
@@ -1693,9 +1655,9 @@ void ScanCommand::DoCommand() {
             ReadableTime(ttl_start).c_str(), ReadableTime(ttl_end).c_str());
   }
   for ( ;
-      it->Valid() && (!end_key_specified_ || it->key().ToString() < end_key_);
-      it->Next()) {
-    std::string key = ldb_options_.key_formatter->Format(it->key());
+        it->Valid() && (!end_key_specified_ || it->key().ToString() < end_key_);
+        it->Next()) {
+    string key = ldb_options_.key_formatter->Format(it->key());
     if (is_db_ttl_) {
       TtlIterator* it_ttl = dynamic_cast<TtlIterator*>(it);
       assert(it_ttl);
@@ -1707,10 +1669,11 @@ void ScanCommand::DoCommand() {
         fprintf(stdout, "%s ", ReadableTime(rawtime).c_str());
       }
     }
-    std::string value = it->value().ToString();
+    string value = it->value().ToString();
     fprintf(stdout, "%s : %s\n",
             (is_key_hex_ ? "0x" + it->key().ToString(true) : key).c_str(),
-            (is_value_hex_ ? StringToHex(value) : value).c_str());
+            (is_value_hex_ ? StringToHex(value) : value).c_str()
+        );
     num_keys_scanned++;
     if (max_keys_scanned_ >= 0 && num_keys_scanned >= max_keys_scanned_) {
       break;
@@ -1723,9 +1686,8 @@ void ScanCommand::DoCommand() {
 }
 
 
-DeleteCommand::DeleteCommand(const std::vector<std::string>& params,
-      const std::map<std::string, std::string>& options,
-      const std::vector<std::string>& flags) :
+DeleteCommand::DeleteCommand(const vector<string>& params,
+      const map<string, string>& options, const vector<string>& flags) :
   LDBCommand(options, flags, false,
              BuildCmdLineOptions({ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX})) {
 
@@ -1740,10 +1702,10 @@ DeleteCommand::DeleteCommand(const std::vector<std::string>& params,
   }
 }
 
-void DeleteCommand::Help(std::string* ret) {
-  ret->append("  ");
-  ret->append(DeleteCommand::Name() + " <key>");
-  ret->append("\n");
+void DeleteCommand::Help(string& ret) {
+  ret.append("  ");
+  ret.append(DeleteCommand::Name() + " <key>");
+  ret.append("\n");
 }
 
 void DeleteCommand::DoCommand() {
@@ -1756,10 +1718,8 @@ void DeleteCommand::DoCommand() {
 }
 
 
-PutCommand::PutCommand(
-    const std::vector<std::string>& params,
-    const std::map<std::string, std::string>& options,
-    const std::vector<std::string>& flags) :
+PutCommand::PutCommand(const vector<string>& params,
+      const map<string, string>& options, const vector<string>& flags) :
   LDBCommand(options, flags, false,
              BuildCmdLineOptions({ARG_TTL, ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX,
                                   ARG_CREATE_IF_MISSING})) {
@@ -1781,12 +1741,12 @@ PutCommand::PutCommand(
   }
 }
 
-void PutCommand::Help(std::string* ret) {
-  ret->append("  ");
-  ret->append(PutCommand::Name());
-  ret->append(" <key> <value> ");
-  ret->append(" [--" + ARG_TTL + "]");
-  ret->append("\n");
+void PutCommand::Help(string& ret) {
+  ret.append("  ");
+  ret.append(PutCommand::Name());
+  ret.append(" <key> <value> ");
+  ret.append(" [--" + ARG_TTL + "]");
+  ret.append("\n");
 }
 
 void PutCommand::DoCommand() {
@@ -1810,43 +1770,43 @@ const char* DBQuerierCommand::GET_CMD = "get";
 const char* DBQuerierCommand::PUT_CMD = "put";
 const char* DBQuerierCommand::DELETE_CMD = "delete";
 
-DBQuerierCommand::DBQuerierCommand(
-    const std::vector<std::string>& params,
-    const std::map<std::string, std::string>& options,
-    const std::vector<std::string>& flags) :
+DBQuerierCommand::DBQuerierCommand(const vector<string>& params,
+    const map<string, string>& options, const vector<string>& flags) :
   LDBCommand(options, flags, false,
              BuildCmdLineOptions({ARG_TTL, ARG_HEX, ARG_KEY_HEX,
                                   ARG_VALUE_HEX})) {
 
 }
 
-void DBQuerierCommand::Help(std::string* ret) {
-  ret->append("  ");
-  ret->append(DBQuerierCommand::Name());
-  ret->append(" [--" + ARG_TTL + "]");
-  ret->append("\n");
-  ret->append("    Starts a REPL shell.  Type help for list of available "
+void DBQuerierCommand::Help(string& ret) {
+  ret.append("  ");
+  ret.append(DBQuerierCommand::Name());
+  ret.append(" [--" + ARG_TTL + "]");
+  ret.append("\n");
+  ret.append("    Starts a REPL shell.  Type help for list of available "
              "commands.");
-  ret->append("\n");
+  ret.append("\n");
 }
 
 void DBQuerierCommand::DoCommand() {
   if (!db_) {
     return;
   }
+
   ReadOptions read_options;
   WriteOptions write_options;
 
-  std::string line;
-  std::string key;
-  std::string value;
-  while (getline(std::cin, line, '\n')) {
-    // Parse line into std::vector<std::string>
-    std::vector<std::string> tokens;
+  string line;
+  string key;
+  string value;
+  while (getline(cin, line, '\n')) {
+
+    // Parse line into vector<string>
+    vector<string> tokens;
     size_t pos = 0;
     while (true) {
       size_t pos2 = line.find(' ', pos);
-      if (pos2 == std::string::npos) {
+      if (pos2 == string::npos) {
         break;
       }
       tokens.push_back(line.substr(pos, pos2-pos));
@@ -1854,7 +1814,7 @@ void DBQuerierCommand::DoCommand() {
     }
     tokens.push_back(line.substr(pos));
 
-    const std::string& cmd = tokens[0];
+    const string& cmd = tokens[0];
 
     if (cmd == HELP_CMD) {
       fprintf(stdout,
@@ -1885,18 +1845,16 @@ void DBQuerierCommand::DoCommand() {
   }
 }
 
-CheckConsistencyCommand::CheckConsistencyCommand(
-    const std::vector<std::string>& params,
-    const std::map<std::string, std::string>& options,
-    const std::vector<std::string>& flags) :
+CheckConsistencyCommand::CheckConsistencyCommand(const vector<string>& params,
+    const map<string, string>& options, const vector<string>& flags) :
   LDBCommand(options, flags, false,
              BuildCmdLineOptions({})) {
 }
 
-void CheckConsistencyCommand::Help(std::string* ret) {
-  ret->append("  ");
-  ret->append(CheckConsistencyCommand::Name());
-  ret->append("\n");
+void CheckConsistencyCommand::Help(string& ret) {
+  ret.append("  ");
+  ret.append(CheckConsistencyCommand::Name());
+  ret.append("\n");
 }
 
 void CheckConsistencyCommand::DoCommand() {
diff --git a/util/ldb_cmd.h b/util/ldb_cmd.h
index b42d779c3..9ffe0eabc 100644
--- a/util/ldb_cmd.h
+++ b/util/ldb_cmd.h
@@ -4,15 +4,12 @@
 //  of patent rights can be found in the PATENTS file in the same directory.
 //
 #pragma once
-#include <stdio.h>
-#include <stdlib.h>
 #include <string>
 #include <iostream>
 #include <sstream>
+#include <stdlib.h>
 #include <algorithm>
-#include <map>
-#include <vector>
-#include <utility>
+#include <stdio.h>
 
 #include "db/version_set.h"
 #include "rocksdb/env.h"
@@ -26,34 +23,39 @@
 #include "util/string_util.h"
 #include "utilities/ttl/db_ttl_impl.h"
 
+using std::string;
+using std::map;
+using std::vector;
+using std::ostringstream;
+
 namespace rocksdb {
 
 class LDBCommand {
- public:
+public:
 
   // Command-line arguments
-  static const std::string ARG_DB;
-  static const std::string ARG_HEX;
-  static const std::string ARG_KEY_HEX;
-  static const std::string ARG_VALUE_HEX;
-  static const std::string ARG_TTL;
-  static const std::string ARG_TTL_START;
-  static const std::string ARG_TTL_END;
-  static const std::string ARG_TIMESTAMP;
-  static const std::string ARG_FROM;
-  static const std::string ARG_TO;
-  static const std::string ARG_MAX_KEYS;
-  static const std::string ARG_BLOOM_BITS;
-  static const std::string ARG_FIX_PREFIX_LEN;
-  static const std::string ARG_COMPRESSION_TYPE;
-  static const std::string ARG_BLOCK_SIZE;
-  static const std::string ARG_AUTO_COMPACTION;
-  static const std::string ARG_WRITE_BUFFER_SIZE;
-  static const std::string ARG_FILE_SIZE;
-  static const std::string ARG_CREATE_IF_MISSING;
+  static const string ARG_DB;
+  static const string ARG_HEX;
+  static const string ARG_KEY_HEX;
+  static const string ARG_VALUE_HEX;
+  static const string ARG_TTL;
+  static const string ARG_TTL_START;
+  static const string ARG_TTL_END;
+  static const string ARG_TIMESTAMP;
+  static const string ARG_FROM;
+  static const string ARG_TO;
+  static const string ARG_MAX_KEYS;
+  static const string ARG_BLOOM_BITS;
+  static const string ARG_FIX_PREFIX_LEN;
+  static const string ARG_COMPRESSION_TYPE;
+  static const string ARG_BLOCK_SIZE;
+  static const string ARG_AUTO_COMPACTION;
+  static const string ARG_WRITE_BUFFER_SIZE;
+  static const string ARG_FILE_SIZE;
+  static const string ARG_CREATE_IF_MISSING;
 
   static LDBCommand* InitFromCmdLineArgs(
-    const std::vector<std::string>& args,
+    const vector<string>& args,
     const Options& options,
     const LDBOptions& ldb_options
   );
@@ -121,8 +123,8 @@ class LDBCommand {
     exec_state_.Reset();
   }
 
-  static std::string HexToString(const std::string& str) {
-    std::string parsed;
+  static string HexToString(const string& str) {
+    string parsed;
     if (str[0] != '0' || str[1] != 'x') {
       fprintf(stderr, "Invalid hex input %s.  Must start with 0x\n",
               str.c_str());
@@ -138,8 +140,8 @@ class LDBCommand {
     return parsed;
   }
 
-  static std::string StringToHex(const std::string& str) {
-    std::string result = "0x";
+  static string StringToHex(const string& str) {
+    string result = "0x";
     char buf[10];
     for (size_t i = 0; i < str.length(); i++) {
       snprintf(buf, 10, "%02X", (unsigned char)str[i]);
@@ -153,7 +155,7 @@ class LDBCommand {
 protected:
 
   LDBCommandExecuteResult exec_state_;
-  std::string db_path_;
+  string db_path_;
   DB* db_;
   DBWithTTL* db_ttl_;
 
@@ -178,24 +180,21 @@ protected:
   /**
    * Map of options passed on the command-line.
    */
-  const std::map<std::string, std::string> option_map_;
+  const map<string, string> option_map_;
 
   /**
    * Flags passed on the command-line.
    */
-  const std::vector<std::string> flags_;
+  const vector<string> flags_;
 
   /** List of command-line options valid for this command */
-  const std::vector<std::string> valid_cmd_line_options_;
+  const vector<string> valid_cmd_line_options_;
 
-  bool ParseKeyValue(const std::string& line,
-                     std::string* key, std::string* value,
-                     bool is_key_hex, bool is_value_hex);
+  bool ParseKeyValue(const string& line, string* key, string* value,
+                      bool is_key_hex, bool is_value_hex);
 
-  LDBCommand(const std::map<std::string, std::string>& options,
-             const std::vector<std::string>& flags,
-             bool is_read_only,
-             const std::vector<std::string>& valid_cmd_line_options) :
+  LDBCommand(const map<string, string>& options, const vector<string>& flags,
+             bool is_read_only, const vector<string>& valid_cmd_line_options) :
       db_(nullptr),
       is_read_only_(is_read_only),
       is_key_hex_(false),
@@ -206,7 +205,7 @@ protected:
       flags_(flags),
       valid_cmd_line_options_(valid_cmd_line_options) {
 
-    auto itr = options.find(ARG_DB);
+    map<string, string>::const_iterator itr = options.find(ARG_DB);
     if (itr != options.end()) {
       db_path_ = itr->second;
     }
@@ -237,7 +236,7 @@ protected:
       st = DB::Open(opt, db_path_, &db_);
     }
     if (!st.ok()) {
-      std::string msg = st.ToString();
+      string msg = st.ToString();
       exec_state_ = LDBCommandExecuteResult::FAILED(msg);
     }
 
@@ -251,33 +250,29 @@ protected:
     }
   }
 
-  static std::string PrintKeyValue(
-      const std::string& key, const std::string& value,
-      bool is_key_hex, bool is_value_hex) {
-    std::string result;
+  static string PrintKeyValue(const string& key, const string& value,
+        bool is_key_hex, bool is_value_hex) {
+    string result;
     result.append(is_key_hex ? StringToHex(key) : key);
     result.append(DELIM);
     result.append(is_value_hex ? StringToHex(value) : value);
     return result;
   }
 
-  static std::string PrintKeyValue(
-      const std::string& key, const std::string& value,
-      bool is_hex) {
+  static string PrintKeyValue(const string& key, const string& value,
+        bool is_hex) {
     return PrintKeyValue(key, value, is_hex, is_hex);
   }
 
   /**
-   * Return true if the specified flag is present in the specified
-   * flags vector
+   * Return true if the specified flag is present in the specified flags vector
    */
-  static bool IsFlagPresent(
-      const std::vector<std::string>& flags, const std::string& flag) {
+  static bool IsFlagPresent(const vector<string>& flags, const string& flag) {
     return (std::find(flags.begin(), flags.end(), flag) != flags.end());
   }
 
-  static std::string HelpRangeCmdArgs() {
-    std::ostringstream str_stream;
+  static string HelpRangeCmdArgs() {
+    ostringstream str_stream;
     str_stream << " ";
     str_stream << "[--" << ARG_FROM << "] ";
     str_stream << "[--" << ARG_TO << "] ";
@@ -289,35 +284,32 @@ protected:
    * used by this command.  It includes the common options and the ones
    * passed in.
    */
-  std::vector<std::string> BuildCmdLineOptions(
-      std::vector<std::string> options) {
-    std::vector<std::string> ret = {
-        ARG_DB,               ARG_BLOOM_BITS,
-        ARG_BLOCK_SIZE,       ARG_AUTO_COMPACTION,
-        ARG_COMPRESSION_TYPE, ARG_WRITE_BUFFER_SIZE,
-        ARG_FILE_SIZE,        ARG_FIX_PREFIX_LEN};
+  vector<string> BuildCmdLineOptions(vector<string> options) {
+    vector<string> ret = {ARG_DB,               ARG_BLOOM_BITS,
+                          ARG_BLOCK_SIZE,       ARG_AUTO_COMPACTION,
+                          ARG_COMPRESSION_TYPE, ARG_WRITE_BUFFER_SIZE,
+                          ARG_FILE_SIZE,        ARG_FIX_PREFIX_LEN};
     ret.insert(ret.end(), options.begin(), options.end());
     return ret;
   }
 
-  bool ParseIntOption(const std::map<std::string, std::string>& options,
-                      const std::string& option,
-                      int* value, LDBCommandExecuteResult* exec_state);
+  bool ParseIntOption(const map<string, string>& options, const string& option,
+                      int& value, LDBCommandExecuteResult& exec_state);
 
-  bool ParseStringOption(const std::map<std::string, std::string>& options,
-                         const std::string& option, std::string* value);
+  bool ParseStringOption(const map<string, string>& options,
+                         const string& option, string* value);
 
   Options options_;
   LDBOptions ldb_options_;
 
- private:
+private:
 
   /**
    * Interpret command line options and flags to determine if the key
    * should be input/output in hex.
    */
-  bool IsKeyHex(const std::map<std::string, std::string>& options,
-      const std::vector<std::string>& flags) {
+  bool IsKeyHex(const map<string, string>& options,
+      const vector<string>& flags) {
     return (IsFlagPresent(flags, ARG_HEX) ||
         IsFlagPresent(flags, ARG_KEY_HEX) ||
         ParseBooleanOption(options, ARG_HEX, false) ||
@@ -328,8 +320,8 @@ protected:
    * Interpret command line options and flags to determine if the value
    * should be input/output in hex.
    */
-  bool IsValueHex(const std::map<std::string, std::string>& options,
-      const std::vector<std::string>& flags) {
+  bool IsValueHex(const map<string, string>& options,
+      const vector<string>& flags) {
     return (IsFlagPresent(flags, ARG_HEX) ||
           IsFlagPresent(flags, ARG_VALUE_HEX) ||
           ParseBooleanOption(options, ARG_HEX, false) ||
@@ -342,13 +334,12 @@ protected:
    * Throws an exception if the value of the option is not
    * "true" or "false" (case insensitive).
    */
-  bool ParseBooleanOption(
-      const std::map<std::string, std::string>& options,
-      const std::string& option, bool default_val) {
+  bool ParseBooleanOption(const map<string, string>& options,
+      const string& option, bool default_val) {
 
-    auto itr = options.find(option);
+    map<string, string>::const_iterator itr = options.find(option);
     if (itr != options.end()) {
-      std::string option_val = itr->second;
+      string option_val = itr->second;
       return StringToBool(itr->second);
     }
     return default_val;
@@ -359,7 +350,7 @@ protected:
    * val must be either true or false (case insensitive).
    * Otherwise an exception is thrown.
    */
-  bool StringToBool(std::string val) {
+  bool StringToBool(string val) {
     std::transform(val.begin(), val.end(), val.begin(), ::tolower);
     if (val == "true") {
       return true;
@@ -371,165 +362,161 @@ protected:
   }
 
   static LDBCommand* SelectCommand(
-    const std::string& cmd,
-    const std::vector<std::string>& cmdParams,
-    const std::map<std::string, std::string>& option_map,
-    const std::vector<std::string>& flags
+    const string& cmd,
+    const vector<string>& cmdParams,
+    const map<string, string>& option_map,
+    const vector<string>& flags
   );
 
 };
 
 class CompactorCommand: public LDBCommand {
- public:
-  static std::string Name() { return "compact"; }
+public:
+  static string Name() { return "compact"; }
 
-  CompactorCommand(const std::vector<std::string>& params,
-      const std::map<std::string, std::string>& options,
-      const std::vector<std::string>& flags);
+  CompactorCommand(const vector<string>& params,
+      const map<string, string>& options, const vector<string>& flags);
 
-  static void Help(std::string* ret);
+  static void Help(string& ret);
 
   virtual void DoCommand();
 
- private:
+private:
   bool null_from_;
-  std::string from_;
+  string from_;
   bool null_to_;
-  std::string to_;
+  string to_;
 };
 
 class DBDumperCommand: public LDBCommand {
- public:
-  static std::string Name() { return "dump"; }
+public:
+  static string Name() { return "dump"; }
 
-  DBDumperCommand(const std::vector<std::string>& params,
-      const std::map<std::string, std::string>& options,
-      const std::vector<std::string>& flags);
+  DBDumperCommand(const vector<string>& params,
+      const map<string, string>& options, const vector<string>& flags);
 
-  static void Help(std::string* ret);
+  static void Help(string& ret);
 
   virtual void DoCommand();
 
- private:
+private:
   bool null_from_;
-  std::string from_;
+  string from_;
   bool null_to_;
-  std::string to_;
-  uint64_t max_keys_;
-  std::string delim_;
+  string to_;
+  int max_keys_;
+  string delim_;
   bool count_only_;
   bool count_delim_;
   bool print_stats_;
 
-  static const std::string ARG_COUNT_ONLY;
-  static const std::string ARG_COUNT_DELIM;
-  static const std::string ARG_STATS;
-  static const std::string ARG_TTL_BUCKET;
+  static const string ARG_COUNT_ONLY;
+  static const string ARG_COUNT_DELIM;
+  static const string ARG_STATS;
+  static const string ARG_TTL_BUCKET;
 };
 
 class InternalDumpCommand: public LDBCommand {
- public:
-  static std::string Name() { return "idump"; }
+public:
+  static string Name() { return "idump"; }
 
-  InternalDumpCommand(const std::vector<std::string>& params,
-                      const std::map<std::string, std::string>& options,
-                      const std::vector<std::string>& flags);
+  InternalDumpCommand(const vector<string>& params,
+                      const map<string, string>& options,
+                      const vector<string>& flags);
 
-  static void Help(std::string* ret);
+  static void Help(string& ret);
 
   virtual void DoCommand();
 
- private:
+private:
   bool has_from_;
-  std::string from_;
+  string from_;
   bool has_to_;
-  std::string to_;
+  string to_;
   int max_keys_;
-  std::string delim_;
+  string delim_;
   bool count_only_;
   bool count_delim_;
   bool print_stats_;
   bool is_input_key_hex_;
 
-  static const std::string ARG_DELIM;
-  static const std::string ARG_COUNT_ONLY;
-  static const std::string ARG_COUNT_DELIM;
-  static const std::string ARG_STATS;
-  static const std::string ARG_INPUT_KEY_HEX;
+  static const string ARG_DELIM;
+  static const string ARG_COUNT_ONLY;
+  static const string ARG_COUNT_DELIM;
+  static const string ARG_STATS;
+  static const string ARG_INPUT_KEY_HEX;
 };
 
 class DBLoaderCommand: public LDBCommand {
- public:
-  static std::string Name() { return "load"; }
+public:
+  static string Name() { return "load"; }
 
-  DBLoaderCommand(const std::vector<std::string>& params,
-      const std::map<std::string, std::string>& options,
-      const std::vector<std::string>& flags);
+  DBLoaderCommand(string& db_name, vector<string>& args);
 
-  static void Help(std::string* ret);
+  DBLoaderCommand(const vector<string>& params,
+      const map<string, string>& options, const vector<string>& flags);
+
+  static void Help(string& ret);
   virtual void DoCommand();
 
   virtual Options PrepareOptionsForOpenDB();
 
- private:
+private:
   bool create_if_missing_;
   bool disable_wal_;
   bool bulk_load_;
   bool compact_;
 
-  static const std::string ARG_DISABLE_WAL;
-  static const std::string ARG_BULK_LOAD;
-  static const std::string ARG_COMPACT;
+  static const string ARG_DISABLE_WAL;
+  static const string ARG_BULK_LOAD;
+  static const string ARG_COMPACT;
 };
 
 class ManifestDumpCommand: public LDBCommand {
- public:
-  static std::string Name() { return "manifest_dump"; }
+public:
+  static string Name() { return "manifest_dump"; }
 
-  ManifestDumpCommand(const std::vector<std::string>& params,
-      const std::map<std::string, std::string>& options,
-      const std::vector<std::string>& flags);
+  ManifestDumpCommand(const vector<string>& params,
+      const map<string, string>& options, const vector<string>& flags);
 
-  static void Help(std::string* ret);
+  static void Help(string& ret);
   virtual void DoCommand();
 
   virtual bool NoDBOpen() {
     return true;
   }
 
- private:
+private:
   bool verbose_;
-  std::string path_;
+  string path_;
 
-  static const std::string ARG_VERBOSE;
-  static const std::string ARG_PATH;
+  static const string ARG_VERBOSE;
+  static const string ARG_PATH;
 };
 
 class ListColumnFamiliesCommand : public LDBCommand {
  public:
-  static std::string Name() { return "list_column_families"; }
+  static string Name() { return "list_column_families"; }
 
-  ListColumnFamiliesCommand(
-      const std::vector<std::string>& params,
-      const std::map<std::string, std::string>& options,
-      const std::vector<std::string>& flags);
+  ListColumnFamiliesCommand(const vector<string>& params,
+                            const map<string, string>& options,
+                            const vector<string>& flags);
 
-  static void Help(std::string* ret);
+  static void Help(string& ret);
   virtual void DoCommand();
 
   virtual bool NoDBOpen() { return true; }
 
  private:
-  std::string dbname_;
+  string dbname_;
 };
 
 class ReduceDBLevelsCommand : public LDBCommand {
- public:
-  static std::string Name() { return "reduce_levels"; }
+public:
+  static string Name() { return "reduce_levels"; }
 
-  ReduceDBLevelsCommand(const std::vector<std::string>& params,
-      const std::map<std::string, std::string>& options,
-      const std::vector<std::string>& flags);
+  ReduceDBLevelsCommand(const vector<string>& params,
+      const map<string, string>& options, const vector<string>& flags);
 
   virtual Options PrepareOptionsForOpenDB();
 
@@ -539,179 +526,169 @@ class ReduceDBLevelsCommand : public LDBCommand {
     return true;
   }
 
-  static void Help(std::string* msg);
+  static void Help(string& msg);
 
-  static std::vector<std::string> PrepareArgs(
-      const std::string& db_path,
-      int new_levels,
+  static vector<string> PrepareArgs(const string& db_path, int new_levels,
       bool print_old_level = false);
 
- private:
+private:
   int old_levels_;
   int new_levels_;
   bool print_old_levels_;
 
-  static const std::string ARG_NEW_LEVELS;
-  static const std::string ARG_PRINT_OLD_LEVELS;
+  static const string ARG_NEW_LEVELS;
+  static const string ARG_PRINT_OLD_LEVELS;
 
   Status GetOldNumOfLevels(Options& opt, int* levels);
 };
 
 class ChangeCompactionStyleCommand : public LDBCommand {
- public:
-  static std::string Name() { return "change_compaction_style"; }
+public:
+  static string Name() { return "change_compaction_style"; }
 
-  ChangeCompactionStyleCommand(const std::vector<std::string>& params,
-      const std::map<std::string, std::string>& options,
-      const std::vector<std::string>& flags);
+  ChangeCompactionStyleCommand(const vector<string>& params,
+      const map<string, string>& options, const vector<string>& flags);
 
   virtual Options PrepareOptionsForOpenDB();
 
   virtual void DoCommand();
 
-  static void Help(std::string* msg);
+  static void Help(string& msg);
 
- private:
+private:
   int old_compaction_style_;
   int new_compaction_style_;
 
-  static const std::string ARG_OLD_COMPACTION_STYLE;
-  static const std::string ARG_NEW_COMPACTION_STYLE;
+  static const string ARG_OLD_COMPACTION_STYLE;
+  static const string ARG_NEW_COMPACTION_STYLE;
 };
 
 class WALDumperCommand : public LDBCommand {
- public:
-  static std::string Name() { return "dump_wal"; }
+public:
+  static string Name() { return "dump_wal"; }
 
-  WALDumperCommand(const std::vector<std::string>& params,
-      const std::map<std::string, std::string>& options,
-      const std::vector<std::string>& flags);
+  WALDumperCommand(const vector<string>& params,
+      const map<string, string>& options, const vector<string>& flags);
 
   virtual bool  NoDBOpen() {
     return true;
   }
 
-  static void Help(std::string* ret);
+  static void Help(string& ret);
   virtual void DoCommand();
 
- private:
+private:
   bool print_header_;
-  std::string wal_file_;
+  string wal_file_;
   bool print_values_;
 
-  static const std::string ARG_WAL_FILE;
-  static const std::string ARG_PRINT_HEADER;
-  static const std::string ARG_PRINT_VALUE;
+  static const string ARG_WAL_FILE;
+  static const string ARG_PRINT_HEADER;
+  static const string ARG_PRINT_VALUE;
 };
 
 
 class GetCommand : public LDBCommand {
- public:
-  static std::string Name() { return "get"; }
+public:
+  static string Name() { return "get"; }
 
-  GetCommand(const std::vector<std::string>& params,
-      const std::map<std::string, std::string>& options,
-      const std::vector<std::string>& flags);
+  GetCommand(const vector<string>& params, const map<string, string>& options,
+      const vector<string>& flags);
 
   virtual void DoCommand();
 
-  static void Help(std::string* ret);
+  static void Help(string& ret);
 
- private:
-  std::string key_;
+private:
+  string key_;
 };
 
 class ApproxSizeCommand : public LDBCommand {
- public:
-  static std::string Name() { return "approxsize"; }
+public:
+  static string Name() { return "approxsize"; }
 
-  ApproxSizeCommand(const std::vector<std::string>& params,
-      const std::map<std::string, std::string>& options,
-      const std::vector<std::string>& flags);
+  ApproxSizeCommand(const vector<string>& params,
+      const map<string, string>& options, const vector<string>& flags);
 
   virtual void DoCommand();
 
-  static void Help(std::string* ret);
+  static void Help(string& ret);
 
- private:
-  std::string start_key_;
-  std::string end_key_;
+private:
+  string start_key_;
+  string end_key_;
 };
 
 class BatchPutCommand : public LDBCommand {
- public:
-  static std::string Name() { return "batchput"; }
+public:
+  static string Name() { return "batchput"; }
 
-  BatchPutCommand(const std::vector<std::string>& params,
-      const std::map<std::string, std::string>& options,
-      const std::vector<std::string>& flags);
+  BatchPutCommand(const vector<string>& params,
+      const map<string, string>& options, const vector<string>& flags);
 
   virtual void DoCommand();
 
-  static void Help(std::string* ret);
+  static void Help(string& ret);
 
   virtual Options PrepareOptionsForOpenDB();
 
- private:
+private:
   /**
    * The key-values to be inserted.
    */
-  std::vector<std::pair<std::string, std::string>> key_values_;
+  vector<std::pair<string, string>> key_values_;
 };
 
 class ScanCommand : public LDBCommand {
- public:
-  static std::string Name() { return "scan"; }
+public:
+  static string Name() { return "scan"; }
 
-  ScanCommand(const std::vector<std::string>& params,
-      const std::map<std::string, std::string>& options,
-      const std::vector<std::string>& flags);
+  ScanCommand(const vector<string>& params, const map<string, string>& options,
+      const vector<string>& flags);
 
   virtual void DoCommand();
 
-  static void Help(std::string* ret);
+  static void Help(string& ret);
 
- private:
-  std::string start_key_;
-  std::string end_key_;
+private:
+  string start_key_;
+  string end_key_;
   bool start_key_specified_;
   bool end_key_specified_;
   int max_keys_scanned_;
 };
 
 class DeleteCommand : public LDBCommand {
- public:
-  static std::string Name() { return "delete"; }
+public:
+  static string Name() { return "delete"; }
 
-  DeleteCommand(const std::vector<std::string>& params,
-      const std::map<std::string, std::string>& options,
-      const std::vector<std::string>& flags);
+  DeleteCommand(const vector<string>& params,
+      const map<string, string>& options, const vector<string>& flags);
 
   virtual void DoCommand();
 
-  static void Help(std::string* ret);
+  static void Help(string& ret);
 
- private:
-  std::string key_;
+private:
+  string key_;
 };
 
 class PutCommand : public LDBCommand {
- public:
-  static std::string Name() { return "put"; }
+public:
+  static string Name() { return "put"; }
 
-  PutCommand(const std::vector<std::string>& params,
-      const std::map<std::string, std::string>& options,
-      const std::vector<std::string>& flags);
+  PutCommand(const vector<string>& params, const map<string, string>& options,
+      const vector<string>& flags);
 
   virtual void DoCommand();
 
-  static void Help(std::string* ret);
+  static void Help(string& ret);
 
   virtual Options PrepareOptionsForOpenDB();
 
- private:
-  std::string key_;
-  std::string value_;
+private:
+  string key_;
+  string value_;
 };
 
 /**
@@ -719,18 +696,17 @@ class PutCommand : public LDBCommand {
  * get/put/delete.
  */
 class DBQuerierCommand: public LDBCommand {
- public:
-  static std::string Name() { return "query"; }
+public:
+  static string Name() { return "query"; }
 
-  DBQuerierCommand(const std::vector<std::string>& params,
-      const std::map<std::string, std::string>& options,
-      const std::vector<std::string>& flags);
+  DBQuerierCommand(const vector<string>& params,
+      const map<string, string>& options, const vector<string>& flags);
 
-  static void Help(std::string* ret);
+  static void Help(string& ret);
 
   virtual void DoCommand();
 
- private:
+private:
   static const char* HELP_CMD;
   static const char* GET_CMD;
   static const char* PUT_CMD;
@@ -738,12 +714,11 @@ class DBQuerierCommand: public LDBCommand {
 };
 
 class CheckConsistencyCommand : public LDBCommand {
- public:
-  static std::string Name() { return "checkconsistency"; }
+public:
+  static string Name() { return "checkconsistency"; }
 
-  CheckConsistencyCommand(const std::vector<std::string>& params,
-      const std::map<std::string, std::string>& options,
-      const std::vector<std::string>& flags);
+  CheckConsistencyCommand(const vector<string>& params,
+      const map<string, string>& options, const vector<string>& flags);
 
   virtual void DoCommand();
 
@@ -751,7 +726,7 @@ class CheckConsistencyCommand : public LDBCommand {
     return true;
   }
 
-  static void Help(std::string* ret);
+  static void Help(string& ret);
 };
 
 } // namespace rocksdb
diff --git a/util/ldb_cmd_execute_result.h b/util/ldb_cmd_execute_result.h
index 48a4b495c..b8e6c4634 100644
--- a/util/ldb_cmd_execute_result.h
+++ b/util/ldb_cmd_execute_result.h
@@ -15,7 +15,7 @@ public:
 
   LDBCommandExecuteResult() : state_(EXEC_NOT_STARTED), message_("") {}
 
-  LDBCommandExecuteResult(State state, const std::string& msg) :
+  LDBCommandExecuteResult(State state, std::string& msg) :
     state_(state), message_(msg) {}
 
   std::string ToString() {
@@ -52,11 +52,11 @@ public:
     return state_ == EXEC_FAILED;
   }
 
-  static LDBCommandExecuteResult SUCCEED(const std::string& msg) {
+  static LDBCommandExecuteResult SUCCEED(std::string msg) {
     return LDBCommandExecuteResult(EXEC_SUCCEED, msg);
   }
 
-  static LDBCommandExecuteResult FAILED(const std::string& msg) {
+  static LDBCommandExecuteResult FAILED(std::string msg) {
     return LDBCommandExecuteResult(EXEC_FAILED, msg);
   }
 
diff --git a/util/ldb_tool.cc b/util/ldb_tool.cc
index 9824c0210..bb6c8ffca 100644
--- a/util/ldb_tool.cc
+++ b/util/ldb_tool.cc
@@ -24,7 +24,7 @@ class LDBCommandRunner {
 public:
 
   static void PrintHelp(const char* exec_name) {
-    std::string ret;
+    string ret;
 
     ret.append("ldb - LevelDB Tool");
     ret.append("\n\n");
@@ -59,26 +59,26 @@ public:
 
     ret.append("\n\n");
     ret.append("Data Access Commands:\n");
-    PutCommand::Help(&ret);
-    GetCommand::Help(&ret);
-    BatchPutCommand::Help(&ret);
-    ScanCommand::Help(&ret);
-    DeleteCommand::Help(&ret);
-    DBQuerierCommand::Help(&ret);
-    ApproxSizeCommand::Help(&ret);
-    CheckConsistencyCommand::Help(&ret);
+    PutCommand::Help(ret);
+    GetCommand::Help(ret);
+    BatchPutCommand::Help(ret);
+    ScanCommand::Help(ret);
+    DeleteCommand::Help(ret);
+    DBQuerierCommand::Help(ret);
+    ApproxSizeCommand::Help(ret);
+    CheckConsistencyCommand::Help(ret);
 
     ret.append("\n\n");
     ret.append("Admin Commands:\n");
-    WALDumperCommand::Help(&ret);
-    CompactorCommand::Help(&ret);
-    ReduceDBLevelsCommand::Help(&ret);
-    ChangeCompactionStyleCommand::Help(&ret);
-    DBDumperCommand::Help(&ret);
-    DBLoaderCommand::Help(&ret);
-    ManifestDumpCommand::Help(&ret);
-    ListColumnFamiliesCommand::Help(&ret);
-    InternalDumpCommand::Help(&ret);
+    WALDumperCommand::Help(ret);
+    CompactorCommand::Help(ret);
+    ReduceDBLevelsCommand::Help(ret);
+    ChangeCompactionStyleCommand::Help(ret);
+    DBDumperCommand::Help(ret);
+    DBLoaderCommand::Help(ret);
+    ManifestDumpCommand::Help(ret);
+    ListColumnFamiliesCommand::Help(ret);
+    InternalDumpCommand::Help(ret);
 
     fprintf(stderr, "%s\n", ret.c_str());
   }

From 9fd65e566794f03058faefb65c99b61b08f3c0db Mon Sep 17 00:00:00 2001
From: baotiao <baotiao@gmail.com>
Date: Mon, 3 Nov 2014 04:42:28 +0800
Subject: [PATCH 396/829] add make clean in examples makefile

---
 examples/Makefile | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/examples/Makefile b/examples/Makefile
index 2567fdf86..97a4b2850 100644
--- a/examples/Makefile
+++ b/examples/Makefile
@@ -1,5 +1,7 @@
 include ../build_config.mk
 
+.PHONY: main clean
+
 all: simple_example column_families_example
 
 simple_example: simple_example.cc
@@ -7,3 +9,6 @@ simple_example: simple_example.cc
 
 column_families_example: column_families_example.cc
 	$(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++11 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
+
+clean: simple_example column_families_example
+	rm -rf ./simple_example ./column_families_example

From b060d30065a89d803fdd3cf7be8ea5713e90e99e Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Sun, 2 Nov 2014 23:48:40 +0100
Subject: [PATCH 397/829] [RocksJava] Build fix after options refactoring

---
 java/Makefile            | 2 +-
 java/rocksjni/options.cc | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/java/Makefile b/java/Makefile
index 04eac63dd..b56ddbb44 100644
--- a/java/Makefile
+++ b/java/Makefile
@@ -1,4 +1,4 @@
-NATIVE_JAVA_CLASSES = org.rocksdb.RocksDB org.rocksdb.Options org.rocksdb.DBOptions org.rocksdb.ColumnFamilyOptions org.rocksdb.WriteBatch org.rocksdb.WriteBatchInternal org.rocksdb.WriteBatchTest org.rocksdb.WriteOptions org.rocksdb.BackupableDB org.rocksdb.BackupableDBOptions org.rocksdb.Statistics org.rocksdb.RocksIterator org.rocksdb.VectorMemTableConfig org.rocksdb.SkipListMemTableConfig org.rocksdb.HashLinkedListMemTableConfig org.rocksdb.HashSkipListMemTableConfig org.rocksdb.PlainTableConfig org.rocksdb.BlockBasedTableConfig org.rocksdb.ReadOptions org.rocksdb.Filter org.rocksdb.BloomFilter org.rocksdb.ComparatorOptions org.rocksdb.AbstractComparator org.rocksdb.Comparator org.rocksdb.DirectComparator org.rocksdb.AbstractSlice org.rocksdb.Slice org.rocksdb.DirectSlice org.rocksdb.RestoreOptions org.rocksdb.RestoreBackupableDB org.rocksdb.RocksEnv org.rocksdb.GenericRateLimiterConfig org.rocksdb.ColumnFamilyHandle org.rocksdb.MergeOperator org.rocksdb.StringAppendOperator org.rocksdb.ComparatorOptions org.rocksdb.AbstractComparator org.rocksdb.Comparator org.rocksdb.DirectComparator org.rocksdb.AbstractSlice org.rocksdb.Slice org.rocksdb.DirectSlice
+NATIVE_JAVA_CLASSES = org.rocksdb.RocksDB org.rocksdb.Options org.rocksdb.WriteBatch org.rocksdb.WriteBatchInternal org.rocksdb.WriteBatchTest org.rocksdb.WriteOptions org.rocksdb.BackupableDB org.rocksdb.BackupableDBOptions org.rocksdb.Statistics org.rocksdb.RocksIterator org.rocksdb.VectorMemTableConfig org.rocksdb.SkipListMemTableConfig org.rocksdb.HashLinkedListMemTableConfig org.rocksdb.HashSkipListMemTableConfig org.rocksdb.PlainTableConfig org.rocksdb.BlockBasedTableConfig org.rocksdb.ReadOptions org.rocksdb.Filter org.rocksdb.BloomFilter org.rocksdb.ComparatorOptions org.rocksdb.AbstractComparator org.rocksdb.Comparator org.rocksdb.DirectComparator org.rocksdb.AbstractSlice org.rocksdb.Slice org.rocksdb.DirectSlice org.rocksdb.RestoreOptions org.rocksdb.RestoreBackupableDB org.rocksdb.RocksEnv org.rocksdb.GenericRateLimiterConfig org.rocksdb.ColumnFamilyHandle org.rocksdb.MergeOperator org.rocksdb.StringAppendOperator org.rocksdb.ComparatorOptions org.rocksdb.AbstractComparator org.rocksdb.Comparator org.rocksdb.DirectComparator org.rocksdb.AbstractSlice org.rocksdb.Slice org.rocksdb.DirectSlice
 
 ROCKSDB_MAJOR = $(shell egrep "ROCKSDB_MAJOR.[0-9]" ../include/rocksdb/version.h | cut -d ' ' -f 3)
 ROCKSDB_MINOR = $(shell egrep "ROCKSDB_MINOR.[0-9]" ../include/rocksdb/version.h | cut -d ' ' -f 3)
diff --git a/java/rocksjni/options.cc b/java/rocksjni/options.cc
index 0f4c19232..181a8d317 100644
--- a/java/rocksjni/options.cc
+++ b/java/rocksjni/options.cc
@@ -12,8 +12,9 @@
 #include <memory>
 
 #include "include/org_rocksdb_Options.h"
-#include "include/org_rocksdb_DBOptions.h"
-#include "include/org_rocksdb_ColumnFamilyOptions.h"
+//TODO(fyrz) to be commented in with options refactoring pull requests
+//#include "include/org_rocksdb_DBOptions.h"
+//#include "include/org_rocksdb_ColumnFamilyOptions.h"
 #include "include/org_rocksdb_WriteOptions.h"
 #include "include/org_rocksdb_ReadOptions.h"
 #include "include/org_rocksdb_ComparatorOptions.h"

From 94e31ac2273e2e6a446b5bfc5f5c1334f0eb6416 Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Tue, 28 Oct 2014 22:38:08 +0100
Subject: [PATCH 398/829] [RocksJava] Extend Options with DBOptions
 implementation [RocksJava] Included DBOptionsTest and refactored OptionsTest

Summary: Options refactoring - Split Part2

Test Plan:
make rocksdbjava
make jtest

Reviewers: yhchiang, ankgup87

Subscribers: dhruba

Differential Revision: https://reviews.facebook.net/D28017
---
 java/Makefile                            |   3 +-
 java/org/rocksdb/DBOptions.java          | 569 +++++++++++++++++++++++
 java/org/rocksdb/Options.java            |   7 +-
 java/org/rocksdb/test/DBOptionsTest.java | 228 +++++++++
 java/org/rocksdb/test/OptionsTest.java   | 200 +-------
 java/rocksjni/options.cc                 |   7 +-
 java/rocksjni/portal.h                   |  65 +++
 7 files changed, 874 insertions(+), 205 deletions(-)
 create mode 100644 java/org/rocksdb/DBOptions.java
 create mode 100644 java/org/rocksdb/test/DBOptionsTest.java

diff --git a/java/Makefile b/java/Makefile
index b56ddbb44..a6d3c95f3 100644
--- a/java/Makefile
+++ b/java/Makefile
@@ -1,4 +1,4 @@
-NATIVE_JAVA_CLASSES = org.rocksdb.RocksDB org.rocksdb.Options org.rocksdb.WriteBatch org.rocksdb.WriteBatchInternal org.rocksdb.WriteBatchTest org.rocksdb.WriteOptions org.rocksdb.BackupableDB org.rocksdb.BackupableDBOptions org.rocksdb.Statistics org.rocksdb.RocksIterator org.rocksdb.VectorMemTableConfig org.rocksdb.SkipListMemTableConfig org.rocksdb.HashLinkedListMemTableConfig org.rocksdb.HashSkipListMemTableConfig org.rocksdb.PlainTableConfig org.rocksdb.BlockBasedTableConfig org.rocksdb.ReadOptions org.rocksdb.Filter org.rocksdb.BloomFilter org.rocksdb.ComparatorOptions org.rocksdb.AbstractComparator org.rocksdb.Comparator org.rocksdb.DirectComparator org.rocksdb.AbstractSlice org.rocksdb.Slice org.rocksdb.DirectSlice org.rocksdb.RestoreOptions org.rocksdb.RestoreBackupableDB org.rocksdb.RocksEnv org.rocksdb.GenericRateLimiterConfig org.rocksdb.ColumnFamilyHandle org.rocksdb.MergeOperator org.rocksdb.StringAppendOperator org.rocksdb.ComparatorOptions org.rocksdb.AbstractComparator org.rocksdb.Comparator org.rocksdb.DirectComparator org.rocksdb.AbstractSlice org.rocksdb.Slice org.rocksdb.DirectSlice
+NATIVE_JAVA_CLASSES = org.rocksdb.RocksDB org.rocksdb.Options org.rocksdb.DBOptions org.rocksdb.WriteBatch org.rocksdb.WriteBatchInternal org.rocksdb.WriteBatchTest org.rocksdb.WriteOptions org.rocksdb.BackupableDB org.rocksdb.BackupableDBOptions org.rocksdb.Statistics org.rocksdb.RocksIterator org.rocksdb.VectorMemTableConfig org.rocksdb.SkipListMemTableConfig org.rocksdb.HashLinkedListMemTableConfig org.rocksdb.HashSkipListMemTableConfig org.rocksdb.PlainTableConfig org.rocksdb.BlockBasedTableConfig org.rocksdb.ReadOptions org.rocksdb.Filter org.rocksdb.BloomFilter org.rocksdb.ComparatorOptions org.rocksdb.AbstractComparator org.rocksdb.Comparator org.rocksdb.DirectComparator org.rocksdb.AbstractSlice org.rocksdb.Slice org.rocksdb.DirectSlice org.rocksdb.RestoreOptions org.rocksdb.RestoreBackupableDB org.rocksdb.RocksEnv org.rocksdb.GenericRateLimiterConfig org.rocksdb.ColumnFamilyHandle org.rocksdb.MergeOperator org.rocksdb.StringAppendOperator org.rocksdb.ComparatorOptions org.rocksdb.AbstractComparator org.rocksdb.Comparator org.rocksdb.DirectComparator org.rocksdb.AbstractSlice org.rocksdb.Slice org.rocksdb.DirectSlice
 
 ROCKSDB_MAJOR = $(shell egrep "ROCKSDB_MAJOR.[0-9]" ../include/rocksdb/version.h | cut -d ' ' -f 3)
 ROCKSDB_MINOR = $(shell egrep "ROCKSDB_MINOR.[0-9]" ../include/rocksdb/version.h | cut -d ' ' -f 3)
@@ -40,6 +40,7 @@ test: java
 	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.WriteBatchTest
 	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.BackupableDBTest
 	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.BlockBasedTableConfigTest
+	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.DBOptionsTest
 	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.ColumnFamilyTest
 	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.FilterTest
 	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.KeyMayExistTest
diff --git a/java/org/rocksdb/DBOptions.java b/java/org/rocksdb/DBOptions.java
new file mode 100644
index 000000000..6ab276755
--- /dev/null
+++ b/java/org/rocksdb/DBOptions.java
@@ -0,0 +1,569 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+/**
+ * DBOptions to control the behavior of a database.  It will be used
+ * during the creation of a {@link org.rocksdb.RocksDB} (i.e., RocksDB.open()).
+ *
+ * If {@link #dispose()} function is not called, then it will be GC'd automatically
+ * and native resources will be released as part of the process.
+ */
+public class DBOptions extends RocksObject implements DBOptionsInterface {
+  static {
+    RocksDB.loadLibrary();
+  }
+
+  /**
+   * Construct DBOptions.
+   *
+   * This constructor will create (by allocating a block of memory)
+   * an {@code rocksdb::DBOptions} in the c++ side.
+   */
+  public DBOptions() {
+    super();
+    numShardBits_ = DEFAULT_NUM_SHARD_BITS;
+    newDBOptions();
+  }
+
+  @Override
+  public DBOptions setCreateIfMissing(boolean flag) {
+    assert(isInitialized());
+    setCreateIfMissing(nativeHandle_, flag);
+    return this;
+  }
+
+  @Override
+  public boolean createIfMissing() {
+    assert(isInitialized());
+    return createIfMissing(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setCreateMissingColumnFamilies(boolean flag) {
+    assert(isInitialized());
+    setCreateMissingColumnFamilies(nativeHandle_, flag);
+    return this;
+  }
+
+  @Override
+  public boolean createMissingColumnFamilies() {
+    assert(isInitialized());
+    return createMissingColumnFamilies(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setErrorIfExists(boolean errorIfExists) {
+    assert(isInitialized());
+    setErrorIfExists(nativeHandle_, errorIfExists);
+    return this;
+  }
+
+  @Override
+  public boolean errorIfExists() {
+    assert(isInitialized());
+    return errorIfExists(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setParanoidChecks(boolean paranoidChecks) {
+    assert(isInitialized());
+    setParanoidChecks(nativeHandle_, paranoidChecks);
+    return this;
+  }
+
+  @Override
+  public boolean paranoidChecks() {
+    assert(isInitialized());
+    return paranoidChecks(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setRateLimiterConfig(RateLimiterConfig config) {
+    rateLimiterConfig_ = config;
+    setRateLimiter(nativeHandle_, config.newRateLimiterHandle());
+    return this;
+  }
+
+  @Override
+  public DBOptions setMaxOpenFiles(int maxOpenFiles) {
+    assert(isInitialized());
+    setMaxOpenFiles(nativeHandle_, maxOpenFiles);
+    return this;
+  }
+
+  @Override
+  public int maxOpenFiles() {
+    assert(isInitialized());
+    return maxOpenFiles(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setMaxTotalWalSize(long maxTotalWalSize) {
+    assert(isInitialized());
+    setMaxTotalWalSize(nativeHandle_, maxTotalWalSize);
+    return this;
+  }
+
+  @Override
+  public long maxTotalWalSize() {
+    assert(isInitialized());
+    return maxTotalWalSize(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions createStatistics() {
+    assert(isInitialized());
+    createStatistics(nativeHandle_);
+    return this;
+  }
+
+  @Override
+  public Statistics statisticsPtr() {
+    assert(isInitialized());
+
+    long statsPtr = statisticsPtr(nativeHandle_);
+    if(statsPtr == 0) {
+      createStatistics();
+      statsPtr = statisticsPtr(nativeHandle_);
+    }
+
+    return new Statistics(statsPtr);
+  }
+
+  @Override
+  public DBOptions setDisableDataSync(boolean disableDataSync) {
+    assert(isInitialized());
+    setDisableDataSync(nativeHandle_, disableDataSync);
+    return this;
+  }
+
+  @Override
+  public boolean disableDataSync() {
+    assert(isInitialized());
+    return disableDataSync(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setUseFsync(boolean useFsync) {
+    assert(isInitialized());
+    setUseFsync(nativeHandle_, useFsync);
+    return this;
+  }
+
+  @Override
+  public boolean useFsync() {
+    assert(isInitialized());
+    return useFsync(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setDbLogDir(String dbLogDir) {
+    assert(isInitialized());
+    setDbLogDir(nativeHandle_, dbLogDir);
+    return this;
+  }
+
+  @Override
+  public String dbLogDir() {
+    assert(isInitialized());
+    return dbLogDir(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setWalDir(String walDir) {
+    assert(isInitialized());
+    setWalDir(nativeHandle_, walDir);
+    return this;
+  }
+
+  @Override
+  public String walDir() {
+    assert(isInitialized());
+    return walDir(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setDeleteObsoleteFilesPeriodMicros(long micros) {
+    assert(isInitialized());
+    setDeleteObsoleteFilesPeriodMicros(nativeHandle_, micros);
+    return this;
+  }
+
+  @Override
+  public long deleteObsoleteFilesPeriodMicros() {
+    assert(isInitialized());
+    return deleteObsoleteFilesPeriodMicros(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setMaxBackgroundCompactions(int maxBackgroundCompactions) {
+    assert(isInitialized());
+    setMaxBackgroundCompactions(nativeHandle_, maxBackgroundCompactions);
+    return this;
+  }
+
+  @Override
+  public int maxBackgroundCompactions() {
+    assert(isInitialized());
+    return maxBackgroundCompactions(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setMaxBackgroundFlushes(int maxBackgroundFlushes) {
+    assert(isInitialized());
+    setMaxBackgroundFlushes(nativeHandle_, maxBackgroundFlushes);
+    return this;
+  }
+
+  @Override
+  public int maxBackgroundFlushes() {
+    assert(isInitialized());
+    return maxBackgroundFlushes(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setMaxLogFileSize(long maxLogFileSize)
+      throws RocksDBException {
+    assert(isInitialized());
+    setMaxLogFileSize(nativeHandle_, maxLogFileSize);
+    return this;
+  }
+
+  @Override
+  public long maxLogFileSize() {
+    assert(isInitialized());
+    return maxLogFileSize(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setLogFileTimeToRoll(long logFileTimeToRoll)
+      throws RocksDBException{
+    assert(isInitialized());
+    setLogFileTimeToRoll(nativeHandle_, logFileTimeToRoll);
+    return this;
+  }
+
+  @Override
+  public long logFileTimeToRoll() {
+    assert(isInitialized());
+    return logFileTimeToRoll(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setKeepLogFileNum(long keepLogFileNum)
+      throws RocksDBException{
+    assert(isInitialized());
+    setKeepLogFileNum(nativeHandle_, keepLogFileNum);
+    return this;
+  }
+
+  @Override
+  public long keepLogFileNum() {
+    assert(isInitialized());
+    return keepLogFileNum(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setMaxManifestFileSize(long maxManifestFileSize) {
+    assert(isInitialized());
+    setMaxManifestFileSize(nativeHandle_, maxManifestFileSize);
+    return this;
+  }
+
+  @Override
+  public long maxManifestFileSize() {
+    assert(isInitialized());
+    return maxManifestFileSize(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setTableCacheNumshardbits(int tableCacheNumshardbits) {
+    assert(isInitialized());
+    setTableCacheNumshardbits(nativeHandle_, tableCacheNumshardbits);
+    return this;
+  }
+
+  @Override
+  public int tableCacheNumshardbits() {
+    assert(isInitialized());
+    return tableCacheNumshardbits(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setTableCacheRemoveScanCountLimit(int limit) {
+    assert(isInitialized());
+    setTableCacheRemoveScanCountLimit(nativeHandle_, limit);
+    return this;
+  }
+
+  @Override
+  public int tableCacheRemoveScanCountLimit() {
+    assert(isInitialized());
+    return tableCacheRemoveScanCountLimit(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setWalTtlSeconds(long walTtlSeconds) {
+    assert(isInitialized());
+    setWalTtlSeconds(nativeHandle_, walTtlSeconds);
+    return this;
+  }
+
+  @Override
+  public long walTtlSeconds() {
+    assert(isInitialized());
+    return walTtlSeconds(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setWalSizeLimitMB(long sizeLimitMB) {
+    assert(isInitialized());
+    setWalSizeLimitMB(nativeHandle_, sizeLimitMB);
+    return this;
+  }
+
+  @Override
+  public long walSizeLimitMB() {
+    assert(isInitialized());
+    return walSizeLimitMB(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setManifestPreallocationSize(long size)
+      throws RocksDBException {
+    assert(isInitialized());
+    setManifestPreallocationSize(nativeHandle_, size);
+    return this;
+  }
+
+  @Override
+  public long manifestPreallocationSize() {
+    assert(isInitialized());
+    return manifestPreallocationSize(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setAllowOsBuffer(boolean allowOsBuffer) {
+    assert(isInitialized());
+    setAllowOsBuffer(nativeHandle_, allowOsBuffer);
+    return this;
+  }
+
+  @Override
+  public boolean allowOsBuffer() {
+    assert(isInitialized());
+    return allowOsBuffer(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setAllowMmapReads(boolean allowMmapReads) {
+    assert(isInitialized());
+    setAllowMmapReads(nativeHandle_, allowMmapReads);
+    return this;
+  }
+
+  @Override
+  public boolean allowMmapReads() {
+    assert(isInitialized());
+    return allowMmapReads(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setAllowMmapWrites(boolean allowMmapWrites) {
+    assert(isInitialized());
+    setAllowMmapWrites(nativeHandle_, allowMmapWrites);
+    return this;
+  }
+
+  @Override
+  public boolean allowMmapWrites() {
+    assert(isInitialized());
+    return allowMmapWrites(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setIsFdCloseOnExec(boolean isFdCloseOnExec) {
+    assert(isInitialized());
+    setIsFdCloseOnExec(nativeHandle_, isFdCloseOnExec);
+    return this;
+  }
+
+  @Override
+  public boolean isFdCloseOnExec() {
+    assert(isInitialized());
+    return isFdCloseOnExec(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setSkipLogErrorOnRecovery(boolean skip) {
+    assert(isInitialized());
+    setSkipLogErrorOnRecovery(nativeHandle_, skip);
+    return this;
+  }
+
+  @Override
+  public boolean skipLogErrorOnRecovery() {
+    assert(isInitialized());
+    return skipLogErrorOnRecovery(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setStatsDumpPeriodSec(int statsDumpPeriodSec) {
+    assert(isInitialized());
+    setStatsDumpPeriodSec(nativeHandle_, statsDumpPeriodSec);
+    return this;
+  }
+
+  @Override
+  public int statsDumpPeriodSec() {
+    assert(isInitialized());
+    return statsDumpPeriodSec(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setAdviseRandomOnOpen(boolean adviseRandomOnOpen) {
+    assert(isInitialized());
+    setAdviseRandomOnOpen(nativeHandle_, adviseRandomOnOpen);
+    return this;
+  }
+
+  @Override
+  public boolean adviseRandomOnOpen() {
+    return adviseRandomOnOpen(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setUseAdaptiveMutex(boolean useAdaptiveMutex) {
+    assert(isInitialized());
+    setUseAdaptiveMutex(nativeHandle_, useAdaptiveMutex);
+    return this;
+  }
+
+  @Override
+  public boolean useAdaptiveMutex() {
+    assert(isInitialized());
+    return useAdaptiveMutex(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setBytesPerSync(long bytesPerSync) {
+    assert(isInitialized());
+    setBytesPerSync(nativeHandle_, bytesPerSync);
+    return this;
+  }
+
+  @Override
+  public long bytesPerSync() {
+    return bytesPerSync(nativeHandle_);
+  }
+
+  /**
+   * Release the memory allocated for the current instance
+   * in the c++ side.
+   */
+  @Override protected void disposeInternal() {
+    assert(isInitialized());
+    disposeInternal(nativeHandle_);
+  }
+
+  static final int DEFAULT_NUM_SHARD_BITS = -1;
+
+  private native void newDBOptions();
+  private native void disposeInternal(long handle);
+
+  private native void setCreateIfMissing(long handle, boolean flag);
+  private native boolean createIfMissing(long handle);
+  private native void setCreateMissingColumnFamilies(
+      long handle, boolean flag);
+  private native boolean createMissingColumnFamilies(long handle);
+  private native void setErrorIfExists(long handle, boolean errorIfExists);
+  private native boolean errorIfExists(long handle);
+  private native void setParanoidChecks(
+      long handle, boolean paranoidChecks);
+  private native boolean paranoidChecks(long handle);
+  private native void setRateLimiter(long handle,
+      long rateLimiterHandle);
+  private native void setMaxOpenFiles(long handle, int maxOpenFiles);
+  private native int maxOpenFiles(long handle);
+  private native void setMaxTotalWalSize(long handle,
+      long maxTotalWalSize);
+  private native long maxTotalWalSize(long handle);
+  private native void createStatistics(long optHandle);
+  private native long statisticsPtr(long optHandle);
+  private native void setDisableDataSync(long handle, boolean disableDataSync);
+  private native boolean disableDataSync(long handle);
+  private native boolean useFsync(long handle);
+  private native void setUseFsync(long handle, boolean useFsync);
+  private native void setDbLogDir(long handle, String dbLogDir);
+  private native String dbLogDir(long handle);
+  private native void setWalDir(long handle, String walDir);
+  private native String walDir(long handle);
+  private native void setDeleteObsoleteFilesPeriodMicros(
+      long handle, long micros);
+  private native long deleteObsoleteFilesPeriodMicros(long handle);
+  private native void setMaxBackgroundCompactions(
+      long handle, int maxBackgroundCompactions);
+  private native int maxBackgroundCompactions(long handle);
+  private native void setMaxBackgroundFlushes(
+      long handle, int maxBackgroundFlushes);
+  private native int maxBackgroundFlushes(long handle);
+  private native void setMaxLogFileSize(long handle, long maxLogFileSize)
+      throws RocksDBException;
+  private native long maxLogFileSize(long handle);
+  private native void setLogFileTimeToRoll(
+      long handle, long logFileTimeToRoll) throws RocksDBException;
+  private native long logFileTimeToRoll(long handle);
+  private native void setKeepLogFileNum(long handle, long keepLogFileNum)
+      throws RocksDBException;
+  private native long keepLogFileNum(long handle);
+  private native void setMaxManifestFileSize(
+      long handle, long maxManifestFileSize);
+  private native long maxManifestFileSize(long handle);
+  private native void setTableCacheNumshardbits(
+      long handle, int tableCacheNumshardbits);
+  private native int tableCacheNumshardbits(long handle);
+  private native void setTableCacheRemoveScanCountLimit(
+      long handle, int limit);
+  private native int tableCacheRemoveScanCountLimit(long handle);
+  private native void setWalTtlSeconds(long handle, long walTtlSeconds);
+  private native long walTtlSeconds(long handle);
+  private native void setWalSizeLimitMB(long handle, long sizeLimitMB);
+  private native long walSizeLimitMB(long handle);
+  private native void setManifestPreallocationSize(
+      long handle, long size) throws RocksDBException;
+  private native long manifestPreallocationSize(long handle);
+  private native void setAllowOsBuffer(
+      long handle, boolean allowOsBuffer);
+  private native boolean allowOsBuffer(long handle);
+  private native void setAllowMmapReads(
+      long handle, boolean allowMmapReads);
+  private native boolean allowMmapReads(long handle);
+  private native void setAllowMmapWrites(
+      long handle, boolean allowMmapWrites);
+  private native boolean allowMmapWrites(long handle);
+  private native void setIsFdCloseOnExec(
+      long handle, boolean isFdCloseOnExec);
+  private native boolean isFdCloseOnExec(long handle);
+  private native void setSkipLogErrorOnRecovery(
+      long handle, boolean skip);
+  private native boolean skipLogErrorOnRecovery(long handle);
+  private native void setStatsDumpPeriodSec(
+      long handle, int statsDumpPeriodSec);
+  private native int statsDumpPeriodSec(long handle);
+  private native void setAdviseRandomOnOpen(
+      long handle, boolean adviseRandomOnOpen);
+  private native boolean adviseRandomOnOpen(long handle);
+  private native void setUseAdaptiveMutex(
+      long handle, boolean useAdaptiveMutex);
+  private native boolean useAdaptiveMutex(long handle);
+  private native void setBytesPerSync(
+      long handle, long bytesPerSync);
+  private native long bytesPerSync(long handle);
+
+  int numShardBits_;
+  RateLimiterConfig rateLimiterConfig_;
+}
diff --git a/java/org/rocksdb/Options.java b/java/org/rocksdb/Options.java
index 16db5e166..7ad1e1bf2 100644
--- a/java/org/rocksdb/Options.java
+++ b/java/org/rocksdb/Options.java
@@ -60,13 +60,14 @@ public class Options extends RocksObject
   }
 
   /**
-   * Set appropriate parameters for bulk loading.
+   * <p>Set appropriate parameters for bulk loading.
    * The reason that this is a function that returns "this" instead of a
    * constructor is to enable chaining of multiple similar calls in the future.
+   * </p>
    *
-   * All data will be in level 0 without any automatic compaction.
+   * <p>All data will be in level 0 without any automatic compaction.
    * It's recommended to manually call CompactRange(NULL, NULL) before reading
-   * from the database, because otherwise the read can be very slow.
+   * from the database, because otherwise the read can be very slow.</p>
    *
    * @return the instance of the current Options.
    */
diff --git a/java/org/rocksdb/test/DBOptionsTest.java b/java/org/rocksdb/test/DBOptionsTest.java
new file mode 100644
index 000000000..0cd2468ea
--- /dev/null
+++ b/java/org/rocksdb/test/DBOptionsTest.java
@@ -0,0 +1,228 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb.test;
+
+import org.rocksdb.DBOptions;
+import org.rocksdb.DBOptionsInterface;
+import org.rocksdb.RocksDB;
+import org.rocksdb.RocksDBException;
+
+import java.util.Random;
+
+public class DBOptionsTest {
+  static {
+    RocksDB.loadLibrary();
+  }
+
+  public static void testDBOptions(DBOptionsInterface opt) {
+    Random rand = PlatformRandomHelper.
+        getPlatformSpecificRandomFactory();
+    { // CreateIfMissing test
+      boolean boolValue = rand.nextBoolean();
+      opt.setCreateIfMissing(boolValue);
+      assert(opt.createIfMissing() == boolValue);
+    }
+
+    { // CreateMissingColumnFamilies test
+      boolean boolValue = rand.nextBoolean();
+      opt.setCreateMissingColumnFamilies(boolValue);
+      assert(opt.createMissingColumnFamilies() == boolValue);
+    }
+
+    { // ErrorIfExists test
+      boolean boolValue = rand.nextBoolean();
+      opt.setErrorIfExists(boolValue);
+      assert(opt.errorIfExists() == boolValue);
+    }
+
+    { // ParanoidChecks test
+      boolean boolValue = rand.nextBoolean();
+      opt.setParanoidChecks(boolValue);
+      assert(opt.paranoidChecks() == boolValue);
+    }
+
+    {
+      // MaxTotalWalSize test
+      long longValue = rand.nextLong();
+      opt.setMaxTotalWalSize(longValue);
+      assert(opt.maxTotalWalSize() == longValue);
+    }
+
+    { // MaxOpenFiles test
+      int intValue = rand.nextInt();
+      opt.setMaxOpenFiles(intValue);
+      assert(opt.maxOpenFiles() == intValue);
+    }
+
+    { // DisableDataSync test
+      boolean boolValue = rand.nextBoolean();
+      opt.setDisableDataSync(boolValue);
+      assert(opt.disableDataSync() == boolValue);
+    }
+
+    { // UseFsync test
+      boolean boolValue = rand.nextBoolean();
+      opt.setUseFsync(boolValue);
+      assert(opt.useFsync() == boolValue);
+    }
+
+    { // DbLogDir test
+      String str = "path/to/DbLogDir";
+      opt.setDbLogDir(str);
+      assert(opt.dbLogDir().equals(str));
+    }
+
+    { // WalDir test
+      String str = "path/to/WalDir";
+      opt.setWalDir(str);
+      assert(opt.walDir().equals(str));
+    }
+
+    { // DeleteObsoleteFilesPeriodMicros test
+      long longValue = rand.nextLong();
+      opt.setDeleteObsoleteFilesPeriodMicros(longValue);
+      assert(opt.deleteObsoleteFilesPeriodMicros() == longValue);
+    }
+
+    { // MaxBackgroundCompactions test
+      int intValue = rand.nextInt();
+      opt.setMaxBackgroundCompactions(intValue);
+      assert(opt.maxBackgroundCompactions() == intValue);
+    }
+
+    { // MaxBackgroundFlushes test
+      int intValue = rand.nextInt();
+      opt.setMaxBackgroundFlushes(intValue);
+      assert(opt.maxBackgroundFlushes() == intValue);
+    }
+
+    { // MaxLogFileSize test
+      try {
+        long longValue = rand.nextLong();
+        opt.setMaxLogFileSize(longValue);
+        assert(opt.maxLogFileSize() == longValue);
+      } catch (RocksDBException e) {
+        System.out.println(e.getMessage());
+        assert(false);
+      }
+    }
+
+    { // LogFileTimeToRoll test
+      try {
+        long longValue = rand.nextLong();
+        opt.setLogFileTimeToRoll(longValue);
+        assert(opt.logFileTimeToRoll() == longValue);
+      } catch (RocksDBException e) {
+        assert(false);
+      }
+    }
+
+    { // KeepLogFileNum test
+      try {
+        long longValue = rand.nextLong();
+        opt.setKeepLogFileNum(longValue);
+        assert(opt.keepLogFileNum() == longValue);
+      } catch (RocksDBException e) {
+        assert(false);
+      }
+    }
+
+    { // MaxManifestFileSize test
+      long longValue = rand.nextLong();
+      opt.setMaxManifestFileSize(longValue);
+      assert(opt.maxManifestFileSize() == longValue);
+    }
+
+    { // TableCacheNumshardbits test
+      int intValue = rand.nextInt();
+      opt.setTableCacheNumshardbits(intValue);
+      assert(opt.tableCacheNumshardbits() == intValue);
+    }
+
+    { // TableCacheRemoveScanCountLimit test
+      int intValue = rand.nextInt();
+      opt.setTableCacheRemoveScanCountLimit(intValue);
+      assert(opt.tableCacheRemoveScanCountLimit() == intValue);
+    }
+
+    { // WalTtlSeconds test
+      long longValue = rand.nextLong();
+      opt.setWalTtlSeconds(longValue);
+      assert(opt.walTtlSeconds() == longValue);
+    }
+
+    { // ManifestPreallocationSize test
+      try {
+        long longValue = rand.nextLong();
+        opt.setManifestPreallocationSize(longValue);
+        assert(opt.manifestPreallocationSize() == longValue);
+      } catch (RocksDBException e) {
+        assert(false);
+      }
+    }
+
+    { // AllowOsBuffer test
+      boolean boolValue = rand.nextBoolean();
+      opt.setAllowOsBuffer(boolValue);
+      assert(opt.allowOsBuffer() == boolValue);
+    }
+
+    { // AllowMmapReads test
+      boolean boolValue = rand.nextBoolean();
+      opt.setAllowMmapReads(boolValue);
+      assert(opt.allowMmapReads() == boolValue);
+    }
+
+    { // AllowMmapWrites test
+      boolean boolValue = rand.nextBoolean();
+      opt.setAllowMmapWrites(boolValue);
+      assert(opt.allowMmapWrites() == boolValue);
+    }
+
+    { // IsFdCloseOnExec test
+      boolean boolValue = rand.nextBoolean();
+      opt.setIsFdCloseOnExec(boolValue);
+      assert(opt.isFdCloseOnExec() == boolValue);
+    }
+
+    { // SkipLogErrorOnRecovery test
+      boolean boolValue = rand.nextBoolean();
+      opt.setSkipLogErrorOnRecovery(boolValue);
+      assert(opt.skipLogErrorOnRecovery() == boolValue);
+    }
+
+    { // StatsDumpPeriodSec test
+      int intValue = rand.nextInt();
+      opt.setStatsDumpPeriodSec(intValue);
+      assert(opt.statsDumpPeriodSec() == intValue);
+    }
+
+    { // AdviseRandomOnOpen test
+      boolean boolValue = rand.nextBoolean();
+      opt.setAdviseRandomOnOpen(boolValue);
+      assert(opt.adviseRandomOnOpen() == boolValue);
+    }
+
+    { // UseAdaptiveMutex test
+      boolean boolValue = rand.nextBoolean();
+      opt.setUseAdaptiveMutex(boolValue);
+      assert(opt.useAdaptiveMutex() == boolValue);
+    }
+
+    { // BytesPerSync test
+      long longValue = rand.nextLong();
+      opt.setBytesPerSync(longValue);
+      assert(opt.bytesPerSync() == longValue);
+    }
+  }
+
+  public static void main(String[] args) {
+    DBOptions opt = new DBOptions();
+    testDBOptions(opt);
+    opt.dispose();
+    System.out.println("Passed DBOptionsTest");
+  }
+}
diff --git a/java/org/rocksdb/test/OptionsTest.java b/java/org/rocksdb/test/OptionsTest.java
index ef88e3503..ea8da6c66 100644
--- a/java/org/rocksdb/test/OptionsTest.java
+++ b/java/org/rocksdb/test/OptionsTest.java
@@ -6,10 +6,11 @@
 package org.rocksdb.test;
 
 import java.util.Random;
+
+import org.rocksdb.DBOptions;
 import org.rocksdb.RocksDB;
 import org.rocksdb.RocksDBException;
 import org.rocksdb.Options;
-import org.rocksdb.test.PlatformRandomHelper;
 
 public class OptionsTest {
 
@@ -20,203 +21,8 @@ public class OptionsTest {
     Options opt = new Options();
     Random rand = PlatformRandomHelper.
         getPlatformSpecificRandomFactory();
-    { // CreateIfMissing test
-      boolean boolValue = rand.nextBoolean();
-      opt.setCreateIfMissing(boolValue);
-      assert(opt.createIfMissing() == boolValue);
-    }
-
-    { // CreateMissingColumnFamilies test
-      boolean boolValue = rand.nextBoolean();
-      opt.setCreateMissingColumnFamilies(boolValue);
-      assert(opt.createMissingColumnFamilies() == boolValue);
-    }
-
-    { // ErrorIfExists test
-      boolean boolValue = rand.nextBoolean();
-      opt.setErrorIfExists(boolValue);
-      assert(opt.errorIfExists() == boolValue);
-    }
-
-    { // ParanoidChecks test
-      boolean boolValue = rand.nextBoolean();
-      opt.setParanoidChecks(boolValue);
-      assert(opt.paranoidChecks() == boolValue);
-    }
-
-    {
-      // MaxTotalWalSize test
-      long longValue = rand.nextLong();
-      opt.setMaxTotalWalSize(longValue);
-      assert(opt.maxTotalWalSize() == longValue);
-    }
-
-    { // MaxOpenFiles test
-      int intValue = rand.nextInt();
-      opt.setMaxOpenFiles(intValue);
-      assert(opt.maxOpenFiles() == intValue);
-    }
-
-    { // DisableDataSync test
-      boolean boolValue = rand.nextBoolean();
-      opt.setDisableDataSync(boolValue);
-      assert(opt.disableDataSync() == boolValue);
-    }
-
-    { // UseFsync test
-      boolean boolValue = rand.nextBoolean();
-      opt.setUseFsync(boolValue);
-      assert(opt.useFsync() == boolValue);
-    }
-
-    { // DbLogDir test
-      String str = "path/to/DbLogDir";
-      opt.setDbLogDir(str);
-      assert(opt.dbLogDir().equals(str));
-    }
 
-    { // WalDir test
-      String str = "path/to/WalDir";
-      opt.setWalDir(str);
-      assert(opt.walDir().equals(str));
-    }
-
-    { // DeleteObsoleteFilesPeriodMicros test
-      long longValue = rand.nextLong();
-      opt.setDeleteObsoleteFilesPeriodMicros(longValue);
-      assert(opt.deleteObsoleteFilesPeriodMicros() == longValue);
-    }
-
-    { // MaxBackgroundCompactions test
-      int intValue = rand.nextInt();
-      opt.setMaxBackgroundCompactions(intValue);
-      assert(opt.maxBackgroundCompactions() == intValue);
-    }
-
-    { // MaxBackgroundFlushes test
-      int intValue = rand.nextInt();
-      opt.setMaxBackgroundFlushes(intValue);
-      assert(opt.maxBackgroundFlushes() == intValue);
-    }
-
-    { // MaxLogFileSize test
-      try {
-        long longValue = rand.nextLong();
-        opt.setMaxLogFileSize(longValue);
-        assert(opt.maxLogFileSize() == longValue);
-      } catch (RocksDBException e) {
-        System.out.println(e.getMessage());
-        assert(false);
-      }
-    }
-
-    { // LogFileTimeToRoll test
-      try {
-        long longValue = rand.nextLong();
-        opt.setLogFileTimeToRoll(longValue);
-        assert(opt.logFileTimeToRoll() == longValue);
-      } catch (RocksDBException e) {
-        assert(false);
-      }
-    }
-
-    { // KeepLogFileNum test
-      try {
-        long longValue = rand.nextLong();
-        opt.setKeepLogFileNum(longValue);
-        assert(opt.keepLogFileNum() == longValue);
-      } catch (RocksDBException e) {
-        assert(false);
-      }
-    }
-
-    { // MaxManifestFileSize test
-      long longValue = rand.nextLong();
-      opt.setMaxManifestFileSize(longValue);
-      assert(opt.maxManifestFileSize() == longValue);
-    }
-
-    { // TableCacheNumshardbits test
-      int intValue = rand.nextInt();
-      opt.setTableCacheNumshardbits(intValue);
-      assert(opt.tableCacheNumshardbits() == intValue);
-    }
-
-    { // TableCacheRemoveScanCountLimit test
-      int intValue = rand.nextInt();
-      opt.setTableCacheRemoveScanCountLimit(intValue);
-      assert(opt.tableCacheRemoveScanCountLimit() == intValue);
-    }
-
-    { // WalTtlSeconds test
-      long longValue = rand.nextLong();
-      opt.setWalTtlSeconds(longValue);
-      assert(opt.walTtlSeconds() == longValue);
-    }
-
-    { // ManifestPreallocationSize test
-      try {
-        long longValue = rand.nextLong();
-        opt.setManifestPreallocationSize(longValue);
-        assert(opt.manifestPreallocationSize() == longValue);
-      } catch (RocksDBException e) {
-        assert(false);
-      }
-    }
-
-    { // AllowOsBuffer test
-      boolean boolValue = rand.nextBoolean();
-      opt.setAllowOsBuffer(boolValue);
-      assert(opt.allowOsBuffer() == boolValue);
-    }
-
-    { // AllowMmapReads test
-      boolean boolValue = rand.nextBoolean();
-      opt.setAllowMmapReads(boolValue);
-      assert(opt.allowMmapReads() == boolValue);
-    }
-
-    { // AllowMmapWrites test
-      boolean boolValue = rand.nextBoolean();
-      opt.setAllowMmapWrites(boolValue);
-      assert(opt.allowMmapWrites() == boolValue);
-    }
-
-    { // IsFdCloseOnExec test
-      boolean boolValue = rand.nextBoolean();
-      opt.setIsFdCloseOnExec(boolValue);
-      assert(opt.isFdCloseOnExec() == boolValue);
-    }
-
-    { // SkipLogErrorOnRecovery test
-      boolean boolValue = rand.nextBoolean();
-      opt.setSkipLogErrorOnRecovery(boolValue);
-      assert(opt.skipLogErrorOnRecovery() == boolValue);
-    }
-
-    { // StatsDumpPeriodSec test
-      int intValue = rand.nextInt();
-      opt.setStatsDumpPeriodSec(intValue);
-      assert(opt.statsDumpPeriodSec() == intValue);
-    }
-
-    { // AdviseRandomOnOpen test
-      boolean boolValue = rand.nextBoolean();
-      opt.setAdviseRandomOnOpen(boolValue);
-      assert(opt.adviseRandomOnOpen() == boolValue);
-    }
-
-    { // UseAdaptiveMutex test
-      boolean boolValue = rand.nextBoolean();
-      opt.setUseAdaptiveMutex(boolValue);
-      assert(opt.useAdaptiveMutex() == boolValue);
-    }
-
-    { // BytesPerSync test
-      long longValue = rand.nextLong();
-      opt.setBytesPerSync(longValue);
-      assert(opt.bytesPerSync() == longValue);
-    }
+    DBOptionsTest.testDBOptions(opt);
 
     { // WriteBufferSize test
       try {
diff --git a/java/rocksjni/options.cc b/java/rocksjni/options.cc
index 181a8d317..ee0255d80 100644
--- a/java/rocksjni/options.cc
+++ b/java/rocksjni/options.cc
@@ -13,7 +13,7 @@
 
 #include "include/org_rocksdb_Options.h"
 //TODO(fyrz) to be commented in with options refactoring pull requests
-//#include "include/org_rocksdb_DBOptions.h"
+#include "include/org_rocksdb_DBOptions.h"
 //#include "include/org_rocksdb_ColumnFamilyOptions.h"
 #include "include/org_rocksdb_WriteOptions.h"
 #include "include/org_rocksdb_ReadOptions.h"
@@ -2714,9 +2714,8 @@ void Java_org_rocksdb_ColumnFamilyOptions_setMinPartialMergeOperands(
  */
 void Java_org_rocksdb_DBOptions_newDBOptions(JNIEnv* env,
     jobject jobj) {
-  // TODO(fyrz) needs to be enabled back when DBOptions are available
-  // rocksdb::DBOptions* dbop = new rocksdb::DBOptions();
-  // rocksdb::DBOptionsJni::setHandle(env, jobj, dbop);
+  rocksdb::DBOptions* dbop = new rocksdb::DBOptions();
+  rocksdb::DBOptionsJni::setHandle(env, jobj, dbop);
 }
 
 /*
diff --git a/java/rocksjni/portal.h b/java/rocksjni/portal.h
index 5a56fe639..03c15cb24 100644
--- a/java/rocksjni/portal.h
+++ b/java/rocksjni/portal.h
@@ -126,6 +126,71 @@ class OptionsJni {
   }
 };
 
+class DBOptionsJni {
+ public:
+  // Get the java class id of org.rocksdb.DBOptions.
+  static jclass getJClass(JNIEnv* env) {
+    jclass jclazz = env->FindClass("org/rocksdb/DBOptions");
+    assert(jclazz != nullptr);
+    return jclazz;
+  }
+
+  // Get the field id of the member variable of org.rocksdb.DBOptions
+  // that stores the pointer to rocksdb::DBOptions
+  static jfieldID getHandleFieldID(JNIEnv* env) {
+    static jfieldID fid = env->GetFieldID(
+        getJClass(env), "nativeHandle_", "J");
+    assert(fid != nullptr);
+    return fid;
+  }
+
+  // Get the pointer to rocksdb::DBOptions
+  static rocksdb::DBOptions* getHandle(JNIEnv* env, jobject jobj) {
+    return reinterpret_cast<rocksdb::DBOptions*>(
+        env->GetLongField(jobj, getHandleFieldID(env)));
+  }
+
+  // Pass the rocksdb::DBOptions pointer to the java side.
+  static void setHandle(JNIEnv* env, jobject jobj, rocksdb::DBOptions* op) {
+    env->SetLongField(
+        jobj, getHandleFieldID(env),
+        reinterpret_cast<jlong>(op));
+  }
+};
+
+class ColumnFamilyOptionsJni {
+ public:
+  // Get the java class id of org.rocksdb.ColumnFamilyOptions.
+  static jclass getJClass(JNIEnv* env) {
+    jclass jclazz = env->FindClass("org/rocksdb/ColumnFamilyOptions");
+    assert(jclazz != nullptr);
+    return jclazz;
+  }
+
+  // Get the field id of the member variable of org.rocksdb.DBOptions
+  // that stores the pointer to rocksdb::ColumnFamilyOptions
+  static jfieldID getHandleFieldID(JNIEnv* env) {
+    static jfieldID fid = env->GetFieldID(
+        getJClass(env), "nativeHandle_", "J");
+    assert(fid != nullptr);
+    return fid;
+  }
+
+  // Get the pointer to rocksdb::ColumnFamilyOptions
+  static rocksdb::ColumnFamilyOptions* getHandle(JNIEnv* env, jobject jobj) {
+    return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
+        env->GetLongField(jobj, getHandleFieldID(env)));
+  }
+
+  // Pass the rocksdb::ColumnFamilyOptions pointer to the java side.
+  static void setHandle(JNIEnv* env, jobject jobj,
+      rocksdb::ColumnFamilyOptions* op) {
+    env->SetLongField(
+        jobj, getHandleFieldID(env),
+        reinterpret_cast<jlong>(op));
+  }
+};
+
 class WriteOptionsJni {
  public:
   // Get the java class id of org.rocksdb.WriteOptions.

From 30ca3752bade53604a316c51ebd1d10fed629bc6 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Mon, 3 Nov 2014 14:00:45 -0800
Subject: [PATCH 399/829] Revamp our build tools

Summary:
This diff is revamping our build tools:
1) Use third-party2 instead of third-party
2) consolidate clang and gcc scripts together, lots of duplication there
3) remove hdfs libs, we never compile rocksdb with them

clang compilation doesn't work yet. It doesn't work in master either. I plan to fix it soon, but I just spent 2 hours trying to make it work and failed. I'll ask experts.

Test Plan: compiles with gcc

Reviewers: ljin, yhchiang, rven, sdong

Reviewed By: sdong

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D28047
---
 build_tools/build_detect_platform |  12 +---
 build_tools/fbcode.clang31.sh     |  74 ---------------------
 build_tools/fbcode.gcc471.sh      |  70 --------------------
 build_tools/fbcode.gcc481.sh      |  86 ------------------------
 build_tools/fbcode_config.sh      | 106 ++++++++++++++++++++++++++++++
 5 files changed, 107 insertions(+), 241 deletions(-)
 delete mode 100644 build_tools/fbcode.clang31.sh
 delete mode 100644 build_tools/fbcode.gcc471.sh
 delete mode 100644 build_tools/fbcode.gcc481.sh
 create mode 100644 build_tools/fbcode_config.sh

diff --git a/build_tools/build_detect_platform b/build_tools/build_detect_platform
index 29d94f01d..ec243f2be 100755
--- a/build_tools/build_detect_platform
+++ b/build_tools/build_detect_platform
@@ -48,17 +48,7 @@ COMMON_FLAGS="-DROCKSDB_PLATFORM_POSIX"
 # Default to fbcode gcc on internal fb machines
 if [ -z "$ROCKSDB_NO_FBCODE" -a -d /mnt/gvfs/third-party ]; then
     FBCODE_BUILD="true"
-    if [ -z "$USE_CLANG" ]; then
-        CENTOS_VERSION=`rpm -q --qf "%{VERSION}" \
-          $(rpm -q --whatprovides redhat-release)`
-        if [ "$CENTOS_VERSION" = "6" ]; then
-          source "$PWD/build_tools/fbcode.gcc481.sh"
-        else
-          source "$PWD/build_tools/fbcode.gcc471.sh"
-        fi
-    else
-        source "$PWD/build_tools/fbcode.clang31.sh"
-    fi
+    source "$PWD/build_tools/fbcode_config.sh"
 fi
 
 # Delete existing output, if it exists
diff --git a/build_tools/fbcode.clang31.sh b/build_tools/fbcode.clang31.sh
deleted file mode 100644
index 25a2ca72f..000000000
--- a/build_tools/fbcode.clang31.sh
+++ /dev/null
@@ -1,74 +0,0 @@
-#!/bin/sh
-#
-# Set environment variables so that we can compile leveldb using
-# fbcode settings.  It uses the latest g++ compiler and also
-# uses jemalloc
-
-TOOLCHAIN_REV=fbe3b095a4cc4a3713730050d182b7b4a80c342f
-TOOLCHAIN_EXECUTABLES="/mnt/gvfs/third-party/$TOOLCHAIN_REV/centos5.2-native"
-TOOLCHAIN_LIB_BASE="/mnt/gvfs/third-party/$TOOLCHAIN_REV/gcc-4.7.1-glibc-2.14.1"
-TOOL_JEMALLOC=jemalloc-3.3.1/9202ce3
-GLIBC_RUNTIME_PATH=/usr/local/fbcode/gcc-4.7.1-glibc-2.14.1
-
-# location of libgcc
-LIBGCC_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/include"
-LIBGCC_LIBS=" -L $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/libs"
-
-# location of glibc
-GLIBC_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/glibc/glibc-2.14.1/99df8fc/include"
-GLIBC_LIBS=" -L $TOOLCHAIN_LIB_BASE/glibc/glibc-2.14.1/99df8fc/lib"
-
-# location of snappy headers and libraries
-SNAPPY_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/snappy/snappy-1.0.3/7518bbe/include"
-SNAPPY_LIBS=" $TOOLCHAIN_LIB_BASE/snappy/snappy-1.0.3/7518bbe/lib/libsnappy.a"
-
-# location of zlib headers and libraries
-ZLIB_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/zlib/zlib-1.2.5/91ddd43/include"
-ZLIB_LIBS=" $TOOLCHAIN_LIB_BASE/zlib/zlib-1.2.5/91ddd43/lib/libz.a"
-
-# location of gflags headers and libraries
-GFLAGS_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/91ddd43/include"
-GFLAGS_LIBS=" $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/91ddd43/lib/libgflags.a"
-
-# location of bzip headers and libraries
-BZIP_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/bzip2/bzip2-1.0.6/91ddd43/include"
-BZIP_LIBS=" $TOOLCHAIN_LIB_BASE/bzip2/bzip2-1.0.6/91ddd43/lib/libbz2.a"
-
-# location of gflags headers and libraries
-GFLAGS_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/91ddd43/include"
-GFLAGS_LIBS=" $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/91ddd43/lib/libgflags.a"
-
-# use Intel SSE support for checksum calculations
-export USE_SSE=" -msse -msse4.2 "
-
-CC="$TOOLCHAIN_EXECUTABLES/clang/clang-3.2/0b7c69d/bin/clang $CLANG_INCLUDES"
-CXX="$TOOLCHAIN_EXECUTABLES/clang/clang-3.2/0b7c69d/bin/clang++ $CLANG_INCLUDES $JINCLUDE $SNAPPY_INCLUDE $ZLIB_INCLUDE $BZIP_INCLUDE $GFLAGS_INCLUDE"
-AR=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ar
-RANLIB=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ranlib
-
-CFLAGS="-B$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin -nostdlib "
-CFLAGS+=" -nostdinc -isystem $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/include/c++/4.7.1 "
-CFLAGS+=" -isystem $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/include/c++/4.7.1/x86_64-facebook-linux "
-CFLAGS+=" -isystem $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/include/c++/4.7.1/backward "
-CFLAGS+=" -isystem $TOOLCHAIN_LIB_BASE/glibc/glibc-2.14.1/99df8fc/include "
-CFLAGS+=" -isystem $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/include "
-CFLAGS+=" -isystem $TOOLCHAIN_LIB_BASE/clang/clang-3.2/0b7c69d/lib/clang/3.2/include "
-CFLAGS+=" -isystem $TOOLCHAIN_LIB_BASE/kernel-headers/kernel-headers-3.2.18_70_fbk11_00129_gc8882d0/da39a3e/include/linux "
-CFLAGS+=" -isystem $TOOLCHAIN_LIB_BASE/kernel-headers/kernel-headers-3.2.18_70_fbk11_00129_gc8882d0/da39a3e/include "
-CFLAGS+=" -Wall -Wno-sign-compare -Wno-unused-variable -Winvalid-pch -Wno-deprecated -Woverloaded-virtual"
-CFLAGS+=" $LIBGCC_INCLUDE $GLIBC_INCLUDE"
-CXXFLAGS="$CFLAGS -nostdinc++"
-
-CFLAGS+=" -I $TOOLCHAIN_LIB_BASE/jemalloc/$TOOL_JEMALLOC/include -DHAVE_JEMALLOC"
-
-EXEC_LDFLAGS=" -Wl,--whole-archive $TOOLCHAIN_LIB_BASE/jemalloc/$TOOL_JEMALLOC/lib/libjemalloc.a"
-EXEC_LDFLAGS+=" -Wl,--no-whole-archive $TOOLCHAIN_LIB_BASE/libunwind/libunwind-1.0.1/350336c/lib/libunwind.a"
-EXEC_LDFLAGS+=" $HDFSLIB $SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $GFLAGS_LIBS"
-EXEC_LDFLAGS+=" -Wl,--dynamic-linker,$GLIBC_RUNTIME_PATH/lib/ld-linux-x86-64.so.2"
-EXEC_LDFLAGS+=" -B$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin"
-
-PLATFORM_LDFLAGS="$LIBGCC_LIBS $GLIBC_LIBS "
-
-EXEC_LDFLAGS_SHARED="$SNAPPY_LIBS $ZLIB_LIBS $GFLAGS_LIBS"
-
-export CC CXX AR RANLIB CFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED 
diff --git a/build_tools/fbcode.gcc471.sh b/build_tools/fbcode.gcc471.sh
deleted file mode 100644
index b5d886730..000000000
--- a/build_tools/fbcode.gcc471.sh
+++ /dev/null
@@ -1,70 +0,0 @@
-#!/bin/sh
-#
-# Set environment variables so that we can compile leveldb using
-# fbcode settings.  It uses the latest g++ compiler and also
-# uses jemalloc
-
-TOOLCHAIN_REV=fbe3b095a4cc4a3713730050d182b7b4a80c342f
-TOOLCHAIN_EXECUTABLES="/mnt/gvfs/third-party/$TOOLCHAIN_REV/centos5.2-native"
-TOOLCHAIN_LIB_BASE="/mnt/gvfs/third-party/$TOOLCHAIN_REV/gcc-4.7.1-glibc-2.14.1"
-TOOL_JEMALLOC=jemalloc-3.3.1/9202ce3
-
-# location of libhdfs libraries
-if test "$USE_HDFS"; then
-  JAVA_HOME="/usr/local/jdk-6u22-64"
-  JINCLUDE="-I$JAVA_HOME/include -I$JAVA_HOME/include/linux"
-  GLIBC_RUNTIME_PATH="/usr/local/fbcode/gcc-4.7.1-glibc-2.14.1"
-  HDFSLIB=" -Wl,--no-whole-archive hdfs/libhdfs.a -L$JAVA_HOME/jre/lib/amd64 "
-  HDFSLIB+=" -L$JAVA_HOME/jre/lib/amd64/server -L$GLIBC_RUNTIME_PATH/lib "
-  HDFSLIB+=" -ldl -lverify -ljava -ljvm "
-fi
-
-# location of libgcc
-LIBGCC_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/include"
-LIBGCC_LIBS=" -L $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/libs"
-
-# location of glibc
-GLIBC_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/glibc/glibc-2.14.1/99df8fc/include"
-GLIBC_LIBS=" -L $TOOLCHAIN_LIB_BASE/glibc/glibc-2.14.1/99df8fc/lib"
-
-# location of snappy headers and libraries
-SNAPPY_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/snappy/snappy-1.0.3/7518bbe/include"
-SNAPPY_LIBS=" $TOOLCHAIN_LIB_BASE/snappy/snappy-1.0.3/7518bbe/lib/libsnappy.a"
-
-# location of zlib headers and libraries
-ZLIB_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/zlib/zlib-1.2.5/91ddd43/include"
-ZLIB_LIBS=" $TOOLCHAIN_LIB_BASE/zlib/zlib-1.2.5/91ddd43/lib/libz.a"
-
-# location of bzip headers and libraries
-BZIP_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/bzip2/bzip2-1.0.6/91ddd43/include"
-BZIP_LIBS=" $TOOLCHAIN_LIB_BASE/bzip2/bzip2-1.0.6/91ddd43/lib/libbz2.a"
-
-# location of gflags headers and libraries
-GFLAGS_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/91ddd43/include"
-GFLAGS_LIBS=" $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/91ddd43/lib/libgflags.a"
-
-# use Intel SSE support for checksum calculations
-export USE_SSE=" -msse -msse4.2 "
-
-CC="$TOOLCHAIN_EXECUTABLES/gcc/gcc-4.7.1-glibc-2.14.1/bin/gcc"
-CXX="$TOOLCHAIN_EXECUTABLES/gcc/gcc-4.7.1-glibc-2.14.1/bin/g++ $JINCLUDE $SNAPPY_INCLUDE $ZLIB_INCLUDE $BZIP_INCLUDE $GFLAGS_INCLUDE"
-AR=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ar
-RANLIB=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ranlib
-
-CFLAGS="-B$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/gold -m64 -mtune=generic"
-CFLAGS+=" -I $TOOLCHAIN_LIB_BASE/jemalloc/$TOOL_JEMALLOC/include -DHAVE_JEMALLOC"
-CFLAGS+=" $LIBGCC_INCLUDE $GLIBC_INCLUDE"
-CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_FALLOCATE_PRESENT"
-CFLAGS+=" -DSNAPPY -DGFLAGS=google -DZLIB -DBZIP2"
-
-EXEC_LDFLAGS=" -Wl,--whole-archive $TOOLCHAIN_LIB_BASE/jemalloc/$TOOL_JEMALLOC/lib/libjemalloc.a"
-EXEC_LDFLAGS+=" -Wl,--no-whole-archive $TOOLCHAIN_LIB_BASE/libunwind/libunwind-1.0.1/350336c/lib/libunwind.a"
-EXEC_LDFLAGS+=" $HDFSLIB $SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $GFLAGS_LIBS"
-
-PLATFORM_LDFLAGS="$LIBGCC_LIBS $GLIBC_LIBS "
-
-EXEC_LDFLAGS_SHARED="$SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $GFLAGS_LIBS"
-
-VALGRIND_VER="$TOOLCHAIN_LIB_BASE/valgrind/valgrind-3.8.1/91ddd43/bin/"
-
-export CC CXX AR RANLIB CFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED VALGRIND_VER
diff --git a/build_tools/fbcode.gcc481.sh b/build_tools/fbcode.gcc481.sh
deleted file mode 100644
index 386ad509b..000000000
--- a/build_tools/fbcode.gcc481.sh
+++ /dev/null
@@ -1,86 +0,0 @@
-#!/bin/sh
-#
-# Set environment variables so that we can compile rocksdb using
-# fbcode settings.  It uses the latest g++ compiler and also
-# uses jemalloc
-
-TOOLCHAIN_REV=53dc1fe83f84e9145b9ffb81b81aa7f6a49c87cc
-CENTOS_VERSION=`rpm -q --qf "%{VERSION}" $(rpm -q --whatprovides redhat-release)`
-if [ "$CENTOS_VERSION" = "6" ]; then
-  TOOLCHAIN_EXECUTABLES="/mnt/gvfs/third-party/$TOOLCHAIN_REV/centos6-native"
-else
-  TOOLCHAIN_EXECUTABLES="/mnt/gvfs/third-party/$TOOLCHAIN_REV/centos5.2-native"
-fi
-TOOLCHAIN_LIB_BASE="/mnt/gvfs/third-party/$TOOLCHAIN_REV/gcc-4.8.1-glibc-2.17"
-
-# location of libhdfs libraries
-if test "$USE_HDFS"; then
-  JAVA_HOME="/usr/local/jdk-6u22-64"
-  JINCLUDE="-I$JAVA_HOME/include -I$JAVA_HOME/include/linux"
-  GLIBC_RUNTIME_PATH="/usr/local/fbcode/gcc-4.8.1-glibc-2.17"
-  HDFSLIB=" -Wl,--no-whole-archive hdfs/libhdfs.a -L$JAVA_HOME/jre/lib/amd64 "
-  HDFSLIB+=" -L$JAVA_HOME/jre/lib/amd64/server -L$GLIBC_RUNTIME_PATH/lib "
-  HDFSLIB+=" -ldl -lverify -ljava -ljvm "
-fi
-
-# location of libgcc
-LIBGCC_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.8.1/8aac7fc/include"
-LIBGCC_LIBS=" -L $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.8.1/8aac7fc/libs"
-
-# location of glibc
-GLIBC_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/glibc/glibc-2.17/99df8fc/include"
-GLIBC_LIBS=" -L $TOOLCHAIN_LIB_BASE/glibc/glibc-2.17/99df8fc/lib"
-
-# location of snappy headers and libraries
-SNAPPY_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/snappy/snappy-1.0.3/43d84e2/include"
-SNAPPY_LIBS=" $TOOLCHAIN_LIB_BASE/snappy/snappy-1.0.3/43d84e2/lib/libsnappy.a"
-
-# location of zlib headers and libraries
-ZLIB_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/zlib/zlib-1.2.5/c3f970a/include"
-ZLIB_LIBS=" $TOOLCHAIN_LIB_BASE/zlib/zlib-1.2.5/c3f970a/lib/libz.a"
-
-# location of bzip headers and libraries
-BZIP_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/bzip2/bzip2-1.0.6/c3f970a/include"
-BZIP_LIBS=" $TOOLCHAIN_LIB_BASE/bzip2/bzip2-1.0.6/c3f970a/lib/libbz2.a"
-
-LZ4_REV=065ec7e38fe83329031f6668c43bef83eff5808b
-LZ4_INCLUDE=" -I /mnt/gvfs/third-party2/lz4/$LZ4_REV/r108/gcc-4.8.1-glibc-2.17/c3f970a/include"
-LZ4_LIBS=" /mnt/gvfs/third-party2/lz4/$LZ4_REV/r108/gcc-4.8.1-glibc-2.17/c3f970a/lib/liblz4.a"
-
-# location of gflags headers and libraries
-GFLAGS_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/c3f970a/include"
-GFLAGS_LIBS=" $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/c3f970a/lib/libgflags.a"
-
-# location of jemalloc
-JEMALLOC_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/jemalloc/jemalloc-3.4.1/4d53c6f/include/"
-JEMALLOC_LIB=" -Wl,--whole-archive $TOOLCHAIN_LIB_BASE/jemalloc/jemalloc-3.4.1/4d53c6f/lib/libjemalloc.a"
-
-# location of numa
-NUMA_REV=829d10dac0230f99cd7e1778869d2adf3da24b65
-NUMA_INCLUDE=" -I /mnt/gvfs/third-party2/numa/$NUMA_REV/2.0.8/gcc-4.8.1-glibc-2.17/c3f970a/include/"
-NUMA_LIB=" /mnt/gvfs/third-party2/numa/$NUMA_REV/2.0.8/gcc-4.8.1-glibc-2.17/c3f970a/lib/libnuma.a"
-
-# use Intel SSE support for checksum calculations
-export USE_SSE=" -msse -msse4.2 "
-
-CC="$TOOLCHAIN_EXECUTABLES/gcc/gcc-4.8.1/cc6c9dc/bin/gcc"
-CXX="$TOOLCHAIN_EXECUTABLES/gcc/gcc-4.8.1/cc6c9dc/bin/g++ $JINCLUDE $SNAPPY_INCLUDE $ZLIB_INCLUDE $BZIP_INCLUDE $LZ4_INCLUDE $GFLAGS_INCLUDE $NUMA_INCLUDE"
-AR=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ar
-RANLIB=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ranlib
-
-CFLAGS="-B$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/gold -m64 -mtune=generic"
-CFLAGS+=" $LIBGCC_INCLUDE $GLIBC_INCLUDE"
-CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_FALLOCATE_PRESENT"
-CFLAGS+=" -DSNAPPY -DGFLAGS=google -DZLIB -DBZIP2 -DLZ4 -DNUMA"
-
-EXEC_LDFLAGS="-Wl,--dynamic-linker,/usr/local/fbcode/gcc-4.8.1-glibc-2.17/lib/ld.so"
-EXEC_LDFLAGS+=" -Wl,--no-whole-archive $TOOLCHAIN_LIB_BASE/libunwind/libunwind-1.0.1/675d945/lib/libunwind.a"
-EXEC_LDFLAGS+=" $HDFSLIB $SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $GFLAGS_LIBS $NUMA_LIB"
-
-PLATFORM_LDFLAGS="$LIBGCC_LIBS $GLIBC_LIBS "
-
-EXEC_LDFLAGS_SHARED="$SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $GFLAGS_LIBS"
-
-VALGRIND_VER="$TOOLCHAIN_LIB_BASE/valgrind/valgrind-3.8.1/c3f970a/bin/"
-
-export CC CXX AR RANLIB CFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED VALGRIND_VER JEMALLOC_LIB JEMALLOC_INCLUDE
diff --git a/build_tools/fbcode_config.sh b/build_tools/fbcode_config.sh
new file mode 100644
index 000000000..99215108f
--- /dev/null
+++ b/build_tools/fbcode_config.sh
@@ -0,0 +1,106 @@
+#!/bin/sh
+#
+# Set environment variables so that we can compile rocksdb using
+# fbcode settings.  It uses the latest g++ compiler and also
+# uses jemalloc
+
+# location of libgcc
+LIBGCC_BASE="/mnt/gvfs/third-party2/libgcc/7712e757d7355cb51292454ee0b7b46a467fdfed/4.8.1/gcc-4.8.1-glibc-2.17/8aac7fc"
+LIBGCC_INCLUDE="$LIBGCC_BASE/include"
+LIBGCC_LIBS=" -L $LIBGCC_BASE/libs"
+
+# location of glibc
+GLIBC_REV=6e40560b4e0b6d690fd1cf8c7a43ad7452b04cfa
+GLIBC_INCLUDE="/mnt/gvfs/third-party2/glibc/$GLIBC_REV/2.17/gcc-4.8.1-glibc-2.17/99df8fc/include"
+GLIBC_LIBS=" -L /mnt/gvfs/third-party2/glibc/$GLIBC_REV/2.17/gcc-4.8.1-glibc-2.17/99df8fc/lib"
+
+# location of snappy headers and libraries
+SNAPPY_INCLUDE=" -I /mnt/gvfs/third-party2/snappy/aef17f6c0b44b4fe408bd06f67c93701ab0a6ceb/1.0.3/gcc-4.8.1-glibc-2.17/43d84e2/include"
+SNAPPY_LIBS=" /mnt/gvfs/third-party2/snappy/aef17f6c0b44b4fe408bd06f67c93701ab0a6ceb/1.0.3/gcc-4.8.1-glibc-2.17/43d84e2/lib/libsnappy.a"
+
+# location of zlib headers and libraries
+ZLIB_INCLUDE=" -I /mnt/gvfs/third-party2/zlib/25c6216928b4d77b59ddeca0990ff6fe9ac16b81/1.2.5/gcc-4.8.1-glibc-2.17/c3f970a/include"
+ZLIB_LIBS=" /mnt/gvfs/third-party2/zlib/25c6216928b4d77b59ddeca0990ff6fe9ac16b81/1.2.5/gcc-4.8.1-glibc-2.17/c3f970a/lib/libz.a"
+
+# location of bzip headers and libraries
+BZIP_INCLUDE=" -I /mnt/gvfs/third-party2/bzip2/c9ef7629c2aa0024f7a416e87602f06eb88f5eac/1.0.6/gcc-4.8.1-glibc-2.17/c3f970a/include/"
+BZIP_LIBS=" /mnt/gvfs/third-party2/bzip2/c9ef7629c2aa0024f7a416e87602f06eb88f5eac/1.0.6/gcc-4.8.1-glibc-2.17/c3f970a/lib/libbz2.a"
+
+LZ4_REV=065ec7e38fe83329031f6668c43bef83eff5808b
+LZ4_INCLUDE=" -I /mnt/gvfs/third-party2/lz4/$LZ4_REV/r108/gcc-4.8.1-glibc-2.17/c3f970a/include"
+LZ4_LIBS=" /mnt/gvfs/third-party2/lz4/$LZ4_REV/r108/gcc-4.8.1-glibc-2.17/c3f970a/lib/liblz4.a"
+
+# location of gflags headers and libraries
+GFLAGS_INCLUDE=" -I /mnt/gvfs/third-party2/gflags/1ad047a6e6f6673991918ecadc670868205a243a/1.6/gcc-4.8.1-glibc-2.17/c3f970a/include/"
+GFLAGS_LIBS=" /mnt/gvfs/third-party2/gflags/1ad047a6e6f6673991918ecadc670868205a243a/1.6/gcc-4.8.1-glibc-2.17/c3f970a/lib/libgflags.a"
+
+# location of jemalloc
+JEMALLOC_INCLUDE=" -I /mnt/gvfs/third-party2/jemalloc/c60d854f7824f334195fe7fd34b2bc9057e3c1f9/3.6.0/gcc-4.8.1-glibc-2.17/4d53c6f/include"
+JEMALLOC_LIB=" -Wl,--whole-archive /mnt/gvfs/third-party2/jemalloc/c60d854f7824f334195fe7fd34b2bc9057e3c1f9/3.6.0/gcc-4.8.1-glibc-2.17/4d53c6f/lib/libjemalloc.a"
+
+# location of numa
+NUMA_REV=829d10dac0230f99cd7e1778869d2adf3da24b65
+NUMA_INCLUDE=" -I /mnt/gvfs/third-party2/numa/$NUMA_REV/2.0.8/gcc-4.8.1-glibc-2.17/c3f970a/include/"
+NUMA_LIB=" /mnt/gvfs/third-party2/numa/$NUMA_REV/2.0.8/gcc-4.8.1-glibc-2.17/c3f970a/lib/libnuma.a"
+
+# location of libunwind
+LIBUNWIND_REV=2c060e64064559905d46fd194000d61592087bdc
+LIBUNWIND="/mnt/gvfs/third-party2/libunwind/$LIBUNWIND_REV/1.1/gcc-4.8.1-glibc-2.17/675d945/lib/libunwind.a"
+
+# use Intel SSE support for checksum calculations
+export USE_SSE=" -msse -msse4.2 "
+
+BINUTILS="/mnt/gvfs/third-party2/binutils/2aff2e7b474cd3e6ab23495ad1224b7d214b9f8e/2.21.1/centos6-native/da39a3e/bin"
+AR="$BINUTILS/ar"
+
+DEPS_INCLUDE="$SNAPPY_INCLUDE $ZLIB_INCLUDE $BZIP_INCLUDE $LZ4_INCLUDE $GFLAGS_INCLUDE $NUMA_INCLUDE"
+
+GCC_BASE="/mnt/gvfs/third-party2/gcc/1ec615e23800f0815d474478ba476a0adc3fe788/4.8.1/centos6-native/cc6c9dc"
+STDLIBS="-L $GCC_BASE/lib64"
+
+if [ -z "$USE_CLANG" ]; then
+  # gcc
+  CC="$GCC_BASE/bin/gcc"
+  CXX="$GCC_BASE/bin/g++"
+  
+  CFLAGS="-B$BINUTILS/gold -m64 -mtune=generic"
+  CFLAGS+=" -I $LIBGCC_INCLUDE -I $GLIBC_INCLUDE"
+  CFLAGS+=" $DEPS_INCLUDE"
+  CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_FALLOCATE_PRESENT"
+  CFLAGS+=" -DSNAPPY -DGFLAGS=google -DZLIB -DBZIP2 -DLZ4 -DNUMA"
+else
+  # clang 
+  CLANG_BASE="/mnt/gvfs/third-party2/clang/9ab68376f938992c4eb5946ca68f90c3185cffc8/3.4"
+  CLANG_INCLUDE="$CLANG_BASE/gcc-4.8.1-glibc-2.17/fb0f730/lib/clang/3.4/include"
+  CC="$CLANG_BASE/centos6-native/9cefd8a/bin/clang"
+  CXX="$CLANG_BASE/centos6-native/9cefd8a/bin/clang++"
+
+  KERNEL_HEADERS_INCLUDE="/mnt/gvfs/third-party2/kernel-headers/a683ed7135276731065a9d76d3016c9731f4e2f9/3.2.18_70_fbk11_00129_gc8882d0/gcc-4.8.1-glibc-2.17/da39a3e/include/"
+
+  CFLAGS="-B$BINUTILS  -nostdinc -nostdlib"
+  CFLAGS+=" -isystem $LIBGCC_BASE/include/c++/4.8.1 "
+  CFLAGS+=" -isystem $LIBGCC_BASE/include/c++/4.8.1/x86_64-facebook-linux "
+  CFLAGS+=" -isystem $GLIBC_INCLUDE"
+  CFLAGS+=" -isystem $LIBGCC_INCLUDE"
+  CFLAGS+=" -isystem $CLANG_INCLUDE"
+  CFLAGS+=" -isystem $KERNEL_HEADERS_INCLUDE/linux "
+  CFLAGS+=" -isystem $KERNEL_HEADERS_INCLUDE "
+  CFLAGS+=" -Wall -Wno-sign-compare -Wno-unused-variable -Winvalid-pch -Wno-deprecated -Woverloaded-virtual"
+  CFLAGS+=" $DEPS_INCLUDE"
+  CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_FALLOCATE_PRESENT"
+  CFLAGS+=" -DSNAPPY -DGFLAGS=google -DZLIB -DBZIP2 -DLZ4 -DNUMA"
+  CXXFLAGS="$CFLAGS -nostdinc++"
+fi
+
+EXEC_LDFLAGS=" $SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $GFLAGS_LIBS $NUMA_LIB"
+EXEC_LDFLAGS+=" -Wl,--dynamic-linker,/usr/local/fbcode/gcc-4.8.1-glibc-2.17/lib/ld.so"
+EXEC_LDFLAGS+=" -Wl,--no-whole-archive $LIBUNWIND"
+
+PLATFORM_LDFLAGS="$LIBGCC_LIBS $GLIBC_LIBS $STDLIBS -lgcc -lstdc++"
+
+EXEC_LDFLAGS_SHARED="$SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $GFLAGS_LIBS"
+
+VALGRIND_REV=b2a9f85e4b70cd03abc85a7f3027fbc4cef35bd0
+VALGRIND_VER="/mnt/gvfs/third-party2/valgrind/$VALGRIND_REV/3.8.1/gcc-4.8.1-glibc-2.17/c3f970a/bin/"
+
+export CC CXX AR CFLAGS CXXFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED VALGRIND_VER JEMALLOC_LIB JEMALLOC_INCLUDE

From 636e57b52dc699fee435d627470d757c09f80501 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Mon, 3 Nov 2014 14:53:00 -0800
Subject: [PATCH 400/829] Fix coverage script

---
 coverage/coverage_test.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/coverage/coverage_test.sh b/coverage/coverage_test.sh
index 08dbd05a5..4d8052c9e 100755
--- a/coverage/coverage_test.sh
+++ b/coverage/coverage_test.sh
@@ -11,8 +11,8 @@ fi
 ROOT=".."
 # Fetch right version of gcov
 if [ -d /mnt/gvfs/third-party -a -z "$CXX" ]; then
-  source $ROOT/build_tools/fbcode.gcc471.sh
-  GCOV=$TOOLCHAIN_EXECUTABLES/gcc/gcc-4.7.1/cc6c9dc/bin/gcov
+  source $ROOT/build_tools/fbcode_config.sh
+  GCOV=$GCC_BASE/bin/gcov
 else
   GCOV=$(which gcov)
 fi

From 09899f0b51977366b92dc2e71cd77aa5fa6e9836 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Mon, 3 Nov 2014 14:11:33 -0800
Subject: [PATCH 401/829] DB::Open() to automatically increase thread pool size
 if it is smaller than max number of parallel compactions or flushes

Summary:
With the patch, thread pool size will be automatically increased if DB's options ask for more parallelism of compactions or flushes.

Too many users have been confused by the API. Change it to make it harder for users to make mistakes

Test Plan: Add two unit tests to cover the function.

Reviewers: yhchiang, rven, igor, MarkCallaghan, ljin

Reviewed By: ljin

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D27555
---
 HISTORY.md            |  2 +-
 db/db_impl.cc         |  4 ++++
 db/db_test.cc         | 41 +++++++++++++++++++++++++++++++++++++++++
 hdfs/env_hdfs.h       |  6 +++++-
 include/rocksdb/env.h | 10 ++++++++++
 util/env_posix.cc     | 19 +++++++++++++++++--
 util/env_test.cc      | 38 ++++++++++++++++++++++++++++++++++++--
 7 files changed, 114 insertions(+), 6 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index b72bce080..7c0b5a9b8 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -5,7 +5,7 @@
 ### Public API changes
 * Introduce 4 new convenient functions for converting Options from string: GetColumnFamilyOptionsFromMap(), GetColumnFamilyOptionsFromString(), GetDBOptionsFromMap(), GetDBOptionsFromString()
 * Remove WriteBatchWithIndex.Delete() overloads using SliceParts
-
+* When opening a DB, if options.max_background_compactions is larger than the existing low pri pool of options.env, it will enlarge it. Similarly, options.max_background_flushes is larger than the existing high pri pool of options.env, it will enlarge it.
 
 ## 3.6.0 (10/7/2014)
 ### Disk format changes
diff --git a/db/db_impl.cc b/db/db_impl.cc
index 2fbd40637..98caf98b5 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -118,6 +118,10 @@ DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src) {
       result.info_log = nullptr;
     }
   }
+  result.env->IncBackgroundThreadsIfNeeded(src.max_background_compactions,
+                                           Env::Priority::LOW);
+  result.env->IncBackgroundThreadsIfNeeded(src.max_background_flushes,
+                                           Env::Priority::HIGH);
 
   if (!result.rate_limiter) {
     if (result.bytes_per_sync == 0) {
diff --git a/db/db_test.cc b/db/db_test.cc
index 4807ef121..cd6cd5862 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -412,6 +412,8 @@ class DBTest {
              mem_env_(!getenv("MEM_ENV") ? nullptr :
                                            new MockEnv(Env::Default())),
              env_(new SpecialEnv(mem_env_ ? mem_env_ : Env::Default())) {
+    env_->SetBackgroundThreads(1, Env::LOW);
+    env_->SetBackgroundThreads(1, Env::HIGH);
     dbname_ = test::TmpDir(env_) + "/db_test";
     auto options = CurrentOptions();
     ASSERT_OK(DestroyDB(dbname_, options));
@@ -8193,6 +8195,45 @@ TEST(DBTest, TableOptionsSanitizeTest) {
   ASSERT_OK(TryReopen(options));
 }
 
+TEST(DBTest, SanitizeNumThreads) {
+  for (int attempt = 0; attempt < 2; attempt++) {
+    const size_t kTotalTasks = 8;
+    SleepingBackgroundTask sleeping_tasks[kTotalTasks];
+
+    Options options = CurrentOptions();
+    if (attempt == 0) {
+      options.max_background_compactions = 3;
+      options.max_background_flushes = 2;
+    }
+    options.create_if_missing = true;
+    DestroyAndReopen(options);
+
+    for (size_t i = 0; i < kTotalTasks; i++) {
+      // Insert 5 tasks to low priority queue and 5 tasks to high priority queue
+      env_->Schedule(&SleepingBackgroundTask::DoSleepTask, &sleeping_tasks[i],
+                     (i < 4) ? Env::Priority::LOW : Env::Priority::HIGH);
+    }
+
+    // Wait 100 milliseconds for they are scheduled.
+    env_->SleepForMicroseconds(100000);
+
+    // pool size 3, total task 4. Queue size should be 1.
+    ASSERT_EQ(1U, options.env->GetThreadPoolQueueLen(Env::Priority::LOW));
+    // pool size 2, total task 4. Queue size should be 2.
+    ASSERT_EQ(2U, options.env->GetThreadPoolQueueLen(Env::Priority::HIGH));
+
+    for (size_t i = 0; i < kTotalTasks; i++) {
+      sleeping_tasks[i].WakeUp();
+      sleeping_tasks[i].WaitUntilDone();
+    }
+
+    ASSERT_OK(Put("abc", "def"));
+    ASSERT_EQ("def", Get("abc"));
+    Flush();
+    ASSERT_EQ("def", Get("abc"));
+  }
+}
+
 TEST(DBTest, DBIteratorBoundTest) {
   Options options = CurrentOptions();
   options.env = env_;
diff --git a/hdfs/env_hdfs.h b/hdfs/env_hdfs.h
index 5e7de77d3..82f317f73 100644
--- a/hdfs/env_hdfs.h
+++ b/hdfs/env_hdfs.h
@@ -145,6 +145,10 @@ class HdfsEnv : public Env {
     posixEnv->SetBackgroundThreads(number, pri);
   }
 
+  virtual void IncBackgroundThreadsIfNeeded(int number, Priority pri) override {
+    posixEnv->IncBackgroundThreadsIfNeeded(number, pri);
+  }
+
   virtual std::string TimeToString(uint64_t number) {
     return posixEnv->TimeToString(number);
   }
@@ -319,7 +323,7 @@ class HdfsEnv : public Env {
       std::string* outputpath) {return notsup;}
 
   virtual void SetBackgroundThreads(int number, Priority pri = LOW) {}
-
+  virtual void IncBackgroundThreadsIfNeeded(int number, Priority pri) {}
   virtual std::string TimeToString(uint64_t number) { return "";}
 };
 }
diff --git a/include/rocksdb/env.h b/include/rocksdb/env.h
index 70244bb31..e002fede1 100644
--- a/include/rocksdb/env.h
+++ b/include/rocksdb/env.h
@@ -272,6 +272,11 @@ class Env {
   // default number: 1
   virtual void SetBackgroundThreads(int number, Priority pri = LOW) = 0;
 
+  // Enlarge number of background worker threads of a specific thread pool
+  // for this environment if it is smaller than specified. 'LOW' is the default
+  // pool.
+  virtual void IncBackgroundThreadsIfNeeded(int number, Priority pri) = 0;
+
   // Lower IO priority for threads from the specified pool.
   virtual void LowerThreadPoolIOPriority(Priority pool = LOW) {}
 
@@ -782,6 +787,11 @@ class EnvWrapper : public Env {
   void SetBackgroundThreads(int num, Priority pri) {
     return target_->SetBackgroundThreads(num, pri);
   }
+
+  void IncBackgroundThreadsIfNeeded(int num, Priority pri) {
+    return target_->IncBackgroundThreadsIfNeeded(num, pri);
+  }
+
   void LowerThreadPoolIOPriority(Priority pool = LOW) override {
     target_->LowerThreadPoolIOPriority(pool);
   }
diff --git a/util/env_posix.cc b/util/env_posix.cc
index e44ebc83e..b9987088c 100644
--- a/util/env_posix.cc
+++ b/util/env_posix.cc
@@ -1422,6 +1422,12 @@ class PosixEnv : public Env {
     thread_pools_[pri].SetBackgroundThreads(num);
   }
 
+  // Allow increasing the number of worker threads.
+  virtual void IncBackgroundThreadsIfNeeded(int num, Priority pri) {
+    assert(pri >= Priority::LOW && pri <= Priority::HIGH);
+    thread_pools_[pri].IncBackgroundThreadsIfNeeded(num);
+  }
+
   virtual void LowerThreadPoolIOPriority(Priority pool = LOW) override {
     assert(pool >= Priority::LOW && pool <= Priority::HIGH);
 #ifdef OS_LINUX
@@ -1642,13 +1648,14 @@ class PosixEnv : public Env {
       PthreadCall("signalall", pthread_cond_broadcast(&bgsignal_));
     }
 
-    void SetBackgroundThreads(int num) {
+    void SetBackgroundThreadsInternal(int num, bool allow_reduce) {
       PthreadCall("lock", pthread_mutex_lock(&mu_));
       if (exit_all_threads_) {
         PthreadCall("unlock", pthread_mutex_unlock(&mu_));
         return;
       }
-      if (num != total_threads_limit_) {
+      if (num > total_threads_limit_ ||
+          (num < total_threads_limit_ && allow_reduce)) {
         total_threads_limit_ = num;
         WakeUpAllThreads();
         StartBGThreads();
@@ -1657,6 +1664,14 @@ class PosixEnv : public Env {
       PthreadCall("unlock", pthread_mutex_unlock(&mu_));
     }
 
+    void IncBackgroundThreadsIfNeeded(int num) {
+      SetBackgroundThreadsInternal(num, false);
+    }
+
+    void SetBackgroundThreads(int num) {
+      SetBackgroundThreadsInternal(num, true);
+    }
+
     void StartBGThreads() {
       // Start background thread if necessary
       while ((int)bgthreads_.size() < total_threads_limit_) {
diff --git a/util/env_test.cc b/util/env_test.cc
index 3bb4fb68c..54e52069a 100644
--- a/util/env_test.cc
+++ b/util/env_test.cc
@@ -144,7 +144,7 @@ TEST(EnvPosixTest, TwoPools) {
         std::cout << "Pool " << pool_name_ << ": "
                   << num_running_ << " running threads.\n";
         // make sure we don't have more than pool_size_ jobs running.
-        ASSERT_LE(num_running_, pool_size_);
+        ASSERT_LE(num_running_, pool_size_.load());
       }
 
       // sleep for 1 sec
@@ -162,11 +162,16 @@ TEST(EnvPosixTest, TwoPools) {
       return num_finished_;
     }
 
+    void Reset(int pool_size) {
+      pool_size_.store(pool_size);
+      num_finished_ = 0;
+    }
+
    private:
     port::Mutex mu_;
     int num_running_;
     int num_finished_;
-    int pool_size_;
+    std::atomic<int> pool_size_;
     std::string pool_name_;
   };
 
@@ -205,6 +210,35 @@ TEST(EnvPosixTest, TwoPools) {
 
   ASSERT_EQ(0U, env_->GetThreadPoolQueueLen(Env::Priority::LOW));
   ASSERT_EQ(0U, env_->GetThreadPoolQueueLen(Env::Priority::HIGH));
+
+  // call IncBackgroundThreadsIfNeeded to two pools. One increasing and
+  // the other decreasing
+  env_->IncBackgroundThreadsIfNeeded(kLowPoolSize - 1, Env::Priority::LOW);
+  env_->IncBackgroundThreadsIfNeeded(kHighPoolSize + 1, Env::Priority::HIGH);
+  high_pool_job.Reset(kHighPoolSize + 1);
+  low_pool_job.Reset(kLowPoolSize);
+
+  // schedule same number of jobs in each pool
+  for (int i = 0; i < kJobs; i++) {
+    env_->Schedule(&CB::Run, &low_pool_job);
+    env_->Schedule(&CB::Run, &high_pool_job, Env::Priority::HIGH);
+  }
+  // Wait a short while for the jobs to be dispatched.
+  Env::Default()->SleepForMicroseconds(kDelayMicros);
+  ASSERT_EQ((unsigned int)(kJobs - kLowPoolSize),
+            env_->GetThreadPoolQueueLen());
+  ASSERT_EQ((unsigned int)(kJobs - kLowPoolSize),
+            env_->GetThreadPoolQueueLen(Env::Priority::LOW));
+  ASSERT_EQ((unsigned int)(kJobs - (kHighPoolSize + 1)),
+            env_->GetThreadPoolQueueLen(Env::Priority::HIGH));
+
+  // wait for all jobs to finish
+  while (low_pool_job.NumFinished() < kJobs ||
+         high_pool_job.NumFinished() < kJobs) {
+    env_->SleepForMicroseconds(kDelayMicros);
+  }
+
+  env_->SetBackgroundThreads(kHighPoolSize, Env::Priority::HIGH);
 }
 
 TEST(EnvPosixTest, DecreaseNumBgThreads) {

From ac6afaf9ef5425a081216e9f74407e2aefd1f3c7 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Mon, 3 Nov 2014 17:45:55 -0800
Subject: [PATCH 402/829] Enforce naming convention of getters in version_set.h

Summary: Enforce the accessier naming convention in functions in version_set.h

Test Plan: make all check

Reviewers: ljin, yhchiang, rven, igor

Reviewed By: igor

Subscribers: leveldb, dhruba

Differential Revision: https://reviews.facebook.net/D28143
---
 db/column_family.cc                         |   4 +-
 db/compaction_picker.cc                     |  10 +-
 db/compaction_picker_test.cc                |   4 +-
 db/db_filesnapshot.cc                       |   4 +-
 db/db_impl.cc                               |  14 +--
 db/db_impl_debug.cc                         |   2 +-
 db/db_test.cc                               |   2 +-
 db/forward_iterator.cc                      |   8 +-
 db/internal_stats.cc                        |   2 +-
 db/version_builder.cc                       | 103 ++++++++++----------
 db/version_builder_test.cc                  |   6 +-
 db/version_set.cc                           |  54 +++++-----
 db/version_set.h                            |  20 ++--
 utilities/compacted_db/compacted_db_impl.cc |   6 +-
 14 files changed, 117 insertions(+), 122 deletions(-)

diff --git a/db/column_family.cc b/db/column_family.cc
index c5c4e35e5..9e74df583 100644
--- a/db/column_family.cc
+++ b/db/column_family.cc
@@ -324,8 +324,8 @@ void ColumnFamilyData::RecalculateWriteStallConditions(
       const MutableCFOptions& mutable_cf_options) {
   if (current_ != nullptr) {
     auto* vstorage = current_->storage_info();
-    const double score = vstorage->MaxCompactionScore();
-    const int max_level = vstorage->MaxCompactionScoreLevel();
+    const double score = vstorage->max_compaction_score();
+    const int max_level = vstorage->max_compaction_score_level();
 
     auto write_controller = column_family_set_->write_controller_;
 
diff --git a/db/compaction_picker.cc b/db/compaction_picker.cc
index 096f0d77d..e2694bcd0 100644
--- a/db/compaction_picker.cc
+++ b/db/compaction_picker.cc
@@ -328,7 +328,7 @@ Compaction* CompactionPicker::CompactRange(
   }
   assert(output_path_id < static_cast<uint32_t>(ioptions_.db_paths.size()));
   Compaction* c = new Compaction(
-      vstorage->NumberLevels(), input_level, output_level,
+      vstorage->num_levels(), input_level, output_level,
       mutable_cf_options.MaxFileSizeForLevel(output_level),
       mutable_cf_options.MaxGrandParentOverlapBytes(input_level),
       output_path_id, GetCompressionType(ioptions_, output_level));
@@ -457,7 +457,7 @@ Compaction* LevelCompactionPicker::PickCompactionBySize(
 
   assert(level >= 0);
   assert(level + 1 < NumberLevels());
-  c = new Compaction(vstorage->NumberLevels(), level, level + 1,
+  c = new Compaction(vstorage->num_levels(), level, level + 1,
                      mutable_cf_options.MaxFileSizeForLevel(level + 1),
                      mutable_cf_options.MaxGrandParentOverlapBytes(level), 0,
                      GetCompressionType(ioptions_, level + 1));
@@ -778,7 +778,7 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalReadAmp(
   uint32_t path_id = GetPathId(ioptions_, estimated_total_size);
 
   Compaction* c = new Compaction(
-      vstorage->NumberLevels(), kLevel0, kLevel0,
+      vstorage->num_levels(), kLevel0, kLevel0,
       mutable_cf_options.MaxFileSizeForLevel(kLevel0), LLONG_MAX, path_id,
       GetCompressionType(ioptions_, kLevel0, enable_compression));
   c->score_ = score;
@@ -898,7 +898,7 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalSizeAmp(
   // create a compaction request
   // We always compact all the files, so always compress.
   Compaction* c =
-      new Compaction(vstorage->NumberLevels(), kLevel, kLevel,
+      new Compaction(vstorage->num_levels(), kLevel, kLevel,
                      mutable_cf_options.MaxFileSizeForLevel(kLevel), LLONG_MAX,
                      path_id, GetCompressionType(ioptions_, kLevel));
   c->score_ = score;
@@ -918,7 +918,7 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalSizeAmp(
 Compaction* FIFOCompactionPicker::PickCompaction(
     const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
     VersionStorageInfo* vstorage, LogBuffer* log_buffer) {
-  assert(vstorage->NumberLevels() == 1);
+  assert(vstorage->num_levels() == 1);
   const int kLevel0 = 0;
   const std::vector<FileMetaData*>& level_files = vstorage->LevelFiles(kLevel0);
   uint64_t total_size = 0;
diff --git a/db/compaction_picker_test.cc b/db/compaction_picker_test.cc
index f094fbafb..a041b20c4 100644
--- a/db/compaction_picker_test.cc
+++ b/db/compaction_picker_test.cc
@@ -50,7 +50,7 @@ class CompactionPickerTest {
   }
 
   ~CompactionPickerTest() {
-    for (int i = 0; i < vstorage.NumberLevels(); i++) {
+    for (int i = 0; i < vstorage.num_levels(); i++) {
       for (auto* f : vstorage.LevelFiles(i)) {
         delete f;
       }
@@ -61,7 +61,7 @@ class CompactionPickerTest {
            const char* largest, uint64_t file_size = 0, uint32_t path_id = 0,
            SequenceNumber smallest_seq = 100,
            SequenceNumber largest_seq = 100) {
-    assert(level < vstorage.NumberLevels());
+    assert(level < vstorage.num_levels());
     FileMetaData* f = new FileMetaData;
     f->fd = FileDescriptor(file_number, path_id, file_size);
     f->smallest = InternalKey(smallest, smallest_seq, kTypeValue);
diff --git a/db/db_filesnapshot.cc b/db/db_filesnapshot.cc
index 48819e766..dcf54c8c6 100644
--- a/db/db_filesnapshot.cc
+++ b/db/db_filesnapshot.cc
@@ -122,10 +122,10 @@ Status DBImpl::GetLiveFiles(std::vector<std::string>& ret,
   }
 
   ret.push_back(CurrentFileName(""));
-  ret.push_back(DescriptorFileName("", versions_->ManifestFileNumber()));
+  ret.push_back(DescriptorFileName("", versions_->manifest_file_number()));
 
   // find length of manifest file while holding the mutex lock
-  *manifest_file_size = versions_->ManifestFileSize();
+  *manifest_file_size = versions_->manifest_file_size();
 
   mutex_.Unlock();
   return Status::OK();
diff --git a/db/db_impl.cc b/db/db_impl.cc
index 98caf98b5..6e51c483d 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -425,11 +425,11 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force,
   versions_->GetObsoleteFiles(&job_context->sst_delete_files);
 
   // store the current filenum, lognum, etc
-  job_context->manifest_file_number = versions_->ManifestFileNumber();
+  job_context->manifest_file_number = versions_->manifest_file_number();
   job_context->pending_manifest_file_number =
-      versions_->PendingManifestFileNumber();
+      versions_->pending_manifest_file_number();
   job_context->log_number = versions_->MinLogNumber();
-  job_context->prev_log_number = versions_->PrevLogNumber();
+  job_context->prev_log_number = versions_->prev_log_number();
 
   if (!doing_the_full_scan && !job_context->HaveSomethingToDelete()) {
     // avoid filling up sst_live if we're sure that we
@@ -730,11 +730,11 @@ Status DBImpl::Recover(
     // descriptor (new log files may have been added by the previous
     // incarnation without registering them in the descriptor).
     //
-    // Note that PrevLogNumber() is no longer used, but we pay
+    // Note that prev_log_number() is no longer used, but we pay
     // attention to it in case we are recovering a database
     // produced by an older version of rocksdb.
     const uint64_t min_log = versions_->MinLogNumber();
-    const uint64_t prev_log = versions_->PrevLogNumber();
+    const uint64_t prev_log = versions_->prev_log_number();
     std::vector<std::string> filenames;
     s = env_->GetChildren(db_options_.wal_dir, &filenames);
     if (!s.ok()) {
@@ -2729,7 +2729,7 @@ Status DBImpl::SetNewMemtableAndNewLogFile(ColumnFamilyData* cfd,
 
   // Attempt to switch to a new memtable and trigger flush of old.
   // Do this without holding the dbmutex lock.
-  assert(versions_->PrevLogNumber() == 0);
+  assert(versions_->prev_log_number() == 0);
   bool creating_new_log = !log_empty_;
   uint64_t new_log_number =
       creating_new_log ? versions_->NewFileNumber() : logfile_number_;
@@ -3269,7 +3269,7 @@ Status DB::Open(const DBOptions& db_options, const std::string& dbname,
       if (cfd->ioptions()->compaction_style == kCompactionStyleUniversal ||
           cfd->ioptions()->compaction_style == kCompactionStyleFIFO) {
         auto* vstorage = cfd->current()->storage_info();
-        for (int i = 1; i < vstorage->NumberLevels(); ++i) {
+        for (int i = 1; i < vstorage->num_levels(); ++i) {
           int num_files = vstorage->NumLevelFiles(i);
           if (num_files > 0) {
             s = Status::InvalidArgument(
diff --git a/db/db_impl_debug.cc b/db/db_impl_debug.cc
index 283f9393f..ea8f5e13b 100644
--- a/db/db_impl_debug.cc
+++ b/db/db_impl_debug.cc
@@ -67,7 +67,7 @@ void DBImpl::TEST_GetFilesMetaData(
 }
 
 uint64_t DBImpl::TEST_Current_Manifest_FileNo() {
-  return versions_->ManifestFileNumber();
+  return versions_->manifest_file_number();
 }
 
 Status DBImpl::TEST_CompactRange(int level, const Slice* begin,
diff --git a/db/db_test.cc b/db/db_test.cc
index cd6cd5862..24bad640b 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -5776,7 +5776,7 @@ TEST(DBTest, DropWrites) {
     env_->drop_writes_.store(true, std::memory_order_release);
     env_->sleep_counter_.Reset();
     for (int i = 0; i < 5; i++) {
-      for (int level = 0; level < dbfull()->NumberLevels()-1; level++) {
+      for (int level = 0; level < dbfull()->NumberLevels() - 1; level++) {
         dbfull()->TEST_CompactRange(level, nullptr, nullptr);
       }
     }
diff --git a/db/forward_iterator.cc b/db/forward_iterator.cc
index 154af1147..635678160 100644
--- a/db/forward_iterator.cc
+++ b/db/forward_iterator.cc
@@ -249,7 +249,7 @@ void ForwardIterator::SeekInternal(const Slice& internal_key,
 
     int32_t search_left_bound = 0;
     int32_t search_right_bound = FileIndexer::kLevelMaxIndex;
-    for (int32_t level = 1; level < vstorage->NumberLevels(); ++level) {
+    for (int32_t level = 1; level < vstorage->num_levels(); ++level) {
       const std::vector<FileMetaData*>& level_files =
           vstorage->LevelFiles(level);
       if (level_files.empty()) {
@@ -259,7 +259,7 @@ void ForwardIterator::SeekInternal(const Slice& internal_key,
       }
       assert(level_iters_[level - 1] != nullptr);
       uint32_t f_idx = 0;
-      const auto& indexer = vstorage->GetIndexer();
+      const auto& indexer = vstorage->file_indexer();
       if (!seek_to_first) {
         if (search_left_bound == search_right_bound) {
           f_idx = search_left_bound;
@@ -437,8 +437,8 @@ void ForwardIterator::RebuildIterators(bool refresh_sv) {
     l0_iters_.push_back(cfd_->table_cache()->NewIterator(
         read_options_, *cfd_->soptions(), cfd_->internal_comparator(), l0->fd));
   }
-  level_iters_.reserve(vstorage->NumberLevels() - 1);
-  for (int32_t level = 1; level < vstorage->NumberLevels(); ++level) {
+  level_iters_.reserve(vstorage->num_levels() - 1);
+  for (int32_t level = 1; level < vstorage->num_levels(); ++level) {
     const auto& level_files = vstorage->LevelFiles(level);
 
     if (level_files.empty()) {
diff --git a/db/internal_stats.cc b/db/internal_stats.cc
index a59da4317..cda75e0c8 100644
--- a/db/internal_stats.cc
+++ b/db/internal_stats.cc
@@ -371,7 +371,7 @@ void InternalStats::DumpCFStats(std::string* value) {
   int num_levels_to_check =
       (cfd_->options()->compaction_style != kCompactionStyleUniversal &&
        cfd_->options()->compaction_style != kCompactionStyleFIFO)
-          ? vstorage->NumberLevels() - 1
+          ? vstorage->num_levels() - 1
           : 1;
 
   // Compaction scores are sorted base on its value. Restore them to the
diff --git a/db/version_builder.cc b/db/version_builder.cc
index 61205704f..cf2d21ea8 100644
--- a/db/version_builder.cc
+++ b/db/version_builder.cc
@@ -54,10 +54,7 @@ class VersionBuilder::Rep {
   // kLevel0 -- NewestFirstBySeqNo
   // kLevelNon0 -- BySmallestKey
   struct FileComparator {
-    enum SortMethod {
-      kLevel0 = 0,
-      kLevelNon0 = 1,
-    } sort_method;
+    enum SortMethod { kLevel0 = 0, kLevelNon0 = 1, } sort_method;
     const InternalKeyComparator* internal_comparator;
 
     bool operator()(FileMetaData* f1, FileMetaData* f2) const {
@@ -91,25 +88,25 @@ class VersionBuilder::Rep {
       : env_options_(env_options),
         table_cache_(table_cache),
         base_vstorage_(base_vstorage) {
-    levels_ = new LevelState[base_vstorage_->NumberLevels()];
+    levels_ = new LevelState[base_vstorage_->num_levels()];
     level_zero_cmp_.sort_method = FileComparator::kLevel0;
     level_nonzero_cmp_.sort_method = FileComparator::kLevelNon0;
     level_nonzero_cmp_.internal_comparator =
         base_vstorage_->InternalComparator();
 
     levels_[0].added_files = new FileSet(level_zero_cmp_);
-    for (int level = 1; level < base_vstorage_->NumberLevels(); level++) {
-        levels_[level].added_files = new FileSet(level_nonzero_cmp_);
+    for (int level = 1; level < base_vstorage_->num_levels(); level++) {
+      levels_[level].added_files = new FileSet(level_nonzero_cmp_);
     }
   }
 
   ~Rep() {
-    for (int level = 0; level < base_vstorage_->NumberLevels(); level++) {
+    for (int level = 0; level < base_vstorage_->num_levels(); level++) {
       const FileSet* added = levels_[level].added_files;
       std::vector<FileMetaData*> to_unref;
       to_unref.reserve(added->size());
-      for (FileSet::const_iterator it = added->begin();
-          it != added->end(); ++it) {
+      for (FileSet::const_iterator it = added->begin(); it != added->end();
+           ++it) {
         to_unref.push_back(*it);
       }
       delete added;
@@ -133,7 +130,7 @@ class VersionBuilder::Rep {
   void CheckConsistency(VersionStorageInfo* vstorage) {
 #ifndef NDEBUG
     // make sure the files are sorted correctly
-    for (int level = 0; level < vstorage->NumberLevels(); level++) {
+    for (int level = 0; level < vstorage->num_levels(); level++) {
       auto& level_files = vstorage->LevelFiles(level);
       for (size_t i = 1; i < level_files.size(); i++) {
         auto f1 = level_files[i - 1];
@@ -161,51 +158,50 @@ class VersionBuilder::Rep {
   void CheckConsistencyForDeletes(VersionEdit* edit, uint64_t number,
                                   int level) {
 #ifndef NDEBUG
-      // a file to be deleted better exist in the previous version
-      bool found = false;
-      for (int l = 0; !found && l < base_vstorage_->NumberLevels(); l++) {
-        const std::vector<FileMetaData*>& base_files =
-            base_vstorage_->LevelFiles(l);
-        for (unsigned int i = 0; i < base_files.size(); i++) {
-          FileMetaData* f = base_files[i];
-          if (f->fd.GetNumber() == number) {
-            found =  true;
-            break;
-          }
+    // a file to be deleted better exist in the previous version
+    bool found = false;
+    for (int l = 0; !found && l < base_vstorage_->num_levels(); l++) {
+      const std::vector<FileMetaData*>& base_files =
+          base_vstorage_->LevelFiles(l);
+      for (unsigned int i = 0; i < base_files.size(); i++) {
+        FileMetaData* f = base_files[i];
+        if (f->fd.GetNumber() == number) {
+          found = true;
+          break;
         }
       }
-      // if the file did not exist in the previous version, then it
-      // is possibly moved from lower level to higher level in current
-      // version
-      for (int l = level + 1; !found && l < base_vstorage_->NumberLevels();
-           l++) {
-        const FileSet* added = levels_[l].added_files;
-        for (FileSet::const_iterator added_iter = added->begin();
-             added_iter != added->end(); ++added_iter) {
-          FileMetaData* f = *added_iter;
-          if (f->fd.GetNumber() == number) {
-            found = true;
-            break;
-          }
+    }
+    // if the file did not exist in the previous version, then it
+    // is possibly moved from lower level to higher level in current
+    // version
+    for (int l = level + 1; !found && l < base_vstorage_->num_levels(); l++) {
+      const FileSet* added = levels_[l].added_files;
+      for (FileSet::const_iterator added_iter = added->begin();
+           added_iter != added->end(); ++added_iter) {
+        FileMetaData* f = *added_iter;
+        if (f->fd.GetNumber() == number) {
+          found = true;
+          break;
         }
       }
+    }
 
-      // maybe this file was added in a previous edit that was Applied
-      if (!found) {
-        const FileSet* added = levels_[level].added_files;
-        for (FileSet::const_iterator added_iter = added->begin();
-             added_iter != added->end(); ++added_iter) {
-          FileMetaData* f = *added_iter;
-          if (f->fd.GetNumber() == number) {
-            found = true;
-            break;
-          }
+    // maybe this file was added in a previous edit that was Applied
+    if (!found) {
+      const FileSet* added = levels_[level].added_files;
+      for (FileSet::const_iterator added_iter = added->begin();
+           added_iter != added->end(); ++added_iter) {
+        FileMetaData* f = *added_iter;
+        if (f->fd.GetNumber() == number) {
+          found = true;
+          break;
         }
       }
-      if (!found) {
-        fprintf(stderr, "not found %" PRIu64 "\n", number);
-      }
-      assert(found);
+    }
+    if (!found) {
+      fprintf(stderr, "not found %" PRIu64 "\n", number);
+    }
+    assert(found);
 #endif
   }
 
@@ -238,7 +234,7 @@ class VersionBuilder::Rep {
     CheckConsistency(base_vstorage_);
     CheckConsistency(vstorage);
 
-    for (int level = 0; level < base_vstorage_->NumberLevels(); level++) {
+    for (int level = 0; level < base_vstorage_->num_levels(); level++) {
       const auto& cmp = (level == 0) ? level_zero_cmp_ : level_nonzero_cmp_;
       // Merge the set of added files with the set of pre-existing files.
       // Drop any deleted files.  Store the result in *v.
@@ -251,8 +247,7 @@ class VersionBuilder::Rep {
       for (const auto& added : added_files) {
         // Add all smaller files listed in base_
         for (auto bpos = std::upper_bound(base_iter, base_end, added, cmp);
-             base_iter != bpos;
-             ++base_iter) {
+             base_iter != bpos; ++base_iter) {
           MaybeAddFile(vstorage, level, *base_iter);
         }
 
@@ -270,7 +265,7 @@ class VersionBuilder::Rep {
 
   void LoadTableHandlers() {
     assert(table_cache_ != nullptr);
-    for (int level = 0; level < base_vstorage_->NumberLevels(); level++) {
+    for (int level = 0; level < base_vstorage_->num_levels(); level++) {
       for (auto& file_meta : *(levels_[level].added_files)) {
         assert(!file_meta->table_reader_handle);
         table_cache_->FindTable(
@@ -280,10 +275,10 @@ class VersionBuilder::Rep {
           // Load table_reader
           file_meta->fd.table_reader = table_cache_->GetTableReaderFromHandle(
               file_meta->table_reader_handle);
+        }
       }
     }
   }
-  }
 
   void MaybeAddFile(VersionStorageInfo* vstorage, int level, FileMetaData* f) {
     if (levels_[level].deleted_files.count(f->fd.GetNumber()) > 0) {
diff --git a/db/version_builder_test.cc b/db/version_builder_test.cc
index e11f78eb1..fcf32ce60 100644
--- a/db/version_builder_test.cc
+++ b/db/version_builder_test.cc
@@ -37,7 +37,7 @@ class VersionBuilderTest {
   }
 
   ~VersionBuilderTest() {
-    for (int i = 0; i < vstorage.NumberLevels(); i++) {
+    for (int i = 0; i < vstorage.num_levels(); i++) {
       for (auto* f : vstorage.LevelFiles(i)) {
         if (--f->refs == 0) {
           delete f;
@@ -55,7 +55,7 @@ class VersionBuilderTest {
            const char* largest, uint64_t file_size = 0, uint32_t path_id = 0,
            SequenceNumber smallest_seq = 100,
            SequenceNumber largest_seq = 100) {
-    assert(level < vstorage.NumberLevels());
+    assert(level < vstorage.num_levels());
     FileMetaData* f = new FileMetaData;
     f->fd = FileDescriptor(file_number, path_id, file_size);
     f->smallest = GetInternalKey(smallest, smallest_seq);
@@ -109,7 +109,7 @@ TEST(VersionBuilderTest, ApplyAndSaveTo) {
   ASSERT_EQ(400U, new_vstorage.NumLevelBytes(2));
   ASSERT_EQ(300U, new_vstorage.NumLevelBytes(3));
 
-  for (int i = 0; i < new_vstorage.NumberLevels(); i++) {
+  for (int i = 0; i < new_vstorage.num_levels(); i++) {
     for (auto* f : new_vstorage.LevelFiles(i)) {
       if (--f->refs == 0) {
         delete f;
diff --git a/db/version_set.cc b/db/version_set.cc
index 4b068297f..c4e20140f 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -499,7 +499,7 @@ class BaseReferencedVersionBuilder {
  public:
   explicit BaseReferencedVersionBuilder(ColumnFamilyData* cfd)
       : version_builder_(new VersionBuilder(
-            cfd->current()->version_set()->GetEnvOptions(), cfd->table_cache(),
+            cfd->current()->version_set()->env_options(), cfd->table_cache(),
             cfd->current()->storage_info())),
         version_(cfd->current()) {
     version_->Ref();
@@ -508,7 +508,7 @@ class BaseReferencedVersionBuilder {
     delete version_builder_;
     version_->Unref();
   }
-  VersionBuilder* GetVersionBuilder() { return version_builder_; }
+  VersionBuilder* version_builder() { return version_builder_; }
 
  private:
   VersionBuilder* version_builder_;
@@ -635,7 +635,7 @@ void Version::AddIterators(const ReadOptions& read_options,
   // For levels > 0, we can use a concatenating iterator that sequentially
   // walks through the non-overlapping files in the level, opening them
   // lazily.
-  for (int level = 1; level < storage_info_.NumberLevels(); level++) {
+  for (int level = 1; level < storage_info_.num_levels(); level++) {
     if (storage_info_.level_files_brief_[level].num_files != 0) {
       merge_iter_builder->AddIterator(NewTwoLevelIterator(
           new LevelFileIteratorState(
@@ -892,7 +892,7 @@ void VersionStorageInfo::ComputeCompensatedSizes() {
 
 int VersionStorageInfo::MaxInputLevel() const {
   if (compaction_style_ == kCompactionStyleLevel) {
-    return NumberLevels() - 2;
+    return num_levels() - 2;
   }
   return 0;
 }
@@ -960,8 +960,8 @@ void VersionStorageInfo::ComputeCompactionScore(
 
   // sort all the levels based on their score. Higher scores get listed
   // first. Use bubble sort because the number of entries are small.
-  for (int i = 0; i < NumberLevels() - 2; i++) {
-    for (int j = i + 1; j < NumberLevels() - 1; j++) {
+  for (int i = 0; i < num_levels() - 2; i++) {
+    for (int j = i + 1; j < num_levels() - 1; j++) {
       if (compaction_score_[i] < compaction_score_[j]) {
         double score = compaction_score_[i];
         int level = compaction_level_[i];
@@ -992,7 +992,7 @@ bool CompareCompensatedSizeDescending(const Fsize& first, const Fsize& second) {
 } // anonymous namespace
 
 void VersionStorageInfo::MaybeAddFile(int level, FileMetaData* f) {
-  assert(level < NumberLevels());
+  assert(level < num_levels());
   auto* level_files = &files_[level];
   // Must not overlap
   assert(level <= 0 || level_files->empty() ||
@@ -1021,7 +1021,7 @@ void VersionStorageInfo::UpdateFilesBySize() {
     return;
   }
   // No need to sort the highest level because it is never compacted.
-  for (int level = 0; level < NumberLevels() - 1; level++) {
+  for (int level = 0; level < num_levels() - 1; level++) {
     const std::vector<FileMetaData*>& files = files_[level];
     auto& files_by_size = files_by_size_[level];
     assert(files_by_size.size() == 0);
@@ -1335,14 +1335,14 @@ bool VersionStorageInfo::HasOverlappingUserKey(
 
 uint64_t VersionStorageInfo::NumLevelBytes(int level) const {
   assert(level >= 0);
-  assert(level < NumberLevels());
+  assert(level < num_levels());
   return TotalFileSize(files_[level]);
 }
 
 const char* VersionStorageInfo::LevelSummary(
     LevelSummaryStorage* scratch) const {
   int len = snprintf(scratch->buffer, sizeof(scratch->buffer), "files[");
-  for (int i = 0; i < NumberLevels(); i++) {
+  for (int i = 0; i < num_levels(); i++) {
     int sz = sizeof(scratch->buffer) - len;
     int ret = snprintf(scratch->buffer + len, sz, "%d ", int(files_[i].size()));
     if (ret < 0 || ret >= sz) break;
@@ -1382,7 +1382,7 @@ const char* VersionStorageInfo::LevelFileSummary(FileSummaryStorage* scratch,
 int64_t VersionStorageInfo::MaxNextLevelOverlappingBytes() {
   uint64_t result = 0;
   std::vector<FileMetaData*> overlaps;
-  for (int level = 1; level < NumberLevels() - 1; level++) {
+  for (int level = 1; level < num_levels() - 1; level++) {
     for (const auto& f : files_[level]) {
       GetOverlappingInputs(level + 1, &f->smallest, &f->largest, &overlaps);
       const uint64_t sum = TotalFileSize(overlaps);
@@ -1395,7 +1395,7 @@ int64_t VersionStorageInfo::MaxNextLevelOverlappingBytes() {
 }
 
 void Version::AddLiveFiles(std::vector<FileDescriptor>* live) {
-  for (int level = 0; level < storage_info_.NumberLevels(); level++) {
+  for (int level = 0; level < storage_info_.num_levels(); level++) {
     const std::vector<FileMetaData*>& files = storage_info_.files_[level];
     for (const auto& file : files) {
       live->push_back(file->fd);
@@ -1544,7 +1544,7 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
   } else {
     v = new Version(column_family_data, this, current_version_number_++);
     builder_guard.reset(new BaseReferencedVersionBuilder(column_family_data));
-    auto* builder = builder_guard->GetVersionBuilder();
+    auto* builder = builder_guard->version_builder();
     for (const auto& writer : manifest_writers_) {
       if (writer->edit->IsColumnFamilyManipulation() ||
           writer->cfd->GetID() != column_family_data->GetID()) {
@@ -1586,7 +1586,7 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
   {
     std::vector<uint64_t> size_being_compacted;
     if (!edit->IsColumnFamilyManipulation()) {
-      size_being_compacted.resize(v->storage_info()->NumberLevels() - 1);
+      size_being_compacted.resize(v->storage_info()->num_levels() - 1);
       // calculate the amount of data being compacted at every level
       column_family_data->compaction_picker()->SizeBeingCompacted(
           size_being_compacted);
@@ -1598,7 +1598,7 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
         db_options_->max_open_files == -1) {
       // unlimited table cache. Pre-load table handle now.
       // Need to do it out of the mutex.
-      builder_guard->GetVersionBuilder()->LoadTableHandlers();
+      builder_guard->version_builder()->LoadTableHandlers();
     }
 
     // This is fine because everything inside of this block is serialized --
@@ -1947,7 +1947,7 @@ Status VersionSet::Recover(
         cfd = column_family_set_->GetColumnFamily(edit.column_family_);
         // this should never happen since cf_in_builders is true
         assert(cfd != nullptr);
-        if (edit.max_level_ >= cfd->current()->storage_info()->NumberLevels()) {
+        if (edit.max_level_ >= cfd->current()->storage_info()->num_levels()) {
           s = Status::InvalidArgument(
               "db has more levels than options.num_levels");
           break;
@@ -1958,7 +1958,7 @@ Status VersionSet::Recover(
         // to builder
         auto builder = builders.find(edit.column_family_);
         assert(builder != builders.end());
-        builder->second->GetVersionBuilder()->Apply(&edit);
+        builder->second->version_builder()->Apply(&edit);
       }
 
       if (cfd != nullptr) {
@@ -2038,7 +2038,7 @@ Status VersionSet::Recover(
     for (auto cfd : *column_family_set_) {
       auto builders_iter = builders.find(cfd->GetID());
       assert(builders_iter != builders.end());
-      auto* builder = builders_iter->second->GetVersionBuilder();
+      auto* builder = builders_iter->second->version_builder();
 
       if (db_options_->max_open_files == -1) {
       // unlimited table cache. Pre-load table handle now.
@@ -2051,7 +2051,7 @@ Status VersionSet::Recover(
 
       // Install recovered version
       std::vector<uint64_t> size_being_compacted(
-          v->storage_info()->NumberLevels() - 1);
+          v->storage_info()->num_levels() - 1);
       cfd->compaction_picker()->SizeBeingCompacted(size_being_compacted);
       v->PrepareApply(*cfd->GetLatestMutableCFOptions(), size_being_compacted);
       AppendVersion(cfd, v);
@@ -2184,7 +2184,7 @@ Status VersionSet::ReduceNumberOfLevels(const std::string& dbname,
   Version* current_version =
       versions.GetColumnFamilySet()->GetDefault()->current();
   auto* vstorage = current_version->storage_info();
-  int current_levels = vstorage->NumberLevels();
+  int current_levels = vstorage->num_levels();
 
   if (current_levels <= new_levels) {
     return Status::OK();
@@ -2335,7 +2335,7 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname,
         // to builder
         auto builder = builders.find(edit.column_family_);
         assert(builder != builders.end());
-        builder->second->GetVersionBuilder()->Apply(&edit);
+        builder->second->version_builder()->Apply(&edit);
       }
 
       if (cfd != nullptr && edit.has_log_number_) {
@@ -2382,12 +2382,12 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname,
     for (auto cfd : *column_family_set_) {
       auto builders_iter = builders.find(cfd->GetID());
       assert(builders_iter != builders.end());
-      auto builder = builders_iter->second->GetVersionBuilder();
+      auto builder = builders_iter->second->version_builder();
 
       Version* v = new Version(cfd, this, current_version_number_++);
       builder->SaveTo(v->storage_info());
       std::vector<uint64_t> size_being_compacted(
-          v->storage_info()->NumberLevels() - 1);
+          v->storage_info()->num_levels() - 1);
       cfd->compaction_picker()->SizeBeingCompacted(size_being_compacted);
       v->PrepareApply(*cfd->GetLatestMutableCFOptions(), size_being_compacted);
       delete builder;
@@ -2521,7 +2521,7 @@ bool VersionSet::ManifestContains(uint64_t manifest_file_number,
 uint64_t VersionSet::ApproximateOffsetOf(Version* v, const InternalKey& ikey) {
   uint64_t result = 0;
   const auto* vstorage = v->storage_info();
-  for (int level = 0; level < vstorage->NumberLevels(); level++) {
+  for (int level = 0; level < vstorage->num_levels(); level++) {
     const std::vector<FileMetaData*>& files = vstorage->LevelFiles(level);
     for (size_t i = 0; i < files.size(); i++) {
       if (v->cfd_->internal_comparator().Compare(files[i]->largest, ikey) <=
@@ -2562,7 +2562,7 @@ void VersionSet::AddLiveFiles(std::vector<FileDescriptor>* live_list) {
     for (Version* v = dummy_versions->next_; v != dummy_versions;
          v = v->next_) {
       const auto* vstorage = v->storage_info();
-      for (int level = 0; level < vstorage->NumberLevels(); level++) {
+      for (int level = 0; level < vstorage->num_levels(); level++) {
         total_files += vstorage->LevelFiles(level).size();
       }
     }
@@ -2576,7 +2576,7 @@ void VersionSet::AddLiveFiles(std::vector<FileDescriptor>* live_list) {
     for (Version* v = dummy_versions->next_; v != dummy_versions;
          v = v->next_) {
       const auto* vstorage = v->storage_info();
-      for (int level = 0; level < vstorage->NumberLevels(); level++) {
+      for (int level = 0; level < vstorage->num_levels(); level++) {
         for (const auto& f : vstorage->LevelFiles(level)) {
           live_list->push_back(f->fd);
         }
@@ -2686,7 +2686,7 @@ Status VersionSet::GetMetadataForFile(uint64_t number, int* filelevel,
   for (auto cfd_iter : *column_family_set_) {
     Version* version = cfd_iter->current();
     const auto* vstorage = version->storage_info();
-    for (int level = 0; level < vstorage->NumberLevels(); level++) {
+    for (int level = 0; level < vstorage->num_levels(); level++) {
       for (const auto& file : vstorage->LevelFiles(level)) {
         if (file->fd.GetNumber() == number) {
           *meta = file;
diff --git a/db/version_set.h b/db/version_set.h
index ae3d53cd2..0ae6f1cfd 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -132,10 +132,10 @@ class VersionStorageInfo {
   bool NeedsCompaction() const;
 
   // Returns the maxmimum compaction score for levels 1 to max
-  double MaxCompactionScore() const { return max_compaction_score_; }
+  double max_compaction_score() const { return max_compaction_score_; }
 
   // See field declaration
-  int MaxCompactionScoreLevel() const { return max_compaction_score_level_; }
+  int max_compaction_score_level() const { return max_compaction_score_level_; }
 
   // Return level number that has idx'th highest score
   int CompactionScoreLevel(int idx) const { return compaction_level_[idx]; }
@@ -183,10 +183,10 @@ class VersionStorageInfo {
                                  const Slice& smallest_user_key,
                                  const Slice& largest_user_key);
 
-  int NumberLevels() const { return num_levels_; }
+  int num_levels() const { return num_levels_; }
 
   // REQUIRES: This version has been saved (see VersionSet::SaveTo)
-  int NumNonEmptyLevels() const {
+  int num_non_empty_levels() const {
     assert(finalized_);
     return num_non_empty_levels_;
   }
@@ -228,7 +228,7 @@ class VersionStorageInfo {
   }
 
   // REQUIRES: This version has been saved (see VersionSet::SaveTo)
-  const FileIndexer& GetIndexer() const {
+  const FileIndexer& file_indexer() const {
     assert(finalized_);
     return file_indexer_;
   }
@@ -524,9 +524,9 @@ class VersionSet {
 #endif  // ROCKSDB_LITE
 
   // Return the current manifest file number
-  uint64_t ManifestFileNumber() const { return manifest_file_number_; }
+  uint64_t manifest_file_number() const { return manifest_file_number_; }
 
-  uint64_t PendingManifestFileNumber() const {
+  uint64_t pending_manifest_file_number() const {
     return pending_manifest_file_number_;
   }
 
@@ -558,7 +558,7 @@ class VersionSet {
 
   // Return the log file number for the log file that is currently
   // being compacted, or zero if there is no such log file.
-  uint64_t PrevLogNumber() const { return prev_log_number_; }
+  uint64_t prev_log_number() const { return prev_log_number_; }
 
   // Returns the minimum log number such that all
   // log numbers less than or equal to it can be deleted
@@ -584,7 +584,7 @@ class VersionSet {
   uint64_t ApproximateOffsetOf(Version* v, const InternalKey& key);
 
   // Return the size of the current manifest file
-  uint64_t ManifestFileSize() const { return manifest_file_size_; }
+  uint64_t manifest_file_size() const { return manifest_file_size_; }
 
   // verify that the files that we started with for a compaction
   // still exist in the current version and in the same original level.
@@ -600,7 +600,7 @@ class VersionSet {
   void GetObsoleteFiles(std::vector<FileMetaData*>* files);
 
   ColumnFamilySet* GetColumnFamilySet() { return column_family_set_.get(); }
-  const EnvOptions& GetEnvOptions() { return env_options_; }
+  const EnvOptions& env_options() { return env_options_; }
 
  private:
   struct ManifestWriter;
diff --git a/utilities/compacted_db/compacted_db_impl.cc b/utilities/compacted_db/compacted_db_impl.cc
index 3a417de2b..335dae77b 100644
--- a/utilities/compacted_db/compacted_db_impl.cc
+++ b/utilities/compacted_db/compacted_db_impl.cc
@@ -111,20 +111,20 @@ Status CompactedDBImpl::Init(const Options& options) {
     return Status::NotSupported("L0 contain more than 1 file");
   }
   if (l0.num_files == 1) {
-    if (vstorage->NumNonEmptyLevels() > 1) {
+    if (vstorage->num_non_empty_levels() > 1) {
       return Status::NotSupported("Both L0 and other level contain files");
     }
     files_ = l0;
     return Status::OK();
   }
 
-  for (int i = 1; i < vstorage->NumNonEmptyLevels() - 1; ++i) {
+  for (int i = 1; i < vstorage->num_non_empty_levels() - 1; ++i) {
     if (vstorage->LevelFilesBrief(i).num_files > 0) {
       return Status::NotSupported("Other levels also contain files");
     }
   }
 
-  int level = vstorage->NumNonEmptyLevels() - 1;
+  int level = vstorage->num_non_empty_levels() - 1;
   if (vstorage->LevelFilesBrief(level).num_files > 0) {
     files_ = vstorage->LevelFilesBrief(level);
     return Status::OK();

From 469d474ba0424d18da77135bdf60ee789193890c Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Tue, 4 Nov 2014 10:28:08 -0800
Subject: [PATCH 403/829] Apply InfoLogLevel to the logs in db/db_impl.cc

Summary: Apply InfoLogLevel to the logs in db/db_impl.cc

Test Plan:
db_test
db_bench

Reviewers: ljin, sdong, igor

Reviewed By: igor

Subscribers: leveldb, MarkCallaghan, dhruba

Differential Revision: https://reviews.facebook.net/D28233
---
 db/db_impl.cc | 127 +++++++++++++++++++++++++++++++-------------------
 1 file changed, 80 insertions(+), 47 deletions(-)

diff --git a/db/db_impl.cc b/db/db_impl.cc
index 6e51c483d..4946c007f 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -303,7 +303,8 @@ Status DBImpl::NewDB() {
   new_db.SetNextFile(2);
   new_db.SetLastSequence(0);
 
-  Log(db_options_.info_log, "Creating manifest 1 \n");
+  Log(InfoLogLevel::INFO_LEVEL,
+      db_options_.info_log, "Creating manifest 1 \n");
   const std::string manifest = DescriptorFileName(dbname_, 1);
   unique_ptr<WritableFile> file;
   Status s = env_->NewWritableFile(
@@ -331,7 +332,8 @@ void DBImpl::MaybeIgnoreError(Status* s) const {
   if (s->ok() || db_options_.paranoid_checks) {
     // No change needed
   } else {
-    Log(db_options_.info_log, "Ignoring error %s", s->ToString().c_str());
+    Log(InfoLogLevel::WARN_LEVEL,
+        db_options_.info_log, "Ignoring error %s", s->ToString().c_str());
     *s = Status::OK();
   }
 }
@@ -347,7 +349,7 @@ const Status DBImpl::CreateArchivalDirectory() {
 void DBImpl::PrintStatistics() {
   auto dbstats = db_options_.statistics.get();
   if (dbstats) {
-    Log(db_options_.info_log,
+    Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
         "STATISTCS:\n %s",
         dbstats->ToString().c_str());
   }
@@ -383,8 +385,10 @@ void DBImpl::MaybeDumpStats() {
       default_cf_internal_stats_->GetStringProperty(db_property_type,
                                                     "rocksdb.dbstats", &stats);
     }
-    Log(db_options_.info_log, "------- DUMPING STATS -------");
-    Log(db_options_.info_log, "%s", stats.c_str());
+    Log(InfoLogLevel::INFO_LEVEL,
+        db_options_.info_log, "------- DUMPING STATS -------");
+    Log(InfoLogLevel::INFO_LEVEL,
+        db_options_.info_log, "%s", stats.c_str());
 
     PrintStatistics();
   }
@@ -604,7 +608,8 @@ void DBImpl::PurgeObsoleteFiles(const JobContext& state) {
 
 #ifdef ROCKSDB_LITE
     Status s = env_->DeleteFile(fname);
-    Log(db_options_.info_log, "Delete %s type=%d #%" PRIu64 " -- %s\n",
+    Log(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log,
+        "Delete %s type=%d #%" PRIu64 " -- %s\n",
         fname.c_str(), type, number, s.ToString().c_str());
 #else   // not ROCKSDB_LITE
     if (type == kLogFile && (db_options_.WAL_ttl_seconds > 0 ||
@@ -612,7 +617,8 @@ void DBImpl::PurgeObsoleteFiles(const JobContext& state) {
       wal_manager_.ArchiveWALFile(fname, number);
     } else {
       Status s = env_->DeleteFile(fname);
-      Log(db_options_.info_log, "Delete %s type=%d #%" PRIu64 " -- %s\n",
+      Log(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log,
+          "Delete %s type=%d #%" PRIu64 " -- %s\n",
           fname.c_str(), type, number, s.ToString().c_str());
     }
 #endif  // ROCKSDB_LITE
@@ -627,11 +633,13 @@ void DBImpl::PurgeObsoleteFiles(const JobContext& state) {
       std::string& to_delete = old_info_log_files.at(i);
       std::string full_path_to_delete = (db_options_.db_log_dir.empty() ?
            dbname_ : db_options_.db_log_dir) + "/" + to_delete;
-      Log(db_options_.info_log, "Delete info log file %s\n",
+      Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
+          "Delete info log file %s\n",
           full_path_to_delete.c_str());
       Status s = env_->DeleteFile(full_path_to_delete);
       if (!s.ok()) {
-        Log(db_options_.info_log, "Delete info log file %s FAILED -- %s\n",
+        Log(InfoLogLevel::ERROR_LEVEL,
+            db_options_.info_log, "Delete info log file %s FAILED -- %s\n",
             to_delete.c_str(), s.ToString().c_str());
       }
     }
@@ -798,7 +806,8 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
     Status* status;  // nullptr if db_options_.paranoid_checks==false or
                      //            db_options_.skip_log_error_on_recovery==true
     virtual void Corruption(size_t bytes, const Status& s) {
-      Log(info_log, "%s%s: dropping %d bytes; %s",
+      Log(InfoLogLevel::WARN_LEVEL,
+          info_log, "%s%s: dropping %d bytes; %s",
           (this->status == nullptr ? "(ignoring error) " : ""),
           fname, static_cast<int>(bytes), s.ToString().c_str());
       if (this->status != nullptr && this->status->ok()) *this->status = s;
@@ -850,7 +859,8 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
     // large sequence numbers).
     log::Reader reader(std::move(file), &reporter, true /*checksum*/,
                        0 /*initial_offset*/);
-    Log(db_options_.info_log, "Recovering log #%" PRIu64 "", log_number);
+    Log(InfoLogLevel::INFO_LEVEL,
+        db_options_.info_log, "Recovering log #%" PRIu64 "", log_number);
 
     // Read all the records and add to a memtable
     std::string scratch;
@@ -981,7 +991,9 @@ Status DBImpl::WriteLevel0TableForRecovery(ColumnFamilyData* cfd, MemTable* mem,
     const SequenceNumber newest_snapshot = snapshots_.GetNewest();
     const SequenceNumber earliest_seqno_in_memtable =
         mem->GetFirstSequenceNumber();
-    Log(db_options_.info_log, "[%s] Level-0 table #%" PRIu64 ": started",
+    Log(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log,
+        "[%s] [WriteLevel0TableForRecovery]"
+        " Level-0 table #%" PRIu64 ": started",
         cfd->GetName().c_str(), meta.fd.GetNumber());
 
     {
@@ -995,8 +1007,9 @@ Status DBImpl::WriteLevel0TableForRecovery(ColumnFamilyData* cfd, MemTable* mem,
       mutex_.Lock();
     }
 
-    Log(db_options_.info_log,
-        "[%s] Level-0 table #%" PRIu64 ": %" PRIu64 " bytes %s",
+    Log(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log,
+        "[%s] [WriteLevel0TableForRecovery]"
+        " Level-0 table #%" PRIu64 ": %" PRIu64 " bytes %s",
         cfd->GetName().c_str(), meta.fd.GetNumber(), meta.fd.GetFileSize(),
         s.ToString().c_str());
   }
@@ -1133,7 +1146,8 @@ bool DBImpl::SetOptions(ColumnFamilyHandle* column_family,
     const std::unordered_map<std::string, std::string>& options_map) {
   auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family)->cfd();
   if (options_map.empty()) {
-    Log(db_options_.info_log, "SetOptions() on column family [%s], empty input",
+    Log(InfoLogLevel::WARN_LEVEL,
+        db_options_.info_log, "SetOptions() on column family [%s], empty input",
         cfd->GetName().c_str());
     return false;
   }
@@ -1148,18 +1162,21 @@ bool DBImpl::SetOptions(ColumnFamilyHandle* column_family,
     }
   }
 
-  Log(db_options_.info_log, "SetOptions() on column family [%s], inputs:",
+  Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
+      "SetOptions() on column family [%s], inputs:",
       cfd->GetName().c_str());
   for (const auto& o : options_map) {
-    Log(db_options_.info_log, "%s: %s\n", o.first.c_str(), o.second.c_str());
+    Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
+        "%s: %s\n", o.first.c_str(), o.second.c_str());
   }
   if (succeed) {
-    Log(db_options_.info_log, "[%s] SetOptions succeeded",
+    Log(InfoLogLevel::INFO_LEVEL,
+        db_options_.info_log, "[%s] SetOptions succeeded",
         cfd->GetName().c_str());
     new_options.Dump(db_options_.info_log.get());
   } else {
-    Log(db_options_.info_log, "[%s] SetOptions failed",
-        cfd->GetName().c_str());
+    Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log,
+        "[%s] SetOptions failed", cfd->GetName().c_str());
   }
   return succeed;
 }
@@ -1195,7 +1212,8 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) {
   // only allow one thread refitting
   if (refitting_level_) {
     mutex_.Unlock();
-    Log(db_options_.info_log, "ReFitLevel: another thread is refitting");
+    Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
+        "[ReFitLevel] another thread is refitting");
     delete new_superversion;
     return Status::NotSupported("another thread is refitting");
   }
@@ -1204,8 +1222,8 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) {
   // wait for all background threads to stop
   bg_work_gate_closed_ = true;
   while (bg_compaction_scheduled_ > 0 || bg_flush_scheduled_) {
-    Log(db_options_.info_log,
-        "RefitLevel: waiting for background threads to stop: %d %d",
+    Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
+        "[RefitLevel] waiting for background threads to stop: %d %d",
         bg_compaction_scheduled_, bg_flush_scheduled_);
     bg_cv_.Wait();
   }
@@ -1222,7 +1240,8 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) {
 
   Status status;
   if (to_level < level) {
-    Log(db_options_.info_log, "[%s] Before refitting:\n%s",
+    Log(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log,
+        "[%s] Before refitting:\n%s",
         cfd->GetName().c_str(), cfd->current()->DebugString().data());
 
     VersionEdit edit;
@@ -1233,7 +1252,8 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) {
                    f->fd.GetFileSize(), f->smallest, f->largest,
                    f->smallest_seqno, f->largest_seqno);
     }
-    Log(db_options_.info_log, "[%s] Apply version edit:\n%s",
+    Log(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log,
+        "[%s] Apply version edit:\n%s",
         cfd->GetName().c_str(), edit.DebugString().data());
 
     status = versions_->LogAndApply(cfd,
@@ -1242,11 +1262,13 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) {
         cfd, new_superversion, mutable_cf_options);
     new_superversion = nullptr;
 
-    Log(db_options_.info_log, "[%s] LogAndApply: %s\n", cfd->GetName().c_str(),
+    Log(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log,
+        "[%s] LogAndApply: %s\n", cfd->GetName().c_str(),
         status.ToString().data());
 
     if (status.ok()) {
-      Log(db_options_.info_log, "[%s] After refitting:\n%s",
+      Log(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log,
+          "[%s] After refitting:\n%s",
           cfd->GetName().c_str(), cfd->current()->DebugString().data());
     }
   }
@@ -1340,14 +1362,15 @@ Status DBImpl::RunManualCompaction(ColumnFamilyData* cfd, int input_level,
 
   ++bg_manual_only_;
   while (bg_compaction_scheduled_ > 0) {
-    Log(db_options_.info_log,
+    Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
         "[%s] Manual compaction waiting for all other scheduled background "
         "compactions to finish",
         cfd->GetName().c_str());
     bg_cv_.Wait();
   }
 
-  Log(db_options_.info_log, "[%s] Manual compaction starting",
+  Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
+      "[%s] Manual compaction starting",
       cfd->GetName().c_str());
 
   // We don't check bg_error_ here, because if we get the error in compaction,
@@ -1539,7 +1562,7 @@ void DBImpl::BackgroundCallFlush() {
           default_cf_internal_stats_->BumpAndGetBackgroundErrorCount();
         bg_cv_.SignalAll();  // In case a waiter can proceed despite the error
         mutex_.Unlock();
-        Log(db_options_.info_log,
+        Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
             "Waiting after background flush error: %s"
             "Accumulated background error counts: %" PRIu64,
             s.ToString().c_str(), error_cnt);
@@ -1607,7 +1630,7 @@ void DBImpl::BackgroundCallCompaction() {
         bg_cv_.SignalAll();  // In case a waiter can proceed despite the error
         mutex_.Unlock();
         log_buffer.FlushBufferToLog();
-        Log(db_options_.info_log,
+        Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
             "Waiting after background compaction error: %s, "
             "Accumulated background error counts: %" PRIu64,
             s.ToString().c_str(), error_cnt);
@@ -2223,10 +2246,12 @@ Status DBImpl::CreateColumnFamily(const ColumnFamilyOptions& cf_options,
     assert(cfd != nullptr);
     delete InstallSuperVersion(cfd, nullptr, *cfd->GetLatestMutableCFOptions());
     *handle = new ColumnFamilyHandleImpl(cfd, this, &mutex_);
-    Log(db_options_.info_log, "Created column family [%s] (ID %u)",
+    Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
+        "Created column family [%s] (ID %u)",
         column_family_name.c_str(), (unsigned)cfd->GetID());
   } else {
-    Log(db_options_.info_log, "Creating column family [%s] FAILED -- %s",
+    Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
+        "Creating column family [%s] FAILED -- %s",
         column_family_name.c_str(), s.ToString().c_str());
   }
   return s;
@@ -2265,10 +2290,11 @@ Status DBImpl::DropColumnFamily(ColumnFamilyHandle* column_family) {
     auto* mutable_cf_options = cfd->GetLatestMutableCFOptions();
     max_total_in_memory_state_ -= mutable_cf_options->write_buffer_size *
                                   mutable_cf_options->max_write_buffer_number;
-    Log(db_options_.info_log, "Dropped column family with id %u\n",
+    Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
+        "Dropped column family with id %u\n",
         cfd->GetID());
   } else {
-    Log(db_options_.info_log,
+    Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
         "Dropping column family with id %u FAILED -- %s\n",
         cfd->GetID(), s.ToString().c_str());
   }
@@ -2530,7 +2556,7 @@ Status DBImpl::Write(const WriteOptions& write_options, WriteBatch* my_batch) {
       total_log_size_ > max_total_wal_size) {
     uint64_t flush_column_family_if_log_file = alive_log_files_.begin()->number;
     alive_log_files_.begin()->getting_flushed = true;
-    Log(db_options_.info_log,
+    Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
         "Flushing all column families with data in WAL number %" PRIu64
         ". Total log size is %" PRIu64 " while max_total_wal_size is %" PRIu64,
         flush_column_family_if_log_file, total_log_size_, max_total_wal_size);
@@ -2757,7 +2783,7 @@ Status DBImpl::SetNewMemtableAndNewLogFile(ColumnFamilyData* cfd,
       new_superversion = new SuperVersion();
     }
   }
-  Log(db_options_.info_log,
+  Log(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log,
       "[%s] New memtable created with log file: #%" PRIu64 "\n",
       cfd->GetName().c_str(), new_log_number);
   mutex_.Lock();
@@ -2960,7 +2986,8 @@ Status DBImpl::DeleteFile(std::string name) {
   WalFileType log_type;
   if (!ParseFileName(name, &number, &type, &log_type) ||
       (type != kTableFile && type != kLogFile)) {
-    Log(db_options_.info_log, "DeleteFile %s failed.\n", name.c_str());
+    Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
+        "DeleteFile %s failed.\n", name.c_str());
     return Status::InvalidArgument("Invalid file name");
   }
 
@@ -2968,13 +2995,15 @@ Status DBImpl::DeleteFile(std::string name) {
   if (type == kLogFile) {
     // Only allow deleting archived log files
     if (log_type != kArchivedLogFile) {
-      Log(db_options_.info_log, "DeleteFile %s failed - not archived log.\n",
+      Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
+          "DeleteFile %s failed - not archived log.\n",
           name.c_str());
       return Status::NotSupported("Delete only supported for archived logs");
     }
     status = env_->DeleteFile(db_options_.wal_dir + "/" + name.c_str());
     if (!status.ok()) {
-      Log(db_options_.info_log, "DeleteFile %s failed -- %s.\n",
+      Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
+          "DeleteFile %s failed -- %s.\n",
           name.c_str(), status.ToString().c_str());
     }
     return status;
@@ -2989,15 +3018,15 @@ Status DBImpl::DeleteFile(std::string name) {
     MutexLock l(&mutex_);
     status = versions_->GetMetadataForFile(number, &level, &metadata, &cfd);
     if (!status.ok()) {
-      Log(db_options_.info_log, "DeleteFile %s failed. File not found\n",
-                             name.c_str());
+      Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log,
+          "DeleteFile %s failed. File not found\n", name.c_str());
       return Status::InvalidArgument("File not found");
     }
     assert(level < cfd->NumberLevels());
 
     // If the file is being compacted no need to delete.
     if (metadata->being_compacted) {
-      Log(db_options_.info_log,
+      Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
           "DeleteFile %s Skipped. File about to be compacted\n", name.c_str());
       return Status::OK();
     }
@@ -3008,7 +3037,7 @@ Status DBImpl::DeleteFile(std::string name) {
     auto* vstoreage = cfd->current()->storage_info();
     for (int i = level + 1; i < cfd->NumberLevels(); i++) {
       if (vstoreage->NumLevelFiles(i) != 0) {
-        Log(db_options_.info_log,
+        Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log,
             "DeleteFile %s FAILED. File not in last level\n", name.c_str());
         return Status::InvalidArgument("File not in last level");
       }
@@ -3016,6 +3045,9 @@ Status DBImpl::DeleteFile(std::string name) {
     // if level == 0, it has to be the oldest file
     if (level == 0 &&
         vstoreage->LevelFiles(0).back()->fd.GetNumber() != number) {
+      Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log,
+          "DeleteFile %s failed ---"
+          " target file in level 0 must be the oldest.");
       return Status::InvalidArgument("File in level 0, but not oldest");
     }
     edit.SetColumnFamily(cfd->GetID());
@@ -3402,10 +3434,11 @@ void DumpRocksDBBuildVersion(Logger * log) {
 #if !defined(IOS_CROSS_COMPILE)
   // if we compile with Xcode, we don't run build_detect_vesion, so we don't
   // generate util/build_version.cc
-  Log(log, "RocksDB version: %d.%d.%d\n", ROCKSDB_MAJOR, ROCKSDB_MINOR,
+  Log(InfoLogLevel::INFO_LEVEL, log,
+      "RocksDB version: %d.%d.%d\n", ROCKSDB_MAJOR, ROCKSDB_MINOR,
       ROCKSDB_PATCH);
-  Log(log, "Git sha %s", rocksdb_build_git_sha);
-  Log(log, "Compile time %s %s",
+  Log(InfoLogLevel::INFO_LEVEL, log, "Git sha %s", rocksdb_build_git_sha);
+  Log(InfoLogLevel::INFO_LEVEL, log, "Compile time %s %s",
       rocksdb_build_compile_time, rocksdb_build_compile_date);
 #endif
 }

From 2a019f1d03ed5eaf1f90d21c560c1b7b3ed8b4b3 Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Tue, 4 Nov 2014 10:34:18 -0800
Subject: [PATCH 404/829] Apply InfoLogLevel to the logs in db/wal_manager.cc

Summary: Apply InfoLogLevel to the logs in db/wal_manager.cc

Test Plan: db_test

Reviewers: ljin, sdong, igor

Reviewed By: igor

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D28239
---
 db/wal_manager.cc | 43 +++++++++++++++++++++++++++----------------
 1 file changed, 27 insertions(+), 16 deletions(-)

diff --git a/db/wal_manager.cc b/db/wal_manager.cc
index 9b86a0f97..0889df301 100644
--- a/db/wal_manager.cc
+++ b/db/wal_manager.cc
@@ -67,7 +67,8 @@ Status WalManager::GetSortedWalFiles(VectorLogPtr& files) {
   uint64_t latest_archived_log_number = 0;
   if (!files.empty()) {
     latest_archived_log_number = files.back()->LogNumber();
-    Log(db_options_.info_log, "Latest Archived log: %" PRIu64,
+    Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
+        "Latest Archived log: %" PRIu64,
         latest_archived_log_number);
   }
 
@@ -80,8 +81,8 @@ Status WalManager::GetSortedWalFiles(VectorLogPtr& files) {
       // same log in both db dir and archived dir. Simply
       // ignore the one in db dir. Note that, if we read
       // archived dir first, we would have missed the log file.
-      Log(db_options_.info_log, "%s already moved to archive",
-          log->PathName().c_str());
+      Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log,
+          "%s already moved to archive", log->PathName().c_str());
     }
   }
 
@@ -130,8 +131,8 @@ void WalManager::PurgeObsoleteWALFiles() {
   int64_t current_time;
   Status s = env_->GetCurrentTime(&current_time);
   if (!s.ok()) {
-    Log(db_options_.info_log, "Can't get current time: %s",
-        s.ToString().c_str());
+    Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
+        "Can't get current time: %s", s.ToString().c_str());
     assert(false);
     return;
   }
@@ -150,8 +151,8 @@ void WalManager::PurgeObsoleteWALFiles() {
   std::vector<std::string> files;
   s = env_->GetChildren(archival_dir, &files);
   if (!s.ok()) {
-    Log(db_options_.info_log, "Can't get archive files: %s",
-        s.ToString().c_str());
+    Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
+        "Can't get archive files: %s", s.ToString().c_str());
     assert(false);
     return;
   }
@@ -168,14 +169,16 @@ void WalManager::PurgeObsoleteWALFiles() {
         uint64_t file_m_time;
         s = env_->GetFileModificationTime(file_path, &file_m_time);
         if (!s.ok()) {
-          Log(db_options_.info_log, "Can't get file mod time: %s: %s",
+          Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log,
+              "Can't get file mod time: %s: %s",
               file_path.c_str(), s.ToString().c_str());
           continue;
         }
         if (now_seconds - file_m_time > db_options_.WAL_ttl_seconds) {
           s = env_->DeleteFile(file_path);
           if (!s.ok()) {
-            Log(db_options_.info_log, "Can't delete file: %s: %s",
+            Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log,
+                "Can't delete file: %s: %s",
                 file_path.c_str(), s.ToString().c_str());
             continue;
           } else {
@@ -190,7 +193,8 @@ void WalManager::PurgeObsoleteWALFiles() {
         uint64_t file_size;
         s = env_->GetFileSize(file_path, &file_size);
         if (!s.ok()) {
-          Log(db_options_.info_log, "Can't get file size: %s: %s",
+          Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
+              "Unable to get file size: %s: %s",
               file_path.c_str(), s.ToString().c_str());
           return;
         } else {
@@ -200,7 +204,8 @@ void WalManager::PurgeObsoleteWALFiles() {
           } else {
             s = env_->DeleteFile(file_path);
             if (!s.ok()) {
-              Log(db_options_.info_log, "Can't delete file: %s: %s",
+              Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log,
+                  "Unable to delete file: %s: %s",
                   file_path.c_str(), s.ToString().c_str());
               continue;
             } else {
@@ -228,7 +233,7 @@ void WalManager::PurgeObsoleteWALFiles() {
   GetSortedWalsOfType(archival_dir, archived_logs, kArchivedLogFile);
 
   if (files_del_num > archived_logs.size()) {
-    Log(db_options_.info_log,
+    Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log,
         "Trying to delete more archived log files than "
         "exist. Deleting all");
     files_del_num = archived_logs.size();
@@ -238,7 +243,8 @@ void WalManager::PurgeObsoleteWALFiles() {
     std::string const file_path = archived_logs[i]->PathName();
     s = env_->DeleteFile(db_options_.wal_dir + "/" + file_path);
     if (!s.ok()) {
-      Log(db_options_.info_log, "Can't delete file: %s: %s", file_path.c_str(),
+      Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log,
+          "Unable to delete file: %s: %s", file_path.c_str(),
           s.ToString().c_str());
       continue;
     } else {
@@ -255,7 +261,8 @@ void WalManager::ArchiveWALFile(const std::string& fname, uint64_t number) {
   Status s = env_->RenameFile(fname, archived_log_name);
   // The sync point below is used in (DBTest,TransactionLogIteratorRace)
   TEST_SYNC_POINT("WalManager::PurgeObsoleteFiles:2");
-  Log(db_options_.info_log, "Move log file %s to %s -- %s\n", fname.c_str(),
+  Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
+      "Move log file %s to %s -- %s\n", fname.c_str(),
       archived_log_name.c_str(), s.ToString().c_str());
 }
 
@@ -347,7 +354,10 @@ Status WalManager::ReadFirstRecord(const WalFileType type,
                                    const uint64_t number,
                                    SequenceNumber* sequence) {
   if (type != kAliveLogFile && type != kArchivedLogFile) {
-    return Status::NotSupported("File Type Not Known " + std::to_string(type));
+    Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
+        "[WalManger] Unknown file type %s", std::to_string(type).c_str());
+    return Status::NotSupported(
+        "File Type Not Known " + std::to_string(type));
   }
   {
     MutexLock l(&read_first_record_cache_mutex_);
@@ -393,7 +403,8 @@ Status WalManager::ReadFirstLine(const std::string& fname,
     Status* status;
     bool ignore_error;  // true if db_options_.paranoid_checks==false
     virtual void Corruption(size_t bytes, const Status& s) {
-      Log(info_log, "%s%s: dropping %d bytes; %s",
+      Log(InfoLogLevel::WARN_LEVEL, info_log,
+          "[WalManager] %s%s: dropping %d bytes; %s",
           (this->ignore_error ? "(ignoring error) " : ""), fname,
           static_cast<int>(bytes), s.ToString().c_str());
       if (this->status->ok()) {

From d8e119663539491b10f75e0f0c39abfc296e185e Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Tue, 4 Nov 2014 10:34:33 -0800
Subject: [PATCH 405/829] Apply InfoLogLevel to the logs in db/version_set.cc

Summary: Apply InfoLogLevel to the logs in db/version_set.cc

Test Plan: make

Reviewers: ljin, sdong, igor

Reviewed By: igor

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D27879
---
 db/version_set.cc | 33 +++++++++++++++++++--------------
 1 file changed, 19 insertions(+), 14 deletions(-)

diff --git a/db/version_set.cc b/db/version_set.cc
index c4e20140f..3f7985028 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -804,7 +804,7 @@ bool Version::MaybeInitializeFileMetaData(FileMetaData* file_meta) {
   Status s = GetTableProperties(&tp, file_meta);
   file_meta->init_stats_from_file = true;
   if (!s.ok()) {
-    Log(vset_->db_options_->info_log,
+    Log(InfoLogLevel::ERROR_LEVEL, vset_->db_options_->info_log,
         "Unable to load table properties for file %" PRIu64 " --- %s\n",
         file_meta->fd.GetNumber(), s.ToString().c_str());
     return false;
@@ -1605,7 +1605,7 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
     // only one thread can be here at the same time
     if (new_descriptor_log) {
       // create manifest file
-      Log(db_options_->info_log,
+      Log(InfoLogLevel::INFO_LEVEL, db_options_->info_log,
           "Creating manifest %" PRIu64 "\n", pending_manifest_file_number_);
       unique_ptr<WritableFile> descriptor_file;
       s = env_->NewWritableFile(
@@ -1683,7 +1683,7 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
                          db_options_->disableDataSync ? nullptr : db_directory);
       if (s.ok() && pending_manifest_file_number_ > manifest_file_number_) {
         // delete old manifest file
-        Log(db_options_->info_log,
+        Log(InfoLogLevel::INFO_LEVEL, db_options_->info_log,
             "Deleting manifest %" PRIu64 " current manifest %" PRIu64 "\n",
             manifest_file_number_, pending_manifest_file_number_);
         // we don't care about an error here, PurgeObsoleteFiles will take care
@@ -1733,12 +1733,13 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
     manifest_file_size_ = new_manifest_file_size;
     prev_log_number_ = edit->prev_log_number_;
   } else {
-    Log(db_options_->info_log, "Error in committing version %lu to [%s]",
+    Log(InfoLogLevel::ERROR_LEVEL, db_options_->info_log,
+        "Error in committing version %lu to [%s]",
         (unsigned long)v->GetVersionNumber(),
         column_family_data->GetName().c_str());
     delete v;
     if (new_descriptor_log) {
-      Log(db_options_->info_log,
+      Log(InfoLogLevel::INFO_LEVEL, db_options_->info_log,
         "Deleting manifest %" PRIu64 " current manifest %" PRIu64 "\n",
         manifest_file_number_, pending_manifest_file_number_);
       descriptor_log_.reset();
@@ -1830,7 +1831,8 @@ Status VersionSet::Recover(
     return Status::Corruption("CURRENT file corrupted");
   }
 
-  Log(db_options_->info_log, "Recovering from manifest file: %s\n",
+  Log(InfoLogLevel::INFO_LEVEL, db_options_->info_log,
+      "Recovering from manifest file: %s\n",
       manifest_filename.c_str());
 
   manifest_filename = dbname_ + "/" + manifest_filename;
@@ -1964,7 +1966,7 @@ Status VersionSet::Recover(
       if (cfd != nullptr) {
         if (edit.has_log_number_) {
           if (cfd->GetLogNumber() > edit.log_number_) {
-            Log(db_options_->info_log,
+            Log(InfoLogLevel::WARN_LEVEL, db_options_->info_log,
                 "MANIFEST corruption detected, but ignored - Log numbers in "
                 "records NOT monotonically increasing");
           } else {
@@ -2062,7 +2064,7 @@ Status VersionSet::Recover(
     last_sequence_ = last_sequence;
     prev_log_number_ = prev_log_number;
 
-    Log(db_options_->info_log,
+    Log(InfoLogLevel::INFO_LEVEL, db_options_->info_log,
         "Recovered from manifest file:%s succeeded,"
         "manifest_file_number is %lu, next_file_number is %lu, "
         "last_sequence is %lu, log_number is %lu,"
@@ -2074,7 +2076,7 @@ Status VersionSet::Recover(
         column_family_set_->GetMaxColumnFamily());
 
     for (auto cfd : *column_family_set_) {
-      Log(db_options_->info_log,
+      Log(InfoLogLevel::INFO_LEVEL, db_options_->info_log,
           "Column family [%s] (ID %u), log number is %" PRIu64 "\n",
           cfd->GetName().c_str(), cfd->GetID(), cfd->GetLogNumber());
     }
@@ -2493,12 +2495,14 @@ bool VersionSet::ManifestContains(uint64_t manifest_file_number,
                                   const std::string& record) const {
   std::string fname =
       DescriptorFileName(dbname_, manifest_file_number);
-  Log(db_options_->info_log, "ManifestContains: checking %s\n", fname.c_str());
+  Log(InfoLogLevel::INFO_LEVEL, db_options_->info_log,
+      "ManifestContains: checking %s\n", fname.c_str());
   unique_ptr<SequentialFile> file;
   Status s = env_->NewSequentialFile(fname, &file, env_options_);
   if (!s.ok()) {
-    Log(db_options_->info_log, "ManifestContains: %s\n", s.ToString().c_str());
-    Log(db_options_->info_log,
+    Log(InfoLogLevel::INFO_LEVEL, db_options_->info_log,
+        "ManifestContains: %s\n", s.ToString().c_str());
+    Log(InfoLogLevel::INFO_LEVEL, db_options_->info_log,
         "ManifestContains: is unable to reopen the manifest file  %s",
         fname.c_str());
     return false;
@@ -2513,7 +2517,8 @@ bool VersionSet::ManifestContains(uint64_t manifest_file_number,
       break;
     }
   }
-  Log(db_options_->info_log, "ManifestContains: result = %d\n", result ? 1 : 0);
+  Log(InfoLogLevel::INFO_LEVEL, db_options_->info_log,
+      "ManifestContains: result = %d\n", result ? 1 : 0);
   return result;
 }
 
@@ -2635,7 +2640,7 @@ bool VersionSet::VerifyCompactionFileConsistency(Compaction* c) {
   Version* version = c->column_family_data()->current();
   const VersionStorageInfo* vstorage = version->storage_info();
   if (c->input_version() != version) {
-    Log(db_options_->info_log,
+    Log(InfoLogLevel::ERROR_LEVEL, db_options_->info_log,
         "[%s] VerifyCompactionFileConsistency version mismatch",
         c->column_family_data()->GetName().c_str());
   }

From 8810850dd40fedff9e975f6188f1c9b4fceae234 Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Tue, 4 Nov 2014 11:07:11 -0800
Subject: [PATCH 406/829] Apply InfoLogLevel to the logs in
 db/compaction_job.cc

Summary: Apply InfoLogLevel to the logs in db/compaction_job.cc

Test Plan: db_test

Reviewers: ljin, sdong, igor

Reviewed By: igor

Subscribers: leveldb, MarkCallaghan, dhruba

Differential Revision: https://reviews.facebook.net/D28275
---
 db/compaction_job.cc | 40 ++++++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 18 deletions(-)

diff --git a/db/compaction_job.cc b/db/compaction_job.cc
index 5a1a315ff..5db087b3c 100644
--- a/db/compaction_job.cc
+++ b/db/compaction_job.cc
@@ -318,7 +318,8 @@ Status CompactionJob::Run() {
 
       if (!ParseInternalKey(key, &ikey)) {
         // log error
-        Log(db_options_.info_log, "[%s] Failed to parse key: %s",
+        Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log,
+            "[%s] Failed to parse key: %s",
             cfd->GetName().c_str(), key.ToString().c_str());
         continue;
       } else {
@@ -957,10 +958,10 @@ Status CompactionJob::FinishCompactionOutputFile(Iterator* input) {
     s = iter->status();
     delete iter;
     if (s.ok()) {
-      Log(db_options_.info_log, "[%s] Generated table #%" PRIu64 ": %" PRIu64
-                                " keys, %" PRIu64 " bytes",
-          cfd->GetName().c_str(), output_number, current_entries,
-          current_bytes);
+      Log(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log,
+          "[%s] Generated table #%" PRIu64 ": %" PRIu64
+          " keys, %" PRIu64 " bytes", cfd->GetName().c_str(),
+          output_number, current_entries, current_bytes);
     }
   }
   return s;
@@ -974,7 +975,8 @@ Status CompactionJob::InstallCompactionResults() {
   // This ensures that a concurrent compaction did not erroneously
   // pick the same files to compact_.
   if (!versions_->VerifyCompactionFileConsistency(compact_->compaction)) {
-    Log(db_options_.info_log, "[%s] Compaction %d@%d + %d@%d files aborted",
+    Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
+        "[%s] Compaction %d@%d + %d@%d files aborted",
         compact_->compaction->column_family_data()->GetName().c_str(),
         compact_->compaction->num_input_files(0), compact_->compaction->level(),
         compact_->compaction->num_input_files(1),
@@ -982,13 +984,14 @@ Status CompactionJob::InstallCompactionResults() {
     return Status::Corruption("Compaction input files inconsistent");
   }
 
-  LogToBuffer(log_buffer_, "[%s] Compacted %d@%d + %d@%d files => %lld bytes",
-              compact_->compaction->column_family_data()->GetName().c_str(),
-              compact_->compaction->num_input_files(0),
-              compact_->compaction->level(),
-              compact_->compaction->num_input_files(1),
-              compact_->compaction->output_level(),
-              static_cast<long long>(compact_->total_bytes));
+  Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
+      "[%s] Compacted %d@%d + %d@%d files => %" PRIu64 " bytes",
+      compact_->compaction->column_family_data()->GetName().c_str(),
+      compact_->compaction->num_input_files(0),
+      compact_->compaction->level(),
+      compact_->compaction->num_input_files(1),
+      compact_->compaction->output_level(),
+      compact_->total_bytes);
 
   // Add compaction outputs
   compact_->compaction->AddInputDeletions(compact_->compaction->edit());
@@ -1023,9 +1026,11 @@ inline SequenceNumber CompactionJob::findEarliestVisibleSnapshot(
     prev = cur;  // assignment
     assert(prev);
   }
-  Log(db_options_.info_log,
-      "Looking for seqid %" PRIu64 " but maxseqid is %" PRIu64 "", in,
-      snapshots[snapshots.size() - 1]);
+  Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log,
+      "CompactionJob is not able to find snapshot"
+      " with SeqId later than %" PRIu64
+      ": current MaxSeqId is %" PRIu64 "",
+      in, snapshots[snapshots.size() - 1]);
   assert(0);
   return 0;
 }
@@ -1070,8 +1075,7 @@ Status CompactionJob::OpenCompactionOutputFile() {
   if (!s.ok()) {
     Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
         "[%s] OpenCompactionOutputFiles for table #%" PRIu64
-        " "
-        "fails at NewWritableFile with status %s",
+        " fails at NewWritableFile with status %s",
         compact_->compaction->column_family_data()->GetName().c_str(),
         file_number, s.ToString().c_str());
     LogFlush(db_options_.info_log);

From b0cda4a116a333a10dbf42518f66e6d66e202fc7 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Tue, 4 Nov 2014 11:07:36 -0800
Subject: [PATCH 407/829] DBTest.DynamicMemtableOptions to use single
 background compaction

Summary:
Now DBTest.DynamicMemtableOptions sets background compaction to be 4, without actually increasing thread pool size (even before the feature of automatic increasing it). To make sure the behavior stays the same after the automatic thread pool increasing, set it back to 1.

Hopefully it can fix the occasional failure of the test.

Test Plan: Run the test

Reviewers: igor, ljin

Reviewed By: ljin

Subscribers: leveldb, dhruba

Differential Revision: https://reviews.facebook.net/D28281
---
 db/db_test.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/db/db_test.cc b/db/db_test.cc
index 24bad640b..8b12d0a00 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -8082,7 +8082,7 @@ TEST(DBTest, Level0StopWritesTest) {
   Options options = CurrentOptions();
   options.level0_slowdown_writes_trigger = 2;
   options.level0_stop_writes_trigger = 4;
-  options.disable_auto_compactions = 4;
+  options.disable_auto_compactions = true;
   options.max_mem_compaction_level = 0;
   Reopen(options);
 
@@ -8424,7 +8424,7 @@ TEST(DBTest, DynamicMemtableOptions) {
   options.env = env_;
   options.create_if_missing = true;
   options.compression = kNoCompression;
-  options.max_background_compactions = 4;
+  options.max_background_compactions = 1;
   options.max_mem_compaction_level = 0;
   options.write_buffer_size = k64KB;
   options.max_write_buffer_number = 2;

From da5daa061cc3fd6aca15ce3b2350b1b55a4655a0 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Tue, 4 Nov 2014 11:33:57 -0800
Subject: [PATCH 408/829] Replace some ASSERT_TRUE() asserts in
 DBTest.DynamicMemtableOptions and DynamicCompactionOptions with more specific
 ones

Summary: Replace some ASSERT_TRUE() to ASSERT_GT() and ASSERT_LT() so that in case the assert is triggered, the value is printed out.

Test Plan: Run the two tests

Reviewers: ljin, rven, yhchiang, igor

Reviewed By: igor

Subscribers: leveldb, dhruba

Differential Revision: https://reviews.facebook.net/D28293
---
 db/db_test.cc | 27 ++++++++++++++++-----------
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/db/db_test.cc b/db/db_test.cc
index 8b12d0a00..8f9972bd2 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -8445,8 +8445,8 @@ TEST(DBTest, DynamicMemtableOptions) {
   // Test write_buffer_size
   gen_l0_kb(64);
   ASSERT_EQ(NumTableFilesAtLevel(0), 1);
-  ASSERT_TRUE(SizeAtLevel(0) < k64KB + k5KB);
-  ASSERT_TRUE(SizeAtLevel(0) > k64KB - k5KB);
+  ASSERT_LT(SizeAtLevel(0), k64KB + k5KB);
+  ASSERT_GT(SizeAtLevel(0), k64KB - k5KB);
 
   // Clean up L0
   dbfull()->CompactRange(nullptr, nullptr);
@@ -8462,8 +8462,8 @@ TEST(DBTest, DynamicMemtableOptions) {
   // have a 64KB L0 file, a 128KB L0 file, and a memtable with 64KB data
   gen_l0_kb(256);
   ASSERT_EQ(NumTableFilesAtLevel(0), 2);
-  ASSERT_TRUE(SizeAtLevel(0) < k128KB + k64KB + 2 * k5KB);
-  ASSERT_TRUE(SizeAtLevel(0) > k128KB + k64KB - 2 * k5KB);
+  ASSERT_LT(SizeAtLevel(0), k128KB + k64KB + 2 * k5KB);
+  ASSERT_GT(SizeAtLevel(0), k128KB + k64KB - 2 * k5KB);
 
   // Test max_write_buffer_number
   // Block compaction thread, which will also block the flushes because
@@ -8488,7 +8488,8 @@ TEST(DBTest, DynamicMemtableOptions) {
   while (Put(Key(count), RandomString(&rnd, 1024), wo).ok() && count < 256) {
     count++;
   }
-  ASSERT_TRUE(count > (128 * 0.8) && count < (128 * 1.2));
+  ASSERT_GT(static_cast<double>(count), 128 * 0.8);
+  ASSERT_LT(static_cast<double>(count), 128 * 1.2);
 
   sleeping_task_low1.WakeUp();
   sleeping_task_low1.WaitUntilDone();
@@ -8507,7 +8508,8 @@ TEST(DBTest, DynamicMemtableOptions) {
   while (Put(Key(count), RandomString(&rnd, 1024), wo).ok() && count < 1024) {
     count++;
   }
-  ASSERT_TRUE(count > (512 * 0.8) && count < (512 * 1.2));
+  ASSERT_GT(static_cast<double>(count), 512 * 0.8);
+  ASSERT_LT(static_cast<double>(count), 512 * 1.2);
   sleeping_task_low2.WakeUp();
   sleeping_task_low2.WaitUntilDone();
 
@@ -8525,7 +8527,8 @@ TEST(DBTest, DynamicMemtableOptions) {
   while (Put(Key(count), RandomString(&rnd, 1024), wo).ok() && count < 1024) {
     count++;
   }
-  ASSERT_TRUE(count > (256 * 0.8) && count < (256 * 1.2));
+  ASSERT_GT(static_cast<double>(count), 256 * 0.8);
+  ASSERT_LT(static_cast<double>(count), 266 * 1.2);
   sleeping_task_low3.WakeUp();
   sleeping_task_low3.WaitUntilDone();
 }
@@ -8622,10 +8625,12 @@ TEST(DBTest, DynamicCompactionOptions) {
     gen_l0_kb(i, 64, 96);
   }
   dbfull()->TEST_WaitForCompact();
-  ASSERT_TRUE(SizeAtLevel(1) > k1MB * 0.5 &&
-              SizeAtLevel(1) < k1MB * 1.5);
-  ASSERT_TRUE(SizeAtLevel(2) > 4 * k1MB * 0.5 &&
-              SizeAtLevel(2) < 4 * k1MB * 1.5);
+  ASSERT_GT(SizeAtLevel(1), k1MB / 2);
+  ASSERT_LT(SizeAtLevel(1), k1MB + k1MB / 2);
+
+  // Within (0.5, 1.5) of 4MB.
+  ASSERT_GT(SizeAtLevel(2), 2 * k1MB);
+  ASSERT_LT(SizeAtLevel(2), 6 * k1MB);
 
   // Test max_bytes_for_level_multiplier and
   // max_bytes_for_level_base. Now, reduce both mulitplier and level base,

From 83bf09144bc9c80915dd67ab17182d4f0804cc08 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Tue, 4 Nov 2014 14:39:09 -0800
Subject: [PATCH 409/829] Bump verison number to 3.7

Summary: As tittle

Test Plan: N/A

Reviewers: ljin, yhchiang, rven, igor

Reviewed By: igor

Subscribers: leveldb, dhruba

Differential Revision: https://reviews.facebook.net/D28299
---
 include/rocksdb/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/rocksdb/version.h b/include/rocksdb/version.h
index 285278854..bef989661 100644
--- a/include/rocksdb/version.h
+++ b/include/rocksdb/version.h
@@ -5,7 +5,7 @@
 #pragma once
 
 #define ROCKSDB_MAJOR 3
-#define ROCKSDB_MINOR 6
+#define ROCKSDB_MINOR 7
 #define ROCKSDB_PATCH 0
 
 // Do not use these. We made the mistake of declaring macros starting with

From b1267750fba1398bc09ba2d23d7814e8e422e5fc Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Tue, 4 Nov 2014 15:58:14 -0800
Subject: [PATCH 410/829] fix the asan check

Summary: as title

Test Plan: ran it

Reviewers: yhchiang, igor

Reviewed By: igor

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D28311
---
 db/db_impl.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/db/db_impl.cc b/db/db_impl.cc
index 4946c007f..b5b77882e 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -3047,7 +3047,7 @@ Status DBImpl::DeleteFile(std::string name) {
         vstoreage->LevelFiles(0).back()->fd.GetNumber() != number) {
       Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log,
           "DeleteFile %s failed ---"
-          " target file in level 0 must be the oldest.");
+          " target file in level 0 must be the oldest.", name.c_str());
       return Status::InvalidArgument("File in level 0, but not oldest");
     }
     edit.SetColumnFamily(cfd->GetID());

From fd24ae9d05aa5bbc0be20a36580e680449be97c3 Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Tue, 4 Nov 2014 16:23:05 -0800
Subject: [PATCH 411/829] SetOptions() to return status and also add it to
 StackableDB

Summary: as title

Test Plan: ./db_test

Reviewers: sdong, yhchiang, rven, igor

Reviewed By: igor

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D28269
---
 db/column_family.cc                      | 10 +++----
 db/column_family.h                       |  2 +-
 db/db_impl.cc                            | 14 +++++-----
 db/db_impl.h                             |  2 +-
 db/db_test.cc                            | 34 ++++++++++++------------
 include/rocksdb/db.h                     |  6 ++---
 include/rocksdb/utilities/stackable_db.h |  7 +++++
 tools/db_stress.cc                       |  2 +-
 util/options_helper.cc                   | 11 ++++----
 util/options_helper.h                    |  3 ++-
 10 files changed, 50 insertions(+), 41 deletions(-)

diff --git a/db/column_family.cc b/db/column_family.cc
index 9e74df583..b2670cbdb 100644
--- a/db/column_family.cc
+++ b/db/column_family.cc
@@ -544,16 +544,16 @@ void ColumnFamilyData::ResetThreadLocalSuperVersions() {
   }
 }
 
-bool ColumnFamilyData::SetOptions(
+Status ColumnFamilyData::SetOptions(
       const std::unordered_map<std::string, std::string>& options_map) {
   MutableCFOptions new_mutable_cf_options;
-  if (GetMutableOptionsFromStrings(mutable_cf_options_, options_map,
-                                   &new_mutable_cf_options)) {
+  Status s = GetMutableOptionsFromStrings(mutable_cf_options_, options_map,
+                                          &new_mutable_cf_options);
+  if (s.ok()) {
     mutable_cf_options_ = new_mutable_cf_options;
     mutable_cf_options_.RefreshDerivedOptions(ioptions_);
-    return true;
   }
-  return false;
+  return s;
 }
 
 ColumnFamilySet::ColumnFamilySet(const std::string& dbname,
diff --git a/db/column_family.h b/db/column_family.h
index b37b684fa..013c29615 100644
--- a/db/column_family.h
+++ b/db/column_family.h
@@ -187,7 +187,7 @@ class ColumnFamilyData {
     return &mutable_cf_options_;
   }
   // REQUIRES: DB mutex held
-  bool SetOptions(
+  Status SetOptions(
       const std::unordered_map<std::string, std::string>& options_map);
 
   InternalStats* internal_stats() { return internal_stats_.get(); }
diff --git a/db/db_impl.cc b/db/db_impl.cc
index b5b77882e..2bbb3345f 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -1142,23 +1142,23 @@ Status DBImpl::CompactRange(ColumnFamilyHandle* column_family,
   return s;
 }
 
-bool DBImpl::SetOptions(ColumnFamilyHandle* column_family,
+Status DBImpl::SetOptions(ColumnFamilyHandle* column_family,
     const std::unordered_map<std::string, std::string>& options_map) {
   auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family)->cfd();
   if (options_map.empty()) {
     Log(InfoLogLevel::WARN_LEVEL,
         db_options_.info_log, "SetOptions() on column family [%s], empty input",
         cfd->GetName().c_str());
-    return false;
+    return Status::InvalidArgument("empty input");
   }
 
   MutableCFOptions new_options;
-  bool succeed = false;
+  Status s;
   {
     MutexLock l(&mutex_);
-    if (cfd->SetOptions(options_map)) {
+    s = cfd->SetOptions(options_map);
+    if (s.ok()) {
       new_options = *cfd->GetLatestMutableCFOptions();
-      succeed = true;
     }
   }
 
@@ -1169,7 +1169,7 @@ bool DBImpl::SetOptions(ColumnFamilyHandle* column_family,
     Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
         "%s: %s\n", o.first.c_str(), o.second.c_str());
   }
-  if (succeed) {
+  if (s.ok()) {
     Log(InfoLogLevel::INFO_LEVEL,
         db_options_.info_log, "[%s] SetOptions succeeded",
         cfd->GetName().c_str());
@@ -1178,7 +1178,7 @@ bool DBImpl::SetOptions(ColumnFamilyHandle* column_family,
     Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log,
         "[%s] SetOptions failed", cfd->GetName().c_str());
   }
-  return succeed;
+  return s;
 }
 
 // return the same level if it cannot be moved
diff --git a/db/db_impl.h b/db/db_impl.h
index 5aa1eb8ed..8717dee90 100644
--- a/db/db_impl.h
+++ b/db/db_impl.h
@@ -115,7 +115,7 @@ class DBImpl : public DB {
                               uint32_t target_path_id = 0);
 
   using DB::SetOptions;
-  bool SetOptions(ColumnFamilyHandle* column_family,
+  Status SetOptions(ColumnFamilyHandle* column_family,
       const std::unordered_map<std::string, std::string>& options_map);
 
   using DB::NumberLevels;
diff --git a/db/db_test.cc b/db/db_test.cc
index 8f9972bd2..7aea863f8 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -8453,7 +8453,7 @@ TEST(DBTest, DynamicMemtableOptions) {
   ASSERT_EQ(NumTableFilesAtLevel(0), 0);
 
   // Increase buffer size
-  ASSERT_TRUE(dbfull()->SetOptions({
+  ASSERT_OK(dbfull()->SetOptions({
     {"write_buffer_size", "131072"},
   }));
 
@@ -8495,7 +8495,7 @@ TEST(DBTest, DynamicMemtableOptions) {
   sleeping_task_low1.WaitUntilDone();
 
   // Increase
-  ASSERT_TRUE(dbfull()->SetOptions({
+  ASSERT_OK(dbfull()->SetOptions({
     {"max_write_buffer_number", "8"},
   }));
   // Clean up memtable and L0
@@ -8514,7 +8514,7 @@ TEST(DBTest, DynamicMemtableOptions) {
   sleeping_task_low2.WaitUntilDone();
 
   // Decrease
-  ASSERT_TRUE(dbfull()->SetOptions({
+  ASSERT_OK(dbfull()->SetOptions({
     {"max_write_buffer_number", "4"},
   }));
   // Clean up memtable and L0
@@ -8593,7 +8593,7 @@ TEST(DBTest, DynamicCompactionOptions) {
   // Writing to 64KB L0 files should trigger a compaction. Since these
   // 2 L0 files have the same key range, compaction merge them and should
   // result in 2 32KB L1 files.
-  ASSERT_TRUE(dbfull()->SetOptions({
+  ASSERT_OK(dbfull()->SetOptions({
     {"level0_file_num_compaction_trigger", "2"},
     {"target_file_size_base", std::to_string(k32KB) }
   }));
@@ -8615,7 +8615,7 @@ TEST(DBTest, DynamicCompactionOptions) {
   // Increase level base size to 256KB and write enough data that will
   // fill L1 and L2. L1 size should be around 256KB while L2 size should be
   // around 256KB x 4.
-  ASSERT_TRUE(dbfull()->SetOptions({
+  ASSERT_OK(dbfull()->SetOptions({
     {"max_bytes_for_level_base", std::to_string(k1MB) }
   }));
 
@@ -8636,7 +8636,7 @@ TEST(DBTest, DynamicCompactionOptions) {
   // max_bytes_for_level_base. Now, reduce both mulitplier and level base,
   // After filling enough data that can fit in L1 - L3, we should see L1 size
   // reduces to 128KB from 256KB which was asserted previously. Same for L2.
-  ASSERT_TRUE(dbfull()->SetOptions({
+  ASSERT_OK(dbfull()->SetOptions({
     {"max_bytes_for_level_multiplier", "2"},
     {"max_bytes_for_level_base", std::to_string(k128KB) }
   }));
@@ -8678,7 +8678,7 @@ TEST(DBTest, DynamicCompactionOptions) {
   // Now reduce level0_stop_writes_trigger to 6. Clear up memtables and L0.
   // Block compaction thread again. Perform the put and memtable flushes
   // until we see timeout after 6 memtable flushes.
-  ASSERT_TRUE(dbfull()->SetOptions({
+  ASSERT_OK(dbfull()->SetOptions({
     {"level0_stop_writes_trigger", "6"}
   }));
   dbfull()->CompactRange(nullptr, nullptr);
@@ -8703,7 +8703,7 @@ TEST(DBTest, DynamicCompactionOptions) {
   // 4 L0 files and compaction should be triggered. If auto compaction is
   // disabled, then TEST_WaitForCompact will be waiting for nothing. Number of
   // L0 files do not change after the call.
-  ASSERT_TRUE(dbfull()->SetOptions({
+  ASSERT_OK(dbfull()->SetOptions({
     {"disable_auto_compactions", "true"}
   }));
   dbfull()->CompactRange(nullptr, nullptr);
@@ -8719,7 +8719,7 @@ TEST(DBTest, DynamicCompactionOptions) {
 
   // Enable auto compaction and perform the same test, # of L0 files should be
   // reduced after compaction.
-  ASSERT_TRUE(dbfull()->SetOptions({
+  ASSERT_OK(dbfull()->SetOptions({
     {"disable_auto_compactions", "false"}
   }));
   dbfull()->CompactRange(nullptr, nullptr);
@@ -8737,7 +8737,7 @@ TEST(DBTest, DynamicCompactionOptions) {
   // First change max_bytes_for_level_base to a big value and populate
   // L1 - L3. Then thrink max_bytes_for_level_base and disable auto compaction
   // at the same time, we should see some level with score greater than 2.
-  ASSERT_TRUE(dbfull()->SetOptions({
+  ASSERT_OK(dbfull()->SetOptions({
     {"max_bytes_for_level_base", std::to_string(k1MB) }
   }));
   // writing 40 x 64KB = 10 x 256KB
@@ -8754,7 +8754,7 @@ TEST(DBTest, DynamicCompactionOptions) {
                SizeAtLevel(3) < 4 * k1MB * 1.2));
   // Reduce max_bytes_for_level_base and disable compaction at the same time
   // This should cause score to increase
-  ASSERT_TRUE(dbfull()->SetOptions({
+  ASSERT_OK(dbfull()->SetOptions({
     {"disable_auto_compactions", "true"},
     {"max_bytes_for_level_base", "65536"},
   }));
@@ -8769,7 +8769,7 @@ TEST(DBTest, DynamicCompactionOptions) {
   // Enfoce hard rate limit. Now set hard_rate_limit to 2,
   // we should start to see put delay (1000 us) and timeout as a result
   // (L0 score is not regulated by this limit).
-  ASSERT_TRUE(dbfull()->SetOptions({
+  ASSERT_OK(dbfull()->SetOptions({
     {"hard_rate_limit", "2"}
   }));
   ASSERT_OK(Put(Key(count), RandomString(&rnd, 1024)));
@@ -8781,7 +8781,7 @@ TEST(DBTest, DynamicCompactionOptions) {
   ASSERT_TRUE(Put(Key(count), RandomString(&rnd, 1024), wo).IsTimedOut());
 
   // Lift the limit and no timeout
-  ASSERT_TRUE(dbfull()->SetOptions({
+  ASSERT_OK(dbfull()->SetOptions({
     {"hard_rate_limit", "100"}
   }));
   dbfull()->TEST_FlushMemTable(true);
@@ -8807,7 +8807,7 @@ TEST(DBTest, DynamicCompactionOptions) {
   ASSERT_TRUE(Put("max_mem_compaction_level_key",
               RandomString(&rnd, 8)).ok());
   // Set new value and it becomes effective in this flush
-  ASSERT_TRUE(dbfull()->SetOptions({
+  ASSERT_OK(dbfull()->SetOptions({
     {"max_mem_compaction_level", "1"}
   }));
   dbfull()->TEST_FlushMemTable(true);
@@ -8818,7 +8818,7 @@ TEST(DBTest, DynamicCompactionOptions) {
   ASSERT_TRUE(Put("max_mem_compaction_level_key",
               RandomString(&rnd, 8)).ok());
   // Set new value and it becomes effective in this flush
-  ASSERT_TRUE(dbfull()->SetOptions({
+  ASSERT_OK(dbfull()->SetOptions({
     {"max_mem_compaction_level", "0"}
   }));
   dbfull()->TEST_FlushMemTable(true);
@@ -8994,7 +8994,7 @@ TEST(DBTest, DynamicMiscOptions) {
   // No reseek
   assert_reseek_count(100, 0);
 
-  ASSERT_TRUE(dbfull()->SetOptions({
+  ASSERT_OK(dbfull()->SetOptions({
     {"max_sequential_skip_in_iterations", "4"}
   }));
   // Clear memtable and make new option effective
@@ -9002,7 +9002,7 @@ TEST(DBTest, DynamicMiscOptions) {
   // Trigger reseek
   assert_reseek_count(200, 1);
 
-  ASSERT_TRUE(dbfull()->SetOptions({
+  ASSERT_OK(dbfull()->SetOptions({
     {"max_sequential_skip_in_iterations", "16"}
   }));
   // Clear memtable and make new option effective
diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h
index 0653a8386..21fa43838 100644
--- a/include/rocksdb/db.h
+++ b/include/rocksdb/db.h
@@ -359,11 +359,11 @@ class DB {
     return CompactRange(DefaultColumnFamily(), begin, end, reduce_level,
                         target_level, target_path_id);
   }
-  virtual bool SetOptions(ColumnFamilyHandle* column_family,
+  virtual Status SetOptions(ColumnFamilyHandle* column_family,
       const std::unordered_map<std::string, std::string>& new_options) {
-    return true;
+    return Status::NotSupported("Not implemented");
   }
-  virtual bool SetOptions(
+  virtual Status SetOptions(
       const std::unordered_map<std::string, std::string>& new_options) {
     return SetOptions(DefaultColumnFamily(), new_options);
   }
diff --git a/include/rocksdb/utilities/stackable_db.h b/include/rocksdb/utilities/stackable_db.h
index 417378f5d..50c6a6484 100644
--- a/include/rocksdb/utilities/stackable_db.h
+++ b/include/rocksdb/utilities/stackable_db.h
@@ -3,6 +3,7 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #pragma once
+#include <string>
 #include "rocksdb/db.h"
 
 namespace rocksdb {
@@ -203,6 +204,12 @@ class StackableDB : public DB {
     return db_->GetDbIdentity(identity);
   }
 
+  using DB::SetOptions;
+  virtual Status SetOptions(
+    const std::unordered_map<std::string, std::string>& new_options) override {
+    return db_->SetOptions(new_options);
+  }
+
   using DB::GetPropertiesOfAllTables;
   virtual Status GetPropertiesOfAllTables(ColumnFamilyHandle* column_family,
                                           TablePropertiesCollection* props) {
diff --git a/tools/db_stress.cc b/tools/db_stress.cc
index d2bdec7e0..42d0fb534 100644
--- a/tools/db_stress.cc
+++ b/tools/db_stress.cc
@@ -1298,7 +1298,7 @@ class StressTest {
     return s;
   }
 
-  bool SetOptions(ThreadState* thread) {
+  Status SetOptions(ThreadState* thread) {
     assert(FLAGS_set_options_one_in > 0);
     std::unordered_map<std::string, std::string> opts;
     std::string name = options_index_[
diff --git a/util/options_helper.cc b/util/options_helper.cc
index 9b95150c5..268a67a99 100644
--- a/util/options_helper.cc
+++ b/util/options_helper.cc
@@ -165,7 +165,7 @@ bool ParseMiscOptions(const std::string& name, const std::string& value,
   return true;
 }
 
-bool GetMutableOptionsFromStrings(
+Status GetMutableOptionsFromStrings(
     const MutableCFOptions& base_options,
     const std::unordered_map<std::string, std::string>& options_map,
     MutableCFOptions* new_options) {
@@ -177,13 +177,14 @@ bool GetMutableOptionsFromStrings(
       } else if (ParseCompactionOptions(o.first, o.second, new_options)) {
       } else if (ParseMiscOptions(o.first, o.second, new_options)) {
       } else {
-        return false;
+        return Status::InvalidArgument(
+            "unsupported dynamic option: " + o.first);
       }
     }
-  } catch (std::exception) {
-    return false;
+  } catch (std::exception& e) {
+    return Status::InvalidArgument("error parsing " + std::string(e.what()));
   }
-  return true;
+  return Status::OK();
 }
 
 namespace {
diff --git a/util/options_helper.h b/util/options_helper.h
index c04d2a5d7..62373b2d5 100644
--- a/util/options_helper.h
+++ b/util/options_helper.h
@@ -7,10 +7,11 @@
 
 #include <string>
 #include "util/mutable_cf_options.h"
+#include "rocksdb/status.h"
 
 namespace rocksdb {
 
-bool GetMutableOptionsFromStrings(
+Status GetMutableOptionsFromStrings(
     const MutableCFOptions& base_options,
     const std::unordered_map<std::string, std::string>& options_map,
     MutableCFOptions* new_options);

From 29a9161f34fff5d5fb76e47175baf2f755c97e45 Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Tue, 4 Nov 2014 16:23:45 -0800
Subject: [PATCH 412/829] Note dynamic options in options.h

Summary: as title

Test Plan: n/a

Reviewers: igor, yhchiang, rven, sdong

Reviewed By: sdong

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D28287
---
 HISTORY.md                |  1 +
 include/rocksdb/options.h | 69 +++++++++++++++++++++++++++++++++++----
 2 files changed, 63 insertions(+), 7 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index 7c0b5a9b8..9a7cd7810 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -3,6 +3,7 @@
 ## Unreleased
 
 ### Public API changes
+* Introduce SetOptions() API to allow adjusting a subset of options dynamically online
 * Introduce 4 new convenient functions for converting Options from string: GetColumnFamilyOptionsFromMap(), GetColumnFamilyOptionsFromString(), GetDBOptionsFromMap(), GetDBOptionsFromString()
 * Remove WriteBatchWithIndex.Delete() overloads using SliceParts
 * When opening a DB, if options.max_background_compactions is larger than the existing low pri pool of options.env, it will enlarge it. Similarly, options.max_background_flushes is larger than the existing high pri pool of options.env, it will enlarge it.
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index d9a82fd5a..b3ce77255 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -191,13 +191,18 @@ struct ColumnFamilyOptions {
   // the next time the database is opened.
   //
   // Default: 4MB
+  //
+  // Dynamically changeable through SetOptions() API
   size_t write_buffer_size;
 
   // The maximum number of write buffers that are built up in memory.
   // The default and the minimum number is 2, so that when 1 write buffer
   // is being flushed to storage, new writes can continue to the other
   // write buffer.
+  //
   // Default: 2
+  //
+  // Dynamically changeable through SetOptions() API
   int max_write_buffer_number;
 
   // The minimum number of write buffers that will be merged together
@@ -260,14 +265,20 @@ struct ColumnFamilyOptions {
   // level-0 compaction will not be triggered by number of files at all.
   //
   // Default: 4
+  //
+  // Dynamically changeable through SetOptions() API
   int level0_file_num_compaction_trigger;
 
   // Soft limit on number of level-0 files. We start slowing down writes at this
   // point. A value <0 means that no writing slow down will be triggered by
   // number of files in level-0.
+  //
+  // Dynamically changeable through SetOptions() API
   int level0_slowdown_writes_trigger;
 
   // Maximum number of level-0 files.  We stop writes at this point.
+  //
+  // Dynamically changeable through SetOptions() API
   int level0_stop_writes_trigger;
 
   // Maximum level to which a new compacted memtable is pushed if it
@@ -276,6 +287,8 @@ struct ColumnFamilyOptions {
   // expensive manifest file operations.  We do not push all the way to
   // the largest level since that can generate a lot of wasted disk
   // space if the same key space is being repeatedly overwritten.
+  //
+  // Dynamically changeable through SetOptions() API
   int max_mem_compaction_level;
 
   // Target file size for compaction.
@@ -286,11 +299,15 @@ struct ColumnFamilyOptions {
   // target_file_size_multiplier is 10, then each file on level-1 will
   // be 2MB, and each file on level 2 will be 20MB,
   // and each file on level-3 will be 200MB.
-
-  // by default target_file_size_base is 2MB.
+  //
+  // Default: 2MB.
+  //
+  // Dynamically changeable through SetOptions() API
   uint64_t target_file_size_base;
-  // by default target_file_size_multiplier is 1, which means
+  // By default target_file_size_multiplier is 1, which means
   // by default files in different levels will have similar size.
+  //
+  // Dynamically changeable through SetOptions() API
   int target_file_size_multiplier;
 
   // Control maximum total data size for a level.
@@ -301,22 +318,31 @@ struct ColumnFamilyOptions {
   // max_bytes_for_level_multiplier is 10, total data size for level-1
   // will be 20MB, total file size for level-2 will be 200MB,
   // and total file size for level-3 will be 2GB.
-
-  // by default 'max_bytes_for_level_base' is 10MB.
+  //
+  // Default: 10MB.
+  //
+  // Dynamically changeable through SetOptions() API
   uint64_t max_bytes_for_level_base;
-  // by default 'max_bytes_for_level_base' is 10.
+  // Default: 10.
+  //
+  // Dynamically changeable through SetOptions() API
   int max_bytes_for_level_multiplier;
 
   // Different max-size multipliers for different levels.
   // These are multiplied by max_bytes_for_level_multiplier to arrive
   // at the max-size of each level.
+  //
   // Default: 1
+  //
+  // Dynamically changeable through SetOptions() API
   std::vector<int> max_bytes_for_level_multiplier_additional;
 
   // Maximum number of bytes in all compacted files.  We avoid expanding
   // the lower level file set of a compaction if it would make the
   // total compaction cover more than
   // (expanded_compaction_factor * targetFileSizeLevel()) many bytes.
+  //
+  // Dynamically changeable through SetOptions() API
   int expanded_compaction_factor;
 
   // Maximum number of bytes in all source files to be compacted in a
@@ -326,22 +352,32 @@ struct ColumnFamilyOptions {
   // (source_compaction_factor * targetFileSizeLevel()) many bytes.
   // Default:1, i.e. pick maxfilesize amount of data as the source of
   // a compaction.
+  //
+  // Dynamically changeable through SetOptions() API
   int source_compaction_factor;
 
   // Control maximum bytes of overlaps in grandparent (i.e., level+2) before we
   // stop building a single file in a level->level+1 compaction.
+  //
+  // Dynamically changeable through SetOptions() API
   int max_grandparent_overlap_factor;
 
   // Puts are delayed 0-1 ms when any level has a compaction score that exceeds
   // soft_rate_limit. This is ignored when == 0.0.
   // CONSTRAINT: soft_rate_limit <= hard_rate_limit. If this constraint does not
   // hold, RocksDB will set soft_rate_limit = hard_rate_limit
+  //
   // Default: 0 (disabled)
+  //
+  // Dynamically changeable through SetOptions() API
   double soft_rate_limit;
 
   // Puts are delayed 1ms at a time when any level has a compaction score that
   // exceeds hard_rate_limit. This is ignored when <= 1.0.
+  //
   // Default: 0 (disabled)
+  //
+  // Dynamically changeable through SetOptions() API
   double hard_rate_limit;
 
   // DEPRECATED -- this options is no longer used
@@ -360,10 +396,14 @@ struct ColumnFamilyOptions {
   // conforms to the restrictions.
   //
   // Default: 0
+  //
+  // Dynamically changeable through SetOptions() API
   size_t arena_block_size;
 
   // Disable automatic compactions. Manual compactions can still
   // be issued on this column family
+  //
+  // Dynamically changeable through SetOptions() API
   bool disable_auto_compactions;
 
   // Purge duplicate/deleted keys when a memtable is flushed to storage.
@@ -388,14 +428,20 @@ struct ColumnFamilyOptions {
   // If KeyMayExist returns false, i.e. the key definitely does not exist, then
   // the delete is a noop. KeyMayExist only incurs in-memory look up.
   // This optimization avoids writing the delete to storage when appropriate.
+  //
   // Default: false
+  //
+  // Dynamically changeable through SetOptions() API
   bool filter_deletes;
 
   // An iteration->Next() sequentially skips over keys with the same
   // user-key unless this option is set. This number specifies the number
   // of keys (with the same userkey) that will be sequentially
   // skipped before a reseek is issued.
+  //
   // Default: 8
+  //
+  // Dynamically changeable through SetOptions() API
   uint64_t max_sequential_skip_in_iterations;
 
   // This is a factory that provides MemTableRep objects.
@@ -444,6 +490,8 @@ struct ColumnFamilyOptions {
 
   // Number of locks used for inplace update
   // Default: 10000, if inplace_update_support = true, else 0.
+  //
+  // Dynamically changeable through SetOptions() API
   size_t inplace_update_num_locks;
 
   // existing_value - pointer to previous value (from both memtable and sst).
@@ -490,9 +538,13 @@ struct ColumnFamilyOptions {
 
   // if prefix_extractor is set and bloom_bits is not 0, create prefix bloom
   // for memtable
+  //
+  // Dynamically changeable through SetOptions() API
   uint32_t memtable_prefix_bloom_bits;
 
   // number of hash probes per key
+  //
+  // Dynamically changeable through SetOptions() API
   uint32_t memtable_prefix_bloom_probes;
 
   // Page size for huge page TLB for bloom in memtable. If <=0, not allocate
@@ -500,7 +552,8 @@ struct ColumnFamilyOptions {
   // Need to reserve huge pages for it to be allocated. For example:
   //      sysctl -w vm.nr_hugepages=20
   // See linux doc Documentation/vm/hugetlbpage.txt
-
+  //
+  // Dynamically changeable through SetOptions() API
   size_t memtable_prefix_bloom_huge_page_tlb_size;
 
   // Control locality of bloom filter probes to improve cache miss rate.
@@ -520,6 +573,8 @@ struct ColumnFamilyOptions {
   // operations in the memtable.
   //
   // Default: 0 (disabled)
+  //
+  // Dynamically changeable through SetOptions() API
   size_t max_successive_merges;
 
   // The number of partial merge operands to accumulate before partial

From 76f6c7c7c402d0f9f814718ed3b516b12c74453f Mon Sep 17 00:00:00 2001
From: maurice barnum <msb@yahoo-inc.com>
Date: Wed, 5 Nov 2014 05:31:11 +0000
Subject: [PATCH 413/829] CompactionFilterV2: eliminate an often unnecessary
 allocation.

If a compaction filter implementation is simply filtering values, then
allocating the "changed values" bitmap is an extra memory allocation
that adds no value. Additionally, the compaction implementation has to
do marginally more work to calculate the offset into the bitmap
(vector<bool> specialization) for each record the filter did not mark
for deletion.

Explicitly handle the case where compact_->value_changed_buf_ is empty.
---
 db/compaction_job.cc | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/db/compaction_job.cc b/db/compaction_job.cc
index 5db087b3c..5c0d95e12 100644
--- a/db/compaction_job.cc
+++ b/db/compaction_job.cc
@@ -890,7 +890,8 @@ void CompactionJob::CallCompactionFilterV2(
   assert(compact_->to_delete_buf_.size() == compact_->key_str_buf_.size());
   assert(compact_->to_delete_buf_.size() ==
          compact_->existing_value_str_buf_.size());
-  assert(compact_->to_delete_buf_.size() ==
+  assert(compact_->value_changed_buf_.empty() ||
+         compact_->to_delete_buf_.size() ==
          compact_->value_changed_buf_.size());
 
   int new_value_idx = 0;
@@ -905,7 +906,8 @@ void CompactionJob::CallCompactionFilterV2(
       // no value associated with delete
       compact_->existing_value_str_buf_[i].clear();
       RecordTick(stats_, COMPACTION_KEY_DROP_USER);
-    } else if (compact_->value_changed_buf_[i]) {
+    } else if (!compact_->value_changed_buf_.empty() &&
+        compact_->value_changed_buf_[i]) {
       compact_->existing_value_str_buf_[i] =
           compact_->new_value_buf_[new_value_idx++];
     }

From e4211d10c1699331b9ddae28ef6ec74633669558 Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Wed, 5 Nov 2014 00:12:20 -0800
Subject: [PATCH 414/829] Apply InfoLogLevel to the logs in util/env_hdfs.cc

Summary: Apply InfoLogLevel to the logs in util/env_hdfs.cc

Test Plan: make

Reviewers: ljin, sdong, igor

Reviewed By: igor

Subscribers: dhruba

Differential Revision: https://reviews.facebook.net/D28011
---
 util/env_hdfs.cc | 92 +++++++++++++++++++++++++++++++-----------------
 1 file changed, 60 insertions(+), 32 deletions(-)

diff --git a/util/env_hdfs.cc b/util/env_hdfs.cc
index 1618e5468..bbd5b9779 100644
--- a/util/env_hdfs.cc
+++ b/util/env_hdfs.cc
@@ -52,18 +52,22 @@ class HdfsReadableFile : virtual public SequentialFile,
  public:
   HdfsReadableFile(hdfsFS fileSys, const std::string& fname)
       : fileSys_(fileSys), filename_(fname), hfile_(nullptr) {
-    Log(mylog, "[hdfs] HdfsReadableFile opening file %s\n",
+    Log(InfoLogLevel::DEBUG_LEVEL, mylog,
+        "[hdfs] HdfsReadableFile opening file %s\n",
         filename_.c_str());
     hfile_ = hdfsOpenFile(fileSys_, filename_.c_str(), O_RDONLY, 0, 0, 0);
-    Log(mylog, "[hdfs] HdfsReadableFile opened file %s hfile_=0x%p\n",
-            filename_.c_str(), hfile_);
+    Log(InfoLogLevel::DEBUG_LEVEL, mylog,
+        "[hdfs] HdfsReadableFile opened file %s hfile_=0x%p\n",
+        filename_.c_str(), hfile_);
   }
 
   virtual ~HdfsReadableFile() {
-    Log(mylog, "[hdfs] HdfsReadableFile closing file %s\n",
-       filename_.c_str());
+    Log(InfoLogLevel::DEBUG_LEVEL, mylog,
+        "[hdfs] HdfsReadableFile closing file %s\n",
+        filename_.c_str());
     hdfsCloseFile(fileSys_, hfile_);
-    Log(mylog, "[hdfs] HdfsReadableFile closed file %s\n",
+    Log(InfoLogLevel::DEBUG_LEVEL, mylog,
+        "[hdfs] HdfsReadableFile closed file %s\n",
         filename_.c_str());
     hfile_ = nullptr;
   }
@@ -75,7 +79,8 @@ class HdfsReadableFile : virtual public SequentialFile,
   // sequential access, read data at current offset in file
   virtual Status Read(size_t n, Slice* result, char* scratch) {
     Status s;
-    Log(mylog, "[hdfs] HdfsReadableFile reading %s %ld\n",
+    Log(InfoLogLevel::DEBUG_LEVEL, mylog,
+        "[hdfs] HdfsReadableFile reading %s %ld\n",
         filename_.c_str(), n);
 
     char* buffer = scratch;
@@ -97,7 +102,8 @@ class HdfsReadableFile : virtual public SequentialFile,
     }
     assert(total_bytes_read <= n);
 
-    Log(mylog, "[hdfs] HdfsReadableFile read %s\n", filename_.c_str());
+    Log(InfoLogLevel::DEBUG_LEVEL, mylog,
+        "[hdfs] HdfsReadableFile read %s\n", filename_.c_str());
 
     if (bytes_read < 0) {
       s = IOError(filename_, errno);
@@ -112,10 +118,12 @@ class HdfsReadableFile : virtual public SequentialFile,
   virtual Status Read(uint64_t offset, size_t n, Slice* result,
                       char* scratch) const {
     Status s;
-    Log(mylog, "[hdfs] HdfsReadableFile preading %s\n", filename_.c_str());
+    Log(InfoLogLevel::DEBUG_LEVEL, mylog,
+        "[hdfs] HdfsReadableFile preading %s\n", filename_.c_str());
     ssize_t bytes_read = hdfsPread(fileSys_, hfile_, offset,
                                    (void*)scratch, (tSize)n);
-    Log(mylog, "[hdfs] HdfsReadableFile pread %s\n", filename_.c_str());
+    Log(InfoLogLevel::DEBUG_LEVEL, mylog,
+        "[hdfs] HdfsReadableFile pread %s\n", filename_.c_str());
     *result = Slice(scratch, (bytes_read < 0) ? 0 : bytes_read);
     if (bytes_read < 0) {
       // An error: return a non-ok status
@@ -125,7 +133,8 @@ class HdfsReadableFile : virtual public SequentialFile,
   }
 
   virtual Status Skip(uint64_t n) {
-    Log(mylog, "[hdfs] HdfsReadableFile skip %s\n", filename_.c_str());
+    Log(InfoLogLevel::DEBUG_LEVEL, mylog,
+        "[hdfs] HdfsReadableFile skip %s\n", filename_.c_str());
     // get current offset from file
     tOffset current = hdfsTell(fileSys_, hfile_);
     if (current < 0) {
@@ -144,7 +153,8 @@ class HdfsReadableFile : virtual public SequentialFile,
 
   // returns true if we are at the end of file, false otherwise
   bool feof() {
-    Log(mylog, "[hdfs] HdfsReadableFile feof %s\n", filename_.c_str());
+    Log(InfoLogLevel::DEBUG_LEVEL, mylog,
+        "[hdfs] HdfsReadableFile feof %s\n", filename_.c_str());
     if (hdfsTell(fileSys_, hfile_) == fileSize()) {
       return true;
     }
@@ -153,7 +163,8 @@ class HdfsReadableFile : virtual public SequentialFile,
 
   // the current size of the file
   tOffset fileSize() {
-    Log(mylog, "[hdfs] HdfsReadableFile fileSize %s\n", filename_.c_str());
+    Log(InfoLogLevel::DEBUG_LEVEL, mylog,
+        "[hdfs] HdfsReadableFile fileSize %s\n", filename_.c_str());
     hdfsFileInfo* pFileInfo = hdfsGetPathInfo(fileSys_, filename_.c_str());
     tOffset size = 0L;
     if (pFileInfo != nullptr) {
@@ -176,16 +187,20 @@ class HdfsWritableFile: public WritableFile {
  public:
   HdfsWritableFile(hdfsFS fileSys, const std::string& fname)
       : fileSys_(fileSys), filename_(fname) , hfile_(nullptr) {
-    Log(mylog, "[hdfs] HdfsWritableFile opening %s\n", filename_.c_str());
+    Log(InfoLogLevel::DEBUG_LEVEL, mylog,
+        "[hdfs] HdfsWritableFile opening %s\n", filename_.c_str());
     hfile_ = hdfsOpenFile(fileSys_, filename_.c_str(), O_WRONLY, 0, 0, 0);
-    Log(mylog, "[hdfs] HdfsWritableFile opened %s\n", filename_.c_str());
+    Log(InfoLogLevel::DEBUG_LEVEL, mylog,
+        "[hdfs] HdfsWritableFile opened %s\n", filename_.c_str());
     assert(hfile_ != nullptr);
   }
   virtual ~HdfsWritableFile() {
     if (hfile_ != nullptr) {
-      Log(mylog, "[hdfs] HdfsWritableFile closing %s\n", filename_.c_str());
+      Log(InfoLogLevel::DEBUG_LEVEL, mylog,
+          "[hdfs] HdfsWritableFile closing %s\n", filename_.c_str());
       hdfsCloseFile(fileSys_, hfile_);
-      Log(mylog, "[hdfs] HdfsWritableFile closed %s\n", filename_.c_str());
+      Log(InfoLogLevel::DEBUG_LEVEL, mylog,
+          "[hdfs] HdfsWritableFile closed %s\n", filename_.c_str());
       hfile_ = nullptr;
     }
   }
@@ -202,11 +217,13 @@ class HdfsWritableFile: public WritableFile {
   }
 
   virtual Status Append(const Slice& data) {
-    Log(mylog, "[hdfs] HdfsWritableFile Append %s\n", filename_.c_str());
+    Log(InfoLogLevel::DEBUG_LEVEL, mylog,
+        "[hdfs] HdfsWritableFile Append %s\n", filename_.c_str());
     const char* src = data.data();
     size_t left = data.size();
     size_t ret = hdfsWrite(fileSys_, hfile_, src, left);
-    Log(mylog, "[hdfs] HdfsWritableFile Appended %s\n", filename_.c_str());
+    Log(InfoLogLevel::DEBUG_LEVEL, mylog,
+        "[hdfs] HdfsWritableFile Appended %s\n", filename_.c_str());
     if (ret != left) {
       return IOError(filename_, errno);
     }
@@ -219,14 +236,16 @@ class HdfsWritableFile: public WritableFile {
 
   virtual Status Sync() {
     Status s;
-    Log(mylog, "[hdfs] HdfsWritableFile Sync %s\n", filename_.c_str());
+    Log(InfoLogLevel::DEBUG_LEVEL, mylog,
+        "[hdfs] HdfsWritableFile Sync %s\n", filename_.c_str());
     if (hdfsFlush(fileSys_, hfile_) == -1) {
       return IOError(filename_, errno);
     }
     if (hdfsHSync(fileSys_, hfile_) == -1) {
       return IOError(filename_, errno);
     }
-    Log(mylog, "[hdfs] HdfsWritableFile Synced %s\n", filename_.c_str());
+    Log(InfoLogLevel::DEBUG_LEVEL, mylog,
+        "[hdfs] HdfsWritableFile Synced %s\n", filename_.c_str());
     return Status::OK();
   }
 
@@ -239,11 +258,13 @@ class HdfsWritableFile: public WritableFile {
   }
 
   virtual Status Close() {
-    Log(mylog, "[hdfs] HdfsWritableFile closing %s\n", filename_.c_str());
+    Log(InfoLogLevel::DEBUG_LEVEL, mylog,
+        "[hdfs] HdfsWritableFile closing %s\n", filename_.c_str());
     if (hdfsCloseFile(fileSys_, hfile_) != 0) {
       return IOError(filename_, errno);
     }
-    Log(mylog, "[hdfs] HdfsWritableFile closed %s\n", filename_.c_str());
+    Log(InfoLogLevel::DEBUG_LEVEL, mylog,
+        "[hdfs] HdfsWritableFile closed %s\n", filename_.c_str());
     hfile_ = nullptr;
     return Status::OK();
   }
@@ -258,13 +279,15 @@ class HdfsLogger : public Logger {
  public:
   HdfsLogger(HdfsWritableFile* f, uint64_t (*gettid)())
       : file_(f), gettid_(gettid) {
-    Log(mylog, "[hdfs] HdfsLogger opened %s\n",
-            file_->getName().c_str());
+    Log(InfoLogLevel::DEBUG_LEVEL, mylog,
+        "[hdfs] HdfsLogger opened %s\n",
+        file_->getName().c_str());
   }
 
   virtual ~HdfsLogger() {
-    Log(mylog, "[hdfs] HdfsLogger closed %s\n",
-            file_->getName().c_str());
+    Log(InfoLogLevel::DEBUG_LEVEL, mylog,
+        "[hdfs] HdfsLogger closed %s\n",
+        file_->getName().c_str());
     delete file_;
     if (mylog != nullptr && mylog == this) {
       mylog = nullptr;
@@ -417,7 +440,8 @@ Status HdfsEnv::NewDirectory(const std::string& name,
       result->reset(new HdfsDirectory(0));
       return Status::OK();
     default:  // fail if the directory doesn't exist
-      Log(mylog, "NewDirectory hdfsExists call failed");
+      Log(InfoLogLevel::FATAL_LEVEL,
+          mylog, "NewDirectory hdfsExists call failed");
       throw HdfsFatalException("hdfsExists call failed with error " +
                                std::to_string(value) + " on path " + name +
                                ".\n");
@@ -433,7 +457,8 @@ bool HdfsEnv::FileExists(const std::string& fname) {
     case HDFS_DOESNT_EXIST:
       return false;
     default:  // anything else should be an error
-      Log(mylog, "FileExists hdfsExists call failed");
+      Log(InfoLogLevel::FATAL_LEVEL,
+          mylog, "FileExists hdfsExists call failed");
       throw HdfsFatalException("hdfsExists call failed with error " +
                                std::to_string(value) + " on path " + fname +
                                ".\n");
@@ -461,7 +486,8 @@ Status HdfsEnv::GetChildren(const std::string& path,
       }
     } else {
       // numEntries < 0 indicates error
-      Log(mylog, "hdfsListDirectory call failed with error ");
+      Log(InfoLogLevel::FATAL_LEVEL, mylog,
+          "hdfsListDirectory call failed with error ");
       throw HdfsFatalException(
           "hdfsListDirectory call failed negative error.\n");
     }
@@ -470,7 +496,8 @@ Status HdfsEnv::GetChildren(const std::string& path,
   case HDFS_DOESNT_EXIST:  // directory does not exist, exit
     break;
   default:          // anything else should be an error
-    Log(mylog, "GetChildren hdfsExists call failed");
+    Log(InfoLogLevel::FATAL_LEVEL, mylog,
+        "GetChildren hdfsExists call failed");
     throw HdfsFatalException("hdfsExists call failed with error " +
                              std::to_string(value) + ".\n");
   }
@@ -500,7 +527,8 @@ Status HdfsEnv::CreateDirIfMissing(const std::string& name) {
     case HDFS_DOESNT_EXIST:
     return CreateDir(name);
     default:  // anything else should be an error
-      Log(mylog, "CreateDirIfMissing hdfsExists call failed");
+      Log(InfoLogLevel::FATAL_LEVEL, mylog,
+          "CreateDirIfMissing hdfsExists call failed");
       throw HdfsFatalException("hdfsExists call failed with error " +
                                std::to_string(value) + ".\n");
   }

From 2ea1219eb615f03130a6c0ef713cd3ab31d67553 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Fri, 31 Oct 2014 18:36:07 -0700
Subject: [PATCH 415/829] Fix RecordIn and RecordDrop stats

Summary:
1. fix possible overflow of the two stats by using uint64_t
2. use a similar source of data to calculate RecordDrop. Previous one is not correct.

Test Plan: See outputs of db_bench settings, and the results look reasonable

Reviewers: MarkCallaghan, ljin, igor

Reviewed By: igor

Subscribers: rven, leveldb, yhchiang, dhruba

Differential Revision: https://reviews.facebook.net/D28155
---
 db/compaction_job.cc | 43 ++++++++++++------------
 db/compaction_job.h  |  3 +-
 db/internal_stats.cc | 80 ++++++++++++++++++++------------------------
 db/internal_stats.h  |  4 +--
 4 files changed, 60 insertions(+), 70 deletions(-)

diff --git a/db/compaction_job.cc b/db/compaction_job.cc
index 5db087b3c..dc472233b 100644
--- a/db/compaction_job.cc
+++ b/db/compaction_job.cc
@@ -81,7 +81,11 @@ struct CompactionJob::CompactionState {
 
   Output* current_output() { return &outputs[outputs.size() - 1]; }
 
-  explicit CompactionState(Compaction* c) : compaction(c), total_bytes(0) {}
+  explicit CompactionState(Compaction* c)
+      : compaction(c),
+        total_bytes(0),
+        num_input_records(0),
+        num_output_records(0) {}
 
   // Create a client visible context of this compaction
   CompactionFilter::Context GetFilterContextV1() {
@@ -117,6 +121,9 @@ struct CompactionJob::CompactionState {
 
   std::string cur_prefix_;
 
+  uint64_t num_input_records;
+  uint64_t num_output_records;
+
   // Buffers the kv-pair that will be run through compaction filter V2
   // in the future.
   void BufferKeyValueSlices(const Slice& key, const Slice& value) {
@@ -271,7 +278,6 @@ Status CompactionJob::Run() {
   log_buffer_->FlushBufferToLog();
   ColumnFamilyData* cfd = compact_->compaction->column_family_data();
 
-  int num_output_records = 0;
   const uint64_t start_micros = env_->NowMicros();
   std::unique_ptr<Iterator> input(
       versions_->MakeInputIterator(compact_->compaction));
@@ -289,8 +295,7 @@ Status CompactionJob::Run() {
 
   int64_t imm_micros = 0;  // Micros spent doing imm_ compactions
   if (!compaction_filter_v2) {
-    status = ProcessKeyValueCompaction(&imm_micros, input.get(), false,
-                                       &num_output_records);
+    status = ProcessKeyValueCompaction(&imm_micros, input.get(), false);
   } else {
     // temp_backup_input always point to the start of the current buffer
     // temp_backup_input = backup_input;
@@ -361,8 +366,7 @@ Status CompactionJob::Run() {
 
       // Done buffering for the current prefix. Spit it out to disk
       // Now just iterate through all the kv-pairs
-      status = ProcessKeyValueCompaction(&imm_micros, input.get(), true,
-                                         &num_output_records);
+      status = ProcessKeyValueCompaction(&imm_micros, input.get(), true);
 
       if (!status.ok()) {
         break;
@@ -387,8 +391,7 @@ Status CompactionJob::Run() {
         }
         compact_->MergeKeyValueSliceBuffer(&cfd->internal_comparator());
 
-        status = ProcessKeyValueCompaction(&imm_micros, input.get(), true,
-                                           &num_output_records);
+        status = ProcessKeyValueCompaction(&imm_micros, input.get(), true);
 
         compact_->CleanupBatchBuffer();
         compact_->CleanupMergedBuffer();
@@ -399,8 +402,7 @@ Status CompactionJob::Run() {
       CallCompactionFilterV2(compaction_filter_v2);
     }
     compact_->MergeKeyValueSliceBuffer(&cfd->internal_comparator());
-    status = ProcessKeyValueCompaction(&imm_micros, input.get(), true,
-                                       &num_output_records);
+    status = ProcessKeyValueCompaction(&imm_micros, input.get(), true);
   }  // checking for compaction filter v2
 
   if (status.ok() &&
@@ -434,27 +436,26 @@ Status CompactionJob::Run() {
   }
   compaction_stats_.files_out_levelnp1 = num_output_files;
 
-  uint64_t num_input_records = 0;
-
   for (int i = 0; i < compact_->compaction->num_input_files(0); i++) {
     compaction_stats_.bytes_readn +=
         compact_->compaction->input(0, i)->fd.GetFileSize();
     compaction_stats_.num_input_records +=
-        compact_->compaction->input(0, i)->num_entries;
-    num_input_records += compact_->compaction->input(0, i)->num_entries;
+        static_cast<uint64_t>(compact_->compaction->input(0, i)->num_entries);
   }
 
   for (int i = 0; i < compact_->compaction->num_input_files(1); i++) {
     compaction_stats_.bytes_readnp1 +=
         compact_->compaction->input(1, i)->fd.GetFileSize();
-    num_input_records += compact_->compaction->input(1, i)->num_entries;
   }
 
   for (int i = 0; i < num_output_files; i++) {
     compaction_stats_.bytes_written += compact_->outputs[i].file_size;
   }
-  compaction_stats_.num_dropped_records =
-      static_cast<int>(num_input_records) - num_output_records;
+  if (compact_->num_input_records > compact_->num_output_records) {
+    compaction_stats_.num_dropped_records +=
+        compact_->num_input_records - compact_->num_output_records;
+    compact_->num_input_records = compact_->num_output_records = 0;
+  }
 
   RecordCompactionIOStats();
 
@@ -518,10 +519,7 @@ void CompactionJob::AllocateCompactionOutputFileNumbers() {
 
 Status CompactionJob::ProcessKeyValueCompaction(int64_t* imm_micros,
                                                 Iterator* input,
-                                                bool is_compaction_v2,
-                                                int* num_output_records) {
-  assert(num_output_records != nullptr);
-
+                                                bool is_compaction_v2) {
   size_t combined_idx = 0;
   Status status;
   std::string compaction_filter_value;
@@ -553,6 +551,7 @@ Status CompactionJob::ProcessKeyValueCompaction(int64_t* imm_micros,
   int64_t loop_cnt = 0;
   while (input->Valid() && !shutting_down_->load(std::memory_order_acquire) &&
          !cfd->IsDropped() && status.ok()) {
+    compact_->num_input_records++;
     if (++loop_cnt > 1000) {
       if (key_drop_user > 0) {
         RecordTick(stats_, COMPACTION_KEY_DROP_USER, key_drop_user);
@@ -795,7 +794,7 @@ Status CompactionJob::ProcessKeyValueCompaction(int64_t* imm_micros,
         }
         compact_->current_output()->largest.DecodeFrom(newkey);
         compact_->builder->Add(newkey, value);
-        (*num_output_records)++,
+        compact_->num_output_records++,
             compact_->current_output()->largest_seqno =
                 std::max(compact_->current_output()->largest_seqno, seqno);
 
diff --git a/db/compaction_job.h b/db/compaction_job.h
index 7b91e012a..f090c351d 100644
--- a/db/compaction_job.h
+++ b/db/compaction_job.h
@@ -83,8 +83,7 @@ class CompactionJob {
   // Call compaction filter if is_compaction_v2 is not true. Then iterate
   // through input and compact the kv-pairs
   Status ProcessKeyValueCompaction(int64_t* imm_micros, Iterator* input,
-                                   bool is_compaction_v2,
-                                   int* num_output_records);
+                                   bool is_compaction_v2);
   // Call compaction_filter_v2->Filter() on kv-pairs in compact
   void CallCompactionFilterV2(CompactionFilterV2* compaction_filter_v2);
   Status FinishCompactionOutputFile(Iterator* input);
diff --git a/db/internal_stats.cc b/db/internal_stats.cc
index cda75e0c8..617626cb1 100644
--- a/db/internal_stats.cc
+++ b/db/internal_stats.cc
@@ -46,50 +46,42 @@ void PrintLevelStats(char* buf, size_t len, const std::string& name,
   double elapsed = (stats.micros + 1) / 1000000.0;
 
   snprintf(buf, len,
-    "%4s %5d/%-3d %8.0f %5.1f " /* Level, Files, Size(MB), Score */
-    "%8.1f " /* Read(GB) */
-    "%7.1f " /* Rn(GB) */
-    "%8.1f " /* Rnp1(GB) */
-    "%9.1f " /* Write(GB) */
-    "%8.1f " /* Wnew(GB) */
-    "%6.1f " /* RW-Amp */
-    "%5.1f " /* W-Amp */
-    "%8.1f " /* Rd(MB/s) */
-    "%8.1f " /* Wr(MB/s) */
-    "%8d " /* Rn(cnt) */
-    "%9d " /* Rnp1(cnt) */
-    "%9d " /* Wnp1(cnt) */
-    "%9d " /* Wnew(cnt) */
-    "%10.0f " /* Comp(sec) */
-    "%9d " /* Comp(cnt) */
-    "%8.3f " /* Avg(sec) */
-    "%10.2f " /* Stall(sec) */
-    "%10" PRIu64 " " /* Stall(cnt) */
-    "%7.2f " /* Avg(ms) */
-    "%12d " /* input entries */
-    "%12d\n" /* number of records reduced */,
-    name.c_str(), num_files, being_compacted, total_file_size / kMB, score,
-    bytes_read / kGB,
-    stats.bytes_readn / kGB,
-    stats.bytes_readnp1 / kGB,
-    stats.bytes_written / kGB,
-    bytes_new / kGB,
-    rw_amp,
-    w_amp,
-    bytes_read / kMB / elapsed,
-    stats.bytes_written / kMB / elapsed,
-    stats.files_in_leveln,
-    stats.files_in_levelnp1,
-    stats.files_out_levelnp1,
-    stats.files_out_levelnp1 - stats.files_in_levelnp1,
-    stats.micros / 1000000.0,
-    stats.count,
-    stats.count == 0 ? 0 : stats.micros / 1000000.0 / stats.count,
-    stall_us / 1000000.0,
-    stalls,
-    stalls == 0 ? 0 : stall_us / 1000.0 / stalls,
-    stats.num_input_records,
-    stats.num_dropped_records);
+           "%4s %5d/%-3d %8.0f %5.1f " /* Level, Files, Size(MB), Score */
+           "%8.1f "                    /* Read(GB) */
+           "%7.1f "                    /* Rn(GB) */
+           "%8.1f "                    /* Rnp1(GB) */
+           "%9.1f "                    /* Write(GB) */
+           "%8.1f "                    /* Wnew(GB) */
+           "%6.1f "                    /* RW-Amp */
+           "%5.1f "                    /* W-Amp */
+           "%8.1f "                    /* Rd(MB/s) */
+           "%8.1f "                    /* Wr(MB/s) */
+           "%8d "                      /* Rn(cnt) */
+           "%9d "                      /* Rnp1(cnt) */
+           "%9d "                      /* Wnp1(cnt) */
+           "%9d "                      /* Wnew(cnt) */
+           "%10.0f "                   /* Comp(sec) */
+           "%9d "                      /* Comp(cnt) */
+           "%8.3f "                    /* Avg(sec) */
+           "%10.2f "                   /* Stall(sec) */
+           "%10" PRIu64
+           " "      /* Stall(cnt) */
+           "%7.2f " /* Avg(ms) */
+           "%12" PRIu64
+           " " /* input entries */
+           "%12" PRIu64 "\n" /* number of records reduced */,
+           name.c_str(), num_files, being_compacted, total_file_size / kMB,
+           score, bytes_read / kGB, stats.bytes_readn / kGB,
+           stats.bytes_readnp1 / kGB, stats.bytes_written / kGB,
+           bytes_new / kGB, rw_amp, w_amp, bytes_read / kMB / elapsed,
+           stats.bytes_written / kMB / elapsed, stats.files_in_leveln,
+           stats.files_in_levelnp1, stats.files_out_levelnp1,
+           stats.files_out_levelnp1 - stats.files_in_levelnp1,
+           stats.micros / 1000000.0, stats.count,
+           stats.count == 0 ? 0 : stats.micros / 1000000.0 / stats.count,
+           stall_us / 1000000.0, stalls,
+           stalls == 0 ? 0 : stall_us / 1000.0 / stalls,
+           stats.num_input_records, stats.num_dropped_records);
 }
 
 
diff --git a/db/internal_stats.h b/db/internal_stats.h
index 0c3ee6db7..2fbcefd4c 100644
--- a/db/internal_stats.h
+++ b/db/internal_stats.h
@@ -127,11 +127,11 @@ class InternalStats {
     int files_out_levelnp1;
 
     // Total incoming entries during compaction between levels N and N+1
-    int num_input_records;
+    uint64_t num_input_records;
 
     // Accumulated diff number of entries
     // (num input entries - num output entires) for compaction  levels N and N+1
-    int num_dropped_records;
+    uint64_t num_dropped_records;
 
     // Number of compactions done
     int count;

From ea18b944a7d9e94ec3151656e47efdd5ff170ac1 Mon Sep 17 00:00:00 2001
From: Shi Feng <sfeng@fb.com>
Date: Wed, 29 Oct 2014 14:24:34 -0700
Subject: [PATCH 416/829] Add db_bench option --report_file_operations

Summary: Add db_bench option --report_file_operations

Test Plan:
./db_bench --report_file_operations
Observe outputs on # of file operations

Reviewers: ljin, MarkCallaghan, sdong

Reviewed By: sdong

Subscribers: yhchiang, rven, igor, dhruba

Differential Revision: https://reviews.facebook.net/D27945
---
 db/db_bench.cc | 156 ++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 155 insertions(+), 1 deletion(-)

diff --git a/db/db_bench.cc b/db/db_bench.cc
index fcc930e67..79572e875 100644
--- a/db/db_bench.cc
+++ b/db/db_bench.cc
@@ -575,6 +575,8 @@ DEFINE_string(merge_operator, "", "The merge operator to use with the database."
 DEFINE_int32(skip_list_lookahead, 0, "Used with skip_list memtablerep; try "
              "linear search first for this many steps from the previous "
              "position");
+DEFINE_bool(report_file_operations, false, "if report number of file "
+            "operations");
 
 static const bool FLAGS_soft_rate_limit_dummy __attribute__((unused)) =
     RegisterFlagValidator(&FLAGS_soft_rate_limit, &ValidateRateLimit);
@@ -606,6 +608,131 @@ static const bool FLAGS_table_cache_numshardbits_dummy __attribute__((unused)) =
 
 namespace rocksdb {
 
+namespace {
+struct ReportFileOpCounters {
+  std::atomic<int> open_counter_;
+  std::atomic<int> read_counter_;
+  std::atomic<int> append_counter_;
+  std::atomic<uint64_t> bytes_read_;
+  std::atomic<uint64_t> bytes_written_;
+};
+
+// A special Env to records and report file operations in db_bench
+class ReportFileOpEnv : public EnvWrapper {
+ public:
+  explicit ReportFileOpEnv(Env* base) : EnvWrapper(base) { reset(); }
+
+  void reset() {
+    counters_.open_counter_ = 0;
+    counters_.read_counter_ = 0;
+    counters_.append_counter_ = 0;
+    counters_.bytes_read_ = 0;
+    counters_.bytes_written_ = 0;
+  }
+
+  Status NewSequentialFile(const std::string& f, unique_ptr<SequentialFile>* r,
+                           const EnvOptions& soptions) {
+    class CountingFile : public SequentialFile {
+     private:
+      unique_ptr<SequentialFile> target_;
+      ReportFileOpCounters* counters_;
+
+     public:
+      CountingFile(unique_ptr<SequentialFile>&& target,
+                   ReportFileOpCounters* counters)
+          : target_(std::move(target)), counters_(counters) {}
+
+      virtual Status Read(size_t n, Slice* result, char* scratch) {
+        counters_->read_counter_.fetch_add(1, std::memory_order_relaxed);
+        Status rv = target_->Read(n, result, scratch);
+        counters_->bytes_read_.fetch_add(result->size(),
+                                         std::memory_order_relaxed);
+        return rv;
+      }
+
+      virtual Status Skip(uint64_t n) { return target_->Skip(n); }
+    };
+
+    Status s = target()->NewSequentialFile(f, r, soptions);
+    if (s.ok()) {
+      counters()->open_counter_.fetch_add(1, std::memory_order_relaxed);
+      r->reset(new CountingFile(std::move(*r), counters()));
+    }
+    return s;
+  }
+
+  Status NewRandomAccessFile(const std::string& f,
+                             unique_ptr<RandomAccessFile>* r,
+                             const EnvOptions& soptions) {
+    class CountingFile : public RandomAccessFile {
+     private:
+      unique_ptr<RandomAccessFile> target_;
+      ReportFileOpCounters* counters_;
+
+     public:
+      CountingFile(unique_ptr<RandomAccessFile>&& target,
+                   ReportFileOpCounters* counters)
+          : target_(std::move(target)), counters_(counters) {}
+      virtual Status Read(uint64_t offset, size_t n, Slice* result,
+                          char* scratch) const {
+        counters_->read_counter_.fetch_add(1, std::memory_order_relaxed);
+        Status rv = target_->Read(offset, n, result, scratch);
+        counters_->bytes_read_.fetch_add(result->size(),
+                                         std::memory_order_relaxed);
+        return rv;
+      }
+    };
+
+    Status s = target()->NewRandomAccessFile(f, r, soptions);
+    if (s.ok()) {
+      counters()->open_counter_.fetch_add(1, std::memory_order_relaxed);
+      r->reset(new CountingFile(std::move(*r), counters()));
+    }
+    return s;
+  }
+
+  Status NewWritableFile(const std::string& f, unique_ptr<WritableFile>* r,
+                         const EnvOptions& soptions) {
+    class CountingFile : public WritableFile {
+     private:
+      unique_ptr<WritableFile> target_;
+      ReportFileOpCounters* counters_;
+
+     public:
+      CountingFile(unique_ptr<WritableFile>&& target,
+                   ReportFileOpCounters* counters)
+          : target_(std::move(target)), counters_(counters) {}
+
+      Status Append(const Slice& data) {
+        counters_->append_counter_.fetch_add(1, std::memory_order_relaxed);
+        Status rv = target_->Append(data);
+        counters_->bytes_written_.fetch_add(data.size(),
+                                            std::memory_order_relaxed);
+        return rv;
+      }
+
+      Status Close() { return target_->Close(); }
+      Status Flush() { return target_->Flush(); }
+      Status Sync() { return target_->Sync(); }
+    };
+
+    Status s = target()->NewWritableFile(f, r, soptions);
+    if (s.ok()) {
+      counters()->open_counter_.fetch_add(1, std::memory_order_relaxed);
+      r->reset(new CountingFile(std::move(*r), counters()));
+    }
+    return s;
+  }
+
+  // getter
+  ReportFileOpCounters* counters() { return &counters_; }
+
+ private:
+  ReportFileOpCounters counters_;
+};
+
+}  // namespace
+
 // Helper for quickly generating random data.
 class RandomGenerator {
  private:
@@ -810,6 +937,21 @@ class Stats {
     if (FLAGS_histogram) {
       fprintf(stdout, "Microseconds per op:\n%s\n", hist_.ToString().c_str());
     }
+    if (FLAGS_report_file_operations) {
+      ReportFileOpEnv* env = static_cast<ReportFileOpEnv*>(FLAGS_env);
+      ReportFileOpCounters* counters = env->counters();
+      fprintf(stdout, "Num files opened: %d\n",
+              counters->open_counter_.load(std::memory_order_relaxed));
+      fprintf(stdout, "Num Read(): %d\n",
+              counters->read_counter_.load(std::memory_order_relaxed));
+      fprintf(stdout, "Num Append(): %d\n",
+              counters->append_counter_.load(std::memory_order_relaxed));
+      fprintf(stdout, "Num bytes read: %" PRIu64 "\n",
+              counters->bytes_read_.load(std::memory_order_relaxed));
+      fprintf(stdout, "Num bytes written: %" PRIu64 "\n",
+              counters->bytes_written_.load(std::memory_order_relaxed));
+      env->reset();
+    }
     fflush(stdout);
   }
 };
@@ -899,6 +1041,7 @@ class Benchmark {
   int64_t writes_;
   int64_t readwrites_;
   int64_t merge_keys_;
+  bool report_file_operations_;
 
   bool SanityCheck() {
     if (FLAGS_compression_ratio > 1) {
@@ -1118,7 +1261,18 @@ class Benchmark {
     readwrites_((FLAGS_writes < 0  && FLAGS_reads < 0)? FLAGS_num :
                 ((FLAGS_writes > FLAGS_reads) ? FLAGS_writes : FLAGS_reads)
                ),
-    merge_keys_(FLAGS_merge_keys < 0 ? FLAGS_num : FLAGS_merge_keys) {
+    merge_keys_(FLAGS_merge_keys < 0 ? FLAGS_num : FLAGS_merge_keys),
+    report_file_operations_(FLAGS_report_file_operations) {
+    if (report_file_operations_) {
+      if (!FLAGS_hdfs.empty()) {
+        fprintf(stderr,
+                "--hdfs and --report_file_operations cannot be enabled "
+                "at the same time");
+        exit(1);
+      }
+      FLAGS_env = new ReportFileOpEnv(rocksdb::Env::Default());
+    }
+
     if (FLAGS_prefix_size > FLAGS_key_size) {
       fprintf(stderr, "prefix size is larger than key size");
       exit(1);

From 59d54979899386ebe21331409fee297b1ce1d989 Mon Sep 17 00:00:00 2001
From: Eugene Su <su.eugene@gmail.com>
Date: Thu, 6 Nov 2014 16:25:53 +0800
Subject: [PATCH 417/829] suppress JDK8 errors for #385

---
 java/org/rocksdb/AbstractComparator.java      |  8 ++--
 java/org/rocksdb/AbstractSlice.java           | 13 +++---
 java/org/rocksdb/BloomFilter.java             |  2 +-
 .../rocksdb/ColumnFamilyOptionsInterface.java | 44 +++++++++----------
 java/org/rocksdb/DBOptionsInterface.java      |  2 +
 java/org/rocksdb/PlainTableConfig.java        |  2 +-
 java/org/rocksdb/RocksDB.java                 | 15 +++----
 java/org/rocksdb/RocksIterator.java           |  8 ++--
 java/org/rocksdb/RocksObject.java             |  8 ++--
 java/org/rocksdb/Slice.java                   |  4 +-
 java/org/rocksdb/WriteBatch.java              |  8 ++--
 .../rocksdb/test/AbstractComparatorTest.java  |  2 +-
 .../rocksdb/test/PlatformRandomHelper.java    |  2 +-
 13 files changed, 58 insertions(+), 60 deletions(-)

diff --git a/java/org/rocksdb/AbstractComparator.java b/java/org/rocksdb/AbstractComparator.java
index 8de50e271..5302f43b3 100644
--- a/java/org/rocksdb/AbstractComparator.java
+++ b/java/org/rocksdb/AbstractComparator.java
@@ -39,9 +39,9 @@ public abstract class AbstractComparator<T extends AbstractSlice>
    *  @param b Slice access to second key
    *
    *  @return Should return either:
-   *    1) < 0 if "a" < "b"
+   *    1) &lt; 0 if "a" &lt; "b"
    *    2) == 0 if "a" == "b"
-   *    3) > 0 if "a" > "b"
+   *    3) &gt; 0 if "a" &gt; "b"
    */
   public abstract int compare(final T a, final T b);
 
@@ -49,7 +49,7 @@ public abstract class AbstractComparator<T extends AbstractSlice>
    * Used to reduce the space requirements
    * for internal data structures like index blocks.
    *
-   * If start < limit, you may return a new start which is a
+   * If start &lt; limit, you may return a new start which is a
    * shorter string in [start, limit).
    *
    * Simple comparator implementations may return null if they
@@ -67,7 +67,7 @@ public abstract class AbstractComparator<T extends AbstractSlice>
    * for internal data structures like index blocks.
    *
    * You may return a new short key (key1) where
-   * key1 >= key.
+   * key1 &ge; key.
    *
    * Simple comparator implementations may return null if they
    * wish to leave the key unchanged. i.e., an implementation of
diff --git a/java/org/rocksdb/AbstractSlice.java b/java/org/rocksdb/AbstractSlice.java
index 971bd7c1a..2b0d80c6f 100644
--- a/java/org/rocksdb/AbstractSlice.java
+++ b/java/org/rocksdb/AbstractSlice.java
@@ -19,10 +19,10 @@ package org.rocksdb;
  *   instance of a C++ BaseComparatorJniCallback subclass and
  *   passes that to RocksDB as the comparator. That subclass of
  *   BaseComparatorJniCallback creates the Java
- *   {@see org.rocksdb.AbstractSlice} subclass Objects. When you dispose
- *   the Java {@see org.rocksdb.AbstractComparator} subclass, it disposes the
+ *   @see org.rocksdb.AbstractSlice subclass Objects. When you dispose
+ *   the Java @see org.rocksdb.AbstractComparator subclass, it disposes the
  *   C++ BaseComparatorJniCallback subclass, which in turn destroys the
- *   Java {@see org.rocksdb.AbstractSlice} subclass Objects.
+ *   Java @see org.rocksdb.AbstractSlice subclass Objects.
  */
 abstract class AbstractSlice<T> extends RocksObject {
 
@@ -31,7 +31,7 @@ abstract class AbstractSlice<T> extends RocksObject {
    *
    * @return The slice data. Note, the type of access is
    *   determined by the subclass
-   *   @see org.rocksdb.AbstractSlice#data0(long).
+   *   @see org.rocksdb.AbstractSlice#data0(long)
    */
   public T data() {
     assert (isInitialized());
@@ -95,9 +95,9 @@ abstract class AbstractSlice<T> extends RocksObject {
    *  @param other A slice to compare against
    *
    *  @return Should return either:
-   *    1) < 0 if this < other
+   *    1) &lt; 0 if this &lt; other
    *    2) == 0 if this == other
-   *    3) > 0 if this > other
+   *    3) &gt; 0 if this &gt; other
    */
   public int compare(final AbstractSlice other) {
     assert (other != null);
@@ -145,7 +145,6 @@ abstract class AbstractSlice<T> extends RocksObject {
 
   /**
    * Deletes underlying C++ slice pointer.
-   * <p/>
    * Note that this function should be called only after all
    * RocksDB instances referencing the slice are closed.
    * Otherwise an undefined behavior will occur.
diff --git a/java/org/rocksdb/BloomFilter.java b/java/org/rocksdb/BloomFilter.java
index 6772d2f54..dd2a511dd 100644
--- a/java/org/rocksdb/BloomFilter.java
+++ b/java/org/rocksdb/BloomFilter.java
@@ -60,7 +60,7 @@ public class BloomFilter extends Filter {
    * bits_per_key: bits per key in bloom filter. A good value for bits_per_key
    * is 10, which yields a filter with ~ 1% false positive rate.
    * <p><strong>default bits_per_key</strong>: 10</p>
-   * </p>
+   *
    * <p>use_block_based_builder: use block based filter rather than full filter.
    * If you want to builder full filter, it needs to be set to false.
    * </p>
diff --git a/java/org/rocksdb/ColumnFamilyOptionsInterface.java b/java/org/rocksdb/ColumnFamilyOptionsInterface.java
index 827fe8c64..fb04c249a 100644
--- a/java/org/rocksdb/ColumnFamilyOptionsInterface.java
+++ b/java/org/rocksdb/ColumnFamilyOptionsInterface.java
@@ -266,7 +266,7 @@ public interface ColumnFamilyOptionsInterface {
   int numLevels();
 
   /**
-   * Number of files to trigger level-0 compaction. A value < 0 means that
+   * Number of files to trigger level-0 compaction. A value &lt; 0 means that
    * level-0 compaction will not be triggered by number of files at all.
    * Default: 4
    *
@@ -278,7 +278,7 @@ public interface ColumnFamilyOptionsInterface {
 
   /**
    * The number of files in level 0 to trigger compaction from level-0 to
-   * level-1.  A value < 0 means that level-0 compaction will not be
+   * level-1.  A value &lt; 0 means that level-0 compaction will not be
    * triggered by number of files at all.
    * Default: 4
    *
@@ -288,7 +288,7 @@ public interface ColumnFamilyOptionsInterface {
 
   /**
    * Soft limit on number of level-0 files. We start slowing down writes at this
-   * point. A value < 0 means that no writing slow down will be triggered by
+   * point. A value &lt; 0 means that no writing slow down will be triggered by
    * number of files in level-0.
    *
    * @param numFiles soft limit on number of level-0 files.
@@ -299,7 +299,7 @@ public interface ColumnFamilyOptionsInterface {
 
   /**
    * Soft limit on the number of level-0 files. We start slowing down writes
-   * at this point. A value < 0 means that no writing slow down will be
+   * at this point. A value &lt; 0 means that no writing slow down will be
    * triggered by number of files in level-0.
    *
    * @return the soft limit on the number of level-0 files.
@@ -324,7 +324,7 @@ public interface ColumnFamilyOptionsInterface {
   /**
    * The highest level to which a new compacted memtable is pushed if it
    * does not create overlap.  We try to push to level 2 to avoid the
-   * relatively expensive level 0=>1 compactions and to avoid some
+   * relatively expensive level 0&ge;1 compactions and to avoid some
    * expensive manifest file operations.  We do not push all the way to
    * the largest level since that can generate a lot of wasted disk
    * space if the same key space is being repeatedly overwritten.
@@ -339,7 +339,7 @@ public interface ColumnFamilyOptionsInterface {
   /**
    * The highest level to which a new compacted memtable is pushed if it
    * does not create overlap.  We try to push to level 2 to avoid the
-   * relatively expensive level 0=>1 compactions and to avoid some
+   * relatively expensive level 0&ge;1 compactions and to avoid some
    * expensive manifest file operations.  We do not push all the way to
    * the largest level since that can generate a lot of wasted disk
    * space if the same key space is being repeatedly overwritten.
@@ -515,7 +515,7 @@ public interface ColumnFamilyOptionsInterface {
 
   /**
    * Control maximum bytes of overlaps in grandparent (i.e., level+2) before we
-   * stop building a single file in a level->level+1 compaction.
+   * stop building a single file in a level-&gt;level+1 compaction.
    *
    * @param maxGrandparentOverlapFactor maximum bytes of overlaps in
    *     "grandparent" level.
@@ -526,7 +526,7 @@ public interface ColumnFamilyOptionsInterface {
 
   /**
    * Control maximum bytes of overlaps in grandparent (i.e., level+2) before we
-   * stop building a single file in a level->level+1 compaction.
+   * stop building a single file in a level-&gt;level+1 compaction.
    *
    * @return maximum bytes of overlaps in "grandparent" level.
    */
@@ -535,7 +535,7 @@ public interface ColumnFamilyOptionsInterface {
   /**
    * Puts are delayed 0-1 ms when any level has a compaction score that exceeds
    * soft_rate_limit. This is ignored when == 0.0.
-   * CONSTRAINT: soft_rate_limit <= hard_rate_limit. If this constraint does not
+   * CONSTRAINT: soft_rate_limit &le; hard_rate_limit. If this constraint does not
    * hold, RocksDB will set soft_rate_limit = hard_rate_limit
    * Default: 0 (disabled)
    *
@@ -548,7 +548,7 @@ public interface ColumnFamilyOptionsInterface {
   /**
    * Puts are delayed 0-1 ms when any level has a compaction score that exceeds
    * soft_rate_limit. This is ignored when == 0.0.
-   * CONSTRAINT: soft_rate_limit <= hard_rate_limit. If this constraint does not
+   * CONSTRAINT: soft_rate_limit &le; hard_rate_limit. If this constraint does not
    * hold, RocksDB will set soft_rate_limit = hard_rate_limit
    * Default: 0 (disabled)
    *
@@ -558,7 +558,7 @@ public interface ColumnFamilyOptionsInterface {
 
   /**
    * Puts are delayed 1ms at a time when any level has a compaction score that
-   * exceeds hard_rate_limit. This is ignored when <= 1.0.
+   * exceeds hard_rate_limit. This is ignored when &le; 1.0.
    * Default: 0 (disabled)
    *
    * @param hardRateLimit the hard-rate-limit of a compaction score for put
@@ -569,7 +569,7 @@ public interface ColumnFamilyOptionsInterface {
 
   /**
    * Puts are delayed 1ms at a time when any level has a compaction score that
-   * exceeds hard_rate_limit. This is ignored when <= 1.0.
+   * exceeds hard_rate_limit. This is ignored when &le; 1.0.
    * Default: 0 (disabled)
    *
    * @return the hard-rate-limit of a compaction score for put delay.
@@ -600,11 +600,11 @@ public interface ColumnFamilyOptionsInterface {
 
   /**
    * The size of one block in arena memory allocation.
-   * If <= 0, a proper value is automatically calculated (usually 1/10 of
+   * If &le; 0, a proper value is automatically calculated (usually 1/10 of
    * writer_buffer_size).
    *
    * There are two additonal restriction of the The specified size:
-   * (1) size should be in the range of [4096, 2 << 30] and
+   * (1) size should be in the range of [4096, 2 &lt;&lt; 30] and
    * (2) be the multiple of the CPU word (which helps with the memory
    * alignment).
    *
@@ -621,11 +621,11 @@ public interface ColumnFamilyOptionsInterface {
 
   /**
    * The size of one block in arena memory allocation.
-   * If <= 0, a proper value is automatically calculated (usually 1/10 of
+   * If &le; 0, a proper value is automatically calculated (usually 1/10 of
    * writer_buffer_size).
    *
    * There are two additonal restriction of the The specified size:
-   * (1) size should be in the range of [4096, 2 << 30] and
+   * (1) size should be in the range of [4096, 2 &lt;&lt; 30] and
    * (2) be the multiple of the CPU word (which helps with the memory
    * alignment).
    *
@@ -734,7 +734,7 @@ public interface ColumnFamilyOptionsInterface {
   boolean filterDeletes();
 
   /**
-   * An iteration->Next() sequentially skips over keys with the same
+   * An iteration-&gt;Next() sequentially skips over keys with the same
    * user-key unless this option is set. This number specifies the number
    * of keys (with the same userkey) that will be sequentially
    * skipped before a reseek is issued.
@@ -747,7 +747,7 @@ public interface ColumnFamilyOptionsInterface {
   Object setMaxSequentialSkipInIterations(long maxSequentialSkipInIterations);
 
   /**
-   * An iteration->Next() sequentially skips over keys with the same
+   * An iteration-&gt;Next() sequentially skips over keys with the same
    * user-key unless this option is set. This number specifies the number
    * of keys (with the same userkey) that will be sequentially
    * skipped before a reseek is issued.
@@ -794,7 +794,7 @@ public interface ColumnFamilyOptionsInterface {
    * If inplace_callback function is not set,
    *   Put(key, new_value) will update inplace the existing_value iff
    *   * key exists in current memtable
-   *   * new sizeof(new_value) <= sizeof(existing_value)
+   *   * new sizeof(new_value) &le; sizeof(existing_value)
    *   * existing_value for that key is a put i.e. kTypeValue
    * If inplace_callback function is set, check doc for inplace_callback.
    * Default: false.
@@ -810,7 +810,7 @@ public interface ColumnFamilyOptionsInterface {
    * If inplace_callback function is not set,
    *   Put(key, new_value) will update inplace the existing_value iff
    *   * key exists in current memtable
-   *   * new sizeof(new_value) <= sizeof(existing_value)
+   *   * new sizeof(new_value) &le; sizeof(existing_value)
    *   * existing_value for that key is a put i.e. kTypeValue
    * If inplace_callback function is set, check doc for inplace_callback.
    * Default: false.
@@ -945,7 +945,7 @@ public interface ColumnFamilyOptionsInterface {
    * merge will be performed. Partial merge will not be called
    * if the list of values to merge is less than min_partial_merge_operands.
    *
-   * If min_partial_merge_operands < 2, then it will be treated as 2.
+   * If min_partial_merge_operands &lt; 2, then it will be treated as 2.
    *
    * Default: 2
    *
@@ -959,7 +959,7 @@ public interface ColumnFamilyOptionsInterface {
    * merge will be performed. Partial merge will not be called
    * if the list of values to merge is less than min_partial_merge_operands.
    *
-   * If min_partial_merge_operands < 2, then it will be treated as 2.
+   * If min_partial_merge_operands &lt; 2, then it will be treated as 2.
    *
    * Default: 2
    *
diff --git a/java/org/rocksdb/DBOptionsInterface.java b/java/org/rocksdb/DBOptionsInterface.java
index 35c65eed2..ca65a6146 100644
--- a/java/org/rocksdb/DBOptionsInterface.java
+++ b/java/org/rocksdb/DBOptionsInterface.java
@@ -502,6 +502,7 @@ public interface DBOptionsInterface {
    *    are older than WAL_ttl_seconds will be deleted.</li>
    * <li>If both are not 0, WAL files will be checked every 10 min and both
    *    checks will be performed with ttl being first.</li>
+   * </ol>
    *
    * @param walTtlSeconds the ttl seconds
    * @return the instance of the current Object.
@@ -546,6 +547,7 @@ public interface DBOptionsInterface {
    *    are older than WAL_ttl_seconds will be deleted.</li>
    * <li>If both are not 0, WAL files will be checked every 10 min and both
    *    checks will be performed with ttl being first.</li>
+   * </ol> 
    *
    * @param sizeLimitMB size limit in mega-bytes.
    * @return the instance of the current Object.
diff --git a/java/org/rocksdb/PlainTableConfig.java b/java/org/rocksdb/PlainTableConfig.java
index 71d75f72c..7f0d672ef 100644
--- a/java/org/rocksdb/PlainTableConfig.java
+++ b/java/org/rocksdb/PlainTableConfig.java
@@ -123,7 +123,7 @@ public class PlainTableConfig extends TableFormatConfig {
   }
 
   /**
-   * <p>huge_page_tlb_size: if <=0, allocate hash indexes and blooms
+   * <p>huge_page_tlb_size: if &le;0, allocate hash indexes and blooms
    * from malloc otherwise from huge page TLB.</p>
    *
    * <p>The user needs to reserve huge pages for it to be allocated,
diff --git a/java/org/rocksdb/RocksDB.java b/java/org/rocksdb/RocksDB.java
index f536765f8..40680e438 100644
--- a/java/org/rocksdb/RocksDB.java
+++ b/java/org/rocksdb/RocksDB.java
@@ -328,7 +328,7 @@ public class RocksDB extends RocksObject {
    *
    * @param options Options for opening the database
    * @param path Absolute path to rocksdb database
-   * @return List<byte[]> List containing the column family names
+   * @return List&lt;byte[]&gt; List containing the column family names
    *
    * @throws RocksDBException
    */
@@ -462,7 +462,6 @@ public class RocksDB extends RocksObject {
    * to make this lighter weight is to avoid doing any IOs.
    *
    * @param readOptions {@link ReadOptions} instance
-   * @param columnFamilyHandle {@link ColumnFamilyHandle} instance
    * @param key byte array of a key to search for
    * @param value StringBuffer instance which is a out parameter if a value is
    *    found in block-cache.
@@ -922,13 +921,13 @@ public class RocksDB extends RocksObject {
    *
    * <p>Valid property names include:
    * <ul>
-   * <li>"rocksdb.num-files-at-level<N>" - return the number of files at level <N>,
-   *     where <N> is an ASCII representation of a level number (e.g. "0").</li>
+   * <li>"rocksdb.num-files-at-level&lt;N&gt;" - return the number of files at level &lt;N&gt;,
+   *     where &lt;N&gt; is an ASCII representation of a level number (e.g. "0").</li>
    * <li>"rocksdb.stats" - returns a multi-line string that describes statistics
    *     about the internal operation of the DB.</li>
    * <li>"rocksdb.sstables" - returns a multi-line string that describes all
    *    of the sstables that make up the db contents.</li>
-   *</ul></p>
+   * </ul>
    *
    * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
    *     instance
@@ -951,13 +950,13 @@ public class RocksDB extends RocksObject {
    *
    * <p>Valid property names include:
    * <ul>
-   * <li>"rocksdb.num-files-at-level<N>" - return the number of files at level <N>,
-   *     where <N> is an ASCII representation of a level number (e.g. "0").</li>
+   * <li>"rocksdb.num-files-at-level&lt;N&gt;" - return the number of files at level &lt;N&gt;,
+   *     where &lt;N&gt; is an ASCII representation of a level number (e.g. "0").</li>
    * <li>"rocksdb.stats" - returns a multi-line string that describes statistics
    *     about the internal operation of the DB.</li>
    * <li>"rocksdb.sstables" - returns a multi-line string that describes all
    *    of the sstables that make up the db contents.</li>
-   *</ul></p>
+   *</ul>
    *
    * @param property to be fetched. See above for examples
    * @return property value
diff --git a/java/org/rocksdb/RocksIterator.java b/java/org/rocksdb/RocksIterator.java
index 12377b6df..acfdd3b8c 100644
--- a/java/org/rocksdb/RocksIterator.java
+++ b/java/org/rocksdb/RocksIterator.java
@@ -63,7 +63,7 @@ public class RocksIterator extends RocksObject {
    * <p>Moves to the next entry in the source.  After this call, Valid() is
    * true iff the iterator was not positioned at the last entry in the source.</p>
    *
-   * <p>REQUIRES: {@link #isValid()}<p>
+   * <p>REQUIRES: {@link #isValid()}</p>
    */
   public void next() {
     assert(isInitialized());
@@ -74,7 +74,7 @@ public class RocksIterator extends RocksObject {
    * <p>Moves to the previous entry in the source.  After this call, Valid() is
    * true iff the iterator was not positioned at the first entry in source.</p>
    *
-   * <p>REQUIRES: {@link #isValid()}<p>
+   * <p>REQUIRES: {@link #isValid()}</p>
    */
   public void prev() {
     assert(isInitialized());
@@ -86,7 +86,7 @@ public class RocksIterator extends RocksObject {
    * the returned slice is valid only until the next modification of
    * the iterator.</p>
    *
-   * <p>REQUIRES: {@link #isValid()}<p>
+   * <p>REQUIRES: {@link #isValid()}</p>
    *
    * @return key for the current entry.
    */
@@ -100,7 +100,7 @@ public class RocksIterator extends RocksObject {
    * the returned slice is valid only until the next modification of
    * the iterator.</p>
    *
-   * <p>REQUIRES: !AtEnd() && !AtStart()</p>
+   * <p>REQUIRES: !AtEnd() &amp;&amp; !AtStart()</p>
    * @return value for the current entry.
    */
   public byte[] value() {
diff --git a/java/org/rocksdb/RocksObject.java b/java/org/rocksdb/RocksObject.java
index 828bb4f3c..ff5842139 100644
--- a/java/org/rocksdb/RocksObject.java
+++ b/java/org/rocksdb/RocksObject.java
@@ -11,14 +11,12 @@ package org.rocksdb;
  *
  * <p>
  * RocksObject has {@code dispose()} function, which releases its associated c++
- * resource.
- * </p>
- * </p>
+ * resource.</p>
+ * <p>
  * This function can be either called manually, or being called automatically
  * during the regular Java GC process. However, since Java may wrongly assume a
  * RocksObject only contains a long member variable and think it is small in size,
- * </p>
- * <p>Java may give {@code RocksObject} low priority in the GC process. For this, it is
+ * Java may give {@code RocksObject} low priority in the GC process. For this, it is
  * suggested to call {@code dispose()} manually. However, it is safe to let
  * {@code RocksObject} go out-of-scope without manually calling {@code dispose()}
  * as {@code dispose()} will be called in the finalizer during the
diff --git a/java/org/rocksdb/Slice.java b/java/org/rocksdb/Slice.java
index 4449cb7b8..fe5d8d49d 100644
--- a/java/org/rocksdb/Slice.java
+++ b/java/org/rocksdb/Slice.java
@@ -66,10 +66,10 @@ public class Slice extends AbstractSlice<byte[]> {
    * Deletes underlying C++ slice pointer
    * and any buffered data.
    *
-   * <p/>
+   * <p>
    * Note that this function should be called only after all
    * RocksDB instances referencing the slice are closed.
-   * Otherwise an undefined behavior will occur.
+   * Otherwise an undefined behavior will occur.</p>
    */
   @Override
   protected void disposeInternal() {
diff --git a/java/org/rocksdb/WriteBatch.java b/java/org/rocksdb/WriteBatch.java
index 0a16d5104..118695512 100644
--- a/java/org/rocksdb/WriteBatch.java
+++ b/java/org/rocksdb/WriteBatch.java
@@ -41,14 +41,14 @@ public class WriteBatch extends RocksObject {
   public native int count();
 
   /**
-   * Store the mapping "key->value" in the database.
+   * Store the mapping "key-&gt;value" in the database.
    */
   public void put(byte[] key, byte[] value) {
     put(key, key.length, value, value.length);
   }
 
   /**
-   * Store the mapping "key->value" within given column
+   * Store the mapping "key-&gt;value" within given column
    * family.
    */
   public void put(ColumnFamilyHandle columnFamilyHandle,
@@ -59,7 +59,7 @@ public class WriteBatch extends RocksObject {
 
   /**
    * Merge "value" with the existing value of "key" in the database.
-   * "key->merge(existing, value)"
+   * "key-&gt;merge(existing, value)"
    */
   public void merge(byte[] key, byte[] value) {
     merge(key, key.length, value, value.length);
@@ -67,7 +67,7 @@ public class WriteBatch extends RocksObject {
 
   /**
    * Merge "value" with the existing value of "key" in given column family.
-   * "key->merge(existing, value)"
+   * "key-&gt;merge(existing, value)"
    */
   public void merge(ColumnFamilyHandle columnFamilyHandle,
       byte[] key, byte[] value) {
diff --git a/java/org/rocksdb/test/AbstractComparatorTest.java b/java/org/rocksdb/test/AbstractComparatorTest.java
index dfdb3cad9..7f4c47fb3 100644
--- a/java/org/rocksdb/test/AbstractComparatorTest.java
+++ b/java/org/rocksdb/test/AbstractComparatorTest.java
@@ -104,7 +104,7 @@ public abstract class AbstractComparatorTest {
    * @param a 4-bytes representing an integer key
    * @param b 4-bytes representing an integer key
    *
-   * @return negative if a < b, 0 if a == b, positive otherwise
+   * @return negative if a &lt; b, 0 if a == b, positive otherwise
    */
   protected final int compareIntKeys(final byte[] a, final byte[] b) {
 
diff --git a/java/org/rocksdb/test/PlatformRandomHelper.java b/java/org/rocksdb/test/PlatformRandomHelper.java
index b0ef8d8a6..c729c3dc1 100644
--- a/java/org/rocksdb/test/PlatformRandomHelper.java
+++ b/java/org/rocksdb/test/PlatformRandomHelper.java
@@ -38,7 +38,7 @@ public class PlatformRandomHelper {
     /**
      * Random32Bit is a class which overrides {@code nextLong} to
      * provide random numbers which fit in size_t. This workaround
-     * is necessary because there is no unsigned_int < Java 8
+     * is necessary because there is no unsigned_int &lt; Java 8
      */
     private static class Random32Bit extends Random {
       @Override

From ac95ae1b5d9b58606160dc856d04ed7bb5c06ac5 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Wed, 5 Nov 2014 18:07:22 -0800
Subject: [PATCH 418/829] Make sure WAL is synced for DB::Write() if write
 batch is empty

Summary: This patch makes it a contract that if an empty write batch is passed to DB::Write() and WriteOptions.sync = true, fsync is called to WAL.

Test Plan: A new unit test

Reviewers: ljin, rven, yhchiang, igor

Reviewed By: igor

Subscribers: dhruba, MarkCallaghan, leveldb

Differential Revision: https://reviews.facebook.net/D28365
---
 db/db_test.cc        | 21 +++++++++++++++++++++
 include/rocksdb/db.h |  2 ++
 2 files changed, 23 insertions(+)

diff --git a/db/db_test.cc b/db/db_test.cc
index 7aea863f8..b81b7c08e 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -1270,6 +1270,27 @@ TEST(DBTest, Empty) {
   } while (ChangeOptions());
 }
 
+TEST(DBTest, WriteEmptyBatch) {
+  Options options;
+  options.env = env_;
+  options.write_buffer_size = 100000;
+  options = CurrentOptions(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  ASSERT_OK(Put(1, "foo", "bar"));
+  env_->sync_counter_.store(0);
+  WriteOptions wo;
+  wo.sync = true;
+  wo.disableWAL = false;
+  WriteBatch empty_batch;
+  ASSERT_OK(dbfull()->Write(wo, &empty_batch));
+  ASSERT_GE(env_->sync_counter_.load(), 1);
+
+  // make sure we can re-open it.
+  ASSERT_OK(TryReopenWithColumnFamilies({"default", "pikachu"}, options));
+  ASSERT_EQ("bar", Get(1, "foo"));
+}
+
 TEST(DBTest, ReadOnlyDB) {
   ASSERT_OK(Put("foo", "v1"));
   ASSERT_OK(Put("bar", "v2"));
diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h
index 21fa43838..65b517f54 100644
--- a/include/rocksdb/db.h
+++ b/include/rocksdb/db.h
@@ -195,6 +195,8 @@ class DB {
   }
 
   // Apply the specified updates to the database.
+  // If `updates` contains no update, WAL will still be synced if
+  // options.sync=true.
   // Returns OK on success, non-OK on failure.
   // Note: consider setting options.sync = true.
   virtual Status Write(const WriteOptions& options, WriteBatch* updates) = 0;

From 367a3f9cb4ff1b7d5c23f2fc4b37e8ef55b4971d Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Thu, 6 Nov 2014 10:14:47 -0800
Subject: [PATCH 419/829] Improve DBTest.GroupCommitTest: artificially slowdown
 log writing to trigger group commit

Summary: In order to avoid random failure of DBTest.GroupCommitTest, artificially sleep 100 microseconds in each log writing.

Test Plan: Run the test in a machine where valgrind version of the test always fails multiple times and see it always succeed.

Reviewers: igor, yhchiang, rven, ljin

Reviewed By: ljin

Subscribers: leveldb, dhruba

Differential Revision: https://reviews.facebook.net/D28401
---
 db/db_test.cc | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/db/db_test.cc b/db/db_test.cc
index b81b7c08e..8352975f0 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -144,6 +144,9 @@ class SpecialEnv : public EnvWrapper {
   // Force write to log files to fail while this pointer is non-nullptr
   std::atomic<bool> log_write_error_;
 
+  // Slow down every log write, in micro-seconds.
+  std::atomic<int> log_write_slowdown_;
+
   bool count_random_reads_;
   anon::AtomicCounter random_read_counter_;
 
@@ -172,6 +175,7 @@ class SpecialEnv : public EnvWrapper {
     manifest_sync_error_.store(false, std::memory_order_release);
     manifest_write_error_.store(false, std::memory_order_release);
     log_write_error_.store(false, std::memory_order_release);
+    log_write_slowdown_ = 0;
     bytes_written_ = 0;
     sync_counter_ = 0;
     non_writeable_rate_ = 0;
@@ -254,6 +258,11 @@ class SpecialEnv : public EnvWrapper {
         if (env_->log_write_error_.load(std::memory_order_acquire)) {
           return Status::IOError("simulated writer error");
         } else {
+          int slowdown =
+              env_->log_write_slowdown_.load(std::memory_order_acquire);
+          if (slowdown > 0) {
+            env_->SleepForMicroseconds(slowdown);
+          }
           return base_->Append(data);
         }
       }
@@ -7060,6 +7069,8 @@ static void GCThreadBody(void* arg) {
 TEST(DBTest, GroupCommitTest) {
   do {
     Options options = CurrentOptions();
+    options.env = env_;
+    env_->log_write_slowdown_.store(100);
     options.statistics = rocksdb::CreateDBStatistics();
     Reopen(options);
 
@@ -7077,6 +7088,8 @@ TEST(DBTest, GroupCommitTest) {
         env_->SleepForMicroseconds(100000);
       }
     }
+    env_->log_write_slowdown_.store(0);
+
     ASSERT_GT(TestGetTickerCount(options, WRITE_DONE_BY_OTHER), 0);
 
     std::vector<std::string> expected_db;

From c02338a698569c556a8019240b3740aa30d61cdd Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Thu, 6 Nov 2014 11:02:30 -0800
Subject: [PATCH 420/829] update HISOTRY.md for new release

Summary:
as title

Test Plan:
n/a
---
 HISTORY.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/HISTORY.md b/HISTORY.md
index 9a7cd7810..4182e1dd5 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -2,6 +2,8 @@
 
 ## Unreleased
 
+
+## 3.7.0 (11/6/2014)
 ### Public API changes
 * Introduce SetOptions() API to allow adjusting a subset of options dynamically online
 * Introduce 4 new convenient functions for converting Options from string: GetColumnFamilyOptionsFromMap(), GetColumnFamilyOptionsFromString(), GetDBOptionsFromMap(), GetDBOptionsFromString()

From 9f20395cd618ed41eb6afaba9aa380c209bb4c5d Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Thu, 6 Nov 2014 11:14:28 -0800
Subject: [PATCH 421/829] Turn -Wshadow back on

Summary: It turns out that -Wshadow has different rules for gcc than clang. Previous commit fixed clang. This commits fixes the rest of the warnings for gcc.

Test Plan: compiles

Reviewers: ljin, yhchiang, rven, sdong

Reviewed By: sdong

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D28131
---
 Makefile                                      |  2 +-
 db/column_family.cc                           | 21 +++---
 db/column_family.h                            |  4 +-
 db/compaction.cc                              | 19 ++---
 db/db_iter.cc                                 | 12 ++--
 db/db_iter_test.cc                            | 19 ++---
 db/db_test.cc                                 |  8 +--
 db/dbformat.cc                                |  6 +-
 db/dbformat.h                                 |  6 +-
 db/file_indexer_test.cc                       |  3 -
 db/log_reader.cc                              |  7 +-
 db/memtable_list.cc                           |  6 +-
 db/merge_helper.cc                            | 21 +++---
 db/skiplist_test.cc                           |  4 +-
 db/transaction_log_impl.cc                    | 20 +++---
 db/transaction_log_impl.h                     | 27 ++++---
 db/version_set.cc                             | 72 +++++++++----------
 helpers/memenv/memenv.cc                      |  7 +-
 include/rocksdb/status.h                      |  4 +-
 table/cuckoo_table_reader_test.cc             |  6 +-
 table/format.cc                               | 14 ++--
 table/format.h                                | 10 ++-
 table/iterator_wrapper.h                      |  8 +--
 table/merger.cc                               |  8 +--
 table/table_test.cc                           | 45 +++++-------
 util/autovector_test.cc                       |  5 +-
 util/mock_env.cc                              |  5 +-
 util/status.cc                                |  5 +-
 utilities/document/json_document.cc           | 16 ++---
 .../write_batch_with_index.cc                 |  6 +-
 30 files changed, 192 insertions(+), 204 deletions(-)

diff --git a/Makefile b/Makefile
index d06d6f2a1..c2d206e91 100644
--- a/Makefile
+++ b/Makefile
@@ -69,7 +69,7 @@ install:
 	@[ ! -e $(SHARED) ] || install -C -m 644 $(SHARED) $(INSTALL_PATH)/lib
 #-------------------------------------------------
 
-WARNING_FLAGS = -Wall -Werror -Wsign-compare
+WARNING_FLAGS = -Wall -Werror -Wsign-compare -Wshadow
 CFLAGS += $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CCFLAGS) $(OPT)
 CXXFLAGS += $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CXXFLAGS) $(OPT) -Woverloaded-virtual
 
diff --git a/db/column_family.cc b/db/column_family.cc
index b2670cbdb..eba3c74dd 100644
--- a/db/column_family.cc
+++ b/db/column_family.cc
@@ -62,9 +62,9 @@ uint64_t SlowdownAmount(int n, double bottom, double top) {
 }
 }  // namespace
 
-ColumnFamilyHandleImpl::ColumnFamilyHandleImpl(ColumnFamilyData* cfd,
-                                               DBImpl* db, port::Mutex* mutex)
-    : cfd_(cfd), db_(db), mutex_(mutex) {
+ColumnFamilyHandleImpl::ColumnFamilyHandleImpl(
+    ColumnFamilyData* column_family_data, DBImpl* db, port::Mutex* mutex)
+    : cfd_(column_family_data), db_(db), mutex_(mutex) {
   if (cfd_ != nullptr) {
     cfd_->Ref();
   }
@@ -217,14 +217,15 @@ void SuperVersionUnrefHandle(void* ptr) {
 }  // anonymous namespace
 
 ColumnFamilyData::ColumnFamilyData(uint32_t id, const std::string& name,
-                                   Version* dummy_versions, Cache* table_cache,
+                                   Version* _dummy_versions,
+                                   Cache* _table_cache,
                                    const ColumnFamilyOptions& cf_options,
                                    const DBOptions* db_options,
                                    const EnvOptions& env_options,
                                    ColumnFamilySet* column_family_set)
     : id_(id),
       name_(name),
-      dummy_versions_(dummy_versions),
+      dummy_versions_(_dummy_versions),
       current_(nullptr),
       refs_(0),
       dropped_(false),
@@ -243,11 +244,11 @@ ColumnFamilyData::ColumnFamilyData(uint32_t id, const std::string& name,
       column_family_set_(column_family_set) {
   Ref();
 
-  // if dummy_versions is nullptr, then this is a dummy column family.
-  if (dummy_versions != nullptr) {
+  // if _dummy_versions is nullptr, then this is a dummy column family.
+  if (_dummy_versions != nullptr) {
     internal_stats_.reset(
         new InternalStats(ioptions_.num_levels, db_options->env, this));
-    table_cache_.reset(new TableCache(ioptions_, env_options, table_cache));
+    table_cache_.reset(new TableCache(ioptions_, env_options, _table_cache));
     if (ioptions_.compaction_style == kCompactionStyleUniversal) {
       compaction_picker_.reset(
           new UniversalCompactionPicker(ioptions_, &internal_comparator_));
@@ -389,7 +390,9 @@ const EnvOptions* ColumnFamilyData::soptions() const {
   return &(column_family_set_->env_options_);
 }
 
-void ColumnFamilyData::SetCurrent(Version* current) { current_ = current; }
+void ColumnFamilyData::SetCurrent(Version* current_version) {
+  current_ = current_version;
+}
 
 void ColumnFamilyData::CreateNewMemtable(
     const MutableCFOptions& mutable_cf_options) {
diff --git a/db/column_family.h b/db/column_family.h
index 013c29615..0be47ee84 100644
--- a/db/column_family.h
+++ b/db/column_family.h
@@ -70,7 +70,7 @@ class ColumnFamilyHandleInternal : public ColumnFamilyHandleImpl {
   ColumnFamilyHandleInternal()
       : ColumnFamilyHandleImpl(nullptr, nullptr, nullptr) {}
 
-  void SetCFD(ColumnFamilyData* cfd) { internal_cfd_ = cfd; }
+  void SetCFD(ColumnFamilyData* _cfd) { internal_cfd_ = _cfd; }
   virtual ColumnFamilyData* cfd() const override { return internal_cfd_; }
 
  private:
@@ -178,7 +178,7 @@ class ColumnFamilyData {
   // REQUIRES: DB mutex held
   // This returns the MutableCFOptions used by current SuperVersion
   // You shoul use this API to reference MutableCFOptions most of the time.
-  const MutableCFOptions* mutable_cf_options() const {
+  const MutableCFOptions* GetCurrentMutableCFOptions() const {
     return &(super_version_->mutable_cf_options);
   }
   // REQUIRES: DB mutex held
diff --git a/db/compaction.cc b/db/compaction.cc
index 6c76012db..3f9da1d82 100644
--- a/db/compaction.cc
+++ b/db/compaction.cc
@@ -29,8 +29,8 @@ uint64_t TotalFileSize(const std::vector<FileMetaData*>& files) {
   return sum;
 }
 
-void Compaction::SetInputVersion(Version* input_version) {
-  input_version_ = input_version;
+void Compaction::SetInputVersion(Version* _input_version) {
+  input_version_ = _input_version;
   cfd_ = input_version_->cfd();
 
   cfd_->Ref();
@@ -111,10 +111,10 @@ bool Compaction::IsTrivialMove() const {
           TotalFileSize(grandparents_) <= max_grandparent_overlap_bytes_);
 }
 
-void Compaction::AddInputDeletions(VersionEdit* edit) {
+void Compaction::AddInputDeletions(VersionEdit* out_edit) {
   for (int which = 0; which < num_input_levels(); which++) {
     for (size_t i = 0; i < inputs_[which].size(); i++) {
-      edit->DeleteFile(level(which), inputs_[which][i]->fd.GetNumber());
+      out_edit->DeleteFile(level(which), inputs_[which][i]->fd.GetNumber());
     }
   }
 }
@@ -261,14 +261,15 @@ void Compaction::Summary(char* output, int len) {
     return;
   }
 
-  for (int level = 0; level < num_input_levels(); ++level) {
-    if (level > 0) {
+  for (int level_iter = 0; level_iter < num_input_levels(); ++level_iter) {
+    if (level_iter > 0) {
       write += snprintf(output + write, len - write, "], [");
       if (write < 0 || write >= len) {
         return;
       }
     }
-    write += InputSummary(inputs_[level].files, output + write, len - write);
+    write +=
+        InputSummary(inputs_[level_iter].files, output + write, len - write);
     if (write < 0 || write >= len) {
       return;
     }
@@ -284,8 +285,8 @@ uint64_t Compaction::OutputFilePreallocationSize(
   if (cfd_->ioptions()->compaction_style == kCompactionStyleLevel) {
     preallocation_size = mutable_options.MaxFileSizeForLevel(output_level());
   } else {
-    for (int level = 0; level < num_input_levels(); ++level) {
-      for (const auto& f : inputs_[level].files) {
+    for (int level_iter = 0; level_iter < num_input_levels(); ++level_iter) {
+      for (const auto& f : inputs_[level_iter].files) {
         preallocation_size += f->fd.GetFileSize();
       }
     }
diff --git a/db/db_iter.cc b/db/db_iter.cc
index 2fd4a9e2e..78decd8b1 100644
--- a/db/db_iter.cc
+++ b/db/db_iter.cc
@@ -312,8 +312,8 @@ void DBIter::MergeValuesNewToOld() {
       // hit a put, merge the put value with operands and store the
       // final result in saved_value_. We are done!
       // ignore corruption if there is any.
-      const Slice value = iter_->value();
-      user_merge_operator_->FullMerge(ikey.user_key, &value, operands,
+      const Slice val = iter_->value();
+      user_merge_operator_->FullMerge(ikey.user_key, &val, operands,
                                       &saved_value_, logger_);
       // iter_ is positioned after put
       iter_->Next();
@@ -323,8 +323,8 @@ void DBIter::MergeValuesNewToOld() {
     if (kTypeMerge == ikey.type) {
       // hit a merge, add the value as an operand and run associative merge.
       // when complete, add result to operands and continue.
-      const Slice& value = iter_->value();
-      operands.push_front(value.ToString());
+      const Slice& val = iter_->value();
+      operands.push_front(val.ToString());
     }
   }
 
@@ -505,8 +505,8 @@ bool DBIter::FindValueForCurrentKeyUsingSeek() {
     return true;
   }
 
-  const Slice& value = iter_->value();
-  user_merge_operator_->FullMerge(saved_key_.GetKey(), &value, operands,
+  const Slice& val = iter_->value();
+  user_merge_operator_->FullMerge(saved_key_.GetKey(), &val, operands,
                                   &saved_value_, logger_);
   valid_ = true;
   return true;
diff --git a/db/db_iter_test.cc b/db/db_iter_test.cc
index e6b96c410..a84fd55b7 100644
--- a/db/db_iter_test.cc
+++ b/db/db_iter_test.cc
@@ -33,20 +33,23 @@ class TestIterator : public Iterator {
         iter_(0),
         cmp(comparator) {}
 
-  void AddMerge(std::string key, std::string value) {
-    Add(key, kTypeMerge, value);
+  void AddMerge(std::string argkey, std::string argvalue) {
+    Add(argkey, kTypeMerge, argvalue);
   }
 
-  void AddDeletion(std::string key) { Add(key, kTypeDeletion, std::string()); }
+  void AddDeletion(std::string argkey) {
+    Add(argkey, kTypeDeletion, std::string());
+  }
 
-  void AddPut(std::string key, std::string value) {
-    Add(key, kTypeValue, value);
+  void AddPut(std::string argkey, std::string argvalue) {
+    Add(argkey, kTypeValue, argvalue);
   }
 
-  void Add(std::string key, ValueType type, std::string value) {
+  void Add(std::string argkey, ValueType type, std::string argvalue) {
     valid_ = true;
-    ParsedInternalKey internal_key(key, sequence_number_++, type);
-    data_.push_back(std::pair<std::string, std::string>(std::string(), value));
+    ParsedInternalKey internal_key(argkey, sequence_number_++, type);
+    data_.push_back(
+        std::pair<std::string, std::string>(std::string(), argvalue));
     AppendInternalKey(&data_.back().first, internal_key);
   }
 
diff --git a/db/db_test.cc b/db/db_test.cc
index 8352975f0..d4237b424 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -247,13 +247,13 @@ class SpecialEnv : public EnvWrapper {
         return base_->GetFileSize();
       }
     };
-    class LogFile : public WritableFile {
+    class WalFile : public WritableFile {
      private:
       SpecialEnv* env_;
       unique_ptr<WritableFile> base_;
      public:
-      LogFile(SpecialEnv* env, unique_ptr<WritableFile>&& b)
-          : env_(env), base_(std::move(b)) { }
+      WalFile(SpecialEnv* env, unique_ptr<WritableFile>&& b)
+          : env_(env), base_(std::move(b)) {}
       Status Append(const Slice& data) {
         if (env_->log_write_error_.load(std::memory_order_acquire)) {
           return Status::IOError("simulated writer error");
@@ -296,7 +296,7 @@ class SpecialEnv : public EnvWrapper {
       } else if (strstr(f.c_str(), "MANIFEST") != nullptr) {
         r->reset(new ManifestFile(this, std::move(*r)));
       } else if (strstr(f.c_str(), "log") != nullptr) {
-        r->reset(new LogFile(this, std::move(*r)));
+        r->reset(new WalFile(this, std::move(*r)));
       }
     }
     return s;
diff --git a/db/dbformat.cc b/db/dbformat.cc
index baeb86802..4c8908fd7 100644
--- a/db/dbformat.cc
+++ b/db/dbformat.cc
@@ -127,8 +127,8 @@ void InternalKeyComparator::FindShortSuccessor(std::string* key) const {
   }
 }
 
-LookupKey::LookupKey(const Slice& user_key, SequenceNumber s) {
-  size_t usize = user_key.size();
+LookupKey::LookupKey(const Slice& _user_key, SequenceNumber s) {
+  size_t usize = _user_key.size();
   size_t needed = usize + 13;  // A conservative estimate
   char* dst;
   if (needed <= sizeof(space_)) {
@@ -139,7 +139,7 @@ LookupKey::LookupKey(const Slice& user_key, SequenceNumber s) {
   start_ = dst;
   dst = EncodeVarint32(dst, usize + 8);
   kstart_ = dst;
-  memcpy(dst, user_key.data(), usize);
+  memcpy(dst, _user_key.data(), usize);
   dst += usize;
   EncodeFixed64(dst, PackSequenceAndType(s, kValueTypeForSeek));
   dst += 8;
diff --git a/db/dbformat.h b/db/dbformat.h
index 516a4693b..5a6928e49 100644
--- a/db/dbformat.h
+++ b/db/dbformat.h
@@ -132,8 +132,8 @@ class InternalKey {
   std::string rep_;
  public:
   InternalKey() { }   // Leave rep_ as empty to indicate it is invalid
-  InternalKey(const Slice& user_key, SequenceNumber s, ValueType t) {
-    AppendInternalKey(&rep_, ParsedInternalKey(user_key, s, t));
+  InternalKey(const Slice& _user_key, SequenceNumber s, ValueType t) {
+    AppendInternalKey(&rep_, ParsedInternalKey(_user_key, s, t));
   }
 
   bool Valid() const {
@@ -201,7 +201,7 @@ class LookupKey {
  public:
   // Initialize *this for looking up user_key at a snapshot with
   // the specified sequence number.
-  LookupKey(const Slice& user_key, SequenceNumber sequence);
+  LookupKey(const Slice& _user_key, SequenceNumber sequence);
 
   ~LookupKey();
 
diff --git a/db/file_indexer_test.cc b/db/file_indexer_test.cc
index 673d85a5c..41afe8475 100644
--- a/db/file_indexer_test.cc
+++ b/db/file_indexer_test.cc
@@ -94,7 +94,6 @@ TEST(FileIndexerTest, Empty) {
 // Case 1: no overlap, files are on the left of next level files
 TEST(FileIndexerTest, no_overlap_left) {
   Arena arena;
-  uint32_t kNumLevels = 4;
   indexer = new FileIndexer(&ucmp);
   // level 1
   AddFile(1, 100, 200);
@@ -135,7 +134,6 @@ TEST(FileIndexerTest, no_overlap_left) {
 // Case 2: no overlap, files are on the right of next level files
 TEST(FileIndexerTest, no_overlap_right) {
   Arena arena;
-  uint32_t kNumLevels = 4;
   indexer = new FileIndexer(&ucmp);
   // level 1
   AddFile(1, 2100, 2200);
@@ -178,7 +176,6 @@ TEST(FileIndexerTest, no_overlap_right) {
 // Case 3: empty L2
 TEST(FileIndexerTest, empty_L2) {
   Arena arena;
-  uint32_t kNumLevels = 4;
   indexer = new FileIndexer(&ucmp);
   for (uint32_t i = 1; i < kNumLevels; ++i) {
     ASSERT_EQ(0U, indexer->LevelIndexSize(i));
diff --git a/db/log_reader.cc b/db/log_reader.cc
index be1fb8ceb..21d876de9 100644
--- a/db/log_reader.cc
+++ b/db/log_reader.cc
@@ -20,9 +20,9 @@ namespace log {
 Reader::Reporter::~Reporter() {
 }
 
-Reader::Reader(unique_ptr<SequentialFile>&& file, Reporter* reporter,
+Reader::Reader(unique_ptr<SequentialFile>&& _file, Reporter* reporter,
                bool checksum, uint64_t initial_offset)
-    : file_(std::move(file)),
+    : file_(std::move(_file)),
       reporter_(reporter),
       checksum_(checksum),
       backing_store_(new char[kBlockSize]),
@@ -32,8 +32,7 @@ Reader::Reader(unique_ptr<SequentialFile>&& file, Reporter* reporter,
       eof_offset_(0),
       last_record_offset_(0),
       end_of_buffer_offset_(0),
-      initial_offset_(initial_offset) {
-}
+      initial_offset_(initial_offset) {}
 
 Reader::~Reader() {
   delete[] backing_store_;
diff --git a/db/memtable_list.cc b/db/memtable_list.cc
index 3c74e073c..0066a68ba 100644
--- a/db/memtable_list.cc
+++ b/db/memtable_list.cc
@@ -268,11 +268,11 @@ void MemTableList::Add(MemTable* m) {
 
 // Returns an estimate of the number of bytes of data in use.
 size_t MemTableList::ApproximateMemoryUsage() {
-  size_t size = 0;
+  size_t total_size = 0;
   for (auto& memtable : current_->memlist_) {
-    size += memtable->ApproximateMemoryUsage();
+    total_size += memtable->ApproximateMemoryUsage();
   }
-  return size;
+  return total_size;
 }
 
 void MemTableList::InstallNewVersion() {
diff --git a/db/merge_helper.cc b/db/merge_helper.cc
index 7bde824ab..11b5d8f47 100644
--- a/db/merge_helper.cc
+++ b/db/merge_helper.cc
@@ -85,9 +85,10 @@ void MergeHelper::MergeUntil(Iterator* iter, SequenceNumber stop_before,
       // We store the result in keys_.back() and operands_.back()
       // if nothing went wrong (i.e.: no operand corruption on disk)
       if (success_) {
-        std::string& key = keys_.back();  // The original key encountered
+        std::string& original_key =
+            keys_.back();  // The original key encountered
         orig_ikey.type = kTypeValue;
-        UpdateInternalKey(&key[0], key.size(),
+        UpdateInternalKey(&original_key[0], original_key.size(),
                           orig_ikey.sequence, orig_ikey.type);
         swap(operands_.back(), merge_result);
       } else {
@@ -108,17 +109,17 @@ void MergeHelper::MergeUntil(Iterator* iter, SequenceNumber stop_before,
       //   => store result in operands_.back() (and update keys_.back())
       //   => change the entry type to kTypeValue for keys_.back()
       // We are done! Success!
-      const Slice value = iter->value();
-      success_ = user_merge_operator_->FullMerge(ikey.user_key, &value,
-                                                 operands_, &merge_result,
-                                                 logger_);
+      const Slice val = iter->value();
+      success_ = user_merge_operator_->FullMerge(ikey.user_key, &val, operands_,
+                                                 &merge_result, logger_);
 
       // We store the result in keys_.back() and operands_.back()
       // if nothing went wrong (i.e.: no operand corruption on disk)
       if (success_) {
-        std::string& key = keys_.back();  // The original key encountered
+        std::string& original_key =
+            keys_.back();  // The original key encountered
         orig_ikey.type = kTypeValue;
-        UpdateInternalKey(&key[0], key.size(),
+        UpdateInternalKey(&original_key[0], original_key.size(),
                           orig_ikey.sequence, orig_ikey.type);
         swap(operands_.back(), merge_result);
       } else {
@@ -177,9 +178,9 @@ void MergeHelper::MergeUntil(Iterator* iter, SequenceNumber stop_before,
                                                logger_);
 
     if (success_) {
-      std::string& key = keys_.back();  // The original key encountered
+      std::string& original_key = keys_.back();  // The original key encountered
       orig_ikey.type = kTypeValue;
-      UpdateInternalKey(&key[0], key.size(),
+      UpdateInternalKey(&original_key[0], original_key.size(),
                         orig_ikey.sequence, orig_ikey.type);
 
       // The final value() is always stored in operands_.back()
diff --git a/db/skiplist_test.cc b/db/skiplist_test.cc
index 010616cc0..fe6f68ec9 100644
--- a/db/skiplist_test.cc
+++ b/db/skiplist_test.cc
@@ -220,8 +220,8 @@ class ConcurrentTest {
   void WriteStep(Random* rnd) {
     const uint32_t k = rnd->Next() % K;
     const int g = current_.Get(k) + 1;
-    const Key key = MakeKey(k, g);
-    list_.Insert(key);
+    const Key new_key = MakeKey(k, g);
+    list_.Insert(new_key);
     current_.Set(k, g);
   }
 
diff --git a/db/transaction_log_impl.cc b/db/transaction_log_impl.cc
index 6fc9fbaae..b0bf6e4e9 100644
--- a/db/transaction_log_impl.cc
+++ b/db/transaction_log_impl.cc
@@ -48,14 +48,14 @@ Status TransactionLogIteratorImpl::OpenLogFile(
     return env->NewSequentialFile(fname, file, soptions_);
   } else {
     std::string fname = LogFileName(dir_, logFile->LogNumber());
-    Status status = env->NewSequentialFile(fname, file, soptions_);
-    if (!status.ok()) {
+    Status s = env->NewSequentialFile(fname, file, soptions_);
+    if (!s.ok()) {
       //  If cannot open file in DB directory.
       //  Try the archive dir, as it could have moved in the meanwhile.
       fname = ArchivedLogFileName(dir_, logFile->LogNumber());
-      status = env->NewSequentialFile(fname, file, soptions_);
+      s = env->NewSequentialFile(fname, file, soptions_);
     }
-    return status;
+    return s;
   }
 }
 
@@ -182,10 +182,10 @@ void TransactionLogIteratorImpl::NextImpl(bool internal) {
     // Open the next file
     if (currentFileIndex_ < files_->size() - 1) {
       ++currentFileIndex_;
-      Status status =OpenLogReader(files_->at(currentFileIndex_).get());
-      if (!status.ok()) {
+      Status s = OpenLogReader(files_->at(currentFileIndex_).get());
+      if (!s.ok()) {
         isValid_ = false;
-        currentStatus_ = status;
+        currentStatus_ = s;
         return;
       }
     } else {
@@ -252,9 +252,9 @@ void TransactionLogIteratorImpl::UpdateCurrentWriteBatch(const Slice& record) {
 
 Status TransactionLogIteratorImpl::OpenLogReader(const LogFile* logFile) {
   unique_ptr<SequentialFile> file;
-  Status status = OpenLogFile(logFile, &file);
-  if (!status.ok()) {
-    return status;
+  Status s = OpenLogFile(logFile, &file);
+  if (!s.ok()) {
+    return s;
   }
   assert(file);
   currentLogReader_.reset(new log::Reader(std::move(file), &reporter_,
diff --git a/db/transaction_log_impl.h b/db/transaction_log_impl.h
index a0b7c9d3c..f0e572a5b 100644
--- a/db/transaction_log_impl.h
+++ b/db/transaction_log_impl.h
@@ -17,19 +17,6 @@
 
 namespace rocksdb {
 
-struct LogReporter : public log::Reader::Reporter {
-  Env* env;
-  Logger* info_log;
-  virtual void Corruption(size_t bytes, const Status& s) {
-    Log(InfoLogLevel::ERROR_LEVEL, info_log,
-        "dropping %zu bytes; %s", bytes, s.ToString().c_str());
-  }
-  virtual void Info(const char* s) {
-    Log(InfoLogLevel::INFO_LEVEL,
-        info_log, "%s", s);
-  }
-};
-
 class LogFileImpl : public LogFile {
  public:
   LogFileImpl(uint64_t logNum, WalFileType logType, SequenceNumber startSeq,
@@ -97,7 +84,19 @@ class TransactionLogIteratorImpl : public TransactionLogIterator {
   std::unique_ptr<WriteBatch> currentBatch_;
   unique_ptr<log::Reader> currentLogReader_;
   Status OpenLogFile(const LogFile* logFile, unique_ptr<SequentialFile>* file);
-  LogReporter reporter_;
+
+  struct LogReporter : public log::Reader::Reporter {
+    Env* env;
+    Logger* info_log;
+    virtual void Corruption(size_t bytes, const Status& s) {
+      Log(InfoLogLevel::ERROR_LEVEL, info_log, "dropping %zu bytes; %s", bytes,
+          s.ToString().c_str());
+    }
+    virtual void Info(const char* s) {
+      Log(InfoLogLevel::INFO_LEVEL, info_log, "%s", s);
+    }
+  } reporter_;
+
   SequenceNumber currentBatchSeq_; // sequence number at start of current batch
   SequenceNumber currentLastSeq_; // last sequence in the current batch
   // Used only to get latest seq. num
diff --git a/db/version_set.cc b/db/version_set.cc
index 3f7985028..cdca14177 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -651,12 +651,12 @@ void Version::AddIterators(const ReadOptions& read_options,
 
 VersionStorageInfo::VersionStorageInfo(
     const InternalKeyComparator* internal_comparator,
-    const Comparator* user_comparator, int num_levels,
+    const Comparator* user_comparator, int levels,
     CompactionStyle compaction_style, VersionStorageInfo* ref_vstorage)
     : internal_comparator_(internal_comparator),
       user_comparator_(user_comparator),
       // cfd is nullptr if Version is dummy
-      num_levels_(num_levels),
+      num_levels_(levels),
       num_non_empty_levels_(num_levels_),
       file_indexer_(user_comparator),
       compaction_style_(compaction_style),
@@ -683,22 +683,23 @@ VersionStorageInfo::VersionStorageInfo(
   }
 }
 
-Version::Version(ColumnFamilyData* cfd, VersionSet* vset,
+Version::Version(ColumnFamilyData* column_family_data, VersionSet* vset,
                  uint64_t version_number)
-    : cfd_(cfd),
-      info_log_((cfd == nullptr) ? nullptr : cfd->ioptions()->info_log),
-      db_statistics_((cfd == nullptr) ? nullptr : cfd->ioptions()->statistics),
-      table_cache_((cfd == nullptr) ? nullptr : cfd->table_cache()),
-      merge_operator_((cfd == nullptr) ? nullptr
-                                       : cfd->ioptions()->merge_operator),
-      storage_info_((cfd == nullptr) ? nullptr : &cfd->internal_comparator(),
-                    (cfd == nullptr) ? nullptr : cfd->user_comparator(),
-                    cfd == nullptr ? 0 : cfd->NumberLevels(),
-                    cfd == nullptr ? kCompactionStyleLevel
-                                   : cfd->ioptions()->compaction_style,
-                    (cfd == nullptr || cfd->current() == nullptr)
+    : cfd_(column_family_data),
+      info_log_((cfd_ == nullptr) ? nullptr : cfd_->ioptions()->info_log),
+      db_statistics_((cfd_ == nullptr) ? nullptr
+                                       : cfd_->ioptions()->statistics),
+      table_cache_((cfd_ == nullptr) ? nullptr : cfd_->table_cache()),
+      merge_operator_((cfd_ == nullptr) ? nullptr
+                                        : cfd_->ioptions()->merge_operator),
+      storage_info_((cfd_ == nullptr) ? nullptr : &cfd_->internal_comparator(),
+                    (cfd_ == nullptr) ? nullptr : cfd_->user_comparator(),
+                    cfd_ == nullptr ? 0 : cfd_->NumberLevels(),
+                    cfd_ == nullptr ? kCompactionStyleLevel
+                                    : cfd_->ioptions()->compaction_style,
+                    (cfd_ == nullptr || cfd_->current() == nullptr)
                         ? nullptr
-                        : cfd->current()->storage_info()),
+                        : cfd_->current()->storage_info()),
       vset_(vset),
       next_(this),
       prev_(this),
@@ -1445,10 +1446,10 @@ struct VersionSet::ManifestWriter {
 };
 
 VersionSet::VersionSet(const std::string& dbname, const DBOptions* db_options,
-                       const EnvOptions& env_options, Cache* table_cache,
+                       const EnvOptions& storage_options, Cache* table_cache,
                        WriteController* write_controller)
-    : column_family_set_(new ColumnFamilySet(dbname, db_options, env_options,
-                                             table_cache, write_controller)),
+    : column_family_set_(new ColumnFamilySet(
+          dbname, db_options, storage_options, table_cache, write_controller)),
       env_(db_options->env),
       dbname_(dbname),
       db_options_(db_options),
@@ -1459,7 +1460,7 @@ VersionSet::VersionSet(const std::string& dbname, const DBOptions* db_options,
       prev_log_number_(0),
       current_version_number_(0),
       manifest_file_size_(0),
-      env_options_(env_options),
+      env_options_(storage_options),
       env_options_compactions_(env_options_) {}
 
 VersionSet::~VersionSet() {
@@ -1842,8 +1843,8 @@ Status VersionSet::Recover(
   if (!s.ok()) {
     return s;
   }
-  uint64_t manifest_file_size;
-  s = env_->GetFileSize(manifest_filename, &manifest_file_size);
+  uint64_t current_manifest_file_size;
+  s = env_->GetFileSize(manifest_filename, &current_manifest_file_size);
   if (!s.ok()) {
     return s;
   }
@@ -1855,7 +1856,7 @@ Status VersionSet::Recover(
   uint64_t next_file = 0;
   uint64_t last_sequence = 0;
   uint64_t log_number = 0;
-  uint64_t prev_log_number = 0;
+  uint64_t previous_log_number = 0;
   uint32_t max_column_family = 0;
   std::unordered_map<uint32_t, BaseReferencedVersionBuilder*> builders;
 
@@ -1984,7 +1985,7 @@ Status VersionSet::Recover(
       }
 
       if (edit.has_prev_log_number_) {
-        prev_log_number = edit.prev_log_number_;
+        previous_log_number = edit.prev_log_number_;
         have_prev_log_number = true;
       }
 
@@ -2014,12 +2015,12 @@ Status VersionSet::Recover(
     }
 
     if (!have_prev_log_number) {
-      prev_log_number = 0;
+      previous_log_number = 0;
     }
 
     column_family_set_->UpdateMaxColumnFamily(max_column_family);
 
-    MarkFileNumberUsed(prev_log_number);
+    MarkFileNumberUsed(previous_log_number);
     MarkFileNumberUsed(log_number);
   }
 
@@ -2059,10 +2060,10 @@ Status VersionSet::Recover(
       AppendVersion(cfd, v);
     }
 
-    manifest_file_size_ = manifest_file_size;
+    manifest_file_size_ = current_manifest_file_size;
     next_file_number_ = next_file + 1;
     last_sequence_ = last_sequence;
-    prev_log_number_ = prev_log_number;
+    prev_log_number_ = previous_log_number;
 
     Log(InfoLogLevel::INFO_LEVEL, db_options_->info_log,
         "Recovered from manifest file:%s succeeded,"
@@ -2254,7 +2255,7 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname,
   bool have_last_sequence = false;
   uint64_t next_file = 0;
   uint64_t last_sequence = 0;
-  uint64_t prev_log_number = 0;
+  uint64_t previous_log_number = 0;
   int count = 0;
   std::unordered_map<uint32_t, std::string> comparators;
   std::unordered_map<uint32_t, BaseReferencedVersionBuilder*> builders;
@@ -2345,7 +2346,7 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname,
       }
 
       if (edit.has_prev_log_number_) {
-        prev_log_number = edit.prev_log_number_;
+        previous_log_number = edit.prev_log_number_;
         have_prev_log_number = true;
       }
 
@@ -2376,7 +2377,7 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname,
     }
 
     if (!have_prev_log_number) {
-      prev_log_number = 0;
+      previous_log_number = 0;
     }
   }
 
@@ -2409,13 +2410,13 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname,
 
     next_file_number_ = next_file + 1;
     last_sequence_ = last_sequence;
-    prev_log_number_ = prev_log_number;
+    prev_log_number_ = previous_log_number;
 
     printf(
         "next_file_number %lu last_sequence "
         "%lu  prev_log_number %lu max_column_family %u\n",
         (unsigned long)next_file_number_, (unsigned long)last_sequence,
-        (unsigned long)prev_log_number,
+        (unsigned long)previous_log_number,
         column_family_set_->GetMaxColumnFamily());
   }
 
@@ -2491,10 +2492,9 @@ Status VersionSet::WriteSnapshot(log::Writer* log) {
 
 // Opens the mainfest file and reads all records
 // till it finds the record we are looking for.
-bool VersionSet::ManifestContains(uint64_t manifest_file_number,
+bool VersionSet::ManifestContains(uint64_t manifest_file_num,
                                   const std::string& record) const {
-  std::string fname =
-      DescriptorFileName(dbname_, manifest_file_number);
+  std::string fname = DescriptorFileName(dbname_, manifest_file_num);
   Log(InfoLogLevel::INFO_LEVEL, db_options_->info_log,
       "ManifestContains: checking %s\n", fname.c_str());
   unique_ptr<SequentialFile> file;
diff --git a/helpers/memenv/memenv.cc b/helpers/memenv/memenv.cc
index 185e7d822..b6499f4e0 100644
--- a/helpers/memenv/memenv.cc
+++ b/helpers/memenv/memenv.cc
@@ -351,15 +351,14 @@ class InMemoryEnv : public EnvWrapper {
     return Status::NotSupported("getFileMTime", "Not supported in MemEnv");
   }
 
-  virtual Status RenameFile(const std::string& src,
-                            const std::string& target) {
+  virtual Status RenameFile(const std::string& src, const std::string& dest) {
     MutexLock lock(&mutex_);
     if (file_map_.find(src) == file_map_.end()) {
       return Status::IOError(src, "File not found");
     }
 
-    DeleteFileInternal(target);
-    file_map_[target] = file_map_[src];
+    DeleteFileInternal(dest);
+    file_map_[dest] = file_map_[src];
     file_map_.erase(src);
     return Status::OK();
   }
diff --git a/include/rocksdb/status.h b/include/rocksdb/status.h
index d13ff9d81..4be30c1f4 100644
--- a/include/rocksdb/status.h
+++ b/include/rocksdb/status.h
@@ -130,8 +130,8 @@ class Status {
   Code code_;
   const char* state_;
 
-  explicit Status(Code code) : code_(code), state_(nullptr) { }
-  Status(Code code, const Slice& msg, const Slice& msg2);
+  explicit Status(Code _code) : code_(_code), state_(nullptr) {}
+  Status(Code _code, const Slice& msg, const Slice& msg2);
   static const char* CopyState(const char* s);
 };
 
diff --git a/table/cuckoo_table_reader_test.cc b/table/cuckoo_table_reader_test.cc
index 7bd18f536..d1c52722a 100644
--- a/table/cuckoo_table_reader_test.cc
+++ b/table/cuckoo_table_reader_test.cc
@@ -517,11 +517,11 @@ TEST(CuckooReaderTest, TestReadPerformance) {
   fprintf(stdout,
       "WARNING: Not compiled with DNDEBUG. Performance tests may be slow.\n");
 #endif
-  std::vector<std::string> keys;
   for (uint64_t num : nums) {
     if (FLAGS_write || !Env::Default()->FileExists(GetFileName(num))) {
-      GetKeys(num, &keys);
-      WriteFile(keys, num, hash_ratio);
+      std::vector<std::string> all_keys;
+      GetKeys(num, &all_keys);
+      WriteFile(all_keys, num, hash_ratio);
     }
     ReadKeys(num, 0);
     ReadKeys(num, 10);
diff --git a/table/format.cc b/table/format.cc
index 768e00165..d64bb3eac 100644
--- a/table/format.cc
+++ b/table/format.cc
@@ -107,11 +107,11 @@ inline uint64_t UpconvertLegacyFooterFormat(uint64_t magic_number) {
 }
 }  // namespace
 
-Footer::Footer(uint64_t table_magic_number)
-    : version_(IsLegacyFooterFormat(table_magic_number) ? kLegacyFooter
-                                                        : kFooterVersion),
+Footer::Footer(uint64_t _table_magic_number)
+    : version_(IsLegacyFooterFormat(_table_magic_number) ? kLegacyFooter
+                                                         : kFooterVersion),
       checksum_(kCRC32c),
-      table_magic_number_(table_magic_number) {}
+      table_magic_number_(_table_magic_number) {}
 
 Status Footer::DecodeFrom(Slice* input) {
   assert(input != nullptr);
@@ -160,11 +160,11 @@ Status Footer::DecodeFrom(Slice* input) {
     } else {
       input->remove_prefix(input->size() - kVersion1EncodedLength);
     }
-    uint32_t checksum;
-    if (!GetVarint32(input, &checksum)) {
+    uint32_t chksum;
+    if (!GetVarint32(input, &chksum)) {
       return Status::Corruption("bad checksum type");
     }
-    checksum_ = static_cast<ChecksumType>(checksum);
+    checksum_ = static_cast<ChecksumType>(chksum);
   }
 
   Status result = metaindex_handle_.DecodeFrom(input);
diff --git a/table/format.h b/table/format.h
index 986164d81..1df32bcf1 100644
--- a/table/format.h
+++ b/table/format.h
@@ -33,11 +33,11 @@ class BlockHandle {
 
   // The offset of the block in the file.
   uint64_t offset() const { return offset_; }
-  void set_offset(uint64_t offset) { offset_ = offset; }
+  void set_offset(uint64_t _offset) { offset_ = _offset; }
 
   // The size of the stored block
   uint64_t size() const { return size_; }
-  void set_size(uint64_t size) { size_ = size; }
+  void set_size(uint64_t _size) { size_ = _size; }
 
   void EncodeTo(std::string* dst) const;
   Status DecodeFrom(Slice* input);
@@ -200,9 +200,7 @@ inline BlockHandle::BlockHandle()
                   ~static_cast<uint64_t>(0)) {
 }
 
-inline BlockHandle::BlockHandle(uint64_t offset, uint64_t size)
-    : offset_(offset),
-      size_(size) {
-}
+inline BlockHandle::BlockHandle(uint64_t _offset, uint64_t _size)
+    : offset_(_offset), size_(_size) {}
 
 }  // namespace rocksdb
diff --git a/table/iterator_wrapper.h b/table/iterator_wrapper.h
index 502cacb3e..d64047bea 100644
--- a/table/iterator_wrapper.h
+++ b/table/iterator_wrapper.h
@@ -20,17 +20,15 @@ namespace rocksdb {
 class IteratorWrapper {
  public:
   IteratorWrapper(): iter_(nullptr), valid_(false) { }
-  explicit IteratorWrapper(Iterator* iter): iter_(nullptr) {
-    Set(iter);
-  }
+  explicit IteratorWrapper(Iterator* _iter) : iter_(nullptr) { Set(_iter); }
   ~IteratorWrapper() {}
   Iterator* iter() const { return iter_; }
 
   // Takes ownership of "iter" and will delete it when destroyed, or
   // when Set() is invoked again.
-  void Set(Iterator* iter) {
+  void Set(Iterator* _iter) {
     delete iter_;
-    iter_ = iter;
+    iter_ = _iter;
     if (iter_ == nullptr) {
       valid_ = false;
     } else {
diff --git a/table/merger.cc b/table/merger.cc
index a53376ceb..496f847fa 100644
--- a/table/merger.cc
+++ b/table/merger.cc
@@ -240,14 +240,14 @@ class MergingIterator : public Iterator {
   }
 
   virtual Status status() const {
-    Status status;
+    Status s;
     for (auto& child : children_) {
-      status = child.status();
-      if (!status.ok()) {
+      s = child.status();
+      if (!s.ok()) {
         break;
       }
     }
-    return status;
+    return s;
   }
 
  private:
diff --git a/table/table_test.cc b/table/table_test.cc
index 5f34e92eb..a5685f7f6 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -251,15 +251,13 @@ class BlockConstructor: public Constructor {
                             const ImmutableCFOptions& ioptions,
                             const BlockBasedTableOptions& table_options,
                             const InternalKeyComparator& internal_comparator,
-                            const KVMap& data) {
+                            const KVMap& kv_map) {
     delete block_;
     block_ = nullptr;
     BlockBuilder builder(table_options.block_restart_interval);
 
-    for (KVMap::const_iterator it = data.begin();
-         it != data.end();
-         ++it) {
-      builder.Add(it->first, it->second);
+    for (const auto kv : kv_map) {
+      builder.Add(kv.first, kv.second);
     }
     // Open the block
     data_ = builder.Finish().ToString();
@@ -307,12 +305,12 @@ class KeyConvertingIterator: public Iterator {
 
   virtual Slice key() const {
     assert(Valid());
-    ParsedInternalKey key;
-    if (!ParseInternalKey(iter_->key(), &key)) {
+    ParsedInternalKey parsed_key;
+    if (!ParseInternalKey(iter_->key(), &parsed_key)) {
       status_ = Status::Corruption("malformed internal key");
       return Slice("corrupted key");
     }
-    return key.user_key;
+    return parsed_key.user_key;
   }
 
   virtual Slice value() const { return iter_->value(); }
@@ -342,7 +340,7 @@ class TableConstructor: public Constructor {
                             const ImmutableCFOptions& ioptions,
                             const BlockBasedTableOptions& table_options,
                             const InternalKeyComparator& internal_comparator,
-                            const KVMap& data) {
+                            const KVMap& kv_map) {
     Reset();
     sink_.reset(new StringSink());
     unique_ptr<TableBuilder> builder;
@@ -350,16 +348,14 @@ class TableConstructor: public Constructor {
         ioptions, internal_comparator, sink_.get(), options.compression,
         CompressionOptions()));
 
-    for (KVMap::const_iterator it = data.begin();
-         it != data.end();
-         ++it) {
+    for (const auto kv : kv_map) {
       if (convert_to_internal_key_) {
-        ParsedInternalKey ikey(it->first, kMaxSequenceNumber, kTypeValue);
+        ParsedInternalKey ikey(kv.first, kMaxSequenceNumber, kTypeValue);
         std::string encoded;
         AppendInternalKey(&encoded, ikey);
-        builder->Add(encoded, it->second);
+        builder->Add(encoded, kv.second);
       } else {
-        builder->Add(it->first, it->second);
+        builder->Add(kv.first, kv.second);
       }
       ASSERT_TRUE(builder->status().ok());
     }
@@ -445,11 +441,10 @@ class MemTableConstructor: public Constructor {
   ~MemTableConstructor() {
     delete memtable_->Unref();
   }
-  virtual Status FinishImpl(const Options&,
-                            const ImmutableCFOptions& ioptions,
+  virtual Status FinishImpl(const Options&, const ImmutableCFOptions& ioptions,
                             const BlockBasedTableOptions& table_options,
                             const InternalKeyComparator& internal_comparator,
-                            const KVMap& data) {
+                            const KVMap& kv_map) {
     delete memtable_->Unref();
     Options options;
     options.memtable_factory = table_factory_;
@@ -458,10 +453,8 @@ class MemTableConstructor: public Constructor {
                              MutableCFOptions(options, mem_ioptions));
     memtable_->Ref();
     int seq = 1;
-    for (KVMap::const_iterator it = data.begin();
-         it != data.end();
-         ++it) {
-      memtable_->Add(seq, kTypeValue, it->first, it->second);
+    for (const auto kv : kv_map) {
+      memtable_->Add(seq, kTypeValue, kv.first, kv.second);
       seq++;
     }
     return Status::OK();
@@ -497,15 +490,13 @@ class DBConstructor: public Constructor {
                             const ImmutableCFOptions& ioptions,
                             const BlockBasedTableOptions& table_options,
                             const InternalKeyComparator& internal_comparator,
-                            const KVMap& data) {
+                            const KVMap& kv_map) {
     delete db_;
     db_ = nullptr;
     NewDB();
-    for (KVMap::const_iterator it = data.begin();
-         it != data.end();
-         ++it) {
+    for (const auto kv : kv_map) {
       WriteBatch batch;
-      batch.Put(it->first, it->second);
+      batch.Put(kv.first, kv.second);
       ASSERT_TRUE(db_->Write(WriteOptions(), &batch).ok());
     }
     return Status::OK();
diff --git a/util/autovector_test.cc b/util/autovector_test.cc
index 25ebaa24b..86cafc893 100644
--- a/util/autovector_test.cc
+++ b/util/autovector_test.cc
@@ -5,6 +5,7 @@
 
 #include <atomic>
 #include <iostream>
+#include <utility>
 
 #include "rocksdb/env.h"
 #include "util/autovector.h"
@@ -48,8 +49,8 @@ TEST(AutoVectorTest, PushBackAndPopBack) {
 }
 
 TEST(AutoVectorTest, EmplaceBack) {
-  typedef std::pair<size_t, std::string> ValueType;
-  autovector<ValueType, kSize> vec;
+  typedef std::pair<size_t, std::string> ValType;
+  autovector<ValType, kSize> vec;
 
   for (size_t i = 0; i < 1000 * kSize; ++i) {
     vec.emplace_back(i, std::to_string(i + 123));
diff --git a/util/mock_env.cc b/util/mock_env.cc
index c44592314..bfcfeaa0c 100644
--- a/util/mock_env.cc
+++ b/util/mock_env.cc
@@ -539,10 +539,9 @@ Status MockEnv::GetFileModificationTime(const std::string& fname,
   return Status::OK();
 }
 
-Status MockEnv::RenameFile(const std::string& src,
-                              const std::string& target) {
+Status MockEnv::RenameFile(const std::string& src, const std::string& dest) {
   auto s = NormalizePath(src);
-  auto t = NormalizePath(target);
+  auto t = NormalizePath(dest);
   MutexLock lock(&mutex_);
   if (file_map_.find(s) == file_map_.end()) {
     return Status::IOError(s, "File not found");
diff --git a/util/status.cc b/util/status.cc
index 3165a497d..8eca3a5a8 100644
--- a/util/status.cc
+++ b/util/status.cc
@@ -21,9 +21,8 @@ const char* Status::CopyState(const char* state) {
   return result;
 }
 
-Status::Status(Code code, const Slice& msg, const Slice& msg2) :
-    code_(code) {
-  assert(code != kOk);
+Status::Status(Code _code, const Slice& msg, const Slice& msg2) : code_(_code) {
+  assert(code_ != kOk);
   const uint32_t len1 = msg.size();
   const uint32_t len2 = msg2.size();
   const uint32_t size = len1 + (len2 ? (2 + len2) : 0);
diff --git a/utilities/document/json_document.cc b/utilities/document/json_document.cc
index 4368b759d..e5b745573 100644
--- a/utilities/document/json_document.cc
+++ b/utilities/document/json_document.cc
@@ -31,9 +31,9 @@ JSONDocument::JSONDocument(const std::string& s) : type_(kString) {
 JSONDocument::JSONDocument(const char* s) : type_(kString) {
   new (&data_.s) std::string(s);
 }
-JSONDocument::JSONDocument(Type type) : type_(type) {
+JSONDocument::JSONDocument(Type _type) : type_(_type) {
   // TODO(icanadi) make all of this better by using templates
-  switch (type) {
+  switch (type_) {
     case kNull:
       break;
     case kObject:
@@ -545,11 +545,11 @@ bool JSONDocument::DeserializeInternal(Slice* input) {
       }
       data_.a.resize(size);
       for (size_t i = 0; i < size; ++i) {
-        Type type;
-        if (!GetNextType(input, &type)) {
+        Type t;
+        if (!GetNextType(input, &t)) {
           return false;
         }
-        data_.a[i] = new JSONDocument(type);
+        data_.a[i] = new JSONDocument(t);
         if (!data_.a[i]->DeserializeInternal(input)) {
           return false;
         }
@@ -582,10 +582,10 @@ bool JSONDocument::DeserializeInternal(Slice* input) {
       for (uint32_t i = 0; ok && i < num_elements; ++i) {
         Slice key;
         ok = GetLengthPrefixedSlice(input, &key);
-        Type type;
-        ok = ok && GetNextType(input, &type);
+        Type t;
+        ok = ok && GetNextType(input, &t);
         if (ok) {
-          std::unique_ptr<JSONDocument> value(new JSONDocument(type));
+          std::unique_ptr<JSONDocument> value(new JSONDocument(t));
           ok = value->DeserializeInternal(input);
           if (ok) {
             data_.o.insert({key.ToString(), value.get()});
diff --git a/utilities/write_batch_with_index/write_batch_with_index.cc b/utilities/write_batch_with_index/write_batch_with_index.cc
index ff9d89f2f..761b955a1 100644
--- a/utilities/write_batch_with_index/write_batch_with_index.cc
+++ b/utilities/write_batch_with_index/write_batch_with_index.cc
@@ -53,10 +53,10 @@ class BaseDeltaIterator : public Iterator {
     UpdateCurrent();
   }
 
-  void Seek(const Slice& key) override {
+  void Seek(const Slice& k) override {
     forward_ = true;
-    base_iterator_->Seek(key);
-    delta_iterator_->Seek(key);
+    base_iterator_->Seek(k);
+    delta_iterator_->Seek(k);
     UpdateCurrent();
   }
 

From 5fd33d26f1a2dd6dfbb7e6c4659f5f4e7c5ba6d2 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Thu, 6 Nov 2014 12:01:02 -0800
Subject: [PATCH 422/829] Turn off -Wshadow
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary:
So glibc is not -Wshadow-safe, so we need to turn it off :(

      error: ‘int sigaction(int, const sigaction*, sigaction*)’ hides
      constructor for ‘struct sigaction’

The rest of the changes in this diff is that we include .h files under rocksdb namespace, which is a no-no.

Test Plan: compiles now

Reviewers: ljin, yhchiang, rven, sdong

Reviewed By: sdong

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D28413
---
 Makefile            |  2 +-
 port/stack_trace.cc | 14 +++++++++-----
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/Makefile b/Makefile
index c2d206e91..d06d6f2a1 100644
--- a/Makefile
+++ b/Makefile
@@ -69,7 +69,7 @@ install:
 	@[ ! -e $(SHARED) ] || install -C -m 644 $(SHARED) $(INSTALL_PATH)/lib
 #-------------------------------------------------
 
-WARNING_FLAGS = -Wall -Werror -Wsign-compare -Wshadow
+WARNING_FLAGS = -Wall -Werror -Wsign-compare
 CFLAGS += $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CCFLAGS) $(OPT)
 CXXFLAGS += $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CXXFLAGS) $(OPT) -Woverloaded-virtual
 
diff --git a/port/stack_trace.cc b/port/stack_trace.cc
index 1aeb5f7b5..224cac700 100644
--- a/port/stack_trace.cc
+++ b/port/stack_trace.cc
@@ -5,15 +5,16 @@
 //
 #include "port/stack_trace.h"
 
-namespace rocksdb {
-namespace port {
-
 #if defined(ROCKSDB_LITE) || !(defined(OS_LINUX) || defined(OS_MACOSX))
 
 // noop
 
+namespace rocksdb {
+namespace port {
 void InstallStackTraceHandler() {}
 void PrintStack(int first_frames_to_skip) {}
+}  // namespace port
+}  // namespace rocksdb
 
 #else
 
@@ -25,6 +26,9 @@ void PrintStack(int first_frames_to_skip) {}
 #include <unistd.h>
 #include <cxxabi.h>
 
+namespace rocksdb {
+namespace port {
+
 namespace {
 
 #ifdef OS_LINUX
@@ -126,7 +130,7 @@ void InstallStackTraceHandler() {
   signal(SIGABRT, StackTraceHandler);
 }
 
-#endif
-
 }  // namespace port
 }  // namespace rocksdb
+
+#endif

From 0c2be0de382ce4a65740f203c35b39d2bc041c92 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Thu, 6 Nov 2014 13:29:17 -0800
Subject: [PATCH 423/829] Turn on -Wshadow for travis

---
 .travis.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.travis.yml b/.travis.yml
index 8f1bcb0ae..ce43d7030 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -14,6 +14,6 @@ before_install:
  - sudo dpkg -i libgflags-dev_2.0-1_amd64.deb
 # Lousy hack to disable use and testing of fallocate, which doesn't behave quite
 # as EnvPosixTest::AllocateTest expects within the Travis OpenVZ environment.
-script: OPT=-DTRAVIS make unity  && OPT=-DTRAVIS make check -j8
+script: OPT=-DTRAVIS make unity && make clean && OPT="-DTRAVIS -Wshadow" make check -j8
 notifications:
     email: false

From e526b7140212da43d2b5c059cb73366d1f6f1232 Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Thu, 6 Nov 2014 13:53:02 -0800
Subject: [PATCH 424/829] Make PartialCompactionFailure Test more robust.

Summary: Make PartialCompactionFailure Test more robust.

Test Plan:
export ROCKSDB_TESTS=PartialCompactionFailure
./db_test

Reviewers: ljin, sdong, igor

Reviewed By: igor

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D28425
---
 db/db_test.cc | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/db/db_test.cc b/db/db_test.cc
index d4237b424..e1818a8a5 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -8958,9 +8958,8 @@ TEST(DBTest, PartialCompactionFailure) {
   }
 
   dbfull()->TEST_WaitForFlushMemTable();
-  // Make sure the number of L0 files can trigger compaction.
-  ASSERT_GE(NumTableFilesAtLevel(0),
-            options.level0_file_num_compaction_trigger);
+  // Make sure there're some L0 files we can compact
+  ASSERT_GT(NumTableFilesAtLevel(0), 0);
   auto previous_num_level0_files = NumTableFilesAtLevel(0);
 
   // The number of NewWritableFiles calls required by each operation.
@@ -8973,11 +8972,15 @@ TEST(DBTest, PartialCompactionFailure) {
 
   // Expect compaction to fail here as one file will fail its
   // creation.
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_TRUE(!db_->CompactRange(nullptr, nullptr).ok());
+
   // Verify L0 -> L1 compaction does fail.
   ASSERT_EQ(NumTableFilesAtLevel(1), 0);
+
   // Verify all L0 files are still there.
-  ASSERT_EQ(NumTableFilesAtLevel(0), previous_num_level0_files);
+  // We use GE here as occasionally there might be additional
+  // memtables being flushed.
+  ASSERT_GE(NumTableFilesAtLevel(0), previous_num_level0_files);
 
   // All key-values must exist after compaction fails.
   for (int k = 0; k < kNumInsertedKeys; ++k) {

From c4bf07c24596bca970c72ee5ef7f32dac642401d Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Thu, 6 Nov 2014 23:12:36 +0100
Subject: [PATCH 425/829] [RocksJava] -WShadow improvements

Minor corrections to resolve -WShadow build problems with RocksJava code.
---
 java/rocksjni/env.cc         |   2 +-
 java/rocksjni/iterator.cc    |   6 +-
 java/rocksjni/rocksjni.cc    | 157 ++++++++++++++++++-----------------
 java/rocksjni/write_batch.cc |  38 +++++----
 4 files changed, 105 insertions(+), 98 deletions(-)

diff --git a/java/rocksjni/env.cc b/java/rocksjni/env.cc
index 3aed9f5a0..c6c58e144 100644
--- a/java/rocksjni/env.cc
+++ b/java/rocksjni/env.cc
@@ -15,7 +15,7 @@
  * Signature: ()J
  */
 jlong Java_org_rocksdb_RocksEnv_getDefaultEnvInternal(
-    JNIEnv* env, jclass jclass) {
+    JNIEnv* env, jclass jclazz) {
   return reinterpret_cast<jlong>(rocksdb::Env::Default());
 }
 
diff --git a/java/rocksjni/iterator.cc b/java/rocksjni/iterator.cc
index 84b0b3133..d17ed8722 100644
--- a/java/rocksjni/iterator.cc
+++ b/java/rocksjni/iterator.cc
@@ -91,11 +91,11 @@ jbyteArray Java_org_rocksdb_RocksIterator_value0(
   auto it = reinterpret_cast<rocksdb::Iterator*>(handle);
   rocksdb::Slice value_slice = it->value();
 
-  jbyteArray jvalue = env->NewByteArray(value_slice.size());
+  jbyteArray jkeyValue = env->NewByteArray(value_slice.size());
   env->SetByteArrayRegion(
-      jvalue, 0, value_slice.size(),
+      jkeyValue, 0, value_slice.size(),
       reinterpret_cast<const jbyte*>(value_slice.data()));
-  return jvalue;
+  return jkeyValue;
 }
 
 /*
diff --git a/java/rocksjni/rocksjni.cc b/java/rocksjni/rocksjni.cc
index d1a8bb7be..b17f9bab7 100644
--- a/java/rocksjni/rocksjni.cc
+++ b/java/rocksjni/rocksjni.cc
@@ -108,21 +108,21 @@ jobject
     env->ReleaseStringUTFChars(jcfnames_for_free[i], cfnames_to_free[i]);
   }
 
-  jobject jcfhandle_list = nullptr;
   // check if open operation was successful
   if (s.ok()) {
     rocksdb::RocksDBJni::setHandle(env, jdb, db);
-    jclass jclazz = env->FindClass("java/util/ArrayList");
-    jmethodID mid = rocksdb::ListJni::getArrayListConstructorMethodId(
-        env, jclazz);
-    jobject jcfhandle_list = env->NewObject(jclazz, mid, handles.size());
+    jclass jListClazz = env->FindClass("java/util/ArrayList");
+    jmethodID midList = rocksdb::ListJni::getArrayListConstructorMethodId(
+        env, jListClazz);
+    jobject jcfhandle_list = env->NewObject(jListClazz,
+        midList, handles.size());
     // insert in java list
     for (std::vector<rocksdb::ColumnFamilyHandle*>::size_type i = 0;
         i != handles.size(); i++) {
       // jlong must be converted to Long due to collections restrictions
-      jclass jclazz = env->FindClass("java/lang/Long");
-      jmethodID mid = env->GetMethodID(jclazz, "<init>", "(J)V");
-      jobject obj = env->NewObject(jclazz, mid,
+      jclass jLongClazz = env->FindClass("java/lang/Long");
+      jmethodID midLong = env->GetMethodID(jLongClazz, "<init>", "(J)V");
+      jobject obj = env->NewObject(jLongClazz, midLong,
           reinterpret_cast<jlong>(handles[i]));
       env->CallBooleanMethod(jcfhandle_list,
           rocksdb::ListJni::getListAddMethodId(env), obj);
@@ -131,7 +131,7 @@ jobject
     return jcfhandle_list;
   }
   rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
-  return jcfhandle_list;
+  return nullptr;
 }
 
 /*
@@ -180,21 +180,21 @@ jobject Java_org_rocksdb_RocksDB_open__JLjava_lang_String_2Ljava_util_List_2I(
     env->ReleaseStringUTFChars(jcfnames_for_free[i], cfnames_to_free[i]);
   }
 
-  jobject jcfhandle_list = nullptr;
   // check if open operation was successful
   if (s.ok()) {
     rocksdb::RocksDBJni::setHandle(env, jdb, db);
-    jclass jclazz = env->FindClass("java/util/ArrayList");
-    jmethodID mid = rocksdb::ListJni::getArrayListConstructorMethodId(
-        env, jclazz);
-    jobject jcfhandle_list = env->NewObject(jclazz, mid, handles.size());
+    jclass jListClazz = env->FindClass("java/util/ArrayList");
+    jmethodID midList = rocksdb::ListJni::getArrayListConstructorMethodId(
+        env, jListClazz);
+    jobject jcfhandle_list = env->NewObject(jListClazz,
+        midList, handles.size());
     // insert in java list
     for (std::vector<rocksdb::ColumnFamilyHandle*>::size_type i = 0;
         i != handles.size(); i++) {
       // jlong must be converted to Long due to collections restrictions
-      jclass jclazz = env->FindClass("java/lang/Long");
-      jmethodID mid = env->GetMethodID(jclazz, "<init>", "(J)V");
-      jobject obj = env->NewObject(jclazz, mid,
+      jclass jLongClazz = env->FindClass("java/lang/Long");
+      jmethodID midLong = env->GetMethodID(jLongClazz, "<init>", "(J)V");
+      jobject obj = env->NewObject(jLongClazz, midLong,
           reinterpret_cast<jlong>(handles[i]));
       env->CallBooleanMethod(jcfhandle_list,
           rocksdb::ListJni::getListAddMethodId(env), obj);
@@ -203,7 +203,7 @@ jobject Java_org_rocksdb_RocksDB_open__JLjava_lang_String_2Ljava_util_List_2I(
     return jcfhandle_list;
   }
   rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
-  return jcfhandle_list;
+  return nullptr;
 }
 
 //////////////////////////////////////////////////////////////////////////////
@@ -226,18 +226,20 @@ jobject Java_org_rocksdb_RocksDB_listColumnFamilies(
   env->ReleaseStringUTFChars(jdb_path, db_path);
   if (s.ok()) {
     // Don't reuse class pointer
-    jclass jclazz = env->FindClass("java/util/ArrayList");
+    jclass jListClazz = env->FindClass("java/util/ArrayList");
     jmethodID mid = rocksdb::ListJni::getArrayListConstructorMethodId(env,
-        jclazz);
-    jvalue_list = env->NewObject(jclazz, mid, column_family_names.size());
+        jListClazz);
+    jvalue_list = env->NewObject(jListClazz, mid, column_family_names.size());
 
     for (std::vector<std::string>::size_type i = 0;
         i < column_family_names.size(); i++) {
-      jbyteArray jvalue = env->NewByteArray(column_family_names[i].size());
-      env->SetByteArrayRegion(jvalue, 0, column_family_names[i].size(),
+      jbyteArray jcf_value =
+          env->NewByteArray(column_family_names[i].size());
+      env->SetByteArrayRegion(jcf_value, 0,
+          column_family_names[i].size(),
           reinterpret_cast<const jbyte*>(column_family_names[i].c_str()));
       env->CallBooleanMethod(jvalue_list,
-          rocksdb::ListJni::getListAddMethodId(env), jvalue);
+          rocksdb::ListJni::getListAddMethodId(env), jcf_value);
     }
   }
   return jvalue_list;
@@ -249,12 +251,13 @@ jobject Java_org_rocksdb_RocksDB_listColumnFamilies(
 void rocksdb_put_helper(
     JNIEnv* env, rocksdb::DB* db, const rocksdb::WriteOptions& write_options,
     rocksdb::ColumnFamilyHandle* cf_handle, jbyteArray jkey, jint jkey_len,
-    jbyteArray jvalue, jint jvalue_len) {
+    jbyteArray jentry_value, jint jentry_value_len) {
 
   jbyte* key = env->GetByteArrayElements(jkey, 0);
-  jbyte* value = env->GetByteArrayElements(jvalue, 0);
+  jbyte* value = env->GetByteArrayElements(jentry_value, 0);
   rocksdb::Slice key_slice(reinterpret_cast<char*>(key), jkey_len);
-  rocksdb::Slice value_slice(reinterpret_cast<char*>(value), jvalue_len);
+  rocksdb::Slice value_slice(reinterpret_cast<char*>(value),
+      jentry_value_len);
 
   rocksdb::Status s;
   if (cf_handle != nullptr) {
@@ -268,7 +271,7 @@ void rocksdb_put_helper(
   // by passing JNI_ABORT, it will simply release the reference without
   // copying the result back to the java byte array.
   env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
-  env->ReleaseByteArrayElements(jvalue, value, JNI_ABORT);
+  env->ReleaseByteArrayElements(jentry_value, value, JNI_ABORT);
 
   if (s.ok()) {
     return;
@@ -284,14 +287,14 @@ void rocksdb_put_helper(
 void Java_org_rocksdb_RocksDB_put__J_3BI_3BI(
     JNIEnv* env, jobject jdb, jlong jdb_handle,
     jbyteArray jkey, jint jkey_len,
-    jbyteArray jvalue, jint jvalue_len) {
+    jbyteArray jentry_value, jint jentry_value_len) {
   auto db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
   static const rocksdb::WriteOptions default_write_options =
       rocksdb::WriteOptions();
 
   rocksdb_put_helper(env, db, default_write_options, nullptr,
                      jkey, jkey_len,
-                     jvalue, jvalue_len);
+                     jentry_value, jentry_value_len);
 }
 /*
  * Class:     org_rocksdb_RocksDB
@@ -301,14 +304,14 @@ void Java_org_rocksdb_RocksDB_put__J_3BI_3BI(
 void Java_org_rocksdb_RocksDB_put__J_3BI_3BIJ(
     JNIEnv* env, jobject jdb, jlong jdb_handle,
     jbyteArray jkey, jint jkey_len,
-    jbyteArray jvalue, jint jvalue_len, jlong jcf_handle) {
+    jbyteArray jentry_value, jint jentry_value_len, jlong jcf_handle) {
   auto db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
   static const rocksdb::WriteOptions default_write_options =
       rocksdb::WriteOptions();
   auto cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
   if (cf_handle != nullptr) {
     rocksdb_put_helper(env, db, default_write_options, cf_handle,
-        jkey, jkey_len, jvalue, jvalue_len);
+        jkey, jkey_len, jentry_value, jentry_value_len);
   } else {
     rocksdb::RocksDBExceptionJni::ThrowNew(env,
         rocksdb::Status::InvalidArgument("Invalid ColumnFamilyHandle."));
@@ -324,14 +327,14 @@ void Java_org_rocksdb_RocksDB_put__JJ_3BI_3BI(
     JNIEnv* env, jobject jdb,
     jlong jdb_handle, jlong jwrite_options_handle,
     jbyteArray jkey, jint jkey_len,
-    jbyteArray jvalue, jint jvalue_len) {
+    jbyteArray jentry_value, jint jentry_value_len) {
   auto db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
   auto write_options = reinterpret_cast<rocksdb::WriteOptions*>(
       jwrite_options_handle);
 
   rocksdb_put_helper(env, db, *write_options, nullptr,
                      jkey, jkey_len,
-                     jvalue, jvalue_len);
+                     jentry_value, jentry_value_len);
 }
 
 /*
@@ -343,14 +346,14 @@ void Java_org_rocksdb_RocksDB_put__JJ_3BI_3BIJ(
     JNIEnv* env, jobject jdb,
     jlong jdb_handle, jlong jwrite_options_handle,
     jbyteArray jkey, jint jkey_len,
-    jbyteArray jvalue, jint jvalue_len, jlong jcf_handle) {
+    jbyteArray jentry_value, jint jentry_value_len, jlong jcf_handle) {
   auto db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
   auto write_options = reinterpret_cast<rocksdb::WriteOptions*>(
       jwrite_options_handle);
   auto cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
   if (cf_handle != nullptr) {
     rocksdb_put_helper(env, db, *write_options, cf_handle,
-        jkey, jkey_len, jvalue, jvalue_len);
+        jkey, jkey_len, jentry_value, jentry_value_len);
   } else {
     rocksdb::RocksDBExceptionJni::ThrowNew(env,
         rocksdb::Status::InvalidArgument("Invalid ColumnFamilyHandle."));
@@ -384,7 +387,7 @@ void Java_org_rocksdb_RocksDB_write(
 jboolean key_may_exist_helper(JNIEnv* env, rocksdb::DB* db,
     const rocksdb::ReadOptions& read_opt,
     rocksdb::ColumnFamilyHandle* cf_handle, jbyteArray jkey, jint jkey_len,
-    jobject jvalue) {
+    jobject jstring_buffer) {
   std::string value;
   bool value_found = false;
   jboolean isCopy;
@@ -400,11 +403,11 @@ jboolean key_may_exist_helper(JNIEnv* env, rocksdb::DB* db,
   }
 
   if (value_found && !value.empty()) {
-    jclass clazz = env->GetObjectClass(jvalue);
+    jclass clazz = env->GetObjectClass(jstring_buffer);
     jmethodID mid = env->GetMethodID(clazz, "append",
         "(Ljava/lang/String;)Ljava/lang/StringBuffer;");
     jstring new_value_str = env->NewStringUTF(value.c_str());
-    env->CallObjectMethod(jvalue, mid, new_value_str);
+    env->CallObjectMethod(jstring_buffer, mid, new_value_str);
   }
   env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
   return static_cast<jboolean>(keyMayExist);
@@ -417,10 +420,10 @@ jboolean key_may_exist_helper(JNIEnv* env, rocksdb::DB* db,
  */
 jboolean Java_org_rocksdb_RocksDB_keyMayExist___3BILjava_lang_StringBuffer_2(
     JNIEnv* env, jobject jdb, jbyteArray jkey, jint jkey_len,
-    jobject jvalue) {
+    jobject jstring_buffer) {
   rocksdb::DB* db = rocksdb::RocksDBJni::getHandle(env, jdb);
   return key_may_exist_helper(env, db, rocksdb::ReadOptions(),
-      nullptr, jkey, jkey_len, jvalue);
+      nullptr, jkey, jkey_len, jstring_buffer);
 }
 
 /*
@@ -430,13 +433,13 @@ jboolean Java_org_rocksdb_RocksDB_keyMayExist___3BILjava_lang_StringBuffer_2(
  */
 jboolean Java_org_rocksdb_RocksDB_keyMayExist___3BIJLjava_lang_StringBuffer_2(
     JNIEnv* env, jobject jdb, jbyteArray jkey, jint jkey_len,
-    jlong jcf_handle, jobject jvalue) {
+    jlong jcf_handle, jobject jstring_buffer) {
   rocksdb::DB* db = rocksdb::RocksDBJni::getHandle(env, jdb);
   auto cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(
       jcf_handle);
   if (cf_handle != nullptr) {
     return key_may_exist_helper(env, db, rocksdb::ReadOptions(),
-        cf_handle, jkey, jkey_len, jvalue);
+        cf_handle, jkey, jkey_len, jstring_buffer);
   } else {
     rocksdb::RocksDBExceptionJni::ThrowNew(env,
         rocksdb::Status::InvalidArgument("Invalid ColumnFamilyHandle."));
@@ -451,12 +454,12 @@ jboolean Java_org_rocksdb_RocksDB_keyMayExist___3BIJLjava_lang_StringBuffer_2(
  */
 jboolean Java_org_rocksdb_RocksDB_keyMayExist__J_3BILjava_lang_StringBuffer_2(
     JNIEnv* env, jobject jdb, jlong jread_options_handle,
-    jbyteArray jkey, jint jkey_len, jobject jvalue) {
+    jbyteArray jkey, jint jkey_len, jobject jstring_buffer) {
   rocksdb::DB* db = rocksdb::RocksDBJni::getHandle(env, jdb);
   auto& read_options = *reinterpret_cast<rocksdb::ReadOptions*>(
       jread_options_handle);
   return key_may_exist_helper(env, db, read_options,
-      nullptr, jkey, jkey_len, jvalue);
+      nullptr, jkey, jkey_len, jstring_buffer);
 }
 
 /*
@@ -466,7 +469,7 @@ jboolean Java_org_rocksdb_RocksDB_keyMayExist__J_3BILjava_lang_StringBuffer_2(
  */
 jboolean Java_org_rocksdb_RocksDB_keyMayExist__J_3BIJLjava_lang_StringBuffer_2(
     JNIEnv* env, jobject jdb, jlong jread_options_handle,
-    jbyteArray jkey, jint jkey_len, jlong jcf_handle, jobject jvalue) {
+    jbyteArray jkey, jint jkey_len, jlong jcf_handle, jobject jstring_buffer) {
   rocksdb::DB* db = rocksdb::RocksDBJni::getHandle(env, jdb);
   auto& read_options = *reinterpret_cast<rocksdb::ReadOptions*>(
       jread_options_handle);
@@ -474,7 +477,7 @@ jboolean Java_org_rocksdb_RocksDB_keyMayExist__J_3BIJLjava_lang_StringBuffer_2(
       jcf_handle);
   if (cf_handle != nullptr) {
     return key_may_exist_helper(env, db, read_options, cf_handle,
-        jkey, jkey_len, jvalue);
+        jkey, jkey_len, jstring_buffer);
   } else {
     rocksdb::RocksDBExceptionJni::ThrowNew(env,
         rocksdb::Status::InvalidArgument("Invalid ColumnFamilyHandle."));
@@ -513,11 +516,11 @@ jbyteArray rocksdb_get_helper(
   }
 
   if (s.ok()) {
-    jbyteArray jvalue = env->NewByteArray(value.size());
+    jbyteArray jret_value = env->NewByteArray(value.size());
     env->SetByteArrayRegion(
-        jvalue, 0, value.size(),
+        jret_value, 0, value.size(),
         reinterpret_cast<const jbyte*>(value.c_str()));
-    return jvalue;
+    return jret_value;
   }
   rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
 
@@ -598,7 +601,7 @@ jbyteArray Java_org_rocksdb_RocksDB_get__JJ_3BIJ(
 jint rocksdb_get_helper(
     JNIEnv* env, rocksdb::DB* db, const rocksdb::ReadOptions& read_options,
     rocksdb::ColumnFamilyHandle* column_family_handle, jbyteArray jkey,
-    jint jkey_len, jbyteArray jvalue, jint jvalue_len) {
+    jint jkey_len, jbyteArray jentry_value, jint jentry_value_len) {
   static const int kNotFound = -1;
   static const int kStatusError = -2;
 
@@ -638,10 +641,10 @@ jint rocksdb_get_helper(
   }
 
   int cvalue_len = static_cast<int>(cvalue.size());
-  int length = std::min(jvalue_len, cvalue_len);
+  int length = std::min(jentry_value_len, cvalue_len);
 
   env->SetByteArrayRegion(
-      jvalue, 0, length,
+      jentry_value, 0, length,
       reinterpret_cast<const jbyte*>(cvalue.c_str()));
   return cvalue_len;
 }
@@ -709,12 +712,13 @@ jobject multi_get_helper(JNIEnv* env, jobject jdb, rocksdb::DB* db,
   // insert in java list
   for (std::vector<rocksdb::Status>::size_type i = 0; i != s.size(); i++) {
     if (s[i].ok()) {
-      jbyteArray jvalue = env->NewByteArray(values[i].size());
+      jbyteArray jentry_value = env->NewByteArray(values[i].size());
       env->SetByteArrayRegion(
-          jvalue, 0, values[i].size(),
+          jentry_value, 0, values[i].size(),
           reinterpret_cast<const jbyte*>(values[i].c_str()));
       env->CallBooleanMethod(
-          jvalue_list, rocksdb::ListJni::getListAddMethodId(env), jvalue);
+          jvalue_list, rocksdb::ListJni::getListAddMethodId(env),
+              jentry_value);
     } else {
       env->CallBooleanMethod(
           jvalue_list, rocksdb::ListJni::getListAddMethodId(env), nullptr);
@@ -789,11 +793,11 @@ jobject
 jint Java_org_rocksdb_RocksDB_get__J_3BI_3BI(
     JNIEnv* env, jobject jdb, jlong jdb_handle,
     jbyteArray jkey, jint jkey_len,
-    jbyteArray jvalue, jint jvalue_len) {
+    jbyteArray jentry_value, jint jentry_value_len) {
   return rocksdb_get_helper(env,
       reinterpret_cast<rocksdb::DB*>(jdb_handle),
       rocksdb::ReadOptions(), nullptr,
-      jkey, jkey_len, jvalue, jvalue_len);
+      jkey, jkey_len, jentry_value, jentry_value_len);
 }
 
 /*
@@ -804,12 +808,12 @@ jint Java_org_rocksdb_RocksDB_get__J_3BI_3BI(
 jint Java_org_rocksdb_RocksDB_get__J_3BI_3BIJ(
     JNIEnv* env, jobject jdb, jlong jdb_handle,
     jbyteArray jkey, jint jkey_len,
-    jbyteArray jvalue, jint jvalue_len, jlong jcf_handle) {
+    jbyteArray jentry_value, jint jentry_value_len, jlong jcf_handle) {
   auto db_handle = reinterpret_cast<rocksdb::DB*>(jdb_handle);
   auto cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
   if (cf_handle != nullptr) {
     return rocksdb_get_helper(env, db_handle, rocksdb::ReadOptions(), cf_handle,
-        jkey, jkey_len, jvalue, jvalue_len);
+        jkey, jkey_len, jentry_value, jentry_value_len);
   } else {
     rocksdb::RocksDBExceptionJni::ThrowNew(env,
         rocksdb::Status::InvalidArgument("Invalid ColumnFamilyHandle."));
@@ -826,11 +830,11 @@ jint Java_org_rocksdb_RocksDB_get__J_3BI_3BIJ(
 jint Java_org_rocksdb_RocksDB_get__JJ_3BI_3BI(
     JNIEnv* env, jobject jdb, jlong jdb_handle, jlong jropt_handle,
     jbyteArray jkey, jint jkey_len,
-    jbyteArray jvalue, jint jvalue_len) {
+    jbyteArray jentry_value, jint jentry_value_len) {
   return rocksdb_get_helper(env,
       reinterpret_cast<rocksdb::DB*>(jdb_handle),
       *reinterpret_cast<rocksdb::ReadOptions*>(jropt_handle),
-      nullptr, jkey, jkey_len, jvalue, jvalue_len);
+      nullptr, jkey, jkey_len, jentry_value, jentry_value_len);
 }
 
 /*
@@ -841,13 +845,13 @@ jint Java_org_rocksdb_RocksDB_get__JJ_3BI_3BI(
 jint Java_org_rocksdb_RocksDB_get__JJ_3BI_3BIJ(
     JNIEnv* env, jobject jdb, jlong jdb_handle, jlong jropt_handle,
     jbyteArray jkey, jint jkey_len,
-    jbyteArray jvalue, jint jvalue_len, jlong jcf_handle) {
+    jbyteArray jentry_value, jint jentry_value_len, jlong jcf_handle) {
   auto db_handle = reinterpret_cast<rocksdb::DB*>(jdb_handle);
   auto& ro_opt = *reinterpret_cast<rocksdb::ReadOptions*>(jropt_handle);
   auto cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
   if (cf_handle != nullptr) {
     return rocksdb_get_helper(env, db_handle, ro_opt, cf_handle, jkey,
-        jkey_len, jvalue, jvalue_len);
+        jkey_len, jentry_value, jentry_value_len);
   } else {
     rocksdb::RocksDBExceptionJni::ThrowNew(env,
         rocksdb::Status::InvalidArgument("Invalid ColumnFamilyHandle."));
@@ -955,12 +959,13 @@ void Java_org_rocksdb_RocksDB_remove__JJ_3BIJ(
 void rocksdb_merge_helper(
     JNIEnv* env, rocksdb::DB* db, const rocksdb::WriteOptions& write_options,
     rocksdb::ColumnFamilyHandle* cf_handle, jbyteArray jkey, jint jkey_len,
-    jbyteArray jvalue, jint jvalue_len) {
+    jbyteArray jentry_value, jint jentry_value_len) {
 
   jbyte* key = env->GetByteArrayElements(jkey, 0);
-  jbyte* value = env->GetByteArrayElements(jvalue, 0);
+  jbyte* value = env->GetByteArrayElements(jentry_value, 0);
   rocksdb::Slice key_slice(reinterpret_cast<char*>(key), jkey_len);
-  rocksdb::Slice value_slice(reinterpret_cast<char*>(value), jvalue_len);
+  rocksdb::Slice value_slice(reinterpret_cast<char*>(value),
+      jentry_value_len);
 
   rocksdb::Status s;
   if (cf_handle != nullptr) {
@@ -973,7 +978,7 @@ void rocksdb_merge_helper(
   // by passing JNI_ABORT, it will simply release the reference without
   // copying the result back to the java byte array.
   env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
-  env->ReleaseByteArrayElements(jvalue, value, JNI_ABORT);
+  env->ReleaseByteArrayElements(jentry_value, value, JNI_ABORT);
 
   if (s.ok()) {
     return;
@@ -989,13 +994,13 @@ void rocksdb_merge_helper(
 void Java_org_rocksdb_RocksDB_merge__J_3BI_3BI(
     JNIEnv* env, jobject jdb, jlong jdb_handle,
     jbyteArray jkey, jint jkey_len,
-    jbyteArray jvalue, jint jvalue_len) {
+    jbyteArray jentry_value, jint jentry_value_len) {
   auto db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
   static const rocksdb::WriteOptions default_write_options =
       rocksdb::WriteOptions();
 
   rocksdb_merge_helper(env, db, default_write_options,
-      nullptr, jkey, jkey_len, jvalue, jvalue_len);
+      nullptr, jkey, jkey_len, jentry_value, jentry_value_len);
 }
 
 /*
@@ -1006,14 +1011,14 @@ void Java_org_rocksdb_RocksDB_merge__J_3BI_3BI(
 void Java_org_rocksdb_RocksDB_merge__J_3BI_3BIJ(
     JNIEnv* env, jobject jdb, jlong jdb_handle,
     jbyteArray jkey, jint jkey_len,
-    jbyteArray jvalue, jint jvalue_len, jlong jcf_handle) {
+    jbyteArray jentry_value, jint jentry_value_len, jlong jcf_handle) {
   auto db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
   static const rocksdb::WriteOptions default_write_options =
       rocksdb::WriteOptions();
   auto cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
   if (cf_handle != nullptr) {
     rocksdb_merge_helper(env, db, default_write_options,
-        cf_handle, jkey, jkey_len, jvalue, jvalue_len);
+        cf_handle, jkey, jkey_len, jentry_value, jentry_value_len);
   } else {
     rocksdb::RocksDBExceptionJni::ThrowNew(env,
         rocksdb::Status::InvalidArgument("Invalid ColumnFamilyHandle."));
@@ -1029,13 +1034,13 @@ void Java_org_rocksdb_RocksDB_merge__JJ_3BI_3BI(
     JNIEnv* env, jobject jdb,
     jlong jdb_handle, jlong jwrite_options_handle,
     jbyteArray jkey, jint jkey_len,
-    jbyteArray jvalue, jint jvalue_len) {
+    jbyteArray jentry_value, jint jentry_value_len) {
   auto db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
   auto write_options = reinterpret_cast<rocksdb::WriteOptions*>(
       jwrite_options_handle);
 
   rocksdb_merge_helper(env, db, *write_options,
-      nullptr, jkey, jkey_len, jvalue, jvalue_len);
+      nullptr, jkey, jkey_len, jentry_value, jentry_value_len);
 }
 
 /*
@@ -1047,14 +1052,14 @@ void Java_org_rocksdb_RocksDB_merge__JJ_3BI_3BIJ(
     JNIEnv* env, jobject jdb,
     jlong jdb_handle, jlong jwrite_options_handle,
     jbyteArray jkey, jint jkey_len,
-    jbyteArray jvalue, jint jvalue_len, jlong jcf_handle) {
+    jbyteArray jentry_value, jint jentry_value_len, jlong jcf_handle) {
   auto db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
   auto write_options = reinterpret_cast<rocksdb::WriteOptions*>(
       jwrite_options_handle);
   auto cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
   if (cf_handle != nullptr) {
     rocksdb_merge_helper(env, db, *write_options,
-        cf_handle, jkey, jkey_len, jvalue, jvalue_len);
+        cf_handle, jkey, jkey_len, jentry_value, jentry_value_len);
   } else {
     rocksdb::RocksDBExceptionJni::ThrowNew(env,
         rocksdb::Status::InvalidArgument("Invalid ColumnFamilyHandle."));
diff --git a/java/rocksjni/write_batch.cc b/java/rocksjni/write_batch.cc
index 13bff26db..aea85fab9 100644
--- a/java/rocksjni/write_batch.cc
+++ b/java/rocksjni/write_batch.cc
@@ -65,15 +65,16 @@ void Java_org_rocksdb_WriteBatch_clear(JNIEnv* env, jobject jobj) {
 void write_batch_put_helper(
     JNIEnv* env, jobject jobj,
     jbyteArray jkey, jint jkey_len,
-    jbyteArray jvalue, jint jvalue_len,
+    jbyteArray jentry_value, jint jentry_value_len,
     rocksdb::ColumnFamilyHandle* cf_handle) {
   rocksdb::WriteBatch* wb = rocksdb::WriteBatchJni::getHandle(env, jobj);
   assert(wb != nullptr);
 
   jbyte* key = env->GetByteArrayElements(jkey, nullptr);
-  jbyte* value = env->GetByteArrayElements(jvalue, nullptr);
+  jbyte* value = env->GetByteArrayElements(jentry_value, nullptr);
   rocksdb::Slice key_slice(reinterpret_cast<char*>(key), jkey_len);
-  rocksdb::Slice value_slice(reinterpret_cast<char*>(value), jvalue_len);
+  rocksdb::Slice value_slice(reinterpret_cast<char*>(value),
+      jentry_value_len);
   if (cf_handle != nullptr) {
     wb->Put(cf_handle, key_slice, value_slice);
   } else {
@@ -81,7 +82,7 @@ void write_batch_put_helper(
     wb->Put(key_slice, value_slice);
   }
   env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
-  env->ReleaseByteArrayElements(jvalue, value, JNI_ABORT);
+  env->ReleaseByteArrayElements(jentry_value, value, JNI_ABORT);
 }
 
 /*
@@ -92,9 +93,9 @@ void write_batch_put_helper(
 void Java_org_rocksdb_WriteBatch_put___3BI_3BI(
     JNIEnv* env, jobject jobj,
     jbyteArray jkey, jint jkey_len,
-    jbyteArray jvalue, jint jvalue_len) {
-  write_batch_put_helper(env, jobj, jkey, jkey_len, jvalue,
-      jvalue_len, nullptr);
+    jbyteArray jentry_value, jint jentry_value_len) {
+  write_batch_put_helper(env, jobj, jkey, jkey_len, jentry_value,
+      jentry_value_len, nullptr);
 }
 
 /*
@@ -105,10 +106,10 @@ void Java_org_rocksdb_WriteBatch_put___3BI_3BI(
 void Java_org_rocksdb_WriteBatch_put___3BI_3BIJ(
     JNIEnv* env, jobject jobj,
     jbyteArray jkey, jint jkey_len,
-    jbyteArray jvalue, jint jvalue_len, jlong jcf_handle) {
+    jbyteArray jentry_value, jint jentry_value_len, jlong jcf_handle) {
   auto cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
-  write_batch_put_helper(env, jobj, jkey, jkey_len, jvalue,
-      jvalue_len, cf_handle);
+  write_batch_put_helper(env, jobj, jkey, jkey_len, jentry_value,
+      jentry_value_len, cf_handle);
 }
 
 /*
@@ -117,15 +118,16 @@ void Java_org_rocksdb_WriteBatch_put___3BI_3BIJ(
 void write_batch_merge_helper(
     JNIEnv* env, jobject jobj,
     jbyteArray jkey, jint jkey_len,
-    jbyteArray jvalue, jint jvalue_len,
+    jbyteArray jentry_value, jint jentry_value_len,
     rocksdb::ColumnFamilyHandle* cf_handle) {
   rocksdb::WriteBatch* wb = rocksdb::WriteBatchJni::getHandle(env, jobj);
   assert(wb != nullptr);
 
   jbyte* key = env->GetByteArrayElements(jkey, nullptr);
-  jbyte* value = env->GetByteArrayElements(jvalue, nullptr);
+  jbyte* value = env->GetByteArrayElements(jentry_value, nullptr);
   rocksdb::Slice key_slice(reinterpret_cast<char*>(key), jkey_len);
-  rocksdb::Slice value_slice(reinterpret_cast<char*>(value), jvalue_len);
+  rocksdb::Slice value_slice(reinterpret_cast<char*>(value),
+      jentry_value_len);
   if (cf_handle != nullptr) {
     wb->Merge(cf_handle, key_slice, value_slice);
   } else {
@@ -133,7 +135,7 @@ void write_batch_merge_helper(
     wb->Merge(key_slice, value_slice);
   }
   env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
-  env->ReleaseByteArrayElements(jvalue, value, JNI_ABORT);
+  env->ReleaseByteArrayElements(jentry_value, value, JNI_ABORT);
 }
 
 /*
@@ -144,9 +146,9 @@ void write_batch_merge_helper(
 void Java_org_rocksdb_WriteBatch_merge___3BI_3BI(
     JNIEnv* env, jobject jobj,
     jbyteArray jkey, jint jkey_len,
-    jbyteArray jvalue, jint jvalue_len) {
+    jbyteArray jentry_value, jint jentry_value_len) {
   write_batch_merge_helper(env, jobj, jkey, jkey_len,
-      jvalue, jvalue_len, nullptr);
+      jentry_value, jentry_value_len, nullptr);
 }
 
 /*
@@ -157,10 +159,10 @@ void Java_org_rocksdb_WriteBatch_merge___3BI_3BI(
 void Java_org_rocksdb_WriteBatch_merge___3BI_3BIJ(
     JNIEnv* env, jobject jobj,
     jbyteArray jkey, jint jkey_len,
-    jbyteArray jvalue, jint jvalue_len, jlong jcf_handle) {
+    jbyteArray jentry_value, jint jentry_value_len, jlong jcf_handle) {
   auto cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
   write_batch_merge_helper(env, jobj, jkey, jkey_len,
-      jvalue, jvalue_len, cf_handle);
+      jentry_value, jentry_value_len, cf_handle);
 }
 
 /*

From 64d302d304a87bd76e9bca4b1c08a368e58f0e26 Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Thu, 6 Nov 2014 16:07:07 -0800
Subject: [PATCH 426/829] make DropWritesFlush deterministic

Summary:
TEST_WaitForFlush should wait until it sees error when parameter is set
to true so we don't need to loop and timeout

Test Plan: ROCKSDB_TESTS=DropWritesFlush ./db_test

Reviewers: sdong, igor

Reviewed By: sdong

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D28419
---
 db/db_test.cc | 18 +++---------------
 1 file changed, 3 insertions(+), 15 deletions(-)

diff --git a/db/db_test.cc b/db/db_test.cc
index e1818a8a5..fd4dd17a1 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -5840,21 +5840,9 @@ TEST(DBTest, DropWritesFlush) {
     ASSERT_TRUE(db_->GetProperty("rocksdb.background-errors", &property_value));
     ASSERT_EQ("0", property_value);
 
-    dbfull()->TEST_FlushMemTable(false);
-
-    // Wait 300 milliseconds or background-errors turned 1 from 0.
-    int time_to_sleep_limit = 300000;
-    while (time_to_sleep_limit > 0) {
-      int to_sleep = (time_to_sleep_limit > 1000) ? 1000 : time_to_sleep_limit;
-      time_to_sleep_limit -= to_sleep;
-      env_->SleepForMicroseconds(to_sleep);
-
-      ASSERT_TRUE(
-          db_->GetProperty("rocksdb.background-errors", &property_value));
-      if (property_value == "1") {
-        break;
-      }
-    }
+    dbfull()->TEST_FlushMemTable(true);
+
+    ASSERT_TRUE(db_->GetProperty("rocksdb.background-errors", &property_value));
     ASSERT_EQ("1", property_value);
 
     env_->drop_writes_.store(false, std::memory_order_release);

From 8d87467bb0c76012f70daba69d69299ff574a827 Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Thu, 6 Nov 2014 17:07:52 -0800
Subject: [PATCH 427/829] Make PartialCompactionFailure Test more robust again.

Summary:
Make PartialCompactionFailure Test more robust again by
blocking background compaction until we simulate the
file creation error.

Test Plan:
export ROCKSDB_TESTS=PartialCompactionFailure
./db_test

Reviewers: sdong, igor, ljin

Reviewed By: ljin

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D28431
---
 db/db_test.cc | 45 ++++++++++++++++++++++++---------------------
 1 file changed, 24 insertions(+), 21 deletions(-)

diff --git a/db/db_test.cc b/db/db_test.cc
index fd4dd17a1..8ddaa0c7c 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -163,7 +163,7 @@ class SpecialEnv : public EnvWrapper {
 
   std::atomic<uint32_t> new_writable_count_;
 
-  std::atomic<uint32_t> periodic_non_writable_;
+  std::atomic<uint32_t> non_writable_count_;
 
   explicit SpecialEnv(Env* base) : EnvWrapper(base), rnd_(301) {
     delay_sstable_sync_.store(false, std::memory_order_release);
@@ -180,7 +180,7 @@ class SpecialEnv : public EnvWrapper {
     sync_counter_ = 0;
     non_writeable_rate_ = 0;
     new_writable_count_ = 0;
-    periodic_non_writable_ = 0;
+    non_writable_count_ = 0;
   }
 
   Status NewWritableFile(const std::string& f, unique_ptr<WritableFile>* r,
@@ -283,10 +283,9 @@ class SpecialEnv : public EnvWrapper {
 
     new_writable_count_++;
 
-    auto periodic_fail = periodic_non_writable_.load();
-    if (periodic_fail > 0 &&
-        new_writable_count_.load() % periodic_fail == 0) {
-      return Status::IOError("simulated periodic write error");
+    if (non_writable_count_.load() > 0) {
+      non_writable_count_--;
+      return Status::IOError("simulated write error");
     }
 
     Status s = target()->NewWritableFile(f, r, soptions);
@@ -8927,6 +8926,13 @@ TEST(DBTest, PartialCompactionFailure) {
   options.max_bytes_for_level_multiplier = 2;
   options.compression = kNoCompression;
 
+  env_->SetBackgroundThreads(1, Env::HIGH);
+  env_->SetBackgroundThreads(1, Env::LOW);
+  // stop the compaction thread until we simulate the file creation failure.
+  SleepingBackgroundTask sleeping_task_low;
+  env_->Schedule(&SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+
   options.env = env_;
 
   DestroyAndReopen(options);
@@ -8945,37 +8951,34 @@ TEST(DBTest, PartialCompactionFailure) {
     ASSERT_OK(Put(Slice(keys[k]), Slice(values[k])));
   }
 
-  dbfull()->TEST_WaitForFlushMemTable();
-  // Make sure there're some L0 files we can compact
-  ASSERT_GT(NumTableFilesAtLevel(0), 0);
+  dbfull()->TEST_FlushMemTable(true);
+  // Make sure the number of L0 files can trigger compaction.
+  ASSERT_GE(NumTableFilesAtLevel(0),
+            options.level0_file_num_compaction_trigger);
+
   auto previous_num_level0_files = NumTableFilesAtLevel(0);
 
-  // The number of NewWritableFiles calls required by each operation.
-  const int kNumLevel1NewWritableFiles =
-      options.level0_file_num_compaction_trigger + 1;
-  // This setting will make one of the file-creation fail
-  // in the first L0 -> L1 compaction while making sure
-  // all flushes succeeed.
-  env_->periodic_non_writable_ = kNumLevel1NewWritableFiles - 2;
+  // Fail the first file creation.
+  env_->non_writable_count_ = 1;
+  sleeping_task_low.WakeUp();
+  sleeping_task_low.WaitUntilDone();
 
   // Expect compaction to fail here as one file will fail its
   // creation.
-  ASSERT_TRUE(!db_->CompactRange(nullptr, nullptr).ok());
+  ASSERT_TRUE(!dbfull()->TEST_WaitForCompact().ok());
 
   // Verify L0 -> L1 compaction does fail.
   ASSERT_EQ(NumTableFilesAtLevel(1), 0);
 
   // Verify all L0 files are still there.
-  // We use GE here as occasionally there might be additional
-  // memtables being flushed.
-  ASSERT_GE(NumTableFilesAtLevel(0), previous_num_level0_files);
+  ASSERT_EQ(NumTableFilesAtLevel(0), previous_num_level0_files);
 
   // All key-values must exist after compaction fails.
   for (int k = 0; k < kNumInsertedKeys; ++k) {
     ASSERT_EQ(values[k], Get(keys[k]));
   }
 
-  env_->periodic_non_writable_ = 0;
+  env_->non_writable_count_ = 0;
 
   // Make sure RocksDB will not get into corrupted state.
   Reopen(options);

From ec101cd49a8ba328dc0ae8e28d9bf088019300b4 Mon Sep 17 00:00:00 2001
From: Jonah Cohen <jonah@fb.com>
Date: Thu, 6 Nov 2014 17:28:49 -0800
Subject: [PATCH 428/829] Correctly test both compaction styles in
 CompactionDeletionTriggerReopen

Summary:
CompactionDeletionTriggerReopen wasn't actually testing universal
compaction.

Test Plan: db_test

Reviewers: sdong, igor

Reviewed By: igor

Subscribers: dhruba

Differential Revision: https://reviews.facebook.net/D28443
---
 db/db_test.cc | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/db/db_test.cc b/db/db_test.cc
index 8ddaa0c7c..091e373fa 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -3157,10 +3157,15 @@ Options DeletionTriggerOptions() {
 }  // anonymous namespace
 
 TEST(DBTest, CompactionDeletionTrigger) {
-  Options options = CurrentOptions(DeletionTriggerOptions());
-
   for (int tid = 0; tid < 2; ++tid) {
     uint64_t db_size[2];
+    Options options = CurrentOptions(DeletionTriggerOptions());
+
+    if (tid == 1) {
+      // second pass with universal compaction
+      options.compaction_style = kCompactionStyleUniversal;
+      options.num_levels = 1;
+    }
 
     DestroyAndReopen(options);
     Random rnd(301);
@@ -3184,10 +3189,6 @@ TEST(DBTest, CompactionDeletionTrigger) {
 
     // must have much smaller db size.
     ASSERT_GT(db_size[0] / 3, db_size[1]);
-
-    // repeat the test with universal compaction
-    options.compaction_style = kCompactionStyleUniversal;
-    options.num_levels = 1;
   }
 }
 
@@ -3196,6 +3197,12 @@ TEST(DBTest, CompactionDeletionTriggerReopen) {
     uint64_t db_size[3];
     Options options = CurrentOptions(DeletionTriggerOptions());
 
+    if (tid == 1) {
+      // second pass with universal compaction
+      options.compaction_style = kCompactionStyleUniversal;
+      options.num_levels = 1;
+    }
+
     DestroyAndReopen(options);
     Random rnd(301);
 
@@ -3238,10 +3245,6 @@ TEST(DBTest, CompactionDeletionTriggerReopen) {
     db_size[2] = Size(Key(0), Key(kTestSize - 1));
     // this time we're expecting significant drop in size.
     ASSERT_GT(db_size[0] / 3, db_size[2]);
-
-    // repeat the test with universal compaction
-    options.compaction_style = kCompactionStyleUniversal;
-    options.num_levels = 1;
   }
 }
 

From 53af5d877db5b126d043f29996f9dfc9352c6f81 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Fri, 7 Nov 2014 11:50:34 -0800
Subject: [PATCH 429/829] Redesign pending_outputs_

Summary:
Here's a prototype of redesigning pending_outputs_. This way, we don't have to expose pending_outputs_ to other classes (CompactionJob, FlushJob, MemtableList). DBImpl takes care of it.

Still have to write some comments, but should be good enough to start the discussion.

Test Plan: make check, will also run stress test

Reviewers: ljin, sdong, rven, yhchiang

Reviewed By: yhchiang

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D28353
---
 db/compaction_job.cc | 28 +++++-----------------
 db/compaction_job.h  |  9 +++----
 db/db_impl.cc        | 56 +++++++++++++++++++++++++++++++++++---------
 db/db_impl.h         | 33 ++++++++++++++++++++++----
 db/db_test.cc        | 39 ++++++++++++++++++++++++++++++
 db/filename.h        |  3 ---
 db/flush_job.cc      | 17 ++------------
 db/flush_job.h       |  8 +++----
 db/flush_job_test.cc |  9 ++++---
 db/job_context.h     |  8 +++++--
 db/memtable_list.cc  | 15 +++---------
 db/memtable_list.h   |  8 +++----
 db/version_set.h     |  3 +++
 13 files changed, 146 insertions(+), 90 deletions(-)

diff --git a/db/compaction_job.cc b/db/compaction_job.cc
index dc472233b..04c351d77 100644
--- a/db/compaction_job.cc
+++ b/db/compaction_job.cc
@@ -205,10 +205,9 @@ CompactionJob::CompactionJob(
     Compaction* compaction, const DBOptions& db_options,
     const MutableCFOptions& mutable_cf_options, const EnvOptions& env_options,
     VersionSet* versions, port::Mutex* db_mutex,
-    std::atomic<bool>* shutting_down, FileNumToPathIdMap* pending_outputs,
-    LogBuffer* log_buffer, Directory* db_directory, Statistics* stats,
-    SnapshotList* snapshots, bool is_snapshot_supported,
-    std::shared_ptr<Cache> table_cache,
+    std::atomic<bool>* shutting_down, LogBuffer* log_buffer,
+    Directory* db_directory, Statistics* stats, SnapshotList* snapshots,
+    bool is_snapshot_supported, std::shared_ptr<Cache> table_cache,
     std::function<uint64_t()> yield_callback)
     : compact_(new CompactionState(compaction)),
       compaction_stats_(1),
@@ -219,7 +218,6 @@ CompactionJob::CompactionJob(
       versions_(versions),
       db_mutex_(db_mutex),
       shutting_down_(shutting_down),
-      pending_outputs_(pending_outputs),
       log_buffer_(log_buffer),
       db_directory_(db_directory),
       stats_(stats),
@@ -469,10 +467,6 @@ Status CompactionJob::Install(Status status) {
   cfd->internal_stats()->AddCompactionStats(
       compact_->compaction->output_level(), compaction_stats_);
 
-  // if there were any unused file number (mostly in case of
-  // compaction error), free up the entry from pending_putputs
-  ReleaseCompactionUnusedFileNumbers();
-
   if (status.ok()) {
     status = InstallCompactionResults();
   }
@@ -511,8 +505,6 @@ void CompactionJob::AllocateCompactionOutputFileNumbers() {
   int filesNeeded = compact_->compaction->num_input_files(1);
   for (int i = 0; i < std::max(filesNeeded, 1); i++) {
     uint64_t file_number = versions_->NewFileNumber();
-    pending_outputs_->insert(
-        {file_number, compact_->compaction->GetOutputPathId()});
     compact_->allocated_file_numbers.push_back(file_number);
   }
 }
@@ -1041,14 +1033,6 @@ void CompactionJob::RecordCompactionIOStats() {
   IOSTATS_RESET(bytes_written);
 }
 
-// Frees up unused file number.
-void CompactionJob::ReleaseCompactionUnusedFileNumbers() {
-  db_mutex_->AssertHeld();
-  for (const auto file_number : compact_->allocated_file_numbers) {
-    pending_outputs_->erase(file_number);
-  }
-}
-
 Status CompactionJob::OpenCompactionOutputFile() {
   assert(compact_ != nullptr);
   assert(compact_->builder == nullptr);
@@ -1061,9 +1045,10 @@ Status CompactionJob::OpenCompactionOutputFile() {
     compact_->allocated_file_numbers.pop_front();
   } else {
     db_mutex_->Lock();
+    // TODO(icanadi) make Versions::next_file_number_ atomic and remove db_lock
+    // around here. Once we do that, AllocateCompactionOutputFileNumbers() will
+    // not be needed.
     file_number = versions_->NewFileNumber();
-    pending_outputs_->insert(
-        {file_number, compact_->compaction->GetOutputPathId()});
     db_mutex_->Unlock();
   }
   // Make the output file
@@ -1112,7 +1097,6 @@ void CompactionJob::CleanupCompaction(Status status) {
   }
   for (size_t i = 0; i < compact_->outputs.size(); i++) {
     const CompactionState::Output& out = compact_->outputs[i];
-    pending_outputs_->erase(out.number);
 
     // If this file was inserted into the table cache then remove
     // them here because this compaction was not committed.
diff --git a/db/compaction_job.h b/db/compaction_job.h
index f090c351d..45d438156 100644
--- a/db/compaction_job.h
+++ b/db/compaction_job.h
@@ -57,10 +57,9 @@ class CompactionJob {
                 const MutableCFOptions& mutable_cf_options,
                 const EnvOptions& env_options, VersionSet* versions,
                 port::Mutex* db_mutex, std::atomic<bool>* shutting_down,
-                FileNumToPathIdMap* pending_outputs, LogBuffer* log_buffer,
-                Directory* db_directory, Statistics* stats,
-                SnapshotList* snapshot_list, bool is_snapshot_supported,
-                std::shared_ptr<Cache> table_cache,
+                LogBuffer* log_buffer, Directory* db_directory,
+                Statistics* stats, SnapshotList* snapshot_list,
+                bool is_snapshot_supported, std::shared_ptr<Cache> table_cache,
                 std::function<uint64_t()> yield_callback);
 
   ~CompactionJob() { assert(compact_ == nullptr); }
@@ -92,7 +91,6 @@ class CompactionJob {
       SequenceNumber in, const std::vector<SequenceNumber>& snapshots,
       SequenceNumber* prev_snapshot);
   void RecordCompactionIOStats();
-  void ReleaseCompactionUnusedFileNumbers();
   Status OpenCompactionOutputFile();
   void CleanupCompaction(Status status);
 
@@ -115,7 +113,6 @@ class CompactionJob {
   VersionSet* versions_;
   port::Mutex* db_mutex_;
   std::atomic<bool>* shutting_down_;
-  FileNumToPathIdMap* pending_outputs_;
   LogBuffer* log_buffer_;
   Directory* db_directory_;
   Statistics* stats_;
diff --git a/db/db_impl.cc b/db/db_impl.cc
index 2bbb3345f..da0603303 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -443,8 +443,11 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force,
   }
 
   // don't delete live files
-  for (auto pair : pending_outputs_) {
-    job_context->sst_live.emplace_back(pair.first, pair.second, 0);
+  if (pending_outputs_.size()) {
+    job_context->min_pending_output = *pending_outputs_.begin();
+  } else {
+    // delete all of them
+    job_context->min_pending_output = std::numeric_limits<uint64_t>::max();
   }
   versions_->AddLiveFiles(&job_context->sst_live);
 
@@ -567,7 +570,10 @@ void DBImpl::PurgeObsoleteFiles(const JobContext& state) {
         keep = (number >= state.manifest_file_number);
         break;
       case kTableFile:
-        keep = (sst_live_map.find(number) != sst_live_map.end());
+        // If the second condition is not there, this makes
+        // DontDeletePendingOutputs fail
+        keep = (sst_live_map.find(number) != sst_live_map.end()) ||
+               number >= state.min_pending_output;
         break;
       case kTempFile:
         // Any temp files that are currently being written to must
@@ -981,7 +987,8 @@ Status DBImpl::WriteLevel0TableForRecovery(ColumnFamilyData* cfd, MemTable* mem,
   const uint64_t start_micros = env_->NowMicros();
   FileMetaData meta;
   meta.fd = FileDescriptor(versions_->NewFileNumber(), 0, 0);
-  pending_outputs_[meta.fd.GetNumber()] = 0;  // path 0 for level 0 file.
+  auto pending_outputs_inserted_elem =
+      CaptureCurrentFileNumberInPendingOutputs();
   ReadOptions ro;
   ro.total_order_seek = true;
   Arena arena;
@@ -1013,7 +1020,7 @@ Status DBImpl::WriteLevel0TableForRecovery(ColumnFamilyData* cfd, MemTable* mem,
         cfd->GetName().c_str(), meta.fd.GetNumber(), meta.fd.GetFileSize(),
         s.ToString().c_str());
   }
-  pending_outputs_.erase(meta.fd.GetNumber());
+  ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem);
 
   // Note that if file_size is zero, the file has been deleted and
   // should not be added to the manifest.
@@ -1044,9 +1051,9 @@ Status DBImpl::FlushMemTableToOutputFile(
 
   FlushJob flush_job(dbname_, cfd, db_options_, mutable_cf_options,
                      env_options_, versions_.get(), &mutex_, &shutting_down_,
-                     &pending_outputs_, snapshots_.GetNewest(), job_context,
-                     log_buffer, db_directory_.get(),
-                     GetCompressionFlush(*cfd->ioptions()), stats_);
+                     snapshots_.GetNewest(), job_context, log_buffer,
+                     db_directory_.get(), GetCompressionFlush(*cfd->ioptions()),
+                     stats_);
 
   Status s = flush_job.Run();
 
@@ -1550,6 +1557,9 @@ void DBImpl::BackgroundCallFlush() {
   {
     MutexLock l(&mutex_);
 
+    auto pending_outputs_inserted_elem =
+        CaptureCurrentFileNumberInPendingOutputs();
+
     Status s;
     if (!shutting_down_.load(std::memory_order_acquire)) {
       s = BackgroundFlush(&madeProgress, &job_context, &log_buffer);
@@ -1573,6 +1583,8 @@ void DBImpl::BackgroundCallFlush() {
       }
     }
 
+    ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem);
+
     // If !s.ok(), this means that Flush failed. In that case, we want
     // to delete all obsolete files and we force FindObsoleteFiles()
     FindObsoleteFiles(&job_context, !s.ok());
@@ -1616,6 +1628,10 @@ void DBImpl::BackgroundCallCompaction() {
   LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, db_options_.info_log.get());
   {
     MutexLock l(&mutex_);
+
+    auto pending_outputs_inserted_elem =
+        CaptureCurrentFileNumberInPendingOutputs();
+
     assert(bg_compaction_scheduled_);
     Status s;
     if (!shutting_down_.load(std::memory_order_acquire)) {
@@ -1640,6 +1656,8 @@ void DBImpl::BackgroundCallCompaction() {
       }
     }
 
+    ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem);
+
     // If !s.ok(), this means that Compaction failed. In that case, we want
     // to delete all obsolete files we might have created and we force
     // FindObsoleteFiles(). This is because job_context does not
@@ -1848,9 +1866,9 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress, JobContext* job_context,
     };
     CompactionJob compaction_job(
         c.get(), db_options_, *c->mutable_cf_options(), env_options_,
-        versions_.get(), &mutex_, &shutting_down_, &pending_outputs_,
-        log_buffer, db_directory_.get(), stats_, &snapshots_,
-        IsSnapshotSupported(), table_cache_, std::move(yield_callback));
+        versions_.get(), &mutex_, &shutting_down_, log_buffer,
+        db_directory_.get(), stats_, &snapshots_, IsSnapshotSupported(),
+        table_cache_, std::move(yield_callback));
     compaction_job.Prepare();
     mutex_.Unlock();
     status = compaction_job.Run();
@@ -2968,6 +2986,22 @@ void DBImpl::GetApproximateSizes(ColumnFamilyHandle* column_family,
   }
 }
 
+std::list<uint64_t>::iterator
+DBImpl::CaptureCurrentFileNumberInPendingOutputs() {
+  // We need to remember the iterator of our insert, because after the
+  // background job is done, we need to remove that element from
+  // pending_outputs_.
+  pending_outputs_.push_back(versions_->current_next_file_number());
+  auto pending_outputs_inserted_elem = pending_outputs_.end();
+  --pending_outputs_inserted_elem;
+  return pending_outputs_inserted_elem;
+}
+
+void DBImpl::ReleaseFileNumberFromPendingOutputs(
+    std::list<uint64_t>::iterator v) {
+  pending_outputs_.erase(v);
+}
+
 #ifndef ROCKSDB_LITE
 Status DBImpl::GetUpdatesSince(
     SequenceNumber seq, unique_ptr<TransactionLogIterator>* iter,
diff --git a/db/db_impl.h b/db/db_impl.h
index 8717dee90..a25a82a9a 100644
--- a/db/db_impl.h
+++ b/db/db_impl.h
@@ -12,6 +12,7 @@
 #include <deque>
 #include <limits>
 #include <set>
+#include <list>
 #include <utility>
 #include <vector>
 #include <string>
@@ -265,6 +266,24 @@ class DBImpl : public DB {
   // Delete any unneeded files and stale in-memory entries.
   void DeleteObsoleteFiles();
 
+  // Background process needs to call
+  //     auto x = CaptureCurrentFileNumberInPendingOutputs()
+  //     <do something>
+  //     ReleaseFileNumberFromPendingOutputs(x)
+  // This will protect any temporary files created while <do something> is
+  // executing from being deleted.
+  // -----------
+  // This function will capture current file number and append it to
+  // pending_outputs_. This will prevent any background process to delete any
+  // file created after this point.
+  std::list<uint64_t>::iterator CaptureCurrentFileNumberInPendingOutputs();
+  // This function should be called with the result of
+  // CaptureCurrentFileNumberInPendingOutputs(). It then marks that any file
+  // created between the calls CaptureCurrentFileNumberInPendingOutputs() and
+  // ReleaseFileNumberFromPendingOutputs() can now be deleted (if it's not live
+  // and blocked by any other pending_outputs_ calls)
+  void ReleaseFileNumberFromPendingOutputs(std::list<uint64_t>::iterator v);
+
   // Flush the in-memory write buffer to storage.  Switches to a new
   // log-file/memtable and writes a new descriptor iff successful.
   Status FlushMemTableToOutputFile(ColumnFamilyData* cfd,
@@ -390,10 +409,16 @@ class DBImpl : public DB {
 
   SnapshotList snapshots_;
 
-  // Set of table files to protect from deletion because they are
-  // part of ongoing compactions.
-  // map from pending file number ID to their path IDs.
-  FileNumToPathIdMap pending_outputs_;
+  // For each background job, pending_outputs_ keeps the current file number at
+  // the time that background job started.
+  // FindObsoleteFiles()/PurgeObsoleteFiles() never deletes any file that has
+  // number bigger than any of the file number in pending_outputs_. Since file
+  // numbers grow monotonically, this also means that pending_outputs_ is always
+  // sorted. After a background job is done executing, its file number is
+  // deleted from pending_outputs_, which allows PurgeObsoleteFiles() to clean
+  // it up.
+  // State is protected with db mutex.
+  std::list<uint64_t> pending_outputs_;
 
   // At least one compaction or flush job is pending but not yet scheduled
   // because of the max background thread limit.
diff --git a/db/db_test.cc b/db/db_test.cc
index 091e373fa..ee8844c6c 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -165,6 +165,8 @@ class SpecialEnv : public EnvWrapper {
 
   std::atomic<uint32_t> non_writable_count_;
 
+  std::function<void()>* table_write_callback_;
+
   explicit SpecialEnv(Env* base) : EnvWrapper(base), rnd_(301) {
     delay_sstable_sync_.store(false, std::memory_order_release);
     drop_writes_.store(false, std::memory_order_release);
@@ -181,6 +183,8 @@ class SpecialEnv : public EnvWrapper {
     non_writeable_rate_ = 0;
     new_writable_count_ = 0;
     non_writable_count_ = 0;
+    periodic_non_writable_ = 0;
+    table_write_callback_ = nullptr;
   }
 
   Status NewWritableFile(const std::string& f, unique_ptr<WritableFile>* r,
@@ -196,6 +200,9 @@ class SpecialEnv : public EnvWrapper {
             base_(std::move(base)) {
       }
       Status Append(const Slice& data) {
+        if (env_->table_write_callback_) {
+          (*env_->table_write_callback_)();
+        }
         if (env_->drop_writes_.load(std::memory_order_acquire)) {
           // Drop writes on the floor
           return Status::OK();
@@ -9042,6 +9049,38 @@ TEST(DBTest, DynamicMiscOptions) {
   assert_reseek_count(300, 1);
 }
 
+TEST(DBTest, DontDeletePendingOutputs) {
+  Options options;
+  options.env = env_;
+  options.create_if_missing = true;
+  DestroyAndReopen(options);
+
+  // Every time we write to a table file, call FOF/POF with full DB scan. This
+  // will make sure our pending_outputs_ protection work correctly
+  std::function<void()> purge_obsolete_files_function = [&]() {
+    JobContext job_context;
+    dbfull()->TEST_LockMutex();
+    dbfull()->FindObsoleteFiles(&job_context, true /*force*/);
+    dbfull()->TEST_UnlockMutex();
+    dbfull()->PurgeObsoleteFiles(job_context);
+  };
+
+  env_->table_write_callback_ = &purge_obsolete_files_function;
+
+  for (int i = 0; i < 2; ++i) {
+    ASSERT_OK(Put("a", "begin"));
+    ASSERT_OK(Put("z", "end"));
+    ASSERT_OK(Flush());
+  }
+
+  // If pending output guard does not work correctly, PurgeObsoleteFiles() will
+  // delete the file that Compaction is trying to create, causing this: error
+  // db/db_test.cc:975: IO error:
+  // /tmp/rocksdbtest-1552237650/db_test/000009.sst: No such file or directory
+  Compact("a", "b");
+}
+
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
diff --git a/db/filename.h b/db/filename.h
index a80703074..87963ea21 100644
--- a/db/filename.h
+++ b/db/filename.h
@@ -36,9 +36,6 @@ enum FileType {
   kIdentityFile
 };
 
-// map from file number to path ID.
-typedef std::unordered_map<uint64_t, uint32_t> FileNumToPathIdMap;
-
 // Return the name of the log file with the specified number
 // in the db named by "dbname".  The result will be prefixed with
 // "dbname".
diff --git a/db/flush_job.cc b/db/flush_job.cc
index c477a5e8d..973d86033 100644
--- a/db/flush_job.cc
+++ b/db/flush_job.cc
@@ -55,7 +55,6 @@ FlushJob::FlushJob(const std::string& dbname, ColumnFamilyData* cfd,
                    const MutableCFOptions& mutable_cf_options,
                    const EnvOptions& env_options, VersionSet* versions,
                    port::Mutex* db_mutex, std::atomic<bool>* shutting_down,
-                   FileNumToPathIdMap* pending_outputs,
                    SequenceNumber newest_snapshot, JobContext* job_context,
                    LogBuffer* log_buffer, Directory* db_directory,
                    CompressionType output_compression, Statistics* stats)
@@ -67,7 +66,6 @@ FlushJob::FlushJob(const std::string& dbname, ColumnFamilyData* cfd,
       versions_(versions),
       db_mutex_(db_mutex),
       shutting_down_(shutting_down),
-      pending_outputs_(pending_outputs),
       newest_snapshot_(newest_snapshot),
       job_context_(job_context),
       log_buffer_(log_buffer),
@@ -107,13 +105,12 @@ Status FlushJob::Run() {
   }
 
   if (!s.ok()) {
-    cfd_->imm()->RollbackMemtableFlush(mems, file_number, pending_outputs_);
+    cfd_->imm()->RollbackMemtableFlush(mems, file_number);
   } else {
     // Replace immutable memtable with the generated Table
     s = cfd_->imm()->InstallMemtableFlushResults(
         cfd_, mutable_cf_options_, mems, versions_, db_mutex_, file_number,
-        pending_outputs_, &job_context_->memtables_to_free, db_directory_,
-        log_buffer_);
+        &job_context_->memtables_to_free, db_directory_, log_buffer_);
   }
 
   return s;
@@ -128,7 +125,6 @@ Status FlushJob::WriteLevel0Table(const autovector<MemTable*>& mems,
   meta.fd = FileDescriptor(versions_->NewFileNumber(), 0, 0);
   *filenumber = meta.fd.GetNumber();
   // path 0 for level 0 file.
-  pending_outputs_->insert({meta.fd.GetNumber(), 0});
 
   const SequenceNumber earliest_seqno_in_memtable =
       mems[0]->GetFirstSequenceNumber();
@@ -180,15 +176,6 @@ Status FlushJob::WriteLevel0Table(const autovector<MemTable*>& mems,
   // re-acquire the most current version
   base = cfd_->current();
 
-  // There could be multiple threads writing to its own level-0 file.
-  // The pending_outputs cannot be cleared here, otherwise this newly
-  // created file might not be considered as a live-file by another
-  // compaction thread that is concurrently deleting obselete files.
-  // The pending_outputs can be cleared only after the new version is
-  // committed so that other threads can recognize this file as a
-  // valid one.
-  // pending_outputs_.erase(meta.number);
-
   // Note that if file_size is zero, the file has been deleted and
   // should not be added to the manifest.
   int level = 0;
diff --git a/db/flush_job.h b/db/flush_job.h
index a5a40ce41..86d4aa073 100644
--- a/db/flush_job.h
+++ b/db/flush_job.h
@@ -55,10 +55,9 @@ class FlushJob {
            const MutableCFOptions& mutable_cf_options,
            const EnvOptions& env_options, VersionSet* versions,
            port::Mutex* db_mutex, std::atomic<bool>* shutting_down,
-           FileNumToPathIdMap* pending_outputs, SequenceNumber newest_snapshot,
-           JobContext* job_context, LogBuffer* log_buffer,
-           Directory* db_directory, CompressionType output_compression,
-           Statistics* stats);
+           SequenceNumber newest_snapshot, JobContext* job_context,
+           LogBuffer* log_buffer, Directory* db_directory,
+           CompressionType output_compression, Statistics* stats);
   ~FlushJob() {}
 
   Status Run();
@@ -74,7 +73,6 @@ class FlushJob {
   VersionSet* versions_;
   port::Mutex* db_mutex_;
   std::atomic<bool>* shutting_down_;
-  FileNumToPathIdMap* pending_outputs_;
   SequenceNumber newest_snapshot_;
   JobContext* job_context_;
   LogBuffer* log_buffer_;
diff --git a/db/flush_job_test.cc b/db/flush_job_test.cc
index 9cfe015e1..e39916bd6 100644
--- a/db/flush_job_test.cc
+++ b/db/flush_job_test.cc
@@ -73,7 +73,6 @@ class FlushJobTest {
   std::unique_ptr<VersionSet> versions_;
   port::Mutex mutex_;
   std::atomic<bool> shutting_down_;
-  FileNumToPathIdMap pending_outputs_;
   std::shared_ptr<MockTableFactory> mock_table_factory_;
 };
 
@@ -83,8 +82,8 @@ TEST(FlushJobTest, Empty) {
   FlushJob flush_job(dbname_, versions_->GetColumnFamilySet()->GetDefault(),
                      db_options_, *cfd->GetLatestMutableCFOptions(),
                      env_options_, versions_.get(), &mutex_, &shutting_down_,
-                     &pending_outputs_, SequenceNumber(), &job_context, nullptr,
-                     nullptr, kNoCompression, nullptr);
+                     SequenceNumber(), &job_context, nullptr, nullptr,
+                     kNoCompression, nullptr);
   ASSERT_OK(flush_job.Run());
 }
 
@@ -108,8 +107,8 @@ TEST(FlushJobTest, NonEmpty) {
   FlushJob flush_job(dbname_, versions_->GetColumnFamilySet()->GetDefault(),
                      db_options_, *cfd->GetLatestMutableCFOptions(),
                      env_options_, versions_.get(), &mutex_, &shutting_down_,
-                     &pending_outputs_, SequenceNumber(), &job_context, nullptr,
-                     nullptr, kNoCompression, nullptr);
+                     SequenceNumber(), &job_context, nullptr, nullptr,
+                     kNoCompression, nullptr);
   mutex_.Lock();
   ASSERT_OK(flush_job.Run());
   mutex_.Unlock();
diff --git a/db/job_context.h b/db/job_context.h
index caf28f7d9..d73e817a6 100644
--- a/db/job_context.h
+++ b/db/job_context.h
@@ -58,8 +58,12 @@ struct JobContext {
 
   // the current manifest_file_number, log_number and prev_log_number
   // that corresponds to the set of files in 'live'.
-  uint64_t manifest_file_number, pending_manifest_file_number, log_number,
-      prev_log_number;
+  uint64_t manifest_file_number;
+  uint64_t pending_manifest_file_number;
+  uint64_t log_number;
+  uint64_t prev_log_number;
+
+  uint64_t min_pending_output = 0;
 
   explicit JobContext(bool create_superversion = false) {
     manifest_file_number = 0;
diff --git a/db/memtable_list.cc b/db/memtable_list.cc
index 0066a68ba..8d568e895 100644
--- a/db/memtable_list.cc
+++ b/db/memtable_list.cc
@@ -144,8 +144,7 @@ void MemTableList::PickMemtablesToFlush(autovector<MemTable*>* ret) {
 }
 
 void MemTableList::RollbackMemtableFlush(const autovector<MemTable*>& mems,
-                                         uint64_t file_number,
-                                         FileNumToPathIdMap* pending_outputs) {
+                                         uint64_t file_number) {
   assert(!mems.empty());
 
   // If the flush was not successful, then just reset state.
@@ -159,7 +158,6 @@ void MemTableList::RollbackMemtableFlush(const autovector<MemTable*>& mems,
     m->edit_.Clear();
     num_flush_not_started_++;
   }
-  pending_outputs->erase(file_number);
   imm_flush_needed.store(true, std::memory_order_release);
 }
 
@@ -167,9 +165,8 @@ void MemTableList::RollbackMemtableFlush(const autovector<MemTable*>& mems,
 Status MemTableList::InstallMemtableFlushResults(
     ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
     const autovector<MemTable*>& mems, VersionSet* vset, port::Mutex* mu,
-    uint64_t file_number, FileNumToPathIdMap* pending_outputs,
-    autovector<MemTable*>* to_delete, Directory* db_directory,
-    LogBuffer* log_buffer) {
+    uint64_t file_number, autovector<MemTable*>* to_delete,
+    Directory* db_directory, LogBuffer* log_buffer) {
   mu->AssertHeld();
 
   // flush was sucessful
@@ -220,11 +217,6 @@ Status MemTableList::InstallMemtableFlushResults(
         current_->Remove(m);
         assert(m->file_number_ > 0);
 
-        // pending_outputs can be cleared only after the newly created file
-        // has been written to a committed version so that other concurrently
-        // executing compaction threads do not mistakenly assume that this
-        // file is not live.
-        pending_outputs->erase(m->file_number_);
         if (m->Unref() != nullptr) {
           to_delete->push_back(m);
         }
@@ -237,7 +229,6 @@ Status MemTableList::InstallMemtableFlushResults(
         m->flush_in_progress_ = false;
         m->edit_.Clear();
         num_flush_not_started_++;
-        pending_outputs->erase(m->file_number_);
         m->file_number_ = 0;
         imm_flush_needed.store(true, std::memory_order_release);
       }
diff --git a/db/memtable_list.h b/db/memtable_list.h
index 9f499b834..6cf1737c1 100644
--- a/db/memtable_list.h
+++ b/db/memtable_list.h
@@ -108,16 +108,14 @@ class MemTableList {
   // Reset status of the given memtable list back to pending state so that
   // they can get picked up again on the next round of flush.
   void RollbackMemtableFlush(const autovector<MemTable*>& mems,
-                             uint64_t file_number,
-                             FileNumToPathIdMap* pending_outputs);
+                             uint64_t file_number);
 
   // Commit a successful flush in the manifest file
   Status InstallMemtableFlushResults(
       ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
       const autovector<MemTable*>& m, VersionSet* vset, port::Mutex* mu,
-      uint64_t file_number, FileNumToPathIdMap* pending_outputs,
-      autovector<MemTable*>* to_delete, Directory* db_directory,
-      LogBuffer* log_buffer);
+      uint64_t file_number, autovector<MemTable*>* to_delete,
+      Directory* db_directory, LogBuffer* log_buffer);
 
   // New memtables are inserted at the front of the list.
   // Takes ownership of the referenced held on *m by the caller of Add().
diff --git a/db/version_set.h b/db/version_set.h
index 0ae6f1cfd..f9801c7c7 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -530,6 +530,9 @@ class VersionSet {
     return pending_manifest_file_number_;
   }
 
+  // REQUIRED: mutex locked
+  uint64_t current_next_file_number() const { return next_file_number_; }
+
   // Allocate and return a new file number
   uint64_t NewFileNumber() { return next_file_number_++; }
 

From a0f887c9e463a1cd992403c421eeb9d9011ced83 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Fri, 7 Nov 2014 12:07:43 -0800
Subject: [PATCH 430/829] Fix compile

---
 db/db_test.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/db/db_test.cc b/db/db_test.cc
index ee8844c6c..3a7891559 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -183,7 +183,6 @@ class SpecialEnv : public EnvWrapper {
     non_writeable_rate_ = 0;
     new_writable_count_ = 0;
     non_writable_count_ = 0;
-    periodic_non_writable_ = 0;
     table_write_callback_ = nullptr;
   }
 

From 31342c400530698954b78348d573206b90924d67 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Fri, 7 Nov 2014 12:41:05 -0800
Subject: [PATCH 431/829] Fix implicit compare

---
 include/rocksdb/slice.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/rocksdb/slice.h b/include/rocksdb/slice.h
index 406a8abb9..05d0f9df6 100644
--- a/include/rocksdb/slice.h
+++ b/include/rocksdb/slice.h
@@ -123,7 +123,7 @@ inline bool operator!=(const Slice& x, const Slice& y) {
 }
 
 inline int Slice::compare(const Slice& b) const {
-  const int min_len = (size_ < b.size_) ? size_ : b.size_;
+  const size_t min_len = (size_ < b.size_) ? size_ : b.size_;
   int r = memcmp(data_, b.data_, min_len);
   if (r == 0) {
     if (size_ < b.size_) r = -1;

From 5c930905301fce2363257a8be895a878182a63f6 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Fri, 7 Nov 2014 12:57:08 -0800
Subject: [PATCH 432/829] Turn on -Wshadow

Summary:
Only one more try, I promise.

I talked to Jim and he mentioned that if we include our system includes with -isystem rather than with -I, that signals to the compile that those are system includes and thus no warnings are issued. So I turned our glibc includes into system includes and now we no longer get the warning from there, making us shadow-warning-free!

Test Plan: compiles with both clang and gcc

Reviewers: sdong, yhchiang, rven, ljin

Reviewed By: ljin

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D28479
---
 .travis.yml                  |  2 +-
 Makefile                     |  2 +-
 build_tools/fbcode_config.sh | 19 +++++++++----------
 3 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index ce43d7030..70e213e02 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -14,6 +14,6 @@ before_install:
  - sudo dpkg -i libgflags-dev_2.0-1_amd64.deb
 # Lousy hack to disable use and testing of fallocate, which doesn't behave quite
 # as EnvPosixTest::AllocateTest expects within the Travis OpenVZ environment.
-script: OPT=-DTRAVIS make unity && make clean && OPT="-DTRAVIS -Wshadow" make check -j8
+script: OPT=-DTRAVIS make unity && make clean && OPT=-DTRAVIS make check -j8
 notifications:
     email: false
diff --git a/Makefile b/Makefile
index d06d6f2a1..c2d206e91 100644
--- a/Makefile
+++ b/Makefile
@@ -69,7 +69,7 @@ install:
 	@[ ! -e $(SHARED) ] || install -C -m 644 $(SHARED) $(INSTALL_PATH)/lib
 #-------------------------------------------------
 
-WARNING_FLAGS = -Wall -Werror -Wsign-compare
+WARNING_FLAGS = -Wall -Werror -Wsign-compare -Wshadow
 CFLAGS += $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CCFLAGS) $(OPT)
 CXXFLAGS += $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CXXFLAGS) $(OPT) -Woverloaded-virtual
 
diff --git a/build_tools/fbcode_config.sh b/build_tools/fbcode_config.sh
index 99215108f..fefd48d59 100644
--- a/build_tools/fbcode_config.sh
+++ b/build_tools/fbcode_config.sh
@@ -64,10 +64,8 @@ if [ -z "$USE_CLANG" ]; then
   CXX="$GCC_BASE/bin/g++"
   
   CFLAGS="-B$BINUTILS/gold -m64 -mtune=generic"
-  CFLAGS+=" -I $LIBGCC_INCLUDE -I $GLIBC_INCLUDE"
-  CFLAGS+=" $DEPS_INCLUDE"
-  CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_FALLOCATE_PRESENT"
-  CFLAGS+=" -DSNAPPY -DGFLAGS=google -DZLIB -DBZIP2 -DLZ4 -DNUMA"
+  CFLAGS+=" -isystem $GLIBC_INCLUDE"
+  CFLAGS+=" -isystem $LIBGCC_INCLUDE"
 else
   # clang 
   CLANG_BASE="/mnt/gvfs/third-party2/clang/9ab68376f938992c4eb5946ca68f90c3185cffc8/3.4"
@@ -77,7 +75,7 @@ else
 
   KERNEL_HEADERS_INCLUDE="/mnt/gvfs/third-party2/kernel-headers/a683ed7135276731065a9d76d3016c9731f4e2f9/3.2.18_70_fbk11_00129_gc8882d0/gcc-4.8.1-glibc-2.17/da39a3e/include/"
 
-  CFLAGS="-B$BINUTILS  -nostdinc -nostdlib"
+  CFLAGS="-B$BINUTILS/gold -nostdinc -nostdlib"
   CFLAGS+=" -isystem $LIBGCC_BASE/include/c++/4.8.1 "
   CFLAGS+=" -isystem $LIBGCC_BASE/include/c++/4.8.1/x86_64-facebook-linux "
   CFLAGS+=" -isystem $GLIBC_INCLUDE"
@@ -85,13 +83,14 @@ else
   CFLAGS+=" -isystem $CLANG_INCLUDE"
   CFLAGS+=" -isystem $KERNEL_HEADERS_INCLUDE/linux "
   CFLAGS+=" -isystem $KERNEL_HEADERS_INCLUDE "
-  CFLAGS+=" -Wall -Wno-sign-compare -Wno-unused-variable -Winvalid-pch -Wno-deprecated -Woverloaded-virtual"
-  CFLAGS+=" $DEPS_INCLUDE"
-  CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_FALLOCATE_PRESENT"
-  CFLAGS+=" -DSNAPPY -DGFLAGS=google -DZLIB -DBZIP2 -DLZ4 -DNUMA"
-  CXXFLAGS="$CFLAGS -nostdinc++"
+  CXXFLAGS="-nostdinc++"
 fi
 
+CFLAGS+=" $DEPS_INCLUDE"
+CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_FALLOCATE_PRESENT"
+CFLAGS+=" -DSNAPPY -DGFLAGS=google -DZLIB -DBZIP2 -DLZ4 -DNUMA"
+CXXFLAGS+=" $CFLAGS"
+
 EXEC_LDFLAGS=" $SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $GFLAGS_LIBS $NUMA_LIB"
 EXEC_LDFLAGS+=" -Wl,--dynamic-linker,/usr/local/fbcode/gcc-4.8.1-glibc-2.17/lib/ld.so"
 EXEC_LDFLAGS+=" -Wl,--no-whole-archive $LIBUNWIND"

From 28c82ff1b3275652b2eac66dac3f2a6e44af2cd8 Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Fri, 7 Nov 2014 14:45:18 -0800
Subject: [PATCH 433/829] CompactFiles, EventListener and GetDatabaseMetaData

Summary:
This diff adds three sets of APIs to RocksDB.

= GetColumnFamilyMetaData =
* This APIs allow users to obtain the current state of a RocksDB instance on one column family.
* See GetColumnFamilyMetaData in include/rocksdb/db.h

= EventListener =
* A virtual class that allows users to implement a set of
  call-back functions which will be called when specific
  events of a RocksDB instance happens.
* To register EventListener, simply insert an EventListener to ColumnFamilyOptions::listeners

= CompactFiles =
* CompactFiles API inputs a set of file numbers and an output level, and RocksDB
  will try to compact those files into the specified level.

= Example =
* Example code can be found in example/compact_files_example.cc, which implements
  a simple external compactor using EventListener, GetColumnFamilyMetaData, and
  CompactFiles API.

Test Plan:
listener_test
compactor_test
example/compact_files_example
export ROCKSDB_TESTS=CompactFiles
db_test
export ROCKSDB_TESTS=MetaData
db_test

Reviewers: ljin, igor, rven, sdong

Reviewed By: sdong

Subscribers: MarkCallaghan, dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D24705
---
 Makefile                                 |  10 +-
 db/column_family.cc                      |  34 ++-
 db/column_family.h                       |   6 +
 db/compaction.cc                         |  32 +++
 db/compaction.h                          |  29 +-
 db/compaction_picker.cc                  | 315 ++++++++++++++++++++-
 db/compaction_picker.h                   |  98 ++++++-
 db/db_impl.cc                            | 225 ++++++++++++++-
 db/db_impl.h                             |  34 ++-
 db/db_impl_readonly.h                    |  10 +
 db/db_test.cc                            | 259 ++++++++++++++++-
 db/filename.cc                           |  11 +
 db/filename.h                            |   4 +
 db/flush_job.cc                          |   9 +-
 db/flush_job.h                           |   2 +-
 db/listener_test.cc                      | 344 +++++++++++++++++++++++
 db/version_set.cc                        |  91 +++---
 db/version_set.h                         |   5 +-
 examples/Makefile                        |   9 +-
 examples/compact_files_example.cc        | 175 ++++++++++++
 include/rocksdb/comparator.h             |   2 +-
 include/rocksdb/db.h                     |  71 +++--
 include/rocksdb/immutable_options.h      |   4 +
 include/rocksdb/listener.h               |  65 +++++
 include/rocksdb/metadata.h               |  90 ++++++
 include/rocksdb/options.h                |  24 +-
 include/rocksdb/status.h                 |  14 +-
 include/rocksdb/utilities/stackable_db.h |  21 ++
 util/options.cc                          |   9 +-
 util/status.cc                           |   3 +
 30 files changed, 1908 insertions(+), 97 deletions(-)
 create mode 100644 db/listener_test.cc
 create mode 100644 examples/compact_files_example.cc
 create mode 100644 include/rocksdb/listener.h
 create mode 100644 include/rocksdb/metadata.h

diff --git a/Makefile b/Makefile
index c2d206e91..b8913108f 100644
--- a/Makefile
+++ b/Makefile
@@ -149,7 +149,9 @@ TESTS = \
 	cuckoo_table_db_test \
 	write_batch_with_index_test \
 	flush_job_test \
-	wal_manager_test
+	wal_manager_test \
+	listener_test \
+	write_batch_with_index_test
 
 TOOLS = \
         sst_dump \
@@ -502,6 +504,12 @@ cuckoo_table_reader_test: table/cuckoo_table_reader_test.o $(LIBOBJECTS) $(TESTH
 cuckoo_table_db_test: db/cuckoo_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) db/cuckoo_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
 
+listener_test: db/listener_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) db/listener_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+compactor_test: utilities/compaction/compactor_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) utilities/compaction/compactor_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
 options_test: util/options_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) util/options_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
 
diff --git a/db/column_family.cc b/db/column_family.cc
index eba3c74dd..08ff09866 100644
--- a/db/column_family.cc
+++ b/db/column_family.cc
@@ -87,6 +87,10 @@ ColumnFamilyHandleImpl::~ColumnFamilyHandleImpl() {
 
 uint32_t ColumnFamilyHandleImpl::GetID() const { return cfd()->GetID(); }
 
+const std::string& ColumnFamilyHandleImpl::GetName() const {
+  return cfd()->GetName();
+}
+
 const Comparator* ColumnFamilyHandleImpl::user_comparator() const {
   return cfd()->user_comparator();
 }
@@ -255,10 +259,23 @@ ColumnFamilyData::ColumnFamilyData(uint32_t id, const std::string& name,
     } else if (ioptions_.compaction_style == kCompactionStyleLevel) {
       compaction_picker_.reset(
           new LevelCompactionPicker(ioptions_, &internal_comparator_));
-    } else {
-      assert(ioptions_.compaction_style == kCompactionStyleFIFO);
+    } else if (ioptions_.compaction_style == kCompactionStyleFIFO) {
       compaction_picker_.reset(
           new FIFOCompactionPicker(ioptions_, &internal_comparator_));
+    } else if (ioptions_.compaction_style == kCompactionStyleNone) {
+      compaction_picker_.reset(new NullCompactionPicker(
+          ioptions_, &internal_comparator_));
+      Log(InfoLogLevel::WARN_LEVEL, ioptions_.info_log,
+          "Column family %s does not use any background compaction. "
+          "Compactions can only be done via CompactFiles\n",
+          GetName().c_str());
+    } else {
+      Log(InfoLogLevel::ERROR_LEVEL, ioptions_.info_log,
+          "Unable to recognize the specified compaction style %d. "
+          "Column family %s will use kCompactionStyleLevel.\n",
+          ioptions_.compaction_style, GetName().c_str());
+      compaction_picker_.reset(
+          new LevelCompactionPicker(ioptions_, &internal_comparator_));
     }
 
     Log(InfoLogLevel::INFO_LEVEL,
@@ -503,6 +520,19 @@ bool ColumnFamilyData::ReturnThreadLocalSuperVersion(SuperVersion* sv) {
   return false;
 }
 
+void ColumnFamilyData::NotifyOnFlushCompleted(
+    DB* db, const std::string& file_path,
+    bool triggered_flush_slowdown,
+    bool triggered_flush_stop) {
+  auto listeners = ioptions()->listeners;
+  for (auto listener : listeners) {
+    listener->OnFlushCompleted(
+        db, GetName(), file_path,
+        // Use path 0 as fulled memtables are first flushed into path 0.
+        triggered_flush_slowdown, triggered_flush_stop);
+  }
+}
+
 SuperVersion* ColumnFamilyData::InstallSuperVersion(
     SuperVersion* new_superversion, port::Mutex* db_mutex) {
   db_mutex->AssertHeld();
diff --git a/db/column_family.h b/db/column_family.h
index 0be47ee84..eef7e93b5 100644
--- a/db/column_family.h
+++ b/db/column_family.h
@@ -52,6 +52,7 @@ class ColumnFamilyHandleImpl : public ColumnFamilyHandle {
   virtual const Comparator* user_comparator() const;
 
   virtual uint32_t GetID() const;
+  virtual const std::string& GetName() const override;
 
  private:
   ColumnFamilyData* cfd_;
@@ -250,6 +251,11 @@ class ColumnFamilyData {
 
   void ResetThreadLocalSuperVersions();
 
+  void NotifyOnFlushCompleted(
+      DB* db, const std::string& file_path,
+      bool triggered_flush_slowdown,
+      bool triggered_flush_stop);
+
  private:
   friend class ColumnFamilySet;
   ColumnFamilyData(uint32_t id, const std::string& name,
diff --git a/db/compaction.cc b/db/compaction.cc
index 3f9da1d82..00513f533 100644
--- a/db/compaction.cc
+++ b/db/compaction.cc
@@ -78,6 +78,38 @@ Compaction::Compaction(int number_levels, int start_level, int out_level,
   }
 }
 
+Compaction::Compaction(VersionStorageInfo* vstorage,
+    const autovector<CompactionInputFiles>& inputs,
+    int start_level, int output_level,
+    uint64_t max_grandparent_overlap_bytes,
+    const CompactionOptions& options,
+    bool deletion_compaction)
+    : start_level_(start_level),
+      output_level_(output_level),
+      max_output_file_size_(options.output_file_size_limit),
+      max_grandparent_overlap_bytes_(max_grandparent_overlap_bytes),
+      input_version_(nullptr),  // TODO(yhchiang): set it later
+      number_levels_(vstorage->NumberLevels()),
+      cfd_(nullptr),
+      output_compression_(options.compression),
+      seek_compaction_(false),
+      deletion_compaction_(deletion_compaction),
+      inputs_(inputs),
+      grandparent_index_(0),
+      seen_key_(false),
+      overlapped_bytes_(0),
+      base_index_(-1),
+      parent_index_(-1),
+      score_(0),
+      bottommost_level_(false),
+      is_full_compaction_(false),
+      is_manual_compaction_(false),
+      level_ptrs_(std::vector<size_t>(number_levels_)) {
+  for (int i = 0; i < number_levels_; i++) {
+    level_ptrs_[i] = 0;
+  }
+}
+
 Compaction::~Compaction() {
   delete edit_;
   if (input_version_ != nullptr) {
diff --git a/db/compaction.h b/db/compaction.h
index d8014545b..3a012fb60 100644
--- a/db/compaction.h
+++ b/db/compaction.h
@@ -33,6 +33,13 @@ class VersionStorageInfo;
 // A Compaction encapsulates information about a compaction.
 class Compaction {
  public:
+  Compaction(VersionStorageInfo* input_version,
+    const autovector<CompactionInputFiles>& inputs,
+    int start_level, int output_level,
+    uint64_t max_grandparent_overlap_bytes,
+    const CompactionOptions& options,
+    bool deletion_compaction);
+
   // No copying allowed
   Compaction(const Compaction&) = delete;
   void operator=(const Compaction&) = delete;
@@ -153,6 +160,8 @@ class Compaction {
   // Was this compaction triggered manually by the client?
   bool IsManualCompaction() { return is_manual_compaction_; }
 
+  void SetOutputPathId(uint32_t path_id) { output_path_id_ = path_id; }
+
   // Return the MutableCFOptions that should be used throughout the compaction
   // procedure
   const MutableCFOptions* mutable_cf_options() { return &mutable_cf_options_; }
@@ -164,6 +173,16 @@ class Compaction {
 
   void SetInputVersion(Version* input_version);
 
+  // mark (or clear) all files that are being compacted
+  void MarkFilesBeingCompacted(bool mark_as_compacted);
+
+  // Initialize whether the compaction is producing files at the
+  // bottommost level.
+  //
+  // @see BottomMostLevel()
+  void SetupBottomMostLevel(VersionStorageInfo* vstorage, bool is_manual,
+                            bool level0_only);
+
  private:
   friend class CompactionPicker;
   friend class UniversalCompactionPicker;
@@ -226,16 +245,6 @@ class Compaction {
   // records indices for all levels beyond "output_level_".
   std::vector<size_t> level_ptrs_;
 
-  // mark (or clear) all files that are being compacted
-  void MarkFilesBeingCompacted(bool mark_as_compacted);
-
-  // Initialize whether the compaction is producing files at the
-  // bottommost level.
-  //
-  // @see BottomMostLevel()
-  void SetupBottomMostLevel(VersionStorageInfo* vstorage, bool is_manual,
-                            bool level0_only);
-
   // In case of compaction error, reset the nextIndex that is used
   // to pick up the next file to be compacted from files_by_size_
   void ResetNextCompactionIndex();
diff --git a/db/compaction_picker.cc b/db/compaction_picker.cc
index e2694bcd0..f5207748b 100644
--- a/db/compaction_picker.cc
+++ b/db/compaction_picker.cc
@@ -185,7 +185,8 @@ bool CompactionPicker::ExpandWhileOverlapping(const std::string& cf_name,
 }
 
 // Returns true if any one of specified files are being compacted
-bool CompactionPicker::FilesInCompaction(std::vector<FileMetaData*>& files) {
+bool CompactionPicker::FilesInCompaction(
+    const std::vector<FileMetaData*>& files) {
   for (unsigned int i = 0; i < files.size(); i++) {
     if (files[i]->being_compacted) {
       return true;
@@ -194,6 +195,89 @@ bool CompactionPicker::FilesInCompaction(std::vector<FileMetaData*>& files) {
   return false;
 }
 
+Compaction* CompactionPicker::FormCompaction(
+      const CompactionOptions& compact_options,
+      const autovector<CompactionInputFiles>& input_files,
+      int output_level, VersionStorageInfo* vstorage,
+      const MutableCFOptions& mutable_cf_options) const {
+  uint64_t max_grandparent_overlap_bytes =
+      output_level + 1 < vstorage->NumberLevels() ?
+          mutable_cf_options.MaxGrandParentOverlapBytes(output_level + 1) :
+          std::numeric_limits<uint64_t>::max();
+  assert(input_files.size());
+  auto c = new Compaction(vstorage, input_files,
+      input_files[0].level, output_level,
+      max_grandparent_overlap_bytes,
+      compact_options, false);
+  c->mutable_cf_options_ = mutable_cf_options;
+  c->MarkFilesBeingCompacted(true);
+
+  // TODO(yhchiang): complete the SetBottomMostLevel as follows
+  // If there is no any key of the range in DB that is older than the
+  // range to compact, it is bottom most.  For leveled compaction,
+  // if number-of_level-1 is empty, and output is going to number-of_level-2,
+  // it is also bottom-most.  On the other hand, if number of level=1 (
+  // something like universal), the compaction is only "bottom-most" if
+  // the oldest file is involved.
+  c->SetupBottomMostLevel(
+      vstorage,
+      (output_level == vstorage->NumberLevels() - 1),
+      (output_level == 0));
+  return c;
+}
+
+Status CompactionPicker::GetCompactionInputsFromFileNumbers(
+    autovector<CompactionInputFiles>* input_files,
+    std::unordered_set<uint64_t>* input_set,
+    const VersionStorageInfo* vstorage,
+    const CompactionOptions& compact_options) const {
+  if (input_set->size() == 0U) {
+    return Status::InvalidArgument(
+        "Compaction must include at least one file.");
+  }
+  assert(input_files);
+
+  autovector<CompactionInputFiles> matched_input_files;
+  matched_input_files.resize(vstorage->NumberLevels());
+  int first_non_empty_level = -1;
+  int last_non_empty_level = -1;
+  // TODO(yhchiang): use a lazy-initialized mapping from
+  //                 file_number to FileMetaData in Version.
+  for (int level = 0; level < vstorage->NumberLevels(); ++level) {
+    for (auto file : vstorage->LevelFiles(level)) {
+      auto iter = input_set->find(file->fd.GetNumber());
+      if (iter != input_set->end()) {
+        matched_input_files[level].files.push_back(file);
+        input_set->erase(iter);
+        last_non_empty_level = level;
+        if (first_non_empty_level == -1) {
+          first_non_empty_level = level;
+        }
+      }
+    }
+  }
+
+  if (!input_set->empty()) {
+    std::string message(
+        "Cannot find matched SST files for the following file numbers:");
+    for (auto fn : *input_set) {
+      message += " ";
+      message += std::to_string(fn);
+    }
+    return Status::InvalidArgument(message);
+  }
+
+  for (int level = first_non_empty_level;
+       level <= last_non_empty_level; ++level) {
+    matched_input_files[level].level = level;
+    input_files->emplace_back(std::move(matched_input_files[level]));
+  }
+
+  return Status::OK();
+}
+
+
+
 // Returns true if any one of the parent files are being compacted
 bool CompactionPicker::ParentRangeInCompaction(VersionStorageInfo* vstorage,
                                                const InternalKey* smallest,
@@ -362,6 +446,235 @@ Compaction* CompactionPicker::CompactRange(
   return c;
 }
 
+namespace {
+// Test whether two files have overlapping key-ranges.
+bool HaveOverlappingKeyRanges(
+    const Comparator* c,
+    const SstFileMetaData& a, const SstFileMetaData& b) {
+  if (c->Compare(a.smallestkey, b.smallestkey) >= 0) {
+    if (c->Compare(a.smallestkey, b.largestkey) <= 0) {
+      // b.smallestkey <= a.smallestkey <= b.largestkey
+      return true;
+    }
+  } else if (c->Compare(a.largestkey, b.smallestkey) >= 0) {
+    // a.smallestkey < b.smallestkey <= a.largestkey
+    return true;
+  }
+  if (c->Compare(a.largestkey, b.largestkey) <= 0) {
+    if (c->Compare(a.largestkey, b.smallestkey) >= 0) {
+      // b.smallestkey <= a.largestkey <= b.largestkey
+      return true;
+    }
+  } else if (c->Compare(a.smallestkey, b.largestkey) <= 0) {
+    // a.smallestkey <= b.largestkey < a.largestkey
+    return true;
+  }
+  return false;
+}
+}  // namespace
+
+Status CompactionPicker::SanitizeCompactionInputFilesForAllLevels(
+      std::unordered_set<uint64_t>* input_files,
+      const ColumnFamilyMetaData& cf_meta,
+      const int output_level) const {
+  auto& levels = cf_meta.levels;
+  auto comparator = icmp_->user_comparator();
+
+  // TODO(yhchiang): If there is any input files of L1 or up and there
+  // is at least one L0 files. All L0 files older than the L0 file needs
+  // to be included. Otherwise, it is a false conditoin
+
+  // TODO(yhchiang): add is_adjustable to CompactionOptions
+
+  // the smallest and largest key of the current compaction input
+  std::string smallestkey;
+  std::string largestkey;
+  // a flag for initializing smallest and largest key
+  bool is_first = false;
+  const int kNotFound = -1;
+
+  // For each level, it does the following things:
+  // 1. Find the first and the last compaction input files
+  //    in the current level.
+  // 2. Include all files between the first and the last
+  //    compaction input files.
+  // 3. Update the compaction key-range.
+  // 4. For all remaining levels, include files that have
+  //    overlapping key-range with the compaction key-range.
+  for (int l = 0; l <= output_level; ++l) {
+    auto& current_files = levels[l].files;
+    int first_included = static_cast<int>(current_files.size());
+    int last_included = kNotFound;
+
+    // identify the first and the last compaction input files
+    // in the current level.
+    for (size_t f = 0; f < current_files.size(); ++f) {
+      if (input_files->find(TableFileNameToNumber(current_files[f].name)) !=
+          input_files->end()) {
+        first_included = std::min(first_included, static_cast<int>(f));
+        last_included = std::max(last_included, static_cast<int>(f));
+        if (is_first == false) {
+          smallestkey = current_files[f].smallestkey;
+          largestkey = current_files[f].largestkey;
+          is_first = true;
+        }
+      }
+    }
+    if (last_included == kNotFound) {
+      continue;
+    }
+
+    if (l != 0) {
+      // expend the compaction input of the current level if it
+      // has overlapping key-range with other non-compaction input
+      // files in the same level.
+      while (first_included > 0) {
+        if (comparator->Compare(
+                current_files[first_included - 1].largestkey,
+                current_files[first_included].smallestkey) < 0) {
+          break;
+        }
+        first_included--;
+      }
+
+      while (last_included < static_cast<int>(current_files.size()) - 1) {
+        if (comparator->Compare(
+                current_files[last_included + 1].smallestkey,
+                current_files[last_included].largestkey) > 0) {
+          break;
+        }
+        last_included++;
+      }
+    }
+
+    // include all files between the first and the last compaction input files.
+    for (int f = first_included; f <= last_included; ++f) {
+      if (current_files[f].being_compacted) {
+        return Status::Aborted(
+            "Necessary compaction input file " + current_files[f].name +
+            " is currently being compacted.");
+      }
+      input_files->insert(
+          TableFileNameToNumber(current_files[f].name));
+    }
+
+    // update smallest and largest key
+    if (l == 0) {
+      for (int f = first_included; f <= last_included; ++f) {
+        if (comparator->Compare(
+            smallestkey, current_files[f].smallestkey) > 0) {
+          smallestkey = current_files[f].smallestkey;
+        }
+        if (comparator->Compare(
+            largestkey, current_files[f].largestkey) < 0) {
+          largestkey = current_files[f].largestkey;
+        }
+      }
+    } else {
+      if (comparator->Compare(
+          smallestkey, current_files[first_included].smallestkey) > 0) {
+        smallestkey = current_files[first_included].smallestkey;
+      }
+      if (comparator->Compare(
+          largestkey, current_files[last_included].largestkey) < 0) {
+        largestkey = current_files[last_included].largestkey;
+      }
+    }
+
+    SstFileMetaData aggregated_file_meta;
+    aggregated_file_meta.smallestkey = smallestkey;
+    aggregated_file_meta.largestkey = largestkey;
+
+    // For all lower levels, include all overlapping files.
+    for (int m = l + 1; m <= output_level; ++m) {
+      for (auto& next_lv_file : levels[m].files) {
+        if (HaveOverlappingKeyRanges(
+            comparator, aggregated_file_meta, next_lv_file)) {
+          if (next_lv_file.being_compacted) {
+            return Status::Aborted(
+                "File " + next_lv_file.name +
+                " that has overlapping key range with one of the compaction "
+                " input file is currently being compacted.");
+          }
+          input_files->insert(
+              TableFileNameToNumber(next_lv_file.name));
+        }
+      }
+    }
+  }
+  return Status::OK();
+}
+
+Status CompactionPicker::SanitizeCompactionInputFiles(
+    std::unordered_set<uint64_t>* input_files,
+    const ColumnFamilyMetaData& cf_meta,
+    const int output_level) const {
+  assert(static_cast<int>(cf_meta.levels.size()) - 1 ==
+         cf_meta.levels[cf_meta.levels.size() - 1].level);
+  if (output_level >= static_cast<int>(cf_meta.levels.size())) {
+    return Status::InvalidArgument(
+        "Output level for column family " + cf_meta.name +
+        " must between [0, " +
+        std::to_string(cf_meta.levels[cf_meta.levels.size() - 1].level) +
+        "].");
+  }
+
+  if (output_level > MaxOutputLevel()) {
+    return Status::InvalidArgument(
+        "Exceed the maximum output level defined by "
+        "the current compaction algorithm --- " +
+            std::to_string(MaxOutputLevel()));
+  }
+
+  if (output_level < 0) {
+    return Status::InvalidArgument(
+        "Output level cannot be negative.");
+  }
+
+  if (input_files->size() == 0) {
+    return Status::InvalidArgument(
+        "A compaction must contain at least one file.");
+  }
+
+  Status s = SanitizeCompactionInputFilesForAllLevels(
+      input_files, cf_meta, output_level);
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  // for all input files, check whether the file number matches
+  // any currently-existing files.
+  for (auto file_num : *input_files) {
+    bool found = false;
+    for (auto level_meta : cf_meta.levels) {
+      for (auto file_meta : level_meta.files) {
+        if (file_num == TableFileNameToNumber(file_meta.name)) {
+          if (file_meta.being_compacted) {
+            return Status::Aborted(
+                "Specified compaction input file " +
+                MakeTableFileName("", file_num) +
+                " is already being compacted.");
+          }
+          found = true;
+          break;
+        }
+      }
+      if (found) {
+        break;
+      }
+    }
+    if (!found) {
+      return Status::InvalidArgument(
+          "Specified compaction input file " +
+          MakeTableFileName("", file_num) +
+          " does not exist in column family " + cf_meta.name + ".");
+    }
+  }
+
+  return Status::OK();
+}
+
 Compaction* LevelCompactionPicker::PickCompaction(
     const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
     VersionStorageInfo* vstorage, LogBuffer* log_buffer) {
diff --git a/db/compaction_picker.h b/db/compaction_picker.h
index d691a765a..d8daed115 100644
--- a/db/compaction_picker.h
+++ b/db/compaction_picker.h
@@ -8,6 +8,11 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #pragma once
+#include <vector>
+#include <memory>
+#include <set>
+#include <unordered_set>
+
 #include "db/version_set.h"
 #include "db/compaction.h"
 #include "rocksdb/status.h"
@@ -25,6 +30,7 @@ namespace rocksdb {
 class LogBuffer;
 class Compaction;
 class VersionStorageInfo;
+struct CompactionInputFiles;
 
 class CompactionPicker {
  public:
@@ -62,6 +68,22 @@ class CompactionPicker {
   // for compaction input.
   virtual int MaxInputLevel(int current_num_levels) const = 0;
 
+  // The maximum allowed output level.  Default value is NumberLevels() - 1.
+  virtual int MaxOutputLevel() const {
+    return NumberLevels() - 1;
+  }
+
+  // Sanitize the input set of compaction input files.
+  // When the input parameters do not describe a valid compaction, the
+  // function will try to fix the input_files by adding necessary
+  // files.  If it's not possible to conver an invalid input_files
+  // into a valid one by adding more files, the function will return a
+  // non-ok status with specific reason.
+  Status SanitizeCompactionInputFiles(
+      std::unordered_set<uint64_t>* input_files,
+      const ColumnFamilyMetaData& cf_meta,
+      const int output_level) const;
+
   // Free up the files that participated in a compaction
   void ReleaseCompactionFiles(Compaction* c, Status status);
 
@@ -69,6 +91,25 @@ class CompactionPicker {
   // compactions per level
   void SizeBeingCompacted(std::vector<uint64_t>& sizes);
 
+  // Returns true if any one of the specified files are being compacted
+  bool FilesInCompaction(const std::vector<FileMetaData*>& files);
+
+  // Takes a list of CompactionInputFiles and returns a Compaction object.
+  Compaction* FormCompaction(
+      const CompactionOptions& compact_options,
+      const autovector<CompactionInputFiles>& input_files,
+      int output_level, VersionStorageInfo* vstorage,
+      const MutableCFOptions& mutable_cf_options) const;
+
+  // Converts a set of compaction input file numbers into
+  // a list of CompactionInputFiles.
+  Status GetCompactionInputsFromFileNumbers(
+      autovector<CompactionInputFiles>* input_files,
+      std::unordered_set<uint64_t>* input_set,
+      const VersionStorageInfo* vstorage,
+      const CompactionOptions& compact_options) const;
+
+
  protected:
   int NumberLevels() const { return ioptions_.num_levels; }
 
@@ -98,9 +139,6 @@ class CompactionPicker {
   bool ExpandWhileOverlapping(const std::string& cf_name,
                               VersionStorageInfo* vstorage, Compaction* c);
 
-  // Returns true if any one of the specified files are being compacted
-  bool FilesInCompaction(std::vector<FileMetaData*>& files);
-
   // Returns true if any one of the parent files are being compacted
   bool ParentRangeInCompaction(VersionStorageInfo* vstorage,
                                const InternalKey* smallest,
@@ -113,11 +151,16 @@ class CompactionPicker {
 
   const ImmutableCFOptions& ioptions_;
 
+  // A helper function to SanitizeCompactionInputFiles() that
+  // sanitizes "input_files" by adding necessary files.
+  virtual Status SanitizeCompactionInputFilesForAllLevels(
+      std::unordered_set<uint64_t>* input_files,
+      const ColumnFamilyMetaData& cf_meta,
+      const int output_level) const;
+
   // record all the ongoing compactions for all levels
   std::vector<std::set<Compaction*>> compactions_in_progress_;
 
-
- private:
   const InternalKeyComparator* const icmp_;
 };
 
@@ -131,11 +174,16 @@ class UniversalCompactionPicker : public CompactionPicker {
                                      VersionStorageInfo* vstorage,
                                      LogBuffer* log_buffer) override;
 
-  // The maxinum allowed input level.  Always return 0.
+  // The maxinum allowed input level.  Always returns 0.
   virtual int MaxInputLevel(int current_num_levels) const override {
     return 0;
   }
 
+  // The maximum allowed output level.  Always returns 0.
+  virtual int MaxOutputLevel() const override {
+    return 0;
+  }
+
  private:
   // Pick Universal compaction to limit read amplification
   Compaction* PickCompactionUniversalReadAmp(
@@ -197,10 +245,46 @@ class FIFOCompactionPicker : public CompactionPicker {
       uint32_t output_path_id, const InternalKey* begin, const InternalKey* end,
       InternalKey** compaction_end) override;
 
-  // The maxinum allowed input level.  Always return 0.
+  // The maxinum allowed input level.  Always returns 0.
   virtual int MaxInputLevel(int current_num_levels) const override {
     return 0;
   }
+
+  // The maximum allowed output level.  Always returns 0.
+  virtual int MaxOutputLevel() const override {
+    return 0;
+  }
+};
+
+class NullCompactionPicker : public CompactionPicker {
+ public:
+  NullCompactionPicker(const ImmutableCFOptions& ioptions,
+                       const InternalKeyComparator* icmp) :
+      CompactionPicker(ioptions, icmp) {}
+  virtual ~NullCompactionPicker() {}
+
+  // Always return "nullptr"
+  Compaction* PickCompaction(const std::string& cf_name,
+                             const MutableCFOptions& mutable_cf_options,
+                             VersionStorageInfo* vstorage,
+                             LogBuffer* log_buffer) override {
+    return nullptr;
+  }
+
+  // Always return "nullptr"
+  Compaction* CompactRange(
+      const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+      VersionStorageInfo* vstorage, int input_level, int output_level,
+      uint32_t output_path_id, const InternalKey* begin, const InternalKey* end,
+      InternalKey** compaction_end) override {
+    return nullptr;
+  }
+
+  // Given the current number of levels, returns the highest allowed level
+  // for compaction input.
+  virtual int MaxInputLevel(int current_num_levels) const {
+    return current_num_levels - 2;
+  }
 };
 
 // Utility function
diff --git a/db/db_impl.cc b/db/db_impl.cc
index da0603303..0c218fb03 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -213,7 +213,8 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname)
 #endif  // ROCKSDB_LITE
       bg_work_gate_closed_(false),
       refitting_level_(false),
-      opened_successfully_(false) {
+      opened_successfully_(false),
+      notifying_events_(0) {
   env_->GetAbsolutePath(dbname, &db_absolute_path_);
 
   // Reserve ten files or so for other uses and give the rest to TableCache.
@@ -239,6 +240,7 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname)
 
 DBImpl::~DBImpl() {
   mutex_.Lock();
+
   if (flush_on_destroy_) {
     for (auto cfd : *versions_->GetColumnFamilySet()) {
       if (!cfd->mem()->IsEmpty()) {
@@ -254,10 +256,10 @@ DBImpl::~DBImpl() {
 
   // Wait for background work to finish
   shutting_down_.store(true, std::memory_order_release);
-  while (bg_compaction_scheduled_ || bg_flush_scheduled_) {
+  while (bg_compaction_scheduled_ || bg_flush_scheduled_ || notifying_events_) {
     bg_cv_.Wait();
   }
-
+  listeners_.clear();
   flush_scheduler_.Clear();
 
   if (default_cf_handle_ != nullptr) {
@@ -1055,7 +1057,8 @@ Status DBImpl::FlushMemTableToOutputFile(
                      db_directory_.get(), GetCompressionFlush(*cfd->ioptions()),
                      stats_);
 
-  Status s = flush_job.Run();
+  uint64_t file_number;
+  Status s = flush_job.Run(&file_number);
 
   if (s.ok()) {
     InstallSuperVersionBackground(cfd, job_context, mutable_cf_options);
@@ -1085,9 +1088,42 @@ Status DBImpl::FlushMemTableToOutputFile(
     bg_error_ = s;
   }
   RecordFlushIOStats();
+#ifndef ROCKSDB_LITE
+  if (s.ok()) {
+    // may temporarily unlock and lock the mutex.
+    NotifyOnFlushCompleted(cfd, file_number);
+  }
+#endif  // ROCKSDB_LITE
   return s;
 }
 
+void DBImpl::NotifyOnFlushCompleted(
+    ColumnFamilyData* cfd, uint64_t file_number) {
+  mutex_.AssertHeld();
+  if (shutting_down_.load(std::memory_order_acquire)) {
+    return;
+  }
+  bool triggered_flush_slowdown =
+      (cfd->current()->storage_info()->NumLevelFiles(0) >=
+       cfd->options()->level0_slowdown_writes_trigger);
+  bool triggered_flush_stop =
+      (cfd->current()->storage_info()->NumLevelFiles(0) >=
+       cfd->options()->level0_stop_writes_trigger);
+  notifying_events_++;
+  // release lock while notifying events
+  mutex_.Unlock();
+  // TODO(yhchiang): make db_paths dynamic.
+  cfd->NotifyOnFlushCompleted(
+        this, MakeTableFileName(db_options_.db_paths[0].path, file_number),
+        triggered_flush_slowdown,
+        triggered_flush_stop);
+  mutex_.Lock();
+  notifying_events_--;
+  assert(notifying_events_ >= 0);
+  // no need to signal bg_cv_ as it will be signaled at the end of the
+  // flush process.
+}
+
 Status DBImpl::CompactRange(ColumnFamilyHandle* column_family,
                             const Slice* begin, const Slice* end,
                             bool reduce_level, int target_level,
@@ -1149,6 +1185,167 @@ Status DBImpl::CompactRange(ColumnFamilyHandle* column_family,
   return s;
 }
 
+Status DBImpl::CompactFiles(
+    const CompactionOptions& compact_options,
+    ColumnFamilyHandle* column_family,
+    const std::vector<std::string>& input_file_names,
+    const int output_level, const int output_path_id) {
+  MutexLock l(&mutex_);
+  if (column_family == nullptr) {
+    return Status::InvalidArgument("ColumnFamilyHandle must be non-null.");
+  }
+
+  auto cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family)->cfd();
+  assert(cfd);
+  // TODO(yhchiang): use superversion
+  cfd->Ref();
+  auto version = cfd->current();
+  version->Ref();
+  auto s = CompactFilesImpl(compact_options, cfd, version,
+                            input_file_names, output_level, output_path_id);
+  // TODO(yhchiang): unref could move into CompactFilesImpl().  Otherwise,
+  // FindObsoleteFiles might never able to find any file to delete.
+  version->Unref();
+  // TODO(yhchiang): cfd should be deleted after its last reference.
+  cfd->Unref();
+  return s;
+}
+
+Status DBImpl::CompactFilesImpl(
+    const CompactionOptions& compact_options, ColumnFamilyData* cfd,
+    Version* version, const std::vector<std::string>& input_file_names,
+    const int output_level, int output_path_id) {
+  mutex_.AssertHeld();
+
+  LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, db_options_.info_log.get());
+
+  if (shutting_down_.load(std::memory_order_acquire)) {
+    return Status::ShutdownInProgress();
+  }
+
+  std::unordered_set<uint64_t> input_set;
+  for (auto file_name : input_file_names) {
+    input_set.insert(TableFileNameToNumber(file_name));
+  }
+
+  ColumnFamilyMetaData cf_meta;
+  // TODO(yhchiang): can directly use version here if none of the
+  // following functions call is pluggable to external developers.
+  version->GetColumnFamilyMetaData(&cf_meta);
+
+  if (output_path_id < 0) {
+    if (db_options_.db_paths.size() == 1U) {
+      output_path_id = 0;
+    } else {
+      return Status::NotSupported(
+          "Automatic output path selection is not "
+          "yet supported in CompactFiles()");
+    }
+  }
+
+  Status s = cfd->compaction_picker()->SanitizeCompactionInputFiles(
+      &input_set, cf_meta, output_level);
+  if (!s.ok()) {
+    return s;
+  }
+
+  autovector<CompactionInputFiles> input_files;
+  s = cfd->compaction_picker()->GetCompactionInputsFromFileNumbers(
+      &input_files, &input_set, version->storage_info(), compact_options);
+  if (!s.ok()) {
+    return s;
+  }
+
+  for (auto inputs : input_files) {
+    if (cfd->compaction_picker()->FilesInCompaction(inputs.files)) {
+      return Status::Aborted(
+          "Some of the necessary compaction input "
+          "files are already being compacted");
+    }
+  }
+
+  // At this point, CompactFiles will be run.
+  bg_compaction_scheduled_++;
+
+  unique_ptr<Compaction> c;
+  assert(cfd->compaction_picker());
+  c.reset(cfd->compaction_picker()->FormCompaction(
+        compact_options, input_files,
+        output_level, version->storage_info(),
+        *cfd->GetLatestMutableCFOptions()));
+  assert(c);
+  c->SetInputVersion(version);
+  c->SetOutputPathId(static_cast<uint32_t>(output_path_id));
+  // deletion compaction currently not allowed in CompactFiles.
+  assert(!c->IsDeletionCompaction());
+
+  JobContext job_context(true);
+  auto yield_callback = [&]() {
+    return CallFlushDuringCompaction(c->column_family_data(),
+                                     *c->mutable_cf_options(), &job_context,
+                                     &log_buffer);
+  };
+  CompactionJob compaction_job(
+      c.get(), db_options_, *c->mutable_cf_options(), env_options_,
+      versions_.get(), &mutex_, &shutting_down_, &pending_outputs_,
+      &log_buffer, db_directory_.get(), stats_, &snapshots_,
+      IsSnapshotSupported(), table_cache_, std::move(yield_callback));
+  compaction_job.Prepare();
+
+  mutex_.Unlock();
+  Status status = compaction_job.Run();
+  mutex_.Lock();
+  if (status.ok()) {
+    status = compaction_job.Install(status);
+    if (status.ok()) {
+      InstallSuperVersionBackground(c->column_family_data(), &job_context,
+                                    *c->mutable_cf_options());
+    }
+  }
+  c->ReleaseCompactionFiles(s);
+  c->ReleaseInputs();
+  c.reset();
+
+  if (status.ok()) {
+    // Done
+  } else if (status.IsShutdownInProgress()) {
+    // Ignore compaction errors found during shutting down
+  } else {
+    Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log, "Compaction error: %s",
+        status.ToString().c_str());
+    if (db_options_.paranoid_checks && bg_error_.ok()) {
+      bg_error_ = status;
+    }
+  }
+
+  // If !s.ok(), this means that Compaction failed. In that case, we want
+  // to delete all obsolete files we might have created and we force
+  // FindObsoleteFiles(). This is because job_context does not
+  // catch all created files if compaction failed.
+  // TODO(yhchiang): write an unit-test to make sure files are actually
+  //                 deleted after CompactFiles.
+  FindObsoleteFiles(&job_context, !s.ok());
+
+  // delete unnecessary files if any, this is done outside the mutex
+  if (job_context.HaveSomethingToDelete() || !log_buffer.IsEmpty()) {
+    mutex_.Unlock();
+    // Have to flush the info logs before bg_compaction_scheduled_--
+    // because if bg_flush_scheduled_ becomes 0 and the lock is
+    // released, the deconstructor of DB can kick in and destroy all the
+    // states of DB so info_log might not be available after that point.
+    // It also applies to access other states that DB owns.
+    log_buffer.FlushBufferToLog();
+    if (job_context.HaveSomethingToDelete()) {
+      PurgeObsoleteFiles(job_context);
+    }
+    mutex_.Lock();
+  }
+
+  bg_compaction_scheduled_--;
+
+  return status;
+}
+
 Status DBImpl::SetOptions(ColumnFamilyHandle* column_family,
     const std::unordered_map<std::string, std::string>& options_map) {
   auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family)->cfd();
@@ -3112,6 +3309,17 @@ void DBImpl::GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata) {
   MutexLock l(&mutex_);
   versions_->GetLiveFilesMetaData(metadata);
 }
+
+void DBImpl::GetColumnFamilyMetaData(
+    ColumnFamilyHandle* column_family,
+    ColumnFamilyMetaData* cf_meta) {
+  assert(column_family);
+  auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family)->cfd();
+  auto* sv = GetAndRefSuperVersion(cfd);
+  sv->current->GetColumnFamilyMetaData(cf_meta);
+  ReturnAndCleanupSuperVersion(cfd, sv);
+}
+
 #endif  // ROCKSDB_LITE
 
 Status DBImpl::CheckConsistency() {
@@ -3362,6 +3570,15 @@ Status DB::Open(const DBOptions& db_options, const std::string& dbname,
   if (s.ok()) {
     impl->opened_successfully_ = true;
     *dbptr = impl;
+    // TODO(yhchiang): Add NotifyOnDatabaseOpen() here.
+    // Since the column-family handles are only available after DB::Open(),
+    // typically developers will need to pass the returned ColumnFamilyHandles
+    // to their EventListeners in order to maintain the mapping between
+    // column-family-name to ColumnFamilyHandle.  However, some database
+    // events might happen before the user passing those ColumnFamilyHandle to
+    // their Listeners.  To address this, we should have NotifyOnDatabaseOpen()
+    // here which passes the created ColumnFamilyHandle to the Listeners
+    // as the first event after DB::Open().
   } else {
     for (auto h : *handles) {
       delete h;
diff --git a/db/db_impl.h b/db/db_impl.h
index a25a82a9a..eda00ab9b 100644
--- a/db/db_impl.h
+++ b/db/db_impl.h
@@ -14,6 +14,7 @@
 #include <set>
 #include <list>
 #include <utility>
+#include <list>
 #include <vector>
 #include <string>
 
@@ -115,6 +116,13 @@ class DBImpl : public DB {
                               bool reduce_level = false, int target_level = -1,
                               uint32_t target_path_id = 0);
 
+  using DB::CompactFiles;
+  virtual Status CompactFiles(
+      const CompactionOptions& compact_options,
+      ColumnFamilyHandle* column_family,
+      const std::vector<std::string>& input_file_names,
+      const int output_level, const int output_path_id = -1);
+
   using DB::SetOptions;
   Status SetOptions(ColumnFamilyHandle* column_family,
       const std::unordered_map<std::string, std::string>& options_map);
@@ -152,6 +160,15 @@ class DBImpl : public DB {
   virtual Status DeleteFile(std::string name);
 
   virtual void GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata);
+
+  // Obtains the meta data of the specified column family of the DB.
+  // Status::NotFound() will be returned if the current DB does not have
+  // any column family match the specified name.
+  // TODO(yhchiang): output parameter is placed in the end in this codebase.
+  virtual void GetColumnFamilyMetaData(
+      ColumnFamilyHandle* column_family,
+      ColumnFamilyMetaData* metadata) override;
+
 #endif  // ROCKSDB_LITE
 
   // checks if all live files exist on file system and that their file sizes
@@ -211,7 +228,7 @@ class DBImpl : public DB {
   // REQUIRES: mutex locked
   // pass the pointer that you got from TEST_BeginWrite()
   void TEST_EndWrite(void* w);
-#endif  // NDEBUG
+#endif  // ROCKSDB_LITE
 
   // Returns the list of live files in 'live' and the list
   // of all files in the filesystem in 'candidate_files'.
@@ -239,6 +256,8 @@ class DBImpl : public DB {
   Iterator* NewInternalIterator(const ReadOptions&, ColumnFamilyData* cfd,
                                 SuperVersion* super_version, Arena* arena);
 
+  void NotifyOnFlushCompleted(ColumnFamilyData* cfd, uint64_t file_number);
+
  private:
   friend class DB;
   friend class InternalStats;
@@ -318,6 +337,13 @@ class DBImpl : public DB {
   void RecordFlushIOStats();
   void RecordCompactionIOStats();
 
+  Status CompactFilesImpl(
+      const CompactionOptions& compact_options, ColumnFamilyData* cfd,
+      Version* version, const std::vector<std::string>& input_file_names,
+      const int output_level, int output_path_id);
+
+  ColumnFamilyData* GetColumnFamilyDataByName(const std::string& cf_name);
+
   void MaybeScheduleFlushOrCompaction();
   static void BGWorkCompaction(void* db);
   static void BGWorkFlush(void* db);
@@ -488,6 +514,12 @@ class DBImpl : public DB {
   // Indicate DB was opened successfully
   bool opened_successfully_;
 
+  // The list of registered event listeners.
+  std::list<EventListener*> listeners_;
+
+  // count how many events are currently being notified.
+  int notifying_events_;
+
   // No copying allowed
   DBImpl(const DBImpl&);
   void operator=(const DBImpl&);
diff --git a/db/db_impl_readonly.h b/db/db_impl_readonly.h
index 9b10b83fb..d84b23f18 100644
--- a/db/db_impl_readonly.h
+++ b/db/db_impl_readonly.h
@@ -62,10 +62,20 @@ class DBImplReadOnly : public DBImpl {
     return Status::NotSupported("Not supported operation in read only mode.");
   }
 
+  using DBImpl::CompactFiles;
+  virtual Status CompactFiles(
+      const CompactionOptions& compact_options,
+      ColumnFamilyHandle* column_family,
+      const std::vector<std::string>& input_file_names,
+      const int output_level, const int output_path_id = -1) override {
+    return Status::NotSupported("Not supported operation in read only mode.");
+  }
+
 #ifndef ROCKSDB_LITE
   virtual Status DisableFileDeletions() override {
     return Status::NotSupported("Not supported operation in read only mode.");
   }
+
   virtual Status EnableFileDeletions(bool force) override {
     return Status::NotSupported("Not supported operation in read only mode.");
   }
diff --git a/db/db_test.cc b/db/db_test.cc
index 3a7891559..8b018715c 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -15,6 +15,7 @@
 #include <unordered_set>
 #include <utility>
 
+#include "db/filename.h"
 #include "db/dbformat.h"
 #include "db/db_impl.h"
 #include "db/filename.h"
@@ -4060,8 +4061,43 @@ TEST(DBTest, UniversalCompactionFourPaths) {
 
   Destroy(options);
 }
+
 #endif
 
+void CheckColumnFamilyMeta(const ColumnFamilyMetaData& cf_meta) {
+  uint64_t cf_size = 0;
+  uint64_t cf_csize = 0;
+  size_t file_count = 0;
+  for (auto level_meta : cf_meta.levels) {
+    uint64_t level_size = 0;
+    uint64_t level_csize = 0;
+    file_count += level_meta.files.size();
+    for (auto file_meta : level_meta.files) {
+      level_size += file_meta.size;
+    }
+    ASSERT_EQ(level_meta.size, level_size);
+    cf_size += level_size;
+    cf_csize += level_csize;
+  }
+  ASSERT_EQ(cf_meta.file_count, file_count);
+  ASSERT_EQ(cf_meta.size, cf_size);
+}
+
+TEST(DBTest, ColumnFamilyMetaDataTest) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+  int key_index = 0;
+  ColumnFamilyMetaData cf_meta;
+  for (int i = 0; i < 100; ++i) {
+    GenerateNewFile(&rnd, &key_index);
+    db_->GetColumnFamilyMetaData(&cf_meta);
+    CheckColumnFamilyMeta(cf_meta);
+  }
+}
+
 TEST(DBTest, ConvertCompactionStyle) {
   Random rnd(301);
   int max_key_level_insert = 200;
@@ -4238,7 +4274,7 @@ bool MinLevelToCompress(CompressionType& type, Options& options, int wbits,
 
 TEST(DBTest, MinLevelToCompress1) {
   Options options = CurrentOptions();
-  CompressionType type;
+  CompressionType type = kSnappyCompression;
   if (!MinLevelToCompress(type, options, -14, -1, 0)) {
     return;
   }
@@ -4258,7 +4294,7 @@ TEST(DBTest, MinLevelToCompress1) {
 
 TEST(DBTest, MinLevelToCompress2) {
   Options options = CurrentOptions();
-  CompressionType type;
+  CompressionType type = kSnappyCompression;
   if (!MinLevelToCompress(type, options, 15, -1, 0)) {
     return;
   }
@@ -7246,6 +7282,15 @@ class ModelDB: public DB {
     return Status::NotSupported("Not supported operation.");
   }
 
+  using DB::CompactFiles;
+  virtual Status CompactFiles(
+      const CompactionOptions& compact_options,
+      ColumnFamilyHandle* column_family,
+      const std::vector<std::string>& input_file_names,
+      const int output_level, const int output_path_id = -1) override {
+    return Status::NotSupported("Not supported operation.");
+  }
+
   using DB::NumberLevels;
   virtual int NumberLevels(ColumnFamilyHandle* column_family) { return 1; }
 
@@ -7314,6 +7359,10 @@ class ModelDB: public DB {
 
   virtual ColumnFamilyHandle* DefaultColumnFamily() const { return nullptr; }
 
+  virtual void GetColumnFamilyMetaData(
+      ColumnFamilyHandle* column_family,
+      ColumnFamilyMetaData* metadata) {}
+
  private:
   class ModelIter: public Iterator {
    public:
@@ -8202,6 +8251,211 @@ TEST(DBTest, RateLimitingTest) {
   ASSERT_TRUE(ratio < 0.6);
 }
 
+namespace {
+  bool HaveOverlappingKeyRanges(
+      const Comparator* c,
+      const SstFileMetaData& a, const SstFileMetaData& b) {
+    if (c->Compare(a.smallestkey, b.smallestkey) >= 0) {
+      if (c->Compare(a.smallestkey, b.largestkey) <= 0) {
+        // b.smallestkey <= a.smallestkey <= b.largestkey
+        return true;
+      }
+    } else if (c->Compare(a.largestkey, b.smallestkey) >= 0) {
+      // a.smallestkey < b.smallestkey <= a.largestkey
+      return true;
+    }
+    if (c->Compare(a.largestkey, b.largestkey) <= 0) {
+      if (c->Compare(a.largestkey, b.smallestkey) >= 0) {
+        // b.smallestkey <= a.largestkey <= b.largestkey
+        return true;
+      }
+    } else if (c->Compare(a.smallestkey, b.largestkey) <= 0) {
+      // a.smallestkey <= b.largestkey < a.largestkey
+      return true;
+    }
+    return false;
+  }
+
+  // Identifies all files between level "min_level" and "max_level"
+  // which has overlapping key range with "input_file_meta".
+  void GetOverlappingFileNumbersForLevelCompaction(
+      const ColumnFamilyMetaData& cf_meta,
+      const Comparator* comparator,
+      int min_level, int max_level,
+      const SstFileMetaData* input_file_meta,
+      std::set<std::string>* overlapping_file_names) {
+    std::set<const SstFileMetaData*> overlapping_files;
+    overlapping_files.insert(input_file_meta);
+    for (int m = min_level; m <= max_level; ++m) {
+      for (auto& file : cf_meta.levels[m].files) {
+        for (auto* included_file : overlapping_files) {
+          if (HaveOverlappingKeyRanges(
+                  comparator, *included_file, file)) {
+            overlapping_files.insert(&file);
+            overlapping_file_names->insert(file.name);
+            break;
+          }
+        }
+      }
+    }
+  }
+
+  void VerifyCompactionResult(
+      const ColumnFamilyMetaData& cf_meta,
+      const std::set<std::string>& overlapping_file_numbers) {
+    for (auto& level : cf_meta.levels) {
+      for (auto& file : level.files) {
+        assert(overlapping_file_numbers.find(file.name) ==
+               overlapping_file_numbers.end());
+      }
+    }
+  }
+
+  const SstFileMetaData* PickFileRandomly(
+      const ColumnFamilyMetaData& cf_meta,
+      Random* rand,
+      int* level = nullptr) {
+    auto file_id = rand->Uniform(static_cast<int>(
+        cf_meta.file_count)) + 1;
+    for (auto& level_meta : cf_meta.levels) {
+      if (file_id <= level_meta.files.size()) {
+        if (level != nullptr) {
+          *level = level_meta.level;
+        }
+        auto result = rand->Uniform(file_id);
+        return &(level_meta.files[result]);
+      }
+      file_id -= level_meta.files.size();
+    }
+    assert(false);
+    return nullptr;
+  }
+}  // namespace
+
+TEST(DBTest, CompactFilesOnLevelCompaction) {
+  const int kKeySize = 16;
+  const int kValueSize = 984;
+  const int kEntrySize = kKeySize + kValueSize;
+  const int kEntriesPerBuffer = 100;
+  Options options;
+  options.create_if_missing = true;
+  options.write_buffer_size = kEntrySize * kEntriesPerBuffer;
+  options.compaction_style = kCompactionStyleLevel;
+  options.target_file_size_base = options.write_buffer_size;
+  options.max_bytes_for_level_base = options.target_file_size_base * 2;
+  options.level0_stop_writes_trigger = 2;
+  options.max_bytes_for_level_multiplier = 2;
+  options.compression = kNoCompression;
+  options = CurrentOptions(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  Random rnd(301);
+  for (int key = 64 * kEntriesPerBuffer; key >= 0; --key) {
+    ASSERT_OK(Put(1, std::to_string(key), RandomString(&rnd, kValueSize)));
+  }
+  dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+  dbfull()->TEST_WaitForCompact();
+
+  ColumnFamilyMetaData cf_meta;
+  dbfull()->GetColumnFamilyMetaData(handles_[1], &cf_meta);
+  int output_level = cf_meta.levels.size() - 1;
+  for (int file_picked = 5; file_picked > 0; --file_picked) {
+    std::set<std::string> overlapping_file_names;
+    std::vector<std::string> compaction_input_file_names;
+    for (int f = 0; f < file_picked; ++f) {
+      int level;
+      auto file_meta = PickFileRandomly(cf_meta, &rnd, &level);
+      compaction_input_file_names.push_back(file_meta->name);
+      GetOverlappingFileNumbersForLevelCompaction(
+          cf_meta, options.comparator, level, output_level,
+          file_meta, &overlapping_file_names);
+    }
+
+    ASSERT_OK(dbfull()->CompactFiles(
+        CompactionOptions(), handles_[1],
+        compaction_input_file_names,
+        output_level));
+
+    // Make sure all overlapping files do not exist after compaction
+    dbfull()->GetColumnFamilyMetaData(handles_[1], &cf_meta);
+    VerifyCompactionResult(cf_meta, overlapping_file_names);
+  }
+
+  // make sure all key-values are still there.
+  for (int key = 64 * kEntriesPerBuffer; key >= 0; --key) {
+    ASSERT_NE(Get(1, std::to_string(key)), "NOT_FOUND");
+  }
+}
+
+TEST(DBTest, CompactFilesOnUniversalCompaction) {
+  const int kKeySize = 16;
+  const int kValueSize = 984;
+  const int kEntrySize = kKeySize + kValueSize;
+  const int kEntriesPerBuffer = 10;
+
+  ChangeCompactOptions();
+  Options options;
+  options.create_if_missing = true;
+  options.write_buffer_size = kEntrySize * kEntriesPerBuffer;
+  options.compaction_style = kCompactionStyleLevel;
+  options.target_file_size_base = options.write_buffer_size;
+  options.compression = kNoCompression;
+  options = CurrentOptions(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+  ASSERT_EQ(options.compaction_style, kCompactionStyleUniversal);
+  Random rnd(301);
+  for (int key = 1024 * kEntriesPerBuffer; key >= 0; --key) {
+    ASSERT_OK(Put(1, std::to_string(key), RandomString(&rnd, kValueSize)));
+  }
+  dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+  dbfull()->TEST_WaitForCompact();
+  ColumnFamilyMetaData cf_meta;
+  dbfull()->GetColumnFamilyMetaData(handles_[1], &cf_meta);
+  std::vector<std::string> compaction_input_file_names;
+  for (auto file : cf_meta.levels[0].files) {
+    if (rnd.OneIn(2)) {
+      compaction_input_file_names.push_back(file.name);
+    }
+  }
+
+  if (compaction_input_file_names.size() == 0) {
+    compaction_input_file_names.push_back(
+        cf_meta.levels[0].files[0].name);
+  }
+
+  // expect fail since universal compaction only allow L0 output
+  ASSERT_TRUE(!dbfull()->CompactFiles(
+      CompactionOptions(), handles_[1],
+      compaction_input_file_names, 1).ok());
+
+  // expect ok and verify the compacted files no longer exist.
+  ASSERT_OK(dbfull()->CompactFiles(
+      CompactionOptions(), handles_[1],
+      compaction_input_file_names, 0));
+
+  dbfull()->GetColumnFamilyMetaData(handles_[1], &cf_meta);
+  VerifyCompactionResult(
+      cf_meta,
+      std::set<std::string>(compaction_input_file_names.begin(),
+          compaction_input_file_names.end()));
+
+  compaction_input_file_names.clear();
+
+  // Pick the first and the last file, expect everything is
+  // compacted into one single file.
+  compaction_input_file_names.push_back(
+      cf_meta.levels[0].files[0].name);
+  compaction_input_file_names.push_back(
+      cf_meta.levels[0].files[
+          cf_meta.levels[0].files.size() - 1].name);
+  ASSERT_OK(dbfull()->CompactFiles(
+      CompactionOptions(), handles_[1],
+      compaction_input_file_names, 0));
+
+  dbfull()->GetColumnFamilyMetaData(handles_[1], &cf_meta);
+  ASSERT_EQ(cf_meta.levels[0].files.size(), 1U);
+}
+
 TEST(DBTest, TableOptionsSanitizeTest) {
   Options options = CurrentOptions();
   options.create_if_missing = true;
@@ -9079,7 +9333,6 @@ TEST(DBTest, DontDeletePendingOutputs) {
   Compact("a", "b");
 }
 
-
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
diff --git a/db/filename.cc b/db/filename.cc
index a8f685296..e5d97bdf2 100644
--- a/db/filename.cc
+++ b/db/filename.cc
@@ -79,6 +79,17 @@ std::string MakeTableFileName(const std::string& path, uint64_t number) {
   return MakeFileName(path, number, "sst");
 }
 
+uint64_t TableFileNameToNumber(const std::string& name) {
+  uint64_t number = 0;
+  uint64_t base = 1;
+  int pos = static_cast<int>(name.find_last_of('.'));
+  while (--pos >= 0 && name[pos] >= '0' && name[pos] <= '9') {
+    number += (name[pos] - '0') * base;
+    base *= 10;
+  }
+  return number;
+}
+
 std::string TableFileName(const std::vector<DbPath>& db_paths, uint64_t number,
                           uint32_t path_id) {
   assert(number > 0);
diff --git a/db/filename.h b/db/filename.h
index 87963ea21..4136ff12e 100644
--- a/db/filename.h
+++ b/db/filename.h
@@ -52,6 +52,10 @@ extern std::string ArchivedLogFileName(const std::string& dbname,
 
 extern std::string MakeTableFileName(const std::string& name, uint64_t number);
 
+// the reverse function of MakeTableFileName
+// TODO(yhchiang): could merge this function with ParseFileName()
+extern uint64_t TableFileNameToNumber(const std::string& name);
+
 // Return the name of the sstable with the specified number
 // in the db named by "dbname".  The result will be prefixed with
 // "dbname".
diff --git a/db/flush_job.cc b/db/flush_job.cc
index 973d86033..a3079d2df 100644
--- a/db/flush_job.cc
+++ b/db/flush_job.cc
@@ -73,9 +73,9 @@ FlushJob::FlushJob(const std::string& dbname, ColumnFamilyData* cfd,
       output_compression_(output_compression),
       stats_(stats) {}
 
-Status FlushJob::Run() {
+Status FlushJob::Run(uint64_t* file_number) {
   // Save the contents of the earliest memtable as a new Table
-  uint64_t file_number;
+  uint64_t fn;
   autovector<MemTable*> mems;
   cfd_->imm()->PickMemtablesToFlush(&mems);
   if (mems.empty()) {
@@ -96,7 +96,7 @@ Status FlushJob::Run() {
   edit->SetColumnFamily(cfd_->GetID());
 
   // This will release and re-acquire the mutex.
-  Status s = WriteLevel0Table(mems, edit, &file_number);
+  Status s = WriteLevel0Table(mems, edit, &fn);
 
   if (s.ok() &&
       (shutting_down_->load(std::memory_order_acquire) || cfd_->IsDropped())) {
@@ -113,6 +113,9 @@ Status FlushJob::Run() {
         &job_context_->memtables_to_free, db_directory_, log_buffer_);
   }
 
+  if (s.ok() && file_number != nullptr) {
+    *file_number = fn;
+  }
   return s;
 }
 
diff --git a/db/flush_job.h b/db/flush_job.h
index 86d4aa073..394a7a45e 100644
--- a/db/flush_job.h
+++ b/db/flush_job.h
@@ -60,7 +60,7 @@ class FlushJob {
            CompressionType output_compression, Statistics* stats);
   ~FlushJob() {}
 
-  Status Run();
+  Status Run(uint64_t* file_number = nullptr);
 
  private:
   Status WriteLevel0Table(const autovector<MemTable*>& mems, VersionEdit* edit,
diff --git a/db/listener_test.cc b/db/listener_test.cc
new file mode 100644
index 000000000..f39ac93eb
--- /dev/null
+++ b/db/listener_test.cc
@@ -0,0 +1,344 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include "db/dbformat.h"
+#include "db/db_impl.h"
+#include "db/filename.h"
+#include "db/version_set.h"
+#include "db/write_batch_internal.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/perf_context.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/table.h"
+#include "rocksdb/options.h"
+#include "rocksdb/table_properties.h"
+#include "table/block_based_table_factory.h"
+#include "table/plain_table_factory.h"
+#include "util/hash.h"
+#include "util/hash_linklist_rep.h"
+#include "utilities/merge_operators.h"
+#include "util/logging.h"
+#include "util/mutexlock.h"
+#include "util/rate_limiter.h"
+#include "util/statistics.h"
+#include "util/testharness.h"
+#include "util/sync_point.h"
+#include "util/testutil.h"
+
+#ifndef ROCKSDB_LITE
+
+namespace rocksdb {
+
+class EventListenerTest {
+ public:
+  EventListenerTest() {
+    dbname_ = test::TmpDir() + "/listener_test";
+    ASSERT_OK(DestroyDB(dbname_, Options()));
+    db_ = nullptr;
+    Reopen();
+  }
+
+  ~EventListenerTest() {
+    Close();
+    Options options;
+    options.db_paths.emplace_back(dbname_, 0);
+    options.db_paths.emplace_back(dbname_ + "_2", 0);
+    options.db_paths.emplace_back(dbname_ + "_3", 0);
+    options.db_paths.emplace_back(dbname_ + "_4", 0);
+    ASSERT_OK(DestroyDB(dbname_, options));
+  }
+
+  void CreateColumnFamilies(const std::vector<std::string>& cfs,
+                            const ColumnFamilyOptions* options = nullptr) {
+    ColumnFamilyOptions cf_opts;
+    cf_opts = ColumnFamilyOptions(Options());
+    int cfi = handles_.size();
+    handles_.resize(cfi + cfs.size());
+    for (auto cf : cfs) {
+      ASSERT_OK(db_->CreateColumnFamily(cf_opts, cf, &handles_[cfi++]));
+    }
+  }
+
+  void Close() {
+    for (auto h : handles_) {
+      delete h;
+    }
+    handles_.clear();
+    delete db_;
+    db_ = nullptr;
+  }
+
+  void ReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+                                const Options* options = nullptr) {
+    ASSERT_OK(TryReopenWithColumnFamilies(cfs, options));
+  }
+
+  Status TryReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+                                     const Options* options = nullptr) {
+    Close();
+    Options opts = (options == nullptr) ? Options() : *options;
+    std::vector<const Options*> v_opts(cfs.size(), &opts);
+    return TryReopenWithColumnFamilies(cfs, v_opts);
+  }
+
+  Status TryReopenWithColumnFamilies(
+      const std::vector<std::string>& cfs,
+      const std::vector<const Options*>& options) {
+    Close();
+    ASSERT_EQ(cfs.size(), options.size());
+    std::vector<ColumnFamilyDescriptor> column_families;
+    for (size_t i = 0; i < cfs.size(); ++i) {
+      column_families.push_back(ColumnFamilyDescriptor(cfs[i], *options[i]));
+    }
+    DBOptions db_opts = DBOptions(*options[0]);
+    return DB::Open(db_opts, dbname_, column_families, &handles_, &db_);
+  }
+
+  Status TryReopen(Options* options = nullptr) {
+    Close();
+    Options opts;
+    if (options != nullptr) {
+      opts = *options;
+    } else {
+      opts.create_if_missing = true;
+    }
+
+    return DB::Open(opts, dbname_, &db_);
+  }
+
+  void Reopen(Options* options = nullptr) {
+    ASSERT_OK(TryReopen(options));
+  }
+
+  void CreateAndReopenWithCF(const std::vector<std::string>& cfs,
+                             const Options* options = nullptr) {
+    CreateColumnFamilies(cfs, options);
+    std::vector<std::string> cfs_plus_default = cfs;
+    cfs_plus_default.insert(cfs_plus_default.begin(), kDefaultColumnFamilyName);
+    ReopenWithColumnFamilies(cfs_plus_default, options);
+  }
+
+  DBImpl* dbfull() {
+    return reinterpret_cast<DBImpl*>(db_);
+  }
+
+  Status Put(int cf, const Slice& k, const Slice& v,
+             WriteOptions wo = WriteOptions()) {
+    return db_->Put(wo, handles_[cf], k, v);
+  }
+
+  Status Flush(int cf = 0) {
+    if (cf == 0) {
+      return db_->Flush(FlushOptions());
+    } else {
+      return db_->Flush(FlushOptions(), handles_[cf]);
+    }
+  }
+
+
+  DB* db_;
+  std::string dbname_;
+  std::vector<ColumnFamilyHandle*> handles_;
+};
+
+class TestFlushListener : public EventListener {
+ public:
+  void OnFlushCompleted(
+      DB* db, const std::string& name,
+      const std::string& file_path,
+      bool triggered_writes_slowdown,
+      bool triggered_writes_stop) override {
+    flushed_dbs_.push_back(db);
+    flushed_column_family_names_.push_back(name);
+    if (triggered_writes_slowdown) {
+      slowdown_count++;
+    }
+    if (triggered_writes_stop) {
+      stop_count++;
+    }
+  }
+
+  std::vector<std::string> flushed_column_family_names_;
+  std::vector<DB*> flushed_dbs_;
+  int slowdown_count;
+  int stop_count;
+};
+
+TEST(EventListenerTest, OnSingleDBFlushTest) {
+  Options options;
+  TestFlushListener* listener = new TestFlushListener();
+  options.listeners.emplace_back(listener);
+  std::vector<std::string> cf_names = {
+      "pikachu", "ilya", "muromec", "dobrynia",
+      "nikitich", "alyosha", "popovich"};
+  CreateAndReopenWithCF(cf_names, &options);
+
+  ASSERT_OK(Put(1, "pikachu", "pikachu"));
+  ASSERT_OK(Put(2, "ilya", "ilya"));
+  ASSERT_OK(Put(3, "muromec", "muromec"));
+  ASSERT_OK(Put(4, "dobrynia", "dobrynia"));
+  ASSERT_OK(Put(5, "nikitich", "nikitich"));
+  ASSERT_OK(Put(6, "alyosha", "alyosha"));
+  ASSERT_OK(Put(7, "popovich", "popovich"));
+  for (size_t i = 1; i < 8; ++i) {
+    Flush(i);
+    dbfull()->TEST_WaitForFlushMemTable();
+    ASSERT_EQ(listener->flushed_dbs_.size(), i);
+    ASSERT_EQ(listener->flushed_column_family_names_.size(), i);
+  }
+
+  // make sure call-back functions are called in the right order
+  for (size_t i = 0; i < cf_names.size(); ++i) {
+    ASSERT_EQ(listener->flushed_dbs_[i], db_);
+    ASSERT_EQ(listener->flushed_column_family_names_[i], cf_names[i]);
+  }
+}
+
+TEST(EventListenerTest, MultiCF) {
+  Options options;
+  TestFlushListener* listener = new TestFlushListener();
+  options.listeners.emplace_back(listener);
+  std::vector<std::string> cf_names = {
+      "pikachu", "ilya", "muromec", "dobrynia",
+      "nikitich", "alyosha", "popovich"};
+  CreateAndReopenWithCF(cf_names, &options);
+
+  ASSERT_OK(Put(1, "pikachu", "pikachu"));
+  ASSERT_OK(Put(2, "ilya", "ilya"));
+  ASSERT_OK(Put(3, "muromec", "muromec"));
+  ASSERT_OK(Put(4, "dobrynia", "dobrynia"));
+  ASSERT_OK(Put(5, "nikitich", "nikitich"));
+  ASSERT_OK(Put(6, "alyosha", "alyosha"));
+  ASSERT_OK(Put(7, "popovich", "popovich"));
+  for (size_t i = 1; i < 8; ++i) {
+    Flush(i);
+    ASSERT_EQ(listener->flushed_dbs_.size(), i);
+    ASSERT_EQ(listener->flushed_column_family_names_.size(), i);
+  }
+
+  // make sure call-back functions are called in the right order
+  for (size_t i = 0; i < cf_names.size(); i++) {
+    ASSERT_EQ(listener->flushed_dbs_[i], db_);
+    ASSERT_EQ(listener->flushed_column_family_names_[i], cf_names[i]);
+  }
+}
+
+TEST(EventListenerTest, MultiDBMultiListeners) {
+  std::vector<TestFlushListener*> listeners;
+  const int kNumDBs = 5;
+  const int kNumListeners = 10;
+  for (int i = 0; i < kNumListeners; ++i) {
+    listeners.emplace_back(new TestFlushListener());
+  }
+
+  std::vector<std::string> cf_names = {
+      "pikachu", "ilya", "muromec", "dobrynia",
+      "nikitich", "alyosha", "popovich"};
+
+  Options options;
+  options.create_if_missing = true;
+  for (int i = 0; i < kNumListeners; ++i) {
+    options.listeners.emplace_back(listeners[i]);
+  }
+  DBOptions db_opts(options);
+  ColumnFamilyOptions cf_opts(options);
+
+  std::vector<DB*> dbs;
+  std::vector<std::vector<ColumnFamilyHandle *>> vec_handles;
+
+  for (int d = 0; d < kNumDBs; ++d) {
+    ASSERT_OK(DestroyDB(dbname_ + std::to_string(d), options));
+    DB* db;
+    std::vector<ColumnFamilyHandle*> handles;
+    ASSERT_OK(DB::Open(options, dbname_ + std::to_string(d), &db));
+    for (size_t c = 0; c < cf_names.size(); ++c) {
+      ColumnFamilyHandle* handle;
+      db->CreateColumnFamily(cf_opts, cf_names[c], &handle);
+      handles.push_back(handle);
+    }
+
+    vec_handles.push_back(std::move(handles));
+    dbs.push_back(db);
+  }
+
+  for (int d = 0; d < kNumDBs; ++d) {
+    for (size_t c = 0; c < cf_names.size(); ++c) {
+      ASSERT_OK(dbs[d]->Put(WriteOptions(), vec_handles[d][c],
+                cf_names[c], cf_names[c]));
+    }
+  }
+
+  for (size_t c = 0; c < cf_names.size(); ++c) {
+    for (int d = 0; d < kNumDBs; ++d) {
+      ASSERT_OK(dbs[d]->Flush(FlushOptions(), vec_handles[d][c]));
+      reinterpret_cast<DBImpl*>(dbs[d])->TEST_WaitForFlushMemTable();
+    }
+  }
+
+  for (auto* listener : listeners) {
+    int pos = 0;
+    for (size_t c = 0; c < cf_names.size(); ++c) {
+      for (int d = 0; d < kNumDBs; ++d) {
+        ASSERT_EQ(listener->flushed_dbs_[pos], dbs[d]);
+        ASSERT_EQ(listener->flushed_column_family_names_[pos], cf_names[c]);
+        pos++;
+      }
+    }
+  }
+
+  for (auto handles : vec_handles) {
+    for (auto h : handles) {
+      delete h;
+    }
+    handles.clear();
+  }
+  vec_handles.clear();
+
+  for (auto db : dbs) {
+    delete db;
+  }
+}
+
+TEST(EventListenerTest, DisableBGCompaction) {
+  Options options;
+  TestFlushListener* listener = new TestFlushListener();
+  const int kSlowdownTrigger = 5;
+  const int kStopTrigger = 10;
+  options.level0_slowdown_writes_trigger = kSlowdownTrigger;
+  options.level0_stop_writes_trigger = kStopTrigger;
+  options.listeners.emplace_back(listener);
+  // BG compaction is disabled.  Number of L0 files will simply keeps
+  // increasing in this test.
+  options.compaction_style = kCompactionStyleNone;
+  options.compression = kNoCompression;
+  options.write_buffer_size = 100000;  // Small write buffer
+
+  CreateAndReopenWithCF({"pikachu"}, &options);
+  WriteOptions wopts;
+  wopts.timeout_hint_us = 100000;
+  ColumnFamilyMetaData cf_meta;
+  db_->GetColumnFamilyMetaData(handles_[1], &cf_meta);
+  // keep writing until writes are forced to stop.
+  for (int i = 0; static_cast<int>(cf_meta.file_count) < kStopTrigger; ++i) {
+    Put(1, std::to_string(i), std::string(100000, 'x'), wopts);
+    db_->GetColumnFamilyMetaData(handles_[1], &cf_meta);
+  }
+  ASSERT_GE(listener->slowdown_count, kStopTrigger - kSlowdownTrigger);
+  ASSERT_GE(listener->stop_count, 1);
+}
+
+}  // namespace rocksdb
+
+#endif  // ROCKSDB_LITE
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
+
diff --git a/db/version_set.cc b/db/version_set.cc
index cdca14177..1c34b56a5 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -14,13 +14,14 @@
 #endif
 
 #include <inttypes.h>
+#include <stdio.h>
 #include <algorithm>
 #include <map>
 #include <set>
 #include <climits>
 #include <unordered_map>
 #include <vector>
-#include <stdio.h>
+#include <string>
 
 #include "db/filename.h"
 #include "db/log_reader.h"
@@ -599,6 +600,49 @@ size_t Version::GetMemoryUsageByTableReaders() {
   return total_usage;
 }
 
+void Version::GetColumnFamilyMetaData(ColumnFamilyMetaData* cf_meta) {
+  assert(cf_meta);
+  assert(cfd_);
+
+  cf_meta->name = cfd_->GetName();
+  cf_meta->size = 0;
+  cf_meta->file_count = 0;
+  cf_meta->levels.clear();
+
+  auto* ioptions = cfd_->ioptions();
+  auto* vstorage = storage_info();
+
+  for (int level = 0; level < cfd_->NumberLevels(); level++) {
+    uint64_t level_size = 0;
+    cf_meta->file_count += vstorage->LevelFiles(level).size();
+    std::vector<SstFileMetaData> files;
+    for (const auto& file : vstorage->LevelFiles(level)) {
+      uint32_t path_id = file->fd.GetPathId();
+      std::string file_path;
+      if (path_id < ioptions->db_paths.size()) {
+        file_path = ioptions->db_paths[path_id].path;
+      } else {
+        assert(!ioptions->db_paths.empty());
+        file_path = ioptions->db_paths.back().path;
+      }
+      files.emplace_back(
+          MakeTableFileName("", file->fd.GetNumber()),
+          file_path,
+          file->fd.GetFileSize(),
+          file->smallest_seqno,
+          file->largest_seqno,
+          file->smallest.user_key().ToString(),
+          file->largest.user_key().ToString(),
+          file->being_compacted);
+      level_size += file->fd.GetFileSize();
+    }
+    cf_meta->levels.emplace_back(
+        level, level_size, std::move(files));
+    cf_meta->size += level_size;
+  }
+}
+
+
 uint64_t VersionStorageInfo::GetEstimatedActiveKeys() const {
   // Estimation will be not accurate when:
   // (1) there is merge keys
@@ -2645,41 +2689,22 @@ bool VersionSet::VerifyCompactionFileConsistency(Compaction* c) {
         c->column_family_data()->GetName().c_str());
   }
 
-  // verify files in level
-  int level = c->level();
-  for (int i = 0; i < c->num_input_files(0); i++) {
-    uint64_t number = c->input(0, i)->fd.GetNumber();
-
-    // look for this file in the current version
-    bool found = false;
-    for (unsigned int j = 0; j < vstorage->files_[level].size(); j++) {
-      FileMetaData* f = vstorage->files_[level][j];
-      if (f->fd.GetNumber() == number) {
-        found = true;
-        break;
+  for (int input = 0; input < c->num_input_levels(); ++input) {
+    int level = c->level(input);
+    for (int i = 0; i < c->num_input_files(input); ++i) {
+      uint64_t number = c->input(input, i)->fd.GetNumber();
+      bool found = false;
+      for (unsigned int j = 0; j < vstorage->files_[level].size(); j++) {
+        FileMetaData* f = vstorage->files_[level][j];
+        if (f->fd.GetNumber() == number) {
+          found = true;
+          break;
+        }
       }
-    }
-    if (!found) {
-      return false; // input files non existant in current version
-    }
-  }
-  // verify level+1 files
-  level++;
-  for (int i = 0; i < c->num_input_files(1); i++) {
-    uint64_t number = c->input(1, i)->fd.GetNumber();
-
-    // look for this file in the current version
-    bool found = false;
-    for (unsigned int j = 0; j < vstorage->files_[level].size(); j++) {
-      FileMetaData* f = vstorage->files_[level][j];
-      if (f->fd.GetNumber() == number) {
-        found = true;
-        break;
+      if (!found) {
+        return false;  // input files non existent in current version
       }
     }
-    if (!found) {
-      return false; // input files non existant in current version
-    }
   }
 #endif
   return true;     // everything good
diff --git a/db/version_set.h b/db/version_set.h
index f9801c7c7..e0d166818 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -420,6 +420,8 @@ class Version {
 
   VersionSet* version_set() { return vset_; }
 
+  void GetColumnFamilyMetaData(ColumnFamilyMetaData* cf_meta);
+
  private:
   friend class VersionSet;
 
@@ -598,7 +600,7 @@ class VersionSet {
   Status GetMetadataForFile(uint64_t number, int* filelevel,
                             FileMetaData** metadata, ColumnFamilyData** cfd);
 
-  void GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata);
+  void GetLiveFilesMetaData(std::vector<LiveFileMetaData> *metadata);
 
   void GetObsoleteFiles(std::vector<FileMetaData*>* files);
 
@@ -609,6 +611,7 @@ class VersionSet {
   struct ManifestWriter;
 
   friend class Version;
+  friend class DBImpl;
 
   struct LogReporter : public log::Reader::Reporter {
     Status* status;
diff --git a/examples/Makefile b/examples/Makefile
index 97a4b2850..7807289ae 100644
--- a/examples/Makefile
+++ b/examples/Makefile
@@ -2,7 +2,7 @@ include ../build_config.mk
 
 .PHONY: main clean
 
-all: simple_example column_families_example
+all: simple_example column_families_example compact_files_example
 
 simple_example: simple_example.cc
 	$(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++11 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
@@ -10,5 +10,8 @@ simple_example: simple_example.cc
 column_families_example: column_families_example.cc
 	$(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++11 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
 
-clean: simple_example column_families_example
-	rm -rf ./simple_example ./column_families_example
+compact_files_example: compact_files_example.cc
+	$(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++11 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
+
+clean: simple_example column_families_example compact_files_example
+	rm -rf ./simple_example ./column_families_example ./compact_files_example
diff --git a/examples/compact_files_example.cc b/examples/compact_files_example.cc
new file mode 100644
index 000000000..3e7638b7e
--- /dev/null
+++ b/examples/compact_files_example.cc
@@ -0,0 +1,175 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+//
+// An example code demonstrating how to use CompactFiles, EventListener,
+// and GetColumnFamilyMetaData APIs to implement custom compaction algorithm.
+
+#include <mutex>
+#include <string>
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+
+using namespace rocksdb;
+std::string kDBPath = "/tmp/rocksdb_compact_files_example";
+class CompactionTask;
+
+// This is an example interface of external-compaction algorithm.
+// Compaction algorithm can be implemented outside the core-RocksDB
+// code by using the pluggable compaction APIs that RocksDb provides.
+class Compactor : public EventListener {
+ public:
+  // Picks and returns a compaction task given the specified DB
+  // and column family.  It is the caller's responsibility to
+  // destroy the returned CompactionTask.  Returns "nullptr"
+  // if it cannot find a proper compaction task.
+  virtual CompactionTask* PickCompaction(
+      DB* db, const std::string& cf_name) = 0;
+
+  // Schedule and run the specified compaction task in background.
+  virtual void ScheduleCompaction(CompactionTask *task) = 0;
+};
+
+// Example structure that describes a compaction task.
+struct CompactionTask {
+  CompactionTask(
+      DB* db, Compactor* compactor,
+      const std::string& column_family_name,
+      const std::vector<std::string>& input_file_names,
+      const int output_level,
+      const CompactionOptions& compact_options,
+      bool retry_on_fail)
+          : db(db),
+            compactor(compactor),
+            column_family_name(column_family_name),
+            input_file_names(input_file_names),
+            output_level(output_level),
+            compact_options(compact_options),
+            retry_on_fail(false) {}
+  DB* db;
+  Compactor* compactor;
+  const std::string& column_family_name;
+  std::vector<std::string> input_file_names;
+  int output_level;
+  CompactionOptions compact_options;
+  bool retry_on_fail;
+};
+
+// A simple compaction algorithm that always compacts everything
+// to the highest level whenever possible.
+class FullCompactor : public Compactor {
+ public:
+  explicit FullCompactor(const Options options) : options_(options) {
+    compact_options_.compression = options_.compression;
+    compact_options_.output_file_size_limit =
+        options_.target_file_size_base;
+  }
+
+  // When flush happens, it determins whether to trigger compaction.
+  // If triggered_writes_stop is true, it will also set the retry
+  // flag of compaction-task to true.
+  void OnFlushCompleted(
+      DB* db, const std::string& cf_name,
+      const std::string& file_path,
+      bool triggered_writes_slowdown,
+      bool triggered_writes_stop) override {
+    CompactionTask* task = PickCompaction(db, cf_name);
+    if (task != nullptr) {
+      if (triggered_writes_stop) {
+        task->retry_on_fail = true;
+      }
+      // Schedule compaction in a different thread.
+      ScheduleCompaction(task);
+    }
+  }
+
+  // Always pick a compaction which includes all files whenever possible.
+  CompactionTask* PickCompaction(
+      DB* db, const std::string& cf_name) override {
+    ColumnFamilyMetaData cf_meta;
+    db->GetColumnFamilyMetaData(&cf_meta);
+
+    std::vector<std::string> input_file_names;
+    for (auto level : cf_meta.levels) {
+      for (auto file : level.files) {
+        if (file.being_compacted) {
+          return nullptr;
+        }
+        input_file_names.push_back(file.name);
+      }
+    }
+    return new CompactionTask(
+        db, this, cf_name, input_file_names,
+        options_.num_levels - 1, compact_options_, false);
+  }
+
+  // Schedule the specified compaction task in background.
+  void ScheduleCompaction(CompactionTask* task) override {
+    options_.env->Schedule(&FullCompactor::CompactFiles, task);
+  }
+
+  static void CompactFiles(void* arg) {
+    CompactionTask* task = reinterpret_cast<CompactionTask*>(arg);
+    assert(task);
+    assert(task->db);
+    Status s = task->db->CompactFiles(
+        task->compact_options,
+        task->input_file_names,
+        task->output_level);
+    printf("CompactFiles() finished with status %s\n", s.ToString().c_str());
+    if (!s.ok() && !s.IsIOError() && task->retry_on_fail) {
+      // If a compaction task with its retry_on_fail=true failed,
+      // try to schedule another compaction in case the reason
+      // is not an IO error.
+      CompactionTask* new_task = task->compactor->PickCompaction(
+          task->db, task->column_family_name);
+      task->compactor->ScheduleCompaction(new_task);
+    }
+    // release the task
+    delete task;
+  }
+
+ private:
+  Options options_;
+  CompactionOptions compact_options_;
+};
+
+int main() {
+  Options options;
+  options.create_if_missing = true;
+  // Disable RocksDB background compaction.
+  options.compaction_style = kCompactionStyleNone;
+  // Small slowdown and stop trigger for experimental purpose.
+  options.level0_slowdown_writes_trigger = 3;
+  options.level0_stop_writes_trigger = 5;
+  options.IncreaseParallelism(5);
+  options.listeners.emplace_back(new FullCompactor(options));
+
+  DB* db = nullptr;
+  DestroyDB(kDBPath, options);
+  Status s = DB::Open(options, kDBPath, &db);
+  assert(s.ok());
+  assert(db);
+
+  // if background compaction is not working, write will stall
+  // because of options.level0_stop_writes_trigger
+  for (int i = 1000; i < 99999; ++i) {
+    db->Put(WriteOptions(), std::to_string(i),
+                            std::string(500, 'a' + (i % 26)));
+  }
+
+  // verify the values are still there
+  std::string value;
+  for (int i = 1000; i < 99999; ++i) {
+    db->Get(ReadOptions(), std::to_string(i),
+                           &value);
+    assert(value == std::string(500, 'a' + (i % 26)));
+  }
+
+  // close the db.
+  delete db;
+
+  return 0;
+}
diff --git a/include/rocksdb/comparator.h b/include/rocksdb/comparator.h
index 8e7366752..5b7dc1021 100644
--- a/include/rocksdb/comparator.h
+++ b/include/rocksdb/comparator.h
@@ -63,7 +63,7 @@ class Comparator {
 extern const Comparator* BytewiseComparator();
 
 // Return a builtin comparator that uses reverse lexicographic byte-wise
-// ordering. 
+// ordering.
 extern const Comparator* ReverseBytewiseComparator();
 
 }  // namespace rocksdb
diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h
index 65b517f54..3025d7ebc 100644
--- a/include/rocksdb/db.h
+++ b/include/rocksdb/db.h
@@ -15,19 +15,34 @@
 #include <vector>
 #include <string>
 #include <unordered_map>
+#include "rocksdb/metadata.h"
 #include "rocksdb/version.h"
 #include "rocksdb/iterator.h"
 #include "rocksdb/options.h"
 #include "rocksdb/types.h"
 #include "rocksdb/transaction_log.h"
+#include "rocksdb/listener.h"
 
 namespace rocksdb {
 
+struct Options;
+struct DBOptions;
+struct ColumnFamilyOptions;
+struct ReadOptions;
+struct WriteOptions;
+struct FlushOptions;
+struct CompactionOptions;
+struct TableProperties;
+class WriteBatch;
+class Env;
+class EventListener;
+
 using std::unique_ptr;
 
 class ColumnFamilyHandle {
  public:
   virtual ~ColumnFamilyHandle() {}
+  virtual const std::string& GetName() const = 0;
 };
 extern const std::string kDefaultColumnFamilyName;
 
@@ -44,27 +59,6 @@ struct ColumnFamilyDescriptor {
 static const int kMajorVersion = __ROCKSDB_MAJOR__;
 static const int kMinorVersion = __ROCKSDB_MINOR__;
 
-struct Options;
-struct ReadOptions;
-struct WriteOptions;
-struct FlushOptions;
-struct TableProperties;
-class WriteBatch;
-class Env;
-
-// Metadata associated with each SST file.
-struct LiveFileMetaData {
-  std::string column_family_name;  // Name of the column family
-  std::string db_path;
-  std::string name;                // Name of the file
-  int level;               // Level at which this file resides.
-  size_t size;             // File size in bytes.
-  std::string smallestkey; // Smallest user defined key in the file.
-  std::string largestkey;  // Largest user defined key in the file.
-  SequenceNumber smallest_seqno; // smallest seqno in file
-  SequenceNumber largest_seqno;  // largest seqno in file
-};
-
 // Abstract handle to particular state of a DB.
 // A Snapshot is an immutable object and can therefore be safely
 // accessed from multiple threads without any external synchronization.
@@ -370,6 +364,26 @@ class DB {
     return SetOptions(DefaultColumnFamily(), new_options);
   }
 
+  // CompactFiles() inputs a list of files specified by file numbers
+  // and compacts them to the specified level.  Note that the behavior
+  // is different from CompactRange in that CompactFiles() will
+  // perform the compaction job using the CURRENT thread.
+  //
+  // @see GetDataBaseMetaData
+  // @see GetColumnFamilyMetaData
+  virtual Status CompactFiles(
+      const CompactionOptions& compact_options,
+      ColumnFamilyHandle* column_family,
+      const std::vector<std::string>& input_file_names,
+      const int output_level, const int output_path_id = -1) = 0;
+
+  virtual Status CompactFiles(
+      const CompactionOptions& compact_options,
+      const std::vector<std::string>& input_file_names,
+      const int output_level, const int output_path_id = -1) {
+    return CompactFiles(compact_options, DefaultColumnFamily(),
+                        input_file_names, output_level, output_path_id);
+  }
   // Number of levels used for this DB.
   virtual int NumberLevels(ColumnFamilyHandle* column_family) = 0;
   virtual int NumberLevels() { return NumberLevels(DefaultColumnFamily()); }
@@ -476,6 +490,21 @@ class DB {
   // and end key
   virtual void GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata) {}
 
+  // Obtains the meta data of the specified column family of the DB.
+  // Status::NotFound() will be returned if the current DB does not have
+  // any column family match the specified name.
+  //
+  // If cf_name is not specified, then the metadata of the default
+  // column family will be returned.
+  virtual void GetColumnFamilyMetaData(
+      ColumnFamilyHandle* column_family,
+      ColumnFamilyMetaData* metadata) {}
+
+  // Get the metadata of the default column family.
+  virtual void GetColumnFamilyMetaData(
+      ColumnFamilyMetaData* metadata) {
+    GetColumnFamilyMetaData(DefaultColumnFamily(), metadata);
+  }
 #endif  // ROCKSDB_LITE
 
   // Sets the globally unique ID created at database creation time by invoking
diff --git a/include/rocksdb/immutable_options.h b/include/rocksdb/immutable_options.h
index 49a136c07..02bd006f3 100644
--- a/include/rocksdb/immutable_options.h
+++ b/include/rocksdb/immutable_options.h
@@ -90,6 +90,10 @@ struct ImmutableCFOptions {
   Options::AccessHint access_hint_on_compaction_start;
 
   int num_levels;
+
+  // A vector of EventListeners which call-back functions will be called
+  // when specific RocksDB event happens.
+  std::vector<std::shared_ptr<EventListener>> listeners;
 };
 
 }  // namespace rocksdb
diff --git a/include/rocksdb/listener.h b/include/rocksdb/listener.h
new file mode 100644
index 000000000..33e5fc51f
--- /dev/null
+++ b/include/rocksdb/listener.h
@@ -0,0 +1,65 @@
+// Copyright (c) 2014 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <string>
+#include "rocksdb/status.h"
+
+namespace rocksdb {
+
+class DB;
+class Status;
+
+// EventListener class contains a set of call-back functions that will
+// be called when specific RocksDB event happens such as flush.  It can
+// be used as a building block for developing custom features such as
+// stats-collector or external compaction algorithm.
+//
+// Note that call-back functions should not run for an extended period of
+// time before the function returns, otherwise RocksDB may be blocked.
+// For example, it is not suggested to do DB::CompactFiles() (as it may
+// run for a long while) or issue many of DB::Put() (as Put may be blocked
+// in certain cases) in the same thread in the EventListener callback.
+// However, doing DB::CompactFiles() and DB::Put() in another thread is
+// considered safe.
+//
+// [Threading] All EventListener callback will be called using the
+// actual thread that involves in that specific event.   For example, it
+// is the RocksDB background flush thread that does the actual flush to
+// call EventListener::OnFlushCompleted().
+class EventListener {
+ public:
+  // A call-back function to RocksDB which will be called whenever a
+  // registered RocksDB flushes a file.  The default implementation is
+  // no-op.
+  //
+  // Note that the this function must be implemented in a way such that
+  // it should not run for an extended period of time before the function
+  // returns.  Otherwise, RocksDB may be blocked.
+  //
+  // @param db a pointer to the rocksdb instance which just flushed
+  //     a memtable to disk.
+  // @param column_family_id the id of the flushed column family.
+  // @param file_path the path to the newly created file.
+  // @param triggered_writes_slowdown true when rocksdb is currently
+  //     slowing-down all writes to prevent creating too many Level 0
+  //     files as compaction seems not able to catch up the write request
+  //     speed.  This indicates that there're too many files in Level 0.
+  // @param triggered_writes_stop true when rocksdb is currently blocking
+  //     any writes to prevent creating more L0 files.  This indicates that
+  //     there're too many files in level 0.  Compactions should try to
+  //     compact L0 files down to lower levels as soon as possible.
+  virtual void OnFlushCompleted(
+      DB* db, const std::string& column_family_name,
+      const std::string& file_path,
+      bool triggered_writes_slowdown,
+      bool triggered_writes_stop) {}
+};
+
+}  // namespace rocksdb
+
+#endif  // ROCKSDB_LITE
diff --git a/include/rocksdb/metadata.h b/include/rocksdb/metadata.h
new file mode 100644
index 000000000..96f70ed85
--- /dev/null
+++ b/include/rocksdb/metadata.h
@@ -0,0 +1,90 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#include <limits>
+#include <string>
+#include <vector>
+
+#include "rocksdb/types.h"
+
+#pragma once
+
+namespace rocksdb {
+struct ColumnFamilyMetaData;
+struct LevelMetaData;
+struct SstFileMetaData;
+
+// The metadata that describes a column family.
+struct ColumnFamilyMetaData {
+  ColumnFamilyMetaData() : size(0), name("") {}
+  ColumnFamilyMetaData(const std::string& name, uint64_t size,
+                       const std::vector<LevelMetaData>&& levels) :
+      size(size), name(name), levels(levels) {}
+
+  // The size of this column family in bytes, which is equal to the sum of
+  // the file size of its "levels".
+  uint64_t size;
+  // The number of files in this column family.
+  size_t file_count;
+  // The name of the column family.
+  std::string name;
+  // The metadata of all levels in this column family.
+  std::vector<LevelMetaData> levels;
+};
+
+// The metadata that describes a level.
+struct LevelMetaData {
+  LevelMetaData(int level, uint64_t size,
+                const std::vector<SstFileMetaData>&& files) :
+      level(level), size(size),
+      files(files) {}
+
+  // The level which this meta data describes.
+  const int level;
+  // The size of this level in bytes, which is equal to the sum of
+  // the file size of its "files".
+  const uint64_t size;
+  // The metadata of all sst files in this level.
+  const std::vector<SstFileMetaData> files;
+};
+
+// The metadata that describes a SST file.
+struct SstFileMetaData {
+  SstFileMetaData() {}
+  SstFileMetaData(const std::string& file_name,
+                  const std::string& path, uint64_t size,
+                  SequenceNumber smallest_seqno,
+                  SequenceNumber largest_seqno,
+                  const std::string& smallestkey,
+                  const std::string& largestkey,
+                  bool being_compacted) :
+    size(size), name(file_name),
+    db_path(path), smallest_seqno(smallest_seqno), largest_seqno(largest_seqno),
+    smallestkey(smallestkey), largestkey(largestkey),
+    being_compacted(being_compacted) {}
+
+  // File size in bytes.
+  uint64_t size;
+  // The name of the file.
+  std::string name;
+  // The full path where the file locates.
+  std::string db_path;
+
+  SequenceNumber smallest_seqno;  // Smallest sequence number in file.
+  SequenceNumber largest_seqno;   // Largest sequence number in file.
+  std::string smallestkey;     // Smallest user defined key in the file.
+  std::string largestkey;      // Largest user defined key in the file.
+  bool being_compacted;  // true if the file is currently being compacted.
+};
+
+// The full set of metadata associated with each SST file.
+struct LiveFileMetaData : SstFileMetaData {
+  std::string column_family_name;  // Name of the column family
+  int level;               // Level at which this file resides.
+};
+
+
+
+}  // namespace rocksdb
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index b3ce77255..1656c5c41 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -10,13 +10,16 @@
 #define STORAGE_ROCKSDB_INCLUDE_OPTIONS_H_
 
 #include <stddef.h>
+#include <stdint.h>
 #include <string>
 #include <memory>
 #include <vector>
+#include <limits>
 #include <stdint.h>
 #include <unordered_map>
 
 #include "rocksdb/version.h"
+#include "rocksdb/listener.h"
 #include "rocksdb/universal_compaction.h"
 
 namespace rocksdb {
@@ -55,7 +58,9 @@ enum CompressionType : char {
 enum CompactionStyle : char {
   kCompactionStyleLevel = 0x0,      // level based compaction style
   kCompactionStyleUniversal = 0x1,  // Universal compaction style
-  kCompactionStyleFIFO = 0x2,       // FIFO compaction style
+  kCompactionStyleFIFO = 0x2,    // FIFO compaction style
+  kCompactionStyleNone = 0x3,  // Disable background compaction. Compaction
+                               // jobs are submitted via CompactFiles()
 };
 
 
@@ -586,6 +591,10 @@ struct ColumnFamilyOptions {
   // Default: 2
   uint32_t min_partial_merge_operands;
 
+  // A vector of EventListeners which call-back functions will be called
+  // when specific RocksDB event happens.
+  std::vector<std::shared_ptr<EventListener>> listeners;
+
   // Create ColumnFamilyOptions with default values for all fields
   ColumnFamilyOptions();
   // Create ColumnFamilyOptions from Options
@@ -1067,6 +1076,19 @@ extern Options GetOptions(size_t total_write_buffer_limit,
                           int write_amplification_threshold = 32,
                           uint64_t target_db_size = 68719476736 /* 64GB */);
 
+// CompactionOptions are used in CompactFiles() call.
+struct CompactionOptions {
+  // Compaction output compression type
+  // Default: snappy
+  CompressionType compression;
+  // Compaction will create files of size `output_file_size_limit`.
+  // Default: MAX, which means that compaction will create a single file
+  uint64_t output_file_size_limit;
+
+  CompactionOptions()
+      : compression(kSnappyCompression),
+        output_file_size_limit(std::numeric_limits<uint64_t>::max()) {}
+};
 }  // namespace rocksdb
 
 #endif  // STORAGE_ROCKSDB_INCLUDE_OPTIONS_H_
diff --git a/include/rocksdb/status.h b/include/rocksdb/status.h
index 4be30c1f4..177d705f3 100644
--- a/include/rocksdb/status.h
+++ b/include/rocksdb/status.h
@@ -61,6 +61,9 @@ class Status {
   static Status Incomplete(const Slice& msg, const Slice& msg2 = Slice()) {
     return Status(kIncomplete, msg, msg2);
   }
+  static Status ShutdownInProgress() {
+    return Status(kShutdownInProgress);
+  }
   static Status ShutdownInProgress(const Slice& msg,
                                    const Slice& msg2 = Slice()) {
     return Status(kShutdownInProgress, msg, msg2);
@@ -71,6 +74,12 @@ class Status {
   static Status TimedOut(const Slice& msg, const Slice& msg2 = Slice()) {
     return Status(kTimedOut, msg, msg2);
   }
+  static Status Aborted() {
+    return Status(kAborted);
+  }
+  static Status Aborted(const Slice& msg, const Slice& msg2 = Slice()) {
+    return Status(kAborted, msg, msg2);
+  }
 
   // Returns true iff the status indicates success.
   bool ok() const { return code() == kOk; }
@@ -101,6 +110,8 @@ class Status {
 
   bool IsTimedOut() const { return code() == kTimedOut; }
 
+  bool IsAborted() const { return code() == kAborted; }
+
   // Return a string representation of this status suitable for printing.
   // Returns the string "OK" for success.
   std::string ToString() const;
@@ -115,7 +126,8 @@ class Status {
     kMergeInProgress = 6,
     kIncomplete = 7,
     kShutdownInProgress = 8,
-    kTimedOut = 9
+    kTimedOut = 9,
+    kAborted = 10
   };
 
   Code code() const {
diff --git a/include/rocksdb/utilities/stackable_db.h b/include/rocksdb/utilities/stackable_db.h
index 50c6a6484..7bdf9928e 100644
--- a/include/rocksdb/utilities/stackable_db.h
+++ b/include/rocksdb/utilities/stackable_db.h
@@ -133,6 +133,17 @@ class StackableDB : public DB {
                              target_level, target_path_id);
   }
 
+  using DB::CompactFiles;
+  virtual Status CompactFiles(
+      const CompactionOptions& compact_options,
+      ColumnFamilyHandle* column_family,
+      const std::vector<std::string>& input_file_names,
+      const int output_level, const int output_path_id = -1) override {
+    return db_->CompactFiles(
+        compact_options, column_family, input_file_names,
+        output_level, output_path_id);
+  }
+
   using DB::NumberLevels;
   virtual int NumberLevels(ColumnFamilyHandle* column_family) override {
     return db_->NumberLevels(column_family);
@@ -170,6 +181,8 @@ class StackableDB : public DB {
     return db_->Flush(fopts, column_family);
   }
 
+#ifndef ROCKSDB_LITE
+
   virtual Status DisableFileDeletions() override {
     return db_->DisableFileDeletions();
   }
@@ -183,6 +196,14 @@ class StackableDB : public DB {
     db_->GetLiveFilesMetaData(metadata);
   }
 
+  virtual void GetColumnFamilyMetaData(
+      ColumnFamilyHandle *column_family,
+      ColumnFamilyMetaData* cf_meta) override {
+    db_->GetColumnFamilyMetaData(column_family, cf_meta);
+  }
+
+#endif  // ROCKSDB_LITE
+
   virtual Status GetLiveFiles(std::vector<std::string>& vec, uint64_t* mfs,
                               bool flush_memtable = true) override {
       return db_->GetLiveFiles(vec, mfs, flush_memtable);
diff --git a/util/options.cc b/util/options.cc
index 03ffb0a6d..bdcdcdf2b 100644
--- a/util/options.cc
+++ b/util/options.cc
@@ -64,7 +64,8 @@ ImmutableCFOptions::ImmutableCFOptions(const Options& options)
     compression_per_level(options.compression_per_level),
     compression_opts(options.compression_opts),
     access_hint_on_compaction_start(options.access_hint_on_compaction_start),
-    num_levels(options.num_levels) {}
+    num_levels(options.num_levels),
+    listeners(options.listeners) {}
 
 ColumnFamilyOptions::ColumnFamilyOptions()
     : comparator(BytewiseComparator()),
@@ -112,7 +113,8 @@ ColumnFamilyOptions::ColumnFamilyOptions()
       memtable_prefix_bloom_huge_page_tlb_size(0),
       bloom_locality(0),
       max_successive_merges(0),
-      min_partial_merge_operands(2) {
+      min_partial_merge_operands(2),
+      listeners() {
   assert(memtable_factory.get() != nullptr);
 }
 
@@ -172,7 +174,8 @@ ColumnFamilyOptions::ColumnFamilyOptions(const Options& options)
           options.memtable_prefix_bloom_huge_page_tlb_size),
       bloom_locality(options.bloom_locality),
       max_successive_merges(options.max_successive_merges),
-      min_partial_merge_operands(options.min_partial_merge_operands) {
+      min_partial_merge_operands(options.min_partial_merge_operands),
+      listeners(options.listeners) {
   assert(memtable_factory.get() != nullptr);
   if (max_bytes_for_level_multiplier_additional.size() <
       static_cast<unsigned int>(num_levels)) {
diff --git a/util/status.cc b/util/status.cc
index 8eca3a5a8..fa8e18acf 100644
--- a/util/status.cc
+++ b/util/status.cc
@@ -70,6 +70,9 @@ std::string Status::ToString() const {
     case kTimedOut:
       type = "Operation timed out: ";
       break;
+    case kAborted:
+      type = "Operation aborted: ";
+      break;
     default:
       snprintf(tmp, sizeof(tmp), "Unknown code(%d): ",
                static_cast<int>(code()));

From 844786189632d888c3250e069e61efa03933edaa Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Fri, 7 Nov 2014 14:57:51 -0800
Subject: [PATCH 434/829] Fixed -WShadow errors in db/db_test.cc and
 include/rocksdb/metadata.h

Summary:
Fixed -WShadow errors in db/db_test.cc and include/rocksdb/metadata.h

Test Plan:
make
---
 db/db_test.cc              | 16 ++++++++--------
 include/rocksdb/metadata.h | 36 ++++++++++++++++++------------------
 2 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/db/db_test.cc b/db/db_test.cc
index 8b018715c..4ae34ff9c 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -8333,9 +8333,9 @@ namespace {
 }  // namespace
 
 TEST(DBTest, CompactFilesOnLevelCompaction) {
-  const int kKeySize = 16;
-  const int kValueSize = 984;
-  const int kEntrySize = kKeySize + kValueSize;
+  const int kTestKeySize = 16;
+  const int kTestValueSize = 984;
+  const int kEntrySize = kTestKeySize + kTestValueSize;
   const int kEntriesPerBuffer = 100;
   Options options;
   options.create_if_missing = true;
@@ -8351,7 +8351,7 @@ TEST(DBTest, CompactFilesOnLevelCompaction) {
 
   Random rnd(301);
   for (int key = 64 * kEntriesPerBuffer; key >= 0; --key) {
-    ASSERT_OK(Put(1, std::to_string(key), RandomString(&rnd, kValueSize)));
+    ASSERT_OK(Put(1, std::to_string(key), RandomString(&rnd, kTestValueSize)));
   }
   dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
   dbfull()->TEST_WaitForCompact();
@@ -8388,9 +8388,9 @@ TEST(DBTest, CompactFilesOnLevelCompaction) {
 }
 
 TEST(DBTest, CompactFilesOnUniversalCompaction) {
-  const int kKeySize = 16;
-  const int kValueSize = 984;
-  const int kEntrySize = kKeySize + kValueSize;
+  const int kTestKeySize = 16;
+  const int kTestValueSize = 984;
+  const int kEntrySize = kTestKeySize + kTestValueSize;
   const int kEntriesPerBuffer = 10;
 
   ChangeCompactOptions();
@@ -8405,7 +8405,7 @@ TEST(DBTest, CompactFilesOnUniversalCompaction) {
   ASSERT_EQ(options.compaction_style, kCompactionStyleUniversal);
   Random rnd(301);
   for (int key = 1024 * kEntriesPerBuffer; key >= 0; --key) {
-    ASSERT_OK(Put(1, std::to_string(key), RandomString(&rnd, kValueSize)));
+    ASSERT_OK(Put(1, std::to_string(key), RandomString(&rnd, kTestValueSize)));
   }
   dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
   dbfull()->TEST_WaitForCompact();
diff --git a/include/rocksdb/metadata.h b/include/rocksdb/metadata.h
index 96f70ed85..e026fa96e 100644
--- a/include/rocksdb/metadata.h
+++ b/include/rocksdb/metadata.h
@@ -19,9 +19,9 @@ struct SstFileMetaData;
 // The metadata that describes a column family.
 struct ColumnFamilyMetaData {
   ColumnFamilyMetaData() : size(0), name("") {}
-  ColumnFamilyMetaData(const std::string& name, uint64_t size,
-                       const std::vector<LevelMetaData>&& levels) :
-      size(size), name(name), levels(levels) {}
+  ColumnFamilyMetaData(const std::string& _name, uint64_t _size,
+                       const std::vector<LevelMetaData>&& _levels) :
+      size(_size), name(_name), levels(_levels) {}
 
   // The size of this column family in bytes, which is equal to the sum of
   // the file size of its "levels".
@@ -36,10 +36,10 @@ struct ColumnFamilyMetaData {
 
 // The metadata that describes a level.
 struct LevelMetaData {
-  LevelMetaData(int level, uint64_t size,
-                const std::vector<SstFileMetaData>&& files) :
-      level(level), size(size),
-      files(files) {}
+  LevelMetaData(int _level, uint64_t _size,
+                const std::vector<SstFileMetaData>&& _files) :
+      level(_level), size(_size),
+      files(_files) {}
 
   // The level which this meta data describes.
   const int level;
@@ -53,17 +53,17 @@ struct LevelMetaData {
 // The metadata that describes a SST file.
 struct SstFileMetaData {
   SstFileMetaData() {}
-  SstFileMetaData(const std::string& file_name,
-                  const std::string& path, uint64_t size,
-                  SequenceNumber smallest_seqno,
-                  SequenceNumber largest_seqno,
-                  const std::string& smallestkey,
-                  const std::string& largestkey,
-                  bool being_compacted) :
-    size(size), name(file_name),
-    db_path(path), smallest_seqno(smallest_seqno), largest_seqno(largest_seqno),
-    smallestkey(smallestkey), largestkey(largestkey),
-    being_compacted(being_compacted) {}
+  SstFileMetaData(const std::string& _file_name,
+                  const std::string& _path, uint64_t _size,
+                  SequenceNumber _smallest_seqno,
+                  SequenceNumber _largest_seqno,
+                  const std::string& _smallestkey,
+                  const std::string& _largestkey,
+                  bool _being_compacted) :
+    size(_size), name(_file_name),
+    db_path(_path), smallest_seqno(_smallest_seqno), largest_seqno(_largest_seqno),
+    smallestkey(_smallestkey), largestkey(_largestkey),
+    being_compacted(_being_compacted) {}
 
   // File size in bytes.
   uint64_t size;

From 68effa0348b3368a83ae67ae2245ed9fa2912d18 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Fri, 7 Nov 2014 15:04:30 -0800
Subject: [PATCH 435/829] Fix -Wshadow for tools

Summary: Previously I made `make check` work with -Wshadow, but there are some tools that are not compiled using `make check`.

Test Plan: make all

Reviewers: yhchiang, rven, ljin, sdong

Reviewed By: ljin, sdong

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D28497
---
 db/perf_context_test.cc     | 38 ++++++++++++++++++++-----------------
 table/table_reader_bench.cc |  2 +-
 tools/blob_store_bench.cc   | 24 +++++++++++++----------
 tools/db_stress.cc          |  7 ++-----
 tools/sst_dump.cc           |  1 -
 util/cache_bench.cc         |  6 ++----
 6 files changed, 40 insertions(+), 38 deletions(-)

diff --git a/db/perf_context_test.cc b/db/perf_context_test.cc
index 9d34409c3..6669aaec3 100644
--- a/db/perf_context_test.cc
+++ b/db/perf_context_test.cc
@@ -88,22 +88,27 @@ TEST(PerfContextTest, SeekIntoDeletion) {
     hist_get_time.Add(elapsed_nanos);
   }
 
-  std::cout << "Get uesr key comparison: \n" << hist_get.ToString()
+  std::cout << "Get user key comparison: \n" << hist_get.ToString()
             << "Get time: \n" << hist_get_time.ToString();
 
-  HistogramImpl hist_seek_to_first;
-  std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+  {
+    HistogramImpl hist_seek_to_first;
+    std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
 
-  perf_context.Reset();
-  StopWatchNano timer(Env::Default(), true);
-  iter->SeekToFirst();
-  hist_seek_to_first.Add(perf_context.user_key_comparison_count);
-  auto elapsed_nanos = timer.ElapsedNanos();
+    perf_context.Reset();
+    StopWatchNano timer(Env::Default(), true);
+    iter->SeekToFirst();
+    hist_seek_to_first.Add(perf_context.user_key_comparison_count);
+    auto elapsed_nanos = timer.ElapsedNanos();
 
-  std::cout << "SeekToFirst uesr key comparison: \n" << hist_seek_to_first.ToString()
-            << "ikey skipped: " << perf_context.internal_key_skipped_count << "\n"
-            << "idelete skipped: " << perf_context.internal_delete_skipped_count << "\n"
-            << "elapsed: " << elapsed_nanos << "\n";
+    std::cout << "SeekToFirst uesr key comparison: \n"
+              << hist_seek_to_first.ToString()
+              << "ikey skipped: " << perf_context.internal_key_skipped_count
+              << "\n"
+              << "idelete skipped: "
+              << perf_context.internal_delete_skipped_count << "\n"
+              << "elapsed: " << elapsed_nanos << "\n";
+  }
 
   HistogramImpl hist_seek;
   for (int i = 0; i < FLAGS_total_keys; ++i) {
@@ -224,7 +229,6 @@ void ProfileQueries(bool enabled_time = false) {
     std::string key = "k" + std::to_string(i);
     std::string value = "v" + std::to_string(i);
 
-    std::vector<Slice> keys = {Slice(key)};
     std::vector<std::string> values;
 
     perf_context.Reset();
@@ -239,7 +243,7 @@ void ProfileQueries(bool enabled_time = false) {
     std::string key = "k" + std::to_string(i);
     std::string value = "v" + std::to_string(i);
 
-    std::vector<Slice> keys = {Slice(key)};
+    std::vector<Slice> multiget_keys = {Slice(key)};
     std::vector<std::string> values;
 
     perf_context.Reset();
@@ -252,7 +256,7 @@ void ProfileQueries(bool enabled_time = false) {
     hist_get.Add(perf_context.user_key_comparison_count);
 
     perf_context.Reset();
-    db->MultiGet(read_options, keys, &values);
+    db->MultiGet(read_options, multiget_keys, &values);
     hist_mget_snapshot.Add(perf_context.get_snapshot_time);
     hist_mget_memtable.Add(perf_context.get_from_memtable_time);
     hist_mget_files.Add(perf_context.get_from_output_files_time);
@@ -329,7 +333,7 @@ void ProfileQueries(bool enabled_time = false) {
     std::string key = "k" + std::to_string(i);
     std::string value = "v" + std::to_string(i);
 
-    std::vector<Slice> keys = {Slice(key)};
+    std::vector<Slice> multiget_keys = {Slice(key)};
     std::vector<std::string> values;
 
     perf_context.Reset();
@@ -342,7 +346,7 @@ void ProfileQueries(bool enabled_time = false) {
     hist_get.Add(perf_context.user_key_comparison_count);
 
     perf_context.Reset();
-    db->MultiGet(read_options, keys, &values);
+    db->MultiGet(read_options, multiget_keys, &values);
     hist_mget_snapshot.Add(perf_context.get_snapshot_time);
     hist_mget_memtable.Add(perf_context.get_from_memtable_time);
     hist_mget_files.Add(perf_context.get_from_output_files_time);
diff --git a/table/table_reader_bench.cc b/table/table_reader_bench.cc
index e6960f751..ea722a8bf 100644
--- a/table/table_reader_bench.cc
+++ b/table/table_reader_bench.cc
@@ -115,7 +115,7 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options,
   unique_ptr<TableReader> table_reader;
   unique_ptr<RandomAccessFile> raf;
   if (!through_db) {
-    Status s = env->NewRandomAccessFile(file_name, &raf, env_options);
+    s = env->NewRandomAccessFile(file_name, &raf, env_options);
     uint64_t file_size;
     env->GetFileSize(file_name, &file_size);
     s = opts.table_factory->NewTableReader(
diff --git a/tools/blob_store_bench.cc b/tools/blob_store_bench.cc
index 60a0b84a6..7b820a178 100644
--- a/tools/blob_store_bench.cc
+++ b/tools/blob_store_bench.cc
@@ -59,11 +59,13 @@ struct Result {
     writes = reads = deletes = data_read = data_written = 0;
   }
 
-  Result (uint32_t writes, uint32_t reads, uint32_t deletes,
-          uint64_t data_written, uint64_t data_read) :
-    writes(writes), reads(reads), deletes(deletes),
-    data_written(data_written), data_read(data_read) {}
-
+  Result(uint32_t _writes, uint32_t _reads, uint32_t _deletes,
+         uint64_t _data_written, uint64_t _data_read)
+      : writes(_writes),
+        reads(_reads),
+        deletes(_deletes),
+        data_written(_data_written),
+        data_read(_data_read) {}
 };
 
 namespace {
@@ -81,11 +83,13 @@ struct WorkerThread {
   Result result;
   atomic<bool> stopped;
 
-  WorkerThread(uint64_t data_size_from, uint64_t data_size_to,
-                double read_ratio, uint64_t working_set_size) :
-    data_size_from(data_size_from), data_size_to(data_size_to),
-    read_ratio(read_ratio), working_set_size(working_set_size),
-    stopped(false) {}
+  WorkerThread(uint64_t _data_size_from, uint64_t _data_size_to,
+               double _read_ratio, uint64_t _working_set_size)
+      : data_size_from(_data_size_from),
+        data_size_to(_data_size_to),
+        read_ratio(_read_ratio),
+        working_set_size(_working_set_size),
+        stopped(false) {}
 
   WorkerThread(const WorkerThread& wt) :
     data_size_from(wt.data_size_from), data_size_to(wt.data_size_to),
diff --git a/tools/db_stress.cc b/tools/db_stress.cc
index 42d0fb534..a0d6ea723 100644
--- a/tools/db_stress.cc
+++ b/tools/db_stress.cc
@@ -751,11 +751,8 @@ struct ThreadState {
   SharedState* shared;
   Stats stats;
 
-  ThreadState(uint32_t index, SharedState *shared)
-      : tid(index),
-        rand(1000 + index + shared->GetSeed()),
-        shared(shared) {
-  }
+  ThreadState(uint32_t index, SharedState* _shared)
+      : tid(index), rand(1000 + index + shared->GetSeed()), shared(_shared) {}
 };
 
 }  // namespace
diff --git a/tools/sst_dump.cc b/tools/sst_dump.cc
index 6c496e8dd..03da2b840 100644
--- a/tools/sst_dump.cc
+++ b/tools/sst_dump.cc
@@ -378,7 +378,6 @@ int main(int argc, char** argv) {
     }
     rocksdb::SstFileReader reader(filename, verify_checksum,
                                   output_hex);
-    rocksdb::Status st;
     // scan all files in give file path.
     if (command == "" || command == "scan" || command == "check") {
       st = reader.ReadSequential(command != "check",
diff --git a/util/cache_bench.cc b/util/cache_bench.cc
index 3d006ecf8..92df77267 100644
--- a/util/cache_bench.cc
+++ b/util/cache_bench.cc
@@ -122,10 +122,8 @@ struct ThreadState {
   Random rnd;
   SharedState* shared;
 
-  ThreadState(uint32_t index, SharedState *shared)
-      : tid(index),
-        rnd(1000 + index),
-        shared(shared) {}
+  ThreadState(uint32_t index, SharedState* _shared)
+      : tid(index), rnd(1000 + index), shared(_shared) {}
 };
 }  // namespace
 

From 642ac9d8ab56826d634c243059c39b5a0dbe3206 Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Fri, 7 Nov 2014 15:08:12 -0800
Subject: [PATCH 436/829] Fixed compile error in db/compaction.cc and
 db/compaction_picker.cc

Summary:
Fixed compile error in db/compaction.cc and db/compaction_picker.cc

Test Plan:
make
---
 db/compaction.cc        | 2 +-
 db/compaction_picker.cc | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/db/compaction.cc b/db/compaction.cc
index 00513f533..7a8881479 100644
--- a/db/compaction.cc
+++ b/db/compaction.cc
@@ -89,7 +89,7 @@ Compaction::Compaction(VersionStorageInfo* vstorage,
       max_output_file_size_(options.output_file_size_limit),
       max_grandparent_overlap_bytes_(max_grandparent_overlap_bytes),
       input_version_(nullptr),  // TODO(yhchiang): set it later
-      number_levels_(vstorage->NumberLevels()),
+      number_levels_(vstorage->num_levels()),
       cfd_(nullptr),
       output_compression_(options.compression),
       seek_compaction_(false),
diff --git a/db/compaction_picker.cc b/db/compaction_picker.cc
index f5207748b..04d04dc16 100644
--- a/db/compaction_picker.cc
+++ b/db/compaction_picker.cc
@@ -201,7 +201,7 @@ Compaction* CompactionPicker::FormCompaction(
       int output_level, VersionStorageInfo* vstorage,
       const MutableCFOptions& mutable_cf_options) const {
   uint64_t max_grandparent_overlap_bytes =
-      output_level + 1 < vstorage->NumberLevels() ?
+      output_level + 1 < vstorage->num_levels() ?
           mutable_cf_options.MaxGrandParentOverlapBytes(output_level + 1) :
           std::numeric_limits<uint64_t>::max();
   assert(input_files.size());
@@ -221,7 +221,7 @@ Compaction* CompactionPicker::FormCompaction(
   // the oldest file is involved.
   c->SetupBottomMostLevel(
       vstorage,
-      (output_level == vstorage->NumberLevels() - 1),
+      (output_level == vstorage->num_levels() - 1),
       (output_level == 0));
   return c;
 }
@@ -238,12 +238,12 @@ Status CompactionPicker::GetCompactionInputsFromFileNumbers(
   assert(input_files);
 
   autovector<CompactionInputFiles> matched_input_files;
-  matched_input_files.resize(vstorage->NumberLevels());
+  matched_input_files.resize(vstorage->num_levels());
   int first_non_empty_level = -1;
   int last_non_empty_level = -1;
   // TODO(yhchiang): use a lazy-initialized mapping from
   //                 file_number to FileMetaData in Version.
-  for (int level = 0; level < vstorage->NumberLevels(); ++level) {
+  for (int level = 0; level < vstorage->num_levels(); ++level) {
     for (auto file : vstorage->LevelFiles(level)) {
       auto iter = input_set->find(file->fd.GetNumber());
       if (iter != input_set->end()) {

From b622ba5d6b28c37118002bb82615ed5613c98063 Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Fri, 7 Nov 2014 15:11:36 -0800
Subject: [PATCH 437/829] Fixed compile error in db/flush_job.cc

Summary:
Fixed compile error in db/flush_job.cc

Test Plan:
make
---
 db/flush_job.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/db/flush_job.cc b/db/flush_job.cc
index a3079d2df..74daf240b 100644
--- a/db/flush_job.cc
+++ b/db/flush_job.cc
@@ -105,11 +105,11 @@ Status FlushJob::Run(uint64_t* file_number) {
   }
 
   if (!s.ok()) {
-    cfd_->imm()->RollbackMemtableFlush(mems, file_number);
+    cfd_->imm()->RollbackMemtableFlush(mems, fn);
   } else {
     // Replace immutable memtable with the generated Table
     s = cfd_->imm()->InstallMemtableFlushResults(
-        cfd_, mutable_cf_options_, mems, versions_, db_mutex_, file_number,
+        cfd_, mutable_cf_options_, mems, versions_, db_mutex_, fn,
         &job_context_->memtables_to_free, db_directory_, log_buffer_);
   }
 

From b8b39034292c3cec82d9af61982b4b12614b1758 Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Fri, 7 Nov 2014 15:13:01 -0800
Subject: [PATCH 438/829] Fixed compile error in db/db_impl.cc

Summary:
Fixed compile error in db/db_impl.cc

Test Plan:
make
---
 db/db_impl.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/db/db_impl.cc b/db/db_impl.cc
index 0c218fb03..9db1da3b6 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -1287,7 +1287,7 @@ Status DBImpl::CompactFilesImpl(
   };
   CompactionJob compaction_job(
       c.get(), db_options_, *c->mutable_cf_options(), env_options_,
-      versions_.get(), &mutex_, &shutting_down_, &pending_outputs_,
+      versions_.get(), &mutex_, &shutting_down_,
       &log_buffer, db_directory_.get(), stats_, &snapshots_,
       IsSnapshotSupported(), table_cache_, std::move(yield_callback));
   compaction_job.Prepare();

From 344edbb044ff5c08a43e4a6e9344c5c861552c0e Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Fri, 7 Nov 2014 15:22:10 -0800
Subject: [PATCH 439/829] Fixed the shadowing in db/compaction.cc and
 include/rocksdb/db.h

Summary:
Fixed the shadowing in db/compaction.cc and include/rocksdb/db.h

Test Plan:
make
---
 db/compaction.cc     | 26 +++++++++++++-------------
 include/rocksdb/db.h |  2 +-
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/db/compaction.cc b/db/compaction.cc
index 7a8881479..98de352bc 100644
--- a/db/compaction.cc
+++ b/db/compaction.cc
@@ -79,22 +79,22 @@ Compaction::Compaction(int number_levels, int start_level, int out_level,
 }
 
 Compaction::Compaction(VersionStorageInfo* vstorage,
-    const autovector<CompactionInputFiles>& inputs,
-    int start_level, int output_level,
-    uint64_t max_grandparent_overlap_bytes,
-    const CompactionOptions& options,
-    bool deletion_compaction)
-    : start_level_(start_level),
-      output_level_(output_level),
-      max_output_file_size_(options.output_file_size_limit),
-      max_grandparent_overlap_bytes_(max_grandparent_overlap_bytes),
-      input_version_(nullptr),  // TODO(yhchiang): set it later
+    const autovector<CompactionInputFiles>& _inputs,
+    int _start_level, int _output_level,
+    uint64_t _max_grandparent_overlap_bytes,
+    const CompactionOptions& _options,
+    bool _deletion_compaction)
+    : start_level_(_start_level),
+      output_level_(_output_level),
+      max_output_file_size_(_options.output_file_size_limit),
+      max_grandparent_overlap_bytes_(_max_grandparent_overlap_bytes),
+      input_version_(nullptr),
       number_levels_(vstorage->num_levels()),
       cfd_(nullptr),
-      output_compression_(options.compression),
+      output_compression_(_options.compression),
       seek_compaction_(false),
-      deletion_compaction_(deletion_compaction),
-      inputs_(inputs),
+      deletion_compaction_(_deletion_compaction),
+      inputs_(_inputs),
       grandparent_index_(0),
       seen_key_(false),
       overlapped_bytes_(0),
diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h
index 3025d7ebc..04460ad9e 100644
--- a/include/rocksdb/db.h
+++ b/include/rocksdb/db.h
@@ -501,7 +501,7 @@ class DB {
       ColumnFamilyMetaData* metadata) {}
 
   // Get the metadata of the default column family.
-  virtual void GetColumnFamilyMetaData(
+  void GetColumnFamilyMetaData(
       ColumnFamilyMetaData* metadata) {
     GetColumnFamilyMetaData(DefaultColumnFamily(), metadata);
   }

From e3d3567b5b4feb034ae14e23d9802431f2626ee5 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Fri, 7 Nov 2014 15:44:12 -0800
Subject: [PATCH 440/829] Get rid of mutex in CompactionJob's state

Summary: Based on @sdong's feedback in the diff, we shouldn't keep db_mutex in CompactionJob's state. This diff removes db_mutex from CompactionJob state, by making next_file_number_ atomic. That way we only need to pass the lock to InstallCompactionResults() because of LogAndApply()

Test Plan: make check

Reviewers: ljin, yhchiang, rven, sdong

Reviewed By: sdong

Subscribers: sdong, dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D28491
---
 db/compaction_job.cc | 57 ++++++++++----------------------------------
 db/compaction_job.h  | 13 +++++-----
 db/db_impl.cc        | 28 +++++++++++-----------
 db/version_set.cc    | 28 ++++++++++++----------
 db/version_set.h     | 16 ++++++-------
 5 files changed, 55 insertions(+), 87 deletions(-)

diff --git a/db/compaction_job.cc b/db/compaction_job.cc
index 04c351d77..3395085a9 100644
--- a/db/compaction_job.cc
+++ b/db/compaction_job.cc
@@ -71,7 +71,6 @@ struct CompactionJob::CompactionState {
     SequenceNumber smallest_seqno, largest_seqno;
   };
   std::vector<Output> outputs;
-  std::list<uint64_t> allocated_file_numbers;
 
   // State kept for output being generated
   std::unique_ptr<WritableFile> outfile;
@@ -204,10 +203,10 @@ struct CompactionJob::CompactionState {
 CompactionJob::CompactionJob(
     Compaction* compaction, const DBOptions& db_options,
     const MutableCFOptions& mutable_cf_options, const EnvOptions& env_options,
-    VersionSet* versions, port::Mutex* db_mutex,
-    std::atomic<bool>* shutting_down, LogBuffer* log_buffer,
-    Directory* db_directory, Statistics* stats, SnapshotList* snapshots,
-    bool is_snapshot_supported, std::shared_ptr<Cache> table_cache,
+    VersionSet* versions, std::atomic<bool>* shutting_down,
+    LogBuffer* log_buffer, Directory* db_directory, Statistics* stats,
+    SnapshotList* snapshots, bool is_snapshot_supported,
+    std::shared_ptr<Cache> table_cache,
     std::function<uint64_t()> yield_callback)
     : compact_(new CompactionState(compaction)),
       compaction_stats_(1),
@@ -216,7 +215,6 @@ CompactionJob::CompactionJob(
       env_options_(env_options),
       env_(db_options.env),
       versions_(versions),
-      db_mutex_(db_mutex),
       shutting_down_(shutting_down),
       log_buffer_(log_buffer),
       db_directory_(db_directory),
@@ -227,7 +225,6 @@ CompactionJob::CompactionJob(
       yield_callback_(std::move(yield_callback)) {}
 
 void CompactionJob::Prepare() {
-  db_mutex_->AssertHeld();
   compact_->CleanupBatchBuffer();
   compact_->CleanupMergedBuffer();
 
@@ -267,9 +264,6 @@ void CompactionJob::Prepare() {
 
   // Is this compaction producing files at the bottommost level?
   bottommost_level_ = compact_->compaction->BottomMostLevel();
-
-  // Allocate the output file numbers before we release the lock
-  AllocateCompactionOutputFileNumbers();
 }
 
 Status CompactionJob::Run() {
@@ -461,14 +455,14 @@ Status CompactionJob::Run() {
   return status;
 }
 
-Status CompactionJob::Install(Status status) {
-  db_mutex_->AssertHeld();
+Status CompactionJob::Install(Status status, port::Mutex* db_mutex) {
+  db_mutex->AssertHeld();
   ColumnFamilyData* cfd = compact_->compaction->column_family_data();
   cfd->internal_stats()->AddCompactionStats(
       compact_->compaction->output_level(), compaction_stats_);
 
   if (status.ok()) {
-    status = InstallCompactionResults();
+    status = InstallCompactionResults(db_mutex);
   }
   VersionStorageInfo::LevelSummaryStorage tmp;
   const auto& stats = compaction_stats_;
@@ -496,19 +490,6 @@ Status CompactionJob::Install(Status status) {
   return status;
 }
 
-// Allocate the file numbers for the output file. We allocate as
-// many output file numbers as there are files in level+1 (at least one)
-// Insert them into pending_outputs so that they do not get deleted.
-void CompactionJob::AllocateCompactionOutputFileNumbers() {
-  db_mutex_->AssertHeld();
-  assert(compact_->builder == nullptr);
-  int filesNeeded = compact_->compaction->num_input_files(1);
-  for (int i = 0; i < std::max(filesNeeded, 1); i++) {
-    uint64_t file_number = versions_->NewFileNumber();
-    compact_->allocated_file_numbers.push_back(file_number);
-  }
-}
-
 Status CompactionJob::ProcessKeyValueCompaction(int64_t* imm_micros,
                                                 Iterator* input,
                                                 bool is_compaction_v2) {
@@ -958,8 +939,8 @@ Status CompactionJob::FinishCompactionOutputFile(Iterator* input) {
   return s;
 }
 
-Status CompactionJob::InstallCompactionResults() {
-  db_mutex_->AssertHeld();
+Status CompactionJob::InstallCompactionResults(port::Mutex* db_mutex) {
+  db_mutex->AssertHeld();
 
   // paranoia: verify that the files that we started with
   // still exist in the current version and in the same original level.
@@ -995,7 +976,7 @@ Status CompactionJob::InstallCompactionResults() {
   }
   return versions_->LogAndApply(
       compact_->compaction->column_family_data(), mutable_cf_options_,
-      compact_->compaction->edit(), db_mutex_, db_directory_);
+      compact_->compaction->edit(), db_mutex, db_directory_);
 }
 
 // Given a sequence number, return the sequence number of the
@@ -1036,21 +1017,8 @@ void CompactionJob::RecordCompactionIOStats() {
 Status CompactionJob::OpenCompactionOutputFile() {
   assert(compact_ != nullptr);
   assert(compact_->builder == nullptr);
-  uint64_t file_number;
-  // If we have not yet exhausted the pre-allocated file numbers,
-  // then use the one from the front. Otherwise, we have to acquire
-  // the heavyweight lock and allocate a new file number.
-  if (!compact_->allocated_file_numbers.empty()) {
-    file_number = compact_->allocated_file_numbers.front();
-    compact_->allocated_file_numbers.pop_front();
-  } else {
-    db_mutex_->Lock();
-    // TODO(icanadi) make Versions::next_file_number_ atomic and remove db_lock
-    // around here. Once we do that, AllocateCompactionOutputFileNumbers() will
-    // not be needed.
-    file_number = versions_->NewFileNumber();
-    db_mutex_->Unlock();
-  }
+  // no need to lock because VersionSet::next_file_number_ is atomic
+  uint64_t file_number = versions_->NewFileNumber();
   // Make the output file
   std::string fname = TableFileName(db_options_.db_paths, file_number,
                                     compact_->compaction->GetOutputPathId());
@@ -1087,7 +1055,6 @@ Status CompactionJob::OpenCompactionOutputFile() {
 }
 
 void CompactionJob::CleanupCompaction(Status status) {
-  db_mutex_->AssertHeld();
   if (compact_->builder != nullptr) {
     // May happen if we get a shutdown call in the middle of compaction
     compact_->builder->Abandon();
diff --git a/db/compaction_job.h b/db/compaction_job.h
index 45d438156..e993ea675 100644
--- a/db/compaction_job.h
+++ b/db/compaction_job.h
@@ -56,10 +56,10 @@ class CompactionJob {
   CompactionJob(Compaction* compaction, const DBOptions& db_options,
                 const MutableCFOptions& mutable_cf_options,
                 const EnvOptions& env_options, VersionSet* versions,
-                port::Mutex* db_mutex, std::atomic<bool>* shutting_down,
-                LogBuffer* log_buffer, Directory* db_directory,
-                Statistics* stats, SnapshotList* snapshot_list,
-                bool is_snapshot_supported, std::shared_ptr<Cache> table_cache,
+                std::atomic<bool>* shutting_down, LogBuffer* log_buffer,
+                Directory* db_directory, Statistics* stats,
+                SnapshotList* snapshot_list, bool is_snapshot_supported,
+                std::shared_ptr<Cache> table_cache,
                 std::function<uint64_t()> yield_callback);
 
   ~CompactionJob() { assert(compact_ == nullptr); }
@@ -75,7 +75,7 @@ class CompactionJob {
   Status Run();
   // REQUIRED: mutex held
   // status is the return of Run()
-  Status Install(Status status);
+  Status Install(Status status, port::Mutex* db_mutex);
 
  private:
   void AllocateCompactionOutputFileNumbers();
@@ -86,7 +86,7 @@ class CompactionJob {
   // Call compaction_filter_v2->Filter() on kv-pairs in compact
   void CallCompactionFilterV2(CompactionFilterV2* compaction_filter_v2);
   Status FinishCompactionOutputFile(Iterator* input);
-  Status InstallCompactionResults();
+  Status InstallCompactionResults(port::Mutex* db_mutex);
   SequenceNumber findEarliestVisibleSnapshot(
       SequenceNumber in, const std::vector<SequenceNumber>& snapshots,
       SequenceNumber* prev_snapshot);
@@ -111,7 +111,6 @@ class CompactionJob {
   const EnvOptions& env_options_;
   Env* env_;
   VersionSet* versions_;
-  port::Mutex* db_mutex_;
   std::atomic<bool>* shutting_down_;
   LogBuffer* log_buffer_;
   Directory* db_directory_;
diff --git a/db/db_impl.cc b/db/db_impl.cc
index 9db1da3b6..8ac509249 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -836,7 +836,7 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
     // The previous incarnation may not have written any MANIFEST
     // records after allocating this log number.  So we manually
     // update the file number allocation counter in VersionSet.
-    versions_->MarkFileNumberUsed(log_number);
+    versions_->MarkFileNumberUsedDuringRecovery(log_number);
     // Open the log file
     std::string fname = LogFileName(db_options_.wal_dir, log_number);
     unique_ptr<SequentialFile> file;
@@ -970,7 +970,7 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
       // not actually used. that is because VersionSet assumes
       // VersionSet::next_file_number_ always to be strictly greater than any
       // log number
-      versions_->MarkFileNumberUsed(max_log_number + 1);
+      versions_->MarkFileNumberUsedDuringRecovery(max_log_number + 1);
       status = versions_->LogAndApply(
           cfd, *cfd->GetLatestMutableCFOptions(), edit, &mutex_);
       if (!status.ok()) {
@@ -1285,18 +1285,18 @@ Status DBImpl::CompactFilesImpl(
                                      *c->mutable_cf_options(), &job_context,
                                      &log_buffer);
   };
-  CompactionJob compaction_job(
-      c.get(), db_options_, *c->mutable_cf_options(), env_options_,
-      versions_.get(), &mutex_, &shutting_down_,
-      &log_buffer, db_directory_.get(), stats_, &snapshots_,
-      IsSnapshotSupported(), table_cache_, std::move(yield_callback));
+  CompactionJob compaction_job(c.get(), db_options_, *c->mutable_cf_options(),
+                               env_options_, versions_.get(), &shutting_down_,
+                               &log_buffer, db_directory_.get(), stats_,
+                               &snapshots_, IsSnapshotSupported(), table_cache_,
+                               std::move(yield_callback));
   compaction_job.Prepare();
 
   mutex_.Unlock();
   Status status = compaction_job.Run();
   mutex_.Lock();
   if (status.ok()) {
-    status = compaction_job.Install(status);
+    status = compaction_job.Install(status, &mutex_);
     if (status.ok()) {
       InstallSuperVersionBackground(c->column_family_data(), &job_context,
                                     *c->mutable_cf_options());
@@ -2061,16 +2061,16 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress, JobContext* job_context,
                                        *c->mutable_cf_options(), job_context,
                                        log_buffer);
     };
-    CompactionJob compaction_job(
-        c.get(), db_options_, *c->mutable_cf_options(), env_options_,
-        versions_.get(), &mutex_, &shutting_down_, log_buffer,
-        db_directory_.get(), stats_, &snapshots_, IsSnapshotSupported(),
-        table_cache_, std::move(yield_callback));
+    CompactionJob compaction_job(c.get(), db_options_, *c->mutable_cf_options(),
+                                 env_options_, versions_.get(), &shutting_down_,
+                                 log_buffer, db_directory_.get(), stats_,
+                                 &snapshots_, IsSnapshotSupported(),
+                                 table_cache_, std::move(yield_callback));
     compaction_job.Prepare();
     mutex_.Unlock();
     status = compaction_job.Run();
     mutex_.Lock();
-    status = compaction_job.Install(status);
+    status = compaction_job.Install(status, &mutex_);
     if (status.ok()) {
       InstallSuperVersionBackground(c->column_family_data(), job_context,
                                     *c->mutable_cf_options());
diff --git a/db/version_set.cc b/db/version_set.cc
index 1c34b56a5..b2b63eb33 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -1613,7 +1613,7 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
   if (!descriptor_log_ ||
       manifest_file_size_ > db_options_->max_manifest_file_size) {
     pending_manifest_file_number_ = NewFileNumber();
-    batch_edits.back()->SetNextFile(next_file_number_);
+    batch_edits.back()->SetNextFile(next_file_number_.load());
     new_descriptor_log = true;
   } else {
     pending_manifest_file_number_ = manifest_file_number_;
@@ -1814,7 +1814,7 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
 
 void VersionSet::LogAndApplyCFHelper(VersionEdit* edit) {
   assert(edit->IsColumnFamilyManipulation());
-  edit->SetNextFile(next_file_number_);
+  edit->SetNextFile(next_file_number_.load());
   edit->SetLastSequence(last_sequence_);
   if (edit->is_column_family_drop_) {
     // if we drop column family, we have to make sure to save max column family,
@@ -1831,13 +1831,13 @@ void VersionSet::LogAndApplyHelper(ColumnFamilyData* cfd,
 
   if (edit->has_log_number_) {
     assert(edit->log_number_ >= cfd->GetLogNumber());
-    assert(edit->log_number_ < next_file_number_);
+    assert(edit->log_number_ < next_file_number_.load());
   }
 
   if (!edit->has_prev_log_number_) {
     edit->SetPrevLogNumber(prev_log_number_);
   }
-  edit->SetNextFile(next_file_number_);
+  edit->SetNextFile(next_file_number_.load());
   edit->SetLastSequence(last_sequence_);
 
   builder->Apply(edit);
@@ -2064,8 +2064,8 @@ Status VersionSet::Recover(
 
     column_family_set_->UpdateMaxColumnFamily(max_column_family);
 
-    MarkFileNumberUsed(previous_log_number);
-    MarkFileNumberUsed(log_number);
+    MarkFileNumberUsedDuringRecovery(previous_log_number);
+    MarkFileNumberUsedDuringRecovery(log_number);
   }
 
   // there were some column families in the MANIFEST that weren't specified
@@ -2105,7 +2105,7 @@ Status VersionSet::Recover(
     }
 
     manifest_file_size_ = current_manifest_file_size;
-    next_file_number_ = next_file + 1;
+    next_file_number_.store(next_file + 1);
     last_sequence_ = last_sequence;
     prev_log_number_ = previous_log_number;
 
@@ -2116,7 +2116,7 @@ Status VersionSet::Recover(
         "prev_log_number is %lu,"
         "max_column_family is %u\n",
         manifest_filename.c_str(), (unsigned long)manifest_file_number_,
-        (unsigned long)next_file_number_, (unsigned long)last_sequence_,
+        (unsigned long)next_file_number_.load(), (unsigned long)last_sequence_,
         (unsigned long)log_number, (unsigned long)prev_log_number_,
         column_family_set_->GetMaxColumnFamily());
 
@@ -2452,14 +2452,14 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname,
       delete v;
     }
 
-    next_file_number_ = next_file + 1;
+    next_file_number_.store(next_file + 1);
     last_sequence_ = last_sequence;
     prev_log_number_ = previous_log_number;
 
     printf(
         "next_file_number %lu last_sequence "
         "%lu  prev_log_number %lu max_column_family %u\n",
-        (unsigned long)next_file_number_, (unsigned long)last_sequence,
+        (unsigned long)next_file_number_.load(), (unsigned long)last_sequence,
         (unsigned long)previous_log_number,
         column_family_set_->GetMaxColumnFamily());
   }
@@ -2468,9 +2468,11 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname,
 }
 #endif  // ROCKSDB_LITE
 
-void VersionSet::MarkFileNumberUsed(uint64_t number) {
-  if (next_file_number_ <= number) {
-    next_file_number_ = number + 1;
+void VersionSet::MarkFileNumberUsedDuringRecovery(uint64_t number) {
+  // only called during recovery which is single threaded, so this works because
+  // there can't be concurrent calls
+  if (next_file_number_.load(std::memory_order_relaxed) <= number) {
+    next_file_number_.store(number + 1, std::memory_order_relaxed);
   }
 }
 
diff --git a/db/version_set.h b/db/version_set.h
index e0d166818..0be8c4e1b 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -532,19 +532,18 @@ class VersionSet {
     return pending_manifest_file_number_;
   }
 
-  // REQUIRED: mutex locked
-  uint64_t current_next_file_number() const { return next_file_number_; }
+  uint64_t current_next_file_number() const { return next_file_number_.load(); }
 
   // Allocate and return a new file number
-  uint64_t NewFileNumber() { return next_file_number_++; }
+  uint64_t NewFileNumber() { return next_file_number_.fetch_add(1) + 1; }
 
   // Arrange to reuse "file_number" unless a newer file number has
   // already been allocated.
   // REQUIRES: "file_number" was returned by a call to NewFileNumber().
   void ReuseLogFileNumber(uint64_t file_number) {
-    if (next_file_number_ == file_number + 1) {
-      next_file_number_ = file_number;
-    }
+    auto expected = file_number + 1;
+    std::atomic_compare_exchange_strong(&next_file_number_, &expected,
+                                        file_number);
   }
 
   // Return the last sequence number.
@@ -559,7 +558,8 @@ class VersionSet {
   }
 
   // Mark the specified file number as used.
-  void MarkFileNumberUsed(uint64_t number);
+  // REQUIRED: this is only called during single-threaded recovery
+  void MarkFileNumberUsedDuringRecovery(uint64_t number);
 
   // Return the log file number for the log file that is currently
   // being compacted, or zero if there is no such log file.
@@ -636,7 +636,7 @@ class VersionSet {
   Env* const env_;
   const std::string dbname_;
   const DBOptions* const db_options_;
-  uint64_t next_file_number_;
+  std::atomic<uint64_t> next_file_number_;
   uint64_t manifest_file_number_;
   uint64_t pending_manifest_file_number_;
   std::atomic<uint64_t> last_sequence_;

From 543df158c56eea062786a12b0ece14d50f154472 Mon Sep 17 00:00:00 2001
From: Federico Piccinini <fpi@fb.com>
Date: Fri, 7 Nov 2014 17:23:58 -0800
Subject: [PATCH 441/829] Expose sst_dump functionality as library call.

Summary:
Refactor sst_dump to follow the same structure as ldb. Introduce a
SSTDump interface.

Test Plan: built sst_dump and tried it manually.

Reviewers: sdong

Reviewed By: sdong

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D28485
---
 include/rocksdb/sst_dump_tool.h |  17 ++
 tools/sst_dump.cc               | 416 +------------------------------
 util/sst_dump_tool.cc           | 426 ++++++++++++++++++++++++++++++++
 3 files changed, 447 insertions(+), 412 deletions(-)
 create mode 100644 include/rocksdb/sst_dump_tool.h
 create mode 100644 util/sst_dump_tool.cc

diff --git a/include/rocksdb/sst_dump_tool.h b/include/rocksdb/sst_dump_tool.h
new file mode 100644
index 000000000..e3ee2a9c8
--- /dev/null
+++ b/include/rocksdb/sst_dump_tool.h
@@ -0,0 +1,17 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+#ifndef ROCKSDB_LITE
+#pragma once
+
+namespace rocksdb {
+
+class SSTDumpTool {
+ public:
+  void Run(int argc, char** argv);
+};
+
+}  // namespace rocksdb
+
+#endif  // ROCKSDB_LITE
diff --git a/tools/sst_dump.cc b/tools/sst_dump.cc
index 03da2b840..7a83b60b3 100644
--- a/tools/sst_dump.cc
+++ b/tools/sst_dump.cc
@@ -4,418 +4,10 @@
 //  of patent rights can be found in the PATENTS file in the same directory.
 //
 
-#include <map>
-#include <string>
-#include <vector>
-#include <inttypes.h>
-
-#include "db/dbformat.h"
-#include "db/memtable.h"
-#include "db/write_batch_internal.h"
-#include "rocksdb/db.h"
-#include "rocksdb/env.h"
-#include "rocksdb/iterator.h"
-#include "rocksdb/slice_transform.h"
-#include "rocksdb/table.h"
-#include "rocksdb/table_properties.h"
-#include "table/block_based_table_factory.h"
-#include "table/plain_table_factory.h"
-#include "table/meta_blocks.h"
-#include "table/block.h"
-#include "table/block_builder.h"
-#include "table/format.h"
-#include "util/ldb_cmd.h"
-#include "util/random.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
-
-namespace rocksdb {
-
-class SstFileReader {
- public:
-  explicit SstFileReader(const std::string& file_name,
-                         bool verify_checksum,
-                         bool output_hex);
-
-  Status ReadSequential(bool print_kv,
-                        uint64_t read_num,
-                        bool has_from,
-                        const std::string& from_key,
-                        bool has_to,
-                        const std::string& to_key);
-
-  Status ReadTableProperties(
-      std::shared_ptr<const TableProperties>* table_properties);
-  uint64_t GetReadNumber() { return read_num_; }
-  TableProperties* GetInitTableProperties() { return table_properties_.get(); }
-
- private:
-  Status NewTableReader(const std::string& file_path);
-  Status ReadTableProperties(uint64_t table_magic_number,
-                             RandomAccessFile* file, uint64_t file_size);
-  Status SetTableOptionsByMagicNumber(uint64_t table_magic_number);
-  Status SetOldTableOptions();
-
-  std::string file_name_;
-  uint64_t read_num_;
-  bool verify_checksum_;
-  bool output_hex_;
-  EnvOptions soptions_;
-
-  Status init_result_;
-  unique_ptr<TableReader> table_reader_;
-  unique_ptr<RandomAccessFile> file_;
-  // options_ and internal_comparator_ will also be used in
-  // ReadSequential internally (specifically, seek-related operations)
-  Options options_;
-  const ImmutableCFOptions ioptions_;
-  InternalKeyComparator internal_comparator_;
-  unique_ptr<TableProperties> table_properties_;
-};
-
-SstFileReader::SstFileReader(const std::string& file_path,
-                             bool verify_checksum,
-                             bool output_hex)
-    :file_name_(file_path), read_num_(0), verify_checksum_(verify_checksum),
-    output_hex_(output_hex), ioptions_(options_),
-    internal_comparator_(BytewiseComparator()) {
-  fprintf(stdout, "Process %s\n", file_path.c_str());
-
-  init_result_ = NewTableReader(file_name_);
-}
-
-extern uint64_t kBlockBasedTableMagicNumber;
-extern uint64_t kLegacyBlockBasedTableMagicNumber;
-extern uint64_t kPlainTableMagicNumber;
-extern uint64_t kLegacyPlainTableMagicNumber;
-
-Status SstFileReader::NewTableReader(const std::string& file_path) {
-  uint64_t magic_number;
-
-  // read table magic number
-  Footer footer;
-
-  unique_ptr<RandomAccessFile> file;
-  uint64_t file_size;
-  Status s = options_.env->NewRandomAccessFile(file_path, &file_, soptions_);
-  if (s.ok()) {
-    s = options_.env->GetFileSize(file_path, &file_size);
-  }
-  if (s.ok()) {
-    s = ReadFooterFromFile(file_.get(), file_size, &footer);
-  }
-  if (s.ok()) {
-    magic_number = footer.table_magic_number();
-  }
-
-  if (s.ok()) {
-    if (magic_number == kPlainTableMagicNumber ||
-        magic_number == kLegacyPlainTableMagicNumber) {
-      soptions_.use_mmap_reads = true;
-      options_.env->NewRandomAccessFile(file_path, &file_, soptions_);
-    }
-    options_.comparator = &internal_comparator_;
-    // For old sst format, ReadTableProperties might fail but file can be read
-    if (ReadTableProperties(magic_number, file_.get(), file_size).ok()) {
-      SetTableOptionsByMagicNumber(magic_number);
-    } else {
-      SetOldTableOptions();
-    }
-  }
-
-  if (s.ok()) {
-    s = options_.table_factory->NewTableReader(
-        ioptions_, soptions_, internal_comparator_, std::move(file_), file_size,
-        &table_reader_);
-  }
-  return s;
-}
-
-Status SstFileReader::ReadTableProperties(uint64_t table_magic_number,
-                                          RandomAccessFile* file,
-                                          uint64_t file_size) {
-  TableProperties* table_properties = nullptr;
-  Status s = rocksdb::ReadTableProperties(file, file_size, table_magic_number,
-                                          options_.env, options_.info_log.get(),
-                                          &table_properties);
-  if (s.ok()) {
-    table_properties_.reset(table_properties);
-  } else {
-    fprintf(stdout, "Not able to read table properties\n");
-  }
-  return s;
-}
-
-Status SstFileReader::SetTableOptionsByMagicNumber(
-    uint64_t table_magic_number) {
-  assert(table_properties_);
-  if (table_magic_number == kBlockBasedTableMagicNumber ||
-      table_magic_number == kLegacyBlockBasedTableMagicNumber) {
-    options_.table_factory = std::make_shared<BlockBasedTableFactory>();
-    fprintf(stdout, "Sst file format: block-based\n");
-    auto& props = table_properties_->user_collected_properties;
-    auto pos = props.find(BlockBasedTablePropertyNames::kIndexType);
-    if (pos != props.end()) {
-      auto index_type_on_file = static_cast<BlockBasedTableOptions::IndexType>(
-          DecodeFixed32(pos->second.c_str()));
-      if (index_type_on_file ==
-          BlockBasedTableOptions::IndexType::kHashSearch) {
-        options_.prefix_extractor.reset(NewNoopTransform());
-      }
-    }
-  } else if (table_magic_number == kPlainTableMagicNumber ||
-             table_magic_number == kLegacyPlainTableMagicNumber) {
-    options_.allow_mmap_reads = true;
-
-    PlainTableOptions plain_table_options;
-    plain_table_options.user_key_len = kPlainTableVariableLength;
-    plain_table_options.bloom_bits_per_key = 0;
-    plain_table_options.hash_table_ratio = 0;
-    plain_table_options.index_sparseness = 1;
-    plain_table_options.huge_page_tlb_size = 0;
-    plain_table_options.encoding_type = kPlain;
-    plain_table_options.full_scan_mode = true;
-
-    options_.table_factory.reset(NewPlainTableFactory(plain_table_options));
-    fprintf(stdout, "Sst file format: plain table\n");
-  } else {
-    char error_msg_buffer[80];
-    snprintf(error_msg_buffer, sizeof(error_msg_buffer) - 1,
-             "Unsupported table magic number --- %lx",
-             (long)table_magic_number);
-    return Status::InvalidArgument(error_msg_buffer);
-  }
-
-  return Status::OK();
-}
-
-Status SstFileReader::SetOldTableOptions() {
-  assert(table_properties_ == nullptr);
-  options_.table_factory = std::make_shared<BlockBasedTableFactory>();
-  fprintf(stdout, "Sst file format: block-based(old version)\n");
-
-  return Status::OK();
-}
-
-Status SstFileReader::ReadSequential(bool print_kv,
-                                     uint64_t read_num,
-                                     bool has_from,
-                                     const std::string& from_key,
-                                     bool has_to,
-                                     const std::string& to_key) {
-  if (!table_reader_) {
-    return init_result_;
-  }
-
-  Iterator* iter = table_reader_->NewIterator(ReadOptions(verify_checksum_,
-                                                         false));
-  uint64_t i = 0;
-  if (has_from) {
-    InternalKey ikey(from_key, kMaxSequenceNumber, kValueTypeForSeek);
-    iter->Seek(ikey.Encode());
-  } else {
-    iter->SeekToFirst();
-  }
-  for (; iter->Valid(); iter->Next()) {
-    Slice key = iter->key();
-    Slice value = iter->value();
-    ++i;
-    if (read_num > 0 && i > read_num)
-      break;
-
-    ParsedInternalKey ikey;
-    if (!ParseInternalKey(key, &ikey)) {
-      std::cerr << "Internal Key ["
-                << key.ToString(true /* in hex*/)
-                << "] parse error!\n";
-      continue;
-    }
-
-    // If end marker was specified, we stop before it
-    if (has_to && BytewiseComparator()->Compare(ikey.user_key, to_key) >= 0) {
-      break;
-    }
-
-    if (print_kv) {
-      fprintf(stdout, "%s => %s\n",
-          ikey.DebugString(output_hex_).c_str(),
-          value.ToString(output_hex_).c_str());
-    }
-  }
-
-  read_num_ += i;
-
-  Status ret = iter->status();
-  delete iter;
-  return ret;
-}
-
-Status SstFileReader::ReadTableProperties(
-    std::shared_ptr<const TableProperties>* table_properties) {
-  if (!table_reader_) {
-    return init_result_;
-  }
-
-  *table_properties = table_reader_->GetTableProperties();
-  return init_result_;
-}
-
-}  // namespace rocksdb
-
-static void print_help() {
-  fprintf(stderr,
-          "sst_dump [--command=check|scan|none] [--verify_checksum] "
-          "--file=data_dir_OR_sst_file"
-          " [--output_hex]"
-          " [--input_key_hex]"
-          " [--from=<user_key>]"
-          " [--to=<user_key>]"
-          " [--read_num=NUM]"
-          " [--show_properties]\n");
-}
-
-namespace {
-string HexToString(const string& str) {
-  string parsed;
-  if (str[0] != '0' || str[1] != 'x') {
-    fprintf(stderr, "Invalid hex input %s.  Must start with 0x\n",
-            str.c_str());
-    throw "Invalid hex input";
-  }
-
-  for (unsigned int i = 2; i < str.length();) {
-    int c;
-    sscanf(str.c_str() + i, "%2X", &c);
-    parsed.push_back(c);
-    i += 2;
-  }
-  return parsed;
-}
-}  // namespace
+#include "rocksdb/sst_dump_tool.h"
 
 int main(int argc, char** argv) {
-  const char* dir_or_file = nullptr;
-  uint64_t read_num = -1;
-  std::string command;
-
-  char junk;
-  uint64_t n;
-  bool verify_checksum = false;
-  bool output_hex = false;
-  bool input_key_hex = false;
-  bool has_from = false;
-  bool has_to = false;
-  bool show_properties = false;
-  std::string from_key;
-  std::string to_key;
-  for (int i = 1; i < argc; i++) {
-    if (strncmp(argv[i], "--file=", 7) == 0) {
-      dir_or_file = argv[i] + 7;
-    } else if (strcmp(argv[i], "--output_hex") == 0) {
-      output_hex = true;
-    } else if (strcmp(argv[i], "--input_key_hex") == 0) {
-      input_key_hex = true;
-    } else if (sscanf(argv[i],
-               "--read_num=%lu%c",
-               (unsigned long*)&n, &junk) == 1) {
-      read_num = n;
-    } else if (strcmp(argv[i], "--verify_checksum") == 0) {
-      verify_checksum = true;
-    } else if (strncmp(argv[i], "--command=", 10) == 0) {
-      command = argv[i] + 10;
-    } else if (strncmp(argv[i], "--from=", 7) == 0) {
-      from_key = argv[i] + 7;
-      has_from = true;
-    } else if (strncmp(argv[i], "--to=", 5) == 0) {
-      to_key = argv[i] + 5;
-      has_to = true;
-    } else if (strcmp(argv[i], "--show_properties") == 0) {
-      show_properties = true;
-    } else {
-      print_help();
-      exit(1);
-    }
-  }
-
-  if (input_key_hex) {
-    if (has_from) {
-      from_key = HexToString(from_key);
-    }
-    if (has_to) {
-      to_key = HexToString(to_key);
-    }
-  }
-
-  if (dir_or_file == nullptr) {
-    print_help();
-    exit(1);
-  }
-
-  std::vector<std::string> filenames;
-  rocksdb::Env* env = rocksdb::Env::Default();
-  rocksdb::Status st = env->GetChildren(dir_or_file, &filenames);
-  bool dir = true;
-  if (!st.ok()) {
-    filenames.clear();
-    filenames.push_back(dir_or_file);
-    dir = false;
-  }
-
-  fprintf(stdout, "from [%s] to [%s]\n",
-      rocksdb::Slice(from_key).ToString(true).c_str(),
-      rocksdb::Slice(to_key).ToString(true).c_str());
-
-  uint64_t total_read = 0;
-  for (size_t i = 0; i < filenames.size(); i++) {
-    std::string filename = filenames.at(i);
-    if (filename.length() <= 4 ||
-        filename.rfind(".sst") != filename.length() - 4) {
-      // ignore
-      continue;
-    }
-    if (dir) {
-      filename = std::string(dir_or_file) + "/" + filename;
-    }
-    rocksdb::SstFileReader reader(filename, verify_checksum,
-                                  output_hex);
-    // scan all files in give file path.
-    if (command == "" || command == "scan" || command == "check") {
-      st = reader.ReadSequential(command != "check",
-                                 read_num > 0 ? (read_num - total_read) :
-                                                read_num,
-                                 has_from, from_key, has_to, to_key);
-      if (!st.ok()) {
-        fprintf(stderr, "%s: %s\n", filename.c_str(),
-            st.ToString().c_str());
-      }
-      total_read += reader.GetReadNumber();
-      if (read_num > 0 && total_read > read_num) {
-        break;
-      }
-    }
-    if (show_properties) {
-      const rocksdb::TableProperties* table_properties;
-
-      std::shared_ptr<const rocksdb::TableProperties>
-          table_properties_from_reader;
-      st = reader.ReadTableProperties(&table_properties_from_reader);
-      if (!st.ok()) {
-        fprintf(stderr, "%s: %s\n", filename.c_str(), st.ToString().c_str());
-        fprintf(stderr, "Try to use initial table properties\n");
-        table_properties = reader.GetInitTableProperties();
-      } else {
-        table_properties = table_properties_from_reader.get();
-      }
-      if (table_properties != nullptr) {
-        fprintf(stdout,
-                "Table Properties:\n"
-                "------------------------------\n"
-                "  %s",
-                table_properties->ToString("\n  ", ": ").c_str());
-        fprintf(stdout, "# deleted keys: %zd\n",
-                rocksdb::GetDeletedKeys(
-                    table_properties->user_collected_properties));
-      }
-    }
-  }
+  rocksdb::SSTDumpTool tool;
+  tool.Run(argc, argv);
+  return 0;
 }
diff --git a/util/sst_dump_tool.cc b/util/sst_dump_tool.cc
new file mode 100644
index 000000000..925dadc7d
--- /dev/null
+++ b/util/sst_dump_tool.cc
@@ -0,0 +1,426 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+
+#include "rocksdb/sst_dump_tool.h"
+
+#include <map>
+#include <string>
+#include <vector>
+#include <inttypes.h>
+
+#include "db/dbformat.h"
+#include "db/memtable.h"
+#include "db/write_batch_internal.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/immutable_options.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table.h"
+#include "rocksdb/table_properties.h"
+#include "table/block.h"
+#include "table/block_based_table_factory.h"
+#include "table/block_builder.h"
+#include "table/format.h"
+#include "table/meta_blocks.h"
+#include "table/plain_table_factory.h"
+#include "util/ldb_cmd.h"
+#include "util/random.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+namespace rocksdb {
+
+class SstFileReader {
+ public:
+  explicit SstFileReader(const std::string& file_name,
+                         bool verify_checksum,
+                         bool output_hex);
+
+  Status ReadSequential(bool print_kv,
+                        uint64_t read_num,
+                        bool has_from,
+                        const std::string& from_key,
+                        bool has_to,
+                        const std::string& to_key);
+
+  Status ReadTableProperties(
+      std::shared_ptr<const TableProperties>* table_properties);
+  uint64_t GetReadNumber() { return read_num_; }
+  TableProperties* GetInitTableProperties() { return table_properties_.get(); }
+
+ private:
+  Status NewTableReader(const std::string& file_path);
+  Status ReadTableProperties(uint64_t table_magic_number,
+                             RandomAccessFile* file, uint64_t file_size);
+  Status SetTableOptionsByMagicNumber(uint64_t table_magic_number);
+  Status SetOldTableOptions();
+
+  std::string file_name_;
+  uint64_t read_num_;
+  bool verify_checksum_;
+  bool output_hex_;
+  EnvOptions soptions_;
+
+  Status init_result_;
+  unique_ptr<TableReader> table_reader_;
+  unique_ptr<RandomAccessFile> file_;
+  // options_ and internal_comparator_ will also be used in
+  // ReadSequential internally (specifically, seek-related operations)
+  Options options_;
+  const ImmutableCFOptions ioptions_;
+  InternalKeyComparator internal_comparator_;
+  unique_ptr<TableProperties> table_properties_;
+};
+
+SstFileReader::SstFileReader(const std::string& file_path,
+                             bool verify_checksum,
+                             bool output_hex)
+    :file_name_(file_path), read_num_(0), verify_checksum_(verify_checksum),
+    output_hex_(output_hex), ioptions_(options_),
+    internal_comparator_(BytewiseComparator()) {
+  fprintf(stdout, "Process %s\n", file_path.c_str());
+
+  init_result_ = NewTableReader(file_name_);
+}
+
+extern uint64_t kBlockBasedTableMagicNumber;
+extern uint64_t kLegacyBlockBasedTableMagicNumber;
+extern uint64_t kPlainTableMagicNumber;
+extern uint64_t kLegacyPlainTableMagicNumber;
+
+Status SstFileReader::NewTableReader(const std::string& file_path) {
+  uint64_t magic_number;
+
+  // read table magic number
+  Footer footer;
+
+  unique_ptr<RandomAccessFile> file;
+  uint64_t file_size;
+  Status s = options_.env->NewRandomAccessFile(file_path, &file_, soptions_);
+  if (s.ok()) {
+    s = options_.env->GetFileSize(file_path, &file_size);
+  }
+  if (s.ok()) {
+    s = ReadFooterFromFile(file_.get(), file_size, &footer);
+  }
+  if (s.ok()) {
+    magic_number = footer.table_magic_number();
+  }
+
+  if (s.ok()) {
+    if (magic_number == kPlainTableMagicNumber ||
+        magic_number == kLegacyPlainTableMagicNumber) {
+      soptions_.use_mmap_reads = true;
+      options_.env->NewRandomAccessFile(file_path, &file_, soptions_);
+    }
+    options_.comparator = &internal_comparator_;
+    // For old sst format, ReadTableProperties might fail but file can be read
+    if (ReadTableProperties(magic_number, file_.get(), file_size).ok()) {
+      SetTableOptionsByMagicNumber(magic_number);
+    } else {
+      SetOldTableOptions();
+    }
+  }
+
+  if (s.ok()) {
+    s = options_.table_factory->NewTableReader(
+        ioptions_, soptions_, internal_comparator_, std::move(file_), file_size,
+        &table_reader_);
+  }
+  return s;
+}
+
+Status SstFileReader::ReadTableProperties(uint64_t table_magic_number,
+                                          RandomAccessFile* file,
+                                          uint64_t file_size) {
+  TableProperties* table_properties = nullptr;
+  Status s = rocksdb::ReadTableProperties(file, file_size, table_magic_number,
+                                          options_.env, options_.info_log.get(),
+                                          &table_properties);
+  if (s.ok()) {
+    table_properties_.reset(table_properties);
+  } else {
+    fprintf(stdout, "Not able to read table properties\n");
+  }
+  return s;
+}
+
+Status SstFileReader::SetTableOptionsByMagicNumber(
+    uint64_t table_magic_number) {
+  assert(table_properties_);
+  if (table_magic_number == kBlockBasedTableMagicNumber ||
+      table_magic_number == kLegacyBlockBasedTableMagicNumber) {
+    options_.table_factory = std::make_shared<BlockBasedTableFactory>();
+    fprintf(stdout, "Sst file format: block-based\n");
+    auto& props = table_properties_->user_collected_properties;
+    auto pos = props.find(BlockBasedTablePropertyNames::kIndexType);
+    if (pos != props.end()) {
+      auto index_type_on_file = static_cast<BlockBasedTableOptions::IndexType>(
+          DecodeFixed32(pos->second.c_str()));
+      if (index_type_on_file ==
+          BlockBasedTableOptions::IndexType::kHashSearch) {
+        options_.prefix_extractor.reset(NewNoopTransform());
+      }
+    }
+  } else if (table_magic_number == kPlainTableMagicNumber ||
+             table_magic_number == kLegacyPlainTableMagicNumber) {
+    options_.allow_mmap_reads = true;
+
+    PlainTableOptions plain_table_options;
+    plain_table_options.user_key_len = kPlainTableVariableLength;
+    plain_table_options.bloom_bits_per_key = 0;
+    plain_table_options.hash_table_ratio = 0;
+    plain_table_options.index_sparseness = 1;
+    plain_table_options.huge_page_tlb_size = 0;
+    plain_table_options.encoding_type = kPlain;
+    plain_table_options.full_scan_mode = true;
+
+    options_.table_factory.reset(NewPlainTableFactory(plain_table_options));
+    fprintf(stdout, "Sst file format: plain table\n");
+  } else {
+    char error_msg_buffer[80];
+    snprintf(error_msg_buffer, sizeof(error_msg_buffer) - 1,
+             "Unsupported table magic number --- %lx",
+             (long)table_magic_number);
+    return Status::InvalidArgument(error_msg_buffer);
+  }
+
+  return Status::OK();
+}
+
+Status SstFileReader::SetOldTableOptions() {
+  assert(table_properties_ == nullptr);
+  options_.table_factory = std::make_shared<BlockBasedTableFactory>();
+  fprintf(stdout, "Sst file format: block-based(old version)\n");
+
+  return Status::OK();
+}
+
+Status SstFileReader::ReadSequential(bool print_kv,
+                                     uint64_t read_num,
+                                     bool has_from,
+                                     const std::string& from_key,
+                                     bool has_to,
+                                     const std::string& to_key) {
+  if (!table_reader_) {
+    return init_result_;
+  }
+
+  Iterator* iter = table_reader_->NewIterator(ReadOptions(verify_checksum_,
+                                                         false));
+  uint64_t i = 0;
+  if (has_from) {
+    InternalKey ikey(from_key, kMaxSequenceNumber, kValueTypeForSeek);
+    iter->Seek(ikey.Encode());
+  } else {
+    iter->SeekToFirst();
+  }
+  for (; iter->Valid(); iter->Next()) {
+    Slice key = iter->key();
+    Slice value = iter->value();
+    ++i;
+    if (read_num > 0 && i > read_num)
+      break;
+
+    ParsedInternalKey ikey;
+    if (!ParseInternalKey(key, &ikey)) {
+      std::cerr << "Internal Key ["
+                << key.ToString(true /* in hex*/)
+                << "] parse error!\n";
+      continue;
+    }
+
+    // If end marker was specified, we stop before it
+    if (has_to && BytewiseComparator()->Compare(ikey.user_key, to_key) >= 0) {
+      break;
+    }
+
+    if (print_kv) {
+      fprintf(stdout, "%s => %s\n",
+          ikey.DebugString(output_hex_).c_str(),
+          value.ToString(output_hex_).c_str());
+    }
+  }
+
+  read_num_ += i;
+
+  Status ret = iter->status();
+  delete iter;
+  return ret;
+}
+
+Status SstFileReader::ReadTableProperties(
+    std::shared_ptr<const TableProperties>* table_properties) {
+  if (!table_reader_) {
+    return init_result_;
+  }
+
+  *table_properties = table_reader_->GetTableProperties();
+  return init_result_;
+}
+
+namespace {
+
+void print_help() {
+  fprintf(stderr,
+          "sst_dump [--command=check|scan|none] [--verify_checksum] "
+          "--file=data_dir_OR_sst_file"
+          " [--output_hex]"
+          " [--input_key_hex]"
+          " [--from=<user_key>]"
+          " [--to=<user_key>]"
+          " [--read_num=NUM]"
+          " [--show_properties]\n");
+}
+
+string HexToString(const string& str) {
+  string parsed;
+  if (str[0] != '0' || str[1] != 'x') {
+    fprintf(stderr, "Invalid hex input %s.  Must start with 0x\n",
+            str.c_str());
+    throw "Invalid hex input";
+  }
+
+  for (unsigned int i = 2; i < str.length();) {
+    int c;
+    sscanf(str.c_str() + i, "%2X", &c);
+    parsed.push_back(c);
+    i += 2;
+  }
+  return parsed;
+}
+
+}  // namespace
+
+void SSTDumpTool::Run(int argc, char** argv) {
+  const char* dir_or_file = nullptr;
+  uint64_t read_num = -1;
+  std::string command;
+
+  char junk;
+  uint64_t n;
+  bool verify_checksum = false;
+  bool output_hex = false;
+  bool input_key_hex = false;
+  bool has_from = false;
+  bool has_to = false;
+  bool show_properties = false;
+  std::string from_key;
+  std::string to_key;
+  for (int i = 1; i < argc; i++) {
+    if (strncmp(argv[i], "--file=", 7) == 0) {
+      dir_or_file = argv[i] + 7;
+    } else if (strcmp(argv[i], "--output_hex") == 0) {
+      output_hex = true;
+    } else if (strcmp(argv[i], "--input_key_hex") == 0) {
+      input_key_hex = true;
+    } else if (sscanf(argv[i],
+               "--read_num=%lu%c",
+               (unsigned long*)&n, &junk) == 1) {
+      read_num = n;
+    } else if (strcmp(argv[i], "--verify_checksum") == 0) {
+      verify_checksum = true;
+    } else if (strncmp(argv[i], "--command=", 10) == 0) {
+      command = argv[i] + 10;
+    } else if (strncmp(argv[i], "--from=", 7) == 0) {
+      from_key = argv[i] + 7;
+      has_from = true;
+    } else if (strncmp(argv[i], "--to=", 5) == 0) {
+      to_key = argv[i] + 5;
+      has_to = true;
+    } else if (strcmp(argv[i], "--show_properties") == 0) {
+      show_properties = true;
+    } else {
+      print_help();
+      exit(1);
+    }
+  }
+
+  if (input_key_hex) {
+    if (has_from) {
+      from_key = HexToString(from_key);
+    }
+    if (has_to) {
+      to_key = HexToString(to_key);
+    }
+  }
+
+  if (dir_or_file == nullptr) {
+    print_help();
+    exit(1);
+  }
+
+  std::vector<std::string> filenames;
+  rocksdb::Env* env = rocksdb::Env::Default();
+  rocksdb::Status st = env->GetChildren(dir_or_file, &filenames);
+  bool dir = true;
+  if (!st.ok()) {
+    filenames.clear();
+    filenames.push_back(dir_or_file);
+    dir = false;
+  }
+
+  fprintf(stdout, "from [%s] to [%s]\n",
+      rocksdb::Slice(from_key).ToString(true).c_str(),
+      rocksdb::Slice(to_key).ToString(true).c_str());
+
+  uint64_t total_read = 0;
+  for (size_t i = 0; i < filenames.size(); i++) {
+    std::string filename = filenames.at(i);
+    if (filename.length() <= 4 ||
+        filename.rfind(".sst") != filename.length() - 4) {
+      // ignore
+      continue;
+    }
+    if (dir) {
+      filename = std::string(dir_or_file) + "/" + filename;
+    }
+    rocksdb::SstFileReader reader(filename, verify_checksum,
+                                  output_hex);
+    // scan all files in give file path.
+    if (command == "" || command == "scan" || command == "check") {
+      st = reader.ReadSequential(command != "check",
+                                 read_num > 0 ? (read_num - total_read) :
+                                                read_num,
+                                 has_from, from_key, has_to, to_key);
+      if (!st.ok()) {
+        fprintf(stderr, "%s: %s\n", filename.c_str(),
+            st.ToString().c_str());
+      }
+      total_read += reader.GetReadNumber();
+      if (read_num > 0 && total_read > read_num) {
+        break;
+      }
+    }
+    if (show_properties) {
+      const rocksdb::TableProperties* table_properties;
+
+      std::shared_ptr<const rocksdb::TableProperties>
+          table_properties_from_reader;
+      st = reader.ReadTableProperties(&table_properties_from_reader);
+      if (!st.ok()) {
+        fprintf(stderr, "%s: %s\n", filename.c_str(), st.ToString().c_str());
+        fprintf(stderr, "Try to use initial table properties\n");
+        table_properties = reader.GetInitTableProperties();
+      } else {
+        table_properties = table_properties_from_reader.get();
+      }
+      if (table_properties != nullptr) {
+        fprintf(stdout,
+                "Table Properties:\n"
+                "------------------------------\n"
+                "  %s",
+                table_properties->ToString("\n  ", ": ").c_str());
+        fprintf(stdout, "# deleted keys: %zd\n",
+                rocksdb::GetDeletedKeys(
+                    table_properties->user_collected_properties));
+      }
+    }
+  }
+}
+}  // namespace rocksdb

From e7620536cf67683a257efee6892e255a6f15fe80 Mon Sep 17 00:00:00 2001
From: liuchang <chenchu.lc@alibaba-inc.com>
Date: Sat, 8 Nov 2014 11:35:10 -0800
Subject: [PATCH 442/829] fix make static_lib error

---
 util/sst_dump_tool.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/util/sst_dump_tool.cc b/util/sst_dump_tool.cc
index 925dadc7d..f5cc50fa8 100644
--- a/util/sst_dump_tool.cc
+++ b/util/sst_dump_tool.cc
@@ -416,7 +416,7 @@ void SSTDumpTool::Run(int argc, char** argv) {
                 "------------------------------\n"
                 "  %s",
                 table_properties->ToString("\n  ", ": ").c_str());
-        fprintf(stdout, "# deleted keys: %zd\n",
+        fprintf(stdout, "# deleted keys: %" PRIu64 "\n",
                 rocksdb::GetDeletedKeys(
                     table_properties->user_collected_properties));
       }

From 01a770637966ddf60fcae27511829e1c34c699ee Mon Sep 17 00:00:00 2001
From: liuchang <chenchu.lc@alibaba-inc.com>
Date: Sat, 8 Nov 2014 11:51:40 -0800
Subject: [PATCH 443/829] remove unused target

---
 examples/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/Makefile b/examples/Makefile
index 7807289ae..ce43785e0 100644
--- a/examples/Makefile
+++ b/examples/Makefile
@@ -1,6 +1,6 @@
 include ../build_config.mk
 
-.PHONY: main clean
+.PHONY: clean
 
 all: simple_example column_families_example compact_files_example
 

From 00211f9c5b938cc0acf93062e15586a88004ee3c Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Sat, 8 Nov 2014 13:01:31 -0800
Subject: [PATCH 444/829] Fix SIGSEGV in db_stresS

---
 tools/db_stress.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/db_stress.cc b/tools/db_stress.cc
index a0d6ea723..33c72cd48 100644
--- a/tools/db_stress.cc
+++ b/tools/db_stress.cc
@@ -752,7 +752,7 @@ struct ThreadState {
   Stats stats;
 
   ThreadState(uint32_t index, SharedState* _shared)
-      : tid(index), rand(1000 + index + shared->GetSeed()), shared(_shared) {}
+      : tid(index), rand(1000 + index + _shared->GetSeed()), shared(_shared) {}
 };
 
 }  // namespace

From 8fb4751d504379c8316fada9d647cce328daf37c Mon Sep 17 00:00:00 2001
From: Adam Retter <adam.retter@googlemail.com>
Date: Thu, 23 Oct 2014 16:19:38 +0100
Subject: [PATCH 445/829] Iterator support for Write Batches

---
 java/Makefile                                 |  2 +-
 java/org/rocksdb/WriteBatch.java              | 48 ++++++++-
 java/rocksjni/portal.h                        | 74 ++++++++++++++
 java/rocksjni/write_batch.cc                  | 44 +++++++++
 java/rocksjni/writebatchhandlerjnicallback.cc | 98 +++++++++++++++++++
 java/rocksjni/writebatchhandlerjnicallback.h  | 47 +++++++++
 6 files changed, 310 insertions(+), 3 deletions(-)
 create mode 100644 java/rocksjni/writebatchhandlerjnicallback.cc
 create mode 100644 java/rocksjni/writebatchhandlerjnicallback.h

diff --git a/java/Makefile b/java/Makefile
index a6d3c95f3..2b3e904bb 100644
--- a/java/Makefile
+++ b/java/Makefile
@@ -1,4 +1,4 @@
-NATIVE_JAVA_CLASSES = org.rocksdb.RocksDB org.rocksdb.Options org.rocksdb.DBOptions org.rocksdb.WriteBatch org.rocksdb.WriteBatchInternal org.rocksdb.WriteBatchTest org.rocksdb.WriteOptions org.rocksdb.BackupableDB org.rocksdb.BackupableDBOptions org.rocksdb.Statistics org.rocksdb.RocksIterator org.rocksdb.VectorMemTableConfig org.rocksdb.SkipListMemTableConfig org.rocksdb.HashLinkedListMemTableConfig org.rocksdb.HashSkipListMemTableConfig org.rocksdb.PlainTableConfig org.rocksdb.BlockBasedTableConfig org.rocksdb.ReadOptions org.rocksdb.Filter org.rocksdb.BloomFilter org.rocksdb.ComparatorOptions org.rocksdb.AbstractComparator org.rocksdb.Comparator org.rocksdb.DirectComparator org.rocksdb.AbstractSlice org.rocksdb.Slice org.rocksdb.DirectSlice org.rocksdb.RestoreOptions org.rocksdb.RestoreBackupableDB org.rocksdb.RocksEnv org.rocksdb.GenericRateLimiterConfig org.rocksdb.ColumnFamilyHandle org.rocksdb.MergeOperator org.rocksdb.StringAppendOperator org.rocksdb.ComparatorOptions org.rocksdb.AbstractComparator org.rocksdb.Comparator org.rocksdb.DirectComparator org.rocksdb.AbstractSlice org.rocksdb.Slice org.rocksdb.DirectSlice
+NATIVE_JAVA_CLASSES = org.rocksdb.RocksDB org.rocksdb.Options org.rocksdb.DBOptions org.rocksdb.WriteBatch org.rocksdb.WriteBatch.Handler org.rocksdb.WriteBatchInternal org.rocksdb.WriteBatchTest org.rocksdb.WriteOptions org.rocksdb.BackupableDB org.rocksdb.BackupableDBOptions org.rocksdb.Statistics org.rocksdb.RocksIterator org.rocksdb.VectorMemTableConfig org.rocksdb.SkipListMemTableConfig org.rocksdb.HashLinkedListMemTableConfig org.rocksdb.HashSkipListMemTableConfig org.rocksdb.PlainTableConfig org.rocksdb.BlockBasedTableConfig org.rocksdb.ReadOptions org.rocksdb.Filter org.rocksdb.BloomFilter org.rocksdb.ComparatorOptions org.rocksdb.AbstractComparator org.rocksdb.Comparator org.rocksdb.DirectComparator org.rocksdb.AbstractSlice org.rocksdb.Slice org.rocksdb.DirectSlice org.rocksdb.RestoreOptions org.rocksdb.RestoreBackupableDB org.rocksdb.RocksEnv org.rocksdb.GenericRateLimiterConfig org.rocksdb.ColumnFamilyHandle org.rocksdb.MergeOperator org.rocksdb.StringAppendOperator
 
 ROCKSDB_MAJOR = $(shell egrep "ROCKSDB_MAJOR.[0-9]" ../include/rocksdb/version.h | cut -d ' ' -f 3)
 ROCKSDB_MINOR = $(shell egrep "ROCKSDB_MINOR.[0-9]" ../include/rocksdb/version.h | cut -d ' ' -f 3)
diff --git a/java/org/rocksdb/WriteBatch.java b/java/org/rocksdb/WriteBatch.java
index 118695512..68049aded 100644
--- a/java/org/rocksdb/WriteBatch.java
+++ b/java/org/rocksdb/WriteBatch.java
@@ -5,8 +5,6 @@
 
 package org.rocksdb;
 
-import java.util.*;
-
 /**
  * WriteBatch holds a collection of updates to apply atomically to a DB.
  *
@@ -105,6 +103,13 @@ public class WriteBatch extends RocksObject {
     putLogData(blob, blob.length);
   }
 
+  /**
+   * Support for iterating over the contents of a batch.
+   */
+  public void iterate(Handler handler) {
+    iterate(handler.nativeHandle_);
+  }
+
   /**
    * Clear all updates buffered in this batch
    */
@@ -133,7 +138,46 @@ public class WriteBatch extends RocksObject {
   private native void remove(byte[] key, int keyLen,
                             long cfHandle);
   private native void putLogData(byte[] blob, int blobLen);
+  private native void iterate(long handlerHandle);
   private native void disposeInternal(long handle);
+
+  /**
+   * Handler callback for iterating over the contents of a batch.
+   */
+  public static abstract class Handler extends RocksObject {
+    public Handler() {
+      super();
+      createNewHandler0();
+    }
+
+    public abstract void put(byte[] key, byte[] value);
+    public abstract void merge(byte[] key, byte[] value);
+    public abstract void delete(byte[] key);
+    public abstract void logData(byte[] blob);
+
+    /**
+     * shouldContinue is called by the underlying iterator
+     * (WriteBatch::Iterate.If it returns false,
+     * iteration is halted. Otherwise, it continues
+     * iterating. The default implementation always
+     * returns true.
+     */
+    public boolean shouldContinue() {
+      return true;
+    }
+
+    /**
+     * Deletes underlying C++ handler pointer.
+     */
+    @Override
+    protected void disposeInternal() {
+      assert(isInitialized());
+      disposeInternal(nativeHandle_);
+    }
+
+    private native void createNewHandler0();
+    private native void disposeInternal(long handle);
+  }
 }
 
 /**
diff --git a/java/rocksjni/portal.h b/java/rocksjni/portal.h
index 03c15cb24..54b3b2766 100644
--- a/java/rocksjni/portal.h
+++ b/java/rocksjni/portal.h
@@ -20,6 +20,7 @@
 #include "rocksdb/status.h"
 #include "rocksdb/utilities/backupable_db.h"
 #include "rocksjni/comparatorjnicallback.h"
+#include "rocksjni/writebatchhandlerjnicallback.h"
 
 namespace rocksdb {
 
@@ -288,6 +289,79 @@ class WriteBatchJni {
   }
 };
 
+class WriteBatchHandlerJni {
+ public:
+  static jclass getJClass(JNIEnv* env) {
+    jclass jclazz = env->FindClass("org/rocksdb/WriteBatch$Handler");
+    assert(jclazz != nullptr);
+    return jclazz;
+  }
+
+  static jfieldID getHandleFieldID(JNIEnv* env) {
+    static jfieldID fid = env->GetFieldID(
+        getJClass(env), "nativeHandle_", "J");
+    assert(fid != nullptr);
+    return fid;
+  }
+
+  // Get the java method `put` of org.rocksdb.WriteBatch.Handler.
+  static jmethodID getPutMethodId(JNIEnv* env) {
+    static jmethodID mid = env->GetMethodID(
+        getJClass(env), "put", "([B[B)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  // Get the java method `merge` of org.rocksdb.WriteBatch.Handler.
+  static jmethodID getMergeMethodId(JNIEnv* env) {
+    static jmethodID mid = env->GetMethodID(
+        getJClass(env), "merge", "([B[B)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  // Get the java method `delete` of org.rocksdb.WriteBatch.Handler.
+  static jmethodID getDeleteMethodId(JNIEnv* env) {
+    static jmethodID mid = env->GetMethodID(
+        getJClass(env), "delete", "([B)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  // Get the java method `logData` of org.rocksdb.WriteBatch.Handler.
+  static jmethodID getLogDataMethodId(JNIEnv* env) {
+    static jmethodID mid = env->GetMethodID(
+        getJClass(env), "logData", "([B)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  // Get the java method `shouldContinue` of org.rocksdb.WriteBatch.Handler.
+  static jmethodID getContinueMethodId(JNIEnv* env) {
+    static jmethodID mid = env->GetMethodID(
+        getJClass(env), "shouldContinue", "()Z");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  // Get the pointer to rocksdb::WriteBatchHandlerJniCallback of the specified
+  // org.rocksdb.WriteBatchHandler.
+  static rocksdb::WriteBatchHandlerJniCallback* getHandle(
+      JNIEnv* env, jobject jobj) {
+    return reinterpret_cast<rocksdb::WriteBatchHandlerJniCallback*>(
+        env->GetLongField(jobj, getHandleFieldID(env)));
+  }
+
+  // Pass the rocksdb::WriteBatchHandlerJniCallback pointer to the java side.
+  static void setHandle(
+      JNIEnv* env, jobject jobj,
+      const rocksdb::WriteBatchHandlerJniCallback* op) {
+    env->SetLongField(
+        jobj, getHandleFieldID(env),
+        reinterpret_cast<jlong>(op));
+  }
+};
+
 class HistogramDataJni {
  public:
   static jmethodID getConstructorMethodId(JNIEnv* env, jclass jclazz) {
diff --git a/java/rocksjni/write_batch.cc b/java/rocksjni/write_batch.cc
index aea85fab9..57f4cb136 100644
--- a/java/rocksjni/write_batch.cc
+++ b/java/rocksjni/write_batch.cc
@@ -8,13 +8,16 @@
 #include <memory>
 
 #include "include/org_rocksdb_WriteBatch.h"
+#include "include/org_rocksdb_WriteBatch_Handler.h"
 #include "include/org_rocksdb_WriteBatchInternal.h"
 #include "include/org_rocksdb_WriteBatchTest.h"
 #include "rocksjni/portal.h"
+#include "rocksjni/writebatchhandlerjnicallback.h"
 #include "rocksdb/db.h"
 #include "rocksdb/immutable_options.h"
 #include "db/memtable.h"
 #include "rocksdb/write_batch.h"
+#include "rocksdb/status.h"
 #include "db/write_batch_internal.h"
 #include "rocksdb/env.h"
 #include "rocksdb/memtablerep.h"
@@ -224,6 +227,25 @@ void Java_org_rocksdb_WriteBatch_putLogData(
   env->ReleaseByteArrayElements(jblob, blob, JNI_ABORT);
 }
 
+/*
+ * Class:     org_rocksdb_WriteBatch
+ * Method:    iterate
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_WriteBatch_iterate(
+    JNIEnv* env , jobject jobj, jlong handlerHandle) {
+  rocksdb::WriteBatch* wb = rocksdb::WriteBatchJni::getHandle(env, jobj);
+  assert(wb != nullptr);
+
+  rocksdb::Status s = wb->Iterate(
+    reinterpret_cast<rocksdb::WriteBatchHandlerJniCallback*>(handlerHandle));
+
+  if (s.ok()) {
+    return;
+  }
+  rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+}
+
 /*
  * Class:     org_rocksdb_WriteBatch
  * Method:    disposeInternal
@@ -276,6 +298,28 @@ void Java_org_rocksdb_WriteBatchInternal_append(
   rocksdb::WriteBatchInternal::Append(wb1, wb2);
 }
 
+/*
+ * Class:     org_rocksdb_WriteBatch_Handler
+ * Method:    createNewHandler0
+ * Signature: ()V
+ */
+void Java_org_rocksdb_WriteBatch_00024Handler_createNewHandler0(
+    JNIEnv* env, jobject jobj) {
+  const rocksdb::WriteBatchHandlerJniCallback* h =
+    new rocksdb::WriteBatchHandlerJniCallback(env, jobj);
+  rocksdb::WriteBatchHandlerJni::setHandle(env, jobj, h);
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatch_Handler
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_WriteBatch_00024Handler_disposeInternal(
+    JNIEnv* env, jobject jobj, jlong handle) {
+    delete reinterpret_cast<rocksdb::WriteBatchHandlerJniCallback*>(handle);
+}
+
 /*
  * Class:     org_rocksdb_WriteBatchTest
  * Method:    getContents
diff --git a/java/rocksjni/writebatchhandlerjnicallback.cc b/java/rocksjni/writebatchhandlerjnicallback.cc
new file mode 100644
index 000000000..475ab18f1
--- /dev/null
+++ b/java/rocksjni/writebatchhandlerjnicallback.cc
@@ -0,0 +1,98 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+//
+// This file implements the callback "bridge" between Java and C++ for
+// rocksdb::Comparator.
+
+#include "rocksjni/writebatchhandlerjnicallback.h"
+#include "rocksjni/portal.h"
+
+namespace rocksdb {
+WriteBatchHandlerJniCallback::WriteBatchHandlerJniCallback(
+    JNIEnv* env, jobject jWriteBatchHandler) {
+
+  // Note: WriteBatchHandler methods may be accessed by multiple threads,
+  // so we ref the jvm not the env
+  const jint rs = env->GetJavaVM(&m_jvm);
+  assert(rs == JNI_OK);
+
+  // Note: we want to access the Java WriteBatchHandler instance
+  // across multiple method calls, so we create a global ref
+  m_jWriteBatchHandler = env->NewGlobalRef(jWriteBatchHandler);
+
+  m_jPutMethodId = WriteBatchHandlerJni::getPutMethodId(env);
+  m_jMergeMethodId = WriteBatchHandlerJni::getMergeMethodId(env);
+  m_jDeleteMethodId = WriteBatchHandlerJni::getDeleteMethodId(env);
+  m_jLogDataMethodId = WriteBatchHandlerJni::getLogDataMethodId(env);
+  m_jContinueMethodId = WriteBatchHandlerJni::getContinueMethodId(env);
+}
+
+/**
+ * Attach/Get a JNIEnv for the current native thread
+ */
+JNIEnv* WriteBatchHandlerJniCallback::getJniEnv() const {
+  JNIEnv *env;
+  jint rs = m_jvm->AttachCurrentThread(reinterpret_cast<void **>(&env), NULL);
+  assert(rs == JNI_OK);
+  return env;
+}
+
+void WriteBatchHandlerJniCallback::Put(const Slice& key, const Slice& value) {
+  getJniEnv()->CallVoidMethod(
+      m_jWriteBatchHandler,
+      m_jPutMethodId,
+      sliceToJArray(key),
+      sliceToJArray(value));
+}
+
+void WriteBatchHandlerJniCallback::Merge(const Slice& key, const Slice& value) {
+  getJniEnv()->CallVoidMethod(
+      m_jWriteBatchHandler,
+      m_jMergeMethodId,
+      sliceToJArray(key),
+      sliceToJArray(value));
+}
+
+void WriteBatchHandlerJniCallback::Delete(const Slice& key) {
+  getJniEnv()->CallVoidMethod(
+      m_jWriteBatchHandler,
+      m_jDeleteMethodId,
+      sliceToJArray(key));
+}
+
+void WriteBatchHandlerJniCallback::LogData(const Slice& blob) {
+  getJniEnv()->CallVoidMethod(
+      m_jWriteBatchHandler,
+      m_jLogDataMethodId,
+      sliceToJArray(blob));
+}
+
+bool WriteBatchHandlerJniCallback::Continue() {
+  jboolean jContinue = getJniEnv()->CallBooleanMethod(
+      m_jWriteBatchHandler,
+      m_jContinueMethodId);
+
+  return static_cast<bool>(jContinue == JNI_TRUE);
+}
+
+jbyteArray WriteBatchHandlerJniCallback::sliceToJArray(const Slice& s) {
+  jbyteArray ja = getJniEnv()->NewByteArray(s.size());
+  getJniEnv()->SetByteArrayRegion(
+      ja, 0, s.size(),
+      reinterpret_cast<const jbyte*>(s.data()));
+  return ja;
+}
+
+WriteBatchHandlerJniCallback::~WriteBatchHandlerJniCallback() {
+  JNIEnv* m_env = getJniEnv();
+
+  m_env->DeleteGlobalRef(m_jWriteBatchHandler);
+
+  // Note: do not need to explicitly detach, as this function is effectively
+  // called from the Java class's disposeInternal method, and so already
+  // has an attached thread, getJniEnv above is just a no-op Attach to get
+  // the env jvm->DetachCurrentThread();
+}
+}  // namespace rocksdb
diff --git a/java/rocksjni/writebatchhandlerjnicallback.h b/java/rocksjni/writebatchhandlerjnicallback.h
new file mode 100644
index 000000000..69f68a533
--- /dev/null
+++ b/java/rocksjni/writebatchhandlerjnicallback.h
@@ -0,0 +1,47 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+//
+// This file implements the callback "bridge" between Java and C++ for
+// rocksdb::WriteBatch::Handler.
+
+#ifndef JAVA_ROCKSJNI_WRITEBATCHHANDLERJNICALLBACK_H_
+#define JAVA_ROCKSJNI_WRITEBATCHHANDLERJNICALLBACK_H_
+
+#include <jni.h>
+#include "rocksdb/write_batch.h"
+
+namespace rocksdb {
+/**
+ * This class acts as a bridge between C++
+ * and Java. The methods in this class will be
+ * called back from the RocksDB storage engine (C++)
+ * we then callback to the appropriate Java method
+ * this enables Write Batch Handlers to be implemented in Java.
+ */
+class WriteBatchHandlerJniCallback : public WriteBatch::Handler {
+ public:
+    WriteBatchHandlerJniCallback(
+      JNIEnv* env, jobject jWriteBackHandler);
+    ~WriteBatchHandlerJniCallback();
+    void Put(const Slice& key, const Slice& value);
+    void Merge(const Slice& key, const Slice& value);
+    void Delete(const Slice& key);
+    void LogData(const Slice& blob);
+    bool Continue();
+
+ private:
+    JavaVM* m_jvm;
+    jobject m_jWriteBatchHandler;
+    JNIEnv* getJniEnv() const;
+    jbyteArray sliceToJArray(const Slice& s);
+    jmethodID m_jPutMethodId;
+    jmethodID m_jMergeMethodId;
+    jmethodID m_jDeleteMethodId;
+    jmethodID m_jLogDataMethodId;
+    jmethodID m_jContinueMethodId;
+};
+}  // namespace rocksdb
+
+#endif  // JAVA_ROCKSJNI_WRITEBATCHHANDLERJNICALLBACK_H_

From eeb9cf6c42a1547f84a1cb282de99c3770d4bbec Mon Sep 17 00:00:00 2001
From: Adam Retter <adam.retter@googlemail.com>
Date: Wed, 5 Nov 2014 16:51:38 +0000
Subject: [PATCH 446/829] Test for WriteBatchHandler

---
 java/Makefile                                 |   1 +
 .../rocksdb/test/WriteBatchHandlerTest.java   | 162 ++++++++++++++++++
 2 files changed, 163 insertions(+)
 create mode 100644 java/org/rocksdb/test/WriteBatchHandlerTest.java

diff --git a/java/Makefile b/java/Makefile
index 2b3e904bb..14469849b 100644
--- a/java/Makefile
+++ b/java/Makefile
@@ -56,6 +56,7 @@ test: java
 	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.ComparatorOptionsTest
 	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.ComparatorTest
 	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.DirectComparatorTest
+	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.WriteBatchHandlerTest
 	@rm -rf /tmp/rocksdbjni_*
 
 db_bench: java
diff --git a/java/org/rocksdb/test/WriteBatchHandlerTest.java b/java/org/rocksdb/test/WriteBatchHandlerTest.java
new file mode 100644
index 000000000..ccf9b164a
--- /dev/null
+++ b/java/org/rocksdb/test/WriteBatchHandlerTest.java
@@ -0,0 +1,162 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb.test;
+
+import org.rocksdb.RocksDB;
+import org.rocksdb.RocksDBException;
+import org.rocksdb.WriteBatch;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+public class WriteBatchHandlerTest {
+    static {
+        RocksDB.loadLibrary();
+    }
+
+    public static void main(final String[] args) throws IOException, RocksDBException {
+
+        // setup test data
+        final List<Tuple<Action, Tuple<byte[], byte[]>>> testEvents = new ArrayList<>();
+        testEvents.add(new Tuple<>(Action.DELETE,
+            new Tuple<byte[], byte[]>("k0".getBytes(), null)));
+        testEvents.add(new Tuple<>(Action.PUT,
+            new Tuple<>("k1".getBytes(), "v1".getBytes())));
+        testEvents.add(new Tuple<>(Action.PUT,
+            new Tuple<>("k2".getBytes(), "v2".getBytes())));
+        testEvents.add(new Tuple<>(Action.PUT,
+            new Tuple<>("k3".getBytes(), "v3".getBytes())));
+        testEvents.add(new Tuple<>(Action.LOG,
+            new Tuple<byte[], byte[]>(null, "log1".getBytes())));
+        testEvents.add(new Tuple<>(Action.MERGE,
+            new Tuple<>("k2".getBytes(), "v22".getBytes())));
+        testEvents.add(new Tuple<>(Action.DELETE,
+            new Tuple<byte[], byte[]>("k3".getBytes(), null)));
+
+        // load test data to the write batch
+        final WriteBatch batch = new WriteBatch();
+        for(final Tuple<Action, Tuple<byte[], byte[]>> testEvent : testEvents) {
+            final Tuple<byte[], byte[]> data = testEvent.value;
+            switch(testEvent.key) {
+
+                case PUT:
+                    batch.put(data.key, data.value);
+                    break;
+
+                case MERGE:
+                    batch.merge(data.key, data.value);
+                    break;
+
+                case DELETE:
+                    batch.remove(data.key);
+                    break;
+
+                case LOG:
+                    batch.putLogData(data.value);
+                    break;
+            }
+        }
+
+        // attempt to read test data back from the WriteBatch by iterating with a handler
+        final CapturingWriteBatchHandler handler = new CapturingWriteBatchHandler();
+        batch.iterate(handler);
+
+        // compare the results to the test data
+        final List<Tuple<Action, Tuple<byte[], byte[]>>> actualEvents = handler.getEvents();
+        assert(testEvents.size() == actualEvents.size());
+
+        for(int i = 0; i < testEvents.size(); i++) {
+            assert(equals(testEvents.get(i), actualEvents.get(i)));
+        }
+
+        System.out.println("Passed WriteBatchHandler Test");
+    }
+
+    private static boolean equals(final Tuple<Action, Tuple<byte[], byte[]>> expected,
+                                  final Tuple<Action, Tuple<byte[], byte[]>> actual) {
+        if(!expected.key.equals(actual.key)) {
+            return false;
+        }
+
+        final Tuple<byte[], byte[]> expectedData = expected.value;
+        final Tuple<byte[], byte[]> actualData = actual.value;
+
+        if(equals(expectedData.key, actualData.key)) {
+            return equals(expectedData.value, actualData.value);
+        } else {
+            return false;
+        }
+    }
+
+    private static boolean equals(byte[] expected, byte[] actual) {
+        if(expected != null) {
+            return  Arrays.equals(expected, actual);
+        } else {
+            return actual == null;
+        }
+    }
+
+    private static class Tuple<K, V> {
+        public final K key;
+        public final V value;
+
+        public Tuple(final K key, final V value) {
+            this.key = key;
+            this.value = value;
+        }
+    }
+
+    /**
+     * Enumeration of Write Batch
+     * event actions
+     */
+    private enum Action {
+        PUT,
+        MERGE,
+        DELETE,
+        LOG
+    }
+
+    /**
+     * A simple WriteBatch Handler which adds a record
+     * of each event that it receives to a list
+     */
+    private static class CapturingWriteBatchHandler extends WriteBatch.Handler {
+
+        private final List<Tuple<Action, Tuple<byte[], byte[]>>> events = new ArrayList<>();
+
+        /**
+         * Returns a copy of the current events list
+         *
+         * @return a list of the events which have happened upto now
+         */
+        public List<Tuple<Action, Tuple<byte[], byte[]>>> getEvents() {
+            return new ArrayList<>(events);
+        }
+
+        @Override
+        public void put(final byte[] key, final byte[] value) {
+            events.add(new Tuple<>(Action.PUT, new Tuple<>(key, value)));
+        }
+
+        @Override
+        public void merge(final byte[] key, final byte[] value) {
+            events.add(new Tuple<>(Action.MERGE, new Tuple<>(key, value)));
+        }
+
+        @Override
+        public void delete(final byte[] key) {
+            events.add(new Tuple<>(Action.DELETE, new Tuple<byte[], byte[]>(key, null)));
+        }
+
+        @Override
+        public void logData(final byte[] blob) {
+            events.add(new Tuple<>(Action.LOG, new Tuple<byte[], byte[]>(null, blob)));
+        }
+    }
+}

From d904fbbb0b59c3bd168c1a6d20d4e630a6c4361e Mon Sep 17 00:00:00 2001
From: Adam Retter <adam.retter@googlemail.com>
Date: Wed, 5 Nov 2014 18:56:45 +0000
Subject: [PATCH 447/829] Addresed comments from code review
 https://reviews.facebook.net/D27567

---
 java/org/rocksdb/WriteBatch.java              | 11 ++-
 java/rocksjni/write_batch.cc                  |  2 +-
 java/rocksjni/writebatchhandlerjnicallback.cc | 78 ++++++++++---------
 java/rocksjni/writebatchhandlerjnicallback.h  |  7 +-
 4 files changed, 54 insertions(+), 44 deletions(-)

diff --git a/java/org/rocksdb/WriteBatch.java b/java/org/rocksdb/WriteBatch.java
index 68049aded..19984b16c 100644
--- a/java/org/rocksdb/WriteBatch.java
+++ b/java/org/rocksdb/WriteBatch.java
@@ -105,8 +105,13 @@ public class WriteBatch extends RocksObject {
 
   /**
    * Support for iterating over the contents of a batch.
+   *
+   * @param handler A handler that is called back for each
+   *                update present in the batch
+   *
+   * @throws RocksDBException If we cannot iterate over the batch
    */
-  public void iterate(Handler handler) {
+  public void iterate(Handler handler) throws RocksDBException {
     iterate(handler.nativeHandle_);
   }
 
@@ -138,7 +143,7 @@ public class WriteBatch extends RocksObject {
   private native void remove(byte[] key, int keyLen,
                             long cfHandle);
   private native void putLogData(byte[] blob, int blobLen);
-  private native void iterate(long handlerHandle);
+  private native void iterate(long handlerHandle) throws RocksDBException;
   private native void disposeInternal(long handle);
 
   /**
@@ -157,7 +162,7 @@ public class WriteBatch extends RocksObject {
 
     /**
      * shouldContinue is called by the underlying iterator
-     * (WriteBatch::Iterate.If it returns false,
+     * WriteBatch::Iterate. If it returns false,
      * iteration is halted. Otherwise, it continues
      * iterating. The default implementation always
      * returns true.
diff --git a/java/rocksjni/write_batch.cc b/java/rocksjni/write_batch.cc
index 57f4cb136..1abd8c0de 100644
--- a/java/rocksjni/write_batch.cc
+++ b/java/rocksjni/write_batch.cc
@@ -317,7 +317,7 @@ void Java_org_rocksdb_WriteBatch_00024Handler_createNewHandler0(
  */
 void Java_org_rocksdb_WriteBatch_00024Handler_disposeInternal(
     JNIEnv* env, jobject jobj, jlong handle) {
-    delete reinterpret_cast<rocksdb::WriteBatchHandlerJniCallback*>(handle);
+  delete reinterpret_cast<rocksdb::WriteBatchHandlerJniCallback*>(handle);
 }
 
 /*
diff --git a/java/rocksjni/writebatchhandlerjnicallback.cc b/java/rocksjni/writebatchhandlerjnicallback.cc
index 475ab18f1..22f5117b3 100644
--- a/java/rocksjni/writebatchhandlerjnicallback.cc
+++ b/java/rocksjni/writebatchhandlerjnicallback.cc
@@ -11,12 +11,8 @@
 
 namespace rocksdb {
 WriteBatchHandlerJniCallback::WriteBatchHandlerJniCallback(
-    JNIEnv* env, jobject jWriteBatchHandler) {
-
-  // Note: WriteBatchHandler methods may be accessed by multiple threads,
-  // so we ref the jvm not the env
-  const jint rs = env->GetJavaVM(&m_jvm);
-  assert(rs == JNI_OK);
+    JNIEnv* env, jobject jWriteBatchHandler)
+    : m_env(env) {
 
   // Note: we want to access the Java WriteBatchHandler instance
   // across multiple method calls, so we create a global ref
@@ -29,70 +25,80 @@ WriteBatchHandlerJniCallback::WriteBatchHandlerJniCallback(
   m_jContinueMethodId = WriteBatchHandlerJni::getContinueMethodId(env);
 }
 
-/**
- * Attach/Get a JNIEnv for the current native thread
- */
-JNIEnv* WriteBatchHandlerJniCallback::getJniEnv() const {
-  JNIEnv *env;
-  jint rs = m_jvm->AttachCurrentThread(reinterpret_cast<void **>(&env), NULL);
-  assert(rs == JNI_OK);
-  return env;
-}
-
 void WriteBatchHandlerJniCallback::Put(const Slice& key, const Slice& value) {
-  getJniEnv()->CallVoidMethod(
+  const jbyteArray j_key = sliceToJArray(key);
+  const jbyteArray j_value = sliceToJArray(value);
+
+  m_env->CallVoidMethod(
       m_jWriteBatchHandler,
       m_jPutMethodId,
-      sliceToJArray(key),
-      sliceToJArray(value));
+      j_key,
+      j_value);
+
+  m_env->DeleteLocalRef(j_value);
+  m_env->DeleteLocalRef(j_key);
 }
 
 void WriteBatchHandlerJniCallback::Merge(const Slice& key, const Slice& value) {
-  getJniEnv()->CallVoidMethod(
+  const jbyteArray j_key = sliceToJArray(key);
+  const jbyteArray j_value = sliceToJArray(value);
+
+  m_env->CallVoidMethod(
       m_jWriteBatchHandler,
       m_jMergeMethodId,
-      sliceToJArray(key),
-      sliceToJArray(value));
+      j_key,
+      j_value);
+
+  m_env->DeleteLocalRef(j_value);
+  m_env->DeleteLocalRef(j_key);
 }
 
 void WriteBatchHandlerJniCallback::Delete(const Slice& key) {
-  getJniEnv()->CallVoidMethod(
+  const jbyteArray j_key = sliceToJArray(key);
+
+  m_env->CallVoidMethod(
       m_jWriteBatchHandler,
       m_jDeleteMethodId,
-      sliceToJArray(key));
+      j_key);
+
+  m_env->DeleteLocalRef(j_key);
 }
 
 void WriteBatchHandlerJniCallback::LogData(const Slice& blob) {
-  getJniEnv()->CallVoidMethod(
+  const jbyteArray j_blob = sliceToJArray(blob);
+
+  m_env->CallVoidMethod(
       m_jWriteBatchHandler,
       m_jLogDataMethodId,
-      sliceToJArray(blob));
+      j_blob);
+
+  m_env->DeleteLocalRef(j_blob);
 }
 
 bool WriteBatchHandlerJniCallback::Continue() {
-  jboolean jContinue = getJniEnv()->CallBooleanMethod(
+  jboolean jContinue = m_env->CallBooleanMethod(
       m_jWriteBatchHandler,
       m_jContinueMethodId);
 
   return static_cast<bool>(jContinue == JNI_TRUE);
 }
 
+/*
+ * Creates a Java Byte Array from the data in a Slice
+ *
+ * When calling this function
+ * you must remember to call env->DeleteLocalRef
+ * on the result after you have finished with it
+ */
 jbyteArray WriteBatchHandlerJniCallback::sliceToJArray(const Slice& s) {
-  jbyteArray ja = getJniEnv()->NewByteArray(s.size());
-  getJniEnv()->SetByteArrayRegion(
+  jbyteArray ja = m_env->NewByteArray(s.size());
+  m_env->SetByteArrayRegion(
       ja, 0, s.size(),
       reinterpret_cast<const jbyte*>(s.data()));
   return ja;
 }
 
 WriteBatchHandlerJniCallback::~WriteBatchHandlerJniCallback() {
-  JNIEnv* m_env = getJniEnv();
-
   m_env->DeleteGlobalRef(m_jWriteBatchHandler);
-
-  // Note: do not need to explicitly detach, as this function is effectively
-  // called from the Java class's disposeInternal method, and so already
-  // has an attached thread, getJniEnv above is just a no-op Attach to get
-  // the env jvm->DetachCurrentThread();
 }
 }  // namespace rocksdb
diff --git a/java/rocksjni/writebatchhandlerjnicallback.h b/java/rocksjni/writebatchhandlerjnicallback.h
index 69f68a533..9a2a47e80 100644
--- a/java/rocksjni/writebatchhandlerjnicallback.h
+++ b/java/rocksjni/writebatchhandlerjnicallback.h
@@ -17,8 +17,8 @@ namespace rocksdb {
  * This class acts as a bridge between C++
  * and Java. The methods in this class will be
  * called back from the RocksDB storage engine (C++)
- * we then callback to the appropriate Java method
- * this enables Write Batch Handlers to be implemented in Java.
+ * which calls the appropriate Java method.
+ * This enables Write Batch Handlers to be implemented in Java.
  */
 class WriteBatchHandlerJniCallback : public WriteBatch::Handler {
  public:
@@ -32,9 +32,8 @@ class WriteBatchHandlerJniCallback : public WriteBatch::Handler {
     bool Continue();
 
  private:
-    JavaVM* m_jvm;
+    JNIEnv* m_env;
     jobject m_jWriteBatchHandler;
-    JNIEnv* getJniEnv() const;
     jbyteArray sliceToJArray(const Slice& s);
     jmethodID m_jPutMethodId;
     jmethodID m_jMergeMethodId;

From bcdb9671c06e99f89319baa078a6163bd3969be2 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Sun, 9 Nov 2014 13:01:50 -0500
Subject: [PATCH 448/829] Fix build

---
 util/sst_dump_tool.cc | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/util/sst_dump_tool.cc b/util/sst_dump_tool.cc
index f5cc50fa8..46fc10d79 100644
--- a/util/sst_dump_tool.cc
+++ b/util/sst_dump_tool.cc
@@ -6,6 +6,10 @@
 
 #include "rocksdb/sst_dump_tool.h"
 
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
 #include <map>
 #include <string>
 #include <vector>
@@ -88,10 +92,10 @@ SstFileReader::SstFileReader(const std::string& file_path,
   init_result_ = NewTableReader(file_name_);
 }
 
-extern uint64_t kBlockBasedTableMagicNumber;
-extern uint64_t kLegacyBlockBasedTableMagicNumber;
-extern uint64_t kPlainTableMagicNumber;
-extern uint64_t kLegacyPlainTableMagicNumber;
+extern const uint64_t kBlockBasedTableMagicNumber;
+extern const uint64_t kLegacyBlockBasedTableMagicNumber;
+extern const uint64_t kPlainTableMagicNumber;
+extern const uint64_t kLegacyPlainTableMagicNumber;
 
 Status SstFileReader::NewTableReader(const std::string& file_path) {
   uint64_t magic_number;

From 4a3bd2bad252951c5fe9b3d256d810e235edcd2e Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Mon, 10 Nov 2014 11:57:58 -0800
Subject: [PATCH 449/829] Optimize usage of Status in CompactionJob

Summary: Based on @ljin feedback

Test Plan: compiles

Reviewers: ljin, yhchiang, sdong

Reviewed By: sdong

Subscribers: ljin, dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D28515
---
 db/compaction_job.cc | 13 ++++++-------
 db/compaction_job.h  |  4 ++--
 db/db_impl.cc        | 10 ++++------
 3 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/db/compaction_job.cc b/db/compaction_job.cc
index 3395085a9..91bea0601 100644
--- a/db/compaction_job.cc
+++ b/db/compaction_job.cc
@@ -455,14 +455,14 @@ Status CompactionJob::Run() {
   return status;
 }
 
-Status CompactionJob::Install(Status status, port::Mutex* db_mutex) {
+void CompactionJob::Install(Status* status, port::Mutex* db_mutex) {
   db_mutex->AssertHeld();
   ColumnFamilyData* cfd = compact_->compaction->column_family_data();
   cfd->internal_stats()->AddCompactionStats(
       compact_->compaction->output_level(), compaction_stats_);
 
-  if (status.ok()) {
-    status = InstallCompactionResults(db_mutex);
+  if (status->ok()) {
+    *status = InstallCompactionResults(db_mutex);
   }
   VersionStorageInfo::LevelSummaryStorage tmp;
   const auto& stats = compaction_stats_;
@@ -483,11 +483,10 @@ Status CompactionJob::Install(Status status, port::Mutex* db_mutex) {
               (stats.bytes_written + stats.bytes_readnp1 + stats.bytes_readn) /
                   static_cast<double>(stats.bytes_readn),
               stats.bytes_written / static_cast<double>(stats.bytes_readn),
-              status.ToString().c_str(), stats.num_input_records,
+              status->ToString().c_str(), stats.num_input_records,
               stats.num_dropped_records);
 
-  CleanupCompaction(status);
-  return status;
+  CleanupCompaction(*status);
 }
 
 Status CompactionJob::ProcessKeyValueCompaction(int64_t* imm_micros,
@@ -1054,7 +1053,7 @@ Status CompactionJob::OpenCompactionOutputFile() {
   return s;
 }
 
-void CompactionJob::CleanupCompaction(Status status) {
+void CompactionJob::CleanupCompaction(const Status& status) {
   if (compact_->builder != nullptr) {
     // May happen if we get a shutdown call in the middle of compaction
     compact_->builder->Abandon();
diff --git a/db/compaction_job.h b/db/compaction_job.h
index e993ea675..4ce440a36 100644
--- a/db/compaction_job.h
+++ b/db/compaction_job.h
@@ -75,7 +75,7 @@ class CompactionJob {
   Status Run();
   // REQUIRED: mutex held
   // status is the return of Run()
-  Status Install(Status status, port::Mutex* db_mutex);
+  void Install(Status* status, port::Mutex* db_mutex);
 
  private:
   void AllocateCompactionOutputFileNumbers();
@@ -92,7 +92,7 @@ class CompactionJob {
       SequenceNumber* prev_snapshot);
   void RecordCompactionIOStats();
   Status OpenCompactionOutputFile();
-  void CleanupCompaction(Status status);
+  void CleanupCompaction(const Status& status);
 
   // CompactionJob state
   struct CompactionState;
diff --git a/db/db_impl.cc b/db/db_impl.cc
index 8ac509249..893dfdee7 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -1295,12 +1295,10 @@ Status DBImpl::CompactFilesImpl(
   mutex_.Unlock();
   Status status = compaction_job.Run();
   mutex_.Lock();
+  compaction_job.Install(&status, &mutex_);
   if (status.ok()) {
-    status = compaction_job.Install(status, &mutex_);
-    if (status.ok()) {
-      InstallSuperVersionBackground(c->column_family_data(), &job_context,
-                                    *c->mutable_cf_options());
-    }
+    InstallSuperVersionBackground(c->column_family_data(), &job_context,
+                                  *c->mutable_cf_options());
   }
   c->ReleaseCompactionFiles(s);
   c->ReleaseInputs();
@@ -2070,7 +2068,7 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress, JobContext* job_context,
     mutex_.Unlock();
     status = compaction_job.Run();
     mutex_.Lock();
-    status = compaction_job.Install(status, &mutex_);
+    compaction_job.Install(&status, &mutex_);
     if (status.ok()) {
       InstallSuperVersionBackground(c->column_family_data(), job_context,
                                     *c->mutable_cf_options());

From dd726a59efc51b4bd4566fb6c578634a34306747 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Mon, 10 Nov 2014 11:01:28 -0800
Subject: [PATCH 450/829] Bump Version Number to 3.8

Summary: As tittle.

Test Plan: Not needed.

Reviewers: ljin, igor, yhchiang, rven, fpi

Reviewed By: fpi

Subscribers: leveldb, fpi, dhruba

Differential Revision: https://reviews.facebook.net/D28629
---
 include/rocksdb/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/rocksdb/version.h b/include/rocksdb/version.h
index bef989661..fde546c89 100644
--- a/include/rocksdb/version.h
+++ b/include/rocksdb/version.h
@@ -5,7 +5,7 @@
 #pragma once
 
 #define ROCKSDB_MAJOR 3
-#define ROCKSDB_MINOR 7
+#define ROCKSDB_MINOR 8
 #define ROCKSDB_PATCH 0
 
 // Do not use these. We made the mistake of declaring macros starting with

From c7ee9c3ab72c8bd18832e07e1bb5cc3583754036 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Mon, 10 Nov 2014 17:39:38 -0500
Subject: [PATCH 451/829] Fix -Wnon-virtual-dtor errors

Summary: This breaks mongo+rocks build https://mci.10gen.com/task_log_raw/rocksdb_ubuntu1404_rocksdb_c6e8e3d868660dc66b3bbd438cdc135df6356c5a_14_11_10_21_36_10_compile_ubuntu1404_rocksdb/0?type=T

Test Plan: m check + -Wnon-virtual-dtor

Reviewers: sdong

Reviewed By: sdong

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D28653
---
 Makefile                   | 2 +-
 db/wal_manager.h           | 4 ++--
 include/rocksdb/listener.h | 1 +
 util/arena.h               | 2 +-
 4 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/Makefile b/Makefile
index b8913108f..ff8dcd3c3 100644
--- a/Makefile
+++ b/Makefile
@@ -69,7 +69,7 @@ install:
 	@[ ! -e $(SHARED) ] || install -C -m 644 $(SHARED) $(INSTALL_PATH)/lib
 #-------------------------------------------------
 
-WARNING_FLAGS = -Wall -Werror -Wsign-compare -Wshadow
+WARNING_FLAGS = -Wall -Werror -Wsign-compare -Wshadow -Wnon-virtual-dtor
 CFLAGS += $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CCFLAGS) $(OPT)
 CXXFLAGS += $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CXXFLAGS) $(OPT) -Woverloaded-virtual
 
diff --git a/db/wal_manager.h b/db/wal_manager.h
index 493c426e3..fc04863b2 100644
--- a/db/wal_manager.h
+++ b/db/wal_manager.h
@@ -38,9 +38,9 @@ class WalManager {
         env_(db_options.env),
         purge_wal_files_last_run_(0) {}
 
-  virtual Status GetSortedWalFiles(VectorLogPtr& files);
+  Status GetSortedWalFiles(VectorLogPtr& files);
 
-  virtual Status GetUpdatesSince(
+  Status GetUpdatesSince(
       SequenceNumber seq_number, std::unique_ptr<TransactionLogIterator>* iter,
       const TransactionLogIterator::ReadOptions& read_options,
       VersionSet* version_set);
diff --git a/include/rocksdb/listener.h b/include/rocksdb/listener.h
index 33e5fc51f..4ad1ae04b 100644
--- a/include/rocksdb/listener.h
+++ b/include/rocksdb/listener.h
@@ -58,6 +58,7 @@ class EventListener {
       const std::string& file_path,
       bool triggered_writes_slowdown,
       bool triggered_writes_stop) {}
+  virtual ~EventListener() {}
 };
 
 }  // namespace rocksdb
diff --git a/util/arena.h b/util/arena.h
index 0855c205c..dfd8e2b24 100644
--- a/util/arena.h
+++ b/util/arena.h
@@ -69,7 +69,7 @@ class Arena {
 
   // If an allocation is too big, we'll allocate an irregular block with the
   // same size of that allocation.
-  virtual size_t IrregularBlockNum() const { return irregular_block_num; }
+  size_t IrregularBlockNum() const { return irregular_block_num; }
 
   size_t BlockSize() const { return kBlockSize; }
 

From d88568c68db147e172e3d7fb5e3654f5c3760c61 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Mon, 10 Nov 2014 17:41:51 -0500
Subject: [PATCH 452/829] Move -Wnon-virtual-dtor to c++ flags

---
 Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index ff8dcd3c3..55053be13 100644
--- a/Makefile
+++ b/Makefile
@@ -69,9 +69,9 @@ install:
 	@[ ! -e $(SHARED) ] || install -C -m 644 $(SHARED) $(INSTALL_PATH)/lib
 #-------------------------------------------------
 
-WARNING_FLAGS = -Wall -Werror -Wsign-compare -Wshadow -Wnon-virtual-dtor
+WARNING_FLAGS = -Wall -Werror -Wsign-compare -Wshadow
 CFLAGS += $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CCFLAGS) $(OPT)
-CXXFLAGS += $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CXXFLAGS) $(OPT) -Woverloaded-virtual
+CXXFLAGS += $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CXXFLAGS) $(OPT) -Woverloaded-virtual -Wnon-virtual-dtor
 
 LDFLAGS += $(PLATFORM_LDFLAGS)
 

From 35c8c814e895ee36218e1dc7b06bbdad2cef132e Mon Sep 17 00:00:00 2001
From: Tomislav Novak <tnovak@fb.com>
Date: Fri, 26 Sep 2014 14:20:24 -0700
Subject: [PATCH 453/829] Make ForwardIterator::status() more efficient

Summary:
In D19581 I made `ForwardIterator::status()` check all child iterators,
including immutable ones. It's, however, not necessary to do it every
time -- it'll suffice to check only when they're used and their status
could change.

This diff:
* introduces `immutable_status_` which is updated by `Seek()` and `Next()`
* removes special handling of `kIncomplete` status in those methods

Test Plan:
* `db_test`
* hacked ReadSequential in db_bench.cc to check `status()` in addition to
  validity:

```
   $ ./db_bench -use_existing_db -benchmarks readseq -disable_auto_compactions \
      -use_tailing_iterator  # without this patch
   Keys:       16 bytes each
   Values:     100 bytes each (50 bytes after compression)
   Entries:    1000000
   [...]
   DB path: [/dev/shm/rocksdbtest/dbbench]
   readseq      :       0.562 micros/op 1778103 ops/sec;   98.4 MB/s
   $ ./db_bench -use_existing_db -benchmarks readseq -disable_auto_compactions \
      -use_tailing_iterator  # with the patch
   readseq      :       0.433 micros/op 2311363 ops/sec;  127.8 MB/s
```

Reviewers: igor, sdong, ljin

Reviewed By: ljin

Subscribers: dhruba, leveldb, march, lovro

Differential Revision: https://reviews.facebook.net/D24063
---
 db/db_bench.cc         |  5 +++-
 db/forward_iterator.cc | 55 ++++++++++++------------------------------
 db/forward_iterator.h  |  1 +
 3 files changed, 21 insertions(+), 40 deletions(-)

diff --git a/db/db_bench.cc b/db/db_bench.cc
index 79572e875..a11b9cb5d 100644
--- a/db/db_bench.cc
+++ b/db/db_bench.cc
@@ -2220,7 +2220,10 @@ class Benchmark {
   }
 
   void ReadSequential(ThreadState* thread, DB* db) {
-    Iterator* iter = db->NewIterator(ReadOptions(FLAGS_verify_checksum, true));
+    ReadOptions options(FLAGS_verify_checksum, true);
+    options.tailing = FLAGS_use_tailing_iterator;
+
+    Iterator* iter = db->NewIterator(options);
     int64_t i = 0;
     int64_t bytes = 0;
     for (iter->SeekToFirst(); i < reads_ && iter->Valid(); iter->Next()) {
diff --git a/db/forward_iterator.cc b/db/forward_iterator.cc
index 635678160..a9a98073b 100644
--- a/db/forward_iterator.cc
+++ b/db/forward_iterator.cc
@@ -125,6 +125,8 @@ ForwardIterator::ForwardIterator(DBImpl* db, const ReadOptions& read_options,
       sv_(current_sv),
       mutable_iter_(nullptr),
       current_(nullptr),
+      status_(Status::OK()),
+      immutable_status_(Status::OK()),
       valid_(false),
       is_prev_set_(false),
       is_prev_inclusive_(false) {
@@ -177,7 +179,7 @@ void ForwardIterator::SeekToFirst() {
   if (sv_ == nullptr ||
       sv_ ->version_number != cfd_->GetSuperVersionNumber()) {
     RebuildIterators(true);
-  } else if (status_.IsIncomplete()) {
+  } else if (immutable_status_.IsIncomplete()) {
     ResetIncompleteIterators();
   }
   SeekInternal(Slice(), true);
@@ -187,7 +189,7 @@ void ForwardIterator::Seek(const Slice& internal_key) {
   if (sv_ == nullptr ||
       sv_ ->version_number != cfd_->GetSuperVersionNumber()) {
     RebuildIterators(true);
-  } else if (status_.IsIncomplete()) {
+  } else if (immutable_status_.IsIncomplete()) {
     ResetIncompleteIterators();
   }
   SeekInternal(internal_key, false);
@@ -205,13 +207,16 @@ void ForwardIterator::SeekInternal(const Slice& internal_key,
   // if it turns to need to seek immutable often. We probably want to have
   // an option to turn it off.
   if (seek_to_first || NeedToSeekImmutable(internal_key)) {
+    immutable_status_ = Status::OK();
     {
       auto tmp = MinIterHeap(MinIterComparator(&cfd_->internal_comparator()));
       immutable_min_heap_.swap(tmp);
     }
     for (auto* m : imm_iters_) {
       seek_to_first ? m->SeekToFirst() : m->Seek(internal_key);
-      if (m->Valid()) {
+      if (!m->status().ok()) {
+        immutable_status_ = m->status();
+      } else if (m->Valid()) {
         immutable_min_heap_.push(m);
       }
     }
@@ -235,13 +240,8 @@ void ForwardIterator::SeekInternal(const Slice& internal_key,
         l0_iters_[i]->Seek(internal_key);
       }
 
-      if (l0_iters_[i]->status().IsIncomplete()) {
-        // if any of the immutable iterators is incomplete (no-io option was
-        // used), we are unable to reliably find the smallest key
-        assert(read_options_.read_tier == kBlockCacheTier);
-        status_ = l0_iters_[i]->status();
-        valid_ = false;
-        return;
+      if (!l0_iters_[i]->status().ok()) {
+        immutable_status_ = l0_iters_[i]->status();
       } else if (l0_iters_[i]->Valid()) {
         immutable_min_heap_.push(l0_iters_[i]);
       }
@@ -311,12 +311,8 @@ void ForwardIterator::SeekInternal(const Slice& internal_key,
         seek_to_first ? level_iters_[level - 1]->SeekToFirst() :
                         level_iters_[level - 1]->Seek(internal_key);
 
-        if (level_iters_[level - 1]->status().IsIncomplete()) {
-          // see above
-          assert(read_options_.read_tier == kBlockCacheTier);
-          status_ = level_iters_[level - 1]->status();
-          valid_ = false;
-          return;
+        if (!level_iters_[level - 1]->status().ok()) {
+          immutable_status_ = level_iters_[level - 1]->status();
         } else if (level_iters_[level - 1]->Valid()) {
           immutable_min_heap_.push(level_iters_[level - 1]);
         }
@@ -371,11 +367,8 @@ void ForwardIterator::Next() {
 
   current_->Next();
   if (current_ != mutable_iter_) {
-    if (current_->status().IsIncomplete()) {
-      assert(read_options_.read_tier == kBlockCacheTier);
-      status_ = current_->status();
-      valid_ = false;
-      return;
+    if (!current_->status().ok()) {
+      immutable_status_ = current_->status();
     } else if (current_->Valid()) {
       immutable_min_heap_.push(current_);
     }
@@ -401,23 +394,7 @@ Status ForwardIterator::status() const {
     return mutable_iter_->status();
   }
 
-  for (auto *it : imm_iters_) {
-    if (it && !it->status().ok()) {
-      return it->status();
-    }
-  }
-  for (auto *it : l0_iters_) {
-    if (it && !it->status().ok()) {
-      return it->status();
-    }
-  }
-  for (auto *it : level_iters_) {
-    if (it && !it->status().ok()) {
-      return it->status();
-    }
-  }
-
-  return Status::OK();
+  return immutable_status_;
 }
 
 void ForwardIterator::RebuildIterators(bool refresh_sv) {
@@ -511,7 +488,7 @@ bool ForwardIterator::NeedToSeekImmutable(const Slice& target) {
   // 'target' belongs to that interval (immutable_min_heap_.top() is already
   // at the correct position).
 
-  if (!valid_ || !current_ || !is_prev_set_) {
+  if (!valid_ || !current_ || !is_prev_set_ || !immutable_status_.ok()) {
     return true;
   }
   Slice prev_key = prev_key_.GetKey();
diff --git a/db/forward_iterator.h b/db/forward_iterator.h
index 537dc1352..ccc23ebaa 100644
--- a/db/forward_iterator.h
+++ b/db/forward_iterator.h
@@ -97,6 +97,7 @@ class ForwardIterator : public Iterator {
   Iterator* current_;
   // internal iterator status
   Status status_;
+  Status immutable_status_;
   bool valid_;
 
   IterKey prev_key_;

From 8e5547f64f5d0581027093d0d4b5fc18e43a5d49 Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Sun, 9 Nov 2014 20:08:35 +0100
Subject: [PATCH 454/829] [RocksJava] Makefile restructured

---
 java/Makefile | 37 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 36 insertions(+), 1 deletion(-)

diff --git a/java/Makefile b/java/Makefile
index 14469849b..37d704428 100644
--- a/java/Makefile
+++ b/java/Makefile
@@ -1,4 +1,38 @@
-NATIVE_JAVA_CLASSES = org.rocksdb.RocksDB org.rocksdb.Options org.rocksdb.DBOptions org.rocksdb.WriteBatch org.rocksdb.WriteBatch.Handler org.rocksdb.WriteBatchInternal org.rocksdb.WriteBatchTest org.rocksdb.WriteOptions org.rocksdb.BackupableDB org.rocksdb.BackupableDBOptions org.rocksdb.Statistics org.rocksdb.RocksIterator org.rocksdb.VectorMemTableConfig org.rocksdb.SkipListMemTableConfig org.rocksdb.HashLinkedListMemTableConfig org.rocksdb.HashSkipListMemTableConfig org.rocksdb.PlainTableConfig org.rocksdb.BlockBasedTableConfig org.rocksdb.ReadOptions org.rocksdb.Filter org.rocksdb.BloomFilter org.rocksdb.ComparatorOptions org.rocksdb.AbstractComparator org.rocksdb.Comparator org.rocksdb.DirectComparator org.rocksdb.AbstractSlice org.rocksdb.Slice org.rocksdb.DirectSlice org.rocksdb.RestoreOptions org.rocksdb.RestoreBackupableDB org.rocksdb.RocksEnv org.rocksdb.GenericRateLimiterConfig org.rocksdb.ColumnFamilyHandle org.rocksdb.MergeOperator org.rocksdb.StringAppendOperator
+NATIVE_JAVA_CLASSES = org.rocksdb.AbstractComparator\
+	org.rocksdb.AbstractSlice\
+	org.rocksdb.BackupableDB\
+	org.rocksdb.BackupableDBOptions\
+	org.rocksdb.BlockBasedTableConfig\
+	org.rocksdb.BloomFilter\
+	org.rocksdb.ColumnFamilyHandle\
+	org.rocksdb.Comparator\
+	org.rocksdb.ComparatorOptions\
+	org.rocksdb.DBOptions\
+	org.rocksdb.DirectComparator\
+	org.rocksdb.DirectSlice\
+	org.rocksdb.Filter\
+	org.rocksdb.GenericRateLimiterConfig\
+	org.rocksdb.HashLinkedListMemTableConfig\
+	org.rocksdb.HashSkipListMemTableConfig\
+	org.rocksdb.MergeOperator\
+	org.rocksdb.Options\
+	org.rocksdb.PlainTableConfig\
+	org.rocksdb.ReadOptions\
+	org.rocksdb.RestoreBackupableDB\
+	org.rocksdb.RestoreOptions\
+	org.rocksdb.RocksDB\
+	org.rocksdb.RocksEnv\
+	org.rocksdb.RocksIterator\
+	org.rocksdb.SkipListMemTableConfig\
+	org.rocksdb.Slice\
+	org.rocksdb.Statistics\
+	org.rocksdb.VectorMemTableConfig\
+	org.rocksdb.StringAppendOperator\
+	org.rocksdb.WriteBatch\
+	org.rocksdb.WriteBatch.Handler\
+	org.rocksdb.WriteBatchInternal\
+	org.rocksdb.WriteBatchTest\
+	org.rocksdb.WriteOptions\
 
 ROCKSDB_MAJOR = $(shell egrep "ROCKSDB_MAJOR.[0-9]" ../include/rocksdb/version.h | cut -d ' ' -f 3)
 ROCKSDB_MINOR = $(shell egrep "ROCKSDB_MINOR.[0-9]" ../include/rocksdb/version.h | cut -d ' ' -f 3)
@@ -43,6 +77,7 @@ test: java
 	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.DBOptionsTest
 	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.ColumnFamilyTest
 	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.FilterTest
+	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.FlushTest
 	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.KeyMayExistTest
 	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.MemTableTest
 	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.OptionsTest

From fc6fcbab9ef40af4167cac4240452fe19721aa27 Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Sun, 9 Nov 2014 20:09:39 +0100
Subject: [PATCH 455/829] [RocksJava] Flush functionality

RocksJava now supports also flush functionality of
RocksDB.
---
 java/Makefile                        |  1 +
 java/org/rocksdb/FlushOptions.java   | 51 +++++++++++++++++
 java/org/rocksdb/RocksDB.java        | 39 ++++++++++++-
 java/org/rocksdb/test/FlushTest.java | 47 ++++++++++++++++
 java/rocksjni/options.cc             | 82 +++++++++++++++++++++++-----
 java/rocksjni/portal.h               | 28 ++++++++++
 java/rocksjni/rocksjni.cc            | 45 +++++++++++++++
 7 files changed, 277 insertions(+), 16 deletions(-)
 create mode 100644 java/org/rocksdb/FlushOptions.java
 create mode 100644 java/org/rocksdb/test/FlushTest.java

diff --git a/java/Makefile b/java/Makefile
index 37d704428..9fd714ee9 100644
--- a/java/Makefile
+++ b/java/Makefile
@@ -10,6 +10,7 @@ NATIVE_JAVA_CLASSES = org.rocksdb.AbstractComparator\
 	org.rocksdb.DBOptions\
 	org.rocksdb.DirectComparator\
 	org.rocksdb.DirectSlice\
+	org.rocksdb.FlushOptions\
 	org.rocksdb.Filter\
 	org.rocksdb.GenericRateLimiterConfig\
 	org.rocksdb.HashLinkedListMemTableConfig\
diff --git a/java/org/rocksdb/FlushOptions.java b/java/org/rocksdb/FlushOptions.java
new file mode 100644
index 000000000..e481c7664
--- /dev/null
+++ b/java/org/rocksdb/FlushOptions.java
@@ -0,0 +1,51 @@
+package org.rocksdb;
+
+/**
+ * FlushOptions to be passed to flush operations of
+ * {@link org.rocksdb.RocksDB}.
+ */
+public class FlushOptions extends RocksObject {
+
+  /**
+   * Construct a new instance of FlushOptions.
+   */
+  public FlushOptions(){
+    super();
+    newFlushOptions();
+  }
+
+  /**
+   * Set if the flush operation shall block until it terminates.
+   *
+   * @param waitForFlush boolean value indicating if the flush
+   *     operations waits for termination of the flush process.
+   *
+   * @return instance of current FlushOptions.
+   */
+  public FlushOptions setWaitForFlush(boolean waitForFlush) {
+    assert(isInitialized());
+    waitForFlush(nativeHandle_);
+    return this;
+  }
+
+  /**
+   * Wait for flush to finished.
+   *
+   * @return boolean value indicating if the flush operation
+   *     waits for termination of the flush process.
+   */
+  public boolean waitForFlush() {
+    assert(isInitialized());
+    return waitForFlush(nativeHandle_);
+  }
+
+  @Override protected void disposeInternal() {
+    disposeInternal(nativeHandle_);
+  }
+
+  private native void newFlushOptions();
+  private native void disposeInternal(long handle);
+  private native void setWaitForFlush(long handle,
+      boolean wait);
+  private native boolean waitForFlush(long handle);
+}
diff --git a/java/org/rocksdb/RocksDB.java b/java/org/rocksdb/RocksDB.java
index 40680e438..8efdaea1f 100644
--- a/java/org/rocksdb/RocksDB.java
+++ b/java/org/rocksdb/RocksDB.java
@@ -1085,6 +1085,40 @@ public class RocksDB extends RocksObject {
     dropColumnFamily(nativeHandle_, columnFamilyHandle.nativeHandle_);
   }
 
+  /**
+   * <p>Flush all memory table data.</p>
+   *
+   * <p>Note: it must be ensured that the FlushOptions instance
+   * is not GC'ed before this method finishes. If the wait parameter is
+   * set to false, flush processing is asynchronous.</p>
+   *
+   * @param flushOptions {@link org.rocksdb.FlushOptions} instance.
+   * @throws RocksDBException thrown if an error occurs within the native
+   *     part of the library.
+   */
+  public void flush(FlushOptions flushOptions)
+      throws RocksDBException {
+    flush(nativeHandle_, flushOptions.nativeHandle_);
+  }
+
+  /**
+   * <p>Flush all memory table data.</p>
+   *
+   * <p>Note: it must be ensured that the FlushOptions instance
+   * is not GC'ed before this method finishes. If the wait parameter is
+   * set to false, flush processing is asynchronous.</p>
+   *
+   * @param flushOptions {@link org.rocksdb.FlushOptions} instance.
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} instance.
+   * @throws RocksDBException thrown if an error occurs within the native
+   *     part of the library.
+   */
+  public void flush(FlushOptions flushOptions,
+      ColumnFamilyHandle columnFamilyHandle) throws RocksDBException {
+    flush(nativeHandle_, flushOptions.nativeHandle_,
+        columnFamilyHandle.nativeHandle_);
+  }
+
   /**
    * Private constructor.
    */
@@ -1197,10 +1231,13 @@ public class RocksDB extends RocksObject {
   protected native void releaseSnapshot(
       long nativeHandle, long snapshotHandle);
   private native void disposeInternal(long handle);
-
   private native long createColumnFamily(long handle, long opt_handle,
       String name) throws RocksDBException;
   private native void dropColumnFamily(long handle, long cfHandle) throws RocksDBException;
+  private native void flush(long handle, long flushOptHandle)
+      throws RocksDBException;
+  private native void flush(long handle, long flushOptHandle,
+      long cfHandle) throws RocksDBException;
 
   protected Options options_;
 }
diff --git a/java/org/rocksdb/test/FlushTest.java b/java/org/rocksdb/test/FlushTest.java
new file mode 100644
index 000000000..1742be67f
--- /dev/null
+++ b/java/org/rocksdb/test/FlushTest.java
@@ -0,0 +1,47 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+package org.rocksdb.test;
+
+import org.rocksdb.*;
+
+public class FlushTest {
+
+  static final String db_path = "/tmp/rocksdbjni_flush_test";
+  static {
+    RocksDB.loadLibrary();
+  }
+
+  public static void main(String[] args) {
+    RocksDB db = null;
+    Options options = new Options();
+    WriteOptions wOpt = new WriteOptions();
+    FlushOptions flushOptions = new FlushOptions();
+
+    try {
+      // Setup options
+      options.setCreateIfMissing(true);
+      options.setMaxWriteBufferNumber(10);
+      options.setMinWriteBufferNumberToMerge(10);
+      flushOptions.setWaitForFlush(true);
+      wOpt.setDisableWAL(true);
+      db = RocksDB.open(options, db_path);
+
+      db.put(wOpt, "key1".getBytes(), "value1".getBytes());
+      db.put(wOpt, "key2".getBytes(), "value2".getBytes());
+      db.put(wOpt, "key3".getBytes(), "value3".getBytes());
+      db.put(wOpt, "key4".getBytes(), "value4".getBytes());
+      assert(db.getProperty("rocksdb.num-entries-active-mem-table").equals("4"));
+      db.flush(flushOptions);
+      assert(db.getProperty("rocksdb.num-entries-active-mem-table").equals("0"));
+    } catch (RocksDBException e) {
+      assert(false);
+    }
+
+    db.close();
+    options.dispose();
+    wOpt.dispose();
+    flushOptions.dispose();
+  }
+}
diff --git a/java/rocksjni/options.cc b/java/rocksjni/options.cc
index ee0255d80..de614594f 100644
--- a/java/rocksjni/options.cc
+++ b/java/rocksjni/options.cc
@@ -18,9 +18,11 @@
 #include "include/org_rocksdb_WriteOptions.h"
 #include "include/org_rocksdb_ReadOptions.h"
 #include "include/org_rocksdb_ComparatorOptions.h"
+#include "include/org_rocksdb_FlushOptions.h"
 
 #include "rocksjni/comparatorjnicallback.h"
 #include "rocksjni/portal.h"
+
 #include "rocksdb/db.h"
 #include "rocksdb/options.h"
 #include "rocksdb/statistics.h"
@@ -3607,6 +3609,32 @@ jboolean Java_org_rocksdb_ReadOptions_tailing(
   return reinterpret_cast<rocksdb::ReadOptions*>(jhandle)->tailing;
 }
 
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    setSnapshot
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_ReadOptions_setSnapshot(
+    JNIEnv* env, jobject jobj, jlong jhandle, jlong jsnapshot) {
+  reinterpret_cast<rocksdb::ReadOptions*>(jhandle)->snapshot =
+      reinterpret_cast<rocksdb::Snapshot*>(jsnapshot);
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    snapshot
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_ReadOptions_snapshot(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  auto& snapshot =
+      reinterpret_cast<rocksdb::ReadOptions*>(jhandle)->snapshot;
+  return reinterpret_cast<jlong>(snapshot);
+}
+
+/////////////////////////////////////////////////////////////////////
+// rocksdb::ComparatorOptions
+
 /*
  * Class:     org_rocksdb_ComparatorOptions
  * Method:    newComparatorOptions
@@ -3651,25 +3679,49 @@ void Java_org_rocksdb_ComparatorOptions_disposeInternal(
   rocksdb::ComparatorOptionsJni::setHandle(env, jobj, nullptr);
 }
 
+/////////////////////////////////////////////////////////////////////
+// rocksdb::FlushOptions
+
 /*
- * Class:     org_rocksdb_ReadOptions
- * Method:    setSnapshot
- * Signature: (JJ)V
+ * Class:     org_rocksdb_FlushOptions
+ * Method:    newFlushOptions
+ * Signature: ()V
  */
-void Java_org_rocksdb_ReadOptions_setSnapshot(
-    JNIEnv* env, jobject jobj, jlong jhandle, jlong jsnapshot) {
-  reinterpret_cast<rocksdb::ReadOptions*>(jhandle)->snapshot =
-      reinterpret_cast<rocksdb::Snapshot*>(jsnapshot);
+void Java_org_rocksdb_FlushOptions_newFlushOptions(
+    JNIEnv* env, jobject jobj) {
+  auto flush_opt = new rocksdb::FlushOptions();
+  rocksdb::FlushOptionsJni::setHandle(env, jobj, flush_opt);
 }
 
 /*
- * Class:     org_rocksdb_ReadOptions
- * Method:    snapshot
- * Signature: (J)J
+ * Class:     org_rocksdb_FlushOptions
+ * Method:    setWaitForFlush
+ * Signature: (JZ)V
  */
-jlong Java_org_rocksdb_ReadOptions_snapshot(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  auto& snapshot =
-      reinterpret_cast<rocksdb::ReadOptions*>(jhandle)->snapshot;
-  return reinterpret_cast<jlong>(snapshot);
+void Java_org_rocksdb_FlushOptions_setWaitForFlush(
+    JNIEnv * env, jobject jobj, jlong jhandle, jboolean jwait) {
+  reinterpret_cast<rocksdb::FlushOptions*>(jhandle)
+    ->wait = static_cast<bool>(jwait);
+}
+
+/*
+ * Class:     org_rocksdb_FlushOptions
+ * Method:    waitForFlush
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_FlushOptions_waitForFlush(
+    JNIEnv * env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::FlushOptions*>(jhandle)
+    ->wait;
+}
+
+/*
+ * Class:     org_rocksdb_FlushOptions
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_FlushOptions_disposeInternal(
+    JNIEnv * env, jobject jobj, jlong jhandle) {
+  delete reinterpret_cast<rocksdb::FlushOptions*>(jhandle);
+  rocksdb::FlushOptionsJni::setHandle(env, jobj, nullptr);
 }
diff --git a/java/rocksjni/portal.h b/java/rocksjni/portal.h
index 54b3b2766..3a5641d46 100644
--- a/java/rocksjni/portal.h
+++ b/java/rocksjni/portal.h
@@ -505,6 +505,34 @@ class ColumnFamilyHandleJni {
   }
 };
 
+class FlushOptionsJni {
+ public:
+    // Get the java class id of org.rocksdb.FlushOptions.
+    static jclass getJClass(JNIEnv* env) {
+      jclass jclazz = env->FindClass("org/rocksdb/FlushOptions");
+      assert(jclazz != nullptr);
+      return jclazz;
+    }
+
+    // Get the field id of the member variable of org.rocksdb.FlushOptions
+    // that stores the pointer to rocksdb::FlushOptions.
+    static jfieldID getHandleFieldID(JNIEnv* env) {
+      static jfieldID fid = env->GetFieldID(
+          getJClass(env), "nativeHandle_", "J");
+      assert(fid != nullptr);
+      return fid;
+    }
+
+    // Pass the FlushOptions pointer to the java side.
+    static void setHandle(
+      JNIEnv* env, jobject jobj,
+      const rocksdb::FlushOptions* op) {
+      env->SetLongField(
+          jobj, getHandleFieldID(env),
+          reinterpret_cast<jlong>(op));
+    }
+};
+
 class ComparatorOptionsJni {
  public:
     // Get the java class id of org.rocksdb.ComparatorOptions.
diff --git a/java/rocksjni/rocksjni.cc b/java/rocksjni/rocksjni.cc
index b17f9bab7..3b00cbe42 100644
--- a/java/rocksjni/rocksjni.cc
+++ b/java/rocksjni/rocksjni.cc
@@ -1255,3 +1255,48 @@ jstring Java_org_rocksdb_RocksDB_getProperty0__JJLjava_lang_String_2I(
 
   return env->NewStringUTF(property_value.data());
 }
+
+//////////////////////////////////////////////////////////////////////////////
+// rocksdb::DB::Flush
+
+void rocksdb_flush_helper(
+    JNIEnv* env, rocksdb::DB* db, const rocksdb::FlushOptions& flush_options,
+  rocksdb::ColumnFamilyHandle* column_family_handle) {
+  rocksdb::Status s;
+  if (column_family_handle != nullptr) {
+    s = db->Flush(flush_options, column_family_handle);
+  } else {
+    s = db->Flush(flush_options);
+  }
+  if (!s.ok()) {
+      rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    flush
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_RocksDB_flush__JJ(
+    JNIEnv* env, jobject jdb, jlong jdb_handle,
+    jlong jflush_options) {
+  auto db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  auto flush_options = reinterpret_cast<rocksdb::FlushOptions*>(jflush_options);
+  rocksdb_flush_helper(env, db, *flush_options, nullptr);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    flush
+ * Signature: (JJJ)V
+ */
+void Java_org_rocksdb_RocksDB_flush__JJJ(
+    JNIEnv* env, jobject jdb, jlong jdb_handle,
+    jlong jflush_options, jlong jcf_handle) {
+  auto db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  auto flush_options = reinterpret_cast<rocksdb::FlushOptions*>(jflush_options);
+  auto cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
+  rocksdb_flush_helper(env, db, *flush_options, cf_handle);
+}
+

From 113796c49318eb230fc5451fb106be5f0733cbec Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Tue, 11 Nov 2014 06:58:47 -0800
Subject: [PATCH 456/829] Fix NewFileNumber()

Summary: I mistakenly changed the behavior to ++next_file_number_ instead of next_file_number_++, as it should have been: https://github.com/facebook/rocksdb/blob/344edbb044ff5c08a43e4a6e9344c5c861552c0e/db/version_set.h#L539

Test Plan: none. not sure if this would break anything. It's just different behavior, so I'd rather not risk

Reviewers: ljin, rven, yhchiang, sdong

Reviewed By: sdong

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D28557
---
 db/db_impl.cc             |  2 --
 db/version_set.h          | 11 +----------
 include/rocksdb/options.h | 11 ++++-------
 3 files changed, 5 insertions(+), 19 deletions(-)

diff --git a/db/db_impl.cc b/db/db_impl.cc
index 893dfdee7..ee0f954fd 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -3003,8 +3003,6 @@ Status DBImpl::SetNewMemtableAndNewLogFile(ColumnFamilyData* cfd,
   if (!s.ok()) {
     // how do we fail if we're not creating new log?
     assert(creating_new_log);
-    // Avoid chewing through file number space in a tight loop.
-    versions_->ReuseLogFileNumber(new_log_number);
     assert(!new_mem);
     assert(!new_log);
     return s;
diff --git a/db/version_set.h b/db/version_set.h
index 0be8c4e1b..3c4eff353 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -535,16 +535,7 @@ class VersionSet {
   uint64_t current_next_file_number() const { return next_file_number_.load(); }
 
   // Allocate and return a new file number
-  uint64_t NewFileNumber() { return next_file_number_.fetch_add(1) + 1; }
-
-  // Arrange to reuse "file_number" unless a newer file number has
-  // already been allocated.
-  // REQUIRES: "file_number" was returned by a call to NewFileNumber().
-  void ReuseLogFileNumber(uint64_t file_number) {
-    auto expected = file_number + 1;
-    std::atomic_compare_exchange_strong(&next_file_number_, &expected,
-                                        file_number);
-  }
+  uint64_t NewFileNumber() { return next_file_number_.fetch_add(1); }
 
   // Return the last sequence number.
   uint64_t LastSequence() const {
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index 1656c5c41..e22ee03eb 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -625,14 +625,11 @@ struct DBOptions {
   // Default: false
   bool error_if_exists;
 
-  // If true, the implementation will do aggressive checking of the
-  // data it is processing and will stop early if it detects any
-  // errors.  This may have unforeseen ramifications: for example, a
-  // corruption of one DB entry may cause a large number of entries to
-  // become unreadable or for the entire DB to become unopenable.
-  // If any of the  writes to the database fails (Put, Delete, Merge, Write),
-  // the database will switch to read-only mode and fail all other
+  // If true, RocksDB will aggressively check consistency of the data.
+  // Also, if any of the  writes to the database fails (Put, Delete, Merge,
+  // Write), the database will switch to read-only mode and fail all other
   // Write operations.
+  // In most cases you want this to be set to true.
   // Default: true
   bool paranoid_checks;
 

From 767777c2bd7bf4be1968dbc35452e556e781ad5f Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Tue, 11 Nov 2014 16:47:22 -0500
Subject: [PATCH 457/829] Turn on -Wshorten-64-to-32 and fix all the errors

Summary:
We need to turn on -Wshorten-64-to-32 for mobile. See D1671432 (internal phabricator) for details.

This diff turns on the warning flag and fixes all the errors. There were also some interesting errors that I might call bugs, especially in plain table. Going forward, I think it makes sense to have this flag turned on and be very very careful when converting 64-bit to 32-bit variables.

Test Plan: compiles

Reviewers: ljin, rven, yhchiang, sdong

Reviewed By: yhchiang

Subscribers: bobbaldwin, dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D28689
---
 Makefile                                      |  1 -
 build_tools/build_detect_platform             |  8 ++
 db/c.cc                                       | 12 ++-
 db/c_test.c                                   |  2 +-
 db/column_family_test.cc                      | 12 +--
 db/compaction.cc                              | 14 +--
 db/compaction.h                               | 12 +--
 db/compaction_job.cc                          | 15 ++--
 db/compaction_picker.cc                       | 10 +--
 db/compaction_picker_test.cc                  | 10 +--
 db/comparator_db_test.cc                      |  8 +-
 db/corruption_test.cc                         | 15 ++--
 db/cuckoo_table_db_test.cc                    |  2 +-
 db/db_bench.cc                                |  8 +-
 db/db_impl.cc                                 |  5 +-
 db/db_iter_test.cc                            |  2 +-
 db/db_test.cc                                 | 27 +++---
 db/dbformat.cc                                |  3 +-
 db/dbformat.h                                 | 16 ++--
 db/file_indexer.cc                            | 32 +++----
 db/file_indexer.h                             | 14 +--
 db/file_indexer_test.cc                       | 11 ++-
 db/flush_job.cc                               |  6 +-
 db/forward_iterator.cc                        |  9 +-
 db/listener_test.cc                           |  6 +-
 db/log_and_apply_bench.cc                     |  9 +-
 db/log_test.cc                                |  6 +-
 db/memtable.cc                                | 22 ++---
 db/merge_test.cc                              | 54 ++++++------
 db/plain_table_db_test.cc                     |  2 +-
 db/prefix_test.cc                             |  6 +-
 db/skiplist_test.cc                           |  9 +-
 db/version_edit.h                             |  6 +-
 db/version_edit_test.cc                       |  3 +-
 db/version_set.cc                             | 50 ++++++-----
 db/version_set.h                              |  2 +-
 db/write_batch.cc                             |  2 +-
 include/rocksdb/env.h                         | 12 +--
 java/rocksjni/iterator.cc                     | 15 ++--
 java/rocksjni/restorejni.cc                   |  4 +-
 java/rocksjni/rocksjni.cc                     | 29 ++++---
 java/rocksjni/slice.cc                        |  4 +-
 java/rocksjni/write_batch.cc                  |  7 +-
 port/port_posix.h                             | 86 ++++++++++---------
 table/block.cc                                |  3 +-
 table/block.h                                 |  3 +-
 table/block_based_filter_block.cc             |  9 +-
 table/block_based_table_builder.cc            |  8 +-
 table/block_builder.cc                        | 10 +--
 table/block_hash_index.cc                     |  2 +-
 table/block_hash_index_test.cc                |  4 +-
 table/block_prefix_index.cc                   |  2 +-
 table/block_test.cc                           |  2 +-
 table/cuckoo_table_builder.cc                 | 13 ++-
 table/cuckoo_table_builder.h                  |  8 +-
 table/cuckoo_table_builder_test.cc            | 39 +++++----
 table/cuckoo_table_factory.h                  |  2 +-
 table/cuckoo_table_reader.cc                  |  9 +-
 table/cuckoo_table_reader_test.cc             |  4 +-
 table/format.cc                               |  2 +-
 table/full_filter_block_test.cc               |  4 +-
 table/merger_test.cc                          | 11 +--
 table/plain_table_builder.cc                  | 16 ++--
 table/plain_table_builder.h                   |  2 +-
 table/plain_table_index.cc                    | 19 ++--
 table/plain_table_index.h                     | 14 +--
 table/plain_table_key_coding.cc               | 19 ++--
 table/plain_table_reader.cc                   | 11 +--
 table/plain_table_reader.h                    |  4 +-
 table/table_test.cc                           |  2 +-
 tools/blob_store_bench.cc                     |  2 +-
 tools/db_stress.cc                            | 30 ++++---
 util/auto_roll_logger.cc                      |  3 +-
 util/auto_roll_logger_test.cc                 |  2 +-
 util/benchharness.cc                          |  3 +-
 util/benchharness_test.cc                     | 14 +--
 util/blob_store.cc                            |  4 +-
 util/bloom.cc                                 | 14 +--
 util/bloom_test.cc                            |  3 +-
 util/cache.cc                                 |  2 +-
 util/cache_test.cc                            |  4 +-
 util/coding.h                                 | 10 +--
 util/crc32c.cc                                |  6 +-
 util/dynamic_bloom_test.cc                    | 18 ++--
 util/env_posix.cc                             |  6 +-
 util/hash.cc                                  |  2 +-
 util/hash_cuckoo_rep.cc                       |  7 +-
 util/hash_linklist_rep.cc                     |  3 +-
 util/hash_skiplist_rep.cc                     |  3 +-
 util/ldb_cmd.cc                               |  5 +-
 util/mock_env.cc                              | 14 ++-
 util/murmurhash.h                             |  2 +-
 util/mutable_cf_options.cc                    |  2 +-
 util/mutable_cf_options.h                     |  2 +-
 util/options_builder.cc                       |  8 +-
 util/options_helper.cc                        | 10 +--
 util/rate_limiter.cc                          |  3 +-
 util/rate_limiter_test.cc                     | 11 ++-
 util/status.cc                                |  4 +-
 utilities/backupable/backupable_db.cc         |  7 +-
 utilities/geodb/geodb_impl.cc                 |  6 +-
 utilities/redis/redis_list_iterator.h         | 14 +--
 utilities/redis/redis_lists_test.cc           |  6 +-
 utilities/spatialdb/utils.h                   |  2 +-
 utilities/ttl/ttl_test.cc                     | 45 +++++-----
 .../write_batch_with_index_test.cc            |  2 +-
 106 files changed, 584 insertions(+), 505 deletions(-)

diff --git a/Makefile b/Makefile
index 55053be13..e5d823f41 100644
--- a/Makefile
+++ b/Makefile
@@ -147,7 +147,6 @@ TESTS = \
 	cuckoo_table_builder_test \
 	cuckoo_table_reader_test \
 	cuckoo_table_db_test \
-	write_batch_with_index_test \
 	flush_job_test \
 	wal_manager_test \
 	listener_test \
diff --git a/build_tools/build_detect_platform b/build_tools/build_detect_platform
index ec243f2be..7abccc8cc 100755
--- a/build_tools/build_detect_platform
+++ b/build_tools/build_detect_platform
@@ -284,6 +284,14 @@ EOF
     fi
 fi
 
+# Test whether -Wshorten-64-to-32 is available
+$CXX $CFLAGS -x c++ - -o /dev/null -Wshorten-64-to-32 2>/dev/null  <<EOF
+  int main() {}
+EOF
+if [ "$?" = 0 ]; then
+    COMMON_FLAGS="$COMMON_FLAGS -Wshorten-64-to-32"
+fi
+
 # shall we use HDFS?
 
 if test "$USE_HDFS"; then
diff --git a/db/c.cc b/db/c.cc
index b3077aaad..e98463b24 100644
--- a/db/c.cc
+++ b/db/c.cc
@@ -385,11 +385,9 @@ struct rocksdb_mergeoperator_t : public MergeOperator {
     unsigned char success;
     size_t new_value_len;
     char* tmp_new_value = (*full_merge_)(
-        state_,
-        key.data(), key.size(),
-        existing_value_data, existing_value_len,
-        &operand_pointers[0], &operand_sizes[0], n,
-        &success, &new_value_len);
+        state_, key.data(), key.size(), existing_value_data, existing_value_len,
+        &operand_pointers[0], &operand_sizes[0], static_cast<int>(n), &success,
+        &new_value_len);
     new_value->assign(tmp_new_value, new_value_len);
 
     if (delete_value_ != nullptr) {
@@ -417,7 +415,7 @@ struct rocksdb_mergeoperator_t : public MergeOperator {
     size_t new_value_len;
     char* tmp_new_value = (*partial_merge_)(
         state_, key.data(), key.size(), &operand_pointers[0], &operand_sizes[0],
-        operand_count, &success, &new_value_len);
+        static_cast<int>(operand_count), &success, &new_value_len);
     new_value->assign(tmp_new_value, new_value_len);
 
     if (delete_value_ != nullptr) {
@@ -2041,7 +2039,7 @@ void rocksdb_options_set_min_level_to_compress(rocksdb_options_t* opt, int level
 
 int rocksdb_livefiles_count(
   const rocksdb_livefiles_t* lf) {
-  return lf->rep.size();
+  return static_cast<int>(lf->rep.size());
 }
 
 const char* rocksdb_livefiles_name(
diff --git a/db/c_test.c b/db/c_test.c
index c17267114..4f296f9bd 100644
--- a/db/c_test.c
+++ b/db/c_test.c
@@ -132,7 +132,7 @@ static void CmpDestroy(void* arg) { }
 
 static int CmpCompare(void* arg, const char* a, size_t alen,
                       const char* b, size_t blen) {
-  int n = (alen < blen) ? alen : blen;
+  size_t n = (alen < blen) ? alen : blen;
   int r = memcmp(a, b, n);
   if (r == 0) {
     if (alen < blen) r = -1;
diff --git a/db/column_family_test.cc b/db/column_family_test.cc
index b96e66829..69f21a580 100644
--- a/db/column_family_test.cc
+++ b/db/column_family_test.cc
@@ -133,7 +133,7 @@ class ColumnFamilyTest {
   void CreateColumnFamilies(
       const std::vector<std::string>& cfs,
       const std::vector<ColumnFamilyOptions> options = {}) {
-    int cfi = handles_.size();
+    int cfi = static_cast<int>(handles_.size());
     handles_.resize(cfi + cfs.size());
     names_.resize(cfi + cfs.size());
     for (size_t i = 0; i < cfs.size(); ++i) {
@@ -231,7 +231,7 @@ class ColumnFamilyTest {
       snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f);
       result += buf;
       if (f > 0) {
-        last_non_zero_offset = result.size();
+        last_non_zero_offset = static_cast<int>(result.size());
       }
     }
     result.resize(last_non_zero_offset);
@@ -287,8 +287,8 @@ class ColumnFamilyTest {
     assert(num_per_cf.size() == handles_.size());
 
     for (size_t i = 0; i < num_per_cf.size(); ++i) {
-      ASSERT_EQ(num_per_cf[i],
-                GetProperty(i, "rocksdb.num-immutable-mem-table"));
+      ASSERT_EQ(num_per_cf[i], GetProperty(static_cast<int>(i),
+                                           "rocksdb.num-immutable-mem-table"));
     }
   }
 
@@ -916,11 +916,11 @@ TEST(ColumnFamilyTest, DontRollEmptyLogs) {
   CreateColumnFamiliesAndReopen({"one", "two", "three", "four"});
 
   for (size_t i = 0; i < handles_.size(); ++i) {
-    PutRandomData(i, 10, 100);
+    PutRandomData(static_cast<int>(i), 10, 100);
   }
   int num_writable_file_start = env_->GetNumberOfNewWritableFileCalls();
   // this will trigger the flushes
-  for (size_t i = 0; i <= 4; ++i) {
+  for (int i = 0; i <= 4; ++i) {
     ASSERT_OK(Flush(i));
   }
 
diff --git a/db/compaction.cc b/db/compaction.cc
index 98de352bc..a29b386b7 100644
--- a/db/compaction.cc
+++ b/db/compaction.cc
@@ -124,9 +124,9 @@ Compaction::~Compaction() {
 
 void Compaction::GenerateFileLevels() {
   input_levels_.resize(num_input_levels());
-  for (int which = 0; which < num_input_levels(); which++) {
-    DoGenerateLevelFilesBrief(
-        &input_levels_[which], inputs_[which].files, &arena_);
+  for (size_t which = 0; which < num_input_levels(); which++) {
+    DoGenerateLevelFilesBrief(&input_levels_[which], inputs_[which].files,
+                              &arena_);
   }
 }
 
@@ -144,7 +144,7 @@ bool Compaction::IsTrivialMove() const {
 }
 
 void Compaction::AddInputDeletions(VersionEdit* out_edit) {
-  for (int which = 0; which < num_input_levels(); which++) {
+  for (size_t which = 0; which < num_input_levels(); which++) {
     for (size_t i = 0; i < inputs_[which].size(); i++) {
       out_edit->DeleteFile(level(which), inputs_[which][i]->fd.GetNumber());
     }
@@ -207,7 +207,7 @@ bool Compaction::ShouldStopBefore(const Slice& internal_key) {
 
 // Mark (or clear) each file that is being compacted
 void Compaction::MarkFilesBeingCompacted(bool mark_as_compacted) {
-  for (int i = 0; i < num_input_levels(); i++) {
+  for (size_t i = 0; i < num_input_levels(); i++) {
     for (unsigned int j = 0; j < inputs_[i].size(); j++) {
       assert(mark_as_compacted ? !inputs_[i][j]->being_compacted :
                                   inputs_[i][j]->being_compacted);
@@ -293,7 +293,7 @@ void Compaction::Summary(char* output, int len) {
     return;
   }
 
-  for (int level_iter = 0; level_iter < num_input_levels(); ++level_iter) {
+  for (size_t level_iter = 0; level_iter < num_input_levels(); ++level_iter) {
     if (level_iter > 0) {
       write += snprintf(output + write, len - write, "], [");
       if (write < 0 || write >= len) {
@@ -317,7 +317,7 @@ uint64_t Compaction::OutputFilePreallocationSize(
   if (cfd_->ioptions()->compaction_style == kCompactionStyleLevel) {
     preallocation_size = mutable_options.MaxFileSizeForLevel(output_level());
   } else {
-    for (int level_iter = 0; level_iter < num_input_levels(); ++level_iter) {
+    for (size_t level_iter = 0; level_iter < num_input_levels(); ++level_iter) {
       for (const auto& f : inputs_[level_iter].files) {
         preallocation_size += f->fd.GetFileSize();
       }
diff --git a/db/compaction.h b/db/compaction.h
index 3a012fb60..b17a4a91b 100644
--- a/db/compaction.h
+++ b/db/compaction.h
@@ -23,7 +23,7 @@ struct CompactionInputFiles {
   inline bool empty() const { return files.empty(); }
   inline size_t size() const { return files.size(); }
   inline void clear() { files.clear(); }
-  inline FileMetaData* operator[](int i) const { return files[i]; }
+  inline FileMetaData* operator[](size_t i) const { return files[i]; }
 };
 
 class Version;
@@ -48,7 +48,7 @@ class Compaction {
 
   // Returns the level associated to the specified compaction input level.
   // If compaction_input_level is not specified, then input_level is set to 0.
-  int level(int compaction_input_level = 0) const {
+  int level(size_t compaction_input_level = 0) const {
     return inputs_[compaction_input_level].level;
   }
 
@@ -56,7 +56,7 @@ class Compaction {
   int output_level() const { return output_level_; }
 
   // Returns the number of input levels in this compaction.
-  int num_input_levels() const { return inputs_.size(); }
+  size_t num_input_levels() const { return inputs_.size(); }
 
   // Return the object that holds the edits to the descriptor done
   // by this compaction.
@@ -66,7 +66,7 @@ class Compaction {
   // compaction input level.
   // The function will return 0 if when "compaction_input_level" < 0
   // or "compaction_input_level" >= "num_input_levels()".
-  int num_input_files(size_t compaction_input_level) const {
+  size_t num_input_files(size_t compaction_input_level) const {
     if (compaction_input_level < inputs_.size()) {
       return inputs_[compaction_input_level].size();
     }
@@ -83,7 +83,7 @@ class Compaction {
   // specified compaction input level.
   // REQUIREMENT: "compaction_input_level" must be >= 0 and
   //              < "input_levels()"
-  FileMetaData* input(size_t compaction_input_level, int i) const {
+  FileMetaData* input(size_t compaction_input_level, size_t i) const {
     assert(compaction_input_level < inputs_.size());
     return inputs_[compaction_input_level][i];
   }
@@ -98,7 +98,7 @@ class Compaction {
   }
 
   // Returns the LevelFilesBrief of the specified compaction input level.
-  LevelFilesBrief* input_levels(int compaction_input_level) {
+  LevelFilesBrief* input_levels(size_t compaction_input_level) {
     return &input_levels_[compaction_input_level];
   }
 
diff --git a/db/compaction_job.cc b/db/compaction_job.cc
index 6f92a7d03..d816b68dd 100644
--- a/db/compaction_job.cc
+++ b/db/compaction_job.cc
@@ -415,32 +415,33 @@ Status CompactionJob::Run() {
   }
 
   compaction_stats_.micros = env_->NowMicros() - start_micros - imm_micros;
-  compaction_stats_.files_in_leveln = compact_->compaction->num_input_files(0);
+  compaction_stats_.files_in_leveln =
+      static_cast<int>(compact_->compaction->num_input_files(0));
   compaction_stats_.files_in_levelnp1 =
-      compact_->compaction->num_input_files(1);
+      static_cast<int>(compact_->compaction->num_input_files(1));
   MeasureTime(stats_, COMPACTION_TIME, compaction_stats_.micros);
 
-  int num_output_files = compact_->outputs.size();
+  size_t num_output_files = compact_->outputs.size();
   if (compact_->builder != nullptr) {
     // An error occurred so ignore the last output.
     assert(num_output_files > 0);
     --num_output_files;
   }
-  compaction_stats_.files_out_levelnp1 = num_output_files;
+  compaction_stats_.files_out_levelnp1 = static_cast<int>(num_output_files);
 
-  for (int i = 0; i < compact_->compaction->num_input_files(0); i++) {
+  for (size_t i = 0; i < compact_->compaction->num_input_files(0); i++) {
     compaction_stats_.bytes_readn +=
         compact_->compaction->input(0, i)->fd.GetFileSize();
     compaction_stats_.num_input_records +=
         static_cast<uint64_t>(compact_->compaction->input(0, i)->num_entries);
   }
 
-  for (int i = 0; i < compact_->compaction->num_input_files(1); i++) {
+  for (size_t i = 0; i < compact_->compaction->num_input_files(1); i++) {
     compaction_stats_.bytes_readnp1 +=
         compact_->compaction->input(1, i)->fd.GetFileSize();
   }
 
-  for (int i = 0; i < num_output_files; i++) {
+  for (size_t i = 0; i < num_output_files; i++) {
     compaction_stats_.bytes_written += compact_->outputs[i].file_size;
   }
   if (compact_->num_input_records > compact_->num_output_records) {
diff --git a/db/compaction_picker.cc b/db/compaction_picker.cc
index 04d04dc16..20d0e2c74 100644
--- a/db/compaction_picker.cc
+++ b/db/compaction_picker.cc
@@ -46,7 +46,7 @@ CompressionType GetCompressionType(
   // If the use has specified a different compression level for each level,
   // then pick the compression for that level.
   if (!ioptions.compression_per_level.empty()) {
-    const int n = ioptions.compression_per_level.size() - 1;
+    const int n = static_cast<int>(ioptions.compression_per_level.size()) - 1;
     // It is possible for level_ to be -1; in that case, we use level
     // 0's compression.  This occurs mostly in backwards compatibility
     // situations when the builder doesn't know what level the file
@@ -75,7 +75,7 @@ void CompactionPicker::SizeBeingCompacted(std::vector<uint64_t>& sizes) {
     uint64_t total = 0;
     for (auto c : compactions_in_progress_[level]) {
       assert(c->level() == level);
-      for (int i = 0; i < c->num_input_files(0); i++) {
+      for (size_t i = 0; i < c->num_input_files(0); i++) {
         total += c->input(0, i)->compensated_file_size;
       }
     }
@@ -870,7 +870,8 @@ Compaction* UniversalCompactionPicker::PickCompaction(
       // If max read amplification is exceeding configured limits, then force
       // compaction without looking at filesize ratios and try to reduce
       // the number of files to fewer than level0_file_num_compaction_trigger.
-      unsigned int num_files = level_files.size() -
+      unsigned int num_files =
+          static_cast<unsigned int>(level_files.size()) -
           mutable_cf_options.level0_file_num_compaction_trigger;
       if ((c = PickCompactionUniversalReadAmp(
                cf_name, mutable_cf_options, vstorage, score, UINT_MAX,
@@ -1074,8 +1075,7 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalReadAmp(
   if (ratio_to_compress >= 0) {
     uint64_t total_size = vstorage->NumLevelBytes(kLevel0);
     uint64_t older_file_size = 0;
-    for (unsigned int i = files.size() - 1;
-         i >= first_index_after; i--) {
+    for (size_t i = files.size() - 1; i >= first_index_after; i--) {
       older_file_size += files[i]->fd.GetFileSize();
       if (older_file_size * 100L >= total_size * (long) ratio_to_compress) {
         enable_compression = false;
diff --git a/db/compaction_picker_test.cc b/db/compaction_picker_test.cc
index a041b20c4..2396d7f85 100644
--- a/db/compaction_picker_test.cc
+++ b/db/compaction_picker_test.cc
@@ -109,7 +109,7 @@ TEST(CompactionPickerTest, Level0Trigger) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name, mutable_cf_options, &vstorage, &log_buffer));
   ASSERT_TRUE(compaction.get() != nullptr);
-  ASSERT_EQ(2, compaction->num_input_files(0));
+  ASSERT_EQ(2U, compaction->num_input_files(0));
   ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
   ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber());
 }
@@ -121,7 +121,7 @@ TEST(CompactionPickerTest, Level1Trigger) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name, mutable_cf_options, &vstorage, &log_buffer));
   ASSERT_TRUE(compaction.get() != nullptr);
-  ASSERT_EQ(1, compaction->num_input_files(0));
+  ASSERT_EQ(1U, compaction->num_input_files(0));
   ASSERT_EQ(66U, compaction->input(0, 0)->fd.GetNumber());
 }
 
@@ -136,8 +136,8 @@ TEST(CompactionPickerTest, Level1Trigger2) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name, mutable_cf_options, &vstorage, &log_buffer));
   ASSERT_TRUE(compaction.get() != nullptr);
-  ASSERT_EQ(1, compaction->num_input_files(0));
-  ASSERT_EQ(2, compaction->num_input_files(1));
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+  ASSERT_EQ(2U, compaction->num_input_files(1));
   ASSERT_EQ(66U, compaction->input(0, 0)->fd.GetNumber());
   ASSERT_EQ(6U, compaction->input(1, 0)->fd.GetNumber());
   ASSERT_EQ(7U, compaction->input(1, 1)->fd.GetNumber());
@@ -164,7 +164,7 @@ TEST(CompactionPickerTest, LevelMaxScore) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name, mutable_cf_options, &vstorage, &log_buffer));
   ASSERT_TRUE(compaction.get() != nullptr);
-  ASSERT_EQ(1, compaction->num_input_files(0));
+  ASSERT_EQ(1U, compaction->num_input_files(0));
   ASSERT_EQ(7U, compaction->input(0, 0)->fd.GetNumber());
 }
 
diff --git a/db/comparator_db_test.cc b/db/comparator_db_test.cc
index 548c495cb..e0f842730 100644
--- a/db/comparator_db_test.cc
+++ b/db/comparator_db_test.cc
@@ -82,7 +82,7 @@ void DoRandomIteraratorTest(DB* db, std::vector<std::string> source_strings,
     }
 
     int type = rnd->Uniform(2);
-    int index = rnd->Uniform(source_strings.size());
+    int index = rnd->Uniform(static_cast<int>(source_strings.size()));
     auto& key = source_strings[index];
     switch (type) {
       case 0:
@@ -124,7 +124,7 @@ void DoRandomIteraratorTest(DB* db, std::vector<std::string> source_strings,
         break;
       case 2: {
         // Seek to random key
-        auto key_idx = rnd->Uniform(source_strings.size());
+        auto key_idx = rnd->Uniform(static_cast<int>(source_strings.size()));
         auto key = source_strings[key_idx];
         iter->Seek(key);
         result_iter->Seek(key);
@@ -150,7 +150,7 @@ void DoRandomIteraratorTest(DB* db, std::vector<std::string> source_strings,
         break;
       default: {
         assert(type == 5);
-        auto key_idx = rnd->Uniform(source_strings.size());
+        auto key_idx = rnd->Uniform(static_cast<int>(source_strings.size()));
         auto key = source_strings[key_idx];
         std::string result;
         auto status = db->Get(ReadOptions(), key, &result);
@@ -325,7 +325,7 @@ TEST(ComparatorDBTest, SimpleSuffixReverseComparator) {
       source_prefixes.push_back(test::RandomHumanReadableString(&rnd, 8));
     }
     for (int j = 0; j < 20; j++) {
-      int prefix_index = rnd.Uniform(source_prefixes.size());
+      int prefix_index = rnd.Uniform(static_cast<int>(source_prefixes.size()));
       std::string key = source_prefixes[prefix_index] +
                         test::RandomHumanReadableString(&rnd, rnd.Uniform(8));
       source_strings.push_back(key);
diff --git a/db/corruption_test.cc b/db/corruption_test.cc
index 4fcea0d5a..e73725a63 100644
--- a/db/corruption_test.cc
+++ b/db/corruption_test.cc
@@ -115,8 +115,8 @@ class CorruptionTest {
         continue;
       }
       missed += (key - next_expected);
-      next_expected = key + 1;
-      if (iter->value() != Value(key, &value_space)) {
+      next_expected = static_cast<unsigned int>(key + 1);
+      if (iter->value() != Value(static_cast<int>(key), &value_space)) {
         bad_values++;
       } else {
         correct++;
@@ -143,14 +143,14 @@ class CorruptionTest {
       if (-offset > sbuf.st_size) {
         offset = 0;
       } else {
-        offset = sbuf.st_size + offset;
+        offset = static_cast<int>(sbuf.st_size + offset);
       }
     }
     if (offset > sbuf.st_size) {
-      offset = sbuf.st_size;
+      offset = static_cast<int>(sbuf.st_size);
     }
     if (offset + bytes_to_corrupt > sbuf.st_size) {
-      bytes_to_corrupt = sbuf.st_size - offset;
+      bytes_to_corrupt = static_cast<int>(sbuf.st_size - offset);
     }
 
     // Do it
@@ -177,7 +177,7 @@ class CorruptionTest {
           type == filetype &&
           static_cast<int>(number) > picked_number) {  // Pick latest file
         fname = dbname_ + "/" + filenames[i];
-        picked_number = number;
+        picked_number = static_cast<int>(number);
       }
     }
     ASSERT_TRUE(!fname.empty()) << filetype;
@@ -246,7 +246,8 @@ TEST(CorruptionTest, RecoverWriteError) {
 TEST(CorruptionTest, NewFileErrorDuringWrite) {
   // Do enough writing to force minor compaction
   env_.writable_file_error_ = true;
-  const int num = 3 + (Options().write_buffer_size / kValueSize);
+  const int num =
+      static_cast<int>(3 + (Options().write_buffer_size / kValueSize));
   std::string value_storage;
   Status s;
   bool failed = false;
diff --git a/db/cuckoo_table_db_test.cc b/db/cuckoo_table_db_test.cc
index 4beee59e4..4fff07c46 100644
--- a/db/cuckoo_table_db_test.cc
+++ b/db/cuckoo_table_db_test.cc
@@ -92,7 +92,7 @@ class CuckooTableDBTest {
   // Return spread of files per level
   std::string FilesPerLevel() {
     std::string result;
-    int last_non_zero_offset = 0;
+    size_t last_non_zero_offset = 0;
     for (int level = 0; level < db_->NumberLevels(); level++) {
       int f = NumTableFilesAtLevel(level);
       char buf[100];
diff --git a/db/db_bench.cc b/db/db_bench.cc
index a11b9cb5d..c66a1fc1c 100644
--- a/db/db_bench.cc
+++ b/db/db_bench.cc
@@ -251,7 +251,8 @@ DEFINE_int32(universal_compression_size_percent, -1,
 DEFINE_int64(cache_size, -1, "Number of bytes to use as a cache of uncompressed"
              "data. Negative means use default settings.");
 
-DEFINE_int32(block_size, rocksdb::BlockBasedTableOptions().block_size,
+DEFINE_int32(block_size,
+             static_cast<int32_t>(rocksdb::BlockBasedTableOptions().block_size),
              "Number of bytes in a block.");
 
 DEFINE_int32(block_restart_interval,
@@ -2111,8 +2112,9 @@ class Benchmark {
         for (uint64_t i = 0; i < num_; ++i) {
           values_[i] = i;
         }
-        std::shuffle(values_.begin(), values_.end(),
-            std::default_random_engine(FLAGS_seed));
+        std::shuffle(
+            values_.begin(), values_.end(),
+            std::default_random_engine(static_cast<unsigned int>(FLAGS_seed)));
       }
     }
 
diff --git a/db/db_impl.cc b/db/db_impl.cc
index ee0f954fd..acbd213b6 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -2252,7 +2252,7 @@ SuperVersion* DBImpl::InstallSuperVersion(
   MaybeScheduleFlushOrCompaction();
 
   // Update max_total_in_memory_state_
-  auto old_memtable_size = 0;
+  size_t old_memtable_size = 0;
   if (old) {
     old_memtable_size = old->mutable_cf_options.write_buffer_size *
                         old->mutable_cf_options.max_write_buffer_number;
@@ -2920,7 +2920,8 @@ Status DBImpl::DelayWrite(uint64_t expiration_time) {
   auto delay = write_controller_.GetDelay();
   if (write_controller_.IsStopped() == false && delay > 0) {
     mutex_.Unlock();
-    env_->SleepForMicroseconds(delay);
+    // hopefully we don't have to sleep more than 2 billion microseconds
+    env_->SleepForMicroseconds(static_cast<int>(delay));
     mutex_.Lock();
   }
 
diff --git a/db/db_iter_test.cc b/db/db_iter_test.cc
index a84fd55b7..79623ce17 100644
--- a/db/db_iter_test.cc
+++ b/db/db_iter_test.cc
@@ -19,7 +19,7 @@
 
 namespace rocksdb {
 
-static uint32_t TestGetTickerCount(const Options& options,
+static uint64_t TestGetTickerCount(const Options& options,
                                    Tickers ticker_type) {
   return options.statistics->getTickerCount(ticker_type);
 }
diff --git a/db/db_test.cc b/db/db_test.cc
index 4ae34ff9c..eed7af41c 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -668,7 +668,7 @@ class DBTest {
   void CreateColumnFamilies(const std::vector<std::string>& cfs,
                             const Options& options) {
     ColumnFamilyOptions cf_opts(options);
-    int cfi = handles_.size();
+    size_t cfi = handles_.size();
     handles_.resize(cfi + cfs.size());
     for (auto cf : cfs) {
       ASSERT_OK(db_->CreateColumnFamily(cf_opts, cf, &handles_[cfi++]));
@@ -933,7 +933,7 @@ class DBTest {
     int num_levels =
         (cf == 0) ? db_->NumberLevels() : db_->NumberLevels(handles_[1]);
     std::string result;
-    int last_non_zero_offset = 0;
+    size_t last_non_zero_offset = 0;
     for (int level = 0; level < num_levels; level++) {
       int f = NumTableFilesAtLevel(level, cf);
       char buf[100];
@@ -947,7 +947,7 @@ class DBTest {
     return result;
   }
 
-  int CountFiles() {
+  size_t CountFiles() {
     std::vector<std::string> files;
     env_->GetChildren(dbname_, &files);
 
@@ -956,10 +956,10 @@ class DBTest {
       env_->GetChildren(last_options_.wal_dir, &logfiles);
     }
 
-    return static_cast<int>(files.size() + logfiles.size());
+    return files.size() + logfiles.size();
   }
 
-  int CountLiveFiles() {
+  size_t CountLiveFiles() {
     std::vector<LiveFileMetaData> metadata;
     db_->GetLiveFilesMetaData(&metadata);
     return metadata.size();
@@ -4326,7 +4326,8 @@ TEST(DBTest, RepeatedWritesToSameKey) {
         options.num_levels + options.level0_stop_writes_trigger;
 
     Random rnd(301);
-    std::string value = RandomString(&rnd, 2 * options.write_buffer_size);
+    std::string value =
+        RandomString(&rnd, static_cast<int>(2 * options.write_buffer_size));
     for (int i = 0; i < 5 * kMaxFiles; i++) {
       ASSERT_OK(Put(1, "key", value));
       ASSERT_LE(TotalTableFiles(1), kMaxFiles);
@@ -4657,7 +4658,7 @@ TEST(DBTest, CompactionFilterDeletesAll) {
 
   // this will produce empty file (delete compaction filter)
   ASSERT_OK(db_->CompactRange(nullptr, nullptr));
-  ASSERT_EQ(0, CountLiveFiles());
+  ASSERT_EQ(0U, CountLiveFiles());
 
   Reopen(options);
 
@@ -5845,7 +5846,7 @@ TEST(DBTest, DropWrites) {
     ASSERT_OK(Put("foo", "v1"));
     ASSERT_EQ("v1", Get("foo"));
     Compact("a", "z");
-    const int num_files = CountFiles();
+    const size_t num_files = CountFiles();
     // Force out-of-space errors
     env_->drop_writes_.store(true, std::memory_order_release);
     env_->sleep_counter_.Reset();
@@ -6031,7 +6032,7 @@ TEST(DBTest, FilesDeletedAfterCompaction) {
     CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
     ASSERT_OK(Put(1, "foo", "v2"));
     Compact(1, "a", "z");
-    const int num_files = CountLiveFiles();
+    const size_t num_files = CountLiveFiles();
     for (int i = 0; i < 10; i++) {
       ASSERT_OK(Put(1, "foo", "v2"));
       Compact(1, "a", "z");
@@ -6504,7 +6505,7 @@ TEST(DBTest, FlushOneColumnFamily) {
   ASSERT_OK(Put(6, "alyosha", "alyosha"));
   ASSERT_OK(Put(7, "popovich", "popovich"));
 
-  for (size_t i = 0; i < 8; ++i) {
+  for (int i = 0; i < 8; ++i) {
     Flush(i);
     auto tables = ListTableFiles(env_, dbname_);
     ASSERT_EQ(tables.size(), i + 1U);
@@ -6848,8 +6849,8 @@ TEST(DBTest, TransactionLogIteratorCorruptedLog) {
     // than 1025 entries
     auto iter = OpenTransactionLogIter(0);
     int count;
-    int last_sequence_read = ReadRecords(iter, count);
-    ASSERT_LT(last_sequence_read, 1025);
+    SequenceNumber last_sequence_read = ReadRecords(iter, count);
+    ASSERT_LT(last_sequence_read, 1025U);
     // Try to read past the gap, should be able to seek to key1025
     auto iter2 = OpenTransactionLogIter(last_sequence_read + 1);
     ExpectRecords(1, iter2);
@@ -8358,7 +8359,7 @@ TEST(DBTest, CompactFilesOnLevelCompaction) {
 
   ColumnFamilyMetaData cf_meta;
   dbfull()->GetColumnFamilyMetaData(handles_[1], &cf_meta);
-  int output_level = cf_meta.levels.size() - 1;
+  int output_level = static_cast<int>(cf_meta.levels.size()) - 1;
   for (int file_picked = 5; file_picked > 0; --file_picked) {
     std::set<std::string> overlapping_file_names;
     std::vector<std::string> compaction_input_file_names;
diff --git a/db/dbformat.cc b/db/dbformat.cc
index 4c8908fd7..f0bd9d01e 100644
--- a/db/dbformat.cc
+++ b/db/dbformat.cc
@@ -137,7 +137,8 @@ LookupKey::LookupKey(const Slice& _user_key, SequenceNumber s) {
     dst = new char[needed];
   }
   start_ = dst;
-  dst = EncodeVarint32(dst, usize + 8);
+  // NOTE: We don't support users keys of more than 2GB :)
+  dst = EncodeVarint32(dst, static_cast<uint32_t>(usize + 8));
   kstart_ = dst;
   memcpy(dst, _user_key.data(), usize);
   dst += usize;
diff --git a/db/dbformat.h b/db/dbformat.h
index 5a6928e49..9c7c8dcf1 100644
--- a/db/dbformat.h
+++ b/db/dbformat.h
@@ -206,13 +206,19 @@ class LookupKey {
   ~LookupKey();
 
   // Return a key suitable for lookup in a MemTable.
-  Slice memtable_key() const { return Slice(start_, end_ - start_); }
+  Slice memtable_key() const {
+    return Slice(start_, static_cast<size_t>(end_ - start_));
+  }
 
   // Return an internal key (suitable for passing to an internal iterator)
-  Slice internal_key() const { return Slice(kstart_, end_ - kstart_); }
+  Slice internal_key() const {
+    return Slice(kstart_, static_cast<size_t>(end_ - kstart_));
+  }
 
   // Return the user key
-  Slice user_key() const { return Slice(kstart_, end_ - kstart_ - 8); }
+  Slice user_key() const {
+    return Slice(kstart_, static_cast<size_t>(end_ - kstart_ - 8));
+  }
 
  private:
   // We construct a char array of the form:
@@ -319,8 +325,8 @@ class IterKey {
 
   void EncodeLengthPrefixedKey(const Slice& key) {
     auto size = key.size();
-    EnlargeBufferIfNeeded(size + VarintLength(size));
-    char* ptr = EncodeVarint32(key_, size);
+    EnlargeBufferIfNeeded(size + static_cast<size_t>(VarintLength(size)));
+    char* ptr = EncodeVarint32(key_, static_cast<uint32_t>(size));
     memcpy(ptr, key.data(), size);
   }
 
diff --git a/db/file_indexer.cc b/db/file_indexer.cc
index 8c0ca043e..c59036bd6 100644
--- a/db/file_indexer.cc
+++ b/db/file_indexer.cc
@@ -17,17 +17,16 @@ namespace rocksdb {
 FileIndexer::FileIndexer(const Comparator* ucmp)
     : num_levels_(0), ucmp_(ucmp), level_rb_(nullptr) {}
 
-uint32_t FileIndexer::NumLevelIndex() const {
-  return next_level_index_.size();
-}
+size_t FileIndexer::NumLevelIndex() const { return next_level_index_.size(); }
 
-uint32_t FileIndexer::LevelIndexSize(uint32_t level) const {
+size_t FileIndexer::LevelIndexSize(size_t level) const {
   return next_level_index_[level].num_index;
 }
 
-void FileIndexer::GetNextLevelIndex(
-    const uint32_t level, const uint32_t file_index, const int cmp_smallest,
-    const int cmp_largest, int32_t* left_bound, int32_t* right_bound) const {
+void FileIndexer::GetNextLevelIndex(const size_t level, const size_t file_index,
+                                    const int cmp_smallest,
+                                    const int cmp_largest, int32_t* left_bound,
+                                    int32_t* right_bound) const {
   assert(level > 0);
 
   // Last level, no hint
@@ -69,7 +68,7 @@ void FileIndexer::GetNextLevelIndex(
   assert(*right_bound <= level_rb_[level + 1]);
 }
 
-void FileIndexer::UpdateIndex(Arena* arena, const uint32_t num_levels,
+void FileIndexer::UpdateIndex(Arena* arena, const size_t num_levels,
                               std::vector<FileMetaData*>* const files) {
   if (files == nullptr) {
     return;
@@ -90,11 +89,11 @@ void FileIndexer::UpdateIndex(Arena* arena, const uint32_t num_levels,
   }
 
   // L1 - Ln-1
-  for (uint32_t level = 1; level < num_levels_ - 1; ++level) {
+  for (size_t level = 1; level < num_levels_ - 1; ++level) {
     const auto& upper_files = files[level];
-    const int32_t upper_size = upper_files.size();
+    const int32_t upper_size = static_cast<int32_t>(upper_files.size());
     const auto& lower_files = files[level + 1];
-    level_rb_[level] = upper_files.size() - 1;
+    level_rb_[level] = static_cast<int32_t>(upper_files.size()) - 1;
     if (upper_size == 0) {
       continue;
     }
@@ -129,7 +128,8 @@ void FileIndexer::UpdateIndex(Arena* arena, const uint32_t num_levels,
         [](IndexUnit* index, int32_t f_idx) { index->largest_rb = f_idx; });
   }
 
-  level_rb_[num_levels_ - 1] = files[num_levels_ - 1].size() - 1;
+  level_rb_[num_levels_ - 1] =
+      static_cast<int32_t>(files[num_levels_ - 1].size()) - 1;
 }
 
 void FileIndexer::CalculateLB(
@@ -137,8 +137,8 @@ void FileIndexer::CalculateLB(
     const std::vector<FileMetaData*>& lower_files, IndexLevel* index_level,
     std::function<int(const FileMetaData*, const FileMetaData*)> cmp_op,
     std::function<void(IndexUnit*, int32_t)> set_index) {
-  const int32_t upper_size = upper_files.size();
-  const int32_t lower_size = lower_files.size();
+  const int32_t upper_size = static_cast<int32_t>(upper_files.size());
+  const int32_t lower_size = static_cast<int32_t>(lower_files.size());
   int32_t upper_idx = 0;
   int32_t lower_idx = 0;
 
@@ -175,8 +175,8 @@ void FileIndexer::CalculateRB(
     const std::vector<FileMetaData*>& lower_files, IndexLevel* index_level,
     std::function<int(const FileMetaData*, const FileMetaData*)> cmp_op,
     std::function<void(IndexUnit*, int32_t)> set_index) {
-  const int32_t upper_size = upper_files.size();
-  const int32_t lower_size = lower_files.size();
+  const int32_t upper_size = static_cast<int32_t>(upper_files.size());
+  const int32_t lower_size = static_cast<int32_t>(lower_files.size());
   int32_t upper_idx = upper_size - 1;
   int32_t lower_idx = lower_size - 1;
 
diff --git a/db/file_indexer.h b/db/file_indexer.h
index 0c5dea92e..e673499ac 100644
--- a/db/file_indexer.h
+++ b/db/file_indexer.h
@@ -42,19 +42,19 @@ class FileIndexer {
  public:
   explicit FileIndexer(const Comparator* ucmp);
 
-  uint32_t NumLevelIndex() const;
+  size_t NumLevelIndex() const;
 
-  uint32_t LevelIndexSize(uint32_t level) const;
+  size_t LevelIndexSize(size_t level) const;
 
   // Return a file index range in the next level to search for a key based on
   // smallest and largest key comparision for the current file specified by
   // level and file_index. When *left_index < *right_index, both index should
   // be valid and fit in the vector size.
-  void GetNextLevelIndex(
-    const uint32_t level, const uint32_t file_index, const int cmp_smallest,
-    const int cmp_largest, int32_t* left_bound, int32_t* right_bound) const;
+  void GetNextLevelIndex(const size_t level, const size_t file_index,
+                         const int cmp_smallest, const int cmp_largest,
+                         int32_t* left_bound, int32_t* right_bound) const;
 
-  void UpdateIndex(Arena* arena, const uint32_t num_levels,
+  void UpdateIndex(Arena* arena, const size_t num_levels,
                    std::vector<FileMetaData*>* const files);
 
   enum {
@@ -62,7 +62,7 @@ class FileIndexer {
   };
 
  private:
-  uint32_t num_levels_;
+  size_t num_levels_;
   const Comparator* ucmp_;
 
   struct IndexUnit {
diff --git a/db/file_indexer_test.cc b/db/file_indexer_test.cc
index 41afe8475..69aaa386f 100644
--- a/db/file_indexer_test.cc
+++ b/db/file_indexer_test.cc
@@ -22,8 +22,15 @@ class IntComparator : public Comparator {
   int Compare(const Slice& a, const Slice& b) const {
     assert(a.size() == 8);
     assert(b.size() == 8);
-    return *reinterpret_cast<const int64_t*>(a.data()) -
-      *reinterpret_cast<const int64_t*>(b.data());
+    int64_t diff = *reinterpret_cast<const int64_t*>(a.data()) -
+                   *reinterpret_cast<const int64_t*>(b.data());
+    if (diff < 0) {
+      return -1;
+    } else if (diff == 0) {
+      return 0;
+    } else {
+      return 1;
+    }
   }
 
   const char* Name() const {
diff --git a/db/flush_job.cc b/db/flush_job.cc
index 74daf240b..10bd6f96b 100644
--- a/db/flush_job.cc
+++ b/db/flush_job.cc
@@ -150,9 +150,9 @@ Status FlushJob::WriteLevel0Table(const autovector<MemTable*>& mems,
       memtables.push_back(m->NewIterator(ro, &arena));
     }
     {
-      ScopedArenaIterator iter(NewMergingIterator(&cfd_->internal_comparator(),
-                                                  &memtables[0],
-                                                  memtables.size(), &arena));
+      ScopedArenaIterator iter(
+          NewMergingIterator(&cfd_->internal_comparator(), &memtables[0],
+                             static_cast<int>(memtables.size()), &arena));
       Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
           "[%s] Level-0 flush table #%" PRIu64 ": started",
           cfd_->GetName().c_str(), meta.fd.GetNumber());
diff --git a/db/forward_iterator.cc b/db/forward_iterator.cc
index a9a98073b..7fd625a00 100644
--- a/db/forward_iterator.cc
+++ b/db/forward_iterator.cc
@@ -264,10 +264,11 @@ void ForwardIterator::SeekInternal(const Slice& internal_key,
         if (search_left_bound == search_right_bound) {
           f_idx = search_left_bound;
         } else if (search_left_bound < search_right_bound) {
-          f_idx = FindFileInRange(
-              level_files, internal_key, search_left_bound,
-              search_right_bound == FileIndexer::kLevelMaxIndex ?
-                level_files.size() : search_right_bound);
+          f_idx =
+              FindFileInRange(level_files, internal_key, search_left_bound,
+                              search_right_bound == FileIndexer::kLevelMaxIndex
+                                  ? static_cast<uint32_t>(level_files.size())
+                                  : search_right_bound);
         } else {
           // search_left_bound > search_right_bound
           // There are only 2 cases this can happen:
diff --git a/db/listener_test.cc b/db/listener_test.cc
index f39ac93eb..35e00c94a 100644
--- a/db/listener_test.cc
+++ b/db/listener_test.cc
@@ -59,7 +59,7 @@ class EventListenerTest {
                             const ColumnFamilyOptions* options = nullptr) {
     ColumnFamilyOptions cf_opts;
     cf_opts = ColumnFamilyOptions(Options());
-    int cfi = handles_.size();
+    size_t cfi = handles_.size();
     handles_.resize(cfi + cfs.size());
     for (auto cf : cfs) {
       ASSERT_OK(db_->CreateColumnFamily(cf_opts, cf, &handles_[cfi++]));
@@ -188,7 +188,7 @@ TEST(EventListenerTest, OnSingleDBFlushTest) {
   ASSERT_OK(Put(6, "alyosha", "alyosha"));
   ASSERT_OK(Put(7, "popovich", "popovich"));
   for (size_t i = 1; i < 8; ++i) {
-    Flush(i);
+    Flush(static_cast<int>(i));
     dbfull()->TEST_WaitForFlushMemTable();
     ASSERT_EQ(listener->flushed_dbs_.size(), i);
     ASSERT_EQ(listener->flushed_column_family_names_.size(), i);
@@ -218,7 +218,7 @@ TEST(EventListenerTest, MultiCF) {
   ASSERT_OK(Put(6, "alyosha", "alyosha"));
   ASSERT_OK(Put(7, "popovich", "popovich"));
   for (size_t i = 1; i < 8; ++i) {
-    Flush(i);
+    Flush(static_cast<int>(i));
     ASSERT_EQ(listener->flushed_dbs_.size(), i);
     ASSERT_EQ(listener->flushed_column_family_names_.size(), i);
   }
diff --git a/db/log_and_apply_bench.cc b/db/log_and_apply_bench.cc
index eba0a2787..417a2a8d7 100644
--- a/db/log_and_apply_bench.cc
+++ b/db/log_and_apply_bench.cc
@@ -6,6 +6,11 @@
 
 #include <vector>
 
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include <inttypes.h>
 #include "util/testharness.h"
 #include "util/benchharness.h"
 #include "db/version_set.h"
@@ -14,9 +19,9 @@
 
 namespace rocksdb {
 
-std::string MakeKey(unsigned int num) {
+std::string MakeKey(uint64_t num) {
   char buf[30];
-  snprintf(buf, sizeof(buf), "%016u", num);
+  snprintf(buf, sizeof(buf), "%016" PRIu64, num);
   return std::string(buf);
 }
 
diff --git a/db/log_test.cc b/db/log_test.cc
index 6577a6a9c..8086e2775 100644
--- a/db/log_test.cc
+++ b/db/log_test.cc
@@ -558,9 +558,9 @@ TEST(LogTest, ErrorJoinsRecords) {
 
   ASSERT_EQ("correct", Read());
   ASSERT_EQ("EOF", Read());
-  const unsigned int dropped = DroppedBytes();
-  ASSERT_LE(dropped, 2*kBlockSize + 100);
-  ASSERT_GE(dropped, 2*kBlockSize);
+  size_t dropped = DroppedBytes();
+  ASSERT_LE(dropped, 2 * kBlockSize + 100);
+  ASSERT_GE(dropped, 2 * kBlockSize);
 }
 
 TEST(LogTest, ReadStart) {
diff --git a/db/memtable.cc b/db/memtable.cc
index 8d9d99d7e..98212a61b 100644
--- a/db/memtable.cc
+++ b/db/memtable.cc
@@ -188,7 +188,7 @@ KeyHandle MemTableRep::Allocate(const size_t len, char** buf) {
 // into this scratch space.
 const char* EncodeKey(std::string* scratch, const Slice& target) {
   scratch->clear();
-  PutVarint32(scratch, target.size());
+  PutVarint32(scratch, static_cast<uint32_t>(target.size()));
   scratch->append(target.data(), target.size());
   return scratch->data();
 }
@@ -288,12 +288,12 @@ void MemTable::Add(SequenceNumber s, ValueType type,
   //  key bytes    : char[internal_key.size()]
   //  value_size   : varint32 of value.size()
   //  value bytes  : char[value.size()]
-  size_t key_size = key.size();
-  size_t val_size = value.size();
-  size_t internal_key_size = key_size + 8;
-  const size_t encoded_len =
-      VarintLength(internal_key_size) + internal_key_size +
-      VarintLength(val_size) + val_size;
+  uint32_t key_size = static_cast<uint32_t>(key.size());
+  uint32_t val_size = static_cast<uint32_t>(value.size());
+  uint32_t internal_key_size = key_size + 8;
+  const uint32_t encoded_len = VarintLength(internal_key_size) +
+                               internal_key_size + VarintLength(val_size) +
+                               val_size;
   char* buf = nullptr;
   KeyHandle handle = table_->Allocate(encoded_len, &buf);
   assert(buf != nullptr);
@@ -502,8 +502,8 @@ void MemTable::Update(SequenceNumber seq,
       switch (static_cast<ValueType>(tag & 0xff)) {
         case kTypeValue: {
           Slice prev_value = GetLengthPrefixedSlice(key_ptr + key_length);
-          uint32_t prev_size = prev_value.size();
-          uint32_t new_size = value.size();
+          uint32_t prev_size = static_cast<uint32_t>(prev_value.size());
+          uint32_t new_size = static_cast<uint32_t>(value.size());
 
           // Update value, if new value size  <= previous value size
           if (new_size <= prev_size ) {
@@ -560,10 +560,10 @@ bool MemTable::UpdateCallback(SequenceNumber seq,
       switch (static_cast<ValueType>(tag & 0xff)) {
         case kTypeValue: {
           Slice prev_value = GetLengthPrefixedSlice(key_ptr + key_length);
-          uint32_t  prev_size = prev_value.size();
+          uint32_t prev_size = static_cast<uint32_t>(prev_value.size());
 
           char* prev_buffer = const_cast<char*>(prev_value.data());
-          uint32_t  new_prev_size = prev_size;
+          uint32_t new_prev_size = prev_size;
 
           std::string str_value;
           WriteLock wl(GetLock(lkey.user_key()));
diff --git a/db/merge_test.cc b/db/merge_test.cc
index 249e96ad7..1d7800883 100644
--- a/db/merge_test.cc
+++ b/db/merge_test.cc
@@ -23,15 +23,11 @@ using namespace std;
 using namespace rocksdb;
 
 namespace {
-  int numMergeOperatorCalls;
-  void resetNumMergeOperatorCalls() {
-    numMergeOperatorCalls = 0;
-  }
+size_t num_merge_operator_calls;
+void resetNumMergeOperatorCalls() { num_merge_operator_calls = 0; }
 
-  int num_partial_merge_calls;
-  void resetNumPartialMergeCalls() {
-    num_partial_merge_calls = 0;
-  }
+size_t num_partial_merge_calls;
+void resetNumPartialMergeCalls() { num_partial_merge_calls = 0; }
 }
 
 class CountMergeOperator : public AssociativeMergeOperator {
@@ -45,7 +41,7 @@ class CountMergeOperator : public AssociativeMergeOperator {
                      const Slice& value,
                      std::string* new_value,
                      Logger* logger) const override {
-    ++numMergeOperatorCalls;
+    ++num_merge_operator_calls;
     if (existing_value == nullptr) {
       new_value->assign(value.data(), value.size());
       return true;
@@ -307,31 +303,31 @@ void testCounters(Counters& counters, DB* db, bool test_compaction) {
   }
 }
 
-void testSuccessiveMerge(
-    Counters& counters, int max_num_merges, int num_merges) {
+void testSuccessiveMerge(Counters& counters, size_t max_num_merges,
+                         size_t num_merges) {
 
   counters.assert_remove("z");
   uint64_t sum = 0;
 
-  for (int i = 1; i <= num_merges; ++i) {
+  for (size_t i = 1; i <= num_merges; ++i) {
     resetNumMergeOperatorCalls();
     counters.assert_add("z", i);
     sum += i;
 
     if (i % (max_num_merges + 1) == 0) {
-      assert(numMergeOperatorCalls == max_num_merges + 1);
+      assert(num_merge_operator_calls == max_num_merges + 1);
     } else {
-      assert(numMergeOperatorCalls == 0);
+      assert(num_merge_operator_calls == 0);
     }
 
     resetNumMergeOperatorCalls();
     assert(counters.assert_get("z") == sum);
-    assert(numMergeOperatorCalls == i % (max_num_merges + 1));
+    assert(num_merge_operator_calls == i % (max_num_merges + 1));
   }
 }
 
-void testPartialMerge(Counters* counters, DB* db, int max_merge, int min_merge,
-                      int count) {
+void testPartialMerge(Counters* counters, DB* db, size_t max_merge,
+                      size_t min_merge, size_t count) {
   FlushOptions o;
   o.wait = true;
 
@@ -339,7 +335,7 @@ void testPartialMerge(Counters* counters, DB* db, int max_merge, int min_merge,
   //              operands exceeds the threshold.
   uint64_t tmp_sum = 0;
   resetNumPartialMergeCalls();
-  for (int i = 1; i <= count; i++) {
+  for (size_t i = 1; i <= count; i++) {
     counters->assert_add("b", i);
     tmp_sum += i;
   }
@@ -348,7 +344,7 @@ void testPartialMerge(Counters* counters, DB* db, int max_merge, int min_merge,
   ASSERT_EQ(tmp_sum, counters->assert_get("b"));
   if (count > max_merge) {
     // in this case, FullMerge should be called instead.
-    ASSERT_EQ(num_partial_merge_calls, 0);
+    ASSERT_EQ(num_partial_merge_calls, 0U);
   } else {
     // if count >= min_merge, then partial merge should be called once.
     ASSERT_EQ((count >= min_merge), (num_partial_merge_calls == 1));
@@ -358,20 +354,18 @@ void testPartialMerge(Counters* counters, DB* db, int max_merge, int min_merge,
   resetNumPartialMergeCalls();
   tmp_sum = 0;
   db->Put(rocksdb::WriteOptions(), "c", "10");
-  for (int i = 1; i <= count; i++) {
+  for (size_t i = 1; i <= count; i++) {
     counters->assert_add("c", i);
     tmp_sum += i;
   }
   db->Flush(o);
   db->CompactRange(nullptr, nullptr);
   ASSERT_EQ(tmp_sum, counters->assert_get("c"));
-  ASSERT_EQ(num_partial_merge_calls, 0);
+  ASSERT_EQ(num_partial_merge_calls, 0U);
 }
 
-void testSingleBatchSuccessiveMerge(
-    DB* db,
-    int max_num_merges,
-    int num_merges) {
+void testSingleBatchSuccessiveMerge(DB* db, size_t max_num_merges,
+                                    size_t num_merges) {
   assert(num_merges > max_num_merges);
 
   Slice key("BatchSuccessiveMerge");
@@ -380,7 +374,7 @@ void testSingleBatchSuccessiveMerge(
 
   // Create the batch
   WriteBatch batch;
-  for (int i = 0; i < num_merges; ++i) {
+  for (size_t i = 0; i < num_merges; ++i) {
     batch.Merge(key, merge_value_slice);
   }
 
@@ -390,8 +384,9 @@ void testSingleBatchSuccessiveMerge(
     Status s = db->Write(WriteOptions(), &batch);
     assert(s.ok());
   }
-  assert(numMergeOperatorCalls ==
-      num_merges - (num_merges % (max_num_merges + 1)));
+  ASSERT_EQ(
+      num_merge_operator_calls,
+      static_cast<size_t>(num_merges - (num_merges % (max_num_merges + 1))));
 
   // Get the value
   resetNumMergeOperatorCalls();
@@ -403,7 +398,8 @@ void testSingleBatchSuccessiveMerge(
   assert(get_value_str.size() == sizeof(uint64_t));
   uint64_t get_value = DecodeFixed64(&get_value_str[0]);
   ASSERT_EQ(get_value, num_merges * merge_value);
-  ASSERT_EQ(numMergeOperatorCalls, (num_merges % (max_num_merges + 1)));
+  ASSERT_EQ(num_merge_operator_calls,
+            static_cast<size_t>((num_merges % (max_num_merges + 1))));
 }
 
 void runTest(int argc, const string& dbname, const bool use_ttl = false) {
diff --git a/db/plain_table_db_test.cc b/db/plain_table_db_test.cc
index 81a5d9989..1720b678f 100644
--- a/db/plain_table_db_test.cc
+++ b/db/plain_table_db_test.cc
@@ -158,7 +158,7 @@ class PlainTableDBTest {
   // Return spread of files per level
   std::string FilesPerLevel() {
     std::string result;
-    int last_non_zero_offset = 0;
+    size_t last_non_zero_offset = 0;
     for (int level = 0; level < db_->NumberLevels(); level++) {
       int f = NumTableFilesAtLevel(level);
       char buf[100];
diff --git a/db/prefix_test.cc b/db/prefix_test.cc
index c896ab8d8..238f84330 100644
--- a/db/prefix_test.cc
+++ b/db/prefix_test.cc
@@ -29,14 +29,14 @@ using GFLAGS::ParseCommandLineFlags;
 
 DEFINE_bool(trigger_deadlock, false,
             "issue delete in range scan to trigger PrefixHashMap deadlock");
-DEFINE_uint64(bucket_count, 100000, "number of buckets");
+DEFINE_int32(bucket_count, 100000, "number of buckets");
 DEFINE_uint64(num_locks, 10001, "number of locks");
 DEFINE_bool(random_prefix, false, "randomize prefix");
 DEFINE_uint64(total_prefixes, 100000, "total number of prefixes");
 DEFINE_uint64(items_per_prefix, 1, "total number of values per prefix");
 DEFINE_int64(write_buffer_size, 33554432, "");
-DEFINE_int64(max_write_buffer_number, 2, "");
-DEFINE_int64(min_write_buffer_number_to_merge, 1, "");
+DEFINE_int32(max_write_buffer_number, 2, "");
+DEFINE_int32(min_write_buffer_number_to_merge, 1, "");
 DEFINE_int32(skiplist_height, 4, "");
 DEFINE_int32(memtable_prefix_bloom_bits, 10000000, "");
 DEFINE_int32(memtable_prefix_bloom_probes, 10, "");
diff --git a/db/skiplist_test.cc b/db/skiplist_test.cc
index fe6f68ec9..d8e113c66 100644
--- a/db/skiplist_test.cc
+++ b/db/skiplist_test.cc
@@ -253,11 +253,10 @@ class ConcurrentTest {
         // Note that generation 0 is never inserted, so it is ok if
         // <*,0,*> is missing.
         ASSERT_TRUE((gen(pos) == 0U) ||
-                    (gen(pos) > (uint64_t)initial_state.Get(key(pos)))
-                    ) << "key: " << key(pos)
-                      << "; gen: " << gen(pos)
-                      << "; initgen: "
-                      << initial_state.Get(key(pos));
+                    (gen(pos) > static_cast<uint64_t>(initial_state.Get(
+                                    static_cast<int>(key(pos))))))
+            << "key: " << key(pos) << "; gen: " << gen(pos)
+            << "; initgen: " << initial_state.Get(static_cast<int>(key(pos)));
 
         // Advance to next key in the valid key space
         if (key(pos) < key(current)) {
diff --git a/db/version_edit.h b/db/version_edit.h
index 0a8bbf257..86e315c11 100644
--- a/db/version_edit.h
+++ b/db/version_edit.h
@@ -160,7 +160,7 @@ class VersionEdit {
   // Add the specified file at the specified number.
   // REQUIRES: This version has not been saved (see VersionSet::SaveTo)
   // REQUIRES: "smallest" and "largest" are smallest and largest keys in file
-  void AddFile(int level, uint64_t file, uint64_t file_path_id,
+  void AddFile(int level, uint64_t file, uint32_t file_path_id,
                uint64_t file_size, const InternalKey& smallest,
                const InternalKey& largest, const SequenceNumber& smallest_seqno,
                const SequenceNumber& largest_seqno) {
@@ -180,9 +180,7 @@ class VersionEdit {
   }
 
   // Number of edits
-  int NumEntries() {
-    return new_files_.size() + deleted_files_.size();
-  }
+  size_t NumEntries() { return new_files_.size() + deleted_files_.size(); }
 
   bool IsColumnFamilyManipulation() {
     return is_column_family_add_ || is_column_family_drop_;
diff --git a/db/version_edit_test.cc b/db/version_edit_test.cc
index fe663c766..ec123d2c1 100644
--- a/db/version_edit_test.cc
+++ b/db/version_edit_test.cc
@@ -26,11 +26,12 @@ class VersionEditTest { };
 
 TEST(VersionEditTest, EncodeDecode) {
   static const uint64_t kBig = 1ull << 50;
+  static const uint32_t kBig32Bit = 1ull << 30;
 
   VersionEdit edit;
   for (int i = 0; i < 4; i++) {
     TestEncodeDecode(edit);
-    edit.AddFile(3, kBig + 300 + i, kBig + 400 + i, 0,
+    edit.AddFile(3, kBig + 300 + i, kBig32Bit + 400 + i, 0,
                  InternalKey("foo", kBig + 500 + i, kTypeValue),
                  InternalKey("zoo", kBig + 600 + i, kTypeDeletion),
                  kBig + 500 + i, kBig + 600 + i);
diff --git a/db/version_set.cc b/db/version_set.cc
index b2b63eb33..83b93e36b 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -201,8 +201,8 @@ class FilePicker {
  private:
   unsigned int num_levels_;
   unsigned int curr_level_;
-  int search_left_bound_;
-  int search_right_bound_;
+  int32_t search_left_bound_;
+  int32_t search_right_bound_;
 #ifndef NDEBUG
   std::vector<FileMetaData*>* files_;
 #endif
@@ -258,11 +258,13 @@ class FilePicker {
           start_index = search_left_bound_;
         } else if (search_left_bound_ < search_right_bound_) {
           if (search_right_bound_ == FileIndexer::kLevelMaxIndex) {
-            search_right_bound_ = curr_file_level_->num_files - 1;
+            search_right_bound_ =
+                static_cast<int32_t>(curr_file_level_->num_files) - 1;
           }
-          start_index = FindFileInRange(*internal_comparator_,
-              *curr_file_level_, ikey_,
-              search_left_bound_, search_right_bound_);
+          start_index =
+              FindFileInRange(*internal_comparator_, *curr_file_level_, ikey_,
+                              static_cast<uint32_t>(search_left_bound_),
+                              static_cast<uint32_t>(search_right_bound_));
         } else {
           // search_left_bound > search_right_bound, key does not exist in
           // this level. Since no comparision is done in this level, it will
@@ -315,7 +317,8 @@ Version::~Version() {
 int FindFile(const InternalKeyComparator& icmp,
              const LevelFilesBrief& file_level,
              const Slice& key) {
-  return FindFileInRange(icmp, file_level, key, 0, file_level.num_files);
+  return FindFileInRange(icmp, file_level, key, 0,
+                         static_cast<uint32_t>(file_level.num_files));
 }
 
 void DoGenerateLevelFilesBrief(LevelFilesBrief* file_level,
@@ -412,7 +415,7 @@ class LevelFileNumIterator : public Iterator {
                        const LevelFilesBrief* flevel)
       : icmp_(icmp),
         flevel_(flevel),
-        index_(flevel->num_files),
+        index_(static_cast<uint32_t>(flevel->num_files)),
         current_value_(0, 0, 0) {  // Marks as invalid
   }
   virtual bool Valid() const {
@@ -423,7 +426,9 @@ class LevelFileNumIterator : public Iterator {
   }
   virtual void SeekToFirst() { index_ = 0; }
   virtual void SeekToLast() {
-    index_ = (flevel_->num_files == 0) ? 0 : flevel_->num_files - 1;
+    index_ = (flevel_->num_files == 0)
+                 ? 0
+                 : static_cast<uint32_t>(flevel_->num_files) - 1;
   }
   virtual void Next() {
     assert(Valid());
@@ -432,7 +437,7 @@ class LevelFileNumIterator : public Iterator {
   virtual void Prev() {
     assert(Valid());
     if (index_ == 0) {
-      index_ = flevel_->num_files;  // Marks as invalid
+      index_ = static_cast<uint32_t>(flevel_->num_files);  // Marks as invalid
     } else {
       index_--;
     }
@@ -1213,7 +1218,7 @@ void VersionStorageInfo::GetOverlappingInputs(
           i = 0;
         }
       } else if (file_index) {
-        *file_index = i-1;
+        *file_index = static_cast<int>(i) - 1;
       }
     }
   }
@@ -1229,7 +1234,7 @@ void VersionStorageInfo::GetOverlappingInputsBinarySearch(
   assert(level > 0);
   int min = 0;
   int mid = 0;
-  int max = files_[level].size() -1;
+  int max = static_cast<int>(files_[level].size()) - 1;
   bool foundOverlap = false;
   const Comparator* user_cmp = user_comparator_;
 
@@ -2646,12 +2651,12 @@ Iterator* VersionSet::MakeInputIterator(Compaction* c) {
   // Level-0 files have to be merged together.  For other levels,
   // we will make a concatenating iterator per level.
   // TODO(opt): use concatenating iterator for level-0 if there is no overlap
-  const int space = (c->level() == 0 ?
-      c->input_levels(0)->num_files + c->num_input_levels() - 1:
-      c->num_input_levels());
-  Iterator** list = new Iterator*[space];
-  int num = 0;
-  for (int which = 0; which < c->num_input_levels(); which++) {
+  const size_t space = (c->level() == 0 ? c->input_levels(0)->num_files +
+                                              c->num_input_levels() - 1
+                                        : c->num_input_levels());
+  Iterator** list = new Iterator* [space];
+  size_t num = 0;
+  for (size_t which = 0; which < c->num_input_levels(); which++) {
     if (c->input_levels(which)->num_files != 0) {
       if (c->level(which) == 0) {
         const LevelFilesBrief* flevel = c->input_levels(which);
@@ -2673,8 +2678,9 @@ Iterator* VersionSet::MakeInputIterator(Compaction* c) {
     }
   }
   assert(num <= space);
-  Iterator* result = NewMergingIterator(
-      &c->column_family_data()->internal_comparator(), list, num);
+  Iterator* result =
+      NewMergingIterator(&c->column_family_data()->internal_comparator(), list,
+                         static_cast<int>(num));
   delete[] list;
   return result;
 }
@@ -2691,9 +2697,9 @@ bool VersionSet::VerifyCompactionFileConsistency(Compaction* c) {
         c->column_family_data()->GetName().c_str());
   }
 
-  for (int input = 0; input < c->num_input_levels(); ++input) {
+  for (size_t input = 0; input < c->num_input_levels(); ++input) {
     int level = c->level(input);
-    for (int i = 0; i < c->num_input_files(input); ++i) {
+    for (size_t i = 0; i < c->num_input_files(input); ++i) {
       uint64_t number = c->input(input, i)->fd.GetNumber();
       bool found = false;
       for (unsigned int j = 0; j < vstorage->files_[level].size(); j++) {
diff --git a/db/version_set.h b/db/version_set.h
index 3c4eff353..f23fcc693 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -194,7 +194,7 @@ class VersionStorageInfo {
   // REQUIRES: This version has been saved (see VersionSet::SaveTo)
   int NumLevelFiles(int level) const {
     assert(finalized_);
-    return files_[level].size();
+    return static_cast<int>(files_[level].size());
   }
 
   // Return the combined file size of all files at the specified level.
diff --git a/db/write_batch.cc b/db/write_batch.cc
index 6e15ec5c0..3c773d24a 100644
--- a/db/write_batch.cc
+++ b/db/write_batch.cc
@@ -374,7 +374,7 @@ class MemTableInserter : public WriteBatch::Handler {
         Status s = db_->Get(ropts, cf_handle, key, &prev_value);
 
         char* prev_buffer = const_cast<char*>(prev_value.c_str());
-        uint32_t prev_size = prev_value.size();
+        uint32_t prev_size = static_cast<uint32_t>(prev_value.size());
         auto status = moptions->inplace_callback(s.ok() ? prev_buffer : nullptr,
                                                  s.ok() ? &prev_size : nullptr,
                                                  value, &merged_value);
diff --git a/include/rocksdb/env.h b/include/rocksdb/env.h
index e002fede1..36aa5a604 100644
--- a/include/rocksdb/env.h
+++ b/include/rocksdb/env.h
@@ -20,6 +20,7 @@
 #include <cstdarg>
 #include <string>
 #include <memory>
+#include <limits>
 #include <vector>
 #include <stdint.h>
 #include "rocksdb/status.h"
@@ -476,8 +477,8 @@ class WritableFile {
     if (new_last_preallocated_block > last_preallocated_block_) {
       size_t num_spanned_blocks =
         new_last_preallocated_block - last_preallocated_block_;
-      Allocate(block_size * last_preallocated_block_,
-               block_size * num_spanned_blocks);
+      Allocate(static_cast<off_t>(block_size * last_preallocated_block_),
+               static_cast<off_t>(block_size * num_spanned_blocks));
       last_preallocated_block_ = new_last_preallocated_block;
     }
   }
@@ -580,7 +581,8 @@ enum InfoLogLevel : unsigned char {
 // An interface for writing log messages.
 class Logger {
  public:
-  enum { DO_NOT_SUPPORT_GET_LOG_FILE_SIZE = -1 };
+  size_t kDoNotSupportGetLogFileSize = std::numeric_limits<size_t>::max();
+
   explicit Logger(const InfoLogLevel log_level = InfoLogLevel::INFO_LEVEL)
       : log_level_(log_level) {}
   virtual ~Logger();
@@ -613,9 +615,7 @@ class Logger {
       Logv(new_format, ap);
     }
   }
-  virtual size_t GetLogFileSize() const {
-    return DO_NOT_SUPPORT_GET_LOG_FILE_SIZE;
-  }
+  virtual size_t GetLogFileSize() const { return kDoNotSupportGetLogFileSize; }
   // Flush to the OS buffers
   virtual void Flush() {}
   virtual InfoLogLevel GetInfoLogLevel() const { return log_level_; }
diff --git a/java/rocksjni/iterator.cc b/java/rocksjni/iterator.cc
index d17ed8722..c7667a018 100644
--- a/java/rocksjni/iterator.cc
+++ b/java/rocksjni/iterator.cc
@@ -74,10 +74,9 @@ jbyteArray Java_org_rocksdb_RocksIterator_key0(
   auto it = reinterpret_cast<rocksdb::Iterator*>(handle);
   rocksdb::Slice key_slice = it->key();
 
-  jbyteArray jkey = env->NewByteArray(key_slice.size());
-  env->SetByteArrayRegion(
-      jkey, 0, key_slice.size(),
-      reinterpret_cast<const jbyte*>(key_slice.data()));
+  jbyteArray jkey = env->NewByteArray(static_cast<jsize>(key_slice.size()));
+  env->SetByteArrayRegion(jkey, 0, static_cast<jsize>(key_slice.size()),
+                          reinterpret_cast<const jbyte*>(key_slice.data()));
   return jkey;
 }
 
@@ -91,10 +90,10 @@ jbyteArray Java_org_rocksdb_RocksIterator_value0(
   auto it = reinterpret_cast<rocksdb::Iterator*>(handle);
   rocksdb::Slice value_slice = it->value();
 
-  jbyteArray jkeyValue = env->NewByteArray(value_slice.size());
-  env->SetByteArrayRegion(
-      jkeyValue, 0, value_slice.size(),
-      reinterpret_cast<const jbyte*>(value_slice.data()));
+  jbyteArray jkeyValue =
+      env->NewByteArray(static_cast<jsize>(value_slice.size()));
+  env->SetByteArrayRegion(jkeyValue, 0, static_cast<jsize>(value_slice.size()),
+                          reinterpret_cast<const jbyte*>(value_slice.data()));
   return jkeyValue;
 }
 
diff --git a/java/rocksjni/restorejni.cc b/java/rocksjni/restorejni.cc
index a180dec1b..4fe813d09 100644
--- a/java/rocksjni/restorejni.cc
+++ b/java/rocksjni/restorejni.cc
@@ -65,8 +65,8 @@ void Java_org_rocksdb_RestoreBackupableDB_restoreDBFromBackup0(JNIEnv* env,
   const char* cwal_dir = env->GetStringUTFChars(jwal_dir, 0);
 
   auto rdb = reinterpret_cast<rocksdb::RestoreBackupableDB*>(jhandle);
-  rocksdb::Status s =
-      rdb->RestoreDBFromBackup(jbackup_id, cdb_dir, cwal_dir, *opt);
+  rocksdb::Status s = rdb->RestoreDBFromBackup(
+      static_cast<rocksdb::BackupID>(jbackup_id), cdb_dir, cwal_dir, *opt);
 
   env->ReleaseStringUTFChars(jdb_dir, cdb_dir);
   env->ReleaseStringUTFChars(jwal_dir, cwal_dir);
diff --git a/java/rocksjni/rocksjni.cc b/java/rocksjni/rocksjni.cc
index 3b00cbe42..44d45a2c2 100644
--- a/java/rocksjni/rocksjni.cc
+++ b/java/rocksjni/rocksjni.cc
@@ -234,9 +234,9 @@ jobject Java_org_rocksdb_RocksDB_listColumnFamilies(
     for (std::vector<std::string>::size_type i = 0;
         i < column_family_names.size(); i++) {
       jbyteArray jcf_value =
-          env->NewByteArray(column_family_names[i].size());
-      env->SetByteArrayRegion(jcf_value, 0,
-          column_family_names[i].size(),
+          env->NewByteArray(static_cast<jsize>(column_family_names[i].size()));
+      env->SetByteArrayRegion(
+          jcf_value, 0, static_cast<jsize>(column_family_names[i].size()),
           reinterpret_cast<const jbyte*>(column_family_names[i].c_str()));
       env->CallBooleanMethod(jvalue_list,
           rocksdb::ListJni::getListAddMethodId(env), jcf_value);
@@ -516,10 +516,9 @@ jbyteArray rocksdb_get_helper(
   }
 
   if (s.ok()) {
-    jbyteArray jret_value = env->NewByteArray(value.size());
-    env->SetByteArrayRegion(
-        jret_value, 0, value.size(),
-        reinterpret_cast<const jbyte*>(value.c_str()));
+    jbyteArray jret_value = env->NewByteArray(static_cast<jsize>(value.size()));
+    env->SetByteArrayRegion(jret_value, 0, static_cast<jsize>(value.size()),
+                            reinterpret_cast<const jbyte*>(value.c_str()));
     return jret_value;
   }
   rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
@@ -712,9 +711,10 @@ jobject multi_get_helper(JNIEnv* env, jobject jdb, rocksdb::DB* db,
   // insert in java list
   for (std::vector<rocksdb::Status>::size_type i = 0; i != s.size(); i++) {
     if (s[i].ok()) {
-      jbyteArray jentry_value = env->NewByteArray(values[i].size());
+      jbyteArray jentry_value =
+          env->NewByteArray(static_cast<jsize>(values[i].size()));
       env->SetByteArrayRegion(
-          jentry_value, 0, values[i].size(),
+          jentry_value, 0, static_cast<jsize>(values[i].size()),
           reinterpret_cast<const jbyte*>(values[i].c_str()));
       env->CallBooleanMethod(
           jvalue_list, rocksdb::ListJni::getListAddMethodId(env),
@@ -1135,11 +1135,12 @@ jlongArray Java_org_rocksdb_RocksDB_iterators(
   rocksdb::Status s = db->NewIterators(rocksdb::ReadOptions(),
       cf_handles, &iterators);
   if (s.ok()) {
-    jlongArray jLongArray = env->NewLongArray(iterators.size());
-    for (std::vector<rocksdb::Iterator*>::size_type i = 0;
-        i < iterators.size(); i++) {
-      env->SetLongArrayRegion(jLongArray, i, 1,
-          reinterpret_cast<const jlong*>(&iterators[i]));
+    jlongArray jLongArray =
+        env->NewLongArray(static_cast<jsize>(iterators.size()));
+    for (std::vector<rocksdb::Iterator*>::size_type i = 0; i < iterators.size();
+         i++) {
+      env->SetLongArrayRegion(jLongArray, static_cast<jsize>(i), 1,
+                              reinterpret_cast<const jlong*>(&iterators[i]));
     }
     return jLongArray;
   }
diff --git a/java/rocksjni/slice.cc b/java/rocksjni/slice.cc
index 0d8b92c9c..64f89b211 100644
--- a/java/rocksjni/slice.cc
+++ b/java/rocksjni/slice.cc
@@ -39,7 +39,7 @@ void Java_org_rocksdb_AbstractSlice_createNewSliceFromString(
 jint Java_org_rocksdb_AbstractSlice_size0(
     JNIEnv* env, jobject jobj, jlong handle) {
   const rocksdb::Slice* slice = reinterpret_cast<rocksdb::Slice*>(handle);
-  return slice->size();
+  return static_cast<jint>(slice->size());
 }
 
 /*
@@ -154,7 +154,7 @@ void Java_org_rocksdb_Slice_createNewSlice1(
 jbyteArray Java_org_rocksdb_Slice_data0(
     JNIEnv* env, jobject jobj, jlong handle) {
   const rocksdb::Slice* slice = reinterpret_cast<rocksdb::Slice*>(handle);
-  const int len = slice->size();
+  const int len = static_cast<int>(slice->size());
   const jbyteArray data = env->NewByteArray(len);
   env->SetByteArrayRegion(data, 0, len,
     reinterpret_cast<jbyte*>(const_cast<char*>(slice->data())));
diff --git a/java/rocksjni/write_batch.cc b/java/rocksjni/write_batch.cc
index 1abd8c0de..f1d0a89d6 100644
--- a/java/rocksjni/write_batch.cc
+++ b/java/rocksjni/write_batch.cc
@@ -392,10 +392,9 @@ jbyteArray Java_org_rocksdb_WriteBatchTest_getContents(
   }
   delete mem->Unref();
 
-  jbyteArray jstate = env->NewByteArray(state.size());
-  env->SetByteArrayRegion(
-      jstate, 0, state.size(),
-      reinterpret_cast<const jbyte*>(state.c_str()));
+  jbyteArray jstate = env->NewByteArray(static_cast<jsize>(state.size()));
+  env->SetByteArrayRegion(jstate, 0, static_cast<jsize>(state.size()),
+                          reinterpret_cast<const jbyte*>(state.c_str()));
 
   return jstate;
 }
diff --git a/port/port_posix.h b/port/port_posix.h
index ceb6d0aa1..476542cfc 100644
--- a/port/port_posix.h
+++ b/port/port_posix.h
@@ -203,13 +203,13 @@ inline bool Zlib_Compress(const CompressionOptions& opts, const char* input,
 
   // Compress the input, and put compressed data in output.
   _stream.next_in = (Bytef *)input;
-  _stream.avail_in = length;
+  _stream.avail_in = static_cast<unsigned int>(length);
 
   // Initialize the output size.
-  _stream.avail_out = length;
-  _stream.next_out = (Bytef *)&(*output)[0];
+  _stream.avail_out = static_cast<unsigned int>(length);
+  _stream.next_out = (Bytef*)&(*output)[0];
 
-  int old_sz =0, new_sz =0, new_sz_delta =0;
+  size_t old_sz = 0, new_sz = 0, new_sz_delta = 0;
   bool done = false;
   while (!done) {
     st = deflate(&_stream, Z_FINISH);
@@ -221,12 +221,12 @@ inline bool Zlib_Compress(const CompressionOptions& opts, const char* input,
         // No output space. Increase the output space by 20%.
         // (Should we fail the compression since it expands the size?)
         old_sz = output->size();
-        new_sz_delta = (int)(output->size() * 0.2);
+        new_sz_delta = static_cast<size_t>(output->size() * 0.2);
         new_sz = output->size() + (new_sz_delta < 10 ? 10 : new_sz_delta);
         output->resize(new_sz);
         // Set more output.
         _stream.next_out = (Bytef *)&(*output)[old_sz];
-        _stream.avail_out = new_sz - old_sz;
+        _stream.avail_out = static_cast<unsigned int>(new_sz - old_sz);
         break;
       case Z_BUF_ERROR:
       default:
@@ -258,18 +258,18 @@ inline char* Zlib_Uncompress(const char* input_data, size_t input_length,
   }
 
   _stream.next_in = (Bytef *)input_data;
-  _stream.avail_in = input_length;
+  _stream.avail_in = static_cast<unsigned int>(input_length);
 
   // Assume the decompressed data size will 5x of compressed size.
-  int output_len = input_length * 5;
+  size_t output_len = input_length * 5;
   char* output = new char[output_len];
-  int old_sz = output_len;
+  size_t old_sz = output_len;
 
   _stream.next_out = (Bytef *)output;
-  _stream.avail_out = output_len;
+  _stream.avail_out = static_cast<unsigned int>(output_len);
 
   char* tmp = nullptr;
-  int output_len_delta;
+  size_t output_len_delta;
   bool done = false;
 
   //while(_stream.next_in != nullptr && _stream.avail_in != 0) {
@@ -282,7 +282,7 @@ inline char* Zlib_Uncompress(const char* input_data, size_t input_length,
       case Z_OK:
         // No output space. Increase the output space by 20%.
         old_sz = output_len;
-        output_len_delta = (int)(output_len * 0.2);
+        output_len_delta = static_cast<size_t>(output_len * 0.2);
         output_len += output_len_delta < 10 ? 10 : output_len_delta;
         tmp = new char[output_len];
         memcpy(tmp, output, old_sz);
@@ -291,7 +291,7 @@ inline char* Zlib_Uncompress(const char* input_data, size_t input_length,
 
         // Set more output.
         _stream.next_out = (Bytef *)(output + old_sz);
-        _stream.avail_out = output_len - old_sz;
+        _stream.avail_out = static_cast<unsigned int>(output_len - old_sz);
         break;
       case Z_BUF_ERROR:
       default:
@@ -301,7 +301,7 @@ inline char* Zlib_Uncompress(const char* input_data, size_t input_length,
     }
   }
 
-  *decompress_size = output_len - _stream.avail_out;
+  *decompress_size = static_cast<int>(output_len - _stream.avail_out);
   inflateEnd(&_stream);
   return output;
 #endif
@@ -329,14 +329,14 @@ inline bool BZip2_Compress(const CompressionOptions& opts, const char* input,
 
   // Compress the input, and put compressed data in output.
   _stream.next_in = (char *)input;
-  _stream.avail_in = length;
+  _stream.avail_in = static_cast<unsigned int>(length);
 
   // Initialize the output size.
   _stream.next_out = (char *)&(*output)[0];
-  _stream.avail_out = length;
+  _stream.avail_out = static_cast<unsigned int>(length);
 
-  int old_sz =0, new_sz =0;
-  while(_stream.next_in != nullptr && _stream.avail_in != 0) {
+  size_t old_sz = 0, new_sz = 0;
+  while (_stream.next_in != nullptr && _stream.avail_in != 0) {
     st = BZ2_bzCompress(&_stream, BZ_FINISH);
     switch (st) {
       case BZ_STREAM_END:
@@ -345,11 +345,11 @@ inline bool BZip2_Compress(const CompressionOptions& opts, const char* input,
         // No output space. Increase the output space by 20%.
         // (Should we fail the compression since it expands the size?)
         old_sz = output->size();
-        new_sz = (int)(output->size() * 1.2);
+        new_sz = static_cast<size_t>(output->size() * 1.2);
         output->resize(new_sz);
         // Set more output.
         _stream.next_out = (char *)&(*output)[old_sz];
-        _stream.avail_out = new_sz - old_sz;
+        _stream.avail_out = static_cast<unsigned int>(new_sz - old_sz);
         break;
       case BZ_SEQUENCE_ERROR:
       default:
@@ -377,15 +377,15 @@ inline char* BZip2_Uncompress(const char* input_data, size_t input_length,
   }
 
   _stream.next_in = (char *)input_data;
-  _stream.avail_in = input_length;
+  _stream.avail_in = static_cast<unsigned int>(input_length);
 
   // Assume the decompressed data size will be 5x of compressed size.
-  int output_len = input_length * 5;
+  size_t output_len = input_length * 5;
   char* output = new char[output_len];
-  int old_sz = output_len;
+  size_t old_sz = output_len;
 
   _stream.next_out = (char *)output;
-  _stream.avail_out = output_len;
+  _stream.avail_out = static_cast<unsigned int>(output_len);
 
   char* tmp = nullptr;
 
@@ -397,7 +397,7 @@ inline char* BZip2_Uncompress(const char* input_data, size_t input_length,
       case BZ_OK:
         // No output space. Increase the output space by 20%.
         old_sz = output_len;
-        output_len = (int)(output_len * 1.2);
+        output_len = static_cast<size_t>(output_len * 1.2);
         tmp = new char[output_len];
         memcpy(tmp, output, old_sz);
         delete[] output;
@@ -405,7 +405,7 @@ inline char* BZip2_Uncompress(const char* input_data, size_t input_length,
 
         // Set more output.
         _stream.next_out = (char *)(output + old_sz);
-        _stream.avail_out = output_len - old_sz;
+        _stream.avail_out = static_cast<unsigned int>(output_len - old_sz);
         break;
       default:
         delete[] output;
@@ -414,7 +414,7 @@ inline char* BZip2_Uncompress(const char* input_data, size_t input_length,
     }
   }
 
-  *decompress_size = output_len - _stream.avail_out;
+  *decompress_size = static_cast<int>(output_len - _stream.avail_out);
   BZ2_bzDecompressEnd(&_stream);
   return output;
 #endif
@@ -424,16 +424,16 @@ inline char* BZip2_Uncompress(const char* input_data, size_t input_length,
 inline bool LZ4_Compress(const CompressionOptions &opts, const char *input,
                          size_t length, ::std::string* output) {
 #ifdef LZ4
-  int compressBound = LZ4_compressBound(length);
-  output->resize(8 + compressBound);
-  char *p = const_cast<char *>(output->c_str());
+  int compressBound = LZ4_compressBound(static_cast<int>(length));
+  output->resize(static_cast<size_t>(8 + compressBound));
+  char* p = const_cast<char*>(output->c_str());
   memcpy(p, &length, sizeof(length));
-  size_t outlen;
-  outlen = LZ4_compress_limitedOutput(input, p + 8, length, compressBound);
+  int outlen = LZ4_compress_limitedOutput(
+      input, p + 8, static_cast<int>(length), compressBound);
   if (outlen == 0) {
     return false;
   }
-  output->resize(8 + outlen);
+  output->resize(static_cast<size_t>(8 + outlen));
   return true;
 #endif
   return false;
@@ -449,7 +449,8 @@ inline char* LZ4_Uncompress(const char* input_data, size_t input_length,
   memcpy(&output_len, input_data, sizeof(output_len));
   char *output = new char[output_len];
   *decompress_size = LZ4_decompress_safe_partial(
-      input_data + 8, output, input_length - 8, output_len, output_len);
+      input_data + 8, output, static_cast<int>(input_length - 8), output_len,
+      output_len);
   if (*decompress_size < 0) {
     delete[] output;
     return nullptr;
@@ -462,21 +463,22 @@ inline char* LZ4_Uncompress(const char* input_data, size_t input_length,
 inline bool LZ4HC_Compress(const CompressionOptions &opts, const char* input,
                            size_t length, ::std::string* output) {
 #ifdef LZ4
-  int compressBound = LZ4_compressBound(length);
-  output->resize(8 + compressBound);
-  char *p = const_cast<char *>(output->c_str());
+  int compressBound = LZ4_compressBound(static_cast<int>(length));
+  output->resize(static_cast<size_t>(8 + compressBound));
+  char* p = const_cast<char*>(output->c_str());
   memcpy(p, &length, sizeof(length));
-  size_t outlen;
+  int outlen;
 #ifdef LZ4_VERSION_MAJOR  // they only started defining this since r113
-  outlen = LZ4_compressHC2_limitedOutput(input, p + 8, length, compressBound,
-                                         opts.level);
+  outlen = LZ4_compressHC2_limitedOutput(input, p + 8, static_cast<int>(length),
+                                         compressBound, opts.level);
 #else
-  outlen = LZ4_compressHC_limitedOutput(input, p + 8, length, compressBound);
+  outlen = LZ4_compressHC_limitedOutput(input, p + 8, static_cast<int>(length),
+                                        compressBound);
 #endif
   if (outlen == 0) {
     return false;
   }
-  output->resize(8 + outlen);
+  output->resize(static_cast<size_t>(8 + outlen));
   return true;
 #endif
   return false;
diff --git a/table/block.cc b/table/block.cc
index 592d175b1..6a5ede600 100644
--- a/table/block.cc
+++ b/table/block.cc
@@ -304,7 +304,8 @@ Block::Block(BlockContents&& contents)
   if (size_ < sizeof(uint32_t)) {
     size_ = 0;  // Error marker
   } else {
-    restart_offset_ = size_ - (1 + NumRestarts()) * sizeof(uint32_t);
+    restart_offset_ =
+        static_cast<uint32_t>(size_) - (1 + NumRestarts()) * sizeof(uint32_t);
     if (restart_offset_ > size_ - sizeof(uint32_t)) {
       // The size is too small for NumRestarts() and therefore
       // restart_offset_ wrapped around.
diff --git a/table/block.h b/table/block.h
index 68b16ea1f..0187489bb 100644
--- a/table/block.h
+++ b/table/block.h
@@ -159,7 +159,8 @@ class BlockIter : public Iterator {
 
   // Return the offset in data_ just past the end of the current entry.
   inline uint32_t NextEntryOffset() const {
-    return (value_.data() + value_.size()) - data_;
+    // NOTE: We don't support files bigger than 2GB
+    return static_cast<uint32_t>((value_.data() + value_.size()) - data_);
   }
 
   uint32_t GetRestartPoint(uint32_t index) {
diff --git a/table/block_based_filter_block.cc b/table/block_based_filter_block.cc
index fea37b67f..647fc020c 100644
--- a/table/block_based_filter_block.cc
+++ b/table/block_based_filter_block.cc
@@ -99,7 +99,7 @@ Slice BlockBasedFilterBlockBuilder::Finish() {
   }
 
   // Append array of per-filter offsets
-  const uint32_t array_offset = result_.size();
+  const uint32_t array_offset = static_cast<uint32_t>(result_.size());
   for (size_t i = 0; i < filter_offsets_.size(); i++) {
     PutFixed32(&result_, filter_offsets_[i]);
   }
@@ -113,7 +113,7 @@ void BlockBasedFilterBlockBuilder::GenerateFilter() {
   const size_t num_entries = start_.size();
   if (num_entries == 0) {
     // Fast path if there are no keys for this filter
-    filter_offsets_.push_back(result_.size());
+    filter_offsets_.push_back(static_cast<uint32_t>(result_.size()));
     return;
   }
 
@@ -127,8 +127,9 @@ void BlockBasedFilterBlockBuilder::GenerateFilter() {
   }
 
   // Generate filter for current set of keys and append to result_.
-  filter_offsets_.push_back(result_.size());
-  policy_->CreateFilter(&tmp_entries_[0], num_entries, &result_);
+  filter_offsets_.push_back(static_cast<uint32_t>(result_.size()));
+  policy_->CreateFilter(&tmp_entries_[0], static_cast<int>(num_entries),
+                        &result_);
 
   tmp_entries_.clear();
   entries_.clear();
diff --git a/table/block_based_table_builder.cc b/table/block_based_table_builder.cc
index f158ca8c4..6a48a975a 100644
--- a/table/block_based_table_builder.cc
+++ b/table/block_based_table_builder.cc
@@ -203,7 +203,7 @@ class HashIndexBuilder : public IndexBuilder {
       // copy.
       pending_entry_prefix_ = key_prefix.ToString();
       pending_block_num_ = 1;
-      pending_entry_index_ = current_restart_index_;
+      pending_entry_index_ = static_cast<uint32_t>(current_restart_index_);
     } else {
       // entry number increments when keys share the prefix reside in
       // differnt data blocks.
@@ -234,7 +234,8 @@ class HashIndexBuilder : public IndexBuilder {
   void FlushPendingPrefix() {
     prefix_block_.append(pending_entry_prefix_.data(),
                          pending_entry_prefix_.size());
-    PutVarint32(&prefix_meta_block_, pending_entry_prefix_.size());
+    PutVarint32(&prefix_meta_block_,
+                static_cast<uint32_t>(pending_entry_prefix_.size()));
     PutVarint32(&prefix_meta_block_, pending_entry_index_);
     PutVarint32(&prefix_meta_block_, pending_block_num_);
   }
@@ -596,7 +597,8 @@ void BlockBasedTableBuilder::WriteRawBlock(const Slice& block_contents,
       }
       case kxxHash: {
         void* xxh = XXH32_init(0);
-        XXH32_update(xxh, block_contents.data(), block_contents.size());
+        XXH32_update(xxh, block_contents.data(),
+                     static_cast<uint32_t>(block_contents.size()));
         XXH32_update(xxh, trailer, 1);  // Extend  to cover block type
         EncodeFixed32(trailer_without_type, XXH32_digest(xxh));
         break;
diff --git a/table/block_builder.cc b/table/block_builder.cc
index f8627743a..1eee96d46 100644
--- a/table/block_builder.cc
+++ b/table/block_builder.cc
@@ -85,7 +85,7 @@ Slice BlockBuilder::Finish() {
   for (size_t i = 0; i < restarts_.size(); i++) {
     PutFixed32(&buffer_, restarts_[i]);
   }
-  PutFixed32(&buffer_, restarts_.size());
+  PutFixed32(&buffer_, static_cast<uint32_t>(restarts_.size()));
   finished_ = true;
   return Slice(buffer_);
 }
@@ -103,15 +103,15 @@ void BlockBuilder::Add(const Slice& key, const Slice& value) {
     }
   } else {
     // Restart compression
-    restarts_.push_back(buffer_.size());
+    restarts_.push_back(static_cast<uint32_t>(buffer_.size()));
     counter_ = 0;
   }
   const size_t non_shared = key.size() - shared;
 
   // Add "<shared><non_shared><value_size>" to buffer_
-  PutVarint32(&buffer_, shared);
-  PutVarint32(&buffer_, non_shared);
-  PutVarint32(&buffer_, value.size());
+  PutVarint32(&buffer_, static_cast<uint32_t>(shared));
+  PutVarint32(&buffer_, static_cast<uint32_t>(non_shared));
+  PutVarint32(&buffer_, static_cast<uint32_t>(value.size()));
 
   // Add string delta to buffer_ followed by value
   buffer_.append(key.data() + shared, non_shared);
diff --git a/table/block_hash_index.cc b/table/block_hash_index.cc
index 7a6e219a0..a8c965864 100644
--- a/table/block_hash_index.cc
+++ b/table/block_hash_index.cc
@@ -59,7 +59,7 @@ BlockHashIndex* CreateBlockHashIndexOnTheFly(
   auto hash_index = new BlockHashIndex(
       hash_key_extractor,
       true /* hash_index will copy prefix when Add() is called */);
-  uint64_t current_restart_index = 0;
+  uint32_t current_restart_index = 0;
 
   std::string pending_entry_prefix;
   // pending_block_num == 0 also implies there is no entry inserted at all.
diff --git a/table/block_hash_index_test.cc b/table/block_hash_index_test.cc
index 6f7bcb2b7..8a6d1b093 100644
--- a/table/block_hash_index_test.cc
+++ b/table/block_hash_index_test.cc
@@ -82,8 +82,8 @@ TEST(BlockTest, BasicTest) {
 
   auto prefix_extractor = NewFixedPrefixTransform(prefix_size);
   std::unique_ptr<BlockHashIndex> block_hash_index(CreateBlockHashIndexOnTheFly(
-      &index_iter, &data_iter, index_entries.size(), BytewiseComparator(),
-      prefix_extractor));
+      &index_iter, &data_iter, static_cast<uint32_t>(index_entries.size()),
+      BytewiseComparator(), prefix_extractor));
 
   std::map<std::string, BlockHashIndex::RestartIndex> expected = {
       {"01xx", BlockHashIndex::RestartIndex(0, 1)},
diff --git a/table/block_prefix_index.cc b/table/block_prefix_index.cc
index c1c9d520e..147bcf56e 100644
--- a/table/block_prefix_index.cc
+++ b/table/block_prefix_index.cc
@@ -87,7 +87,7 @@ class BlockPrefixIndex::Builder {
 
   BlockPrefixIndex* Finish() {
     // For now, use roughly 1:1 prefix to bucket ratio.
-    uint32_t num_buckets = prefixes_.size() + 1;
+    uint32_t num_buckets = static_cast<uint32_t>(prefixes_.size()) + 1;
 
     // Collect prefix records that hash to the same bucket, into a single
     // linklist.
diff --git a/table/block_test.cc b/table/block_test.cc
index 6b82c4d93..fa263bcbd 100644
--- a/table/block_test.cc
+++ b/table/block_test.cc
@@ -163,7 +163,7 @@ void CheckBlockContents(BlockContents contents, const int max_key,
     auto iter1 = reader1.NewIterator(nullptr);
     auto iter2 = reader1.NewIterator(nullptr);
     reader1.SetBlockHashIndex(CreateBlockHashIndexOnTheFly(
-        iter1, iter2, keys.size(), BytewiseComparator(),
+        iter1, iter2, static_cast<uint32_t>(keys.size()), BytewiseComparator(),
         prefix_extractor.get()));
 
     delete iter1;
diff --git a/table/cuckoo_table_builder.cc b/table/cuckoo_table_builder.cc
index a11945cf7..947c465e9 100644
--- a/table/cuckoo_table_builder.cc
+++ b/table/cuckoo_table_builder.cc
@@ -182,7 +182,7 @@ Slice CuckooTableBuilder::GetValue(uint64_t idx) const {
 
 Status CuckooTableBuilder::MakeHashTable(std::vector<CuckooBucket>* buckets) {
   buckets->resize(hash_table_size_ + cuckoo_block_size_ - 1);
-  uint64_t make_space_for_key_call_id = 0;
+  uint32_t make_space_for_key_call_id = 0;
   for (uint32_t vector_idx = 0; vector_idx < num_entries_; vector_idx++) {
     uint64_t bucket_id;
     bool bucket_found = false;
@@ -254,7 +254,7 @@ Status CuckooTableBuilder::Finish() {
     }
     // Determine unused_user_key to fill empty buckets.
     std::string unused_user_key = smallest_user_key_;
-    int curr_pos = unused_user_key.size() - 1;
+    int curr_pos = static_cast<int>(unused_user_key.size()) - 1;
     while (curr_pos >= 0) {
       --unused_user_key[curr_pos];
       if (Slice(unused_user_key).compare(smallest_user_key_) < 0) {
@@ -265,7 +265,7 @@ Status CuckooTableBuilder::Finish() {
     if (curr_pos < 0) {
       // Try using the largest key to identify an unused key.
       unused_user_key = largest_user_key_;
-      curr_pos = unused_user_key.size() - 1;
+      curr_pos = static_cast<int>(unused_user_key.size()) - 1;
       while (curr_pos >= 0) {
         ++unused_user_key[curr_pos];
         if (Slice(unused_user_key).compare(largest_user_key_) > 0) {
@@ -429,9 +429,8 @@ uint64_t CuckooTableBuilder::FileSize() const {
 // If tree depth exceedes max depth, we return false indicating failure.
 bool CuckooTableBuilder::MakeSpaceForKey(
     const autovector<uint64_t>& hash_vals,
-    const uint64_t make_space_for_key_call_id,
-    std::vector<CuckooBucket>* buckets,
-    uint64_t* bucket_id) {
+    const uint32_t make_space_for_key_call_id,
+    std::vector<CuckooBucket>* buckets, uint64_t* bucket_id) {
   struct CuckooNode {
     uint64_t bucket_id;
     uint32_t depth;
@@ -495,7 +494,7 @@ bool CuckooTableBuilder::MakeSpaceForKey(
     // child with the parent. Stop when first level is reached in the tree
     // (happens when 0 <= bucket_to_replace_pos < num_hash_func_) and return
     // this location in first level for target key to be inserted.
-    uint32_t bucket_to_replace_pos = tree.size()-1;
+    uint32_t bucket_to_replace_pos = static_cast<uint32_t>(tree.size()) - 1;
     while (bucket_to_replace_pos >= num_hash_func_) {
       CuckooNode& curr_node = tree[bucket_to_replace_pos];
       (*buckets)[curr_node.bucket_id] =
diff --git a/table/cuckoo_table_builder.h b/table/cuckoo_table_builder.h
index 6898c1ef6..26c94e1bc 100644
--- a/table/cuckoo_table_builder.h
+++ b/table/cuckoo_table_builder.h
@@ -68,11 +68,9 @@ class CuckooTableBuilder: public TableBuilder {
   };
   static const uint32_t kMaxVectorIdx = std::numeric_limits<int32_t>::max();
 
-  bool MakeSpaceForKey(
-      const autovector<uint64_t>& hash_vals,
-      const uint64_t call_id,
-      std::vector<CuckooBucket>* buckets,
-      uint64_t* bucket_id);
+  bool MakeSpaceForKey(const autovector<uint64_t>& hash_vals,
+                       const uint32_t call_id,
+                       std::vector<CuckooBucket>* buckets, uint64_t* bucket_id);
   Status MakeHashTable(std::vector<CuckooBucket>* buckets);
 
   inline bool IsDeletedKey(uint64_t idx) const;
diff --git a/table/cuckoo_table_builder_test.cc b/table/cuckoo_table_builder_test.cc
index d3b3a713e..ecd23aff5 100644
--- a/table/cuckoo_table_builder_test.cc
+++ b/table/cuckoo_table_builder_test.cc
@@ -87,13 +87,14 @@ class CuckooBuilderTest {
 
     // Check contents of the bucket.
     std::vector<bool> keys_found(keys.size(), false);
-    uint32_t bucket_size = expected_unused_bucket.size();
+    size_t bucket_size = expected_unused_bucket.size();
     for (uint32_t i = 0; i < table_size + cuckoo_block_size - 1; ++i) {
       Slice read_slice;
       ASSERT_OK(read_file->Read(i*bucket_size, bucket_size,
             &read_slice, nullptr));
-      uint32_t key_idx = std::find(expected_locations.begin(),
-          expected_locations.end(), i) - expected_locations.begin();
+      size_t key_idx =
+          std::find(expected_locations.begin(), expected_locations.end(), i) -
+          expected_locations.begin();
       if (key_idx == keys.size()) {
         // i is not one of the expected locaitons. Empty bucket.
         ASSERT_EQ(read_slice.compare(expected_unused_bucket), 0);
@@ -156,7 +157,7 @@ TEST(CuckooBuilderTest, WriteSuccessNoCollisionFullKey) {
   for (auto& user_key : user_keys) {
     keys.push_back(GetInternalKey(user_key, false));
   }
-  uint32_t expected_table_size = NextPowOf2(keys.size() / kHashTableRatio);
+  uint64_t expected_table_size = NextPowOf2(keys.size() / kHashTableRatio);
 
   unique_ptr<WritableFile> writable_file;
   fname = test::TmpDir() + "/NoCollisionFullKey";
@@ -169,7 +170,7 @@ TEST(CuckooBuilderTest, WriteSuccessNoCollisionFullKey) {
     ASSERT_EQ(builder.NumEntries(), i + 1);
     ASSERT_OK(builder.status());
   }
-  uint32_t bucket_size = keys[0].size() + values[0].size();
+  size_t bucket_size = keys[0].size() + values[0].size();
   ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize());
   ASSERT_OK(builder.Finish());
   ASSERT_OK(writable_file->Close());
@@ -196,7 +197,7 @@ TEST(CuckooBuilderTest, WriteSuccessWithCollisionFullKey) {
   for (auto& user_key : user_keys) {
     keys.push_back(GetInternalKey(user_key, false));
   }
-  uint32_t expected_table_size = NextPowOf2(keys.size() / kHashTableRatio);
+  uint64_t expected_table_size = NextPowOf2(keys.size() / kHashTableRatio);
 
   unique_ptr<WritableFile> writable_file;
   fname = test::TmpDir() + "/WithCollisionFullKey";
@@ -209,7 +210,7 @@ TEST(CuckooBuilderTest, WriteSuccessWithCollisionFullKey) {
     ASSERT_EQ(builder.NumEntries(), i + 1);
     ASSERT_OK(builder.status());
   }
-  uint32_t bucket_size = keys[0].size() + values[0].size();
+  size_t bucket_size = keys[0].size() + values[0].size();
   ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize());
   ASSERT_OK(builder.Finish());
   ASSERT_OK(writable_file->Close());
@@ -236,7 +237,7 @@ TEST(CuckooBuilderTest, WriteSuccessWithCollisionAndCuckooBlock) {
   for (auto& user_key : user_keys) {
     keys.push_back(GetInternalKey(user_key, false));
   }
-  uint32_t expected_table_size = NextPowOf2(keys.size() / kHashTableRatio);
+  uint64_t expected_table_size = NextPowOf2(keys.size() / kHashTableRatio);
 
   unique_ptr<WritableFile> writable_file;
   uint32_t cuckoo_block_size = 2;
@@ -251,7 +252,7 @@ TEST(CuckooBuilderTest, WriteSuccessWithCollisionAndCuckooBlock) {
     ASSERT_EQ(builder.NumEntries(), i + 1);
     ASSERT_OK(builder.status());
   }
-  uint32_t bucket_size = keys[0].size() + values[0].size();
+  size_t bucket_size = keys[0].size() + values[0].size();
   ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize());
   ASSERT_OK(builder.Finish());
   ASSERT_OK(writable_file->Close());
@@ -283,7 +284,7 @@ TEST(CuckooBuilderTest, WithCollisionPathFullKey) {
   for (auto& user_key : user_keys) {
     keys.push_back(GetInternalKey(user_key, false));
   }
-  uint32_t expected_table_size = NextPowOf2(keys.size() / kHashTableRatio);
+  uint64_t expected_table_size = NextPowOf2(keys.size() / kHashTableRatio);
 
   unique_ptr<WritableFile> writable_file;
   fname = test::TmpDir() + "/WithCollisionPathFullKey";
@@ -296,7 +297,7 @@ TEST(CuckooBuilderTest, WithCollisionPathFullKey) {
     ASSERT_EQ(builder.NumEntries(), i + 1);
     ASSERT_OK(builder.status());
   }
-  uint32_t bucket_size = keys[0].size() + values[0].size();
+  size_t bucket_size = keys[0].size() + values[0].size();
   ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize());
   ASSERT_OK(builder.Finish());
   ASSERT_OK(writable_file->Close());
@@ -325,7 +326,7 @@ TEST(CuckooBuilderTest, WithCollisionPathFullKeyAndCuckooBlock) {
   for (auto& user_key : user_keys) {
     keys.push_back(GetInternalKey(user_key, false));
   }
-  uint32_t expected_table_size = NextPowOf2(keys.size() / kHashTableRatio);
+  uint64_t expected_table_size = NextPowOf2(keys.size() / kHashTableRatio);
 
   unique_ptr<WritableFile> writable_file;
   fname = test::TmpDir() + "/WithCollisionPathFullKeyAndCuckooBlock";
@@ -338,7 +339,7 @@ TEST(CuckooBuilderTest, WithCollisionPathFullKeyAndCuckooBlock) {
     ASSERT_EQ(builder.NumEntries(), i + 1);
     ASSERT_OK(builder.status());
   }
-  uint32_t bucket_size = keys[0].size() + values[0].size();
+  size_t bucket_size = keys[0].size() + values[0].size();
   ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize());
   ASSERT_OK(builder.Finish());
   ASSERT_OK(writable_file->Close());
@@ -361,7 +362,7 @@ TEST(CuckooBuilderTest, WriteSuccessNoCollisionUserKey) {
     {user_keys[3], {3, 4, 5, 6}}
   };
   std::vector<uint64_t> expected_locations = {0, 1, 2, 3};
-  uint32_t expected_table_size = NextPowOf2(user_keys.size() / kHashTableRatio);
+  uint64_t expected_table_size = NextPowOf2(user_keys.size() / kHashTableRatio);
 
   unique_ptr<WritableFile> writable_file;
   fname = test::TmpDir() + "/NoCollisionUserKey";
@@ -374,7 +375,7 @@ TEST(CuckooBuilderTest, WriteSuccessNoCollisionUserKey) {
     ASSERT_EQ(builder.NumEntries(), i + 1);
     ASSERT_OK(builder.status());
   }
-  uint32_t bucket_size = user_keys[0].size() + values[0].size();
+  size_t bucket_size = user_keys[0].size() + values[0].size();
   ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize());
   ASSERT_OK(builder.Finish());
   ASSERT_OK(writable_file->Close());
@@ -397,7 +398,7 @@ TEST(CuckooBuilderTest, WriteSuccessWithCollisionUserKey) {
     {user_keys[3], {0, 1, 2, 3}},
   };
   std::vector<uint64_t> expected_locations = {0, 1, 2, 3};
-  uint32_t expected_table_size = NextPowOf2(user_keys.size() / kHashTableRatio);
+  uint64_t expected_table_size = NextPowOf2(user_keys.size() / kHashTableRatio);
 
   unique_ptr<WritableFile> writable_file;
   fname = test::TmpDir() + "/WithCollisionUserKey";
@@ -410,7 +411,7 @@ TEST(CuckooBuilderTest, WriteSuccessWithCollisionUserKey) {
     ASSERT_EQ(builder.NumEntries(), i + 1);
     ASSERT_OK(builder.status());
   }
-  uint32_t bucket_size = user_keys[0].size() + values[0].size();
+  size_t bucket_size = user_keys[0].size() + values[0].size();
   ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize());
   ASSERT_OK(builder.Finish());
   ASSERT_OK(writable_file->Close());
@@ -435,7 +436,7 @@ TEST(CuckooBuilderTest, WithCollisionPathUserKey) {
     {user_keys[4], {0, 2}},
   };
   std::vector<uint64_t> expected_locations = {0, 1, 3, 4, 2};
-  uint32_t expected_table_size = NextPowOf2(user_keys.size() / kHashTableRatio);
+  uint64_t expected_table_size = NextPowOf2(user_keys.size() / kHashTableRatio);
 
   unique_ptr<WritableFile> writable_file;
   fname = test::TmpDir() + "/WithCollisionPathUserKey";
@@ -448,7 +449,7 @@ TEST(CuckooBuilderTest, WithCollisionPathUserKey) {
     ASSERT_EQ(builder.NumEntries(), i + 1);
     ASSERT_OK(builder.status());
   }
-  uint32_t bucket_size = user_keys[0].size() + values[0].size();
+  size_t bucket_size = user_keys[0].size() + values[0].size();
   ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize());
   ASSERT_OK(builder.Finish());
   ASSERT_OK(writable_file->Close());
diff --git a/table/cuckoo_table_factory.h b/table/cuckoo_table_factory.h
index 714fdc2a0..625fd9995 100644
--- a/table/cuckoo_table_factory.h
+++ b/table/cuckoo_table_factory.h
@@ -28,7 +28,7 @@ static inline uint64_t CuckooHash(
   if (hash_cnt == 0 && identity_as_first_hash) {
     value = (*reinterpret_cast<const int64_t*>(user_key.data()));
   } else {
-    value = MurmurHash(user_key.data(), user_key.size(),
+    value = MurmurHash(user_key.data(), static_cast<int>(user_key.size()),
                        kCuckooMurmurSeedMultiplier * hash_cnt);
   }
   if (use_module_hash) {
diff --git a/table/cuckoo_table_reader.cc b/table/cuckoo_table_reader.cc
index c0ca38bb7..7f017ec7c 100644
--- a/table/cuckoo_table_reader.cc
+++ b/table/cuckoo_table_reader.cc
@@ -64,7 +64,7 @@ CuckooTableReader::CuckooTableReader(
   }
   unused_key_ = unused_key->second;
 
-  key_length_ = props->fixed_key_len;
+  key_length_ = static_cast<uint32_t>(props->fixed_key_len);
   auto user_key_len = user_props.find(CuckooTablePropertyNames::kUserKeyLength);
   if (user_key_len == user_props.end()) {
     status_ = Status::Corruption("User key length not found");
@@ -274,7 +274,7 @@ void CuckooTableIterator::SeekToFirst() {
 
 void CuckooTableIterator::SeekToLast() {
   InitIfNeeded();
-  curr_key_idx_ = sorted_bucket_ids_.size() - 1;
+  curr_key_idx_ = static_cast<uint32_t>(sorted_bucket_ids_.size()) - 1;
   PrepareKVAtCurrIdx();
 }
 
@@ -288,7 +288,8 @@ void CuckooTableIterator::Seek(const Slice& target) {
       sorted_bucket_ids_.end(),
       kInvalidIndex,
       seek_comparator);
-  curr_key_idx_ = std::distance(sorted_bucket_ids_.begin(), seek_it);
+  curr_key_idx_ =
+      static_cast<uint32_t>(std::distance(sorted_bucket_ids_.begin(), seek_it));
   PrepareKVAtCurrIdx();
 }
 
@@ -327,7 +328,7 @@ void CuckooTableIterator::Next() {
 
 void CuckooTableIterator::Prev() {
   if (curr_key_idx_ == 0) {
-    curr_key_idx_ = sorted_bucket_ids_.size();
+    curr_key_idx_ = static_cast<uint32_t>(sorted_bucket_ids_.size());
   }
   if (!Valid()) {
     curr_value_.clear();
diff --git a/table/cuckoo_table_reader_test.cc b/table/cuckoo_table_reader_test.cc
index d1c52722a..4245b749e 100644
--- a/table/cuckoo_table_reader_test.cc
+++ b/table/cuckoo_table_reader_test.cc
@@ -161,7 +161,7 @@ class CuckooReaderTest {
     ASSERT_EQ(static_cast<uint32_t>(cnt), num_items);
 
     it->SeekToLast();
-    cnt = num_items - 1;
+    cnt = static_cast<int>(num_items) - 1;
     ASSERT_TRUE(it->Valid());
     while (it->Valid()) {
       ASSERT_OK(it->status());
@@ -172,7 +172,7 @@ class CuckooReaderTest {
     }
     ASSERT_EQ(cnt, -1);
 
-    cnt = num_items / 2;
+    cnt = static_cast<int>(num_items) / 2;
     it->Seek(keys[cnt]);
     while (it->Valid()) {
       ASSERT_OK(it->status());
diff --git a/table/format.cc b/table/format.cc
index d64bb3eac..e2cad7bf6 100644
--- a/table/format.cc
+++ b/table/format.cc
@@ -240,7 +240,7 @@ Status ReadBlock(RandomAccessFile* file, const Footer& footer,
         actual = crc32c::Value(data, n + 1);
         break;
       case kxxHash:
-        actual = XXH32(data, n + 1, 0);
+        actual = XXH32(data, static_cast<int>(n) + 1, 0);
         break;
       default:
         s = Status::Corruption("unknown checksum type");
diff --git a/table/full_filter_block_test.cc b/table/full_filter_block_test.cc
index 7bf61f238..7adb5f08d 100644
--- a/table/full_filter_block_test.cc
+++ b/table/full_filter_block_test.cc
@@ -25,7 +25,7 @@ class TestFilterBitsBuilder : public FilterBitsBuilder {
 
   // Generate the filter using the keys that are added
   virtual Slice Finish(std::unique_ptr<const char[]>* buf) override {
-    uint32_t len = hash_entries_.size() * 4;
+    uint32_t len = static_cast<uint32_t>(hash_entries_.size()) * 4;
     char* data = new char[len];
     for (size_t i = 0; i < hash_entries_.size(); i++) {
       EncodeFixed32(data + i * 4, hash_entries_[i]);
@@ -42,7 +42,7 @@ class TestFilterBitsBuilder : public FilterBitsBuilder {
 class TestFilterBitsReader : public FilterBitsReader {
  public:
   explicit TestFilterBitsReader(const Slice& contents)
-    : data_(contents.data()), len_(contents.size()) {}
+      : data_(contents.data()), len_(static_cast<uint32_t>(contents.size())) {}
 
   virtual bool MayMatch(const Slice& entry) override {
     uint32_t h = Hash(entry.data(), entry.size(), 1);
diff --git a/table/merger_test.cc b/table/merger_test.cc
index 3a10527f4..56ea361be 100644
--- a/table/merger_test.cc
+++ b/table/merger_test.cc
@@ -49,9 +49,9 @@ class MergerTest {
   MergerTest()
       : rnd_(3), merging_iterator_(nullptr), single_iterator_(nullptr) {}
   ~MergerTest() = default;
-  std::vector<std::string> GenerateStrings(int len, int string_len) {
+  std::vector<std::string> GenerateStrings(size_t len, int string_len) {
     std::vector<std::string> ret;
-    for (int i = 0; i < len; ++i) {
+    for (size_t i = 0; i < len; ++i) {
       ret.push_back(test::RandomHumanReadableString(&rnd_, string_len));
     }
     return ret;
@@ -119,7 +119,7 @@ class MergerTest {
   }
 
   void Generate(size_t num_iterators, size_t strings_per_iterator,
-                size_t letters_per_string) {
+                int letters_per_string) {
     std::vector<Iterator*> small_iterators;
     for (size_t i = 0; i < num_iterators; ++i) {
       auto strings = GenerateStrings(strings_per_iterator, letters_per_string);
@@ -127,8 +127,9 @@ class MergerTest {
       all_keys_.insert(all_keys_.end(), strings.begin(), strings.end());
     }
 
-    merging_iterator_.reset(NewMergingIterator(
-        BytewiseComparator(), &small_iterators[0], small_iterators.size()));
+    merging_iterator_.reset(
+        NewMergingIterator(BytewiseComparator(), &small_iterators[0],
+                           static_cast<int>(small_iterators.size())));
     single_iterator_.reset(new VectorIterator(all_keys_));
   }
 
diff --git a/table/plain_table_builder.cc b/table/plain_table_builder.cc
index b5914554b..74a71cb35 100644
--- a/table/plain_table_builder.cc
+++ b/table/plain_table_builder.cc
@@ -6,8 +6,10 @@
 #ifndef ROCKSDB_LITE
 #include "table/plain_table_builder.h"
 
-#include <string>
 #include <assert.h>
+
+#include <string>
+#include <limits>
 #include <map>
 
 #include "rocksdb/comparator.h"
@@ -133,7 +135,8 @@ void PlainTableBuilder::Add(const Slice& key, const Slice& value) {
   }
 
   // Write value
-  auto prev_offset = offset_;
+  assert(offset_ <= std::numeric_limits<uint32_t>::max());
+  auto prev_offset = static_cast<uint32_t>(offset_);
   // Write out the key
   encoder_.AppendKey(key, file_, &offset_, meta_bytes_buf,
                      &meta_bytes_buf_size);
@@ -142,7 +145,7 @@ void PlainTableBuilder::Add(const Slice& key, const Slice& value) {
   }
 
   // Write value length
-  int value_size = value.size();
+  uint32_t value_size = static_cast<uint32_t>(value.size());
   char* end_ptr =
       EncodeVarint32(meta_bytes_buf + meta_bytes_buf_size, value_size);
   assert(end_ptr <= meta_bytes_buf + sizeof(meta_bytes_buf));
@@ -180,10 +183,11 @@ Status PlainTableBuilder::Finish() {
   MetaIndexBuilder meta_index_builer;
 
   if (store_index_in_file_ && (properties_.num_entries > 0)) {
+    assert(properties_.num_entries <= std::numeric_limits<uint32_t>::max());
     bloom_block_.SetTotalBits(
-        &arena_, properties_.num_entries * bloom_bits_per_key_,
-        ioptions_.bloom_locality, huge_page_tlb_size_,
-        ioptions_.info_log);
+        &arena_,
+        static_cast<uint32_t>(properties_.num_entries) * bloom_bits_per_key_,
+        ioptions_.bloom_locality, huge_page_tlb_size_, ioptions_.info_log);
 
     PutVarint32(&properties_.user_collected_properties
                      [PlainTablePropertyNames::kNumBloomBlocks],
diff --git a/table/plain_table_builder.h b/table/plain_table_builder.h
index c3af08072..8fc4f1fe4 100644
--- a/table/plain_table_builder.h
+++ b/table/plain_table_builder.h
@@ -81,7 +81,7 @@ class PlainTableBuilder: public TableBuilder {
   WritableFile* file_;
   uint64_t offset_ = 0;
   uint32_t bloom_bits_per_key_;
-  uint32_t huge_page_tlb_size_;
+  size_t huge_page_tlb_size_;
   Status status_;
   TableProperties properties_;
   PlainTableKeyEncoder encoder_;
diff --git a/table/plain_table_index.cc b/table/plain_table_index.cc
index b5e3981c1..8b2c994b8 100644
--- a/table/plain_table_index.cc
+++ b/table/plain_table_index.cc
@@ -3,6 +3,12 @@
 //  LICENSE file in the root directory of this source tree. An additional grant
 //  of patent rights can be found in the PATENTS file in the same directory.
 
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include <inttypes.h>
+
 #include "table/plain_table_index.h"
 #include "util/coding.h"
 #include "util/hash.h"
@@ -24,7 +30,8 @@ Status PlainTableIndex::InitFromRawData(Slice data) {
   if (!GetVarint32(&data, &num_prefixes_)) {
     return Status::Corruption("Couldn't read the index size!");
   }
-  sub_index_size_ = data.size() - index_size_ * kOffsetLen;
+  sub_index_size_ =
+      static_cast<uint32_t>(data.size()) - index_size_ * kOffsetLen;
 
   char* index_data_begin = const_cast<char*>(data.data());
   index_ = reinterpret_cast<uint32_t*>(index_data_begin);
@@ -55,13 +62,15 @@ void PlainTableIndexBuilder::IndexRecordList::AddRecord(murmur_t hash,
     num_records_in_current_group_ = 0;
   }
   auto& new_record = current_group_[num_records_in_current_group_++];
-  new_record.hash = hash;
+  // TODO(sdong) -- check if this is OK -- murmur_t is uint64_t, while we only
+  // use 32 bits here
+  new_record.hash = static_cast<uint32_t>(hash);
   new_record.offset = offset;
   new_record.next = nullptr;
 }
 
 void PlainTableIndexBuilder::AddKeyPrefix(Slice key_prefix_slice,
-                                          uint64_t key_offset) {
+                                          uint32_t key_offset) {
   if (is_first_record_ || prev_key_prefix_ != key_prefix_slice.ToString()) {
     ++num_prefixes_;
     if (!is_first_record_) {
@@ -149,7 +158,7 @@ Slice PlainTableIndexBuilder::FillIndexes(
     const std::vector<IndexRecord*>& hash_to_offsets,
     const std::vector<uint32_t>& entries_per_bucket) {
   Log(InfoLogLevel::DEBUG_LEVEL, ioptions_.info_log,
-      "Reserving %zu bytes for plain table's sub_index",
+      "Reserving %" PRIu32 " bytes for plain table's sub_index",
       sub_index_size_);
   auto total_allocate_size = GetTotalSize();
   char* allocated = arena_->AllocateAligned(
@@ -160,7 +169,7 @@ Slice PlainTableIndexBuilder::FillIndexes(
       reinterpret_cast<uint32_t*>(EncodeVarint32(temp_ptr, num_prefixes_));
   char* sub_index = reinterpret_cast<char*>(index + index_size_);
 
-  size_t sub_index_offset = 0;
+  uint32_t sub_index_offset = 0;
   for (uint32_t i = 0; i < index_size_; i++) {
     uint32_t num_keys_for_bucket = entries_per_bucket[i];
     switch (num_keys_for_bucket) {
diff --git a/table/plain_table_index.h b/table/plain_table_index.h
index 0b26ecd0d..fa6d1a41e 100644
--- a/table/plain_table_index.h
+++ b/table/plain_table_index.h
@@ -92,7 +92,7 @@ class PlainTableIndex {
 
  private:
   uint32_t index_size_;
-  size_t sub_index_size_;
+  uint32_t sub_index_size_;
   uint32_t num_prefixes_;
 
   uint32_t* index_;
@@ -109,8 +109,8 @@ class PlainTableIndex {
 class PlainTableIndexBuilder {
  public:
   PlainTableIndexBuilder(Arena* arena, const ImmutableCFOptions& ioptions,
-                         uint32_t index_sparseness, double hash_table_ratio,
-                         double huge_page_tlb_size)
+                         size_t index_sparseness, double hash_table_ratio,
+                         size_t huge_page_tlb_size)
       : arena_(arena),
         ioptions_(ioptions),
         record_list_(kRecordsPerGroup),
@@ -124,7 +124,7 @@ class PlainTableIndexBuilder {
         hash_table_ratio_(hash_table_ratio),
         huge_page_tlb_size_(huge_page_tlb_size) {}
 
-  void AddKeyPrefix(Slice key_prefix_slice, uint64_t key_offset);
+  void AddKeyPrefix(Slice key_prefix_slice, uint32_t key_offset);
 
   Slice Finish();
 
@@ -205,13 +205,13 @@ class PlainTableIndexBuilder {
   uint32_t num_keys_per_prefix_;
 
   uint32_t prev_key_prefix_hash_;
-  uint32_t index_sparseness_;
+  size_t index_sparseness_;
   uint32_t index_size_;
-  size_t sub_index_size_;
+  uint32_t sub_index_size_;
 
   const SliceTransform* prefix_extractor_;
   double hash_table_ratio_;
-  double huge_page_tlb_size_;
+  size_t huge_page_tlb_size_;
 
   std::string prev_key_prefix_;
 
diff --git a/table/plain_table_key_coding.cc b/table/plain_table_key_coding.cc
index c553752e1..08d16f191 100644
--- a/table/plain_table_key_coding.cc
+++ b/table/plain_table_key_coding.cc
@@ -43,7 +43,7 @@ size_t EncodeSize(EntryType type, uint32_t key_size, char* out_buffer) {
 
 // Return position after the size byte(s). nullptr means error
 const char* DecodeSize(const char* offset, const char* limit,
-                       EntryType* entry_type, size_t* key_size) {
+                       EntryType* entry_type, uint32_t* key_size) {
   assert(offset < limit);
   *entry_type = static_cast<EntryType>(
       (static_cast<unsigned char>(offset[0]) & ~kSizeInlineLimit) >> 6);
@@ -73,10 +73,10 @@ Status PlainTableKeyEncoder::AppendKey(const Slice& key, WritableFile* file,
 
   Slice key_to_write = key;  // Portion of internal key to write out.
 
-  size_t user_key_size = fixed_user_key_len_;
+  uint32_t user_key_size = fixed_user_key_len_;
   if (encoding_type_ == kPlain) {
     if (fixed_user_key_len_ == kPlainTableVariableLength) {
-      user_key_size = key.size() - 8;
+      user_key_size = static_cast<uint32_t>(key.size() - 8);
       // Write key length
       char key_size_buf[5];  // tmp buffer for key size as varint32
       char* ptr = EncodeVarint32(key_size_buf, user_key_size);
@@ -93,7 +93,7 @@ Status PlainTableKeyEncoder::AppendKey(const Slice& key, WritableFile* file,
     char size_bytes[12];
     size_t size_bytes_pos = 0;
 
-    user_key_size = key.size() - 8;
+    user_key_size = static_cast<uint32_t>(key.size() - 8);
 
     Slice prefix =
         prefix_extractor_->Transform(Slice(key.data(), user_key_size));
@@ -112,10 +112,11 @@ Status PlainTableKeyEncoder::AppendKey(const Slice& key, WritableFile* file,
       if (key_count_for_prefix_ == 2) {
         // For second key within a prefix, need to encode prefix length
         size_bytes_pos +=
-            EncodeSize(kPrefixFromPreviousKey, pre_prefix_.GetKey().size(),
+            EncodeSize(kPrefixFromPreviousKey,
+                       static_cast<uint32_t>(pre_prefix_.GetKey().size()),
                        size_bytes + size_bytes_pos);
       }
-      size_t prefix_len = pre_prefix_.GetKey().size();
+      uint32_t prefix_len = static_cast<uint32_t>(pre_prefix_.GetKey().size());
       size_bytes_pos += EncodeSize(kKeySuffix, user_key_size - prefix_len,
                                    size_bytes + size_bytes_pos);
       Status s = file->Append(Slice(size_bytes, size_bytes_pos));
@@ -184,7 +185,7 @@ Status PlainTableKeyDecoder::NextPlainEncodingKey(
     const char* start, const char* limit, ParsedInternalKey* parsed_key,
     Slice* internal_key, size_t* bytes_read, bool* seekable) {
   const char* key_ptr = start;
-  size_t user_key_size = 0;
+  uint32_t user_key_size = 0;
   if (fixed_user_key_len_ != kPlainTableVariableLength) {
     user_key_size = fixed_user_key_len_;
     key_ptr = start;
@@ -195,7 +196,7 @@ Status PlainTableKeyDecoder::NextPlainEncodingKey(
       return Status::Corruption(
           "Unexpected EOF when reading the next key's size");
     }
-    user_key_size = static_cast<size_t>(tmp_size);
+    user_key_size = tmp_size;
     *bytes_read = key_ptr - start;
   }
   // dummy initial value to avoid compiler complain
@@ -227,7 +228,7 @@ Status PlainTableKeyDecoder::NextPrefixEncodingKey(
 
   bool expect_suffix = false;
   do {
-    size_t size = 0;
+    uint32_t size = 0;
     // dummy initial value to avoid compiler complain
     bool decoded_internal_key_valid = true;
     const char* pos = DecodeSize(key_ptr, limit, &entry_type, &size);
diff --git a/table/plain_table_reader.cc b/table/plain_table_reader.cc
index 16120d32b..48b709e80 100644
--- a/table/plain_table_reader.cc
+++ b/table/plain_table_reader.cc
@@ -98,8 +98,8 @@ PlainTableReader::PlainTableReader(const ImmutableCFOptions& ioptions,
     : internal_comparator_(icomparator),
       encoding_type_(encoding_type),
       full_scan_mode_(false),
-      data_end_offset_(table_properties->data_size),
-      user_key_len_(table_properties->fixed_key_len),
+      data_end_offset_(static_cast<uint32_t>(table_properties->data_size)),
+      user_key_len_(static_cast<uint32_t>(table_properties->fixed_key_len)),
       prefix_extractor_(ioptions.prefix_extractor),
       enable_bloom_(false),
       bloom_(6, nullptr),
@@ -327,7 +327,8 @@ Status PlainTableReader::PopulateIndex(TableProperties* props,
     // Allocate bloom filter here for total order mode.
     if (IsTotalOrderMode()) {
       uint32_t num_bloom_bits =
-          table_properties_->num_entries * bloom_bits_per_key;
+          static_cast<uint32_t>(table_properties_->num_entries) *
+          bloom_bits_per_key;
       if (num_bloom_bits > 0) {
         enable_bloom_ = true;
         bloom_.SetTotalBits(&arena_, num_bloom_bits, ioptions_.bloom_locality,
@@ -350,7 +351,7 @@ Status PlainTableReader::PopulateIndex(TableProperties* props,
     bloom_.SetRawData(
         const_cast<unsigned char*>(
             reinterpret_cast<const unsigned char*>(bloom_block->data())),
-        bloom_block->size() * 8, num_blocks);
+        static_cast<uint32_t>(bloom_block->size()) * 8, num_blocks);
   }
 
   PlainTableIndexBuilder index_builder(&arena_, ioptions_, index_sparseness,
@@ -509,7 +510,7 @@ Status PlainTableReader::Next(PlainTableKeyDecoder* decoder, uint32_t* offset,
     return Status::Corruption(
         "Unexpected EOF when reading the next value's size.");
   }
-  *offset = *offset + (value_ptr - start) + value_size;
+  *offset = *offset + static_cast<uint32_t>(value_ptr - start) + value_size;
   if (*offset > data_end_offset_) {
     return Status::Corruption("Unexpected EOF when reading the next value. ");
   }
diff --git a/table/plain_table_reader.h b/table/plain_table_reader.h
index 531ac8e8b..9d0df974e 100644
--- a/table/plain_table_reader.h
+++ b/table/plain_table_reader.h
@@ -123,7 +123,7 @@ class PlainTableReader: public TableReader {
   // sst file that stores data.
   const uint32_t data_start_offset_ = 0;
   const uint32_t data_end_offset_;
-  const size_t user_key_len_;
+  const uint32_t user_key_len_;
   const SliceTransform* prefix_extractor_;
 
   static const size_t kNumInternalBytes = 8;
@@ -135,7 +135,7 @@ class PlainTableReader: public TableReader {
 
   const ImmutableCFOptions& ioptions_;
   unique_ptr<RandomAccessFile> file_;
-  uint32_t file_size_;
+  uint64_t file_size_;
   std::shared_ptr<const TableProperties> table_properties_;
 
   bool IsFixedLength() const {
diff --git a/table/table_test.cc b/table/table_test.cc
index a5685f7f6..facf0926e 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -947,7 +947,7 @@ class Harness {
     if (keys.empty()) {
       return "foo";
     } else {
-      const int index = rnd->Uniform(keys.size());
+      const int index = rnd->Uniform(static_cast<int>(keys.size()));
       std::string result = keys[index];
       switch (rnd->Uniform(support_prev_ ? 3 : 1)) {
         case 0:
diff --git a/tools/blob_store_bench.cc b/tools/blob_store_bench.cc
index 7b820a178..99ca66a37 100644
--- a/tools/blob_store_bench.cc
+++ b/tools/blob_store_bench.cc
@@ -31,7 +31,7 @@ BlobStore* bs;
 namespace {
 std::string RandomString(Random* rnd, uint64_t len) {
   std::string r;
-  test::RandomString(rnd, len, &r);
+  test::RandomString(rnd, static_cast<int>(len), &r);
   return r;
 }
 }  // namespace
diff --git a/tools/db_stress.cc b/tools/db_stress.cc
index 33c72cd48..8109c141e 100644
--- a/tools/db_stress.cc
+++ b/tools/db_stress.cc
@@ -113,7 +113,8 @@ DEFINE_bool(verbose, false, "Verbose");
 DEFINE_bool(progress_reports, true,
             "If true, db_stress will report number of finished operations");
 
-DEFINE_int32(write_buffer_size, rocksdb::Options().write_buffer_size,
+DEFINE_int32(write_buffer_size,
+             static_cast<int32_t>(rocksdb::Options().write_buffer_size),
              "Number of bytes to buffer in memtable before compacting");
 
 DEFINE_int32(max_write_buffer_number,
@@ -154,7 +155,8 @@ DEFINE_int32(level0_stop_writes_trigger,
              rocksdb::Options().level0_stop_writes_trigger,
              "Number of files in level-0 that will trigger put stop.");
 
-DEFINE_int32(block_size, rocksdb::BlockBasedTableOptions().block_size,
+DEFINE_int32(block_size,
+             static_cast<int32_t>(rocksdb::BlockBasedTableOptions().block_size),
              "Number of bytes in a block.");
 
 DEFINE_int32(max_background_compactions,
@@ -573,9 +575,9 @@ class SharedState {
 
   explicit SharedState(StressTest* stress_test)
       : cv_(&mu_),
-        seed_(FLAGS_seed),
+        seed_(static_cast<uint32_t>(FLAGS_seed)),
         max_key_(FLAGS_max_key),
-        log2_keys_per_lock_(FLAGS_log2_keys_per_lock),
+        log2_keys_per_lock_(static_cast<uint32_t>(FLAGS_log2_keys_per_lock)),
         num_threads_(FLAGS_threads),
         num_initialized_(0),
         num_populated_(0),
@@ -1451,7 +1453,7 @@ class StressTest {
           assert(count <=
                  (static_cast<int64_t>(1) << ((8 - FLAGS_prefix_size) * 8)));
           if (iter->status().ok()) {
-            thread->stats.AddPrefixes(1, count);
+            thread->stats.AddPrefixes(1, static_cast<int>(count));
           } else {
             thread->stats.AddErrors(1);
           }
@@ -1489,7 +1491,8 @@ class StressTest {
         } else {
           MultiPut(thread, write_opts, column_family, key, v, sz);
         }
-        PrintKeyValue(rand_column_family, rand_key, value, sz);
+        PrintKeyValue(rand_column_family, static_cast<uint32_t>(rand_key),
+                      value, sz);
       } else if (writeBound <= prob_op && prob_op < delBound) {
         // OPERATION delete
         if (!FLAGS_test_batches_snapshots) {
@@ -1553,16 +1556,19 @@ class StressTest {
               from_db = iter->value().ToString();
               iter->Next();
             } else if (iter->key().compare(k) < 0) {
-              VerificationAbort(shared, "An out of range key was found", cf, i);
+              VerificationAbort(shared, "An out of range key was found",
+                                static_cast<int>(cf), i);
             }
           } else {
             // The iterator found no value for the key in question, so do not
             // move to the next item in the iterator
             s = Status::NotFound(Slice());
           }
-          VerifyValue(cf, i, options, shared, from_db, s, true);
+          VerifyValue(static_cast<int>(cf), i, options, shared, from_db, s,
+                      true);
           if (from_db.length()) {
-            PrintKeyValue(cf, i, from_db.data(), from_db.length());
+            PrintKeyValue(static_cast<int>(cf), static_cast<uint32_t>(i),
+                          from_db.data(), from_db.length());
           }
         }
       } else {
@@ -1575,9 +1581,11 @@ class StressTest {
           std::string keystr = Key(i);
           Slice k = keystr;
           Status s = db_->Get(options, column_families_[cf], k, &from_db);
-          VerifyValue(cf, i, options, shared, from_db, s, true);
+          VerifyValue(static_cast<int>(cf), i, options, shared, from_db, s,
+                      true);
           if (from_db.length()) {
-            PrintKeyValue(cf, i, from_db.data(), from_db.length());
+            PrintKeyValue(static_cast<int>(cf), static_cast<uint32_t>(i),
+                          from_db.data(), from_db.length());
           }
         }
       }
diff --git a/util/auto_roll_logger.cc b/util/auto_roll_logger.cc
index 4812d1c4d..3385986f9 100644
--- a/util/auto_roll_logger.cc
+++ b/util/auto_roll_logger.cc
@@ -18,8 +18,7 @@ Status AutoRollLogger::ResetLogger() {
     return status_;
   }
 
-  if (logger_->GetLogFileSize() ==
-      (size_t)Logger::DO_NOT_SUPPORT_GET_LOG_FILE_SIZE) {
+  if (logger_->GetLogFileSize() == Logger::kDoNotSupportGetLogFileSize) {
     status_ = Status::NotSupported(
         "The underlying logger doesn't support GetLogFileSize()");
   }
diff --git a/util/auto_roll_logger_test.cc b/util/auto_roll_logger_test.cc
index 366ca084e..7a2bb6aa7 100755
--- a/util/auto_roll_logger_test.cc
+++ b/util/auto_roll_logger_test.cc
@@ -122,7 +122,7 @@ uint64_t AutoRollLoggerTest::RollLogFileByTimeTest(
   }
 
   // -- Make the log file expire
-  sleep(time);
+  sleep(static_cast<unsigned int>(time));
   LogMessage(logger, log_message.c_str());
 
   // At this time, the new log file should be created.
diff --git a/util/benchharness.cc b/util/benchharness.cc
index fef8df56e..59fbba4c5 100644
--- a/util/benchharness.cc
+++ b/util/benchharness.cc
@@ -206,7 +206,8 @@ static double RunBenchmarkGetNSPerIteration(const BenchmarkFun& fun,
   size_t actualEpochs = 0;
 
   for (; actualEpochs < epochs; ++actualEpochs) {
-    for (unsigned int n = FLAGS_bm_min_iters; n < (1UL << 30); n *= 2) {
+    for (unsigned int n = static_cast<unsigned int>(FLAGS_bm_min_iters);
+         n < (1UL << 30); n *= 2) {
       auto const nsecs = fun(n);
       if (nsecs < minNanoseconds) {
         continue;
diff --git a/util/benchharness_test.cc b/util/benchharness_test.cc
index 75ff65892..f2c910edb 100644
--- a/util/benchharness_test.cc
+++ b/util/benchharness_test.cc
@@ -10,35 +10,35 @@
 namespace rocksdb {
 
 BENCHMARK(insertFrontVector) {
-  std::vector<int> v;
-  for (int i = 0; i < 100; i++) {
+  std::vector<size_t> v;
+  for (size_t i = 0; i < 100; i++) {
     v.insert(v.begin(), i);
   }
 }
 
 BENCHMARK_RELATIVE(insertBackVector) {
-  std::vector<int> v;
+  std::vector<size_t> v;
   for (size_t i = 0; i < 100; i++) {
     v.insert(v.end(), i);
   }
 }
 
 BENCHMARK_N(insertFrontVector_n, n) {
-  std::vector<int> v;
+  std::vector<size_t> v;
   for (size_t i = 0; i < n; i++) {
     v.insert(v.begin(), i);
   }
 }
 
 BENCHMARK_RELATIVE_N(insertBackVector_n, n) {
-  std::vector<int> v;
+  std::vector<size_t> v;
   for (size_t i = 0; i < n; i++) {
     v.insert(v.end(), i);
   }
 }
 
 BENCHMARK_N(insertFrontEnd_n, n) {
-  std::vector<int> v;
+  std::vector<size_t> v;
   for (size_t i = 0; i < n; i++) {
     v.insert(v.begin(), i);
   }
@@ -48,7 +48,7 @@ BENCHMARK_N(insertFrontEnd_n, n) {
 }
 
 BENCHMARK_RELATIVE_N(insertFrontEndSuspend_n, n) {
-  std::vector<int> v;
+  std::vector<size_t> v;
   for (size_t i = 0; i < n; i++) {
     v.insert(v.begin(), i);
   }
diff --git a/util/blob_store.cc b/util/blob_store.cc
index daaf4bc02..80dfba512 100644
--- a/util/blob_store.cc
+++ b/util/blob_store.cc
@@ -132,7 +132,9 @@ BlobStore::~BlobStore() {
 
 Status BlobStore::Put(const Slice& value, Blob* blob) {
   // convert size to number of blocks
-  Status s = Allocate((value.size() + block_size_ - 1) / block_size_, blob);
+  Status s = Allocate(
+      static_cast<uint32_t>((value.size() + block_size_ - 1) / block_size_),
+      blob);
   if (!s.ok()) {
     return s;
   }
diff --git a/util/bloom.cc b/util/bloom.cc
index 19d8edead..007d4f273 100644
--- a/util/bloom.cc
+++ b/util/bloom.cc
@@ -55,7 +55,8 @@ class FullFilterBitsBuilder : public FilterBitsBuilder {
   // +----------------------------------------------------------------+
   virtual Slice Finish(std::unique_ptr<const char[]>* buf) override {
     uint32_t total_bits, num_lines;
-    char* data = ReserveSpace(hash_entries_.size(), &total_bits, &num_lines);
+    char* data = ReserveSpace(static_cast<int>(hash_entries_.size()),
+                              &total_bits, &num_lines);
     assert(data);
 
     if (total_bits != 0 && num_lines != 0) {
@@ -111,7 +112,7 @@ char* FullFilterBitsBuilder::ReserveSpace(const int num_entry,
   assert(bits_per_key_);
   char* data = nullptr;
   if (num_entry != 0) {
-    uint32_t total_bits_tmp = num_entry * bits_per_key_;
+    uint32_t total_bits_tmp = num_entry * static_cast<uint32_t>(bits_per_key_);
 
     *total_bits = GetTotalBitsForLocality(total_bits_tmp);
     *num_lines = *total_bits / (CACHE_LINE_SIZE * 8);
@@ -152,8 +153,9 @@ class FullFilterBitsReader : public FilterBitsReader {
  public:
   explicit FullFilterBitsReader(const Slice& contents)
       : data_(const_cast<char*>(contents.data())),
-        data_len_(contents.size()),
-        num_probes_(0), num_lines_(0) {
+        data_len_(static_cast<uint32_t>(contents.size())),
+        num_probes_(0),
+        num_lines_(0) {
     assert(data_);
     GetFilterMeta(contents, &num_probes_, &num_lines_);
     // Sanitize broken parameter
@@ -210,7 +212,7 @@ class FullFilterBitsReader : public FilterBitsReader {
 
 void FullFilterBitsReader::GetFilterMeta(const Slice& filter,
     size_t* num_probes, uint32_t* num_lines) {
-  uint32_t len = filter.size();
+  uint32_t len = static_cast<uint32_t>(filter.size());
   if (len <= 5) {
     // filter is empty or broken
     *num_probes = 0;
@@ -225,7 +227,7 @@ void FullFilterBitsReader::GetFilterMeta(const Slice& filter,
 bool FullFilterBitsReader::HashMayMatch(const uint32_t& hash,
     const Slice& filter, const size_t& num_probes,
     const uint32_t& num_lines) {
-  uint32_t len = filter.size();
+  uint32_t len = static_cast<uint32_t>(filter.size());
   if (len <= 5) return false;  // remain the same with original filter
 
   // It is ensured the params are valid before calling it
diff --git a/util/bloom_test.cc b/util/bloom_test.cc
index 275592b70..3d8764b7e 100644
--- a/util/bloom_test.cc
+++ b/util/bloom_test.cc
@@ -79,7 +79,8 @@ class BloomTest {
       key_slices.push_back(Slice(keys_[i]));
     }
     filter_.clear();
-    policy_->CreateFilter(&key_slices[0], key_slices.size(), &filter_);
+    policy_->CreateFilter(&key_slices[0], static_cast<int>(key_slices.size()),
+                          &filter_);
     keys_.clear();
     if (kVerbose >= 2) DumpFilter();
   }
diff --git a/util/cache.cc b/util/cache.cc
index 850fdb537..b1d8a19c3 100644
--- a/util/cache.cc
+++ b/util/cache.cc
@@ -145,7 +145,7 @@ class LRUCache {
 
   // Separate from constructor so caller can easily make an array of LRUCache
   void SetCapacity(size_t capacity) { capacity_ = capacity; }
-  void SetRemoveScanCountLimit(size_t remove_scan_count_limit) {
+  void SetRemoveScanCountLimit(uint32_t remove_scan_count_limit) {
     remove_scan_count_limit_ = remove_scan_count_limit;
   }
 
diff --git a/util/cache_test.cc b/util/cache_test.cc
index 74109ff0c..3aba95645 100644
--- a/util/cache_test.cc
+++ b/util/cache_test.cc
@@ -28,7 +28,9 @@ static int DecodeKey(const Slice& k) {
   return DecodeFixed32(k.data());
 }
 static void* EncodeValue(uintptr_t v) { return reinterpret_cast<void*>(v); }
-static int DecodeValue(void* v) { return reinterpret_cast<uintptr_t>(v); }
+static int DecodeValue(void* v) {
+  return static_cast<int>(reinterpret_cast<uintptr_t>(v));
+}
 
 class CacheTest {
  public:
diff --git a/util/coding.h b/util/coding.h
index fa6652668..a72f7dbec 100644
--- a/util/coding.h
+++ b/util/coding.h
@@ -157,7 +157,7 @@ inline void PutFixed64(std::string* dst, uint64_t value) {
 inline void PutVarint32(std::string* dst, uint32_t v) {
   char buf[5];
   char* ptr = EncodeVarint32(buf, v);
-  dst->append(buf, ptr - buf);
+  dst->append(buf, static_cast<size_t>(ptr - buf));
 }
 
 inline char* EncodeVarint64(char* dst, uint64_t v) {
@@ -174,11 +174,11 @@ inline char* EncodeVarint64(char* dst, uint64_t v) {
 inline void PutVarint64(std::string* dst, uint64_t v) {
   char buf[10];
   char* ptr = EncodeVarint64(buf, v);
-  dst->append(buf, ptr - buf);
+  dst->append(buf, static_cast<size_t>(ptr - buf));
 }
 
 inline void PutLengthPrefixedSlice(std::string* dst, const Slice& value) {
-  PutVarint32(dst, value.size());
+  PutVarint32(dst, static_cast<uint32_t>(value.size()));
   dst->append(value.data(), value.size());
 }
 
@@ -219,7 +219,7 @@ inline bool GetVarint32(Slice* input, uint32_t* value) {
   if (q == nullptr) {
     return false;
   } else {
-    *input = Slice(q, limit - q);
+    *input = Slice(q, static_cast<size_t>(limit - q));
     return true;
   }
 }
@@ -231,7 +231,7 @@ inline bool GetVarint64(Slice* input, uint64_t* value) {
   if (q == nullptr) {
     return false;
   } else {
-    *input = Slice(q, limit - q);
+    *input = Slice(q, static_cast<size_t>(limit - q));
     return true;
   }
 }
diff --git a/util/crc32c.cc b/util/crc32c.cc
index d27fb4be9..8f1a09e17 100644
--- a/util/crc32c.cc
+++ b/util/crc32c.cc
@@ -298,14 +298,14 @@ static inline uint64_t LE_LOAD64(const uint8_t *p) {
 #endif
 
 static inline void Slow_CRC32(uint64_t* l, uint8_t const **p) {
-  uint32_t c = *l ^ LE_LOAD32(*p);
+  uint32_t c = static_cast<uint32_t>(*l ^ LE_LOAD32(*p));
   *p += 4;
   *l = table3_[c & 0xff] ^
   table2_[(c >> 8) & 0xff] ^
   table1_[(c >> 16) & 0xff] ^
   table0_[c >> 24];
   // DO it twice.
-  c = *l ^ LE_LOAD32(*p);
+  c = static_cast<uint32_t>(*l ^ LE_LOAD32(*p));
   *p += 4;
   *l = table3_[c & 0xff] ^
   table2_[(c >> 8) & 0xff] ^
@@ -362,7 +362,7 @@ uint32_t ExtendImpl(uint32_t crc, const char* buf, size_t size) {
   }
 #undef STEP1
 #undef ALIGN
-  return l ^ 0xffffffffu;
+  return static_cast<uint32_t>(l ^ 0xffffffffu);
 }
 
 // Detect if SS42 or not.
diff --git a/util/dynamic_bloom_test.cc b/util/dynamic_bloom_test.cc
index 6d228e81d..a3d6e0fc7 100644
--- a/util/dynamic_bloom_test.cc
+++ b/util/dynamic_bloom_test.cc
@@ -153,15 +153,15 @@ TEST(DynamicBloomTest, perf) {
     return;
   }
 
-  for (uint64_t m = 1; m <= 8; ++m) {
+  for (uint32_t m = 1; m <= 8; ++m) {
     Arena arena;
-    const uint64_t num_keys = m * 8 * 1024 * 1024;
-    fprintf(stderr, "testing %" PRIu64 "M keys\n", m * 8);
+    const uint32_t num_keys = m * 8 * 1024 * 1024;
+    fprintf(stderr, "testing %" PRIu32 "M keys\n", m * 8);
 
     DynamicBloom std_bloom(&arena, num_keys * 10, 0, num_probes);
 
     timer.Start();
-    for (uint64_t i = 1; i <= num_keys; ++i) {
+    for (uint32_t i = 1; i <= num_keys; ++i) {
       std_bloom.Add(Slice(reinterpret_cast<const char*>(&i), 8));
     }
 
@@ -169,9 +169,9 @@ TEST(DynamicBloomTest, perf) {
     fprintf(stderr, "standard bloom, avg add latency %" PRIu64 "\n",
             elapsed / num_keys);
 
-    uint64_t count = 0;
+    uint32_t count = 0;
     timer.Start();
-    for (uint64_t i = 1; i <= num_keys; ++i) {
+    for (uint32_t i = 1; i <= num_keys; ++i) {
       if (std_bloom.MayContain(Slice(reinterpret_cast<const char*>(&i), 8))) {
         ++count;
       }
@@ -185,7 +185,7 @@ TEST(DynamicBloomTest, perf) {
     DynamicBloom blocked_bloom(&arena, num_keys * 10, 1, num_probes);
 
       timer.Start();
-      for (uint64_t i = 1; i <= num_keys; ++i) {
+      for (uint32_t i = 1; i <= num_keys; ++i) {
         blocked_bloom.Add(Slice(reinterpret_cast<const char*>(&i), 8));
       }
 
@@ -196,9 +196,9 @@ TEST(DynamicBloomTest, perf) {
 
       count = 0;
       timer.Start();
-      for (uint64_t i = 1; i <= num_keys; ++i) {
+      for (uint32_t i = 1; i <= num_keys; ++i) {
         if (blocked_bloom.MayContain(
-              Slice(reinterpret_cast<const char*>(&i), 8))) {
+                Slice(reinterpret_cast<const char*>(&i), 8))) {
           ++count;
         }
       }
diff --git a/util/env_posix.cc b/util/env_posix.cc
index b9987088c..b6d17ce31 100644
--- a/util/env_posix.cc
+++ b/util/env_posix.cc
@@ -1594,7 +1594,8 @@ class PosixEnv : public Env {
         void (*function)(void*) = queue_.front().function;
         void* arg = queue_.front().arg;
         queue_.pop_front();
-        queue_len_.store(queue_.size(), std::memory_order_relaxed);
+        queue_len_.store(static_cast<unsigned int>(queue_.size()),
+                         std::memory_order_relaxed);
 
         bool decrease_io_priority = (low_io_priority != low_io_priority_);
         PthreadCall("unlock", pthread_mutex_unlock(&mu_));
@@ -1709,7 +1710,8 @@ class PosixEnv : public Env {
       queue_.push_back(BGItem());
       queue_.back().function = function;
       queue_.back().arg = arg;
-      queue_len_.store(queue_.size(), std::memory_order_relaxed);
+      queue_len_.store(static_cast<unsigned int>(queue_.size()),
+                       std::memory_order_relaxed);
 
       if (!HasExcessiveThread()) {
         // Wake up at least one waiting thread.
diff --git a/util/hash.cc b/util/hash.cc
index 37eaa4057..427f0d138 100644
--- a/util/hash.cc
+++ b/util/hash.cc
@@ -18,7 +18,7 @@ uint32_t Hash(const char* data, size_t n, uint32_t seed) {
   const uint32_t m = 0xc6a4a793;
   const uint32_t r = 24;
   const char* limit = data + n;
-  uint32_t h = seed ^ (n * m);
+  uint32_t h = static_cast<uint32_t>(seed ^ (n * m));
 
   // Pick up four bytes at a time
   while (data + 4 <= limit) {
diff --git a/util/hash_cuckoo_rep.cc b/util/hash_cuckoo_rep.cc
index 2ee05faac..6a67fab44 100644
--- a/util/hash_cuckoo_rep.cc
+++ b/util/hash_cuckoo_rep.cc
@@ -213,9 +213,10 @@ class HashCuckooRep : public MemTableRep {
     static const int kMurmurHashSeeds[HashCuckooRepFactory::kMaxHashCount] = {
         545609244,  1769731426, 763324157,  13099088,   592422103,
         1899789565, 248369300,  1984183468, 1613664382, 1491157517};
-    return MurmurHash(slice.data(), slice.size(),
-                      kMurmurHashSeeds[hash_func_id]) %
-           bucket_count_;
+    return static_cast<unsigned int>(
+        MurmurHash(slice.data(), static_cast<int>(slice.size()),
+                   kMurmurHashSeeds[hash_func_id]) %
+        bucket_count_);
   }
 
   // A cuckoo path is a sequence of bucket ids, where each id points to a
diff --git a/util/hash_linklist_rep.cc b/util/hash_linklist_rep.cc
index 8e5f4025d..4573d8340 100644
--- a/util/hash_linklist_rep.cc
+++ b/util/hash_linklist_rep.cc
@@ -200,7 +200,8 @@ class HashLinkListRep : public MemTableRep {
   }
 
   size_t GetHash(const Slice& slice) const {
-    return MurmurHash(slice.data(), slice.size(), 0) % bucket_size_;
+    return MurmurHash(slice.data(), static_cast<int>(slice.size()), 0) %
+           bucket_size_;
   }
 
   Pointer* GetBucket(size_t i) const {
diff --git a/util/hash_skiplist_rep.cc b/util/hash_skiplist_rep.cc
index f410350e7..1393a917e 100644
--- a/util/hash_skiplist_rep.cc
+++ b/util/hash_skiplist_rep.cc
@@ -65,7 +65,8 @@ class HashSkipListRep : public MemTableRep {
   Arena* const arena_;
 
   inline size_t GetHash(const Slice& slice) const {
-    return MurmurHash(slice.data(), slice.size(), 0) % bucket_size_;
+    return MurmurHash(slice.data(), static_cast<int>(slice.size()), 0) %
+           bucket_size_;
   }
   inline Bucket* GetBucket(size_t i) const {
     return buckets_[i].load(std::memory_order_acquire);
diff --git a/util/ldb_cmd.cc b/util/ldb_cmd.cc
index 618c10a35..d35b9412c 100644
--- a/util/ldb_cmd.cc
+++ b/util/ldb_cmd.cc
@@ -971,8 +971,9 @@ void DBDumperCommand::DoCommand() {
   uint64_t s1=0,s2=0;
 
   // At this point, bucket_size=0 => time_range=0
-  uint64_t num_buckets = (bucket_size >= time_range) ? 1 :
-    ((time_range + bucket_size - 1) / bucket_size);
+  int num_buckets = (bucket_size >= time_range)
+                        ? 1
+                        : ((time_range + bucket_size - 1) / bucket_size);
   vector<uint64_t> bucket_counts(num_buckets, 0);
   if (is_db_ttl_ && !count_only_ && timestamp_ && !count_delim_) {
     fprintf(stdout, "Dumping key-values from %s to %s\n",
diff --git a/util/mock_env.cc b/util/mock_env.cc
index bfcfeaa0c..5a4c2c325 100644
--- a/util/mock_env.cc
+++ b/util/mock_env.cc
@@ -19,9 +19,14 @@ namespace rocksdb {
 
 class MemFile {
  public:
-  explicit MemFile(const std::string& fn) :
-    fn_(fn), refs_(0), size_(0), modified_time_(Now()),
-    rnd_((uint32_t)MurmurHash(fn.data(), fn.size(), 0)), fsynced_bytes_(0) {}
+  explicit MemFile(const std::string& fn)
+      : fn_(fn),
+        refs_(0),
+        size_(0),
+        modified_time_(Now()),
+        rnd_(static_cast<uint32_t>(
+            MurmurHash(fn.data(), static_cast<int>(fn.size()), 0))),
+        fsynced_bytes_(0) {}
 
   void Ref() {
     MutexLock lock(&mutex_);
@@ -61,7 +66,8 @@ class MemFile {
       return;
     }
     uint64_t buffered_bytes = size_ - fsynced_bytes_;
-    uint64_t start = fsynced_bytes_ + rnd_.Uniform(buffered_bytes);
+    uint64_t start =
+        fsynced_bytes_ + rnd_.Uniform(static_cast<int>(buffered_bytes));
     uint64_t end = std::min(start + 512, size_.load());
     MutexLock lock(&mutex_);
     for (uint64_t pos = start; pos < end; ++pos) {
diff --git a/util/murmurhash.h b/util/murmurhash.h
index faa86556d..40ee357a7 100644
--- a/util/murmurhash.h
+++ b/util/murmurhash.h
@@ -36,7 +36,7 @@ typedef unsigned int murmur_t;
 namespace rocksdb {
 struct murmur_hash {
   size_t operator()(const Slice& slice) const {
-    return MurmurHash(slice.data(), slice.size(), 0);
+    return MurmurHash(slice.data(), static_cast<int>(slice.size()), 0);
   }
 };
 }  // rocksdb
diff --git a/util/mutable_cf_options.cc b/util/mutable_cf_options.cc
index 1bc8a5b7d..c5f4c60b3 100644
--- a/util/mutable_cf_options.cc
+++ b/util/mutable_cf_options.cc
@@ -110,7 +110,7 @@ void MutableCFOptions::Dump(Logger* log) const {
       expanded_compaction_factor);
   Log(log, "                 source_compaction_factor: %d",
       source_compaction_factor);
-  Log(log, "                    target_file_size_base: %d",
+  Log(log, "                    target_file_size_base: %" PRIu64,
       target_file_size_base);
   Log(log, "              target_file_size_multiplier: %d",
       target_file_size_multiplier);
diff --git a/util/mutable_cf_options.h b/util/mutable_cf_options.h
index 831b0d786..40938655b 100644
--- a/util/mutable_cf_options.h
+++ b/util/mutable_cf_options.h
@@ -108,7 +108,7 @@ struct MutableCFOptions {
   int max_grandparent_overlap_factor;
   int expanded_compaction_factor;
   int source_compaction_factor;
-  int target_file_size_base;
+  uint64_t target_file_size_base;
   int target_file_size_multiplier;
   uint64_t max_bytes_for_level_base;
   int max_bytes_for_level_multiplier;
diff --git a/util/options_builder.cc b/util/options_builder.cc
index 06ce670f0..12130db52 100644
--- a/util/options_builder.cc
+++ b/util/options_builder.cc
@@ -95,7 +95,7 @@ void PickWriteBufferSize(size_t total_write_buffer_limit, Options* options) {
 
   options->write_buffer_size = write_buffer_size;
   options->max_write_buffer_number =
-      total_write_buffer_limit / write_buffer_size;
+      static_cast<int>(total_write_buffer_limit / write_buffer_size);
   options->min_write_buffer_number_to_merge = 1;
 }
 
@@ -147,10 +147,10 @@ void OptimizeForLevel(int read_amplification_threshold,
 
   // This doesn't consider compaction and overheads of mem tables. But usually
   // it is in the same order of magnitude.
-  int expected_level0_compaction_size =
+  size_t expected_level0_compaction_size =
       options->level0_file_num_compaction_trigger * options->write_buffer_size;
   // Enlarge level1 target file size if level0 compaction size is larger.
-  int max_bytes_for_level_base = 10 * kBytesForOneMb;
+  uint64_t max_bytes_for_level_base = 10 * kBytesForOneMb;
   if (expected_level0_compaction_size > max_bytes_for_level_base) {
     max_bytes_for_level_base = expected_level0_compaction_size;
   }
@@ -160,7 +160,7 @@ void OptimizeForLevel(int read_amplification_threshold,
 
   const int kMinFileSize = 2 * kBytesForOneMb;
   // Allow at least 3-way parallelism for compaction between level 1 and 2.
-  int max_file_size = max_bytes_for_level_base / 3;
+  uint64_t max_file_size = max_bytes_for_level_base / 3;
   if (max_file_size < kMinFileSize) {
     options->target_file_size_base = kMinFileSize;
   } else {
diff --git a/util/options_helper.cc b/util/options_helper.cc
index 268a67a99..0c7c5d7a1 100644
--- a/util/options_helper.cc
+++ b/util/options_helper.cc
@@ -40,12 +40,10 @@ bool ParseBoolean(const std::string& type, const std::string& value) {
     throw type;
   }
 }
-uint32_t ParseInt(const std::string& value) {
-  return std::stoi(value);
-}
+int ParseInt(const std::string& value) { return std::stoi(value); }
 
 uint32_t ParseUint32(const std::string& value) {
-  return std::stoul(value);
+  return static_cast<uint32_t>(std::stoul(value));
 }
 
 uint64_t ParseUint64(const std::string& value) {
@@ -82,9 +80,9 @@ bool ParseMemtableOptions(const std::string& name, const std::string& value,
   } else if (name == "arena_block_size") {
     new_options->arena_block_size = ParseInt64(value);
   } else if (name == "memtable_prefix_bloom_bits") {
-    new_options->memtable_prefix_bloom_bits = stoul(value);
+    new_options->memtable_prefix_bloom_bits = ParseUint32(value);
   } else if (name == "memtable_prefix_bloom_probes") {
-    new_options->memtable_prefix_bloom_probes = stoul(value);
+    new_options->memtable_prefix_bloom_probes = ParseUint32(value);
   } else if (name == "memtable_prefix_bloom_huge_page_tlb_size") {
     new_options->memtable_prefix_bloom_huge_page_tlb_size =
       ParseInt64(value);
diff --git a/util/rate_limiter.cc b/util/rate_limiter.cc
index 3da2627e7..2beefd58f 100644
--- a/util/rate_limiter.cc
+++ b/util/rate_limiter.cc
@@ -47,7 +47,8 @@ GenericRateLimiter::GenericRateLimiter(
 GenericRateLimiter::~GenericRateLimiter() {
   MutexLock g(&request_mutex_);
   stop_ = true;
-  requests_to_wait_ = queue_[Env::IO_LOW].size() + queue_[Env::IO_HIGH].size();
+  requests_to_wait_ = static_cast<int32_t>(queue_[Env::IO_LOW].size() +
+                                           queue_[Env::IO_HIGH].size());
   for (auto& r : queue_[Env::IO_HIGH]) {
     r->cv.Signal();
   }
diff --git a/util/rate_limiter_test.cc b/util/rate_limiter_test.cc
index cdeca578d..269582ff1 100644
--- a/util/rate_limiter_test.cc
+++ b/util/rate_limiter_test.cc
@@ -30,12 +30,12 @@ TEST(RateLimiterTest, StartStop) {
 TEST(RateLimiterTest, Rate) {
   auto* env = Env::Default();
   struct Arg {
-    Arg(int64_t _target_rate, int _burst)
+    Arg(int32_t _target_rate, int _burst)
         : limiter(new GenericRateLimiter(_target_rate, 100 * 1000, 10)),
           request_size(_target_rate / 10),
           burst(_burst) {}
     std::unique_ptr<RateLimiter> limiter;
-    int64_t request_size;
+    int32_t request_size;
     int burst;
   };
 
@@ -51,13 +51,12 @@ TEST(RateLimiterTest, Rate) {
         arg->limiter->Request(r.Uniform(arg->request_size - 1) + 1,
                               Env::IO_HIGH);
       }
-      arg->limiter->Request(r.Uniform(arg->request_size - 1) + 1,
-                            Env::IO_LOW);
+      arg->limiter->Request(r.Uniform(arg->request_size - 1) + 1, Env::IO_LOW);
     }
   };
 
   for (int i = 1; i <= 16; i*=2) {
-    int64_t target = i * 1024 * 10;
+    int32_t target = i * 1024 * 10;
     Arg arg(target, i / 4 + 1);
     auto start = env->NowMicros();
     for (int t = 0; t < i; ++t) {
@@ -68,7 +67,7 @@ TEST(RateLimiterTest, Rate) {
     auto elapsed = env->NowMicros() - start;
     double rate = arg.limiter->GetTotalBytesThrough()
                   * 1000000.0 / elapsed;
-    fprintf(stderr, "request size [1 - %" PRIi64 "], limit %" PRIi64
+    fprintf(stderr, "request size [1 - %" PRIi32 "], limit %" PRIi32
                     " KB/sec, actual rate: %lf KB/sec, elapsed %.2lf seconds\n",
             arg.request_size - 1, target / 1024, rate / 1024,
             elapsed / 1000000.0);
diff --git a/util/status.cc b/util/status.cc
index fa8e18acf..f0112d3e1 100644
--- a/util/status.cc
+++ b/util/status.cc
@@ -23,8 +23,8 @@ const char* Status::CopyState(const char* state) {
 
 Status::Status(Code _code, const Slice& msg, const Slice& msg2) : code_(_code) {
   assert(code_ != kOk);
-  const uint32_t len1 = msg.size();
-  const uint32_t len2 = msg2.size();
+  const uint32_t len1 = static_cast<uint32_t>(msg.size());
+  const uint32_t len2 = static_cast<uint32_t>(msg2.size());
   const uint32_t size = len1 + (len2 ? (2 + len2) : 0);
   char* result = new char[size + 4];
   memcpy(result, &size, sizeof(size));
diff --git a/utilities/backupable/backupable_db.cc b/utilities/backupable/backupable_db.cc
index 20ec9db85..98fd0e34c 100644
--- a/utilities/backupable/backupable_db.cc
+++ b/utilities/backupable/backupable_db.cc
@@ -54,7 +54,8 @@ class BackupRateLimiter {
         (bytes_since_start_ * kMicrosInSecond) / max_bytes_per_second_;
 
     if (should_take_micros > interval) {
-      env_->SleepForMicroseconds(should_take_micros - interval);
+      env_->SleepForMicroseconds(
+          static_cast<int>(should_take_micros - interval));
       now = env_->NowMicros();
     }
     // reset interval
@@ -165,9 +166,7 @@ class BackupEngineImpl : public BackupEngine {
     uint64_t GetSize() const {
       return size_;
     }
-    uint32_t GetNumberFiles() {
-      return files_.size();
-    }
+    uint32_t GetNumberFiles() { return static_cast<uint32_t>(files_.size()); }
     void SetSequenceNumber(uint64_t sequence_number) {
       sequence_number_ = sequence_number;
     }
diff --git a/utilities/geodb/geodb_impl.cc b/utilities/geodb/geodb_impl.cc
index fc387b4ca..56cfa954e 100644
--- a/utilities/geodb/geodb_impl.cc
+++ b/utilities/geodb/geodb_impl.cc
@@ -397,10 +397,10 @@ std::string GeoDBImpl::TileToQuadKey(const Tile& tile, int levelOfDetail) {
 // Convert a quadkey to a tile and its level of detail
 //
 void GeoDBImpl::QuadKeyToTile(std::string quadkey, Tile* tile,
-                                     int *levelOfDetail) {
+                              int* levelOfDetail) {
   tile->x = tile->y = 0;
-  *levelOfDetail = quadkey.size();
-  const char* key = reinterpret_cast<const char *>(quadkey.c_str());
+  *levelOfDetail = static_cast<int>(quadkey.size());
+  const char* key = reinterpret_cast<const char*>(quadkey.c_str());
   for (int i = *levelOfDetail; i > 0; i--) {
     int mask = 1 << (i - 1);
     switch (key[*levelOfDetail - i]) {
diff --git a/utilities/redis/redis_list_iterator.h b/utilities/redis/redis_list_iterator.h
index b776ada24..6d0b1a6af 100644
--- a/utilities/redis/redis_list_iterator.h
+++ b/utilities/redis/redis_list_iterator.h
@@ -67,7 +67,7 @@ class RedisListIterator {
   ///      attempted, a RedisListException will immediately be thrown.
   RedisListIterator(const std::string& list_data)
       : data_(list_data.data()),
-        num_bytes_(list_data.size()),
+        num_bytes_(static_cast<uint32_t>(list_data.size())),
         cur_byte_(0),
         cur_elem_(0),
         cur_elem_length_(0),
@@ -135,11 +135,11 @@ class RedisListIterator {
     // Ensure we are in a valid state
     CheckErrors();
 
-    const int kOrigSize = result_.size();
+    const int kOrigSize = static_cast<int>(result_.size());
     result_.resize(kOrigSize + SizeOf(elem));
-    EncodeFixed32(result_.data() + kOrigSize, elem.size());
-    memcpy(result_.data() + kOrigSize + sizeof(uint32_t),
-           elem.data(),
+    EncodeFixed32(result_.data() + kOrigSize,
+                  static_cast<uint32_t>(elem.size()));
+    memcpy(result_.data() + kOrigSize + sizeof(uint32_t), elem.data(),
            elem.size());
     ++length_;
     ++cur_elem_;
@@ -169,7 +169,7 @@ class RedisListIterator {
   int Size() const {
     // result_ holds the currently written data
     // data_[cur_byte..num_bytes-1] is the remainder of the data
-    return result_.size() + (num_bytes_ - cur_byte_);
+    return static_cast<int>(result_.size() + (num_bytes_ - cur_byte_));
   }
 
   // Reached the end?
@@ -209,7 +209,7 @@ class RedisListIterator {
   /// E.G. This can be used to compute the bytes we want to Reserve().
   static uint32_t SizeOf(const Slice& elem) {
     // [Integer Length . Data]
-    return sizeof(uint32_t) + elem.size();
+    return static_cast<uint32_t>(sizeof(uint32_t) + elem.size());
   }
 
  private: // Private functions
diff --git a/utilities/redis/redis_lists_test.cc b/utilities/redis/redis_lists_test.cc
index b05c6c798..302f02d7c 100644
--- a/utilities/redis/redis_lists_test.cc
+++ b/utilities/redis/redis_lists_test.cc
@@ -745,9 +745,9 @@ TEST(RedisListsTest, PersistenceMultiKeyTest) {
 
 namespace {
 void MakeUpper(std::string* const s) {
-  int len = s->length();
-  for(int i=0; i<len; ++i) {
-    (*s)[i] = toupper((*s)[i]); // C-version defined in <ctype.h>
+  int len = static_cast<int>(s->length());
+  for (int i = 0; i < len; ++i) {
+    (*s)[i] = toupper((*s)[i]);  // C-version defined in <ctype.h>
   }
 }
 
diff --git a/utilities/spatialdb/utils.h b/utilities/spatialdb/utils.h
index eaf3c9b4e..c65ccf561 100644
--- a/utilities/spatialdb/utils.h
+++ b/utilities/spatialdb/utils.h
@@ -27,7 +27,7 @@ inline uint64_t GetQuadKeyFromTile(uint64_t tile_x, uint64_t tile_y,
                                    uint32_t tile_bits) {
   uint64_t quad_key = 0;
   for (uint32_t i = 0; i < tile_bits; ++i) {
-    uint32_t mask = (1LL << i);
+    uint64_t mask = static_cast<uint64_t>(1LL << i);
     quad_key |= (tile_x & mask) << i;
     quad_key |= (tile_y & mask) << (i + 1);
   }
diff --git a/utilities/ttl/ttl_test.cc b/utilities/ttl/ttl_test.cc
index d1c1235c3..66c8db50c 100644
--- a/utilities/ttl/ttl_test.cc
+++ b/utilities/ttl/ttl_test.cc
@@ -94,7 +94,8 @@ class TtlTest {
   void MakeKVMap(int64_t num_entries) {
     kvmap_.clear();
     int digits = 1;
-    for (int dummy = num_entries; dummy /= 10 ; ++digits);
+    for (int64_t dummy = num_entries; dummy /= 10; ++digits) {
+    }
     int digits_in_i = 1;
     for (int64_t i = 0; i < num_entries; i++) {
       std::string key = "key";
@@ -110,17 +111,18 @@ class TtlTest {
       AppendNumberTo(&value, i);
       kvmap_[key] = value;
     }
-    ASSERT_EQ((int)kvmap_.size(), num_entries);//check all insertions done
+    ASSERT_EQ(static_cast<int64_t>(kvmap_.size()),
+              num_entries);  // check all insertions done
   }
 
   // Makes a write-batch with key-vals from kvmap_ and 'Write''s it
-  void MakePutWriteBatch(const BatchOperation* batch_ops, int num_ops) {
-    ASSERT_LE(num_ops, (int)kvmap_.size());
+  void MakePutWriteBatch(const BatchOperation* batch_ops, int64_t num_ops) {
+    ASSERT_LE(num_ops, static_cast<int64_t>(kvmap_.size()));
     static WriteOptions wopts;
     static FlushOptions flush_opts;
     WriteBatch batch;
     kv_it_ = kvmap_.begin();
-    for (int i = 0; i < num_ops && kv_it_ != kvmap_.end(); i++, ++kv_it_) {
+    for (int64_t i = 0; i < num_ops && kv_it_ != kvmap_.end(); i++, ++kv_it_) {
       switch (batch_ops[i]) {
         case PUT:
           batch.Put(kv_it_->first, kv_it_->second);
@@ -137,15 +139,16 @@ class TtlTest {
   }
 
   // Puts num_entries starting from start_pos_map from kvmap_ into the database
-  void PutValues(int start_pos_map, int num_entries, bool flush = true,
+  void PutValues(int64_t start_pos_map, int64_t num_entries, bool flush = true,
                  ColumnFamilyHandle* cf = nullptr) {
     ASSERT_TRUE(db_ttl_);
-    ASSERT_LE(start_pos_map + num_entries, (int)kvmap_.size());
+    ASSERT_LE(start_pos_map + num_entries, static_cast<int64_t>(kvmap_.size()));
     static WriteOptions wopts;
     static FlushOptions flush_opts;
     kv_it_ = kvmap_.begin();
     advance(kv_it_, start_pos_map);
-    for (int i = 0; kv_it_ != kvmap_.end() && i < num_entries; i++, ++kv_it_) {
+    for (int64_t i = 0; kv_it_ != kvmap_.end() && i < num_entries;
+         i++, ++kv_it_) {
       ASSERT_OK(cf == nullptr
                     ? db_ttl_->Put(wopts, kv_it_->first, kv_it_->second)
                     : db_ttl_->Put(wopts, cf, kv_it_->first, kv_it_->second));
@@ -196,8 +199,8 @@ class TtlTest {
   // Gets should return true if check is true and false otherwise
   // Also checks that value that we got is the same as inserted; and =kNewValue
   //   if test_compaction_change is true
-  void SleepCompactCheck(int slp_tim, int st_pos, int span, bool check = true,
-                         bool test_compaction_change = false,
+  void SleepCompactCheck(int slp_tim, int64_t st_pos, int64_t span,
+                         bool check = true, bool test_compaction_change = false,
                          ColumnFamilyHandle* cf = nullptr) {
     ASSERT_TRUE(db_ttl_);
 
@@ -207,7 +210,7 @@ class TtlTest {
     kv_it_ = kvmap_.begin();
     advance(kv_it_, st_pos);
     std::string v;
-    for (int i = 0; kv_it_ != kvmap_.end() && i < span; i++, ++kv_it_) {
+    for (int64_t i = 0; kv_it_ != kvmap_.end() && i < span; i++, ++kv_it_) {
       Status s = (cf == nullptr) ? db_ttl_->Get(ropts, kv_it_->first, &v)
                                  : db_ttl_->Get(ropts, cf, kv_it_->first, &v);
       if (s.ok() != check) {
@@ -235,7 +238,8 @@ class TtlTest {
   }
 
   // Similar as SleepCompactCheck but uses TtlIterator to read from db
-  void SleepCompactCheckIter(int slp, int st_pos, int span, bool check=true) {
+  void SleepCompactCheckIter(int slp, int st_pos, int64_t span,
+                             bool check = true) {
     ASSERT_TRUE(db_ttl_);
     env_->Sleep(slp);
     ManualCompact();
@@ -250,9 +254,8 @@ class TtlTest {
         ASSERT_NE(dbiter->value().compare(kv_it_->second), 0);
       }
     } else {  // dbiter should have found out kvmap_[st_pos]
-      for (int i = st_pos;
-           kv_it_ != kvmap_.end() && i < st_pos + span;
-           i++, ++kv_it_)  {
+      for (int64_t i = st_pos; kv_it_ != kvmap_.end() && i < st_pos + span;
+           i++, ++kv_it_) {
         ASSERT_TRUE(dbiter->Valid());
         ASSERT_EQ(dbiter->value().compare(kv_it_->second), 0);
         dbiter->Next();
@@ -288,7 +291,7 @@ class TtlTest {
         return false; // Keep keys not matching the format "key<NUMBER>"
       }
 
-      int partition = kSampleSize_ / 3;
+      int64_t partition = kSampleSize_ / 3;
       if (num_key_end < partition) {
         return true;
       } else if (num_key_end < partition * 2) {
@@ -352,8 +355,8 @@ class TtlTest {
 // Partitions the sample-size provided into 3 sets over boundary1 and boundary2
 TEST(TtlTest, NoEffect) {
   MakeKVMap(kSampleSize_);
-  int boundary1 = kSampleSize_ / 3;
-  int boundary2 = 2 * boundary1;
+  int64_t boundary1 = kSampleSize_ / 3;
+  int64_t boundary2 = 2 * boundary1;
 
   OpenTtl();
   PutValues(0, boundary1);                       //T=0: Set1 never deleted
@@ -510,9 +513,9 @@ TEST(TtlTest, CompactionFilter) {
 
   OpenTtlWithTestCompaction(3);
   PutValues(0, kSampleSize_);                   // T=0:Insert Set1.
-  int partition = kSampleSize_ / 3;
-  SleepCompactCheck(1, 0, partition, false);   // Part dropped
-  SleepCompactCheck(0, partition, partition);  // Part kept
+  int64_t partition = kSampleSize_ / 3;
+  SleepCompactCheck(1, 0, partition, false);                  // Part dropped
+  SleepCompactCheck(0, partition, partition);                 // Part kept
   SleepCompactCheck(0, 2 * partition, partition, true, true); // Part changed
   CloseTtl();
 }
diff --git a/utilities/write_batch_with_index/write_batch_with_index_test.cc b/utilities/write_batch_with_index/write_batch_with_index_test.cc
index 8667079d3..b573699db 100644
--- a/utilities/write_batch_with_index/write_batch_with_index_test.cc
+++ b/utilities/write_batch_with_index/write_batch_with_index_test.cc
@@ -596,7 +596,7 @@ TEST(WriteBatchWithIndexTest, TestRandomIteraratorWithBase) {
           break;
         case 2: {
           // Seek to random key
-          auto key_idx = rnd.Uniform(source_strings.size());
+          auto key_idx = rnd.Uniform(static_cast<int>(source_strings.size()));
           auto key = source_strings[key_idx];
           iter->Seek(key);
           result_iter->Seek(key);

From 1f621e6abc982d2b316ee692b35ec4a1e192789c Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Tue, 11 Nov 2014 14:09:10 -0800
Subject: [PATCH 458/829] Fix additional -Wshorten-64-to-32 errros

---
 util/posix_logger.h | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/util/posix_logger.h b/util/posix_logger.h
index 6aba769f1..e4a2c8456 100644
--- a/util/posix_logger.h
+++ b/util/posix_logger.h
@@ -123,14 +123,15 @@ class PosixLogger : public Logger {
       // space, pre-allocate more space to avoid overly large
       // allocations from filesystem allocsize options.
       const size_t log_size = log_size_;
-      const int last_allocation_chunk =
+      const size_t last_allocation_chunk =
         ((kDebugLogChunkSize - 1 + log_size) / kDebugLogChunkSize);
-      const int desired_allocation_chunk =
+      const size_t desired_allocation_chunk =
         ((kDebugLogChunkSize - 1 + log_size + write_size) /
            kDebugLogChunkSize);
       if (last_allocation_chunk != desired_allocation_chunk) {
-        fallocate(fd_, FALLOC_FL_KEEP_SIZE, 0,
-                  desired_allocation_chunk * kDebugLogChunkSize);
+        fallocate(
+            fd_, FALLOC_FL_KEEP_SIZE, 0,
+            static_cast<off_t>(desired_allocation_chunk * kDebugLogChunkSize));
       }
 #endif
 

From 5811419357a55f7b95b70571195647a715e6249f Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Tue, 11 Nov 2014 14:28:18 -0800
Subject: [PATCH 459/829] Fixed GetEstimatedActiveKeys

Summary:
Fixed a bug in GetEstimatedActiveKeys which does not normalized
the sampled information correctly.

Add a test in version_builder_test.

Test Plan: version_builder_test

Reviewers: ljin, igor, sdong

Reviewed By: sdong

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D28707
---
 db/version_builder_test.cc | 92 +++++++++++++++++++++++++-------------
 db/version_set.cc          |  9 +++-
 2 files changed, 68 insertions(+), 33 deletions(-)

diff --git a/db/version_builder_test.cc b/db/version_builder_test.cc
index fcf32ce60..66fcdcdae 100644
--- a/db/version_builder_test.cc
+++ b/db/version_builder_test.cc
@@ -14,31 +14,31 @@ namespace rocksdb {
 
 class VersionBuilderTest {
  public:
-  const Comparator* ucmp;
-  InternalKeyComparator icmp;
-  Options options;
-  ImmutableCFOptions ioptions;
-  MutableCFOptions mutable_cf_options;
-  VersionStorageInfo vstorage;
-  uint32_t file_num;
-  CompactionOptionsFIFO fifo_options;
-  std::vector<uint64_t> size_being_compacted;
+  const Comparator* ucmp_;
+  InternalKeyComparator icmp_;
+  Options options_;
+  ImmutableCFOptions ioptions_;
+  MutableCFOptions mutable_cf_options_;
+  VersionStorageInfo vstorage_;
+  uint32_t file_num_;
+  CompactionOptionsFIFO fifo_options_;
+  std::vector<uint64_t> size_being_compacted_;
 
   VersionBuilderTest()
-      : ucmp(BytewiseComparator()),
-        icmp(ucmp),
-        ioptions(options),
-        mutable_cf_options(options, ioptions),
-        vstorage(&icmp, ucmp, options.num_levels, kCompactionStyleLevel,
+      : ucmp_(BytewiseComparator()),
+        icmp_(ucmp_),
+        ioptions_(options_),
+        mutable_cf_options_(options_, ioptions_),
+        vstorage_(&icmp_, ucmp_, options_.num_levels, kCompactionStyleLevel,
                  nullptr),
-        file_num(1) {
-    mutable_cf_options.RefreshDerivedOptions(ioptions);
-    size_being_compacted.resize(options.num_levels);
+        file_num_(1) {
+    mutable_cf_options_.RefreshDerivedOptions(ioptions_);
+    size_being_compacted_.resize(options_.num_levels);
   }
 
   ~VersionBuilderTest() {
-    for (int i = 0; i < vstorage.num_levels(); i++) {
-      for (auto* f : vstorage.LevelFiles(i)) {
+    for (int i = 0; i < vstorage_.num_levels(); i++) {
+      for (auto* f : vstorage_.LevelFiles(i)) {
         if (--f->refs == 0) {
           delete f;
         }
@@ -54,25 +54,33 @@ class VersionBuilderTest {
   void Add(int level, uint32_t file_number, const char* smallest,
            const char* largest, uint64_t file_size = 0, uint32_t path_id = 0,
            SequenceNumber smallest_seq = 100,
-           SequenceNumber largest_seq = 100) {
-    assert(level < vstorage.num_levels());
+           SequenceNumber largest_seq = 100,
+           uint64_t num_entries = 0, uint64_t num_deletions = 0,
+           bool sampled = false) {
+    assert(level < vstorage_.num_levels());
     FileMetaData* f = new FileMetaData;
     f->fd = FileDescriptor(file_number, path_id, file_size);
     f->smallest = GetInternalKey(smallest, smallest_seq);
     f->largest = GetInternalKey(largest, largest_seq);
     f->compensated_file_size = file_size;
     f->refs = 0;
-    vstorage.MaybeAddFile(level, f);
+    f->num_entries = num_entries;
+    f->num_deletions = num_deletions;
+    vstorage_.MaybeAddFile(level, f);
+    if (sampled) {
+      f->init_stats_from_file = true;
+      vstorage_.UpdateAccumulatedStats(f);
+    }
   }
 
   void UpdateVersionStorageInfo() {
-    vstorage.ComputeCompactionScore(mutable_cf_options, fifo_options,
-                                    size_being_compacted);
-    vstorage.UpdateFilesBySize();
-    vstorage.UpdateNumNonEmptyLevels();
-    vstorage.GenerateFileIndexer();
-    vstorage.GenerateLevelFilesBrief();
-    vstorage.SetFinalized();
+    vstorage_.ComputeCompactionScore(mutable_cf_options_, fifo_options_,
+                                    size_being_compacted_);
+    vstorage_.UpdateFilesBySize();
+    vstorage_.UpdateNumNonEmptyLevels();
+    vstorage_.GenerateFileIndexer();
+    vstorage_.GenerateLevelFilesBrief();
+    vstorage_.SetFinalized();
   }
 };
 
@@ -99,9 +107,9 @@ TEST(VersionBuilderTest, ApplyAndSaveTo) {
 
   EnvOptions env_options;
 
-  VersionBuilder version_builder(env_options, nullptr, &vstorage);
+  VersionBuilder version_builder(env_options, nullptr, &vstorage_);
 
-  VersionStorageInfo new_vstorage(&icmp, ucmp, options.num_levels,
+  VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
                                   kCompactionStyleLevel, nullptr);
   version_builder.Apply(&version_edit);
   version_builder.SaveTo(&new_vstorage);
@@ -118,6 +126,28 @@ TEST(VersionBuilderTest, ApplyAndSaveTo) {
   }
 }
 
+TEST(VersionBuilderTest, EstimatedActiveKeys) {
+  const uint64_t kTotalSamples = 20;
+  const uint64_t kNumLevels = 5;
+  const uint64_t kFilesPerLevel = 8;
+  const uint64_t kNumFiles = kNumLevels * kFilesPerLevel;
+  const uint64_t kEntriesPerFile = 1000;
+  const uint64_t kDeletionsPerFile = 100;
+  for (uint64_t i = 0; i < kNumFiles; ++i) {
+    Add(i / kFilesPerLevel, i + 1,
+        std::to_string((i + 100) * 1000).c_str(),
+        std::to_string((i + 100) * 1000 + 999).c_str(),
+        100U,  0, 100, 100,
+        kEntriesPerFile, kDeletionsPerFile,
+        (i < kTotalSamples));
+  }
+  // minus 2X for the number of deletion entries because:
+  // 1x for deletion entry does not count as a data entry.
+  // 1x for each deletion entry will actually remove one data entry.
+  ASSERT_EQ(vstorage_.GetEstimatedActiveKeys(),
+            (kEntriesPerFile - 2 * kDeletionsPerFile) * kNumFiles);
+}
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) { return rocksdb::test::RunAllTests(); }
diff --git a/db/version_set.cc b/db/version_set.cc
index 83b93e36b..a1954bddb 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -658,11 +658,16 @@ uint64_t VersionStorageInfo::GetEstimatedActiveKeys() const {
     return 0;
   }
 
-  if (num_samples_ < files_->size()) {
+  uint64_t file_count = 0;
+  for (int level = 0; level < num_levels_; ++level) {
+    file_count += files_[level].size();
+  }
+
+  if (num_samples_ < file_count) {
     // casting to avoid overflowing
     return static_cast<uint64_t>(static_cast<double>(
         accumulated_num_non_deletions_ - accumulated_num_deletions_) *
-        files_->size() / num_samples_);
+        static_cast<double>(file_count) / num_samples_);
   } else {
     return accumulated_num_non_deletions_ - accumulated_num_deletions_;
   }

From 9759495229aa6ffe300370c5b52790854d88fbad Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Tue, 11 Nov 2014 15:22:06 -0800
Subject: [PATCH 460/829] Fixed clang compile error in version_builder_test

Summary: Fixed clang compile error in version_builder_test

Test Plan: ./version_builder_test

Reviewers: igor, sdong

Reviewed By: sdong

Subscribers: sdong, dhruba

Differential Revision: https://reviews.facebook.net/D28731
---
 db/version_builder_test.cc | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/db/version_builder_test.cc b/db/version_builder_test.cc
index 66fcdcdae..0aa675214 100644
--- a/db/version_builder_test.cc
+++ b/db/version_builder_test.cc
@@ -127,14 +127,14 @@ TEST(VersionBuilderTest, ApplyAndSaveTo) {
 }
 
 TEST(VersionBuilderTest, EstimatedActiveKeys) {
-  const uint64_t kTotalSamples = 20;
-  const uint64_t kNumLevels = 5;
-  const uint64_t kFilesPerLevel = 8;
-  const uint64_t kNumFiles = kNumLevels * kFilesPerLevel;
-  const uint64_t kEntriesPerFile = 1000;
-  const uint64_t kDeletionsPerFile = 100;
-  for (uint64_t i = 0; i < kNumFiles; ++i) {
-    Add(i / kFilesPerLevel, i + 1,
+  const uint32_t kTotalSamples = 20;
+  const uint32_t kNumLevels = 5;
+  const uint32_t kFilesPerLevel = 8;
+  const uint32_t kNumFiles = kNumLevels * kFilesPerLevel;
+  const uint32_t kEntriesPerFile = 1000;
+  const uint32_t kDeletionsPerFile = 100;
+  for (uint32_t i = 0; i < kNumFiles; ++i) {
+    Add(static_cast<int>(i / kFilesPerLevel), i + 1,
         std::to_string((i + 100) * 1000).c_str(),
         std::to_string((i + 100) * 1000 + 999).c_str(),
         100U,  0, 100, 100,

From 0345c2156f23d5b91d62bf45e320f93e3ac556ea Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Tue, 28 Oct 2014 22:38:08 +0100
Subject: [PATCH 461/829] [RocksJava] Extend Options with ColumnFamilyOptions
 implementation ColumnFamilyOptions implementation with tests [RocksJava]
 Extended ColumnFamilyTest

Summary: Options Refactoring split part 3

Test Plan:
make rocksdbjava
make jtest

Reviewers: yhchiang, ankgup87

Subscribers: dhruba

Differential Revision: https://reviews.facebook.net/D28023
---
 java/Makefile                                 |   4 +-
 java/org/rocksdb/ColumnFamilyOptions.java     | 647 ++++++++++++++++++
 java/org/rocksdb/Options.java                 |  14 +
 .../rocksdb/test/ColumnFamilyOptionsTest.java | 229 +++++++
 java/org/rocksdb/test/MixedOptionsTest.java   |  51 ++
 java/org/rocksdb/test/OptionsTest.java        | 206 +-----
 java/rocksjni/options.cc                      |   5 +-
 7 files changed, 947 insertions(+), 209 deletions(-)
 create mode 100644 java/org/rocksdb/ColumnFamilyOptions.java
 create mode 100644 java/org/rocksdb/test/ColumnFamilyOptionsTest.java
 create mode 100644 java/org/rocksdb/test/MixedOptionsTest.java

diff --git a/java/Makefile b/java/Makefile
index 9fd714ee9..ef49f3fc9 100644
--- a/java/Makefile
+++ b/java/Makefile
@@ -77,14 +77,16 @@ test: java
 	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.BlockBasedTableConfigTest
 	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.DBOptionsTest
 	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.ColumnFamilyTest
+	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.ColumnFamilyOptionsTest
 	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.FilterTest
 	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.FlushTest
 	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.KeyMayExistTest
 	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.MemTableTest
+	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.MergeTest
+	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.MixedOptionsTest
 	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.OptionsTest
 	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.PlainTableConfigTest
 	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.ReadOnlyTest
-	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.MergeTest
 	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.ReadOptionsTest
 	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.RocksIteratorTest
 	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.SnapshotTest
diff --git a/java/org/rocksdb/ColumnFamilyOptions.java b/java/org/rocksdb/ColumnFamilyOptions.java
new file mode 100644
index 000000000..9ce1e9a98
--- /dev/null
+++ b/java/org/rocksdb/ColumnFamilyOptions.java
@@ -0,0 +1,647 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+/**
+ * ColumnFamilyOptions to control the behavior of a database.  It will be used
+ * during the creation of a {@link org.rocksdb.RocksDB} (i.e., RocksDB.open()).
+ *
+ * If {@link #dispose()} function is not called, then it will be GC'd automatically
+ * and native resources will be released as part of the process.
+ */
+public class ColumnFamilyOptions extends RocksObject
+    implements ColumnFamilyOptionsInterface {
+  static {
+    RocksDB.loadLibrary();
+  }
+
+  /**
+   * Construct ColumnFamilyOptions.
+   *
+   * This constructor will create (by allocating a block of memory)
+   * an {@code rocksdb::DBOptions} in the c++ side.
+   */
+  public ColumnFamilyOptions() {
+    super();
+    newColumnFamilyOptions();
+  }
+
+  @Override
+  public ColumnFamilyOptions optimizeForPointLookup(
+      long blockCacheSizeMb) {
+    optimizeForPointLookup(nativeHandle_,
+        blockCacheSizeMb);
+    return this;
+  }
+
+  @Override
+  public ColumnFamilyOptions optimizeLevelStyleCompaction() {
+    optimizeLevelStyleCompaction(nativeHandle_,
+        DEFAULT_COMPACTION_MEMTABLE_MEMORY_BUDGET);
+    return this;
+  }
+
+  @Override
+  public ColumnFamilyOptions optimizeLevelStyleCompaction(
+      long memtableMemoryBudget) {
+    optimizeLevelStyleCompaction(nativeHandle_,
+        memtableMemoryBudget);
+    return this;
+  }
+
+  @Override
+  public ColumnFamilyOptions optimizeUniversalStyleCompaction() {
+    optimizeUniversalStyleCompaction(nativeHandle_,
+        DEFAULT_COMPACTION_MEMTABLE_MEMORY_BUDGET);
+    return this;
+  }
+
+  @Override
+  public ColumnFamilyOptions optimizeUniversalStyleCompaction(
+      long memtableMemoryBudget) {
+    optimizeUniversalStyleCompaction(nativeHandle_,
+        memtableMemoryBudget);
+    return this;
+  }
+
+  @Override
+  public ColumnFamilyOptions setComparator(BuiltinComparator builtinComparator) {
+    assert(isInitialized());
+    setComparatorHandle(nativeHandle_, builtinComparator.ordinal());
+    return this;
+  }
+
+  @Override
+  public ColumnFamilyOptions setComparator(AbstractComparator comparator) {
+    assert (isInitialized());
+    setComparatorHandle(nativeHandle_, comparator.nativeHandle_);
+    comparator_ = comparator;
+    return this;
+  }
+
+  @Override
+  public ColumnFamilyOptions setMergeOperatorName(String name) {
+    setMergeOperatorName(nativeHandle_, name);
+    return this;
+  }
+
+  @Override
+  public ColumnFamilyOptions setMergeOperator(MergeOperator mergeOperator) {
+    setMergeOperator(nativeHandle_, mergeOperator.newMergeOperatorHandle());
+    return this;
+  }
+
+  @Override
+  public ColumnFamilyOptions setWriteBufferSize(long writeBufferSize)
+      throws RocksDBException {
+    assert(isInitialized());
+    setWriteBufferSize(nativeHandle_, writeBufferSize);
+    return this;
+  }
+
+  @Override
+  public long writeBufferSize()  {
+    assert(isInitialized());
+    return writeBufferSize(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setMaxWriteBufferNumber(
+      int maxWriteBufferNumber) {
+    assert(isInitialized());
+    setMaxWriteBufferNumber(nativeHandle_, maxWriteBufferNumber);
+    return this;
+  }
+
+  @Override
+  public int maxWriteBufferNumber() {
+    assert(isInitialized());
+    return maxWriteBufferNumber(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setMinWriteBufferNumberToMerge(
+      int minWriteBufferNumberToMerge) {
+    setMinWriteBufferNumberToMerge(nativeHandle_, minWriteBufferNumberToMerge);
+    return this;
+  }
+
+  @Override
+  public int minWriteBufferNumberToMerge() {
+    return minWriteBufferNumberToMerge(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions useFixedLengthPrefixExtractor(int n) {
+    assert(isInitialized());
+    useFixedLengthPrefixExtractor(nativeHandle_, n);
+    return this;
+  }
+
+  @Override
+  public ColumnFamilyOptions setCompressionType(CompressionType compressionType) {
+    setCompressionType(nativeHandle_, compressionType.getValue());
+    return this;
+  }
+
+  @Override
+  public CompressionType compressionType() {
+    return CompressionType.values()[compressionType(nativeHandle_)];
+  }
+
+  @Override
+  public ColumnFamilyOptions setNumLevels(int numLevels) {
+    setNumLevels(nativeHandle_, numLevels);
+    return this;
+  }
+
+  @Override
+  public int numLevels() {
+    return numLevels(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setLevelZeroFileNumCompactionTrigger(
+      int numFiles) {
+    setLevelZeroFileNumCompactionTrigger(
+        nativeHandle_, numFiles);
+    return this;
+  }
+
+  @Override
+  public int levelZeroFileNumCompactionTrigger() {
+    return levelZeroFileNumCompactionTrigger(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setLevelZeroSlowdownWritesTrigger(
+      int numFiles) {
+    setLevelZeroSlowdownWritesTrigger(nativeHandle_, numFiles);
+    return this;
+  }
+
+  @Override
+  public int levelZeroSlowdownWritesTrigger() {
+    return levelZeroSlowdownWritesTrigger(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setLevelZeroStopWritesTrigger(int numFiles) {
+    setLevelZeroStopWritesTrigger(nativeHandle_, numFiles);
+    return this;
+  }
+
+  @Override
+  public int levelZeroStopWritesTrigger() {
+    return levelZeroStopWritesTrigger(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setMaxMemCompactionLevel(
+      int maxMemCompactionLevel) {
+    setMaxMemCompactionLevel(nativeHandle_, maxMemCompactionLevel);
+    return this;
+  }
+
+  @Override
+  public int maxMemCompactionLevel() {
+    return maxMemCompactionLevel(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setTargetFileSizeBase(long targetFileSizeBase) {
+    setTargetFileSizeBase(nativeHandle_, targetFileSizeBase);
+    return this;
+  }
+
+  @Override
+  public long targetFileSizeBase() {
+    return targetFileSizeBase(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setTargetFileSizeMultiplier(int multiplier) {
+    setTargetFileSizeMultiplier(nativeHandle_, multiplier);
+    return this;
+  }
+
+  @Override
+  public int targetFileSizeMultiplier() {
+    return targetFileSizeMultiplier(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setMaxBytesForLevelBase(
+      long maxBytesForLevelBase) {
+    setMaxBytesForLevelBase(nativeHandle_, maxBytesForLevelBase);
+    return this;
+  }
+
+  @Override
+  public long maxBytesForLevelBase() {
+    return maxBytesForLevelBase(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setMaxBytesForLevelMultiplier(int multiplier) {
+    setMaxBytesForLevelMultiplier(nativeHandle_, multiplier);
+    return this;
+  }
+
+  @Override
+  public int maxBytesForLevelMultiplier() {
+    return maxBytesForLevelMultiplier(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setExpandedCompactionFactor(int expandedCompactionFactor) {
+    setExpandedCompactionFactor(nativeHandle_, expandedCompactionFactor);
+    return this;
+  }
+
+  @Override
+  public int expandedCompactionFactor() {
+    return expandedCompactionFactor(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setSourceCompactionFactor(int sourceCompactionFactor) {
+    setSourceCompactionFactor(nativeHandle_, sourceCompactionFactor);
+    return this;
+  }
+
+  @Override
+  public int sourceCompactionFactor() {
+    return sourceCompactionFactor(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setMaxGrandparentOverlapFactor(
+      int maxGrandparentOverlapFactor) {
+    setMaxGrandparentOverlapFactor(nativeHandle_, maxGrandparentOverlapFactor);
+    return this;
+  }
+
+  @Override
+  public int maxGrandparentOverlapFactor() {
+    return maxGrandparentOverlapFactor(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setSoftRateLimit(double softRateLimit) {
+    setSoftRateLimit(nativeHandle_, softRateLimit);
+    return this;
+  }
+
+  @Override
+  public double softRateLimit() {
+    return softRateLimit(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setHardRateLimit(double hardRateLimit) {
+    setHardRateLimit(nativeHandle_, hardRateLimit);
+    return this;
+  }
+
+  @Override
+  public double hardRateLimit() {
+    return hardRateLimit(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setRateLimitDelayMaxMilliseconds(
+      int rateLimitDelayMaxMilliseconds) {
+    setRateLimitDelayMaxMilliseconds(
+        nativeHandle_, rateLimitDelayMaxMilliseconds);
+    return this;
+  }
+
+  @Override
+  public int rateLimitDelayMaxMilliseconds() {
+    return rateLimitDelayMaxMilliseconds(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setArenaBlockSize(long arenaBlockSize)
+      throws RocksDBException {
+    setArenaBlockSize(nativeHandle_, arenaBlockSize);
+    return this;
+  }
+
+  @Override
+  public long arenaBlockSize() {
+    return arenaBlockSize(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setDisableAutoCompactions(boolean disableAutoCompactions) {
+    setDisableAutoCompactions(nativeHandle_, disableAutoCompactions);
+    return this;
+  }
+
+  @Override
+  public boolean disableAutoCompactions() {
+    return disableAutoCompactions(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setPurgeRedundantKvsWhileFlush(
+      boolean purgeRedundantKvsWhileFlush) {
+    setPurgeRedundantKvsWhileFlush(
+        nativeHandle_, purgeRedundantKvsWhileFlush);
+    return this;
+  }
+
+  @Override
+  public boolean purgeRedundantKvsWhileFlush() {
+    return purgeRedundantKvsWhileFlush(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setCompactionStyle(CompactionStyle compactionStyle) {
+    setCompactionStyle(nativeHandle_, compactionStyle.getValue());
+    return this;
+  }
+
+  @Override
+  public CompactionStyle compactionStyle() {
+    return CompactionStyle.values()[compactionStyle(nativeHandle_)];
+  }
+
+  @Override
+  public ColumnFamilyOptions setVerifyChecksumsInCompaction(
+      boolean verifyChecksumsInCompaction) {
+    setVerifyChecksumsInCompaction(
+        nativeHandle_, verifyChecksumsInCompaction);
+    return this;
+  }
+
+  @Override
+  public boolean verifyChecksumsInCompaction() {
+    return verifyChecksumsInCompaction(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setFilterDeletes(boolean filterDeletes) {
+    setFilterDeletes(nativeHandle_, filterDeletes);
+    return this;
+  }
+
+  @Override
+  public boolean filterDeletes() {
+    return filterDeletes(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setMaxSequentialSkipInIterations(long maxSequentialSkipInIterations) {
+    setMaxSequentialSkipInIterations(nativeHandle_, maxSequentialSkipInIterations);
+    return this;
+  }
+
+  @Override
+  public long maxSequentialSkipInIterations() {
+    return maxSequentialSkipInIterations(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setMemTableConfig(MemTableConfig config)
+      throws RocksDBException {
+    memTableConfig_ = config;
+    setMemTableFactory(nativeHandle_, config.newMemTableFactoryHandle());
+    return this;
+  }
+
+  @Override
+  public String memTableFactoryName() {
+    assert(isInitialized());
+    return memTableFactoryName(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setTableFormatConfig(TableFormatConfig config) {
+    tableFormatConfig_ = config;
+    setTableFactory(nativeHandle_, config.newTableFactoryHandle());
+    return this;
+  }
+
+  @Override
+  public String tableFactoryName() {
+    assert(isInitialized());
+    return tableFactoryName(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setInplaceUpdateSupport(boolean inplaceUpdateSupport) {
+    setInplaceUpdateSupport(nativeHandle_, inplaceUpdateSupport);
+    return this;
+  }
+
+  @Override
+  public boolean inplaceUpdateSupport() {
+    return inplaceUpdateSupport(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setInplaceUpdateNumLocks(long inplaceUpdateNumLocks)
+      throws RocksDBException {
+    setInplaceUpdateNumLocks(nativeHandle_, inplaceUpdateNumLocks);
+    return this;
+  }
+
+  @Override
+  public long inplaceUpdateNumLocks() {
+    return inplaceUpdateNumLocks(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setMemtablePrefixBloomBits(int memtablePrefixBloomBits) {
+    setMemtablePrefixBloomBits(nativeHandle_, memtablePrefixBloomBits);
+    return this;
+  }
+
+  @Override
+  public int memtablePrefixBloomBits() {
+    return memtablePrefixBloomBits(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setMemtablePrefixBloomProbes(int memtablePrefixBloomProbes) {
+    setMemtablePrefixBloomProbes(nativeHandle_, memtablePrefixBloomProbes);
+    return this;
+  }
+
+  @Override
+  public int memtablePrefixBloomProbes() {
+    return memtablePrefixBloomProbes(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setBloomLocality(int bloomLocality) {
+    setBloomLocality(nativeHandle_, bloomLocality);
+    return this;
+  }
+
+  @Override
+  public int bloomLocality() {
+    return bloomLocality(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setMaxSuccessiveMerges(long maxSuccessiveMerges)
+      throws RocksDBException {
+    setMaxSuccessiveMerges(nativeHandle_, maxSuccessiveMerges);
+    return this;
+  }
+
+  @Override
+  public long maxSuccessiveMerges() {
+    return maxSuccessiveMerges(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setMinPartialMergeOperands(int minPartialMergeOperands) {
+    setMinPartialMergeOperands(nativeHandle_, minPartialMergeOperands);
+    return this;
+  }
+
+  @Override
+  public int minPartialMergeOperands() {
+    return minPartialMergeOperands(nativeHandle_);
+  }
+
+  /**
+   * Release the memory allocated for the current instance
+   * in the c++ side.
+   */
+  @Override protected void disposeInternal() {
+    assert(isInitialized());
+    disposeInternal(nativeHandle_);
+  }
+
+  private native void newColumnFamilyOptions();
+  private native void disposeInternal(long handle);
+
+  private native void optimizeForPointLookup(long handle,
+      long blockCacheSizeMb);
+  private native void optimizeLevelStyleCompaction(long handle,
+      long memtableMemoryBudget);
+  private native void optimizeUniversalStyleCompaction(long handle,
+      long memtableMemoryBudget);
+  private native void setComparatorHandle(long handle, int builtinComparator);
+  private native void setComparatorHandle(long optHandle, long comparatorHandle);
+  private native void setMergeOperatorName(
+      long handle, String name);
+  private native void setMergeOperator(
+      long handle, long mergeOperatorHandle);
+  private native void setWriteBufferSize(long handle, long writeBufferSize)
+      throws RocksDBException;
+  private native long writeBufferSize(long handle);
+  private native void setMaxWriteBufferNumber(
+      long handle, int maxWriteBufferNumber);
+  private native int maxWriteBufferNumber(long handle);
+  private native void setMinWriteBufferNumberToMerge(
+      long handle, int minWriteBufferNumberToMerge);
+  private native int minWriteBufferNumberToMerge(long handle);
+  private native void setCompressionType(long handle, byte compressionType);
+  private native byte compressionType(long handle);
+  private native void useFixedLengthPrefixExtractor(
+      long handle, int prefixLength);
+  private native void setNumLevels(
+      long handle, int numLevels);
+  private native int numLevels(long handle);
+  private native void setLevelZeroFileNumCompactionTrigger(
+      long handle, int numFiles);
+  private native int levelZeroFileNumCompactionTrigger(long handle);
+  private native void setLevelZeroSlowdownWritesTrigger(
+      long handle, int numFiles);
+  private native int levelZeroSlowdownWritesTrigger(long handle);
+  private native void setLevelZeroStopWritesTrigger(
+      long handle, int numFiles);
+  private native int levelZeroStopWritesTrigger(long handle);
+  private native void setMaxMemCompactionLevel(
+      long handle, int maxMemCompactionLevel);
+  private native int maxMemCompactionLevel(long handle);
+  private native void setTargetFileSizeBase(
+      long handle, long targetFileSizeBase);
+  private native long targetFileSizeBase(long handle);
+  private native void setTargetFileSizeMultiplier(
+      long handle, int multiplier);
+  private native int targetFileSizeMultiplier(long handle);
+  private native void setMaxBytesForLevelBase(
+      long handle, long maxBytesForLevelBase);
+  private native long maxBytesForLevelBase(long handle);
+  private native void setMaxBytesForLevelMultiplier(
+      long handle, int multiplier);
+  private native int maxBytesForLevelMultiplier(long handle);
+  private native void setExpandedCompactionFactor(
+      long handle, int expandedCompactionFactor);
+  private native int expandedCompactionFactor(long handle);
+  private native void setSourceCompactionFactor(
+      long handle, int sourceCompactionFactor);
+  private native int sourceCompactionFactor(long handle);
+  private native void setMaxGrandparentOverlapFactor(
+      long handle, int maxGrandparentOverlapFactor);
+  private native int maxGrandparentOverlapFactor(long handle);
+  private native void setSoftRateLimit(
+      long handle, double softRateLimit);
+  private native double softRateLimit(long handle);
+  private native void setHardRateLimit(
+      long handle, double hardRateLimit);
+  private native double hardRateLimit(long handle);
+  private native void setRateLimitDelayMaxMilliseconds(
+      long handle, int rateLimitDelayMaxMilliseconds);
+  private native int rateLimitDelayMaxMilliseconds(long handle);
+  private native void setArenaBlockSize(
+      long handle, long arenaBlockSize) throws RocksDBException;
+  private native long arenaBlockSize(long handle);
+  private native void setDisableAutoCompactions(
+      long handle, boolean disableAutoCompactions);
+  private native boolean disableAutoCompactions(long handle);
+  private native void setCompactionStyle(long handle, byte compactionStyle);
+  private native byte compactionStyle(long handle);
+  private native void setPurgeRedundantKvsWhileFlush(
+      long handle, boolean purgeRedundantKvsWhileFlush);
+  private native boolean purgeRedundantKvsWhileFlush(long handle);
+  private native void setVerifyChecksumsInCompaction(
+      long handle, boolean verifyChecksumsInCompaction);
+  private native boolean verifyChecksumsInCompaction(long handle);
+  private native void setFilterDeletes(
+      long handle, boolean filterDeletes);
+  private native boolean filterDeletes(long handle);
+  private native void setMaxSequentialSkipInIterations(
+      long handle, long maxSequentialSkipInIterations);
+  private native long maxSequentialSkipInIterations(long handle);
+  private native void setMemTableFactory(long handle, long factoryHandle);
+  private native String memTableFactoryName(long handle);
+  private native void setTableFactory(long handle, long factoryHandle);
+  private native String tableFactoryName(long handle);
+  private native void setInplaceUpdateSupport(
+      long handle, boolean inplaceUpdateSupport);
+  private native boolean inplaceUpdateSupport(long handle);
+  private native void setInplaceUpdateNumLocks(
+      long handle, long inplaceUpdateNumLocks) throws RocksDBException;
+  private native long inplaceUpdateNumLocks(long handle);
+  private native void setMemtablePrefixBloomBits(
+      long handle, int memtablePrefixBloomBits);
+  private native int memtablePrefixBloomBits(long handle);
+  private native void setMemtablePrefixBloomProbes(
+      long handle, int memtablePrefixBloomProbes);
+  private native int memtablePrefixBloomProbes(long handle);
+  private native void setBloomLocality(
+      long handle, int bloomLocality);
+  private native int bloomLocality(long handle);
+  private native void setMaxSuccessiveMerges(
+      long handle, long maxSuccessiveMerges) throws RocksDBException;
+  private native long maxSuccessiveMerges(long handle);
+  private native void setMinPartialMergeOperands(
+      long handle, int minPartialMergeOperands);
+  private native int minPartialMergeOperands(long handle);
+
+  MemTableConfig memTableConfig_;
+  TableFormatConfig tableFormatConfig_;
+  AbstractComparator comparator_;
+}
diff --git a/java/org/rocksdb/Options.java b/java/org/rocksdb/Options.java
index 7ad1e1bf2..2d6fa08cd 100644
--- a/java/org/rocksdb/Options.java
+++ b/java/org/rocksdb/Options.java
@@ -29,6 +29,20 @@ public class Options extends RocksObject
     env_ = RocksEnv.getDefault();
   }
 
+  /**
+   * Construct options for opening a RocksDB. Reusing database options
+   * and column family options.
+   *
+   * @param dbOptions {@link org.rocksdb.DBOptions} instance
+   * @param columnFamilyOptions {@link org.rocksdb.ColumnFamilyOptions}
+   *     instance
+   */
+  public Options(DBOptions dbOptions, ColumnFamilyOptions columnFamilyOptions) {
+    super();
+    newOptions(dbOptions.nativeHandle_, columnFamilyOptions.nativeHandle_);
+    env_ = RocksEnv.getDefault();
+  }
+
   @Override
   public Options setCreateIfMissing(boolean flag) {
     assert(isInitialized());
diff --git a/java/org/rocksdb/test/ColumnFamilyOptionsTest.java b/java/org/rocksdb/test/ColumnFamilyOptionsTest.java
new file mode 100644
index 000000000..95289a301
--- /dev/null
+++ b/java/org/rocksdb/test/ColumnFamilyOptionsTest.java
@@ -0,0 +1,229 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb.test;
+
+import org.rocksdb.*;
+
+import java.util.Random;
+
+public class ColumnFamilyOptionsTest {
+  static {
+    RocksDB.loadLibrary();
+  }
+
+  public static void testCFOptions(ColumnFamilyOptionsInterface opt) {
+    Random rand = PlatformRandomHelper.
+        getPlatformSpecificRandomFactory();
+    { // WriteBufferSize test
+      try {
+        long longValue = rand.nextLong();
+        opt.setWriteBufferSize(longValue);
+        assert(opt.writeBufferSize() == longValue);
+      } catch (RocksDBException e) {
+        assert(false);
+      }
+    }
+
+    { // MaxWriteBufferNumber test
+      int intValue = rand.nextInt();
+      opt.setMaxWriteBufferNumber(intValue);
+      assert(opt.maxWriteBufferNumber() == intValue);
+    }
+
+    { // MinWriteBufferNumberToMerge test
+      int intValue = rand.nextInt();
+      opt.setMinWriteBufferNumberToMerge(intValue);
+      assert(opt.minWriteBufferNumberToMerge() == intValue);
+    }
+
+    { // NumLevels test
+      int intValue = rand.nextInt();
+      opt.setNumLevels(intValue);
+      assert(opt.numLevels() == intValue);
+    }
+
+    { // LevelFileNumCompactionTrigger test
+      int intValue = rand.nextInt();
+      opt.setLevelZeroFileNumCompactionTrigger(intValue);
+      assert(opt.levelZeroFileNumCompactionTrigger() == intValue);
+    }
+
+    { // LevelSlowdownWritesTrigger test
+      int intValue = rand.nextInt();
+      opt.setLevelZeroSlowdownWritesTrigger(intValue);
+      assert(opt.levelZeroSlowdownWritesTrigger() == intValue);
+    }
+
+    { // LevelStopWritesTrigger test
+      int intValue = rand.nextInt();
+      opt.setLevelZeroStopWritesTrigger(intValue);
+      assert(opt.levelZeroStopWritesTrigger() == intValue);
+    }
+
+    { // MaxMemCompactionLevel test
+      int intValue = rand.nextInt();
+      opt.setMaxMemCompactionLevel(intValue);
+      assert(opt.maxMemCompactionLevel() == intValue);
+    }
+
+    { // TargetFileSizeBase test
+      long longValue = rand.nextLong();
+      opt.setTargetFileSizeBase(longValue);
+      assert(opt.targetFileSizeBase() == longValue);
+    }
+
+    { // TargetFileSizeMultiplier test
+      int intValue = rand.nextInt();
+      opt.setTargetFileSizeMultiplier(intValue);
+      assert(opt.targetFileSizeMultiplier() == intValue);
+    }
+
+    { // MaxBytesForLevelBase test
+      long longValue = rand.nextLong();
+      opt.setMaxBytesForLevelBase(longValue);
+      assert(opt.maxBytesForLevelBase() == longValue);
+    }
+
+    { // MaxBytesForLevelMultiplier test
+      int intValue = rand.nextInt();
+      opt.setMaxBytesForLevelMultiplier(intValue);
+      assert(opt.maxBytesForLevelMultiplier() == intValue);
+    }
+
+    { // ExpandedCompactionFactor test
+      int intValue = rand.nextInt();
+      opt.setExpandedCompactionFactor(intValue);
+      assert(opt.expandedCompactionFactor() == intValue);
+    }
+
+    { // SourceCompactionFactor test
+      int intValue = rand.nextInt();
+      opt.setSourceCompactionFactor(intValue);
+      assert(opt.sourceCompactionFactor() == intValue);
+    }
+
+    { // MaxGrandparentOverlapFactor test
+      int intValue = rand.nextInt();
+      opt.setMaxGrandparentOverlapFactor(intValue);
+      assert(opt.maxGrandparentOverlapFactor() == intValue);
+    }
+
+    { // SoftRateLimit test
+      double doubleValue = rand.nextDouble();
+      opt.setSoftRateLimit(doubleValue);
+      assert(opt.softRateLimit() == doubleValue);
+    }
+
+    { // HardRateLimit test
+      double doubleValue = rand.nextDouble();
+      opt.setHardRateLimit(doubleValue);
+      assert(opt.hardRateLimit() == doubleValue);
+    }
+
+    { // RateLimitDelayMaxMilliseconds test
+      int intValue = rand.nextInt();
+      opt.setRateLimitDelayMaxMilliseconds(intValue);
+      assert(opt.rateLimitDelayMaxMilliseconds() == intValue);
+    }
+
+    { // ArenaBlockSize test
+      try {
+        long longValue = rand.nextLong();
+        opt.setArenaBlockSize(longValue);
+        assert(opt.arenaBlockSize() == longValue);
+      } catch (RocksDBException e) {
+        assert(false);
+      }
+    }
+
+    { // DisableAutoCompactions test
+      boolean boolValue = rand.nextBoolean();
+      opt.setDisableAutoCompactions(boolValue);
+      assert(opt.disableAutoCompactions() == boolValue);
+    }
+
+    { // PurgeRedundantKvsWhileFlush test
+      boolean boolValue = rand.nextBoolean();
+      opt.setPurgeRedundantKvsWhileFlush(boolValue);
+      assert(opt.purgeRedundantKvsWhileFlush() == boolValue);
+    }
+
+    { // VerifyChecksumsInCompaction test
+      boolean boolValue = rand.nextBoolean();
+      opt.setVerifyChecksumsInCompaction(boolValue);
+      assert(opt.verifyChecksumsInCompaction() == boolValue);
+    }
+
+    { // FilterDeletes test
+      boolean boolValue = rand.nextBoolean();
+      opt.setFilterDeletes(boolValue);
+      assert(opt.filterDeletes() == boolValue);
+    }
+
+    { // MaxSequentialSkipInIterations test
+      long longValue = rand.nextLong();
+      opt.setMaxSequentialSkipInIterations(longValue);
+      assert(opt.maxSequentialSkipInIterations() == longValue);
+    }
+
+    { // InplaceUpdateSupport test
+      boolean boolValue = rand.nextBoolean();
+      opt.setInplaceUpdateSupport(boolValue);
+      assert(opt.inplaceUpdateSupport() == boolValue);
+    }
+
+    { // InplaceUpdateNumLocks test
+      try {
+        long longValue = rand.nextLong();
+        opt.setInplaceUpdateNumLocks(longValue);
+        assert(opt.inplaceUpdateNumLocks() == longValue);
+      } catch (RocksDBException e) {
+        assert(false);
+      }
+    }
+
+    { // MemtablePrefixBloomBits test
+      int intValue = rand.nextInt();
+      opt.setMemtablePrefixBloomBits(intValue);
+      assert(opt.memtablePrefixBloomBits() == intValue);
+    }
+
+    { // MemtablePrefixBloomProbes test
+      int intValue = rand.nextInt();
+      opt.setMemtablePrefixBloomProbes(intValue);
+      assert(opt.memtablePrefixBloomProbes() == intValue);
+    }
+
+    { // BloomLocality test
+      int intValue = rand.nextInt();
+      opt.setBloomLocality(intValue);
+      assert(opt.bloomLocality() == intValue);
+    }
+
+    { // MaxSuccessiveMerges test
+      try {
+        long longValue = rand.nextLong();
+        opt.setMaxSuccessiveMerges(longValue);
+        assert(opt.maxSuccessiveMerges() == longValue);
+      } catch (RocksDBException e){
+        assert(false);
+      }
+    }
+
+    { // MinPartialMergeOperands test
+      int intValue = rand.nextInt();
+      opt.setMinPartialMergeOperands(intValue);
+      assert(opt.minPartialMergeOperands() == intValue);
+    }
+  }
+
+  public static void main(String[] args) {
+    ColumnFamilyOptions opt = new ColumnFamilyOptions();
+    testCFOptions(opt);
+    opt.dispose();
+    System.out.println("Passed DBOptionsTest");
+  }
+}
diff --git a/java/org/rocksdb/test/MixedOptionsTest.java b/java/org/rocksdb/test/MixedOptionsTest.java
new file mode 100644
index 000000000..edaa2c318
--- /dev/null
+++ b/java/org/rocksdb/test/MixedOptionsTest.java
@@ -0,0 +1,51 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb.test;
+
+import org.rocksdb.*;
+
+public class MixedOptionsTest {
+  static {
+    RocksDB.loadLibrary();
+  }
+  public static void main(String[] args) {
+    // Set a table factory and check the names
+    ColumnFamilyOptions cfOptions = new ColumnFamilyOptions();
+    cfOptions.setTableFormatConfig(new BlockBasedTableConfig().
+        setFilter(new BloomFilter()));
+    assert(cfOptions.tableFactoryName().equals(
+        "BlockBasedTable"));
+    cfOptions.setTableFormatConfig(new PlainTableConfig());
+    assert(cfOptions.tableFactoryName().equals("PlainTable"));
+    // Initialize a dbOptions object from cf options and
+    // db options
+    DBOptions dbOptions = new DBOptions();
+    Options options = new Options(dbOptions, cfOptions);
+    assert(options.tableFactoryName().equals("PlainTable"));
+    // Free instances
+    options.dispose();
+    options = null;
+    cfOptions.dispose();
+    cfOptions = null;
+    dbOptions.dispose();
+    dbOptions = null;
+    System.gc();
+    System.runFinalization();
+    // Test Optimize for statements
+    cfOptions = new ColumnFamilyOptions();
+    cfOptions.optimizeUniversalStyleCompaction();
+    cfOptions.optimizeLevelStyleCompaction();
+    cfOptions.optimizeForPointLookup(1024);
+    options = new Options();
+    options.optimizeLevelStyleCompaction();
+    options.optimizeLevelStyleCompaction(400);
+    options.optimizeUniversalStyleCompaction();
+    options.optimizeUniversalStyleCompaction(400);
+    options.optimizeForPointLookup(1024);
+    options.prepareForBulkLoad();
+    System.out.println("Mixed options test passed");
+  }
+}
diff --git a/java/org/rocksdb/test/OptionsTest.java b/java/org/rocksdb/test/OptionsTest.java
index ea8da6c66..defdcc304 100644
--- a/java/org/rocksdb/test/OptionsTest.java
+++ b/java/org/rocksdb/test/OptionsTest.java
@@ -6,10 +6,7 @@
 package org.rocksdb.test;
 
 import java.util.Random;
-
-import org.rocksdb.DBOptions;
 import org.rocksdb.RocksDB;
-import org.rocksdb.RocksDBException;
 import org.rocksdb.Options;
 
 public class OptionsTest {
@@ -23,208 +20,7 @@ public class OptionsTest {
         getPlatformSpecificRandomFactory();
 
     DBOptionsTest.testDBOptions(opt);
-
-    { // WriteBufferSize test
-      try {
-        long longValue = rand.nextLong();
-        opt.setWriteBufferSize(longValue);
-        assert(opt.writeBufferSize() == longValue);
-      } catch (RocksDBException e) {
-        assert(false);
-      }
-    }
-
-    { // MaxWriteBufferNumber test
-      int intValue = rand.nextInt();
-      opt.setMaxWriteBufferNumber(intValue);
-      assert(opt.maxWriteBufferNumber() == intValue);
-    }
-
-    { // MinWriteBufferNumberToMerge test
-      int intValue = rand.nextInt();
-      opt.setMinWriteBufferNumberToMerge(intValue);
-      assert(opt.minWriteBufferNumberToMerge() == intValue);
-    }
-
-    { // NumLevels test
-      int intValue = rand.nextInt();
-      opt.setNumLevels(intValue);
-      assert(opt.numLevels() == intValue);
-    }
-
-    { // LevelFileNumCompactionTrigger test
-      int intValue = rand.nextInt();
-      opt.setLevelZeroFileNumCompactionTrigger(intValue);
-      assert(opt.levelZeroFileNumCompactionTrigger() == intValue);
-    }
-
-    { // LevelSlowdownWritesTrigger test
-      int intValue = rand.nextInt();
-      opt.setLevelZeroSlowdownWritesTrigger(intValue);
-      assert(opt.levelZeroSlowdownWritesTrigger() == intValue);
-    }
-
-    { // LevelStopWritesTrigger test
-      int intValue = rand.nextInt();
-      opt.setLevelZeroStopWritesTrigger(intValue);
-      assert(opt.levelZeroStopWritesTrigger() == intValue);
-    }
-
-    { // MaxMemCompactionLevel test
-      int intValue = rand.nextInt();
-      opt.setMaxMemCompactionLevel(intValue);
-      assert(opt.maxMemCompactionLevel() == intValue);
-    }
-
-    { // TargetFileSizeBase test
-      long longValue = rand.nextLong();
-      opt.setTargetFileSizeBase(longValue);
-      assert(opt.targetFileSizeBase() == longValue);
-    }
-
-    { // TargetFileSizeMultiplier test
-      int intValue = rand.nextInt();
-      opt.setTargetFileSizeMultiplier(intValue);
-      assert(opt.targetFileSizeMultiplier() == intValue);
-    }
-
-    { // MaxBytesForLevelBase test
-      long longValue = rand.nextLong();
-      opt.setMaxBytesForLevelBase(longValue);
-      assert(opt.maxBytesForLevelBase() == longValue);
-    }
-
-    { // MaxBytesForLevelMultiplier test
-      int intValue = rand.nextInt();
-      opt.setMaxBytesForLevelMultiplier(intValue);
-      assert(opt.maxBytesForLevelMultiplier() == intValue);
-    }
-
-    { // ExpandedCompactionFactor test
-      int intValue = rand.nextInt();
-      opt.setExpandedCompactionFactor(intValue);
-      assert(opt.expandedCompactionFactor() == intValue);
-    }
-
-    { // SourceCompactionFactor test
-      int intValue = rand.nextInt();
-      opt.setSourceCompactionFactor(intValue);
-      assert(opt.sourceCompactionFactor() == intValue);
-    }
-
-    { // MaxGrandparentOverlapFactor test
-      int intValue = rand.nextInt();
-      opt.setMaxGrandparentOverlapFactor(intValue);
-      assert(opt.maxGrandparentOverlapFactor() == intValue);
-    }
-
-    { // SoftRateLimit test
-      double doubleValue = rand.nextDouble();
-      opt.setSoftRateLimit(doubleValue);
-      assert(opt.softRateLimit() == doubleValue);
-    }
-
-    { // HardRateLimit test
-      double doubleValue = rand.nextDouble();
-      opt.setHardRateLimit(doubleValue);
-      assert(opt.hardRateLimit() == doubleValue);
-    }
-
-    { // RateLimitDelayMaxMilliseconds test
-      int intValue = rand.nextInt();
-      opt.setRateLimitDelayMaxMilliseconds(intValue);
-      assert(opt.rateLimitDelayMaxMilliseconds() == intValue);
-    }
-
-    { // ArenaBlockSize test
-      try {
-        long longValue = rand.nextLong();
-        opt.setArenaBlockSize(longValue);
-        assert(opt.arenaBlockSize() == longValue);
-      } catch (RocksDBException e) {
-        assert(false);
-      }
-    }
-
-    { // DisableAutoCompactions test
-      boolean boolValue = rand.nextBoolean();
-      opt.setDisableAutoCompactions(boolValue);
-      assert(opt.disableAutoCompactions() == boolValue);
-    }
-
-    { // PurgeRedundantKvsWhileFlush test
-      boolean boolValue = rand.nextBoolean();
-      opt.setPurgeRedundantKvsWhileFlush(boolValue);
-      assert(opt.purgeRedundantKvsWhileFlush() == boolValue);
-    }
-
-    { // VerifyChecksumsInCompaction test
-      boolean boolValue = rand.nextBoolean();
-      opt.setVerifyChecksumsInCompaction(boolValue);
-      assert(opt.verifyChecksumsInCompaction() == boolValue);
-    }
-
-    { // FilterDeletes test
-      boolean boolValue = rand.nextBoolean();
-      opt.setFilterDeletes(boolValue);
-      assert(opt.filterDeletes() == boolValue);
-    }
-
-    { // MaxSequentialSkipInIterations test
-      long longValue = rand.nextLong();
-      opt.setMaxSequentialSkipInIterations(longValue);
-      assert(opt.maxSequentialSkipInIterations() == longValue);
-    }
-
-    { // InplaceUpdateSupport test
-      boolean boolValue = rand.nextBoolean();
-      opt.setInplaceUpdateSupport(boolValue);
-      assert(opt.inplaceUpdateSupport() == boolValue);
-    }
-
-    { // InplaceUpdateNumLocks test
-      try {
-        long longValue = rand.nextLong();
-        opt.setInplaceUpdateNumLocks(longValue);
-        assert(opt.inplaceUpdateNumLocks() == longValue);
-      } catch (RocksDBException e) {
-        assert(false);
-      }
-    }
-
-    { // MemtablePrefixBloomBits test
-      int intValue = rand.nextInt();
-      opt.setMemtablePrefixBloomBits(intValue);
-      assert(opt.memtablePrefixBloomBits() == intValue);
-    }
-
-    { // MemtablePrefixBloomProbes test
-      int intValue = rand.nextInt();
-      opt.setMemtablePrefixBloomProbes(intValue);
-      assert(opt.memtablePrefixBloomProbes() == intValue);
-    }
-
-    { // BloomLocality test
-      int intValue = rand.nextInt();
-      opt.setBloomLocality(intValue);
-      assert(opt.bloomLocality() == intValue);
-    }
-
-    { // MaxSuccessiveMerges test
-      try {
-        long longValue = rand.nextLong();
-        opt.setMaxSuccessiveMerges(longValue);
-        assert(opt.maxSuccessiveMerges() == longValue);
-      } catch (RocksDBException e){
-        assert(false);
-      }
-    }
-
-    { // MinPartialMergeOperands test
-      int intValue = rand.nextInt();
-      opt.setMinPartialMergeOperands(intValue);
-      assert(opt.minPartialMergeOperands() == intValue);
-    }
+    ColumnFamilyOptionsTest.testCFOptions(opt);
 
     opt.dispose();
     System.out.println("Passed OptionsTest");
diff --git a/java/rocksjni/options.cc b/java/rocksjni/options.cc
index de614594f..7ce685d43 100644
--- a/java/rocksjni/options.cc
+++ b/java/rocksjni/options.cc
@@ -1740,9 +1740,8 @@ void Java_org_rocksdb_Options_prepareForBulkLoad(
  */
 void Java_org_rocksdb_ColumnFamilyOptions_newColumnFamilyOptions(
     JNIEnv* env, jobject jobj) {
-  // TODO(fyrz) needs to be enabled back when ColumnFamilyOptions are available
-  // rocksdb::ColumnFamilyOptions* op = new rocksdb::ColumnFamilyOptions();
-  // rocksdb::ColumnFamilyOptionsJni::setHandle(env, jobj, op);
+  rocksdb::ColumnFamilyOptions* op = new rocksdb::ColumnFamilyOptions();
+  rocksdb::ColumnFamilyOptionsJni::setHandle(env, jobj, op);
 }
 
 /*

From 75010d2084563c74fdb341efb37e61abfa158fdb Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Fri, 31 Oct 2014 23:39:14 +0100
Subject: [PATCH 462/829] [RocksJava] ColumnFamily custom Options API extension

                      *********************
                   ***************************
                 ******** ************* ********
                ********   ***********   ********
               ********     *********     ********
              *************************************
              *************************************
              *************************************
               ******     ***       ***     ******
                ******    ***  ***  ***    ******
                 ******        ***        ******
                   ***************************
                      *********************
---
 java/Makefile                                |  1 +
 java/org/rocksdb/ColumnFamilyDescriptor.java | 58 +++++++++++++++
 java/org/rocksdb/RocksDB.java                | 52 ++++++++------
 java/org/rocksdb/test/ColumnFamilyTest.java  | 13 ++--
 java/org/rocksdb/test/KeyMayExistTest.java   |  7 +-
 java/org/rocksdb/test/MergeTest.java         | 31 +++++---
 java/org/rocksdb/test/ReadOnlyTest.java      | 18 +++--
 java/rocksjni/options.cc                     |  3 +-
 java/rocksjni/portal.h                       | 28 ++++++++
 java/rocksjni/rocksjni.cc                    | 74 ++++++++++++++------
 10 files changed, 216 insertions(+), 69 deletions(-)
 create mode 100644 java/org/rocksdb/ColumnFamilyDescriptor.java

diff --git a/java/Makefile b/java/Makefile
index ef49f3fc9..a4bee144a 100644
--- a/java/Makefile
+++ b/java/Makefile
@@ -5,6 +5,7 @@ NATIVE_JAVA_CLASSES = org.rocksdb.AbstractComparator\
 	org.rocksdb.BlockBasedTableConfig\
 	org.rocksdb.BloomFilter\
 	org.rocksdb.ColumnFamilyHandle\
+	org.rocksdb.ColumnFamilyOptions\
 	org.rocksdb.Comparator\
 	org.rocksdb.ComparatorOptions\
 	org.rocksdb.DBOptions\
diff --git a/java/org/rocksdb/ColumnFamilyDescriptor.java b/java/org/rocksdb/ColumnFamilyDescriptor.java
new file mode 100644
index 000000000..b01c0e858
--- /dev/null
+++ b/java/org/rocksdb/ColumnFamilyDescriptor.java
@@ -0,0 +1,58 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+/**
+ * <p>Describes a column family with a
+ * name and respective Options.</p>
+ */
+public class ColumnFamilyDescriptor {
+
+  /**
+   * <p>Creates a new Column Family using a name and default
+   * options,</p>
+   *
+   * @param columnFamilyName name of column family.
+   */
+  public ColumnFamilyDescriptor(final String columnFamilyName){
+    this(columnFamilyName, new ColumnFamilyOptions());
+  }
+
+  /**
+   * <p>Creates a new Column Family using a name and custom
+   * options.</p>
+   *
+   * @param columnFamilyName name of column family.
+   * @param columnFamilyOptions options to be used with
+   *     column family.
+   */
+  public ColumnFamilyDescriptor(final String columnFamilyName,
+      final ColumnFamilyOptions columnFamilyOptions) {
+    columnFamilyName_ = columnFamilyName;
+    columnFamilyOptions_ = columnFamilyOptions;
+  }
+
+  /**
+   * Retrieve name of column family.
+   *
+   * @return column family name.
+   */
+  public String columnFamilyName() {
+    return columnFamilyName_;
+  }
+
+  /**
+   * Retrieve assigned options instance.
+   *
+   * @return Options instance assigned to this instance.
+   */
+  public ColumnFamilyOptions columnFamilyOptions() {
+    return columnFamilyOptions_;
+  }
+
+  private final String columnFamilyName_;
+  private final ColumnFamilyOptions columnFamilyOptions_;
+}
diff --git a/java/org/rocksdb/RocksDB.java b/java/org/rocksdb/RocksDB.java
index 8efdaea1f..4b580f81f 100644
--- a/java/org/rocksdb/RocksDB.java
+++ b/java/org/rocksdb/RocksDB.java
@@ -123,7 +123,7 @@ public class RocksDB extends RocksObject {
    * </p>
    *
    * @param path the path to the rocksdb.
-   * @param columnFamilyNames list of column family names
+   * @param columnFamilyDescriptors list of column family descriptors
    * @param columnFamilyHandles will be filled with ColumnFamilyHandle instances
    *     on open.
    * @return a {@link RocksDB} instance on success, null if the specified
@@ -132,12 +132,13 @@ public class RocksDB extends RocksObject {
    * @throws org.rocksdb.RocksDBException
    * @see Options#setCreateIfMissing(boolean)
    */
-  public static RocksDB open(String path, List<String> columnFamilyNames,
+  public static RocksDB open(String path,
+      List<ColumnFamilyDescriptor> columnFamilyDescriptors,
       List<ColumnFamilyHandle> columnFamilyHandles) throws RocksDBException {
     // This allows to use the rocksjni default Options instead of
     // the c++ one.
     Options options = new Options();
-    return open(options, path, columnFamilyNames, columnFamilyHandles);
+    return open(options, path, columnFamilyDescriptors, columnFamilyHandles);
   }
 
   /**
@@ -198,7 +199,7 @@ public class RocksDB extends RocksObject {
    *
    * @param options {@link org.rocksdb.Options} instance.
    * @param path the path to the rocksdb.
-   * @param columnFamilyNames list of column family names
+   * @param columnFamilyDescriptors list of column family descriptors
    * @param columnFamilyHandles will be filled with ColumnFamilyHandle instances
    *     on open.
    * @return a {@link RocksDB} instance on success, null if the specified
@@ -207,13 +208,14 @@ public class RocksDB extends RocksObject {
    * @throws org.rocksdb.RocksDBException
    * @see Options#setCreateIfMissing(boolean)
    */
-  public static RocksDB open(Options options, String path, List<String> columnFamilyNames,
+  public static RocksDB open(Options options, String path,
+      List<ColumnFamilyDescriptor> columnFamilyDescriptors,
       List<ColumnFamilyHandle> columnFamilyHandles)
       throws RocksDBException {
     RocksDB db = new RocksDB();
     List<Long> cfReferences = db.open(options.nativeHandle_, path,
-        columnFamilyNames, columnFamilyNames.size());
-    for (int i=0; i<columnFamilyNames.size(); i++) {
+        columnFamilyDescriptors, columnFamilyDescriptors.size());
+    for (int i=0; i<columnFamilyDescriptors.size(); i++) {
       columnFamilyHandles.add(new ColumnFamilyHandle(db, cfReferences.get(i)));
     }
     db.storeOptionsInstance(options);
@@ -244,19 +246,21 @@ public class RocksDB extends RocksObject {
    * options.
    *
    * @param path the path to the RocksDB.
-   * @param columnFamilyNames list of column family names
+   * @param columnFamilyDescriptors list of column family descriptors
    * @param columnFamilyHandles will be filled with ColumnFamilyHandle instances
    *     on open.
    * @return a {@link RocksDB} instance on success, null if the specified
    *     {@link RocksDB} can not be opened.
    * @throws RocksDBException
    */
-  public static RocksDB openReadOnly(String path, List<String> columnFamilyNames,
+  public static RocksDB openReadOnly(String path,
+      List<ColumnFamilyDescriptor> columnFamilyDescriptors,
       List<ColumnFamilyHandle> columnFamilyHandles) throws RocksDBException {
     // This allows to use the rocksjni default Options instead of
     // the c++ one.
     Options options = new Options();
-    return openReadOnly(options, path, columnFamilyNames, columnFamilyHandles);
+    return openReadOnly(options, path, columnFamilyDescriptors,
+        columnFamilyHandles);
   }
 
   /**
@@ -299,7 +303,7 @@ public class RocksDB extends RocksObject {
    *
    * @param options {@link Options} instance.
    * @param path the path to the RocksDB.
-   * @param columnFamilyNames list of column family names
+   * @param columnFamilyDescriptors list of column family descriptors
    * @param columnFamilyHandles will be filled with ColumnFamilyHandle instances
    *     on open.
    * @return a {@link RocksDB} instance on success, null if the specified
@@ -307,15 +311,16 @@ public class RocksDB extends RocksObject {
    * @throws RocksDBException
    */
   public static RocksDB openReadOnly(Options options, String path,
-      List<String> columnFamilyNames, List<ColumnFamilyHandle> columnFamilyHandles)
+      List<ColumnFamilyDescriptor> columnFamilyDescriptors,
+      List<ColumnFamilyHandle> columnFamilyHandles)
       throws RocksDBException {
     // when non-default Options is used, keeping an Options reference
     // in RocksDB can prevent Java to GC during the life-time of
     // the currently-created RocksDB.
     RocksDB db = new RocksDB();
     List<Long> cfReferences = db.openROnly(options.nativeHandle_, path,
-        columnFamilyNames, columnFamilyNames.size());
-    for (int i=0; i<columnFamilyNames.size(); i++) {
+        columnFamilyDescriptors, columnFamilyDescriptors.size());
+    for (int i=0; i<columnFamilyDescriptors.size(); i++) {
       columnFamilyHandles.add(new ColumnFamilyHandle(db, cfReferences.get(i)));
     }
 
@@ -1059,14 +1064,15 @@ public class RocksDB extends RocksObject {
    * allocates a ColumnFamilyHandle within an internal structure.
    * The ColumnFamilyHandle is automatically disposed with DB disposal.
    *
-   * @param columnFamilyName Name of column family to be created.
+   * @param columnFamilyDescriptor column family to be created.
    * @return {@link org.rocksdb.ColumnFamilyHandle} instance
    * @see RocksDBException
    */
-  public ColumnFamilyHandle createColumnFamily(String columnFamilyName)
+  public ColumnFamilyHandle createColumnFamily(
+      ColumnFamilyDescriptor columnFamilyDescriptor)
       throws RocksDBException {
     return new ColumnFamilyHandle(this, createColumnFamily(nativeHandle_,
-        options_.nativeHandle_, columnFamilyName));
+        columnFamilyDescriptor));
   }
 
   /**
@@ -1130,15 +1136,17 @@ public class RocksDB extends RocksObject {
   protected native void open(
       long optionsHandle, String path) throws RocksDBException;
   protected native List<Long> open(long optionsHandle, String path,
-      List<String> columnFamilyNames, int columnFamilyNamesLength)
+      List<ColumnFamilyDescriptor> columnFamilyDescriptors,
+      int columnFamilyDescriptorsLength)
       throws RocksDBException;
   protected native static List<byte[]> listColumnFamilies(
       long optionsHandle, String path) throws RocksDBException;
   protected native void openROnly(
       long optionsHandle, String path) throws RocksDBException;
   protected native List<Long> openROnly(
-      long optionsHandle, String path, List<String> columnFamilyNames,
-      int columnFamilyNamesLength) throws RocksDBException;
+      long optionsHandle, String path,
+      List<ColumnFamilyDescriptor> columnFamilyDescriptors,
+      int columnFamilyDescriptorsLength) throws RocksDBException;
   protected native void put(
       long handle, byte[] key, int keyLen,
       byte[] value, int valueLen) throws RocksDBException;
@@ -1231,8 +1239,8 @@ public class RocksDB extends RocksObject {
   protected native void releaseSnapshot(
       long nativeHandle, long snapshotHandle);
   private native void disposeInternal(long handle);
-  private native long createColumnFamily(long handle, long opt_handle,
-      String name) throws RocksDBException;
+  private native long createColumnFamily(long handle,
+      ColumnFamilyDescriptor columnFamilyDescriptor) throws RocksDBException;
   private native void dropColumnFamily(long handle, long cfHandle) throws RocksDBException;
   private native void flush(long handle, long flushOptHandle)
       throws RocksDBException;
diff --git a/java/org/rocksdb/test/ColumnFamilyTest.java b/java/org/rocksdb/test/ColumnFamilyTest.java
index 57fd2e347..38f6df7f8 100644
--- a/java/org/rocksdb/test/ColumnFamilyTest.java
+++ b/java/org/rocksdb/test/ColumnFamilyTest.java
@@ -43,7 +43,8 @@ public class ColumnFamilyTest {
 
     // Test createColumnFamily
     try {
-      db.createColumnFamily("new_cf");
+      db.createColumnFamily(new ColumnFamilyDescriptor("new_cf",
+          new ColumnFamilyOptions()));
     } catch (RocksDBException e) {
       assert(false);
     }
@@ -67,11 +68,12 @@ public class ColumnFamilyTest {
     }
 
     // Test open database with column family names
-    List<String> cfNames = new ArrayList<String>();
+    List<ColumnFamilyDescriptor> cfNames =
+        new ArrayList<ColumnFamilyDescriptor>();
     List<ColumnFamilyHandle> columnFamilyHandleList =
         new ArrayList<ColumnFamilyHandle>();
-    cfNames.add("default");
-    cfNames.add("new_cf");
+    cfNames.add(new ColumnFamilyDescriptor("default"));
+    cfNames.add(new ColumnFamilyDescriptor("new_cf"));
 
     try {
       db = RocksDB.open(options, db_path, cfNames, columnFamilyHandleList);
@@ -100,7 +102,8 @@ public class ColumnFamilyTest {
     // Test create write to and drop ColumnFamily
     ColumnFamilyHandle tmpColumnFamilyHandle = null;
     try {
-      tmpColumnFamilyHandle = db.createColumnFamily("tmpCF");
+      tmpColumnFamilyHandle = db.createColumnFamily(
+          new ColumnFamilyDescriptor("tmpCF", new ColumnFamilyOptions()));
       db.put(tmpColumnFamilyHandle, "key".getBytes(), "value".getBytes());
       db.dropColumnFamily(tmpColumnFamilyHandle);
       tmpColumnFamilyHandle.dispose();
diff --git a/java/org/rocksdb/test/KeyMayExistTest.java b/java/org/rocksdb/test/KeyMayExistTest.java
index c83a70e52..e3b4ed763 100644
--- a/java/org/rocksdb/test/KeyMayExistTest.java
+++ b/java/org/rocksdb/test/KeyMayExistTest.java
@@ -22,11 +22,12 @@ public class KeyMayExistTest {
         .setCreateMissingColumnFamilies(true);
     try {
       // open database using cf names
-      List<String> cfNames = new ArrayList<String>();
+      List<ColumnFamilyDescriptor> cfNames =
+          new ArrayList<ColumnFamilyDescriptor>();
       List<ColumnFamilyHandle> columnFamilyHandleList =
           new ArrayList<ColumnFamilyHandle>();
-      cfNames.add("default");
-      cfNames.add("new_cf");
+      cfNames.add(new ColumnFamilyDescriptor("default"));
+      cfNames.add(new ColumnFamilyDescriptor("new_cf"));
       db = RocksDB.open(options, DB_PATH, cfNames, columnFamilyHandleList);
       assert(columnFamilyHandleList.size()==2);
 
diff --git a/java/org/rocksdb/test/MergeTest.java b/java/org/rocksdb/test/MergeTest.java
index 9435718f8..e3c70c885 100644
--- a/java/org/rocksdb/test/MergeTest.java
+++ b/java/org/rocksdb/test/MergeTest.java
@@ -46,13 +46,18 @@ public class MergeTest {
     opt.setCreateMissingColumnFamilies(true);
     opt.setMergeOperatorName("stringappend");
 
-    List<String> cfNames = new ArrayList<String>();
+    List<ColumnFamilyDescriptor> cfDescr =
+        new ArrayList<ColumnFamilyDescriptor>();
     List<ColumnFamilyHandle> columnFamilyHandleList =
     new ArrayList<ColumnFamilyHandle>();
-    cfNames.add("default");
-    cfNames.add("new_cf");
+    cfDescr.add(new ColumnFamilyDescriptor("default",
+        new ColumnFamilyOptions().setMergeOperatorName(
+            "stringappend")));
+    cfDescr.add(new ColumnFamilyDescriptor("default",
+        new ColumnFamilyOptions().setMergeOperatorName(
+            "stringappend")));
     RocksDB db = RocksDB.open(opt, db_cf_path_string,
-        cfNames, columnFamilyHandleList);
+        cfDescr, columnFamilyHandleList);
 
     // writing aa under key
     db.put(columnFamilyHandleList.get(1),
@@ -103,13 +108,18 @@ public class MergeTest {
     StringAppendOperator stringAppendOperator = new StringAppendOperator();
     opt.setMergeOperator(stringAppendOperator);
 
-    List<String> cfNames = new ArrayList<String>();
+    List<ColumnFamilyDescriptor> cfDescr =
+        new ArrayList<ColumnFamilyDescriptor>();
     List<ColumnFamilyHandle> columnFamilyHandleList =
     new ArrayList<ColumnFamilyHandle>();
-    cfNames.add("default");
-    cfNames.add("new_cf");
+    cfDescr.add(new ColumnFamilyDescriptor("default",
+        new ColumnFamilyOptions().setMergeOperator(
+            stringAppendOperator)));
+    cfDescr.add(new ColumnFamilyDescriptor("new_cf",
+        new ColumnFamilyOptions().setMergeOperator(
+            stringAppendOperator)));
     RocksDB db = RocksDB.open(opt, db_path_operator,
-        cfNames, columnFamilyHandleList);
+        cfDescr, columnFamilyHandleList);
 
     // writing aa under key
     db.put(columnFamilyHandleList.get(1),
@@ -121,7 +131,10 @@ public class MergeTest {
     String strValue = new String(value);
 
     // Test also with createColumnFamily
-    ColumnFamilyHandle columnFamilyHandle = db.createColumnFamily("new_cf2");
+    ColumnFamilyHandle columnFamilyHandle = db.createColumnFamily(
+        new ColumnFamilyDescriptor("new_cf2",
+            new ColumnFamilyOptions().setMergeOperator(
+                new StringAppendOperator())));
     // writing xx under cfkey2
     db.put(columnFamilyHandle, "cfkey2".getBytes(), "xx".getBytes());
     // merge yy under cfkey2
diff --git a/java/org/rocksdb/test/ReadOnlyTest.java b/java/org/rocksdb/test/ReadOnlyTest.java
index 87e8f1e9e..21b5eb9ae 100644
--- a/java/org/rocksdb/test/ReadOnlyTest.java
+++ b/java/org/rocksdb/test/ReadOnlyTest.java
@@ -34,12 +34,15 @@ public class ReadOnlyTest {
       db2.close();
 
 
-      List<String> cfNames = new ArrayList<String>();
-      cfNames.add("default");
+      List<ColumnFamilyDescriptor> cfNames =
+          new ArrayList<ColumnFamilyDescriptor>();
+      cfNames.add(new ColumnFamilyDescriptor("default"));
 
       db = RocksDB.open(DB_PATH, cfNames, columnFamilyHandleList);
-      columnFamilyHandleList.add(db.createColumnFamily("new_cf"));
-      columnFamilyHandleList.add(db.createColumnFamily("new_cf2"));
+      columnFamilyHandleList.add(db.createColumnFamily(
+          new ColumnFamilyDescriptor("new_cf", new ColumnFamilyOptions())));
+      columnFamilyHandleList.add(db.createColumnFamily(
+          new ColumnFamilyDescriptor("new_cf2", new ColumnFamilyOptions())));
       db.put(columnFamilyHandleList.get(2), "key2".getBytes(),
           "value2".getBytes());
 
@@ -47,9 +50,10 @@ public class ReadOnlyTest {
       assert(db2.get("key2".getBytes())==null);
       assert(db2.get(columnFamilyHandleList.get(0), "key2".getBytes())==null);
 
-      List<String> cfNewName = new ArrayList<String>();
-      cfNewName.add("default");
-      cfNewName.add("new_cf2");
+      List<ColumnFamilyDescriptor> cfNewName =
+          new ArrayList<ColumnFamilyDescriptor>();
+      cfNewName.add(new ColumnFamilyDescriptor("default"));
+      cfNewName.add(new ColumnFamilyDescriptor("new_cf2"));
       db3 = RocksDB.openReadOnly(DB_PATH, cfNewName, db3ColumnFamilyHandleList);
       assert(new String(db3.get(db3ColumnFamilyHandleList.get(1),
           "key2".getBytes())).equals("value2"));
diff --git a/java/rocksjni/options.cc b/java/rocksjni/options.cc
index 7ce685d43..109930cdc 100644
--- a/java/rocksjni/options.cc
+++ b/java/rocksjni/options.cc
@@ -12,9 +12,8 @@
 #include <memory>
 
 #include "include/org_rocksdb_Options.h"
-//TODO(fyrz) to be commented in with options refactoring pull requests
 #include "include/org_rocksdb_DBOptions.h"
-//#include "include/org_rocksdb_ColumnFamilyOptions.h"
+#include "include/org_rocksdb_ColumnFamilyOptions.h"
 #include "include/org_rocksdb_WriteOptions.h"
 #include "include/org_rocksdb_ReadOptions.h"
 #include "include/org_rocksdb_ComparatorOptions.h"
diff --git a/java/rocksjni/portal.h b/java/rocksjni/portal.h
index 3a5641d46..9fdab09a4 100644
--- a/java/rocksjni/portal.h
+++ b/java/rocksjni/portal.h
@@ -159,6 +159,34 @@ class DBOptionsJni {
   }
 };
 
+class ColumnFamilyDescriptorJni {
+ public:
+  // Get the java class id of org.rocksdb.ColumnFamilyDescriptor
+  static jclass getColumnFamilyDescriptorClass(JNIEnv* env) {
+    jclass jclazz = env->FindClass("org/rocksdb/ColumnFamilyDescriptor");
+    assert(jclazz != nullptr);
+    return jclazz;
+  }
+
+  // Get the java method id of columnFamilyName
+  static jmethodID getColumnFamilyNameMethod(JNIEnv* env) {
+    static jmethodID mid = env->GetMethodID(
+        getColumnFamilyDescriptorClass(env),
+        "columnFamilyName", "()Ljava/lang/String;");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  // Get the java method id of columnFamilyOptions
+  static jmethodID getColumnFamilyOptionsMethod(JNIEnv* env) {
+    static jmethodID mid = env->GetMethodID(
+        getColumnFamilyDescriptorClass(env),
+        "columnFamilyOptions", "()Lorg/rocksdb/ColumnFamilyOptions;");
+    assert(mid != nullptr);
+    return mid;
+  }
+};
+
 class ColumnFamilyOptionsJni {
  public:
   // Get the java class id of org.rocksdb.ColumnFamilyOptions.
diff --git a/java/rocksjni/rocksjni.cc b/java/rocksjni/rocksjni.cc
index 44d45a2c2..4fa1a544c 100644
--- a/java/rocksjni/rocksjni.cc
+++ b/java/rocksjni/rocksjni.cc
@@ -69,7 +69,7 @@ void Java_org_rocksdb_RocksDB_openROnly__JLjava_lang_String_2(
 jobject
     Java_org_rocksdb_RocksDB_openROnly__JLjava_lang_String_2Ljava_util_List_2I(
     JNIEnv* env, jobject jdb, jlong jopt_handle, jstring jdb_path,
-    jobject jcfname_list, jint jcfname_count) {
+    jobject jcfdesc_list, jint jcfdesc_count) {
   auto opt = reinterpret_cast<rocksdb::Options*>(jopt_handle);
   rocksdb::DB* db = nullptr;
   const char* db_path = env->GetStringUTFChars(jdb_path, 0);
@@ -79,23 +79,34 @@ jobject
 
   std::vector<rocksdb::ColumnFamilyDescriptor> column_families;
   std::vector<rocksdb::ColumnFamilyHandle* > handles;
-  // get iterator for cfnames
+  // get iterator for ColumnFamilyDescriptors
   jobject iteratorObj = env->CallObjectMethod(
-      jcfname_list, rocksdb::ListJni::getIteratorMethod(env));
+      jcfdesc_list, rocksdb::ListJni::getIteratorMethod(env));
 
-  // iterate over cfnames and convert cfnames to
-  // ColumnFamilyDescriptor instances
+  // iterate over ColumnFamilyDescriptors
   while (env->CallBooleanMethod(
       iteratorObj, rocksdb::ListJni::getHasNextMethod(env)) == JNI_TRUE) {
-      jstring jstr = (jstring) env->CallObjectMethod(iteratorObj,
+      // get ColumnFamilyDescriptor
+      jobject jcf_descriptor = env->CallObjectMethod(iteratorObj,
           rocksdb::ListJni::getNextMethod(env));
+      // get ColumnFamilyName
+      jstring jstr = (jstring) env->CallObjectMethod(jcf_descriptor,
+          rocksdb::ColumnFamilyDescriptorJni::getColumnFamilyNameMethod(
+          env));
+      // get CF Options
+      jobject jcf_opt_obj = env->CallObjectMethod(jcf_descriptor,
+          rocksdb::ColumnFamilyDescriptorJni::getColumnFamilyOptionsMethod(
+          env));
+      rocksdb::ColumnFamilyOptions* cfOptions =
+          rocksdb::ColumnFamilyOptionsJni::getHandle(env, jcf_opt_obj);
+
       const char* cfname = env->GetStringUTFChars(jstr, 0);
 
       // free allocated cfnames after call to open
       cfnames_to_free.push_back(cfname);
       jcfnames_for_free.push_back(jstr);
       column_families.push_back(rocksdb::ColumnFamilyDescriptor(cfname,
-          *static_cast<rocksdb::ColumnFamilyOptions*>(opt)));
+          *cfOptions));
   }
 
   rocksdb::Status s = rocksdb::DB::OpenForReadOnly(*opt,
@@ -141,7 +152,7 @@ jobject
  */
 jobject Java_org_rocksdb_RocksDB_open__JLjava_lang_String_2Ljava_util_List_2I(
     JNIEnv* env, jobject jdb, jlong jopt_handle, jstring jdb_path,
-    jobject jcfname_list, jint jcfname_count) {
+    jobject jcfdesc_list, jint jcfdesc_count) {
   auto opt = reinterpret_cast<rocksdb::Options*>(jopt_handle);
   rocksdb::DB* db = nullptr;
   const char* db_path = env->GetStringUTFChars(jdb_path, 0);
@@ -151,23 +162,34 @@ jobject Java_org_rocksdb_RocksDB_open__JLjava_lang_String_2Ljava_util_List_2I(
 
   std::vector<rocksdb::ColumnFamilyDescriptor> column_families;
   std::vector<rocksdb::ColumnFamilyHandle* > handles;
-  // get iterator for cfnames
+  // get iterator for ColumnFamilyDescriptors
   jobject iteratorObj = env->CallObjectMethod(
-      jcfname_list, rocksdb::ListJni::getIteratorMethod(env));
+      jcfdesc_list, rocksdb::ListJni::getIteratorMethod(env));
 
-  // iterate over cfnames and convert cfnames to
-  // ColumnFamilyDescriptor instances
+  // iterate over ColumnFamilyDescriptors
   while (env->CallBooleanMethod(
       iteratorObj, rocksdb::ListJni::getHasNextMethod(env)) == JNI_TRUE) {
-      jstring jstr = (jstring) env->CallObjectMethod(iteratorObj,
+      // get ColumnFamilyDescriptor
+      jobject jcf_descriptor = env->CallObjectMethod(iteratorObj,
           rocksdb::ListJni::getNextMethod(env));
+      // get ColumnFamilyName
+      jstring jstr = (jstring) env->CallObjectMethod(jcf_descriptor,
+          rocksdb::ColumnFamilyDescriptorJni::getColumnFamilyNameMethod(
+          env));
+      // get CF Options
+      jobject jcf_opt_obj = env->CallObjectMethod(jcf_descriptor,
+          rocksdb::ColumnFamilyDescriptorJni::getColumnFamilyOptionsMethod(
+          env));
+      rocksdb::ColumnFamilyOptions* cfOptions =
+          rocksdb::ColumnFamilyOptionsJni::getHandle(env, jcf_opt_obj);
+
       const char* cfname = env->GetStringUTFChars(jstr, 0);
 
       // free allocated cfnames after call to open
       cfnames_to_free.push_back(cfname);
       jcfnames_for_free.push_back(jstr);
       column_families.push_back(rocksdb::ColumnFamilyDescriptor(cfname,
-          *static_cast<rocksdb::ColumnFamilyOptions*>(opt)));
+          *cfOptions));
   }
 
   rocksdb::Status s = rocksdb::DB::Open(*opt, db_path, column_families,
@@ -1151,18 +1173,28 @@ jlongArray Java_org_rocksdb_RocksDB_iterators(
 /*
  * Class:     org_rocksdb_RocksDB
  * Method:    createColumnFamily
- * Signature: (JJLjava/lang/String;)J;
+ * Signature: (JLorg/rocksdb/ColumnFamilyDescriptor;)J;
  */
 jlong Java_org_rocksdb_RocksDB_createColumnFamily(
-    JNIEnv* env, jobject jdb, jlong jdb_handle, jlong jopt_handle,
-    jstring jcfname) {
+    JNIEnv* env, jobject jdb, jlong jdb_handle,
+    jobject jcf_descriptor) {
   rocksdb::ColumnFamilyHandle* handle;
-  const char* cfname = env->GetStringUTFChars(jcfname, 0);
   auto db_handle = reinterpret_cast<rocksdb::DB*>(jdb_handle);
-  auto opt = reinterpret_cast<rocksdb::Options*>(jopt_handle);
+
+  jstring jstr = (jstring) env->CallObjectMethod(jcf_descriptor,
+      rocksdb::ColumnFamilyDescriptorJni::getColumnFamilyNameMethod(
+      env));
+  // get CF Options
+  jobject jcf_opt_obj = env->CallObjectMethod(jcf_descriptor,
+      rocksdb::ColumnFamilyDescriptorJni::getColumnFamilyOptionsMethod(
+      env));
+  rocksdb::ColumnFamilyOptions* cfOptions =
+      rocksdb::ColumnFamilyOptionsJni::getHandle(env, jcf_opt_obj);
+
+  const char* cfname = env->GetStringUTFChars(jstr, 0);
   rocksdb::Status s = db_handle->CreateColumnFamily(
-      *static_cast<rocksdb::ColumnFamilyOptions*>(opt), cfname, &handle);
-  env->ReleaseStringUTFChars(jcfname, cfname);
+      *cfOptions, cfname, &handle);
+  env->ReleaseStringUTFChars(jstr, cfname);
 
   if (s.ok()) {
     return reinterpret_cast<jlong>(handle);

From fa9cfc65f343f8530fb99712ba1e522bff067519 Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Wed, 5 Nov 2014 12:50:26 +0100
Subject: [PATCH 463/829] [RocksJava] Integrated Review comments from yhchiang
 in D28023

---
 java/org/rocksdb/RocksDB.java               | 22 ++++++++++-----------
 java/org/rocksdb/test/ColumnFamilyTest.java |  6 +++++-
 java/org/rocksdb/test/KeyMayExistTest.java  |  2 +-
 java/org/rocksdb/test/MergeTest.java        |  7 ++-----
 4 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/java/org/rocksdb/RocksDB.java b/java/org/rocksdb/RocksDB.java
index 4b580f81f..c62c2f160 100644
--- a/java/org/rocksdb/RocksDB.java
+++ b/java/org/rocksdb/RocksDB.java
@@ -130,14 +130,14 @@ public class RocksDB extends RocksObject {
    *     {@link RocksDB} can not be opened.
    *
    * @throws org.rocksdb.RocksDBException
-   * @see Options#setCreateIfMissing(boolean)
+   * @see DBOptions#setCreateIfMissing(boolean)
    */
   public static RocksDB open(String path,
       List<ColumnFamilyDescriptor> columnFamilyDescriptors,
       List<ColumnFamilyHandle> columnFamilyHandles) throws RocksDBException {
     // This allows to use the rocksjni default Options instead of
     // the c++ one.
-    Options options = new Options();
+    DBOptions options = new DBOptions();
     return open(options, path, columnFamilyDescriptors, columnFamilyHandles);
   }
 
@@ -197,7 +197,7 @@ public class RocksDB extends RocksObject {
    * <p>
    * ColumnFamily handles are disposed when the RocksDB instance is disposed.</p>
    *
-   * @param options {@link org.rocksdb.Options} instance.
+   * @param options {@link org.rocksdb.DBOptions} instance.
    * @param path the path to the rocksdb.
    * @param columnFamilyDescriptors list of column family descriptors
    * @param columnFamilyHandles will be filled with ColumnFamilyHandle instances
@@ -206,16 +206,16 @@ public class RocksDB extends RocksObject {
    *     {@link RocksDB} can not be opened.
    *
    * @throws org.rocksdb.RocksDBException
-   * @see Options#setCreateIfMissing(boolean)
+   * @see DBOptions#setCreateIfMissing(boolean)
    */
-  public static RocksDB open(Options options, String path,
+  public static RocksDB open(DBOptions options, String path,
       List<ColumnFamilyDescriptor> columnFamilyDescriptors,
       List<ColumnFamilyHandle> columnFamilyHandles)
       throws RocksDBException {
     RocksDB db = new RocksDB();
     List<Long> cfReferences = db.open(options.nativeHandle_, path,
         columnFamilyDescriptors, columnFamilyDescriptors.size());
-    for (int i=0; i<columnFamilyDescriptors.size(); i++) {
+    for (int i = 0; i < columnFamilyDescriptors.size(); i++) {
       columnFamilyHandles.add(new ColumnFamilyHandle(db, cfReferences.get(i)));
     }
     db.storeOptionsInstance(options);
@@ -258,7 +258,7 @@ public class RocksDB extends RocksObject {
       List<ColumnFamilyHandle> columnFamilyHandles) throws RocksDBException {
     // This allows to use the rocksjni default Options instead of
     // the c++ one.
-    Options options = new Options();
+    DBOptions options = new DBOptions();
     return openReadOnly(options, path, columnFamilyDescriptors,
         columnFamilyHandles);
   }
@@ -301,7 +301,7 @@ public class RocksDB extends RocksObject {
    * options instance have been closed. If user doesn't call options dispose
    * explicitly,then this options instance will be GC'd automatically.</p>
    *
-   * @param options {@link Options} instance.
+   * @param options {@link DBOptions} instance.
    * @param path the path to the RocksDB.
    * @param columnFamilyDescriptors list of column family descriptors
    * @param columnFamilyHandles will be filled with ColumnFamilyHandle instances
@@ -310,7 +310,7 @@ public class RocksDB extends RocksObject {
    *     {@link RocksDB} can not be opened.
    * @throws RocksDBException
    */
-  public static RocksDB openReadOnly(Options options, String path,
+  public static RocksDB openReadOnly(DBOptions options, String path,
       List<ColumnFamilyDescriptor> columnFamilyDescriptors,
       List<ColumnFamilyHandle> columnFamilyHandles)
       throws RocksDBException {
@@ -342,7 +342,7 @@ public class RocksDB extends RocksObject {
     return RocksDB.listColumnFamilies(options.nativeHandle_, path);
   }
 
-  private void storeOptionsInstance(Options options) {
+  private void storeOptionsInstance(DBOptionsInterface options) {
     options_ = options;
   }
 
@@ -1247,5 +1247,5 @@ public class RocksDB extends RocksObject {
   private native void flush(long handle, long flushOptHandle,
       long cfHandle) throws RocksDBException;
 
-  protected Options options_;
+  protected DBOptionsInterface options_;
 }
diff --git a/java/org/rocksdb/test/ColumnFamilyTest.java b/java/org/rocksdb/test/ColumnFamilyTest.java
index 38f6df7f8..0f3ee0c25 100644
--- a/java/org/rocksdb/test/ColumnFamilyTest.java
+++ b/java/org/rocksdb/test/ColumnFamilyTest.java
@@ -22,6 +22,10 @@ public class ColumnFamilyTest {
     RocksDB db = null;
     Options options = new Options();
     options.setCreateIfMissing(true);
+
+    DBOptions dbOptions = new DBOptions();
+    dbOptions.setCreateIfMissing(true);
+
     try {
         db = RocksDB.open(options, db_path);
     } catch (RocksDBException e) {
@@ -76,7 +80,7 @@ public class ColumnFamilyTest {
     cfNames.add(new ColumnFamilyDescriptor("new_cf"));
 
     try {
-      db = RocksDB.open(options, db_path, cfNames, columnFamilyHandleList);
+      db = RocksDB.open(dbOptions, db_path, cfNames, columnFamilyHandleList);
       assert(columnFamilyHandleList.size() == 2);
       db.put("dfkey1".getBytes(), "dfvalue".getBytes());
       db.put(columnFamilyHandleList.get(0), "dfkey2".getBytes(),
diff --git a/java/org/rocksdb/test/KeyMayExistTest.java b/java/org/rocksdb/test/KeyMayExistTest.java
index e3b4ed763..03be46fbe 100644
--- a/java/org/rocksdb/test/KeyMayExistTest.java
+++ b/java/org/rocksdb/test/KeyMayExistTest.java
@@ -17,7 +17,7 @@ public class KeyMayExistTest {
 
   public static void main(String[] args){
     RocksDB db;
-    Options options = new Options();
+    DBOptions options = new DBOptions();
     options.setCreateIfMissing(true)
         .setCreateMissingColumnFamilies(true);
     try {
diff --git a/java/org/rocksdb/test/MergeTest.java b/java/org/rocksdb/test/MergeTest.java
index e3c70c885..d802559e1 100644
--- a/java/org/rocksdb/test/MergeTest.java
+++ b/java/org/rocksdb/test/MergeTest.java
@@ -7,7 +7,6 @@ package org.rocksdb.test;
 
 import java.util.List;
 import java.util.ArrayList;
-import java.util.Collections;
 import org.rocksdb.*;
 
 public class MergeTest {
@@ -41,10 +40,9 @@ public class MergeTest {
 
   public static void testCFStringOption()
       throws InterruptedException, RocksDBException {
-    Options opt = new Options();
+    DBOptions opt = new DBOptions();
     opt.setCreateIfMissing(true);
     opt.setCreateMissingColumnFamilies(true);
-    opt.setMergeOperatorName("stringappend");
 
     List<ColumnFamilyDescriptor> cfDescr =
         new ArrayList<ColumnFamilyDescriptor>();
@@ -102,11 +100,10 @@ public class MergeTest {
 
   public static void testCFOperatorOption()
       throws InterruptedException, RocksDBException {
-    Options opt = new Options();
+    DBOptions opt = new DBOptions();
     opt.setCreateIfMissing(true);
     opt.setCreateMissingColumnFamilies(true);
     StringAppendOperator stringAppendOperator = new StringAppendOperator();
-    opt.setMergeOperator(stringAppendOperator);
 
     List<ColumnFamilyDescriptor> cfDescr =
         new ArrayList<ColumnFamilyDescriptor>();

From 9d2ba2136168e09a8fc5ec3a54714bcbc96e3256 Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Sat, 8 Nov 2014 18:58:35 +0100
Subject: [PATCH 464/829] [RocksJava] Incorporated review comments

---
 java/Makefile                               |   6 +
 java/RocksDBColumnFamilySample.java         | 146 ++++++++++++++++++++
 java/org/rocksdb/test/ColumnFamilyTest.java |   6 +-
 3 files changed, 154 insertions(+), 4 deletions(-)
 create mode 100644 java/RocksDBColumnFamilySample.java

diff --git a/java/Makefile b/java/Makefile
index a4bee144a..21066b991 100644
--- a/java/Makefile
+++ b/java/Makefile
@@ -70,6 +70,12 @@ sample: java
 	@rm -rf /tmp/rocksdbjni
 	@rm -rf /tmp/rocksdbjni_not_found
 
+column_family_sample: java
+	javac -cp $(ROCKSDB_JAR) RocksDBColumnFamilySample.java
+	@rm -rf /tmp/rocksdbjni
+	java -ea -Djava.library.path=.:../ -cp ".:./*" -Xcheck:jni RocksDBColumnFamilySample /tmp/rocksdbjni
+	@rm -rf /tmp/rocksdbjni
+
 test: java
 	@rm -rf /tmp/rocksdbjni_*
 	javac org/rocksdb/test/*.java
diff --git a/java/RocksDBColumnFamilySample.java b/java/RocksDBColumnFamilySample.java
new file mode 100644
index 000000000..23ff07c85
--- /dev/null
+++ b/java/RocksDBColumnFamilySample.java
@@ -0,0 +1,146 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+import org.rocksdb.*;
+
+import java.util.ArrayList;
+import java.util.List;
+
+public class RocksDBColumnFamilySample {
+  static {
+    RocksDB.loadLibrary();
+  }
+
+  public static void main(String[] args) throws RocksDBException {
+    if (args.length < 1) {
+      System.out.println(
+          "usage: RocksDBColumnFamilySample db_path");
+      return;
+    }
+    String db_path = args[0];
+
+    System.out.println("RocksDBColumnFamilySample");
+    RocksDB db = null;
+    DBOptions dbOptions = null;
+    List<RocksIterator> iterators = new ArrayList<>();
+    RocksIterator iterator = null;
+    ColumnFamilyHandle cfHandle = null;
+    WriteBatch wb = null;
+    try {
+      // Setup DBOptions
+      dbOptions = new DBOptions().
+          setCreateIfMissing(true).
+          setCreateMissingColumnFamilies(true);
+      // Setup ColumnFamily descriptors
+      List<ColumnFamilyDescriptor> cfNames =
+          new ArrayList<>();
+      // Default column family
+      cfNames.add(new ColumnFamilyDescriptor("default"));
+      // New column families
+      cfNames.add(new ColumnFamilyDescriptor("cf_green",
+          new ColumnFamilyOptions().setComparator(
+              BuiltinComparator.BYTEWISE_COMPARATOR)));
+      cfNames.add(new ColumnFamilyDescriptor("cf_blue",
+          new ColumnFamilyOptions().setComparator(
+              BuiltinComparator.REVERSE_BYTEWISE_COMPARATOR)));
+      cfNames.add(new ColumnFamilyDescriptor("cf_red",
+          new ColumnFamilyOptions().
+              setMergeOperator(new StringAppendOperator())));
+
+      List<ColumnFamilyHandle> cfHandles =
+          new ArrayList<>();
+      db = RocksDB.open(dbOptions,
+          db_path, cfNames, cfHandles);
+      // List column families in database
+      System.out.println("List existent column families:");
+      List<byte[]> cfListing = RocksDB.listColumnFamilies(
+          new Options(), db_path);
+      for (byte[] cf : cfListing) {
+        System.out.format(" - %s\n", new String(cf));
+      }
+      // Bootstrapping values
+      System.out.println("Writing values to database.");
+      for (int i=0; i < cfNames.size(); i++) {
+        for (int j=0; j < 10; j++) {
+          db.put(cfHandles.get(i),
+              String.valueOf(j).getBytes(),
+              String.valueOf(j).getBytes());
+        }
+      }
+      // Retrieve values using get
+      System.out.println("Retrieve values with get.");
+      for (int i=0; i < cfNames.size(); i++) {
+        for (int j=0; j < 10; j++) {
+          System.out.format(" %s", new String(
+              db.get(cfHandles.get(i),
+                  String.valueOf(j).getBytes())));
+        }
+        System.out.println("");
+      }
+      // Add a new column family to existing database
+      System.out.println("Add new column family");
+      cfHandle = db.createColumnFamily(new ColumnFamilyDescriptor(
+          "cf_temp", new ColumnFamilyOptions().
+          setMergeOperator(new StringAppendOperator())));
+      System.out.println("Write key/value into new column family.");
+      db.put(cfHandle, "key".getBytes(), "value".getBytes());
+      System.out.format("Lookup 'key' retrieved value: %s\n", new String(
+          db.get(cfHandle, "key".getBytes())));
+      // Delete key
+      System.out.println("Delete key/value in new column family.");
+      db.remove(cfHandle, "key".getBytes());
+      // WriteBatch with column family
+      wb = new WriteBatch();
+      wb.put(cfHandle, "key".getBytes(), "value".getBytes());
+      wb.put(cfHandle, "key2".getBytes(), "value2".getBytes());
+      wb.remove(cfHandle, "key2".getBytes());
+      wb.merge(cfHandle, "key".getBytes(), "morevalues".getBytes());
+      db.write(new WriteOptions(), wb);
+      // Retrieve a single iterator with a cf handle
+      System.out.println("Retrieve values using a iterator on" +
+          " a column family.");
+      iterator = db.newIterator(cfHandle);
+      iterator.seekToFirst();
+      while(iterator.isValid()) {
+        System.out.format(" %s", new String(
+            iterator.value()));
+        iterator.next();
+      }
+      System.out.println("");
+      // Delete column family
+      System.out.println("Delete column family.");
+      db.dropColumnFamily(cfHandle);
+      // Retrieve values from cf using iterator
+      System.out.println("Retrieve values with iterators");
+      iterators = db.newIterators(cfHandles);
+      assert(iterators.size() == 4);
+      for (RocksIterator iter : iterators) {
+        iter.seekToFirst();
+        while(iter.isValid()) {
+          System.out.format(" %s", new String(
+              iter.value()));
+          iter.next();
+        }
+        System.out.println("");
+      }
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+      if (dbOptions != null) {
+        dbOptions.dispose();
+      }
+      if (iterator != null) {
+        iterator.dispose();
+      }
+      for (RocksIterator iter : iterators) {
+        iter.dispose();
+      }
+      if (wb != null) {
+        wb.dispose();
+      }
+    }
+  }
+}
diff --git a/java/org/rocksdb/test/ColumnFamilyTest.java b/java/org/rocksdb/test/ColumnFamilyTest.java
index 0f3ee0c25..350c4446c 100644
--- a/java/org/rocksdb/test/ColumnFamilyTest.java
+++ b/java/org/rocksdb/test/ColumnFamilyTest.java
@@ -73,9 +73,9 @@ public class ColumnFamilyTest {
 
     // Test open database with column family names
     List<ColumnFamilyDescriptor> cfNames =
-        new ArrayList<ColumnFamilyDescriptor>();
+        new ArrayList<>();
     List<ColumnFamilyHandle> columnFamilyHandleList =
-        new ArrayList<ColumnFamilyHandle>();
+        new ArrayList<>();
     cfNames.add(new ColumnFamilyDescriptor("default"));
     cfNames.add(new ColumnFamilyDescriptor("new_cf"));
 
@@ -222,8 +222,6 @@ public class ColumnFamilyTest {
           .equals("value"));
     } catch (RocksDBException e) {
       assert(false);
-    } catch (IllegalArgumentException e) {
-      assert(false);
     }
 
     // Test multiget without correct number of column

From 9a255b95f061c820d9e3e7f50f0542a5db6db834 Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Wed, 12 Nov 2014 19:49:13 +0100
Subject: [PATCH 465/829] [RocksJava] Sample and Default value

- RocksDB ColumnFamilySample adjusted to C++ sample.
- DefaultColumnFamily is available now as constant in RocksDB.
---
 java/RocksDBColumnFamilySample.java | 153 +++++++++-------------------
 java/org/rocksdb/RocksDB.java       |   1 +
 2 files changed, 50 insertions(+), 104 deletions(-)

diff --git a/java/RocksDBColumnFamilySample.java b/java/RocksDBColumnFamilySample.java
index 23ff07c85..200e53a1d 100644
--- a/java/RocksDBColumnFamilySample.java
+++ b/java/RocksDBColumnFamilySample.java
@@ -23,121 +23,66 @@ public class RocksDBColumnFamilySample {
 
     System.out.println("RocksDBColumnFamilySample");
     RocksDB db = null;
-    DBOptions dbOptions = null;
-    List<RocksIterator> iterators = new ArrayList<>();
-    RocksIterator iterator = null;
-    ColumnFamilyHandle cfHandle = null;
+    Options options = null;
+    ColumnFamilyHandle columnFamilyHandle = null;
     WriteBatch wb = null;
     try {
-      // Setup DBOptions
-      dbOptions = new DBOptions().
-          setCreateIfMissing(true).
-          setCreateMissingColumnFamilies(true);
-      // Setup ColumnFamily descriptors
-      List<ColumnFamilyDescriptor> cfNames =
-          new ArrayList<>();
-      // Default column family
-      cfNames.add(new ColumnFamilyDescriptor("default"));
-      // New column families
-      cfNames.add(new ColumnFamilyDescriptor("cf_green",
-          new ColumnFamilyOptions().setComparator(
-              BuiltinComparator.BYTEWISE_COMPARATOR)));
-      cfNames.add(new ColumnFamilyDescriptor("cf_blue",
-          new ColumnFamilyOptions().setComparator(
-              BuiltinComparator.REVERSE_BYTEWISE_COMPARATOR)));
-      cfNames.add(new ColumnFamilyDescriptor("cf_red",
-          new ColumnFamilyOptions().
-              setMergeOperator(new StringAppendOperator())));
+      options = new Options().setCreateIfMissing(true);
+      db = RocksDB.open(options, db_path);
+      assert(db != null);
 
-      List<ColumnFamilyHandle> cfHandles =
-          new ArrayList<>();
-      db = RocksDB.open(dbOptions,
-          db_path, cfNames, cfHandles);
-      // List column families in database
-      System.out.println("List existent column families:");
-      List<byte[]> cfListing = RocksDB.listColumnFamilies(
-          new Options(), db_path);
-      for (byte[] cf : cfListing) {
-        System.out.format(" - %s\n", new String(cf));
-      }
-      // Bootstrapping values
-      System.out.println("Writing values to database.");
-      for (int i=0; i < cfNames.size(); i++) {
-        for (int j=0; j < 10; j++) {
-          db.put(cfHandles.get(i),
-              String.valueOf(j).getBytes(),
-              String.valueOf(j).getBytes());
-        }
+      // create column family
+      columnFamilyHandle = db.createColumnFamily(
+          new ColumnFamilyDescriptor("new_cf", new ColumnFamilyOptions()));
+      assert(columnFamilyHandle != null);
+
+    } finally {
+      if (columnFamilyHandle != null) {
+        columnFamilyHandle.dispose();
       }
-      // Retrieve values using get
-      System.out.println("Retrieve values with get.");
-      for (int i=0; i < cfNames.size(); i++) {
-        for (int j=0; j < 10; j++) {
-          System.out.format(" %s", new String(
-              db.get(cfHandles.get(i),
-                  String.valueOf(j).getBytes())));
-        }
-        System.out.println("");
+      if (db != null) {
+        db.close();
+        db = null;
       }
-      // Add a new column family to existing database
-      System.out.println("Add new column family");
-      cfHandle = db.createColumnFamily(new ColumnFamilyDescriptor(
-          "cf_temp", new ColumnFamilyOptions().
-          setMergeOperator(new StringAppendOperator())));
-      System.out.println("Write key/value into new column family.");
-      db.put(cfHandle, "key".getBytes(), "value".getBytes());
-      System.out.format("Lookup 'key' retrieved value: %s\n", new String(
-          db.get(cfHandle, "key".getBytes())));
-      // Delete key
-      System.out.println("Delete key/value in new column family.");
-      db.remove(cfHandle, "key".getBytes());
-      // WriteBatch with column family
+    }
+
+    // open DB with two column families
+    List<ColumnFamilyDescriptor> columnFamilyDescriptors = new ArrayList<>();
+    // have to open default column family
+    columnFamilyDescriptors.add(new ColumnFamilyDescriptor(
+        RocksDB.DEFAULT_COLUMN_FAMILY, new ColumnFamilyOptions()));
+    // open the new one, too
+    columnFamilyDescriptors.add(new ColumnFamilyDescriptor(
+        "new_cf", new ColumnFamilyOptions()));
+    List<ColumnFamilyHandle> columnFamilyHandles = new ArrayList<>();
+    try {
+      db = RocksDB.open(new DBOptions(), db_path,
+          columnFamilyDescriptors, columnFamilyHandles);
+      assert(db != null);
+
+      // put and get from non-default column family
+      db.put(columnFamilyHandles.get(0), new WriteOptions(),
+          "key".getBytes(), "value".getBytes());
+      String value = new String(db.get(columnFamilyHandles.get(0),
+          "key".getBytes()));
+
+      // atomic write
       wb = new WriteBatch();
-      wb.put(cfHandle, "key".getBytes(), "value".getBytes());
-      wb.put(cfHandle, "key2".getBytes(), "value2".getBytes());
-      wb.remove(cfHandle, "key2".getBytes());
-      wb.merge(cfHandle, "key".getBytes(), "morevalues".getBytes());
+      wb.put(columnFamilyHandles.get(0), "key2".getBytes(), "value2".getBytes());
+      wb.put(columnFamilyHandles.get(1), "key3".getBytes(), "value3".getBytes());
+      wb.remove(columnFamilyHandles.get(0), "key".getBytes());
       db.write(new WriteOptions(), wb);
-      // Retrieve a single iterator with a cf handle
-      System.out.println("Retrieve values using a iterator on" +
-          " a column family.");
-      iterator = db.newIterator(cfHandle);
-      iterator.seekToFirst();
-      while(iterator.isValid()) {
-        System.out.format(" %s", new String(
-            iterator.value()));
-        iterator.next();
-      }
-      System.out.println("");
-      // Delete column family
-      System.out.println("Delete column family.");
-      db.dropColumnFamily(cfHandle);
-      // Retrieve values from cf using iterator
-      System.out.println("Retrieve values with iterators");
-      iterators = db.newIterators(cfHandles);
-      assert(iterators.size() == 4);
-      for (RocksIterator iter : iterators) {
-        iter.seekToFirst();
-        while(iter.isValid()) {
-          System.out.format(" %s", new String(
-              iter.value()));
-          iter.next();
-        }
-        System.out.println("");
-      }
+
+      // drop column family
+      db.dropColumnFamily(columnFamilyHandles.get(1));
+
     } finally {
+      for (ColumnFamilyHandle handle : columnFamilyHandles){
+        handle.dispose();
+      }
       if (db != null) {
         db.close();
       }
-      if (dbOptions != null) {
-        dbOptions.dispose();
-      }
-      if (iterator != null) {
-        iterator.dispose();
-      }
-      for (RocksIterator iter : iterators) {
-        iter.dispose();
-      }
       if (wb != null) {
         wb.dispose();
       }
diff --git a/java/org/rocksdb/RocksDB.java b/java/org/rocksdb/RocksDB.java
index c62c2f160..690d84ec8 100644
--- a/java/org/rocksdb/RocksDB.java
+++ b/java/org/rocksdb/RocksDB.java
@@ -16,6 +16,7 @@ import org.rocksdb.util.Environment;
  * indicates sth wrong at the RocksDB library side and the call failed.
  */
 public class RocksDB extends RocksObject {
+  public static final String DEFAULT_COLUMN_FAMILY = "default";
   public static final int NOT_FOUND = -1;
   private static final String[] compressionLibs_ = {
       "snappy", "z", "bzip2", "lz4", "lz4hc"};

From 079d942ea8be48f9d0670ee2f39514d54e606443 Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Sat, 8 Nov 2014 19:19:29 +0100
Subject: [PATCH 466/829] [RocksJava] Code-cleanup + Java7 warnings removed

---
 java/org/rocksdb/BackupableDB.java        |  2 +-
 java/org/rocksdb/MergeOperator.java       |  2 --
 java/org/rocksdb/NativeLibraryLoader.java |  6 ++----
 java/org/rocksdb/PlainTableConfig.java    |  2 +-
 java/org/rocksdb/RocksDB.java             | 10 +++++-----
 java/org/rocksdb/RocksDBException.java    |  2 --
 java/org/rocksdb/RocksObject.java         |  3 ++-
 java/org/rocksdb/Statistics.java          |  3 +--
 java/org/rocksdb/StatisticsCollector.java |  2 --
 java/org/rocksdb/util/Environment.java    | 10 +++++-----
 10 files changed, 17 insertions(+), 25 deletions(-)

diff --git a/java/org/rocksdb/BackupableDB.java b/java/org/rocksdb/BackupableDB.java
index 7fa37abab..75683f587 100644
--- a/java/org/rocksdb/BackupableDB.java
+++ b/java/org/rocksdb/BackupableDB.java
@@ -104,7 +104,7 @@ public class BackupableDB extends RocksDB {
     super();
   }
 
-  @Override protected void finalize() {
+  @Override protected void finalize() throws Throwable {
     close();
     super.finalize();
   }
diff --git a/java/org/rocksdb/MergeOperator.java b/java/org/rocksdb/MergeOperator.java
index aaf44d07c..2655e466f 100644
--- a/java/org/rocksdb/MergeOperator.java
+++ b/java/org/rocksdb/MergeOperator.java
@@ -5,8 +5,6 @@
 
 package org.rocksdb;
 
-import java.util.*;
-
 /**
  * MergeOperator holds an operator to be applied when compacting
  * two merge operands held under the same key in order to obtain a single
diff --git a/java/org/rocksdb/NativeLibraryLoader.java b/java/org/rocksdb/NativeLibraryLoader.java
index 26a26bbca..bf0196e77 100644
--- a/java/org/rocksdb/NativeLibraryLoader.java
+++ b/java/org/rocksdb/NativeLibraryLoader.java
@@ -9,12 +9,12 @@ import org.rocksdb.util.Environment;
  */
 public class NativeLibraryLoader {
   private static String sharedLibraryName = Environment.getJniLibraryName("rocksdb");
-  private static String tempFilePrefix = "librocksdbjni";
   private static String tempFileSuffix = "." + Environment.getJniLibraryExtension();
 
   public static void loadLibraryFromJar(String tmpDir)
       throws IOException {
     File temp;
+    String tempFilePrefix = "librocksdbjni";
     if(tmpDir == null || tmpDir.equals(""))
       temp = File.createTempFile(tempFilePrefix, tempFileSuffix);
     else
@@ -43,9 +43,7 @@ public class NativeLibraryLoader {
     } finally {
       if(os != null)
         os.close();
-
-      if(is != null)
-        is.close();
+      is.close();
     }
 
     System.load(temp.getAbsolutePath());
diff --git a/java/org/rocksdb/PlainTableConfig.java b/java/org/rocksdb/PlainTableConfig.java
index 7f0d672ef..3a41bea84 100644
--- a/java/org/rocksdb/PlainTableConfig.java
+++ b/java/org/rocksdb/PlainTableConfig.java
@@ -131,7 +131,7 @@ public class PlainTableConfig extends TableFormatConfig {
    *
    * <p>See linux doc Documentation/vm/hugetlbpage.txt</p>
    *
-   * @param hugePageTlbSize
+   * @param hugePageTlbSize huge page tlb size
    * @return the reference to the current config.
    */
   public PlainTableConfig setHugePageTlbSize(int hugePageTlbSize) {
diff --git a/java/org/rocksdb/RocksDB.java b/java/org/rocksdb/RocksDB.java
index 690d84ec8..c3b8072a2 100644
--- a/java/org/rocksdb/RocksDB.java
+++ b/java/org/rocksdb/RocksDB.java
@@ -734,7 +734,7 @@ public class RocksDB extends RocksObject {
     List<byte[]> values = multiGet(
         nativeHandle_, keys, keys.size());
 
-    Map<byte[], byte[]> keyValueMap = new HashMap<byte[], byte[]>();
+    Map<byte[], byte[]> keyValueMap = new HashMap<>();
     for(int i = 0; i < values.size(); i++) {
       if(values.get(i) == null) {
         continue;
@@ -774,7 +774,7 @@ public class RocksDB extends RocksObject {
     List<byte[]> values = multiGet(nativeHandle_, keys, keys.size(),
         columnFamilyHandleList);
 
-    Map<byte[], byte[]> keyValueMap = new HashMap<byte[], byte[]>();
+    Map<byte[], byte[]> keyValueMap = new HashMap<>();
     for(int i = 0; i < values.size(); i++) {
       if (values.get(i) == null) {
         continue;
@@ -801,7 +801,7 @@ public class RocksDB extends RocksObject {
     List<byte[]> values = multiGet(
         nativeHandle_, opt.nativeHandle_, keys, keys.size());
 
-    Map<byte[], byte[]> keyValueMap = new HashMap<byte[], byte[]>();
+    Map<byte[], byte[]> keyValueMap = new HashMap<>();
     for(int i = 0; i < values.size(); i++) {
       if(values.get(i) == null) {
         continue;
@@ -844,7 +844,7 @@ public class RocksDB extends RocksObject {
     List<byte[]> values = multiGet(nativeHandle_, opt.nativeHandle_,
         keys, keys.size(), columnFamilyHandleList);
 
-    Map<byte[], byte[]> keyValueMap = new HashMap<byte[], byte[]>();
+    Map<byte[], byte[]> keyValueMap = new HashMap<>();
     for(int i = 0; i < values.size(); i++) {
       if(values.get(i) == null) {
         continue;
@@ -1051,7 +1051,7 @@ public class RocksDB extends RocksObject {
   public List<RocksIterator> newIterators(
       List<ColumnFamilyHandle> columnFamilyHandleList) throws RocksDBException {
     List<RocksIterator> iterators =
-        new ArrayList<RocksIterator>(columnFamilyHandleList.size());
+        new ArrayList<>(columnFamilyHandleList.size());
 
     long[] iteratorRefs = iterators(nativeHandle_, columnFamilyHandleList);
     for (int i=0; i<columnFamilyHandleList.size(); i++){
diff --git a/java/org/rocksdb/RocksDBException.java b/java/org/rocksdb/RocksDBException.java
index acc93669e..c4fe72bdd 100644
--- a/java/org/rocksdb/RocksDBException.java
+++ b/java/org/rocksdb/RocksDBException.java
@@ -5,8 +5,6 @@
 
 package org.rocksdb;
 
-import java.util.*;
-
 /**
  * A RocksDBException encapsulates the error of an operation.  This exception
  * type is used to describe an internal error from the c++ rocksdb library.
diff --git a/java/org/rocksdb/RocksObject.java b/java/org/rocksdb/RocksObject.java
index ff5842139..51b7fb890 100644
--- a/java/org/rocksdb/RocksObject.java
+++ b/java/org/rocksdb/RocksObject.java
@@ -107,8 +107,9 @@ public abstract class RocksObject {
    * Simply calls {@code dispose()} and release its c++ resource if it has not
    * yet released.
    */
-  @Override protected void finalize() {
+  @Override protected void finalize() throws Throwable {
     dispose();
+    super.finalize();
   }
 
   /**
diff --git a/java/org/rocksdb/Statistics.java b/java/org/rocksdb/Statistics.java
index bed2b8810..066f3a5b5 100644
--- a/java/org/rocksdb/Statistics.java
+++ b/java/org/rocksdb/Statistics.java
@@ -24,9 +24,8 @@ public class Statistics {
 
   public HistogramData geHistogramData(HistogramType histogramType) {
     assert(isInitialized());
-    HistogramData hist = geHistogramData0(
+    return geHistogramData0(
         histogramType.getValue(), statsHandle_);
-    return hist;
   }
 
   private boolean isInitialized() {
diff --git a/java/org/rocksdb/StatisticsCollector.java b/java/org/rocksdb/StatisticsCollector.java
index 524756a6c..dc83e7c88 100644
--- a/java/org/rocksdb/StatisticsCollector.java
+++ b/java/org/rocksdb/StatisticsCollector.java
@@ -6,11 +6,9 @@
 package org.rocksdb;
 
 import java.util.List;
-import java.util.concurrent.ArrayBlockingQueue;
 import java.util.concurrent.Executors;
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.TimeUnit;
-import java.util.concurrent.atomic.AtomicBoolean;
 
 /**
  * <p>Helper class to collect DB statistics periodically at a period specified in
diff --git a/java/org/rocksdb/util/Environment.java b/java/org/rocksdb/util/Environment.java
index 7bb42ace7..c121adb17 100644
--- a/java/org/rocksdb/util/Environment.java
+++ b/java/org/rocksdb/util/Environment.java
@@ -5,17 +5,17 @@ public class Environment {
   private static String ARCH = System.getProperty("os.arch").toLowerCase();
 
   public static boolean isWindows() {
-    return (OS.indexOf("win") >= 0);
+    return (OS.contains("win"));
   }
 
   public static boolean isMac() {
-    return (OS.indexOf("mac") >= 0);
+    return (OS.contains("mac"));
   }
 
   public static boolean isUnix() {
-    return (OS.indexOf("nix") >= 0 ||
-            OS.indexOf("nux") >= 0 ||
-            OS.indexOf("aix") >= 0);
+    return (OS.contains("nix") ||
+        OS.contains("nux") ||
+        OS.contains("aix"));
   }
 
   public static boolean is64Bit() {

From d50c68e3a5cdb981cbe71c3cd8b8f998198bdc2d Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Sat, 8 Nov 2014 20:35:35 +0100
Subject: [PATCH 467/829] [RocksJava] JavaDoc cleanup warnings with Java8

Java8 is more restrictive than Java7 with generating
JavaDocs. This commit resolves current existing Java8
warnings.
---
 java/org/rocksdb/AbstractComparator.java      |  29 ++--
 java/org/rocksdb/BackupableDB.java            |  15 +-
 .../rocksdb/ColumnFamilyOptionsInterface.java |  18 ++-
 java/org/rocksdb/DBOptionsInterface.java      |  19 ++-
 .../rocksdb/HashSkipListMemTableConfig.java   |   2 +
 java/org/rocksdb/MemTableConfig.java          |   5 +
 java/org/rocksdb/Options.java                 |   8 +
 java/org/rocksdb/RateLimiterConfig.java       |   2 +
 java/org/rocksdb/RestoreBackupableDB.java     |  18 ++-
 java/org/rocksdb/RocksDB.java                 | 142 +++++++++++++-----
 java/org/rocksdb/RocksEnv.java                |   9 ++
 java/org/rocksdb/RocksIterator.java           |   6 +-
 java/org/rocksdb/RocksObject.java             |   2 +-
 java/org/rocksdb/Slice.java                   |  42 +++---
 java/org/rocksdb/StatisticsCollector.java     |   1 +
 java/org/rocksdb/TableFormatConfig.java       |   6 +-
 java/org/rocksdb/WriteBatch.java              |  55 +++++--
 java/org/rocksdb/WriteOptions.java            |   7 +
 .../rocksdb/test/AbstractComparatorTest.java  |   2 +
 .../rocksdb/test/PlatformRandomHelper.java    |   4 +
 20 files changed, 288 insertions(+), 104 deletions(-)

diff --git a/java/org/rocksdb/AbstractComparator.java b/java/org/rocksdb/AbstractComparator.java
index 5302f43b3..1abdb4774 100644
--- a/java/org/rocksdb/AbstractComparator.java
+++ b/java/org/rocksdb/AbstractComparator.java
@@ -46,15 +46,18 @@ public abstract class AbstractComparator<T extends AbstractSlice>
   public abstract int compare(final T a, final T b);
 
   /**
-   * Used to reduce the space requirements
-   * for internal data structures like index blocks.
+   * <p>Used to reduce the space requirements
+   * for internal data structures like index blocks.</p>
    *
-   * If start &lt; limit, you may return a new start which is a
-   * shorter string in [start, limit).
+   * <p>If start &lt; limit, you may return a new start which is a
+   * shorter string in [start, limit).</p>
    *
-   * Simple comparator implementations may return null if they
+   * <p>Simple comparator implementations may return null if they
    * wish to use start unchanged. i.e., an implementation of
-   * this method that does nothing is correct.
+   * this method that does nothing is correct.</p>
+   *
+   * @param start String
+   * @param limit of type T
    *
    * @return a shorter start, or null
    */
@@ -63,15 +66,17 @@ public abstract class AbstractComparator<T extends AbstractSlice>
   }
 
   /**
-   * Used to reduce the space requirements
-   * for internal data structures like index blocks.
+   * <p>Used to reduce the space requirements
+   * for internal data structures like index blocks.</p>
    *
-   * You may return a new short key (key1) where
-   * key1 &ge; key.
+   * <p>You may return a new short key (key1) where
+   * key1 &ge; key.</p>
    *
-   * Simple comparator implementations may return null if they
+   * <p>Simple comparator implementations may return null if they
    * wish to leave the key unchanged. i.e., an implementation of
-   * this method that does nothing is correct.
+   * this method that does nothing is correct.</p>
+   *
+   * @param key String
    *
    * @return a shorter key, or null
    */
diff --git a/java/org/rocksdb/BackupableDB.java b/java/org/rocksdb/BackupableDB.java
index 75683f587..2644fec8f 100644
--- a/java/org/rocksdb/BackupableDB.java
+++ b/java/org/rocksdb/BackupableDB.java
@@ -23,6 +23,9 @@ public class BackupableDB extends RocksDB {
    * @param db_path Path to store data to. The path for storing the backup should be
    *     specified in the {@link org.rocksdb.BackupableDBOptions}.
    * @return BackupableDB reference to the opened database.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
    */
   public static BackupableDB open(
       Options opt, BackupableDBOptions bopt, String db_path)
@@ -45,7 +48,9 @@ public class BackupableDB extends RocksDB {
    *
    * @param flushBeforeBackup if true, then all data will be flushed
    *     before creating backup.
-   * @throws org.rocksdb.RocksDBException
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
    */
   public void createNewBackup(boolean flushBeforeBackup)
       throws RocksDBException {
@@ -56,7 +61,9 @@ public class BackupableDB extends RocksDB {
    * Deletes old backups, keeping latest numBackupsToKeep alive.
    *
    * @param numBackupsToKeep Number of latest backups to keep.
-   * @throws org.rocksdb.RocksDBException
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
    */
   public void purgeOldBackups(int numBackupsToKeep)
       throws RocksDBException {
@@ -67,7 +74,9 @@ public class BackupableDB extends RocksDB {
    * Deletes a specific backup.
    *
    * @param backupId of backup to delete.
-   * @throws org.rocksdb.RocksDBException
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
    */
   public void deleteBackup(int backupId) throws RocksDBException {
     deleteBackup0(nativeHandle_, backupId);
diff --git a/java/org/rocksdb/ColumnFamilyOptionsInterface.java b/java/org/rocksdb/ColumnFamilyOptionsInterface.java
index fb04c249a..c1be7f294 100644
--- a/java/org/rocksdb/ColumnFamilyOptionsInterface.java
+++ b/java/org/rocksdb/ColumnFamilyOptionsInterface.java
@@ -152,7 +152,8 @@ public interface ColumnFamilyOptionsInterface {
    * Default: 4MB
    * @param writeBufferSize the size of write buffer.
    * @return the instance of the current Object.
-   * @throws org.rocksdb.RocksDBException
+   * @throws org.rocksdb.RocksDBException thrown on 32-Bit platforms while
+   *    overflowing the underlying platform specific value.
    */
   Object setWriteBufferSize(long writeBufferSize)
       throws RocksDBException;
@@ -223,6 +224,7 @@ public interface ColumnFamilyOptionsInterface {
    * extract the prefix given a key.
    *
    * @param n use the first n bytes of a key as its prefix.
+   * @return the reference to the current option.
    */
   Object useFixedLengthPrefixExtractor(int n);
 
@@ -415,6 +417,8 @@ public interface ColumnFamilyOptionsInterface {
    * and total file size for level-3 will be 2GB.
    * by default 'maxBytesForLevelBase' is 10MB.
    *
+   * @param maxBytesForLevelBase maximum bytes for level base.
+   *
    * @return the reference to the current option.
    * @see #setMaxBytesForLevelMultiplier(int)
    */
@@ -614,7 +618,8 @@ public interface ColumnFamilyOptionsInterface {
    *
    * @param arenaBlockSize the size of an arena block
    * @return the reference to the current option.
-   * @throws org.rocksdb.RocksDBException
+   * @throws org.rocksdb.RocksDBException thrown on 32-Bit platforms while
+   *    overflowing the underlying platform specific value.
    */
   Object setArenaBlockSize(long arenaBlockSize)
       throws RocksDBException;
@@ -762,7 +767,8 @@ public interface ColumnFamilyOptionsInterface {
    *
    * @param config the mem-table config.
    * @return the instance of the current Object.
-   * @throws org.rocksdb.RocksDBException
+   * @throws org.rocksdb.RocksDBException thrown on 32-Bit platforms while
+   *    overflowing the underlying platform specific value.
    */
   Object setMemTableConfig(MemTableConfig config)
       throws RocksDBException;
@@ -826,7 +832,8 @@ public interface ColumnFamilyOptionsInterface {
    * @param inplaceUpdateNumLocks the number of locks used for
    *     inplace updates.
    * @return the reference to the current option.
-   * @throws org.rocksdb.RocksDBException
+   * @throws org.rocksdb.RocksDBException thrown on 32-Bit platforms while
+   *    overflowing the underlying platform specific value.
    */
   Object setInplaceUpdateNumLocks(long inplaceUpdateNumLocks)
       throws RocksDBException;
@@ -920,7 +927,8 @@ public interface ColumnFamilyOptionsInterface {
    *
    * @param maxSuccessiveMerges the maximum number of successive merges.
    * @return the reference to the current option.
-   * @throws org.rocksdb.RocksDBException
+   * @throws org.rocksdb.RocksDBException thrown on 32-Bit platforms while
+   *    overflowing the underlying platform specific value.
    */
   Object setMaxSuccessiveMerges(long maxSuccessiveMerges)
       throws RocksDBException;
diff --git a/java/org/rocksdb/DBOptionsInterface.java b/java/org/rocksdb/DBOptionsInterface.java
index ca65a6146..d3df483cb 100644
--- a/java/org/rocksdb/DBOptionsInterface.java
+++ b/java/org/rocksdb/DBOptionsInterface.java
@@ -123,7 +123,7 @@ public interface DBOptionsInterface {
    * Default: 5000
    *
    * @param maxOpenFiles the maximum number of open files.
-   * @return the reference to the current DBOptions.
+   * @return the instance of the current Object.
    */
   Object setMaxOpenFiles(int maxOpenFiles);
 
@@ -147,6 +147,9 @@ public interface DBOptionsInterface {
    * <p>If set to 0 (default), we will dynamically choose the WAL size limit to
    * be [sum of all write_buffer_size * max_write_buffer_number] * 2</p>
    * <p>Default: 0</p>
+   *
+   * @param maxTotalWalSize max total wal size.
+   * @return the instance of the current Object.
    */
   Object setMaxTotalWalSize(long maxTotalWalSize);
 
@@ -197,7 +200,7 @@ public interface DBOptionsInterface {
    *
    * @param disableDataSync a boolean flag to specify whether to
    *     disable data sync.
-   * @return the reference to the current DBOptions.
+   * @return the instance of the current Object.
    */
   Object setDisableDataSync(boolean disableDataSync);
 
@@ -370,7 +373,8 @@ public interface DBOptionsInterface {
    *
    * @param maxLogFileSize the maximum size of a info log file.
    * @return the instance of the current Object.
-   * @throws org.rocksdb.RocksDBException
+   * @throws org.rocksdb.RocksDBException thrown on 32-Bit platforms while
+   *    overflowing the underlying platform specific value.
    */
   Object setMaxLogFileSize(long maxLogFileSize)
       throws RocksDBException;
@@ -392,7 +396,8 @@ public interface DBOptionsInterface {
    *
    * @param logFileTimeToRoll the time interval in seconds.
    * @return the instance of the current Object.
-   * @throws org.rocksdb.RocksDBException
+   * @throws org.rocksdb.RocksDBException thrown on 32-Bit platforms while
+   *    overflowing the underlying platform specific value.
    */
   Object setLogFileTimeToRoll(long logFileTimeToRoll)
       throws RocksDBException;
@@ -413,7 +418,8 @@ public interface DBOptionsInterface {
    *
    * @param keepLogFileNum the maximum number of info log files to be kept.
    * @return the instance of the current Object.
-   * @throws org.rocksdb.RocksDBException
+   * @throws org.rocksdb.RocksDBException thrown on 32-Bit platforms while
+   *    overflowing the underlying platform specific value.
    */
   Object setKeepLogFileNum(long keepLogFileNum)
       throws RocksDBException;
@@ -584,7 +590,8 @@ public interface DBOptionsInterface {
    *
    * @param size the size in byte
    * @return the instance of the current Object.
-   * @throws org.rocksdb.RocksDBException
+   * @throws org.rocksdb.RocksDBException thrown on 32-Bit platforms while
+   *    overflowing the underlying platform specific value.
    */
   Object setManifestPreallocationSize(long size)
       throws RocksDBException;
diff --git a/java/org/rocksdb/HashSkipListMemTableConfig.java b/java/org/rocksdb/HashSkipListMemTableConfig.java
index ad2120f18..7dc598fc4 100644
--- a/java/org/rocksdb/HashSkipListMemTableConfig.java
+++ b/java/org/rocksdb/HashSkipListMemTableConfig.java
@@ -50,6 +50,8 @@ public class HashSkipListMemTableConfig extends MemTableConfig {
   /**
    * Set the height of the skip list.  Default = 4.
    *
+   * @param height height to set.
+   *
    * @return the reference to the current HashSkipListMemTableConfig.
    */
   public HashSkipListMemTableConfig setHeight(int height) {
diff --git a/java/org/rocksdb/MemTableConfig.java b/java/org/rocksdb/MemTableConfig.java
index deb74f185..853d29776 100644
--- a/java/org/rocksdb/MemTableConfig.java
+++ b/java/org/rocksdb/MemTableConfig.java
@@ -22,6 +22,11 @@ public abstract class MemTableConfig {
    * that associated with the Java MemTableConfig.
    *
    * @see Options#setMemTableConfig(MemTableConfig)
+   *
+   * @return native handle address to native memory table instance.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
    */
   abstract protected long newMemTableFactoryHandle()
       throws RocksDBException;
diff --git a/java/org/rocksdb/Options.java b/java/org/rocksdb/Options.java
index 2d6fa08cd..7307608af 100644
--- a/java/org/rocksdb/Options.java
+++ b/java/org/rocksdb/Options.java
@@ -61,6 +61,9 @@ public class Options extends RocksObject
    * Use the specified object to interact with the environment,
    * e.g. to read/write files, schedule background work, etc.
    * Default: {@link RocksEnv#getDefault()}
+   *
+   * @param env {@link RocksEnv} instance.
+   * @return the instance of the current Options.
    */
   public Options setEnv(RocksEnv env) {
     assert(isInitialized());
@@ -69,6 +72,11 @@ public class Options extends RocksObject
     return this;
   }
 
+  /**
+   * Returns the set RocksEnv instance.
+   *
+   * @return {@link RocksEnv} instance set in the Options.
+   */
   public RocksEnv getEnv() {
     return env_;
   }
diff --git a/java/org/rocksdb/RateLimiterConfig.java b/java/org/rocksdb/RateLimiterConfig.java
index 06f3990d0..09d1c7a04 100644
--- a/java/org/rocksdb/RateLimiterConfig.java
+++ b/java/org/rocksdb/RateLimiterConfig.java
@@ -16,6 +16,8 @@ public abstract class RateLimiterConfig {
    * with a Java RateLimiterConfig.
    *
    * @see org.rocksdb.DBOptions#setRateLimiter(long, long)
+   *
+   * @return native handle address to rate limiter instance.
    */
   abstract protected long newRateLimiterHandle();
 }
diff --git a/java/org/rocksdb/RestoreBackupableDB.java b/java/org/rocksdb/RestoreBackupableDB.java
index 207383e43..ffbc2e011 100644
--- a/java/org/rocksdb/RestoreBackupableDB.java
+++ b/java/org/rocksdb/RestoreBackupableDB.java
@@ -40,8 +40,10 @@ public class RestoreBackupableDB extends RocksObject {
    * @param backupId id pointing to backup
    * @param dbDir database directory to restore to
    * @param walDir directory where wal files are located
-   * @param restoreOptions {@link org.rocksdb.RestoreOptions} instance
-   * @throws RocksDBException
+   * @param restoreOptions {@link org.rocksdb.RestoreOptions} instance.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
    */
   public void restoreDBFromBackup(long backupId, String dbDir, String walDir,
       RestoreOptions restoreOptions) throws RocksDBException {
@@ -55,7 +57,9 @@ public class RestoreBackupableDB extends RocksObject {
    * @param dbDir database directory to restore to
    * @param walDir directory where wal files are located
    * @param restoreOptions {@link org.rocksdb.RestoreOptions} instance
-   * @throws RocksDBException
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
    */
   public void restoreDBFromLatestBackup(String dbDir, String walDir,
       RestoreOptions restoreOptions) throws RocksDBException {
@@ -67,7 +71,9 @@ public class RestoreBackupableDB extends RocksObject {
    * Deletes old backups, keeping latest numBackupsToKeep alive.
    *
    * @param numBackupsToKeep of latest backups to keep
-   * @throws org.rocksdb.RocksDBException
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
    */
   public void purgeOldBackups(int numBackupsToKeep) throws RocksDBException {
     purgeOldBackups0(nativeHandle_, numBackupsToKeep);
@@ -77,7 +83,9 @@ public class RestoreBackupableDB extends RocksObject {
    * Deletes a specific backup.
    *
    * @param backupId of backup to delete.
-   * @throws org.rocksdb.RocksDBException
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
    */
   public void deleteBackup(int backupId) throws RocksDBException {
     deleteBackup0(nativeHandle_, backupId);
diff --git a/java/org/rocksdb/RocksDB.java b/java/org/rocksdb/RocksDB.java
index c3b8072a2..5ebbc609e 100644
--- a/java/org/rocksdb/RocksDB.java
+++ b/java/org/rocksdb/RocksDB.java
@@ -95,7 +95,8 @@ public class RocksDB extends RocksObject {
    * @return a {@link RocksDB} instance on success, null if the specified
    *     {@link RocksDB} can not be opened.
    *
-   * @throws org.rocksdb.RocksDBException
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
    * @see Options#setCreateIfMissing(boolean)
    */
   public static RocksDB open(String path) throws RocksDBException {
@@ -130,7 +131,8 @@ public class RocksDB extends RocksObject {
    * @return a {@link RocksDB} instance on success, null if the specified
    *     {@link RocksDB} can not be opened.
    *
-   * @throws org.rocksdb.RocksDBException
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
    * @see DBOptions#setCreateIfMissing(boolean)
    */
   public static RocksDB open(String path,
@@ -161,7 +163,9 @@ public class RocksDB extends RocksObject {
    * @return a {@link RocksDB} instance on success, null if the specified
    *     {@link RocksDB} can not be opened.
    *
-   * @throws org.rocksdb.RocksDBException
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   *
    * @see Options#setCreateIfMissing(boolean)
    */
   public static RocksDB open(Options options, String path)
@@ -206,7 +210,9 @@ public class RocksDB extends RocksObject {
    * @return a {@link RocksDB} instance on success, null if the specified
    *     {@link RocksDB} can not be opened.
    *
-   * @throws org.rocksdb.RocksDBException
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   *
    * @see DBOptions#setCreateIfMissing(boolean)
    */
   public static RocksDB open(DBOptions options, String path,
@@ -231,7 +237,9 @@ public class RocksDB extends RocksObject {
    * @param path the path to the RocksDB.
    * @return a {@link RocksDB} instance on success, null if the specified
    *     {@link RocksDB} can not be opened.
-   * @throws RocksDBException
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
    */
   public static RocksDB openReadOnly(String path)
       throws RocksDBException {
@@ -252,7 +260,9 @@ public class RocksDB extends RocksObject {
    *     on open.
    * @return a {@link RocksDB} instance on success, null if the specified
    *     {@link RocksDB} can not be opened.
-   * @throws RocksDBException
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
    */
   public static RocksDB openReadOnly(String path,
       List<ColumnFamilyDescriptor> columnFamilyDescriptors,
@@ -277,7 +287,9 @@ public class RocksDB extends RocksObject {
    * @param path the path to the RocksDB.
    * @return a {@link RocksDB} instance on success, null if the specified
    *     {@link RocksDB} can not be opened.
-   * @throws RocksDBException
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
    */
   public static RocksDB openReadOnly(Options options, String path)
       throws RocksDBException {
@@ -309,7 +321,9 @@ public class RocksDB extends RocksObject {
    *     on open.
    * @return a {@link RocksDB} instance on success, null if the specified
    *     {@link RocksDB} can not be opened.
-   * @throws RocksDBException
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
    */
   public static RocksDB openReadOnly(DBOptions options, String path,
       List<ColumnFamilyDescriptor> columnFamilyDescriptors,
@@ -336,7 +350,8 @@ public class RocksDB extends RocksObject {
    * @param path Absolute path to rocksdb database
    * @return List&lt;byte[]&gt; List containing the column family names
    *
-   * @throws RocksDBException
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
    */
   public static List<byte[]> listColumnFamilies(Options options, String path)
       throws RocksDBException {
@@ -366,7 +381,8 @@ public class RocksDB extends RocksObject {
    * @param key the specified key to be inserted.
    * @param value the value associated with the specified key.
    *
-   * @see RocksDBException
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
    */
   public void put(byte[] key, byte[] value) throws RocksDBException {
     put(nativeHandle_, key, key.length, value, value.length);
@@ -383,7 +399,8 @@ public class RocksDB extends RocksObject {
    *
    * throws IllegalArgumentException if column family is not present
    *
-   * @see RocksDBException
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
    */
   public void put(ColumnFamilyHandle columnFamilyHandle, byte[] key,
       byte[] value) throws RocksDBException {
@@ -394,10 +411,12 @@ public class RocksDB extends RocksObject {
   /**
    * Set the database entry for "key" to "value".
    *
+   * @param writeOpts {@link org.rocksdb.WriteOptions} instance.
    * @param key the specified key to be inserted.
    * @param value the value associated with the specified key.
    *
-   * @see RocksDBException
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
    */
   public void put(WriteOptions writeOpts, byte[] key, byte[] value)
       throws RocksDBException {
@@ -411,12 +430,14 @@ public class RocksDB extends RocksObject {
    *
    * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
    *     instance
+   * @param writeOpts {@link org.rocksdb.WriteOptions} instance.
    * @param key the specified key to be inserted.
    * @param value the value associated with the specified key.
    *
    * throws IllegalArgumentException if column family is not present
    *
-   * @see RocksDBException
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
    * @see IllegalArgumentException
    */
   public void put(ColumnFamilyHandle columnFamilyHandle, WriteOptions writeOpts,
@@ -506,7 +527,8 @@ public class RocksDB extends RocksObject {
    * @param writeOpts WriteOptions instance
    * @param updates WriteBatch instance
    *
-   * @see RocksDBException
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
    */
   public void write(WriteOptions writeOpts, WriteBatch updates)
       throws RocksDBException {
@@ -517,8 +539,11 @@ public class RocksDB extends RocksObject {
    * Add merge operand for key/value pair.
    *
    * @param key the specified key to be merged.
-   * @param value the value to be nerged with the current value for
+   * @param value the value to be merged with the current value for
    * the specified key.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
    */
   public void merge(byte[] key, byte[] value) throws RocksDBException {
     merge(nativeHandle_, key, key.length, value, value.length);
@@ -529,8 +554,11 @@ public class RocksDB extends RocksObject {
    *
    * @param columnFamilyHandle {@link ColumnFamilyHandle} instance
    * @param key the specified key to be merged.
-   * @param value the value to be nerged with the current value for
+   * @param value the value to be merged with the current value for
    * the specified key.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
    */
   public void merge(ColumnFamilyHandle columnFamilyHandle, byte[] key,
       byte[] value) throws RocksDBException {
@@ -545,6 +573,9 @@ public class RocksDB extends RocksObject {
    * @param key the specified key to be merged.
    * @param value the value to be merged with the current value for
    * the specified key.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
    */
   public void merge(WriteOptions writeOpts, byte[] key, byte[] value)
       throws RocksDBException {
@@ -560,6 +591,9 @@ public class RocksDB extends RocksObject {
    * @param key the specified key to be merged.
    * @param value the value to be merged with the current value for
    * the specified key.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
    */
   public void merge(ColumnFamilyHandle columnFamilyHandle,
       WriteOptions writeOpts, byte[] key, byte[] value)
@@ -580,7 +614,8 @@ public class RocksDB extends RocksObject {
    *     be returned.  RocksDB.NOT_FOUND will be returned if the value not
    *     found.
    *
-   * @see RocksDBException
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
    */
   public int get(byte[] key, byte[] value) throws RocksDBException {
     return get(nativeHandle_, key, key.length, value, value.length);
@@ -600,7 +635,8 @@ public class RocksDB extends RocksObject {
    *     be returned.  RocksDB.NOT_FOUND will be returned if the value not
    *     found.
    *
-   * @throws RocksDBException
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
    */
   public int get(ColumnFamilyHandle columnFamilyHandle, byte[] key, byte[] value)
       throws RocksDBException, IllegalArgumentException {
@@ -611,6 +647,7 @@ public class RocksDB extends RocksObject {
   /**
    * Get the value associated with the specified key.
    *
+   * @param opt {@link org.rocksdb.ReadOptions} instance.
    * @param key the key to retrieve the value.
    * @param value the out-value to receive the retrieved value.
    * @return The size of the actual value that matches the specified
@@ -620,7 +657,8 @@ public class RocksDB extends RocksObject {
    *     be returned.  RocksDB.NOT_FOUND will be returned if the value not
    *     found.
    *
-   * @throws RocksDBException
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
    */
   public int get(ReadOptions opt, byte[] key, byte[] value)
       throws RocksDBException {
@@ -632,6 +670,7 @@ public class RocksDB extends RocksObject {
    *
    * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
    *     instance
+   * @param opt {@link org.rocksdb.ReadOptions} instance.
    * @param key the key to retrieve the value.
    * @param value the out-value to receive the retrieved value.
    * @return The size of the actual value that matches the specified
@@ -641,7 +680,8 @@ public class RocksDB extends RocksObject {
    *     be returned.  RocksDB.NOT_FOUND will be returned if the value not
    *     found.
    *
-   * @throws RocksDBException
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
    */
   public int get(ColumnFamilyHandle columnFamilyHandle, ReadOptions opt, byte[] key,
       byte[] value) throws RocksDBException {
@@ -658,7 +698,8 @@ public class RocksDB extends RocksObject {
    * @return a byte array storing the value associated with the input key if
    *     any.  null if it does not find the specified key.
    *
-   * @throws RocksDBException
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
    */
   public byte[] get(byte[] key) throws RocksDBException {
     return get(nativeHandle_, key, key.length);
@@ -675,7 +716,8 @@ public class RocksDB extends RocksObject {
    * @return a byte array storing the value associated with the input key if
    *     any.  null if it does not find the specified key.
    *
-   * @throws RocksDBException
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
    */
   public byte[] get(ColumnFamilyHandle columnFamilyHandle, byte[] key)
       throws RocksDBException {
@@ -692,7 +734,8 @@ public class RocksDB extends RocksObject {
    * @return a byte array storing the value associated with the input key if
    *     any.  null if it does not find the specified key.
    *
-   * @throws RocksDBException
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
    */
   public byte[] get(ReadOptions opt, byte[] key) throws RocksDBException {
     return get(nativeHandle_, opt.nativeHandle_, key, key.length);
@@ -710,7 +753,8 @@ public class RocksDB extends RocksObject {
    * @return a byte array storing the value associated with the input key if
    *     any.  null if it does not find the specified key.
    *
-   * @throws RocksDBException
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
    */
   public byte[] get(ColumnFamilyHandle columnFamilyHandle, ReadOptions opt,
       byte[] key) throws RocksDBException {
@@ -725,7 +769,8 @@ public class RocksDB extends RocksObject {
    * @return Map where key of map is the key passed by user and value for map
    * entry is the corresponding value in DB.
    *
-   * @throws RocksDBException
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
    */
   public Map<byte[], byte[]> multiGet(List<byte[]> keys)
       throws RocksDBException {
@@ -759,8 +804,10 @@ public class RocksDB extends RocksObject {
    * @return Map where key of map is the key passed by user and value for map
    * entry is the corresponding value in DB.
    *
-   * @throws RocksDBException
-   * @throws IllegalArgumentException
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   * @throws IllegalArgumentException thrown if the size of passed keys is not
+   *    equal to the amount of passed column family handles.
    */
   public Map<byte[], byte[]> multiGet(List<ColumnFamilyHandle> columnFamilyHandleList,
       List<byte[]> keys) throws RocksDBException, IllegalArgumentException {
@@ -792,7 +839,8 @@ public class RocksDB extends RocksObject {
    * @return Map where key of map is the key passed by user and value for map
    * entry is the corresponding value in DB.
    *
-   * @throws RocksDBException
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
    */
   public Map<byte[], byte[]> multiGet(ReadOptions opt, List<byte[]> keys)
       throws RocksDBException {
@@ -827,8 +875,10 @@ public class RocksDB extends RocksObject {
    * @return Map where key of map is the key passed by user and value for map
    * entry is the corresponding value in DB.
    *
-   * @throws RocksDBException
-   * @throws java.lang.IllegalArgumentException
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   * @throws IllegalArgumentException thrown if the size of passed keys is not
+   *    equal to the amount of passed column family handles.
    */
   public Map<byte[], byte[]> multiGet(ReadOptions opt,
       List<ColumnFamilyHandle> columnFamilyHandleList, List<byte[]> keys)
@@ -862,7 +912,8 @@ public class RocksDB extends RocksObject {
    *
    * @param key Key to delete within database
    *
-   * @throws RocksDBException
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
    */
   public void remove(byte[] key) throws RocksDBException {
     remove(nativeHandle_, key, key.length);
@@ -877,7 +928,8 @@ public class RocksDB extends RocksObject {
    *     instance
    * @param key Key to delete within database
    *
-   * @throws RocksDBException
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
    */
   public void remove(ColumnFamilyHandle columnFamilyHandle, byte[] key)
       throws RocksDBException {
@@ -892,7 +944,8 @@ public class RocksDB extends RocksObject {
    * @param writeOpt WriteOptions to be used with delete operation
    * @param key Key to delete within database
    *
-   * @throws RocksDBException
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
    */
   public void remove(WriteOptions writeOpt, byte[] key)
       throws RocksDBException {
@@ -909,7 +962,8 @@ public class RocksDB extends RocksObject {
    * @param writeOpt WriteOptions to be used with delete operation
    * @param key Key to delete within database
    *
-   * @throws RocksDBException
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
    */
   public void remove(ColumnFamilyHandle columnFamilyHandle, WriteOptions writeOpt,
       byte[] key) throws RocksDBException {
@@ -940,7 +994,8 @@ public class RocksDB extends RocksObject {
    * @param property to be fetched. See above for examples
    * @return property value
    *
-   * @throws RocksDBException
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
    */
   public String getProperty(ColumnFamilyHandle columnFamilyHandle, String property)
       throws RocksDBException {
@@ -967,7 +1022,8 @@ public class RocksDB extends RocksObject {
    * @param property to be fetched. See above for examples
    * @return property value
    *
-   * @throws RocksDBException
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
    */
   public String getProperty(String property) throws RocksDBException {
     return getProperty0(nativeHandle_, property, property.length());
@@ -997,7 +1053,7 @@ public class RocksDB extends RocksObject {
    * <p>nullptr will be returned if the DB fails to take a snapshot or does
    * not support snapshot.</p>
    *
-   * @return Snapshot
+   * @return Snapshot {@link Snapshot} instance
    */
   public Snapshot getSnapshot() {
     long snapshotHandle = getSnapshot(nativeHandle_);
@@ -1046,7 +1102,8 @@ public class RocksDB extends RocksObject {
    * @return {@link java.util.List} containing {@link org.rocksdb.RocksIterator}
    *     instances
    *
-   * @throws RocksDBException
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
    */
   public List<RocksIterator> newIterators(
       List<ColumnFamilyHandle> columnFamilyHandleList) throws RocksDBException {
@@ -1066,8 +1123,10 @@ public class RocksDB extends RocksObject {
    * The ColumnFamilyHandle is automatically disposed with DB disposal.
    *
    * @param columnFamilyDescriptor column family to be created.
-   * @return {@link org.rocksdb.ColumnFamilyHandle} instance
-   * @see RocksDBException
+   * @return {@link org.rocksdb.ColumnFamilyHandle} instance.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
    */
   public ColumnFamilyHandle createColumnFamily(
       ColumnFamilyDescriptor columnFamilyDescriptor)
@@ -1084,7 +1143,8 @@ public class RocksDB extends RocksObject {
    * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
    *     instance
    *
-   * @throws RocksDBException
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
    */
   public void dropColumnFamily(ColumnFamilyHandle columnFamilyHandle)
       throws RocksDBException, IllegalArgumentException {
diff --git a/java/org/rocksdb/RocksEnv.java b/java/org/rocksdb/RocksEnv.java
index a9b01ab72..5bbf4fb3d 100644
--- a/java/org/rocksdb/RocksEnv.java
+++ b/java/org/rocksdb/RocksEnv.java
@@ -29,6 +29,8 @@ public class RocksEnv extends RocksObject {
    * belongs to rocksdb c++.  As a result, the returned RocksEnv will not
    * have the ownership of its c++ resource, and calling its dispose()
    * will be no-op.</p>
+   *
+   * @return the default {@link org.rocksdb.RocksEnv} instance.
    */
   public static RocksEnv getDefault() {
     return default_env_;
@@ -38,6 +40,10 @@ public class RocksEnv extends RocksObject {
    * <p>Sets the number of background worker threads of the flush pool
    * for this environment.</p>
    * <p>Default number: 1</p>
+   *
+   * @param num the number of threads
+   *
+   * @return current {@link org.rocksdb.RocksEnv} instance.
    */
   public RocksEnv setBackgroundThreads(int num) {
     return setBackgroundThreads(num, FLUSH_POOL);
@@ -52,6 +58,7 @@ public class RocksEnv extends RocksObject {
    *     FLUSH_POOL or COMPACTION_POOL.
    *
    * <p>Default number: 1</p>
+   * @return current {@link org.rocksdb.RocksEnv} instance.
    */
   public RocksEnv setBackgroundThreads(int num, int poolID) {
     setBackgroundThreads(nativeHandle_, num, poolID);
@@ -66,6 +73,8 @@ public class RocksEnv extends RocksObject {
    *
    * @param poolID the id to specified a thread pool.  Should be either
    *     FLUSH_POOL or COMPACTION_POOL.
+   *
+   * @return the thread pool queue length.
    */
   public int getThreadPoolQueueLen(int poolID) {
     return getThreadPoolQueueLen(nativeHandle_, poolID);
diff --git a/java/org/rocksdb/RocksIterator.java b/java/org/rocksdb/RocksIterator.java
index acfdd3b8c..fee3f459d 100644
--- a/java/org/rocksdb/RocksIterator.java
+++ b/java/org/rocksdb/RocksIterator.java
@@ -112,6 +112,9 @@ public class RocksIterator extends RocksObject {
    * <p>Position at the first key in the source that at or past target
    * The iterator is valid after this call iff the source contains
    * an entry that comes at or past target.</p>
+   *
+   * @param target byte array describing a key or a
+   *     key prefix to seek for.
    */
   public void seek(byte[] target) {
     assert(isInitialized());
@@ -123,7 +126,8 @@ public class RocksIterator extends RocksObject {
    * If non-blocking IO is requested and this operation cannot be
    * satisfied without doing some IO, then this returns Status::Incomplete().
    *
-   * @throws org.rocksdb.RocksDBException
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
    */
   public void status() throws RocksDBException {
     assert(isInitialized());
diff --git a/java/org/rocksdb/RocksObject.java b/java/org/rocksdb/RocksObject.java
index 51b7fb890..6e24a1385 100644
--- a/java/org/rocksdb/RocksObject.java
+++ b/java/org/rocksdb/RocksObject.java
@@ -37,7 +37,7 @@ public abstract class RocksObject {
    * small in that they seems to only hold a long variable. As a result,
    * they might have low priority in the GC process.  To prevent this,
    * it is suggested to call {@code dispose()} manually.
-   * <p>
+   * </p>
    * <p>
    * Note that once an instance of {@code RocksObject} has been disposed,
    * calling its function will lead undefined behavior.
diff --git a/java/org/rocksdb/Slice.java b/java/org/rocksdb/Slice.java
index fe5d8d49d..0dfa12ee7 100644
--- a/java/org/rocksdb/Slice.java
+++ b/java/org/rocksdb/Slice.java
@@ -6,26 +6,26 @@
 package org.rocksdb;
 
 /**
- * Base class for slices which will receive
- * byte[] based access to the underlying data.
+ * <p>Base class for slices which will receive
+ * byte[] based access to the underlying data.</p>
  *
- * byte[] backed slices typically perform better with
+ * <p>byte[] backed slices typically perform better with
  * small keys and values. When using larger keys and
- * values consider using @see org.rocksdb.DirectSlice
+ * values consider using {@link org.rocksdb.DirectSlice}</p>
  */
 public class Slice extends AbstractSlice<byte[]> {
   /**
-   * Called from JNI to construct a new Java Slice
+   * <p>Called from JNI to construct a new Java Slice
    * without an underlying C++ object set
-   * at creation time.
+   * at creation time.</p>
    *
-   * Note: You should be aware that
+   * <p>Note: You should be aware that
    * {@see org.rocksdb.RocksObject#disOwnNativeHandle()} is intentionally
    * called from the default Slice constructor, and that it is marked as
    * private. This is so that developers cannot construct their own default
    * Slice objects (at present). As developers cannot construct their own
    * Slice objects through this, they are not creating underlying C++ Slice
-   * objects, and so there is nothing to free (dispose) from Java.
+   * objects, and so there is nothing to free (dispose) from Java.</p>
    */
   private Slice() {
     super();
@@ -33,9 +33,10 @@ public class Slice extends AbstractSlice<byte[]> {
   }
 
   /**
-   * Constructs a slice
-   * where the data is taken from
-   * a String.
+   * <p>Constructs a slice where the data is taken from
+   * a String.</p>
+   *
+   * @param str String value.
    */
   public Slice(final String str) {
     super();
@@ -43,9 +44,11 @@ public class Slice extends AbstractSlice<byte[]> {
   }
 
   /**
-   * Constructs a slice
-   * where the data is a copy of
-   * the byte array from a specific offset.
+   * <p>Constructs a slice where the data is a copy of
+   * the byte array from a specific offset.</p>
+   *
+   * @param data byte array.
+   * @param offset offset within the byte array.
    */
   public Slice(final byte[] data, final int offset) {
     super();
@@ -53,9 +56,10 @@ public class Slice extends AbstractSlice<byte[]> {
   }
 
   /**
-   * Constructs a slice
-   * where the data is a copy of
-   * the byte array.
+   * <p>Constructs a slice where the data is a copy of
+   * the byte array.</p>
+   *
+   * @param data byte array.
    */
   public Slice(final byte[] data) {
     super();
@@ -63,8 +67,8 @@ public class Slice extends AbstractSlice<byte[]> {
   }
 
   /**
-   * Deletes underlying C++ slice pointer
-   * and any buffered data.
+   * <p>Deletes underlying C++ slice pointer
+   * and any buffered data.</p>
    *
    * <p>
    * Note that this function should be called only after all
diff --git a/java/org/rocksdb/StatisticsCollector.java b/java/org/rocksdb/StatisticsCollector.java
index dc83e7c88..be8f26a14 100644
--- a/java/org/rocksdb/StatisticsCollector.java
+++ b/java/org/rocksdb/StatisticsCollector.java
@@ -49,6 +49,7 @@ public class StatisticsCollector {
    *
    * @param shutdownTimeout Time in milli-seconds to wait for shutdown before
    *        killing the collection process.
+   * @throws java.lang.InterruptedException thrown if Threads are interrupted.
    */
   public void shutDown(int shutdownTimeout) throws InterruptedException {
     _isRunning = false;
diff --git a/java/org/rocksdb/TableFormatConfig.java b/java/org/rocksdb/TableFormatConfig.java
index e5c63411f..58a533b22 100644
--- a/java/org/rocksdb/TableFormatConfig.java
+++ b/java/org/rocksdb/TableFormatConfig.java
@@ -12,9 +12,11 @@ package org.rocksdb;
  */
 public abstract class TableFormatConfig {
   /**
-   * This function should only be called by Options.setTableFormatConfig(),
+   * <p>This function should only be called by Options.setTableFormatConfig(),
    * which will create a c++ shared-pointer to the c++ TableFactory
-   * that associated with the Java TableFormatConfig.
+   * that associated with the Java TableFormatConfig.</p>
+   *
+   * @return native handle address to native table instance.
    */
   abstract protected long newTableFactoryHandle();
 }
diff --git a/java/org/rocksdb/WriteBatch.java b/java/org/rocksdb/WriteBatch.java
index 19984b16c..5bd1119da 100644
--- a/java/org/rocksdb/WriteBatch.java
+++ b/java/org/rocksdb/WriteBatch.java
@@ -23,11 +23,19 @@ package org.rocksdb;
  * external synchronization.
  */
 public class WriteBatch extends RocksObject {
+  /**
+   * Constructs a WriteBatch instance.
+   */
   public WriteBatch() {
     super();
     newWriteBatch(0);
   }
 
+  /**
+   * Constructs a WriteBatch instance with a given size.
+   *
+   * @param reserved_bytes reserved size for WriteBatch
+   */
   public WriteBatch(int reserved_bytes) {
     nativeHandle_ = 0;
     newWriteBatch(reserved_bytes);
@@ -35,19 +43,29 @@ public class WriteBatch extends RocksObject {
 
   /**
    * Returns the number of updates in the batch.
+   *
+   * @return number of items in WriteBatch
    */
   public native int count();
 
   /**
-   * Store the mapping "key-&gt;value" in the database.
+   * <p>Store the mapping "key-&gt;value" in the database.</p>
+   *
+   * @param key the specified key to be inserted.
+   * @param value the value associated with the specified key.
    */
   public void put(byte[] key, byte[] value) {
     put(key, key.length, value, value.length);
   }
 
   /**
-   * Store the mapping "key-&gt;value" within given column
-   * family.
+   * <p>Store the mapping "key-&gt;value" within given column
+   * family.</p>
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance
+   * @param key the specified key to be inserted.
+   * @param value the value associated with the specified key.
    */
   public void put(ColumnFamilyHandle columnFamilyHandle,
       byte[] key, byte[] value) {
@@ -56,16 +74,25 @@ public class WriteBatch extends RocksObject {
   }
 
   /**
-   * Merge "value" with the existing value of "key" in the database.
-   * "key-&gt;merge(existing, value)"
+   * <p>Merge "value" with the existing value of "key" in the database.
+   * "key-&gt;merge(existing, value)"</p>
+   *
+   * @param key the specified key to be merged.
+   * @param value the value to be merged with the current value for
+   * the specified key.
    */
   public void merge(byte[] key, byte[] value) {
     merge(key, key.length, value, value.length);
   }
 
   /**
-   * Merge "value" with the existing value of "key" in given column family.
-   * "key-&gt;merge(existing, value)"
+   * <p>Merge "value" with the existing value of "key" in given column family.
+   * "key-&gt;merge(existing, value)"</p>
+   *
+   * @param columnFamilyHandle {@link ColumnFamilyHandle} instance
+   * @param key the specified key to be merged.
+   * @param value the value to be merged with the current value for
+   * the specified key.
    */
   public void merge(ColumnFamilyHandle columnFamilyHandle,
       byte[] key, byte[] value) {
@@ -74,14 +101,19 @@ public class WriteBatch extends RocksObject {
   }
 
   /**
-   * If the database contains a mapping for "key", erase it.  Else do nothing.
+   * <p>If the database contains a mapping for "key", erase it.  Else do nothing.</p>
+   *
+   * @param key Key to delete within database
    */
   public void remove(byte[] key) {
     remove(key, key.length);
   }
 
   /**
-   * If column family contains a mapping for "key", erase it.  Else do nothing.
+   * <p>If column family contains a mapping for "key", erase it.  Else do nothing.</p>
+   *
+   * @param columnFamilyHandle {@link ColumnFamilyHandle} instance
+   * @param key Key to delete within database
    */
   public void remove(ColumnFamilyHandle columnFamilyHandle, byte[] key) {
     remove(key, key.length, columnFamilyHandle.nativeHandle_);
@@ -98,6 +130,8 @@ public class WriteBatch extends RocksObject {
    *
    * Example application: add timestamps to the transaction log for use in
    * replication.
+   *
+   * @param blob binary object to be inserted
    */
   public void putLogData(byte[] blob) {
     putLogData(blob, blob.length);
@@ -166,6 +200,9 @@ public class WriteBatch extends RocksObject {
      * iteration is halted. Otherwise, it continues
      * iterating. The default implementation always
      * returns true.
+     *
+     * @return boolean value indicating if the
+     *     iteration is halted.
      */
     public boolean shouldContinue() {
       return true;
diff --git a/java/org/rocksdb/WriteOptions.java b/java/org/rocksdb/WriteOptions.java
index d26dbb918..6d7ea4c30 100644
--- a/java/org/rocksdb/WriteOptions.java
+++ b/java/org/rocksdb/WriteOptions.java
@@ -12,6 +12,9 @@ package org.rocksdb;
  * c++ side memory before a WriteOptions instance runs out of scope.
  */
 public class WriteOptions extends RocksObject {
+  /**
+   * Construct WriteOptions instance.
+   */
   public WriteOptions() {
     super();
     newWriteOptions();
@@ -64,6 +67,8 @@ public class WriteOptions extends RocksObject {
    * crash semantics as the "write()" system call.  A DB write
    * with sync==true has similar crash semantics to a "write()"
    * system call followed by "fdatasync()".
+   *
+   * @return boolean value indicating if sync is active.
    */
   public boolean sync() {
     return sync(nativeHandle_);
@@ -85,6 +90,8 @@ public class WriteOptions extends RocksObject {
   /**
    * If true, writes will not first go to the write ahead log,
    * and the write may got lost after a crash.
+   *
+   * @return boolean value indicating if WAL is disabled.
    */
   public boolean disableWAL() {
     return disableWAL(nativeHandle_);
diff --git a/java/org/rocksdb/test/AbstractComparatorTest.java b/java/org/rocksdb/test/AbstractComparatorTest.java
index 7f4c47fb3..339615b45 100644
--- a/java/org/rocksdb/test/AbstractComparatorTest.java
+++ b/java/org/rocksdb/test/AbstractComparatorTest.java
@@ -36,6 +36,8 @@ public abstract class AbstractComparatorTest {
    *
    * @param db_path A path where we can store database
    *                files temporarily
+   *
+   * @throws java.io.IOException if IO error happens.
    */
   public void testRoundtrip(final Path db_path) throws IOException {
 
diff --git a/java/org/rocksdb/test/PlatformRandomHelper.java b/java/org/rocksdb/test/PlatformRandomHelper.java
index c729c3dc1..7112fc4f1 100644
--- a/java/org/rocksdb/test/PlatformRandomHelper.java
+++ b/java/org/rocksdb/test/PlatformRandomHelper.java
@@ -14,6 +14,8 @@ import java.util.Random;
 public class PlatformRandomHelper {
     /**
      * Determine if OS is 32-Bit/64-Bit
+     *
+     * @return boolean value indicating if operating system is 64 Bit.
      */
     public static boolean isOs64Bit(){
       boolean is64Bit = false;
@@ -27,6 +29,8 @@ public class PlatformRandomHelper {
 
     /**
      * Factory to get a platform specific Random instance
+     *
+     * @return {@link java.util.Random} instance.
      */
     public static Random getPlatformSpecificRandomFactory(){
       if (isOs64Bit()) {

From 31b02dc21d15393906e9437bc2bc95ef451a41dd Mon Sep 17 00:00:00 2001
From: Hasnain Lakhani <mhl@fb.com>
Date: Thu, 13 Nov 2014 09:51:41 -0800
Subject: [PATCH 468/829] Improve Backup Engine.

Summary:
Improve the backup engine by not deleting the corrupted
backup when it is detected; instead leaving it to the client
to delete the corrupted backup.

Also add a BackupEngine::Open() call.

Test Plan:
Add check to CorruptionTest inside backupable_db_test
to check that the corrupt backups are not deleted. The previous
version of the code failed this test as backups were deleted,
but after the changes in this commit, this test passes.

Run make check to ensure that no other tests fail.

Reviewers: sdong, benj, sanketh, sumeet, igor

Reviewed By: igor

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D28521
---
 HISTORY.md                                 |   3 +
 include/rocksdb/utilities/backupable_db.h  |  27 ++-
 utilities/backupable/backupable_db.cc      | 251 ++++++++++++---------
 utilities/backupable/backupable_db_test.cc |  28 +++
 4 files changed, 200 insertions(+), 109 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index 4182e1dd5..00a3dcbc9 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -2,6 +2,9 @@
 
 ## Unreleased
 
+* BackupEngine::NewBackupEngine() was deprecated; please use BackupEngine::Open() from now on.
+* BackupableDB/RestoreBackupableDB have new GarbageCollect() methods, which will clean up files from corrupt and obsolete backups.
+* BackupableDB/RestoreBackupableDB have new GetCorruptedBackups() methods which list corrupt backups.
 
 ## 3.7.0 (11/6/2014)
 ### Public API changes
diff --git a/include/rocksdb/utilities/backupable_db.h b/include/rocksdb/utilities/backupable_db.h
index 57a8accdf..4b4ba6079 100644
--- a/include/rocksdb/utilities/backupable_db.h
+++ b/include/rocksdb/utilities/backupable_db.h
@@ -177,6 +177,8 @@ class BackupEngineReadOnly {
   // You can GetBackupInfo safely, even with other BackupEngine performing
   // backups on the same directory
   virtual void GetBackupInfo(std::vector<BackupInfo>* backup_info) = 0;
+  virtual void GetCorruptedBackups(
+      std::vector<BackupID>* corrupt_backup_ids) = 0;
 
   // Restoring DB from backup is NOT safe when there is another BackupEngine
   // running that might call DeleteBackup() or PurgeOldBackups(). It is caller's
@@ -196,7 +198,12 @@ class BackupEngine {
   virtual ~BackupEngine() {}
 
   static BackupEngine* NewBackupEngine(Env* db_env,
-                                       const BackupableDBOptions& options);
+                                       const BackupableDBOptions& options)
+    __attribute__((deprecated("Please use Open() instead")));
+
+  static Status Open(Env* db_env,
+                     const BackupableDBOptions& options,
+                     BackupEngine** backup_engine_ptr);
 
   virtual Status CreateNewBackup(DB* db, bool flush_before_backup = false) = 0;
   virtual Status PurgeOldBackups(uint32_t num_backups_to_keep) = 0;
@@ -204,12 +211,16 @@ class BackupEngine {
   virtual void StopBackup() = 0;
 
   virtual void GetBackupInfo(std::vector<BackupInfo>* backup_info) = 0;
+  virtual void GetCorruptedBackups(
+      std::vector<BackupID>* corrupt_backup_ids) = 0;
   virtual Status RestoreDBFromBackup(
       BackupID backup_id, const std::string& db_dir, const std::string& wal_dir,
       const RestoreOptions& restore_options = RestoreOptions()) = 0;
   virtual Status RestoreDBFromLatestBackup(
       const std::string& db_dir, const std::string& wal_dir,
       const RestoreOptions& restore_options = RestoreOptions()) = 0;
+
+  virtual Status GarbageCollect() = 0;
 };
 
 // Stack your DB with BackupableDB to be able to backup the DB
@@ -228,6 +239,8 @@ class BackupableDB : public StackableDB {
   Status CreateNewBackup(bool flush_before_backup = false);
   // Returns info about backups in backup_info
   void GetBackupInfo(std::vector<BackupInfo>* backup_info);
+  // Returns info about corrupt backups in corrupt_backups
+  void GetCorruptedBackups(std::vector<BackupID>* corrupt_backup_ids);
   // deletes old backups, keeping latest num_backups_to_keep alive
   Status PurgeOldBackups(uint32_t num_backups_to_keep);
   // deletes a specific backup
@@ -241,6 +254,11 @@ class BackupableDB : public StackableDB {
   // next time you create BackupableDB or RestoreBackupableDB.
   void StopBackup();
 
+  // Will delete all the files we don't need anymore
+  // It will do the full scan of the files/ directory and delete all the
+  // files that are not referenced.
+  Status GarbageCollect();
+
  private:
   BackupEngine* backup_engine_;
 };
@@ -253,6 +271,8 @@ class RestoreBackupableDB {
 
   // Returns info about backups in backup_info
   void GetBackupInfo(std::vector<BackupInfo>* backup_info);
+  // Returns info about corrupt backups in corrupt_backups
+  void GetCorruptedBackups(std::vector<BackupID>* corrupt_backup_ids);
 
   // restore from backup with backup_id
   // IMPORTANT -- if options_.share_table_files == true and you restore DB
@@ -279,6 +299,11 @@ class RestoreBackupableDB {
   // deletes a specific backup
   Status DeleteBackup(BackupID backup_id);
 
+  // Will delete all the files we don't need anymore
+  // It will do the full scan of the files/ directory and delete all the
+  // files that are not referenced.
+  Status GarbageCollect();
+
  private:
   BackupEngine* backup_engine_;
 };
diff --git a/utilities/backupable/backupable_db.cc b/utilities/backupable/backupable_db.cc
index 98fd0e34c..269e9e9f1 100644
--- a/utilities/backupable/backupable_db.cc
+++ b/utilities/backupable/backupable_db.cc
@@ -121,8 +121,10 @@ class BackupEngineImpl : public BackupEngine {
   void StopBackup() {
     stop_backup_.store(true, std::memory_order_release);
   }
+  Status GarbageCollect();
 
   void GetBackupInfo(std::vector<BackupInfo>* backup_info);
+  void GetCorruptedBackups(std::vector<BackupID>* corrupt_backup_ids);
   Status RestoreDBFromBackup(BackupID backup_id, const std::string& db_dir,
                              const std::string& wal_dir,
                              const RestoreOptions& restore_options =
@@ -285,16 +287,11 @@ class BackupEngineImpl : public BackupEngine {
                            uint64_t size_limit,
                            uint32_t* checksum_value);
 
-  // Will delete all the files we don't need anymore
-  // If full_scan == true, it will do the full scan of files/ directory
-  // and delete all the files that are not referenced from backuped_file_infos__
-  void GarbageCollection(bool full_scan);
-
   // backup state data
   BackupID latest_backup_id_;
   std::map<BackupID, BackupMeta> backups_;
+  std::map<BackupID, std::pair<Status, BackupMeta> > corrupt_backups_;
   std::unordered_map<std::string, FileInfo> backuped_file_infos_;
-  std::vector<BackupID> obsolete_backups_;
   std::atomic<bool> stop_backup_;
 
   // options data
@@ -319,6 +316,13 @@ BackupEngine* BackupEngine::NewBackupEngine(
   return new BackupEngineImpl(db_env, options);
 }
 
+Status BackupEngine::Open(Env* env,
+                          const BackupableDBOptions& options,
+                          BackupEngine** backup_engine_ptr) {
+  *backup_engine_ptr = new BackupEngineImpl(env, options);
+  return Status::OK();
+}
+
 BackupEngineImpl::BackupEngineImpl(Env* db_env,
                                    const BackupableDBOptions& options,
                                    bool read_only)
@@ -377,14 +381,10 @@ BackupEngineImpl::BackupEngineImpl(Env* db_env,
 
   if (options_.destroy_old_data) {  // Destory old data
     assert(!read_only_);
-    for (auto& backup : backups_) {
-      backup.second.Delete();
-      obsolete_backups_.push_back(backup.first);
-    }
-    backups_.clear();
+    PurgeOldBackups(0);
+    (void) GarbageCollect();
     // start from beginning
     latest_backup_id_ = 0;
-    // GarbageCollection() will do the actual deletion
   } else {  // Load data from storage
     // load the backups if any
     for (auto& backup : backups_) {
@@ -392,16 +392,13 @@ BackupEngineImpl::BackupEngineImpl(Env* db_env,
       if (!s.ok()) {
         Log(options_.info_log, "Backup %u corrupted -- %s", backup.first,
             s.ToString().c_str());
-        if (!read_only_) {
-          Log(options_.info_log, "-> Deleting backup %u", backup.first);
-        }
-        backup.second.Delete(!read_only_);
-        obsolete_backups_.push_back(backup.first);
+        corrupt_backups_.insert(std::make_pair(
+              backup.first, std::make_pair(s, backup.second)));
       }
     }
-    // delete obsolete backups from the structure
-    for (auto ob : obsolete_backups_) {
-      backups_.erase(ob);
+
+    for (auto corrupt : corrupt_backups_) {
+      backups_.erase(backups_.find(corrupt.first));
     }
 
     Status s = GetLatestBackupFileContents(&latest_backup_id_);
@@ -417,16 +414,17 @@ BackupEngineImpl::BackupEngineImpl(Env* db_env,
   }
 
   // delete any backups that claim to be later than latest
-  for (auto itr = backups_.upper_bound(latest_backup_id_);
-       itr != backups_.end();) {
-    itr->second.Delete();
-    obsolete_backups_.push_back(itr->first);
-    itr = backups_.erase(itr);
+  std::vector<BackupID> later_ids;
+  for (auto itr = backups_.lower_bound(latest_backup_id_ + 1);
+       itr != backups_.end(); itr++) {
+    later_ids.push_back(itr->first);
+  }
+  for (auto id : later_ids) {
+    DeleteBackup(id);
   }
 
   if (!read_only_) {
     PutLatestBackupFileContents(latest_backup_id_);  // Ignore errors
-    GarbageCollection(true);
   }
   Log(options_.info_log, "Initialized BackupEngine, the latest backup is %u.",
       latest_backup_id_);
@@ -575,7 +573,7 @@ Status BackupEngineImpl::CreateNewBackup(DB* db, bool flush_before_backup) {
     Log(options_.info_log, "Backup Statistics %s\n",
         backup_statistics_.ToString().c_str());
     backups_.erase(new_backup_id);
-    GarbageCollection(true);
+    (void) GarbageCollect();
     return s;
   }
 
@@ -601,13 +599,15 @@ Status BackupEngineImpl::PurgeOldBackups(uint32_t num_backups_to_keep) {
   assert(!read_only_);
   Log(options_.info_log, "Purging old backups, keeping %u",
       num_backups_to_keep);
-  while (num_backups_to_keep < backups_.size()) {
-    Log(options_.info_log, "Deleting backup %u", backups_.begin()->first);
-    backups_.begin()->second.Delete();
-    obsolete_backups_.push_back(backups_.begin()->first);
-    backups_.erase(backups_.begin());
+  std::vector<BackupID> to_delete;
+  auto itr = backups_.begin();
+  while ((backups_.size() - to_delete.size()) > num_backups_to_keep) {
+    to_delete.push_back(itr->first);
+    itr++;
+  }
+  for (auto backup_id : to_delete) {
+    DeleteBackup(backup_id);
   }
-  GarbageCollection(false);
   return Status::OK();
 }
 
@@ -615,13 +615,37 @@ Status BackupEngineImpl::DeleteBackup(BackupID backup_id) {
   assert(!read_only_);
   Log(options_.info_log, "Deleting backup %u", backup_id);
   auto backup = backups_.find(backup_id);
-  if (backup == backups_.end()) {
-    return Status::NotFound("Backup not found");
+  if (backup != backups_.end()) {
+    backup->second.Delete();
+    backups_.erase(backup);
+  } else {
+    auto corrupt = corrupt_backups_.find(backup_id);
+    if (corrupt == corrupt_backups_.end()) {
+      return Status::NotFound("Backup not found");
+    }
+    corrupt->second.second.Delete();
+    corrupt_backups_.erase(corrupt);
+  }
+
+  std::vector<std::string> to_delete;
+  for (auto& itr : backuped_file_infos_) {
+    if (itr.second.refs == 0) {
+      Status s = backup_env_->DeleteFile(GetAbsolutePath(itr.first));
+      Log(options_.info_log, "Deleting %s -- %s", itr.first.c_str(),
+          s.ToString().c_str());
+      to_delete.push_back(itr.first);
+    }
+  }
+  for (auto& td : to_delete) {
+    backuped_file_infos_.erase(td);
   }
-  backup->second.Delete();
-  obsolete_backups_.push_back(backup_id);
-  backups_.erase(backup);
-  GarbageCollection(false);
+
+  // take care of private dirs -- GarbageCollect() will take care of them
+  // if they are not empty
+  std::string private_dir = GetPrivateFileRel(backup_id);
+  Status s = backup_env_->DeleteDir(GetAbsolutePath(private_dir));
+  Log(options_.info_log, "Deleting private dir %s -- %s",
+      private_dir.c_str(), s.ToString().c_str());
   return Status::OK();
 }
 
@@ -636,9 +660,22 @@ void BackupEngineImpl::GetBackupInfo(std::vector<BackupInfo>* backup_info) {
   }
 }
 
+void
+BackupEngineImpl::GetCorruptedBackups(
+    std::vector<BackupID>* corrupt_backup_ids) {
+  corrupt_backup_ids->reserve(corrupt_backups_.size());
+  for (auto& backup : corrupt_backups_) {
+    corrupt_backup_ids->push_back(backup.first);
+  }
+}
+
 Status BackupEngineImpl::RestoreDBFromBackup(
     BackupID backup_id, const std::string& db_dir, const std::string& wal_dir,
     const RestoreOptions& restore_options) {
+  auto corrupt_itr = corrupt_backups_.find(backup_id);
+  if (corrupt_itr != corrupt_backups_.end()) {
+    return corrupt_itr->second.first;
+  }
   auto backup_itr = backups_.find(backup_id);
   if (backup_itr == backups_.end()) {
     return Status::NotFound("Backup not found");
@@ -1005,83 +1042,59 @@ void BackupEngineImpl::DeleteChildren(const std::string& dir,
   }
 }
 
-void BackupEngineImpl::GarbageCollection(bool full_scan) {
+Status BackupEngineImpl::GarbageCollect() {
   assert(!read_only_);
   Log(options_.info_log, "Starting garbage collection");
-  std::vector<std::string> to_delete;
-  for (auto& itr : backuped_file_infos_) {
-    if (itr.second.refs == 0) {
-      Status s = backup_env_->DeleteFile(GetAbsolutePath(itr.first));
-      Log(options_.info_log, "Deleting %s -- %s", itr.first.c_str(),
-          s.ToString().c_str());
-      to_delete.push_back(itr.first);
-    }
-  }
-  for (auto& td : to_delete) {
-    backuped_file_infos_.erase(td);
-  }
-  if (!full_scan) {
-    // take care of private dirs -- if full_scan == true, then full_scan will
-    // take care of them
-    for (auto backup_id : obsolete_backups_) {
-      std::string private_dir = GetPrivateFileRel(backup_id);
-      Status s = backup_env_->DeleteDir(GetAbsolutePath(private_dir));
-      Log(options_.info_log, "Deleting private dir %s -- %s",
-          private_dir.c_str(), s.ToString().c_str());
-    }
-  }
-  obsolete_backups_.clear();
-
-  if (full_scan) {
-    Log(options_.info_log, "Starting full scan garbage collection");
-    // delete obsolete shared files
-    std::vector<std::string> shared_children;
-    backup_env_->GetChildren(GetAbsolutePath(GetSharedFileRel()),
-                             &shared_children);
-    for (auto& child : shared_children) {
-      std::string rel_fname = GetSharedFileRel(child);
-      // if it's not refcounted, delete it
-      if (backuped_file_infos_.find(rel_fname) == backuped_file_infos_.end()) {
-        // this might be a directory, but DeleteFile will just fail in that
-        // case, so we're good
-        Status s = backup_env_->DeleteFile(GetAbsolutePath(rel_fname));
-        if (s.ok()) {
-          Log(options_.info_log, "Deleted %s", rel_fname.c_str());
-        }
+
+  // delete obsolete shared files
+  std::vector<std::string> shared_children;
+  backup_env_->GetChildren(GetAbsolutePath(GetSharedFileRel()),
+                           &shared_children);
+  for (auto& child : shared_children) {
+    std::string rel_fname = GetSharedFileRel(child);
+    // if it's not refcounted, delete it
+    if (backuped_file_infos_.find(rel_fname) == backuped_file_infos_.end()) {
+      // this might be a directory, but DeleteFile will just fail in that
+      // case, so we're good
+      Status s = backup_env_->DeleteFile(GetAbsolutePath(rel_fname));
+      if (s.ok()) {
+        Log(options_.info_log, "Deleted %s", rel_fname.c_str());
       }
     }
+  }
 
-    // delete obsolete private files
-    std::vector<std::string> private_children;
-    backup_env_->GetChildren(GetAbsolutePath(GetPrivateDirRel()),
-                             &private_children);
-    for (auto& child : private_children) {
-      BackupID backup_id = 0;
-      bool tmp_dir = child.find(".tmp") != std::string::npos;
-      sscanf(child.c_str(), "%u", &backup_id);
-      if (!tmp_dir &&  // if it's tmp_dir, delete it
-          (backup_id == 0 || backups_.find(backup_id) != backups_.end())) {
-        // it's either not a number or it's still alive. continue
-        continue;
-      }
-      // here we have to delete the dir and all its children
-      std::string full_private_path =
-          GetAbsolutePath(GetPrivateFileRel(backup_id, tmp_dir));
-      std::vector<std::string> subchildren;
-      backup_env_->GetChildren(full_private_path, &subchildren);
-      for (auto& subchild : subchildren) {
-        Status s = backup_env_->DeleteFile(full_private_path + subchild);
-        if (s.ok()) {
-          Log(options_.info_log, "Deleted %s",
-              (full_private_path + subchild).c_str());
-        }
+  // delete obsolete private files
+  std::vector<std::string> private_children;
+  backup_env_->GetChildren(GetAbsolutePath(GetPrivateDirRel()),
+                           &private_children);
+  for (auto& child : private_children) {
+    BackupID backup_id = 0;
+    bool tmp_dir = child.find(".tmp") != std::string::npos;
+    sscanf(child.c_str(), "%u", &backup_id);
+    if (!tmp_dir &&  // if it's tmp_dir, delete it
+        (backup_id == 0 || backups_.find(backup_id) != backups_.end())) {
+      // it's either not a number or it's still alive. continue
+      continue;
+    }
+    // here we have to delete the dir and all its children
+    std::string full_private_path =
+        GetAbsolutePath(GetPrivateFileRel(backup_id, tmp_dir));
+    std::vector<std::string> subchildren;
+    backup_env_->GetChildren(full_private_path, &subchildren);
+    for (auto& subchild : subchildren) {
+      Status s = backup_env_->DeleteFile(full_private_path + subchild);
+      if (s.ok()) {
+        Log(options_.info_log, "Deleted %s",
+            (full_private_path + subchild).c_str());
       }
-      // finally delete the private dir
-      Status s = backup_env_->DeleteDir(full_private_path);
-      Log(options_.info_log, "Deleted dir %s -- %s", full_private_path.c_str(),
-          s.ToString().c_str());
     }
+    // finally delete the private dir
+    Status s = backup_env_->DeleteDir(full_private_path);
+    Log(options_.info_log, "Deleted dir %s -- %s", full_private_path.c_str(),
+        s.ToString().c_str());
   }
+
+  return Status::OK();
 }
 
 // ------- BackupMeta class --------
@@ -1257,6 +1270,10 @@ class BackupEngineReadOnlyImpl : public BackupEngineReadOnly {
     backup_engine_->GetBackupInfo(backup_info);
   }
 
+  virtual void GetCorruptedBackups(std::vector<BackupID>* corrupt_backup_ids) {
+    backup_engine_->GetCorruptedBackups(corrupt_backup_ids);
+  }
+
   virtual Status RestoreDBFromBackup(
       BackupID backup_id, const std::string& db_dir, const std::string& wal_dir,
       const RestoreOptions& restore_options = RestoreOptions()) {
@@ -1302,6 +1319,11 @@ void BackupableDB::GetBackupInfo(std::vector<BackupInfo>* backup_info) {
   backup_engine_->GetBackupInfo(backup_info);
 }
 
+void
+BackupableDB::GetCorruptedBackups(std::vector<BackupID>* corrupt_backup_ids) {
+  backup_engine_->GetCorruptedBackups(corrupt_backup_ids);
+}
+
 Status BackupableDB::PurgeOldBackups(uint32_t num_backups_to_keep) {
   return backup_engine_->PurgeOldBackups(num_backups_to_keep);
 }
@@ -1314,6 +1336,10 @@ void BackupableDB::StopBackup() {
   backup_engine_->StopBackup();
 }
 
+Status BackupableDB::GarbageCollect() {
+  return backup_engine_->GarbageCollect();
+}
+
 // --- RestoreBackupableDB methods ------
 
 RestoreBackupableDB::RestoreBackupableDB(Env* db_env,
@@ -1329,6 +1355,11 @@ RestoreBackupableDB::GetBackupInfo(std::vector<BackupInfo>* backup_info) {
   backup_engine_->GetBackupInfo(backup_info);
 }
 
+void RestoreBackupableDB::GetCorruptedBackups(
+    std::vector<BackupID>* corrupt_backup_ids) {
+  backup_engine_->GetCorruptedBackups(corrupt_backup_ids);
+}
+
 Status RestoreBackupableDB::RestoreDBFromBackup(
     BackupID backup_id, const std::string& db_dir, const std::string& wal_dir,
     const RestoreOptions& restore_options) {
@@ -1351,6 +1382,10 @@ Status RestoreBackupableDB::DeleteBackup(BackupID backup_id) {
   return backup_engine_->DeleteBackup(backup_id);
 }
 
+Status RestoreBackupableDB::GarbageCollect() {
+  return backup_engine_->GarbageCollect();
+}
+
 }  // namespace rocksdb
 
 #endif  // ROCKSDB_LITE
diff --git a/utilities/backupable/backupable_db_test.cc b/utilities/backupable/backupable_db_test.cc
index 281837773..7ca5acad8 100644
--- a/utilities/backupable/backupable_db_test.cc
+++ b/utilities/backupable/backupable_db_test.cc
@@ -646,6 +646,32 @@ TEST(BackupableDBTest, CorruptionsTest) {
   ASSERT_OK(db_->CreateNewBackup(!!(rnd.Next() % 2)));
   CloseBackupableDB();
   AssertBackupConsistency(2, 0, keys_iteration * 2, keys_iteration * 5);
+
+  // make sure that no corrupt backups have actually been deleted!
+  ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/meta/1"));
+  ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/meta/2"));
+  ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/meta/3"));
+  ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/meta/4"));
+  ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/meta/5"));
+  ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/private/1"));
+  ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/private/2"));
+  ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/private/3"));
+  ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/private/4"));
+  ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/private/5"));
+
+  // delete the corrupt backups and then make sure they're actually deleted
+  OpenBackupableDB();
+  ASSERT_OK(db_->DeleteBackup(5));
+  ASSERT_OK(db_->DeleteBackup(4));
+  ASSERT_OK(db_->DeleteBackup(3));
+  (void) db_->GarbageCollect();
+  ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/meta/5") == false);
+  ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/private/5") == false);
+  ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/meta/4") == false);
+  ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/private/4") == false);
+  ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/meta/3") == false);
+  ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/private/3") == false);
+  CloseBackupableDB();
 }
 
 // open DB, write, close DB, backup, restore, repeat
@@ -867,6 +893,8 @@ TEST(BackupableDBTest, DeleteTmpFiles) {
   file_manager_->WriteToFile(private_tmp_file, "tmp");
   ASSERT_EQ(true, file_manager_->FileExists(private_tmp_dir));
   OpenBackupableDB();
+  // Need to call this explicitly to delete tmp files
+  (void) db_->GarbageCollect();
   CloseBackupableDB();
   ASSERT_EQ(false, file_manager_->FileExists(shared_tmp));
   ASSERT_EQ(false, file_manager_->FileExists(private_tmp_file));

From fa50abb72615789647103f3d9220878c2125aaaa Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Wed, 12 Nov 2014 14:19:33 -0800
Subject: [PATCH 469/829] Fix bug of reading from empty DB.

Summary: I found that db_stress sometimes segfault on my machine. Fix the bug.

Test Plan: make all check. Run db_stress

Reviewers: ljin, yhchiang, rven, igor

Reviewed By: igor

Subscribers: leveldb, dhruba

Differential Revision: https://reviews.facebook.net/D28803
---
 db/version_set.cc | 11 ++++++++---
 db/version_set.h  |  1 +
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/db/version_set.cc b/db/version_set.cc
index a1954bddb..32dc900b3 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -678,6 +678,11 @@ void Version::AddIterators(const ReadOptions& read_options,
                            MergeIteratorBuilder* merge_iter_builder) {
   assert(storage_info_.finalized_);
 
+  if (storage_info_.num_non_empty_levels() == 0) {
+    // No file in the Version.
+    return;
+  }
+
   // Merge all level zero files together since they may overlap
   for (size_t i = 0; i < storage_info_.LevelFilesBrief(0).num_files; i++) {
     const auto& file = storage_info_.LevelFilesBrief(0).files[i];
@@ -689,8 +694,8 @@ void Version::AddIterators(const ReadOptions& read_options,
   // For levels > 0, we can use a concatenating iterator that sequentially
   // walks through the non-overlapping files in the level, opening them
   // lazily.
-  for (int level = 1; level < storage_info_.num_levels(); level++) {
-    if (storage_info_.level_files_brief_[level].num_files != 0) {
+  for (int level = 1; level < storage_info_.num_non_empty_levels(); level++) {
+    if (storage_info_.LevelFilesBrief(level).num_files != 0) {
       merge_iter_builder->AddIterator(NewTwoLevelIterator(
           new LevelFileIteratorState(
               cfd_->table_cache(), read_options, soptions,
@@ -711,7 +716,7 @@ VersionStorageInfo::VersionStorageInfo(
       user_comparator_(user_comparator),
       // cfd is nullptr if Version is dummy
       num_levels_(levels),
-      num_non_empty_levels_(num_levels_),
+      num_non_empty_levels_(0),
       file_indexer_(user_comparator),
       compaction_style_(compaction_style),
       files_(new std::vector<FileMetaData*>[num_levels_]),
diff --git a/db/version_set.h b/db/version_set.h
index f23fcc693..d5a0cb28b 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -206,6 +206,7 @@ class VersionStorageInfo {
   }
 
   const rocksdb::LevelFilesBrief& LevelFilesBrief(int level) const {
+    assert(level < static_cast<int>(level_files_brief_.size()));
     return level_files_brief_[level];
   }
 

From 25f273027b4c9d98236a1103c58549e035bb313f Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Thu, 13 Nov 2014 14:39:30 -0500
Subject: [PATCH 470/829] Fix iOS compile with -Wshorten-64-to-32

Summary: So iOS size_t is 32-bit, so we need to static_cast<size_t> any uint64_t :(

Test Plan: TARGET_OS=IOS make static_lib

Reviewers: dhruba, ljin, yhchiang, rven, sdong

Reviewed By: sdong

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D28743
---
 db/column_family.cc                           |  3 ++
 db/compaction_job.cc                          |  4 +-
 db/db_impl.cc                                 |  3 +-
 db/db_impl.h                                  |  4 --
 db/log_reader.cc                              |  2 +-
 db/version_set.cc                             |  2 +-
 include/rocksdb/immutable_options.h           |  2 +
 include/rocksdb/options.h                     |  7 +++-
 include/{ => rocksdb}/utilities/convenience.h |  2 +
 java/rocksjni/writebatchhandlerjnicallback.cc |  4 +-
 port/port_posix.cc                            |  4 +-
 table/block_based_table_builder.cc            |  7 +++-
 table/block_based_table_reader.h              |  1 +
 table/format.cc                               | 12 ++++--
 table/merger.cc                               | 37 +++++++++----------
 table/meta_blocks.cc                          |  1 +
 table/meta_blocks.h                           |  4 --
 table/plain_table_index.cc                    |  6 +--
 table/plain_table_index.h                     |  2 +-
 table/table_properties.cc                     |  1 +
 table/table_properties_internal.h             | 18 +++++++++
 util/benchharness.cc                          | 10 ++++-
 util/benchharness.h                           |  2 -
 util/db_info_dumper.cc                        |  6 +--
 util/db_info_dumper.h                         | 13 +++++++
 util/dynamic_bloom.h                          |  4 +-
 util/env_posix.cc                             |  8 ++--
 util/histogram.cc                             |  2 +-
 util/histogram.h                              |  2 +-
 util/ldb_cmd.h                                |  5 +++
 util/options.cc                               | 34 +++++++++++------
 util/options_helper.cc                        | 26 +++++++------
 util/sst_dump_tool.cc                         |  4 ++
 33 files changed, 158 insertions(+), 84 deletions(-)
 rename include/{ => rocksdb}/utilities/convenience.h (96%)
 create mode 100644 table/table_properties_internal.h
 create mode 100644 util/db_info_dumper.h

diff --git a/db/column_family.cc b/db/column_family.cc
index 08ff09866..7d203fdbe 100644
--- a/db/column_family.cc
+++ b/db/column_family.cc
@@ -524,6 +524,8 @@ void ColumnFamilyData::NotifyOnFlushCompleted(
     DB* db, const std::string& file_path,
     bool triggered_flush_slowdown,
     bool triggered_flush_stop) {
+
+#ifndef ROCKSDB_LITE
   auto listeners = ioptions()->listeners;
   for (auto listener : listeners) {
     listener->OnFlushCompleted(
@@ -531,6 +533,7 @@ void ColumnFamilyData::NotifyOnFlushCompleted(
         // Use path 0 as fulled memtables are first flushed into path 0.
         triggered_flush_slowdown, triggered_flush_stop);
   }
+#endif  // ROCKSDB_LITE
 }
 
 SuperVersion* ColumnFamilyData::InstallSuperVersion(
diff --git a/db/compaction_job.cc b/db/compaction_job.cc
index d816b68dd..db751775a 100644
--- a/db/compaction_job.cc
+++ b/db/compaction_job.cc
@@ -1044,8 +1044,8 @@ Status CompactionJob::OpenCompactionOutputFile() {
 
   compact_->outputs.push_back(out);
   compact_->outfile->SetIOPriority(Env::IO_LOW);
-  compact_->outfile->SetPreallocationBlockSize(
-      compact_->compaction->OutputFilePreallocationSize(mutable_cf_options_));
+  compact_->outfile->SetPreallocationBlockSize(static_cast<size_t>(
+      compact_->compaction->OutputFilePreallocationSize(mutable_cf_options_)));
 
   ColumnFamilyData* cfd = compact_->compaction->column_family_data();
   compact_->builder.reset(NewTableBuilder(
diff --git a/db/db_impl.cc b/db/db_impl.cc
index acbd213b6..80857402b 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -64,6 +64,7 @@
 #include "util/autovector.h"
 #include "util/build_version.h"
 #include "util/coding.h"
+#include "util/db_info_dumper.h"
 #include "util/hash_skiplist_rep.h"
 #include "util/hash_linklist_rep.h"
 #include "util/logging.h"
@@ -3362,7 +3363,7 @@ Status DBImpl::GetDbIdentity(std::string& identity) {
   }
   char buffer[file_size];
   Slice id;
-  s = idfile->Read(file_size, &id, buffer);
+  s = idfile->Read(static_cast<size_t>(file_size), &id, buffer);
   if (!s.ok()) {
     return s;
   }
diff --git a/db/db_impl.h b/db/db_impl.h
index eda00ab9b..f7a655cb6 100644
--- a/db/db_impl.h
+++ b/db/db_impl.h
@@ -585,8 +585,4 @@ static void ClipToRange(T* ptr, V minvalue, V maxvalue) {
   if (static_cast<V>(*ptr) < minvalue) *ptr = minvalue;
 }
 
-// Dump db file summary, implemented in util/
-extern void DumpDBFileSummary(const DBOptions& options,
-                              const std::string& dbname);
-
 }  // namespace rocksdb
diff --git a/db/log_reader.cc b/db/log_reader.cc
index 21d876de9..9ab97ca3e 100644
--- a/db/log_reader.cc
+++ b/db/log_reader.cc
@@ -54,7 +54,7 @@ bool Reader::SkipToInitialBlock() {
   if (block_start_location > 0) {
     Status skip_status = file_->Skip(block_start_location);
     if (!skip_status.ok()) {
-      ReportDrop(block_start_location, skip_status);
+      ReportDrop(static_cast<size_t>(block_start_location), skip_status);
       return false;
     }
   }
diff --git a/db/version_set.cc b/db/version_set.cc
index 32dc900b3..0efaf0c7d 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -2635,7 +2635,7 @@ void VersionSet::AddLiveFiles(std::vector<FileDescriptor>* live_list) {
   }
 
   // just one time extension to the right size
-  live_list->reserve(live_list->size() + total_files);
+  live_list->reserve(live_list->size() + static_cast<size_t>(total_files));
 
   for (auto cfd : *column_family_set_) {
     Version* dummy_versions = cfd->dummy_versions();
diff --git a/include/rocksdb/immutable_options.h b/include/rocksdb/immutable_options.h
index 02bd006f3..adf46d647 100644
--- a/include/rocksdb/immutable_options.h
+++ b/include/rocksdb/immutable_options.h
@@ -91,9 +91,11 @@ struct ImmutableCFOptions {
 
   int num_levels;
 
+#ifndef ROCKSDB_LITE
   // A vector of EventListeners which call-back functions will be called
   // when specific RocksDB event happens.
   std::vector<std::shared_ptr<EventListener>> listeners;
+#endif  // ROCKSDB_LITE
 };
 
 }  // namespace rocksdb
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index e22ee03eb..102143301 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -63,7 +63,6 @@ enum CompactionStyle : char {
                                // jobs are submitted via CompactFiles()
 };
 
-
 struct CompactionOptionsFIFO {
   // once the total sum of table files reaches this, we will delete the oldest
   // table file
@@ -102,6 +101,7 @@ struct Options;
 struct ColumnFamilyOptions {
   // Some functions that make it easier to optimize RocksDB
 
+#ifndef ROCKSDB_LITE
   // Use this if you don't need to keep the data sorted, i.e. you'll never use
   // an iterator, only Put() and Get() API calls
   ColumnFamilyOptions* OptimizeForPointLookup(
@@ -125,6 +125,7 @@ struct ColumnFamilyOptions {
       uint64_t memtable_memory_budget = 512 * 1024 * 1024);
   ColumnFamilyOptions* OptimizeUniversalStyleCompaction(
       uint64_t memtable_memory_budget = 512 * 1024 * 1024);
+#endif  // ROCKSDB_LITE
 
   // -------------------
   // Parameters that affect behavior
@@ -591,9 +592,11 @@ struct ColumnFamilyOptions {
   // Default: 2
   uint32_t min_partial_merge_operands;
 
+#ifndef ROCKSDB_LITE
   // A vector of EventListeners which call-back functions will be called
   // when specific RocksDB event happens.
   std::vector<std::shared_ptr<EventListener>> listeners;
+#endif  // ROCKSDB_LITE
 
   // Create ColumnFamilyOptions with default values for all fields
   ColumnFamilyOptions();
@@ -606,12 +609,14 @@ struct ColumnFamilyOptions {
 struct DBOptions {
   // Some functions that make it easier to optimize RocksDB
 
+#ifndef ROCKSDB_LITE
   // By default, RocksDB uses only one background thread for flush and
   // compaction. Calling this function will set it up such that total of
   // `total_threads` is used. Good value for `total_threads` is the number of
   // cores. You almost definitely want to call this function if your system is
   // bottlenecked by RocksDB.
   DBOptions* IncreaseParallelism(int total_threads = 16);
+#endif  // ROCKSDB_LITE
 
   // If true, the database will be created if it is missing.
   // Default: false
diff --git a/include/utilities/convenience.h b/include/rocksdb/utilities/convenience.h
similarity index 96%
rename from include/utilities/convenience.h
rename to include/rocksdb/utilities/convenience.h
index 5d7b6d116..77913c254 100644
--- a/include/utilities/convenience.h
+++ b/include/rocksdb/utilities/convenience.h
@@ -11,6 +11,7 @@
 
 namespace rocksdb {
 
+#ifndef ROCKSDB_LITE
 // Take a map of option name and option value, apply them into the
 // base_options, and return the new options as a result
 bool GetColumnFamilyOptionsFromMap(
@@ -36,5 +37,6 @@ bool GetDBOptionsFromString(
     const DBOptions& base_options,
     const std::string& opts_str,
     DBOptions* new_options);
+#endif  // ROCKSDB_LITE
 
 }  // namespace rocksdb
diff --git a/java/rocksjni/writebatchhandlerjnicallback.cc b/java/rocksjni/writebatchhandlerjnicallback.cc
index 22f5117b3..b12e35544 100644
--- a/java/rocksjni/writebatchhandlerjnicallback.cc
+++ b/java/rocksjni/writebatchhandlerjnicallback.cc
@@ -91,9 +91,9 @@ bool WriteBatchHandlerJniCallback::Continue() {
  * on the result after you have finished with it
  */
 jbyteArray WriteBatchHandlerJniCallback::sliceToJArray(const Slice& s) {
-  jbyteArray ja = m_env->NewByteArray(s.size());
+  jbyteArray ja = m_env->NewByteArray(static_cast<jsize>(s.size()));
   m_env->SetByteArrayRegion(
-      ja, 0, s.size(),
+      ja, 0, static_cast<jsize>(s.size()),
       reinterpret_cast<const jbyte*>(s.data()));
   return ja;
 }
diff --git a/port/port_posix.cc b/port/port_posix.cc
index c5ea439eb..a8cffcc7e 100644
--- a/port/port_posix.cc
+++ b/port/port_posix.cc
@@ -88,8 +88,8 @@ void CondVar::Wait() {
 
 bool CondVar::TimedWait(uint64_t abs_time_us) {
   struct timespec ts;
-  ts.tv_sec = abs_time_us / 1000000;
-  ts.tv_nsec = (abs_time_us % 1000000) * 1000;
+  ts.tv_sec = static_cast<time_t>(abs_time_us / 1000000);
+  ts.tv_nsec = static_cast<suseconds_t>((abs_time_us % 1000000) * 1000);
 
 #ifndef NDEBUG
   mu_->locked_ = false;
diff --git a/table/block_based_table_builder.cc b/table/block_based_table_builder.cc
index 6a48a975a..0a93e309d 100644
--- a/table/block_based_table_builder.cc
+++ b/table/block_based_table_builder.cc
@@ -258,6 +258,9 @@ class HashIndexBuilder : public IndexBuilder {
   uint64_t current_restart_index_ = 0;
 };
 
+// Without anonymous namespace here, we fail the warning -Wmissing-prototypes
+namespace {
+
 // Create a index builder based on its type.
 IndexBuilder* CreateIndexBuilder(IndexType type, const Comparator* comparator,
                                  const SliceTransform* prefix_extractor) {
@@ -352,6 +355,8 @@ Slice CompressBlock(const Slice& raw,
   return raw;
 }
 
+}  // namespace
+
 // kBlockBasedTableMagicNumber was picked by running
 //    echo rocksdb.table.block_based | sha1sum
 // and taking the leading 64 bits.
@@ -660,7 +665,7 @@ Status BlockBasedTableBuilder::InsertBlockInCache(const Slice& block_contents,
     block_cache_compressed->Release(cache_handle);
 
     // Invalidate OS cache.
-    r->file->InvalidateCache(r->offset, size);
+    r->file->InvalidateCache(static_cast<size_t>(r->offset), size);
   }
   return Status::OK();
 }
diff --git a/table/block_based_table_reader.h b/table/block_based_table_reader.h
index a000c6a9a..0b89edd3f 100644
--- a/table/block_based_table_reader.h
+++ b/table/block_based_table_reader.h
@@ -19,6 +19,7 @@
 #include "rocksdb/status.h"
 #include "rocksdb/table.h"
 #include "table/table_reader.h"
+#include "table/table_properties_internal.h"
 #include "util/coding.h"
 
 namespace rocksdb {
diff --git a/table/format.cc b/table/format.cc
index e2cad7bf6..90d7ac8dc 100644
--- a/table/format.cc
+++ b/table/format.cc
@@ -188,9 +188,10 @@ Status ReadFooterFromFile(RandomAccessFile* file,
 
   char footer_space[Footer::kMaxEncodedLength];
   Slice footer_input;
-  size_t read_offset = (file_size > Footer::kMaxEncodedLength)
-                           ? (file_size - Footer::kMaxEncodedLength)
-                           : 0;
+  size_t read_offset =
+      (file_size > Footer::kMaxEncodedLength)
+          ? static_cast<size_t>(file_size - Footer::kMaxEncodedLength)
+          : 0;
   Status s = file->Read(read_offset, Footer::kMaxEncodedLength, &footer_input,
                         footer_space);
   if (!s.ok()) return s;
@@ -204,6 +205,9 @@ Status ReadFooterFromFile(RandomAccessFile* file,
   return footer->DecodeFrom(&footer_input);
 }
 
+// Without anonymous namespace here, we fail the warning -Wmissing-prototypes
+namespace {
+
 // Read a block and check its CRC
 // contents is the result of reading.
 // According to the implementation of file->Read, contents may not point to buf
@@ -255,6 +259,8 @@ Status ReadBlock(RandomAccessFile* file, const Footer& footer,
   return s;
 }
 
+}  // namespace
+
 Status ReadBlockContents(RandomAccessFile* file, const Footer& footer,
                          const ReadOptions& options, const BlockHandle& handle,
                          BlockContents* contents, Env* env,
diff --git a/table/merger.cc b/table/merger.cc
index 496f847fa..26a90097c 100644
--- a/table/merger.cc
+++ b/table/merger.cc
@@ -23,27 +23,24 @@
 #include "util/autovector.h"
 
 namespace rocksdb {
-namespace merger {
-typedef std::priority_queue<
-          IteratorWrapper*,
-          std::vector<IteratorWrapper*>,
-          MaxIteratorComparator> MaxIterHeap;
+// Without anonymous namespace here, we fail the warning -Wmissing-prototypes
+namespace {
+typedef std::priority_queue<IteratorWrapper*, std::vector<IteratorWrapper*>,
+                            MaxIteratorComparator> MergerMaxIterHeap;
 
-typedef std::priority_queue<
-          IteratorWrapper*,
-          std::vector<IteratorWrapper*>,
-          MinIteratorComparator> MinIterHeap;
+typedef std::priority_queue<IteratorWrapper*, std::vector<IteratorWrapper*>,
+                            MinIteratorComparator> MergerMinIterHeap;
 
 // Return's a new MaxHeap of IteratorWrapper's using the provided Comparator.
-MaxIterHeap NewMaxIterHeap(const Comparator* comparator) {
-  return MaxIterHeap(MaxIteratorComparator(comparator));
+MergerMaxIterHeap NewMergerMaxIterHeap(const Comparator* comparator) {
+  return MergerMaxIterHeap(MaxIteratorComparator(comparator));
 }
 
 // Return's a new MinHeap of IteratorWrapper's using the provided Comparator.
-MinIterHeap NewMinIterHeap(const Comparator* comparator) {
-  return MinIterHeap(MinIteratorComparator(comparator));
+MergerMinIterHeap NewMergerMinIterHeap(const Comparator* comparator) {
+  return MergerMinIterHeap(MinIteratorComparator(comparator));
 }
-}  // namespace merger
+}  // namespace
 
 const size_t kNumIterReserve = 4;
 
@@ -56,8 +53,8 @@ class MergingIterator : public Iterator {
         current_(nullptr),
         use_heap_(true),
         direction_(kForward),
-        maxHeap_(merger::NewMaxIterHeap(comparator_)),
-        minHeap_(merger::NewMinIterHeap(comparator_)) {
+        maxHeap_(NewMergerMaxIterHeap(comparator_)),
+        minHeap_(NewMergerMinIterHeap(comparator_)) {
     children_.resize(n);
     for (int i = 0; i < n; i++) {
       children_[i].Set(children[i]);
@@ -271,8 +268,8 @@ class MergingIterator : public Iterator {
     kReverse
   };
   Direction direction_;
-  merger::MaxIterHeap maxHeap_;
-  merger::MinIterHeap minHeap_;
+  MergerMaxIterHeap maxHeap_;
+  MergerMinIterHeap minHeap_;
 };
 
 void MergingIterator::FindSmallest() {
@@ -299,8 +296,8 @@ void MergingIterator::FindLargest() {
 
 void MergingIterator::ClearHeaps() {
   use_heap_ = true;
-  maxHeap_ = merger::NewMaxIterHeap(comparator_);
-  minHeap_ = merger::NewMinIterHeap(comparator_);
+  maxHeap_ = NewMergerMaxIterHeap(comparator_);
+  minHeap_ = NewMergerMinIterHeap(comparator_);
 }
 
 Iterator* NewMergingIterator(const Comparator* cmp, Iterator** list, int n,
diff --git a/table/meta_blocks.cc b/table/meta_blocks.cc
index 04b68eb95..25a785787 100644
--- a/table/meta_blocks.cc
+++ b/table/meta_blocks.cc
@@ -11,6 +11,7 @@
 #include "rocksdb/table_properties.h"
 #include "table/block.h"
 #include "table/format.h"
+#include "table/table_properties_internal.h"
 #include "util/coding.h"
 
 namespace rocksdb {
diff --git a/table/meta_blocks.h b/table/meta_blocks.h
index 798a18af0..283f7a0be 100644
--- a/table/meta_blocks.h
+++ b/table/meta_blocks.h
@@ -119,10 +119,6 @@ Status ReadTableProperties(RandomAccessFile* file, uint64_t file_size,
                            uint64_t table_magic_number, Env* env,
                            Logger* info_log, TableProperties** properties);
 
-// Seek to the properties block.
-// If it successfully seeks to the properties block, "is_found" will be
-// set to true.
-extern Status SeekToPropertiesBlock(Iterator* meta_iter, bool* is_found);
 
 // Find the meta block from the meta index block.
 Status FindMetaBlock(Iterator* meta_index_iter,
diff --git a/table/plain_table_index.cc b/table/plain_table_index.cc
index 8b2c994b8..ea8ac6b94 100644
--- a/table/plain_table_index.cc
+++ b/table/plain_table_index.cc
@@ -55,16 +55,14 @@ PlainTableIndex::IndexSearchResult PlainTableIndex::GetOffset(
   }
 }
 
-void PlainTableIndexBuilder::IndexRecordList::AddRecord(murmur_t hash,
+void PlainTableIndexBuilder::IndexRecordList::AddRecord(uint32_t hash,
                                                         uint32_t offset) {
   if (num_records_in_current_group_ == kNumRecordsPerGroup) {
     current_group_ = AllocateNewGroup();
     num_records_in_current_group_ = 0;
   }
   auto& new_record = current_group_[num_records_in_current_group_++];
-  // TODO(sdong) -- check if this is OK -- murmur_t is uint64_t, while we only
-  // use 32 bits here
-  new_record.hash = static_cast<uint32_t>(hash);
+  new_record.hash = hash;
   new_record.offset = offset;
   new_record.next = nullptr;
 }
diff --git a/table/plain_table_index.h b/table/plain_table_index.h
index fa6d1a41e..870e3fb00 100644
--- a/table/plain_table_index.h
+++ b/table/plain_table_index.h
@@ -156,7 +156,7 @@ class PlainTableIndexBuilder {
       }
     }
 
-    void AddRecord(murmur_t hash, uint32_t offset);
+    void AddRecord(uint32_t hash, uint32_t offset);
 
     size_t GetNumRecords() const {
       return (groups_.size() - 1) * kNumRecordsPerGroup +
diff --git a/table/table_properties.cc b/table/table_properties.cc
index c7e141943..98d519971 100644
--- a/table/table_properties.cc
+++ b/table/table_properties.cc
@@ -3,6 +3,7 @@
 //  LICENSE file in the root directory of this source tree. An additional grant
 //  of patent rights can be found in the PATENTS file in the same directory.
 
+#include "table/table_properties_internal.h"
 #include "rocksdb/table_properties.h"
 #include "rocksdb/iterator.h"
 #include "rocksdb/env.h"
diff --git a/table/table_properties_internal.h b/table/table_properties_internal.h
new file mode 100644
index 000000000..9ef8ad432
--- /dev/null
+++ b/table/table_properties_internal.h
@@ -0,0 +1,18 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#pragma once
+
+#include "rocksdb/status.h"
+#include "rocksdb/iterator.h"
+
+namespace rocksdb {
+
+// Seek to the properties block.
+// If it successfully seeks to the properties block, "is_found" will be
+// set to true.
+Status SeekToPropertiesBlock(Iterator* meta_iter, bool* is_found);
+
+}  // namespace rocksdb
diff --git a/util/benchharness.cc b/util/benchharness.cc
index 59fbba4c5..1c282aee4 100644
--- a/util/benchharness.cc
+++ b/util/benchharness.cc
@@ -17,6 +17,8 @@
 #include <utility>
 #include <vector>
 
+#include <gflags/gflags.h>
+
 using std::function;
 using std::get;
 using std::make_pair;
@@ -28,6 +30,12 @@ using std::string;
 using std::tuple;
 using std::vector;
 
+#ifndef GFLAGS
+bool FLAGS_benchmark = false;
+uint64_t FLAGS_bm_min_usec = 100;
+int64_t FLAGS_bm_min_iter = 1;
+int32_t FLAGS_bm_max_secs = 1;
+#else
 DEFINE_bool(benchmark, false, "Run benchmarks.");
 
 DEFINE_uint64(bm_min_usec, 100,
@@ -38,7 +46,7 @@ DEFINE_int64(bm_min_iters, 1,
 
 DEFINE_int32(bm_max_secs, 1,
              "Maximum # of seconds we'll spend on each benchmark.");
-
+#endif  // GFLAGS
 
 namespace rocksdb {
 namespace benchmark {
diff --git a/util/benchharness.h b/util/benchharness.h
index 4fdef520c..948fdf2ff 100644
--- a/util/benchharness.h
+++ b/util/benchharness.h
@@ -9,8 +9,6 @@
 
 #pragma once
 
-#include <gflags/gflags.h>
-
 #include <cassert>
 #include <functional>
 #include <limits>
diff --git a/util/db_info_dumper.cc b/util/db_info_dumper.cc
index 7049e6853..9c709282c 100644
--- a/util/db_info_dumper.cc
+++ b/util/db_info_dumper.cc
@@ -2,9 +2,6 @@
 //  This source code is licensed under the BSD-style license found in the
 //  LICENSE file in the root directory of this source tree. An additional grant
 //  of patent rights can be found in the PATENTS file in the same directory.
-//
-// Must not be included from any .h files to avoid polluting the namespace
-// with macros.
 
 #ifndef __STDC_FORMAT_MACROS
 #define __STDC_FORMAT_MACROS
@@ -16,9 +13,10 @@
 #include <algorithm>
 #include <vector>
 
+#include "db/filename.h"
 #include "rocksdb/options.h"
 #include "rocksdb/env.h"
-#include "db/filename.h"
+#include "util/db_info_dumper.h"
 
 namespace rocksdb {
 
diff --git a/util/db_info_dumper.h b/util/db_info_dumper.h
new file mode 100644
index 000000000..ed0a63ded
--- /dev/null
+++ b/util/db_info_dumper.h
@@ -0,0 +1,13 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+#pragma once
+
+#include <string>
+
+#include "rocksdb/options.h"
+
+namespace rocksdb {
+void DumpDBFileSummary(const DBOptions& options, const std::string& dbname);
+}  // namespace rocksdb
diff --git a/util/dynamic_bloom.h b/util/dynamic_bloom.h
index 927710d24..b3b402c4f 100644
--- a/util/dynamic_bloom.h
+++ b/util/dynamic_bloom.h
@@ -9,8 +9,8 @@
 
 #include "rocksdb/slice.h"
 
-#include <util/arena.h>
-#include <port/port_posix.h>
+#include "util/arena.h"
+#include "port/port_posix.h"
 
 #include <atomic>
 #include <memory>
diff --git a/util/env_posix.cc b/util/env_posix.cc
index b6d17ce31..86343be30 100644
--- a/util/env_posix.cc
+++ b/util/env_posix.cc
@@ -201,7 +201,7 @@ class PosixSequentialFile: public SequentialFile {
   }
 
   virtual Status Skip(uint64_t n) {
-    if (fseek(file_, n, SEEK_CUR)) {
+    if (fseek(file_, static_cast<long int>(n), SEEK_CUR)) {
       return IOError(filename_, errno);
     }
     return Status::OK();
@@ -486,7 +486,7 @@ class PosixMmapFile : public WritableFile {
     const char* src = data.data();
     size_t left = data.size();
     TEST_KILL_RANDOM(rocksdb_kill_odds * REDUCE_ODDS);
-    PrepareWrite(GetFileSize(), left);
+    PrepareWrite(static_cast<size_t>(GetFileSize()), left);
     while (left > 0) {
       assert(base_ <= dst_);
       assert(dst_ <= limit_);
@@ -683,7 +683,7 @@ class PosixWritableFile : public WritableFile {
 
     TEST_KILL_RANDOM(rocksdb_kill_odds * REDUCE_ODDS2);
 
-    PrepareWrite(GetFileSize(), left);
+    PrepareWrite(static_cast<size_t>(GetFileSize()), left);
     // if there is no space in the cache, then flush
     if (cursize_ + left > capacity_) {
       s = Flush();
@@ -1380,7 +1380,7 @@ class PosixEnv : public Env {
   }
 
   virtual Status GetHostName(char* name, uint64_t len) {
-    int ret = gethostname(name, len);
+    int ret = gethostname(name, static_cast<size_t>(len));
     if (ret < 0) {
       if (errno == EFAULT || errno == EINVAL)
         return Status::InvalidArgument(strerror(errno));
diff --git a/util/histogram.cc b/util/histogram.cc
index 0dbfba7d6..67621a5fc 100644
--- a/util/histogram.cc
+++ b/util/histogram.cc
@@ -60,7 +60,7 @@ size_t HistogramBucketMapper::IndexForValue(const uint64_t value) const {
     std::map<uint64_t, uint64_t>::const_iterator lowerBound =
       valueIndexMap_.lower_bound(value);
     if (lowerBound != valueIndexMap_.end()) {
-      return lowerBound->second;
+      return static_cast<size_t>(lowerBound->second);
     } else {
       return 0;
     }
diff --git a/util/histogram.h b/util/histogram.h
index af3a019d8..77ed9bed7 100644
--- a/util/histogram.h
+++ b/util/histogram.h
@@ -38,7 +38,7 @@ class HistogramBucketMapper {
     return minBucketValue_;
   }
 
-  uint64_t BucketLimit(const uint64_t bucketNumber) const {
+  uint64_t BucketLimit(const size_t bucketNumber) const {
     assert(bucketNumber < BucketCount());
     return bucketValues_[bucketNumber];
   }
diff --git a/util/ldb_cmd.h b/util/ldb_cmd.h
index 9ffe0eabc..7436cc368 100644
--- a/util/ldb_cmd.h
+++ b/util/ldb_cmd.h
@@ -4,6 +4,9 @@
 //  of patent rights can be found in the PATENTS file in the same directory.
 //
 #pragma once
+
+#ifndef ROCKSDB_LITE
+
 #include <string>
 #include <iostream>
 #include <sstream>
@@ -730,3 +733,5 @@ public:
 };
 
 } // namespace rocksdb
+
+#endif  // ROCKSDB_LITE
diff --git a/util/options.cc b/util/options.cc
index bdcdcdf2b..b97f622d2 100644
--- a/util/options.cc
+++ b/util/options.cc
@@ -64,8 +64,12 @@ ImmutableCFOptions::ImmutableCFOptions(const Options& options)
     compression_per_level(options.compression_per_level),
     compression_opts(options.compression_opts),
     access_hint_on_compaction_start(options.access_hint_on_compaction_start),
-    num_levels(options.num_levels),
-    listeners(options.listeners) {}
+    num_levels(options.num_levels)
+#ifndef ROCKSDB_LITE
+    , listeners(options.listeners) {}
+#else  // ROCKSDB_LITE
+    {}
+#endif  // ROCKSDB_LITE
 
 ColumnFamilyOptions::ColumnFamilyOptions()
     : comparator(BytewiseComparator()),
@@ -113,8 +117,12 @@ ColumnFamilyOptions::ColumnFamilyOptions()
       memtable_prefix_bloom_huge_page_tlb_size(0),
       bloom_locality(0),
       max_successive_merges(0),
-      min_partial_merge_operands(2),
-      listeners() {
+      min_partial_merge_operands(2)
+#ifndef ROCKSDB_LITE
+      , listeners() {
+#else  // ROCKSDB_LITE
+      {
+#endif  // ROCKSDB_LITE
   assert(memtable_factory.get() != nullptr);
 }
 
@@ -174,8 +182,12 @@ ColumnFamilyOptions::ColumnFamilyOptions(const Options& options)
           options.memtable_prefix_bloom_huge_page_tlb_size),
       bloom_locality(options.bloom_locality),
       max_successive_merges(options.max_successive_merges),
-      min_partial_merge_operands(options.min_partial_merge_operands),
-      listeners(options.listeners) {
+      min_partial_merge_operands(options.min_partial_merge_operands)
+#ifndef ROCKSDB_LITE
+      , listeners(options.listeners) {
+#else   // ROCKSDB_LITE
+      {
+#endif  // ROCKSDB_LITE
   assert(memtable_factory.get() != nullptr);
   if (max_bytes_for_level_multiplier_additional.size() <
       static_cast<unsigned int>(num_levels)) {
@@ -496,6 +508,7 @@ Options::PrepareForBulkLoad()
   return this;
 }
 
+#ifndef ROCKSDB_LITE
 // Optimization functions
 ColumnFamilyOptions* ColumnFamilyOptions::OptimizeForPointLookup(
     uint64_t block_cache_size_mb) {
@@ -504,17 +517,15 @@ ColumnFamilyOptions* ColumnFamilyOptions::OptimizeForPointLookup(
   block_based_options.index_type = BlockBasedTableOptions::kHashSearch;
   block_based_options.filter_policy.reset(NewBloomFilterPolicy(10));
   block_based_options.block_cache =
-    NewLRUCache(block_cache_size_mb * 1024 * 1024);
+      NewLRUCache(static_cast<size_t>(block_cache_size_mb * 1024 * 1024));
   table_factory.reset(new BlockBasedTableFactory(block_based_options));
-#ifndef ROCKSDB_LITE
   memtable_factory.reset(NewHashLinkListRepFactory());
-#endif
   return this;
 }
 
 ColumnFamilyOptions* ColumnFamilyOptions::OptimizeLevelStyleCompaction(
     uint64_t memtable_memory_budget) {
-  write_buffer_size = memtable_memory_budget / 4;
+  write_buffer_size = static_cast<size_t>(memtable_memory_budget / 4);
   // merge two memtables when flushing to L0
   min_write_buffer_number_to_merge = 2;
   // this means we'll use 50% extra memory in the worst case, but will reduce
@@ -546,7 +557,7 @@ ColumnFamilyOptions* ColumnFamilyOptions::OptimizeLevelStyleCompaction(
 
 ColumnFamilyOptions* ColumnFamilyOptions::OptimizeUniversalStyleCompaction(
     uint64_t memtable_memory_budget) {
-  write_buffer_size = memtable_memory_budget / 4;
+  write_buffer_size = static_cast<size_t>(memtable_memory_budget / 4);
   // merge two memtables when flushing to L0
   min_write_buffer_number_to_merge = 2;
   // this means we'll use 50% extra memory in the worst case, but will reduce
@@ -565,5 +576,6 @@ DBOptions* DBOptions::IncreaseParallelism(int total_threads) {
   env->SetBackgroundThreads(1, Env::HIGH);
   return this;
 }
+#endif  // ROCKSDB_LITE
 
 }  // namespace rocksdb
diff --git a/util/options_helper.cc b/util/options_helper.cc
index 0c7c5d7a1..bffcc1f5c 100644
--- a/util/options_helper.cc
+++ b/util/options_helper.cc
@@ -7,10 +7,13 @@
 #include <cctype>
 #include <unordered_set>
 #include "rocksdb/options.h"
+#include "rocksdb/utilities/convenience.h"
 #include "util/options_helper.h"
 
 namespace rocksdb {
 
+#ifndef ROCKSDB_LITE
+
 namespace {
 CompressionType ParseCompressionType(const std::string& type) {
   if (type == "kNoCompression") {
@@ -50,8 +53,8 @@ uint64_t ParseUint64(const std::string& value) {
   return std::stoull(value);
 }
 
-int64_t ParseInt64(const std::string& value) {
-  return std::stol(value);
+size_t ParseSizeT(const std::string& value) {
+  return static_cast<size_t>(ParseUint64(value));
 }
 
 double ParseDouble(const std::string& value) {
@@ -76,24 +79,24 @@ template<typename OptionsType>
 bool ParseMemtableOptions(const std::string& name, const std::string& value,
                           OptionsType* new_options) {
   if (name == "write_buffer_size") {
-    new_options->write_buffer_size = ParseInt64(value);
+    new_options->write_buffer_size = ParseSizeT(value);
   } else if (name == "arena_block_size") {
-    new_options->arena_block_size = ParseInt64(value);
+    new_options->arena_block_size = ParseSizeT(value);
   } else if (name == "memtable_prefix_bloom_bits") {
     new_options->memtable_prefix_bloom_bits = ParseUint32(value);
   } else if (name == "memtable_prefix_bloom_probes") {
     new_options->memtable_prefix_bloom_probes = ParseUint32(value);
   } else if (name == "memtable_prefix_bloom_huge_page_tlb_size") {
     new_options->memtable_prefix_bloom_huge_page_tlb_size =
-      ParseInt64(value);
+      ParseSizeT(value);
   } else if (name == "max_successive_merges") {
-    new_options->max_successive_merges = ParseInt64(value);
+    new_options->max_successive_merges = ParseSizeT(value);
   } else if (name == "filter_deletes") {
     new_options->filter_deletes = ParseBoolean(name, value);
   } else if (name == "max_write_buffer_number") {
     new_options->max_write_buffer_number = ParseInt(value);
   } else if (name == "inplace_update_num_locks") {
-    new_options->inplace_update_num_locks = ParseInt64(value);
+    new_options->inplace_update_num_locks = ParseSizeT(value);
   } else {
     return false;
   }
@@ -367,11 +370,11 @@ bool GetDBOptionsFromMap(
       } else if (o.first == "max_background_flushes") {
         new_options->max_background_flushes = ParseInt(o.second);
       } else if (o.first == "max_log_file_size") {
-        new_options->max_log_file_size = ParseInt64(o.second);
+        new_options->max_log_file_size = ParseSizeT(o.second);
       } else if (o.first == "log_file_time_to_roll") {
-        new_options->log_file_time_to_roll = ParseInt64(o.second);
+        new_options->log_file_time_to_roll = ParseSizeT(o.second);
       } else if (o.first == "keep_log_file_num") {
-        new_options->keep_log_file_num = ParseInt64(o.second);
+        new_options->keep_log_file_num = ParseSizeT(o.second);
       } else if (o.first == "max_manifest_file_size") {
         new_options->max_manifest_file_size = ParseUint64(o.second);
       } else if (o.first == "table_cache_numshardbits") {
@@ -383,7 +386,7 @@ bool GetDBOptionsFromMap(
       } else if (o.first == "WAL_size_limit_MB") {
         new_options->WAL_size_limit_MB = ParseUint64(o.second);
       } else if (o.first == "manifest_preallocation_size") {
-        new_options->manifest_preallocation_size = ParseInt64(o.second);
+        new_options->manifest_preallocation_size = ParseSizeT(o.second);
       } else if (o.first == "allow_os_buffer") {
         new_options->allow_os_buffer = ParseBoolean(o.first, o.second);
       } else if (o.first == "allow_mmap_reads") {
@@ -424,4 +427,5 @@ bool GetDBOptionsFromString(
   return GetDBOptionsFromMap(base_options, opts_map, new_options);
 }
 
+#endif  // ROCKSDB_LITE
 }  // namespace rocksdb
diff --git a/util/sst_dump_tool.cc b/util/sst_dump_tool.cc
index 46fc10d79..be4e54da7 100644
--- a/util/sst_dump_tool.cc
+++ b/util/sst_dump_tool.cc
@@ -6,6 +6,8 @@
 
 #include "rocksdb/sst_dump_tool.h"
 
+#ifndef ROCKSDB_LITE
+
 #ifndef __STDC_FORMAT_MACROS
 #define __STDC_FORMAT_MACROS
 #endif
@@ -428,3 +430,5 @@ void SSTDumpTool::Run(int argc, char** argv) {
   }
 }
 }  // namespace rocksdb
+
+#endif  // ROCKSDB_LITE

From 3c92e52338abc024e073dbf209f53172ef38ec67 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Thu, 13 Nov 2014 14:47:54 -0500
Subject: [PATCH 471/829] Fix include

---
 util/options_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/util/options_test.cc b/util/options_test.cc
index 341240130..6bf2f0b0f 100644
--- a/util/options_test.cc
+++ b/util/options_test.cc
@@ -17,7 +17,7 @@
 
 #include "rocksdb/options.h"
 #include "util/testharness.h"
-#include "utilities/convenience.h"
+#include "rocksdb/utilities/convenience.h"
 
 using GFLAGS::ParseCommandLineFlags;
 DEFINE_bool(enable_print, false, "Print options generated to console.");

From cd0980150bd0614a326302f534c23e585750b370 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Thu, 13 Nov 2014 16:34:29 -0500
Subject: [PATCH 472/829] Add concurrency to compacting SpatialDB

Summary: This will speed up our import times

Test Plan: Added simple unit test just to get code coverage

Reviewers: sdong, ljin, yhchiang, rven, mohaps

Reviewed By: mohaps

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D28869
---
 include/rocksdb/utilities/spatial_db.h |  4 +-
 utilities/spatialdb/spatial_db.cc      | 61 ++++++++++++++++++--------
 utilities/spatialdb/spatial_db_test.cc |  5 ++-
 3 files changed, 50 insertions(+), 20 deletions(-)

diff --git a/include/rocksdb/utilities/spatial_db.h b/include/rocksdb/utilities/spatial_db.h
index cba93cd5f..1beb5c7f1 100644
--- a/include/rocksdb/utilities/spatial_db.h
+++ b/include/rocksdb/utilities/spatial_db.h
@@ -222,7 +222,9 @@ class SpatialDB : public StackableDB {
 
   // Calling Compact() after inserting a bunch of elements should speed up
   // reading. This is especially useful if you use SpatialDBOptions::bulk_load
-  virtual Status Compact() = 0;
+  // Num threads determines how many threads we'll use for compactions. Setting
+  // this to bigger number will use more IO and CPU, but finish faster
+  virtual Status Compact(int num_threads = 1) = 0;
 
   // Query the specified spatial_index. Query will return all elements that
   // intersect bbox, but it may also return some extra elements.
diff --git a/utilities/spatialdb/spatial_db.cc b/utilities/spatialdb/spatial_db.cc
index 6fbb780bc..2a4f7b14e 100644
--- a/utilities/spatialdb/spatial_db.cc
+++ b/utilities/spatialdb/spatial_db.cc
@@ -11,10 +11,13 @@
 #define __STDC_FORMAT_MACROS
 #endif
 
+#include <algorithm>
+#include <condition_variable>
 #include <inttypes.h>
 #include <string>
 #include <vector>
-#include <algorithm>
+#include <mutex>
+#include <thread>
 #include <set>
 #include <unordered_set>
 
@@ -561,27 +564,49 @@ class SpatialDBImpl : public SpatialDB {
     return Write(write_options, &batch);
   }
 
-  virtual Status Compact() override {
-    // TODO(icanadi) maybe do this in parallel?
-    Status s, t;
+  virtual Status Compact(int num_threads) override {
+    std::vector<ColumnFamilyHandle*> column_families;
+    column_families.push_back(data_column_family_);
+
     for (auto& iter : name_to_index_) {
-      t = Flush(FlushOptions(), iter.second.column_family);
-      if (!t.ok()) {
-        s = t;
-      }
-      t = CompactRange(iter.second.column_family, nullptr, nullptr);
-      if (!t.ok()) {
-        s = t;
-      }
+      column_families.push_back(iter.second.column_family);
     }
-    t = Flush(FlushOptions(), data_column_family_);
-    if (!t.ok()) {
-      s = t;
+
+    std::mutex state_mutex;
+    std::condition_variable cv;
+    Status s;
+    int threads_running = 0;
+
+    std::vector<std::thread> threads;
+
+    for (auto cfh : column_families) {
+      threads.emplace_back([&, cfh] {
+          {
+            std::unique_lock<std::mutex> lk(state_mutex);
+            cv.wait(lk, [&] { return threads_running < num_threads; });
+            threads_running++;
+          }
+
+          Status t = Flush(FlushOptions(), cfh);
+          if (t.ok()) {
+            t = CompactRange(cfh, nullptr, nullptr);
+          }
+
+          {
+            std::unique_lock<std::mutex> lk(state_mutex);
+            threads_running--;
+            if (s.ok() && !t.ok()) {
+              s = t;
+            }
+            cv.notify_one();
+          }
+      });
     }
-    t = CompactRange(data_column_family_, nullptr, nullptr);
-    if (!t.ok()) {
-      s = t;
+
+    for (auto& t : threads) {
+      t.join();
     }
+
     return s;
   }
 
diff --git a/utilities/spatialdb/spatial_db_test.cc b/utilities/spatialdb/spatial_db_test.cc
index 166920b57..0484f8c02 100644
--- a/utilities/spatialdb/spatial_db_test.cc
+++ b/utilities/spatialdb/spatial_db_test.cc
@@ -245,7 +245,10 @@ TEST(SpatialDBTest, RandomizedTest) {
     elements.push_back(make_pair(blob, bbox));
   }
 
-  db_->Compact();
+  // parallel
+  db_->Compact(2);
+  // serial
+  db_->Compact(1);
 
   for (int i = 0; i < 1000; ++i) {
     BoundingBox<int> int_bbox = RandomBoundingBox(128, &rnd, 10);

From 1d1a64f58a192f9c1db53f94de502a6785ae05c0 Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Thu, 13 Nov 2014 13:41:43 -0800
Subject: [PATCH 473/829] Move NeedsCompaction() from VersionStorageInfo to
 CompactionPicker

Summary:
Move NeedsCompaction() from VersionStorageInfo to CompactionPicker
to allow different compaction strategy to have their own way to
determine whether doing compaction is necessary.

When compaction style is set to kCompactionStyleNone, then
NeedsCompaction() will always return false.

Test Plan:
export ROCKSDB_TESTS=Compact
./db_test

Reviewers: ljin, sdong, igor

Reviewed By: igor

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D28719
---
 db/compaction_picker.cc      |  47 ++++++++-
 db/compaction_picker.h       |  24 ++++-
 db/compaction_picker_test.cc | 190 ++++++++++++++++++++++++++---------
 db/db_impl.cc                |   4 +-
 db/internal_stats.cc         |   3 +-
 db/version_builder.cc        |   2 +-
 db/version_builder_test.cc   |   2 +-
 db/version_set.cc            |  18 +---
 db/version_set.h             |   5 +-
 9 files changed, 219 insertions(+), 76 deletions(-)

diff --git a/db/compaction_picker.cc b/db/compaction_picker.cc
index 20d0e2c74..2e9144e29 100644
--- a/db/compaction_picker.cc
+++ b/db/compaction_picker.cc
@@ -675,6 +675,17 @@ Status CompactionPicker::SanitizeCompactionInputFiles(
   return Status::OK();
 }
 
+bool LevelCompactionPicker::NeedsCompaction(
+    const VersionStorageInfo* vstorage,
+    const MutableCFOptions& mutable_cf_options) const {
+  for (int i = 0; i <= vstorage->MaxInputLevel(); i++) {
+    if (vstorage->CompactionScore(i) >= 1) {
+      return true;
+    }
+  }
+  return false;
+}
+
 Compaction* LevelCompactionPicker::PickCompaction(
     const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
     VersionStorageInfo* vstorage, LogBuffer* log_buffer) {
@@ -829,6 +840,19 @@ Compaction* LevelCompactionPicker::PickCompactionBySize(
   return c;
 }
 
+bool UniversalCompactionPicker::NeedsCompaction(
+    const VersionStorageInfo* vstorage,
+    const MutableCFOptions& mutable_cf_options) const {
+  const int kLevel0 = 0;
+
+  if (vstorage->LevelFiles(kLevel0).size() <
+      static_cast<size_t>(
+          mutable_cf_options.level0_file_num_compaction_trigger)) {
+    return false;
+  }
+  return true;
+}
+
 // Universal style of compaction. Pick files that are contiguous in
 // time-range to compact.
 //
@@ -1228,6 +1252,27 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalSizeAmp(
   return c;
 }
 
+bool FIFOCompactionPicker::NeedsCompaction(
+    const VersionStorageInfo* vstorage,
+    const MutableCFOptions& mutable_cf_options) const {
+  const int kLevel0 = 0;
+  const std::vector<FileMetaData*>& level_files = vstorage->LevelFiles(kLevel0);
+
+  if (level_files.size() == 0) {
+    return false;
+  }
+
+  uint64_t total_size = 0;
+  for (const auto& file : level_files) {
+    total_size += file->fd.file_size;
+  }
+  if (total_size <= ioptions_.compaction_options_fifo.max_table_files_size) {
+    return false;
+  }
+
+  return true;
+}
+
 Compaction* FIFOCompactionPicker::PickCompaction(
     const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
     VersionStorageInfo* vstorage, LogBuffer* log_buffer) {
@@ -1236,7 +1281,7 @@ Compaction* FIFOCompactionPicker::PickCompaction(
   const std::vector<FileMetaData*>& level_files = vstorage->LevelFiles(kLevel0);
   uint64_t total_size = 0;
   for (const auto& file : level_files) {
-    total_size += file->compensated_file_size;
+    total_size += file->fd.file_size;
   }
 
   if (total_size <= ioptions_.compaction_options_fifo.max_table_files_size ||
diff --git a/db/compaction_picker.h b/db/compaction_picker.h
index d8daed115..94c661293 100644
--- a/db/compaction_picker.h
+++ b/db/compaction_picker.h
@@ -73,6 +73,10 @@ class CompactionPicker {
     return NumberLevels() - 1;
   }
 
+  virtual bool NeedsCompaction(
+      const VersionStorageInfo* vstorage,
+      const MutableCFOptions& cf_options) const = 0;
+
   // Sanitize the input set of compaction input files.
   // When the input parameters do not describe a valid compaction, the
   // function will try to fix the input_files by adding necessary
@@ -109,7 +113,6 @@ class CompactionPicker {
       const VersionStorageInfo* vstorage,
       const CompactionOptions& compact_options) const;
 
-
  protected:
   int NumberLevels() const { return ioptions_.num_levels; }
 
@@ -184,6 +187,10 @@ class UniversalCompactionPicker : public CompactionPicker {
     return 0;
   }
 
+  virtual bool NeedsCompaction(
+      const VersionStorageInfo* vstorage,
+      const MutableCFOptions& cf_options) const override;
+
  private:
   // Pick Universal compaction to limit read amplification
   Compaction* PickCompactionUniversalReadAmp(
@@ -218,6 +225,10 @@ class LevelCompactionPicker : public CompactionPicker {
     return current_num_levels - 2;
   }
 
+  virtual bool NeedsCompaction(
+      const VersionStorageInfo* vstorage,
+      const MutableCFOptions& cf_options) const override;
+
  private:
   // For the specfied level, pick a compaction.
   // Returns nullptr if there is no compaction to be done.
@@ -254,6 +265,10 @@ class FIFOCompactionPicker : public CompactionPicker {
   virtual int MaxOutputLevel() const override {
     return 0;
   }
+
+  virtual bool NeedsCompaction(
+      const VersionStorageInfo* vstorage,
+      const MutableCFOptions& cf_options) const override;
 };
 
 class NullCompactionPicker : public CompactionPicker {
@@ -285,6 +300,13 @@ class NullCompactionPicker : public CompactionPicker {
   virtual int MaxInputLevel(int current_num_levels) const {
     return current_num_levels - 2;
   }
+
+  // Always returns false.
+  virtual bool NeedsCompaction(
+      const VersionStorageInfo* vstorage,
+      const MutableCFOptions& cf_options) const override {
+    return false;
+  }
 };
 
 // Utility function
diff --git a/db/compaction_picker_test.cc b/db/compaction_picker_test.cc
index 2396d7f85..2297b21c9 100644
--- a/db/compaction_picker_test.cc
+++ b/db/compaction_picker_test.cc
@@ -19,95 +19,106 @@ class CountingLogger : public Logger {
 
 class CompactionPickerTest {
  public:
-  const Comparator* ucmp;
-  InternalKeyComparator icmp;
-  Options options;
-  ImmutableCFOptions ioptions;
-  MutableCFOptions mutable_cf_options;
+  const Comparator* ucmp_;
+  InternalKeyComparator icmp_;
+  Options options_;
+  ImmutableCFOptions ioptions_;
+  MutableCFOptions mutable_cf_options_;
   LevelCompactionPicker level_compaction_picker;
-  std::string cf_name;
-  CountingLogger logger;
-  LogBuffer log_buffer;
-  VersionStorageInfo vstorage;
-  uint32_t file_num;
-  CompactionOptionsFIFO fifo_options;
-  std::vector<uint64_t> size_being_compacted;
+  std::string cf_name_;
+  CountingLogger logger_;
+  LogBuffer log_buffer_;
+  uint32_t file_num_;
+  CompactionOptionsFIFO fifo_options_;
+  std::vector<uint64_t> size_being_compacted_;
+  std::unique_ptr<VersionStorageInfo> vstorage_;
+  std::vector<std::unique_ptr<FileMetaData>> files_;
 
   CompactionPickerTest()
-      : ucmp(BytewiseComparator()),
-        icmp(ucmp),
-        ioptions(options),
-        mutable_cf_options(options, ioptions),
-        level_compaction_picker(ioptions, &icmp),
-        cf_name("dummy"),
-        log_buffer(InfoLogLevel::INFO_LEVEL, &logger),
-        vstorage(&icmp, ucmp, options.num_levels, kCompactionStyleLevel,
-                 nullptr),
-        file_num(1) {
-    fifo_options.max_table_files_size = 1;
-    mutable_cf_options.RefreshDerivedOptions(ioptions);
-    size_being_compacted.resize(options.num_levels);
+      : ucmp_(BytewiseComparator()),
+        icmp_(ucmp_),
+        ioptions_(options_),
+        mutable_cf_options_(options_, ioptions_),
+        level_compaction_picker(ioptions_, &icmp_),
+        cf_name_("dummy"),
+        log_buffer_(InfoLogLevel::INFO_LEVEL, &logger_),
+        file_num_(1),
+        vstorage_(nullptr) {
+    fifo_options_.max_table_files_size = 1;
+    mutable_cf_options_.RefreshDerivedOptions(ioptions_);
+    size_being_compacted_.resize(options_.num_levels);
   }
 
   ~CompactionPickerTest() {
-    for (int i = 0; i < vstorage.num_levels(); i++) {
-      for (auto* f : vstorage.LevelFiles(i)) {
-        delete f;
-      }
-    }
+  }
+
+  void NewVersionStorage(int num_levels, CompactionStyle style) {
+    DeleteVersionStorage();
+    options_.num_levels = num_levels;
+    vstorage_.reset(new VersionStorageInfo(
+        &icmp_, ucmp_, options_.num_levels, style, nullptr));
+  }
+
+  void DeleteVersionStorage() {
+    vstorage_.reset();
+    files_.clear();
   }
 
   void Add(int level, uint32_t file_number, const char* smallest,
            const char* largest, uint64_t file_size = 0, uint32_t path_id = 0,
            SequenceNumber smallest_seq = 100,
            SequenceNumber largest_seq = 100) {
-    assert(level < vstorage.num_levels());
+    assert(level < vstorage_->num_levels());
     FileMetaData* f = new FileMetaData;
     f->fd = FileDescriptor(file_number, path_id, file_size);
     f->smallest = InternalKey(smallest, smallest_seq, kTypeValue);
     f->largest = InternalKey(largest, largest_seq, kTypeValue);
     f->compensated_file_size = file_size;
     f->refs = 0;
-    vstorage.MaybeAddFile(level, f);
+    vstorage_->AddFile(level, f);
+    files_.emplace_back(f);
   }
 
   void UpdateVersionStorageInfo() {
-    vstorage.ComputeCompactionScore(mutable_cf_options, fifo_options,
-                                    size_being_compacted);
-    vstorage.UpdateFilesBySize();
-    vstorage.UpdateNumNonEmptyLevels();
-    vstorage.GenerateFileIndexer();
-    vstorage.GenerateLevelFilesBrief();
-    vstorage.SetFinalized();
+    vstorage_->ComputeCompactionScore(mutable_cf_options_, fifo_options_,
+                                    size_being_compacted_);
+    vstorage_->UpdateFilesBySize();
+    vstorage_->UpdateNumNonEmptyLevels();
+    vstorage_->GenerateFileIndexer();
+    vstorage_->GenerateLevelFilesBrief();
+    vstorage_->SetFinalized();
   }
 };
 
 TEST(CompactionPickerTest, Empty) {
+  NewVersionStorage(6, kCompactionStyleLevel);
   UpdateVersionStorageInfo();
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
-      cf_name, mutable_cf_options, &vstorage, &log_buffer));
+      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
   ASSERT_TRUE(compaction.get() == nullptr);
 }
 
 TEST(CompactionPickerTest, Single) {
-  mutable_cf_options.level0_file_num_compaction_trigger = 2;
+  NewVersionStorage(6, kCompactionStyleLevel);
+  mutable_cf_options_.level0_file_num_compaction_trigger = 2;
   Add(0, 1U, "p", "q");
   UpdateVersionStorageInfo();
 
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
-      cf_name, mutable_cf_options, &vstorage, &log_buffer));
+      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
   ASSERT_TRUE(compaction.get() == nullptr);
 }
 
 TEST(CompactionPickerTest, Level0Trigger) {
-  mutable_cf_options.level0_file_num_compaction_trigger = 2;
+  NewVersionStorage(6, kCompactionStyleLevel);
+  mutable_cf_options_.level0_file_num_compaction_trigger = 2;
   Add(0, 1U, "150", "200");
   Add(0, 2U, "200", "250");
 
   UpdateVersionStorageInfo();
 
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
-      cf_name, mutable_cf_options, &vstorage, &log_buffer));
+      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(2U, compaction->num_input_files(0));
   ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
@@ -115,17 +126,19 @@ TEST(CompactionPickerTest, Level0Trigger) {
 }
 
 TEST(CompactionPickerTest, Level1Trigger) {
+  NewVersionStorage(6, kCompactionStyleLevel);
   Add(1, 66U, "150", "200", 1000000000U);
   UpdateVersionStorageInfo();
 
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
-      cf_name, mutable_cf_options, &vstorage, &log_buffer));
+      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(1U, compaction->num_input_files(0));
   ASSERT_EQ(66U, compaction->input(0, 0)->fd.GetNumber());
 }
 
 TEST(CompactionPickerTest, Level1Trigger2) {
+  NewVersionStorage(6, kCompactionStyleLevel);
   Add(1, 66U, "150", "200", 1000000001U);
   Add(1, 88U, "201", "300", 1000000000U);
   Add(2, 6U, "150", "179", 1000000000U);
@@ -134,7 +147,7 @@ TEST(CompactionPickerTest, Level1Trigger2) {
   UpdateVersionStorageInfo();
 
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
-      cf_name, mutable_cf_options, &vstorage, &log_buffer));
+      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(1U, compaction->num_input_files(0));
   ASSERT_EQ(2U, compaction->num_input_files(1));
@@ -144,8 +157,9 @@ TEST(CompactionPickerTest, Level1Trigger2) {
 }
 
 TEST(CompactionPickerTest, LevelMaxScore) {
-  mutable_cf_options.target_file_size_base = 10000000;
-  mutable_cf_options.target_file_size_multiplier = 10;
+  NewVersionStorage(6, kCompactionStyleLevel);
+  mutable_cf_options_.target_file_size_base = 10000000;
+  mutable_cf_options_.target_file_size_multiplier = 10;
   Add(0, 1U, "150", "200", 1000000000U);
   // Level 1 score 1.2
   Add(1, 66U, "150", "200", 6000000U);
@@ -162,12 +176,90 @@ TEST(CompactionPickerTest, LevelMaxScore) {
   UpdateVersionStorageInfo();
 
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
-      cf_name, mutable_cf_options, &vstorage, &log_buffer));
+      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(1U, compaction->num_input_files(0));
   ASSERT_EQ(7U, compaction->input(0, 0)->fd.GetNumber());
 }
 
+TEST(CompactionPickerTest, NeedsCompactionLevel) {
+  const int kLevels = 6;
+  const int kFileCount = 20;
+  for (int level = 0; level < kLevels - 1; ++level) {
+    uint64_t file_size =
+        mutable_cf_options_.MaxBytesForLevel(level) * 2 / kFileCount;
+    for (int file_count = 1; file_count <= kFileCount; ++file_count) {
+      // start a brand new version in each test.
+      NewVersionStorage(kLevels, kCompactionStyleLevel);
+      for (int i = 0; i < file_count; ++i) {
+        Add(level, i, std::to_string((i + 100) * 1000).c_str(),
+            std::to_string((i + 100) * 1000 + 999).c_str(),
+            file_size, 0, i * 100, i * 100 + 99);
+      }
+      UpdateVersionStorageInfo();
+      ASSERT_EQ(vstorage_->CompactionScoreLevel(0), level);
+      ASSERT_EQ(level_compaction_picker.NeedsCompaction(
+            vstorage_.get(), mutable_cf_options_),
+            vstorage_->CompactionScore(0) >= 1);
+      // release the version storage
+      DeleteVersionStorage();
+    }
+  }
+}
+
+TEST(CompactionPickerTest, NeedsCompactionUniversal) {
+  NewVersionStorage(1, kCompactionStyleUniversal);
+  UniversalCompactionPicker universal_compaction_picker(
+      ioptions_, &icmp_);
+  // must return false when there's no files.
+  ASSERT_EQ(universal_compaction_picker.NeedsCompaction(
+          vstorage_.get(), mutable_cf_options_), false);
+
+  // verify the trigger given different number of L0 files.
+  for (int i = 1;
+       i <= mutable_cf_options_.level0_file_num_compaction_trigger * 2;
+       ++i) {
+    Add(0, i, std::to_string((i + 100) * 1000).c_str(),
+        std::to_string((i + 100) * 1000 + 999).c_str(),
+        1000000, 0, i * 100, i * 100 + 99);
+    ASSERT_EQ(
+        universal_compaction_picker.NeedsCompaction(
+            vstorage_.get(), mutable_cf_options_),
+        i >= mutable_cf_options_.level0_file_num_compaction_trigger);
+  }
+}
+
+TEST(CompactionPickerTest, NeedsCompactionFIFO) {
+  NewVersionStorage(1, kCompactionStyleFIFO);
+  const int kFileCount =
+      mutable_cf_options_.level0_file_num_compaction_trigger * 3;
+  const uint64_t kFileSize = 100000;
+  const uint64_t kMaxSize = kFileSize * kFileCount / 2;
+
+  fifo_options_.max_table_files_size = kMaxSize;
+  ioptions_.compaction_options_fifo = fifo_options_;
+  FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_);
+
+  // must return false when there's no files.
+  ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(
+      vstorage_.get(), mutable_cf_options_), false);
+
+  // verify whether compaction is needed based on the current
+  // size of L0 files.
+  uint64_t current_size = 0;
+  for (int i = 1; i <= kFileCount; ++i) {
+    Add(0, i, std::to_string((i + 100) * 1000).c_str(),
+        std::to_string((i + 100) * 1000 + 999).c_str(),
+        kFileSize, 0, i * 100, i * 100 + 99);
+    current_size += kFileSize;
+    ASSERT_EQ(
+        fifo_compaction_picker.NeedsCompaction(
+            vstorage_.get(), mutable_cf_options_),
+        current_size > fifo_options_.max_table_files_size);
+  }
+}
+
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) { return rocksdb::test::RunAllTests(); }
diff --git a/db/db_impl.cc b/db/db_impl.cc
index 80857402b..16438302c 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -1669,7 +1669,9 @@ void DBImpl::MaybeScheduleFlushOrCompaction() {
     bool is_compaction_needed = false;
     // no need to refcount since we're under a mutex
     for (auto cfd : *versions_->GetColumnFamilySet()) {
-      if (cfd->current()->storage_info()->NeedsCompaction()) {
+      if (cfd->compaction_picker()->NeedsCompaction(
+              cfd->current()->storage_info(),
+              *cfd->GetCurrentMutableCFOptions())) {
         is_compaction_needed = true;
         break;
       }
diff --git a/db/internal_stats.cc b/db/internal_stats.cc
index 617626cb1..c5acc3402 100644
--- a/db/internal_stats.cc
+++ b/db/internal_stats.cc
@@ -235,7 +235,8 @@ bool InternalStats::GetIntProperty(DBPropertyType property_type,
     case kCompactionPending:
       // 1 if the system already determines at least one compacdtion is needed.
       // 0 otherwise,
-      *value = (vstorage->NeedsCompaction() ? 1 : 0);
+      *value = (cfd_->compaction_picker()->NeedsCompaction(
+          vstorage, *cfd_->GetCurrentMutableCFOptions()) ? 1 : 0);
       return true;
     case kBackgroundErrors:
       // Accumulated number of  errors in background flushes or compactions.
diff --git a/db/version_builder.cc b/db/version_builder.cc
index cf2d21ea8..a360ab02a 100644
--- a/db/version_builder.cc
+++ b/db/version_builder.cc
@@ -284,7 +284,7 @@ class VersionBuilder::Rep {
     if (levels_[level].deleted_files.count(f->fd.GetNumber()) > 0) {
       // File is deleted: do nothing
     } else {
-      vstorage->MaybeAddFile(level, f);
+      vstorage->AddFile(level, f);
     }
   }
 };
diff --git a/db/version_builder_test.cc b/db/version_builder_test.cc
index 0aa675214..978251998 100644
--- a/db/version_builder_test.cc
+++ b/db/version_builder_test.cc
@@ -66,7 +66,7 @@ class VersionBuilderTest {
     f->refs = 0;
     f->num_entries = num_entries;
     f->num_deletions = num_deletions;
-    vstorage_.MaybeAddFile(level, f);
+    vstorage_.AddFile(level, f);
     if (sampled) {
       f->init_stats_from_file = true;
       vstorage_.UpdateAccumulatedStats(f);
diff --git a/db/version_set.cc b/db/version_set.cc
index 0efaf0c7d..97215ce0c 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -1051,7 +1051,7 @@ bool CompareCompensatedSizeDescending(const Fsize& first, const Fsize& second) {
 
 } // anonymous namespace
 
-void VersionStorageInfo::MaybeAddFile(int level, FileMetaData* f) {
+void VersionStorageInfo::AddFile(int level, FileMetaData* f) {
   assert(level < num_levels());
   auto* level_files = &files_[level];
   // Must not overlap
@@ -1125,22 +1125,6 @@ bool Version::Unref() {
   return false;
 }
 
-bool VersionStorageInfo::NeedsCompaction() const {
-  // In universal compaction case, this check doesn't really
-  // check the compaction condition, but checks num of files threshold
-  // only. We are not going to miss any compaction opportunity
-  // but it's likely that more compactions are scheduled but
-  // ending up with nothing to do. We can improve it later.
-  // TODO(sdong): improve this function to be accurate for universal
-  //              compactions.
-  for (int i = 0; i <= MaxInputLevel(); i++) {
-    if (compaction_score_[i] >= 1) {
-      return true;
-    }
-  }
-  return false;
-}
-
 bool VersionStorageInfo::OverlapInLevel(int level,
                                         const Slice* smallest_user_key,
                                         const Slice* largest_user_key) {
diff --git a/db/version_set.h b/db/version_set.h
index d5a0cb28b..04ad37773 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -94,7 +94,7 @@ class VersionStorageInfo {
 
   void Reserve(int level, size_t size) { files_[level].reserve(size); }
 
-  void MaybeAddFile(int level, FileMetaData* f);
+  void AddFile(int level, FileMetaData* f);
 
   void SetFinalized() { finalized_ = true; }
 
@@ -128,9 +128,6 @@ class VersionStorageInfo {
 
   int MaxInputLevel() const;
 
-  // Returns true iff some level needs a compaction.
-  bool NeedsCompaction() const;
-
   // Returns the maxmimum compaction score for levels 1 to max
   double max_compaction_score() const { return max_compaction_score_; }
 

From 772bc97f1357aab07bcf1434e7b505c29498fe48 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Thu, 13 Nov 2014 16:45:33 -0500
Subject: [PATCH 474/829] No CompactFiles in ROCKSDB_LITE

Summary: It adds lots of code.

Test Plan: compile for iOS, compile for mac. works.

Reviewers: rven, sdong, ljin, yhchiang

Reviewed By: yhchiang

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D28857
---
 db/column_family.cc     |  2 ++
 db/column_family.h      |  2 ++
 db/compaction_picker.cc |  2 ++
 db/compaction_picker.h  |  4 ++++
 db/db_impl.cc           | 11 +++++++++++
 db/db_impl.h            |  2 ++
 6 files changed, 23 insertions(+)

diff --git a/db/column_family.cc b/db/column_family.cc
index 7d203fdbe..5261acc8c 100644
--- a/db/column_family.cc
+++ b/db/column_family.cc
@@ -580,6 +580,7 @@ void ColumnFamilyData::ResetThreadLocalSuperVersions() {
   }
 }
 
+#ifndef ROCKSDB_LITE
 Status ColumnFamilyData::SetOptions(
       const std::unordered_map<std::string, std::string>& options_map) {
   MutableCFOptions new_mutable_cf_options;
@@ -591,6 +592,7 @@ Status ColumnFamilyData::SetOptions(
   }
   return s;
 }
+#endif  // ROCKSDB_LITE
 
 ColumnFamilySet::ColumnFamilySet(const std::string& dbname,
                                  const DBOptions* db_options,
diff --git a/db/column_family.h b/db/column_family.h
index eef7e93b5..b421e44c6 100644
--- a/db/column_family.h
+++ b/db/column_family.h
@@ -187,9 +187,11 @@ class ColumnFamilyData {
   const MutableCFOptions* GetLatestMutableCFOptions() const {
     return &mutable_cf_options_;
   }
+#ifndef ROCKSDB_LITE
   // REQUIRES: DB mutex held
   Status SetOptions(
       const std::unordered_map<std::string, std::string>& options_map);
+#endif  // ROCKSDB_LITE
 
   InternalStats* internal_stats() { return internal_stats_.get(); }
 
diff --git a/db/compaction_picker.cc b/db/compaction_picker.cc
index 2e9144e29..5b7e50bfe 100644
--- a/db/compaction_picker.cc
+++ b/db/compaction_picker.cc
@@ -446,6 +446,7 @@ Compaction* CompactionPicker::CompactRange(
   return c;
 }
 
+#ifndef ROCKSDB_LITE
 namespace {
 // Test whether two files have overlapping key-ranges.
 bool HaveOverlappingKeyRanges(
@@ -674,6 +675,7 @@ Status CompactionPicker::SanitizeCompactionInputFiles(
 
   return Status::OK();
 }
+#endif  // ROCKSDB_LITE
 
 bool LevelCompactionPicker::NeedsCompaction(
     const VersionStorageInfo* vstorage,
diff --git a/db/compaction_picker.h b/db/compaction_picker.h
index 94c661293..aba70f08d 100644
--- a/db/compaction_picker.h
+++ b/db/compaction_picker.h
@@ -83,10 +83,12 @@ class CompactionPicker {
   // files.  If it's not possible to conver an invalid input_files
   // into a valid one by adding more files, the function will return a
   // non-ok status with specific reason.
+#ifndef ROCKSDB_LITE
   Status SanitizeCompactionInputFiles(
       std::unordered_set<uint64_t>* input_files,
       const ColumnFamilyMetaData& cf_meta,
       const int output_level) const;
+#endif  // ROCKSDB_LITE
 
   // Free up the files that participated in a compaction
   void ReleaseCompactionFiles(Compaction* c, Status status);
@@ -156,10 +158,12 @@ class CompactionPicker {
 
   // A helper function to SanitizeCompactionInputFiles() that
   // sanitizes "input_files" by adding necessary files.
+#ifndef ROCKSDB_LITE
   virtual Status SanitizeCompactionInputFilesForAllLevels(
       std::unordered_set<uint64_t>* input_files,
       const ColumnFamilyMetaData& cf_meta,
       const int output_level) const;
+#endif  // ROCKSDB_LITE
 
   // record all the ongoing compactions for all levels
   std::vector<std::set<Compaction*>> compactions_in_progress_;
diff --git a/db/db_impl.cc b/db/db_impl.cc
index 16438302c..122a92aae 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -1191,6 +1191,10 @@ Status DBImpl::CompactFiles(
     ColumnFamilyHandle* column_family,
     const std::vector<std::string>& input_file_names,
     const int output_level, const int output_path_id) {
+#ifdef ROCKSDB_LITE
+    // not supported in lite version
+  return Status::NotSupported("Not supported in ROCKSDB LITE");
+#else
   MutexLock l(&mutex_);
   if (column_family == nullptr) {
     return Status::InvalidArgument("ColumnFamilyHandle must be non-null.");
@@ -1210,8 +1214,10 @@ Status DBImpl::CompactFiles(
   // TODO(yhchiang): cfd should be deleted after its last reference.
   cfd->Unref();
   return s;
+#endif  // ROCKSDB_LITE
 }
 
+#ifndef ROCKSDB_LITE
 Status DBImpl::CompactFilesImpl(
     const CompactionOptions& compact_options, ColumnFamilyData* cfd,
     Version* version, const std::vector<std::string>& input_file_names,
@@ -1344,9 +1350,13 @@ Status DBImpl::CompactFilesImpl(
 
   return status;
 }
+#endif  // ROCKSDB_LITE
 
 Status DBImpl::SetOptions(ColumnFamilyHandle* column_family,
     const std::unordered_map<std::string, std::string>& options_map) {
+#ifdef ROCKSDB_LITE
+  return Status::NotSupported("Not supported in ROCKSDB LITE");
+#else
   auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family)->cfd();
   if (options_map.empty()) {
     Log(InfoLogLevel::WARN_LEVEL,
@@ -1382,6 +1392,7 @@ Status DBImpl::SetOptions(ColumnFamilyHandle* column_family,
         "[%s] SetOptions failed", cfd->GetName().c_str());
   }
   return s;
+#endif  // ROCKSDB_LITE
 }
 
 // return the same level if it cannot be moved
diff --git a/db/db_impl.h b/db/db_impl.h
index f7a655cb6..400f207b8 100644
--- a/db/db_impl.h
+++ b/db/db_impl.h
@@ -337,10 +337,12 @@ class DBImpl : public DB {
   void RecordFlushIOStats();
   void RecordCompactionIOStats();
 
+#ifndef ROCKSDB_LITE
   Status CompactFilesImpl(
       const CompactionOptions& compact_options, ColumnFamilyData* cfd,
       Version* version, const std::vector<std::string>& input_file_names,
       const int output_level, int output_path_id);
+#endif  // ROCKSDB_LITE
 
   ColumnFamilyData* GetColumnFamilyDataByName(const std::string& cf_name);
 

From 373c665edf5dead0778c3ee5b2d304778183a317 Mon Sep 17 00:00:00 2001
From: Hasnain Lakhani <mhl@fb.com>
Date: Thu, 13 Nov 2014 14:46:30 -0800
Subject: [PATCH 475/829] Fix broken test in 31b02d.

Summary:
CorruptionTest for backupable_db_test did not call
GarbageCollect() after deleting a corrupt backup,
which sometimes lead to test failures as the newly created backup
would reuse the same backup ID and files and fail the consistency
check.

Moved around some of the test logic to ensure that GarbageCollect()
is called at the right time.

Test Plan:
Run backupable_db_test eight times and make sure
it passes repeatedly. Also run make check to make sure other
tests don't fail.

Reviewers: igor

Reviewed By: igor

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D28863
---
 utilities/backupable/backupable_db_test.cc | 31 +++++++++++-----------
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/utilities/backupable/backupable_db_test.cc b/utilities/backupable/backupable_db_test.cc
index 7ca5acad8..46fc7cb6f 100644
--- a/utilities/backupable/backupable_db_test.cc
+++ b/utilities/backupable/backupable_db_test.cc
@@ -636,16 +636,6 @@ TEST(BackupableDBTest, CorruptionsTest) {
   ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/meta/2"));
   s = restore_db_->RestoreDBFromBackup(2, dbname_, dbname_);
   ASSERT_TRUE(!s.ok());
-  ASSERT_OK(restore_db_->DeleteBackup(2));
-  CloseRestoreDB();
-  AssertBackupConsistency(0, 0, keys_iteration * 1, keys_iteration * 5);
-
-  // new backup should be 2!
-  OpenBackupableDB();
-  FillDB(db_.get(), keys_iteration * 1, keys_iteration * 2);
-  ASSERT_OK(db_->CreateNewBackup(!!(rnd.Next() % 2)));
-  CloseBackupableDB();
-  AssertBackupConsistency(2, 0, keys_iteration * 2, keys_iteration * 5);
 
   // make sure that no corrupt backups have actually been deleted!
   ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/meta/1"));
@@ -660,18 +650,29 @@ TEST(BackupableDBTest, CorruptionsTest) {
   ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/private/5"));
 
   // delete the corrupt backups and then make sure they're actually deleted
-  OpenBackupableDB();
-  ASSERT_OK(db_->DeleteBackup(5));
-  ASSERT_OK(db_->DeleteBackup(4));
-  ASSERT_OK(db_->DeleteBackup(3));
-  (void) db_->GarbageCollect();
+  ASSERT_OK(restore_db_->DeleteBackup(5));
+  ASSERT_OK(restore_db_->DeleteBackup(4));
+  ASSERT_OK(restore_db_->DeleteBackup(3));
+  ASSERT_OK(restore_db_->DeleteBackup(2));
+  (void) restore_db_->GarbageCollect();
   ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/meta/5") == false);
   ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/private/5") == false);
   ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/meta/4") == false);
   ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/private/4") == false);
   ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/meta/3") == false);
   ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/private/3") == false);
+  ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/meta/2") == false);
+  ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/private/2") == false);
+
+  CloseRestoreDB();
+  AssertBackupConsistency(0, 0, keys_iteration * 1, keys_iteration * 5);
+
+  // new backup should be 2!
+  OpenBackupableDB();
+  FillDB(db_.get(), keys_iteration * 1, keys_iteration * 2);
+  ASSERT_OK(db_->CreateNewBackup(!!(rnd.Next() % 2)));
   CloseBackupableDB();
+  AssertBackupConsistency(2, 0, keys_iteration * 2, keys_iteration * 5);
 }
 
 // open DB, write, close DB, backup, restore, repeat

From 4161de92a3511bdf999c4d4d95c5a7a1e39f4b84 Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Thu, 13 Nov 2014 15:21:04 -0800
Subject: [PATCH 476/829] Fix SIGSEGV

Summary: As a short-term fix, let's go back to previous way of calculating NeedsCompaction(). SIGSEGV happens because NeedsCompaction() can happen before super_version (and thus MutableCFOptions) is initialized.

Test Plan: make check

Reviewers: ljin, sdong, rven, yhchiang

Reviewed By: yhchiang

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D28875
---
 db/compaction_picker.cc      | 37 +++++++-----------------------------
 db/compaction_picker.h       | 24 +++++++++--------------
 db/compaction_picker_test.cc | 31 ++++++++++++------------------
 db/db_impl.cc                |  3 +--
 db/internal_stats.cc         |  3 +--
 5 files changed, 30 insertions(+), 68 deletions(-)

diff --git a/db/compaction_picker.cc b/db/compaction_picker.cc
index 5b7e50bfe..e6c5fd8ee 100644
--- a/db/compaction_picker.cc
+++ b/db/compaction_picker.cc
@@ -677,9 +677,8 @@ Status CompactionPicker::SanitizeCompactionInputFiles(
 }
 #endif  // ROCKSDB_LITE
 
-bool LevelCompactionPicker::NeedsCompaction(
-    const VersionStorageInfo* vstorage,
-    const MutableCFOptions& mutable_cf_options) const {
+bool LevelCompactionPicker::NeedsCompaction(const VersionStorageInfo* vstorage)
+    const {
   for (int i = 0; i <= vstorage->MaxInputLevel(); i++) {
     if (vstorage->CompactionScore(i) >= 1) {
       return true;
@@ -843,16 +842,9 @@ Compaction* LevelCompactionPicker::PickCompactionBySize(
 }
 
 bool UniversalCompactionPicker::NeedsCompaction(
-    const VersionStorageInfo* vstorage,
-    const MutableCFOptions& mutable_cf_options) const {
+    const VersionStorageInfo* vstorage) const {
   const int kLevel0 = 0;
-
-  if (vstorage->LevelFiles(kLevel0).size() <
-      static_cast<size_t>(
-          mutable_cf_options.level0_file_num_compaction_trigger)) {
-    return false;
-  }
-  return true;
+  return vstorage->CompactionScore(kLevel0) >= 1;
 }
 
 // Universal style of compaction. Pick files that are contiguous in
@@ -1254,25 +1246,10 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalSizeAmp(
   return c;
 }
 
-bool FIFOCompactionPicker::NeedsCompaction(
-    const VersionStorageInfo* vstorage,
-    const MutableCFOptions& mutable_cf_options) const {
+bool FIFOCompactionPicker::NeedsCompaction(const VersionStorageInfo* vstorage)
+    const {
   const int kLevel0 = 0;
-  const std::vector<FileMetaData*>& level_files = vstorage->LevelFiles(kLevel0);
-
-  if (level_files.size() == 0) {
-    return false;
-  }
-
-  uint64_t total_size = 0;
-  for (const auto& file : level_files) {
-    total_size += file->fd.file_size;
-  }
-  if (total_size <= ioptions_.compaction_options_fifo.max_table_files_size) {
-    return false;
-  }
-
-  return true;
+  return vstorage->CompactionScore(kLevel0) >= 1;
 }
 
 Compaction* FIFOCompactionPicker::PickCompaction(
diff --git a/db/compaction_picker.h b/db/compaction_picker.h
index aba70f08d..f5bb2f256 100644
--- a/db/compaction_picker.h
+++ b/db/compaction_picker.h
@@ -73,9 +73,7 @@ class CompactionPicker {
     return NumberLevels() - 1;
   }
 
-  virtual bool NeedsCompaction(
-      const VersionStorageInfo* vstorage,
-      const MutableCFOptions& cf_options) const = 0;
+  virtual bool NeedsCompaction(const VersionStorageInfo* vstorage) const = 0;
 
   // Sanitize the input set of compaction input files.
   // When the input parameters do not describe a valid compaction, the
@@ -191,9 +189,8 @@ class UniversalCompactionPicker : public CompactionPicker {
     return 0;
   }
 
-  virtual bool NeedsCompaction(
-      const VersionStorageInfo* vstorage,
-      const MutableCFOptions& cf_options) const override;
+  virtual bool NeedsCompaction(const VersionStorageInfo* vstorage) const
+      override;
 
  private:
   // Pick Universal compaction to limit read amplification
@@ -229,9 +226,8 @@ class LevelCompactionPicker : public CompactionPicker {
     return current_num_levels - 2;
   }
 
-  virtual bool NeedsCompaction(
-      const VersionStorageInfo* vstorage,
-      const MutableCFOptions& cf_options) const override;
+  virtual bool NeedsCompaction(const VersionStorageInfo* vstorage) const
+      override;
 
  private:
   // For the specfied level, pick a compaction.
@@ -270,9 +266,8 @@ class FIFOCompactionPicker : public CompactionPicker {
     return 0;
   }
 
-  virtual bool NeedsCompaction(
-      const VersionStorageInfo* vstorage,
-      const MutableCFOptions& cf_options) const override;
+  virtual bool NeedsCompaction(const VersionStorageInfo* vstorage) const
+      override;
 };
 
 class NullCompactionPicker : public CompactionPicker {
@@ -306,9 +301,8 @@ class NullCompactionPicker : public CompactionPicker {
   }
 
   // Always returns false.
-  virtual bool NeedsCompaction(
-      const VersionStorageInfo* vstorage,
-      const MutableCFOptions& cf_options) const override {
+  virtual bool NeedsCompaction(const VersionStorageInfo* vstorage) const
+      override {
     return false;
   }
 };
diff --git a/db/compaction_picker_test.cc b/db/compaction_picker_test.cc
index 2297b21c9..913e745c8 100644
--- a/db/compaction_picker_test.cc
+++ b/db/compaction_picker_test.cc
@@ -198,9 +198,8 @@ TEST(CompactionPickerTest, NeedsCompactionLevel) {
       }
       UpdateVersionStorageInfo();
       ASSERT_EQ(vstorage_->CompactionScoreLevel(0), level);
-      ASSERT_EQ(level_compaction_picker.NeedsCompaction(
-            vstorage_.get(), mutable_cf_options_),
-            vstorage_->CompactionScore(0) >= 1);
+      ASSERT_EQ(level_compaction_picker.NeedsCompaction(vstorage_.get()),
+                vstorage_->CompactionScore(0) >= 1);
       // release the version storage
       DeleteVersionStorage();
     }
@@ -212,20 +211,17 @@ TEST(CompactionPickerTest, NeedsCompactionUniversal) {
   UniversalCompactionPicker universal_compaction_picker(
       ioptions_, &icmp_);
   // must return false when there's no files.
-  ASSERT_EQ(universal_compaction_picker.NeedsCompaction(
-          vstorage_.get(), mutable_cf_options_), false);
+  ASSERT_EQ(universal_compaction_picker.NeedsCompaction(vstorage_.get()),
+            false);
 
   // verify the trigger given different number of L0 files.
   for (int i = 1;
-       i <= mutable_cf_options_.level0_file_num_compaction_trigger * 2;
-       ++i) {
+       i <= mutable_cf_options_.level0_file_num_compaction_trigger * 2; ++i) {
     Add(0, i, std::to_string((i + 100) * 1000).c_str(),
-        std::to_string((i + 100) * 1000 + 999).c_str(),
-        1000000, 0, i * 100, i * 100 + 99);
-    ASSERT_EQ(
-        universal_compaction_picker.NeedsCompaction(
-            vstorage_.get(), mutable_cf_options_),
-        i >= mutable_cf_options_.level0_file_num_compaction_trigger);
+        std::to_string((i + 100) * 1000 + 999).c_str(), 1000000, 0, i * 100,
+        i * 100 + 99);
+    ASSERT_EQ(level_compaction_picker.NeedsCompaction(vstorage_.get()),
+              vstorage_->CompactionScore(0) >= 1);
   }
 }
 
@@ -241,8 +237,7 @@ TEST(CompactionPickerTest, NeedsCompactionFIFO) {
   FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_);
 
   // must return false when there's no files.
-  ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(
-      vstorage_.get(), mutable_cf_options_), false);
+  ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), false);
 
   // verify whether compaction is needed based on the current
   // size of L0 files.
@@ -252,10 +247,8 @@ TEST(CompactionPickerTest, NeedsCompactionFIFO) {
         std::to_string((i + 100) * 1000 + 999).c_str(),
         kFileSize, 0, i * 100, i * 100 + 99);
     current_size += kFileSize;
-    ASSERT_EQ(
-        fifo_compaction_picker.NeedsCompaction(
-            vstorage_.get(), mutable_cf_options_),
-        current_size > fifo_options_.max_table_files_size);
+    ASSERT_EQ(level_compaction_picker.NeedsCompaction(vstorage_.get()),
+              vstorage_->CompactionScore(0) >= 1);
   }
 }
 
diff --git a/db/db_impl.cc b/db/db_impl.cc
index 122a92aae..a4ea5af12 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -1681,8 +1681,7 @@ void DBImpl::MaybeScheduleFlushOrCompaction() {
     // no need to refcount since we're under a mutex
     for (auto cfd : *versions_->GetColumnFamilySet()) {
       if (cfd->compaction_picker()->NeedsCompaction(
-              cfd->current()->storage_info(),
-              *cfd->GetCurrentMutableCFOptions())) {
+              cfd->current()->storage_info())) {
         is_compaction_needed = true;
         break;
       }
diff --git a/db/internal_stats.cc b/db/internal_stats.cc
index c5acc3402..1afe31520 100644
--- a/db/internal_stats.cc
+++ b/db/internal_stats.cc
@@ -235,8 +235,7 @@ bool InternalStats::GetIntProperty(DBPropertyType property_type,
     case kCompactionPending:
       // 1 if the system already determines at least one compacdtion is needed.
       // 0 otherwise,
-      *value = (cfd_->compaction_picker()->NeedsCompaction(
-          vstorage, *cfd_->GetCurrentMutableCFOptions()) ? 1 : 0);
+      *value = (cfd_->compaction_picker()->NeedsCompaction(vstorage) ? 1 : 0);
       return true;
     case kBackgroundErrors:
       // Accumulated number of  errors in background flushes or compactions.

From f822129b32a7a990f218f3b0efecef55fd991024 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Thu, 13 Nov 2014 15:19:57 -0800
Subject: [PATCH 477/829] Add a unit test for behavior when merge operator and
 compaction filter co-exist.

Summary: Add a unit test in db_test to verify the behavior when both of merge operator and compaction filter apply to a key when merging.

Test Plan: Run the new test

Reviewers: ljin, yhchiang, rven, igor

Reviewed By: igor

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D28455
---
 db/db_test.cc | 104 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 104 insertions(+)

diff --git a/db/db_test.cc b/db/db_test.cc
index eed7af41c..a42a15a13 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -3284,6 +3284,22 @@ class DeleteFilter : public CompactionFilter {
   virtual const char* Name() const override { return "DeleteFilter"; }
 };
 
+class ConditionalFilter : public CompactionFilter {
+ public:
+  explicit ConditionalFilter(const std::string* filtered_value)
+      : filtered_value_(filtered_value) {}
+  virtual bool Filter(int level, const Slice& key, const Slice& value,
+                      std::string* new_value,
+                      bool* value_changed) const override {
+    return value.ToString() == *filtered_value_;
+  }
+
+  virtual const char* Name() const override { return "ConditionalFilter"; }
+
+ private:
+  const std::string* filtered_value_;
+};
+
 class ChangeFilter : public CompactionFilter {
  public:
   explicit ChangeFilter() {}
@@ -3334,6 +3350,25 @@ class DeleteFilterFactory : public CompactionFilterFactory {
   virtual const char* Name() const override { return "DeleteFilterFactory"; }
 };
 
+class ConditionalFilterFactory : public CompactionFilterFactory {
+ public:
+  explicit ConditionalFilterFactory(const Slice& filtered_value)
+      : filtered_value_(filtered_value.ToString()) {}
+
+  virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+      const CompactionFilter::Context& context) override {
+    return std::unique_ptr<CompactionFilter>(
+        new ConditionalFilter(&filtered_value_));
+  }
+
+  virtual const char* Name() const override {
+    return "ConditionalFilterFactory";
+  }
+
+ private:
+  std::string filtered_value_;
+};
+
 class ChangeFilterFactory : public CompactionFilterFactory {
  public:
   explicit ChangeFilterFactory() {}
@@ -4721,6 +4756,75 @@ TEST(DBTest, CompactionFilterWithValueChange) {
   } while (ChangeCompactOptions());
 }
 
+TEST(DBTest, CompactionFilterWithMergeOperator) {
+  std::string one, two, three, four;
+  PutFixed64(&one, 1);
+  PutFixed64(&two, 2);
+  PutFixed64(&three, 3);
+  PutFixed64(&four, 4);
+
+  Options options;
+  options = CurrentOptions(options);
+  options.create_if_missing = true;
+  options.merge_operator = MergeOperators::CreateUInt64AddOperator();
+  options.num_levels = 3;
+  options.max_mem_compaction_level = 0;
+  // Filter out keys with value is 2.
+  options.compaction_filter_factory =
+      std::make_shared<ConditionalFilterFactory>(two);
+  DestroyAndReopen(options);
+
+  // In the same compaction, a value type needs to be deleted based on
+  // compaction filter, and there is a merge type for the key. compaction
+  // filter result is ignored.
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", two));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->Merge(WriteOptions(), "foo", one));
+  ASSERT_OK(Flush());
+  std::string newvalue = Get("foo");
+  ASSERT_EQ(newvalue, three);
+  dbfull()->CompactRange(nullptr, nullptr);
+  newvalue = Get("foo");
+  ASSERT_EQ(newvalue, three);
+
+  // value key can be deleted based on compaction filter, leaving only
+  // merge keys.
+  ASSERT_OK(db_->Put(WriteOptions(), "bar", two));
+  ASSERT_OK(Flush());
+  dbfull()->CompactRange(nullptr, nullptr);
+  newvalue = Get("bar");
+  ASSERT_EQ("NOT_FOUND", newvalue);
+  ASSERT_OK(db_->Merge(WriteOptions(), "bar", two));
+  ASSERT_OK(Flush());
+  dbfull()->CompactRange(nullptr, nullptr);
+  newvalue = Get("bar");
+  ASSERT_EQ(two, two);
+
+  // Compaction filter never applies to merge keys.
+  ASSERT_OK(db_->Put(WriteOptions(), "foobar", one));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->Merge(WriteOptions(), "foobar", two));
+  ASSERT_OK(Flush());
+  newvalue = Get("foobar");
+  ASSERT_EQ(newvalue, three);
+  dbfull()->CompactRange(nullptr, nullptr);
+  newvalue = Get("foobar");
+  ASSERT_EQ(newvalue, three);
+
+  // In the same compaction, both of value type and merge type keys need to be
+  // deleted based on compaction filter, and there is a merge type for the key.
+  // For both keys, compaction filter results are ignored.
+  ASSERT_OK(db_->Put(WriteOptions(), "barfoo", two));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->Merge(WriteOptions(), "barfoo", two));
+  ASSERT_OK(Flush());
+  newvalue = Get("barfoo");
+  ASSERT_EQ(newvalue, four);
+  dbfull()->CompactRange(nullptr, nullptr);
+  newvalue = Get("barfoo");
+  ASSERT_EQ(newvalue, four);
+}
+
 TEST(DBTest, CompactionFilterContextManual) {
   KeepFilterFactory* filter = new KeepFilterFactory();
 

From 746cfaac5847a087b05446ca56a489e401553b97 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luk=C3=A1=C5=A1=20Lalinsk=C3=BD?= <lukas@oxygene.sk>
Date: Thu, 13 Nov 2014 16:58:05 -0800
Subject: [PATCH 478/829] Relax the block count check on deallocation in
 env_test

It seems that on some FS we get more blocks than we ask for. This is
already handled when checking the allocated number of blocks, but
after the file is closed it checks for an exact number of blocks,
which fails on my machine.

I changed the test to add one full page to the size, then calculate
the expected number of blocks and check if the actual number of blocks
is less or equal to that.
---
 util/env_test.cc | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/util/env_test.cc b/util/env_test.cc
index 54e52069a..9819d837a 100644
--- a/util/env_test.cc
+++ b/util/env_test.cc
@@ -552,6 +552,7 @@ TEST(EnvPosixTest, AllocateTest) {
   // allocate 100 MB
   size_t kPreallocateSize = 100 * 1024 * 1024;
   size_t kBlockSize = 512;
+  size_t kPageSize = 4096;
   std::string data(1024 * 1024, 'a');
   wfile->SetPreallocationBlockSize(kPreallocateSize);
   ASSERT_OK(wfile->Append(Slice(data)));
@@ -565,8 +566,7 @@ TEST(EnvPosixTest, AllocateTest) {
   // we only require that number of allocated blocks is at least what we expect.
   // It looks like some FS give us more blocks that we asked for. That's fine.
   // It might be worth investigating further.
-  auto st_blocks = f_stat.st_blocks;
-  ASSERT_LE((unsigned int)(kPreallocateSize / kBlockSize), st_blocks);
+  ASSERT_LE((unsigned int)(kPreallocateSize / kBlockSize), f_stat.st_blocks);
 
   // close the file, should deallocate the blocks
   wfile.reset();
@@ -574,7 +574,9 @@ TEST(EnvPosixTest, AllocateTest) {
   stat(fname.c_str(), &f_stat);
   ASSERT_EQ((unsigned int)data.size(), f_stat.st_size);
   // verify that preallocated blocks were deallocated on file close
-  ASSERT_EQ((f_stat.st_size + kBlockSize - 1) / kBlockSize, (unsigned int)f_stat.st_blocks);
+  // Because the FS might give us more blocks, we add a full page to the size
+  // and expect the number of blocks to be less or equal to that.
+  ASSERT_GE((f_stat.st_size + kPageSize + kBlockSize - 1) / kBlockSize, (unsigned int)f_stat.st_blocks);
 }
 #endif  // ROCKSDB_FALLOCATE_PRESENT
 

From a177742a9b965f5be72a1457465b1b8ce0413d12 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Wed, 12 Nov 2014 13:05:12 -0800
Subject: [PATCH 479/829] Make db_stress built for ROCKSDB_LITE

Summary:
Make db_stress built for ROCKSDB_LITE.
The test doesn't pass tough. It seg fault quickly. But I took a look and it doesn't seem to be related to lite version. Likely to be a bug inside RocksDB.

Test Plan: make db_stress

Reviewers: yhchiang, rven, ljin, igor

Reviewed By: igor

Subscribers: leveldb, dhruba

Differential Revision: https://reviews.facebook.net/D28797
---
 db/db_bench.cc              | 24 ++++++++++++++++++++----
 db/perf_context_test.cc     |  5 +++++
 table/table_reader_bench.cc | 10 ++++++++++
 tools/blob_store_bench.cc   |  8 ++++++++
 tools/db_repl_stress.cc     |  9 +++++++++
 tools/db_sanity_test.cc     |  7 +++++--
 tools/db_stress.cc          | 19 ++++++++++++++++---
 tools/ldb.cc                |  8 ++++++++
 tools/sst_dump.cc           |  8 ++++++++
 util/sst_dump_tool.cc       |  3 +--
 10 files changed, 90 insertions(+), 11 deletions(-)

diff --git a/db/db_bench.cc b/db/db_bench.cc
index c66a1fc1c..54ede5161 100644
--- a/db/db_bench.cc
+++ b/db/db_bench.cc
@@ -1878,14 +1878,15 @@ class Benchmark {
       exit(1);
     }
     switch (FLAGS_rep_factory) {
-      case kPrefixHash:
-        options.memtable_factory.reset(NewHashSkipListRepFactory(
-            FLAGS_hash_bucket_count));
-        break;
       case kSkipList:
         options.memtable_factory.reset(new SkipListFactory(
             FLAGS_skip_list_lookahead));
         break;
+#ifndef ROCKSDB_LITE
+      case kPrefixHash:
+        options.memtable_factory.reset(
+            NewHashSkipListRepFactory(FLAGS_hash_bucket_count));
+        break;
       case kHashLinkedList:
         options.memtable_factory.reset(NewHashLinkListRepFactory(
             FLAGS_hash_bucket_count));
@@ -1899,8 +1900,14 @@ class Benchmark {
         options.memtable_factory.reset(NewHashCuckooRepFactory(
             options.write_buffer_size, FLAGS_key_size + FLAGS_value_size));
         break;
+#else
+      default:
+        fprintf(stderr, "Only skip list is supported in lite mode\n");
+        exit(1);
+#endif  // ROCKSDB_LITE
     }
     if (FLAGS_use_plain_table) {
+#ifndef ROCKSDB_LITE
       if (FLAGS_rep_factory != kPrefixHash &&
           FLAGS_rep_factory != kHashLinkedList) {
         fprintf(stderr, "Waring: plain table is used with skipList\n");
@@ -1921,7 +1928,12 @@ class Benchmark {
       plain_table_options.hash_table_ratio = 0.75;
       options.table_factory = std::shared_ptr<TableFactory>(
           NewPlainTableFactory(plain_table_options));
+#else
+      fprintf(stderr, "Plain table is not supported in lite mode\n");
+      exit(1);
+#endif  // ROCKSDB_LITE
     } else if (FLAGS_use_cuckoo_table) {
+#ifndef ROCKSDB_LITE
       if (FLAGS_cuckoo_hash_ratio > 1 || FLAGS_cuckoo_hash_ratio < 0) {
         fprintf(stderr, "Invalid cuckoo_hash_ratio\n");
         exit(1);
@@ -1931,6 +1943,10 @@ class Benchmark {
       table_options.identity_as_first_hash = FLAGS_identity_as_first_hash;
       options.table_factory = std::shared_ptr<TableFactory>(
           NewCuckooTableFactory(table_options));
+#else
+      fprintf(stderr, "Cuckoo table is not supported in lite mode\n");
+      exit(1);
+#endif  // ROCKSDB_LITE
     } else {
       BlockBasedTableOptions block_based_options;
       if (FLAGS_use_hash_search) {
diff --git a/db/perf_context_test.cc b/db/perf_context_test.cc
index 6669aaec3..2d20a0186 100644
--- a/db/perf_context_test.cc
+++ b/db/perf_context_test.cc
@@ -38,8 +38,13 @@ std::shared_ptr<DB> OpenDb(bool read_only = false) {
       FLAGS_min_write_buffer_number_to_merge;
 
     if (FLAGS_use_set_based_memetable) {
+#ifndef ROCKSDB_LITE
       options.prefix_extractor.reset(rocksdb::NewFixedPrefixTransform(0));
       options.memtable_factory.reset(NewHashSkipListRepFactory());
+#else
+      fprintf(stderr, "Prefix hash is not supported in lite mode\n");
+      exit(1);
+#endif  // ROCKSDB_LITE
     }
 
     Status s;
diff --git a/table/table_reader_bench.cc b/table/table_reader_bench.cc
index ea722a8bf..a75424e82 100644
--- a/table/table_reader_bench.cc
+++ b/table/table_reader_bench.cc
@@ -257,12 +257,18 @@ int main(int argc, char** argv) {
   options.compression = rocksdb::CompressionType::kNoCompression;
 
   if (FLAGS_table_factory == "cuckoo_hash") {
+#ifndef ROCKSDB_LITE
     options.allow_mmap_reads = true;
     env_options.use_mmap_reads = true;
     rocksdb::CuckooTableOptions table_options;
     table_options.hash_table_ratio = 0.75;
     tf.reset(rocksdb::NewCuckooTableFactory(table_options));
+#else
+    fprintf(stderr, "Plain table is not supported in lite mode\n");
+    exit(1);
+#endif  // ROCKSDB_LITE
   } else if (FLAGS_table_factory == "plain_table") {
+#ifndef ROCKSDB_LITE
     options.allow_mmap_reads = true;
     env_options.use_mmap_reads = true;
 
@@ -274,6 +280,10 @@ int main(int argc, char** argv) {
     tf.reset(new rocksdb::PlainTableFactory(plain_table_options));
     options.prefix_extractor.reset(rocksdb::NewFixedPrefixTransform(
         FLAGS_prefix_len));
+#else
+    fprintf(stderr, "Cuckoo table is not supported in lite mode\n");
+    exit(1);
+#endif  // ROCKSDB_LITE
   } else if (FLAGS_table_factory == "block_based") {
     tf.reset(new rocksdb::BlockBasedTableFactory());
   } else {
diff --git a/tools/blob_store_bench.cc b/tools/blob_store_bench.cc
index 99ca66a37..0daae1a11 100644
--- a/tools/blob_store_bench.cc
+++ b/tools/blob_store_bench.cc
@@ -3,6 +3,7 @@
 //  LICENSE file in the root directory of this source tree. An additional grant
 //  of patent rights can be found in the PATENTS file in the same directory.
 
+#ifndef ROCKSDB_LITE
 #include <cstdio>
 #include <vector>
 #include <atomic>
@@ -282,3 +283,10 @@ int main(int argc, const char** argv) {
 
   return 0;
 }
+#else  // ROCKSDB_LITE
+#include <stdio.h>
+int main(int argc, char** argv) {
+  fprintf(stderr, "Not supported in lite mode.\n");
+  return 1;
+}
+#endif  // ROCKSDB_LITE
diff --git a/tools/db_repl_stress.cc b/tools/db_repl_stress.cc
index ec18ab512..b745d7b37 100644
--- a/tools/db_repl_stress.cc
+++ b/tools/db_repl_stress.cc
@@ -3,6 +3,7 @@
 //  LICENSE file in the root directory of this source tree. An additional grant
 //  of patent rights can be found in the PATENTS file in the same directory.
 
+#ifndef ROCKSDB_LITE
 #ifndef GFLAGS
 #include <cstdio>
 int main() {
@@ -145,3 +146,11 @@ int main(int argc, const char** argv) {
 }
 
 #endif  // GFLAGS
+
+#else  // ROCKSDB_LITE
+#include <stdio.h>
+int main(int argc, char** argv) {
+  fprintf(stderr, "Not supported in lite mode.\n");
+  return 1;
+}
+#endif  // ROCKSDB_LITE
diff --git a/tools/db_sanity_test.cc b/tools/db_sanity_test.cc
index 237ef07d0..f994ab38b 100644
--- a/tools/db_sanity_test.cc
+++ b/tools/db_sanity_test.cc
@@ -131,6 +131,7 @@ class SanityTestZlibCompression : public SanityTest {
   Options options_;
 };
 
+#ifndef ROCKSDB_LITE
 class SanityTestPlainTableFactory : public SanityTest {
  public:
   explicit SanityTestPlainTableFactory(const std::string& path)
@@ -146,6 +147,7 @@ class SanityTestPlainTableFactory : public SanityTest {
  private:
   Options options_;
 };
+#endif  // ROCKSDB_LITE
 
 class SanityTestBloomFilter : public SanityTest {
  public:
@@ -165,10 +167,11 @@ class SanityTestBloomFilter : public SanityTest {
 namespace {
 bool RunSanityTests(const std::string& command, const std::string& path) {
   std::vector<SanityTest*> sanity_tests = {
-      new SanityTestBasic(path),
-      new SanityTestSpecialComparator(path),
+      new SanityTestBasic(path), new SanityTestSpecialComparator(path),
       new SanityTestZlibCompression(path),
+#ifndef ROCKSDB_LITE
       new SanityTestPlainTableFactory(path),
+#endif  // ROCKSDB_LITE
       new SanityTestBloomFilter(path)};
 
   if (command == "create") {
diff --git a/tools/db_stress.cc b/tools/db_stress.cc
index 8109c141e..a6d8c9ace 100644
--- a/tools/db_stress.cc
+++ b/tools/db_stress.cc
@@ -1789,16 +1789,24 @@ class StressTest {
       exit(1);
     }
     switch (FLAGS_rep_factory) {
-      case kHashSkipList:
-        options_.memtable_factory.reset(NewHashSkipListRepFactory(10000));
-        break;
       case kSkipList:
         // no need to do anything
         break;
+#ifndef ROCKSDB_LITE
+      case kHashSkipList:
+        options_.memtable_factory.reset(NewHashSkipListRepFactory(10000));
+        break;
       case kVectorRep:
         options_.memtable_factory.reset(new VectorRepFactory());
         break;
+#else
+      default:
+        fprintf(stderr,
+                "RocksdbLite only supports skip list mem table. Skip "
+                "--rep_factory\n");
+#endif  // ROCKSDB_LITE
     }
+
     static Random purge_percent(1000); // no benefit from non-determinism here
     if (static_cast<int32_t>(purge_percent.Uniform(100)) <
         FLAGS_purge_redundant_percent - 1) {
@@ -1884,9 +1892,14 @@ class StressTest {
       assert(!s.ok() || column_families_.size() ==
                             static_cast<size_t>(FLAGS_column_families));
     } else {
+#ifndef ROCKSDB_LITE
       DBWithTTL* db_with_ttl;
       s = DBWithTTL::Open(options_, FLAGS_db, &db_with_ttl, FLAGS_ttl);
       db_ = db_with_ttl;
+#else
+      fprintf(stderr, "TTL is not supported in RocksDBLite\n");
+      exit(1);
+#endif
     }
     if (!s.ok()) {
       fprintf(stderr, "open error: %s\n", s.ToString().c_str());
diff --git a/tools/ldb.cc b/tools/ldb.cc
index 4581b8011..cb5ef5204 100644
--- a/tools/ldb.cc
+++ b/tools/ldb.cc
@@ -3,6 +3,7 @@
 //  LICENSE file in the root directory of this source tree. An additional grant
 //  of patent rights can be found in the PATENTS file in the same directory.
 //
+#ifndef ROCKSDB_LITE
 
 #include "rocksdb/ldb_tool.h"
 
@@ -11,3 +12,10 @@ int main(int argc, char** argv) {
   tool.Run(argc, argv);
   return 0;
 }
+#else
+#include <stdio.h>
+int main(int argc, char** argv) {
+  fprintf(stderr, "Not supported in lite mode.\n");
+  return 1;
+}
+#endif  // ROCKSDB_LITE
diff --git a/tools/sst_dump.cc b/tools/sst_dump.cc
index 7a83b60b3..403893779 100644
--- a/tools/sst_dump.cc
+++ b/tools/sst_dump.cc
@@ -3,6 +3,7 @@
 //  LICENSE file in the root directory of this source tree. An additional grant
 //  of patent rights can be found in the PATENTS file in the same directory.
 //
+#ifndef ROCKSDB_LITE
 
 #include "rocksdb/sst_dump_tool.h"
 
@@ -11,3 +12,10 @@ int main(int argc, char** argv) {
   tool.Run(argc, argv);
   return 0;
 }
+#else
+#include <stdio.h>
+int main(int argc, char** argv) {
+  fprintf(stderr, "Not supported in lite mode.\n");
+  return 1;
+}
+#endif  // ROCKSDB_LITE
diff --git a/util/sst_dump_tool.cc b/util/sst_dump_tool.cc
index be4e54da7..8d2233de8 100644
--- a/util/sst_dump_tool.cc
+++ b/util/sst_dump_tool.cc
@@ -3,11 +3,10 @@
 //  LICENSE file in the root directory of this source tree. An additional grant
 //  of patent rights can be found in the PATENTS file in the same directory.
 //
+#ifndef ROCKSDB_LITE
 
 #include "rocksdb/sst_dump_tool.h"
 
-#ifndef ROCKSDB_LITE
-
 #ifndef __STDC_FORMAT_MACROS
 #define __STDC_FORMAT_MACROS
 #endif

From 94fa542f82fba2bfb63e2246a2b0f3c50c0c6d6e Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Fri, 14 Nov 2014 10:43:12 -0800
Subject: [PATCH 480/829] Update HISTROY.md for 3.8 release

---
 HISTORY.md | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/HISTORY.md b/HISTORY.md
index 00a3dcbc9..2d65e69d1 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,11 +1,18 @@
 # Rocksdb Change Log
 
-## Unreleased
+## 3.8.0 (11/14/2014)
 
+### Public API changes
 * BackupEngine::NewBackupEngine() was deprecated; please use BackupEngine::Open() from now on.
 * BackupableDB/RestoreBackupableDB have new GarbageCollect() methods, which will clean up files from corrupt and obsolete backups.
 * BackupableDB/RestoreBackupableDB have new GetCorruptedBackups() methods which list corrupt backups.
 
+### Cleanup
+* Bunch of code cleanup, some extra warnings turned on (-Wshadow, -Wshorten-64-to-32, -Wnon-virtual-dtor)
+
+### New features
+* CompactFiles and EventListener, although they are still in experimental state
+
 ## 3.7.0 (11/6/2014)
 ### Public API changes
 * Introduce SetOptions() API to allow adjusting a subset of options dynamically online

From c44a2927817fff1a7a08665bb75b92275879be44 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luk=C3=A1=C5=A1=20Lalinsk=C3=BD?= <lukas@oxygene.sk>
Date: Thu, 13 Nov 2014 16:57:01 -0800
Subject: [PATCH 481/829] Add cuckoo table options to the C interface

---
 db/c.cc             | 47 +++++++++++++++++++++++++++++++++++++++++++++
 include/rocksdb/c.h | 21 ++++++++++++++++++++
 2 files changed, 68 insertions(+)

diff --git a/db/c.cc b/db/c.cc
index e98463b24..64ce9d0a3 100644
--- a/db/c.cc
+++ b/db/c.cc
@@ -56,6 +56,7 @@ using rocksdb::NewBloomFilterPolicy;
 using rocksdb::NewLRUCache;
 using rocksdb::Options;
 using rocksdb::BlockBasedTableOptions;
+using rocksdb::CuckooTableOptions;
 using rocksdb::RandomAccessFile;
 using rocksdb::Range;
 using rocksdb::ReadOptions;
@@ -83,6 +84,7 @@ struct rocksdb_readoptions_t     { ReadOptions       rep; };
 struct rocksdb_writeoptions_t    { WriteOptions      rep; };
 struct rocksdb_options_t         { Options           rep; };
 struct rocksdb_block_based_table_options_t  { BlockBasedTableOptions rep; };
+struct rocksdb_cuckoo_table_options_t  { CuckooTableOptions rep; };
 struct rocksdb_seqfile_t         { SequentialFile*   rep; };
 struct rocksdb_randomfile_t      { RandomAccessFile* rep; };
 struct rocksdb_writablefile_t    { WritableFile*     rep; };
@@ -1121,6 +1123,51 @@ void rocksdb_options_set_block_based_table_factory(
 }
 
 
+rocksdb_cuckoo_table_options_t*
+rocksdb_cuckoo_options_create() {
+  return new rocksdb_cuckoo_table_options_t;
+}
+
+void rocksdb_cuckoo_options_destroy(
+    rocksdb_cuckoo_table_options_t* options) {
+  delete options;
+}
+
+void rocksdb_cuckoo_options_set_hash_ratio(
+    rocksdb_cuckoo_table_options_t* options, double v) {
+  options->rep.hash_table_ratio = v;
+}
+
+void rocksdb_cuckoo_options_set_max_search_depth(
+    rocksdb_cuckoo_table_options_t* options, uint32_t v) {
+  options->rep.max_search_depth = v;
+}
+
+void rocksdb_cuckoo_options_set_cuckoo_block_size(
+    rocksdb_cuckoo_table_options_t* options, uint32_t v) {
+  options->rep.cuckoo_block_size = v;
+}
+
+void rocksdb_cuckoo_options_set_identity_as_first_hash(
+    rocksdb_cuckoo_table_options_t* options, unsigned char v) {
+  options->rep.identity_as_first_hash = v;
+}
+
+void rocksdb_cuckoo_options_set_use_module_hash(
+    rocksdb_cuckoo_table_options_t* options, unsigned char v) {
+  options->rep.use_module_hash = v;
+}
+
+void rocksdb_options_set_cuckoo_table_factory(
+    rocksdb_options_t *opt,
+    rocksdb_cuckoo_table_options_t* table_options) {
+  if (table_options) {
+    opt->rep.table_factory.reset(
+        rocksdb::NewCuckooTableFactory(table_options->rep));
+  }
+}
+
+
 rocksdb_options_t* rocksdb_options_create() {
   return new rocksdb_options_t;
 }
diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h
index 726a1edc3..b12e4fe5c 100644
--- a/include/rocksdb/c.h
+++ b/include/rocksdb/c.h
@@ -77,6 +77,8 @@ typedef struct rocksdb_mergeoperator_t   rocksdb_mergeoperator_t;
 typedef struct rocksdb_options_t         rocksdb_options_t;
 typedef struct rocksdb_block_based_table_options_t
     rocksdb_block_based_table_options_t;
+typedef struct rocksdb_cuckoo_table_options_t
+    rocksdb_cuckoo_table_options_t;
 typedef struct rocksdb_randomfile_t      rocksdb_randomfile_t;
 typedef struct rocksdb_readoptions_t     rocksdb_readoptions_t;
 typedef struct rocksdb_seqfile_t         rocksdb_seqfile_t;
@@ -376,6 +378,25 @@ extern void rocksdb_block_based_options_set_whole_key_filtering(
 extern void rocksdb_options_set_block_based_table_factory(
     rocksdb_options_t *opt, rocksdb_block_based_table_options_t* table_options);
 
+/* Cuckoo table options */
+
+extern rocksdb_cuckoo_table_options_t*
+    rocksdb_cuckoo_options_create();
+extern void rocksdb_cuckoo_options_destroy(
+    rocksdb_cuckoo_table_options_t* options);
+extern void rocksdb_cuckoo_options_set_hash_ratio(
+    rocksdb_cuckoo_table_options_t* options, double v);
+extern void rocksdb_cuckoo_options_set_max_search_depth(
+    rocksdb_cuckoo_table_options_t* options, uint32_t v);
+extern void rocksdb_cuckoo_options_set_cuckoo_block_size(
+    rocksdb_cuckoo_table_options_t* options, uint32_t v);
+extern void rocksdb_cuckoo_options_set_identity_as_first_hash(
+    rocksdb_cuckoo_table_options_t* options, unsigned char v);
+extern void rocksdb_cuckoo_options_set_use_module_hash(
+    rocksdb_cuckoo_table_options_t* options, unsigned char v);
+extern void rocksdb_options_set_cuckoo_table_factory(
+    rocksdb_options_t *opt, rocksdb_cuckoo_table_options_t* table_options);
+
 /* Options */
 
 extern rocksdb_options_t* rocksdb_options_create();

From 7fe247080f6fc0fd9c2a55c376e2a10d4a351a1a Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Fri, 14 Nov 2014 11:19:00 -0800
Subject: [PATCH 482/829] Update HISTORY.md for RocksJava

---
 HISTORY.md           | 1 +
 java/HISTORY-JAVA.md | 7 +++++++
 2 files changed, 8 insertions(+)

diff --git a/HISTORY.md b/HISTORY.md
index 2d65e69d1..08c26cc2a 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -12,6 +12,7 @@
 
 ### New features
 * CompactFiles and EventListener, although they are still in experimental state
+* Full ColumnFamily support in RocksJava.
 
 ## 3.7.0 (11/6/2014)
 ### Public API changes
diff --git a/java/HISTORY-JAVA.md b/java/HISTORY-JAVA.md
index 4cf0f7d18..7a293fd3f 100644
--- a/java/HISTORY-JAVA.md
+++ b/java/HISTORY-JAVA.md
@@ -1,5 +1,12 @@
 # RocksJava Change Log
 
+## By 11/14/2014
+### New Features
+* Full support for Column Family.
+* Slice and Comparator support.
+* Default merge operator support.
+* RateLimiter support.
+
 ## By 06/15/2014
 ### New Features
 * Added basic Java binding for rocksdb::Env such that multiple RocksDB can share the same thread pool and environment.

From e6c3cc65748d90208e66b06229e7bbdf2e5d6d03 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luk=C3=A1=C5=A1=20Lalinsk=C3=BD?= <lukas@oxygene.sk>
Date: Fri, 14 Nov 2014 11:31:52 -0800
Subject: [PATCH 483/829] Add very basic tests to make sure the C cuckoo table
 options compile and run

---
 db/c_test.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/db/c_test.c b/db/c_test.c
index 4f296f9bd..ed9a62a9d 100644
--- a/db/c_test.c
+++ b/db/c_test.c
@@ -801,8 +801,27 @@ int main(int argc, char** argv) {
     rocksdb_iter_get_error(iter, &err);
     CheckNoError(err);
     rocksdb_iter_destroy(iter);
+
+    rocksdb_close(db);
+    rocksdb_destroy_db(options, dbname, &err);
   }
 
+  StartPhase("cuckoo_options");
+  {
+    rocksdb_cuckoo_table_options_t* cuckoo_options;
+    cuckoo_options = rocksdb_cuckoo_options_create();
+    rocksdb_cuckoo_options_set_hash_ratio(cuckoo_options, 0.5);
+    rocksdb_cuckoo_options_set_max_search_depth(cuckoo_options, 200);
+    rocksdb_cuckoo_options_set_cuckoo_block_size(cuckoo_options, 10);
+    rocksdb_cuckoo_options_set_identity_as_first_hash(cuckoo_options, 1);
+    rocksdb_cuckoo_options_set_use_module_hash(cuckoo_options, 0);
+    rocksdb_options_set_cuckoo_table_factory(options, cuckoo_options);
+
+    db = rocksdb_open(options, dbname, &err);
+    CheckNoError(err);
+
+    rocksdb_cuckoo_options_destroy(cuckoo_options);
+  }
 
   StartPhase("cleanup");
   rocksdb_close(db);

From c9fd03ec51f16a108038e4fc78fafb74ed5b7702 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luk=C3=A1=C5=A1=20Lalinsk=C3=BD?= <lukas@oxygene.sk>
Date: Fri, 14 Nov 2014 11:34:32 -0800
Subject: [PATCH 484/829] Update docs for NewAdaptiveTableFactory

---
 include/rocksdb/table.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/include/rocksdb/table.h b/include/rocksdb/table.h
index 4fddab4b3..da525d4a2 100644
--- a/include/rocksdb/table.h
+++ b/include/rocksdb/table.h
@@ -369,13 +369,14 @@ class TableFactory {
 };
 
 #ifndef ROCKSDB_LITE
-// Create a special table factory that can open both of block based table format
-// and plain table, based on setting inside the SST files. It should be used to
+// Create a special table factory that can open either of the supported
+// table formats, based on setting inside the SST files. It should be used to
 // convert a DB from one table format to another.
 // @table_factory_to_write: the table factory used when writing to new files.
 // @block_based_table_factory:  block based table factory to use. If NULL, use
 //                              a default one.
 // @plain_table_factory: plain table factory to use. If NULL, use a default one.
+// @cuckoo_table_factory: cuckoo table factory to use. If NULL, use a default one.
 extern TableFactory* NewAdaptiveTableFactory(
     std::shared_ptr<TableFactory> table_factory_to_write = nullptr,
     std::shared_ptr<TableFactory> block_based_table_factory = nullptr,

From 9be338cf9d69e4ae9cbd933480ce78a42ac1d1fd Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Fri, 14 Nov 2014 11:35:48 -0800
Subject: [PATCH 485/829] CompactionJobTest

Summary:
This is just a simple test that passes two files though a compaction. It shows the framework so that people can continue building new compaction *unit* tests.
In the future we might want to move some Compaction* tests from DBTest here. For example, CompactBetweenSnapshot seems a good candidate.

Hopefully this test can be simpler when we mock out VersionSet.

Test Plan: this is a test

Reviewers: ljin, rven, yhchiang, sdong

Reviewed By: sdong

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D28449
---
 Makefile                  |   5 +-
 db/compaction.cc          |  11 +++
 db/compaction.h           |  10 +++
 db/compaction_job.cc      |   2 +
 db/compaction_job_test.cc | 176 ++++++++++++++++++++++++++++++++++++++
 db/flush_job_test.cc      |   4 +-
 table/mock_table.cc       |  25 +++++-
 table/mock_table.h        |  31 ++++---
 8 files changed, 247 insertions(+), 17 deletions(-)
 create mode 100644 db/compaction_job_test.cc

diff --git a/Makefile b/Makefile
index e5d823f41..fc80fa377 100644
--- a/Makefile
+++ b/Makefile
@@ -150,7 +150,7 @@ TESTS = \
 	flush_job_test \
 	wal_manager_test \
 	listener_test \
-	write_batch_with_index_test
+	compaction_job_test
 
 TOOLS = \
         sst_dump \
@@ -425,6 +425,9 @@ write_batch_with_index_test: utilities/write_batch_with_index/write_batch_with_i
 flush_job_test: db/flush_job_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) db/flush_job_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@  $(LDFLAGS) $(COVERAGEFLAGS)
 
+compaction_job_test: db/compaction_job_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) db/compaction_job_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@  $(LDFLAGS) $(COVERAGEFLAGS)
+
 wal_manager_test: db/wal_manager_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) db/wal_manager_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@  $(LDFLAGS) $(COVERAGEFLAGS)
 
diff --git a/db/compaction.cc b/db/compaction.cc
index a29b386b7..3d4c352c9 100644
--- a/db/compaction.cc
+++ b/db/compaction.cc
@@ -328,4 +328,15 @@ uint64_t Compaction::OutputFilePreallocationSize(
   return preallocation_size * 1.1;
 }
 
+Compaction* Compaction::TEST_NewCompaction(
+    int num_levels, int start_level, int out_level, uint64_t target_file_size,
+    uint64_t max_grandparent_overlap_bytes, uint32_t output_path_id,
+    CompressionType output_compression, bool seek_compaction,
+    bool deletion_compaction) {
+  return new Compaction(num_levels, start_level, out_level, target_file_size,
+                        max_grandparent_overlap_bytes, output_path_id,
+                        output_compression, seek_compaction,
+                        deletion_compaction);
+}
+
 }  // namespace rocksdb
diff --git a/db/compaction.h b/db/compaction.h
index b17a4a91b..c4c412c40 100644
--- a/db/compaction.h
+++ b/db/compaction.h
@@ -183,6 +183,16 @@ class Compaction {
   void SetupBottomMostLevel(VersionStorageInfo* vstorage, bool is_manual,
                             bool level0_only);
 
+  static Compaction* TEST_NewCompaction(
+      int num_levels, int start_level, int out_level, uint64_t target_file_size,
+      uint64_t max_grandparent_overlap_bytes, uint32_t output_path_id,
+      CompressionType output_compression, bool seek_compaction = false,
+      bool deletion_compaction = false);
+
+  CompactionInputFiles* TEST_GetInputFiles(int level) {
+    return &inputs_[level];
+  }
+
  private:
   friend class CompactionPicker;
   friend class UniversalCompactionPicker;
diff --git a/db/compaction_job.cc b/db/compaction_job.cc
index db751775a..bc514a2e8 100644
--- a/db/compaction_job.cc
+++ b/db/compaction_job.cc
@@ -231,6 +231,7 @@ void CompactionJob::Prepare() {
   // Generate file_levels_ for compaction berfore making Iterator
   compact_->compaction->GenerateFileLevels();
   ColumnFamilyData* cfd = compact_->compaction->column_family_data();
+  assert(cfd != nullptr);
   LogToBuffer(
       log_buffer_, "[%s] Compacting %d@%d + %d@%d files, score %.2f",
       cfd->GetName().c_str(), compact_->compaction->num_input_files(0),
@@ -990,6 +991,7 @@ Status CompactionJob::InstallCompactionResults(port::Mutex* db_mutex) {
 inline SequenceNumber CompactionJob::findEarliestVisibleSnapshot(
     SequenceNumber in, const std::vector<SequenceNumber>& snapshots,
     SequenceNumber* prev_snapshot) {
+  assert(snapshots.size());
   SequenceNumber prev __attribute__((unused)) = 0;
   for (const auto cur : snapshots) {
     assert(prev <= cur);
diff --git a/db/compaction_job_test.cc b/db/compaction_job_test.cc
new file mode 100644
index 000000000..cdf1c704a
--- /dev/null
+++ b/db/compaction_job_test.cc
@@ -0,0 +1,176 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include <map>
+#include <string>
+
+#include "db/compaction_job.h"
+#include "db/column_family.h"
+#include "db/version_set.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/options.h"
+#include "rocksdb/db.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+#include "table/mock_table.h"
+
+namespace rocksdb {
+
+// TODO(icanadi) Make it simpler once we mock out VersionSet
+class CompactionJobTest {
+ public:
+  CompactionJobTest()
+      : env_(Env::Default()),
+        dbname_(test::TmpDir() + "/compaction_job_test"),
+        table_cache_(NewLRUCache(50000, 16, 8)),
+        versions_(new VersionSet(dbname_, &db_options_, env_options_,
+                                 table_cache_.get(), &write_controller_)),
+        shutting_down_(false),
+        mock_table_factory_(new mock::MockTableFactory()) {
+    ASSERT_OK(env_->CreateDirIfMissing(dbname_));
+    db_options_.db_paths.emplace_back(dbname_,
+                                      std::numeric_limits<uint64_t>::max());
+    NewDB();
+    std::vector<ColumnFamilyDescriptor> column_families;
+    cf_options_.table_factory = mock_table_factory_;
+    column_families.emplace_back(kDefaultColumnFamilyName, cf_options_);
+
+    mutable_cf_options_.RefreshDerivedOptions(ImmutableCFOptions(Options()));
+
+    ASSERT_OK(versions_->Recover(column_families, false));
+  }
+
+  std::string GenerateFileName(uint64_t file_number) {
+    FileMetaData meta;
+    std::vector<DbPath> db_paths;
+    db_paths.emplace_back(dbname_, std::numeric_limits<uint64_t>::max());
+    meta.fd = FileDescriptor(file_number, 0, 0);
+    return TableFileName(db_paths, meta.fd.GetNumber(), meta.fd.GetPathId());
+  }
+
+  // returns expected result after compaction
+  mock::MockFileContents CreateTwoFiles() {
+    mock::MockFileContents expected_results;
+    const int kKeysPerFile = 10000;
+    SequenceNumber sequence_number = 0;
+    for (int i = 0; i < 2; ++i) {
+      mock::MockFileContents contents;
+      SequenceNumber smallest_seqno, largest_seqno;
+      InternalKey smallest, largest;
+      for (int k = 0; k < kKeysPerFile; ++k) {
+        auto key = std::to_string(i * (kKeysPerFile / 2) + k);
+        auto value = std::to_string(i * kKeysPerFile + k);
+        InternalKey internal_key(key, ++sequence_number, kTypeValue);
+        if (k == 0) {
+          smallest = internal_key;
+          smallest_seqno = sequence_number;
+        } else if (k == kKeysPerFile - 1) {
+          largest = internal_key;
+          largest_seqno = sequence_number;
+        }
+        std::pair<std::string, std::string> key_value(
+            {internal_key.Encode().ToString(), value});
+        contents.insert(key_value);
+        if (i == 1 || k < kKeysPerFile / 2) {
+          expected_results.insert(key_value);
+        }
+      }
+
+      uint64_t file_number = versions_->NewFileNumber();
+      ASSERT_OK(mock_table_factory_->CreateMockTable(
+          env_, GenerateFileName(file_number), std::move(contents)));
+
+      VersionEdit edit;
+      edit.AddFile(0, file_number, 0, 10, smallest, largest, smallest_seqno,
+                   largest_seqno);
+
+      mutex_.Lock();
+      versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(),
+                             mutable_cf_options_, &edit, &mutex_);
+      mutex_.Unlock();
+    }
+    versions_->SetLastSequence(sequence_number);
+    return expected_results;
+  }
+
+  void NewDB() {
+    VersionEdit new_db;
+    new_db.SetLogNumber(0);
+    new_db.SetNextFile(2);
+    new_db.SetLastSequence(0);
+
+    const std::string manifest = DescriptorFileName(dbname_, 1);
+    unique_ptr<WritableFile> file;
+    Status s = env_->NewWritableFile(
+        manifest, &file, env_->OptimizeForManifestWrite(env_options_));
+    ASSERT_OK(s);
+    {
+      log::Writer log(std::move(file));
+      std::string record;
+      new_db.EncodeTo(&record);
+      s = log.AddRecord(record);
+    }
+    ASSERT_OK(s);
+    // Make "CURRENT" file that points to the new manifest file.
+    s = SetCurrentFile(env_, dbname_, 1, nullptr);
+  }
+
+  Env* env_;
+  std::string dbname_;
+  EnvOptions env_options_;
+  MutableCFOptions mutable_cf_options_;
+  std::shared_ptr<Cache> table_cache_;
+  WriteController write_controller_;
+  DBOptions db_options_;
+  ColumnFamilyOptions cf_options_;
+  std::unique_ptr<VersionSet> versions_;
+  port::Mutex mutex_;
+  std::atomic<bool> shutting_down_;
+  std::shared_ptr<mock::MockTableFactory> mock_table_factory_;
+};
+
+TEST(CompactionJobTest, Simple) {
+  auto cfd = versions_->GetColumnFamilySet()->GetDefault();
+
+  auto expected_results = CreateTwoFiles();
+
+  auto files = cfd->current()->storage_info()->LevelFiles(0);
+  ASSERT_EQ(2U, files.size());
+
+  std::unique_ptr<Compaction> compaction(Compaction::TEST_NewCompaction(
+      7, 0, 1, 1024 * 1024, 10, 0, kNoCompression));
+  compaction->SetInputVersion(cfd->current());
+
+  auto compaction_input_files = compaction->TEST_GetInputFiles(0);
+  compaction_input_files->level = 0;
+  compaction_input_files->files.push_back(files[0]);
+  compaction_input_files->files.push_back(files[1]);
+
+  SnapshotList snapshots;
+  int yield_callback_called = 0;
+  std::function<uint64_t()> yield_callback = [&]() {
+    yield_callback_called++;
+    return 0;
+  };
+  LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, db_options_.info_log.get());
+  mutex_.Lock();
+  CompactionJob compaction_job(
+      compaction.get(), db_options_, *cfd->GetLatestMutableCFOptions(),
+      env_options_, versions_.get(), &shutting_down_, &log_buffer, nullptr,
+      nullptr, &snapshots, true, table_cache_, std::move(yield_callback));
+  compaction_job.Prepare();
+  mutex_.Unlock();
+  ASSERT_OK(compaction_job.Run());
+  mutex_.Lock();
+  compaction_job.Install(Status::OK(), &mutex_);
+  mutex_.Unlock();
+
+  mock_table_factory_->AssertLatestFile(expected_results);
+  ASSERT_EQ(yield_callback_called, 20000);
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) { return rocksdb::test::RunAllTests(); }
diff --git a/db/flush_job_test.cc b/db/flush_job_test.cc
index e39916bd6..0fa5b4e57 100644
--- a/db/flush_job_test.cc
+++ b/db/flush_job_test.cc
@@ -28,7 +28,7 @@ class FlushJobTest {
         versions_(new VersionSet(dbname_, &db_options_, env_options_,
                                  table_cache_.get(), &write_controller_)),
         shutting_down_(false),
-        mock_table_factory_(new MockTableFactory()) {
+        mock_table_factory_(new mock::MockTableFactory()) {
     ASSERT_OK(env_->CreateDirIfMissing(dbname_));
     db_options_.db_paths.emplace_back(dbname_,
                                       std::numeric_limits<uint64_t>::max());
@@ -73,7 +73,7 @@ class FlushJobTest {
   std::unique_ptr<VersionSet> versions_;
   port::Mutex mutex_;
   std::atomic<bool> shutting_down_;
-  std::shared_ptr<MockTableFactory> mock_table_factory_;
+  std::shared_ptr<mock::MockTableFactory> mock_table_factory_;
 };
 
 TEST(FlushJobTest, Empty) {
diff --git a/table/mock_table.cc b/table/mock_table.cc
index 64a00951c..70adf2da6 100644
--- a/table/mock_table.cc
+++ b/table/mock_table.cc
@@ -13,6 +13,7 @@
 #include "util/coding.h"
 
 namespace rocksdb {
+namespace mock {
 
 Iterator* MockTableReader::NewIterator(const ReadOptions&, Arena* arena) {
   return new MockTableIterator(table_);
@@ -70,6 +71,19 @@ TableBuilder* MockTableFactory::NewTableBuilder(
   return new MockTableBuilder(id, &file_system_);
 }
 
+Status MockTableFactory::CreateMockTable(Env* env, const std::string& fname,
+                                         MockFileContents file_contents) {
+  std::unique_ptr<WritableFile> file;
+  auto s = env->NewWritableFile(fname, &file, EnvOptions());
+  if (!s.ok()) {
+    return s;
+  }
+
+  uint32_t id = GetAndWriteNextID(file.get());
+  file_system_.files.insert({id, std::move(file_contents)});
+  return Status::OK();
+}
+
 uint32_t MockTableFactory::GetAndWriteNextID(WritableFile* file) const {
   uint32_t next_id = next_id_.fetch_add(1);
   char buf[4];
@@ -86,10 +100,17 @@ uint32_t MockTableFactory::GetIDFromFile(RandomAccessFile* file) const {
   return DecodeFixed32(buf);
 }
 
-void MockTableFactory::AssertSingleFile(
-    const std::map<std::string, std::string>& file_contents) {
+void MockTableFactory::AssertSingleFile(const MockFileContents& file_contents) {
   ASSERT_EQ(file_system_.files.size(), 1U);
   ASSERT_TRUE(file_contents == file_system_.files.begin()->second);
 }
 
+void MockTableFactory::AssertLatestFile(const MockFileContents& file_contents) {
+  ASSERT_GE(file_system_.files.size(), 1U);
+  auto latest = file_system_.files.end();
+  --latest;
+  ASSERT_TRUE(file_contents == latest->second);
+}
+
+}  // namespace mock
 }  // namespace rocksdb
diff --git a/table/mock_table.h b/table/mock_table.h
index 806ab93d4..57481a4bc 100644
--- a/table/mock_table.h
+++ b/table/mock_table.h
@@ -21,18 +21,19 @@
 #include "util/testutil.h"
 
 namespace rocksdb {
+namespace mock {
 
+typedef std::map<std::string, std::string> MockFileContents;
 // NOTE this currently only supports bitwise comparator
 
 struct MockTableFileSystem {
   port::Mutex mutex;
-  std::map<uint32_t, std::map<std::string, std::string>> files;
+  std::map<uint32_t, MockFileContents> files;
 };
 
 class MockTableReader : public TableReader {
  public:
-  MockTableReader(const std::map<std::string, std::string>& table)
-      : table_(table) {}
+  explicit MockTableReader(const MockFileContents& table) : table_(table) {}
 
   Iterator* NewIterator(const ReadOptions&, Arena* arena) override;
 
@@ -50,17 +51,16 @@ class MockTableReader : public TableReader {
   ~MockTableReader() {}
 
  private:
-  const std::map<std::string, std::string>& table_;
+  const MockFileContents& table_;
 };
 
 class MockTableIterator : public Iterator {
  public:
-  explicit MockTableIterator(const std::map<std::string, std::string>& table)
-      : table_(table) {
+  explicit MockTableIterator(const MockFileContents& table) : table_(table) {
     itr_ = table_.end();
   }
 
-  bool Valid() const { return itr_ == table_.end(); }
+  bool Valid() const { return itr_ != table_.end(); }
 
   void SeekToFirst() { itr_ = table_.begin(); }
 
@@ -91,8 +91,8 @@ class MockTableIterator : public Iterator {
   Status status() const { return Status::OK(); }
 
  private:
-  const std::map<std::string, std::string>& table_;
-  std::map<std::string, std::string>::const_iterator itr_;
+  const MockFileContents& table_;
+  MockFileContents::const_iterator itr_;
 };
 
 class MockTableBuilder : public TableBuilder {
@@ -128,7 +128,7 @@ class MockTableBuilder : public TableBuilder {
  private:
   uint32_t id_;
   MockTableFileSystem* file_system_;
-  std::map<std::string, std::string> table_;
+  MockFileContents table_;
 };
 
 class MockTableFactory : public TableFactory {
@@ -147,6 +147,12 @@ class MockTableFactory : public TableFactory {
       const CompressionType compression_type,
       const CompressionOptions& compression_opts) const;
 
+  // This function will directly create mock table instead of going through
+  // MockTableBuilder. MockFileContents has to have a format of <internal_key,
+  // value>. Those key-value pairs will then be inserted into the mock table
+  Status CreateMockTable(Env* env, const std::string& fname,
+                         MockFileContents file_contents);
+
   virtual Status SanitizeOptions(const DBOptions& db_opts,
                                  const ColumnFamilyOptions& cf_opts) const {
     return Status::OK();
@@ -158,8 +164,8 @@ class MockTableFactory : public TableFactory {
 
   // This function will assert that only a single file exists and that the
   // contents are equal to file_contents
-  void AssertSingleFile(
-      const std::map<std::string, std::string>& file_contents);
+  void AssertSingleFile(const MockFileContents& file_contents);
+  void AssertLatestFile(const MockFileContents& file_contents);
 
  private:
   uint32_t GetAndWriteNextID(WritableFile* file) const;
@@ -169,4 +175,5 @@ class MockTableFactory : public TableFactory {
   mutable std::atomic<uint32_t> next_id_;
 };
 
+}  // namespace mock
 }  // namespace rocksdb

From 6c1b040cc999a87781cda042ca4e3b644261ee5d Mon Sep 17 00:00:00 2001
From: Venkatesh Radhakrishnan <rven@fb.com>
Date: Fri, 14 Nov 2014 11:38:26 -0800
Subject: [PATCH 486/829] Provide openable snapshots

Summary: Store links to live files in directory on same disk

Test Plan:
Take snapshot and open it. Added a test GetSnapshotLink in
db_test.

Reviewers: sdong

Reviewed By: sdong

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D28713
---
 db/db_filesnapshot.cc                    | 109 +++++++++++++++++++++++
 db/db_impl.h                             |   5 ++
 db/db_test.cc                            |  58 ++++++++++++
 hdfs/env_hdfs.h                          |   6 ++
 include/rocksdb/db.h                     |   6 ++
 include/rocksdb/env.h                    |   9 ++
 include/rocksdb/utilities/stackable_db.h |   4 +
 util/env_posix.cc                        |  11 +++
 util/file_util.cc                        |  59 ++++++++++++
 util/file_util.h                         |  18 ++++
 util/mock_env.cc                         |  13 +++
 util/mock_env.h                          |   2 +
 12 files changed, 300 insertions(+)
 create mode 100644 util/file_util.cc
 create mode 100644 util/file_util.h

diff --git a/db/db_filesnapshot.cc b/db/db_filesnapshot.cc
index dcf54c8c6..64e5e437c 100644
--- a/db/db_filesnapshot.cc
+++ b/db/db_filesnapshot.cc
@@ -25,6 +25,7 @@
 #include "port/port.h"
 #include "util/mutexlock.h"
 #include "util/sync_point.h"
+#include "util/file_util.h"
 
 namespace rocksdb {
 
@@ -134,6 +135,114 @@ Status DBImpl::GetLiveFiles(std::vector<std::string>& ret,
 Status DBImpl::GetSortedWalFiles(VectorLogPtr& files) {
   return wal_manager_.GetSortedWalFiles(files);
 }
+
+// Builds an openable snapshot of RocksDB
+Status DBImpl::CreateCheckpoint(const std::string& snapshot_dir) {
+  Status s;
+  std::vector<std::string> live_files;
+  uint64_t manifest_file_size = 0;
+  uint64_t sequence_number = GetLatestSequenceNumber();
+  bool same_fs = true;
+
+  if (env_->FileExists(snapshot_dir)) {
+    return Status::InvalidArgument("Directory exists");
+  }
+
+  s = DisableFileDeletions();
+  if (s.ok()) {
+    // this will return live_files prefixed with "/"
+    s = GetLiveFiles(live_files, &manifest_file_size, true);
+  }
+  if (!s.ok()) {
+    EnableFileDeletions(false);
+    return s;
+  }
+
+  Log(db_options_.info_log,
+      "Started the snapshot process -- creating snapshot in directory %s",
+      snapshot_dir.c_str());
+
+  std::string full_private_path = snapshot_dir + ".tmp";
+
+  // create snapshot directory
+  s = env_->CreateDir(full_private_path);
+
+  // copy/hard link live_files
+  for (size_t i = 0; s.ok() && i < live_files.size(); ++i) {
+    uint64_t number;
+    FileType type;
+    bool ok = ParseFileName(live_files[i], &number, &type);
+    if (!ok) {
+      s = Status::Corruption("Can't parse file name. This is very bad");
+      break;
+    }
+    // we should only get sst, manifest and current files here
+    assert(type == kTableFile || type == kDescriptorFile ||
+           type == kCurrentFile);
+    assert(live_files[i].size() > 0 && live_files[i][0] == '/');
+    std::string src_fname = live_files[i];
+
+    // rules:
+    // * if it's kTableFile, then it's shared
+    // * if it's kDescriptorFile, limit the size to manifest_file_size
+    // * always copy if cross-device link
+    if ((type == kTableFile) && same_fs) {
+      Log(db_options_.info_log, "Hard Linking %s", src_fname.c_str());
+      s = env_->LinkFile(GetName() + src_fname, full_private_path + src_fname);
+      if (s.IsNotSupported()) {
+        same_fs = false;
+        s = Status::OK();
+      }
+    }
+    if ((type != kTableFile) || (!same_fs)) {
+      Log(db_options_.info_log, "Copying %s", src_fname.c_str());
+      s = CopyFile(env_, GetName() + src_fname, full_private_path + src_fname,
+                   (type == kDescriptorFile) ? manifest_file_size : 0);
+    }
+  }
+
+  // we copied all the files, enable file deletions
+  EnableFileDeletions(false);
+
+  if (s.ok()) {
+    // move tmp private backup to real snapshot directory
+    s = env_->RenameFile(full_private_path, snapshot_dir);
+  }
+  if (s.ok()) {
+    unique_ptr<Directory> snapshot_directory;
+    env_->NewDirectory(snapshot_dir, &snapshot_directory);
+    if (snapshot_directory != nullptr) {
+      s = snapshot_directory->Fsync();
+    }
+  }
+
+  if (!s.ok()) {
+    // clean all the files we might have created
+    Log(db_options_.info_log, "Snapshot failed -- %s", s.ToString().c_str());
+    // we have to delete the dir and all its children
+    std::vector<std::string> subchildren;
+    env_->GetChildren(full_private_path, &subchildren);
+    for (auto& subchild : subchildren) {
+      Status s1 = env_->DeleteFile(full_private_path + subchild);
+      if (s1.ok()) {
+        Log(db_options_.info_log, "Deleted %s",
+            (full_private_path + subchild).c_str());
+      }
+    }
+    // finally delete the private dir
+    Status s1 = env_->DeleteDir(full_private_path);
+    Log(db_options_.info_log, "Deleted dir %s -- %s", full_private_path.c_str(),
+        s1.ToString().c_str());
+    return s;
+  }
+
+  // here we know that we succeeded and installed the new snapshot
+  Log(db_options_.info_log, "Snapshot DONE. All is good");
+  Log(db_options_.info_log, "Snapshot sequence number: %" PRIu64,
+      sequence_number);
+
+  return s;
+}
 }
 
 #endif  // ROCKSDB_LITE
diff --git a/db/db_impl.h b/db/db_impl.h
index 400f207b8..1106a281d 100644
--- a/db/db_impl.h
+++ b/db/db_impl.h
@@ -169,6 +169,11 @@ class DBImpl : public DB {
       ColumnFamilyHandle* column_family,
       ColumnFamilyMetaData* metadata) override;
 
+  // Builds an openable snapshot of RocksDB on the same disk, which
+  // accepts an output directory on the same disk, and under the directory
+  // (1) hard-linked SST files pointing to existing live SST files
+  // (2) a copied manifest files and other files
+  virtual Status CreateCheckpoint(const std::string& snapshot_dir);
 #endif  // ROCKSDB_LITE
 
   // checks if all live files exist on file system and that their file sizes
diff --git a/db/db_test.cc b/db/db_test.cc
index a42a15a13..a3ad82c51 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -1606,6 +1606,60 @@ TEST(DBTest, GetSnapshot) {
   } while (ChangeOptions(kSkipHashCuckoo));
 }
 
+TEST(DBTest, GetSnapshotLink) {
+  do {
+    Options options;
+    const std::string snapshot_name = test::TmpDir(env_) + "/snapshot";
+    DB* snapshotDB;
+    ReadOptions roptions;
+    std::string result;
+
+    options = CurrentOptions(options);
+    delete db_;
+    db_ = nullptr;
+    ASSERT_OK(DestroyDB(dbname_, options));
+    ASSERT_OK(DestroyDB(snapshot_name, options));
+    env_->DeleteDir(snapshot_name);
+
+    // Create a database
+    Status s;
+    options.create_if_missing = true;
+    ASSERT_OK(DB::Open(options, dbname_, &db_));
+    std::string key = std::string("foo");
+    ASSERT_OK(Put(key, "v1"));
+    // Take a snapshot
+    ASSERT_OK(db_->CreateCheckpoint(snapshot_name));
+    ASSERT_OK(Put(key, "v2"));
+    ASSERT_EQ("v2", Get(key));
+    ASSERT_OK(Flush());
+    ASSERT_EQ("v2", Get(key));
+    // Open snapshot and verify contents while DB is running
+    options.create_if_missing = false;
+    ASSERT_OK(DB::Open(options, snapshot_name, &snapshotDB));
+    ASSERT_OK(snapshotDB->Get(roptions, key, &result));
+    ASSERT_EQ("v1", result);
+    delete snapshotDB;
+    snapshotDB = nullptr;
+    delete db_;
+    db_ = nullptr;
+
+    // Destroy original DB
+    ASSERT_OK(DestroyDB(dbname_, options));
+
+    // Open snapshot and verify contents
+    options.create_if_missing = false;
+    dbname_ = snapshot_name;
+    ASSERT_OK(DB::Open(options, dbname_, &db_));
+    ASSERT_EQ("v1", Get(key));
+    delete db_;
+    db_ = nullptr;
+    ASSERT_OK(DestroyDB(dbname_, options));
+
+    // Restore DB name
+    dbname_ = test::TmpDir(env_) + "/db_test";
+  } while (ChangeOptions());
+}
+
 TEST(DBTest, GetLevel0Ordering) {
   do {
     CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
@@ -7468,6 +7522,10 @@ class ModelDB: public DB {
       ColumnFamilyHandle* column_family,
       ColumnFamilyMetaData* metadata) {}
 
+  virtual Status CreateCheckpoint(const std::string& snapshot_dir) {
+    return Status::NotSupported("Not supported in Model DB");
+  }
+
  private:
   class ModelIter: public Iterator {
    public:
diff --git a/hdfs/env_hdfs.h b/hdfs/env_hdfs.h
index 82f317f73..475ea7cab 100644
--- a/hdfs/env_hdfs.h
+++ b/hdfs/env_hdfs.h
@@ -93,6 +93,8 @@ class HdfsEnv : public Env {
 
   virtual Status RenameFile(const std::string& src, const std::string& target);
 
+  virtual Status LinkFile(const std::string& src, const std::string& target);
+
   virtual Status LockFile(const std::string& fname, FileLock** lock);
 
   virtual Status UnlockFile(FileLock* lock);
@@ -291,6 +293,10 @@ class HdfsEnv : public Env {
 
   virtual Status RenameFile(const std::string& src, const std::string& target){return notsup;}
 
+  virtual Status LinkFile(const std::string& src, const std::string& target) {
+    return notsup;
+  }
+
   virtual Status LockFile(const std::string& fname, FileLock** lock){return notsup;}
 
   virtual Status UnlockFile(FileLock* lock){return notsup;}
diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h
index 04460ad9e..52f157d82 100644
--- a/include/rocksdb/db.h
+++ b/include/rocksdb/db.h
@@ -521,6 +521,12 @@ class DB {
   virtual Status GetPropertiesOfAllTables(TablePropertiesCollection* props) {
     return GetPropertiesOfAllTables(DefaultColumnFamily(), props);
   }
+
+  // Builds an openable snapshot of RocksDB on the same disk, which
+  // accepts an output directory on the same disk, and under the directory
+  // (1) hard-linked SST files pointing to existing live SST files
+  // (2) a copied manifest files and other files
+  virtual Status CreateCheckpoint(const std::string& snapshot_dir) = 0;
 #endif  // ROCKSDB_LITE
 
  private:
diff --git a/include/rocksdb/env.h b/include/rocksdb/env.h
index 36aa5a604..291676002 100644
--- a/include/rocksdb/env.h
+++ b/include/rocksdb/env.h
@@ -178,6 +178,10 @@ class Env {
   virtual Status RenameFile(const std::string& src,
                             const std::string& target) = 0;
 
+  // Hard Link file src to target.
+  virtual Status LinkFile(const std::string& src,
+                          const std::string& target) = 0;
+
   // Lock the specified file.  Used to prevent concurrent access to
   // the same db by multiple processes.  On failure, stores nullptr in
   // *lock and returns non-OK.
@@ -747,6 +751,11 @@ class EnvWrapper : public Env {
   Status RenameFile(const std::string& s, const std::string& t) {
     return target_->RenameFile(s, t);
   }
+
+  Status LinkFile(const std::string& s, const std::string& t) {
+    return target_->LinkFile(s, t);
+  }
+
   Status LockFile(const std::string& f, FileLock** l) {
     return target_->LockFile(f, l);
   }
diff --git a/include/rocksdb/utilities/stackable_db.h b/include/rocksdb/utilities/stackable_db.h
index 7bdf9928e..9366bd84f 100644
--- a/include/rocksdb/utilities/stackable_db.h
+++ b/include/rocksdb/utilities/stackable_db.h
@@ -247,6 +247,10 @@ class StackableDB : public DB {
     return db_->DefaultColumnFamily();
   }
 
+  virtual Status CreateCheckpoint(const std::string& snapshot_dir) override {
+    return db_->CreateCheckpoint(snapshot_dir);
+  }
+
  protected:
   DB* db_;
 };
diff --git a/util/env_posix.cc b/util/env_posix.cc
index 86343be30..af1801607 100644
--- a/util/env_posix.cc
+++ b/util/env_posix.cc
@@ -1288,6 +1288,17 @@ class PosixEnv : public Env {
     return result;
   }
 
+  virtual Status LinkFile(const std::string& src, const std::string& target) {
+    Status result;
+    if (link(src.c_str(), target.c_str()) != 0) {
+      if (errno == EXDEV) {
+        return Status::NotSupported("No cross FS links allowed");
+      }
+      result = IOError(src, errno);
+    }
+    return result;
+  }
+
   virtual Status LockFile(const std::string& fname, FileLock** lock) {
     *lock = nullptr;
     Status result;
diff --git a/util/file_util.cc b/util/file_util.cc
new file mode 100644
index 000000000..c75d59c5f
--- /dev/null
+++ b/util/file_util.cc
@@ -0,0 +1,59 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#include <string>
+#include <algorithm>
+#include "util/file_util.h"
+#include "rocksdb/env.h"
+#include "db/filename.h"
+
+namespace rocksdb {
+
+// Utility function to copy a file up to a specified length
+Status CopyFile(Env* env, const std::string& source,
+                const std::string& destination, uint64_t size) {
+  const EnvOptions soptions;
+  unique_ptr<SequentialFile> srcfile;
+  Status s;
+  s = env->NewSequentialFile(source, &srcfile, soptions);
+  unique_ptr<WritableFile> destfile;
+  if (s.ok()) {
+    s = env->NewWritableFile(destination, &destfile, soptions);
+  } else {
+    return s;
+  }
+
+  if (size == 0) {
+    // default argument means copy everything
+    if (s.ok()) {
+      s = env->GetFileSize(source, &size);
+    } else {
+      return s;
+    }
+  }
+
+  char buffer[4096];
+  Slice slice;
+  while (size > 0) {
+    uint64_t bytes_to_read =
+        std::min(static_cast<uint64_t>(sizeof(buffer)), size);
+    if (s.ok()) {
+      s = srcfile->Read(bytes_to_read, &slice, buffer);
+    }
+    if (s.ok()) {
+      if (slice.size() == 0) {
+        return Status::Corruption("file too small");
+      }
+      s = destfile->Append(slice);
+    }
+    if (!s.ok()) {
+      return s;
+    }
+    size -= slice.size();
+  }
+  return Status::OK();
+}
+
+}  // namespace rocksdb
diff --git a/util/file_util.h b/util/file_util.h
new file mode 100644
index 000000000..84b37345b
--- /dev/null
+++ b/util/file_util.h
@@ -0,0 +1,18 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#include <string>
+
+#pragma once
+#include "rocksdb/status.h"
+#include "rocksdb/types.h"
+#include "rocksdb/env.h"
+
+namespace rocksdb {
+
+extern Status CopyFile(Env* env, const std::string& source,
+                       const std::string& destination, uint64_t size = 0);
+
+}  // namespace rocksdb
diff --git a/util/mock_env.cc b/util/mock_env.cc
index 5a4c2c325..a88db18d5 100644
--- a/util/mock_env.cc
+++ b/util/mock_env.cc
@@ -559,6 +559,19 @@ Status MockEnv::RenameFile(const std::string& src, const std::string& dest) {
   return Status::OK();
 }
 
+Status MockEnv::LinkFile(const std::string& src, const std::string& dest) {
+  auto s = NormalizePath(src);
+  auto t = NormalizePath(dest);
+  MutexLock lock(&mutex_);
+  if (file_map_.find(s) == file_map_.end()) {
+    return Status::IOError(s, "File not found");
+  }
+
+  DeleteFileInternal(t);
+  file_map_[t] = file_map_[s];
+  return Status::OK();
+}
+
 Status MockEnv::NewLogger(const std::string& fname,
                              shared_ptr<Logger>* result) {
   auto fn = NormalizePath(fname);
diff --git a/util/mock_env.h b/util/mock_env.h
index b92caa5cf..bbd191d78 100644
--- a/util/mock_env.h
+++ b/util/mock_env.h
@@ -69,6 +69,8 @@ class MockEnv : public EnvWrapper {
   virtual Status RenameFile(const std::string& src,
                             const std::string& target);
 
+  virtual Status LinkFile(const std::string& src, const std::string& target);
+
   virtual Status NewLogger(const std::string& fname,
                            shared_ptr<Logger>* result);
 

From 04ca7481d202369a8bae06b253cfd4a7890ea511 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Fri, 14 Nov 2014 11:52:17 -0800
Subject: [PATCH 487/829] Fix build

---
 db/compaction.h           | 4 ++--
 db/compaction_job_test.cc | 4 +++-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/db/compaction.h b/db/compaction.h
index c4c412c40..4333cc208 100644
--- a/db/compaction.h
+++ b/db/compaction.h
@@ -189,8 +189,8 @@ class Compaction {
       CompressionType output_compression, bool seek_compaction = false,
       bool deletion_compaction = false);
 
-  CompactionInputFiles* TEST_GetInputFiles(int level) {
-    return &inputs_[level];
+  CompactionInputFiles* TEST_GetInputFiles(int l) {
+    return &inputs_[l];
   }
 
  private:
diff --git a/db/compaction_job_test.cc b/db/compaction_job_test.cc
index cdf1c704a..e0fffcf2e 100644
--- a/db/compaction_job_test.cc
+++ b/db/compaction_job_test.cc
@@ -164,7 +164,9 @@ TEST(CompactionJobTest, Simple) {
   mutex_.Unlock();
   ASSERT_OK(compaction_job.Run());
   mutex_.Lock();
-  compaction_job.Install(Status::OK(), &mutex_);
+  Status s;
+  compaction_job.Install(&s, &mutex_);
+  ASSERT_OK(s);
   mutex_.Unlock();
 
   mock_table_factory_->AssertLatestFile(expected_results);

From 1fe7a4c62f09d871a9e7fb359e8ff80fa0e09bda Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Sun, 2 Nov 2014 01:08:41 +0100
Subject: [PATCH 488/829] [RocksJava] Test-framework integration

---
 java/Makefile                                 |  79 ++++---
 java/org/rocksdb/ColumnFamilyHandle.java      |   8 +-
 java/org/rocksdb/RocksDB.java                 |   8 +-
 java/org/rocksdb/RocksIterator.java           |   8 +-
 java/org/rocksdb/WriteBatch.java              |  10 -
 java/org/rocksdb/WriteBatchTest.java          | 124 -----------
 java/org/rocksdb/test/BackupableDBTest.java   |  38 +++-
 .../test/BlockBasedTableConfigTest.java       |  10 +-
 java/org/rocksdb/test/ColumnFamilyTest.java   |  21 +-
 .../rocksdb/test/ComparatorOptionsTest.java   |  21 +-
 java/org/rocksdb/test/ComparatorTest.java     |  20 +-
 .../rocksdb/test/DirectComparatorTest.java    |  19 +-
 java/org/rocksdb/test/FilterTest.java         |  15 +-
 java/org/rocksdb/test/MemTableTest.java       |  45 ++--
 java/org/rocksdb/test/MergeTest.java          |  71 ++++---
 .../rocksdb/test/PlainTableConfigTest.java    |  11 +-
 .../rocksdb/test/PlatformRandomHelper.java    |   4 +-
 java/org/rocksdb/test/ReadOnlyTest.java       |  41 ++--
 java/org/rocksdb/test/ReadOptionsTest.java    |  17 +-
 java/org/rocksdb/test/RocksIteratorTest.java  |  59 +++---
 .../org/rocksdb/test/RocksMemoryResource.java |  21 ++
 java/org/rocksdb/test/SnapshotTest.java       | 133 ++++++------
 .../rocksdb/test/StatisticsCollectorTest.java |  28 ++-
 java/org/rocksdb/test/WriteBatchTest.java     | 130 ++++++++++++
 java/rocksjni.pom                             | 194 +++++++++++++++---
 java/rocksjni/write_batch.cc                  |  20 +-
 26 files changed, 709 insertions(+), 446 deletions(-)
 delete mode 100644 java/org/rocksdb/WriteBatchTest.java
 create mode 100644 java/org/rocksdb/test/RocksMemoryResource.java
 create mode 100644 java/org/rocksdb/test/WriteBatchTest.java

diff --git a/java/Makefile b/java/Makefile
index 21066b991..d3bd8d8d4 100644
--- a/java/Makefile
+++ b/java/Makefile
@@ -32,8 +32,8 @@ NATIVE_JAVA_CLASSES = org.rocksdb.AbstractComparator\
 	org.rocksdb.StringAppendOperator\
 	org.rocksdb.WriteBatch\
 	org.rocksdb.WriteBatch.Handler\
-	org.rocksdb.WriteBatchInternal\
-	org.rocksdb.WriteBatchTest\
+	org.rocksdb.test.WriteBatchInternal\
+	org.rocksdb.test.WriteBatchTest\
 	org.rocksdb.WriteOptions\
 
 ROCKSDB_MAJOR = $(shell egrep "ROCKSDB_MAJOR.[0-9]" ../include/rocksdb/version.h | cut -d ' ' -f 3)
@@ -43,18 +43,51 @@ ROCKSDB_PATCH = $(shell egrep "ROCKSDB_PATCH.[0-9]" ../include/rocksdb/version.h
 NATIVE_INCLUDE = ./include
 ARCH := $(shell getconf LONG_BIT)
 ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-linux$(ARCH).jar
-
 ifeq ($(PLATFORM), OS_MACOSX)
 ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-osx.jar
 endif
 
+JAVA_TESTS = org.rocksdb.test.AbstractComparatorTest\
+        org.rocksdb.test.BackupableDBTest\
+		org.rocksdb.test.BlockBasedTableConfigTest\
+		org.rocksdb.test.ColumnFamilyOptionsTest\
+		org.rocksdb.test.ColumnFamilyTest\
+		org.rocksdb.test.ComparatorOptionsTest\
+		org.rocksdb.test.ComparatorTest\
+		org.rocksdb.test.DBOptionsTest\
+		org.rocksdb.test.DirectComparatorTest\
+		org.rocksdb.test.FilterTest\
+		org.rocksdb.test.FlushTest\
+		org.rocksdb.test.KeyMayExistTest\
+		org.rocksdb.test.MemTableTest\
+		org.rocksdb.test.MergeTest\
+		org.rocksdb.test.MixedOptionsTest\
+		org.rocksdb.test.OptionsTest\
+		org.rocksdb.test.PlainTableConfigTest\
+		org.rocksdb.test.ReadOnlyTest\
+		org.rocksdb.test.ReadOptionsTest\
+		org.rocksdb.test.RocksIteratorTest\
+		org.rocksdb.test.SnapshotTest\
+		org.rocksdb.test.StatisticsCollectorTest\
+		org.rocksdb.test.WirteBatchHandlerTest\
+		org.rocksdb.test.WriteBatchTest\
+
+JAVA_TEST_LIBDIR = ./test-libs/
+JAVA_JUNIT_JAR = $(JAVA_TEST_LIBDIR)junit-4.12-beta-2.jar
+JAVA_HAMCR_JAR = $(JAVA_TEST_LIBDIR)hamcrest-core-1.3.jar
+JAVA_MOCKITO_JAR = $(JAVA_TEST_LIBDIR)mockito-all-1.9.5.jar
+JAVA_CGLIB_JAR = $(JAVA_TEST_LIBDIR)cglib-2.2.2.jar
+JAVA_ASSERTJ_JAR = $(JAVA_TEST_LIBDIR)assertj-core-1.7.0.jar
+JAVA_TESTCLASSPATH = $(ROCKSDB_JAR):$(JAVA_JUNIT_JAR):$(JAVA_HAMCR_JAR):$(JAVA_MOCKITO_JAR):$(JAVA_CGLIB_JAR):$(JAVA_ASSERTJ_JAR):.:./*
+
 clean:
 	-find . -name "*.class" -exec rm {} \;
 	-find . -name "hs*.log" -exec rm {} \;
 	rm -rf javadoc/*
+	rm -rf test-libs/
 
 javadocs:
-	mkdir -p javadoc; javadoc -d javadoc -sourcepath . -subpackages org
+	mkdir -p javadoc; javadoc -d javadoc -sourcepath . -subpackages org -exclude org.rocksdb.test
 
 java: javadocs
 	javac org/rocksdb/util/*.java org/rocksdb/*.java
@@ -76,33 +109,17 @@ column_family_sample: java
 	java -ea -Djava.library.path=.:../ -cp ".:./*" -Xcheck:jni RocksDBColumnFamilySample /tmp/rocksdbjni
 	@rm -rf /tmp/rocksdbjni
 
-test: java
-	@rm -rf /tmp/rocksdbjni_*
-	javac org/rocksdb/test/*.java
-	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.WriteBatchTest
-	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.BackupableDBTest
-	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.BlockBasedTableConfigTest
-	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.DBOptionsTest
-	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.ColumnFamilyTest
-	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.ColumnFamilyOptionsTest
-	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.FilterTest
-	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.FlushTest
-	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.KeyMayExistTest
-	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.MemTableTest
-	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.MergeTest
-	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.MixedOptionsTest
-	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.OptionsTest
-	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.PlainTableConfigTest
-	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.ReadOnlyTest
-	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.ReadOptionsTest
-	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.RocksIteratorTest
-	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.SnapshotTest
-	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.StatisticsCollectorTest
-	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.ComparatorOptionsTest
-	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.ComparatorTest
-	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.DirectComparatorTest
-	java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.WriteBatchHandlerTest
-	@rm -rf /tmp/rocksdbjni_*
+resolve_test_deps:
+	mkdir -p "$(JAVA_TEST_LIBDIR)"
+	test -s "$(JAVA_JUNIT_JAR)" || curl -L -o $(JAVA_JUNIT_JAR) http://search.maven.org/remotecontent?filepath=junit/junit/4.12-beta-2/junit-4.12-beta-2.jar
+	test -s "$(JAVA_HAMCR_JAR)" || curl -L -o $(JAVA_HAMCR_JAR) http://search.maven.org/remotecontent?filepath=org/hamcrest/hamcrest-core/1.3/hamcrest-core-1.3.jar
+	test -s "$(JAVA_MOCKITO_JAR)" || curl -L -o "$(JAVA_MOCKITO_JAR)" http://search.maven.org/remotecontent?filepath=org/mockito/mockito-all/1.9.5/mockito-all-1.9.5.jar
+	test -s "$(JAVA_CGLIB_JAR)" || curl -L -o "$(JAVA_CGLIB_JAR)" http://search.maven.org/remotecontent?filepath=cglib/cglib/2.2.2/cglib-2.2.2.jar
+	test -s "$(JAVA_ASSERTJ_JAR)" || curl -L -o "$(JAVA_ASSERTJ_JAR)" http://central.maven.org/maven2/org/assertj/assertj-core/1.7.0/assertj-core-1.7.0.jar
+
+test: java resolve_test_deps
+	javac -cp $(JAVA_TESTCLASSPATH) org/rocksdb/test/*.java
+	java -ea -Djava.library.path=.:../ -cp "$(JAVA_TESTCLASSPATH)" org.junit.runner.JUnitCore $(JAVA_TESTS)
 
 db_bench: java
 	javac org/rocksdb/benchmark/*.java
diff --git a/java/org/rocksdb/ColumnFamilyHandle.java b/java/org/rocksdb/ColumnFamilyHandle.java
index 92a4d7cef..ed8417728 100644
--- a/java/org/rocksdb/ColumnFamilyHandle.java
+++ b/java/org/rocksdb/ColumnFamilyHandle.java
@@ -30,9 +30,11 @@ public class ColumnFamilyHandle extends RocksObject {
    * before freeing the native handle.</p>
    */
   @Override protected void disposeInternal() {
-    assert(isInitialized());
-    if (rocksDB_.isInitialized()) {
-      disposeInternal(nativeHandle_);
+    synchronized (rocksDB_) {
+      assert (isInitialized());
+      if (rocksDB_.isInitialized()) {
+        disposeInternal(nativeHandle_);
+      }
     }
   }
 
diff --git a/java/org/rocksdb/RocksDB.java b/java/org/rocksdb/RocksDB.java
index 5ebbc609e..730c1940d 100644
--- a/java/org/rocksdb/RocksDB.java
+++ b/java/org/rocksdb/RocksDB.java
@@ -363,8 +363,10 @@ public class RocksDB extends RocksObject {
   }
 
   @Override protected void disposeInternal() {
-    assert(isInitialized());
-    disposeInternal(nativeHandle_);
+    synchronized (this) {
+      assert (isInitialized());
+      disposeInternal(nativeHandle_);
+    }
   }
 
   /**
@@ -1150,6 +1152,8 @@ public class RocksDB extends RocksObject {
       throws RocksDBException, IllegalArgumentException {
     // throws RocksDBException if something goes wrong
     dropColumnFamily(nativeHandle_, columnFamilyHandle.nativeHandle_);
+    // After the drop the native handle is not valid anymore
+    columnFamilyHandle.nativeHandle_ = 0;
   }
 
   /**
diff --git a/java/org/rocksdb/RocksIterator.java b/java/org/rocksdb/RocksIterator.java
index fee3f459d..b947b2c83 100644
--- a/java/org/rocksdb/RocksIterator.java
+++ b/java/org/rocksdb/RocksIterator.java
@@ -143,9 +143,11 @@ public class RocksIterator extends RocksObject {
    * before freeing the native handle.</p>
    */
   @Override protected void disposeInternal() {
-    assert(isInitialized());
-    if (rocksDB_.isInitialized()) {
-      disposeInternal(nativeHandle_);
+    synchronized (rocksDB_) {
+      assert (isInitialized());
+      if (rocksDB_.isInitialized()) {
+        disposeInternal(nativeHandle_);
+      }
     }
   }
 
diff --git a/java/org/rocksdb/WriteBatch.java b/java/org/rocksdb/WriteBatch.java
index 5bd1119da..3407033ab 100644
--- a/java/org/rocksdb/WriteBatch.java
+++ b/java/org/rocksdb/WriteBatch.java
@@ -221,13 +221,3 @@ public class WriteBatch extends RocksObject {
     private native void disposeInternal(long handle);
   }
 }
-
-/**
- * Package-private class which provides java api to access
- * c++ WriteBatchInternal.
- */
-class WriteBatchInternal {
-  static native void setSequence(WriteBatch batch, long sn);
-  static native long sequence(WriteBatch batch);
-  static native void append(WriteBatch b1, WriteBatch b2);
-}
diff --git a/java/org/rocksdb/WriteBatchTest.java b/java/org/rocksdb/WriteBatchTest.java
deleted file mode 100644
index 770cd85b8..000000000
--- a/java/org/rocksdb/WriteBatchTest.java
+++ /dev/null
@@ -1,124 +0,0 @@
-//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under the BSD-style license found in the
-//  LICENSE file in the root directory of this source tree. An additional grant
-//  of patent rights can be found in the PATENTS file in the same directory.
-//
-// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-package org.rocksdb;
-
-import java.util.*;
-import java.io.UnsupportedEncodingException;
-
-/**
- * This class mimics the db/write_batch_test.cc in the c++ rocksdb library.
- */
-public class WriteBatchTest {
-  static {
-    RocksDB.loadLibrary();
-  }
-
-  public static void main(String args[]) {
-    System.out.println("Testing WriteBatchTest.Empty ===");
-    Empty();
-
-    System.out.println("Testing WriteBatchTest.Multiple ===");
-    Multiple();
-
-    System.out.println("Testing WriteBatchTest.Append ===");
-    Append();
-
-    System.out.println("Testing WriteBatchTest.Blob ===");
-    Blob();
-
-    // The following tests have not yet ported.
-    // Continue();
-    // PutGatherSlices();
-
-    System.out.println("Passed all WriteBatchTest!");
-  }
-
-  static void Empty() {
-    WriteBatch batch = new WriteBatch();
-    assert(batch.count() == 0);
-  }
-
-  static void Multiple() {
-    try {
-      WriteBatch batch =  new WriteBatch();
-      batch.put("foo".getBytes("US-ASCII"), "bar".getBytes("US-ASCII"));
-      batch.remove("box".getBytes("US-ASCII"));
-      batch.put("baz".getBytes("US-ASCII"), "boo".getBytes("US-ASCII"));
-      WriteBatchInternal.setSequence(batch, 100);
-      assert(100 == WriteBatchInternal.sequence(batch));
-      assert(3 == batch.count());
-      assert(("Put(baz, boo)@102" +
-              "Delete(box)@101" +
-              "Put(foo, bar)@100")
-                .equals(new String(getContents(batch), "US-ASCII")));
-    } catch (UnsupportedEncodingException e) {
-      System.err.println(e);
-      assert(false);
-    }
-  }
-
-  static void Append() {
-    WriteBatch b1 = new WriteBatch();
-    WriteBatch b2 = new WriteBatch();
-    WriteBatchInternal.setSequence(b1, 200);
-    WriteBatchInternal.setSequence(b2, 300);
-    WriteBatchInternal.append(b1, b2);
-    assert(getContents(b1).length == 0);
-    assert(b1.count() == 0);
-    try {
-      b2.put("a".getBytes("US-ASCII"), "va".getBytes("US-ASCII"));
-      WriteBatchInternal.append(b1, b2);
-      assert("Put(a, va)@200".equals(new String(getContents(b1), "US-ASCII")));
-      assert(1 == b1.count());
-      b2.clear();
-      b2.put("b".getBytes("US-ASCII"), "vb".getBytes("US-ASCII"));
-      WriteBatchInternal.append(b1, b2);
-      assert(("Put(a, va)@200" +
-              "Put(b, vb)@201")
-                .equals(new String(getContents(b1), "US-ASCII")));
-      assert(2 == b1.count());
-      b2.remove("foo".getBytes("US-ASCII"));
-      WriteBatchInternal.append(b1, b2);
-      assert(("Put(a, va)@200" +
-              "Put(b, vb)@202" +
-              "Put(b, vb)@201" +
-              "Delete(foo)@203")
-                 .equals(new String(getContents(b1), "US-ASCII")));
-      assert(4 == b1.count());
-    } catch (UnsupportedEncodingException e) {
-      System.err.println(e);
-      assert(false);
-    }
-  }
-
-  static void Blob() {
-    WriteBatch batch = new WriteBatch();
-    try {
-      batch.put("k1".getBytes("US-ASCII"), "v1".getBytes("US-ASCII"));
-      batch.put("k2".getBytes("US-ASCII"), "v2".getBytes("US-ASCII"));
-      batch.put("k3".getBytes("US-ASCII"), "v3".getBytes("US-ASCII"));
-      batch.putLogData("blob1".getBytes("US-ASCII"));
-      batch.remove("k2".getBytes("US-ASCII"));
-      batch.putLogData("blob2".getBytes("US-ASCII"));
-      batch.merge("foo".getBytes("US-ASCII"), "bar".getBytes("US-ASCII"));
-      assert(5 == batch.count());
-      assert(("Merge(foo, bar)@4" +
-              "Put(k1, v1)@0" +
-              "Delete(k2)@3" +
-              "Put(k2, v2)@1" +
-              "Put(k3, v3)@2")
-                .equals(new String(getContents(batch), "US-ASCII")));
-    } catch (UnsupportedEncodingException e) {
-      System.err.println(e);
-      assert(false);
-    }
-  }
-
-  static native byte[] getContents(WriteBatch batch);
-}
diff --git a/java/org/rocksdb/test/BackupableDBTest.java b/java/org/rocksdb/test/BackupableDBTest.java
index 2115e9ca9..f0a6708c1 100644
--- a/java/org/rocksdb/test/BackupableDBTest.java
+++ b/java/org/rocksdb/test/BackupableDBTest.java
@@ -5,28 +5,41 @@
 
 package org.rocksdb.test;
 
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
 import org.rocksdb.*;
 
 import java.util.List;
 
 public class BackupableDBTest {
-  static final String db_path = "/tmp/rocksdbjni_backupable_db_test";
-  static final String backup_path = "/tmp/rocksdbjni_backupable_db_backup_test";
-  static {
-    RocksDB.loadLibrary();
-  }
-  public static void main(String[] args) {
+
+  @ClassRule
+  public static final RocksMemoryResource rocksMemoryResource =
+      new RocksMemoryResource();
+
+  @Rule
+  public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  @Rule
+  public TemporaryFolder backupFolder = new TemporaryFolder();
+
+  @Test
+  public void shouldTestBackupableDb() {
 
     Options opt = new Options();
     opt.setCreateIfMissing(true);
 
-    BackupableDBOptions bopt = new BackupableDBOptions(backup_path, false,
+    BackupableDBOptions bopt = new BackupableDBOptions(
+        backupFolder.getRoot().getAbsolutePath(), false,
         true, false, true, 0, 0);
     BackupableDB bdb = null;
     List<BackupInfo> backupInfos;
     List<BackupInfo> restoreInfos;
     try {
-      bdb = BackupableDB.open(opt, bopt, db_path);
+      bdb = BackupableDB.open(opt, bopt,
+          dbFolder.getRoot().getAbsolutePath());
 
       bdb.put("abc".getBytes(), "def".getBytes());
       bdb.put("ghi".getBytes(), "jkl".getBytes());
@@ -74,7 +87,9 @@ public class BackupableDBTest {
       assert(restoreInfos.get(0).numberFiles() ==
           backupInfos.get(0).numberFiles());
 
-      rdb.restoreDBFromLatestBackup(db_path, db_path,
+      rdb.restoreDBFromLatestBackup(
+          dbFolder.getRoot().getAbsolutePath(),
+          dbFolder.getRoot().getAbsolutePath(),
           ropt);
       // do nothing because there is only one backup
       rdb.purgeOldBackups(1);
@@ -84,7 +99,8 @@ public class BackupableDBTest {
       ropt.dispose();
 
       // verify that backed up data contains deleted record
-      bdb = BackupableDB.open(opt, bopt, db_path);
+      bdb = BackupableDB.open(opt, bopt,
+          dbFolder.getRoot().getAbsolutePath());
       value = bdb.get("abc".getBytes());
       assert(new String(value).equals("def"));
 
@@ -110,7 +126,6 @@ public class BackupableDBTest {
       assert(backupInfos.size() == 2);
       assert(backupInfos.get(0).backupId() == 4);
       assert(backupInfos.get(1).backupId() == 5);
-      System.out.println("Backup and restore test passed");
     } catch (RocksDBException e) {
       System.err.format("[ERROR]: %s%n", e);
       e.printStackTrace();
@@ -121,5 +136,6 @@ public class BackupableDBTest {
         bdb.close();
       }
     }
+    System.out.println("Passed BackupableDBTest.");
   }
 }
diff --git a/java/org/rocksdb/test/BlockBasedTableConfigTest.java b/java/org/rocksdb/test/BlockBasedTableConfigTest.java
index 3f54d5a78..8c73915ee 100644
--- a/java/org/rocksdb/test/BlockBasedTableConfigTest.java
+++ b/java/org/rocksdb/test/BlockBasedTableConfigTest.java
@@ -5,13 +5,19 @@
 
 package org.rocksdb.test;
 
+import org.junit.ClassRule;
+import org.junit.Test;
 import org.rocksdb.BlockBasedTableConfig;
 import org.rocksdb.ChecksumType;
 import org.rocksdb.IndexType;
 
 public class BlockBasedTableConfigTest {
+  @ClassRule
+  public static final RocksMemoryResource rocksMemoryResource =
+      new RocksMemoryResource();
 
-  public static void main(String[] args) {
+  @Test
+  public void shouldTestBlockBasedTableConfig() {
     BlockBasedTableConfig blockBasedTableConfig =
         new BlockBasedTableConfig();
     blockBasedTableConfig.setNoBlockCache(true);
@@ -42,6 +48,6 @@ public class BlockBasedTableConfigTest {
         == 4);
     blockBasedTableConfig.setCacheNumShardBits(5);
     assert(blockBasedTableConfig.cacheNumShardBits() == 5);
-    System.out.println("BlockBasedTableConfig test passed");
+    System.out.println("Passed BlockBasedTableConfigTest.");
   }
 }
diff --git a/java/org/rocksdb/test/ColumnFamilyTest.java b/java/org/rocksdb/test/ColumnFamilyTest.java
index 350c4446c..e52eac589 100644
--- a/java/org/rocksdb/test/ColumnFamilyTest.java
+++ b/java/org/rocksdb/test/ColumnFamilyTest.java
@@ -9,16 +9,25 @@ import java.util.HashMap;
 import java.util.List;
 import java.util.ArrayList;
 import java.util.Map;
+
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
 import org.rocksdb.*;
 
 public class ColumnFamilyTest {
-  static final String db_path = "/tmp/rocksdbjni_columnfamily_test";
-  static {
-    RocksDB.loadLibrary();
-  }
 
-  public static void main(String[] args) {
+  @ClassRule
+  public static final RocksMemoryResource rocksMemoryResource =
+      new RocksMemoryResource();
+
+  @Rule
+  public TemporaryFolder dbFolder = new TemporaryFolder();
 
+  @Test
+  public void shouldTestColumnFamilies() {
+    String db_path = dbFolder.getRoot().getAbsolutePath();
     RocksDB db = null;
     Options options = new Options();
     options.setCreateIfMissing(true);
@@ -274,7 +283,6 @@ public class ColumnFamilyTest {
       assert(false);
     }
 
-    System.out.println("Passed ColumnFamilyTest");
     // free cf handles before database close
     for (ColumnFamilyHandle columnFamilyHandle : columnFamilyHandleList) {
       columnFamilyHandle.dispose();
@@ -283,5 +291,6 @@ public class ColumnFamilyTest {
     db.close();
     // be sure to dispose c++ pointers
     options.dispose();
+    System.out.println("Passed ColumnFamilyTest.");
   }
 }
diff --git a/java/org/rocksdb/test/ComparatorOptionsTest.java b/java/org/rocksdb/test/ComparatorOptionsTest.java
index e25209392..21f4fc2a1 100644
--- a/java/org/rocksdb/test/ComparatorOptionsTest.java
+++ b/java/org/rocksdb/test/ComparatorOptionsTest.java
@@ -5,27 +5,30 @@
 
 package org.rocksdb.test;
 
+import org.junit.ClassRule;
+import org.junit.Test;
 import org.rocksdb.ComparatorOptions;
-import org.rocksdb.RocksDB;
 
-import java.util.Random;
+import static org.assertj.core.api.Assertions.assertThat;
 
 public class ComparatorOptionsTest {
 
-  static {
-    RocksDB.loadLibrary();
-  }
+  @ClassRule
+  public static final RocksMemoryResource rocksMemoryResource =
+      new RocksMemoryResource();
 
-  public static void main(String[] args) {
+  @Test
+  public void shouldTestComparatorOptions() {
     final ComparatorOptions copt = new ComparatorOptions();
-    Random rand = new Random();
+
+    assertThat(copt).isNotNull();
 
     { // UseAdaptiveMutex test
       copt.setUseAdaptiveMutex(true);
-      assert(copt.useAdaptiveMutex() == true);
+      assertThat(copt.useAdaptiveMutex()).isTrue();
 
       copt.setUseAdaptiveMutex(false);
-      assert(copt.useAdaptiveMutex() == false);
+      assertThat(copt.useAdaptiveMutex()).isFalse();
     }
 
     copt.dispose();
diff --git a/java/org/rocksdb/test/ComparatorTest.java b/java/org/rocksdb/test/ComparatorTest.java
index 34d7c78df..d65a0653a 100644
--- a/java/org/rocksdb/test/ComparatorTest.java
+++ b/java/org/rocksdb/test/ComparatorTest.java
@@ -5,19 +5,26 @@
 
 package org.rocksdb.test;
 
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
 import org.rocksdb.*;
 
 import java.io.IOException;
 import java.nio.file.FileSystems;
 
 public class ComparatorTest {
-  private static final String db_path = "/tmp/comparator_db";
 
-  static {
-    RocksDB.loadLibrary();
-  }
+  @ClassRule
+  public static final RocksMemoryResource rocksMemoryResource =
+      new RocksMemoryResource();
+
+  @Rule
+  public TemporaryFolder dbFolder = new TemporaryFolder();
 
-  public static void main(String[] args) throws IOException {
+  @Test
+  public void shouldTestComparator() throws IOException {
 
     final AbstractComparatorTest comparatorTest = new AbstractComparatorTest() {
       @Override
@@ -38,7 +45,8 @@ public class ComparatorTest {
     };
 
     // test the round-tripability of keys written and read with the Comparator
-    comparatorTest.testRoundtrip(FileSystems.getDefault().getPath(db_path));
+    comparatorTest.testRoundtrip(FileSystems.getDefault().getPath(
+        dbFolder.getRoot().getAbsolutePath()));
 
     System.out.println("Passed ComparatorTest");
   }
diff --git a/java/org/rocksdb/test/DirectComparatorTest.java b/java/org/rocksdb/test/DirectComparatorTest.java
index 9df06eb73..562038897 100644
--- a/java/org/rocksdb/test/DirectComparatorTest.java
+++ b/java/org/rocksdb/test/DirectComparatorTest.java
@@ -5,19 +5,25 @@
 
 package org.rocksdb.test;
 
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
 import org.rocksdb.*;
 
 import java.io.IOException;
 import java.nio.file.FileSystems;
 
 public class DirectComparatorTest {
-  private static final String db_path = "/tmp/direct_comparator_db";
+  @ClassRule
+  public static final RocksMemoryResource rocksMemoryResource =
+      new RocksMemoryResource();
 
-  static {
-    RocksDB.loadLibrary();
-  }
+  @Rule
+  public TemporaryFolder dbFolder = new TemporaryFolder();
 
-  public static void main(String[] args) throws IOException {
+  @Test
+  public void shouldTestDirectComparator() throws IOException {
 
     final AbstractComparatorTest comparatorTest = new AbstractComparatorTest() {
       @Override
@@ -41,7 +47,8 @@ public class DirectComparatorTest {
     };
 
     // test the round-tripability of keys written and read with the DirectComparator
-    comparatorTest.testRoundtrip(FileSystems.getDefault().getPath(db_path));
+    comparatorTest.testRoundtrip(FileSystems.getDefault().getPath(
+        dbFolder.getRoot().getAbsolutePath()));
 
     System.out.println("Passed DirectComparatorTest");
   }
diff --git a/java/org/rocksdb/test/FilterTest.java b/java/org/rocksdb/test/FilterTest.java
index fc4fabf56..3894167b0 100644
--- a/java/org/rocksdb/test/FilterTest.java
+++ b/java/org/rocksdb/test/FilterTest.java
@@ -5,13 +5,18 @@
 
 package org.rocksdb.test;
 
+import org.junit.ClassRule;
+import org.junit.Test;
 import org.rocksdb.*;
 
 public class FilterTest {
-  static {
-    RocksDB.loadLibrary();
-  }
-  public static void main(String[] args) {
+
+  @ClassRule
+  public static final RocksMemoryResource rocksMemoryResource =
+      new RocksMemoryResource();
+
+  @Test
+  public void shouldTestFilter() {
     Options options = new Options();
     // test table config
     BlockBasedTableConfig blockConfig = new BlockBasedTableConfig();
@@ -37,6 +42,6 @@ public class FilterTest {
     blockConfig = null;
     System.gc();
     System.runFinalization();
-    System.out.println("Filter test passed");
+    System.out.println("Passed FilterTest.");
   }
 }
diff --git a/java/org/rocksdb/test/MemTableTest.java b/java/org/rocksdb/test/MemTableTest.java
index 0d1e4d54a..0b1244fc2 100644
--- a/java/org/rocksdb/test/MemTableTest.java
+++ b/java/org/rocksdb/test/MemTableTest.java
@@ -5,13 +5,18 @@
 
 package org.rocksdb.test;
 
+import org.junit.ClassRule;
+import org.junit.Test;
 import org.rocksdb.*;
 
 public class MemTableTest {
-  static {
-    RocksDB.loadLibrary();
-  }
-  public static void main(String[] args) {
+
+  @ClassRule
+  public static final RocksMemoryResource rocksMemoryResource =
+      new RocksMemoryResource();
+
+  @Test
+  public void shouldTestMemTable() throws RocksDBException {
     Options options = new Options();
     // Test HashSkipListMemTableConfig
     HashSkipListMemTableConfig memTableConfig =
@@ -25,11 +30,7 @@ public class MemTableTest {
     assert(memTableConfig.branchingFactor() == 4);
     memTableConfig.setBranchingFactor(6);
     assert(memTableConfig.branchingFactor() == 6);
-    try {
-      options.setMemTableConfig(memTableConfig);
-    } catch (RocksDBException e) {
-      assert(false);
-    }
+    options.setMemTableConfig(memTableConfig);
     memTableConfig = null;
     options.dispose();
     System.gc();
@@ -41,11 +42,7 @@ public class MemTableTest {
     assert(skipMemTableConfig.lookahead() == 0);
     skipMemTableConfig.setLookahead(20);
     assert(skipMemTableConfig.lookahead() == 20);
-    try {
-      options.setMemTableConfig(skipMemTableConfig);
-    } catch (RocksDBException e) {
-      assert(false);
-    }
+    options.setMemTableConfig(skipMemTableConfig);
     skipMemTableConfig = null;
     options.dispose();
     System.gc();
@@ -67,21 +64,17 @@ public class MemTableTest {
     assert(hashLinkedListMemTableConfig.
        bucketEntriesLoggingThreshold() == 200);
     assert(hashLinkedListMemTableConfig.
-        ifLogBucketDistWhenFlush() == true);
+        ifLogBucketDistWhenFlush());
     hashLinkedListMemTableConfig.
         setIfLogBucketDistWhenFlush(false);
-    assert(hashLinkedListMemTableConfig.
-        ifLogBucketDistWhenFlush() == false);
+    assert(!hashLinkedListMemTableConfig.
+        ifLogBucketDistWhenFlush());
     assert(hashLinkedListMemTableConfig.
         thresholdUseSkiplist() == 256);
     hashLinkedListMemTableConfig.setThresholdUseSkiplist(29);
     assert(hashLinkedListMemTableConfig.
         thresholdUseSkiplist() == 29);
-    try {
-      options.setMemTableConfig(hashLinkedListMemTableConfig);
-    } catch (RocksDBException e) {
-      assert(false);
-    }
+    options.setMemTableConfig(hashLinkedListMemTableConfig);
     hashLinkedListMemTableConfig = null;
     options.dispose();
     System.gc();
@@ -93,15 +86,11 @@ public class MemTableTest {
     assert(vectorMemTableConfig.reservedSize() == 0);
     vectorMemTableConfig.setReservedSize(123);
     assert(vectorMemTableConfig.reservedSize() == 123);
-    try {
-      options.setMemTableConfig(vectorMemTableConfig);
-    } catch (RocksDBException e) {
-      assert(false);
-    }
+    options.setMemTableConfig(vectorMemTableConfig);
     vectorMemTableConfig = null;
     options.dispose();
     System.gc();
     System.runFinalization();
-    System.out.println("Mem-table test passed");
+    System.out.println("Passed MemTableTest.");
   }
 }
diff --git a/java/org/rocksdb/test/MergeTest.java b/java/org/rocksdb/test/MergeTest.java
index d802559e1..31a3fe5cb 100644
--- a/java/org/rocksdb/test/MergeTest.java
+++ b/java/org/rocksdb/test/MergeTest.java
@@ -7,19 +7,33 @@ package org.rocksdb.test;
 
 import java.util.List;
 import java.util.ArrayList;
+
+import org.junit.AfterClass;
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
 import org.rocksdb.*;
 
 public class MergeTest {
-  static final String db_path_string = "/tmp/rocksdbjni_mergestring_db";
-  static final String db_cf_path_string = "/tmp/rocksdbjni_mergecfstring_db";
-  static final String db_path_operator = "/tmp/rocksdbjni_mergeoperator_db";
 
-  static {
-    RocksDB.loadLibrary();
+  @ClassRule
+  public static final RocksMemoryResource rocksMemoryResource =
+      new RocksMemoryResource();
+
+  @Rule
+  public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  @AfterClass
+  public static void printMergePass(){
+    System.out.println("Passed MergeTest.");
   }
 
-  public static void testStringOption()
+  @Test
+  public void shouldTestStringOption()
       throws InterruptedException, RocksDBException {
+    String db_path_string =
+        dbFolder.getRoot().getAbsolutePath();
     Options opt = new Options();
     opt.setCreateIfMissing(true);
     opt.setMergeOperatorName("stringappend");
@@ -38,23 +52,26 @@ public class MergeTest {
     assert(strValue.equals("aa,bb"));
   }
 
-  public static void testCFStringOption()
+  @Test
+  public void shouldTestCFStringOption()
       throws InterruptedException, RocksDBException {
     DBOptions opt = new DBOptions();
+    String db_path_string =
+        dbFolder.getRoot().getAbsolutePath();
     opt.setCreateIfMissing(true);
     opt.setCreateMissingColumnFamilies(true);
 
     List<ColumnFamilyDescriptor> cfDescr =
-        new ArrayList<ColumnFamilyDescriptor>();
+        new ArrayList<>();
     List<ColumnFamilyHandle> columnFamilyHandleList =
-    new ArrayList<ColumnFamilyHandle>();
+    new ArrayList<>();
     cfDescr.add(new ColumnFamilyDescriptor("default",
         new ColumnFamilyOptions().setMergeOperatorName(
             "stringappend")));
     cfDescr.add(new ColumnFamilyDescriptor("default",
         new ColumnFamilyOptions().setMergeOperatorName(
             "stringappend")));
-    RocksDB db = RocksDB.open(opt, db_cf_path_string,
+    RocksDB db = RocksDB.open(opt, db_path_string,
         cfDescr, columnFamilyHandleList);
 
     // writing aa under key
@@ -75,8 +92,11 @@ public class MergeTest {
     assert(strValue.equals("aa,bb"));
   }
 
-  public static void testOperatorOption()
+  @Test
+  public void shouldTestOperatorOption()
       throws InterruptedException, RocksDBException {
+    String db_path_string =
+        dbFolder.getRoot().getAbsolutePath();
     Options opt = new Options();
     opt.setCreateIfMissing(true);
 
@@ -98,26 +118,29 @@ public class MergeTest {
     assert(strValue.equals("aa,bb"));
   }
 
-  public static void testCFOperatorOption()
+  @Test
+  public void shouldTestCFOperatorOption()
       throws InterruptedException, RocksDBException {
     DBOptions opt = new DBOptions();
+    String db_path_string =
+        dbFolder.getRoot().getAbsolutePath();
+
     opt.setCreateIfMissing(true);
     opt.setCreateMissingColumnFamilies(true);
     StringAppendOperator stringAppendOperator = new StringAppendOperator();
 
     List<ColumnFamilyDescriptor> cfDescr =
-        new ArrayList<ColumnFamilyDescriptor>();
+        new ArrayList<>();
     List<ColumnFamilyHandle> columnFamilyHandleList =
-    new ArrayList<ColumnFamilyHandle>();
+    new ArrayList<>();
     cfDescr.add(new ColumnFamilyDescriptor("default",
         new ColumnFamilyOptions().setMergeOperator(
             stringAppendOperator)));
     cfDescr.add(new ColumnFamilyDescriptor("new_cf",
         new ColumnFamilyOptions().setMergeOperator(
             stringAppendOperator)));
-    RocksDB db = RocksDB.open(opt, db_path_operator,
+    RocksDB db = RocksDB.open(opt, db_path_string,
         cfDescr, columnFamilyHandleList);
-
     // writing aa under key
     db.put(columnFamilyHandleList.get(1),
         "cfkey".getBytes(), "aa".getBytes());
@@ -139,14 +162,18 @@ public class MergeTest {
     value = db.get(columnFamilyHandle, "cfkey2".getBytes());
     String strValueTmpCf = new String(value);
 
+    columnFamilyHandle.dispose();
     db.close();
     opt.dispose();
     assert(strValue.equals("aa,bb"));
     assert(strValueTmpCf.equals("xx,yy"));
   }
 
-  public static void testOperatorGcBehaviour()
+  @Test
+  public void shouldTestOperatorGcBehaviour()
       throws RocksDBException {
+    String db_path_string =
+        dbFolder.getRoot().getAbsolutePath();
     Options opt = new Options();
     opt.setCreateIfMissing(true);
     StringAppendOperator stringAppendOperator = new StringAppendOperator();
@@ -185,14 +212,4 @@ public class MergeTest {
     System.gc();
     System.runFinalization();
   }
-
-  public static void main(String[] args)
-      throws InterruptedException, RocksDBException {
-    testStringOption();
-    testCFStringOption();
-    testOperatorOption();
-    testCFOperatorOption();
-    testOperatorGcBehaviour();
-    System.out.println("Passed MergeTest.");
-  }
 }
diff --git a/java/org/rocksdb/test/PlainTableConfigTest.java b/java/org/rocksdb/test/PlainTableConfigTest.java
index 888f35d81..f4cebb155 100644
--- a/java/org/rocksdb/test/PlainTableConfigTest.java
+++ b/java/org/rocksdb/test/PlainTableConfigTest.java
@@ -5,12 +5,19 @@
 
 package org.rocksdb.test;
 
+import org.junit.ClassRule;
+import org.junit.Test;
 import org.rocksdb.EncodingType;
 import org.rocksdb.PlainTableConfig;
 
 public class PlainTableConfigTest {
 
-  public static void main(String[] args) {
+  @ClassRule
+  public static final RocksMemoryResource rocksMemoryResource =
+      new RocksMemoryResource();
+
+  @Test
+  public void shouldTestPlainTableConfig() {
     PlainTableConfig plainTableConfig = new PlainTableConfig();
     plainTableConfig.setKeySize(5);
     assert(plainTableConfig.keySize() == 5);
@@ -29,6 +36,6 @@ public class PlainTableConfigTest {
     assert(plainTableConfig.fullScanMode());
     plainTableConfig.setStoreIndexInFile(true);
     assert(plainTableConfig.storeIndexInFile());
-    System.out.println("PlainTableConfig test passed");
+    System.out.println("Passed PlainTableConfigTest.");
   }
 }
diff --git a/java/org/rocksdb/test/PlatformRandomHelper.java b/java/org/rocksdb/test/PlatformRandomHelper.java
index 7112fc4f1..d43f4a4f0 100644
--- a/java/org/rocksdb/test/PlatformRandomHelper.java
+++ b/java/org/rocksdb/test/PlatformRandomHelper.java
@@ -18,11 +18,11 @@ public class PlatformRandomHelper {
      * @return boolean value indicating if operating system is 64 Bit.
      */
     public static boolean isOs64Bit(){
-      boolean is64Bit = false;
+      boolean is64Bit;
       if (System.getProperty("os.name").contains("Windows")) {
         is64Bit = (System.getenv("ProgramFiles(x86)") != null);
       } else {
-        is64Bit = (System.getProperty("os.arch").indexOf("64") != -1);
+        is64Bit = (System.getProperty("os.arch").contains("64"));
       }
       return is64Bit;
     }
diff --git a/java/org/rocksdb/test/ReadOnlyTest.java b/java/org/rocksdb/test/ReadOnlyTest.java
index 21b5eb9ae..057d2d4b8 100644
--- a/java/org/rocksdb/test/ReadOnlyTest.java
+++ b/java/org/rocksdb/test/ReadOnlyTest.java
@@ -4,31 +4,41 @@
 // of patent rights can be found in the PATENTS file in the same directory.
 package org.rocksdb.test;
 
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
 import org.rocksdb.*;
 
 import java.util.ArrayList;
 import java.util.List;
 
 public class ReadOnlyTest {
-  static final String DB_PATH = "/tmp/rocksdbjni_readonly_test";
-  static {
-    RocksDB.loadLibrary();
-  }
 
-  public static void main(String[] args){
+  @ClassRule
+  public static final RocksMemoryResource rocksMemoryResource =
+      new RocksMemoryResource();
+
+  @Rule
+  public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  @Test
+  public void shouldTestReadOnlyOpen() {
     RocksDB db = null, db2 = null, db3 = null;
     List<ColumnFamilyHandle> columnFamilyHandleList =
-        new ArrayList<ColumnFamilyHandle>();
+        new ArrayList<>();
     List<ColumnFamilyHandle> db2ColumnFamilyHandleList =
-        new ArrayList<ColumnFamilyHandle>();
+        new ArrayList<>();
     List<ColumnFamilyHandle> db3ColumnFamilyHandleList =
-        new ArrayList<ColumnFamilyHandle>();
+        new ArrayList<>();
     Options options = new Options();
     options.setCreateIfMissing(true);
     try {
-      db = RocksDB.open(options, DB_PATH);
+      db = RocksDB.open(options,
+          dbFolder.getRoot().getAbsolutePath());
       db.put("key".getBytes(), "value".getBytes());
-      db2 = RocksDB.openReadOnly(DB_PATH);
+      db2 = RocksDB.openReadOnly(
+          dbFolder.getRoot().getAbsolutePath());
       assert("value".equals(new String(db2.get("key".getBytes()))));
       db.close();
       db2.close();
@@ -38,7 +48,7 @@ public class ReadOnlyTest {
           new ArrayList<ColumnFamilyDescriptor>();
       cfNames.add(new ColumnFamilyDescriptor("default"));
 
-      db = RocksDB.open(DB_PATH, cfNames, columnFamilyHandleList);
+      db = RocksDB.open(dbFolder.getRoot().getAbsolutePath(), cfNames, columnFamilyHandleList);
       columnFamilyHandleList.add(db.createColumnFamily(
           new ColumnFamilyDescriptor("new_cf", new ColumnFamilyOptions())));
       columnFamilyHandleList.add(db.createColumnFamily(
@@ -46,15 +56,16 @@ public class ReadOnlyTest {
       db.put(columnFamilyHandleList.get(2), "key2".getBytes(),
           "value2".getBytes());
 
-      db2 = RocksDB.openReadOnly(DB_PATH, cfNames, db2ColumnFamilyHandleList);
+      db2 = RocksDB.openReadOnly(
+          dbFolder.getRoot().getAbsolutePath(), cfNames, db2ColumnFamilyHandleList);
       assert(db2.get("key2".getBytes())==null);
       assert(db2.get(columnFamilyHandleList.get(0), "key2".getBytes())==null);
 
       List<ColumnFamilyDescriptor> cfNewName =
-          new ArrayList<ColumnFamilyDescriptor>();
+          new ArrayList<>();
       cfNewName.add(new ColumnFamilyDescriptor("default"));
       cfNewName.add(new ColumnFamilyDescriptor("new_cf2"));
-      db3 = RocksDB.openReadOnly(DB_PATH, cfNewName, db3ColumnFamilyHandleList);
+      db3 = RocksDB.openReadOnly(dbFolder.getRoot().getAbsolutePath(), cfNewName, db3ColumnFamilyHandleList);
       assert(new String(db3.get(db3ColumnFamilyHandleList.get(1),
           "key2".getBytes())).equals("value2"));
     }catch (RocksDBException e){
@@ -125,6 +136,6 @@ public class ReadOnlyTest {
       columnFamilyHandle.dispose();
     }
     db3.close();
-    System.out.println("Passed ReadOnlyTest");
+    System.out.println("Passed ReadOnlyTest.");
   }
 }
diff --git a/java/org/rocksdb/test/ReadOptionsTest.java b/java/org/rocksdb/test/ReadOptionsTest.java
index b3b5b2690..27d757a10 100644
--- a/java/org/rocksdb/test/ReadOptionsTest.java
+++ b/java/org/rocksdb/test/ReadOptionsTest.java
@@ -6,14 +6,20 @@
 package org.rocksdb.test;
 
 import java.util.Random;
+
+import org.junit.ClassRule;
+import org.junit.Test;
 import org.rocksdb.RocksDB;
 import org.rocksdb.ReadOptions;
 
 public class ReadOptionsTest {
-  static {
-    RocksDB.loadLibrary();
-  }
-  public static void main(String[] args) {
+
+  @ClassRule
+  public static final RocksMemoryResource rocksMemoryResource =
+      new RocksMemoryResource();
+
+  @Test
+  public void shouldTestReadOptions() {
     ReadOptions opt = new ReadOptions();
     Random rand = new Random();
     { // VerifyChecksums test
@@ -33,8 +39,7 @@ public class ReadOptionsTest {
       opt.setTailing(boolValue);
       assert(opt.tailing() == boolValue);
     }
-
     opt.dispose();
-    System.out.println("Passed ReadOptionsTest");
+    System.out.println("Passed ReadOptionsTest.");
   }
 }
diff --git a/java/org/rocksdb/test/RocksIteratorTest.java b/java/org/rocksdb/test/RocksIteratorTest.java
index 1e2fa8c6d..7de27cad9 100644
--- a/java/org/rocksdb/test/RocksIteratorTest.java
+++ b/java/org/rocksdb/test/RocksIteratorTest.java
@@ -4,45 +4,46 @@
 // of patent rights can be found in the PATENTS file in the same directory.
 package org.rocksdb.test;
 
-import org.rocksdb.ColumnFamilyHandle;
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
 import org.rocksdb.Options;
 import org.rocksdb.RocksDB;
 import org.rocksdb.RocksDBException;
 import org.rocksdb.RocksIterator;
 
-import java.util.ArrayList;
-import java.util.List;
-
 public class RocksIteratorTest {
-  static final String DB_PATH = "/tmp/rocksdbjni_iterator_test";
-  static {
-    RocksDB.loadLibrary();
-  }
 
-  public static void main(String[] args){
+  @ClassRule
+  public static final RocksMemoryResource rocksMemoryResource =
+      new RocksMemoryResource();
+
+  @Rule
+  public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  @Test
+  public void shouldTestRocksIteratorGc()
+      throws RocksDBException {
     RocksDB db;
     Options options = new Options();
     options.setCreateIfMissing(true)
         .setCreateMissingColumnFamilies(true);
-    try {
-      db = RocksDB.open(options, DB_PATH);
-      db.put("key".getBytes(), "value".getBytes());
-      RocksIterator iter = db.newIterator();
-      RocksIterator iter2 = db.newIterator();
-      RocksIterator iter3 = db.newIterator();
-      iter = null;
-      db.close();
-      db = null;
-      iter2 = null;
-      System.gc();
-      System.runFinalization();
-      System.out.println("Passed RocksIterator Test");
-      iter3.dispose();
-      System.gc();
-      System.runFinalization();
-    }catch (RocksDBException e){
-      e.printStackTrace();
-      assert(false);
-    }
+    db = RocksDB.open(options,
+        dbFolder.getRoot().getAbsolutePath());
+    db.put("key".getBytes(), "value".getBytes());
+    RocksIterator iter = db.newIterator();
+    RocksIterator iter2 = db.newIterator();
+    RocksIterator iter3 = db.newIterator();
+    iter = null;
+    db.close();
+    db = null;
+    iter2 = null;
+    System.gc();
+    System.runFinalization();
+    iter3.dispose();
+    System.gc();
+    System.runFinalization();
+    System.out.println("Passed RocksIteratorTest.");
   }
 }
diff --git a/java/org/rocksdb/test/RocksMemoryResource.java b/java/org/rocksdb/test/RocksMemoryResource.java
new file mode 100644
index 000000000..eabbc822e
--- /dev/null
+++ b/java/org/rocksdb/test/RocksMemoryResource.java
@@ -0,0 +1,21 @@
+package org.rocksdb.test;
+
+import org.junit.rules.ExternalResource;
+import org.rocksdb.RocksDB;
+
+/**
+ * Resource to trigger garbage collection after each test
+ * run.
+ */
+public class RocksMemoryResource extends ExternalResource {
+
+  static {
+    RocksDB.loadLibrary();
+  }
+
+  @Override
+  protected void after() {
+    System.gc();
+    System.runFinalization();
+  }
+}
\ No newline at end of file
diff --git a/java/org/rocksdb/test/SnapshotTest.java b/java/org/rocksdb/test/SnapshotTest.java
index 67d0a83ef..ad3546de3 100644
--- a/java/org/rocksdb/test/SnapshotTest.java
+++ b/java/org/rocksdb/test/SnapshotTest.java
@@ -4,84 +4,79 @@
 // of patent rights can be found in the PATENTS file in the same directory.
 package org.rocksdb.test;
 
-import java.util.ArrayList;
-import java.util.List;
-
-import org.rocksdb.ColumnFamilyHandle;
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
 import org.rocksdb.Options;
 import org.rocksdb.ReadOptions;
 import org.rocksdb.RocksDB;
 import org.rocksdb.RocksDBException;
 import org.rocksdb.Snapshot;
-import org.rocksdb.WriteBatch;
-import org.rocksdb.WriteOptions;
 
+public class SnapshotTest {
 
-public class SnapshotTest
-{
-  static final String DB_PATH = "/tmp/rocksdbjni_snapshot_test";
-  static {
-    RocksDB.loadLibrary();
-  }
+  @ClassRule
+  public static final RocksMemoryResource rocksMemoryResource =
+      new RocksMemoryResource();
 
-  public static void main(String[] args){
-    RocksDB db = null;
+  @Rule
+  public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  @Test
+  public void shouldTestSnapshots() throws RocksDBException {
+    RocksDB db;
     Options options = new Options();
     options.setCreateIfMissing(true);
-    try {
-      db = RocksDB.open(options, DB_PATH);
-      db.put("key".getBytes(), "value".getBytes());
-      // Get new Snapshot of database
-      Snapshot snapshot = db.getSnapshot();
-      ReadOptions readOptions = new ReadOptions();
-      // set snapshot in ReadOptions
-      readOptions.setSnapshot(snapshot);
-      // retrieve key value pair
-      assert(new String(db.get("key".getBytes()))
-          .equals("value"));
-      // retrieve key value pair created before
-      // the snapshot was made
-      assert(new String(db.get(readOptions,
-          "key".getBytes())).equals("value"));
-      // add new key/value pair
-      db.put("newkey".getBytes(), "newvalue".getBytes());
-      // using no snapshot the latest db entries
-      // will be taken into account
-      assert(new String(db.get("newkey".getBytes()))
-          .equals("newvalue"));
-      // snapshopot was created before newkey
-      assert(db.get(readOptions, "newkey".getBytes())
-          == null);
-      // Retrieve snapshot from read options
-      Snapshot sameSnapshot = readOptions.snapshot();
-      readOptions.setSnapshot(sameSnapshot);
-      // results must be the same with new Snapshot
-      // instance using the same native pointer
-      assert(new String(db.get(readOptions,
-          "key".getBytes())).equals("value"));
-      // update key value pair to newvalue
-      db.put("key".getBytes(), "newvalue".getBytes());
-      // read with previously created snapshot will
-      // read previous version of key value pair
-      assert(new String(db.get(readOptions,
-          "key".getBytes())).equals("value"));
-      // read for newkey using the snapshot must be
-      // null
-      assert(db.get(readOptions, "newkey".getBytes())
-          == null);
-      // setting null to snapshot in ReadOptions leads
-      // to no Snapshot being used.
-      readOptions.setSnapshot(null);
-      assert(new String(db.get(readOptions,
-          "newkey".getBytes())).equals("newvalue"));
-      // release Snapshot
-      db.releaseSnapshot(snapshot);
-      // Close database
-      db.close();
-    }catch (RocksDBException e){
-      e.printStackTrace();
-      assert(false);
-    }
-    System.out.println("Passed SnapshotTest");
+
+    db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath());
+    db.put("key".getBytes(), "value".getBytes());
+    // Get new Snapshot of database
+    Snapshot snapshot = db.getSnapshot();
+    ReadOptions readOptions = new ReadOptions();
+    // set snapshot in ReadOptions
+    readOptions.setSnapshot(snapshot);
+    // retrieve key value pair
+    assert(new String(db.get("key".getBytes()))
+        .equals("value"));
+    // retrieve key value pair created before
+    // the snapshot was made
+    assert(new String(db.get(readOptions,
+        "key".getBytes())).equals("value"));
+    // add new key/value pair
+    db.put("newkey".getBytes(), "newvalue".getBytes());
+    // using no snapshot the latest db entries
+    // will be taken into account
+    assert(new String(db.get("newkey".getBytes()))
+        .equals("newvalue"));
+    // snapshopot was created before newkey
+    assert(db.get(readOptions, "newkey".getBytes())
+        == null);
+    // Retrieve snapshot from read options
+    Snapshot sameSnapshot = readOptions.snapshot();
+    readOptions.setSnapshot(sameSnapshot);
+    // results must be the same with new Snapshot
+    // instance using the same native pointer
+    assert(new String(db.get(readOptions,
+        "key".getBytes())).equals("value"));
+    // update key value pair to newvalue
+    db.put("key".getBytes(), "newvalue".getBytes());
+    // read with previously created snapshot will
+    // read previous version of key value pair
+    assert(new String(db.get(readOptions,
+        "key".getBytes())).equals("value"));
+    // read for newkey using the snapshot must be
+    // null
+    assert(db.get(readOptions, "newkey".getBytes())
+        == null);
+    // setting null to snapshot in ReadOptions leads
+    // to no Snapshot being used.
+    readOptions.setSnapshot(null);
+    assert(new String(db.get(readOptions,
+        "newkey".getBytes())).equals("newvalue"));
+    // release Snapshot
+    db.releaseSnapshot(snapshot);
+    // Close database
+    db.close();
   }
 }
diff --git a/java/org/rocksdb/test/StatisticsCollectorTest.java b/java/org/rocksdb/test/StatisticsCollectorTest.java
index 5298aa46a..b748c21ce 100644
--- a/java/org/rocksdb/test/StatisticsCollectorTest.java
+++ b/java/org/rocksdb/test/StatisticsCollectorTest.java
@@ -6,20 +6,32 @@
 package org.rocksdb.test;
 
 import java.util.Collections;
+
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
 import org.rocksdb.*;
 
+import static org.assertj.core.api.Assertions.assertThat;
+
 public class StatisticsCollectorTest {
-  static final String db_path = "/tmp/rocksdbjni_statistics_collector_test";
-  static {
-    RocksDB.loadLibrary();
-  }
 
-  public static void main(String[] args)
+  @ClassRule
+  public static final RocksMemoryResource rocksMemoryResource =
+      new RocksMemoryResource();
+
+  @Rule
+  public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  @Test
+  public void shouldTestStatisticsCollector()
       throws InterruptedException, RocksDBException {
     Options opt = new Options().createStatistics().setCreateIfMissing(true);
     Statistics stats = opt.statisticsPtr();
 
-    RocksDB db = RocksDB.open(opt, db_path);
+    RocksDB db = RocksDB.open(opt,
+        dbFolder.getRoot().getAbsolutePath());
 
     StatsCallbackMock callback = new StatsCallbackMock();
     StatsCollectorInput statsInput = new StatsCollectorInput(stats, callback);
@@ -30,8 +42,8 @@ public class StatisticsCollectorTest {
 
     Thread.sleep(1000);
 
-    assert(callback.tickerCallbackCount > 0);
-    assert(callback.histCallbackCount > 0);
+    assertThat(callback.tickerCallbackCount).isGreaterThan(0);
+    assertThat(callback.histCallbackCount).isGreaterThan(0);
 
     statsCollector.shutDown(1000);
 
diff --git a/java/org/rocksdb/test/WriteBatchTest.java b/java/org/rocksdb/test/WriteBatchTest.java
new file mode 100644
index 000000000..72e0e464e
--- /dev/null
+++ b/java/org/rocksdb/test/WriteBatchTest.java
@@ -0,0 +1,130 @@
+//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+package org.rocksdb.test;
+
+import org.junit.AfterClass;
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+import org.rocksdb.WriteBatch;
+
+import java.io.UnsupportedEncodingException;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+/**
+ * This class mimics the db/write_batch_test.cc
+ * in the c++ rocksdb library.
+ *
+ * Not ported yet:
+ *
+ * Continue();
+ * PutGatherSlices();
+ */
+public class WriteBatchTest {
+  @ClassRule
+  public static final RocksMemoryResource rocksMemoryResource =
+      new RocksMemoryResource();
+
+  @Rule
+  public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  @AfterClass
+  public static void printMergePass(){
+    System.out.println("Passed WriteBatchTest.");
+  }
+
+  @Test
+  public void shouldTestEmptyWriteBatch() {
+    WriteBatch batch = new WriteBatch();
+    assertThat(batch.count()).isEqualTo(0);
+  }
+
+  @Test
+  public void shouldTestMultipleBatchOperations()
+      throws UnsupportedEncodingException {
+    WriteBatch batch =  new WriteBatch();
+    batch.put("foo".getBytes("US-ASCII"), "bar".getBytes("US-ASCII"));
+    batch.remove("box".getBytes("US-ASCII"));
+    batch.put("baz".getBytes("US-ASCII"), "boo".getBytes("US-ASCII"));
+    WriteBatchInternal.setSequence(batch, 100);
+    assertThat(WriteBatchInternal.sequence(batch)).
+        isNotNull().
+        isEqualTo(100);
+    assertThat(batch.count()).isEqualTo(3);
+    assertThat(new String(getContents(batch), "US-ASCII")).
+        isEqualTo("Put(baz, boo)@102" +
+                  "Delete(box)@101" +
+                  "Put(foo, bar)@100");
+  }
+
+  @Test
+  public void shouldTestAppendOperation()
+      throws UnsupportedEncodingException {
+    WriteBatch b1 = new WriteBatch();
+    WriteBatch b2 = new WriteBatch();
+    WriteBatchInternal.setSequence(b1, 200);
+    WriteBatchInternal.setSequence(b2, 300);
+    WriteBatchInternal.append(b1, b2);
+    assertThat(getContents(b1).length).isEqualTo(0);
+    assertThat(b1.count()).isEqualTo(0);
+    b2.put("a".getBytes("US-ASCII"), "va".getBytes("US-ASCII"));
+    WriteBatchInternal.append(b1, b2);
+    assertThat("Put(a, va)@200".equals(new String(getContents(b1), "US-ASCII")));
+    assertThat(b1.count()).isEqualTo(1);
+    b2.clear();
+    b2.put("b".getBytes("US-ASCII"), "vb".getBytes("US-ASCII"));
+    WriteBatchInternal.append(b1, b2);
+    assertThat(("Put(a, va)@200" +
+            "Put(b, vb)@201")
+                .equals(new String(getContents(b1), "US-ASCII")));
+    assertThat(b1.count()).isEqualTo(2);
+    b2.remove("foo".getBytes("US-ASCII"));
+    WriteBatchInternal.append(b1, b2);
+    assertThat(("Put(a, va)@200" +
+        "Put(b, vb)@202" +
+        "Put(b, vb)@201" +
+        "Delete(foo)@203")
+        .equals(new String(getContents(b1), "US-ASCII")));
+    assertThat(b1.count()).isEqualTo(4);
+  }
+
+  @Test
+  public void shouldTestBlobOperation()
+      throws UnsupportedEncodingException {
+    WriteBatch batch = new WriteBatch();
+    batch.put("k1".getBytes("US-ASCII"), "v1".getBytes("US-ASCII"));
+    batch.put("k2".getBytes("US-ASCII"), "v2".getBytes("US-ASCII"));
+    batch.put("k3".getBytes("US-ASCII"), "v3".getBytes("US-ASCII"));
+    batch.putLogData("blob1".getBytes("US-ASCII"));
+    batch.remove("k2".getBytes("US-ASCII"));
+    batch.putLogData("blob2".getBytes("US-ASCII"));
+    batch.merge("foo".getBytes("US-ASCII"), "bar".getBytes("US-ASCII"));
+    assertThat(batch.count()).isEqualTo(5);
+    assertThat(("Merge(foo, bar)@4" +
+            "Put(k1, v1)@0" +
+            "Delete(k2)@3" +
+            "Put(k2, v2)@1" +
+            "Put(k3, v3)@2")
+               .equals(new String(getContents(batch), "US-ASCII")));
+  }
+
+  static native byte[] getContents(WriteBatch batch);
+}
+
+/**
+ * Package-private class which provides java api to access
+ * c++ WriteBatchInternal.
+ */
+class WriteBatchInternal {
+  static native void setSequence(WriteBatch batch, long sn);
+  static native long sequence(WriteBatch batch);
+  static native void append(WriteBatch b1, WriteBatch b2);
+}
diff --git a/java/rocksjni.pom b/java/rocksjni.pom
index 554357031..d8fe09fe9 100644
--- a/java/rocksjni.pom
+++ b/java/rocksjni.pom
@@ -1,34 +1,164 @@
 <?xml version="1.0" encoding="UTF-8"?>
-<project xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd" xmlns="http://maven.apache.org/POM/4.0.0"
-    xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
-  <modelVersion>4.0.0</modelVersion>
-  <name>RocksDB JNI</name>
-  <url>http://rocksdb.org/</url>
-  <groupId>org.rocksdb</groupId>
-  <artifactId>rocksdbjni</artifactId>
-  <version>3.6.0</version>
-  <description>RocksDB fat jar that contains .so files for linux32 and linux64, and jnilib files for Mac OSX.</description>
-  <licenses>
-    <license>
-      <name>Apache License 2.0</name>
-      <url>http://www.apache.org/licenses/LICENSE-2.0.html</url>
-      <distribution>repo</distribution>
-    </license>
-  </licenses>
-  <scm>
-    <connection>scm:git:git://github.com/dropwizard/metrics.git</connection>
-    <developerConnection>scm:git:git@github.com:dropwizard/metrics.git</developerConnection>
-    <url>http://github.com/dropwizard/metrics/</url>
-    <tag>HEAD</tag>
-  </scm>
-  <developers>
-    <developer>
-      <name>Facebook</name>
-      <email>help@facebook.com</email>
-      <timezone>America/New_York</timezone>
-      <roles>
-        <role>architect</role>
-      </roles>
-    </developer>
-  </developers>
+<project
+        xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"
+        xmlns="http://maven.apache.org/POM/4.0.0"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+    <modelVersion>4.0.0</modelVersion>
+    <name>RocksDB JNI</name>
+    <url>http://rocksdb.org/</url>
+    <groupId>org.rocksdb</groupId>
+    <artifactId>rocksdbjni</artifactId>
+    <version>3.6.0</version>
+    <description>RocksDB fat jar that contains .so files for linux32 and linux64, and jnilib files
+        for Mac OSX.
+    </description>
+    <licenses>
+        <license>
+            <name>Apache License 2.0</name>
+            <url>http://www.apache.org/licenses/LICENSE-2.0.html</url>
+            <distribution>repo</distribution>
+        </license>
+    </licenses>
+    <scm>
+        <connection>scm:git:git://github.com/dropwizard/metrics.git</connection>
+        <developerConnection>scm:git:git@github.com:dropwizard/metrics.git</developerConnection>
+        <url>http://github.com/dropwizard/metrics/</url>
+        <tag>HEAD</tag>
+    </scm>
+    <developers>
+        <developer>
+            <name>Facebook</name>
+            <email>help@facebook.com</email>
+            <timezone>America/New_York</timezone>
+            <roles>
+                <role>architect</role>
+            </roles>
+        </developer>
+    </developers>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+    <build>
+        <!-- Use custom maven folder layout -->
+        <!-- Set folder for src root -->
+        <sourceDirectory>${project.basedir}</sourceDirectory>
+        <!-- main resources, nothing shall be excluded -->
+        <resources>
+            <resource>
+                <directory>${project.basedir}</directory>
+                <excludes>
+                    <exclude>**/*</exclude>
+                </excludes>
+            </resource>
+        </resources>
+        <!-- Set folder for test root -->
+        <testSourceDirectory>${project.basedir}</testSourceDirectory>
+        <!-- Bring libraries on classpath -->
+        <testResources>
+            <testResource>
+                <directory>${project.basedir}</directory>
+                <includes>
+                    <include>*.so</include>
+                    <include>*.jar</include>
+                    <include>*.jnilib</include>
+                </includes>
+            </testResource>
+        </testResources>
+        <plugins>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-compiler-plugin</artifactId>
+                <version>2.0.2</version>
+                <configuration>
+                    <source>1.7</source>
+                    <target>1.7</target>
+                    <!-- Exclude all tests from classes -->
+                    <excludes>
+                        <!-- Exclude Sample -->
+                        <exclude>*.java</exclude>
+                        <!-- Exclude Benchmark -->
+                        <exclude>org/rocksdb/benchmark/*.java</exclude>
+                        <!-- Exclude Tests -->
+                        <exclude>org/rocksdb/test/*.java</exclude>
+                        <exclude>org/rocksdb/WriteBatchTest.java</exclude>
+                    </excludes>
+                </configuration>
+                <executions>
+                    <execution>
+                        <id>default-testCompile</id>
+                        <phase>test-compile</phase>
+                        <configuration>
+                            <!-- Include only tests in test-classes -->
+                            <testExcludes>
+                                <!-- Exclude everything but WriteBatchTest -->
+                                <exclude>%regex[org/rocksdb/[^WriteBatchTest].*java]</exclude>
+                                <!-- Exclude WriteBatchTest -->
+                                <exclude>*.java</exclude>
+                                <!-- Exclude Benchmark -->
+                                <exclude>org/rocksdb/benchmark/*.java</exclude>
+                                <!-- Exclude Utilities -->
+                                <exclude>org/rocksdb/util/*.java</exclude>
+                            </testExcludes>
+                            <testIncludes>
+                                <!-- Include Tests -->
+                                <include>org/rocksdb/test/*.java</include>
+                            </testIncludes>
+                        </configuration>
+                        <goals>
+                            <goal>testCompile</goal>
+                        </goals>
+                    </execution>
+                </executions>
+            </plugin>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-surefire-plugin</artifactId>
+                <version>2.17</version>
+                <configuration>
+                    <argLine>${argLine}</argLine>
+                </configuration>
+            </plugin>
+            <plugin>
+                <groupId>org.jacoco</groupId>
+                <artifactId>jacoco-maven-plugin</artifactId>
+                <version>0.7.1.201405082137</version>
+                <executions>
+                    <execution>
+                        <goals>
+                            <goal>prepare-agent</goal>
+                        </goals>
+                    </execution>
+                    <execution>
+                        <id>report</id>
+                        <phase>prepare-package</phase>
+                        <goals>
+                            <goal>report</goal>
+                        </goals>
+                    </execution>
+                </executions>
+            </plugin>
+        </plugins>
+    </build>
+
+    <dependencies>
+        <dependency>
+            <groupId>junit</groupId>
+            <artifactId>junit</artifactId>
+            <version>4.12-beta-2</version>
+            <scope>test</scope>
+        </dependency>
+        <dependency>
+            <groupId>org.assertj</groupId>
+            <artifactId>assertj-core</artifactId>
+            <version>1.7.0</version>
+            <scope>test</scope>
+        </dependency>
+        <dependency>
+            <groupId>org.mockito</groupId>
+            <artifactId>mockito-all</artifactId>
+            <version>1.9.5</version>
+        </dependency>
+    </dependencies>
 </project>
diff --git a/java/rocksjni/write_batch.cc b/java/rocksjni/write_batch.cc
index f1d0a89d6..d243c87a0 100644
--- a/java/rocksjni/write_batch.cc
+++ b/java/rocksjni/write_batch.cc
@@ -9,8 +9,8 @@
 
 #include "include/org_rocksdb_WriteBatch.h"
 #include "include/org_rocksdb_WriteBatch_Handler.h"
-#include "include/org_rocksdb_WriteBatchInternal.h"
-#include "include/org_rocksdb_WriteBatchTest.h"
+#include "include/org_rocksdb_test_WriteBatchInternal.h"
+#include "include/org_rocksdb_test_WriteBatchTest.h"
 #include "rocksjni/portal.h"
 #include "rocksjni/writebatchhandlerjnicallback.h"
 #include "rocksdb/db.h"
@@ -257,11 +257,11 @@ void Java_org_rocksdb_WriteBatch_disposeInternal(
 }
 
 /*
- * Class:     org_rocksdb_WriteBatchInternal
+ * Class:     org_rocksdb_test_WriteBatchInternal
  * Method:    setSequence
  * Signature: (Lorg/rocksdb/WriteBatch;J)V
  */
-void Java_org_rocksdb_WriteBatchInternal_setSequence(
+void Java_org_rocksdb_test_WriteBatchInternal_setSequence(
     JNIEnv* env, jclass jclazz, jobject jobj, jlong jsn) {
   rocksdb::WriteBatch* wb = rocksdb::WriteBatchJni::getHandle(env, jobj);
   assert(wb != nullptr);
@@ -271,11 +271,11 @@ void Java_org_rocksdb_WriteBatchInternal_setSequence(
 }
 
 /*
- * Class:     org_rocksdb_WriteBatchInternal
+ * Class:     org_rocksdb_test_WriteBatchInternal
  * Method:    sequence
  * Signature: (Lorg/rocksdb/WriteBatch;)J
  */
-jlong Java_org_rocksdb_WriteBatchInternal_sequence(
+jlong Java_org_rocksdb_test_WriteBatchInternal_sequence(
     JNIEnv* env, jclass jclazz, jobject jobj) {
   rocksdb::WriteBatch* wb = rocksdb::WriteBatchJni::getHandle(env, jobj);
   assert(wb != nullptr);
@@ -284,11 +284,11 @@ jlong Java_org_rocksdb_WriteBatchInternal_sequence(
 }
 
 /*
- * Class:     org_rocksdb_WriteBatchInternal
+ * Class:     org_rocksdb_test_WriteBatchInternal
  * Method:    append
  * Signature: (Lorg/rocksdb/WriteBatch;Lorg/rocksdb/WriteBatch;)V
  */
-void Java_org_rocksdb_WriteBatchInternal_append(
+void Java_org_rocksdb_test_WriteBatchInternal_append(
     JNIEnv* env, jclass jclazz, jobject jwb1, jobject jwb2) {
   rocksdb::WriteBatch* wb1 = rocksdb::WriteBatchJni::getHandle(env, jwb1);
   assert(wb1 != nullptr);
@@ -321,11 +321,11 @@ void Java_org_rocksdb_WriteBatch_00024Handler_disposeInternal(
 }
 
 /*
- * Class:     org_rocksdb_WriteBatchTest
+ * Class:     org_rocksdb_test_WriteBatchTest
  * Method:    getContents
  * Signature: (Lorg/rocksdb/WriteBatch;)[B
  */
-jbyteArray Java_org_rocksdb_WriteBatchTest_getContents(
+jbyteArray Java_org_rocksdb_test_WriteBatchTest_getContents(
     JNIEnv* env, jclass jclazz, jobject jobj) {
   rocksdb::WriteBatch* b = rocksdb::WriteBatchJni::getHandle(env, jobj);
   assert(b != nullptr);

From f617135d5fd148206da8399c64c236ccafb08017 Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Sun, 2 Nov 2014 23:39:01 +0100
Subject: [PATCH 489/829] [RocksJava] Testcase improvements

---
 java/org/rocksdb/ReadOptions.java             |  1 -
 .../test/BlockBasedTableConfigTest.java       | 70 ++++++++++++----
 .../rocksdb/test/PlainTableConfigTest.java    |  7 +-
 java/org/rocksdb/test/ReadOptionsTest.java    | 83 +++++++++++++++++--
 4 files changed, 137 insertions(+), 24 deletions(-)

diff --git a/java/org/rocksdb/ReadOptions.java b/java/org/rocksdb/ReadOptions.java
index aa6977e98..4a64f288b 100644
--- a/java/org/rocksdb/ReadOptions.java
+++ b/java/org/rocksdb/ReadOptions.java
@@ -155,7 +155,6 @@ public class ReadOptions extends RocksObject {
 
 
   @Override protected void disposeInternal() {
-    assert(isInitialized());
     disposeInternal(nativeHandle_);
   }
   private native void disposeInternal(long handle);
diff --git a/java/org/rocksdb/test/BlockBasedTableConfigTest.java b/java/org/rocksdb/test/BlockBasedTableConfigTest.java
index 8c73915ee..6e90f340a 100644
--- a/java/org/rocksdb/test/BlockBasedTableConfigTest.java
+++ b/java/org/rocksdb/test/BlockBasedTableConfigTest.java
@@ -5,49 +5,85 @@
 
 package org.rocksdb.test;
 
+import org.junit.AfterClass;
 import org.junit.ClassRule;
 import org.junit.Test;
-import org.rocksdb.BlockBasedTableConfig;
-import org.rocksdb.ChecksumType;
-import org.rocksdb.IndexType;
+import org.rocksdb.*;
+
+import static org.assertj.core.api.Assertions.assertThat;
 
 public class BlockBasedTableConfigTest {
+
   @ClassRule
   public static final RocksMemoryResource rocksMemoryResource =
       new RocksMemoryResource();
 
+  @AfterClass
+  public static void printMessage(){
+    System.out.println("Passed BlockBasedTableConfigTst.");
+  }
+
   @Test
   public void shouldTestBlockBasedTableConfig() {
     BlockBasedTableConfig blockBasedTableConfig =
         new BlockBasedTableConfig();
     blockBasedTableConfig.setNoBlockCache(true);
-    assert(blockBasedTableConfig.noBlockCache());
-    blockBasedTableConfig.setBlockCacheSize(8*1024);
-    assert(blockBasedTableConfig.blockCacheSize() == (8*1024));
+    assertThat(blockBasedTableConfig.noBlockCache()).isTrue();
+    blockBasedTableConfig.setBlockCacheSize(8 * 1024);
+    assertThat(blockBasedTableConfig.blockCacheSize()).
+        isEqualTo(8 * 1024);
     blockBasedTableConfig.setBlockSizeDeviation(12);
-    assert(blockBasedTableConfig.blockSizeDeviation() == 12);
+    assertThat(blockBasedTableConfig.blockSizeDeviation()).
+        isEqualTo(12);
     blockBasedTableConfig.setBlockRestartInterval(15);
-    assert(blockBasedTableConfig.blockRestartInterval() == 15);
+    assertThat(blockBasedTableConfig.blockRestartInterval()).
+        isEqualTo(15);
     blockBasedTableConfig.setWholeKeyFiltering(false);
-    assert(!blockBasedTableConfig.wholeKeyFiltering());
+    assertThat(blockBasedTableConfig.wholeKeyFiltering()).
+        isFalse();
     blockBasedTableConfig.setCacheIndexAndFilterBlocks(true);
-    assert(blockBasedTableConfig.cacheIndexAndFilterBlocks());
+    assertThat(blockBasedTableConfig.cacheIndexAndFilterBlocks()).
+        isTrue();
     blockBasedTableConfig.setHashIndexAllowCollision(false);
-    assert(!blockBasedTableConfig.hashIndexAllowCollision());
+    assertThat(blockBasedTableConfig.hashIndexAllowCollision()).
+        isFalse();
     blockBasedTableConfig.setBlockCacheCompressedSize(40);
-    assert(blockBasedTableConfig.blockCacheCompressedSize() == 40);
+    assertThat(blockBasedTableConfig.blockCacheCompressedSize()).
+        isEqualTo(40);
     blockBasedTableConfig.setChecksumType(ChecksumType.kNoChecksum);
     blockBasedTableConfig.setChecksumType(ChecksumType.kxxHash);
-    assert(blockBasedTableConfig.checksumType().equals(
+    assertThat(blockBasedTableConfig.checksumType().equals(
         ChecksumType.kxxHash));
     blockBasedTableConfig.setIndexType(IndexType.kHashSearch);
-    assert(blockBasedTableConfig.indexType().equals(
+    assertThat(blockBasedTableConfig.indexType().equals(
         IndexType.kHashSearch));
     blockBasedTableConfig.setBlockCacheCompressedNumShardBits(4);
-    assert(blockBasedTableConfig.blockCacheCompressedNumShardBits()
-        == 4);
+    assertThat(blockBasedTableConfig.blockCacheCompressedNumShardBits()).
+        isEqualTo(4);
     blockBasedTableConfig.setCacheNumShardBits(5);
-    assert(blockBasedTableConfig.cacheNumShardBits() == 5);
+    assertThat(blockBasedTableConfig.cacheNumShardBits()).
+        isEqualTo(5);
+    blockBasedTableConfig.setBlockSize(10);
+    assertThat(blockBasedTableConfig.blockSize()).isEqualTo(10);
     System.out.println("Passed BlockBasedTableConfigTest.");
   }
+
+  @Test
+  public void shouldTestBlockBasedTableWithFilter() {
+    Options options = new Options();
+    options.setTableFormatConfig(
+        new BlockBasedTableConfig().setFilter(
+            new BloomFilter(10)));
+    assertThat(options.tableFactoryName()).
+        isEqualTo("BlockBasedTable");
+  }
+
+  @Test
+  public void shouldTestBlockBasedTableWithoutFilter() {
+    Options options = new Options();
+    options.setTableFormatConfig(
+        new BlockBasedTableConfig().setFilter(null));
+    assertThat(options.tableFactoryName()).
+        isEqualTo("BlockBasedTable");
+  }
 }
diff --git a/java/org/rocksdb/test/PlainTableConfigTest.java b/java/org/rocksdb/test/PlainTableConfigTest.java
index f4cebb155..1891cd19a 100644
--- a/java/org/rocksdb/test/PlainTableConfigTest.java
+++ b/java/org/rocksdb/test/PlainTableConfigTest.java
@@ -5,6 +5,7 @@
 
 package org.rocksdb.test;
 
+import org.junit.AfterClass;
 import org.junit.ClassRule;
 import org.junit.Test;
 import org.rocksdb.EncodingType;
@@ -16,6 +17,11 @@ public class PlainTableConfigTest {
   public static final RocksMemoryResource rocksMemoryResource =
       new RocksMemoryResource();
 
+  @AfterClass
+  public static void printMessage(){
+    System.out.println("Passed PlainTableConfigTest.");
+  }
+
   @Test
   public void shouldTestPlainTableConfig() {
     PlainTableConfig plainTableConfig = new PlainTableConfig();
@@ -36,6 +42,5 @@ public class PlainTableConfigTest {
     assert(plainTableConfig.fullScanMode());
     plainTableConfig.setStoreIndexInFile(true);
     assert(plainTableConfig.storeIndexInFile());
-    System.out.println("Passed PlainTableConfigTest.");
   }
 }
diff --git a/java/org/rocksdb/test/ReadOptionsTest.java b/java/org/rocksdb/test/ReadOptionsTest.java
index 27d757a10..e00e143cf 100644
--- a/java/org/rocksdb/test/ReadOptionsTest.java
+++ b/java/org/rocksdb/test/ReadOptionsTest.java
@@ -7,17 +7,29 @@ package org.rocksdb.test;
 
 import java.util.Random;
 
+import org.junit.AfterClass;
 import org.junit.ClassRule;
+import org.junit.Rule;
 import org.junit.Test;
-import org.rocksdb.RocksDB;
+import org.junit.rules.ExpectedException;
 import org.rocksdb.ReadOptions;
 
+import static org.assertj.core.api.Assertions.assertThat;
+
 public class ReadOptionsTest {
 
   @ClassRule
   public static final RocksMemoryResource rocksMemoryResource =
       new RocksMemoryResource();
 
+  @Rule
+  public ExpectedException exception = ExpectedException.none();
+
+  @AfterClass
+  public static void printMessage(){
+    System.out.println("Passed ReadOptionsTest.");
+  }
+
   @Test
   public void shouldTestReadOptions() {
     ReadOptions opt = new ReadOptions();
@@ -25,21 +37,82 @@ public class ReadOptionsTest {
     { // VerifyChecksums test
       boolean boolValue = rand.nextBoolean();
       opt.setVerifyChecksums(boolValue);
-      assert(opt.verifyChecksums() == boolValue);
+      assertThat(opt.verifyChecksums()).isEqualTo(boolValue);
     }
 
     { // FillCache test
       boolean boolValue = rand.nextBoolean();
       opt.setFillCache(boolValue);
-      assert(opt.fillCache() == boolValue);
+      assertThat(opt.fillCache()).isEqualTo(boolValue);
     }
 
     { // Tailing test
       boolean boolValue = rand.nextBoolean();
       opt.setTailing(boolValue);
-      assert(opt.tailing() == boolValue);
+      assertThat(opt.tailing()).isEqualTo(boolValue);
+    }
+
+    { // Snapshot null test
+      opt.setSnapshot(null);
+      assertThat(opt.snapshot()).isNull();
     }
     opt.dispose();
-    System.out.println("Passed ReadOptionsTest.");
+  }
+
+  @Test
+  public void shouldFailVerifyChecksumUninitialized(){
+    ReadOptions readOptions = setupUninitializedReadOptions(
+        exception);
+    readOptions.setVerifyChecksums(true);
+  }
+
+  @Test
+  public void shouldFailSetFillCacheUninitialized(){
+    ReadOptions readOptions = setupUninitializedReadOptions(
+        exception);
+    readOptions.setFillCache(true);
+  }
+
+  @Test
+  public void shouldFailFillCacheUninitialized(){
+    ReadOptions readOptions = setupUninitializedReadOptions(
+        exception);
+    readOptions.fillCache();
+  }
+
+  @Test
+  public void shouldFailSetTailingUninitialized(){
+    ReadOptions readOptions = setupUninitializedReadOptions(
+        exception);
+    readOptions.setTailing(true);
+  }
+
+  @Test
+  public void shouldFailTailingUninitialized(){
+    ReadOptions readOptions = setupUninitializedReadOptions(
+        exception);
+    readOptions.tailing();
+  }
+
+  @Test
+  public void shouldFailSetSnapshotUninitialized(){
+    ReadOptions readOptions = setupUninitializedReadOptions(
+        exception);
+    readOptions.setSnapshot(null);
+  }
+
+  @Test
+  public void shouldFailSnapshotUninitialized(){
+    ReadOptions readOptions = setupUninitializedReadOptions(
+        exception);
+    readOptions.snapshot();
+  }
+
+  private ReadOptions setupUninitializedReadOptions(
+      ExpectedException exception) {
+    ReadOptions readOptions = new ReadOptions();
+    readOptions.dispose();
+    exception.expect(AssertionError.class);
+    return readOptions;
   }
 }

From 9bec23c413ab3b2ab59aafc7d8664155b76fb018 Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Mon, 3 Nov 2014 00:11:01 +0100
Subject: [PATCH 490/829] [RocksJava] Test-framework integration

Summary:
As we had the discussion some weeks ago. Java needs a test framework and should support code coverage analysis. This pull request includes:

    Move Tests from main method functionality to Junit4
    Move WriteBatchTest to test package
    Adjust the Makefile to run Junit4
    Download dependencies from Make (once if not-present)
    Adjustment of the rocksjni.pom to run coverage analysis using jacoco
    Javadoc excludes now tests
    Two bugfixes regarding GC cleanup which came up within the test runs

Make can be used as beforehand to build and run RocksJava. make test runs tests using the command-line version of Junit4.

Maven can be used to retrieve code coverage reports using mvn -f rocksjni.pom package. Code coverage reports can then be found as usual in the site folder.

Testing libraries available within Java

    Junit4 (incl. hamcrest-core dependency)
    AssertJ (providing fluent syntax for assertions, cglib dependency)
    Mockito to provide mocktests

Libraries as said before are not statically within this commit or filesystem instead they are downloaded using curl. Make checks if files are present, if so it will perform tests without downloading the libraries again.

Note: Libraries are only necessary to compile & run tests.

Next steps after merge:

    Get the maven build into travis-ci and coveralls.io
    Filling up the missing test spots (based on coverage data)

Test Plan:
make rocksdbjava
make jtest

Reviewers: yhchiang, ankgup87, adamretter

Subscribers: dhruba

Differential Revision: https://reviews.facebook.net/D28209
---
 java/org/rocksdb/test/RocksMemoryResource.java | 2 +-
 java/rocksjni.pom                              | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/java/org/rocksdb/test/RocksMemoryResource.java b/java/org/rocksdb/test/RocksMemoryResource.java
index eabbc822e..51164ad65 100644
--- a/java/org/rocksdb/test/RocksMemoryResource.java
+++ b/java/org/rocksdb/test/RocksMemoryResource.java
@@ -18,4 +18,4 @@ public class RocksMemoryResource extends ExternalResource {
     System.gc();
     System.runFinalization();
   }
-}
\ No newline at end of file
+}
diff --git a/java/rocksjni.pom b/java/rocksjni.pom
index d8fe09fe9..2966a8a6b 100644
--- a/java/rocksjni.pom
+++ b/java/rocksjni.pom
@@ -123,7 +123,7 @@
             <plugin>
                 <groupId>org.jacoco</groupId>
                 <artifactId>jacoco-maven-plugin</artifactId>
-                <version>0.7.1.201405082137</version>
+                <version>0.7.2.201409121644</version>
                 <executions>
                     <execution>
                         <goals>

From b092686959dffec61f5afff5c29b15215239dfb5 Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Mon, 3 Nov 2014 09:59:52 +0100
Subject: [PATCH 491/829] [RocksJava] Extended testcases

+ 7% coverage + 3% branch coverage
---
 java/Makefile                                 |   1 +
 .../test/BlockBasedTableConfigTest.java       |   3 +-
 java/org/rocksdb/test/ComparatorTest.java     | 129 +++++++++++++++++-
 java/org/rocksdb/test/WriteOptionsTest.java   |  38 ++++++
 4 files changed, 168 insertions(+), 3 deletions(-)
 create mode 100644 java/org/rocksdb/test/WriteOptionsTest.java

diff --git a/java/Makefile b/java/Makefile
index d3bd8d8d4..d99821f05 100644
--- a/java/Makefile
+++ b/java/Makefile
@@ -71,6 +71,7 @@ JAVA_TESTS = org.rocksdb.test.AbstractComparatorTest\
 		org.rocksdb.test.StatisticsCollectorTest\
 		org.rocksdb.test.WirteBatchHandlerTest\
 		org.rocksdb.test.WriteBatchTest\
+		org.rocksdb.test.WriteOptionsTest\
 
 JAVA_TEST_LIBDIR = ./test-libs/
 JAVA_JUNIT_JAR = $(JAVA_TEST_LIBDIR)junit-4.12-beta-2.jar
diff --git a/java/org/rocksdb/test/BlockBasedTableConfigTest.java b/java/org/rocksdb/test/BlockBasedTableConfigTest.java
index 6e90f340a..0b1961569 100644
--- a/java/org/rocksdb/test/BlockBasedTableConfigTest.java
+++ b/java/org/rocksdb/test/BlockBasedTableConfigTest.java
@@ -20,7 +20,7 @@ public class BlockBasedTableConfigTest {
 
   @AfterClass
   public static void printMessage(){
-    System.out.println("Passed BlockBasedTableConfigTst.");
+    System.out.println("Passed BlockBasedTableConfigTest.");
   }
 
   @Test
@@ -65,7 +65,6 @@ public class BlockBasedTableConfigTest {
         isEqualTo(5);
     blockBasedTableConfig.setBlockSize(10);
     assertThat(blockBasedTableConfig.blockSize()).isEqualTo(10);
-    System.out.println("Passed BlockBasedTableConfigTest.");
   }
 
   @Test
diff --git a/java/org/rocksdb/test/ComparatorTest.java b/java/org/rocksdb/test/ComparatorTest.java
index d65a0653a..26523c719 100644
--- a/java/org/rocksdb/test/ComparatorTest.java
+++ b/java/org/rocksdb/test/ComparatorTest.java
@@ -5,6 +5,7 @@
 
 package org.rocksdb.test;
 
+import org.junit.AfterClass;
 import org.junit.ClassRule;
 import org.junit.Rule;
 import org.junit.Test;
@@ -14,6 +15,8 @@ import org.rocksdb.*;
 import java.io.IOException;
 import java.nio.file.FileSystems;
 
+import static org.assertj.core.api.Assertions.assertThat;
+
 public class ComparatorTest {
 
   @ClassRule
@@ -23,6 +26,11 @@ public class ComparatorTest {
   @Rule
   public TemporaryFolder dbFolder = new TemporaryFolder();
 
+  @AfterClass
+  public static void printMessage(){
+    System.out.println("Passed ComparatorTest.");
+  }
+
   @Test
   public void shouldTestComparator() throws IOException {
 
@@ -47,7 +55,126 @@ public class ComparatorTest {
     // test the round-tripability of keys written and read with the Comparator
     comparatorTest.testRoundtrip(FileSystems.getDefault().getPath(
         dbFolder.getRoot().getAbsolutePath()));
+  }
 
-    System.out.println("Passed ComparatorTest");
+  @Test
+  public void shouldTestBuiltinForwardComparator()
+      throws RocksDBException {
+    Options options = new Options();
+    options.setCreateIfMissing(true);
+    options.setComparator(BuiltinComparator.BYTEWISE_COMPARATOR);
+    RocksDB rocksDB = RocksDB.open(options,
+        dbFolder.getRoot().getAbsolutePath());
+
+    rocksDB.put("abc1".getBytes(), "abc1".getBytes());
+    rocksDB.put("abc2".getBytes(), "abc2".getBytes());
+    rocksDB.put("abc3".getBytes(), "abc3".getBytes());
+
+    RocksIterator rocksIterator = rocksDB.newIterator();
+    // Iterate over keys using a iterator
+    rocksIterator.seekToFirst();
+    assertThat(rocksIterator.isValid()).isTrue();
+    assertThat(rocksIterator.key()).isEqualTo(
+        "abc1".getBytes());
+    assertThat(rocksIterator.value()).isEqualTo(
+        "abc1".getBytes());
+    rocksIterator.next();
+    assertThat(rocksIterator.isValid()).isTrue();
+    assertThat(rocksIterator.key()).isEqualTo(
+        "abc2".getBytes());
+    assertThat(rocksIterator.value()).isEqualTo(
+        "abc2".getBytes());
+    rocksIterator.next();
+    assertThat(rocksIterator.isValid()).isTrue();
+    assertThat(rocksIterator.key()).isEqualTo(
+        "abc3".getBytes());
+    assertThat(rocksIterator.value()).isEqualTo(
+        "abc3".getBytes());
+    rocksIterator.next();
+    assertThat(rocksIterator.isValid()).isFalse();
+    // Get last one
+    rocksIterator.seekToLast();
+    assertThat(rocksIterator.isValid()).isTrue();
+    assertThat(rocksIterator.key()).isEqualTo(
+        "abc3".getBytes());
+    assertThat(rocksIterator.value()).isEqualTo(
+        "abc3".getBytes());
+    // Seek for abc
+    rocksIterator.seek("abc".getBytes());
+    assertThat(rocksIterator.isValid()).isTrue();
+    assertThat(rocksIterator.key()).isEqualTo(
+        "abc1".getBytes());
+    assertThat(rocksIterator.value()).isEqualTo(
+        "abc1".getBytes());
+    rocksIterator.dispose();
+    rocksDB.close();
+    options.dispose();
+  }
+
+  @Test
+  public void shouldTestBuiltinReverseComparator()
+      throws RocksDBException {
+    Options options = new Options();
+    options.setCreateIfMissing(true);
+    options.setComparator(
+        BuiltinComparator.REVERSE_BYTEWISE_COMPARATOR);
+    RocksDB rocksDB = RocksDB.open(options,
+        dbFolder.getRoot().getAbsolutePath());
+
+    rocksDB.put("abc1".getBytes(), "abc1".getBytes());
+    rocksDB.put("abc2".getBytes(), "abc2".getBytes());
+    rocksDB.put("abc3".getBytes(), "abc3".getBytes());
+
+    RocksIterator rocksIterator = rocksDB.newIterator();
+    // Iterate over keys using a iterator
+    rocksIterator.seekToFirst();
+    assertThat(rocksIterator.isValid()).isTrue();
+    assertThat(rocksIterator.key()).isEqualTo(
+        "abc3".getBytes());
+    assertThat(rocksIterator.value()).isEqualTo(
+        "abc3".getBytes());
+    rocksIterator.next();
+    assertThat(rocksIterator.isValid()).isTrue();
+    assertThat(rocksIterator.key()).isEqualTo(
+        "abc2".getBytes());
+    assertThat(rocksIterator.value()).isEqualTo(
+        "abc2".getBytes());
+    rocksIterator.next();
+    assertThat(rocksIterator.isValid()).isTrue();
+    assertThat(rocksIterator.key()).isEqualTo(
+        "abc1".getBytes());
+    assertThat(rocksIterator.value()).isEqualTo(
+        "abc1".getBytes());
+    rocksIterator.next();
+    assertThat(rocksIterator.isValid()).isFalse();
+    // Get last one
+    rocksIterator.seekToLast();
+    assertThat(rocksIterator.isValid()).isTrue();
+    assertThat(rocksIterator.key()).isEqualTo(
+        "abc1".getBytes());
+    assertThat(rocksIterator.value()).isEqualTo(
+        "abc1".getBytes());
+    // Will be invalid because abc is after abc1
+    rocksIterator.seek("abc".getBytes());
+    assertThat(rocksIterator.isValid()).isFalse();
+    // Will be abc3 because the next one after abc999
+    // is abc3
+    rocksIterator.seek("abc999".getBytes());
+    assertThat(rocksIterator.key()).isEqualTo(
+        "abc3".getBytes());
+    assertThat(rocksIterator.value()).isEqualTo(
+        "abc3".getBytes());
+    rocksIterator.dispose();
+    rocksDB.close();
+    options.dispose();
+  }
+
+  @Test
+  public void shouldTestBuiltinComparatorEnum(){
+    assertThat(BuiltinComparator.BYTEWISE_COMPARATOR.ordinal())
+        .isEqualTo(0);
+    assertThat(
+        BuiltinComparator.REVERSE_BYTEWISE_COMPARATOR.ordinal())
+        .isEqualTo(1);
   }
 }
diff --git a/java/org/rocksdb/test/WriteOptionsTest.java b/java/org/rocksdb/test/WriteOptionsTest.java
new file mode 100644
index 000000000..7a92bf9fa
--- /dev/null
+++ b/java/org/rocksdb/test/WriteOptionsTest.java
@@ -0,0 +1,38 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb.test;
+
+import org.junit.AfterClass;
+import org.junit.ClassRule;
+import org.junit.Test;
+import org.rocksdb.WriteOptions;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class WriteOptionsTest {
+
+  @ClassRule
+  public static final RocksMemoryResource rocksMemoryResource =
+      new RocksMemoryResource();
+
+  @AfterClass
+  public static void printMessage(){
+    System.out.println("Passed WriteOptionsTest.");
+  }
+
+  @Test
+  public void shouldTestWriteOptions(){
+    WriteOptions writeOptions = new WriteOptions();
+    writeOptions.setDisableWAL(true);
+    assertThat(writeOptions.disableWAL()).isTrue();
+    writeOptions.setDisableWAL(false);
+    assertThat(writeOptions.disableWAL()).isFalse();
+    writeOptions.setSync(true);
+    assertThat(writeOptions.sync()).isTrue();
+    writeOptions.setSync(false);
+    assertThat(writeOptions.sync()).isFalse();
+  }
+}

From 36f3a0bb8e19ddef1d5ae7e9c88bf9cc5b67cb47 Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Wed, 5 Nov 2014 18:46:36 +0100
Subject: [PATCH 492/829] [RocksJava] Integrated review comments from
 adamretter in D28209

---
 java/Makefile                                 |   2 +-
 java/org/rocksdb/test/BackupableDBTest.java   | 208 ++++++------
 .../test/BlockBasedTableConfigTest.java       |  12 +-
 java/org/rocksdb/test/ColumnFamilyTest.java   |   3 +-
 .../rocksdb/test/ComparatorOptionsTest.java   |   3 +-
 java/org/rocksdb/test/ComparatorTest.java     |  14 +-
 java/org/rocksdb/test/DBOptionsTest.java      | 162 ++++++----
 .../rocksdb/test/DirectComparatorTest.java    |   4 +-
 java/org/rocksdb/test/FilterTest.java         |  10 +-
 java/org/rocksdb/test/KeyMayExistTest.java    | 118 +++----
 java/org/rocksdb/test/MemTableTest.java       |  79 +++--
 java/org/rocksdb/test/MergeTest.java          |  32 +-
 java/org/rocksdb/test/OptionsTest.java        | 289 ++++++++++++++++-
 .../rocksdb/test/PlainTableConfigTest.java    |  34 +-
 java/org/rocksdb/test/ReadOnlyTest.java       | 295 ++++++++++++------
 java/org/rocksdb/test/ReadOptionsTest.java    |  22 +-
 java/org/rocksdb/test/RocksIteratorTest.java  |   3 +-
 java/org/rocksdb/test/RocksJunitRunner.java   |  65 ++++
 java/org/rocksdb/test/SnapshotTest.java       |  36 ++-
 .../rocksdb/test/StatisticsCollectorTest.java |   4 +-
 java/org/rocksdb/test/WriteBatchTest.java     |  14 +-
 java/org/rocksdb/test/WriteOptionsTest.java   |   8 +-
 java/rocksjni.pom                             |   1 +
 23 files changed, 925 insertions(+), 493 deletions(-)
 create mode 100644 java/org/rocksdb/test/RocksJunitRunner.java

diff --git a/java/Makefile b/java/Makefile
index d99821f05..83bf34afc 100644
--- a/java/Makefile
+++ b/java/Makefile
@@ -120,7 +120,7 @@ resolve_test_deps:
 
 test: java resolve_test_deps
 	javac -cp $(JAVA_TESTCLASSPATH) org/rocksdb/test/*.java
-	java -ea -Djava.library.path=.:../ -cp "$(JAVA_TESTCLASSPATH)" org.junit.runner.JUnitCore $(JAVA_TESTS)
+	java -ea -Djava.library.path=.:../ -cp "$(JAVA_TESTCLASSPATH)" org.rocksdb.test.RocksJunitRunner $(JAVA_TESTS)
 
 db_bench: java
 	javac org/rocksdb/benchmark/*.java
diff --git a/java/org/rocksdb/test/BackupableDBTest.java b/java/org/rocksdb/test/BackupableDBTest.java
index f0a6708c1..aa6c07c14 100644
--- a/java/org/rocksdb/test/BackupableDBTest.java
+++ b/java/org/rocksdb/test/BackupableDBTest.java
@@ -13,6 +13,8 @@ import org.rocksdb.*;
 
 import java.util.List;
 
+import static org.assertj.core.api.Assertions.assertThat;
+
 public class BackupableDBTest {
 
   @ClassRule
@@ -26,7 +28,7 @@ public class BackupableDBTest {
   public TemporaryFolder backupFolder = new TemporaryFolder();
 
   @Test
-  public void shouldTestBackupableDb() {
+  public void backupableDb() throws RocksDBException {
 
     Options opt = new Options();
     opt.setCreateIfMissing(true);
@@ -34,108 +36,110 @@ public class BackupableDBTest {
     BackupableDBOptions bopt = new BackupableDBOptions(
         backupFolder.getRoot().getAbsolutePath(), false,
         true, false, true, 0, 0);
-    BackupableDB bdb = null;
+    BackupableDB bdb;
     List<BackupInfo> backupInfos;
     List<BackupInfo> restoreInfos;
-    try {
-      bdb = BackupableDB.open(opt, bopt,
-          dbFolder.getRoot().getAbsolutePath());
-
-      bdb.put("abc".getBytes(), "def".getBytes());
-      bdb.put("ghi".getBytes(), "jkl".getBytes());
-
-      backupInfos = bdb.getBackupInfos();
-      assert(backupInfos.size() == 0);
-
-      bdb.createNewBackup(true);
-
-      backupInfos = bdb.getBackupInfos();
-      assert(backupInfos.size() == 1);
-
-      // Retrieving backup infos twice shall not
-      // lead to different results
-      List<BackupInfo> tmpBackupInfo = bdb.getBackupInfos();
-      assert(tmpBackupInfo.get(0).backupId() ==
-          backupInfos.get(0).backupId());
-      assert(tmpBackupInfo.get(0).timestamp() ==
-          backupInfos.get(0).timestamp());
-      assert(tmpBackupInfo.get(0).size() ==
-          backupInfos.get(0).size());
-      assert(tmpBackupInfo.get(0).numberFiles() ==
-          backupInfos.get(0).numberFiles());
-
-      // delete record after backup
-      bdb.remove("abc".getBytes());
-      byte[] value = bdb.get("abc".getBytes());
-      assert(value == null);
-      bdb.close();
-
-      // restore from backup
-      RestoreOptions ropt = new RestoreOptions(false);
-      RestoreBackupableDB rdb = new RestoreBackupableDB(bopt);
-
-      // getting backup infos from restorable db should
-      // lead to the same infos as from backupable db
-      restoreInfos = rdb.getBackupInfos();
-      assert(restoreInfos.size() == backupInfos.size());
-      assert(restoreInfos.get(0).backupId() ==
-          backupInfos.get(0).backupId());
-      assert(restoreInfos.get(0).timestamp() ==
-          backupInfos.get(0).timestamp());
-      assert(restoreInfos.get(0).size() ==
-          backupInfos.get(0).size());
-      assert(restoreInfos.get(0).numberFiles() ==
-          backupInfos.get(0).numberFiles());
-
-      rdb.restoreDBFromLatestBackup(
-          dbFolder.getRoot().getAbsolutePath(),
-          dbFolder.getRoot().getAbsolutePath(),
-          ropt);
-      // do nothing because there is only one backup
-      rdb.purgeOldBackups(1);
-      restoreInfos = rdb.getBackupInfos();
-      assert(restoreInfos.size() == 1);
-      rdb.dispose();
-      ropt.dispose();
-
-      // verify that backed up data contains deleted record
-      bdb = BackupableDB.open(opt, bopt,
-          dbFolder.getRoot().getAbsolutePath());
-      value = bdb.get("abc".getBytes());
-      assert(new String(value).equals("def"));
-
-      bdb.createNewBackup(false);
-      // after new backup there must be two backup infos
-      backupInfos = bdb.getBackupInfos();
-      assert(backupInfos.size() == 2);
-      // deleting the backup must be possible using the
-      // id provided by backup infos
-      bdb.deleteBackup(backupInfos.get(1).backupId());
-      // after deletion there should only be one info
-      backupInfos = bdb.getBackupInfos();
-      assert(backupInfos.size() == 1);
-      bdb.createNewBackup(false);
-      bdb.createNewBackup(false);
-      bdb.createNewBackup(false);
-      backupInfos = bdb.getBackupInfos();
-      assert(backupInfos.size() == 4);
-      // purge everything and keep two
-      bdb.purgeOldBackups(2);
-      // backup infos need to be two
-      backupInfos = bdb.getBackupInfos();
-      assert(backupInfos.size() == 2);
-      assert(backupInfos.get(0).backupId() == 4);
-      assert(backupInfos.get(1).backupId() == 5);
-    } catch (RocksDBException e) {
-      System.err.format("[ERROR]: %s%n", e);
-      e.printStackTrace();
-    } finally {
-      opt.dispose();
-      bopt.dispose();
-      if (bdb != null) {
-        bdb.close();
-      }
-    }
-    System.out.println("Passed BackupableDBTest.");
+
+    bdb = BackupableDB.open(opt, bopt,
+        dbFolder.getRoot().getAbsolutePath());
+    bdb.put("abc".getBytes(), "def".getBytes());
+    bdb.put("ghi".getBytes(), "jkl".getBytes());
+
+    backupInfos = bdb.getBackupInfos();
+    assertThat(backupInfos.size()).
+        isEqualTo(0);
+
+    bdb.createNewBackup(true);
+    backupInfos = bdb.getBackupInfos();
+    assertThat(backupInfos.size()).
+        isEqualTo(1);
+
+    // Retrieving backup infos twice shall not
+    // lead to different results
+    List<BackupInfo> tmpBackupInfo = bdb.getBackupInfos();
+    assertThat(tmpBackupInfo.get(0).backupId()).
+        isEqualTo(backupInfos.get(0).backupId());
+    assertThat(tmpBackupInfo.get(0).timestamp()).
+        isEqualTo(backupInfos.get(0).timestamp());
+    assertThat(tmpBackupInfo.get(0).size()).
+        isEqualTo(backupInfos.get(0).size());
+    assertThat(tmpBackupInfo.get(0).numberFiles()).
+        isEqualTo(backupInfos.get(0).numberFiles());
+
+    // delete record after backup
+    bdb.remove("abc".getBytes());
+    byte[] value = bdb.get("abc".getBytes());
+    assertThat(value).isNull();
+    bdb.close();
+
+    // restore from backup
+    RestoreOptions ropt = new RestoreOptions(false);
+    RestoreBackupableDB rdb = new RestoreBackupableDB(bopt);
+
+    // getting backup infos from restorable db should
+    // lead to the same infos as from backupable db
+    restoreInfos = rdb.getBackupInfos();
+    assertThat(restoreInfos.size()).
+        isEqualTo(backupInfos.size());
+    assertThat(restoreInfos.get(0).backupId()).
+        isEqualTo(backupInfos.get(0).backupId());
+    assertThat(restoreInfos.get(0).timestamp()).
+        isEqualTo(backupInfos.get(0).timestamp());
+    assertThat(restoreInfos.get(0).size()).
+        isEqualTo(backupInfos.get(0).size());
+    assertThat(restoreInfos.get(0).numberFiles()).
+        isEqualTo(backupInfos.get(0).numberFiles());
+
+    rdb.restoreDBFromLatestBackup(
+        dbFolder.getRoot().getAbsolutePath(),
+        dbFolder.getRoot().getAbsolutePath(),
+        ropt);
+    // do nothing because there is only one backup
+    rdb.purgeOldBackups(1);
+    restoreInfos = rdb.getBackupInfos();
+    assertThat(restoreInfos.size()).
+        isEqualTo(1);
+    rdb.dispose();
+    ropt.dispose();
+
+    // verify that backed up data contains deleted record
+    bdb = BackupableDB.open(opt, bopt,
+        dbFolder.getRoot().getAbsolutePath());
+    value = bdb.get("abc".getBytes());
+    assertThat(new String(value)).
+        isEqualTo("def");
+
+    bdb.createNewBackup(false);
+    // after new backup there must be two backup infos
+    backupInfos = bdb.getBackupInfos();
+    assertThat(backupInfos.size()).
+        isEqualTo(2);
+    // deleting the backup must be possible using the
+    // id provided by backup infos
+    bdb.deleteBackup(backupInfos.get(1).backupId());
+    // after deletion there should only be one info
+    backupInfos = bdb.getBackupInfos();
+    assertThat(backupInfos.size()).
+        isEqualTo(1);
+    bdb.createNewBackup(false);
+    bdb.createNewBackup(false);
+    bdb.createNewBackup(false);
+    backupInfos = bdb.getBackupInfos();
+    assertThat(backupInfos.size()).
+        isEqualTo(4);
+    // purge everything and keep two
+    bdb.purgeOldBackups(2);
+    // backup infos need to be two
+    backupInfos = bdb.getBackupInfos();
+    assertThat(backupInfos.size()).
+        isEqualTo(2);
+    assertThat(backupInfos.get(0).backupId()).
+        isEqualTo(4);
+    assertThat(backupInfos.get(1).backupId()).
+        isEqualTo(5);
+
+    opt.dispose();
+    bopt.dispose();
+    bdb.close();
   }
 }
diff --git a/java/org/rocksdb/test/BlockBasedTableConfigTest.java b/java/org/rocksdb/test/BlockBasedTableConfigTest.java
index 0b1961569..143a3fa14 100644
--- a/java/org/rocksdb/test/BlockBasedTableConfigTest.java
+++ b/java/org/rocksdb/test/BlockBasedTableConfigTest.java
@@ -5,7 +5,6 @@
 
 package org.rocksdb.test;
 
-import org.junit.AfterClass;
 import org.junit.ClassRule;
 import org.junit.Test;
 import org.rocksdb.*;
@@ -18,13 +17,8 @@ public class BlockBasedTableConfigTest {
   public static final RocksMemoryResource rocksMemoryResource =
       new RocksMemoryResource();
 
-  @AfterClass
-  public static void printMessage(){
-    System.out.println("Passed BlockBasedTableConfigTest.");
-  }
-
   @Test
-  public void shouldTestBlockBasedTableConfig() {
+  public void blockBasedTableConfig() {
     BlockBasedTableConfig blockBasedTableConfig =
         new BlockBasedTableConfig();
     blockBasedTableConfig.setNoBlockCache(true);
@@ -68,7 +62,7 @@ public class BlockBasedTableConfigTest {
   }
 
   @Test
-  public void shouldTestBlockBasedTableWithFilter() {
+  public void blockBasedTableWithFilter() {
     Options options = new Options();
     options.setTableFormatConfig(
         new BlockBasedTableConfig().setFilter(
@@ -78,7 +72,7 @@ public class BlockBasedTableConfigTest {
   }
 
   @Test
-  public void shouldTestBlockBasedTableWithoutFilter() {
+  public void blockBasedTableWithoutFilter() {
     Options options = new Options();
     options.setTableFormatConfig(
         new BlockBasedTableConfig().setFilter(null));
diff --git a/java/org/rocksdb/test/ColumnFamilyTest.java b/java/org/rocksdb/test/ColumnFamilyTest.java
index e52eac589..fc5b4ba6e 100644
--- a/java/org/rocksdb/test/ColumnFamilyTest.java
+++ b/java/org/rocksdb/test/ColumnFamilyTest.java
@@ -26,7 +26,7 @@ public class ColumnFamilyTest {
   public TemporaryFolder dbFolder = new TemporaryFolder();
 
   @Test
-  public void shouldTestColumnFamilies() {
+  public void  columnFamilies() {
     String db_path = dbFolder.getRoot().getAbsolutePath();
     RocksDB db = null;
     Options options = new Options();
@@ -291,6 +291,5 @@ public class ColumnFamilyTest {
     db.close();
     // be sure to dispose c++ pointers
     options.dispose();
-    System.out.println("Passed ColumnFamilyTest.");
   }
 }
diff --git a/java/org/rocksdb/test/ComparatorOptionsTest.java b/java/org/rocksdb/test/ComparatorOptionsTest.java
index 21f4fc2a1..1064910df 100644
--- a/java/org/rocksdb/test/ComparatorOptionsTest.java
+++ b/java/org/rocksdb/test/ComparatorOptionsTest.java
@@ -18,7 +18,7 @@ public class ComparatorOptionsTest {
       new RocksMemoryResource();
 
   @Test
-  public void shouldTestComparatorOptions() {
+  public void comparatorOptions() {
     final ComparatorOptions copt = new ComparatorOptions();
 
     assertThat(copt).isNotNull();
@@ -32,6 +32,5 @@ public class ComparatorOptionsTest {
     }
 
     copt.dispose();
-    System.out.println("Passed ComparatorOptionsTest");
   }
 }
diff --git a/java/org/rocksdb/test/ComparatorTest.java b/java/org/rocksdb/test/ComparatorTest.java
index 26523c719..c9037954e 100644
--- a/java/org/rocksdb/test/ComparatorTest.java
+++ b/java/org/rocksdb/test/ComparatorTest.java
@@ -5,7 +5,6 @@
 
 package org.rocksdb.test;
 
-import org.junit.AfterClass;
 import org.junit.ClassRule;
 import org.junit.Rule;
 import org.junit.Test;
@@ -26,13 +25,8 @@ public class ComparatorTest {
   @Rule
   public TemporaryFolder dbFolder = new TemporaryFolder();
 
-  @AfterClass
-  public static void printMessage(){
-    System.out.println("Passed ComparatorTest.");
-  }
-
   @Test
-  public void shouldTestComparator() throws IOException {
+  public void javaComparator() throws IOException {
 
     final AbstractComparatorTest comparatorTest = new AbstractComparatorTest() {
       @Override
@@ -58,7 +52,7 @@ public class ComparatorTest {
   }
 
   @Test
-  public void shouldTestBuiltinForwardComparator()
+  public void builtinForwardComparator()
       throws RocksDBException {
     Options options = new Options();
     options.setCreateIfMissing(true);
@@ -112,7 +106,7 @@ public class ComparatorTest {
   }
 
   @Test
-  public void shouldTestBuiltinReverseComparator()
+  public void builtinReverseComparator()
       throws RocksDBException {
     Options options = new Options();
     options.setCreateIfMissing(true);
@@ -170,7 +164,7 @@ public class ComparatorTest {
   }
 
   @Test
-  public void shouldTestBuiltinComparatorEnum(){
+  public void builtinComparatorEnum(){
     assertThat(BuiltinComparator.BYTEWISE_COMPARATOR.ordinal())
         .isEqualTo(0);
     assertThat(
diff --git a/java/org/rocksdb/test/DBOptionsTest.java b/java/org/rocksdb/test/DBOptionsTest.java
index 0cd2468ea..529a9b09b 100644
--- a/java/org/rocksdb/test/DBOptionsTest.java
+++ b/java/org/rocksdb/test/DBOptionsTest.java
@@ -5,224 +5,252 @@
 
 package org.rocksdb.test;
 
-import org.rocksdb.DBOptions;
-import org.rocksdb.DBOptionsInterface;
-import org.rocksdb.RocksDB;
-import org.rocksdb.RocksDBException;
+import org.junit.ClassRule;
+import org.junit.Test;
+import org.rocksdb.*;
 
 import java.util.Random;
 
+import static org.assertj.core.api.Assertions.assertThat;
+
 public class DBOptionsTest {
-  static {
-    RocksDB.loadLibrary();
+
+  @ClassRule
+  public static final RocksMemoryResource rocksMemoryResource =
+      new RocksMemoryResource();
+
+  @Test
+  public void dbOptions() throws RocksDBException {
+    testDBOptions(new DBOptions());
   }
 
-  public static void testDBOptions(DBOptionsInterface opt) {
+  static void testDBOptions(DBOptionsInterface opt) throws RocksDBException {
     Random rand = PlatformRandomHelper.
         getPlatformSpecificRandomFactory();
     { // CreateIfMissing test
       boolean boolValue = rand.nextBoolean();
       opt.setCreateIfMissing(boolValue);
-      assert(opt.createIfMissing() == boolValue);
+      assertThat(opt.createIfMissing()).
+          isEqualTo(boolValue);
     }
 
     { // CreateMissingColumnFamilies test
       boolean boolValue = rand.nextBoolean();
       opt.setCreateMissingColumnFamilies(boolValue);
-      assert(opt.createMissingColumnFamilies() == boolValue);
+      assertThat(opt.createMissingColumnFamilies()).
+          isEqualTo(boolValue);
     }
 
     { // ErrorIfExists test
       boolean boolValue = rand.nextBoolean();
       opt.setErrorIfExists(boolValue);
-      assert(opt.errorIfExists() == boolValue);
+      assertThat(opt.errorIfExists()).isEqualTo(boolValue);
     }
 
     { // ParanoidChecks test
       boolean boolValue = rand.nextBoolean();
       opt.setParanoidChecks(boolValue);
-      assert(opt.paranoidChecks() == boolValue);
+      assertThat(opt.paranoidChecks()).
+          isEqualTo(boolValue);
     }
 
     {
       // MaxTotalWalSize test
       long longValue = rand.nextLong();
       opt.setMaxTotalWalSize(longValue);
-      assert(opt.maxTotalWalSize() == longValue);
+      assertThat(opt.maxTotalWalSize()).
+          isEqualTo(longValue);
     }
 
     { // MaxOpenFiles test
       int intValue = rand.nextInt();
       opt.setMaxOpenFiles(intValue);
-      assert(opt.maxOpenFiles() == intValue);
+      assertThat(opt.maxOpenFiles()).isEqualTo(intValue);
     }
 
     { // DisableDataSync test
       boolean boolValue = rand.nextBoolean();
       opt.setDisableDataSync(boolValue);
-      assert(opt.disableDataSync() == boolValue);
+      assertThat(opt.disableDataSync()).
+          isEqualTo(boolValue);
     }
 
     { // UseFsync test
       boolean boolValue = rand.nextBoolean();
       opt.setUseFsync(boolValue);
-      assert(opt.useFsync() == boolValue);
+      assertThat(opt.useFsync()).isEqualTo(boolValue);
     }
 
     { // DbLogDir test
       String str = "path/to/DbLogDir";
       opt.setDbLogDir(str);
-      assert(opt.dbLogDir().equals(str));
+      assertThat(opt.dbLogDir()).isEqualTo(str);
     }
 
     { // WalDir test
       String str = "path/to/WalDir";
       opt.setWalDir(str);
-      assert(opt.walDir().equals(str));
+      assertThat(opt.walDir()).isEqualTo(str);
     }
 
     { // DeleteObsoleteFilesPeriodMicros test
       long longValue = rand.nextLong();
       opt.setDeleteObsoleteFilesPeriodMicros(longValue);
-      assert(opt.deleteObsoleteFilesPeriodMicros() == longValue);
+      assertThat(opt.deleteObsoleteFilesPeriodMicros()).
+          isEqualTo(longValue);
     }
 
     { // MaxBackgroundCompactions test
       int intValue = rand.nextInt();
       opt.setMaxBackgroundCompactions(intValue);
-      assert(opt.maxBackgroundCompactions() == intValue);
+      assertThat(opt.maxBackgroundCompactions()).
+          isEqualTo(intValue);
     }
 
     { // MaxBackgroundFlushes test
       int intValue = rand.nextInt();
       opt.setMaxBackgroundFlushes(intValue);
-      assert(opt.maxBackgroundFlushes() == intValue);
+      assertThat(opt.maxBackgroundFlushes()).
+          isEqualTo(intValue);
     }
 
     { // MaxLogFileSize test
-      try {
-        long longValue = rand.nextLong();
-        opt.setMaxLogFileSize(longValue);
-        assert(opt.maxLogFileSize() == longValue);
-      } catch (RocksDBException e) {
-        System.out.println(e.getMessage());
-        assert(false);
-      }
+      long longValue = rand.nextLong();
+      opt.setMaxLogFileSize(longValue);
+      assertThat(opt.maxLogFileSize()).isEqualTo(longValue);
     }
 
     { // LogFileTimeToRoll test
-      try {
-        long longValue = rand.nextLong();
-        opt.setLogFileTimeToRoll(longValue);
-        assert(opt.logFileTimeToRoll() == longValue);
-      } catch (RocksDBException e) {
-        assert(false);
-      }
+      long longValue = rand.nextLong();
+      opt.setLogFileTimeToRoll(longValue);
+      assertThat(opt.logFileTimeToRoll()).
+          isEqualTo(longValue);
     }
 
     { // KeepLogFileNum test
-      try {
-        long longValue = rand.nextLong();
-        opt.setKeepLogFileNum(longValue);
-        assert(opt.keepLogFileNum() == longValue);
-      } catch (RocksDBException e) {
-        assert(false);
-      }
+      long longValue = rand.nextLong();
+      opt.setKeepLogFileNum(longValue);
+      assertThat(opt.keepLogFileNum()).isEqualTo(longValue);
     }
 
     { // MaxManifestFileSize test
       long longValue = rand.nextLong();
       opt.setMaxManifestFileSize(longValue);
-      assert(opt.maxManifestFileSize() == longValue);
+      assertThat(opt.maxManifestFileSize()).
+          isEqualTo(longValue);
     }
 
     { // TableCacheNumshardbits test
       int intValue = rand.nextInt();
       opt.setTableCacheNumshardbits(intValue);
-      assert(opt.tableCacheNumshardbits() == intValue);
+      assertThat(opt.tableCacheNumshardbits()).
+          isEqualTo(intValue);
     }
 
     { // TableCacheRemoveScanCountLimit test
       int intValue = rand.nextInt();
       opt.setTableCacheRemoveScanCountLimit(intValue);
-      assert(opt.tableCacheRemoveScanCountLimit() == intValue);
+      assertThat(opt.tableCacheRemoveScanCountLimit()).
+          isEqualTo(intValue);
+    }
+
+    { // WalSizeLimitMB test
+      long longValue = rand.nextLong();
+      opt.setWalSizeLimitMB(longValue);
+      assertThat(opt.walSizeLimitMB()).isEqualTo(longValue);
     }
 
     { // WalTtlSeconds test
       long longValue = rand.nextLong();
       opt.setWalTtlSeconds(longValue);
-      assert(opt.walTtlSeconds() == longValue);
+      assertThat(opt.walTtlSeconds()).isEqualTo(longValue);
     }
 
     { // ManifestPreallocationSize test
-      try {
-        long longValue = rand.nextLong();
-        opt.setManifestPreallocationSize(longValue);
-        assert(opt.manifestPreallocationSize() == longValue);
-      } catch (RocksDBException e) {
-        assert(false);
-      }
+      long longValue = rand.nextLong();
+      opt.setManifestPreallocationSize(longValue);
+      assertThat(opt.manifestPreallocationSize()).
+          isEqualTo(longValue);
     }
 
     { // AllowOsBuffer test
       boolean boolValue = rand.nextBoolean();
       opt.setAllowOsBuffer(boolValue);
-      assert(opt.allowOsBuffer() == boolValue);
+      assertThat(opt.allowOsBuffer()).isEqualTo(boolValue);
     }
 
     { // AllowMmapReads test
       boolean boolValue = rand.nextBoolean();
       opt.setAllowMmapReads(boolValue);
-      assert(opt.allowMmapReads() == boolValue);
+      assertThat(opt.allowMmapReads()).isEqualTo(boolValue);
     }
 
     { // AllowMmapWrites test
       boolean boolValue = rand.nextBoolean();
       opt.setAllowMmapWrites(boolValue);
-      assert(opt.allowMmapWrites() == boolValue);
+      assertThat(opt.allowMmapWrites()).isEqualTo(boolValue);
     }
 
     { // IsFdCloseOnExec test
       boolean boolValue = rand.nextBoolean();
       opt.setIsFdCloseOnExec(boolValue);
-      assert(opt.isFdCloseOnExec() == boolValue);
+      assertThat(opt.isFdCloseOnExec()).isEqualTo(boolValue);
     }
 
     { // SkipLogErrorOnRecovery test
       boolean boolValue = rand.nextBoolean();
       opt.setSkipLogErrorOnRecovery(boolValue);
-      assert(opt.skipLogErrorOnRecovery() == boolValue);
+      assertThat(opt.skipLogErrorOnRecovery()).isEqualTo(boolValue);
     }
 
     { // StatsDumpPeriodSec test
       int intValue = rand.nextInt();
       opt.setStatsDumpPeriodSec(intValue);
-      assert(opt.statsDumpPeriodSec() == intValue);
+      assertThat(opt.statsDumpPeriodSec()).isEqualTo(intValue);
     }
 
     { // AdviseRandomOnOpen test
       boolean boolValue = rand.nextBoolean();
       opt.setAdviseRandomOnOpen(boolValue);
-      assert(opt.adviseRandomOnOpen() == boolValue);
+      assertThat(opt.adviseRandomOnOpen()).isEqualTo(boolValue);
     }
 
     { // UseAdaptiveMutex test
       boolean boolValue = rand.nextBoolean();
       opt.setUseAdaptiveMutex(boolValue);
-      assert(opt.useAdaptiveMutex() == boolValue);
+      assertThat(opt.useAdaptiveMutex()).isEqualTo(boolValue);
     }
 
     { // BytesPerSync test
       long longValue = rand.nextLong();
       opt.setBytesPerSync(longValue);
-      assert(opt.bytesPerSync() == longValue);
+      assertThat(opt.bytesPerSync()).isEqualTo(longValue);
     }
   }
 
-  public static void main(String[] args) {
-    DBOptions opt = new DBOptions();
-    testDBOptions(opt);
-    opt.dispose();
-    System.out.println("Passed DBOptionsTest");
+  @Test
+  public void rateLimiterConfig() {
+    DBOptions options = new DBOptions();
+    RateLimiterConfig rateLimiterConfig =
+        new GenericRateLimiterConfig(1000, 0, 1);
+    options.setRateLimiterConfig(rateLimiterConfig);
+    options.dispose();
+    // Test with parameter initialization
+    DBOptions anotherOptions = new DBOptions();
+    anotherOptions.setRateLimiterConfig(
+        new GenericRateLimiterConfig(1000));
+    anotherOptions.dispose();
+  }
+
+  @Test
+  public void statistics() {
+    DBOptions options = new DBOptions();
+    Statistics statistics = options.createStatistics().
+        statisticsPtr();
+    assertThat(statistics).isNotNull();
+
+    DBOptions anotherOptions = new DBOptions();
+    statistics = anotherOptions.statisticsPtr();
+    assertThat(statistics).isNotNull();
   }
 }
diff --git a/java/org/rocksdb/test/DirectComparatorTest.java b/java/org/rocksdb/test/DirectComparatorTest.java
index 562038897..f09d94843 100644
--- a/java/org/rocksdb/test/DirectComparatorTest.java
+++ b/java/org/rocksdb/test/DirectComparatorTest.java
@@ -23,7 +23,7 @@ public class DirectComparatorTest {
   public TemporaryFolder dbFolder = new TemporaryFolder();
 
   @Test
-  public void shouldTestDirectComparator() throws IOException {
+  public void directComparator() throws IOException {
 
     final AbstractComparatorTest comparatorTest = new AbstractComparatorTest() {
       @Override
@@ -49,7 +49,5 @@ public class DirectComparatorTest {
     // test the round-tripability of keys written and read with the DirectComparator
     comparatorTest.testRoundtrip(FileSystems.getDefault().getPath(
         dbFolder.getRoot().getAbsolutePath()));
-
-    System.out.println("Passed DirectComparatorTest");
   }
 }
diff --git a/java/org/rocksdb/test/FilterTest.java b/java/org/rocksdb/test/FilterTest.java
index 3894167b0..c183f8d95 100644
--- a/java/org/rocksdb/test/FilterTest.java
+++ b/java/org/rocksdb/test/FilterTest.java
@@ -16,10 +16,9 @@ public class FilterTest {
       new RocksMemoryResource();
 
   @Test
-  public void shouldTestFilter() {
+  public void filter() {
     Options options = new Options();
     // test table config
-    BlockBasedTableConfig blockConfig = new BlockBasedTableConfig();
     options.setTableFormatConfig(new BlockBasedTableConfig().
         setFilter(new BloomFilter()));
     options.dispose();
@@ -27,7 +26,7 @@ public class FilterTest {
     System.runFinalization();
     // new Bloom filter
     options = new Options();
-    blockConfig = new BlockBasedTableConfig();
+    BlockBasedTableConfig blockConfig = new BlockBasedTableConfig();
     blockConfig.setFilter(new BloomFilter());
     options.setTableFormatConfig(blockConfig);
     BloomFilter bloomFilter = new BloomFilter(10);
@@ -38,10 +37,5 @@ public class FilterTest {
     blockConfig.setFilter(new BloomFilter(10, false));
     options.setTableFormatConfig(blockConfig);
     options.dispose();
-    options = null;
-    blockConfig = null;
-    System.gc();
-    System.runFinalization();
-    System.out.println("Passed FilterTest.");
   }
 }
diff --git a/java/org/rocksdb/test/KeyMayExistTest.java b/java/org/rocksdb/test/KeyMayExistTest.java
index 03be46fbe..5f6d6225a 100644
--- a/java/org/rocksdb/test/KeyMayExistTest.java
+++ b/java/org/rocksdb/test/KeyMayExistTest.java
@@ -4,73 +4,79 @@
 // of patent rights can be found in the PATENTS file in the same directory.
 package org.rocksdb.test;
 
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
 import org.rocksdb.*;
 
 import java.util.ArrayList;
 import java.util.List;
 
+import static org.assertj.core.api.Assertions.assertThat;
+
 public class KeyMayExistTest {
-  static final String DB_PATH = "/tmp/rocksdbjni_keymayexit_test";
-  static {
-    RocksDB.loadLibrary();
-  }
 
-  public static void main(String[] args){
+  @ClassRule
+  public static final RocksMemoryResource rocksMemoryResource =
+      new RocksMemoryResource();
+
+  @Rule
+  public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  @Test
+  public void keyMayExist() throws RocksDBException {
     RocksDB db;
     DBOptions options = new DBOptions();
     options.setCreateIfMissing(true)
         .setCreateMissingColumnFamilies(true);
-    try {
-      // open database using cf names
-      List<ColumnFamilyDescriptor> cfNames =
-          new ArrayList<ColumnFamilyDescriptor>();
-      List<ColumnFamilyHandle> columnFamilyHandleList =
-          new ArrayList<ColumnFamilyHandle>();
-      cfNames.add(new ColumnFamilyDescriptor("default"));
-      cfNames.add(new ColumnFamilyDescriptor("new_cf"));
-      db = RocksDB.open(options, DB_PATH, cfNames, columnFamilyHandleList);
-      assert(columnFamilyHandleList.size()==2);
+    // open database using cf names
+    List<ColumnFamilyDescriptor> cfDescriptors =
+        new ArrayList<ColumnFamilyDescriptor>();
+    List<ColumnFamilyHandle> columnFamilyHandleList =
+        new ArrayList<>();
+    cfDescriptors.add(new ColumnFamilyDescriptor("default"));
+    cfDescriptors.add(new ColumnFamilyDescriptor("new_cf"));
+    db = RocksDB.open(options,
+        dbFolder.getRoot().getAbsolutePath(),
+        cfDescriptors, columnFamilyHandleList);
+    assertThat(columnFamilyHandleList.size()).
+        isEqualTo(2);
+    db.put("key".getBytes(), "value".getBytes());
+    // Test without column family
+    StringBuffer retValue = new StringBuffer();
+    boolean exists = db.keyMayExist("key".getBytes(), retValue);
+    assertThat(exists).isTrue();
+    assertThat(retValue.toString()).
+        isEqualTo("value");
+
+    // Test without column family but with readOptions
+    retValue = new StringBuffer();
+    exists = db.keyMayExist(new ReadOptions(), "key".getBytes(),
+        retValue);
+    assertThat(exists).isTrue();
+    assertThat(retValue.toString()).
+        isEqualTo("value");
+
+    // Test with column family
+    retValue = new StringBuffer();
+    exists = db.keyMayExist(columnFamilyHandleList.get(0), "key".getBytes(),
+        retValue);
+    assertThat(exists).isTrue();
+    assertThat(retValue.toString()).
+        isEqualTo("value");
+
+    // Test with column family and readOptions
+    retValue = new StringBuffer();
+    exists = db.keyMayExist(new ReadOptions(),
+        columnFamilyHandleList.get(0), "key".getBytes(),
+        retValue);
+    assertThat(exists).isTrue();
+    assertThat(retValue.toString()).
+        isEqualTo("value");
 
-      db.put("key".getBytes(), "value".getBytes());
-      // Test without column family
-      StringBuffer retValue = new StringBuffer();
-      if (db.keyMayExist("key".getBytes(), retValue)) {
-        assert(retValue.toString().equals("value"));
-      } else {
-        assert(false);
-      }
-      // Test without column family but with readOptions
-      retValue = new StringBuffer();
-      if (db.keyMayExist(new ReadOptions(), "key".getBytes(),
-          retValue)) {
-        assert(retValue.toString().equals("value"));
-      } else {
-        assert(false);
-      }
-      // Test with column family
-      retValue = new StringBuffer();
-      if (db.keyMayExist(columnFamilyHandleList.get(0), "key".getBytes(),
-          retValue)) {
-        assert(retValue.toString().equals("value"));
-      } else {
-        assert(false);
-      }
-      // Test with column family and readOptions
-      retValue = new StringBuffer();
-      if (db.keyMayExist(new ReadOptions(),
-          columnFamilyHandleList.get(0), "key".getBytes(),
-          retValue)) {
-        assert(retValue.toString().equals("value"));
-      } else {
-        assert(false);
-      }
-      // KeyMayExist in CF1 must return false
-      assert(db.keyMayExist(columnFamilyHandleList.get(1), "key".getBytes(),
-          retValue) == false);
-      System.out.println("Passed KeyMayExistTest");
-    }catch (RocksDBException e){
-      e.printStackTrace();
-      assert(false);
-    }
+    // KeyMayExist in CF1 must return false
+    assertThat(db.keyMayExist(columnFamilyHandleList.get(1),
+        "key".getBytes(), retValue)).isFalse();
   }
 }
diff --git a/java/org/rocksdb/test/MemTableTest.java b/java/org/rocksdb/test/MemTableTest.java
index 0b1244fc2..93146303a 100644
--- a/java/org/rocksdb/test/MemTableTest.java
+++ b/java/org/rocksdb/test/MemTableTest.java
@@ -9,6 +9,8 @@ import org.junit.ClassRule;
 import org.junit.Test;
 import org.rocksdb.*;
 
+import static org.assertj.core.api.Assertions.assertThat;
+
 public class MemTableTest {
 
   @ClassRule
@@ -16,22 +18,27 @@ public class MemTableTest {
       new RocksMemoryResource();
 
   @Test
-  public void shouldTestMemTable() throws RocksDBException {
+  public void memTable() throws RocksDBException {
     Options options = new Options();
     // Test HashSkipListMemTableConfig
     HashSkipListMemTableConfig memTableConfig =
         new HashSkipListMemTableConfig();
-    assert(memTableConfig.bucketCount() == 1000000);
+    assertThat(memTableConfig.bucketCount()).
+        isEqualTo(1000000);
     memTableConfig.setBucketCount(2000000);
-    assert(memTableConfig.bucketCount() == 2000000);
-    assert(memTableConfig.height() == 4);
+    assertThat(memTableConfig.bucketCount()).
+        isEqualTo(2000000);
+    assertThat(memTableConfig.height()).
+        isEqualTo(4);
     memTableConfig.setHeight(5);
-    assert(memTableConfig.height() == 5);
-    assert(memTableConfig.branchingFactor() == 4);
+    assertThat(memTableConfig.height()).
+        isEqualTo(5);
+    assertThat(memTableConfig.branchingFactor()).
+        isEqualTo(4);
     memTableConfig.setBranchingFactor(6);
-    assert(memTableConfig.branchingFactor() == 6);
+    assertThat(memTableConfig.branchingFactor()).
+        isEqualTo(6);
     options.setMemTableConfig(memTableConfig);
-    memTableConfig = null;
     options.dispose();
     System.gc();
     System.runFinalization();
@@ -39,11 +46,12 @@ public class MemTableTest {
     options = new Options();
     SkipListMemTableConfig skipMemTableConfig =
         new SkipListMemTableConfig();
-    assert(skipMemTableConfig.lookahead() == 0);
+    assertThat(skipMemTableConfig.lookahead()).
+        isEqualTo(0);
     skipMemTableConfig.setLookahead(20);
-    assert(skipMemTableConfig.lookahead() == 20);
+    assertThat(skipMemTableConfig.lookahead()).
+        isEqualTo(20);
     options.setMemTableConfig(skipMemTableConfig);
-    skipMemTableConfig = null;
     options.dispose();
     System.gc();
     System.runFinalization();
@@ -51,31 +59,38 @@ public class MemTableTest {
     options = new Options();
     HashLinkedListMemTableConfig hashLinkedListMemTableConfig =
         new HashLinkedListMemTableConfig();
-    assert(hashLinkedListMemTableConfig.bucketCount() == 50000);
+    assertThat(hashLinkedListMemTableConfig.bucketCount()).
+        isEqualTo(50000);
     hashLinkedListMemTableConfig.setBucketCount(100000);
-    assert(hashLinkedListMemTableConfig.bucketCount() == 100000);
-    assert(hashLinkedListMemTableConfig.hugePageTlbSize() == 0);
+    assertThat(hashLinkedListMemTableConfig.bucketCount()).
+        isEqualTo(100000);
+    assertThat(hashLinkedListMemTableConfig.hugePageTlbSize()).
+        isEqualTo(0);
     hashLinkedListMemTableConfig.setHugePageTlbSize(1);
-    assert(hashLinkedListMemTableConfig.hugePageTlbSize() == 1);
-    assert(hashLinkedListMemTableConfig.
-       bucketEntriesLoggingThreshold() == 4096);
+    assertThat(hashLinkedListMemTableConfig.hugePageTlbSize()).
+        isEqualTo(1);
+    assertThat(hashLinkedListMemTableConfig.
+       bucketEntriesLoggingThreshold()).
+        isEqualTo(4096);
     hashLinkedListMemTableConfig.
         setBucketEntriesLoggingThreshold(200);
-    assert(hashLinkedListMemTableConfig.
-       bucketEntriesLoggingThreshold() == 200);
-    assert(hashLinkedListMemTableConfig.
-        ifLogBucketDistWhenFlush());
+    assertThat(hashLinkedListMemTableConfig.
+       bucketEntriesLoggingThreshold()).
+        isEqualTo(200);
+    assertThat(hashLinkedListMemTableConfig.
+        ifLogBucketDistWhenFlush()).isTrue();
     hashLinkedListMemTableConfig.
         setIfLogBucketDistWhenFlush(false);
-    assert(!hashLinkedListMemTableConfig.
-        ifLogBucketDistWhenFlush());
-    assert(hashLinkedListMemTableConfig.
-        thresholdUseSkiplist() == 256);
+    assertThat(hashLinkedListMemTableConfig.
+        ifLogBucketDistWhenFlush()).isFalse();
+    assertThat(hashLinkedListMemTableConfig.
+        thresholdUseSkiplist()).
+        isEqualTo(256);
     hashLinkedListMemTableConfig.setThresholdUseSkiplist(29);
-    assert(hashLinkedListMemTableConfig.
-        thresholdUseSkiplist() == 29);
+    assertThat(hashLinkedListMemTableConfig.
+        thresholdUseSkiplist()).
+        isEqualTo(29);
     options.setMemTableConfig(hashLinkedListMemTableConfig);
-    hashLinkedListMemTableConfig = null;
     options.dispose();
     System.gc();
     System.runFinalization();
@@ -83,14 +98,14 @@ public class MemTableTest {
     options = new Options();
     VectorMemTableConfig vectorMemTableConfig =
         new VectorMemTableConfig();
-    assert(vectorMemTableConfig.reservedSize() == 0);
+    assertThat(vectorMemTableConfig.reservedSize()).
+        isEqualTo(0);
     vectorMemTableConfig.setReservedSize(123);
-    assert(vectorMemTableConfig.reservedSize() == 123);
+    assertThat(vectorMemTableConfig.reservedSize()).
+        isEqualTo(123);
     options.setMemTableConfig(vectorMemTableConfig);
-    vectorMemTableConfig = null;
     options.dispose();
     System.gc();
     System.runFinalization();
-    System.out.println("Passed MemTableTest.");
   }
 }
diff --git a/java/org/rocksdb/test/MergeTest.java b/java/org/rocksdb/test/MergeTest.java
index 31a3fe5cb..f1e2fb759 100644
--- a/java/org/rocksdb/test/MergeTest.java
+++ b/java/org/rocksdb/test/MergeTest.java
@@ -8,13 +8,14 @@ package org.rocksdb.test;
 import java.util.List;
 import java.util.ArrayList;
 
-import org.junit.AfterClass;
 import org.junit.ClassRule;
 import org.junit.Rule;
 import org.junit.Test;
 import org.junit.rules.TemporaryFolder;
 import org.rocksdb.*;
 
+import static org.assertj.core.api.Assertions.assertThat;
+
 public class MergeTest {
 
   @ClassRule
@@ -24,13 +25,8 @@ public class MergeTest {
   @Rule
   public TemporaryFolder dbFolder = new TemporaryFolder();
 
-  @AfterClass
-  public static void printMergePass(){
-    System.out.println("Passed MergeTest.");
-  }
-
   @Test
-  public void shouldTestStringOption()
+  public void stringOption()
       throws InterruptedException, RocksDBException {
     String db_path_string =
         dbFolder.getRoot().getAbsolutePath();
@@ -49,11 +45,11 @@ public class MergeTest {
 
     db.close();
     opt.dispose();
-    assert(strValue.equals("aa,bb"));
+    assertThat(strValue).isEqualTo("aa,bb");
   }
 
   @Test
-  public void shouldTestCFStringOption()
+  public void cFStringOption()
       throws InterruptedException, RocksDBException {
     DBOptions opt = new DBOptions();
     String db_path_string =
@@ -89,11 +85,11 @@ public class MergeTest {
     }
     db.close();
     opt.dispose();
-    assert(strValue.equals("aa,bb"));
+    assertThat(strValue).isEqualTo("aa,bb");
   }
 
   @Test
-  public void shouldTestOperatorOption()
+  public void operatorOption()
       throws InterruptedException, RocksDBException {
     String db_path_string =
         dbFolder.getRoot().getAbsolutePath();
@@ -115,11 +111,11 @@ public class MergeTest {
 
     db.close();
     opt.dispose();
-    assert(strValue.equals("aa,bb"));
+    assertThat(strValue).isEqualTo("aa,bb");
   }
 
   @Test
-  public void shouldTestCFOperatorOption()
+  public void cFOperatorOption()
       throws InterruptedException, RocksDBException {
     DBOptions opt = new DBOptions();
     String db_path_string =
@@ -165,12 +161,12 @@ public class MergeTest {
     columnFamilyHandle.dispose();
     db.close();
     opt.dispose();
-    assert(strValue.equals("aa,bb"));
-    assert(strValueTmpCf.equals("xx,yy"));
+    assertThat(strValue).isEqualTo("aa,bb");
+    assertThat(strValueTmpCf).isEqualTo("xx,yy");
   }
 
   @Test
-  public void shouldTestOperatorGcBehaviour()
+  public void operatorGcBehaviour()
       throws RocksDBException {
     String db_path_string =
         dbFolder.getRoot().getAbsolutePath();
@@ -207,9 +203,5 @@ public class MergeTest {
     db = RocksDB.open(opt, db_path_string);
     db.close();
     opt.dispose();
-    stringAppendOperator = null;
-    newStringAppendOperator = null;
-    System.gc();
-    System.runFinalization();
   }
 }
diff --git a/java/org/rocksdb/test/OptionsTest.java b/java/org/rocksdb/test/OptionsTest.java
index defdcc304..f8fbd7bcc 100644
--- a/java/org/rocksdb/test/OptionsTest.java
+++ b/java/org/rocksdb/test/OptionsTest.java
@@ -6,23 +6,296 @@
 package org.rocksdb.test;
 
 import java.util.Random;
-import org.rocksdb.RocksDB;
-import org.rocksdb.Options;
+import org.junit.ClassRule;
+import org.junit.Test;
+import org.rocksdb.*;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
 
 public class OptionsTest {
 
-  static {
-    RocksDB.loadLibrary();
-  }
-  public static void main(String[] args) {
+  @ClassRule
+  public static final RocksMemoryResource rocksMemoryResource =
+      new RocksMemoryResource();
+
+  @Test
+  public void options() throws RocksDBException {
     Options opt = new Options();
     Random rand = PlatformRandomHelper.
         getPlatformSpecificRandomFactory();
 
     DBOptionsTest.testDBOptions(opt);
-    ColumnFamilyOptionsTest.testCFOptions(opt);
+
+    { // WriteBufferSize test
+      long longValue = rand.nextLong();
+      opt.setWriteBufferSize(longValue);
+      assert(opt.writeBufferSize() == longValue);
+    }
+
+    { // MaxWriteBufferNumber test
+      int intValue = rand.nextInt();
+      opt.setMaxWriteBufferNumber(intValue);
+      assert(opt.maxWriteBufferNumber() == intValue);
+    }
+
+    { // MinWriteBufferNumberToMerge test
+      int intValue = rand.nextInt();
+      opt.setMinWriteBufferNumberToMerge(intValue);
+      assert(opt.minWriteBufferNumberToMerge() == intValue);
+    }
+
+    { // NumLevels test
+      int intValue = rand.nextInt();
+      opt.setNumLevels(intValue);
+      assert(opt.numLevels() == intValue);
+    }
+
+    { // LevelFileNumCompactionTrigger test
+      int intValue = rand.nextInt();
+      opt.setLevelZeroFileNumCompactionTrigger(intValue);
+      assert(opt.levelZeroFileNumCompactionTrigger() == intValue);
+    }
+
+    { // LevelSlowdownWritesTrigger test
+      int intValue = rand.nextInt();
+      opt.setLevelZeroSlowdownWritesTrigger(intValue);
+      assert(opt.levelZeroSlowdownWritesTrigger() == intValue);
+    }
+
+    { // LevelStopWritesTrigger test
+      int intValue = rand.nextInt();
+      opt.setLevelZeroStopWritesTrigger(intValue);
+      assert(opt.levelZeroStopWritesTrigger() == intValue);
+    }
+
+    { // MaxMemCompactionLevel test
+      int intValue = rand.nextInt();
+      opt.setMaxMemCompactionLevel(intValue);
+      assert(opt.maxMemCompactionLevel() == intValue);
+    }
+
+    { // TargetFileSizeBase test
+      long longValue = rand.nextLong();
+      opt.setTargetFileSizeBase(longValue);
+      assert(opt.targetFileSizeBase() == longValue);
+    }
+
+    { // TargetFileSizeMultiplier test
+      int intValue = rand.nextInt();
+      opt.setTargetFileSizeMultiplier(intValue);
+      assert(opt.targetFileSizeMultiplier() == intValue);
+    }
+
+    { // MaxBytesForLevelBase test
+      long longValue = rand.nextLong();
+      opt.setMaxBytesForLevelBase(longValue);
+      assert(opt.maxBytesForLevelBase() == longValue);
+    }
+
+    { // MaxBytesForLevelMultiplier test
+      int intValue = rand.nextInt();
+      opt.setMaxBytesForLevelMultiplier(intValue);
+      assert(opt.maxBytesForLevelMultiplier() == intValue);
+    }
+
+    { // ExpandedCompactionFactor test
+      int intValue = rand.nextInt();
+      opt.setExpandedCompactionFactor(intValue);
+      assert(opt.expandedCompactionFactor() == intValue);
+    }
+
+    { // SourceCompactionFactor test
+      int intValue = rand.nextInt();
+      opt.setSourceCompactionFactor(intValue);
+      assert(opt.sourceCompactionFactor() == intValue);
+    }
+
+    { // MaxGrandparentOverlapFactor test
+      int intValue = rand.nextInt();
+      opt.setMaxGrandparentOverlapFactor(intValue);
+      assert(opt.maxGrandparentOverlapFactor() == intValue);
+    }
+
+    { // SoftRateLimit test
+      double doubleValue = rand.nextDouble();
+      opt.setSoftRateLimit(doubleValue);
+      assert(opt.softRateLimit() == doubleValue);
+    }
+
+    { // HardRateLimit test
+      double doubleValue = rand.nextDouble();
+      opt.setHardRateLimit(doubleValue);
+      assert(opt.hardRateLimit() == doubleValue);
+    }
+
+    { // RateLimitDelayMaxMilliseconds test
+      int intValue = rand.nextInt();
+      opt.setRateLimitDelayMaxMilliseconds(intValue);
+      assert(opt.rateLimitDelayMaxMilliseconds() == intValue);
+    }
+
+    { // ArenaBlockSize test
+      long longValue = rand.nextLong();
+      opt.setArenaBlockSize(longValue);
+      assert(opt.arenaBlockSize() == longValue);
+    }
+
+    { // DisableAutoCompactions test
+      boolean boolValue = rand.nextBoolean();
+      opt.setDisableAutoCompactions(boolValue);
+      assert(opt.disableAutoCompactions() == boolValue);
+    }
+
+    { // PurgeRedundantKvsWhileFlush test
+      boolean boolValue = rand.nextBoolean();
+      opt.setPurgeRedundantKvsWhileFlush(boolValue);
+      assert(opt.purgeRedundantKvsWhileFlush() == boolValue);
+    }
+
+    { // VerifyChecksumsInCompaction test
+      boolean boolValue = rand.nextBoolean();
+      opt.setVerifyChecksumsInCompaction(boolValue);
+      assert(opt.verifyChecksumsInCompaction() == boolValue);
+    }
+
+    { // FilterDeletes test
+      boolean boolValue = rand.nextBoolean();
+      opt.setFilterDeletes(boolValue);
+      assert(opt.filterDeletes() == boolValue);
+    }
+
+    { // MaxSequentialSkipInIterations test
+      long longValue = rand.nextLong();
+      opt.setMaxSequentialSkipInIterations(longValue);
+      assert(opt.maxSequentialSkipInIterations() == longValue);
+    }
+
+    { // InplaceUpdateSupport test
+      boolean boolValue = rand.nextBoolean();
+      opt.setInplaceUpdateSupport(boolValue);
+      assert(opt.inplaceUpdateSupport() == boolValue);
+    }
+
+    { // InplaceUpdateNumLocks test
+      long longValue = rand.nextLong();
+      opt.setInplaceUpdateNumLocks(longValue);
+      assert(opt.inplaceUpdateNumLocks() == longValue);
+    }
+
+    { // MemtablePrefixBloomBits test
+      int intValue = rand.nextInt();
+      opt.setMemtablePrefixBloomBits(intValue);
+      assert(opt.memtablePrefixBloomBits() == intValue);
+    }
+
+    { // MemtablePrefixBloomProbes test
+      int intValue = rand.nextInt();
+      opt.setMemtablePrefixBloomProbes(intValue);
+      assert(opt.memtablePrefixBloomProbes() == intValue);
+    }
+
+    { // BloomLocality test
+      int intValue = rand.nextInt();
+      opt.setBloomLocality(intValue);
+      assert(opt.bloomLocality() == intValue);
+    }
+
+    { // MaxSuccessiveMerges test
+      long longValue = rand.nextLong();
+      opt.setMaxSuccessiveMerges(longValue);
+      assert(opt.maxSuccessiveMerges() == longValue);
+    }
+
+    { // MinPartialMergeOperands test
+      int intValue = rand.nextInt();
+      opt.setMinPartialMergeOperands(intValue);
+      assert(opt.minPartialMergeOperands() == intValue);
+    }
 
     opt.dispose();
-    System.out.println("Passed OptionsTest");
+  }
+
+  @Test
+  public void linkageOfPrepMethods() {
+    Options options = new Options();
+    options.optimizeUniversalStyleCompaction();
+    options.optimizeUniversalStyleCompaction(4000);
+    options.optimizeLevelStyleCompaction();
+    options.optimizeLevelStyleCompaction(3000);
+    options.optimizeForPointLookup(10);
+    options.prepareForBulkLoad();
+  }
+
+  @Test
+  public void compressionTypes() {
+    Options options = new Options();
+    for(CompressionType compressionType :
+        CompressionType.values()) {
+      options.setCompressionType(compressionType);
+      assertThat(options.compressionType()).
+          isEqualTo(compressionType);
+    }
+    options.dispose();
+  }
+
+  @Test
+  public void compactionStyles() {
+    Options options = new Options();
+    for (CompactionStyle compactionStyle :
+        CompactionStyle.values()) {
+      options.setCompactionStyle(compactionStyle);
+      assertThat(options.compactionStyle()).
+          isEqualTo(compactionStyle);
+    }
+    options.dispose();
+  }
+
+  @Test
+  public void rateLimiterConfig() {
+    Options options = new Options();
+    RateLimiterConfig rateLimiterConfig =
+        new GenericRateLimiterConfig(1000, 0, 1);
+    options.setRateLimiterConfig(rateLimiterConfig);
+    options.dispose();
+    // Test with parameter initialization
+    Options anotherOptions = new Options();
+    anotherOptions.setRateLimiterConfig(
+        new GenericRateLimiterConfig(1000));
+    anotherOptions.dispose();
+  }
+
+  @Test
+  public void shouldSetTestPrefixExtractor() {
+    Options options = new Options();
+    options.useFixedLengthPrefixExtractor(100);
+    options.useFixedLengthPrefixExtractor(10);
+    options.dispose();
+  }
+
+  @Test
+  public void shouldTestMemTableFactoryName()
+      throws RocksDBException {
+    Options options = new Options();
+    options.setMemTableConfig(new VectorMemTableConfig());
+    assertThat(options.memTableFactoryName()).
+        isEqualTo("VectorRepFactory");
+    options.setMemTableConfig(
+        new HashLinkedListMemTableConfig());
+    assertThat(options.memTableFactoryName()).
+        isEqualTo("HashLinkedListRepFactory");
+    options.dispose();
+  }
+
+  @Test
+  public void statistics() {
+    Options options = new Options();
+    Statistics statistics = options.createStatistics().
+        statisticsPtr();
+    assertThat(statistics).isNotNull();
+
+    Options anotherOptions = new Options();
+    statistics = anotherOptions.statisticsPtr();
+    assertThat(statistics).isNotNull();
   }
 }
diff --git a/java/org/rocksdb/test/PlainTableConfigTest.java b/java/org/rocksdb/test/PlainTableConfigTest.java
index 1891cd19a..abd2cda12 100644
--- a/java/org/rocksdb/test/PlainTableConfigTest.java
+++ b/java/org/rocksdb/test/PlainTableConfigTest.java
@@ -5,42 +5,44 @@
 
 package org.rocksdb.test;
 
-import org.junit.AfterClass;
 import org.junit.ClassRule;
 import org.junit.Test;
 import org.rocksdb.EncodingType;
 import org.rocksdb.PlainTableConfig;
 
+import static org.assertj.core.api.Assertions.assertThat;
+
 public class PlainTableConfigTest {
 
   @ClassRule
   public static final RocksMemoryResource rocksMemoryResource =
       new RocksMemoryResource();
 
-  @AfterClass
-  public static void printMessage(){
-    System.out.println("Passed PlainTableConfigTest.");
-  }
-
   @Test
-  public void shouldTestPlainTableConfig() {
+  public void plainTableConfig() {
     PlainTableConfig plainTableConfig = new PlainTableConfig();
     plainTableConfig.setKeySize(5);
-    assert(plainTableConfig.keySize() == 5);
+    assertThat(plainTableConfig.keySize()).
+        isEqualTo(5);
     plainTableConfig.setBloomBitsPerKey(11);
-    assert(plainTableConfig.bloomBitsPerKey() == 11);
+    assertThat(plainTableConfig.bloomBitsPerKey()).
+        isEqualTo(11);
     plainTableConfig.setHashTableRatio(0.95);
-    assert(plainTableConfig.hashTableRatio() == 0.95);
+    assertThat(plainTableConfig.hashTableRatio()).
+        isEqualTo(0.95);
     plainTableConfig.setIndexSparseness(18);
-    assert(plainTableConfig.indexSparseness() == 18);
+    assertThat(plainTableConfig.indexSparseness()).
+        isEqualTo(18);
     plainTableConfig.setHugePageTlbSize(1);
-    assert(plainTableConfig.hugePageTlbSize() == 1);
+    assertThat(plainTableConfig.hugePageTlbSize()).
+        isEqualTo(1);
     plainTableConfig.setEncodingType(EncodingType.kPrefix);
-    assert(plainTableConfig.encodingType().equals(
-        EncodingType.kPrefix));
+    assertThat(plainTableConfig.encodingType()).isEqualTo(
+        EncodingType.kPrefix);
     plainTableConfig.setFullScanMode(true);
-    assert(plainTableConfig.fullScanMode());
+    assertThat(plainTableConfig.fullScanMode()).isTrue();
     plainTableConfig.setStoreIndexInFile(true);
-    assert(plainTableConfig.storeIndexInFile());
+    assertThat(plainTableConfig.storeIndexInFile()).
+        isTrue();
   }
 }
diff --git a/java/org/rocksdb/test/ReadOnlyTest.java b/java/org/rocksdb/test/ReadOnlyTest.java
index 057d2d4b8..1151f93dc 100644
--- a/java/org/rocksdb/test/ReadOnlyTest.java
+++ b/java/org/rocksdb/test/ReadOnlyTest.java
@@ -13,6 +13,8 @@ import org.rocksdb.*;
 import java.util.ArrayList;
 import java.util.List;
 
+import static org.assertj.core.api.Assertions.assertThat;
+
 public class ReadOnlyTest {
 
   @ClassRule
@@ -23,119 +25,204 @@ public class ReadOnlyTest {
   public TemporaryFolder dbFolder = new TemporaryFolder();
 
   @Test
-  public void shouldTestReadOnlyOpen() {
-    RocksDB db = null, db2 = null, db3 = null;
+  public void readOnlyOpen() throws RocksDBException {
+    RocksDB db, db2, db3;
     List<ColumnFamilyHandle> columnFamilyHandleList =
         new ArrayList<>();
-    List<ColumnFamilyHandle> db2ColumnFamilyHandleList =
+    List<ColumnFamilyHandle> readOnlyColumnFamilyHandleList =
+        new ArrayList<>();
+    List<ColumnFamilyHandle> readOnlyColumnFamilyHandleList2 =
         new ArrayList<>();
-    List<ColumnFamilyHandle> db3ColumnFamilyHandleList =
+
+    Options options = new Options();
+    options.setCreateIfMissing(true);
+
+    db = RocksDB.open(options,
+        dbFolder.getRoot().getAbsolutePath());
+    db.put("key".getBytes(), "value".getBytes());
+    db2 = RocksDB.openReadOnly(
+        dbFolder.getRoot().getAbsolutePath());
+    assertThat("value").
+        isEqualTo(new String(db2.get("key".getBytes())));
+    db.close();
+    db2.close();
+
+    List<ColumnFamilyDescriptor> cfDescriptors = new ArrayList<>();
+    cfDescriptors.add(
+        new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY,
+            new ColumnFamilyOptions()));
+
+    db = RocksDB.open(
+        dbFolder.getRoot().getAbsolutePath(), cfDescriptors, columnFamilyHandleList);
+    columnFamilyHandleList.add(db.createColumnFamily(
+        new ColumnFamilyDescriptor("new_cf", new ColumnFamilyOptions())));
+    columnFamilyHandleList.add(db.createColumnFamily(
+        new ColumnFamilyDescriptor("new_cf2", new ColumnFamilyOptions())));
+    db.put(columnFamilyHandleList.get(2), "key2".getBytes(),
+        "value2".getBytes());
+
+    db2 = RocksDB.openReadOnly(
+        dbFolder.getRoot().getAbsolutePath(), cfDescriptors,
+        readOnlyColumnFamilyHandleList);
+    assertThat(db2.get("key2".getBytes())).isNull();
+    assertThat(db2.get(readOnlyColumnFamilyHandleList.get(0), "key2".getBytes())).
+        isNull();
+
+    cfDescriptors.clear();
+    cfDescriptors.add(
+        new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY,
+            new ColumnFamilyOptions()));
+    cfDescriptors.add(
+        new ColumnFamilyDescriptor("new_cf2", new ColumnFamilyOptions()));
+    db3 = RocksDB.openReadOnly(
+        dbFolder.getRoot().getAbsolutePath(), cfDescriptors,
+        readOnlyColumnFamilyHandleList2);
+    assertThat(new String(db3.get(readOnlyColumnFamilyHandleList2.get(1),
+        "key2".getBytes()))).isEqualTo("value2");
+    db.close();
+    db2.close();
+    db3.close();
+    options.dispose();
+  }
+
+  @Test(expected = RocksDBException.class)
+  public void failToWriteInReadOnly() throws RocksDBException {
+    List<ColumnFamilyDescriptor> cfDescriptors = new ArrayList<>();
+    cfDescriptors.add(
+        new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY,
+            new ColumnFamilyOptions()));
+    List<ColumnFamilyHandle> readOnlyColumnFamilyHandleList =
         new ArrayList<>();
     Options options = new Options();
     options.setCreateIfMissing(true);
-    try {
-      db = RocksDB.open(options,
-          dbFolder.getRoot().getAbsolutePath());
-      db.put("key".getBytes(), "value".getBytes());
-      db2 = RocksDB.openReadOnly(
-          dbFolder.getRoot().getAbsolutePath());
-      assert("value".equals(new String(db2.get("key".getBytes()))));
-      db.close();
-      db2.close();
-
-
-      List<ColumnFamilyDescriptor> cfNames =
-          new ArrayList<ColumnFamilyDescriptor>();
-      cfNames.add(new ColumnFamilyDescriptor("default"));
-
-      db = RocksDB.open(dbFolder.getRoot().getAbsolutePath(), cfNames, columnFamilyHandleList);
-      columnFamilyHandleList.add(db.createColumnFamily(
-          new ColumnFamilyDescriptor("new_cf", new ColumnFamilyOptions())));
-      columnFamilyHandleList.add(db.createColumnFamily(
-          new ColumnFamilyDescriptor("new_cf2", new ColumnFamilyOptions())));
-      db.put(columnFamilyHandleList.get(2), "key2".getBytes(),
-          "value2".getBytes());
-
-      db2 = RocksDB.openReadOnly(
-          dbFolder.getRoot().getAbsolutePath(), cfNames, db2ColumnFamilyHandleList);
-      assert(db2.get("key2".getBytes())==null);
-      assert(db2.get(columnFamilyHandleList.get(0), "key2".getBytes())==null);
-
-      List<ColumnFamilyDescriptor> cfNewName =
-          new ArrayList<>();
-      cfNewName.add(new ColumnFamilyDescriptor("default"));
-      cfNewName.add(new ColumnFamilyDescriptor("new_cf2"));
-      db3 = RocksDB.openReadOnly(dbFolder.getRoot().getAbsolutePath(), cfNewName, db3ColumnFamilyHandleList);
-      assert(new String(db3.get(db3ColumnFamilyHandleList.get(1),
-          "key2".getBytes())).equals("value2"));
-    }catch (RocksDBException e){
-      e.printStackTrace();
-      assert(false);
-    }
+
+    RocksDB db = RocksDB.open(options,
+        dbFolder.getRoot().getAbsolutePath());
+    db.close();
+    RocksDB rDb = RocksDB.openReadOnly(
+        dbFolder.getRoot().getAbsolutePath(), cfDescriptors,
+        readOnlyColumnFamilyHandleList);
     // test that put fails in readonly mode
-    try {
-      db2.put("key".getBytes(), "value".getBytes());
-      assert(false);
-    } catch (RocksDBException e) {
-      assert(true);
-    }
-    try {
-      db3.put(db3ColumnFamilyHandleList.get(1),
-          "key".getBytes(), "value".getBytes());
-      assert(false);
-    } catch (RocksDBException e) {
-      assert(true);
-    }
-    // test that remove fails in readonly mode
-    try {
-      db2.remove("key".getBytes());
-      assert(false);
-    } catch (RocksDBException e) {
-      assert(true);
-    }
-    try {
-      db3.remove(db3ColumnFamilyHandleList.get(1),
-          "key".getBytes());
-      assert(false);
-    } catch (RocksDBException e) {
-      assert(true);
-    }
-    // test that write fails in readonly mode
+    rDb.put("key".getBytes(), "value".getBytes());
+  }
+
+  @Test(expected = RocksDBException.class)
+  public void failToCFWriteInReadOnly() throws RocksDBException {
+    List<ColumnFamilyDescriptor> cfDescriptors = new ArrayList<>();
+    cfDescriptors.add(
+        new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY,
+            new ColumnFamilyOptions()));
+    List<ColumnFamilyHandle> readOnlyColumnFamilyHandleList =
+        new ArrayList<>();
+
+    Options options = new Options();
+    options.setCreateIfMissing(true);
+
+    RocksDB db = RocksDB.open(options,
+        dbFolder.getRoot().getAbsolutePath());
+    db.close();
+    RocksDB rDb = RocksDB.openReadOnly(
+        dbFolder.getRoot().getAbsolutePath(), cfDescriptors,
+        readOnlyColumnFamilyHandleList);
+
+    rDb.put(readOnlyColumnFamilyHandleList.get(0),
+        "key".getBytes(), "value".getBytes());
+  }
+
+  @Test(expected = RocksDBException.class)
+  public void failToRemoveInReadOnly() throws RocksDBException {
+    List<ColumnFamilyDescriptor> cfDescriptors = new ArrayList<>();
+    cfDescriptors.add(
+        new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY,
+            new ColumnFamilyOptions()));
+    List<ColumnFamilyHandle> readOnlyColumnFamilyHandleList =
+        new ArrayList<>();
+
+    Options options = new Options();
+    options.setCreateIfMissing(true);
+
+    RocksDB db = RocksDB.open(options,
+        dbFolder.getRoot().getAbsolutePath());
+    db.close();
+    RocksDB rDb = RocksDB.openReadOnly(
+        dbFolder.getRoot().getAbsolutePath(), cfDescriptors,
+        readOnlyColumnFamilyHandleList);
+
+    rDb.remove("key".getBytes());
+  }
+
+  @Test(expected = RocksDBException.class)
+  public void failToCFRemoveInReadOnly() throws RocksDBException {
+    List<ColumnFamilyDescriptor> cfDescriptors = new ArrayList<>();
+    cfDescriptors.add(
+        new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY,
+            new ColumnFamilyOptions()));
+    List<ColumnFamilyHandle> readOnlyColumnFamilyHandleList =
+        new ArrayList<>();
+
+    Options options = new Options();
+    options.setCreateIfMissing(true);
+
+    RocksDB db = RocksDB.open(options,
+        dbFolder.getRoot().getAbsolutePath());
+    db.close();
+
+    RocksDB rDb = RocksDB.openReadOnly(
+        dbFolder.getRoot().getAbsolutePath(), cfDescriptors,
+        readOnlyColumnFamilyHandleList);
+
+    rDb.remove(readOnlyColumnFamilyHandleList.get(0),
+        "key".getBytes());
+  }
+
+  @Test(expected = RocksDBException.class)
+  public void failToWriteBatchReadOnly() throws RocksDBException {
+    List<ColumnFamilyDescriptor> cfDescriptors = new ArrayList<>();
+    cfDescriptors.add(
+        new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY,
+            new ColumnFamilyOptions()));
+    List<ColumnFamilyHandle> readOnlyColumnFamilyHandleList =
+        new ArrayList<>();
+
+    Options options = new Options();
+    options.setCreateIfMissing(true);
+
+    RocksDB db = RocksDB.open(options,
+        dbFolder.getRoot().getAbsolutePath());
+    db.close();
+
+    RocksDB rDb = RocksDB.openReadOnly(
+        dbFolder.getRoot().getAbsolutePath(), cfDescriptors,
+        readOnlyColumnFamilyHandleList);
+
     WriteBatch wb = new WriteBatch();
     wb.put("key".getBytes(), "value".getBytes());
-    try {
-      db2.write(new WriteOptions(), wb);
-      assert(false);
-    } catch (RocksDBException e) {
-      assert(true);
-    }
-    wb.dispose();
-    wb = new WriteBatch();
-    wb.put(db3ColumnFamilyHandleList.get(1),
-        "key".getBytes(), "value".getBytes());
-    try {
-      db3.write(new WriteOptions(), wb);
-      assert(false);
-    } catch (RocksDBException e) {
-      assert(true);
-    }
-    wb.dispose();
-    // cleanup c++ pointers
-    for (ColumnFamilyHandle columnFamilyHandle :
-        columnFamilyHandleList) {
-      columnFamilyHandle.dispose();
-    }
+    rDb.write(new WriteOptions(), wb);
+  }
+
+  @Test(expected = RocksDBException.class)
+  public void failToCFWriteBatchReadOnly() throws RocksDBException {
+    List<ColumnFamilyDescriptor> cfDescriptors = new ArrayList<>();
+    cfDescriptors.add(
+        new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY,
+            new ColumnFamilyOptions()));
+    List<ColumnFamilyHandle> readOnlyColumnFamilyHandleList =
+        new ArrayList<>();
+
+    Options options = new Options();
+    options.setCreateIfMissing(true);
+
+    RocksDB db = RocksDB.open(options,
+        dbFolder.getRoot().getAbsolutePath());
     db.close();
-    for (ColumnFamilyHandle columnFamilyHandle :
-        db2ColumnFamilyHandleList) {
-      columnFamilyHandle.dispose();
-    }
-    db2.close();
-    for (ColumnFamilyHandle columnFamilyHandle :
-        db3ColumnFamilyHandleList) {
-      columnFamilyHandle.dispose();
-    }
-    db3.close();
-    System.out.println("Passed ReadOnlyTest.");
+
+    RocksDB rDb = RocksDB.openReadOnly(
+        dbFolder.getRoot().getAbsolutePath(), cfDescriptors,
+        readOnlyColumnFamilyHandleList);
+
+    WriteBatch wb = new WriteBatch();
+    wb.put(readOnlyColumnFamilyHandleList.get(0),
+        "key".getBytes(), "value".getBytes());
+    rDb.write(new WriteOptions(), wb);
   }
 }
diff --git a/java/org/rocksdb/test/ReadOptionsTest.java b/java/org/rocksdb/test/ReadOptionsTest.java
index e00e143cf..5b58fe5e9 100644
--- a/java/org/rocksdb/test/ReadOptionsTest.java
+++ b/java/org/rocksdb/test/ReadOptionsTest.java
@@ -7,7 +7,6 @@ package org.rocksdb.test;
 
 import java.util.Random;
 
-import org.junit.AfterClass;
 import org.junit.ClassRule;
 import org.junit.Rule;
 import org.junit.Test;
@@ -25,13 +24,8 @@ public class ReadOptionsTest {
   @Rule
   public ExpectedException exception = ExpectedException.none();
 
-  @AfterClass
-  public static void printMessage(){
-    System.out.println("Passed ReadOptionsTest.");
-  }
-
   @Test
-  public void shouldTestReadOptions() {
+  public void readOptions() {
     ReadOptions opt = new ReadOptions();
     Random rand = new Random();
     { // VerifyChecksums test
@@ -60,49 +54,49 @@ public class ReadOptionsTest {
   }
 
   @Test
-  public void shouldFailVerifyChecksumUninitialized(){
+  public void failVerifyChecksumUninitialized(){
     ReadOptions readOptions = setupUninitializedReadOptions(
         exception);
     readOptions.setVerifyChecksums(true);
   }
 
   @Test
-  public void shouldFailSetFillCacheUninitialized(){
+  public void failSetFillCacheUninitialized(){
     ReadOptions readOptions = setupUninitializedReadOptions(
         exception);
     readOptions.setFillCache(true);
   }
 
   @Test
-  public void shouldFailFillCacheUninitialized(){
+  public void failFillCacheUninitialized(){
     ReadOptions readOptions = setupUninitializedReadOptions(
         exception);
     readOptions.fillCache();
   }
 
   @Test
-  public void shouldFailSetTailingUninitialized(){
+  public void failSetTailingUninitialized(){
     ReadOptions readOptions = setupUninitializedReadOptions(
         exception);
     readOptions.setTailing(true);
   }
 
   @Test
-  public void shouldFailTailingUninitialized(){
+  public void failTailingUninitialized(){
     ReadOptions readOptions = setupUninitializedReadOptions(
         exception);
     readOptions.tailing();
   }
 
   @Test
-  public void shouldFailSetSnapshotUninitialized(){
+  public void failSetSnapshotUninitialized(){
     ReadOptions readOptions = setupUninitializedReadOptions(
         exception);
     readOptions.setSnapshot(null);
   }
 
   @Test
-  public void shouldFailSnapshotUninitialized(){
+  public void failSnapshotUninitialized(){
     ReadOptions readOptions = setupUninitializedReadOptions(
         exception);
     readOptions.snapshot();
diff --git a/java/org/rocksdb/test/RocksIteratorTest.java b/java/org/rocksdb/test/RocksIteratorTest.java
index 7de27cad9..d2dae63aa 100644
--- a/java/org/rocksdb/test/RocksIteratorTest.java
+++ b/java/org/rocksdb/test/RocksIteratorTest.java
@@ -23,7 +23,7 @@ public class RocksIteratorTest {
   public TemporaryFolder dbFolder = new TemporaryFolder();
 
   @Test
-  public void shouldTestRocksIteratorGc()
+  public void rocksIteratorGc()
       throws RocksDBException {
     RocksDB db;
     Options options = new Options();
@@ -44,6 +44,5 @@ public class RocksIteratorTest {
     iter3.dispose();
     System.gc();
     System.runFinalization();
-    System.out.println("Passed RocksIteratorTest.");
   }
 }
diff --git a/java/org/rocksdb/test/RocksJunitRunner.java b/java/org/rocksdb/test/RocksJunitRunner.java
new file mode 100644
index 000000000..61655f33c
--- /dev/null
+++ b/java/org/rocksdb/test/RocksJunitRunner.java
@@ -0,0 +1,65 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+package org.rocksdb.test;
+
+import org.junit.internal.JUnitSystem;
+import org.junit.internal.RealSystem;
+import org.junit.internal.TextListener;
+import org.junit.runner.Description;
+import org.junit.runner.JUnitCore;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * Custom Junit Runner to print also Test classes
+ * and executed methods to command prompt.
+ */
+public class RocksJunitRunner {
+
+  /**
+   * Listener which overrides default functionality
+   * to print class and method to system out.
+   */
+  static class RocksJunitListener extends TextListener {
+
+    /**
+     * RocksJunitListener constructor
+     *
+     * @param system JUnitSystem
+     */
+    public RocksJunitListener(JUnitSystem system) {
+      super(system);
+    }
+
+    @Override
+    public void testStarted(Description description) {
+       System.out.format("Run: %s testing now -> %s \n",
+           description.getClassName(),
+           description.getMethodName());
+    }
+  }
+
+  /**
+   * Main method to execute tests
+   *
+   * @param args Test classes as String names
+   */
+  public static void main(String[] args){
+    JUnitCore runner = new JUnitCore();
+    final JUnitSystem system = new RealSystem();
+    runner.addListener(new RocksJunitListener(system));
+    try {
+      List<Class<?>> classes = new ArrayList<>();
+      for (String arg : args) {
+        classes.add(Class.forName(arg));
+      }
+      runner.run(classes.toArray(new Class[1]));
+
+    } catch (ClassNotFoundException e) {
+      e.printStackTrace();
+    }
+  }
+}
diff --git a/java/org/rocksdb/test/SnapshotTest.java b/java/org/rocksdb/test/SnapshotTest.java
index ad3546de3..7140a1fcb 100644
--- a/java/org/rocksdb/test/SnapshotTest.java
+++ b/java/org/rocksdb/test/SnapshotTest.java
@@ -14,6 +14,8 @@ import org.rocksdb.RocksDB;
 import org.rocksdb.RocksDBException;
 import org.rocksdb.Snapshot;
 
+import static org.assertj.core.api.Assertions.assertThat;
+
 public class SnapshotTest {
 
   @ClassRule
@@ -24,7 +26,7 @@ public class SnapshotTest {
   public TemporaryFolder dbFolder = new TemporaryFolder();
 
   @Test
-  public void shouldTestSnapshots() throws RocksDBException {
+  public void snapshots() throws RocksDBException {
     RocksDB db;
     Options options = new Options();
     options.setCreateIfMissing(true);
@@ -37,43 +39,43 @@ public class SnapshotTest {
     // set snapshot in ReadOptions
     readOptions.setSnapshot(snapshot);
     // retrieve key value pair
-    assert(new String(db.get("key".getBytes()))
-        .equals("value"));
+    assertThat(new String(db.get("key".getBytes()))).
+        isEqualTo("value");
     // retrieve key value pair created before
     // the snapshot was made
-    assert(new String(db.get(readOptions,
-        "key".getBytes())).equals("value"));
+    assertThat(new String(db.get(readOptions,
+        "key".getBytes()))).isEqualTo("value");
     // add new key/value pair
     db.put("newkey".getBytes(), "newvalue".getBytes());
     // using no snapshot the latest db entries
     // will be taken into account
-    assert(new String(db.get("newkey".getBytes()))
-        .equals("newvalue"));
+    assertThat(new String(db.get("newkey".getBytes()))).
+        isEqualTo("newvalue");
     // snapshopot was created before newkey
-    assert(db.get(readOptions, "newkey".getBytes())
-        == null);
+    assertThat(db.get(readOptions, "newkey".getBytes())).
+        isNull();
     // Retrieve snapshot from read options
     Snapshot sameSnapshot = readOptions.snapshot();
     readOptions.setSnapshot(sameSnapshot);
     // results must be the same with new Snapshot
     // instance using the same native pointer
-    assert(new String(db.get(readOptions,
-        "key".getBytes())).equals("value"));
+    assertThat(new String(db.get(readOptions,
+        "key".getBytes()))).isEqualTo("value");
     // update key value pair to newvalue
     db.put("key".getBytes(), "newvalue".getBytes());
     // read with previously created snapshot will
     // read previous version of key value pair
-    assert(new String(db.get(readOptions,
-        "key".getBytes())).equals("value"));
+    assertThat(new String(db.get(readOptions,
+        "key".getBytes()))).isEqualTo("value");
     // read for newkey using the snapshot must be
     // null
-    assert(db.get(readOptions, "newkey".getBytes())
-        == null);
+    assertThat(db.get(readOptions, "newkey".getBytes())).
+        isNull();
     // setting null to snapshot in ReadOptions leads
     // to no Snapshot being used.
     readOptions.setSnapshot(null);
-    assert(new String(db.get(readOptions,
-        "newkey".getBytes())).equals("newvalue"));
+    assertThat(new String(db.get(readOptions,
+        "newkey".getBytes()))).isEqualTo("newvalue");
     // release Snapshot
     db.releaseSnapshot(snapshot);
     // Close database
diff --git a/java/org/rocksdb/test/StatisticsCollectorTest.java b/java/org/rocksdb/test/StatisticsCollectorTest.java
index b748c21ce..57842af10 100644
--- a/java/org/rocksdb/test/StatisticsCollectorTest.java
+++ b/java/org/rocksdb/test/StatisticsCollectorTest.java
@@ -25,7 +25,7 @@ public class StatisticsCollectorTest {
   public TemporaryFolder dbFolder = new TemporaryFolder();
 
   @Test
-  public void shouldTestStatisticsCollector()
+  public void statisticsCollector()
       throws InterruptedException, RocksDBException {
     Options opt = new Options().createStatistics().setCreateIfMissing(true);
     Statistics stats = opt.statisticsPtr();
@@ -49,7 +49,5 @@ public class StatisticsCollectorTest {
 
     db.close();
     opt.dispose();
-
-    System.out.println("Stats collector test passed.!");
   }
 }
diff --git a/java/org/rocksdb/test/WriteBatchTest.java b/java/org/rocksdb/test/WriteBatchTest.java
index 72e0e464e..cf855c121 100644
--- a/java/org/rocksdb/test/WriteBatchTest.java
+++ b/java/org/rocksdb/test/WriteBatchTest.java
@@ -8,7 +8,6 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 package org.rocksdb.test;
 
-import org.junit.AfterClass;
 import org.junit.ClassRule;
 import org.junit.Rule;
 import org.junit.Test;
@@ -36,19 +35,14 @@ public class WriteBatchTest {
   @Rule
   public TemporaryFolder dbFolder = new TemporaryFolder();
 
-  @AfterClass
-  public static void printMergePass(){
-    System.out.println("Passed WriteBatchTest.");
-  }
-
   @Test
-  public void shouldTestEmptyWriteBatch() {
+  public void emptyWriteBatch() {
     WriteBatch batch = new WriteBatch();
     assertThat(batch.count()).isEqualTo(0);
   }
 
   @Test
-  public void shouldTestMultipleBatchOperations()
+  public void multipleBatchOperations()
       throws UnsupportedEncodingException {
     WriteBatch batch =  new WriteBatch();
     batch.put("foo".getBytes("US-ASCII"), "bar".getBytes("US-ASCII"));
@@ -66,7 +60,7 @@ public class WriteBatchTest {
   }
 
   @Test
-  public void shouldTestAppendOperation()
+  public void testAppendOperation()
       throws UnsupportedEncodingException {
     WriteBatch b1 = new WriteBatch();
     WriteBatch b2 = new WriteBatch();
@@ -97,7 +91,7 @@ public class WriteBatchTest {
   }
 
   @Test
-  public void shouldTestBlobOperation()
+  public void blobOperation()
       throws UnsupportedEncodingException {
     WriteBatch batch = new WriteBatch();
     batch.put("k1".getBytes("US-ASCII"), "v1".getBytes("US-ASCII"));
diff --git a/java/org/rocksdb/test/WriteOptionsTest.java b/java/org/rocksdb/test/WriteOptionsTest.java
index 7a92bf9fa..70a68335d 100644
--- a/java/org/rocksdb/test/WriteOptionsTest.java
+++ b/java/org/rocksdb/test/WriteOptionsTest.java
@@ -5,7 +5,6 @@
 
 package org.rocksdb.test;
 
-import org.junit.AfterClass;
 import org.junit.ClassRule;
 import org.junit.Test;
 import org.rocksdb.WriteOptions;
@@ -18,13 +17,8 @@ public class WriteOptionsTest {
   public static final RocksMemoryResource rocksMemoryResource =
       new RocksMemoryResource();
 
-  @AfterClass
-  public static void printMessage(){
-    System.out.println("Passed WriteOptionsTest.");
-  }
-
   @Test
-  public void shouldTestWriteOptions(){
+  public void writeOptions(){
     WriteOptions writeOptions = new WriteOptions();
     writeOptions.setDisableWAL(true);
     assertThat(writeOptions.disableWAL()).isTrue();
diff --git a/java/rocksjni.pom b/java/rocksjni.pom
index 2966a8a6b..552e26f8e 100644
--- a/java/rocksjni.pom
+++ b/java/rocksjni.pom
@@ -159,6 +159,7 @@
             <groupId>org.mockito</groupId>
             <artifactId>mockito-all</artifactId>
             <version>1.9.5</version>
+            <scope>test</scope>
         </dependency>
     </dependencies>
 </project>

From a4b28c1ae7ed6697779d68a8ed61153866228163 Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Wed, 5 Nov 2014 21:16:35 +0100
Subject: [PATCH 493/829] [RocksJava] Extended Testcases

---
 java/org/rocksdb/test/EnvironmentTest.java   | 125 +++++++++++++++++++
 java/org/rocksdb/test/RocksIteratorTest.java |  38 ++++++
 2 files changed, 163 insertions(+)
 create mode 100644 java/org/rocksdb/test/EnvironmentTest.java

diff --git a/java/org/rocksdb/test/EnvironmentTest.java b/java/org/rocksdb/test/EnvironmentTest.java
new file mode 100644
index 000000000..c6542afed
--- /dev/null
+++ b/java/org/rocksdb/test/EnvironmentTest.java
@@ -0,0 +1,125 @@
+package org.rocksdb.test;
+
+import org.junit.Test;
+import org.rocksdb.util.Environment;
+
+import java.lang.reflect.Field;
+import java.lang.reflect.Modifier;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class EnvironmentTest {
+
+  // Init static context
+  private static Environment environment =
+      new Environment();
+
+  @Test
+  public void mac32() {
+    setEnvironmentClassFields("mac", "32");
+    assertThat(Environment.getJniLibraryExtension()).
+        isEqualTo(".jnilib");
+    assertThat(Environment.getJniLibraryName("rocksdb")).
+        isEqualTo("librocksdbjni-osx.jnilib");
+    assertThat(Environment.getSharedLibraryName("rocksdb")).
+        isEqualTo("librocksdbjni.dylib");
+  }
+
+  @Test
+  public void mac64() {
+    setEnvironmentClassFields("mac", "64");
+    assertThat(Environment.getJniLibraryExtension()).
+        isEqualTo(".jnilib");
+    assertThat(Environment.getJniLibraryName("rocksdb")).
+        isEqualTo("librocksdbjni-osx.jnilib");
+    assertThat(Environment.getSharedLibraryName("rocksdb")).
+        isEqualTo("librocksdbjni.dylib");
+  }
+
+  @Test
+  public void nix32() {
+    // Linux
+    setEnvironmentClassFields("Linux", "32");
+    assertThat(Environment.getJniLibraryExtension()).
+        isEqualTo(".so");
+    assertThat(Environment.getJniLibraryName("rocksdb")).
+        isEqualTo("librocksdbjni-linux32.so");
+    assertThat(Environment.getSharedLibraryName("rocksdb")).
+        isEqualTo("librocksdbjni.so");
+    // UNIX
+    setEnvironmentClassFields("Unix", "32");
+    assertThat(Environment.getJniLibraryExtension()).
+        isEqualTo(".so");
+    assertThat(Environment.getJniLibraryName("rocksdb")).
+        isEqualTo("librocksdbjni-linux32.so");
+    assertThat(Environment.getSharedLibraryName("rocksdb")).
+        isEqualTo("librocksdbjni.so");
+    // AIX
+    setEnvironmentClassFields("aix", "32");
+    assertThat(Environment.getJniLibraryExtension()).
+        isEqualTo(".so");
+    assertThat(Environment.getJniLibraryName("rocksdb")).
+        isEqualTo("librocksdbjni-linux32.so");
+    assertThat(Environment.getSharedLibraryName("rocksdb")).
+        isEqualTo("librocksdbjni.so");
+  }
+
+  @Test
+  public void nix64() {
+    setEnvironmentClassFields("Linux", "x64");
+    assertThat(Environment.getJniLibraryExtension()).
+        isEqualTo(".so");
+    assertThat(Environment.getJniLibraryName("rocksdb")).
+        isEqualTo("librocksdbjni-linux64.so");
+    assertThat(Environment.getSharedLibraryName("rocksdb")).
+        isEqualTo("librocksdbjni.so");
+    // UNIX
+    setEnvironmentClassFields("Unix", "x64");
+    assertThat(Environment.getJniLibraryExtension()).
+        isEqualTo(".so");
+    assertThat(Environment.getJniLibraryName("rocksdb")).
+        isEqualTo("librocksdbjni-linux64.so");
+    assertThat(Environment.getSharedLibraryName("rocksdb")).
+        isEqualTo("librocksdbjni.so");
+    // AIX
+    setEnvironmentClassFields("aix", "x64");
+    assertThat(Environment.getJniLibraryExtension()).
+        isEqualTo(".so");
+    assertThat(Environment.getJniLibraryName("rocksdb")).
+        isEqualTo("librocksdbjni-linux64.so");
+    assertThat(Environment.getSharedLibraryName("rocksdb")).
+        isEqualTo("librocksdbjni.so");
+  }
+
+  @Test(expected = UnsupportedOperationException.class)
+  public void failLinuxJniLibraryName(){
+    setEnvironmentClassFields("win", "x64");
+    Environment.getJniLibraryName("rocksdb");
+  }
+
+  @Test(expected = UnsupportedOperationException.class)
+  public void failWinSharedLibrary(){
+    setEnvironmentClassFields("win", "x64");
+    Environment.getSharedLibraryName("rocksdb");
+  }
+
+  private void setEnvironmentClassFields(String osName,
+      String osArch) {
+    setEnvironmentClassField("OS", osName);
+    setEnvironmentClassField("ARCH", osArch);
+  }
+
+  private void setEnvironmentClassField(String fieldName, String value) {
+    final Field field;
+    try {
+      field = Environment.class.getDeclaredField(fieldName);
+      field.setAccessible(true);
+      final Field modifiersField = Field.class.getDeclaredField("modifiers");
+      modifiersField.setAccessible(true);
+      modifiersField.setInt(field, field.getModifiers() & ~Modifier.FINAL);
+      field.set(null, value);
+    } catch (NoSuchFieldException | IllegalAccessException e) {
+      throw new RuntimeException(e);
+    }
+  }
+}
diff --git a/java/org/rocksdb/test/RocksIteratorTest.java b/java/org/rocksdb/test/RocksIteratorTest.java
index d2dae63aa..961b7c789 100644
--- a/java/org/rocksdb/test/RocksIteratorTest.java
+++ b/java/org/rocksdb/test/RocksIteratorTest.java
@@ -13,6 +13,8 @@ import org.rocksdb.RocksDB;
 import org.rocksdb.RocksDBException;
 import org.rocksdb.RocksIterator;
 
+import static org.assertj.core.api.Assertions.assertThat;
+
 public class RocksIteratorTest {
 
   @ClassRule
@@ -22,6 +24,42 @@ public class RocksIteratorTest {
   @Rule
   public TemporaryFolder dbFolder = new TemporaryFolder();
 
+  @Test
+  public void rocksIterator() throws RocksDBException {
+    RocksDB db;
+    Options options = new Options();
+    options.setCreateIfMissing(true)
+        .setCreateMissingColumnFamilies(true);
+    db = RocksDB.open(options,
+        dbFolder.getRoot().getAbsolutePath());
+    db.put("key1".getBytes(), "value1".getBytes());
+    db.put("key2".getBytes(), "value2".getBytes());
+
+    RocksIterator iterator = db.newIterator();
+
+    iterator.seekToFirst();
+    assertThat(iterator.isValid()).isTrue();
+    assertThat(iterator.key()).isEqualTo("key1".getBytes());
+    assertThat(iterator.value()).isEqualTo("value1".getBytes());
+    iterator.next();
+    assertThat(iterator.isValid()).isTrue();
+    assertThat(iterator.key()).isEqualTo("key2".getBytes());
+    assertThat(iterator.value()).isEqualTo("value2".getBytes());
+    iterator.next();
+    assertThat(iterator.isValid()).isFalse();
+    iterator.seekToLast();
+    iterator.prev();
+    assertThat(iterator.isValid()).isTrue();
+    assertThat(iterator.key()).isEqualTo("key1".getBytes());
+    assertThat(iterator.value()).isEqualTo("value1".getBytes());
+    iterator.seekToFirst();
+    iterator.seekToLast();
+    assertThat(iterator.isValid()).isTrue();
+    assertThat(iterator.key()).isEqualTo("key2".getBytes());
+    assertThat(iterator.value()).isEqualTo("value2".getBytes());
+    iterator.status();
+  }
+
   @Test
   public void rocksIteratorGc()
       throws RocksDBException {

From 628e39e97df75554720063cb43c2bddce99bfec4 Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Fri, 7 Nov 2014 14:38:21 +0100
Subject: [PATCH 494/829] [RocksJava] Integrated review comments from D28209

---
 java/Makefile                                 |  11 +-
 java/org/rocksdb/Options.java                 |   1 -
 java/org/rocksdb/test/BackupableDBTest.java   | 244 ++++----
 java/org/rocksdb/test/ComparatorTest.java     | 229 ++++----
 java/org/rocksdb/test/FilterTest.java         |  47 +-
 java/org/rocksdb/test/KeyMayExistTest.java    |  80 ++-
 java/org/rocksdb/test/MemTableTest.java       | 205 ++++---
 java/org/rocksdb/test/MergeTest.java          | 353 ++++++-----
 java/org/rocksdb/test/OptionsTest.java        | 553 ++++++++++--------
 java/org/rocksdb/test/ReadOnlyTest.java       | 441 ++++++++------
 java/org/rocksdb/test/ReadOptionsTest.java    |  50 +-
 java/org/rocksdb/test/RocksEnvTest.java       |  39 ++
 java/org/rocksdb/test/RocksIteratorTest.java  | 120 ++--
 java/org/rocksdb/test/SnapshotTest.java       | 117 ++--
 .../rocksdb/test/StatisticsCollectorTest.java |  52 +-
 java/rocksjni/backupablejni.cc                |   2 +-
 java/rocksjni/options.cc                      |  11 +
 17 files changed, 1514 insertions(+), 1041 deletions(-)
 create mode 100644 java/org/rocksdb/test/RocksEnvTest.java

diff --git a/java/Makefile b/java/Makefile
index 83bf34afc..3c7b8bb1a 100644
--- a/java/Makefile
+++ b/java/Makefile
@@ -66,6 +66,7 @@ JAVA_TESTS = org.rocksdb.test.AbstractComparatorTest\
 		org.rocksdb.test.PlainTableConfigTest\
 		org.rocksdb.test.ReadOnlyTest\
 		org.rocksdb.test.ReadOptionsTest\
+		org.rocksdb.test.RocksEnvTest\
 		org.rocksdb.test.RocksIteratorTest\
 		org.rocksdb.test.SnapshotTest\
 		org.rocksdb.test.StatisticsCollectorTest\
@@ -112,11 +113,11 @@ column_family_sample: java
 
 resolve_test_deps:
 	mkdir -p "$(JAVA_TEST_LIBDIR)"
-	test -s "$(JAVA_JUNIT_JAR)" || curl -L -o $(JAVA_JUNIT_JAR) http://search.maven.org/remotecontent?filepath=junit/junit/4.12-beta-2/junit-4.12-beta-2.jar
-	test -s "$(JAVA_HAMCR_JAR)" || curl -L -o $(JAVA_HAMCR_JAR) http://search.maven.org/remotecontent?filepath=org/hamcrest/hamcrest-core/1.3/hamcrest-core-1.3.jar
-	test -s "$(JAVA_MOCKITO_JAR)" || curl -L -o "$(JAVA_MOCKITO_JAR)" http://search.maven.org/remotecontent?filepath=org/mockito/mockito-all/1.9.5/mockito-all-1.9.5.jar
-	test -s "$(JAVA_CGLIB_JAR)" || curl -L -o "$(JAVA_CGLIB_JAR)" http://search.maven.org/remotecontent?filepath=cglib/cglib/2.2.2/cglib-2.2.2.jar
-	test -s "$(JAVA_ASSERTJ_JAR)" || curl -L -o "$(JAVA_ASSERTJ_JAR)" http://central.maven.org/maven2/org/assertj/assertj-core/1.7.0/assertj-core-1.7.0.jar
+	test -s "$(JAVA_JUNIT_JAR)" || curl -k -L -o $(JAVA_JUNIT_JAR) http://search.maven.org/remotecontent?filepath=junit/junit/4.12-beta-2/junit-4.12-beta-2.jar
+	test -s "$(JAVA_HAMCR_JAR)" || curl -k -L -o $(JAVA_HAMCR_JAR) http://search.maven.org/remotecontent?filepath=org/hamcrest/hamcrest-core/1.3/hamcrest-core-1.3.jar
+	test -s "$(JAVA_MOCKITO_JAR)" || curl -k -L -o "$(JAVA_MOCKITO_JAR)" http://search.maven.org/remotecontent?filepath=org/mockito/mockito-all/1.9.5/mockito-all-1.9.5.jar
+	test -s "$(JAVA_CGLIB_JAR)" || curl -k -L -o "$(JAVA_CGLIB_JAR)" http://search.maven.org/remotecontent?filepath=cglib/cglib/2.2.2/cglib-2.2.2.jar
+	test -s "$(JAVA_ASSERTJ_JAR)" || curl -k -L -o "$(JAVA_ASSERTJ_JAR)" http://central.maven.org/maven2/org/assertj/assertj-core/1.7.0/assertj-core-1.7.0.jar
 
 test: java resolve_test_deps
 	javac -cp $(JAVA_TESTCLASSPATH) org/rocksdb/test/*.java
diff --git a/java/org/rocksdb/Options.java b/java/org/rocksdb/Options.java
index 7307608af..c5ea7216e 100644
--- a/java/org/rocksdb/Options.java
+++ b/java/org/rocksdb/Options.java
@@ -1010,7 +1010,6 @@ public class Options extends RocksObject
       long cfOptHandle);
   private native void disposeInternal(long handle);
   private native void setEnv(long optHandle, long envHandle);
-  private native long getEnvHandle(long handle);
   private native void prepareForBulkLoad(long handle);
 
   // DB native handles
diff --git a/java/org/rocksdb/test/BackupableDBTest.java b/java/org/rocksdb/test/BackupableDBTest.java
index aa6c07c14..55a707687 100644
--- a/java/org/rocksdb/test/BackupableDBTest.java
+++ b/java/org/rocksdb/test/BackupableDBTest.java
@@ -29,117 +29,137 @@ public class BackupableDBTest {
 
   @Test
   public void backupableDb() throws RocksDBException {
-
-    Options opt = new Options();
-    opt.setCreateIfMissing(true);
-
-    BackupableDBOptions bopt = new BackupableDBOptions(
-        backupFolder.getRoot().getAbsolutePath(), false,
-        true, false, true, 0, 0);
-    BackupableDB bdb;
-    List<BackupInfo> backupInfos;
-    List<BackupInfo> restoreInfos;
-
-    bdb = BackupableDB.open(opt, bopt,
-        dbFolder.getRoot().getAbsolutePath());
-    bdb.put("abc".getBytes(), "def".getBytes());
-    bdb.put("ghi".getBytes(), "jkl".getBytes());
-
-    backupInfos = bdb.getBackupInfos();
-    assertThat(backupInfos.size()).
-        isEqualTo(0);
-
-    bdb.createNewBackup(true);
-    backupInfos = bdb.getBackupInfos();
-    assertThat(backupInfos.size()).
-        isEqualTo(1);
-
-    // Retrieving backup infos twice shall not
-    // lead to different results
-    List<BackupInfo> tmpBackupInfo = bdb.getBackupInfos();
-    assertThat(tmpBackupInfo.get(0).backupId()).
-        isEqualTo(backupInfos.get(0).backupId());
-    assertThat(tmpBackupInfo.get(0).timestamp()).
-        isEqualTo(backupInfos.get(0).timestamp());
-    assertThat(tmpBackupInfo.get(0).size()).
-        isEqualTo(backupInfos.get(0).size());
-    assertThat(tmpBackupInfo.get(0).numberFiles()).
-        isEqualTo(backupInfos.get(0).numberFiles());
-
-    // delete record after backup
-    bdb.remove("abc".getBytes());
-    byte[] value = bdb.get("abc".getBytes());
-    assertThat(value).isNull();
-    bdb.close();
-
-    // restore from backup
-    RestoreOptions ropt = new RestoreOptions(false);
-    RestoreBackupableDB rdb = new RestoreBackupableDB(bopt);
-
-    // getting backup infos from restorable db should
-    // lead to the same infos as from backupable db
-    restoreInfos = rdb.getBackupInfos();
-    assertThat(restoreInfos.size()).
-        isEqualTo(backupInfos.size());
-    assertThat(restoreInfos.get(0).backupId()).
-        isEqualTo(backupInfos.get(0).backupId());
-    assertThat(restoreInfos.get(0).timestamp()).
-        isEqualTo(backupInfos.get(0).timestamp());
-    assertThat(restoreInfos.get(0).size()).
-        isEqualTo(backupInfos.get(0).size());
-    assertThat(restoreInfos.get(0).numberFiles()).
-        isEqualTo(backupInfos.get(0).numberFiles());
-
-    rdb.restoreDBFromLatestBackup(
-        dbFolder.getRoot().getAbsolutePath(),
-        dbFolder.getRoot().getAbsolutePath(),
-        ropt);
-    // do nothing because there is only one backup
-    rdb.purgeOldBackups(1);
-    restoreInfos = rdb.getBackupInfos();
-    assertThat(restoreInfos.size()).
-        isEqualTo(1);
-    rdb.dispose();
-    ropt.dispose();
-
-    // verify that backed up data contains deleted record
-    bdb = BackupableDB.open(opt, bopt,
-        dbFolder.getRoot().getAbsolutePath());
-    value = bdb.get("abc".getBytes());
-    assertThat(new String(value)).
-        isEqualTo("def");
-
-    bdb.createNewBackup(false);
-    // after new backup there must be two backup infos
-    backupInfos = bdb.getBackupInfos();
-    assertThat(backupInfos.size()).
-        isEqualTo(2);
-    // deleting the backup must be possible using the
-    // id provided by backup infos
-    bdb.deleteBackup(backupInfos.get(1).backupId());
-    // after deletion there should only be one info
-    backupInfos = bdb.getBackupInfos();
-    assertThat(backupInfos.size()).
-        isEqualTo(1);
-    bdb.createNewBackup(false);
-    bdb.createNewBackup(false);
-    bdb.createNewBackup(false);
-    backupInfos = bdb.getBackupInfos();
-    assertThat(backupInfos.size()).
-        isEqualTo(4);
-    // purge everything and keep two
-    bdb.purgeOldBackups(2);
-    // backup infos need to be two
-    backupInfos = bdb.getBackupInfos();
-    assertThat(backupInfos.size()).
-        isEqualTo(2);
-    assertThat(backupInfos.get(0).backupId()).
-        isEqualTo(4);
-    assertThat(backupInfos.get(1).backupId()).
-        isEqualTo(5);
-
-    opt.dispose();
-    bopt.dispose();
-    bdb.close();
+    Options opt = null;
+    BackupableDBOptions bopt = null;
+    BackupableDB bdb = null;
+    RestoreOptions ropt = null;
+    RestoreBackupableDB rdb = null;
+    try {
+      opt = new Options();
+      opt.setCreateIfMissing(true);
+
+      bopt = new BackupableDBOptions(
+          backupFolder.getRoot().getAbsolutePath(), false,
+          true, false, true, 0, 0);
+      assertThat(bopt.backupDir()).isEqualTo(
+          backupFolder.getRoot().getAbsolutePath());
+
+      List<BackupInfo> backupInfos;
+      List<BackupInfo> restoreInfos;
+
+      bdb = BackupableDB.open(opt, bopt,
+          dbFolder.getRoot().getAbsolutePath());
+      bdb.put("abc".getBytes(), "def".getBytes());
+      bdb.put("ghi".getBytes(), "jkl".getBytes());
+
+      backupInfos = bdb.getBackupInfos();
+      assertThat(backupInfos.size()).
+          isEqualTo(0);
+
+      bdb.createNewBackup(true);
+      backupInfos = bdb.getBackupInfos();
+      assertThat(backupInfos.size()).
+          isEqualTo(1);
+
+      // Retrieving backup infos twice shall not
+      // lead to different results
+      List<BackupInfo> tmpBackupInfo = bdb.getBackupInfos();
+      assertThat(tmpBackupInfo.get(0).backupId()).
+          isEqualTo(backupInfos.get(0).backupId());
+      assertThat(tmpBackupInfo.get(0).timestamp()).
+          isEqualTo(backupInfos.get(0).timestamp());
+      assertThat(tmpBackupInfo.get(0).size()).
+          isEqualTo(backupInfos.get(0).size());
+      assertThat(tmpBackupInfo.get(0).numberFiles()).
+          isEqualTo(backupInfos.get(0).numberFiles());
+
+      // delete record after backup
+      bdb.remove("abc".getBytes());
+      byte[] value = bdb.get("abc".getBytes());
+      assertThat(value).isNull();
+      bdb.close();
+
+      // restore from backup
+      ropt = new RestoreOptions(false);
+      rdb = new RestoreBackupableDB(bopt);
+
+      // getting backup infos from restorable db should
+      // lead to the same infos as from backupable db
+      restoreInfos = rdb.getBackupInfos();
+      assertThat(restoreInfos.size()).
+          isEqualTo(backupInfos.size());
+      assertThat(restoreInfos.get(0).backupId()).
+          isEqualTo(backupInfos.get(0).backupId());
+      assertThat(restoreInfos.get(0).timestamp()).
+          isEqualTo(backupInfos.get(0).timestamp());
+      assertThat(restoreInfos.get(0).size()).
+          isEqualTo(backupInfos.get(0).size());
+      assertThat(restoreInfos.get(0).numberFiles()).
+          isEqualTo(backupInfos.get(0).numberFiles());
+
+      rdb.restoreDBFromLatestBackup(
+          dbFolder.getRoot().getAbsolutePath(),
+          dbFolder.getRoot().getAbsolutePath(),
+          ropt);
+      // do nothing because there is only one backup
+      rdb.purgeOldBackups(1);
+      restoreInfos = rdb.getBackupInfos();
+      assertThat(restoreInfos.size()).
+          isEqualTo(1);
+      rdb.dispose();
+      ropt.dispose();
+
+      // verify that backed up data contains deleted record
+      bdb = BackupableDB.open(opt, bopt,
+          dbFolder.getRoot().getAbsolutePath());
+      value = bdb.get("abc".getBytes());
+      assertThat(new String(value)).
+          isEqualTo("def");
+
+      bdb.createNewBackup(false);
+      // after new backup there must be two backup infos
+      backupInfos = bdb.getBackupInfos();
+      assertThat(backupInfos.size()).
+          isEqualTo(2);
+      // deleting the backup must be possible using the
+      // id provided by backup infos
+      bdb.deleteBackup(backupInfos.get(1).backupId());
+      // after deletion there should only be one info
+      backupInfos = bdb.getBackupInfos();
+      assertThat(backupInfos.size()).
+          isEqualTo(1);
+      bdb.createNewBackup(false);
+      bdb.createNewBackup(false);
+      bdb.createNewBackup(false);
+      backupInfos = bdb.getBackupInfos();
+      assertThat(backupInfos.size()).
+          isEqualTo(4);
+      // purge everything and keep two
+      bdb.purgeOldBackups(2);
+      // backup infos need to be two
+      backupInfos = bdb.getBackupInfos();
+      assertThat(backupInfos.size()).
+          isEqualTo(2);
+      assertThat(backupInfos.get(0).backupId()).
+          isEqualTo(4);
+      assertThat(backupInfos.get(1).backupId()).
+          isEqualTo(5);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+      if (bopt != null) {
+        bopt.dispose();
+      }
+      if (bdb != null) {
+        bdb.close();
+      }
+      if (ropt != null) {
+        ropt.dispose();
+      }
+      if (rdb != null) {
+        rdb.dispose();
+      }
+    }
   }
 }
diff --git a/java/org/rocksdb/test/ComparatorTest.java b/java/org/rocksdb/test/ComparatorTest.java
index c9037954e..299d8f62d 100644
--- a/java/org/rocksdb/test/ComparatorTest.java
+++ b/java/org/rocksdb/test/ComparatorTest.java
@@ -54,113 +54,138 @@ public class ComparatorTest {
   @Test
   public void builtinForwardComparator()
       throws RocksDBException {
-    Options options = new Options();
-    options.setCreateIfMissing(true);
-    options.setComparator(BuiltinComparator.BYTEWISE_COMPARATOR);
-    RocksDB rocksDB = RocksDB.open(options,
-        dbFolder.getRoot().getAbsolutePath());
-
-    rocksDB.put("abc1".getBytes(), "abc1".getBytes());
-    rocksDB.put("abc2".getBytes(), "abc2".getBytes());
-    rocksDB.put("abc3".getBytes(), "abc3".getBytes());
-
-    RocksIterator rocksIterator = rocksDB.newIterator();
-    // Iterate over keys using a iterator
-    rocksIterator.seekToFirst();
-    assertThat(rocksIterator.isValid()).isTrue();
-    assertThat(rocksIterator.key()).isEqualTo(
-        "abc1".getBytes());
-    assertThat(rocksIterator.value()).isEqualTo(
-        "abc1".getBytes());
-    rocksIterator.next();
-    assertThat(rocksIterator.isValid()).isTrue();
-    assertThat(rocksIterator.key()).isEqualTo(
-        "abc2".getBytes());
-    assertThat(rocksIterator.value()).isEqualTo(
-        "abc2".getBytes());
-    rocksIterator.next();
-    assertThat(rocksIterator.isValid()).isTrue();
-    assertThat(rocksIterator.key()).isEqualTo(
-        "abc3".getBytes());
-    assertThat(rocksIterator.value()).isEqualTo(
-        "abc3".getBytes());
-    rocksIterator.next();
-    assertThat(rocksIterator.isValid()).isFalse();
-    // Get last one
-    rocksIterator.seekToLast();
-    assertThat(rocksIterator.isValid()).isTrue();
-    assertThat(rocksIterator.key()).isEqualTo(
-        "abc3".getBytes());
-    assertThat(rocksIterator.value()).isEqualTo(
-        "abc3".getBytes());
-    // Seek for abc
-    rocksIterator.seek("abc".getBytes());
-    assertThat(rocksIterator.isValid()).isTrue();
-    assertThat(rocksIterator.key()).isEqualTo(
-        "abc1".getBytes());
-    assertThat(rocksIterator.value()).isEqualTo(
-        "abc1".getBytes());
-    rocksIterator.dispose();
-    rocksDB.close();
-    options.dispose();
+    Options options = null;
+    RocksDB rocksDB = null;
+    RocksIterator rocksIterator = null;
+    try {
+      options = new Options();
+      options.setCreateIfMissing(true);
+      options.setComparator(BuiltinComparator.BYTEWISE_COMPARATOR);
+      rocksDB = RocksDB.open(options,
+          dbFolder.getRoot().getAbsolutePath());
+
+      rocksDB.put("abc1".getBytes(), "abc1".getBytes());
+      rocksDB.put("abc2".getBytes(), "abc2".getBytes());
+      rocksDB.put("abc3".getBytes(), "abc3".getBytes());
+
+      rocksIterator = rocksDB.newIterator();
+      // Iterate over keys using a iterator
+      rocksIterator.seekToFirst();
+      assertThat(rocksIterator.isValid()).isTrue();
+      assertThat(rocksIterator.key()).isEqualTo(
+          "abc1".getBytes());
+      assertThat(rocksIterator.value()).isEqualTo(
+          "abc1".getBytes());
+      rocksIterator.next();
+      assertThat(rocksIterator.isValid()).isTrue();
+      assertThat(rocksIterator.key()).isEqualTo(
+          "abc2".getBytes());
+      assertThat(rocksIterator.value()).isEqualTo(
+          "abc2".getBytes());
+      rocksIterator.next();
+      assertThat(rocksIterator.isValid()).isTrue();
+      assertThat(rocksIterator.key()).isEqualTo(
+          "abc3".getBytes());
+      assertThat(rocksIterator.value()).isEqualTo(
+          "abc3".getBytes());
+      rocksIterator.next();
+      assertThat(rocksIterator.isValid()).isFalse();
+      // Get last one
+      rocksIterator.seekToLast();
+      assertThat(rocksIterator.isValid()).isTrue();
+      assertThat(rocksIterator.key()).isEqualTo(
+          "abc3".getBytes());
+      assertThat(rocksIterator.value()).isEqualTo(
+          "abc3".getBytes());
+      // Seek for abc
+      rocksIterator.seek("abc".getBytes());
+      assertThat(rocksIterator.isValid()).isTrue();
+      assertThat(rocksIterator.key()).isEqualTo(
+          "abc1".getBytes());
+      assertThat(rocksIterator.value()).isEqualTo(
+          "abc1".getBytes());
+
+    } finally {
+      if (rocksIterator != null) {
+        rocksIterator.dispose();
+      }
+      if (rocksDB != null) {
+        rocksDB.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
+    }
   }
 
   @Test
   public void builtinReverseComparator()
       throws RocksDBException {
-    Options options = new Options();
-    options.setCreateIfMissing(true);
-    options.setComparator(
-        BuiltinComparator.REVERSE_BYTEWISE_COMPARATOR);
-    RocksDB rocksDB = RocksDB.open(options,
-        dbFolder.getRoot().getAbsolutePath());
-
-    rocksDB.put("abc1".getBytes(), "abc1".getBytes());
-    rocksDB.put("abc2".getBytes(), "abc2".getBytes());
-    rocksDB.put("abc3".getBytes(), "abc3".getBytes());
-
-    RocksIterator rocksIterator = rocksDB.newIterator();
-    // Iterate over keys using a iterator
-    rocksIterator.seekToFirst();
-    assertThat(rocksIterator.isValid()).isTrue();
-    assertThat(rocksIterator.key()).isEqualTo(
-        "abc3".getBytes());
-    assertThat(rocksIterator.value()).isEqualTo(
-        "abc3".getBytes());
-    rocksIterator.next();
-    assertThat(rocksIterator.isValid()).isTrue();
-    assertThat(rocksIterator.key()).isEqualTo(
-        "abc2".getBytes());
-    assertThat(rocksIterator.value()).isEqualTo(
-        "abc2".getBytes());
-    rocksIterator.next();
-    assertThat(rocksIterator.isValid()).isTrue();
-    assertThat(rocksIterator.key()).isEqualTo(
-        "abc1".getBytes());
-    assertThat(rocksIterator.value()).isEqualTo(
-        "abc1".getBytes());
-    rocksIterator.next();
-    assertThat(rocksIterator.isValid()).isFalse();
-    // Get last one
-    rocksIterator.seekToLast();
-    assertThat(rocksIterator.isValid()).isTrue();
-    assertThat(rocksIterator.key()).isEqualTo(
-        "abc1".getBytes());
-    assertThat(rocksIterator.value()).isEqualTo(
-        "abc1".getBytes());
-    // Will be invalid because abc is after abc1
-    rocksIterator.seek("abc".getBytes());
-    assertThat(rocksIterator.isValid()).isFalse();
-    // Will be abc3 because the next one after abc999
-    // is abc3
-    rocksIterator.seek("abc999".getBytes());
-    assertThat(rocksIterator.key()).isEqualTo(
-        "abc3".getBytes());
-    assertThat(rocksIterator.value()).isEqualTo(
-        "abc3".getBytes());
-    rocksIterator.dispose();
-    rocksDB.close();
-    options.dispose();
+    Options options = null;
+    RocksDB rocksDB = null;
+    RocksIterator rocksIterator = null;
+    try {
+      options = new Options();
+      options.setCreateIfMissing(true);
+      options.setComparator(
+          BuiltinComparator.REVERSE_BYTEWISE_COMPARATOR);
+      rocksDB = RocksDB.open(options,
+          dbFolder.getRoot().getAbsolutePath());
+
+      rocksDB.put("abc1".getBytes(), "abc1".getBytes());
+      rocksDB.put("abc2".getBytes(), "abc2".getBytes());
+      rocksDB.put("abc3".getBytes(), "abc3".getBytes());
+
+      rocksIterator = rocksDB.newIterator();
+      // Iterate over keys using a iterator
+      rocksIterator.seekToFirst();
+      assertThat(rocksIterator.isValid()).isTrue();
+      assertThat(rocksIterator.key()).isEqualTo(
+          "abc3".getBytes());
+      assertThat(rocksIterator.value()).isEqualTo(
+          "abc3".getBytes());
+      rocksIterator.next();
+      assertThat(rocksIterator.isValid()).isTrue();
+      assertThat(rocksIterator.key()).isEqualTo(
+          "abc2".getBytes());
+      assertThat(rocksIterator.value()).isEqualTo(
+          "abc2".getBytes());
+      rocksIterator.next();
+      assertThat(rocksIterator.isValid()).isTrue();
+      assertThat(rocksIterator.key()).isEqualTo(
+          "abc1".getBytes());
+      assertThat(rocksIterator.value()).isEqualTo(
+          "abc1".getBytes());
+      rocksIterator.next();
+      assertThat(rocksIterator.isValid()).isFalse();
+      // Get last one
+      rocksIterator.seekToLast();
+      assertThat(rocksIterator.isValid()).isTrue();
+      assertThat(rocksIterator.key()).isEqualTo(
+          "abc1".getBytes());
+      assertThat(rocksIterator.value()).isEqualTo(
+          "abc1".getBytes());
+      // Will be invalid because abc is after abc1
+      rocksIterator.seek("abc".getBytes());
+      assertThat(rocksIterator.isValid()).isFalse();
+      // Will be abc3 because the next one after abc999
+      // is abc3
+      rocksIterator.seek("abc999".getBytes());
+      assertThat(rocksIterator.key()).isEqualTo(
+          "abc3".getBytes());
+      assertThat(rocksIterator.value()).isEqualTo(
+          "abc3".getBytes());
+    } finally {
+      if (rocksIterator != null) {
+        rocksIterator.dispose();
+      }
+      if (rocksDB != null) {
+        rocksDB.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
+    }
   }
 
   @Test
diff --git a/java/org/rocksdb/test/FilterTest.java b/java/org/rocksdb/test/FilterTest.java
index c183f8d95..da4783fbf 100644
--- a/java/org/rocksdb/test/FilterTest.java
+++ b/java/org/rocksdb/test/FilterTest.java
@@ -17,25 +17,32 @@ public class FilterTest {
 
   @Test
   public void filter() {
-    Options options = new Options();
-    // test table config
-    options.setTableFormatConfig(new BlockBasedTableConfig().
-        setFilter(new BloomFilter()));
-    options.dispose();
-    System.gc();
-    System.runFinalization();
-    // new Bloom filter
-    options = new Options();
-    BlockBasedTableConfig blockConfig = new BlockBasedTableConfig();
-    blockConfig.setFilter(new BloomFilter());
-    options.setTableFormatConfig(blockConfig);
-    BloomFilter bloomFilter = new BloomFilter(10);
-    blockConfig.setFilter(bloomFilter);
-    options.setTableFormatConfig(blockConfig);
-    System.gc();
-    System.runFinalization();
-    blockConfig.setFilter(new BloomFilter(10, false));
-    options.setTableFormatConfig(blockConfig);
-    options.dispose();
+    Options options = null;
+    try {
+      options = new Options();
+      // test table config
+      options.setTableFormatConfig(new BlockBasedTableConfig().
+          setFilter(new BloomFilter()));
+      options.dispose();
+      System.gc();
+      System.runFinalization();
+      // new Bloom filter
+      options = new Options();
+      BlockBasedTableConfig blockConfig = new BlockBasedTableConfig();
+      blockConfig.setFilter(new BloomFilter());
+      options.setTableFormatConfig(blockConfig);
+      BloomFilter bloomFilter = new BloomFilter(10);
+      blockConfig.setFilter(bloomFilter);
+      options.setTableFormatConfig(blockConfig);
+      System.gc();
+      System.runFinalization();
+      blockConfig.setFilter(new BloomFilter(10, false));
+      options.setTableFormatConfig(blockConfig);
+
+    } finally {
+      if (options != null) {
+        options.dispose();
+      }
+    }
   }
 }
diff --git a/java/org/rocksdb/test/KeyMayExistTest.java b/java/org/rocksdb/test/KeyMayExistTest.java
index 5f6d6225a..c0613bf53 100644
--- a/java/org/rocksdb/test/KeyMayExistTest.java
+++ b/java/org/rocksdb/test/KeyMayExistTest.java
@@ -26,6 +26,7 @@ public class KeyMayExistTest {
 
   @Test
   public void keyMayExist() throws RocksDBException {
+<<<<<<< HEAD
     RocksDB db;
     DBOptions options = new DBOptions();
     options.setCreateIfMissing(true)
@@ -49,23 +50,50 @@ public class KeyMayExistTest {
     assertThat(exists).isTrue();
     assertThat(retValue.toString()).
         isEqualTo("value");
+=======
+    RocksDB db = null;
+    Options options = null;
+    try {
+      options = new Options();
+      options.setCreateIfMissing(true)
+          .setCreateMissingColumnFamilies(true);
+      // open database using cf names
+      List<String> cfNames = new ArrayList<>();
+      List<ColumnFamilyHandle> columnFamilyHandleList =
+          new ArrayList<>();
+      cfNames.add("default");
+      cfNames.add("new_cf");
+      db = RocksDB.open(options,
+          dbFolder.getRoot().getAbsolutePath(),
+          cfNames, columnFamilyHandleList);
+      assertThat(columnFamilyHandleList.size()).
+          isEqualTo(2);
+      db.put("key".getBytes(), "value".getBytes());
+      // Test without column family
+      StringBuffer retValue = new StringBuffer();
+      boolean exists = db.keyMayExist("key".getBytes(), retValue);
+      assertThat(exists).isTrue();
+      assertThat(retValue.toString()).
+          isEqualTo("value");
+>>>>>>> [RocksJava] Integrated review comments from D28209
 
-    // Test without column family but with readOptions
-    retValue = new StringBuffer();
-    exists = db.keyMayExist(new ReadOptions(), "key".getBytes(),
-        retValue);
-    assertThat(exists).isTrue();
-    assertThat(retValue.toString()).
-        isEqualTo("value");
+      // Test without column family but with readOptions
+      retValue = new StringBuffer();
+      exists = db.keyMayExist(new ReadOptions(), "key".getBytes(),
+          retValue);
+      assertThat(exists).isTrue();
+      assertThat(retValue.toString()).
+          isEqualTo("value");
 
-    // Test with column family
-    retValue = new StringBuffer();
-    exists = db.keyMayExist(columnFamilyHandleList.get(0), "key".getBytes(),
-        retValue);
-    assertThat(exists).isTrue();
-    assertThat(retValue.toString()).
-        isEqualTo("value");
+      // Test with column family
+      retValue = new StringBuffer();
+      exists = db.keyMayExist(columnFamilyHandleList.get(0), "key".getBytes(),
+          retValue);
+      assertThat(exists).isTrue();
+      assertThat(retValue.toString()).
+          isEqualTo("value");
 
+<<<<<<< HEAD
     // Test with column family and readOptions
     retValue = new StringBuffer();
     exists = db.keyMayExist(new ReadOptions(),
@@ -74,9 +102,27 @@ public class KeyMayExistTest {
     assertThat(exists).isTrue();
     assertThat(retValue.toString()).
         isEqualTo("value");
+=======
+      // Test with column family and readOptions
+      retValue = new StringBuffer();
+      exists = db.keyMayExist(new ReadOptions(),
+          columnFamilyHandleList.get(0), "key".getBytes(),
+          retValue);
+      assertThat(exists).isTrue();
+      assertThat(retValue.toString()).
+          isEqualTo("value");
+>>>>>>> [RocksJava] Integrated review comments from D28209
 
-    // KeyMayExist in CF1 must return false
-    assertThat(db.keyMayExist(columnFamilyHandleList.get(1),
-        "key".getBytes(), retValue)).isFalse();
+      // KeyMayExist in CF1 must return false
+      assertThat(db.keyMayExist(columnFamilyHandleList.get(1),
+          "key".getBytes(), retValue)).isFalse();
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
+    }
   }
 }
diff --git a/java/org/rocksdb/test/MemTableTest.java b/java/org/rocksdb/test/MemTableTest.java
index 93146303a..dbf6b0bef 100644
--- a/java/org/rocksdb/test/MemTableTest.java
+++ b/java/org/rocksdb/test/MemTableTest.java
@@ -18,94 +18,121 @@ public class MemTableTest {
       new RocksMemoryResource();
 
   @Test
-  public void memTable() throws RocksDBException {
-    Options options = new Options();
-    // Test HashSkipListMemTableConfig
-    HashSkipListMemTableConfig memTableConfig =
-        new HashSkipListMemTableConfig();
-    assertThat(memTableConfig.bucketCount()).
-        isEqualTo(1000000);
-    memTableConfig.setBucketCount(2000000);
-    assertThat(memTableConfig.bucketCount()).
-        isEqualTo(2000000);
-    assertThat(memTableConfig.height()).
-        isEqualTo(4);
-    memTableConfig.setHeight(5);
-    assertThat(memTableConfig.height()).
-        isEqualTo(5);
-    assertThat(memTableConfig.branchingFactor()).
-        isEqualTo(4);
-    memTableConfig.setBranchingFactor(6);
-    assertThat(memTableConfig.branchingFactor()).
-        isEqualTo(6);
-    options.setMemTableConfig(memTableConfig);
-    options.dispose();
-    System.gc();
-    System.runFinalization();
-    // Test SkipList
-    options = new Options();
-    SkipListMemTableConfig skipMemTableConfig =
-        new SkipListMemTableConfig();
-    assertThat(skipMemTableConfig.lookahead()).
-        isEqualTo(0);
-    skipMemTableConfig.setLookahead(20);
-    assertThat(skipMemTableConfig.lookahead()).
-        isEqualTo(20);
-    options.setMemTableConfig(skipMemTableConfig);
-    options.dispose();
-    System.gc();
-    System.runFinalization();
-    // Test HashLinkedListMemTableConfig
-    options = new Options();
-    HashLinkedListMemTableConfig hashLinkedListMemTableConfig =
-        new HashLinkedListMemTableConfig();
-    assertThat(hashLinkedListMemTableConfig.bucketCount()).
-        isEqualTo(50000);
-    hashLinkedListMemTableConfig.setBucketCount(100000);
-    assertThat(hashLinkedListMemTableConfig.bucketCount()).
-        isEqualTo(100000);
-    assertThat(hashLinkedListMemTableConfig.hugePageTlbSize()).
-        isEqualTo(0);
-    hashLinkedListMemTableConfig.setHugePageTlbSize(1);
-    assertThat(hashLinkedListMemTableConfig.hugePageTlbSize()).
-        isEqualTo(1);
-    assertThat(hashLinkedListMemTableConfig.
-       bucketEntriesLoggingThreshold()).
-        isEqualTo(4096);
-    hashLinkedListMemTableConfig.
-        setBucketEntriesLoggingThreshold(200);
-    assertThat(hashLinkedListMemTableConfig.
-       bucketEntriesLoggingThreshold()).
-        isEqualTo(200);
-    assertThat(hashLinkedListMemTableConfig.
-        ifLogBucketDistWhenFlush()).isTrue();
-    hashLinkedListMemTableConfig.
-        setIfLogBucketDistWhenFlush(false);
-    assertThat(hashLinkedListMemTableConfig.
-        ifLogBucketDistWhenFlush()).isFalse();
-    assertThat(hashLinkedListMemTableConfig.
-        thresholdUseSkiplist()).
-        isEqualTo(256);
-    hashLinkedListMemTableConfig.setThresholdUseSkiplist(29);
-    assertThat(hashLinkedListMemTableConfig.
-        thresholdUseSkiplist()).
-        isEqualTo(29);
-    options.setMemTableConfig(hashLinkedListMemTableConfig);
-    options.dispose();
-    System.gc();
-    System.runFinalization();
-    // test VectorMemTableConfig
-    options = new Options();
-    VectorMemTableConfig vectorMemTableConfig =
-        new VectorMemTableConfig();
-    assertThat(vectorMemTableConfig.reservedSize()).
-        isEqualTo(0);
-    vectorMemTableConfig.setReservedSize(123);
-    assertThat(vectorMemTableConfig.reservedSize()).
-        isEqualTo(123);
-    options.setMemTableConfig(vectorMemTableConfig);
-    options.dispose();
-    System.gc();
-    System.runFinalization();
+  public void hashSkipListMemTable() throws RocksDBException {
+    Options options = null;
+    try {
+      options = new Options();
+      // Test HashSkipListMemTableConfig
+      HashSkipListMemTableConfig memTableConfig =
+          new HashSkipListMemTableConfig();
+      assertThat(memTableConfig.bucketCount()).
+          isEqualTo(1000000);
+      memTableConfig.setBucketCount(2000000);
+      assertThat(memTableConfig.bucketCount()).
+          isEqualTo(2000000);
+      assertThat(memTableConfig.height()).
+          isEqualTo(4);
+      memTableConfig.setHeight(5);
+      assertThat(memTableConfig.height()).
+          isEqualTo(5);
+      assertThat(memTableConfig.branchingFactor()).
+          isEqualTo(4);
+      memTableConfig.setBranchingFactor(6);
+      assertThat(memTableConfig.branchingFactor()).
+          isEqualTo(6);
+      options.setMemTableConfig(memTableConfig);
+    } finally {
+      if (options != null) {
+        options.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void skipListMemTable() throws RocksDBException {
+    Options options = null;
+    try {
+      options = new Options();
+      SkipListMemTableConfig skipMemTableConfig =
+          new SkipListMemTableConfig();
+      assertThat(skipMemTableConfig.lookahead()).
+          isEqualTo(0);
+      skipMemTableConfig.setLookahead(20);
+      assertThat(skipMemTableConfig.lookahead()).
+          isEqualTo(20);
+      options.setMemTableConfig(skipMemTableConfig);
+      options.dispose();
+    } finally {
+      if (options != null) {
+        options.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void hashLinkedListMemTable() throws RocksDBException {
+    Options options = null;
+    try {
+      options = new Options();
+      HashLinkedListMemTableConfig hashLinkedListMemTableConfig =
+          new HashLinkedListMemTableConfig();
+      assertThat(hashLinkedListMemTableConfig.bucketCount()).
+          isEqualTo(50000);
+      hashLinkedListMemTableConfig.setBucketCount(100000);
+      assertThat(hashLinkedListMemTableConfig.bucketCount()).
+          isEqualTo(100000);
+      assertThat(hashLinkedListMemTableConfig.hugePageTlbSize()).
+          isEqualTo(0);
+      hashLinkedListMemTableConfig.setHugePageTlbSize(1);
+      assertThat(hashLinkedListMemTableConfig.hugePageTlbSize()).
+          isEqualTo(1);
+      assertThat(hashLinkedListMemTableConfig.
+          bucketEntriesLoggingThreshold()).
+          isEqualTo(4096);
+      hashLinkedListMemTableConfig.
+          setBucketEntriesLoggingThreshold(200);
+      assertThat(hashLinkedListMemTableConfig.
+          bucketEntriesLoggingThreshold()).
+          isEqualTo(200);
+      assertThat(hashLinkedListMemTableConfig.
+          ifLogBucketDistWhenFlush()).isTrue();
+      hashLinkedListMemTableConfig.
+          setIfLogBucketDistWhenFlush(false);
+      assertThat(hashLinkedListMemTableConfig.
+          ifLogBucketDistWhenFlush()).isFalse();
+      assertThat(hashLinkedListMemTableConfig.
+          thresholdUseSkiplist()).
+          isEqualTo(256);
+      hashLinkedListMemTableConfig.setThresholdUseSkiplist(29);
+      assertThat(hashLinkedListMemTableConfig.
+          thresholdUseSkiplist()).
+          isEqualTo(29);
+      options.setMemTableConfig(hashLinkedListMemTableConfig);
+    } finally {
+      if (options != null) {
+        options.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void vectorMemTable() throws RocksDBException {
+    Options options = null;
+    try {
+      options = new Options();
+      VectorMemTableConfig vectorMemTableConfig =
+          new VectorMemTableConfig();
+      assertThat(vectorMemTableConfig.reservedSize()).
+          isEqualTo(0);
+      vectorMemTableConfig.setReservedSize(123);
+      assertThat(vectorMemTableConfig.reservedSize()).
+          isEqualTo(123);
+      options.setMemTableConfig(vectorMemTableConfig);
+      options.dispose();
+    }  finally {
+      if (options != null) {
+        options.dispose();
+      }
+    }
   }
 }
diff --git a/java/org/rocksdb/test/MergeTest.java b/java/org/rocksdb/test/MergeTest.java
index f1e2fb759..962674716 100644
--- a/java/org/rocksdb/test/MergeTest.java
+++ b/java/org/rocksdb/test/MergeTest.java
@@ -28,180 +28,227 @@ public class MergeTest {
   @Test
   public void stringOption()
       throws InterruptedException, RocksDBException {
-    String db_path_string =
-        dbFolder.getRoot().getAbsolutePath();
-    Options opt = new Options();
-    opt.setCreateIfMissing(true);
-    opt.setMergeOperatorName("stringappend");
-
-    RocksDB db = RocksDB.open(opt, db_path_string);
-    // writing aa under key
-    db.put("key".getBytes(), "aa".getBytes());
-    // merge bb under key
-    db.merge("key".getBytes(), "bb".getBytes());
-
-    byte[] value = db.get("key".getBytes());
-    String strValue = new String(value);
-
-    db.close();
-    opt.dispose();
-    assertThat(strValue).isEqualTo("aa,bb");
+    RocksDB db = null;
+    Options opt = null;
+    try {
+      String db_path_string =
+          dbFolder.getRoot().getAbsolutePath();
+      opt = new Options();
+      opt.setCreateIfMissing(true);
+      opt.setMergeOperatorName("stringappend");
+
+      db = RocksDB.open(opt, db_path_string);
+      // writing aa under key
+      db.put("key".getBytes(), "aa".getBytes());
+      // merge bb under key
+      db.merge("key".getBytes(), "bb".getBytes());
+
+      byte[] value = db.get("key".getBytes());
+      String strValue = new String(value);
+      assertThat(strValue).isEqualTo("aa,bb");
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
   }
 
   @Test
   public void cFStringOption()
       throws InterruptedException, RocksDBException {
-    DBOptions opt = new DBOptions();
-    String db_path_string =
-        dbFolder.getRoot().getAbsolutePath();
-    opt.setCreateIfMissing(true);
-    opt.setCreateMissingColumnFamilies(true);
-
-    List<ColumnFamilyDescriptor> cfDescr =
-        new ArrayList<>();
+    RocksDB db = null;
+    DBOptions opt = null;
     List<ColumnFamilyHandle> columnFamilyHandleList =
-    new ArrayList<>();
-    cfDescr.add(new ColumnFamilyDescriptor("default",
-        new ColumnFamilyOptions().setMergeOperatorName(
-            "stringappend")));
-    cfDescr.add(new ColumnFamilyDescriptor("default",
-        new ColumnFamilyOptions().setMergeOperatorName(
-            "stringappend")));
-    RocksDB db = RocksDB.open(opt, db_path_string,
-        cfDescr, columnFamilyHandleList);
-
-    // writing aa under key
-    db.put(columnFamilyHandleList.get(1),
-        "cfkey".getBytes(), "aa".getBytes());
-    // merge bb under key
-    db.merge(columnFamilyHandleList.get(1),
-        "cfkey".getBytes(), "bb".getBytes());
-
-    byte[] value = db.get(columnFamilyHandleList.get(1), "cfkey".getBytes());
-    String strValue = new String(value);
-
-    for (ColumnFamilyHandle handle : columnFamilyHandleList) {
-      handle.dispose();
+        new ArrayList<>();
+    try {
+      String db_path_string =
+          dbFolder.getRoot().getAbsolutePath();
+      opt = new DBOptions();
+      opt.setCreateIfMissing(true);
+      opt.setCreateMissingColumnFamilies(true);
+
+      List<ColumnFamilyDescriptor> cfDescriptors =
+          new ArrayList<>();
+      cfDescriptors.add(new ColumnFamilyDescriptor("default",
+          new ColumnFamilyOptions().setMergeOperatorName(
+              "stringappend")));
+      cfDescriptors.add(new ColumnFamilyDescriptor("default",
+          new ColumnFamilyOptions().setMergeOperatorName(
+              "stringappend")));
+      db = RocksDB.open(opt, db_path_string,
+          cfDescriptors, columnFamilyHandleList);
+
+      // writing aa under key
+      db.put(columnFamilyHandleList.get(1),
+          "cfkey".getBytes(), "aa".getBytes());
+      // merge bb under key
+      db.merge(columnFamilyHandleList.get(1),
+          "cfkey".getBytes(), "bb".getBytes());
+
+      byte[] value = db.get(columnFamilyHandleList.get(1), "cfkey".getBytes());
+      String strValue = new String(value);
+      assertThat(strValue).isEqualTo("aa,bb");
+    } finally {
+      for (ColumnFamilyHandle handle : columnFamilyHandleList) {
+        handle.dispose();
+      }
+      if (db != null) {
+        db.close();
+      }
+      if (opt != null) {
+        opt.dispose();
+      }
     }
-    db.close();
-    opt.dispose();
-    assertThat(strValue).isEqualTo("aa,bb");
   }
 
   @Test
   public void operatorOption()
       throws InterruptedException, RocksDBException {
-    String db_path_string =
-        dbFolder.getRoot().getAbsolutePath();
-    Options opt = new Options();
-    opt.setCreateIfMissing(true);
-
-    StringAppendOperator stringAppendOperator = new StringAppendOperator();
-    opt.setMergeOperator(stringAppendOperator);
-
-    RocksDB db = RocksDB.open(opt, db_path_string);
-    // Writing aa under key
-    db.put("key".getBytes(), "aa".getBytes());
-
-    // Writing bb under key
-    db.merge("key".getBytes(), "bb".getBytes());
-
-    byte[] value = db.get("key".getBytes());
-    String strValue = new String(value);
-
-    db.close();
-    opt.dispose();
-    assertThat(strValue).isEqualTo("aa,bb");
+    RocksDB db = null;
+    Options opt = null;
+    try {
+      String db_path_string =
+          dbFolder.getRoot().getAbsolutePath();
+      opt = new Options();
+      opt.setCreateIfMissing(true);
+
+      StringAppendOperator stringAppendOperator = new StringAppendOperator();
+      opt.setMergeOperator(stringAppendOperator);
+
+      db = RocksDB.open(opt, db_path_string);
+      // Writing aa under key
+      db.put("key".getBytes(), "aa".getBytes());
+
+      // Writing bb under key
+      db.merge("key".getBytes(), "bb".getBytes());
+
+      byte[] value = db.get("key".getBytes());
+      String strValue = new String(value);
+
+      assertThat(strValue).isEqualTo("aa,bb");
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
   }
 
   @Test
   public void cFOperatorOption()
       throws InterruptedException, RocksDBException {
-    DBOptions opt = new DBOptions();
-    String db_path_string =
-        dbFolder.getRoot().getAbsolutePath();
-
-    opt.setCreateIfMissing(true);
-    opt.setCreateMissingColumnFamilies(true);
-    StringAppendOperator stringAppendOperator = new StringAppendOperator();
-
-    List<ColumnFamilyDescriptor> cfDescr =
-        new ArrayList<>();
-    List<ColumnFamilyHandle> columnFamilyHandleList =
-    new ArrayList<>();
-    cfDescr.add(new ColumnFamilyDescriptor("default",
-        new ColumnFamilyOptions().setMergeOperator(
-            stringAppendOperator)));
-    cfDescr.add(new ColumnFamilyDescriptor("new_cf",
-        new ColumnFamilyOptions().setMergeOperator(
-            stringAppendOperator)));
-    RocksDB db = RocksDB.open(opt, db_path_string,
-        cfDescr, columnFamilyHandleList);
-    // writing aa under key
-    db.put(columnFamilyHandleList.get(1),
-        "cfkey".getBytes(), "aa".getBytes());
-    // merge bb under key
-    db.merge(columnFamilyHandleList.get(1),
-        "cfkey".getBytes(), "bb".getBytes());
-    byte[] value = db.get(columnFamilyHandleList.get(1), "cfkey".getBytes());
-    String strValue = new String(value);
-
-    // Test also with createColumnFamily
-    ColumnFamilyHandle columnFamilyHandle = db.createColumnFamily(
-        new ColumnFamilyDescriptor("new_cf2",
-            new ColumnFamilyOptions().setMergeOperator(
-                new StringAppendOperator())));
-    // writing xx under cfkey2
-    db.put(columnFamilyHandle, "cfkey2".getBytes(), "xx".getBytes());
-    // merge yy under cfkey2
-    db.merge(columnFamilyHandle, "cfkey2".getBytes(), "yy".getBytes());
-    value = db.get(columnFamilyHandle, "cfkey2".getBytes());
-    String strValueTmpCf = new String(value);
-
-    columnFamilyHandle.dispose();
-    db.close();
-    opt.dispose();
-    assertThat(strValue).isEqualTo("aa,bb");
-    assertThat(strValueTmpCf).isEqualTo("xx,yy");
+    RocksDB db = null;
+    DBOptions opt = null;
+    ColumnFamilyHandle columnFamilyHandle = null;
+    try {
+      String db_path_string =
+          dbFolder.getRoot().getAbsolutePath();
+      opt = new DBOptions();
+      opt.setCreateIfMissing(true);
+      opt.setCreateMissingColumnFamilies(true);
+      StringAppendOperator stringAppendOperator = new StringAppendOperator();
+
+      List<ColumnFamilyDescriptor> cfDescriptors =
+          new ArrayList<>();
+      List<ColumnFamilyHandle> columnFamilyHandleList =
+          new ArrayList<>();
+      cfDescriptors.add(new ColumnFamilyDescriptor("default",
+          new ColumnFamilyOptions().setMergeOperator(
+              stringAppendOperator)));
+      cfDescriptors.add(new ColumnFamilyDescriptor("new_cf",
+          new ColumnFamilyOptions().setMergeOperator(
+              stringAppendOperator)));
+      db = RocksDB.open(opt, db_path_string,
+          cfDescriptors, columnFamilyHandleList);
+
+      // writing aa under key
+      db.put(columnFamilyHandleList.get(1),
+          "cfkey".getBytes(), "aa".getBytes());
+      // merge bb under key
+      db.merge(columnFamilyHandleList.get(1),
+          "cfkey".getBytes(), "bb".getBytes());
+      byte[] value = db.get(columnFamilyHandleList.get(1), "cfkey".getBytes());
+      String strValue = new String(value);
+
+      // Test also with createColumnFamily
+      columnFamilyHandle = db.createColumnFamily(
+          new ColumnFamilyDescriptor("new_cf2"));
+      // writing xx under cfkey2
+      db.put(columnFamilyHandle, "cfkey2".getBytes(), "xx".getBytes());
+      // merge yy under cfkey2
+      db.merge(columnFamilyHandle, "cfkey2".getBytes(), "yy".getBytes());
+      value = db.get(columnFamilyHandle, "cfkey2".getBytes());
+      String strValueTmpCf = new String(value);
+
+      columnFamilyHandle.dispose();
+      assertThat(strValue).isEqualTo("aa,bb");
+      assertThat(strValueTmpCf).isEqualTo("xx,yy");
+    } finally {
+      if (columnFamilyHandle != null) {
+        columnFamilyHandle.dispose();
+      }
+      if (db != null) {
+        db.close();
+      }
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
   }
 
   @Test
   public void operatorGcBehaviour()
       throws RocksDBException {
-    String db_path_string =
-        dbFolder.getRoot().getAbsolutePath();
-    Options opt = new Options();
-    opt.setCreateIfMissing(true);
-    StringAppendOperator stringAppendOperator = new StringAppendOperator();
-    opt.setMergeOperator(stringAppendOperator);
-    RocksDB db = RocksDB.open(opt, db_path_string);
-    db.close();
-    opt.dispose();
-    System.gc();
-    System.runFinalization();
-    // test reuse
-    opt = new Options();
-    opt.setMergeOperator(stringAppendOperator);
-    db = RocksDB.open(opt, db_path_string);
-    db.close();
-    opt.dispose();
-    System.gc();
-    System.runFinalization();
-    // test param init
-    opt = new Options();
-    opt.setMergeOperator(new StringAppendOperator());
-    db = RocksDB.open(opt, db_path_string);
-    db.close();
-    opt.dispose();
-    System.gc();
-    System.runFinalization();
-    // test replace one with another merge operator instance
-    opt = new Options();
-    opt.setMergeOperator(stringAppendOperator);
-    StringAppendOperator newStringAppendOperator = new StringAppendOperator();
-    opt.setMergeOperator(newStringAppendOperator);
-    db = RocksDB.open(opt, db_path_string);
-    db.close();
-    opt.dispose();
+    Options opt = null;
+    RocksDB db = null;
+    try {
+      String db_path_string =
+          dbFolder.getRoot().getAbsolutePath();
+      opt = new Options();
+      opt.setCreateIfMissing(true);
+      StringAppendOperator stringAppendOperator = new StringAppendOperator();
+      opt.setMergeOperator(stringAppendOperator);
+      db = RocksDB.open(opt, db_path_string);
+      db.close();
+      opt.dispose();
+      System.gc();
+      System.runFinalization();
+      // test reuse
+      opt = new Options();
+      opt.setMergeOperator(stringAppendOperator);
+      db = RocksDB.open(opt, db_path_string);
+      db.close();
+      opt.dispose();
+      System.gc();
+      System.runFinalization();
+      // test param init
+      opt = new Options();
+      opt.setMergeOperator(new StringAppendOperator());
+      db = RocksDB.open(opt, db_path_string);
+      db.close();
+      opt.dispose();
+      System.gc();
+      System.runFinalization();
+      // test replace one with another merge operator instance
+      opt = new Options();
+      opt.setMergeOperator(stringAppendOperator);
+      StringAppendOperator newStringAppendOperator = new StringAppendOperator();
+      opt.setMergeOperator(newStringAppendOperator);
+      db = RocksDB.open(opt, db_path_string);
+      db.close();
+      opt.dispose();
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
   }
 }
diff --git a/java/org/rocksdb/test/OptionsTest.java b/java/org/rocksdb/test/OptionsTest.java
index f8fbd7bcc..24dd5081c 100644
--- a/java/org/rocksdb/test/OptionsTest.java
+++ b/java/org/rocksdb/test/OptionsTest.java
@@ -21,281 +21,350 @@ public class OptionsTest {
 
   @Test
   public void options() throws RocksDBException {
-    Options opt = new Options();
-    Random rand = PlatformRandomHelper.
-        getPlatformSpecificRandomFactory();
-
-    DBOptionsTest.testDBOptions(opt);
-
-    { // WriteBufferSize test
-      long longValue = rand.nextLong();
-      opt.setWriteBufferSize(longValue);
-      assert(opt.writeBufferSize() == longValue);
-    }
-
-    { // MaxWriteBufferNumber test
-      int intValue = rand.nextInt();
-      opt.setMaxWriteBufferNumber(intValue);
-      assert(opt.maxWriteBufferNumber() == intValue);
-    }
-
-    { // MinWriteBufferNumberToMerge test
-      int intValue = rand.nextInt();
-      opt.setMinWriteBufferNumberToMerge(intValue);
-      assert(opt.minWriteBufferNumberToMerge() == intValue);
-    }
-
-    { // NumLevels test
-      int intValue = rand.nextInt();
-      opt.setNumLevels(intValue);
-      assert(opt.numLevels() == intValue);
-    }
-
-    { // LevelFileNumCompactionTrigger test
-      int intValue = rand.nextInt();
-      opt.setLevelZeroFileNumCompactionTrigger(intValue);
-      assert(opt.levelZeroFileNumCompactionTrigger() == intValue);
-    }
-
-    { // LevelSlowdownWritesTrigger test
-      int intValue = rand.nextInt();
-      opt.setLevelZeroSlowdownWritesTrigger(intValue);
-      assert(opt.levelZeroSlowdownWritesTrigger() == intValue);
-    }
-
-    { // LevelStopWritesTrigger test
-      int intValue = rand.nextInt();
-      opt.setLevelZeroStopWritesTrigger(intValue);
-      assert(opt.levelZeroStopWritesTrigger() == intValue);
-    }
-
-    { // MaxMemCompactionLevel test
-      int intValue = rand.nextInt();
-      opt.setMaxMemCompactionLevel(intValue);
-      assert(opt.maxMemCompactionLevel() == intValue);
-    }
-
-    { // TargetFileSizeBase test
-      long longValue = rand.nextLong();
-      opt.setTargetFileSizeBase(longValue);
-      assert(opt.targetFileSizeBase() == longValue);
-    }
-
-    { // TargetFileSizeMultiplier test
-      int intValue = rand.nextInt();
-      opt.setTargetFileSizeMultiplier(intValue);
-      assert(opt.targetFileSizeMultiplier() == intValue);
-    }
-
-    { // MaxBytesForLevelBase test
-      long longValue = rand.nextLong();
-      opt.setMaxBytesForLevelBase(longValue);
-      assert(opt.maxBytesForLevelBase() == longValue);
-    }
-
-    { // MaxBytesForLevelMultiplier test
-      int intValue = rand.nextInt();
-      opt.setMaxBytesForLevelMultiplier(intValue);
-      assert(opt.maxBytesForLevelMultiplier() == intValue);
-    }
-
-    { // ExpandedCompactionFactor test
-      int intValue = rand.nextInt();
-      opt.setExpandedCompactionFactor(intValue);
-      assert(opt.expandedCompactionFactor() == intValue);
-    }
-
-    { // SourceCompactionFactor test
-      int intValue = rand.nextInt();
-      opt.setSourceCompactionFactor(intValue);
-      assert(opt.sourceCompactionFactor() == intValue);
-    }
-
-    { // MaxGrandparentOverlapFactor test
-      int intValue = rand.nextInt();
-      opt.setMaxGrandparentOverlapFactor(intValue);
-      assert(opt.maxGrandparentOverlapFactor() == intValue);
-    }
-
-    { // SoftRateLimit test
-      double doubleValue = rand.nextDouble();
-      opt.setSoftRateLimit(doubleValue);
-      assert(opt.softRateLimit() == doubleValue);
-    }
-
-    { // HardRateLimit test
-      double doubleValue = rand.nextDouble();
-      opt.setHardRateLimit(doubleValue);
-      assert(opt.hardRateLimit() == doubleValue);
-    }
-
-    { // RateLimitDelayMaxMilliseconds test
-      int intValue = rand.nextInt();
-      opt.setRateLimitDelayMaxMilliseconds(intValue);
-      assert(opt.rateLimitDelayMaxMilliseconds() == intValue);
-    }
-
-    { // ArenaBlockSize test
-      long longValue = rand.nextLong();
-      opt.setArenaBlockSize(longValue);
-      assert(opt.arenaBlockSize() == longValue);
-    }
-
-    { // DisableAutoCompactions test
-      boolean boolValue = rand.nextBoolean();
-      opt.setDisableAutoCompactions(boolValue);
-      assert(opt.disableAutoCompactions() == boolValue);
-    }
-
-    { // PurgeRedundantKvsWhileFlush test
-      boolean boolValue = rand.nextBoolean();
-      opt.setPurgeRedundantKvsWhileFlush(boolValue);
-      assert(opt.purgeRedundantKvsWhileFlush() == boolValue);
-    }
-
-    { // VerifyChecksumsInCompaction test
-      boolean boolValue = rand.nextBoolean();
-      opt.setVerifyChecksumsInCompaction(boolValue);
-      assert(opt.verifyChecksumsInCompaction() == boolValue);
-    }
-
-    { // FilterDeletes test
-      boolean boolValue = rand.nextBoolean();
-      opt.setFilterDeletes(boolValue);
-      assert(opt.filterDeletes() == boolValue);
-    }
-
-    { // MaxSequentialSkipInIterations test
-      long longValue = rand.nextLong();
-      opt.setMaxSequentialSkipInIterations(longValue);
-      assert(opt.maxSequentialSkipInIterations() == longValue);
-    }
-
-    { // InplaceUpdateSupport test
-      boolean boolValue = rand.nextBoolean();
-      opt.setInplaceUpdateSupport(boolValue);
-      assert(opt.inplaceUpdateSupport() == boolValue);
-    }
-
-    { // InplaceUpdateNumLocks test
-      long longValue = rand.nextLong();
-      opt.setInplaceUpdateNumLocks(longValue);
-      assert(opt.inplaceUpdateNumLocks() == longValue);
-    }
-
-    { // MemtablePrefixBloomBits test
-      int intValue = rand.nextInt();
-      opt.setMemtablePrefixBloomBits(intValue);
-      assert(opt.memtablePrefixBloomBits() == intValue);
-    }
-
-    { // MemtablePrefixBloomProbes test
-      int intValue = rand.nextInt();
-      opt.setMemtablePrefixBloomProbes(intValue);
-      assert(opt.memtablePrefixBloomProbes() == intValue);
-    }
-
-    { // BloomLocality test
-      int intValue = rand.nextInt();
-      opt.setBloomLocality(intValue);
-      assert(opt.bloomLocality() == intValue);
-    }
-
-    { // MaxSuccessiveMerges test
-      long longValue = rand.nextLong();
-      opt.setMaxSuccessiveMerges(longValue);
-      assert(opt.maxSuccessiveMerges() == longValue);
+    Options opt = null;
+    try {
+      opt = new Options();
+      Random rand = PlatformRandomHelper.
+          getPlatformSpecificRandomFactory();
+      DBOptionsTest.testDBOptions(opt);
+
+      { // WriteBufferSize test
+        long longValue = rand.nextLong();
+        opt.setWriteBufferSize(longValue);
+        assert (opt.writeBufferSize() == longValue);
+      }
+
+      { // MaxWriteBufferNumber test
+        int intValue = rand.nextInt();
+        opt.setMaxWriteBufferNumber(intValue);
+        assert (opt.maxWriteBufferNumber() == intValue);
+      }
+
+      { // MinWriteBufferNumberToMerge test
+        int intValue = rand.nextInt();
+        opt.setMinWriteBufferNumberToMerge(intValue);
+        assert (opt.minWriteBufferNumberToMerge() == intValue);
+      }
+
+      { // NumLevels test
+        int intValue = rand.nextInt();
+        opt.setNumLevels(intValue);
+        assert (opt.numLevels() == intValue);
+      }
+
+      { // LevelFileNumCompactionTrigger test
+        int intValue = rand.nextInt();
+        opt.setLevelZeroFileNumCompactionTrigger(intValue);
+        assert (opt.levelZeroFileNumCompactionTrigger() == intValue);
+      }
+
+      { // LevelSlowdownWritesTrigger test
+        int intValue = rand.nextInt();
+        opt.setLevelZeroSlowdownWritesTrigger(intValue);
+        assert (opt.levelZeroSlowdownWritesTrigger() == intValue);
+      }
+
+      { // LevelStopWritesTrigger test
+        int intValue = rand.nextInt();
+        opt.setLevelZeroStopWritesTrigger(intValue);
+        assert (opt.levelZeroStopWritesTrigger() == intValue);
+      }
+
+      { // MaxMemCompactionLevel test
+        int intValue = rand.nextInt();
+        opt.setMaxMemCompactionLevel(intValue);
+        assert (opt.maxMemCompactionLevel() == intValue);
+      }
+
+      { // TargetFileSizeBase test
+        long longValue = rand.nextLong();
+        opt.setTargetFileSizeBase(longValue);
+        assert (opt.targetFileSizeBase() == longValue);
+      }
+
+      { // TargetFileSizeMultiplier test
+        int intValue = rand.nextInt();
+        opt.setTargetFileSizeMultiplier(intValue);
+        assert (opt.targetFileSizeMultiplier() == intValue);
+      }
+
+      { // MaxBytesForLevelBase test
+        long longValue = rand.nextLong();
+        opt.setMaxBytesForLevelBase(longValue);
+        assert (opt.maxBytesForLevelBase() == longValue);
+      }
+
+      { // MaxBytesForLevelMultiplier test
+        int intValue = rand.nextInt();
+        opt.setMaxBytesForLevelMultiplier(intValue);
+        assert (opt.maxBytesForLevelMultiplier() == intValue);
+      }
+
+      { // ExpandedCompactionFactor test
+        int intValue = rand.nextInt();
+        opt.setExpandedCompactionFactor(intValue);
+        assert (opt.expandedCompactionFactor() == intValue);
+      }
+
+      { // SourceCompactionFactor test
+        int intValue = rand.nextInt();
+        opt.setSourceCompactionFactor(intValue);
+        assert (opt.sourceCompactionFactor() == intValue);
+      }
+
+      { // MaxGrandparentOverlapFactor test
+        int intValue = rand.nextInt();
+        opt.setMaxGrandparentOverlapFactor(intValue);
+        assert (opt.maxGrandparentOverlapFactor() == intValue);
+      }
+
+      { // SoftRateLimit test
+        double doubleValue = rand.nextDouble();
+        opt.setSoftRateLimit(doubleValue);
+        assert (opt.softRateLimit() == doubleValue);
+      }
+
+      { // HardRateLimit test
+        double doubleValue = rand.nextDouble();
+        opt.setHardRateLimit(doubleValue);
+        assert (opt.hardRateLimit() == doubleValue);
+      }
+
+      { // RateLimitDelayMaxMilliseconds test
+        int intValue = rand.nextInt();
+        opt.setRateLimitDelayMaxMilliseconds(intValue);
+        assert (opt.rateLimitDelayMaxMilliseconds() == intValue);
+      }
+
+      { // ArenaBlockSize test
+        long longValue = rand.nextLong();
+        opt.setArenaBlockSize(longValue);
+        assert (opt.arenaBlockSize() == longValue);
+      }
+
+      { // DisableAutoCompactions test
+        boolean boolValue = rand.nextBoolean();
+        opt.setDisableAutoCompactions(boolValue);
+        assert (opt.disableAutoCompactions() == boolValue);
+      }
+
+      { // PurgeRedundantKvsWhileFlush test
+        boolean boolValue = rand.nextBoolean();
+        opt.setPurgeRedundantKvsWhileFlush(boolValue);
+        assert (opt.purgeRedundantKvsWhileFlush() == boolValue);
+      }
+
+      { // VerifyChecksumsInCompaction test
+        boolean boolValue = rand.nextBoolean();
+        opt.setVerifyChecksumsInCompaction(boolValue);
+        assert (opt.verifyChecksumsInCompaction() == boolValue);
+      }
+
+      { // FilterDeletes test
+        boolean boolValue = rand.nextBoolean();
+        opt.setFilterDeletes(boolValue);
+        assert (opt.filterDeletes() == boolValue);
+      }
+
+      { // MaxSequentialSkipInIterations test
+        long longValue = rand.nextLong();
+        opt.setMaxSequentialSkipInIterations(longValue);
+        assert (opt.maxSequentialSkipInIterations() == longValue);
+      }
+
+      { // InplaceUpdateSupport test
+        boolean boolValue = rand.nextBoolean();
+        opt.setInplaceUpdateSupport(boolValue);
+        assert (opt.inplaceUpdateSupport() == boolValue);
+      }
+
+      { // InplaceUpdateNumLocks test
+        long longValue = rand.nextLong();
+        opt.setInplaceUpdateNumLocks(longValue);
+        assert (opt.inplaceUpdateNumLocks() == longValue);
+      }
+
+      { // MemtablePrefixBloomBits test
+        int intValue = rand.nextInt();
+        opt.setMemtablePrefixBloomBits(intValue);
+        assert (opt.memtablePrefixBloomBits() == intValue);
+      }
+
+      { // MemtablePrefixBloomProbes test
+        int intValue = rand.nextInt();
+        opt.setMemtablePrefixBloomProbes(intValue);
+        assert (opt.memtablePrefixBloomProbes() == intValue);
+      }
+
+      { // BloomLocality test
+        int intValue = rand.nextInt();
+        opt.setBloomLocality(intValue);
+        assert (opt.bloomLocality() == intValue);
+      }
+
+      { // MaxSuccessiveMerges test
+        long longValue = rand.nextLong();
+        opt.setMaxSuccessiveMerges(longValue);
+        assert (opt.maxSuccessiveMerges() == longValue);
+      }
+
+      { // MinPartialMergeOperands test
+        int intValue = rand.nextInt();
+        opt.setMinPartialMergeOperands(intValue);
+        assert (opt.minPartialMergeOperands() == intValue);
+      }
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
     }
+  }
 
-    { // MinPartialMergeOperands test
-      int intValue = rand.nextInt();
-      opt.setMinPartialMergeOperands(intValue);
-      assert(opt.minPartialMergeOperands() == intValue);
+  @Test
+  public void rocksEnv() {
+    Options options = null;
+    try {
+      options = new Options();
+      RocksEnv rocksEnv = RocksEnv.getDefault();
+      options.setEnv(rocksEnv);
+      assertThat(options.getEnv()).isSameAs(rocksEnv);
+    } finally {
+      if (options != null) {
+        options.dispose();
+      }
     }
-
-    opt.dispose();
   }
 
   @Test
   public void linkageOfPrepMethods() {
-    Options options = new Options();
-    options.optimizeUniversalStyleCompaction();
-    options.optimizeUniversalStyleCompaction(4000);
-    options.optimizeLevelStyleCompaction();
-    options.optimizeLevelStyleCompaction(3000);
-    options.optimizeForPointLookup(10);
-    options.prepareForBulkLoad();
+    Options options = null;
+    try {
+      options = new Options();
+      options.optimizeUniversalStyleCompaction();
+      options.optimizeUniversalStyleCompaction(4000);
+      options.optimizeLevelStyleCompaction();
+      options.optimizeLevelStyleCompaction(3000);
+      options.optimizeForPointLookup(10);
+      options.prepareForBulkLoad();
+    } finally {
+      if (options != null) {
+        options.dispose();
+      }
+    }
   }
 
   @Test
   public void compressionTypes() {
-    Options options = new Options();
-    for(CompressionType compressionType :
-        CompressionType.values()) {
-      options.setCompressionType(compressionType);
-      assertThat(options.compressionType()).
-          isEqualTo(compressionType);
+    Options options = null;
+    try {
+      options = new Options();
+      for (CompressionType compressionType :
+          CompressionType.values()) {
+        options.setCompressionType(compressionType);
+        assertThat(options.compressionType()).
+            isEqualTo(compressionType);
+      }
+    } finally {
+      if (options != null) {
+        options.dispose();
+      }
     }
-    options.dispose();
   }
 
   @Test
   public void compactionStyles() {
-    Options options = new Options();
-    for (CompactionStyle compactionStyle :
-        CompactionStyle.values()) {
-      options.setCompactionStyle(compactionStyle);
-      assertThat(options.compactionStyle()).
-          isEqualTo(compactionStyle);
+    Options options = null;
+    try {
+      options = new Options();
+      for (CompactionStyle compactionStyle :
+          CompactionStyle.values()) {
+        options.setCompactionStyle(compactionStyle);
+        assertThat(options.compactionStyle()).
+            isEqualTo(compactionStyle);
+      }
+    } finally {
+      if (options != null) {
+        options.dispose();
+      }
     }
-    options.dispose();
   }
 
   @Test
   public void rateLimiterConfig() {
-    Options options = new Options();
-    RateLimiterConfig rateLimiterConfig =
-        new GenericRateLimiterConfig(1000, 0, 1);
-    options.setRateLimiterConfig(rateLimiterConfig);
-    options.dispose();
-    // Test with parameter initialization
-    Options anotherOptions = new Options();
-    anotherOptions.setRateLimiterConfig(
-        new GenericRateLimiterConfig(1000));
-    anotherOptions.dispose();
+    Options options = null;
+    Options anotherOptions = null;
+    RateLimiterConfig rateLimiterConfig;
+    try {
+      options = new Options();
+      rateLimiterConfig = new GenericRateLimiterConfig(1000, 0, 1);
+      options.setRateLimiterConfig(rateLimiterConfig);
+      // Test with parameter initialization
+      anotherOptions = new Options();
+      anotherOptions.setRateLimiterConfig(
+          new GenericRateLimiterConfig(1000));
+    } finally {
+      if (options != null) {
+        options.dispose();
+      }
+      if (anotherOptions != null) {
+        anotherOptions.dispose();
+      }
+    }
   }
 
   @Test
   public void shouldSetTestPrefixExtractor() {
-    Options options = new Options();
-    options.useFixedLengthPrefixExtractor(100);
-    options.useFixedLengthPrefixExtractor(10);
-    options.dispose();
+    Options options = null;
+    try {
+      options = new Options();
+      options.useFixedLengthPrefixExtractor(100);
+      options.useFixedLengthPrefixExtractor(10);
+    } finally {
+      if (options != null) {
+        options.dispose();
+      }
+    }
   }
 
   @Test
   public void shouldTestMemTableFactoryName()
       throws RocksDBException {
-    Options options = new Options();
-    options.setMemTableConfig(new VectorMemTableConfig());
-    assertThat(options.memTableFactoryName()).
-        isEqualTo("VectorRepFactory");
-    options.setMemTableConfig(
-        new HashLinkedListMemTableConfig());
-    assertThat(options.memTableFactoryName()).
-        isEqualTo("HashLinkedListRepFactory");
-    options.dispose();
+    Options options = null;
+    try {
+      options = new Options();
+      options.setMemTableConfig(new VectorMemTableConfig());
+      assertThat(options.memTableFactoryName()).
+          isEqualTo("VectorRepFactory");
+      options.setMemTableConfig(
+          new HashLinkedListMemTableConfig());
+      assertThat(options.memTableFactoryName()).
+          isEqualTo("HashLinkedListRepFactory");
+    } finally {
+      if (options != null) {
+        options.dispose();
+      }
+    }
   }
 
   @Test
   public void statistics() {
-    Options options = new Options();
-    Statistics statistics = options.createStatistics().
-        statisticsPtr();
-    assertThat(statistics).isNotNull();
-
-    Options anotherOptions = new Options();
-    statistics = anotherOptions.statisticsPtr();
-    assertThat(statistics).isNotNull();
+    Options options = null;
+    Options anotherOptions = null;
+    try {
+      options = new Options();
+      Statistics statistics = options.createStatistics().
+          statisticsPtr();
+      assertThat(statistics).isNotNull();
+      anotherOptions = new Options();
+      statistics = anotherOptions.statisticsPtr();
+      assertThat(statistics).isNotNull();
+    } finally {
+      if (options != null) {
+        options.dispose();
+      }
+      if (anotherOptions != null) {
+        anotherOptions.dispose();
+      }
+    }
   }
 }
diff --git a/java/org/rocksdb/test/ReadOnlyTest.java b/java/org/rocksdb/test/ReadOnlyTest.java
index 1151f93dc..bf6bb5eb5 100644
--- a/java/org/rocksdb/test/ReadOnlyTest.java
+++ b/java/org/rocksdb/test/ReadOnlyTest.java
@@ -26,203 +26,308 @@ public class ReadOnlyTest {
 
   @Test
   public void readOnlyOpen() throws RocksDBException {
-    RocksDB db, db2, db3;
+    RocksDB db = null;
+    RocksDB db2 = null;
+    RocksDB db3 = null;
+    Options options = null;
     List<ColumnFamilyHandle> columnFamilyHandleList =
         new ArrayList<>();
     List<ColumnFamilyHandle> readOnlyColumnFamilyHandleList =
         new ArrayList<>();
     List<ColumnFamilyHandle> readOnlyColumnFamilyHandleList2 =
         new ArrayList<>();
-
-    Options options = new Options();
-    options.setCreateIfMissing(true);
-
-    db = RocksDB.open(options,
-        dbFolder.getRoot().getAbsolutePath());
-    db.put("key".getBytes(), "value".getBytes());
-    db2 = RocksDB.openReadOnly(
-        dbFolder.getRoot().getAbsolutePath());
-    assertThat("value").
-        isEqualTo(new String(db2.get("key".getBytes())));
-    db.close();
-    db2.close();
-
-    List<ColumnFamilyDescriptor> cfDescriptors = new ArrayList<>();
-    cfDescriptors.add(
-        new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY,
-            new ColumnFamilyOptions()));
-
-    db = RocksDB.open(
-        dbFolder.getRoot().getAbsolutePath(), cfDescriptors, columnFamilyHandleList);
-    columnFamilyHandleList.add(db.createColumnFamily(
-        new ColumnFamilyDescriptor("new_cf", new ColumnFamilyOptions())));
-    columnFamilyHandleList.add(db.createColumnFamily(
-        new ColumnFamilyDescriptor("new_cf2", new ColumnFamilyOptions())));
-    db.put(columnFamilyHandleList.get(2), "key2".getBytes(),
-        "value2".getBytes());
-
-    db2 = RocksDB.openReadOnly(
-        dbFolder.getRoot().getAbsolutePath(), cfDescriptors,
-        readOnlyColumnFamilyHandleList);
-    assertThat(db2.get("key2".getBytes())).isNull();
-    assertThat(db2.get(readOnlyColumnFamilyHandleList.get(0), "key2".getBytes())).
-        isNull();
-
-    cfDescriptors.clear();
-    cfDescriptors.add(
-        new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY,
-            new ColumnFamilyOptions()));
-    cfDescriptors.add(
-        new ColumnFamilyDescriptor("new_cf2", new ColumnFamilyOptions()));
-    db3 = RocksDB.openReadOnly(
-        dbFolder.getRoot().getAbsolutePath(), cfDescriptors,
-        readOnlyColumnFamilyHandleList2);
-    assertThat(new String(db3.get(readOnlyColumnFamilyHandleList2.get(1),
-        "key2".getBytes()))).isEqualTo("value2");
-    db.close();
-    db2.close();
-    db3.close();
-    options.dispose();
+    try {
+      options = new Options();
+      options.setCreateIfMissing(true);
+
+      db = RocksDB.open(options,
+          dbFolder.getRoot().getAbsolutePath());
+      db.put("key".getBytes(), "value".getBytes());
+      db2 = RocksDB.openReadOnly(
+          dbFolder.getRoot().getAbsolutePath());
+      assertThat("value").
+          isEqualTo(new String(db2.get("key".getBytes())));
+      db.close();
+      db2.close();
+
+      List<ColumnFamilyDescriptor> cfDescriptors = new ArrayList<>();
+      cfDescriptors.add(
+          new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY,
+              new ColumnFamilyOptions()));
+
+      db = RocksDB.open(
+          dbFolder.getRoot().getAbsolutePath(), cfDescriptors, columnFamilyHandleList);
+      columnFamilyHandleList.add(db.createColumnFamily(
+          new ColumnFamilyDescriptor("new_cf", new ColumnFamilyOptions())));
+      columnFamilyHandleList.add(db.createColumnFamily(
+          new ColumnFamilyDescriptor("new_cf2", new ColumnFamilyOptions())));
+      db.put(columnFamilyHandleList.get(2), "key2".getBytes(),
+          "value2".getBytes());
+
+      db2 = RocksDB.openReadOnly(
+          dbFolder.getRoot().getAbsolutePath(), cfDescriptors,
+          readOnlyColumnFamilyHandleList);
+      assertThat(db2.get("key2".getBytes())).isNull();
+      assertThat(db2.get(readOnlyColumnFamilyHandleList.get(0), "key2".getBytes())).
+          isNull();
+      cfDescriptors.clear();
+      cfDescriptors.add(
+          new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY,
+              new ColumnFamilyOptions()));
+      cfDescriptors.add(
+          new ColumnFamilyDescriptor("new_cf2", new ColumnFamilyOptions()));
+      db3 = RocksDB.openReadOnly(
+          dbFolder.getRoot().getAbsolutePath(), cfDescriptors, readOnlyColumnFamilyHandleList2);
+      assertThat(new String(db3.get(readOnlyColumnFamilyHandleList2.get(1),
+          "key2".getBytes()))).isEqualTo("value2");
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+      if (db2 != null) {
+        db2.close();
+      }
+      if (db3 != null) {
+        db3.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
+    }
   }
 
   @Test(expected = RocksDBException.class)
   public void failToWriteInReadOnly() throws RocksDBException {
-    List<ColumnFamilyDescriptor> cfDescriptors = new ArrayList<>();
-    cfDescriptors.add(
-        new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY,
-            new ColumnFamilyOptions()));
-    List<ColumnFamilyHandle> readOnlyColumnFamilyHandleList =
-        new ArrayList<>();
-    Options options = new Options();
-    options.setCreateIfMissing(true);
-
-    RocksDB db = RocksDB.open(options,
-        dbFolder.getRoot().getAbsolutePath());
-    db.close();
-    RocksDB rDb = RocksDB.openReadOnly(
-        dbFolder.getRoot().getAbsolutePath(), cfDescriptors,
-        readOnlyColumnFamilyHandleList);
-    // test that put fails in readonly mode
-    rDb.put("key".getBytes(), "value".getBytes());
+    RocksDB db = null;
+    RocksDB rDb = null;
+    Options options = null;
+    try {
+      List<ColumnFamilyDescriptor> cfDescriptors = new ArrayList<>();
+      cfDescriptors.add(
+          new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY,
+              new ColumnFamilyOptions()));
+      List<ColumnFamilyHandle> readOnlyColumnFamilyHandleList =
+          new ArrayList<>();
+      options = new Options();
+      options.setCreateIfMissing(true);
+
+      db = RocksDB.open(options,
+          dbFolder.getRoot().getAbsolutePath());
+      db.close();
+      rDb = RocksDB.openReadOnly(
+          dbFolder.getRoot().getAbsolutePath(), cfDescriptors,
+          readOnlyColumnFamilyHandleList);
+
+      // test that put fails in readonly mode
+      rDb.put("key".getBytes(), "value".getBytes());
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+      if (rDb != null) {
+        rDb.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
+    }
   }
 
   @Test(expected = RocksDBException.class)
   public void failToCFWriteInReadOnly() throws RocksDBException {
-    List<ColumnFamilyDescriptor> cfDescriptors = new ArrayList<>();
-    cfDescriptors.add(
-        new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY,
-            new ColumnFamilyOptions()));
-    List<ColumnFamilyHandle> readOnlyColumnFamilyHandleList =
-        new ArrayList<>();
-
-    Options options = new Options();
-    options.setCreateIfMissing(true);
-
-    RocksDB db = RocksDB.open(options,
-        dbFolder.getRoot().getAbsolutePath());
-    db.close();
-    RocksDB rDb = RocksDB.openReadOnly(
-        dbFolder.getRoot().getAbsolutePath(), cfDescriptors,
-        readOnlyColumnFamilyHandleList);
-
-    rDb.put(readOnlyColumnFamilyHandleList.get(0),
-        "key".getBytes(), "value".getBytes());
+    RocksDB db = null;
+    RocksDB rDb = null;
+    Options options = null;
+    try {
+      List<ColumnFamilyDescriptor> cfDescriptors = new ArrayList<>();
+      cfDescriptors.add(
+          new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY,
+              new ColumnFamilyOptions()));
+      List<ColumnFamilyHandle> readOnlyColumnFamilyHandleList =
+          new ArrayList<>();
+      options = new Options();
+      options.setCreateIfMissing(true);
+
+      db = RocksDB.open(options,
+          dbFolder.getRoot().getAbsolutePath());
+      db.close();
+      rDb = RocksDB.openReadOnly(
+          dbFolder.getRoot().getAbsolutePath(), cfDescriptors,
+          readOnlyColumnFamilyHandleList);
+
+      rDb.put(readOnlyColumnFamilyHandleList.get(0),
+          "key".getBytes(), "value".getBytes());
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+      if (rDb != null) {
+        rDb.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
+    }
   }
 
   @Test(expected = RocksDBException.class)
   public void failToRemoveInReadOnly() throws RocksDBException {
-    List<ColumnFamilyDescriptor> cfDescriptors = new ArrayList<>();
-    cfDescriptors.add(
-        new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY,
-            new ColumnFamilyOptions()));
-    List<ColumnFamilyHandle> readOnlyColumnFamilyHandleList =
-        new ArrayList<>();
-
-    Options options = new Options();
-    options.setCreateIfMissing(true);
-
-    RocksDB db = RocksDB.open(options,
-        dbFolder.getRoot().getAbsolutePath());
-    db.close();
-    RocksDB rDb = RocksDB.openReadOnly(
-        dbFolder.getRoot().getAbsolutePath(), cfDescriptors,
-        readOnlyColumnFamilyHandleList);
-
-    rDb.remove("key".getBytes());
+    RocksDB db = null;
+    RocksDB rDb = null;
+    Options options = null;
+    try {
+      List<ColumnFamilyDescriptor> cfDescriptors = new ArrayList<>();
+      cfDescriptors.add(
+          new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY,
+              new ColumnFamilyOptions()));
+      List<ColumnFamilyHandle> readOnlyColumnFamilyHandleList =
+          new ArrayList<>();
+
+      options = new Options();
+      options.setCreateIfMissing(true);
+
+      db = RocksDB.open(options,
+          dbFolder.getRoot().getAbsolutePath());
+      db.close();
+      rDb = RocksDB.openReadOnly(
+          dbFolder.getRoot().getAbsolutePath(), cfDescriptors,
+          readOnlyColumnFamilyHandleList);
+
+      rDb.remove("key".getBytes());
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+      if (rDb != null) {
+        rDb.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
+    }
   }
 
   @Test(expected = RocksDBException.class)
   public void failToCFRemoveInReadOnly() throws RocksDBException {
-    List<ColumnFamilyDescriptor> cfDescriptors = new ArrayList<>();
-    cfDescriptors.add(
-        new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY,
-            new ColumnFamilyOptions()));
-    List<ColumnFamilyHandle> readOnlyColumnFamilyHandleList =
-        new ArrayList<>();
-
-    Options options = new Options();
-    options.setCreateIfMissing(true);
-
-    RocksDB db = RocksDB.open(options,
-        dbFolder.getRoot().getAbsolutePath());
-    db.close();
-
-    RocksDB rDb = RocksDB.openReadOnly(
-        dbFolder.getRoot().getAbsolutePath(), cfDescriptors,
-        readOnlyColumnFamilyHandleList);
-
-    rDb.remove(readOnlyColumnFamilyHandleList.get(0),
-        "key".getBytes());
+    RocksDB db = null;
+    RocksDB rDb = null;
+    Options options = null;
+    try {
+      List<ColumnFamilyDescriptor> cfDescriptors = new ArrayList<>();
+      cfDescriptors.add(
+          new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY,
+              new ColumnFamilyOptions()));
+      List<ColumnFamilyHandle> readOnlyColumnFamilyHandleList =
+          new ArrayList<>();
+
+      options = new Options();
+      options.setCreateIfMissing(true);
+
+      db = RocksDB.open(options,
+          dbFolder.getRoot().getAbsolutePath());
+      db.close();
+
+      rDb = RocksDB.openReadOnly(
+          dbFolder.getRoot().getAbsolutePath(), cfDescriptors,
+          readOnlyColumnFamilyHandleList);
+
+      rDb.remove(readOnlyColumnFamilyHandleList.get(0),
+          "key".getBytes());
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+      if (rDb != null) {
+        rDb.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
+    }
   }
 
   @Test(expected = RocksDBException.class)
   public void failToWriteBatchReadOnly() throws RocksDBException {
-    List<ColumnFamilyDescriptor> cfDescriptors = new ArrayList<>();
-    cfDescriptors.add(
-        new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY,
-            new ColumnFamilyOptions()));
-    List<ColumnFamilyHandle> readOnlyColumnFamilyHandleList =
-        new ArrayList<>();
-
-    Options options = new Options();
-    options.setCreateIfMissing(true);
-
-    RocksDB db = RocksDB.open(options,
-        dbFolder.getRoot().getAbsolutePath());
-    db.close();
-
-    RocksDB rDb = RocksDB.openReadOnly(
-        dbFolder.getRoot().getAbsolutePath(), cfDescriptors,
-        readOnlyColumnFamilyHandleList);
-
-    WriteBatch wb = new WriteBatch();
-    wb.put("key".getBytes(), "value".getBytes());
-    rDb.write(new WriteOptions(), wb);
+    RocksDB db = null;
+    RocksDB rDb = null;
+    Options options = null;
+    try {
+      List<ColumnFamilyDescriptor> cfDescriptors = new ArrayList<>();
+      cfDescriptors.add(
+          new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY,
+              new ColumnFamilyOptions()));
+      List<ColumnFamilyHandle> readOnlyColumnFamilyHandleList =
+          new ArrayList<>();
+
+      options = new Options();
+      options.setCreateIfMissing(true);
+
+      db = RocksDB.open(options,
+          dbFolder.getRoot().getAbsolutePath());
+      db.close();
+
+      rDb = RocksDB.openReadOnly(
+          dbFolder.getRoot().getAbsolutePath(), cfDescriptors,
+          readOnlyColumnFamilyHandleList);
+
+      WriteBatch wb = new WriteBatch();
+      wb.put("key".getBytes(), "value".getBytes());
+      rDb.write(new WriteOptions(), wb);
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+      if (rDb != null) {
+        rDb.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
+    }
   }
 
   @Test(expected = RocksDBException.class)
   public void failToCFWriteBatchReadOnly() throws RocksDBException {
-    List<ColumnFamilyDescriptor> cfDescriptors = new ArrayList<>();
-    cfDescriptors.add(
-        new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY,
-            new ColumnFamilyOptions()));
-    List<ColumnFamilyHandle> readOnlyColumnFamilyHandleList =
-        new ArrayList<>();
-
-    Options options = new Options();
-    options.setCreateIfMissing(true);
-
-    RocksDB db = RocksDB.open(options,
-        dbFolder.getRoot().getAbsolutePath());
-    db.close();
-
-    RocksDB rDb = RocksDB.openReadOnly(
-        dbFolder.getRoot().getAbsolutePath(), cfDescriptors,
-        readOnlyColumnFamilyHandleList);
-
-    WriteBatch wb = new WriteBatch();
-    wb.put(readOnlyColumnFamilyHandleList.get(0),
-        "key".getBytes(), "value".getBytes());
-    rDb.write(new WriteOptions(), wb);
+    RocksDB db = null;
+    RocksDB rDb = null;
+    Options options = null;
+    WriteBatch wb = null;
+    try {
+      List<ColumnFamilyDescriptor> cfDescriptors = new ArrayList<>();
+      cfDescriptors.add(
+          new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY,
+              new ColumnFamilyOptions()));
+      List<ColumnFamilyHandle> readOnlyColumnFamilyHandleList =
+          new ArrayList<>();
+
+      options = new Options();
+      options.setCreateIfMissing(true);
+
+      db = RocksDB.open(options,
+          dbFolder.getRoot().getAbsolutePath());
+      db.close();
+
+      rDb = RocksDB.openReadOnly(
+          dbFolder.getRoot().getAbsolutePath(), cfDescriptors,
+          readOnlyColumnFamilyHandleList);
+
+      wb = new WriteBatch();
+      wb.put(readOnlyColumnFamilyHandleList.get(0),
+          "key".getBytes(), "value".getBytes());
+      rDb.write(new WriteOptions(), wb);
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+      if (rDb != null) {
+        rDb.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
+      if (wb != null) {
+        wb.dispose();
+      }
+    }
   }
 }
diff --git a/java/org/rocksdb/test/ReadOptionsTest.java b/java/org/rocksdb/test/ReadOptionsTest.java
index 5b58fe5e9..80ea765c5 100644
--- a/java/org/rocksdb/test/ReadOptionsTest.java
+++ b/java/org/rocksdb/test/ReadOptionsTest.java
@@ -25,32 +25,66 @@ public class ReadOptionsTest {
   public ExpectedException exception = ExpectedException.none();
 
   @Test
-  public void readOptions() {
-    ReadOptions opt = new ReadOptions();
-    Random rand = new Random();
-    { // VerifyChecksums test
+  public void verifyChecksum(){
+    ReadOptions opt = null;
+    try {
+      opt = new ReadOptions();
+      Random rand = new Random();
       boolean boolValue = rand.nextBoolean();
       opt.setVerifyChecksums(boolValue);
       assertThat(opt.verifyChecksums()).isEqualTo(boolValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
     }
+  }
 
-    { // FillCache test
+  @Test
+  public void fillCache(){
+    ReadOptions opt = null;
+    try {
+      opt = new ReadOptions();
+      Random rand = new Random();
       boolean boolValue = rand.nextBoolean();
       opt.setFillCache(boolValue);
       assertThat(opt.fillCache()).isEqualTo(boolValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
     }
+  }
 
-    { // Tailing test
+  @Test
+  public void tailing(){
+    ReadOptions opt = null;
+    try {
+      opt = new ReadOptions();
+      Random rand = new Random();
       boolean boolValue = rand.nextBoolean();
       opt.setTailing(boolValue);
       assertThat(opt.tailing()).isEqualTo(boolValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
     }
+  }
 
-    { // Snapshot null test
+  @Test
+  public void snapshot(){
+    ReadOptions opt = null;
+    try {
+      opt = new ReadOptions();
+      Random rand = new Random();
       opt.setSnapshot(null);
       assertThat(opt.snapshot()).isNull();
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
     }
-    opt.dispose();
   }
 
   @Test
diff --git a/java/org/rocksdb/test/RocksEnvTest.java b/java/org/rocksdb/test/RocksEnvTest.java
new file mode 100644
index 000000000..f55e9042e
--- /dev/null
+++ b/java/org/rocksdb/test/RocksEnvTest.java
@@ -0,0 +1,39 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb.test;
+
+import org.junit.ClassRule;
+import org.junit.Test;
+import org.rocksdb.RocksEnv;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class RocksEnvTest {
+
+  @ClassRule
+  public static final RocksMemoryResource rocksMemoryResource =
+      new RocksMemoryResource();
+
+  @Test
+  public void rocksEnv(){
+    RocksEnv rocksEnv = RocksEnv.getDefault();
+    rocksEnv.setBackgroundThreads(5);
+    // default rocksenv will always return zero for flush pool
+    // no matter what was set via setBackgroundThreads
+    assertThat(rocksEnv.getThreadPoolQueueLen(RocksEnv.FLUSH_POOL)).
+        isEqualTo(0);
+    rocksEnv.setBackgroundThreads(5, RocksEnv.FLUSH_POOL);
+    // default rocksenv will always return zero for flush pool
+    // no matter what was set via setBackgroundThreads
+    assertThat(rocksEnv.getThreadPoolQueueLen(RocksEnv.FLUSH_POOL)).
+        isEqualTo(0);
+    rocksEnv.setBackgroundThreads(5, RocksEnv.COMPACTION_POOL);
+    // default rocksenv will always return zero for compaction pool
+    // no matter what was set via setBackgroundThreads
+    assertThat(rocksEnv.getThreadPoolQueueLen(RocksEnv.COMPACTION_POOL)).
+        isEqualTo(0);
+  }
+}
diff --git a/java/org/rocksdb/test/RocksIteratorTest.java b/java/org/rocksdb/test/RocksIteratorTest.java
index 961b7c789..448e8f397 100644
--- a/java/org/rocksdb/test/RocksIteratorTest.java
+++ b/java/org/rocksdb/test/RocksIteratorTest.java
@@ -26,61 +26,83 @@ public class RocksIteratorTest {
 
   @Test
   public void rocksIterator() throws RocksDBException {
-    RocksDB db;
-    Options options = new Options();
-    options.setCreateIfMissing(true)
-        .setCreateMissingColumnFamilies(true);
-    db = RocksDB.open(options,
-        dbFolder.getRoot().getAbsolutePath());
-    db.put("key1".getBytes(), "value1".getBytes());
-    db.put("key2".getBytes(), "value2".getBytes());
+    RocksDB db = null;
+    Options options = null;
+    RocksIterator iterator = null;
+    try {
+      options = new Options();
+      options.setCreateIfMissing(true)
+          .setCreateMissingColumnFamilies(true);
+      db = RocksDB.open(options,
+          dbFolder.getRoot().getAbsolutePath());
+      db.put("key1".getBytes(), "value1".getBytes());
+      db.put("key2".getBytes(), "value2".getBytes());
 
-    RocksIterator iterator = db.newIterator();
+      iterator = db.newIterator();
 
-    iterator.seekToFirst();
-    assertThat(iterator.isValid()).isTrue();
-    assertThat(iterator.key()).isEqualTo("key1".getBytes());
-    assertThat(iterator.value()).isEqualTo("value1".getBytes());
-    iterator.next();
-    assertThat(iterator.isValid()).isTrue();
-    assertThat(iterator.key()).isEqualTo("key2".getBytes());
-    assertThat(iterator.value()).isEqualTo("value2".getBytes());
-    iterator.next();
-    assertThat(iterator.isValid()).isFalse();
-    iterator.seekToLast();
-    iterator.prev();
-    assertThat(iterator.isValid()).isTrue();
-    assertThat(iterator.key()).isEqualTo("key1".getBytes());
-    assertThat(iterator.value()).isEqualTo("value1".getBytes());
-    iterator.seekToFirst();
-    iterator.seekToLast();
-    assertThat(iterator.isValid()).isTrue();
-    assertThat(iterator.key()).isEqualTo("key2".getBytes());
-    assertThat(iterator.value()).isEqualTo("value2".getBytes());
-    iterator.status();
+      iterator.seekToFirst();
+      assertThat(iterator.isValid()).isTrue();
+      assertThat(iterator.key()).isEqualTo("key1".getBytes());
+      assertThat(iterator.value()).isEqualTo("value1".getBytes());
+      iterator.next();
+      assertThat(iterator.isValid()).isTrue();
+      assertThat(iterator.key()).isEqualTo("key2".getBytes());
+      assertThat(iterator.value()).isEqualTo("value2".getBytes());
+      iterator.next();
+      assertThat(iterator.isValid()).isFalse();
+      iterator.seekToLast();
+      iterator.prev();
+      assertThat(iterator.isValid()).isTrue();
+      assertThat(iterator.key()).isEqualTo("key1".getBytes());
+      assertThat(iterator.value()).isEqualTo("value1".getBytes());
+      iterator.seekToFirst();
+      iterator.seekToLast();
+      assertThat(iterator.isValid()).isTrue();
+      assertThat(iterator.key()).isEqualTo("key2".getBytes());
+      assertThat(iterator.value()).isEqualTo("value2".getBytes());
+      iterator.status();
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
+      if (iterator != null) {
+        iterator.dispose();
+      }
+    }
   }
 
   @Test
   public void rocksIteratorGc()
       throws RocksDBException {
-    RocksDB db;
-    Options options = new Options();
-    options.setCreateIfMissing(true)
-        .setCreateMissingColumnFamilies(true);
-    db = RocksDB.open(options,
-        dbFolder.getRoot().getAbsolutePath());
-    db.put("key".getBytes(), "value".getBytes());
-    RocksIterator iter = db.newIterator();
-    RocksIterator iter2 = db.newIterator();
-    RocksIterator iter3 = db.newIterator();
-    iter = null;
-    db.close();
-    db = null;
-    iter2 = null;
-    System.gc();
-    System.runFinalization();
-    iter3.dispose();
-    System.gc();
-    System.runFinalization();
+    RocksDB db = null;
+    Options options = null;
+    try {
+      options = new Options();
+      options.setCreateIfMissing(true)
+          .setCreateMissingColumnFamilies(true);
+      db = RocksDB.open(options,
+          dbFolder.getRoot().getAbsolutePath());
+      db.put("key".getBytes(), "value".getBytes());
+      db.newIterator();
+      db.newIterator();
+      RocksIterator iter3 = db.newIterator();
+      db.close();
+      db = null;
+      System.gc();
+      System.runFinalization();
+      iter3.dispose();
+      System.gc();
+      System.runFinalization();
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
+    }
   }
 }
diff --git a/java/org/rocksdb/test/SnapshotTest.java b/java/org/rocksdb/test/SnapshotTest.java
index 7140a1fcb..1b45c517e 100644
--- a/java/org/rocksdb/test/SnapshotTest.java
+++ b/java/org/rocksdb/test/SnapshotTest.java
@@ -27,58 +27,71 @@ public class SnapshotTest {
 
   @Test
   public void snapshots() throws RocksDBException {
-    RocksDB db;
-    Options options = new Options();
-    options.setCreateIfMissing(true);
+    RocksDB db = null;
+    Options options = null;
+    ReadOptions readOptions = null;
+    try {
 
-    db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath());
-    db.put("key".getBytes(), "value".getBytes());
-    // Get new Snapshot of database
-    Snapshot snapshot = db.getSnapshot();
-    ReadOptions readOptions = new ReadOptions();
-    // set snapshot in ReadOptions
-    readOptions.setSnapshot(snapshot);
-    // retrieve key value pair
-    assertThat(new String(db.get("key".getBytes()))).
-        isEqualTo("value");
-    // retrieve key value pair created before
-    // the snapshot was made
-    assertThat(new String(db.get(readOptions,
-        "key".getBytes()))).isEqualTo("value");
-    // add new key/value pair
-    db.put("newkey".getBytes(), "newvalue".getBytes());
-    // using no snapshot the latest db entries
-    // will be taken into account
-    assertThat(new String(db.get("newkey".getBytes()))).
-        isEqualTo("newvalue");
-    // snapshopot was created before newkey
-    assertThat(db.get(readOptions, "newkey".getBytes())).
-        isNull();
-    // Retrieve snapshot from read options
-    Snapshot sameSnapshot = readOptions.snapshot();
-    readOptions.setSnapshot(sameSnapshot);
-    // results must be the same with new Snapshot
-    // instance using the same native pointer
-    assertThat(new String(db.get(readOptions,
-        "key".getBytes()))).isEqualTo("value");
-    // update key value pair to newvalue
-    db.put("key".getBytes(), "newvalue".getBytes());
-    // read with previously created snapshot will
-    // read previous version of key value pair
-    assertThat(new String(db.get(readOptions,
-        "key".getBytes()))).isEqualTo("value");
-    // read for newkey using the snapshot must be
-    // null
-    assertThat(db.get(readOptions, "newkey".getBytes())).
-        isNull();
-    // setting null to snapshot in ReadOptions leads
-    // to no Snapshot being used.
-    readOptions.setSnapshot(null);
-    assertThat(new String(db.get(readOptions,
-        "newkey".getBytes()))).isEqualTo("newvalue");
-    // release Snapshot
-    db.releaseSnapshot(snapshot);
-    // Close database
-    db.close();
+      options = new Options();
+      options.setCreateIfMissing(true);
+
+      db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath());
+      db.put("key".getBytes(), "value".getBytes());
+      // Get new Snapshot of database
+      Snapshot snapshot = db.getSnapshot();
+      readOptions = new ReadOptions();
+      // set snapshot in ReadOptions
+      readOptions.setSnapshot(snapshot);
+      // retrieve key value pair
+      assertThat(new String(db.get("key".getBytes()))).
+          isEqualTo("value");
+      // retrieve key value pair created before
+      // the snapshot was made
+      assertThat(new String(db.get(readOptions,
+          "key".getBytes()))).isEqualTo("value");
+      // add new key/value pair
+      db.put("newkey".getBytes(), "newvalue".getBytes());
+      // using no snapshot the latest db entries
+      // will be taken into account
+      assertThat(new String(db.get("newkey".getBytes()))).
+          isEqualTo("newvalue");
+      // snapshopot was created before newkey
+      assertThat(db.get(readOptions, "newkey".getBytes())).
+          isNull();
+      // Retrieve snapshot from read options
+      Snapshot sameSnapshot = readOptions.snapshot();
+      readOptions.setSnapshot(sameSnapshot);
+      // results must be the same with new Snapshot
+      // instance using the same native pointer
+      assertThat(new String(db.get(readOptions,
+          "key".getBytes()))).isEqualTo("value");
+      // update key value pair to newvalue
+      db.put("key".getBytes(), "newvalue".getBytes());
+      // read with previously created snapshot will
+      // read previous version of key value pair
+      assertThat(new String(db.get(readOptions,
+          "key".getBytes()))).isEqualTo("value");
+      // read for newkey using the snapshot must be
+      // null
+      assertThat(db.get(readOptions, "newkey".getBytes())).
+          isNull();
+      // setting null to snapshot in ReadOptions leads
+      // to no Snapshot being used.
+      readOptions.setSnapshot(null);
+      assertThat(new String(db.get(readOptions,
+          "newkey".getBytes()))).isEqualTo("newvalue");
+      // release Snapshot
+      db.releaseSnapshot(snapshot);
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
+      if (readOptions != null) {
+        readOptions.dispose();
+      }
+    }
   }
 }
diff --git a/java/org/rocksdb/test/StatisticsCollectorTest.java b/java/org/rocksdb/test/StatisticsCollectorTest.java
index 57842af10..ba84857ba 100644
--- a/java/org/rocksdb/test/StatisticsCollectorTest.java
+++ b/java/org/rocksdb/test/StatisticsCollectorTest.java
@@ -27,27 +27,35 @@ public class StatisticsCollectorTest {
   @Test
   public void statisticsCollector()
       throws InterruptedException, RocksDBException {
-    Options opt = new Options().createStatistics().setCreateIfMissing(true);
-    Statistics stats = opt.statisticsPtr();
-
-    RocksDB db = RocksDB.open(opt,
-        dbFolder.getRoot().getAbsolutePath());
-
-    StatsCallbackMock callback = new StatsCallbackMock();
-    StatsCollectorInput statsInput = new StatsCollectorInput(stats, callback);
-
-    StatisticsCollector statsCollector = new StatisticsCollector(
-        Collections.singletonList(statsInput), 100);
-    statsCollector.start();
-
-    Thread.sleep(1000);
-
-    assertThat(callback.tickerCallbackCount).isGreaterThan(0);
-    assertThat(callback.histCallbackCount).isGreaterThan(0);
-
-    statsCollector.shutDown(1000);
-
-    db.close();
-    opt.dispose();
+    Options opt = null;
+    RocksDB db = null;
+    try {
+      opt = new Options().createStatistics().setCreateIfMissing(true);
+      Statistics stats = opt.statisticsPtr();
+
+      db = RocksDB.open(opt,
+          dbFolder.getRoot().getAbsolutePath());
+
+      StatsCallbackMock callback = new StatsCallbackMock();
+      StatsCollectorInput statsInput = new StatsCollectorInput(stats, callback);
+
+      StatisticsCollector statsCollector = new StatisticsCollector(
+          Collections.singletonList(statsInput), 100);
+      statsCollector.start();
+
+      Thread.sleep(1000);
+
+      assertThat(callback.tickerCallbackCount).isGreaterThan(0);
+      assertThat(callback.histCallbackCount).isGreaterThan(0);
+
+      statsCollector.shutDown(1000);
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
   }
 }
diff --git a/java/rocksjni/backupablejni.cc b/java/rocksjni/backupablejni.cc
index 41390c5bc..609cbd73e 100644
--- a/java/rocksjni/backupablejni.cc
+++ b/java/rocksjni/backupablejni.cc
@@ -124,7 +124,7 @@ void Java_org_rocksdb_BackupableDBOptions_newBackupableDBOptions(
  * Signature: (J)Ljava/lang/String;
  */
 jstring Java_org_rocksdb_BackupableDBOptions_backupDir(
-    JNIEnv* env, jobject jopt, jlong jhandle, jstring jpath) {
+    JNIEnv* env, jobject jopt, jlong jhandle) {
   auto bopt = reinterpret_cast<rocksdb::BackupableDBOptions*>(jhandle);
   return env->NewStringUTF(bopt->backup_dir.c_str());
 }
diff --git a/java/rocksjni/options.cc b/java/rocksjni/options.cc
index 109930cdc..d725cc305 100644
--- a/java/rocksjni/options.cc
+++ b/java/rocksjni/options.cc
@@ -274,6 +274,17 @@ void Java_org_rocksdb_Options_setParanoidChecks(
       static_cast<bool>(paranoid_checks);
 }
 
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setEnv
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setEnv(
+    JNIEnv* env, jobject jobj, jlong jhandle, jlong jenv) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->env =
+      reinterpret_cast<rocksdb::Env*>(jenv);
+}
+
 /*
  * Class:     org_rocksdb_Options
  * Method:    setMaxTotalWalSize

From 74057d6d2d9fd288c0bb454cad786055714c3b32 Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Fri, 7 Nov 2014 23:31:38 +0100
Subject: [PATCH 495/829] [RocksJava] Improved tests within RocksJava

---
 java/Makefile                                 |   1 +
 java/org/rocksdb/RocksDB.java                 |   1 +
 java/org/rocksdb/RocksEnv.java                |   3 +-
 .../test/BlockBasedTableConfigTest.java       | 108 +++-
 java/org/rocksdb/test/DBOptionsTest.java      | 379 ++++++++++++--
 java/org/rocksdb/test/EnvironmentTest.java    |  20 +-
 java/org/rocksdb/test/OptionsTest.java        | 488 +++++++++++++++++-
 .../rocksdb/test/PlainTableConfigTest.java    |  54 +-
 java/org/rocksdb/test/ReadOptionsTest.java    |  10 +-
 java/org/rocksdb/test/RocksDBTest.java        | 282 ++++++++++
 java/org/rocksdb/test/SizeUnitTest.java       |  28 +
 11 files changed, 1302 insertions(+), 72 deletions(-)
 create mode 100644 java/org/rocksdb/test/RocksDBTest.java
 create mode 100644 java/org/rocksdb/test/SizeUnitTest.java

diff --git a/java/Makefile b/java/Makefile
index 3c7b8bb1a..99664c6ef 100644
--- a/java/Makefile
+++ b/java/Makefile
@@ -66,6 +66,7 @@ JAVA_TESTS = org.rocksdb.test.AbstractComparatorTest\
 		org.rocksdb.test.PlainTableConfigTest\
 		org.rocksdb.test.ReadOnlyTest\
 		org.rocksdb.test.ReadOptionsTest\
+		org.rocksdb.test.RocksDBTest\
 		org.rocksdb.test.RocksEnvTest\
 		org.rocksdb.test.RocksIteratorTest\
 		org.rocksdb.test.SnapshotTest\
diff --git a/java/org/rocksdb/RocksDB.java b/java/org/rocksdb/RocksDB.java
index 730c1940d..5fcfd2ff4 100644
--- a/java/org/rocksdb/RocksDB.java
+++ b/java/org/rocksdb/RocksDB.java
@@ -103,6 +103,7 @@ public class RocksDB extends RocksObject {
     // This allows to use the rocksjni default Options instead of
     // the c++ one.
     Options options = new Options();
+    options.setCreateIfMissing(true);
     return open(options, path);
   }
 
diff --git a/java/org/rocksdb/RocksEnv.java b/java/org/rocksdb/RocksEnv.java
index 5bbf4fb3d..bb19eb732 100644
--- a/java/org/rocksdb/RocksEnv.java
+++ b/java/org/rocksdb/RocksEnv.java
@@ -18,6 +18,7 @@ public class RocksEnv extends RocksObject {
 
   static {
     default_env_ = new RocksEnv(getDefaultEnvInternal());
+
   }
   private static native long getDefaultEnvInternal();
 
@@ -101,7 +102,7 @@ public class RocksEnv extends RocksObject {
    * {@link RocksObject} must implement to release their associated C++
    * resource.
    */
-  protected void disposeInternal() {
+  @Override protected void disposeInternal() {
     disposeInternal(nativeHandle_);
   }
   private native void disposeInternal(long handle);
diff --git a/java/org/rocksdb/test/BlockBasedTableConfigTest.java b/java/org/rocksdb/test/BlockBasedTableConfigTest.java
index 143a3fa14..241429542 100644
--- a/java/org/rocksdb/test/BlockBasedTableConfigTest.java
+++ b/java/org/rocksdb/test/BlockBasedTableConfigTest.java
@@ -18,65 +18,145 @@ public class BlockBasedTableConfigTest {
       new RocksMemoryResource();
 
   @Test
-  public void blockBasedTableConfig() {
-    BlockBasedTableConfig blockBasedTableConfig =
-        new BlockBasedTableConfig();
+  public void noBlockCache() {
+    BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
     blockBasedTableConfig.setNoBlockCache(true);
     assertThat(blockBasedTableConfig.noBlockCache()).isTrue();
+  }
+
+  @Test
+  public void blockCacheSize() {
+    BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
     blockBasedTableConfig.setBlockCacheSize(8 * 1024);
     assertThat(blockBasedTableConfig.blockCacheSize()).
         isEqualTo(8 * 1024);
+  }
+
+  @Test
+  public void blockSizeDeviation() {
+    BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
     blockBasedTableConfig.setBlockSizeDeviation(12);
     assertThat(blockBasedTableConfig.blockSizeDeviation()).
         isEqualTo(12);
+  }
+
+  @Test
+  public void blockRestartInterval() {
+    BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
     blockBasedTableConfig.setBlockRestartInterval(15);
     assertThat(blockBasedTableConfig.blockRestartInterval()).
         isEqualTo(15);
+  }
+
+  @Test
+  public void wholeKeyFiltering() {
+    BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
     blockBasedTableConfig.setWholeKeyFiltering(false);
     assertThat(blockBasedTableConfig.wholeKeyFiltering()).
         isFalse();
+  }
+
+  @Test
+  public void cacheIndexAndFilterBlocks() {
+    BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
     blockBasedTableConfig.setCacheIndexAndFilterBlocks(true);
     assertThat(blockBasedTableConfig.cacheIndexAndFilterBlocks()).
         isTrue();
+
+  }
+
+  @Test
+  public void hashIndexAllowCollision() {
+    BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
     blockBasedTableConfig.setHashIndexAllowCollision(false);
     assertThat(blockBasedTableConfig.hashIndexAllowCollision()).
         isFalse();
+  }
+
+  @Test
+  public void blockCacheCompressedSize() {
+    BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
     blockBasedTableConfig.setBlockCacheCompressedSize(40);
     assertThat(blockBasedTableConfig.blockCacheCompressedSize()).
         isEqualTo(40);
+  }
+
+  @Test
+  public void checksumType() {
+    BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
     blockBasedTableConfig.setChecksumType(ChecksumType.kNoChecksum);
     blockBasedTableConfig.setChecksumType(ChecksumType.kxxHash);
     assertThat(blockBasedTableConfig.checksumType().equals(
         ChecksumType.kxxHash));
+  }
+
+  @Test
+  public void indexType() {
+    BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+    assertThat(IndexType.values().length).isEqualTo(2);
     blockBasedTableConfig.setIndexType(IndexType.kHashSearch);
     assertThat(blockBasedTableConfig.indexType().equals(
         IndexType.kHashSearch));
+    assertThat(IndexType.valueOf("kBinarySearch")).isNotNull();
+    blockBasedTableConfig.setIndexType(IndexType.valueOf("kBinarySearch"));
+    assertThat(blockBasedTableConfig.indexType().equals(
+        IndexType.kBinarySearch));
+  }
+
+  @Test
+  public void blockCacheCompressedNumShardBits() {
+    BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
     blockBasedTableConfig.setBlockCacheCompressedNumShardBits(4);
     assertThat(blockBasedTableConfig.blockCacheCompressedNumShardBits()).
         isEqualTo(4);
+  }
+
+  @Test
+  public void cacheNumShardBits() {
+    BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
     blockBasedTableConfig.setCacheNumShardBits(5);
     assertThat(blockBasedTableConfig.cacheNumShardBits()).
         isEqualTo(5);
+  }
+
+  @Test
+  public void blockSize() {
+    BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
     blockBasedTableConfig.setBlockSize(10);
     assertThat(blockBasedTableConfig.blockSize()).isEqualTo(10);
   }
 
+
   @Test
   public void blockBasedTableWithFilter() {
-    Options options = new Options();
-    options.setTableFormatConfig(
-        new BlockBasedTableConfig().setFilter(
-            new BloomFilter(10)));
-    assertThat(options.tableFactoryName()).
-        isEqualTo("BlockBasedTable");
+    Options options = null;
+    try {
+      options = new Options();
+      options.setTableFormatConfig(
+          new BlockBasedTableConfig().setFilter(
+              new BloomFilter(10)));
+      assertThat(options.tableFactoryName()).
+          isEqualTo("BlockBasedTable");
+    } finally {
+      if (options != null) {
+        options.dispose();
+      }
+    }
   }
 
   @Test
   public void blockBasedTableWithoutFilter() {
-    Options options = new Options();
-    options.setTableFormatConfig(
-        new BlockBasedTableConfig().setFilter(null));
-    assertThat(options.tableFactoryName()).
-        isEqualTo("BlockBasedTable");
+    Options options = null;
+    try {
+      options = new Options();
+      options.setTableFormatConfig(
+          new BlockBasedTableConfig().setFilter(null));
+      assertThat(options.tableFactoryName()).
+          isEqualTo("BlockBasedTable");
+    } finally {
+      if (options != null) {
+        options.dispose();
+      }
+    }
   }
 }
diff --git a/java/org/rocksdb/test/DBOptionsTest.java b/java/org/rocksdb/test/DBOptionsTest.java
index 529a9b09b..9a15658e7 100644
--- a/java/org/rocksdb/test/DBOptionsTest.java
+++ b/java/org/rocksdb/test/DBOptionsTest.java
@@ -19,227 +19,508 @@ public class DBOptionsTest {
   public static final RocksMemoryResource rocksMemoryResource =
       new RocksMemoryResource();
 
-  @Test
-  public void dbOptions() throws RocksDBException {
-    testDBOptions(new DBOptions());
-  }
+  public static final Random rand = PlatformRandomHelper.
+      getPlatformSpecificRandomFactory();
 
-  static void testDBOptions(DBOptionsInterface opt) throws RocksDBException {
-    Random rand = PlatformRandomHelper.
-        getPlatformSpecificRandomFactory();
-    { // CreateIfMissing test
+  @Test
+  public void createIfMissing() {
+    DBOptions opt = null;
+    try {
+      opt = new DBOptions();
       boolean boolValue = rand.nextBoolean();
       opt.setCreateIfMissing(boolValue);
       assertThat(opt.createIfMissing()).
           isEqualTo(boolValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
     }
+  }
 
-    { // CreateMissingColumnFamilies test
+  @Test
+  public void createMissingColumnFamilies() {
+    DBOptions opt = null;
+    try {
+      opt = new DBOptions();
       boolean boolValue = rand.nextBoolean();
       opt.setCreateMissingColumnFamilies(boolValue);
       assertThat(opt.createMissingColumnFamilies()).
           isEqualTo(boolValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
     }
+  }
 
-    { // ErrorIfExists test
+  @Test
+  public void errorIfExists() {
+    DBOptions opt = null;
+    try {
+      opt = new DBOptions();
       boolean boolValue = rand.nextBoolean();
       opt.setErrorIfExists(boolValue);
       assertThat(opt.errorIfExists()).isEqualTo(boolValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
     }
+  }
 
-    { // ParanoidChecks test
+  @Test
+  public void paranoidChecks() {
+    DBOptions opt = null;
+    try {
+      opt = new DBOptions();
       boolean boolValue = rand.nextBoolean();
       opt.setParanoidChecks(boolValue);
       assertThat(opt.paranoidChecks()).
           isEqualTo(boolValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
     }
+  }
 
-    {
-      // MaxTotalWalSize test
+  @Test
+  public void maxTotalWalSize() {
+    DBOptions opt = null;
+    try {
+      opt = new DBOptions();
       long longValue = rand.nextLong();
       opt.setMaxTotalWalSize(longValue);
       assertThat(opt.maxTotalWalSize()).
           isEqualTo(longValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
     }
+  }
 
-    { // MaxOpenFiles test
+  @Test
+  public void maxOpenFiles() {
+    DBOptions opt = null;
+    try {
+      opt = new DBOptions();
       int intValue = rand.nextInt();
       opt.setMaxOpenFiles(intValue);
       assertThat(opt.maxOpenFiles()).isEqualTo(intValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
     }
+  }
 
-    { // DisableDataSync test
+  @Test
+  public void disableDataSync() {
+    DBOptions opt = null;
+    try {
+      opt = new DBOptions();
       boolean boolValue = rand.nextBoolean();
       opt.setDisableDataSync(boolValue);
       assertThat(opt.disableDataSync()).
           isEqualTo(boolValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
     }
+  }
 
-    { // UseFsync test
+  @Test
+  public void useFsync() {
+    DBOptions opt = null;
+    try {
+      opt = new DBOptions();
       boolean boolValue = rand.nextBoolean();
       opt.setUseFsync(boolValue);
       assertThat(opt.useFsync()).isEqualTo(boolValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
     }
+  }
 
-    { // DbLogDir test
+  @Test
+  public void dbLogDir() {
+    DBOptions opt = null;
+    try {
+      opt = new DBOptions();
       String str = "path/to/DbLogDir";
       opt.setDbLogDir(str);
       assertThat(opt.dbLogDir()).isEqualTo(str);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
     }
+  }
 
-    { // WalDir test
+  @Test
+  public void walDir() {
+    DBOptions opt = null;
+    try {
+      opt = new DBOptions();
       String str = "path/to/WalDir";
       opt.setWalDir(str);
       assertThat(opt.walDir()).isEqualTo(str);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
     }
+  }
 
-    { // DeleteObsoleteFilesPeriodMicros test
+  @Test
+  public void deleteObsoleteFilesPeriodMicros() {
+    DBOptions opt = null;
+    try {
+      opt = new DBOptions();
       long longValue = rand.nextLong();
       opt.setDeleteObsoleteFilesPeriodMicros(longValue);
       assertThat(opt.deleteObsoleteFilesPeriodMicros()).
           isEqualTo(longValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
     }
+  }
 
-    { // MaxBackgroundCompactions test
+  @Test
+  public void maxBackgroundCompactions() {
+    DBOptions opt = null;
+    try {
+      opt = new DBOptions();
       int intValue = rand.nextInt();
       opt.setMaxBackgroundCompactions(intValue);
       assertThat(opt.maxBackgroundCompactions()).
           isEqualTo(intValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
     }
+  }
 
-    { // MaxBackgroundFlushes test
+  @Test
+  public void maxBackgroundFlushes() {
+    DBOptions opt = null;
+    try {
+      opt = new DBOptions();
       int intValue = rand.nextInt();
       opt.setMaxBackgroundFlushes(intValue);
       assertThat(opt.maxBackgroundFlushes()).
           isEqualTo(intValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
     }
+  }
 
-    { // MaxLogFileSize test
+  @Test
+  public void maxLogFileSize() throws RocksDBException {
+    DBOptions opt = null;
+    try {
+      opt = new DBOptions();
       long longValue = rand.nextLong();
       opt.setMaxLogFileSize(longValue);
       assertThat(opt.maxLogFileSize()).isEqualTo(longValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
     }
+  }
 
-    { // LogFileTimeToRoll test
+  @Test
+  public void logFileTimeToRoll() throws RocksDBException {
+    DBOptions opt = null;
+    try {
+      opt = new DBOptions();
       long longValue = rand.nextLong();
       opt.setLogFileTimeToRoll(longValue);
       assertThat(opt.logFileTimeToRoll()).
           isEqualTo(longValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
     }
+  }
 
-    { // KeepLogFileNum test
+  @Test
+  public void keepLogFileNum() throws RocksDBException {
+    DBOptions opt = null;
+    try {
+      opt = new DBOptions();
       long longValue = rand.nextLong();
       opt.setKeepLogFileNum(longValue);
       assertThat(opt.keepLogFileNum()).isEqualTo(longValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
     }
+  }
 
-    { // MaxManifestFileSize test
+  @Test
+  public void maxManifestFileSize() {
+    DBOptions opt = null;
+    try {
+      opt = new DBOptions();
       long longValue = rand.nextLong();
       opt.setMaxManifestFileSize(longValue);
       assertThat(opt.maxManifestFileSize()).
           isEqualTo(longValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
     }
+  }
 
-    { // TableCacheNumshardbits test
+  @Test
+  public void tableCacheNumshardbits() {
+    DBOptions opt = null;
+    try {
+      opt = new DBOptions();
       int intValue = rand.nextInt();
       opt.setTableCacheNumshardbits(intValue);
       assertThat(opt.tableCacheNumshardbits()).
           isEqualTo(intValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
     }
+  }
 
-    { // TableCacheRemoveScanCountLimit test
+  @Test
+  public void tableCacheRemoveScanCountLimit() {
+    DBOptions opt = null;
+    try {
+      opt = new DBOptions();
       int intValue = rand.nextInt();
       opt.setTableCacheRemoveScanCountLimit(intValue);
       assertThat(opt.tableCacheRemoveScanCountLimit()).
           isEqualTo(intValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
     }
+  }
 
-    { // WalSizeLimitMB test
+  @Test
+  public void walSizeLimitMB() {
+    DBOptions opt = null;
+    try {
+      opt = new DBOptions();
       long longValue = rand.nextLong();
       opt.setWalSizeLimitMB(longValue);
       assertThat(opt.walSizeLimitMB()).isEqualTo(longValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
     }
+  }
 
-    { // WalTtlSeconds test
+  @Test
+  public void walTtlSeconds() {
+    DBOptions opt = null;
+    try {
+      opt = new DBOptions();
       long longValue = rand.nextLong();
       opt.setWalTtlSeconds(longValue);
       assertThat(opt.walTtlSeconds()).isEqualTo(longValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
     }
+  }
 
-    { // ManifestPreallocationSize test
+  @Test
+  public void manifestPreallocationSize() throws RocksDBException {
+    DBOptions opt = null;
+    try {
+      opt = new DBOptions();
       long longValue = rand.nextLong();
       opt.setManifestPreallocationSize(longValue);
       assertThat(opt.manifestPreallocationSize()).
           isEqualTo(longValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
     }
+  }
 
-    { // AllowOsBuffer test
+  @Test
+  public void allowOsBuffer() {
+    DBOptions opt = null;
+    try {
+      opt = new DBOptions();
       boolean boolValue = rand.nextBoolean();
       opt.setAllowOsBuffer(boolValue);
       assertThat(opt.allowOsBuffer()).isEqualTo(boolValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
     }
+  }
 
-    { // AllowMmapReads test
+  @Test
+  public void allowMmapReads() {
+    DBOptions opt = null;
+    try {
+      opt = new DBOptions();
       boolean boolValue = rand.nextBoolean();
       opt.setAllowMmapReads(boolValue);
       assertThat(opt.allowMmapReads()).isEqualTo(boolValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
     }
+  }
 
-    { // AllowMmapWrites test
+  @Test
+  public void allowMmapWrites() {
+    DBOptions opt = null;
+    try {
+      opt = new DBOptions();
       boolean boolValue = rand.nextBoolean();
       opt.setAllowMmapWrites(boolValue);
       assertThat(opt.allowMmapWrites()).isEqualTo(boolValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
     }
+  }
 
-    { // IsFdCloseOnExec test
+  @Test
+  public void isFdCloseOnExec() {
+    DBOptions opt = null;
+    try {
+      opt = new DBOptions();
       boolean boolValue = rand.nextBoolean();
       opt.setIsFdCloseOnExec(boolValue);
       assertThat(opt.isFdCloseOnExec()).isEqualTo(boolValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
     }
+  }
 
-    { // SkipLogErrorOnRecovery test
+  @Test
+  public void skipLogErrorOnRecovery() {
+    DBOptions opt = null;
+    try {
+      opt = new DBOptions();
       boolean boolValue = rand.nextBoolean();
       opt.setSkipLogErrorOnRecovery(boolValue);
       assertThat(opt.skipLogErrorOnRecovery()).isEqualTo(boolValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
     }
+  }
 
-    { // StatsDumpPeriodSec test
+  @Test
+  public void statsDumpPeriodSec() {
+    DBOptions opt = null;
+    try {
+      opt = new DBOptions();
       int intValue = rand.nextInt();
       opt.setStatsDumpPeriodSec(intValue);
       assertThat(opt.statsDumpPeriodSec()).isEqualTo(intValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
     }
+  }
 
-    { // AdviseRandomOnOpen test
+  @Test
+  public void adviseRandomOnOpen() {
+    DBOptions opt = null;
+    try {
+      opt = new DBOptions();
       boolean boolValue = rand.nextBoolean();
       opt.setAdviseRandomOnOpen(boolValue);
       assertThat(opt.adviseRandomOnOpen()).isEqualTo(boolValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
     }
+  }
 
-    { // UseAdaptiveMutex test
+  @Test
+  public void useAdaptiveMutex() {
+    DBOptions opt = null;
+    try {
+      opt = new DBOptions();
       boolean boolValue = rand.nextBoolean();
       opt.setUseAdaptiveMutex(boolValue);
       assertThat(opt.useAdaptiveMutex()).isEqualTo(boolValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
     }
+  }
 
-    { // BytesPerSync test
+  @Test
+  public void bytesPerSync() {
+    DBOptions opt = null;
+    try {
+      opt = new DBOptions();
       long longValue = rand.nextLong();
       opt.setBytesPerSync(longValue);
       assertThat(opt.bytesPerSync()).isEqualTo(longValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
     }
   }
 
   @Test
   public void rateLimiterConfig() {
-    DBOptions options = new DBOptions();
-    RateLimiterConfig rateLimiterConfig =
-        new GenericRateLimiterConfig(1000, 0, 1);
-    options.setRateLimiterConfig(rateLimiterConfig);
-    options.dispose();
-    // Test with parameter initialization
-    DBOptions anotherOptions = new DBOptions();
-    anotherOptions.setRateLimiterConfig(
-        new GenericRateLimiterConfig(1000));
-    anotherOptions.dispose();
+    DBOptions options = null;
+    DBOptions anotherOptions = null;
+    try {
+      options = new DBOptions();
+      RateLimiterConfig rateLimiterConfig =
+          new GenericRateLimiterConfig(1000, 0, 1);
+      options.setRateLimiterConfig(rateLimiterConfig);
+      // Test with parameter initialization
+      anotherOptions = new DBOptions();
+      anotherOptions.setRateLimiterConfig(
+          new GenericRateLimiterConfig(1000));
+    } finally {
+      if (options != null) {
+        options.dispose();
+      }
+      if (anotherOptions != null) {
+        anotherOptions.dispose();
+      }
+    }
   }
 
   @Test
diff --git a/java/org/rocksdb/test/EnvironmentTest.java b/java/org/rocksdb/test/EnvironmentTest.java
index c6542afed..b5af069da 100644
--- a/java/org/rocksdb/test/EnvironmentTest.java
+++ b/java/org/rocksdb/test/EnvironmentTest.java
@@ -1,3 +1,7 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
 package org.rocksdb.test;
 
 import org.junit.Test;
@@ -17,6 +21,7 @@ public class EnvironmentTest {
   @Test
   public void mac32() {
     setEnvironmentClassFields("mac", "32");
+    assertThat(Environment.isWindows()).isFalse();
     assertThat(Environment.getJniLibraryExtension()).
         isEqualTo(".jnilib");
     assertThat(Environment.getJniLibraryName("rocksdb")).
@@ -28,6 +33,7 @@ public class EnvironmentTest {
   @Test
   public void mac64() {
     setEnvironmentClassFields("mac", "64");
+    assertThat(Environment.isWindows()).isFalse();
     assertThat(Environment.getJniLibraryExtension()).
         isEqualTo(".jnilib");
     assertThat(Environment.getJniLibraryName("rocksdb")).
@@ -40,6 +46,7 @@ public class EnvironmentTest {
   public void nix32() {
     // Linux
     setEnvironmentClassFields("Linux", "32");
+    assertThat(Environment.isWindows()).isFalse();
     assertThat(Environment.getJniLibraryExtension()).
         isEqualTo(".so");
     assertThat(Environment.getJniLibraryName("rocksdb")).
@@ -48,6 +55,7 @@ public class EnvironmentTest {
         isEqualTo("librocksdbjni.so");
     // UNIX
     setEnvironmentClassFields("Unix", "32");
+    assertThat(Environment.isWindows()).isFalse();
     assertThat(Environment.getJniLibraryExtension()).
         isEqualTo(".so");
     assertThat(Environment.getJniLibraryName("rocksdb")).
@@ -56,6 +64,7 @@ public class EnvironmentTest {
         isEqualTo("librocksdbjni.so");
     // AIX
     setEnvironmentClassFields("aix", "32");
+    assertThat(Environment.isWindows()).isFalse();
     assertThat(Environment.getJniLibraryExtension()).
         isEqualTo(".so");
     assertThat(Environment.getJniLibraryName("rocksdb")).
@@ -67,6 +76,7 @@ public class EnvironmentTest {
   @Test
   public void nix64() {
     setEnvironmentClassFields("Linux", "x64");
+    assertThat(Environment.isWindows()).isFalse();
     assertThat(Environment.getJniLibraryExtension()).
         isEqualTo(".so");
     assertThat(Environment.getJniLibraryName("rocksdb")).
@@ -75,6 +85,7 @@ public class EnvironmentTest {
         isEqualTo("librocksdbjni.so");
     // UNIX
     setEnvironmentClassFields("Unix", "x64");
+    assertThat(Environment.isWindows()).isFalse();
     assertThat(Environment.getJniLibraryExtension()).
         isEqualTo(".so");
     assertThat(Environment.getJniLibraryName("rocksdb")).
@@ -83,6 +94,7 @@ public class EnvironmentTest {
         isEqualTo("librocksdbjni.so");
     // AIX
     setEnvironmentClassFields("aix", "x64");
+    assertThat(Environment.isWindows()).isFalse();
     assertThat(Environment.getJniLibraryExtension()).
         isEqualTo(".so");
     assertThat(Environment.getJniLibraryName("rocksdb")).
@@ -91,8 +103,14 @@ public class EnvironmentTest {
         isEqualTo("librocksdbjni.so");
   }
 
+  @Test
+  public void detectWindows(){
+    setEnvironmentClassFields("win", "x64");
+    assertThat(Environment.isWindows()).isTrue();
+  }
+
   @Test(expected = UnsupportedOperationException.class)
-  public void failLinuxJniLibraryName(){
+  public void failWinJniLibraryName(){
     setEnvironmentClassFields("win", "x64");
     Environment.getJniLibraryName("rocksdb");
   }
diff --git a/java/org/rocksdb/test/OptionsTest.java b/java/org/rocksdb/test/OptionsTest.java
index 24dd5081c..a7241e822 100644
--- a/java/org/rocksdb/test/OptionsTest.java
+++ b/java/org/rocksdb/test/OptionsTest.java
@@ -19,14 +19,14 @@ public class OptionsTest {
   public static final RocksMemoryResource rocksMemoryResource =
       new RocksMemoryResource();
 
+  public static final Random rand = PlatformRandomHelper.
+      getPlatformSpecificRandomFactory();
+
   @Test
   public void options() throws RocksDBException {
     Options opt = null;
     try {
       opt = new Options();
-      Random rand = PlatformRandomHelper.
-          getPlatformSpecificRandomFactory();
-      DBOptionsTest.testDBOptions(opt);
 
       { // WriteBufferSize test
         long longValue = rand.nextLong();
@@ -220,6 +220,484 @@ public class OptionsTest {
     }
   }
 
+  @Test
+  public void createIfMissing() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      boolean boolValue = rand.nextBoolean();
+      opt.setCreateIfMissing(boolValue);
+      assertThat(opt.createIfMissing()).
+          isEqualTo(boolValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void createMissingColumnFamilies() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      boolean boolValue = rand.nextBoolean();
+      opt.setCreateMissingColumnFamilies(boolValue);
+      assertThat(opt.createMissingColumnFamilies()).
+          isEqualTo(boolValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void errorIfExists() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      boolean boolValue = rand.nextBoolean();
+      opt.setErrorIfExists(boolValue);
+      assertThat(opt.errorIfExists()).isEqualTo(boolValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void paranoidChecks() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      boolean boolValue = rand.nextBoolean();
+      opt.setParanoidChecks(boolValue);
+      assertThat(opt.paranoidChecks()).
+          isEqualTo(boolValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void maxTotalWalSize() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      long longValue = rand.nextLong();
+      opt.setMaxTotalWalSize(longValue);
+      assertThat(opt.maxTotalWalSize()).
+          isEqualTo(longValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void maxOpenFiles() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      int intValue = rand.nextInt();
+      opt.setMaxOpenFiles(intValue);
+      assertThat(opt.maxOpenFiles()).isEqualTo(intValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void disableDataSync() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      boolean boolValue = rand.nextBoolean();
+      opt.setDisableDataSync(boolValue);
+      assertThat(opt.disableDataSync()).
+          isEqualTo(boolValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void useFsync() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      boolean boolValue = rand.nextBoolean();
+      opt.setUseFsync(boolValue);
+      assertThat(opt.useFsync()).isEqualTo(boolValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void dbLogDir() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      String str = "path/to/DbLogDir";
+      opt.setDbLogDir(str);
+      assertThat(opt.dbLogDir()).isEqualTo(str);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void walDir() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      String str = "path/to/WalDir";
+      opt.setWalDir(str);
+      assertThat(opt.walDir()).isEqualTo(str);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void deleteObsoleteFilesPeriodMicros() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      long longValue = rand.nextLong();
+      opt.setDeleteObsoleteFilesPeriodMicros(longValue);
+      assertThat(opt.deleteObsoleteFilesPeriodMicros()).
+          isEqualTo(longValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void maxBackgroundCompactions() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      int intValue = rand.nextInt();
+      opt.setMaxBackgroundCompactions(intValue);
+      assertThat(opt.maxBackgroundCompactions()).
+          isEqualTo(intValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void maxBackgroundFlushes() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      int intValue = rand.nextInt();
+      opt.setMaxBackgroundFlushes(intValue);
+      assertThat(opt.maxBackgroundFlushes()).
+          isEqualTo(intValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void maxLogFileSize() throws RocksDBException {
+    Options opt = null;
+    try {
+      opt = new Options();
+      long longValue = rand.nextLong();
+      opt.setMaxLogFileSize(longValue);
+      assertThat(opt.maxLogFileSize()).isEqualTo(longValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void logFileTimeToRoll() throws RocksDBException {
+    Options opt = null;
+    try {
+      opt = new Options();
+      long longValue = rand.nextLong();
+      opt.setLogFileTimeToRoll(longValue);
+      assertThat(opt.logFileTimeToRoll()).
+          isEqualTo(longValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void keepLogFileNum() throws RocksDBException {
+    Options opt = null;
+    try {
+      opt = new Options();
+      long longValue = rand.nextLong();
+      opt.setKeepLogFileNum(longValue);
+      assertThat(opt.keepLogFileNum()).isEqualTo(longValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void maxManifestFileSize() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      long longValue = rand.nextLong();
+      opt.setMaxManifestFileSize(longValue);
+      assertThat(opt.maxManifestFileSize()).
+          isEqualTo(longValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void tableCacheNumshardbits() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      int intValue = rand.nextInt();
+      opt.setTableCacheNumshardbits(intValue);
+      assertThat(opt.tableCacheNumshardbits()).
+          isEqualTo(intValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void tableCacheRemoveScanCountLimit() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      int intValue = rand.nextInt();
+      opt.setTableCacheRemoveScanCountLimit(intValue);
+      assertThat(opt.tableCacheRemoveScanCountLimit()).
+          isEqualTo(intValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void walSizeLimitMB() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      long longValue = rand.nextLong();
+      opt.setWalSizeLimitMB(longValue);
+      assertThat(opt.walSizeLimitMB()).isEqualTo(longValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void walTtlSeconds() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      long longValue = rand.nextLong();
+      opt.setWalTtlSeconds(longValue);
+      assertThat(opt.walTtlSeconds()).isEqualTo(longValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void manifestPreallocationSize() throws RocksDBException {
+    Options opt = null;
+    try {
+      opt = new Options();
+      long longValue = rand.nextLong();
+      opt.setManifestPreallocationSize(longValue);
+      assertThat(opt.manifestPreallocationSize()).
+          isEqualTo(longValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void allowOsBuffer() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      boolean boolValue = rand.nextBoolean();
+      opt.setAllowOsBuffer(boolValue);
+      assertThat(opt.allowOsBuffer()).isEqualTo(boolValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void allowMmapReads() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      boolean boolValue = rand.nextBoolean();
+      opt.setAllowMmapReads(boolValue);
+      assertThat(opt.allowMmapReads()).isEqualTo(boolValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void allowMmapWrites() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      boolean boolValue = rand.nextBoolean();
+      opt.setAllowMmapWrites(boolValue);
+      assertThat(opt.allowMmapWrites()).isEqualTo(boolValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void isFdCloseOnExec() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      boolean boolValue = rand.nextBoolean();
+      opt.setIsFdCloseOnExec(boolValue);
+      assertThat(opt.isFdCloseOnExec()).isEqualTo(boolValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void skipLogErrorOnRecovery() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      boolean boolValue = rand.nextBoolean();
+      opt.setSkipLogErrorOnRecovery(boolValue);
+      assertThat(opt.skipLogErrorOnRecovery()).isEqualTo(boolValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void statsDumpPeriodSec() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      int intValue = rand.nextInt();
+      opt.setStatsDumpPeriodSec(intValue);
+      assertThat(opt.statsDumpPeriodSec()).isEqualTo(intValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void adviseRandomOnOpen() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      boolean boolValue = rand.nextBoolean();
+      opt.setAdviseRandomOnOpen(boolValue);
+      assertThat(opt.adviseRandomOnOpen()).isEqualTo(boolValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void useAdaptiveMutex() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      boolean boolValue = rand.nextBoolean();
+      opt.setUseAdaptiveMutex(boolValue);
+      assertThat(opt.useAdaptiveMutex()).isEqualTo(boolValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void bytesPerSync() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      long longValue = rand.nextLong();
+      opt.setBytesPerSync(longValue);
+      assertThat(opt.bytesPerSync()).isEqualTo(longValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
   @Test
   public void rocksEnv() {
     Options options = null;
@@ -263,6 +741,8 @@ public class OptionsTest {
         options.setCompressionType(compressionType);
         assertThat(options.compressionType()).
             isEqualTo(compressionType);
+        assertThat(CompressionType.valueOf("NO_COMPRESSION")).
+            isEqualTo(CompressionType.NO_COMPRESSION);
       }
     } finally {
       if (options != null) {
@@ -281,6 +761,8 @@ public class OptionsTest {
         options.setCompactionStyle(compactionStyle);
         assertThat(options.compactionStyle()).
             isEqualTo(compactionStyle);
+        assertThat(CompactionStyle.valueOf("FIFO")).
+            isEqualTo(CompactionStyle.FIFO);
       }
     } finally {
       if (options != null) {
diff --git a/java/org/rocksdb/test/PlainTableConfigTest.java b/java/org/rocksdb/test/PlainTableConfigTest.java
index abd2cda12..72347e7d4 100644
--- a/java/org/rocksdb/test/PlainTableConfigTest.java
+++ b/java/org/rocksdb/test/PlainTableConfigTest.java
@@ -8,6 +8,7 @@ package org.rocksdb.test;
 import org.junit.ClassRule;
 import org.junit.Test;
 import org.rocksdb.EncodingType;
+import org.rocksdb.Options;
 import org.rocksdb.PlainTableConfig;
 
 import static org.assertj.core.api.Assertions.assertThat;
@@ -19,30 +20,79 @@ public class PlainTableConfigTest {
       new RocksMemoryResource();
 
   @Test
-  public void plainTableConfig() {
+  public void keySize() {
     PlainTableConfig plainTableConfig = new PlainTableConfig();
     plainTableConfig.setKeySize(5);
     assertThat(plainTableConfig.keySize()).
         isEqualTo(5);
+  }
+
+  @Test
+  public void bloomBitsPerKey() {
+    PlainTableConfig plainTableConfig = new PlainTableConfig();
     plainTableConfig.setBloomBitsPerKey(11);
     assertThat(plainTableConfig.bloomBitsPerKey()).
         isEqualTo(11);
+  }
+
+  @Test
+  public void hashTableRatio() {
+    PlainTableConfig plainTableConfig = new PlainTableConfig();
     plainTableConfig.setHashTableRatio(0.95);
     assertThat(plainTableConfig.hashTableRatio()).
         isEqualTo(0.95);
+  }
+
+  @Test
+  public void indexSparseness() {
+    PlainTableConfig plainTableConfig = new PlainTableConfig();
     plainTableConfig.setIndexSparseness(18);
     assertThat(plainTableConfig.indexSparseness()).
         isEqualTo(18);
+  }
+
+  @Test
+  public void hugePageTlbSize() {
+    PlainTableConfig plainTableConfig = new PlainTableConfig();
     plainTableConfig.setHugePageTlbSize(1);
     assertThat(plainTableConfig.hugePageTlbSize()).
         isEqualTo(1);
+  }
+
+  @Test
+  public void encodingType() {
+    PlainTableConfig plainTableConfig = new PlainTableConfig();
     plainTableConfig.setEncodingType(EncodingType.kPrefix);
     assertThat(plainTableConfig.encodingType()).isEqualTo(
         EncodingType.kPrefix);
+  }
+
+  @Test
+  public void fullScanMode() {
+    PlainTableConfig plainTableConfig = new PlainTableConfig();
     plainTableConfig.setFullScanMode(true);
-    assertThat(plainTableConfig.fullScanMode()).isTrue();
+    assertThat(plainTableConfig.fullScanMode()).isTrue();  }
+
+  @Test
+  public void storeIndexInFile() {
+    PlainTableConfig plainTableConfig = new PlainTableConfig();
     plainTableConfig.setStoreIndexInFile(true);
     assertThat(plainTableConfig.storeIndexInFile()).
         isTrue();
   }
+
+  @Test
+  public void plainTableConfig() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      PlainTableConfig plainTableConfig = new PlainTableConfig();
+      opt.setTableFormatConfig(plainTableConfig);
+      assertThat(opt.tableFactoryName()).isEqualTo("PlainTable");
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
 }
diff --git a/java/org/rocksdb/test/ReadOptionsTest.java b/java/org/rocksdb/test/ReadOptionsTest.java
index 80ea765c5..2cf1584a1 100644
--- a/java/org/rocksdb/test/ReadOptionsTest.java
+++ b/java/org/rocksdb/test/ReadOptionsTest.java
@@ -77,7 +77,6 @@ public class ReadOptionsTest {
     ReadOptions opt = null;
     try {
       opt = new ReadOptions();
-      Random rand = new Random();
       opt.setSnapshot(null);
       assertThat(opt.snapshot()).isNull();
     } finally {
@@ -88,12 +87,19 @@ public class ReadOptionsTest {
   }
 
   @Test
-  public void failVerifyChecksumUninitialized(){
+  public void failSetVerifyChecksumUninitialized(){
     ReadOptions readOptions = setupUninitializedReadOptions(
         exception);
     readOptions.setVerifyChecksums(true);
   }
 
+  @Test
+  public void failVerifyChecksumUninitialized(){
+    ReadOptions readOptions = setupUninitializedReadOptions(
+        exception);
+    readOptions.verifyChecksums();
+  }
+
   @Test
   public void failSetFillCacheUninitialized(){
     ReadOptions readOptions = setupUninitializedReadOptions(
diff --git a/java/org/rocksdb/test/RocksDBTest.java b/java/org/rocksdb/test/RocksDBTest.java
new file mode 100644
index 000000000..4f51e8b97
--- /dev/null
+++ b/java/org/rocksdb/test/RocksDBTest.java
@@ -0,0 +1,282 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+package org.rocksdb.test;
+
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+import org.rocksdb.*;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class RocksDBTest {
+
+  @ClassRule
+  public static final RocksMemoryResource rocksMemoryResource =
+      new RocksMemoryResource();
+
+  @Rule
+  public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  @Test
+  public void open() throws RocksDBException {
+    RocksDB db = null;
+    Options opt = null;
+    try {
+      db = RocksDB.open(dbFolder.getRoot().getAbsolutePath());
+      db.close();
+      opt = new Options();
+      opt.setCreateIfMissing(true);
+      db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath());
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void put() throws RocksDBException {
+    RocksDB db = null;
+    WriteOptions opt = null;
+    try {
+      db = RocksDB.open(dbFolder.getRoot().getAbsolutePath());
+      db.put("key1".getBytes(), "value".getBytes());
+      opt = new WriteOptions();
+      db.put(opt, "key2".getBytes(), "12345678".getBytes());
+      assertThat(db.get("key1".getBytes())).isEqualTo(
+          "value".getBytes());
+      assertThat(db.get("key2".getBytes())).isEqualTo(
+          "12345678".getBytes());
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void write() throws RocksDBException {
+    RocksDB db = null;
+    Options options = null;
+    WriteBatch wb1 = null;
+    WriteBatch wb2 = null;
+    WriteOptions opts = null;
+    try {
+      options = new Options().
+          setMergeOperator(new StringAppendOperator()).
+          setCreateIfMissing(true);
+      db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath());
+      opts = new WriteOptions();
+      wb1 = new WriteBatch();
+      wb1.put("key1".getBytes(), "aa".getBytes());
+      wb1.merge("key1".getBytes(), "bb".getBytes());
+      wb2 = new WriteBatch();
+      wb2.put("key2".getBytes(), "xx".getBytes());
+      wb2.merge("key2".getBytes(), "yy".getBytes());
+      db.write(opts, wb1);
+      db.write(opts, wb2);
+      assertThat(db.get("key1".getBytes())).isEqualTo(
+          "aa,bb".getBytes());
+      assertThat(db.get("key2".getBytes())).isEqualTo(
+          "xx,yy".getBytes());
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+      if (wb1 != null) {
+        wb1.dispose();
+      }
+      if (wb2 != null) {
+        wb2.dispose();
+      }
+      if (options != null) {
+        options.dispose();
+      }
+      if (opts != null) {
+        opts.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void getWithOutValue() throws RocksDBException {
+    RocksDB db = null;
+    try {
+      db = RocksDB.open(dbFolder.getRoot().getAbsolutePath());
+      db.put("key1".getBytes(), "value".getBytes());
+      db.put("key2".getBytes(), "12345678".getBytes());
+      byte[] outValue = new byte[5];
+      // not found value
+      int getResult = db.get("keyNotFound".getBytes(), outValue);
+      assertThat(getResult).isEqualTo(RocksDB.NOT_FOUND);
+      // found value which fits in outValue
+      getResult = db.get("key1".getBytes(), outValue);
+      assertThat(getResult).isNotEqualTo(RocksDB.NOT_FOUND);
+      assertThat(outValue).isEqualTo("value".getBytes());
+      // found value which fits partially
+      getResult = db.get("key2".getBytes(), outValue);
+      assertThat(getResult).isNotEqualTo(RocksDB.NOT_FOUND);
+      assertThat(outValue).isEqualTo("12345".getBytes());
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+    }
+  }
+
+  @Test
+  public void getWithOutValueReadOptions() throws RocksDBException {
+    RocksDB db = null;
+    ReadOptions rOpt = null;
+    try {
+      db = RocksDB.open(dbFolder.getRoot().getAbsolutePath());
+      rOpt = new ReadOptions();
+      db.put("key1".getBytes(), "value".getBytes());
+      db.put("key2".getBytes(), "12345678".getBytes());
+      byte[] outValue = new byte[5];
+      // not found value
+      int getResult = db.get(rOpt, "keyNotFound".getBytes(),
+          outValue);
+      assertThat(getResult).isEqualTo(RocksDB.NOT_FOUND);
+      // found value which fits in outValue
+      getResult = db.get(rOpt, "key1".getBytes(), outValue);
+      assertThat(getResult).isNotEqualTo(RocksDB.NOT_FOUND);
+      assertThat(outValue).isEqualTo("value".getBytes());
+      // found value which fits partially
+      getResult = db.get(rOpt, "key2".getBytes(), outValue);
+      assertThat(getResult).isNotEqualTo(RocksDB.NOT_FOUND);
+      assertThat(outValue).isEqualTo("12345".getBytes());
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+      if (rOpt != null) {
+        rOpt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void multiGet() throws RocksDBException {
+    RocksDB db = null;
+    ReadOptions rOpt = null;
+    try {
+      db = RocksDB.open(dbFolder.getRoot().getAbsolutePath());
+      rOpt = new ReadOptions();
+      db.put("key1".getBytes(), "value".getBytes());
+      db.put("key2".getBytes(), "12345678".getBytes());
+      List<byte[]> lookupKeys = new ArrayList<byte[]>() {{
+        add("key1".getBytes());
+        add("key2".getBytes());
+      }};
+      Map<byte[], byte[]> results = db.multiGet(lookupKeys);
+      assertThat(results).isNotNull();
+      assertThat(results.values()).isNotNull();
+      assertThat(results.values()).
+          contains("value".getBytes(), "12345678".getBytes());
+      // test same method with ReadOptions
+      results = db.multiGet(rOpt, lookupKeys);
+      assertThat(results).isNotNull();
+      assertThat(results.values()).isNotNull();
+      assertThat(results.values()).
+          contains("value".getBytes(), "12345678".getBytes());
+
+      // remove existing key
+      lookupKeys.remove("key2".getBytes());
+      // add non existing key
+      lookupKeys.add("key3".getBytes());
+      results = db.multiGet(lookupKeys);
+      assertThat(results).isNotNull();
+      assertThat(results.values()).isNotNull();
+      assertThat(results.values()).
+          contains("value".getBytes());
+      // test same call with readOptions
+      results = db.multiGet(rOpt, lookupKeys);
+      assertThat(results).isNotNull();
+      assertThat(results.values()).isNotNull();
+      assertThat(results.values()).
+          contains("value".getBytes());
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+      if (rOpt != null) {
+        rOpt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void merge() throws RocksDBException {
+    RocksDB db = null;
+    Options opt = null;
+    WriteOptions wOpt;
+    try {
+      opt = new Options().
+          setCreateIfMissing(true).
+          setMergeOperator(new StringAppendOperator());
+      wOpt = new WriteOptions();
+      db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath());
+      db.put("key1".getBytes(), "value".getBytes());
+      assertThat(db.get("key1".getBytes())).isEqualTo(
+          "value".getBytes());
+      // merge key1 with another value portion
+      db.merge("key1".getBytes(), "value2".getBytes());
+      assertThat(db.get("key1".getBytes())).isEqualTo(
+          "value,value2".getBytes());
+      // merge key1 with another value portion
+      db.merge(wOpt, "key1".getBytes(), "value3".getBytes());
+      assertThat(db.get("key1".getBytes())).isEqualTo(
+          "value,value2,value3".getBytes());
+      // merge on non existent key shall insert the value
+      db.merge(wOpt, "key2".getBytes(), "xxxx".getBytes());
+      assertThat(db.get("key2".getBytes())).isEqualTo(
+          "xxxx".getBytes());
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void remove() throws RocksDBException {
+    RocksDB db = null;
+    WriteOptions wOpt;
+    try {
+      wOpt = new WriteOptions();
+      db = RocksDB.open(dbFolder.getRoot().getAbsolutePath());
+      db.put("key1".getBytes(), "value".getBytes());
+      db.put("key2".getBytes(), "12345678".getBytes());
+      assertThat(db.get("key1".getBytes())).isEqualTo(
+          "value".getBytes());
+      assertThat(db.get("key2".getBytes())).isEqualTo(
+          "12345678".getBytes());
+      db.remove("key1".getBytes());
+      db.remove(wOpt, "key2".getBytes());
+      assertThat(db.get("key1".getBytes())).isNull();
+      assertThat(db.get("key2".getBytes())).isNull();
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+    }
+  }
+}
diff --git a/java/org/rocksdb/test/SizeUnitTest.java b/java/org/rocksdb/test/SizeUnitTest.java
new file mode 100644
index 000000000..16f636267
--- /dev/null
+++ b/java/org/rocksdb/test/SizeUnitTest.java
@@ -0,0 +1,28 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+package org.rocksdb.test;
+
+import org.junit.Test;
+import org.rocksdb.util.SizeUnit;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class SizeUnitTest {
+
+  public static final long COMPUTATION_UNIT = 1024L;
+
+  @Test
+  public void sizeUnit() {
+    assertThat(SizeUnit.KB).isEqualTo(COMPUTATION_UNIT);
+    assertThat(SizeUnit.MB).isEqualTo(
+        SizeUnit.KB * COMPUTATION_UNIT);
+    assertThat(SizeUnit.GB).isEqualTo(
+        SizeUnit.MB * COMPUTATION_UNIT);
+    assertThat(SizeUnit.TB).isEqualTo(
+        SizeUnit.GB * COMPUTATION_UNIT);
+    assertThat(SizeUnit.PB).isEqualTo(
+        SizeUnit.TB * COMPUTATION_UNIT);
+  }
+}

From b6abab8b77608aa0dc0016ed02cfd09c7ccea55f Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Tue, 11 Nov 2014 08:49:00 +0100
Subject: [PATCH 496/829] [RocksJava] Merged & rebased to HEAD

---
 java/org/rocksdb/test/KeyMayExistTest.java    | 50 +++----------------
 .../rocksdb/test/WriteBatchHandlerTest.java   | 15 ++++--
 2 files changed, 18 insertions(+), 47 deletions(-)

diff --git a/java/org/rocksdb/test/KeyMayExistTest.java b/java/org/rocksdb/test/KeyMayExistTest.java
index c0613bf53..64f15e68d 100644
--- a/java/org/rocksdb/test/KeyMayExistTest.java
+++ b/java/org/rocksdb/test/KeyMayExistTest.java
@@ -26,46 +26,22 @@ public class KeyMayExistTest {
 
   @Test
   public void keyMayExist() throws RocksDBException {
-<<<<<<< HEAD
-    RocksDB db;
-    DBOptions options = new DBOptions();
-    options.setCreateIfMissing(true)
-        .setCreateMissingColumnFamilies(true);
-    // open database using cf names
-    List<ColumnFamilyDescriptor> cfDescriptors =
-        new ArrayList<ColumnFamilyDescriptor>();
-    List<ColumnFamilyHandle> columnFamilyHandleList =
-        new ArrayList<>();
-    cfDescriptors.add(new ColumnFamilyDescriptor("default"));
-    cfDescriptors.add(new ColumnFamilyDescriptor("new_cf"));
-    db = RocksDB.open(options,
-        dbFolder.getRoot().getAbsolutePath(),
-        cfDescriptors, columnFamilyHandleList);
-    assertThat(columnFamilyHandleList.size()).
-        isEqualTo(2);
-    db.put("key".getBytes(), "value".getBytes());
-    // Test without column family
-    StringBuffer retValue = new StringBuffer();
-    boolean exists = db.keyMayExist("key".getBytes(), retValue);
-    assertThat(exists).isTrue();
-    assertThat(retValue.toString()).
-        isEqualTo("value");
-=======
     RocksDB db = null;
-    Options options = null;
+    DBOptions options = null;
     try {
-      options = new Options();
+      options = new DBOptions();
       options.setCreateIfMissing(true)
           .setCreateMissingColumnFamilies(true);
       // open database using cf names
-      List<String> cfNames = new ArrayList<>();
+      List<ColumnFamilyDescriptor> cfDescriptors =
+          new ArrayList<ColumnFamilyDescriptor>();
       List<ColumnFamilyHandle> columnFamilyHandleList =
           new ArrayList<>();
-      cfNames.add("default");
-      cfNames.add("new_cf");
+      cfDescriptors.add(new ColumnFamilyDescriptor("default"));
+      cfDescriptors.add(new ColumnFamilyDescriptor("new_cf"));
       db = RocksDB.open(options,
           dbFolder.getRoot().getAbsolutePath(),
-          cfNames, columnFamilyHandleList);
+          cfDescriptors, columnFamilyHandleList);
       assertThat(columnFamilyHandleList.size()).
           isEqualTo(2);
       db.put("key".getBytes(), "value".getBytes());
@@ -75,7 +51,6 @@ public class KeyMayExistTest {
       assertThat(exists).isTrue();
       assertThat(retValue.toString()).
           isEqualTo("value");
->>>>>>> [RocksJava] Integrated review comments from D28209
 
       // Test without column family but with readOptions
       retValue = new StringBuffer();
@@ -93,16 +68,6 @@ public class KeyMayExistTest {
       assertThat(retValue.toString()).
           isEqualTo("value");
 
-<<<<<<< HEAD
-    // Test with column family and readOptions
-    retValue = new StringBuffer();
-    exists = db.keyMayExist(new ReadOptions(),
-        columnFamilyHandleList.get(0), "key".getBytes(),
-        retValue);
-    assertThat(exists).isTrue();
-    assertThat(retValue.toString()).
-        isEqualTo("value");
-=======
       // Test with column family and readOptions
       retValue = new StringBuffer();
       exists = db.keyMayExist(new ReadOptions(),
@@ -111,7 +76,6 @@ public class KeyMayExistTest {
       assertThat(exists).isTrue();
       assertThat(retValue.toString()).
           isEqualTo("value");
->>>>>>> [RocksJava] Integrated review comments from D28209
 
       // KeyMayExist in CF1 must return false
       assertThat(db.keyMayExist(columnFamilyHandleList.get(1),
diff --git a/java/org/rocksdb/test/WriteBatchHandlerTest.java b/java/org/rocksdb/test/WriteBatchHandlerTest.java
index ccf9b164a..1debc2bda 100644
--- a/java/org/rocksdb/test/WriteBatchHandlerTest.java
+++ b/java/org/rocksdb/test/WriteBatchHandlerTest.java
@@ -13,13 +13,20 @@ import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
+import org.junit.ClassRule;
+import org.junit.Test;
+import org.rocksdb.WriteOptions;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
 
 public class WriteBatchHandlerTest {
-    static {
-        RocksDB.loadLibrary();
-    }
+  @ClassRule
+  public static final RocksMemoryResource rocksMemoryResource =
+      new RocksMemoryResource();
 
-    public static void main(final String[] args) throws IOException, RocksDBException {
+  @Test
+  public void writeBatchHandler() throws IOException, RocksDBException {
 
         // setup test data
         final List<Tuple<Action, Tuple<byte[], byte[]>>> testEvents = new ArrayList<>();

From cd82beb0cbee9d62820f2e5a12034dd5540b7702 Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Tue, 11 Nov 2014 18:59:04 +0100
Subject: [PATCH 497/829] [RocksJava] Merged in latest changes.

---
 java/Makefile                        |  6 +++---
 java/org/rocksdb/test/FlushTest.java | 19 +++++++++++++------
 2 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/java/Makefile b/java/Makefile
index 99664c6ef..0f4d42244 100644
--- a/java/Makefile
+++ b/java/Makefile
@@ -34,7 +34,7 @@ NATIVE_JAVA_CLASSES = org.rocksdb.AbstractComparator\
 	org.rocksdb.WriteBatch.Handler\
 	org.rocksdb.test.WriteBatchInternal\
 	org.rocksdb.test.WriteBatchTest\
-	org.rocksdb.WriteOptions\
+        org.rocksdb.WriteOptions\
 
 ROCKSDB_MAJOR = $(shell egrep "ROCKSDB_MAJOR.[0-9]" ../include/rocksdb/version.h | cut -d ' ' -f 3)
 ROCKSDB_MINOR = $(shell egrep "ROCKSDB_MINOR.[0-9]" ../include/rocksdb/version.h | cut -d ' ' -f 3)
@@ -92,8 +92,9 @@ clean:
 javadocs:
 	mkdir -p javadoc; javadoc -d javadoc -sourcepath . -subpackages org -exclude org.rocksdb.test
 
-java: javadocs
+java: javadocs resolve_test_deps
 	javac org/rocksdb/util/*.java org/rocksdb/*.java
+	javac -cp $(JAVA_TESTCLASSPATH) org/rocksdb/test/*.java
 	@cp ../HISTORY.md ./HISTORY-CPP.md
 	@rm -f ./HISTORY-CPP.md
 	javah -d $(NATIVE_INCLUDE) -jni $(NATIVE_JAVA_CLASSES)
@@ -121,7 +122,6 @@ resolve_test_deps:
 	test -s "$(JAVA_ASSERTJ_JAR)" || curl -k -L -o "$(JAVA_ASSERTJ_JAR)" http://central.maven.org/maven2/org/assertj/assertj-core/1.7.0/assertj-core-1.7.0.jar
 
 test: java resolve_test_deps
-	javac -cp $(JAVA_TESTCLASSPATH) org/rocksdb/test/*.java
 	java -ea -Djava.library.path=.:../ -cp "$(JAVA_TESTCLASSPATH)" org.rocksdb.test.RocksJunitRunner $(JAVA_TESTS)
 
 db_bench: java
diff --git a/java/org/rocksdb/test/FlushTest.java b/java/org/rocksdb/test/FlushTest.java
index 1742be67f..3e47668b7 100644
--- a/java/org/rocksdb/test/FlushTest.java
+++ b/java/org/rocksdb/test/FlushTest.java
@@ -4,16 +4,23 @@
 // of patent rights can be found in the PATENTS file in the same directory.
 package org.rocksdb.test;
 
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
 import org.rocksdb.*;
 
 public class FlushTest {
 
-  static final String db_path = "/tmp/rocksdbjni_flush_test";
-  static {
-    RocksDB.loadLibrary();
-  }
+  @ClassRule
+  public static final RocksMemoryResource rocksMemoryResource =
+      new RocksMemoryResource();
+
+  @Rule
+  public TemporaryFolder dbFolder = new TemporaryFolder();
 
-  public static void main(String[] args) {
+  @Test
+  public void flush() {
     RocksDB db = null;
     Options options = new Options();
     WriteOptions wOpt = new WriteOptions();
@@ -26,7 +33,7 @@ public class FlushTest {
       options.setMinWriteBufferNumberToMerge(10);
       flushOptions.setWaitForFlush(true);
       wOpt.setDisableWAL(true);
-      db = RocksDB.open(options, db_path);
+      db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath());
 
       db.put(wOpt, "key1".getBytes(), "value1".getBytes());
       db.put(wOpt, "key2".getBytes(), "value2".getBytes());

From e46450da6d8fcff4121ec31bd03a11277b9b6d23 Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Thu, 13 Nov 2014 22:27:58 +0100
Subject: [PATCH 498/829] [RocksJava] Rebased + integrated CF tests

---
 java/Makefile                                 |   5 +-
 .../rocksdb/test/AbstractComparatorTest.java  |   5 +-
 .../rocksdb/test/ColumnFamilyOptionsTest.java | 545 +++++++++++++---
 java/org/rocksdb/test/ColumnFamilyTest.java   | 610 +++++++++++++-----
 java/org/rocksdb/test/FlushTest.java          | 119 ++--
 java/org/rocksdb/test/KeyMayExistTest.java    |   2 +-
 java/org/rocksdb/test/MergeTest.java          |   5 +-
 java/org/rocksdb/test/MixedOptionsTest.java   |  23 +-
 java/org/rocksdb/test/OptionsTest.java        | 521 +++++++++++----
 .../rocksdb/test/WriteBatchHandlerTest.java   |   4 +-
 10 files changed, 1375 insertions(+), 464 deletions(-)

diff --git a/java/Makefile b/java/Makefile
index 0f4d42244..b87fea16d 100644
--- a/java/Makefile
+++ b/java/Makefile
@@ -47,8 +47,7 @@ ifeq ($(PLATFORM), OS_MACOSX)
 ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-osx.jar
 endif
 
-JAVA_TESTS = org.rocksdb.test.AbstractComparatorTest\
-        org.rocksdb.test.BackupableDBTest\
+JAVA_TESTS = org.rocksdb.test.BackupableDBTest\
 		org.rocksdb.test.BlockBasedTableConfigTest\
 		org.rocksdb.test.ColumnFamilyOptionsTest\
 		org.rocksdb.test.ColumnFamilyTest\
@@ -71,7 +70,7 @@ JAVA_TESTS = org.rocksdb.test.AbstractComparatorTest\
 		org.rocksdb.test.RocksIteratorTest\
 		org.rocksdb.test.SnapshotTest\
 		org.rocksdb.test.StatisticsCollectorTest\
-		org.rocksdb.test.WirteBatchHandlerTest\
+		org.rocksdb.test.WriteBatchHandlerTest\
 		org.rocksdb.test.WriteBatchTest\
 		org.rocksdb.test.WriteOptionsTest\
 
diff --git a/java/org/rocksdb/test/AbstractComparatorTest.java b/java/org/rocksdb/test/AbstractComparatorTest.java
index 339615b45..e3e2f8849 100644
--- a/java/org/rocksdb/test/AbstractComparatorTest.java
+++ b/java/org/rocksdb/test/AbstractComparatorTest.java
@@ -12,6 +12,7 @@ import java.nio.file.*;
 import java.nio.file.attribute.BasicFileAttributes;
 import java.util.Random;
 
+import static org.assertj.core.api.Assertions.assertThat;
 import static org.rocksdb.test.Types.byteToInt;
 import static org.rocksdb.test.Types.intToByte;
 
@@ -75,13 +76,13 @@ public abstract class AbstractComparatorTest {
       int count = 0;
       for (it.seekToFirst(); it.isValid(); it.next()) {
         final int thisKey = byteToInt(it.key());
-        assert(thisKey > lastKey);
+        assertThat(thisKey).isGreaterThan(lastKey);
         lastKey = thisKey;
         count++;
       }
       db.close();
 
-      assert(count == ITERATIONS);
+      assertThat(count).isEqualTo(ITERATIONS);
 
     } catch (final RocksDBException e) {
       System.err.format("[ERROR]: %s%n", e);
diff --git a/java/org/rocksdb/test/ColumnFamilyOptionsTest.java b/java/org/rocksdb/test/ColumnFamilyOptionsTest.java
index 95289a301..7fcfee14c 100644
--- a/java/org/rocksdb/test/ColumnFamilyOptionsTest.java
+++ b/java/org/rocksdb/test/ColumnFamilyOptionsTest.java
@@ -5,225 +5,584 @@
 
 package org.rocksdb.test;
 
+import org.junit.ClassRule;
+import org.junit.Test;
 import org.rocksdb.*;
 
 import java.util.Random;
 
+import static org.assertj.core.api.Assertions.assertThat;
+
 public class ColumnFamilyOptionsTest {
-  static {
-    RocksDB.loadLibrary();
-  }
 
-  public static void testCFOptions(ColumnFamilyOptionsInterface opt) {
-    Random rand = PlatformRandomHelper.
-        getPlatformSpecificRandomFactory();
-    { // WriteBufferSize test
-      try {
-        long longValue = rand.nextLong();
-        opt.setWriteBufferSize(longValue);
-        assert(opt.writeBufferSize() == longValue);
-      } catch (RocksDBException e) {
-        assert(false);
+  @ClassRule
+  public static final RocksMemoryResource rocksMemoryResource =
+      new RocksMemoryResource();
+
+  public static final Random rand = PlatformRandomHelper.
+      getPlatformSpecificRandomFactory();
+
+  @Test
+  public void writeBufferSize() throws RocksDBException {
+    ColumnFamilyOptions opt = null;
+    try {
+      opt = new ColumnFamilyOptions();
+      long longValue = rand.nextLong();
+      opt.setWriteBufferSize(longValue);
+      assertThat(opt.writeBufferSize()).isEqualTo(longValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
       }
     }
+  }
 
-    { // MaxWriteBufferNumber test
+  @Test
+  public void maxWriteBufferNumber() {
+    ColumnFamilyOptions opt = null;
+    try {
+      opt = new ColumnFamilyOptions();
       int intValue = rand.nextInt();
       opt.setMaxWriteBufferNumber(intValue);
-      assert(opt.maxWriteBufferNumber() == intValue);
+      assertThat(opt.maxWriteBufferNumber()).isEqualTo(intValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
     }
+  }
 
-    { // MinWriteBufferNumberToMerge test
+  @Test
+  public void minWriteBufferNumberToMerge() {
+    ColumnFamilyOptions opt = null;
+    try {
+      opt = new ColumnFamilyOptions();
       int intValue = rand.nextInt();
       opt.setMinWriteBufferNumberToMerge(intValue);
-      assert(opt.minWriteBufferNumberToMerge() == intValue);
+      assertThat(opt.minWriteBufferNumberToMerge()).isEqualTo(intValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
     }
+  }
 
-    { // NumLevels test
+  @Test
+  public void numLevels() {
+    ColumnFamilyOptions opt = null;
+    try {
+      opt = new ColumnFamilyOptions();
       int intValue = rand.nextInt();
       opt.setNumLevels(intValue);
-      assert(opt.numLevels() == intValue);
+      assertThat(opt.numLevels()).isEqualTo(intValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
     }
+  }
 
-    { // LevelFileNumCompactionTrigger test
+  @Test
+  public void levelZeroFileNumCompactionTrigger() {
+    ColumnFamilyOptions opt = null;
+    try {
+      opt = new ColumnFamilyOptions();
       int intValue = rand.nextInt();
       opt.setLevelZeroFileNumCompactionTrigger(intValue);
-      assert(opt.levelZeroFileNumCompactionTrigger() == intValue);
+      assertThat(opt.levelZeroFileNumCompactionTrigger()).isEqualTo(intValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
     }
+  }
 
-    { // LevelSlowdownWritesTrigger test
+  @Test
+  public void levelZeroSlowdownWritesTrigger() {
+    ColumnFamilyOptions opt = null;
+    try {
+      opt = new ColumnFamilyOptions();
       int intValue = rand.nextInt();
       opt.setLevelZeroSlowdownWritesTrigger(intValue);
-      assert(opt.levelZeroSlowdownWritesTrigger() == intValue);
+      assertThat(opt.levelZeroSlowdownWritesTrigger()).isEqualTo(intValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
     }
+  }
 
-    { // LevelStopWritesTrigger test
+  @Test
+  public void levelZeroStopWritesTrigger() {
+    ColumnFamilyOptions opt = null;
+    try {
+      opt = new ColumnFamilyOptions();
       int intValue = rand.nextInt();
       opt.setLevelZeroStopWritesTrigger(intValue);
-      assert(opt.levelZeroStopWritesTrigger() == intValue);
+      assertThat(opt.levelZeroStopWritesTrigger()).isEqualTo(intValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
     }
+  }
 
-    { // MaxMemCompactionLevel test
+  @Test
+  public void maxMemCompactionLevel() {
+    ColumnFamilyOptions opt = null;
+    try {
+      opt = new ColumnFamilyOptions();
       int intValue = rand.nextInt();
       opt.setMaxMemCompactionLevel(intValue);
-      assert(opt.maxMemCompactionLevel() == intValue);
+      assertThat(opt.maxMemCompactionLevel()).isEqualTo(intValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
     }
+  }
 
-    { // TargetFileSizeBase test
+  @Test
+  public void targetFileSizeBase() {
+    ColumnFamilyOptions opt = null;
+    try {
+      opt = new ColumnFamilyOptions();
       long longValue = rand.nextLong();
       opt.setTargetFileSizeBase(longValue);
-      assert(opt.targetFileSizeBase() == longValue);
+      assertThat(opt.targetFileSizeBase()).isEqualTo(longValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
     }
+  }
 
-    { // TargetFileSizeMultiplier test
+  @Test
+  public void targetFileSizeMultiplier() {
+    ColumnFamilyOptions opt = null;
+    try {
+      opt = new ColumnFamilyOptions();
       int intValue = rand.nextInt();
       opt.setTargetFileSizeMultiplier(intValue);
-      assert(opt.targetFileSizeMultiplier() == intValue);
+      assertThat(opt.targetFileSizeMultiplier()).isEqualTo(intValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
     }
+  }
 
-    { // MaxBytesForLevelBase test
+  @Test
+  public void maxBytesForLevelBase() {
+    ColumnFamilyOptions opt = null;
+    try {
+      opt = new ColumnFamilyOptions();
       long longValue = rand.nextLong();
       opt.setMaxBytesForLevelBase(longValue);
-      assert(opt.maxBytesForLevelBase() == longValue);
+      assertThat(opt.maxBytesForLevelBase()).isEqualTo(longValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
     }
+  }
 
-    { // MaxBytesForLevelMultiplier test
+  @Test
+  public void maxBytesForLevelMultiplier() {
+    ColumnFamilyOptions opt = null;
+    try {
+      opt = new ColumnFamilyOptions();
       int intValue = rand.nextInt();
       opt.setMaxBytesForLevelMultiplier(intValue);
-      assert(opt.maxBytesForLevelMultiplier() == intValue);
+      assertThat(opt.maxBytesForLevelMultiplier()).isEqualTo(intValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
     }
+  }
 
-    { // ExpandedCompactionFactor test
+  @Test
+  public void expandedCompactionFactor() {
+    ColumnFamilyOptions opt = null;
+    try {
+      opt = new ColumnFamilyOptions();
       int intValue = rand.nextInt();
       opt.setExpandedCompactionFactor(intValue);
-      assert(opt.expandedCompactionFactor() == intValue);
+      assertThat(opt.expandedCompactionFactor()).isEqualTo(intValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
     }
+  }
 
-    { // SourceCompactionFactor test
+  @Test
+  public void sourceCompactionFactor() {
+    ColumnFamilyOptions opt = null;
+    try {
+      opt = new ColumnFamilyOptions();
       int intValue = rand.nextInt();
       opt.setSourceCompactionFactor(intValue);
-      assert(opt.sourceCompactionFactor() == intValue);
+      assertThat(opt.sourceCompactionFactor()).isEqualTo(intValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
     }
+  }
 
-    { // MaxGrandparentOverlapFactor test
+  @Test
+  public void maxGrandparentOverlapFactor() {
+    ColumnFamilyOptions opt = null;
+    try {
+      opt = new ColumnFamilyOptions();
       int intValue = rand.nextInt();
       opt.setMaxGrandparentOverlapFactor(intValue);
-      assert(opt.maxGrandparentOverlapFactor() == intValue);
+      assertThat(opt.maxGrandparentOverlapFactor()).isEqualTo(intValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
     }
+  }
 
-    { // SoftRateLimit test
+  @Test
+  public void softRateLimit() {
+    ColumnFamilyOptions opt = null;
+    try {
+      opt = new ColumnFamilyOptions();
       double doubleValue = rand.nextDouble();
       opt.setSoftRateLimit(doubleValue);
-      assert(opt.softRateLimit() == doubleValue);
+      assertThat(opt.softRateLimit()).isEqualTo(doubleValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
     }
+  }
 
-    { // HardRateLimit test
+  @Test
+  public void hardRateLimit() {
+    ColumnFamilyOptions opt = null;
+    try {
+      opt = new ColumnFamilyOptions();
       double doubleValue = rand.nextDouble();
       opt.setHardRateLimit(doubleValue);
-      assert(opt.hardRateLimit() == doubleValue);
+      assertThat(opt.hardRateLimit()).isEqualTo(doubleValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
     }
+  }
 
-    { // RateLimitDelayMaxMilliseconds test
+  @Test
+  public void rateLimitDelayMaxMilliseconds() {
+    ColumnFamilyOptions opt = null;
+    try {
+      opt = new ColumnFamilyOptions();
       int intValue = rand.nextInt();
       opt.setRateLimitDelayMaxMilliseconds(intValue);
-      assert(opt.rateLimitDelayMaxMilliseconds() == intValue);
+      assertThat(opt.rateLimitDelayMaxMilliseconds()).isEqualTo(intValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
     }
+  }
 
-    { // ArenaBlockSize test
-      try {
-        long longValue = rand.nextLong();
-        opt.setArenaBlockSize(longValue);
-        assert(opt.arenaBlockSize() == longValue);
-      } catch (RocksDBException e) {
-        assert(false);
+  @Test
+  public void arenaBlockSize() throws RocksDBException {
+    ColumnFamilyOptions opt = null;
+    try {
+      opt = new ColumnFamilyOptions();
+      long longValue = rand.nextLong();
+      opt.setArenaBlockSize(longValue);
+      assertThat(opt.arenaBlockSize()).isEqualTo(longValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
       }
     }
+  }
 
-    { // DisableAutoCompactions test
+  @Test
+  public void disableAutoCompactions() {
+    ColumnFamilyOptions opt = null;
+    try {
+      opt = new ColumnFamilyOptions();
       boolean boolValue = rand.nextBoolean();
       opt.setDisableAutoCompactions(boolValue);
-      assert(opt.disableAutoCompactions() == boolValue);
+      assertThat(opt.disableAutoCompactions()).isEqualTo(boolValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
     }
+  }
 
-    { // PurgeRedundantKvsWhileFlush test
+  @Test
+  public void purgeRedundantKvsWhileFlush() {
+    ColumnFamilyOptions opt = null;
+    try {
+      opt = new ColumnFamilyOptions();
       boolean boolValue = rand.nextBoolean();
       opt.setPurgeRedundantKvsWhileFlush(boolValue);
-      assert(opt.purgeRedundantKvsWhileFlush() == boolValue);
+      assertThat(opt.purgeRedundantKvsWhileFlush()).isEqualTo(boolValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
     }
+  }
 
-    { // VerifyChecksumsInCompaction test
+  @Test
+  public void verifyChecksumsInCompaction() {
+    ColumnFamilyOptions opt = null;
+    try {
+      opt = new ColumnFamilyOptions();
       boolean boolValue = rand.nextBoolean();
       opt.setVerifyChecksumsInCompaction(boolValue);
-      assert(opt.verifyChecksumsInCompaction() == boolValue);
+      assertThat(opt.verifyChecksumsInCompaction()).isEqualTo(boolValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
     }
+  }
 
-    { // FilterDeletes test
+  @Test
+  public void filterDeletes() {
+    ColumnFamilyOptions opt = null;
+    try {
+      opt = new ColumnFamilyOptions();
       boolean boolValue = rand.nextBoolean();
       opt.setFilterDeletes(boolValue);
-      assert(opt.filterDeletes() == boolValue);
+      assertThat(opt.filterDeletes()).isEqualTo(boolValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
     }
+  }
 
-    { // MaxSequentialSkipInIterations test
+  @Test
+  public void maxSequentialSkipInIterations() {
+    ColumnFamilyOptions opt = null;
+    try {
+      opt = new ColumnFamilyOptions();
       long longValue = rand.nextLong();
       opt.setMaxSequentialSkipInIterations(longValue);
-      assert(opt.maxSequentialSkipInIterations() == longValue);
+      assertThat(opt.maxSequentialSkipInIterations()).isEqualTo(longValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
     }
+  }
 
-    { // InplaceUpdateSupport test
+  @Test
+  public void inplaceUpdateSupport() {
+    ColumnFamilyOptions opt = null;
+    try {
+      opt = new ColumnFamilyOptions();
       boolean boolValue = rand.nextBoolean();
       opt.setInplaceUpdateSupport(boolValue);
-      assert(opt.inplaceUpdateSupport() == boolValue);
+      assertThat(opt.inplaceUpdateSupport()).isEqualTo(boolValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
     }
+  }
 
-    { // InplaceUpdateNumLocks test
-      try {
-        long longValue = rand.nextLong();
-        opt.setInplaceUpdateNumLocks(longValue);
-        assert(opt.inplaceUpdateNumLocks() == longValue);
-      } catch (RocksDBException e) {
-        assert(false);
+  @Test
+  public void inplaceUpdateNumLocks() throws RocksDBException {
+    ColumnFamilyOptions opt = null;
+    try {
+      opt = new ColumnFamilyOptions();
+      long longValue = rand.nextLong();
+      opt.setInplaceUpdateNumLocks(longValue);
+      assertThat(opt.inplaceUpdateNumLocks()).isEqualTo(longValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
       }
     }
+  }
 
-    { // MemtablePrefixBloomBits test
+  @Test
+  public void memtablePrefixBloomBits() {
+    ColumnFamilyOptions opt = null;
+    try {
+      opt = new ColumnFamilyOptions();
       int intValue = rand.nextInt();
       opt.setMemtablePrefixBloomBits(intValue);
-      assert(opt.memtablePrefixBloomBits() == intValue);
+      assertThat(opt.memtablePrefixBloomBits()).isEqualTo(intValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
     }
+  }
 
-    { // MemtablePrefixBloomProbes test
+  @Test
+  public void memtablePrefixBloomProbes() {
+    ColumnFamilyOptions opt = null;
+    try {
       int intValue = rand.nextInt();
+      opt = new ColumnFamilyOptions();
       opt.setMemtablePrefixBloomProbes(intValue);
-      assert(opt.memtablePrefixBloomProbes() == intValue);
+      assertThat(opt.memtablePrefixBloomProbes()).isEqualTo(intValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
     }
+  }
 
-    { // BloomLocality test
+  @Test
+  public void bloomLocality() {
+    ColumnFamilyOptions opt = null;
+    try {
       int intValue = rand.nextInt();
+      opt = new ColumnFamilyOptions();
       opt.setBloomLocality(intValue);
-      assert(opt.bloomLocality() == intValue);
+      assertThat(opt.bloomLocality()).isEqualTo(intValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
     }
+  }
 
-    { // MaxSuccessiveMerges test
-      try {
-        long longValue = rand.nextLong();
-        opt.setMaxSuccessiveMerges(longValue);
-        assert(opt.maxSuccessiveMerges() == longValue);
-      } catch (RocksDBException e){
-        assert(false);
+  @Test
+  public void maxSuccessiveMerges() throws RocksDBException {
+    ColumnFamilyOptions opt = null;
+    try {
+      long longValue = rand.nextLong();
+      opt = new ColumnFamilyOptions();
+      opt.setMaxSuccessiveMerges(longValue);
+      assertThat(opt.maxSuccessiveMerges()).isEqualTo(longValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
       }
     }
+  }
 
-    { // MinPartialMergeOperands test
+  @Test
+  public void minPartialMergeOperands() {
+    ColumnFamilyOptions opt = null;
+    try {
       int intValue = rand.nextInt();
+      opt = new ColumnFamilyOptions();
       opt.setMinPartialMergeOperands(intValue);
-      assert(opt.minPartialMergeOperands() == intValue);
+      assertThat(opt.minPartialMergeOperands()).isEqualTo(intValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void memTable() throws RocksDBException {
+    ColumnFamilyOptions opt = null;
+    try {
+      opt = new ColumnFamilyOptions();
+      opt.setMemTableConfig(new HashLinkedListMemTableConfig());
+      assertThat(opt.memTableFactoryName()).
+          isEqualTo("HashLinkedListRepFactory");
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void comparator() throws RocksDBException {
+    ColumnFamilyOptions opt = null;
+    try {
+      opt = new ColumnFamilyOptions();
+      opt.setComparator(BuiltinComparator.BYTEWISE_COMPARATOR);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void linkageOfPrepMethods() {
+    ColumnFamilyOptions options = null;
+    try {
+      options = new ColumnFamilyOptions();
+      options.optimizeUniversalStyleCompaction();
+      options.optimizeUniversalStyleCompaction(4000);
+      options.optimizeLevelStyleCompaction();
+      options.optimizeLevelStyleCompaction(3000);
+      options.optimizeForPointLookup(10);
+    } finally {
+      if (options != null) {
+        options.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void shouldSetTestPrefixExtractor() {
+    ColumnFamilyOptions options = null;
+    try {
+      options = new ColumnFamilyOptions();
+      options.useFixedLengthPrefixExtractor(100);
+      options.useFixedLengthPrefixExtractor(10);
+    } finally {
+      if (options != null) {
+        options.dispose();
+      }
     }
   }
 
-  public static void main(String[] args) {
-    ColumnFamilyOptions opt = new ColumnFamilyOptions();
-    testCFOptions(opt);
-    opt.dispose();
-    System.out.println("Passed DBOptionsTest");
+  @Test
+  public void compressionTypes() {
+    ColumnFamilyOptions ColumnFamilyOptions = null;
+    try {
+      ColumnFamilyOptions = new ColumnFamilyOptions();
+      for (CompressionType compressionType :
+          CompressionType.values()) {
+        ColumnFamilyOptions.setCompressionType(compressionType);
+        assertThat(ColumnFamilyOptions.compressionType()).
+            isEqualTo(compressionType);
+        assertThat(CompressionType.valueOf("NO_COMPRESSION")).
+            isEqualTo(CompressionType.NO_COMPRESSION);
+      }
+    } finally {
+      if (ColumnFamilyOptions != null) {
+        ColumnFamilyOptions.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void compactionStyles() {
+    ColumnFamilyOptions ColumnFamilyOptions = null;
+    try {
+      ColumnFamilyOptions = new ColumnFamilyOptions();
+      for (CompactionStyle compactionStyle :
+          CompactionStyle.values()) {
+        ColumnFamilyOptions.setCompactionStyle(compactionStyle);
+        assertThat(ColumnFamilyOptions.compactionStyle()).
+            isEqualTo(compactionStyle);
+        assertThat(CompactionStyle.valueOf("FIFO")).
+            isEqualTo(CompactionStyle.FIFO);
+      }
+    } finally {
+      if (ColumnFamilyOptions != null) {
+        ColumnFamilyOptions.dispose();
+      }
+    }
   }
 }
diff --git a/java/org/rocksdb/test/ColumnFamilyTest.java b/java/org/rocksdb/test/ColumnFamilyTest.java
index fc5b4ba6e..4c2ac8536 100644
--- a/java/org/rocksdb/test/ColumnFamilyTest.java
+++ b/java/org/rocksdb/test/ColumnFamilyTest.java
@@ -16,6 +16,8 @@ import org.junit.Test;
 import org.junit.rules.TemporaryFolder;
 import org.rocksdb.*;
 
+import static org.assertj.core.api.Assertions.assertThat;
+
 public class ColumnFamilyTest {
 
   @ClassRule
@@ -26,71 +28,85 @@ public class ColumnFamilyTest {
   public TemporaryFolder dbFolder = new TemporaryFolder();
 
   @Test
-  public void  columnFamilies() {
-    String db_path = dbFolder.getRoot().getAbsolutePath();
+  public void listColumnFamilies() throws RocksDBException {
     RocksDB db = null;
-    Options options = new Options();
-    options.setCreateIfMissing(true);
+    Options options = null;
+    try {
+      options = new Options();
+      options.setCreateIfMissing(true);
 
-    DBOptions dbOptions = new DBOptions();
-    dbOptions.setCreateIfMissing(true);
+      DBOptions dbOptions = new DBOptions();
+      dbOptions.setCreateIfMissing(true);
 
-    try {
-        db = RocksDB.open(options, db_path);
-    } catch (RocksDBException e) {
-      assert(false);
-    }
-    // Test listColumnFamilies
-    List<byte[]> columnFamilyNames;
-    try {
-      columnFamilyNames =  RocksDB.listColumnFamilies(options, db_path);
-      if (columnFamilyNames != null && columnFamilyNames.size() > 0) {
-        assert(columnFamilyNames.size() == 1);
-        assert(new String(columnFamilyNames.get(0)).equals("default"));
-      } else {
-        assert(false);
-      }
-    } catch (RocksDBException e) {
-      assert(false);
+      db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath());
+      // Test listColumnFamilies
+      List<byte[]> columnFamilyNames;
+      columnFamilyNames = RocksDB.listColumnFamilies(options, dbFolder.getRoot().getAbsolutePath());
+      assertThat(columnFamilyNames).isNotNull();
+      assertThat(columnFamilyNames.size()).isGreaterThan(0);
+      assertThat(columnFamilyNames.size()).isEqualTo(1);
+      assertThat(new String(columnFamilyNames.get(0))).isEqualTo("default");
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
     }
+  }
 
-    // Test createColumnFamily
+  @Test
+  public void createColumnFamily() throws RocksDBException {
+    RocksDB db = null;
+    Options options = null;
     try {
+      options = new Options();
+      options.setCreateIfMissing(true);
+
+      DBOptions dbOptions = new DBOptions();
+      dbOptions.setCreateIfMissing(true);
+
+      db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath());
       db.createColumnFamily(new ColumnFamilyDescriptor("new_cf",
           new ColumnFamilyOptions()));
-    } catch (RocksDBException e) {
-      assert(false);
-    }
-
-    if (db != null) {
       db.close();
+      List<byte[]> columnFamilyNames;
+      columnFamilyNames = RocksDB.listColumnFamilies(options, dbFolder.getRoot().getAbsolutePath());
+      assertThat(columnFamilyNames).isNotNull();
+      assertThat(columnFamilyNames.size()).isGreaterThan(0);
+      assertThat(columnFamilyNames.size()).isEqualTo(2);
+      assertThat(new String(columnFamilyNames.get(0))).isEqualTo("default");
+      assertThat(new String(columnFamilyNames.get(1))).isEqualTo("new_cf");
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
     }
+  }
 
-    // Test listColumnFamilies after create "new_cf"
+  @Test
+  public void openWithColumnFamilies() throws RocksDBException {
+    RocksDB db = null;
+    DBOptions options = null;
     try {
-      columnFamilyNames =  RocksDB.listColumnFamilies(options, db_path);
-      if (columnFamilyNames != null && columnFamilyNames.size() > 0) {
-        assert(columnFamilyNames.size() == 2);
-        assert(new String(columnFamilyNames.get(0)).equals("default"));
-        assert(new String(columnFamilyNames.get(1)).equals("new_cf"));
-      } else {
-        assert(false);
-      }
-    } catch (RocksDBException e) {
-      assert(false);
-    }
-
-    // Test open database with column family names
-    List<ColumnFamilyDescriptor> cfNames =
-        new ArrayList<>();
-    List<ColumnFamilyHandle> columnFamilyHandleList =
-        new ArrayList<>();
-    cfNames.add(new ColumnFamilyDescriptor("default"));
-    cfNames.add(new ColumnFamilyDescriptor("new_cf"));
+      options = new DBOptions();
+      options.setCreateIfMissing(true);
+      options.setCreateMissingColumnFamilies(true);
+      // Test open database with column family names
+      List<ColumnFamilyDescriptor> cfNames =
+          new ArrayList<>();
+      List<ColumnFamilyHandle> columnFamilyHandleList =
+          new ArrayList<>();
+      cfNames.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY));
+      cfNames.add(new ColumnFamilyDescriptor("new_cf"));
 
-    try {
-      db = RocksDB.open(dbOptions, db_path, cfNames, columnFamilyHandleList);
-      assert(columnFamilyHandleList.size() == 2);
+      db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath(),
+          cfNames, columnFamilyHandleList);
+      assertThat(columnFamilyHandleList.size()).isEqualTo(2);
       db.put("dfkey1".getBytes(), "dfvalue".getBytes());
       db.put(columnFamilyHandleList.get(0), "dfkey2".getBytes(),
           "dfvalue".getBytes());
@@ -99,57 +115,118 @@ public class ColumnFamilyTest {
 
       String retVal = new String(db.get(columnFamilyHandleList.get(1),
           "newcfkey1".getBytes()));
-      assert(retVal.equals("newcfvalue"));
-      assert( (db.get(columnFamilyHandleList.get(1),
-          "dfkey1".getBytes())) == null);
+      assertThat(retVal).isEqualTo("newcfvalue");
+      assertThat((db.get(columnFamilyHandleList.get(1),
+          "dfkey1".getBytes()))).isNull();
       db.remove(columnFamilyHandleList.get(1), "newcfkey1".getBytes());
-      assert( (db.get(columnFamilyHandleList.get(1),
-          "newcfkey1".getBytes())) == null);
-      db.remove("dfkey2".getBytes());
-      assert( (db.get(columnFamilyHandleList.get(0),
-          "dfkey2".getBytes())) == null);
-    } catch (RocksDBException e) {
-      assert(false);
+      assertThat((db.get(columnFamilyHandleList.get(1),
+          "newcfkey1".getBytes()))).isNull();
+      db.remove(columnFamilyHandleList.get(0), new WriteOptions(),
+          "dfkey2".getBytes());
+      assertThat(db.get(columnFamilyHandleList.get(0), new ReadOptions(),
+          "dfkey2".getBytes())).isNull();
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void getWithOutValueAndCf() throws RocksDBException {
+    RocksDB db = null;
+    DBOptions options = null;
+    try {
+      options = new DBOptions();
+      options.setCreateIfMissing(true);
+      options.setCreateMissingColumnFamilies(true);
+      // Test open database with column family names
+      List<ColumnFamilyDescriptor> cfDescriptors =
+          new ArrayList<>();
+      List<ColumnFamilyHandle> columnFamilyHandleList =
+          new ArrayList<>();
+      cfDescriptors.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY));
+      db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath(),
+          cfDescriptors, columnFamilyHandleList);
+      db.put(columnFamilyHandleList.get(0), new WriteOptions(), "key1".getBytes(), "value".getBytes());
+      db.put("key2".getBytes(), "12345678".getBytes());
+      byte[] outValue = new byte[5];
+      // not found value
+      int getResult = db.get("keyNotFound".getBytes(), outValue);
+      assertThat(getResult).isEqualTo(RocksDB.NOT_FOUND);
+      // found value which fits in outValue
+      getResult = db.get(columnFamilyHandleList.get(0), "key1".getBytes(), outValue);
+      assertThat(getResult).isNotEqualTo(RocksDB.NOT_FOUND);
+      assertThat(outValue).isEqualTo("value".getBytes());
+      // found value which fits partially
+      getResult = db.get(columnFamilyHandleList.get(0), new ReadOptions(),
+          "key2".getBytes(), outValue);
+      assertThat(getResult).isNotEqualTo(RocksDB.NOT_FOUND);
+      assertThat(outValue).isEqualTo("12345".getBytes());
+    } finally {
+      if (db != null) {
+        db.close();
+      }
     }
+  }
 
-    // Test create write to and drop ColumnFamily
+  @Test
+  public void createWriteDropColumnFamily() throws RocksDBException {
+    RocksDB db = null;
+    DBOptions opt = null;
     ColumnFamilyHandle tmpColumnFamilyHandle = null;
     try {
+      opt = new DBOptions();
+      opt.setCreateIfMissing(true);
+      opt.setCreateMissingColumnFamilies(true);
+      List<ColumnFamilyDescriptor> cfNames =
+          new ArrayList<>();
+      List<ColumnFamilyHandle> columnFamilyHandleList =
+          new ArrayList<>();
+      cfNames.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY));
+      cfNames.add(new ColumnFamilyDescriptor("new_cf"));
+
+      db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath(),
+          cfNames, columnFamilyHandleList);
       tmpColumnFamilyHandle = db.createColumnFamily(
           new ColumnFamilyDescriptor("tmpCF", new ColumnFamilyOptions()));
       db.put(tmpColumnFamilyHandle, "key".getBytes(), "value".getBytes());
       db.dropColumnFamily(tmpColumnFamilyHandle);
       tmpColumnFamilyHandle.dispose();
-    } catch (Exception e) {
-      assert(false);
-    }
-
-    // Put to disposed column family tmpColumnFamilyHandle must fail
-    try {
-      db.put(tmpColumnFamilyHandle, "key".getBytes(), "value".getBytes());
-      assert(false);
-    } catch (RocksDBException e) {
-      assert(true);
+    } finally {
+      if (tmpColumnFamilyHandle != null) {
+        tmpColumnFamilyHandle.dispose();
+      }
+      if (db != null) {
+        db.close();
+      }
+      if (opt != null) {
+        opt.dispose();
+      }
     }
+  }
 
-    // Remove to disposed column family tmpColumnFamilyHandle must fail
+  @Test
+  public void writeBatch() throws RocksDBException {
+    RocksDB db = null;
+    DBOptions opt = null;
     try {
-      db.remove(tmpColumnFamilyHandle, "key".getBytes());
-      assert(false);
-    } catch (RocksDBException e) {
-      assert(true);
-    }
+      opt = new DBOptions();
+      opt.setCreateIfMissing(true);
+      opt.setCreateMissingColumnFamilies(true);
+      List<ColumnFamilyDescriptor> cfNames =
+          new ArrayList<>();
+      List<ColumnFamilyHandle> columnFamilyHandleList =
+          new ArrayList<>();
+      cfNames.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY));
+      cfNames.add(new ColumnFamilyDescriptor("new_cf"));
 
-    // Get on a disposed column family tmpColumnFamilyHandle must fail
-    try {
-      db.get(tmpColumnFamilyHandle, "key".getBytes());
-      assert(false);
-    } catch (RocksDBException e) {
-      assert(true);
-    }
+      db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath(),
+          cfNames, columnFamilyHandleList);
 
-    // Test WriteBatch
-    try {
       WriteBatch writeBatch = new WriteBatch();
       WriteOptions writeOpt = new WriteOptions();
       writeBatch.put("key".getBytes(), "value".getBytes());
@@ -161,135 +238,324 @@ public class ColumnFamilyTest {
       writeBatch.remove(columnFamilyHandleList.get(1), "xyz".getBytes());
       db.write(writeOpt, writeBatch);
       writeBatch.dispose();
-      assert(db.get(columnFamilyHandleList.get(1),
+      assertThat(db.get(columnFamilyHandleList.get(1),
           "xyz".getBytes()) == null);
-      assert(new String(db.get(columnFamilyHandleList.get(1),
-          "newcfkey".getBytes())).equals("value"));
-      assert(new String(db.get(columnFamilyHandleList.get(1),
-          "newcfkey2".getBytes())).equals("value2"));
-      assert(new String(db.get("key".getBytes())).equals("value"));
-    } catch (Exception e) {
-      e.printStackTrace();
-      assert(false);
+      assertThat(new String(db.get(columnFamilyHandleList.get(1),
+          "newcfkey".getBytes()))).isEqualTo("value");
+      assertThat(new String(db.get(columnFamilyHandleList.get(1),
+          "newcfkey2".getBytes()))).isEqualTo("value2");
+      assertThat(new String(db.get("key".getBytes()))).isEqualTo("value");
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+      if (opt != null) {
+        opt.dispose();
+      }
     }
+  }
 
-    // Test iterator on column family
+  @Test
+  public void iteratorOnColumnFamily() throws RocksDBException {
+    RocksDB db = null;
+    DBOptions options = null;
+    RocksIterator rocksIterator = null;
     try {
-      RocksIterator rocksIterator = db.newIterator(
+      options = new DBOptions();
+      options.setCreateIfMissing(true);
+      options.setCreateMissingColumnFamilies(true);
+      List<ColumnFamilyDescriptor> cfNames =
+          new ArrayList<>();
+      List<ColumnFamilyHandle> columnFamilyHandleList =
+          new ArrayList<>();
+      cfNames.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY));
+      cfNames.add(new ColumnFamilyDescriptor("new_cf"));
+
+      db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath(),
+          cfNames, columnFamilyHandleList);
+      db.put(columnFamilyHandleList.get(1), "newcfkey".getBytes(),
+          "value".getBytes());
+      db.put(columnFamilyHandleList.get(1), "newcfkey2".getBytes(),
+          "value2".getBytes());
+      rocksIterator = db.newIterator(
           columnFamilyHandleList.get(1));
       rocksIterator.seekToFirst();
-      Map<String, String> refMap = new HashMap<String, String>();
+      Map<String, String> refMap = new HashMap<>();
       refMap.put("newcfkey", "value");
       refMap.put("newcfkey2", "value2");
       int i = 0;
-      while(rocksIterator.isValid()) {
+      while (rocksIterator.isValid()) {
         i++;
-        refMap.get(new String(rocksIterator.key())).equals(
-            new String(rocksIterator.value()));
+        assertThat(refMap.get(new String(rocksIterator.key()))).
+            isEqualTo(new String(rocksIterator.value()));
         rocksIterator.next();
       }
-      assert(i == 2);
+      assertThat(i).isEqualTo(2);
       rocksIterator.dispose();
-    } catch(Exception e) {
-      assert(false);
+    } finally {
+      if (rocksIterator != null) {
+        rocksIterator.dispose();
+      }
+      if (db != null) {
+        db.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
     }
+  }
 
-    // Test property handling on column families
+  @Test
+  public void multiGet() throws RocksDBException {
+    RocksDB db = null;
+    DBOptions options = null;
     try {
-      assert(db.getProperty("rocksdb.estimate-num-keys") != null);
-      assert(db.getProperty("rocksdb.stats") != null);
-      assert(db.getProperty(columnFamilyHandleList.get(0),
-          "rocksdb.sstables") != null);
-      assert(db.getProperty(columnFamilyHandleList.get(1),
-          "rocksdb.estimate-num-keys") != null);
-      assert(db.getProperty(columnFamilyHandleList.get(1),
-          "rocksdb.stats") != null);
-      assert(db.getProperty(columnFamilyHandleList.get(1),
-          "rocksdb.sstables") != null);
-    } catch(Exception e) {
-      assert(false);
-    }
+      options = new DBOptions();
+      options.setCreateIfMissing(true);
+      options.setCreateMissingColumnFamilies(true);
+      List<ColumnFamilyDescriptor> cfDescriptors =
+          new ArrayList<>();
+      List<ColumnFamilyHandle> columnFamilyHandleList =
+          new ArrayList<>();
+      cfDescriptors.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY));
+      cfDescriptors.add(new ColumnFamilyDescriptor("new_cf"));
 
-    // MultiGet test
-    List<ColumnFamilyHandle> cfCustomList = new ArrayList<ColumnFamilyHandle>();
-    try {
-      List<byte[]> keys = new ArrayList<byte[]>();
+      db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath(),
+          cfDescriptors, columnFamilyHandleList);
+      db.put(columnFamilyHandleList.get(0), "key".getBytes(), "value".getBytes());
+      db.put(columnFamilyHandleList.get(1), "newcfkey".getBytes(), "value".getBytes());
+
+      List<byte[]> keys = new ArrayList<>();
       keys.add("key".getBytes());
       keys.add("newcfkey".getBytes());
-      Map<byte[], byte[]> retValues = db.multiGet(columnFamilyHandleList,keys);
-      assert(retValues.size() == 2);
-      assert(new String(retValues.get(keys.get(0)))
-          .equals("value"));
-      assert(new String(retValues.get(keys.get(1)))
-          .equals("value"));
-
-      cfCustomList.add(columnFamilyHandleList.get(0));
-      cfCustomList.add(columnFamilyHandleList.get(0));
-      retValues = db.multiGet(cfCustomList, keys);
-      assert(retValues.size() == 1);
-      assert(new String(retValues.get(keys.get(0)))
-          .equals("value"));
-    } catch (RocksDBException e) {
-      assert(false);
+      Map<byte[], byte[]> retValues = db.multiGet(columnFamilyHandleList, keys);
+      assertThat(retValues.size()).isEqualTo(2);
+      assertThat(new String(retValues.get(keys.get(0))))
+          .isEqualTo("value");
+      assertThat(new String(retValues.get(keys.get(1))))
+          .isEqualTo("value");
+      retValues = db.multiGet(new ReadOptions(), columnFamilyHandleList, keys);
+      assertThat(retValues.size()).isEqualTo(2);
+      assertThat(new String(retValues.get(keys.get(0))))
+          .isEqualTo("value");
+      assertThat(new String(retValues.get(keys.get(1))))
+          .isEqualTo("value");
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
     }
+  }
+
 
-    // Test multiget without correct number of column
-    // families
+  @Test
+  public void properties() throws RocksDBException {
+    RocksDB db = null;
+    DBOptions options = null;
     try {
-      List<byte[]> keys = new ArrayList<byte[]>();
-      keys.add("key".getBytes());
-      keys.add("newcfkey".getBytes());
-      cfCustomList.remove(1);
-      db.multiGet(cfCustomList, keys);
-      assert(false);
-    } catch (RocksDBException e) {
-      assert(false);
-    } catch (IllegalArgumentException e) {
-      assert(true);
+      options = new DBOptions();
+      options.setCreateIfMissing(true);
+      options.setCreateMissingColumnFamilies(true);
+      List<ColumnFamilyDescriptor> cfNames =
+          new ArrayList<>();
+      List<ColumnFamilyHandle> columnFamilyHandleList =
+          new ArrayList<>();
+      cfNames.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY));
+      cfNames.add(new ColumnFamilyDescriptor("new_cf"));
+
+      db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath(),
+          cfNames, columnFamilyHandleList);
+      assertThat(db.getProperty("rocksdb.estimate-num-keys")).
+          isNotNull();
+      assertThat(db.getProperty("rocksdb.stats")).isNotNull();
+      assertThat(db.getProperty(columnFamilyHandleList.get(0),
+          "rocksdb.sstables")).isNotNull();
+      assertThat(db.getProperty(columnFamilyHandleList.get(1),
+          "rocksdb.estimate-num-keys")).isNotNull();
+      assertThat(db.getProperty(columnFamilyHandleList.get(1),
+          "rocksdb.stats")).isNotNull();
+      assertThat(db.getProperty(columnFamilyHandleList.get(1),
+          "rocksdb.sstables")).isNotNull();
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
     }
+  }
 
+
+  @Test
+  public void iterators() throws RocksDBException {
+    RocksDB db = null;
+    DBOptions options = null;
     try {
-      // iterate over default key/value pairs
+      options = new DBOptions();
+      options.setCreateIfMissing(true);
+      options.setCreateMissingColumnFamilies(true);
+
+      List<ColumnFamilyDescriptor> cfNames =
+          new ArrayList<>();
+      List<ColumnFamilyHandle> columnFamilyHandleList =
+          new ArrayList<>();
+      cfNames.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY));
+      cfNames.add(new ColumnFamilyDescriptor("new_cf"));
+
+      db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath(),
+          cfNames, columnFamilyHandleList);
       List<RocksIterator> iterators =
           db.newIterators(columnFamilyHandleList);
-      assert(iterators.size() == 2);
+      assertThat(iterators.size()).isEqualTo(2);
       RocksIterator iter = iterators.get(0);
       iter.seekToFirst();
-      Map<String,String> defRefMap = new HashMap<String, String>();
+      Map<String, String> defRefMap = new HashMap<>();
       defRefMap.put("dfkey1", "dfvalue");
       defRefMap.put("key", "value");
       while (iter.isValid()) {
-        defRefMap.get(new String(iter.key())).equals(
-            new String(iter.value()));
+        assertThat(defRefMap.get(new String(iter.key()))).
+            isEqualTo(new String(iter.value()));
         iter.next();
       }
       // iterate over new_cf key/value pairs
-      Map<String,String> cfRefMap = new HashMap<String, String>();
+      Map<String, String> cfRefMap = new HashMap<>();
       cfRefMap.put("newcfkey", "value");
       cfRefMap.put("newcfkey2", "value2");
       iter = iterators.get(1);
       iter.seekToFirst();
       while (iter.isValid()) {
-        cfRefMap.get(new String(iter.key())).equals(
-            new String(iter.value()));
+        assertThat(cfRefMap.get(new String(iter.key()))).
+            isEqualTo(new String(iter.value()));
         iter.next();
       }
-      // free iterators
-      for (RocksIterator iterator : iterators) {
-        iterator.dispose();
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
+    }
+  }
+
+  @Test(expected = RocksDBException.class)
+  public void failPutDisposedCF() throws RocksDBException {
+    RocksDB db = null;
+    DBOptions options = null;
+    try {
+      options = new DBOptions();
+      options.setCreateIfMissing(true);
+      List<ColumnFamilyDescriptor> cfNames =
+          new ArrayList<>();
+      List<ColumnFamilyHandle> columnFamilyHandleList =
+          new ArrayList<>();
+      cfNames.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY));
+      cfNames.add(new ColumnFamilyDescriptor("new_cf"));
+
+      db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath(),
+          cfNames, columnFamilyHandleList);
+      db.dropColumnFamily(columnFamilyHandleList.get(1));
+      db.put(columnFamilyHandleList.get(1), "key".getBytes(), "value".getBytes());
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
+    }
+  }
+
+  @Test(expected = RocksDBException.class)
+  public void failRemoveDisposedCF() throws RocksDBException {
+    RocksDB db = null;
+    DBOptions options = null;
+    try {
+      options = new DBOptions();
+      options.setCreateIfMissing(true);
+      List<ColumnFamilyDescriptor> cfNames =
+          new ArrayList<>();
+      List<ColumnFamilyHandle> columnFamilyHandleList =
+          new ArrayList<>();
+      cfNames.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY));
+      cfNames.add(new ColumnFamilyDescriptor("new_cf"));
+
+      db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath(),
+          cfNames, columnFamilyHandleList);
+      db.dropColumnFamily(columnFamilyHandleList.get(1));
+      db.remove(columnFamilyHandleList.get(1), "key".getBytes());
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
+    }
+  }
+
+  @Test(expected = RocksDBException.class)
+  public void failGetDisposedCF() throws RocksDBException {
+    RocksDB db = null;
+    DBOptions options = null;
+    try {
+      options = new DBOptions();
+      options.setCreateIfMissing(true);
+      List<ColumnFamilyDescriptor> cfNames =
+          new ArrayList<>();
+      List<ColumnFamilyHandle> columnFamilyHandleList =
+          new ArrayList<>();
+      cfNames.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY));
+      cfNames.add(new ColumnFamilyDescriptor("new_cf"));
+
+      db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath(),
+          cfNames, columnFamilyHandleList);
+      db.dropColumnFamily(columnFamilyHandleList.get(1));
+      db.get(columnFamilyHandleList.get(1), "key".getBytes());
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+      if (options != null) {
+        options.dispose();
       }
-      assert(true);
-    } catch (RocksDBException e) {
-      assert(false);
     }
+  }
+
+  @Test(expected = RocksDBException.class)
+  public void failMultiGetWithoutCorrectNumberOfCF() throws RocksDBException {
+    RocksDB db = null;
+    DBOptions options = null;
+    try {
+      options = new DBOptions();
+      options.setCreateIfMissing(true);
+      List<ColumnFamilyDescriptor> cfNames =
+          new ArrayList<>();
+      List<ColumnFamilyHandle> columnFamilyHandleList =
+          new ArrayList<>();
+      cfNames.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY));
+      cfNames.add(new ColumnFamilyDescriptor("new_cf"));
 
-    // free cf handles before database close
-    for (ColumnFamilyHandle columnFamilyHandle : columnFamilyHandleList) {
-      columnFamilyHandle.dispose();
+      db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath(),
+          cfNames, columnFamilyHandleList);
+      List<byte[]> keys = new ArrayList<>();
+      keys.add("key".getBytes());
+      keys.add("newcfkey".getBytes());
+      List<ColumnFamilyHandle> cfCustomList = new ArrayList<>();
+      db.multiGet(cfCustomList, keys);
+
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
     }
-    // close database
-    db.close();
-    // be sure to dispose c++ pointers
-    options.dispose();
   }
+
 }
diff --git a/java/org/rocksdb/test/FlushTest.java b/java/org/rocksdb/test/FlushTest.java
index 3e47668b7..9dea7e753 100644
--- a/java/org/rocksdb/test/FlushTest.java
+++ b/java/org/rocksdb/test/FlushTest.java
@@ -1,54 +1,65 @@
-// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree. An additional grant
-// of patent rights can be found in the PATENTS file in the same directory.
-package org.rocksdb.test;
-
-import org.junit.ClassRule;
-import org.junit.Rule;
-import org.junit.Test;
-import org.junit.rules.TemporaryFolder;
-import org.rocksdb.*;
-
-public class FlushTest {
-
-  @ClassRule
-  public static final RocksMemoryResource rocksMemoryResource =
-      new RocksMemoryResource();
-
-  @Rule
-  public TemporaryFolder dbFolder = new TemporaryFolder();
-
-  @Test
-  public void flush() {
-    RocksDB db = null;
-    Options options = new Options();
-    WriteOptions wOpt = new WriteOptions();
-    FlushOptions flushOptions = new FlushOptions();
-
-    try {
-      // Setup options
-      options.setCreateIfMissing(true);
-      options.setMaxWriteBufferNumber(10);
-      options.setMinWriteBufferNumberToMerge(10);
-      flushOptions.setWaitForFlush(true);
-      wOpt.setDisableWAL(true);
-      db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath());
-
-      db.put(wOpt, "key1".getBytes(), "value1".getBytes());
-      db.put(wOpt, "key2".getBytes(), "value2".getBytes());
-      db.put(wOpt, "key3".getBytes(), "value3".getBytes());
-      db.put(wOpt, "key4".getBytes(), "value4".getBytes());
-      assert(db.getProperty("rocksdb.num-entries-active-mem-table").equals("4"));
-      db.flush(flushOptions);
-      assert(db.getProperty("rocksdb.num-entries-active-mem-table").equals("0"));
-    } catch (RocksDBException e) {
-      assert(false);
-    }
-
-    db.close();
-    options.dispose();
-    wOpt.dispose();
-    flushOptions.dispose();
-  }
-}
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+package org.rocksdb.test;
+
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+import org.rocksdb.*;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class FlushTest {
+
+  @ClassRule
+  public static final RocksMemoryResource rocksMemoryResource =
+      new RocksMemoryResource();
+
+  @Rule
+  public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  @Test
+  public void flush() throws RocksDBException {
+    RocksDB db = null;
+    Options options = null;
+    WriteOptions wOpt = null;
+    FlushOptions flushOptions = null;
+    try {
+      options = new Options();
+      // Setup options
+      options.setCreateIfMissing(true);
+      options.setMaxWriteBufferNumber(10);
+      options.setMinWriteBufferNumberToMerge(10);
+      wOpt = new WriteOptions();
+      flushOptions = new FlushOptions();
+      flushOptions.setWaitForFlush(true);
+      wOpt.setDisableWAL(true);
+      db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath());
+      db.put(wOpt, "key1".getBytes(), "value1".getBytes());
+      db.put(wOpt, "key2".getBytes(), "value2".getBytes());
+      db.put(wOpt, "key3".getBytes(), "value3".getBytes());
+      db.put(wOpt, "key4".getBytes(), "value4".getBytes());
+      assertThat(db.getProperty("rocksdb.num-entries-active-mem-table")).isEqualTo("4");
+      db.flush(flushOptions);
+      assertThat(db.getProperty("rocksdb.num-entries-active-mem-table")).
+          isEqualTo("0");
+    } finally {
+      if (flushOptions != null) {
+        flushOptions.dispose();
+      }
+      if (db != null) {
+        db.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
+      if (wOpt != null) {
+        wOpt.dispose();
+      }
+
+    }
+  }
+}
diff --git a/java/org/rocksdb/test/KeyMayExistTest.java b/java/org/rocksdb/test/KeyMayExistTest.java
index 64f15e68d..4fe45e4c0 100644
--- a/java/org/rocksdb/test/KeyMayExistTest.java
+++ b/java/org/rocksdb/test/KeyMayExistTest.java
@@ -34,7 +34,7 @@ public class KeyMayExistTest {
           .setCreateMissingColumnFamilies(true);
       // open database using cf names
       List<ColumnFamilyDescriptor> cfDescriptors =
-          new ArrayList<ColumnFamilyDescriptor>();
+          new ArrayList<>();
       List<ColumnFamilyHandle> columnFamilyHandleList =
           new ArrayList<>();
       cfDescriptors.add(new ColumnFamilyDescriptor("default"));
diff --git a/java/org/rocksdb/test/MergeTest.java b/java/org/rocksdb/test/MergeTest.java
index 962674716..3ebd55975 100644
--- a/java/org/rocksdb/test/MergeTest.java
+++ b/java/org/rocksdb/test/MergeTest.java
@@ -177,11 +177,12 @@ public class MergeTest {
 
       // Test also with createColumnFamily
       columnFamilyHandle = db.createColumnFamily(
-          new ColumnFamilyDescriptor("new_cf2"));
+          new ColumnFamilyDescriptor("new_cf2",
+              new ColumnFamilyOptions().setMergeOperator(stringAppendOperator)));
       // writing xx under cfkey2
       db.put(columnFamilyHandle, "cfkey2".getBytes(), "xx".getBytes());
       // merge yy under cfkey2
-      db.merge(columnFamilyHandle, "cfkey2".getBytes(), "yy".getBytes());
+      db.merge(columnFamilyHandle, new WriteOptions(), "cfkey2".getBytes(), "yy".getBytes());
       value = db.get(columnFamilyHandle, "cfkey2".getBytes());
       String strValueTmpCf = new String(value);
 
diff --git a/java/org/rocksdb/test/MixedOptionsTest.java b/java/org/rocksdb/test/MixedOptionsTest.java
index edaa2c318..0f15e668c 100644
--- a/java/org/rocksdb/test/MixedOptionsTest.java
+++ b/java/org/rocksdb/test/MixedOptionsTest.java
@@ -5,26 +5,33 @@
 
 package org.rocksdb.test;
 
+import org.junit.ClassRule;
+import org.junit.Test;
 import org.rocksdb.*;
 
+import static org.assertj.core.api.Assertions.assertThat;
+
 public class MixedOptionsTest {
-  static {
-    RocksDB.loadLibrary();
-  }
-  public static void main(String[] args) {
+
+  @ClassRule
+  public static final RocksMemoryResource rocksMemoryResource =
+      new RocksMemoryResource();
+
+  @Test
+  public void mixedOptionsTest(){
     // Set a table factory and check the names
     ColumnFamilyOptions cfOptions = new ColumnFamilyOptions();
     cfOptions.setTableFormatConfig(new BlockBasedTableConfig().
         setFilter(new BloomFilter()));
-    assert(cfOptions.tableFactoryName().equals(
-        "BlockBasedTable"));
+    assertThat(cfOptions.tableFactoryName()).isEqualTo(
+        "BlockBasedTable");
     cfOptions.setTableFormatConfig(new PlainTableConfig());
-    assert(cfOptions.tableFactoryName().equals("PlainTable"));
+    assertThat(cfOptions.tableFactoryName()).isEqualTo("PlainTable");
     // Initialize a dbOptions object from cf options and
     // db options
     DBOptions dbOptions = new DBOptions();
     Options options = new Options(dbOptions, cfOptions);
-    assert(options.tableFactoryName().equals("PlainTable"));
+    assertThat(options.tableFactoryName()).isEqualTo("PlainTable");
     // Free instances
     options.dispose();
     options = null;
diff --git a/java/org/rocksdb/test/OptionsTest.java b/java/org/rocksdb/test/OptionsTest.java
index a7241e822..3425502d8 100644
--- a/java/org/rocksdb/test/OptionsTest.java
+++ b/java/org/rocksdb/test/OptionsTest.java
@@ -23,196 +23,463 @@ public class OptionsTest {
       getPlatformSpecificRandomFactory();
 
   @Test
-  public void options() throws RocksDBException {
+  public void writeBufferSize() throws RocksDBException {
     Options opt = null;
     try {
       opt = new Options();
-
-      { // WriteBufferSize test
-        long longValue = rand.nextLong();
-        opt.setWriteBufferSize(longValue);
-        assert (opt.writeBufferSize() == longValue);
+      long longValue = rand.nextLong();
+      opt.setWriteBufferSize(longValue);
+      assertThat(opt.writeBufferSize()).isEqualTo(longValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
       }
+    }
+  }
 
-      { // MaxWriteBufferNumber test
-        int intValue = rand.nextInt();
-        opt.setMaxWriteBufferNumber(intValue);
-        assert (opt.maxWriteBufferNumber() == intValue);
+  @Test
+  public void maxWriteBufferNumber() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      int intValue = rand.nextInt();
+      opt.setMaxWriteBufferNumber(intValue);
+      assertThat(opt.maxWriteBufferNumber()).isEqualTo(intValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
       }
+    }
+  }
 
-      { // MinWriteBufferNumberToMerge test
-        int intValue = rand.nextInt();
-        opt.setMinWriteBufferNumberToMerge(intValue);
-        assert (opt.minWriteBufferNumberToMerge() == intValue);
+  @Test
+  public void minWriteBufferNumberToMerge() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      int intValue = rand.nextInt();
+      opt.setMinWriteBufferNumberToMerge(intValue);
+      assertThat(opt.minWriteBufferNumberToMerge()).isEqualTo(intValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
       }
+    }
+  }
 
-      { // NumLevels test
-        int intValue = rand.nextInt();
-        opt.setNumLevels(intValue);
-        assert (opt.numLevels() == intValue);
+  @Test
+  public void numLevels() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      int intValue = rand.nextInt();
+      opt.setNumLevels(intValue);
+      assertThat(opt.numLevels()).isEqualTo(intValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
       }
+    }
+  }
 
-      { // LevelFileNumCompactionTrigger test
-        int intValue = rand.nextInt();
-        opt.setLevelZeroFileNumCompactionTrigger(intValue);
-        assert (opt.levelZeroFileNumCompactionTrigger() == intValue);
+  @Test
+  public void levelZeroFileNumCompactionTrigger() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      int intValue = rand.nextInt();
+      opt.setLevelZeroFileNumCompactionTrigger(intValue);
+      assertThat(opt.levelZeroFileNumCompactionTrigger()).isEqualTo(intValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
       }
+    }
+  }
 
-      { // LevelSlowdownWritesTrigger test
-        int intValue = rand.nextInt();
-        opt.setLevelZeroSlowdownWritesTrigger(intValue);
-        assert (opt.levelZeroSlowdownWritesTrigger() == intValue);
+  @Test
+  public void levelZeroSlowdownWritesTrigger() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      int intValue = rand.nextInt();
+      opt.setLevelZeroSlowdownWritesTrigger(intValue);
+      assertThat(opt.levelZeroSlowdownWritesTrigger()).isEqualTo(intValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
       }
+    }
+  }
 
-      { // LevelStopWritesTrigger test
-        int intValue = rand.nextInt();
-        opt.setLevelZeroStopWritesTrigger(intValue);
-        assert (opt.levelZeroStopWritesTrigger() == intValue);
+  @Test
+  public void levelZeroStopWritesTrigger() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      int intValue = rand.nextInt();
+      opt.setLevelZeroStopWritesTrigger(intValue);
+      assertThat(opt.levelZeroStopWritesTrigger()).isEqualTo(intValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
       }
+    }
+  }
 
-      { // MaxMemCompactionLevel test
-        int intValue = rand.nextInt();
-        opt.setMaxMemCompactionLevel(intValue);
-        assert (opt.maxMemCompactionLevel() == intValue);
+  @Test
+  public void maxMemCompactionLevel() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      int intValue = rand.nextInt();
+      opt.setMaxMemCompactionLevel(intValue);
+      assertThat(opt.maxMemCompactionLevel()).isEqualTo(intValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
       }
+    }
+  }
 
-      { // TargetFileSizeBase test
-        long longValue = rand.nextLong();
-        opt.setTargetFileSizeBase(longValue);
-        assert (opt.targetFileSizeBase() == longValue);
+  @Test
+  public void targetFileSizeBase() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      long longValue = rand.nextLong();
+      opt.setTargetFileSizeBase(longValue);
+      assertThat(opt.targetFileSizeBase()).isEqualTo(longValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
       }
+    }
+  }
 
-      { // TargetFileSizeMultiplier test
-        int intValue = rand.nextInt();
-        opt.setTargetFileSizeMultiplier(intValue);
-        assert (opt.targetFileSizeMultiplier() == intValue);
+  @Test
+  public void targetFileSizeMultiplier() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      int intValue = rand.nextInt();
+      opt.setTargetFileSizeMultiplier(intValue);
+      assertThat(opt.targetFileSizeMultiplier()).isEqualTo(intValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
       }
+    }
+  }
 
-      { // MaxBytesForLevelBase test
-        long longValue = rand.nextLong();
-        opt.setMaxBytesForLevelBase(longValue);
-        assert (opt.maxBytesForLevelBase() == longValue);
+  @Test
+  public void maxBytesForLevelBase() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      long longValue = rand.nextLong();
+      opt.setMaxBytesForLevelBase(longValue);
+      assertThat(opt.maxBytesForLevelBase()).isEqualTo(longValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
       }
+    }
+  }
 
-      { // MaxBytesForLevelMultiplier test
-        int intValue = rand.nextInt();
-        opt.setMaxBytesForLevelMultiplier(intValue);
-        assert (opt.maxBytesForLevelMultiplier() == intValue);
+  @Test
+  public void maxBytesForLevelMultiplier() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      int intValue = rand.nextInt();
+      opt.setMaxBytesForLevelMultiplier(intValue);
+      assertThat(opt.maxBytesForLevelMultiplier()).isEqualTo(intValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
       }
+    }
+  }
 
-      { // ExpandedCompactionFactor test
-        int intValue = rand.nextInt();
-        opt.setExpandedCompactionFactor(intValue);
-        assert (opt.expandedCompactionFactor() == intValue);
+  @Test
+  public void expandedCompactionFactor() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      int intValue = rand.nextInt();
+      opt.setExpandedCompactionFactor(intValue);
+      assertThat(opt.expandedCompactionFactor()).isEqualTo(intValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
       }
+    }
+  }
 
-      { // SourceCompactionFactor test
-        int intValue = rand.nextInt();
-        opt.setSourceCompactionFactor(intValue);
-        assert (opt.sourceCompactionFactor() == intValue);
+  @Test
+  public void sourceCompactionFactor() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      int intValue = rand.nextInt();
+      opt.setSourceCompactionFactor(intValue);
+      assertThat(opt.sourceCompactionFactor()).isEqualTo(intValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
       }
+    }
+  }
 
-      { // MaxGrandparentOverlapFactor test
-        int intValue = rand.nextInt();
-        opt.setMaxGrandparentOverlapFactor(intValue);
-        assert (opt.maxGrandparentOverlapFactor() == intValue);
+  @Test
+  public void maxGrandparentOverlapFactor() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      int intValue = rand.nextInt();
+      opt.setMaxGrandparentOverlapFactor(intValue);
+      assertThat(opt.maxGrandparentOverlapFactor()).isEqualTo(intValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
       }
+    }
+  }
 
-      { // SoftRateLimit test
-        double doubleValue = rand.nextDouble();
-        opt.setSoftRateLimit(doubleValue);
-        assert (opt.softRateLimit() == doubleValue);
+  @Test
+  public void softRateLimit() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      double doubleValue = rand.nextDouble();
+      opt.setSoftRateLimit(doubleValue);
+      assertThat(opt.softRateLimit()).isEqualTo(doubleValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
       }
+    }
+  }
 
-      { // HardRateLimit test
-        double doubleValue = rand.nextDouble();
-        opt.setHardRateLimit(doubleValue);
-        assert (opt.hardRateLimit() == doubleValue);
+  @Test
+  public void hardRateLimit() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      double doubleValue = rand.nextDouble();
+      opt.setHardRateLimit(doubleValue);
+      assertThat(opt.hardRateLimit()).isEqualTo(doubleValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
       }
+    }
+  }
 
-      { // RateLimitDelayMaxMilliseconds test
-        int intValue = rand.nextInt();
-        opt.setRateLimitDelayMaxMilliseconds(intValue);
-        assert (opt.rateLimitDelayMaxMilliseconds() == intValue);
+  @Test
+  public void rateLimitDelayMaxMilliseconds() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      int intValue = rand.nextInt();
+      opt.setRateLimitDelayMaxMilliseconds(intValue);
+      assertThat(opt.rateLimitDelayMaxMilliseconds()).isEqualTo(intValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
       }
+    }
+  }
 
-      { // ArenaBlockSize test
-        long longValue = rand.nextLong();
-        opt.setArenaBlockSize(longValue);
-        assert (opt.arenaBlockSize() == longValue);
+  @Test
+  public void arenaBlockSize() throws RocksDBException {
+    Options opt = null;
+    try {
+      opt = new Options();
+      long longValue = rand.nextLong();
+      opt.setArenaBlockSize(longValue);
+      assertThat(opt.arenaBlockSize()).isEqualTo(longValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
       }
+    }
+  }
 
-      { // DisableAutoCompactions test
-        boolean boolValue = rand.nextBoolean();
-        opt.setDisableAutoCompactions(boolValue);
-        assert (opt.disableAutoCompactions() == boolValue);
+  @Test
+  public void disableAutoCompactions() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      boolean boolValue = rand.nextBoolean();
+      opt.setDisableAutoCompactions(boolValue);
+      assertThat(opt.disableAutoCompactions()).isEqualTo(boolValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
       }
+    }
+  }
 
-      { // PurgeRedundantKvsWhileFlush test
-        boolean boolValue = rand.nextBoolean();
-        opt.setPurgeRedundantKvsWhileFlush(boolValue);
-        assert (opt.purgeRedundantKvsWhileFlush() == boolValue);
+  @Test
+  public void purgeRedundantKvsWhileFlush() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      boolean boolValue = rand.nextBoolean();
+      opt.setPurgeRedundantKvsWhileFlush(boolValue);
+      assertThat(opt.purgeRedundantKvsWhileFlush()).isEqualTo(boolValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
       }
+    }
+  }
 
-      { // VerifyChecksumsInCompaction test
-        boolean boolValue = rand.nextBoolean();
-        opt.setVerifyChecksumsInCompaction(boolValue);
-        assert (opt.verifyChecksumsInCompaction() == boolValue);
+  @Test
+  public void verifyChecksumsInCompaction() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      boolean boolValue = rand.nextBoolean();
+      opt.setVerifyChecksumsInCompaction(boolValue);
+      assertThat(opt.verifyChecksumsInCompaction()).isEqualTo(boolValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
       }
+    }
+  }
 
-      { // FilterDeletes test
-        boolean boolValue = rand.nextBoolean();
-        opt.setFilterDeletes(boolValue);
-        assert (opt.filterDeletes() == boolValue);
+  @Test
+  public void filterDeletes() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      boolean boolValue = rand.nextBoolean();
+      opt.setFilterDeletes(boolValue);
+      assertThat(opt.filterDeletes()).isEqualTo(boolValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
       }
+    }
+  }
 
-      { // MaxSequentialSkipInIterations test
-        long longValue = rand.nextLong();
-        opt.setMaxSequentialSkipInIterations(longValue);
-        assert (opt.maxSequentialSkipInIterations() == longValue);
+  @Test
+  public void maxSequentialSkipInIterations() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      long longValue = rand.nextLong();
+      opt.setMaxSequentialSkipInIterations(longValue);
+      assertThat(opt.maxSequentialSkipInIterations()).isEqualTo(longValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
       }
+    }
+  }
 
-      { // InplaceUpdateSupport test
-        boolean boolValue = rand.nextBoolean();
-        opt.setInplaceUpdateSupport(boolValue);
-        assert (opt.inplaceUpdateSupport() == boolValue);
+  @Test
+  public void inplaceUpdateSupport() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      boolean boolValue = rand.nextBoolean();
+      opt.setInplaceUpdateSupport(boolValue);
+      assertThat(opt.inplaceUpdateSupport()).isEqualTo(boolValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
       }
+    }
+  }
 
-      { // InplaceUpdateNumLocks test
-        long longValue = rand.nextLong();
-        opt.setInplaceUpdateNumLocks(longValue);
-        assert (opt.inplaceUpdateNumLocks() == longValue);
+  @Test
+  public void inplaceUpdateNumLocks() throws RocksDBException {
+    Options opt = null;
+    try {
+      opt = new Options();
+      long longValue = rand.nextLong();
+      opt.setInplaceUpdateNumLocks(longValue);
+      assertThat(opt.inplaceUpdateNumLocks()).isEqualTo(longValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
       }
+    }
+  }
 
-      { // MemtablePrefixBloomBits test
-        int intValue = rand.nextInt();
-        opt.setMemtablePrefixBloomBits(intValue);
-        assert (opt.memtablePrefixBloomBits() == intValue);
+  @Test
+  public void memtablePrefixBloomBits() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      int intValue = rand.nextInt();
+      opt.setMemtablePrefixBloomBits(intValue);
+      assertThat(opt.memtablePrefixBloomBits()).isEqualTo(intValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
       }
+    }
+  }
 
-      { // MemtablePrefixBloomProbes test
-        int intValue = rand.nextInt();
-        opt.setMemtablePrefixBloomProbes(intValue);
-        assert (opt.memtablePrefixBloomProbes() == intValue);
+  @Test
+  public void memtablePrefixBloomProbes() {
+    Options opt = null;
+    try {
+      int intValue = rand.nextInt();
+      opt = new Options();
+      opt.setMemtablePrefixBloomProbes(intValue);
+      assertThat(opt.memtablePrefixBloomProbes()).isEqualTo(intValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
       }
+    }
+  }
 
-      { // BloomLocality test
-        int intValue = rand.nextInt();
-        opt.setBloomLocality(intValue);
-        assert (opt.bloomLocality() == intValue);
+  @Test
+  public void bloomLocality() {
+    Options opt = null;
+    try {
+      int intValue = rand.nextInt();
+      opt = new Options();
+      opt.setBloomLocality(intValue);
+      assertThat(opt.bloomLocality()).isEqualTo(intValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
       }
+    }
+  }
 
-      { // MaxSuccessiveMerges test
-        long longValue = rand.nextLong();
-        opt.setMaxSuccessiveMerges(longValue);
-        assert (opt.maxSuccessiveMerges() == longValue);
+  @Test
+  public void maxSuccessiveMerges() throws RocksDBException {
+    Options opt = null;
+    try {
+      long longValue = rand.nextLong();
+      opt = new Options();
+      opt.setMaxSuccessiveMerges(longValue);
+      assertThat(opt.maxSuccessiveMerges()).isEqualTo(longValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
       }
+    }
+  }
 
-      { // MinPartialMergeOperands test
-        int intValue = rand.nextInt();
-        opt.setMinPartialMergeOperands(intValue);
-        assert (opt.minPartialMergeOperands() == intValue);
-      }
+  @Test
+  public void minPartialMergeOperands() {
+    Options opt = null;
+    try {
+      int intValue = rand.nextInt();
+      opt = new Options();
+      opt.setMinPartialMergeOperands(intValue);
+      assertThat(opt.minPartialMergeOperands()).isEqualTo(intValue);
     } finally {
       if (opt != null) {
         opt.dispose();
diff --git a/java/org/rocksdb/test/WriteBatchHandlerTest.java b/java/org/rocksdb/test/WriteBatchHandlerTest.java
index 1debc2bda..5a330e409 100644
--- a/java/org/rocksdb/test/WriteBatchHandlerTest.java
+++ b/java/org/rocksdb/test/WriteBatchHandlerTest.java
@@ -75,10 +75,10 @@ public class WriteBatchHandlerTest {
 
         // compare the results to the test data
         final List<Tuple<Action, Tuple<byte[], byte[]>>> actualEvents = handler.getEvents();
-        assert(testEvents.size() == actualEvents.size());
+        assertThat(testEvents.size()).isSameAs(actualEvents.size());
 
         for(int i = 0; i < testEvents.size(); i++) {
-            assert(equals(testEvents.get(i), actualEvents.get(i)));
+            assertThat(equals(testEvents.get(i), actualEvents.get(i))).isTrue();
         }
 
         System.out.println("Passed WriteBatchHandler Test");

From 3f9c95a51998cd1bd75fdfb78e5d6074fe03b6e0 Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Fri, 14 Nov 2014 21:34:24 +0100
Subject: [PATCH 499/829] [RocksJava] Minor lint correction

---
 java/org/rocksdb/test/ColumnFamilyTest.java | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/java/org/rocksdb/test/ColumnFamilyTest.java b/java/org/rocksdb/test/ColumnFamilyTest.java
index 4c2ac8536..0a77240ac 100644
--- a/java/org/rocksdb/test/ColumnFamilyTest.java
+++ b/java/org/rocksdb/test/ColumnFamilyTest.java
@@ -151,7 +151,8 @@ public class ColumnFamilyTest {
       cfDescriptors.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY));
       db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath(),
           cfDescriptors, columnFamilyHandleList);
-      db.put(columnFamilyHandleList.get(0), new WriteOptions(), "key1".getBytes(), "value".getBytes());
+      db.put(columnFamilyHandleList.get(0), new WriteOptions(),
+          "key1".getBytes(), "value".getBytes());
       db.put("key2".getBytes(), "12345678".getBytes());
       byte[] outValue = new byte[5];
       // not found value

From 26dc5da96c05c37156a02ae905f4887a10993be2 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Fri, 14 Nov 2014 13:42:13 -0800
Subject: [PATCH 500/829] Fix compaction_job_test

---
 db/compaction_job_test.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/db/compaction_job_test.cc b/db/compaction_job_test.cc
index e0fffcf2e..75132fe00 100644
--- a/db/compaction_job_test.cc
+++ b/db/compaction_job_test.cc
@@ -24,6 +24,7 @@ class CompactionJobTest {
   CompactionJobTest()
       : env_(Env::Default()),
         dbname_(test::TmpDir() + "/compaction_job_test"),
+        mutable_cf_options_(Options(), ImmutableCFOptions(Options())),
         table_cache_(NewLRUCache(50000, 16, 8)),
         versions_(new VersionSet(dbname_, &db_options_, env_options_,
                                  table_cache_.get(), &write_controller_)),
@@ -37,7 +38,6 @@ class CompactionJobTest {
     cf_options_.table_factory = mock_table_factory_;
     column_families.emplace_back(kDefaultColumnFamilyName, cf_options_);
 
-    mutable_cf_options_.RefreshDerivedOptions(ImmutableCFOptions(Options()));
 
     ASSERT_OK(versions_->Recover(column_families, false));
   }
@@ -57,7 +57,7 @@ class CompactionJobTest {
     SequenceNumber sequence_number = 0;
     for (int i = 0; i < 2; ++i) {
       mock::MockFileContents contents;
-      SequenceNumber smallest_seqno, largest_seqno;
+      SequenceNumber smallest_seqno = 0, largest_seqno = 0;
       InternalKey smallest, largest;
       for (int k = 0; k < kKeysPerFile; ++k) {
         auto key = std::to_string(i * (kKeysPerFile / 2) + k);

From 07cd3c42a270f14a7116a17a155c1e2a222209b8 Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Fri, 14 Nov 2014 23:40:20 +0100
Subject: [PATCH 501/829] [RocksJava] LogLevel support in Options

It's now possible to set a LogLevel in Options and
DBOptions to control LOG verbosity.
---
 java/Makefile                               |  1 +
 java/org/rocksdb/DBOptions.java             | 17 ++++
 java/org/rocksdb/DBOptionsInterface.java    | 14 +++
 java/org/rocksdb/InfoLogLevel.java          | 44 +++++++++
 java/org/rocksdb/Options.java               | 18 ++++
 java/org/rocksdb/test/InfoLogLevelTest.java | 98 +++++++++++++++++++++
 java/rocksjni/options.cc                    | 44 +++++++++
 7 files changed, 236 insertions(+)
 create mode 100644 java/org/rocksdb/InfoLogLevel.java
 create mode 100644 java/org/rocksdb/test/InfoLogLevelTest.java

diff --git a/java/Makefile b/java/Makefile
index b87fea16d..f0ab4c12e 100644
--- a/java/Makefile
+++ b/java/Makefile
@@ -57,6 +57,7 @@ JAVA_TESTS = org.rocksdb.test.BackupableDBTest\
 		org.rocksdb.test.DirectComparatorTest\
 		org.rocksdb.test.FilterTest\
 		org.rocksdb.test.FlushTest\
+		org.rocksdb.test.InfoLogLevelTest\
 		org.rocksdb.test.KeyMayExistTest\
 		org.rocksdb.test.MemTableTest\
 		org.rocksdb.test.MergeTest\
diff --git a/java/org/rocksdb/DBOptions.java b/java/org/rocksdb/DBOptions.java
index 6ab276755..e19ee9a0a 100644
--- a/java/org/rocksdb/DBOptions.java
+++ b/java/org/rocksdb/DBOptions.java
@@ -83,11 +83,26 @@ public class DBOptions extends RocksObject implements DBOptionsInterface {
 
   @Override
   public DBOptions setRateLimiterConfig(RateLimiterConfig config) {
+    assert(isInitialized());
     rateLimiterConfig_ = config;
     setRateLimiter(nativeHandle_, config.newRateLimiterHandle());
     return this;
   }
 
+  @Override
+  public DBOptions setInfoLogLevel(InfoLogLevel infoLogLevel) {
+    assert(isInitialized());
+    setInfoLogLevel(nativeHandle_, infoLogLevel.getValue());
+    return this;
+  }
+
+  @Override
+  public InfoLogLevel infoLogLevel() {
+    assert(isInitialized());
+    return InfoLogLevel.getInfoLogLevel(
+        infoLogLevel(nativeHandle_));
+  }
+
   @Override
   public DBOptions setMaxOpenFiles(int maxOpenFiles) {
     assert(isInitialized());
@@ -487,6 +502,8 @@ public class DBOptions extends RocksObject implements DBOptionsInterface {
   private native boolean paranoidChecks(long handle);
   private native void setRateLimiter(long handle,
       long rateLimiterHandle);
+  private native void setInfoLogLevel(long handle, byte logLevel);
+  private native byte infoLogLevel(long handle);
   private native void setMaxOpenFiles(long handle, int maxOpenFiles);
   private native int maxOpenFiles(long handle);
   private native void setMaxTotalWalSize(long handle,
diff --git a/java/org/rocksdb/DBOptionsInterface.java b/java/org/rocksdb/DBOptionsInterface.java
index d3df483cb..19ffe375d 100644
--- a/java/org/rocksdb/DBOptionsInterface.java
+++ b/java/org/rocksdb/DBOptionsInterface.java
@@ -113,6 +113,20 @@ public interface DBOptionsInterface {
    */
   Object setRateLimiterConfig(RateLimiterConfig config);
 
+  /**
+   * <p>Sets the RocksDB log level. Default level is INFO</p>
+   *
+   * @param infoLogLevel log level to set.
+   * @return the instance of the current Object.
+   */
+  Object setInfoLogLevel(InfoLogLevel infoLogLevel);
+
+  /**
+   * <p>Returns currently set log level.</p>
+   * @return {@link org.rocksdb.InfoLogLevel} instance.
+   */
+  InfoLogLevel infoLogLevel();
+
   /**
    * Number of open files that can be used by the DB.  You may need to
    * increase this if your database has a large working set. Value -1 means
diff --git a/java/org/rocksdb/InfoLogLevel.java b/java/org/rocksdb/InfoLogLevel.java
new file mode 100644
index 000000000..0a4a0e6ea
--- /dev/null
+++ b/java/org/rocksdb/InfoLogLevel.java
@@ -0,0 +1,44 @@
+package org.rocksdb;
+
+/**
+ * RocksDB log levels.
+ */
+public enum InfoLogLevel {
+  DEBUG_LEVEL((byte)0),
+  INFO_LEVEL((byte)1),
+  WARN_LEVEL((byte)2),
+  ERROR_LEVEL((byte)3),
+  FATAL_LEVEL((byte)4),
+  NUM_INFO_LOG_LEVELS((byte)5);
+
+  private final byte value_;
+
+  private InfoLogLevel(byte value) {
+    value_ = value;
+  }
+
+  /**
+   * Returns the byte value of the enumerations value
+   *
+   * @return byte representation
+   */
+  public byte getValue() {
+    return value_;
+  }
+
+  /**
+   * Get InfoLogLevel by byte value.
+   *
+   * @param value byte representation of InfoLogLevel.
+   *
+   * @return {@link org.rocksdb.InfoLogLevel} instance or null.
+   */
+  public static InfoLogLevel getInfoLogLevel(byte value) {
+    for (InfoLogLevel infoLogLevel : InfoLogLevel.values()) {
+      if (infoLogLevel.getValue() == value){
+        return infoLogLevel;
+      }
+    }
+    return null;
+  }
+}
diff --git a/java/org/rocksdb/Options.java b/java/org/rocksdb/Options.java
index c5ea7216e..0d2a79698 100644
--- a/java/org/rocksdb/Options.java
+++ b/java/org/rocksdb/Options.java
@@ -604,6 +604,7 @@ public class Options extends RocksObject
   @Override
   public Options setMemTableConfig(MemTableConfig config)
       throws RocksDBException {
+    assert(isInitialized());
     memTableConfig_ = config;
     setMemTableFactory(nativeHandle_, config.newMemTableFactoryHandle());
     return this;
@@ -611,11 +612,26 @@ public class Options extends RocksObject
 
   @Override
   public Options setRateLimiterConfig(RateLimiterConfig config) {
+    assert(isInitialized());
     rateLimiterConfig_ = config;
     setRateLimiter(nativeHandle_, config.newRateLimiterHandle());
     return this;
   }
 
+  @Override
+  public Options setInfoLogLevel(InfoLogLevel infoLogLevel) {
+    assert(isInitialized());
+    setInfoLogLevel(nativeHandle_, infoLogLevel.getValue());
+    return this;
+  }
+
+  @Override
+  public InfoLogLevel infoLogLevel() {
+    assert(isInitialized());
+    return InfoLogLevel.getInfoLogLevel(
+        infoLogLevel(nativeHandle_));
+  }
+
   @Override
   public String memTableFactoryName() {
     assert(isInitialized());
@@ -1025,6 +1041,8 @@ public class Options extends RocksObject
   private native boolean paranoidChecks(long handle);
   private native void setRateLimiter(long handle,
       long rateLimiterHandle);
+  private native void setInfoLogLevel(long handle, byte logLevel);
+  private native byte infoLogLevel(long handle);
   private native void setMaxOpenFiles(long handle, int maxOpenFiles);
   private native int maxOpenFiles(long handle);
   private native void setMaxTotalWalSize(long handle,
diff --git a/java/org/rocksdb/test/InfoLogLevelTest.java b/java/org/rocksdb/test/InfoLogLevelTest.java
new file mode 100644
index 000000000..c2da83979
--- /dev/null
+++ b/java/org/rocksdb/test/InfoLogLevelTest.java
@@ -0,0 +1,98 @@
+package org.rocksdb.test;
+
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+import org.rocksdb.*;
+
+import java.io.IOException;
+
+import static java.nio.file.Files.readAllBytes;
+import static java.nio.file.Paths.get;
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class InfoLogLevelTest {
+
+  @ClassRule
+  public static final RocksMemoryResource rocksMemoryResource =
+      new RocksMemoryResource();
+
+  @Rule
+  public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  @Test
+  public void testInfoLogLevel() throws RocksDBException,
+      IOException {
+    RocksDB db = null;
+    try {
+      db = RocksDB.open(dbFolder.getRoot().getAbsolutePath());
+      db.put("key".getBytes(), "value".getBytes());
+      assertThat(getLogContents()).isNotEmpty();
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+    }
+  }
+
+  @Test
+     public void testFatalLogLevel() throws RocksDBException,
+      IOException {
+    RocksDB db = null;
+    Options options = null;
+    try {
+      options = new Options().
+          setCreateIfMissing(true).
+          setInfoLogLevel(InfoLogLevel.FATAL_LEVEL);
+      assertThat(options.infoLogLevel()).
+          isEqualTo(InfoLogLevel.FATAL_LEVEL);
+      db = RocksDB.open(options,
+          dbFolder.getRoot().getAbsolutePath());
+      db.put("key".getBytes(), "value".getBytes());
+      assertThat(getLogContents()).isEmpty();
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+    }
+  }
+
+  @Test
+  public void testFatalLogLevelWithDBOptions()
+      throws RocksDBException, IOException {
+    RocksDB db = null;
+    Options options = null;
+    DBOptions dbOptions = null;
+    try {
+      dbOptions = new DBOptions().
+          setInfoLogLevel(InfoLogLevel.FATAL_LEVEL);
+      options = new Options(dbOptions,
+          new ColumnFamilyOptions()).
+          setCreateIfMissing(true);
+      assertThat(dbOptions.infoLogLevel()).
+          isEqualTo(InfoLogLevel.FATAL_LEVEL);
+      assertThat(options.infoLogLevel()).
+          isEqualTo(InfoLogLevel.FATAL_LEVEL);
+      db = RocksDB.open(options,
+          dbFolder.getRoot().getAbsolutePath());
+      db.put("key".getBytes(), "value".getBytes());
+      assertThat(getLogContents()).isEmpty();
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+    }
+  }
+
+  /**
+   * Read LOG file contents into String.
+   *
+   * @return LOG file contents as String.
+   * @throws IOException if file is not found.
+   */
+  private String getLogContents() throws IOException {
+    return new String(readAllBytes(get(
+        dbFolder.getRoot().getAbsolutePath()+ "/LOG")));
+  }
+}
diff --git a/java/rocksjni/options.cc b/java/rocksjni/options.cc
index d725cc305..50bab7a1b 100644
--- a/java/rocksjni/options.cc
+++ b/java/rocksjni/options.cc
@@ -625,6 +625,28 @@ void Java_org_rocksdb_Options_setRateLimiter(
       reinterpret_cast<rocksdb::RateLimiter*>(jrate_limiter_handle));
 }
 
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setInfoLogLevel
+ * Signature: (JB)V
+ */
+void Java_org_rocksdb_Options_setInfoLogLevel(
+    JNIEnv* env, jobject jobj, jlong jhandle, jbyte jlog_level) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->info_log_level =
+      static_cast<rocksdb::InfoLogLevel>(jlog_level);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    infoLogLevel
+ * Signature: (J)B
+ */
+jbyte Java_org_rocksdb_Options_infoLogLevel(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return static_cast<jbyte>(
+      reinterpret_cast<rocksdb::Options*>(jhandle)->info_log_level);
+}
+
 /*
  * Class:     org_rocksdb_Options
  * Method:    tableCacheNumshardbits
@@ -2835,6 +2857,28 @@ void Java_org_rocksdb_DBOptions_setRateLimiter(
       reinterpret_cast<rocksdb::RateLimiter*>(jrate_limiter_handle));
 }
 
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setInfoLogLevel
+ * Signature: (JB)V
+ */
+void Java_org_rocksdb_DBOptions_setInfoLogLevel(
+    JNIEnv* env, jobject jobj, jlong jhandle, jbyte jlog_level) {
+  reinterpret_cast<rocksdb::DBOptions*>(jhandle)->info_log_level =
+    static_cast<rocksdb::InfoLogLevel>(jlog_level);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    infoLogLevel
+ * Signature: (J)B
+ */
+jbyte Java_org_rocksdb_DBOptions_infoLogLevel(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return static_cast<jbyte>(
+      reinterpret_cast<rocksdb::DBOptions*>(jhandle)->info_log_level);
+}
+
 /*
  * Class:     org_rocksdb_DBOptions
  * Method:    setMaxTotalWalSize

From 4947a0674f67cc32d5262deb2c398acf6e3eb2ff Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Sat, 15 Nov 2014 00:14:36 +0100
Subject: [PATCH 502/829] [RocksJava] Incorporated review comments D28947

---
 java/org/rocksdb/InfoLogLevel.java | 5 ++++-
 java/org/rocksdb/Options.java      | 2 --
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/java/org/rocksdb/InfoLogLevel.java b/java/org/rocksdb/InfoLogLevel.java
index 0a4a0e6ea..e67063c68 100644
--- a/java/org/rocksdb/InfoLogLevel.java
+++ b/java/org/rocksdb/InfoLogLevel.java
@@ -32,6 +32,8 @@ public enum InfoLogLevel {
    * @param value byte representation of InfoLogLevel.
    *
    * @return {@link org.rocksdb.InfoLogLevel} instance or null.
+   * @throws java.lang.IllegalArgumentException if an invalid
+   *     value is provided.
    */
   public static InfoLogLevel getInfoLogLevel(byte value) {
     for (InfoLogLevel infoLogLevel : InfoLogLevel.values()) {
@@ -39,6 +41,7 @@ public enum InfoLogLevel {
         return infoLogLevel;
       }
     }
-    return null;
+    throw new IllegalArgumentException(
+        "Illegal value provided for InfoLogLevel.");
   }
 }
diff --git a/java/org/rocksdb/Options.java b/java/org/rocksdb/Options.java
index 0d2a79698..55f3defd2 100644
--- a/java/org/rocksdb/Options.java
+++ b/java/org/rocksdb/Options.java
@@ -604,7 +604,6 @@ public class Options extends RocksObject
   @Override
   public Options setMemTableConfig(MemTableConfig config)
       throws RocksDBException {
-    assert(isInitialized());
     memTableConfig_ = config;
     setMemTableFactory(nativeHandle_, config.newMemTableFactoryHandle());
     return this;
@@ -612,7 +611,6 @@ public class Options extends RocksObject
 
   @Override
   public Options setRateLimiterConfig(RateLimiterConfig config) {
-    assert(isInitialized());
     rateLimiterConfig_ = config;
     setRateLimiter(nativeHandle_, config.newRateLimiterHandle());
     return this;

From 5c04acda08c16853c9aa7790d8f45fdd0821c682 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Fri, 14 Nov 2014 15:43:10 -0800
Subject: [PATCH 503/829] Explicitly clean JobContext

Summary: This way we can gurantee that old MemTables get destructed before DBImpl gets destructed, which might be useful if we want to make them depend on state from DBImpl.

Test Plan: make check with asserts in JobContext's destructor

Reviewers: ljin, sdong, yhchiang, rven, jonahcohen

Reviewed By: jonahcohen

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D28959
---
 db/db_filesnapshot.cc |  1 +
 db/db_impl.cc         |  8 ++++++++
 db/db_impl.h          |  2 +-
 db/db_test.cc         |  1 +
 db/job_context.h      | 15 +++++++++++++--
 5 files changed, 24 insertions(+), 3 deletions(-)

diff --git a/db/db_filesnapshot.cc b/db/db_filesnapshot.cc
index 64e5e437c..c35d8e796 100644
--- a/db/db_filesnapshot.cc
+++ b/db/db_filesnapshot.cc
@@ -19,6 +19,7 @@
 #include <stdint.h>
 #include "db/db_impl.h"
 #include "db/filename.h"
+#include "db/job_context.h"
 #include "db/version_set.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
diff --git a/db/db_impl.cc b/db/db_impl.cc
index a4ea5af12..db0d64f4e 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -32,6 +32,7 @@
 #include "db/db_iter.h"
 #include "db/dbformat.h"
 #include "db/filename.h"
+#include "db/job_context.h"
 #include "db/log_reader.h"
 #include "db/log_writer.h"
 #include "db/memtable.h"
@@ -287,6 +288,7 @@ DBImpl::~DBImpl() {
     if (job_context.HaveSomethingToDelete()) {
       PurgeObsoleteFiles(job_context);
     }
+    job_context.Clean();
   }
 
   // versions need to be destroyed before table_cache since it can hold
@@ -666,6 +668,7 @@ void DBImpl::DeleteObsoleteFiles() {
   if (job_context.HaveSomethingToDelete()) {
     PurgeObsoleteFiles(job_context);
   }
+  job_context.Clean();
 }
 
 Status DBImpl::Recover(
@@ -1343,6 +1346,7 @@ Status DBImpl::CompactFilesImpl(
     if (job_context.HaveSomethingToDelete()) {
       PurgeObsoleteFiles(job_context);
     }
+    job_context.Clean();
     mutex_.Lock();
   }
 
@@ -1808,6 +1812,7 @@ void DBImpl::BackgroundCallFlush() {
       if (job_context.HaveSomethingToDelete()) {
         PurgeObsoleteFiles(job_context);
       }
+      job_context.Clean();
       mutex_.Lock();
     }
 
@@ -1884,6 +1889,7 @@ void DBImpl::BackgroundCallCompaction() {
       if (job_context.HaveSomethingToDelete()) {
         PurgeObsoleteFiles(job_context);
       }
+      job_context.Clean();
       mutex_.Lock();
     }
 
@@ -2190,6 +2196,7 @@ static void CleanupIteratorState(void* arg1, void* arg2) {
     if (job_context.HaveSomethingToDelete()) {
       state->db->PurgeObsoleteFiles(job_context);
     }
+    job_context.Clean();
   }
 
   delete state;
@@ -3306,6 +3313,7 @@ Status DBImpl::DeleteFile(std::string name) {
   if (job_context.HaveSomethingToDelete()) {
     PurgeObsoleteFiles(job_context);
   }
+  job_context.Clean();
   {
     MutexLock l(&mutex_);
     // schedule flush if file deletion means we freed the space for flushes to
diff --git a/db/db_impl.h b/db/db_impl.h
index 1106a281d..c3c7c72a1 100644
--- a/db/db_impl.h
+++ b/db/db_impl.h
@@ -38,7 +38,6 @@
 #include "db/write_controller.h"
 #include "db/flush_scheduler.h"
 #include "db/write_thread.h"
-#include "db/job_context.h"
 
 namespace rocksdb {
 
@@ -49,6 +48,7 @@ class VersionEdit;
 class VersionSet;
 class CompactionFilterV2;
 class Arena;
+struct JobContext;
 
 class DBImpl : public DB {
  public:
diff --git a/db/db_test.cc b/db/db_test.cc
index a3ad82c51..dbf3506b9 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -19,6 +19,7 @@
 #include "db/dbformat.h"
 #include "db/db_impl.h"
 #include "db/filename.h"
+#include "db/job_context.h"
 #include "db/version_set.h"
 #include "db/write_batch_internal.h"
 #include "rocksdb/cache.h"
diff --git a/db/job_context.h b/db/job_context.h
index d73e817a6..9b14d5995 100644
--- a/db/job_context.h
+++ b/db/job_context.h
@@ -21,7 +21,8 @@ class MemTable;
 struct JobContext {
   inline bool HaveSomethingToDelete() const {
     return candidate_files.size() || sst_delete_files.size() ||
-           log_delete_files.size();
+           log_delete_files.size() || new_superversion != nullptr ||
+           superversions_to_free.size() > 0 || memtables_to_free.size() > 0;
   }
 
   // Structure to store information for candidate files to delete.
@@ -73,7 +74,7 @@ struct JobContext {
     new_superversion = create_superversion ? new SuperVersion() : nullptr;
   }
 
-  ~JobContext() {
+  void Clean() {
     // free pending memtables
     for (auto m : memtables_to_free) {
       delete m;
@@ -85,6 +86,16 @@ struct JobContext {
     // if new_superversion was not used, it will be non-nullptr and needs
     // to be freed here
     delete new_superversion;
+
+    memtables_to_free.clear();
+    superversions_to_free.clear();
+    new_superversion = nullptr;
+  }
+
+  ~JobContext() {
+    assert(memtables_to_free.size() == 0);
+    assert(superversions_to_free.size() == 0);
+    assert(new_superversion == nullptr);
   }
 };
 

From 8a1bcc39c5b53d0fc4e16f539564e0ff0bccdddf Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Sat, 15 Nov 2014 01:09:54 +0100
Subject: [PATCH 504/829] [RocksJava] Bump version to 3.8 in rocksjni.pom

---
 java/rocksjni.pom | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/java/rocksjni.pom b/java/rocksjni.pom
index 552e26f8e..c0ee88c22 100644
--- a/java/rocksjni.pom
+++ b/java/rocksjni.pom
@@ -8,7 +8,7 @@
     <url>http://rocksdb.org/</url>
     <groupId>org.rocksdb</groupId>
     <artifactId>rocksdbjni</artifactId>
-    <version>3.6.0</version>
+    <version>3.8.0</version>
     <description>RocksDB fat jar that contains .so files for linux32 and linux64, and jnilib files
         for Mac OSX.
     </description>

From 84af2ff8d3b90d820e75b7e91574fdd6b8092540 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Fri, 14 Nov 2014 16:20:24 -0800
Subject: [PATCH 505/829] Clean job context in DeleteFile

---
 db/db_impl.cc | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/db/db_impl.cc b/db/db_impl.cc
index db0d64f4e..fb851b3d4 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -3268,6 +3268,7 @@ Status DBImpl::DeleteFile(std::string name) {
     if (!status.ok()) {
       Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log,
           "DeleteFile %s failed. File not found\n", name.c_str());
+      job_context.Clean();
       return Status::InvalidArgument("File not found");
     }
     assert(level < cfd->NumberLevels());
@@ -3276,6 +3277,7 @@ Status DBImpl::DeleteFile(std::string name) {
     if (metadata->being_compacted) {
       Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
           "DeleteFile %s Skipped. File about to be compacted\n", name.c_str());
+      job_context.Clean();
       return Status::OK();
     }
 
@@ -3287,6 +3289,7 @@ Status DBImpl::DeleteFile(std::string name) {
       if (vstoreage->NumLevelFiles(i) != 0) {
         Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log,
             "DeleteFile %s FAILED. File not in last level\n", name.c_str());
+        job_context.Clean();
         return Status::InvalidArgument("File not in last level");
       }
     }
@@ -3296,6 +3299,7 @@ Status DBImpl::DeleteFile(std::string name) {
       Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log,
           "DeleteFile %s failed ---"
           " target file in level 0 must be the oldest.", name.c_str());
+      job_context.Clean();
       return Status::InvalidArgument("File in level 0, but not oldest");
     }
     edit.SetColumnFamily(cfd->GetID());

From e7960c03ac5aede6d7d43ce1d24de84d78eb535d Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Fri, 14 Nov 2014 16:23:56 -0800
Subject: [PATCH 506/829] Don't parallelize the build in travis

---
 .travis.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.travis.yml b/.travis.yml
index 70e213e02..ad2129d27 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -14,6 +14,6 @@ before_install:
  - sudo dpkg -i libgflags-dev_2.0-1_amd64.deb
 # Lousy hack to disable use and testing of fallocate, which doesn't behave quite
 # as EnvPosixTest::AllocateTest expects within the Travis OpenVZ environment.
-script: OPT=-DTRAVIS make unity && make clean && OPT=-DTRAVIS make check -j8
+script: OPT=-DTRAVIS make unity && make clean && OPT=-DTRAVIS make check
 notifications:
     email: false

From 0ce38fe983c5827b8424719c95f3d0aa8ef99fd4 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Fri, 14 Nov 2014 16:33:41 -0800
Subject: [PATCH 507/829] Fix signed/unsigned compile

---
 util/options_builder.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/util/options_builder.cc b/util/options_builder.cc
index 12130db52..a92a5e86e 100644
--- a/util/options_builder.cc
+++ b/util/options_builder.cc
@@ -158,7 +158,7 @@ void OptimizeForLevel(int read_amplification_threshold,
   // Now always set level multiplier to be 10
   options->max_bytes_for_level_multiplier = kBytesForLevelMultiplier;
 
-  const int kMinFileSize = 2 * kBytesForOneMb;
+  const uint64_t kMinFileSize = 2 * kBytesForOneMb;
   // Allow at least 3-way parallelism for compaction between level 1 and 2.
   uint64_t max_file_size = max_bytes_for_level_base / 3;
   if (max_file_size < kMinFileSize) {

From 23295b74b665d94114b20409ec0636e550f581b9 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Fri, 14 Nov 2014 16:57:17 -0800
Subject: [PATCH 508/829] Clean job_context

---
 db/db_filesnapshot.cc | 1 +
 db/flush_job_test.cc  | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/db/db_filesnapshot.cc b/db/db_filesnapshot.cc
index c35d8e796..a442c68b2 100644
--- a/db/db_filesnapshot.cc
+++ b/db/db_filesnapshot.cc
@@ -69,6 +69,7 @@ Status DBImpl::EnableFileDeletions(bool force) {
   if (should_purge_files)  {
     PurgeObsoleteFiles(job_context);
   }
+  job_context.Clean();
   LogFlush(db_options_.info_log);
   return Status::OK();
 }
diff --git a/db/flush_job_test.cc b/db/flush_job_test.cc
index 0fa5b4e57..33d1abe86 100644
--- a/db/flush_job_test.cc
+++ b/db/flush_job_test.cc
@@ -85,6 +85,7 @@ TEST(FlushJobTest, Empty) {
                      SequenceNumber(), &job_context, nullptr, nullptr,
                      kNoCompression, nullptr);
   ASSERT_OK(flush_job.Run());
+  job_context.Clean();
 }
 
 TEST(FlushJobTest, NonEmpty) {
@@ -113,6 +114,7 @@ TEST(FlushJobTest, NonEmpty) {
   ASSERT_OK(flush_job.Run());
   mutex_.Unlock();
   mock_table_factory_->AssertSingleFile(inserted_keys);
+  job_context.Clean();
 }
 
 }  // namespace rocksdb

From b8d5e3f08e075fb37af73fdd9dceb7d369f38dd7 Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Sat, 15 Nov 2014 20:12:04 +0100
Subject: [PATCH 509/829] [RocksJava] MVN Build reads version from version.h

---
 java/rocksjni.pom | 35 ++++++++++++++++++++++++++++++++++-
 1 file changed, 34 insertions(+), 1 deletion(-)

diff --git a/java/rocksjni.pom b/java/rocksjni.pom
index c0ee88c22..e18a7734d 100644
--- a/java/rocksjni.pom
+++ b/java/rocksjni.pom
@@ -8,7 +8,8 @@
     <url>http://rocksdb.org/</url>
     <groupId>org.rocksdb</groupId>
     <artifactId>rocksdbjni</artifactId>
-    <version>3.8.0</version>
+    <!-- Version will be automatically replaced -->
+    <version>-</version>
     <description>RocksDB fat jar that contains .so files for linux32 and linux64, and jnilib files
         for Mac OSX.
     </description>
@@ -139,6 +140,38 @@
                     </execution>
                 </executions>
             </plugin>
+            <plugin>
+                <groupId>org.codehaus.gmaven</groupId>
+                <artifactId>groovy-maven-plugin</artifactId>
+                <version>2.0</version>
+                <executions>
+                    <execution>
+                        <phase>process-classes</phase>
+                        <goals>
+                            <goal>execute</goal>
+                        </goals>
+                        <configuration>
+                            <defaults>
+                                <name>Xenu</name>
+                            </defaults>
+                            <source>
+                                String fileContents = new File("${project.basedir}/../include/rocksdb/version.h").getText('UTF-8')
+                                matcher = (fileContents =~ /(?s).*ROCKSDB_MAJOR ([0-9]+).*?/)
+                                String major_version = matcher.getAt(0).getAt(1)
+                                matcher = (fileContents =~ /(?s).*ROCKSDB_MINOR ([0-9]+).*?/)
+                                String minor_version = matcher.getAt(0).getAt(1)
+                                matcher = (fileContents =~ /(?s).*ROCKSDB_PATCH ([0-9]+).*?/)
+                                String patch_version = matcher.getAt(0).getAt(1)
+                                String version = String.format('%s.%s.%s', major_version, minor_version, patch_version)
+                                // Set version to be used in pom.properties
+                                project.version = version
+                                // Set version to be set as jar name
+                                project.build.finalName = project.artifactId + "-" + version
+                            </source>
+                        </configuration>
+                    </execution>
+                </executions>
+            </plugin>
         </plugins>
     </build>
 

From c3915abbae3b99686b24a900fa51a2816e8648ba Mon Sep 17 00:00:00 2001
From: Adam Retter <adam.retter@googlemail.com>
Date: Sat, 15 Nov 2014 19:54:44 +0000
Subject: [PATCH 510/829] Minor tidyup and use Java 7 for file copying

---
 java/org/rocksdb/NativeLibraryLoader.java | 47 ++++++++++-------------
 1 file changed, 20 insertions(+), 27 deletions(-)

diff --git a/java/org/rocksdb/NativeLibraryLoader.java b/java/org/rocksdb/NativeLibraryLoader.java
index bf0196e77..73170ba68 100644
--- a/java/org/rocksdb/NativeLibraryLoader.java
+++ b/java/org/rocksdb/NativeLibraryLoader.java
@@ -1,6 +1,9 @@
 package org.rocksdb;
 
 import java.io.*;
+import java.nio.file.Files;
+import java.nio.file.StandardCopyOption;
+
 import org.rocksdb.util.Environment;
 
 /**
@@ -8,42 +11,32 @@ import org.rocksdb.util.Environment;
  * The shared library is extracted to a temp folder and loaded from there.
  */
 public class NativeLibraryLoader {
-  private static String sharedLibraryName = Environment.getJniLibraryName("rocksdb");
-  private static String tempFileSuffix = "." + Environment.getJniLibraryExtension();
+  private static final String sharedLibraryName = Environment.getJniLibraryName("rocksdb");
+  private static final String tempFilePrefix = "librocksdbjni";
+  private static final String tempFileSuffix = "." + Environment.getJniLibraryExtension();
 
-  public static void loadLibraryFromJar(String tmpDir)
+  public static void loadLibraryFromJar(final String tmpDir)
       throws IOException {
-    File temp;
-    String tempFilePrefix = "librocksdbjni";
-    if(tmpDir == null || tmpDir.equals(""))
+    final File temp;
+    if(tmpDir == null || tmpDir.equals("")) {
       temp = File.createTempFile(tempFilePrefix, tempFileSuffix);
-    else
-      temp = new File(tmpDir + "/" + sharedLibraryName);
-
-    temp.deleteOnExit();
+    } else {
+      temp = new File(tmpDir, sharedLibraryName);
+    }
 
     if (!temp.exists()) {
       throw new RuntimeException("File " + temp.getAbsolutePath() + " does not exist.");
+    } else {
+      temp.deleteOnExit();
     }
 
-    byte[] buffer = new byte[102400];
-    int readBytes;
-
-    InputStream is = ClassLoader.getSystemClassLoader().getResourceAsStream(sharedLibraryName);
-    if (is == null) {
-      throw new RuntimeException(sharedLibraryName + " was not found inside JAR.");
-    }
-
-    OutputStream os = null;
-    try {
-      os = new FileOutputStream(temp);
-      while ((readBytes = is.read(buffer)) != -1) {
-        os.write(buffer, 0, readBytes);
+    // attempt to copy the library from the JAR to the temp destination
+    try(final InputStream is = ClassLoader.getSystemClassLoader().getResourceAsStream(sharedLibraryName)) {
+      if (is == null) {
+        throw new RuntimeException(sharedLibraryName + " was not found inside JAR.");
+      } else {
+        Files.copy(is, temp.toPath(), StandardCopyOption.REPLACE_EXISTING);
       }
-    } finally {
-      if(os != null)
-        os.close();
-      is.close();
     }
 
     System.load(temp.getAbsolutePath());

From 585c759cf382945102d0f0f7fb70dc33612d9970 Mon Sep 17 00:00:00 2001
From: Adam Retter <adam.retter@googlemail.com>
Date: Sat, 15 Nov 2014 23:41:01 +0000
Subject: [PATCH 511/829] Make sure to use the correct Java classloader for
 loading the RocksDB Native Library

---
 java/org/rocksdb/NativeLibraryLoader.java | 30 ++++++++++++++++++++---
 java/org/rocksdb/RocksDB.java             |  2 +-
 2 files changed, 28 insertions(+), 4 deletions(-)

diff --git a/java/org/rocksdb/NativeLibraryLoader.java b/java/org/rocksdb/NativeLibraryLoader.java
index 73170ba68..32836f670 100644
--- a/java/org/rocksdb/NativeLibraryLoader.java
+++ b/java/org/rocksdb/NativeLibraryLoader.java
@@ -11,11 +11,35 @@ import org.rocksdb.util.Environment;
  * The shared library is extracted to a temp folder and loaded from there.
  */
 public class NativeLibraryLoader {
+  //singleton
+  private static final NativeLibraryLoader instance = new NativeLibraryLoader();
+
   private static final String sharedLibraryName = Environment.getJniLibraryName("rocksdb");
   private static final String tempFilePrefix = "librocksdbjni";
   private static final String tempFileSuffix = "." + Environment.getJniLibraryExtension();
 
-  public static void loadLibraryFromJar(final String tmpDir)
+  /**
+   * Get a reference to the NativeLibraryLoader
+   *
+   * @return The NativeLibraryLoader
+   */
+  public static NativeLibraryLoader getInstance() {
+    return instance;
+  }
+
+  /**
+   * Attempts to extract the native RocksDB library
+   * from the classpath and load it
+   *
+   * @param tmpDir A temporary directory to use
+   *   to copy the native library to. If null,
+   *   or the empty string, we rely on Java's
+   *   {@see java.io.File#createTempFile(String, String) }
+   *   function to provide a temporary location.
+   *   The temporary file will be registered for deletion
+   *   on exit.
+   */
+  public void loadLibraryFromJar(final String tmpDir)
       throws IOException {
     final File temp;
     if(tmpDir == null || tmpDir.equals("")) {
@@ -30,8 +54,8 @@ public class NativeLibraryLoader {
       temp.deleteOnExit();
     }
 
-    // attempt to copy the library from the JAR to the temp destination
-    try(final InputStream is = ClassLoader.getSystemClassLoader().getResourceAsStream(sharedLibraryName)) {
+    // attempt to copy the library from the Jar file to the temp destination
+    try(final InputStream is = getClass().getClassLoader().getResourceAsStream(sharedLibraryName)) {
       if (is == null) {
         throw new RuntimeException(sharedLibraryName + " was not found inside JAR.");
       } else {
diff --git a/java/org/rocksdb/RocksDB.java b/java/org/rocksdb/RocksDB.java
index 5fcfd2ff4..79b00f3c0 100644
--- a/java/org/rocksdb/RocksDB.java
+++ b/java/org/rocksdb/RocksDB.java
@@ -44,7 +44,7 @@ public class RocksDB extends RocksObject {
     }
     try
     {
-      NativeLibraryLoader.loadLibraryFromJar(tmpDir);
+      NativeLibraryLoader.getInstance().loadLibraryFromJar(tmpDir);
     }
     catch (IOException e)
     {

From d3c4a0f4aea2a097680b7663b34508030db5bda3 Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Sat, 15 Nov 2014 17:05:52 -0800
Subject: [PATCH 512/829] Improve the comment in InfoLogLevelTest.java

Summary:
Improve the comment in InfoLogLevelTest.java

Test Plan:
make rocksdbjava
---
 java/org/rocksdb/test/InfoLogLevelTest.java | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/java/org/rocksdb/test/InfoLogLevelTest.java b/java/org/rocksdb/test/InfoLogLevelTest.java
index c2da83979..f96ca92b9 100644
--- a/java/org/rocksdb/test/InfoLogLevelTest.java
+++ b/java/org/rocksdb/test/InfoLogLevelTest.java
@@ -50,6 +50,8 @@ public class InfoLogLevelTest {
       db = RocksDB.open(options,
           dbFolder.getRoot().getAbsolutePath());
       db.put("key".getBytes(), "value".getBytes());
+      // As InfoLogLevel is set to FATAL_LEVEL, here we expect the log
+      // content to be empty.
       assertThat(getLogContents()).isEmpty();
     } finally {
       if (db != null) {

From 98e59f9813f3cb6c6573c977d67eca77fd7708e2 Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Sun, 16 Nov 2014 21:52:23 -0800
Subject: [PATCH 513/829] Fixed a bug which could hide non-ok status in
 CompactionJob::Run()

Summary: Fixed a bug which could hide non-ok status in CompactionJob::Run()

Test Plan: make

Reviewers: sdong, igor

Reviewed By: igor

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D28995
---
 db/compaction_job.cc | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/db/compaction_job.cc b/db/compaction_job.cc
index bc514a2e8..d836ccd30 100644
--- a/db/compaction_job.cc
+++ b/db/compaction_job.cc
@@ -385,17 +385,22 @@ Status CompactionJob::Run() {
         compact_->MergeKeyValueSliceBuffer(&cfd->internal_comparator());
 
         status = ProcessKeyValueCompaction(&imm_micros, input.get(), true);
+        if (!status.ok()) {
+          break;
+        }
 
         compact_->CleanupBatchBuffer();
         compact_->CleanupMergedBuffer();
       }
     }  // done processing all prefix batches
     // finish the last batch
-    if (compact_->key_str_buf_.size() > 0) {
-      CallCompactionFilterV2(compaction_filter_v2);
+    if (status.ok()) {
+      if (compact_->key_str_buf_.size() > 0) {
+        CallCompactionFilterV2(compaction_filter_v2);
+      }
+      compact_->MergeKeyValueSliceBuffer(&cfd->internal_comparator());
+      status = ProcessKeyValueCompaction(&imm_micros, input.get(), true);
     }
-    compact_->MergeKeyValueSliceBuffer(&cfd->internal_comparator());
-    status = ProcessKeyValueCompaction(&imm_micros, input.get(), true);
   }  // checking for compaction filter v2
 
   if (status.ok() &&

From e97f014b913069d685359bce13afbbb951c73e13 Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Mon, 17 Nov 2014 19:27:52 +0100
Subject: [PATCH 514/829] [RocksJava] JavaDoc corrections - Java8

This commit solves build problems in Java8 due
to wrong JavaDoc.
---
 java/org/rocksdb/NativeLibraryLoader.java | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/java/org/rocksdb/NativeLibraryLoader.java b/java/org/rocksdb/NativeLibraryLoader.java
index 32836f670..1aa9a8b16 100644
--- a/java/org/rocksdb/NativeLibraryLoader.java
+++ b/java/org/rocksdb/NativeLibraryLoader.java
@@ -34,10 +34,12 @@ public class NativeLibraryLoader {
    * @param tmpDir A temporary directory to use
    *   to copy the native library to. If null,
    *   or the empty string, we rely on Java's
-   *   {@see java.io.File#createTempFile(String, String) }
+   *   {@link java.io.File#createTempFile(String, String)}
    *   function to provide a temporary location.
    *   The temporary file will be registered for deletion
    *   on exit.
+   *
+   * @throws java.io.IOException if a filesystem operation fails.
    */
   public void loadLibraryFromJar(final String tmpDir)
       throws IOException {

From 5529c1ad1b50a4e84e4ca75ab194e35eede9b3a3 Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Mon, 10 Nov 2014 12:55:58 +0100
Subject: [PATCH 515/829] [RocksJava] GetIntProperty in RocksDB

Expose GetIntProperty methods to RocksJava. As the integer(64-Bit)
value is no integer in Java the method is aligned with the return
type which is long.
---
 java/org/rocksdb/RocksDB.java               | 36 ++++++++++++++++
 java/org/rocksdb/test/ColumnFamilyTest.java |  3 +-
 java/org/rocksdb/test/RocksDBTest.java      | 33 ++++++++++++++
 java/rocksjni/rocksjni.cc                   | 48 ++++++++++++++++++++-
 4 files changed, 118 insertions(+), 2 deletions(-)

diff --git a/java/org/rocksdb/RocksDB.java b/java/org/rocksdb/RocksDB.java
index 79b00f3c0..fc45e3611 100644
--- a/java/org/rocksdb/RocksDB.java
+++ b/java/org/rocksdb/RocksDB.java
@@ -1032,6 +1032,38 @@ public class RocksDB extends RocksObject {
     return getProperty0(nativeHandle_, property, property.length());
   }
 
+  /**
+   * <p> Similar to GetProperty(), but only works for a subset of properties whose
+   * return value is a numerical value. Return the value as long.</p>
+   *
+   * @param property to be fetched.
+   *
+   * @return property value
+   *
+   * @throws RocksDBException if an error happens in the underlying native code.
+   */
+  public long getLongProperty(String property) throws RocksDBException {
+    return getLongProperty(nativeHandle_, property, property.length());
+  }
+
+  /**
+   * <p> Similar to GetProperty(), but only works for a subset of properties whose
+   * return value is a numerical value. Return the value as long.</p>
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance
+   * @param property to be fetched.
+   *
+   * @return property value
+   *
+   * @throws RocksDBException if an error happens in the underlying native code.
+   */
+  public long getLongProperty(ColumnFamilyHandle columnFamilyHandle, String property)
+      throws RocksDBException {
+    return getLongProperty(nativeHandle_, columnFamilyHandle.nativeHandle_, property,
+        property.length());
+  }
+
   /**
    * Return a heap-allocated iterator over the contents of the database.
    * The result of newIterator() is initially invalid (caller must
@@ -1297,6 +1329,10 @@ public class RocksDB extends RocksObject {
       String property, int propertyLength) throws RocksDBException;
   protected native String getProperty0(long nativeHandle, long cfHandle,
       String property, int propertyLength) throws RocksDBException;
+  protected native long getLongProperty(long nativeHandle,
+      String property, int propertyLength) throws RocksDBException;
+  protected native long getLongProperty(long nativeHandle, long cfHandle,
+      String property, int propertyLength) throws RocksDBException;
   protected native long iterator0(long handle);
   protected native long iterator0(long handle, long cfHandle);
   protected native long[] iterators(long handle,
diff --git a/java/org/rocksdb/test/ColumnFamilyTest.java b/java/org/rocksdb/test/ColumnFamilyTest.java
index 0a77240ac..92f977ce3 100644
--- a/java/org/rocksdb/test/ColumnFamilyTest.java
+++ b/java/org/rocksdb/test/ColumnFamilyTest.java
@@ -351,7 +351,6 @@ public class ColumnFamilyTest {
     }
   }
 
-
   @Test
   public void properties() throws RocksDBException {
     RocksDB db = null;
@@ -371,6 +370,8 @@ public class ColumnFamilyTest {
           cfNames, columnFamilyHandleList);
       assertThat(db.getProperty("rocksdb.estimate-num-keys")).
           isNotNull();
+      assertThat(db.getLongProperty(columnFamilyHandleList.get(0),
+          "rocksdb.estimate-num-keys")).isGreaterThanOrEqualTo(0);
       assertThat(db.getProperty("rocksdb.stats")).isNotNull();
       assertThat(db.getProperty(columnFamilyHandleList.get(0),
           "rocksdb.sstables")).isNotNull();
diff --git a/java/org/rocksdb/test/RocksDBTest.java b/java/org/rocksdb/test/RocksDBTest.java
index 4f51e8b97..5a8613aa1 100644
--- a/java/org/rocksdb/test/RocksDBTest.java
+++ b/java/org/rocksdb/test/RocksDBTest.java
@@ -279,4 +279,37 @@ public class RocksDBTest {
       }
     }
   }
+
+  @Test
+  public void getIntProperty() throws RocksDBException {
+    RocksDB db = null;
+    Options options = null;
+    WriteOptions wOpt = null;
+    try {
+      options = new Options();
+      wOpt = new WriteOptions();
+      // Setup options
+      options.setCreateIfMissing(true);
+      options.setMaxWriteBufferNumber(10);
+      options.setMinWriteBufferNumberToMerge(10);
+      wOpt.setDisableWAL(true);
+      db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath());
+      db.put(wOpt, "key1".getBytes(), "value1".getBytes());
+      db.put(wOpt, "key2".getBytes(), "value2".getBytes());
+      db.put(wOpt, "key3".getBytes(), "value3".getBytes());
+      db.put(wOpt, "key4".getBytes(), "value4".getBytes());
+      assertThat(db.getLongProperty("rocksdb.num-entries-active-mem-table")).isGreaterThan(0);
+      assertThat(db.getLongProperty("rocksdb.cur-size-active-mem-table")).isGreaterThan(0);
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
+      if (wOpt != null) {
+        wOpt.dispose();
+      }
+    }
+  }
 }
diff --git a/java/rocksjni/rocksjni.cc b/java/rocksjni/rocksjni.cc
index 4fa1a544c..5af3c6b68 100644
--- a/java/rocksjni/rocksjni.cc
+++ b/java/rocksjni/rocksjni.cc
@@ -1289,6 +1289,53 @@ jstring Java_org_rocksdb_RocksDB_getProperty0__JJLjava_lang_String_2I(
   return env->NewStringUTF(property_value.data());
 }
 
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    getLongProperty
+ * Signature: (JLjava/lang/String;I)L;
+ */
+jlong Java_org_rocksdb_RocksDB_getLongProperty__JLjava_lang_String_2I(
+    JNIEnv* env, jobject jdb, jlong db_handle, jstring jproperty,
+    jint jproperty_len) {
+  auto db = reinterpret_cast<rocksdb::DB*>(db_handle);
+
+  const char* property = env->GetStringUTFChars(jproperty, 0);
+  rocksdb::Slice property_slice(property, jproperty_len);
+
+  uint64_t property_value = 0;
+  bool retCode = db->GetIntProperty(property_slice, &property_value);
+  env->ReleaseStringUTFChars(jproperty, property);
+
+  if (!retCode) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, rocksdb::Status::NotFound());
+  }
+  return property_value;
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    getLongProperty
+ * Signature: (JJLjava/lang/String;I)L;
+ */
+jlong Java_org_rocksdb_RocksDB_getLongProperty__JJLjava_lang_String_2I(
+    JNIEnv* env, jobject jdb, jlong db_handle, jlong jcf_handle,
+    jstring jproperty, jint jproperty_len) {
+  auto db = reinterpret_cast<rocksdb::DB*>(db_handle);
+  auto cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
+
+  const char* property = env->GetStringUTFChars(jproperty, 0);
+  rocksdb::Slice property_slice(property, jproperty_len);
+
+  uint64_t property_value;
+  bool retCode = db->GetIntProperty(cf_handle, property_slice, &property_value);
+  env->ReleaseStringUTFChars(jproperty, property);
+
+  if (!retCode) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, rocksdb::Status::NotFound());
+  }
+  return property_value;
+}
+
 //////////////////////////////////////////////////////////////////////////////
 // rocksdb::DB::Flush
 
@@ -1332,4 +1379,3 @@ void Java_org_rocksdb_RocksDB_flush__JJJ(
   auto cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
   rocksdb_flush_helper(env, db, *flush_options, cf_handle);
 }
-

From 8efd4bb42494143cc9ed160ece0cfb9eaadd2334 Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Mon, 17 Nov 2014 21:29:05 +0100
Subject: [PATCH 516/829] [RocksJava] Improved comments in RocksDB class

Improved comments in RocksDB getLongProperty methods,
to describe the behavior more detailed.
---
 java/org/rocksdb/RocksDB.java | 28 ++++++++++++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

diff --git a/java/org/rocksdb/RocksDB.java b/java/org/rocksdb/RocksDB.java
index fc45e3611..bb88710ed 100644
--- a/java/org/rocksdb/RocksDB.java
+++ b/java/org/rocksdb/RocksDB.java
@@ -1036,9 +1036,21 @@ public class RocksDB extends RocksObject {
    * <p> Similar to GetProperty(), but only works for a subset of properties whose
    * return value is a numerical value. Return the value as long.</p>
    *
+   * <p><strong>Note</strong>: As the returned property is of type
+   * {@code uint64_t} on C++ side the returning value can be negative
+   * because Java supports in Java 7 only signed long values.</p>
+   *
+   * <p><strong>Java 7</strong>: To mitigate the problem of the non
+   * existent unsigned long tpye, values should be encapsulated using
+   * {@link java.math.BigInteger} to reflect the correct value. The correct
+   * behavior is guaranteed if {@code 2^64} is added to negative values.</p>
+   *
+   * <p><strong>Java 8</strong>: In Java 8 the value should be treated as
+   * unsigned long using provided methods of type {@link Long}.</p>
+   *
    * @param property to be fetched.
    *
-   * @return property value
+   * @return numerical property value.
    *
    * @throws RocksDBException if an error happens in the underlying native code.
    */
@@ -1050,11 +1062,23 @@ public class RocksDB extends RocksObject {
    * <p> Similar to GetProperty(), but only works for a subset of properties whose
    * return value is a numerical value. Return the value as long.</p>
    *
+   * <p><strong>Note</strong>: As the returned property is of type
+   * {@code uint64_t} on C++ side the returning value can be negative
+   * because Java supports in Java 7 only signed long values.</p>
+   *
+   * <p><strong>Java 7</strong>: To mitigate the problem of the non
+   * existent unsigned long tpye, values should be encapsulated using
+   * {@link java.math.BigInteger} to reflect the correct value. The correct
+   * behavior is guaranteed if {@code 2^64} is added to negative values.</p>
+   *
+   * <p><strong>Java 8</strong>: In Java 8 the value should be treated as
+   * unsigned long using provided methods of type {@link Long}.</p>
+   *
    * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
    *     instance
    * @param property to be fetched.
    *
-   * @return property value
+   * @return numerical property value
    *
    * @throws RocksDBException if an error happens in the underlying native code.
    */

From 517c28994d35a26fe0c5c9303bb6644a6c582827 Mon Sep 17 00:00:00 2001
From: Jonah Cohen <jonah@fb.com>
Date: Mon, 17 Nov 2014 13:47:51 -0800
Subject: [PATCH 517/829] Options helper supports k, m, g, and t unit suffixes

Summary:
Add unit support in options helper so we can specify, e.g., 10m for
10 megabytes.

Test Plan: Updated options_test

Reviewers: sdong, igor, ljin

Reviewed By: ljin

Subscribers: dhruba

Differential Revision: https://reviews.facebook.net/D28977
---
 util/options_helper.cc | 48 ++++++++++++++++++++++++++++++++++++------
 util/options_test.cc   | 22 +++++++++++++++++++
 2 files changed, 64 insertions(+), 6 deletions(-)

diff --git a/util/options_helper.cc b/util/options_helper.cc
index bffcc1f5c..111215282 100644
--- a/util/options_helper.cc
+++ b/util/options_helper.cc
@@ -43,20 +43,56 @@ bool ParseBoolean(const std::string& type, const std::string& value) {
     throw type;
   }
 }
-int ParseInt(const std::string& value) { return std::stoi(value); }
-
-uint32_t ParseUint32(const std::string& value) {
-  return static_cast<uint32_t>(std::stoul(value));
-}
 
 uint64_t ParseUint64(const std::string& value) {
-  return std::stoull(value);
+  size_t endchar;
+  uint64_t num = std::stoull(value.c_str(), &endchar);
+
+  if (endchar < value.length()) {
+    char c = value[endchar];
+    if (c == 'k' || c == 'K')
+      num <<= 10LL;
+    else if (c == 'm' || c == 'M')
+      num <<= 20LL;
+    else if (c == 'g' || c == 'G')
+      num <<= 30LL;
+    else if (c == 't' || c == 'T')
+      num <<= 40LL;
+  }
+
+  return num;
 }
 
 size_t ParseSizeT(const std::string& value) {
   return static_cast<size_t>(ParseUint64(value));
 }
 
+uint32_t ParseUint32(const std::string& value) {
+  uint64_t num = ParseUint64(value);
+  if ((num >> 32LL) == 0) {
+    return static_cast<uint32_t>(num);
+  } else {
+    throw std::out_of_range(value);
+  }
+}
+
+int ParseInt(const std::string& value) {
+  size_t endchar;
+  int num = std::stoi(value.c_str(), &endchar);
+
+  if (endchar < value.length()) {
+    char c = value[endchar];
+    if (c == 'k' || c == 'K')
+      num <<= 10;
+    else if (c == 'm' || c == 'M')
+      num <<= 20;
+    else if (c == 'g' || c == 'G')
+      num <<= 30;
+  }
+
+  return num;
+}
+
 double ParseDouble(const std::string& value) {
   return std::stod(value);
 }
diff --git a/util/options_test.cc b/util/options_test.cc
index 6bf2f0b0f..b2087608f 100644
--- a/util/options_test.cc
+++ b/util/options_test.cc
@@ -298,6 +298,28 @@ TEST(OptionsTest, GetOptionsFromStringTest) {
   // Missing option name
   ASSERT_TRUE(!GetColumnFamilyOptionsFromString(base_cf_opt,
               "write_buffer_size=13; =100;", &new_cf_opt));
+  // Units (k)
+  ASSERT_TRUE(GetColumnFamilyOptionsFromString(base_cf_opt,
+              "memtable_prefix_bloom_bits=14k;max_write_buffer_number=-15K",
+              &new_cf_opt));
+  ASSERT_EQ(new_cf_opt.memtable_prefix_bloom_bits, 14*1024);
+  ASSERT_EQ(new_cf_opt.max_write_buffer_number, -15*1024);
+  // Units (m)
+  ASSERT_TRUE(GetColumnFamilyOptionsFromString(base_cf_opt,
+              "max_write_buffer_number=16m;inplace_update_num_locks=17M",
+              &new_cf_opt));
+  ASSERT_EQ(new_cf_opt.max_write_buffer_number, 16*1024*1024);
+  ASSERT_EQ(new_cf_opt.inplace_update_num_locks, 17*1024*1024);
+  // Units (g)
+  ASSERT_TRUE(GetColumnFamilyOptionsFromString(base_cf_opt,
+              "write_buffer_size=18g;arena_block_size=19G", &new_cf_opt));
+  ASSERT_EQ(new_cf_opt.write_buffer_size, 18*1024LL*1024LL*1024LL);
+  ASSERT_EQ(new_cf_opt.arena_block_size, 19*1024LL*1024LL*1024LL);
+  // Units (t)
+  ASSERT_TRUE(GetColumnFamilyOptionsFromString(base_cf_opt,
+              "write_buffer_size=20t;arena_block_size=21T", &new_cf_opt));
+  ASSERT_EQ(new_cf_opt.write_buffer_size, 20*1024LL*1024LL*1024LL*1024LL);
+  ASSERT_EQ(new_cf_opt.arena_block_size, 21*1024LL*1024LL*1024LL*1024LL);
 }
 
 }  // namespace rocksdb

From 1e4a45aac88533fd9aefac643f0360a44afe6c61 Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Tue, 18 Nov 2014 10:19:48 -0800
Subject: [PATCH 518/829] remove cfd->options() in
 DBImpl::NotifyOnFlushCompleted

Summary: We should not reference cfd->options() directly!

Test Plan: make release

Reviewers: sdong, rven, igor, yhchiang

Reviewed By: igor, yhchiang

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D29061
---
 db/column_family.h | 1 +
 db/db_impl.cc      | 9 +++++----
 db/db_impl.h       | 3 ++-
 3 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/db/column_family.h b/db/column_family.h
index b421e44c6..f24105fbe 100644
--- a/db/column_family.h
+++ b/db/column_family.h
@@ -173,6 +173,7 @@ class ColumnFamilyData {
   uint64_t GetLogNumber() const { return log_number_; }
 
   // thread-safe
+  // To be deprecated! Please don't not use this function anymore!
   const Options* options() const { return &options_; }
   const EnvOptions* soptions() const;
   const ImmutableCFOptions* ioptions() const { return &ioptions_; }
diff --git a/db/db_impl.cc b/db/db_impl.cc
index fb851b3d4..587ca2068 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -1095,24 +1095,25 @@ Status DBImpl::FlushMemTableToOutputFile(
 #ifndef ROCKSDB_LITE
   if (s.ok()) {
     // may temporarily unlock and lock the mutex.
-    NotifyOnFlushCompleted(cfd, file_number);
+    NotifyOnFlushCompleted(cfd, file_number, mutable_cf_options);
   }
 #endif  // ROCKSDB_LITE
   return s;
 }
 
 void DBImpl::NotifyOnFlushCompleted(
-    ColumnFamilyData* cfd, uint64_t file_number) {
+    ColumnFamilyData* cfd, uint64_t file_number,
+    const MutableCFOptions& mutable_cf_options) {
   mutex_.AssertHeld();
   if (shutting_down_.load(std::memory_order_acquire)) {
     return;
   }
   bool triggered_flush_slowdown =
       (cfd->current()->storage_info()->NumLevelFiles(0) >=
-       cfd->options()->level0_slowdown_writes_trigger);
+       mutable_cf_options.level0_slowdown_writes_trigger);
   bool triggered_flush_stop =
       (cfd->current()->storage_info()->NumLevelFiles(0) >=
-       cfd->options()->level0_stop_writes_trigger);
+       mutable_cf_options.level0_stop_writes_trigger);
   notifying_events_++;
   // release lock while notifying events
   mutex_.Unlock();
diff --git a/db/db_impl.h b/db/db_impl.h
index c3c7c72a1..cce238284 100644
--- a/db/db_impl.h
+++ b/db/db_impl.h
@@ -261,7 +261,8 @@ class DBImpl : public DB {
   Iterator* NewInternalIterator(const ReadOptions&, ColumnFamilyData* cfd,
                                 SuperVersion* super_version, Arena* arena);
 
-  void NotifyOnFlushCompleted(ColumnFamilyData* cfd, uint64_t file_number);
+  void NotifyOnFlushCompleted(ColumnFamilyData* cfd, uint64_t file_number,
+                              const MutableCFOptions& mutable_cf_options);
 
  private:
   friend class DB;

From 8d3f8f9696664378aff7f53f739ee6904ef0bbba Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Tue, 18 Nov 2014 10:20:10 -0800
Subject: [PATCH 519/829] remove all remaining references to cfd->options()

Summary:
The very last reference happens in DBImpl::GetOptions()
I built with both DBImpl::GetOptions() and ColumnFamilyData::options() commented out

Test Plan: make all check

Reviewers: sdong, yhchiang, igor

Reviewed By: igor

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D29073
---
 db/column_family.cc          |  5 -----
 db/column_family.h           |  9 +++------
 db/db_impl_debug.cc          |  4 ++--
 db/forward_iterator.cc       |  2 +-
 db/internal_stats.cc         |  4 ++--
 db/repair.cc                 |  2 +-
 db/version_set.cc            |  2 +-
 db/write_batch_internal.h    | 11 ++---------
 db/write_batch_test.cc       |  2 +-
 include/rocksdb/options.h    |  3 +++
 java/rocksjni/write_batch.cc |  2 +-
 table/table_test.cc          |  2 +-
 util/mutable_cf_options.cc   |  2 ++
 util/mutable_cf_options.h    |  3 +++
 util/options_helper.cc       |  5 ++---
 15 files changed, 25 insertions(+), 33 deletions(-)

diff --git a/db/column_family.cc b/db/column_family.cc
index 5261acc8c..8456ed9ca 100644
--- a/db/column_family.cc
+++ b/db/column_family.cc
@@ -752,11 +752,6 @@ MemTable* ColumnFamilyMemTablesImpl::GetMemTable() const {
   return current_->mem();
 }
 
-const Options* ColumnFamilyMemTablesImpl::GetOptions() const {
-  assert(current_ != nullptr);
-  return current_->options();
-}
-
 ColumnFamilyHandle* ColumnFamilyMemTablesImpl::GetColumnFamilyHandle() {
   assert(current_ != nullptr);
   return &handle_;
diff --git a/db/column_family.h b/db/column_family.h
index f24105fbe..c6d49e71b 100644
--- a/db/column_family.h
+++ b/db/column_family.h
@@ -172,9 +172,10 @@ class ColumnFamilyData {
   void SetLogNumber(uint64_t log_number) { log_number_ = log_number; }
   uint64_t GetLogNumber() const { return log_number_; }
 
-  // thread-safe
-  // To be deprecated! Please don't not use this function anymore!
+  // !!! To be deprecated! Please don't not use this function anymore!
   const Options* options() const { return &options_; }
+
+  // thread-safe
   const EnvOptions* soptions() const;
   const ImmutableCFOptions* ioptions() const { return &ioptions_; }
   // REQUIRES: DB mutex held
@@ -444,10 +445,6 @@ class ColumnFamilyMemTablesImpl : public ColumnFamilyMemTables {
   // REQUIRES: Seek() called first
   virtual MemTable* GetMemTable() const override;
 
-  // Returns options for selected column family
-  // REQUIRES: Seek() called first
-  virtual const Options* GetOptions() const override;
-
   // Returns column family handle for the selected column family
   virtual ColumnFamilyHandle* GetColumnFamilyHandle() override;
 
diff --git a/db/db_impl_debug.cc b/db/db_impl_debug.cc
index ea8f5e13b..65eaff6b3 100644
--- a/db/db_impl_debug.cc
+++ b/db/db_impl_debug.cc
@@ -81,8 +81,8 @@ Status DBImpl::TEST_CompactRange(int level, const Slice* begin,
     cfd = cfh->cfd();
   }
   int output_level =
-      (cfd->options()->compaction_style == kCompactionStyleUniversal ||
-       cfd->options()->compaction_style == kCompactionStyleFIFO)
+      (cfd->ioptions()->compaction_style == kCompactionStyleUniversal ||
+       cfd->ioptions()->compaction_style == kCompactionStyleFIFO)
           ? level
           : level + 1;
   return RunManualCompaction(cfd, level, output_level, 0, begin, end);
diff --git a/db/forward_iterator.cc b/db/forward_iterator.cc
index 7fd625a00..93af3c2d4 100644
--- a/db/forward_iterator.cc
+++ b/db/forward_iterator.cc
@@ -119,7 +119,7 @@ ForwardIterator::ForwardIterator(DBImpl* db, const ReadOptions& read_options,
     : db_(db),
       read_options_(read_options),
       cfd_(cfd),
-      prefix_extractor_(cfd->options()->prefix_extractor.get()),
+      prefix_extractor_(cfd->ioptions()->prefix_extractor),
       user_comparator_(cfd->user_comparator()),
       immutable_min_heap_(MinIterComparator(&cfd_->internal_comparator())),
       sv_(current_sv),
diff --git a/db/internal_stats.cc b/db/internal_stats.cc
index 1afe31520..33842fed8 100644
--- a/db/internal_stats.cc
+++ b/db/internal_stats.cc
@@ -361,8 +361,8 @@ void InternalStats::DumpCFStats(std::string* value) {
   const VersionStorageInfo* vstorage = cfd_->current()->storage_info();
 
   int num_levels_to_check =
-      (cfd_->options()->compaction_style != kCompactionStyleUniversal &&
-       cfd_->options()->compaction_style != kCompactionStyleFIFO)
+      (cfd_->ioptions()->compaction_style != kCompactionStyleUniversal &&
+       cfd_->ioptions()->compaction_style != kCompactionStyleFIFO)
           ? vstorage->num_levels() - 1
           : 1;
 
diff --git a/db/repair.cc b/db/repair.cc
index f23e757b0..8fa312638 100644
--- a/db/repair.cc
+++ b/db/repair.cc
@@ -222,7 +222,7 @@ class Repairer {
     WriteBatch batch;
     MemTable* mem = new MemTable(icmp_, ioptions_,
                                  MutableCFOptions(options_, ioptions_));
-    auto cf_mems_default = new ColumnFamilyMemTablesDefault(mem, &options_);
+    auto cf_mems_default = new ColumnFamilyMemTablesDefault(mem);
     mem->Ref();
     int counter = 0;
     while (reader.ReadRecord(&record, &scratch)) {
diff --git a/db/version_set.cc b/db/version_set.cc
index 97215ce0c..db8808687 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -2639,7 +2639,7 @@ Iterator* VersionSet::MakeInputIterator(Compaction* c) {
   auto cfd = c->column_family_data();
   ReadOptions read_options;
   read_options.verify_checksums =
-    cfd->options()->verify_checksums_in_compaction;
+    c->mutable_cf_options()->verify_checksums_in_compaction;
   read_options.fill_cache = false;
 
   // Level-0 files have to be merged together.  For other levels,
diff --git a/db/write_batch_internal.h b/db/write_batch_internal.h
index 568cd70d8..793c0d40f 100644
--- a/db/write_batch_internal.h
+++ b/db/write_batch_internal.h
@@ -26,15 +26,14 @@ class ColumnFamilyMemTables {
   // been processed)
   virtual uint64_t GetLogNumber() const = 0;
   virtual MemTable* GetMemTable() const = 0;
-  virtual const Options* GetOptions() const = 0;
   virtual ColumnFamilyHandle* GetColumnFamilyHandle() = 0;
   virtual void CheckMemtableFull() = 0;
 };
 
 class ColumnFamilyMemTablesDefault : public ColumnFamilyMemTables {
  public:
-  ColumnFamilyMemTablesDefault(MemTable* mem, const Options* options)
-      : ok_(false), mem_(mem), options_(options) {}
+  explicit ColumnFamilyMemTablesDefault(MemTable* mem)
+      : ok_(false), mem_(mem) {}
 
   bool Seek(uint32_t column_family_id) override {
     ok_ = (column_family_id == 0);
@@ -48,11 +47,6 @@ class ColumnFamilyMemTablesDefault : public ColumnFamilyMemTables {
     return mem_;
   }
 
-  const Options* GetOptions() const override {
-    assert(ok_);
-    return options_;
-  }
-
   ColumnFamilyHandle* GetColumnFamilyHandle() override { return nullptr; }
 
   void CheckMemtableFull() override {}
@@ -60,7 +54,6 @@ class ColumnFamilyMemTablesDefault : public ColumnFamilyMemTables {
  private:
   bool ok_;
   MemTable* mem_;
-  const Options* const options_;
 };
 
 // WriteBatchInternal provides static methods for manipulating a
diff --git a/db/write_batch_test.cc b/db/write_batch_test.cc
index d24b2e068..7f180d9e6 100644
--- a/db/write_batch_test.cc
+++ b/db/write_batch_test.cc
@@ -32,7 +32,7 @@ static std::string PrintContents(WriteBatch* b) {
                                MutableCFOptions(options, ioptions));
   mem->Ref();
   std::string state;
-  ColumnFamilyMemTablesDefault cf_mems_default(mem, &options);
+  ColumnFamilyMemTablesDefault cf_mems_default(mem);
   Status s = WriteBatchInternal::InsertInto(b, &cf_mems_default);
   int count = 0;
   Arena arena;
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index 102143301..dd05aa9de 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -421,7 +421,10 @@ struct ColumnFamilyOptions {
 
   // If true, compaction will verify checksum on every read that happens
   // as part of compaction
+  //
   // Default: true
+  //
+  // Dynamically changeable through SetOptions() API
   bool verify_checksums_in_compaction;
 
   // The options needed to support Universal Style compactions
diff --git a/java/rocksjni/write_batch.cc b/java/rocksjni/write_batch.cc
index d243c87a0..8adcfdc0f 100644
--- a/java/rocksjni/write_batch.cc
+++ b/java/rocksjni/write_batch.cc
@@ -344,7 +344,7 @@ jbyteArray Java_org_rocksdb_test_WriteBatchTest_getContents(
       rocksdb::ImmutableCFOptions(options)));
   mem->Ref();
   std::string state;
-  rocksdb::ColumnFamilyMemTablesDefault cf_mems_default(mem, &options);
+  rocksdb::ColumnFamilyMemTablesDefault cf_mems_default(mem);
   rocksdb::Status s =
       rocksdb::WriteBatchInternal::InsertInto(b, &cf_mems_default);
   int count = 0;
diff --git a/table/table_test.cc b/table/table_test.cc
index facf0926e..a02846ccc 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -1879,7 +1879,7 @@ TEST(MemTableTest, Simple) {
   batch.Put(std::string("k2"), std::string("v2"));
   batch.Put(std::string("k3"), std::string("v3"));
   batch.Put(std::string("largekey"), std::string("vlarge"));
-  ColumnFamilyMemTablesDefault cf_mems_default(memtable, &options);
+  ColumnFamilyMemTablesDefault cf_mems_default(memtable);
   ASSERT_TRUE(WriteBatchInternal::InsertInto(&batch, &cf_mems_default).ok());
 
   Arena arena;
diff --git a/util/mutable_cf_options.cc b/util/mutable_cf_options.cc
index c5f4c60b3..4ec2a4138 100644
--- a/util/mutable_cf_options.cc
+++ b/util/mutable_cf_options.cc
@@ -128,6 +128,8 @@ void MutableCFOptions::Dump(Logger* log) const {
   Log(log, "max_bytes_for_level_multiplier_additional: %s", result.c_str());
   Log(log, "                 max_mem_compaction_level: %d",
       max_mem_compaction_level);
+  Log(log, "           verify_checksums_in_compaction: %d",
+      verify_checksums_in_compaction);
   Log(log, "        max_sequential_skip_in_iterations: %" PRIu64,
       max_sequential_skip_in_iterations);
 }
diff --git a/util/mutable_cf_options.h b/util/mutable_cf_options.h
index 40938655b..9f876ace0 100644
--- a/util/mutable_cf_options.h
+++ b/util/mutable_cf_options.h
@@ -40,6 +40,7 @@ struct MutableCFOptions {
       max_bytes_for_level_multiplier_additional(
           options.max_bytes_for_level_multiplier_additional),
       max_mem_compaction_level(options.max_mem_compaction_level),
+      verify_checksums_in_compaction(options.verify_checksums_in_compaction),
       max_sequential_skip_in_iterations(
           options.max_sequential_skip_in_iterations)
   {
@@ -69,6 +70,7 @@ struct MutableCFOptions {
       max_bytes_for_level_base(0),
       max_bytes_for_level_multiplier(0),
       max_mem_compaction_level(0),
+      verify_checksums_in_compaction(false),
       max_sequential_skip_in_iterations(0)
   {}
 
@@ -114,6 +116,7 @@ struct MutableCFOptions {
   int max_bytes_for_level_multiplier;
   std::vector<int> max_bytes_for_level_multiplier_additional;
   int max_mem_compaction_level;
+  bool verify_checksums_in_compaction;
 
   // Misc options
   uint64_t max_sequential_skip_in_iterations;
diff --git a/util/options_helper.cc b/util/options_helper.cc
index 111215282..bea7f1a9d 100644
--- a/util/options_helper.cc
+++ b/util/options_helper.cc
@@ -185,6 +185,8 @@ bool ParseCompactionOptions(const std::string& name, const std::string& value,
     }
   } else if (name == "max_mem_compaction_level") {
     new_options->max_mem_compaction_level = ParseInt(value);
+  } else if (name == "verify_checksums_in_compaction") {
+    new_options->verify_checksums_in_compaction = ParseBoolean(name, value);
   } else {
     return false;
   }
@@ -330,9 +332,6 @@ bool GetColumnFamilyOptionsFromMap(
           ParseBoolean(o.first, o.second);
       } else if (o.first == "compaction_style") {
         new_options->compaction_style = ParseCompactionStyle(o.second);
-      } else if (o.first == "verify_checksums_in_compaction") {
-        new_options->verify_checksums_in_compaction =
-          ParseBoolean(o.first, o.second);
       } else if (o.first == "compaction_options_universal") {
         // TODO(ljin): add support
         throw o.first;

From 5249d0db50a1a629299404206ad9c50cc24eb3a6 Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Sun, 16 Nov 2014 22:35:54 +0100
Subject: [PATCH 520/829] [RocksJava] Convenience methods for Options

RocksDB introduced in 3.7.0 convenience methods
for getting ColumnFamilyOptions and DBOptions
instances from predefined configuration structures.

There is now also a method in RocksJava to load DBOptions
as well as ColumnFamilyOptions from a predefined Properties
based configuration.
---
 java/org/rocksdb/ColumnFamilyOptions.java     | 57 +++++++++++++++++++
 java/org/rocksdb/DBOptions.java               | 57 +++++++++++++++++++
 .../rocksdb/test/ColumnFamilyOptionsTest.java | 52 +++++++++++++++++
 java/org/rocksdb/test/DBOptionsTest.java      | 51 +++++++++++++++++
 java/rocksjni/options.cc                      | 51 +++++++++++++++++
 5 files changed, 268 insertions(+)

diff --git a/java/org/rocksdb/ColumnFamilyOptions.java b/java/org/rocksdb/ColumnFamilyOptions.java
index 9ce1e9a98..7fa4545b5 100644
--- a/java/org/rocksdb/ColumnFamilyOptions.java
+++ b/java/org/rocksdb/ColumnFamilyOptions.java
@@ -5,6 +5,8 @@
 
 package org.rocksdb;
 
+import java.util.Properties;
+
 /**
  * ColumnFamilyOptions to control the behavior of a database.  It will be used
  * during the creation of a {@link org.rocksdb.RocksDB} (i.e., RocksDB.open()).
@@ -29,6 +31,58 @@ public class ColumnFamilyOptions extends RocksObject
     newColumnFamilyOptions();
   }
 
+  /**
+   * <p>Private constructor to be used by
+   * {@link #getColumnFamilyOptionsFromProps(java.util.Properties)}</p>
+   *
+   * @param handle native handle to ColumnFamilyOptions instance.
+   */
+  private ColumnFamilyOptions(long handle) {
+    super();
+    nativeHandle_ = handle;
+  }
+
+  /**
+   * <p>Method to get a options instance by using pre-configured
+   * property values. If one or many values are undefined in
+   * the context of RocksDB the method will return a null
+   * value.</p>
+   *
+   * <p><strong>Note</strong>: Property keys can be derived from
+   * getter methods within the options class. Example: the method
+   * {@code writeBufferSize()} has a property key:
+   * {@code write_buffer_size}.</p>
+   *
+   * @param properties {@link java.util.Properties} instance.
+   *
+   * @return {@link org.rocksdb.ColumnFamilyOptions instance}
+   *     or null.
+   *
+   * @throws java.lang.IllegalArgumentException if null or empty
+   *     {@link Properties} instance is passed to the method call.
+   */
+  public static ColumnFamilyOptions getColumnFamilyOptionsFromProps(
+      Properties properties) {
+    if (properties == null || properties.size() == 0) {
+      throw new IllegalArgumentException(
+          "Properties value must contain at least one value.");
+    }
+    ColumnFamilyOptions columnFamilyOptions = null;
+    StringBuilder stringBuilder = new StringBuilder();
+    for (final String name : properties.stringPropertyNames()){
+      stringBuilder.append(name);
+      stringBuilder.append("=");
+      stringBuilder.append(properties.getProperty(name));
+      stringBuilder.append(";");
+    }
+    long handle = getColumnFamilyOptionsFromProps(
+        stringBuilder.toString());
+    if (handle != 0){
+      columnFamilyOptions = new ColumnFamilyOptions(handle);
+    }
+    return columnFamilyOptions;
+  }
+
   @Override
   public ColumnFamilyOptions optimizeForPointLookup(
       long blockCacheSizeMb) {
@@ -522,6 +576,9 @@ public class ColumnFamilyOptions extends RocksObject
     disposeInternal(nativeHandle_);
   }
 
+  private static native long getColumnFamilyOptionsFromProps(
+      String optString);
+
   private native void newColumnFamilyOptions();
   private native void disposeInternal(long handle);
 
diff --git a/java/org/rocksdb/DBOptions.java b/java/org/rocksdb/DBOptions.java
index e19ee9a0a..45113c0f2 100644
--- a/java/org/rocksdb/DBOptions.java
+++ b/java/org/rocksdb/DBOptions.java
@@ -5,6 +5,8 @@
 
 package org.rocksdb;
 
+import java.util.Properties;
+
 /**
  * DBOptions to control the behavior of a database.  It will be used
  * during the creation of a {@link org.rocksdb.RocksDB} (i.e., RocksDB.open()).
@@ -29,6 +31,58 @@ public class DBOptions extends RocksObject implements DBOptionsInterface {
     newDBOptions();
   }
 
+  /**
+   * <p>Private constructor to be used by
+   * {@link #getDBOptionsFromProps(java.util.Properties)}</p>
+   *
+   * @param handle native handle to DBOptions instance.
+   */
+  private DBOptions(long handle) {
+    super();
+    nativeHandle_ = handle;
+  }
+
+  /**
+   * <p>Method to get a options instance by using pre-configured
+   * property values. If one or many values are undefined in
+   * the context of RocksDB the method will return a null
+   * value.</p>
+   *
+   * <p><strong>Note</strong>: Property keys can be derived from
+   * getter methods within the options class. Example: the method
+   * {@code allowMmapReads()} has a property key:
+   * {@code allow_mmap_reads}.</p>
+   *
+   * @param properties {@link java.util.Properties} instance.
+   *
+   * @return {@link org.rocksdb.DBOptions instance}
+   *     or null.
+   *
+   * @throws java.lang.IllegalArgumentException if null or empty
+   *     {@link java.util.Properties} instance is passed to the method call.
+   */
+  public static DBOptions getDBOptionsFromProps(
+      Properties properties) {
+    if (properties == null || properties.size() == 0) {
+      throw new IllegalArgumentException(
+          "Properties value must contain at least one value.");
+    }
+    DBOptions dbOptions = null;
+    StringBuilder stringBuilder = new StringBuilder();
+    for (final String name : properties.stringPropertyNames()){
+      stringBuilder.append(name);
+      stringBuilder.append("=");
+      stringBuilder.append(properties.getProperty(name));
+      stringBuilder.append(";");
+    }
+    long handle = getDBOptionsFromProps(
+        stringBuilder.toString());
+    if (handle != 0){
+      dbOptions = new DBOptions(handle);
+    }
+    return dbOptions;
+  }
+
   @Override
   public DBOptions setCreateIfMissing(boolean flag) {
     assert(isInitialized());
@@ -487,6 +541,9 @@ public class DBOptions extends RocksObject implements DBOptionsInterface {
 
   static final int DEFAULT_NUM_SHARD_BITS = -1;
 
+  private static native long getDBOptionsFromProps(
+      String optString);
+
   private native void newDBOptions();
   private native void disposeInternal(long handle);
 
diff --git a/java/org/rocksdb/test/ColumnFamilyOptionsTest.java b/java/org/rocksdb/test/ColumnFamilyOptionsTest.java
index 7fcfee14c..aae9b5749 100644
--- a/java/org/rocksdb/test/ColumnFamilyOptionsTest.java
+++ b/java/org/rocksdb/test/ColumnFamilyOptionsTest.java
@@ -9,6 +9,7 @@ import org.junit.ClassRule;
 import org.junit.Test;
 import org.rocksdb.*;
 
+import java.util.Properties;
 import java.util.Random;
 
 import static org.assertj.core.api.Assertions.assertThat;
@@ -22,6 +23,57 @@ public class ColumnFamilyOptionsTest {
   public static final Random rand = PlatformRandomHelper.
       getPlatformSpecificRandomFactory();
 
+  @Test
+  public void getColumnFamilyOptionsFromProps() {
+    ColumnFamilyOptions opt = null;
+    try {
+      // setup sample properties
+      Properties properties = new Properties();
+      properties.put("write_buffer_size", "112");
+      properties.put("max_write_buffer_number", "13");
+      opt = ColumnFamilyOptions.
+          getColumnFamilyOptionsFromProps(properties);
+      assertThat(opt).isNotNull();
+      assertThat(String.valueOf(opt.writeBufferSize())).
+          isEqualTo(properties.get("write_buffer_size"));
+      assertThat(String.valueOf(opt.maxWriteBufferNumber())).
+          isEqualTo(properties.get("max_write_buffer_number"));
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void failColumnFamilyOptionsFromPropsWithIllegalValue() {
+    ColumnFamilyOptions opt = null;
+    try {
+      // setup sample properties
+      Properties properties = new Properties();
+      properties.put("tomato", "1024");
+      properties.put("burger", "2");
+      opt = ColumnFamilyOptions.
+          getColumnFamilyOptionsFromProps(properties);
+      assertThat(opt).isNull();
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test(expected = IllegalArgumentException.class)
+  public void failColumnFamilyOptionsFromPropsWithNullValue() {
+    ColumnFamilyOptions.getColumnFamilyOptionsFromProps(null);
+  }
+
+  @Test(expected = IllegalArgumentException.class)
+  public void failColumnFamilyOptionsFromPropsWithEmptyProps() {
+    ColumnFamilyOptions.getColumnFamilyOptionsFromProps(
+        new Properties());
+  }
+
   @Test
   public void writeBufferSize() throws RocksDBException {
     ColumnFamilyOptions opt = null;
diff --git a/java/org/rocksdb/test/DBOptionsTest.java b/java/org/rocksdb/test/DBOptionsTest.java
index 9a15658e7..6064dd694 100644
--- a/java/org/rocksdb/test/DBOptionsTest.java
+++ b/java/org/rocksdb/test/DBOptionsTest.java
@@ -9,6 +9,7 @@ import org.junit.ClassRule;
 import org.junit.Test;
 import org.rocksdb.*;
 
+import java.util.Properties;
 import java.util.Random;
 
 import static org.assertj.core.api.Assertions.assertThat;
@@ -22,6 +23,56 @@ public class DBOptionsTest {
   public static final Random rand = PlatformRandomHelper.
       getPlatformSpecificRandomFactory();
 
+  @Test
+  public void getDBOptionsFromProps() {
+    DBOptions opt = null;
+    try {
+      // setup sample properties
+      Properties properties = new Properties();
+      properties.put("allow_mmap_reads", "true");
+      properties.put("bytes_per_sync", "13");
+      opt = DBOptions.getDBOptionsFromProps(properties);
+      assertThat(opt).isNotNull();
+      assertThat(String.valueOf(opt.allowMmapReads())).
+          isEqualTo(properties.get("allow_mmap_reads"));
+      assertThat(String.valueOf(opt.bytesPerSync())).
+          isEqualTo(properties.get("bytes_per_sync"));
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void failDBOptionsFromPropsWithIllegalValue() {
+    DBOptions opt = null;
+    try {
+      // setup sample properties
+      Properties properties = new Properties();
+      properties.put("tomato", "1024");
+      properties.put("burger", "2");
+      opt = DBOptions.
+          getDBOptionsFromProps(properties);
+      assertThat(opt).isNull();
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test(expected = IllegalArgumentException.class)
+  public void failDBOptionsFromPropsWithNullValue() {
+    DBOptions.getDBOptionsFromProps(null);
+  }
+
+  @Test(expected = IllegalArgumentException.class)
+  public void failDBOptionsFromPropsWithEmptyProps() {
+    DBOptions.getDBOptionsFromProps(
+        new Properties());
+  }
+
   @Test
   public void createIfMissing() {
     DBOptions opt = null;
diff --git a/java/rocksjni/options.cc b/java/rocksjni/options.cc
index 50bab7a1b..82fb1fd1b 100644
--- a/java/rocksjni/options.cc
+++ b/java/rocksjni/options.cc
@@ -31,6 +31,7 @@
 #include "rocksdb/rate_limiter.h"
 #include "rocksdb/comparator.h"
 #include "rocksdb/merge_operator.h"
+#include "rocksdb/utilities/convenience.h"
 #include "utilities/merge_operators.h"
 
 /*
@@ -1776,6 +1777,31 @@ void Java_org_rocksdb_ColumnFamilyOptions_newColumnFamilyOptions(
   rocksdb::ColumnFamilyOptionsJni::setHandle(env, jobj, op);
 }
 
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    getColumnFamilyOptionsFromProps
+ * Signature: (Ljava/util/String;)J
+ */
+jlong Java_org_rocksdb_ColumnFamilyOptions_getColumnFamilyOptionsFromProps(
+    JNIEnv* env, jclass jclazz, jstring jopt_string) {
+  jlong ret_value = 0;
+  rocksdb::ColumnFamilyOptions* cf_options =
+      new rocksdb::ColumnFamilyOptions();
+  const char* opt_string = env->GetStringUTFChars(jopt_string, 0);
+  bool status = rocksdb::GetColumnFamilyOptionsFromString(
+      rocksdb::ColumnFamilyOptions(), opt_string, cf_options);
+  env->ReleaseStringUTFChars(jopt_string, opt_string);
+  // Check if ColumnFamilyOptions creation was possible.
+  if (status) {
+    ret_value = reinterpret_cast<jlong>(cf_options);
+  } else {
+    // if operation failed the ColumnFamilyOptions need to be deleted
+    // again to prevent a memory leak.
+    delete cf_options;
+  }
+  return ret_value;
+}
+
 /*
  * Class:     org_rocksdb_ColumnFamilyOptions
  * Method:    disposeInternal
@@ -2751,6 +2777,31 @@ void Java_org_rocksdb_DBOptions_newDBOptions(JNIEnv* env,
   rocksdb::DBOptionsJni::setHandle(env, jobj, dbop);
 }
 
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    getDBOptionsFromProps
+ * Signature: (Ljava/util/String;)J
+ */
+jlong Java_org_rocksdb_DBOptions_getDBOptionsFromProps(
+    JNIEnv* env, jclass jclazz, jstring jopt_string) {
+  jlong ret_value = 0;
+  rocksdb::DBOptions* db_options =
+      new rocksdb::DBOptions();
+  const char* opt_string = env->GetStringUTFChars(jopt_string, 0);
+  bool status = rocksdb::GetDBOptionsFromString(
+      rocksdb::DBOptions(), opt_string, db_options);
+  env->ReleaseStringUTFChars(jopt_string, opt_string);
+  // Check if DBOptions creation was possible.
+  if (status) {
+    ret_value = reinterpret_cast<jlong>(db_options);
+  } else {
+    // if operation failed the DBOptions need to be deleted
+    // again to prevent a memory leak.
+    delete db_options;
+  }
+  return ret_value;
+}
+
 /*
  * Class:     org_rocksdb_DBOptions
  * Method:    disposeInternal

From 91ccc8ebefc48b5f4fcc0e0a7a220fd808dc52d4 Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Tue, 18 Nov 2014 21:45:12 +0100
Subject: [PATCH 521/829] [RocksJava] Integrated changes in D29025

Addressed review comments.
---
 java/org/rocksdb/ColumnFamilyOptions.java | 22 +++++++++++-----------
 java/org/rocksdb/DBOptions.java           | 22 +++++++++++-----------
 2 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/java/org/rocksdb/ColumnFamilyOptions.java b/java/org/rocksdb/ColumnFamilyOptions.java
index 7fa4545b5..3d3b236a2 100644
--- a/java/org/rocksdb/ColumnFamilyOptions.java
+++ b/java/org/rocksdb/ColumnFamilyOptions.java
@@ -31,17 +31,6 @@ public class ColumnFamilyOptions extends RocksObject
     newColumnFamilyOptions();
   }
 
-  /**
-   * <p>Private constructor to be used by
-   * {@link #getColumnFamilyOptionsFromProps(java.util.Properties)}</p>
-   *
-   * @param handle native handle to ColumnFamilyOptions instance.
-   */
-  private ColumnFamilyOptions(long handle) {
-    super();
-    nativeHandle_ = handle;
-  }
-
   /**
    * <p>Method to get a options instance by using pre-configured
    * property values. If one or many values are undefined in
@@ -576,6 +565,17 @@ public class ColumnFamilyOptions extends RocksObject
     disposeInternal(nativeHandle_);
   }
 
+  /**
+   * <p>Private constructor to be used by
+   * {@link #getColumnFamilyOptionsFromProps(java.util.Properties)}</p>
+   *
+   * @param handle native handle to ColumnFamilyOptions instance.
+   */
+  private ColumnFamilyOptions(long handle) {
+    super();
+    nativeHandle_ = handle;
+  }
+
   private static native long getColumnFamilyOptionsFromProps(
       String optString);
 
diff --git a/java/org/rocksdb/DBOptions.java b/java/org/rocksdb/DBOptions.java
index 45113c0f2..600369dec 100644
--- a/java/org/rocksdb/DBOptions.java
+++ b/java/org/rocksdb/DBOptions.java
@@ -31,17 +31,6 @@ public class DBOptions extends RocksObject implements DBOptionsInterface {
     newDBOptions();
   }
 
-  /**
-   * <p>Private constructor to be used by
-   * {@link #getDBOptionsFromProps(java.util.Properties)}</p>
-   *
-   * @param handle native handle to DBOptions instance.
-   */
-  private DBOptions(long handle) {
-    super();
-    nativeHandle_ = handle;
-  }
-
   /**
    * <p>Method to get a options instance by using pre-configured
    * property values. If one or many values are undefined in
@@ -541,6 +530,17 @@ public class DBOptions extends RocksObject implements DBOptionsInterface {
 
   static final int DEFAULT_NUM_SHARD_BITS = -1;
 
+  /**
+   * <p>Private constructor to be used by
+   * {@link #getDBOptionsFromProps(java.util.Properties)}</p>
+   *
+   * @param handle native handle to DBOptions instance.
+   */
+  private DBOptions(long handle) {
+    super();
+    nativeHandle_ = handle;
+  }
+
   private static native long getDBOptionsFromProps(
       String optString);
 

From 9e9a83baf77a204650091a0c80b5e33d3794035e Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Tue, 18 Nov 2014 22:21:02 +0100
Subject: [PATCH 522/829] Missing header in build on CentOS

While building RocksJava the build fails on
CentOS because of the missing stdexcept header.
---
 util/options_helper.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/util/options_helper.h b/util/options_helper.h
index 62373b2d5..02c788114 100644
--- a/util/options_helper.h
+++ b/util/options_helper.h
@@ -6,6 +6,7 @@
 #pragma once
 
 #include <string>
+#include <stdexcept>
 #include "util/mutable_cf_options.h"
 #include "rocksdb/status.h"
 

From be005e17bb39b2ddadf37cd72e61bf18772aa911 Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Tue, 18 Nov 2014 20:41:29 -0800
Subject: [PATCH 523/829] fix clang compilation

Summary:
as title
---
 util/options_test.cc | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/util/options_test.cc b/util/options_test.cc
index b2087608f..cd456a0ae 100644
--- a/util/options_test.cc
+++ b/util/options_test.cc
@@ -302,24 +302,24 @@ TEST(OptionsTest, GetOptionsFromStringTest) {
   ASSERT_TRUE(GetColumnFamilyOptionsFromString(base_cf_opt,
               "memtable_prefix_bloom_bits=14k;max_write_buffer_number=-15K",
               &new_cf_opt));
-  ASSERT_EQ(new_cf_opt.memtable_prefix_bloom_bits, 14*1024);
+  ASSERT_EQ(new_cf_opt.memtable_prefix_bloom_bits, 14UL*1024UL);
   ASSERT_EQ(new_cf_opt.max_write_buffer_number, -15*1024);
   // Units (m)
   ASSERT_TRUE(GetColumnFamilyOptionsFromString(base_cf_opt,
               "max_write_buffer_number=16m;inplace_update_num_locks=17M",
               &new_cf_opt));
   ASSERT_EQ(new_cf_opt.max_write_buffer_number, 16*1024*1024);
-  ASSERT_EQ(new_cf_opt.inplace_update_num_locks, 17*1024*1024);
+  ASSERT_EQ(new_cf_opt.inplace_update_num_locks, 17*1024UL*1024UL);
   // Units (g)
   ASSERT_TRUE(GetColumnFamilyOptionsFromString(base_cf_opt,
               "write_buffer_size=18g;arena_block_size=19G", &new_cf_opt));
-  ASSERT_EQ(new_cf_opt.write_buffer_size, 18*1024LL*1024LL*1024LL);
-  ASSERT_EQ(new_cf_opt.arena_block_size, 19*1024LL*1024LL*1024LL);
+  ASSERT_EQ(new_cf_opt.write_buffer_size, 18*1024UL*1024UL*1024UL);
+  ASSERT_EQ(new_cf_opt.arena_block_size, 19*1024UL*1024UL*1024UL);
   // Units (t)
   ASSERT_TRUE(GetColumnFamilyOptionsFromString(base_cf_opt,
               "write_buffer_size=20t;arena_block_size=21T", &new_cf_opt));
-  ASSERT_EQ(new_cf_opt.write_buffer_size, 20*1024LL*1024LL*1024LL*1024LL);
-  ASSERT_EQ(new_cf_opt.arena_block_size, 21*1024LL*1024LL*1024LL*1024LL);
+  ASSERT_EQ(new_cf_opt.write_buffer_size, 20*1024UL*1024UL*1024UL*1024UL);
+  ASSERT_EQ(new_cf_opt.arena_block_size, 21*1024UL*1024UL*1024UL*1024UL);
 }
 
 }  // namespace rocksdb

From 2cd1794e4f400eeb47240533f73d58654e72bcc6 Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Wed, 19 Nov 2014 21:15:01 +0100
Subject: [PATCH 524/829] [RocksJava] Make cleanup - Clean Target

- Remove JNI includes on clean
- Remove target folder generated by Maven
- Remove shared object
- Remove jar
---
 java/Makefile | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/java/Makefile b/java/Makefile
index f0ab4c12e..c9e6ce80f 100644
--- a/java/Makefile
+++ b/java/Makefile
@@ -86,8 +86,13 @@ JAVA_TESTCLASSPATH = $(ROCKSDB_JAR):$(JAVA_JUNIT_JAR):$(JAVA_HAMCR_JAR):$(JAVA_M
 clean:
 	-find . -name "*.class" -exec rm {} \;
 	-find . -name "hs*.log" -exec rm {} \;
+	rm -rf include/*
 	rm -rf javadoc/*
 	rm -rf test-libs/
+	rm -rf target
+	rm librocksdbjni*
+	rm rocksdbjni*
+
 
 javadocs:
 	mkdir -p javadoc; javadoc -d javadoc -sourcepath . -subpackages org -exclude org.rocksdb.test

From e7fcaa4d9297850780ff4023d44443c2beb48acc Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Wed, 19 Nov 2014 21:21:21 +0100
Subject: [PATCH 525/829] [RocksJava] JavaDoc is executed too often

Previous to this commit too much targets got dependencies
on javadocs target.

Introduced one additional target "javalib" which resolves
that situation. JavaDoc will now be generated once while
executing a task with prefix "rocksdbjava".
---
 Makefile      | 4 ++--
 java/Makefile | 4 +++-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/Makefile b/Makefile
index fc80fa377..ad79d0589 100644
--- a/Makefile
+++ b/Makefile
@@ -594,7 +594,7 @@ libsnappy.a:
 
 rocksdbjavastatic: libz.a libbz2.a libsnappy.a
 	OPT="-fPIC -DNDEBUG -O2" $(MAKE) $(LIBRARY) -j
-	cd java;$(MAKE) java;
+	cd java;$(MAKE) javalib;
 	rm -f ./java/$(ROCKSDBJNILIB)
 	$(CXX) $(CXXFLAGS) -I./java/. $(JAVA_INCLUDE) -shared -fPIC -o ./java/$(ROCKSDBJNILIB) $(JNI_NATIVE_SOURCES) $(LIBOBJECTS) $(COVERAGEFLAGS) libz.a libbz2.a libsnappy.a
 	cd java;jar -cf $(ROCKSDB_JAR) org/rocksdb/*.class org/rocksdb/util/*.class HISTORY*.md $(ROCKSDBJNILIB)
@@ -615,7 +615,7 @@ rocksdbjavastaticpublish: rocksdbjavastaticrelease
 
 rocksdbjava:
 	OPT="-fPIC -DNDEBUG -O2" $(MAKE) $(LIBRARY) -j32
-	cd java;$(MAKE) java;
+	cd java;$(MAKE) javalib;
 	rm -f ./java/$(ROCKSDBJNILIB)
 	$(CXX) $(CXXFLAGS) -I./java/. $(JAVA_INCLUDE) -shared -fPIC -o ./java/$(ROCKSDBJNILIB) $(JNI_NATIVE_SOURCES) $(LIBOBJECTS) $(JAVA_LDFLAGS) $(COVERAGEFLAGS)
 	cd java;jar -cf $(ROCKSDB_JAR) org/rocksdb/*.class org/rocksdb/util/*.class HISTORY*.md $(ROCKSDBJNILIB)
diff --git a/java/Makefile b/java/Makefile
index c9e6ce80f..e8dc5cb47 100644
--- a/java/Makefile
+++ b/java/Makefile
@@ -97,7 +97,9 @@ clean:
 javadocs:
 	mkdir -p javadoc; javadoc -d javadoc -sourcepath . -subpackages org -exclude org.rocksdb.test
 
-java: javadocs resolve_test_deps
+javalib: java javadocs
+
+java: resolve_test_deps
 	javac org/rocksdb/util/*.java org/rocksdb/*.java
 	javac -cp $(JAVA_TESTCLASSPATH) org/rocksdb/test/*.java
 	@cp ../HISTORY.md ./HISTORY-CPP.md

From 91c8dcefc3de66956ce6d1c7d3d7cff97f4a8eb9 Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Wed, 19 Nov 2014 21:31:46 +0100
Subject: [PATCH 526/829] [RocksJava] Strip library in publish

Currently maven publishing uses the library with debug symbols. What
leads to unnecessary big library sizes. Included strip to remove
unnecessary stuff. 40M -> 2.7M
---
 Makefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Makefile b/Makefile
index ad79d0589..676be348b 100644
--- a/Makefile
+++ b/Makefile
@@ -597,6 +597,7 @@ rocksdbjavastatic: libz.a libbz2.a libsnappy.a
 	cd java;$(MAKE) javalib;
 	rm -f ./java/$(ROCKSDBJNILIB)
 	$(CXX) $(CXXFLAGS) -I./java/. $(JAVA_INCLUDE) -shared -fPIC -o ./java/$(ROCKSDBJNILIB) $(JNI_NATIVE_SOURCES) $(LIBOBJECTS) $(COVERAGEFLAGS) libz.a libbz2.a libsnappy.a
+	cd java;strip $(ROCKSDBJNILIB)
 	cd java;jar -cf $(ROCKSDB_JAR) org/rocksdb/*.class org/rocksdb/util/*.class HISTORY*.md $(ROCKSDBJNILIB)
 	cd java/javadoc;jar -cf ../$(ROCKSDB_JAVADOCS_JAR) *
 	cd java;jar -cf $(ROCKSDB_SOURCES_JAR) org

From d0c5f28a5c0ec27744512695c90be69c74281950 Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Thu, 20 Nov 2014 10:49:32 -0800
Subject: [PATCH 527/829] Introduce GetThreadList API

Summary:
Add GetThreadList API, which allows developer to track the
status of each process.  Currently, calling GetThreadList will
only get the list of background threads in RocksDB with their
thread-id and thread-type (priority) set.  Will add more support
on this in the later diffs.

ThreadStatus currently has the following properties:

  // An unique ID for the thread.
  const uint64_t thread_id;

  // The type of the thread, it could be ROCKSDB_HIGH_PRIORITY,
  // ROCKSDB_LOW_PRIORITY, and USER_THREAD
  const ThreadType thread_type;

  // The name of the DB instance where the thread is currently
  // involved with.  It would be set to empty string if the thread
  // does not involve in any DB operation.
  const std::string db_name;

  // The name of the column family where the thread is currently
  // It would be set to empty string if the thread does not involve
  // in any column family.
  const std::string cf_name;

  // The event that the current thread is involved.
  // It would be set to empty string if the information about event
  // is not currently available.

Test Plan:
./thread_list_test
export ROCKSDB_TESTS=GetThreadList
./db_test

Reviewers: rven, igor, sdong, ljin

Reviewed By: ljin

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D25047
---
 HISTORY.md                                  |   3 +
 Makefile                                    |   6 +-
 db/db_impl.cc                               | 114 ++++++++---
 db/db_impl.h                                |   7 +
 db/db_impl_readonly.cc                      |   5 +
 db/db_test.cc                               |  56 ++++++
 include/rocksdb/db.h                        |   7 +
 include/rocksdb/thread_status.h             |  66 +++++++
 util/env_posix.cc                           |  27 +++
 util/hash.h                                 |   3 +-
 util/thread_list_test.cc                    | 156 +++++++++++++++
 util/thread_status_impl.cc                  | 198 ++++++++++++++++++++
 util/thread_status_impl.h                   | 164 ++++++++++++++++
 util/thread_status_impl_debug.cc            |  26 +++
 utilities/compacted_db/compacted_db_impl.cc |   2 +
 15 files changed, 807 insertions(+), 33 deletions(-)
 create mode 100644 include/rocksdb/thread_status.h
 create mode 100644 util/thread_list_test.cc
 create mode 100644 util/thread_status_impl.cc
 create mode 100644 util/thread_status_impl.h
 create mode 100644 util/thread_status_impl_debug.cc

diff --git a/HISTORY.md b/HISTORY.md
index 08c26cc2a..78973adec 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,5 +1,8 @@
 # Rocksdb Change Log
 
+### Unreleased Features
+* Add rocksdb::GetThreadList(), which returns the current status of all rocksdb-related threads.
+
 ## 3.8.0 (11/14/2014)
 
 ### Public API changes
diff --git a/Makefile b/Makefile
index 676be348b..1862d2bf9 100644
--- a/Makefile
+++ b/Makefile
@@ -150,7 +150,8 @@ TESTS = \
 	flush_job_test \
 	wal_manager_test \
 	listener_test \
-	compaction_job_test
+	compaction_job_test \
+	thread_list_test
 
 TOOLS = \
         sst_dump \
@@ -509,6 +510,9 @@ cuckoo_table_db_test: db/cuckoo_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS)
 listener_test: db/listener_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) db/listener_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
 
+thread_list_test: util/thread_list_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) util/thread_list_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
 compactor_test: utilities/compaction/compactor_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) utilities/compaction/compactor_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
 
diff --git a/db/db_impl.cc b/db/db_impl.cc
index 587ca2068..c004ddfbb 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -75,6 +75,7 @@
 #include "util/iostats_context_imp.h"
 #include "util/stop_watch.h"
 #include "util/sync_point.h"
+#include "util/thread_status_impl.h"
 
 namespace rocksdb {
 
@@ -241,6 +242,7 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname)
 }
 
 DBImpl::~DBImpl() {
+  EraseThreadStatusDbInfo();
   mutex_.Lock();
 
   if (flush_on_destroy_) {
@@ -2453,40 +2455,50 @@ std::vector<Status> DBImpl::MultiGet(
 Status DBImpl::CreateColumnFamily(const ColumnFamilyOptions& cf_options,
                                   const std::string& column_family_name,
                                   ColumnFamilyHandle** handle) {
+  Status s;
   *handle = nullptr;
-  MutexLock l(&mutex_);
+  {
+    MutexLock l(&mutex_);
 
-  if (versions_->GetColumnFamilySet()->GetColumnFamily(column_family_name) !=
-      nullptr) {
-    return Status::InvalidArgument("Column family already exists");
+    if (versions_->GetColumnFamilySet()->GetColumnFamily(column_family_name) !=
+        nullptr) {
+      return Status::InvalidArgument("Column family already exists");
+    }
+    VersionEdit edit;
+    edit.AddColumnFamily(column_family_name);
+    uint32_t new_id = versions_->GetColumnFamilySet()->GetNextColumnFamilyID();
+    edit.SetColumnFamily(new_id);
+    edit.SetLogNumber(logfile_number_);
+    edit.SetComparatorName(cf_options.comparator->Name());
+
+    // LogAndApply will both write the creation in MANIFEST and create
+    // ColumnFamilyData object
+    Options opt(db_options_, cf_options);
+    s = versions_->LogAndApply(nullptr,
+        MutableCFOptions(opt, ImmutableCFOptions(opt)),
+        &edit, &mutex_, db_directory_.get(), false, &cf_options);
+    if (s.ok()) {
+      single_column_family_mode_ = false;
+      auto* cfd =
+          versions_->GetColumnFamilySet()->GetColumnFamily(column_family_name);
+      assert(cfd != nullptr);
+      delete InstallSuperVersion(
+          cfd, nullptr, *cfd->GetLatestMutableCFOptions());
+      *handle = new ColumnFamilyHandleImpl(cfd, this, &mutex_);
+      Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
+          "Created column family [%s] (ID %u)",
+          column_family_name.c_str(), (unsigned)cfd->GetID());
+    } else {
+      Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
+          "Creating column family [%s] FAILED -- %s",
+          column_family_name.c_str(), s.ToString().c_str());
+    }
   }
-  VersionEdit edit;
-  edit.AddColumnFamily(column_family_name);
-  uint32_t new_id = versions_->GetColumnFamilySet()->GetNextColumnFamilyID();
-  edit.SetColumnFamily(new_id);
-  edit.SetLogNumber(logfile_number_);
-  edit.SetComparatorName(cf_options.comparator->Name());
-
-  // LogAndApply will both write the creation in MANIFEST and create
-  // ColumnFamilyData object
-  Options opt(db_options_, cf_options);
-  Status s = versions_->LogAndApply(nullptr,
-      MutableCFOptions(opt, ImmutableCFOptions(opt)),
-      &edit, &mutex_, db_directory_.get(), false, &cf_options);
+
+  // this is outside the mutex
   if (s.ok()) {
-    single_column_family_mode_ = false;
-    auto cfd =
-        versions_->GetColumnFamilySet()->GetColumnFamily(column_family_name);
-    assert(cfd != nullptr);
-    delete InstallSuperVersion(cfd, nullptr, *cfd->GetLatestMutableCFOptions());
-    *handle = new ColumnFamilyHandleImpl(cfd, this, &mutex_);
-    Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
-        "Created column family [%s] (ID %u)",
-        column_family_name.c_str(), (unsigned)cfd->GetID());
-  } else {
-    Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
-        "Creating column family [%s] FAILED -- %s",
-        column_family_name.c_str(), s.ToString().c_str());
+    NewThreadStatusCfInfo(
+        reinterpret_cast<ColumnFamilyHandleImpl*>(*handle)->cfd());
   }
   return s;
 }
@@ -2520,6 +2532,10 @@ Status DBImpl::DropColumnFamily(ColumnFamilyHandle* column_family) {
   }
 
   if (s.ok()) {
+    // Note that here we erase the associated cf_info of the to-be-dropped
+    // cfd before its ref-count goes to zero to avoid having to erase cf_info
+    // later inside db_mutex.
+    EraseThreadStatusCfInfo(cfd);
     assert(cfd->IsDropped());
     auto* mutable_cf_options = cfd->GetLatestMutableCFOptions();
     max_total_in_memory_state_ -= mutable_cf_options->write_buffer_size *
@@ -3602,8 +3618,12 @@ Status DB::Open(const DBOptions& db_options, const std::string& dbname,
     // their Listeners.  To address this, we should have NotifyOnDatabaseOpen()
     // here which passes the created ColumnFamilyHandle to the Listeners
     // as the first event after DB::Open().
+    for (auto* h : *handles) {
+      impl->NewThreadStatusCfInfo(
+          reinterpret_cast<ColumnFamilyHandleImpl*>(h)->cfd());
+    }
   } else {
-    for (auto h : *handles) {
+    for (auto* h : *handles) {
       delete h;
     }
     handles->clear();
@@ -3702,6 +3722,38 @@ Status DestroyDB(const std::string& dbname, const Options& options) {
   return result;
 }
 
+#if ROCKSDB_USING_THREAD_STATUS
+void DBImpl::NewThreadStatusCfInfo(
+    ColumnFamilyData* cfd) const {
+  ThreadStatusImpl::NewColumnFamilyInfo(
+      this, GetName(), cfd, cfd->GetName());
+}
+
+void DBImpl::EraseThreadStatusCfInfo(
+    ColumnFamilyData* cfd) const {
+  ThreadStatusImpl::EraseColumnFamilyInfo(cfd);
+}
+
+void DBImpl::EraseThreadStatusDbInfo() const {
+  ThreadStatusImpl::EraseDatabaseInfo(this);
+}
+
+Status GetThreadList(std::vector<ThreadStatus>* thread_list) {
+  return thread_local_status.GetThreadList(thread_list);
+}
+#else
+void DBImpl::NewThreadStatusCfInfo(
+    ColumnFamilyData* cfd) const {
+}
+
+void DBImpl::EraseThreadStatusCfInfo(
+    ColumnFamilyData* cfd) const {
+}
+
+void DBImpl::EraseThreadStatusDbInfo() const {
+}
+#endif  // ROCKSDB_USING_THREAD_STATUS
+
 //
 // A global method that can dump out the build version
 void DumpRocksDBBuildVersion(Logger * log) {
diff --git a/db/db_impl.h b/db/db_impl.h
index cce238284..283796120 100644
--- a/db/db_impl.h
+++ b/db/db_impl.h
@@ -34,6 +34,7 @@
 #include "util/stop_watch.h"
 #include "util/thread_local.h"
 #include "util/scoped_arena_iterator.h"
+#include "util/hash.h"
 #include "db/internal_stats.h"
 #include "db/write_controller.h"
 #include "db/flush_scheduler.h"
@@ -264,6 +265,12 @@ class DBImpl : public DB {
   void NotifyOnFlushCompleted(ColumnFamilyData* cfd, uint64_t file_number,
                               const MutableCFOptions& mutable_cf_options);
 
+  void NewThreadStatusCfInfo(ColumnFamilyData* cfd) const;
+
+  void EraseThreadStatusCfInfo(ColumnFamilyData* cfd) const;
+
+  void EraseThreadStatusDbInfo() const;
+
  private:
   friend class DB;
   friend class InternalStats;
diff --git a/db/db_impl_readonly.cc b/db/db_impl_readonly.cc
index c98693d38..298944f62 100644
--- a/db/db_impl_readonly.cc
+++ b/db/db_impl_readonly.cc
@@ -9,6 +9,7 @@
 #include "db/merge_context.h"
 #include "db/db_iter.h"
 #include "util/perf_context_imp.h"
+#include "util/thread_status_impl.h"
 
 namespace rocksdb {
 
@@ -152,6 +153,10 @@ Status DB::OpenForReadOnly(
   impl->mutex_.Unlock();
   if (s.ok()) {
     *dbptr = impl;
+    for (auto* h : *handles) {
+      impl->NewThreadStatusCfInfo(
+          reinterpret_cast<ColumnFamilyHandleImpl*>(h)->cfd());
+    }
   } else {
     for (auto h : *handles) {
       delete h;
diff --git a/db/db_test.cc b/db/db_test.cc
index dbf3506b9..870fc7268 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -33,6 +33,7 @@
 #include "rocksdb/table.h"
 #include "rocksdb/options.h"
 #include "rocksdb/table_properties.h"
+#include "rocksdb/thread_status.h"
 #include "rocksdb/utilities/write_batch_with_index.h"
 #include "table/block_based_table_factory.h"
 #include "table/plain_table_factory.h"
@@ -48,6 +49,7 @@
 #include "util/sync_point.h"
 #include "util/testutil.h"
 #include "util/mock_env.h"
+#include "util/thread_status_impl.h"
 
 namespace rocksdb {
 
@@ -8981,6 +8983,60 @@ TEST(DBTest, DynamicMemtableOptions) {
   sleeping_task_low3.WaitUntilDone();
 }
 
+#if ROCKSDB_USING_THREAD_STATUS
+TEST(DBTest, GetThreadList) {
+  Options options;
+  options.env = env_;
+
+  std::vector<ThreadStatus> thread_list;
+  Status s = GetThreadList(&thread_list);
+
+  for (int i = 0; i < 2; ++i) {
+    // repeat the test with differet number of high / low priority threads
+    const int kTestCount = 3;
+    const unsigned int kHighPriCounts[kTestCount] = {3, 2, 5};
+    const unsigned int kLowPriCounts[kTestCount] = {10, 15, 3};
+    for (int test = 0; test < kTestCount; ++test) {
+      // Change the number of threads in high / low priority pool.
+      env_->SetBackgroundThreads(kHighPriCounts[test], Env::HIGH);
+      env_->SetBackgroundThreads(kLowPriCounts[test], Env::LOW);
+      // Wait to ensure the all threads has been registered
+      env_->SleepForMicroseconds(100000);
+      s = GetThreadList(&thread_list);
+      ASSERT_OK(s);
+      unsigned int thread_type_counts[ThreadStatus::ThreadType::TOTAL];
+      memset(thread_type_counts, 0, sizeof(thread_type_counts));
+      for (auto thread : thread_list) {
+        ASSERT_LT(thread.thread_type, ThreadStatus::ThreadType::TOTAL);
+        thread_type_counts[thread.thread_type]++;
+      }
+      // Verify the total number of threades
+      ASSERT_EQ(
+          thread_list.size(),
+          kHighPriCounts[test] + kLowPriCounts[test]);
+      // Verify the number of high-priority threads
+      ASSERT_EQ(
+          thread_type_counts[ThreadStatus::ThreadType::ROCKSDB_HIGH_PRIORITY],
+          kHighPriCounts[test]);
+      // Verify the number of low-priority threads
+      ASSERT_EQ(
+          thread_type_counts[ThreadStatus::ThreadType::ROCKSDB_LOW_PRIORITY],
+          kLowPriCounts[test]);
+    }
+    if (i == 0) {
+      // repeat the test with multiple column families
+      CreateAndReopenWithCF({"pikachu", "about-to-remove"}, options);
+      ThreadStatusImpl::TEST_VerifyColumnFamilyInfoMap(handles_);
+    }
+  }
+  db_->DropColumnFamily(handles_[2]);
+  handles_.erase(handles_.begin() + 2);
+  ThreadStatusImpl::TEST_VerifyColumnFamilyInfoMap(handles_);
+  Close();
+  ThreadStatusImpl::TEST_VerifyColumnFamilyInfoMap(handles_);
+}
+#endif  // ROCKSDB_USING_THREAD_STATUS
+
 TEST(DBTest, DynamicCompactionOptions) {
   // minimum write buffer size is enforced at 64KB
   const uint64_t k32KB = 1 << 15;
diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h
index 52f157d82..ad3745c5e 100644
--- a/include/rocksdb/db.h
+++ b/include/rocksdb/db.h
@@ -22,6 +22,7 @@
 #include "rocksdb/types.h"
 #include "rocksdb/transaction_log.h"
 #include "rocksdb/listener.h"
+#include "rocksdb/thread_status.h"
 
 namespace rocksdb {
 
@@ -547,6 +548,12 @@ Status DestroyDB(const std::string& name, const Options& options);
 Status RepairDB(const std::string& dbname, const Options& options);
 #endif
 
+#if ROCKSDB_USING_THREAD_STATUS
+// Obtain the status of all rocksdb-related threads.
+Status GetThreadList(std::vector<ThreadStatus>* thread_list);
+#endif
+
+
 }  // namespace rocksdb
 
 #endif  // STORAGE_ROCKSDB_INCLUDE_DB_H_
diff --git a/include/rocksdb/thread_status.h b/include/rocksdb/thread_status.h
new file mode 100644
index 000000000..bfd4a79fc
--- /dev/null
+++ b/include/rocksdb/thread_status.h
@@ -0,0 +1,66 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#pragma once
+
+#include <cstddef>
+#include <string>
+
+#ifndef ROCKSDB_USING_THREAD_STATUS
+#define ROCKSDB_USING_THREAD_STATUS \
+    !defined(ROCKSDB_LITE) && \
+    !defined(NROCKSDB_THREAD_STATUS) && \
+    !defined(OS_MACOSX)
+#endif
+
+namespace rocksdb {
+
+// A structure that describes the current status of a thread.
+// The status of active threads can be fetched using
+// rocksdb::GetThreadList().
+struct ThreadStatus {
+  enum ThreadType {
+    ROCKSDB_HIGH_PRIORITY = 0x0,
+    ROCKSDB_LOW_PRIORITY = 0x1,
+    USER_THREAD = 0x2,
+    TOTAL = 0x3
+  };
+
+#if ROCKSDB_USING_THREAD_STATUS
+  ThreadStatus(const uint64_t _id,
+               const ThreadType _thread_type,
+               const std::string& _db_name,
+               const std::string& _cf_name,
+               const std::string& _event) :
+      thread_id(_id), thread_type(_thread_type),
+      db_name(_db_name),
+      cf_name(_cf_name),
+      event(_event) {}
+
+  // An unique ID for the thread.
+  const uint64_t thread_id;
+
+  // The type of the thread, it could be ROCKSDB_HIGH_PRIORITY,
+  // ROCKSDB_LOW_PRIORITY, and USER_THREAD
+  const ThreadType thread_type;
+
+  // The name of the DB instance where the thread is currently
+  // involved with.  It would be set to empty string if the thread
+  // does not involve in any DB operation.
+  const std::string db_name;
+
+  // The name of the column family where the thread is currently
+  // It would be set to empty string if the thread does not involve
+  // in any column family.
+  const std::string cf_name;
+
+  // The event that the current thread is involved.
+  // It would be set to empty string if the information about event
+  // is not currently available.
+  const std::string event;
+#endif  // ROCKSDB_USING_THREAD_STATUS
+};
+
+}  // namespace rocksdb
diff --git a/util/env_posix.cc b/util/env_posix.cc
index af1801607..a850ed130 100644
--- a/util/env_posix.cc
+++ b/util/env_posix.cc
@@ -42,6 +42,7 @@
 #include "util/random.h"
 #include "util/iostats_context_imp.h"
 #include "util/rate_limiter.h"
+#include "util/thread_status_impl.h"
 
 // Get nano time for mach systems
 #ifdef __MACH__
@@ -75,6 +76,10 @@ int rocksdb_kill_odds = 0;
 
 namespace rocksdb {
 
+#if ROCKSDB_USING_THREAD_STATUS
+extern ThreadStatusImpl thread_local_status;
+#endif
+
 namespace {
 
 // A wrapper for fadvise, if the platform doesn't support fadvise,
@@ -1570,6 +1575,17 @@ class PosixEnv : public Env {
       return static_cast<int>(thread_id) >= total_threads_limit_;
     }
 
+    // Return the thread priority.
+    // This would allow its member-thread to know its priority.
+    Env::Priority GetThreadPriority() {
+      return priority_;
+    }
+
+    // Set the thread priority.
+    void SetThreadPriority(Env::Priority priority) {
+      priority_ = priority;
+    }
+
     void BGThread(size_t thread_id) {
       bool low_io_priority = false;
       while (true) {
@@ -1651,8 +1667,14 @@ class PosixEnv : public Env {
       BGThreadMetadata* meta = reinterpret_cast<BGThreadMetadata*>(arg);
       size_t thread_id = meta->thread_id_;
       ThreadPool* tp = meta->thread_pool_;
+      // for thread-status
+      thread_local_status.SetThreadType(
+          (tp->GetThreadPriority() == Env::Priority::HIGH ?
+              ThreadStatus::ThreadType::ROCKSDB_HIGH_PRIORITY :
+              ThreadStatus::ThreadType::ROCKSDB_LOW_PRIORITY));
       delete meta;
       tp->BGThread(thread_id);
+      thread_local_status.UnregisterThread();
       return nullptr;
     }
 
@@ -1753,6 +1775,7 @@ class PosixEnv : public Env {
     std::atomic_uint queue_len_;  // Queue length. Used for stats reporting
     bool exit_all_threads_;
     bool low_io_priority_;
+    Env::Priority priority_;
   };
 
   std::vector<ThreadPool> thread_pools_;
@@ -1767,6 +1790,10 @@ PosixEnv::PosixEnv() : checkedDiskForMmap_(false),
                        page_size_(getpagesize()),
                        thread_pools_(Priority::TOTAL) {
   PthreadCall("mutex_init", pthread_mutex_init(&mu_, nullptr));
+  for (int pool_id = 0; pool_id < Env::Priority::TOTAL; ++pool_id) {
+    thread_pools_[pool_id].SetThreadPriority(
+        static_cast<Env::Priority>(pool_id));
+  }
 }
 
 void PosixEnv::Schedule(void (*function)(void*), void* arg, Priority pri) {
diff --git a/util/hash.h b/util/hash.h
index 6d9bebaf8..cab8d4677 100644
--- a/util/hash.h
+++ b/util/hash.h
@@ -24,4 +24,5 @@ inline uint32_t BloomHash(const Slice& key) {
 inline uint32_t GetSliceHash(const Slice& s) {
   return Hash(s.data(), s.size(), 397);
 }
-}
+
+}  // namespace rocksdb
diff --git a/util/thread_list_test.cc b/util/thread_list_test.cc
new file mode 100644
index 000000000..b5ff60cc7
--- /dev/null
+++ b/util/thread_list_test.cc
@@ -0,0 +1,156 @@
+//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include <mutex>
+#include <condition_variable>
+
+#include "util/thread_status_impl.h"
+#include "util/testharness.h"
+#include "rocksdb/db.h"
+
+#if ROCKSDB_USING_THREAD_STATUS
+
+namespace rocksdb {
+
+class SleepingBackgroundTask {
+ public:
+  SleepingBackgroundTask(const void* db_key, const std::string& db_name,
+                         const void* cf_key, const std::string& cf_name)
+      : db_key_(db_key), db_name_(db_name),
+        cf_key_(cf_key), cf_name_(cf_name),
+        should_sleep_(true), sleeping_count_(0) {
+    ThreadStatusImpl::NewColumnFamilyInfo(
+        db_key_, db_name_, cf_key_, cf_name_);
+  }
+
+  ~SleepingBackgroundTask() {
+    ThreadStatusImpl::EraseDatabaseInfo(db_key_);
+  }
+
+  void DoSleep() {
+    thread_local_status.SetColumnFamilyInfoKey(cf_key_);
+    std::unique_lock<std::mutex> l(mutex_);
+    sleeping_count_++;
+    while (should_sleep_) {
+      bg_cv_.wait(l);
+    }
+    sleeping_count_--;
+    bg_cv_.notify_all();
+    thread_local_status.SetColumnFamilyInfoKey(0);
+  }
+  void WakeUp() {
+    std::unique_lock<std::mutex> l(mutex_);
+    should_sleep_ = false;
+    bg_cv_.notify_all();
+  }
+  void WaitUntilDone() {
+    std::unique_lock<std::mutex> l(mutex_);
+    while (sleeping_count_ > 0) {
+      bg_cv_.wait(l);
+    }
+  }
+
+  static void DoSleepTask(void* arg) {
+    reinterpret_cast<SleepingBackgroundTask*>(arg)->DoSleep();
+  }
+
+ private:
+  const void* db_key_;
+  const std::string db_name_;
+  const void* cf_key_;
+  const std::string cf_name_;
+  std::mutex mutex_;
+  std::condition_variable bg_cv_;
+  bool should_sleep_;
+  std::atomic<int> sleeping_count_;
+};
+
+class ThreadListTest {
+ public:
+  ThreadListTest() {
+  }
+};
+
+TEST(ThreadListTest, SimpleColumnFamilyInfoTest) {
+  Env* env = Env::Default();
+  const int kHighPriorityThreads = 3;
+  const int kLowPriorityThreads = 5;
+  const int kSleepingHighPriThreads = kHighPriorityThreads - 1;
+  const int kSleepingLowPriThreads = kLowPriorityThreads / 3;
+  env->SetBackgroundThreads(kHighPriorityThreads, Env::HIGH);
+  env->SetBackgroundThreads(kLowPriorityThreads, Env::LOW);
+
+  SleepingBackgroundTask sleeping_task(
+      reinterpret_cast<void*>(1234), "sleeping",
+      reinterpret_cast<void*>(5678), "pikachu");
+
+  for (int test = 0; test < kSleepingHighPriThreads; ++test) {
+    env->Schedule(&SleepingBackgroundTask::DoSleepTask,
+        &sleeping_task, Env::Priority::HIGH);
+  }
+  for (int test = 0; test < kSleepingLowPriThreads; ++test) {
+    env->Schedule(&SleepingBackgroundTask::DoSleepTask,
+        &sleeping_task, Env::Priority::LOW);
+  }
+
+  // make sure everything is scheduled.
+  env->SleepForMicroseconds(10000);
+
+  std::vector<ThreadStatus> thread_list;
+
+  // Verify the number of sleeping threads in each pool.
+  GetThreadList(&thread_list);
+  int sleeping_count[ThreadStatus::ThreadType::TOTAL] = {0};
+  for (auto thread_status : thread_list) {
+    if (thread_status.cf_name == "pikachu" &&
+        thread_status.db_name == "sleeping") {
+      sleeping_count[thread_status.thread_type]++;
+    }
+  }
+  ASSERT_EQ(
+      sleeping_count[ThreadStatus::ThreadType::ROCKSDB_HIGH_PRIORITY],
+      kSleepingHighPriThreads);
+  ASSERT_EQ(
+      sleeping_count[ThreadStatus::ThreadType::ROCKSDB_LOW_PRIORITY],
+      kSleepingLowPriThreads);
+  ASSERT_EQ(
+      sleeping_count[ThreadStatus::ThreadType::USER_THREAD], 0);
+
+  sleeping_task.WakeUp();
+  sleeping_task.WaitUntilDone();
+
+  // Verify none of the threads are sleeping
+  GetThreadList(&thread_list);
+  for (int i = 0; i < ThreadStatus::ThreadType::TOTAL; ++i) {
+    sleeping_count[i] = 0;
+  }
+
+  for (auto thread_status : thread_list) {
+    if (thread_status.cf_name == "pikachu" &&
+        thread_status.db_name == "sleeping") {
+      sleeping_count[thread_status.thread_type]++;
+    }
+  }
+  ASSERT_EQ(
+      sleeping_count[ThreadStatus::ThreadType::ROCKSDB_HIGH_PRIORITY], 0);
+  ASSERT_EQ(
+      sleeping_count[ThreadStatus::ThreadType::ROCKSDB_LOW_PRIORITY], 0);
+  ASSERT_EQ(
+      sleeping_count[ThreadStatus::ThreadType::USER_THREAD], 0);
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
+
+#else
+
+int main(int argc, char** argv) {
+  return 0;
+}
+
+#endif  // ROCKSDB_USING_THREAD_STATUS
diff --git a/util/thread_status_impl.cc b/util/thread_status_impl.cc
new file mode 100644
index 000000000..99a6fdc99
--- /dev/null
+++ b/util/thread_status_impl.cc
@@ -0,0 +1,198 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#include "port/likely.h"
+#include "util/mutexlock.h"
+#include "util/thread_status_impl.h"
+
+namespace rocksdb {
+
+ThreadStatusImpl thread_local_status;
+
+#if ROCKSDB_USING_THREAD_STATUS
+__thread ThreadStatusData* ThreadStatusImpl::thread_status_data_ = nullptr;
+std::mutex ThreadStatusImpl::thread_list_mutex_;
+std::unordered_set<ThreadStatusData*> ThreadStatusImpl::thread_data_set_;
+std::unordered_map<const void*, ConstantColumnFamilyInfo*>
+    ThreadStatusImpl::cf_info_map_;
+std::unordered_map<const void*, std::unordered_set<const void*>>
+    ThreadStatusImpl::db_key_map_;
+
+ThreadStatusImpl::~ThreadStatusImpl() {
+  std::lock_guard<std::mutex> lck(thread_list_mutex_);
+  for (auto* thread_data : thread_data_set_) {
+    assert(thread_data->thread_type == ThreadStatus::ThreadType::USER_THREAD);
+    delete thread_data;
+  }
+  assert(thread_data_set_.size() == 0);
+  thread_data_set_.clear();
+}
+
+void ThreadStatusImpl::UnregisterThread() {
+  if (thread_status_data_ != nullptr) {
+    std::lock_guard<std::mutex> lck(thread_list_mutex_);
+    thread_data_set_.erase(thread_status_data_);
+    delete thread_status_data_;
+  }
+}
+
+void ThreadStatusImpl::SetThreadType(
+    ThreadStatus::ThreadType ttype) {
+  auto* data = InitAndGet();
+  data->thread_type.store(ttype, std::memory_order_relaxed);
+}
+
+void ThreadStatusImpl::SetColumnFamilyInfoKey(
+    const void* cf_key) {
+  auto* data = InitAndGet();
+  data->cf_key.store(cf_key, std::memory_order_relaxed);
+}
+
+void ThreadStatusImpl::SetEventInfoPtr(
+    const ThreadEventInfo* event_info) {
+  auto* data = InitAndGet();
+  data->event_info.store(event_info, std::memory_order_relaxed);
+}
+
+Status ThreadStatusImpl::GetThreadList(
+    std::vector<ThreadStatus>* thread_list) const {
+  thread_list->clear();
+  std::vector<std::shared_ptr<ThreadStatusData>> valid_list;
+
+  std::lock_guard<std::mutex> lck(thread_list_mutex_);
+  for (auto* thread_data : thread_data_set_) {
+    assert(thread_data);
+    auto thread_type = thread_data->thread_type.load(
+        std::memory_order_relaxed);
+    auto cf_key = thread_data->cf_key.load(
+        std::memory_order_relaxed);
+    auto iter = cf_info_map_.find(
+        thread_data->cf_key.load(std::memory_order_relaxed));
+    assert(cf_key == 0 || iter != cf_info_map_.end());
+    auto* cf_info = iter != cf_info_map_.end() ?
+        iter->second : nullptr;
+    auto* event_info = thread_data->event_info.load(
+        std::memory_order_relaxed);
+    const std::string* db_name = nullptr;
+    const std::string* cf_name = nullptr;
+    const std::string* event_name = nullptr;
+    if (cf_info != nullptr) {
+      db_name = &cf_info->db_name;
+      cf_name = &cf_info->cf_name;
+      // display lower-level info only when higher-level info is available.
+      if (event_info != nullptr) {
+        event_name = &event_info->event_name;
+      }
+    }
+    thread_list->emplace_back(
+        thread_data->thread_id, thread_type,
+        db_name ? *db_name : "",
+        cf_name ? *cf_name : "",
+        event_name ? *event_name : "");
+  }
+
+  return Status::OK();
+}
+
+ThreadStatusData* ThreadStatusImpl::InitAndGet() {
+  if (UNLIKELY(thread_status_data_ == nullptr)) {
+    thread_status_data_ = new ThreadStatusData();
+    thread_status_data_->thread_id = reinterpret_cast<uint64_t>(
+        thread_status_data_);
+    std::lock_guard<std::mutex> lck(thread_list_mutex_);
+    thread_data_set_.insert(thread_status_data_);
+  }
+  return thread_status_data_;
+}
+
+void ThreadStatusImpl::NewColumnFamilyInfo(
+    const void* db_key, const std::string& db_name,
+    const void* cf_key, const std::string& cf_name) {
+  std::lock_guard<std::mutex> lck(thread_list_mutex_);
+
+  cf_info_map_[cf_key] = new ConstantColumnFamilyInfo(db_key, db_name, cf_name);
+  db_key_map_[db_key].insert(cf_key);
+}
+
+void ThreadStatusImpl::EraseColumnFamilyInfo(const void* cf_key) {
+  std::lock_guard<std::mutex> lck(thread_list_mutex_);
+  auto cf_pair = cf_info_map_.find(cf_key);
+  assert(cf_pair != cf_info_map_.end());
+
+  auto* cf_info = cf_pair->second;
+  assert(cf_info);
+
+  // Remove its entry from db_key_map_ by the following steps:
+  // 1. Obtain the entry in db_key_map_ whose set contains cf_key
+  // 2. Remove it from the set.
+  auto db_pair = db_key_map_.find(cf_info->db_key);
+  assert(db_pair != db_key_map_.end());
+  int result __attribute__((unused)) = db_pair->second.erase(cf_key);
+  assert(result);
+
+  delete cf_info;
+  result = cf_info_map_.erase(cf_key);
+  assert(result);
+}
+
+void ThreadStatusImpl::EraseDatabaseInfo(const void* db_key) {
+  std::lock_guard<std::mutex> lck(thread_list_mutex_);
+  auto db_pair = db_key_map_.find(db_key);
+  if (UNLIKELY(db_pair == db_key_map_.end())) {
+    // In some occasional cases such as DB::Open fails, we won't
+    // register ColumnFamilyInfo for a db.
+    return;
+  }
+
+  int result __attribute__((unused)) = 0;
+  for (auto cf_key : db_pair->second) {
+    auto cf_pair = cf_info_map_.find(cf_key);
+    assert(cf_pair != cf_info_map_.end());
+    result = cf_info_map_.erase(cf_key);
+    delete cf_pair->second;
+    assert(result);
+  }
+  db_key_map_.erase(db_key);
+}
+
+#else
+
+ThreadStatusImpl::~ThreadStatusImpl() {
+}
+
+void ThreadStatusImpl::UnregisterThread() {
+}
+
+void ThreadStatusImpl::SetThreadType(
+    ThreadStatus::ThreadType ttype) {
+}
+
+void ThreadStatusImpl::SetColumnFamilyInfoKey(
+    const void* cf_key) {
+}
+
+void ThreadStatusImpl::SetEventInfoPtr(
+    const ThreadEventInfo* event_info) {
+}
+
+Status ThreadStatusImpl::GetThreadList(
+    std::vector<ThreadStatus>* thread_list) const {
+  return Status::NotSupported(
+      "GetThreadList is not supported in the current running environment.");
+}
+
+void ThreadStatusImpl::NewColumnFamilyInfo(
+    const void* db_key, const std::string& db_name,
+    const void* cf_key, const std::string& cf_name) {
+}
+
+void ThreadStatusImpl::EraseColumnFamilyInfo(const void* cf_key) {
+}
+
+void ThreadStatusImpl::EraseDatabaseInfo(const void* db_key) {
+}
+
+#endif  // ROCKSDB_USING_THREAD_STATUS
+}  // namespace rocksdb
diff --git a/util/thread_status_impl.h b/util/thread_status_impl.h
new file mode 100644
index 000000000..3d4987a34
--- /dev/null
+++ b/util/thread_status_impl.h
@@ -0,0 +1,164 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+//
+// The implementation of ThreadStatus.  It is implemented via combination
+// of macros and thread-local variables.
+//
+// Note that we make get and set access to ThreadStatusData lockless.
+// As a result, ThreadStatusData as a whole is not atomic.  However,
+// we guarantee consistent ThreadStatusData all the time whenever
+// user call GetThreadList().  This consistency guarantee is done
+// by having the following constraint in the internal implementation
+// of set and get order:
+//
+// 1. When reset any information in ThreadStatusData, always start from
+//    clearing up the lower-level information first.
+// 2. When setting any information in ThreadStatusData, always start from
+//    setting the higher-level information.
+// 3. When returning ThreadStatusData to the user, fields are fetched from
+//    higher-level to lower-level.  In addition, where there's a nullptr
+//    in one field, then all fields that has lower-level than that field
+//    should be ignored.
+//
+// The high to low level information would be:
+// thread_id > thread_type > db > cf > event > event_count > event_details
+//
+// This means user might not always get full information, but whenever
+// returned by the GetThreadList() is guaranteed to be consistent.
+#pragma once
+#include <unordered_set>
+#include <atomic>
+#include <string>
+#include <unordered_map>
+#include <mutex>
+#include <list>
+#include <vector>
+#include "rocksdb/status.h"
+#include "rocksdb/thread_status.h"
+#include "port/port_posix.h"
+
+namespace rocksdb {
+
+class ColumnFamilyHandle;
+
+// The mutable version of ThreadStatus.  It has a static set maintaining
+// the set of current registered threades.
+//
+// Note that it is suggested to call the above macros.
+struct ConstantColumnFamilyInfo {
+#if ROCKSDB_USING_THREAD_STATUS
+ public:
+  ConstantColumnFamilyInfo(
+      const void* _db_key,
+      const std::string& _db_name,
+      const std::string& _cf_name) :
+      db_key(_db_key), db_name(_db_name), cf_name(_cf_name) {}
+  const void* db_key;
+  const std::string db_name;
+  const std::string cf_name;
+#endif  // ROCKSDB_USING_THREAD_STATUS
+};
+
+struct ThreadEventInfo {
+#if ROCKSDB_USING_THREAD_STATUS
+ public:
+  const std::string event_name;
+#endif  // ROCKSDB_USING_THREAD_STATUS
+};
+
+// the internal data-structure that is used to reflect the current
+// status of a thread using a set of atomic pointers.
+struct ThreadStatusData {
+#if ROCKSDB_USING_THREAD_STATUS
+  explicit ThreadStatusData() : thread_id(0) {
+    thread_type.store(ThreadStatus::ThreadType::USER_THREAD);
+    cf_key.store(0);
+    event_info.store(nullptr);
+  }
+  uint64_t thread_id;
+  std::atomic<ThreadStatus::ThreadType> thread_type;
+  std::atomic<const void*> cf_key;
+  std::atomic<const ThreadEventInfo*> event_info;
+#endif  // ROCKSDB_USING_THREAD_STATUS
+};
+
+class ThreadStatusImpl {
+ public:
+  ThreadStatusImpl() {}
+
+  // Releases all ThreadStatusData of all active threads.
+  ~ThreadStatusImpl();
+
+  void UnregisterThread();
+
+  // Set the thread type of the current thread.
+  void SetThreadType(ThreadStatus::ThreadType ttype);
+
+  // Update the column-family info of the current thread by setting
+  // its thread-local pointer of ThreadEventInfo to the correct entry.
+  void SetColumnFamilyInfoKey(const void* cf_key);
+
+  // Update the event info of the current thread by setting
+  // its thread-local pointer of ThreadEventInfo to the correct entry.
+  void SetEventInfoPtr(const ThreadEventInfo* event_info);
+
+  Status GetThreadList(
+      std::vector<ThreadStatus>* thread_list) const;
+
+  // Create an entry in the global ColumnFamilyInfo table for the
+  // specified column family.  This function should be called only
+  // when the current thread does not hold db_mutex.
+  static void NewColumnFamilyInfo(
+      const void* db_key, const std::string& db_name,
+      const void* cf_key, const std::string& cf_name);
+
+  // Erase all ConstantColumnFamilyInfo that is associated with the
+  // specified db instance.  This function should be called only when
+  // the current thread does not hold db_mutex.
+  static void EraseDatabaseInfo(const void* db_key);
+
+  // Erase the ConstantColumnFamilyInfo that is associated with the
+  // specified ColumnFamilyData.  This function should be called only
+  // when the current thread does not hold db_mutex.
+  static void EraseColumnFamilyInfo(const void* cf_key);
+
+  // Verifies whether the input ColumnFamilyHandles matches
+  // the information stored in the current cf_info_map.
+  static void TEST_VerifyColumnFamilyInfoMap(
+      const std::vector<ColumnFamilyHandle*>& handles);
+
+ protected:
+  // The thread-local variable for storing thread status.
+  static __thread ThreadStatusData* thread_status_data_;
+
+#if ROCKSDB_USING_THREAD_STATUS
+
+  // Obtain the pointer to the thread status data.  It also performs
+  // initialization when necessary.
+  ThreadStatusData* InitAndGet();
+
+  // The mutex that protects cf_info_map and db_key_map.
+  static std::mutex thread_list_mutex_;
+
+  // The current status data of all active threads.
+  static std::unordered_set<ThreadStatusData*> thread_data_set_;
+
+  // A global map that keeps the column family information.  It is stored
+  // globally instead of inside DB is to avoid the situation where DB is
+  // closing while GetThreadList function already get the pointer to its
+  // CopnstantColumnFamilyInfo.
+  static std::unordered_map<
+      const void*, ConstantColumnFamilyInfo*> cf_info_map_;
+
+  // A db_key to cf_key map that allows erasing elements in cf_info_map
+  // associated to the same db_key faster.
+  static std::unordered_map<
+      const void*, std::unordered_set<const void*>> db_key_map_;
+#endif  // ROCKSDB_USING_THREAD_STATUS
+};
+
+
+extern ThreadStatusImpl thread_local_status;
+}  // namespace rocksdb
diff --git a/util/thread_status_impl_debug.cc b/util/thread_status_impl_debug.cc
new file mode 100644
index 000000000..1def2b143
--- /dev/null
+++ b/util/thread_status_impl_debug.cc
@@ -0,0 +1,26 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#include <mutex>
+
+#include "util/thread_status_impl.h"
+#include "db/column_family.h"
+#if ROCKSDB_USING_THREAD_STATUS
+
+namespace rocksdb {
+void ThreadStatusImpl::TEST_VerifyColumnFamilyInfoMap(
+    const std::vector<ColumnFamilyHandle*>& handles) {
+  std::unique_lock<std::mutex> lock(thread_list_mutex_);
+  assert(cf_info_map_.size() == handles.size());
+  for (auto* handle : handles) {
+    auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(handle)->cfd();
+    auto iter = cf_info_map_.find(cfd);
+    assert(iter != cf_info_map_.end());
+    assert(iter->second);
+    assert(iter->second->cf_name == cfd->GetName());
+  }
+}
+}  // namespace rocksdb
+#endif  // ROCKSDB_USING_THREAD_STATUS
diff --git a/utilities/compacted_db/compacted_db_impl.cc b/utilities/compacted_db/compacted_db_impl.cc
index 335dae77b..fd35698b4 100644
--- a/utilities/compacted_db/compacted_db_impl.cc
+++ b/utilities/compacted_db/compacted_db_impl.cc
@@ -8,6 +8,7 @@
 #include "db/db_impl.h"
 #include "db/version_set.h"
 #include "table/get_context.h"
+#include "util/thread_status_impl.h"
 
 namespace rocksdb {
 
@@ -102,6 +103,7 @@ Status CompactedDBImpl::Init(const Options& options) {
   if (!s.ok()) {
     return s;
   }
+  NewThreadStatusCfInfo(cfd_);
   version_ = cfd_->GetSuperVersion()->current;
   user_comparator_ = cfd_->user_comparator();
   auto* vstorage = version_->storage_info();

From 7165d18869c46ee4aa7d2f076d788cfbfea86fb4 Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Thu, 20 Nov 2014 11:00:21 -0800
Subject: [PATCH 528/829] Fix clang compile error

---
 util/thread_status_impl.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/util/thread_status_impl.cc b/util/thread_status_impl.cc
index 99a6fdc99..f8368a830 100644
--- a/util/thread_status_impl.cc
+++ b/util/thread_status_impl.cc
@@ -129,7 +129,7 @@ void ThreadStatusImpl::EraseColumnFamilyInfo(const void* cf_key) {
   // 2. Remove it from the set.
   auto db_pair = db_key_map_.find(cf_info->db_key);
   assert(db_pair != db_key_map_.end());
-  int result __attribute__((unused)) = db_pair->second.erase(cf_key);
+  size_t result __attribute__((unused)) = db_pair->second.erase(cf_key);
   assert(result);
 
   delete cf_info;
@@ -146,7 +146,7 @@ void ThreadStatusImpl::EraseDatabaseInfo(const void* db_key) {
     return;
   }
 
-  int result __attribute__((unused)) = 0;
+  size_t result __attribute__((unused)) = 0;
   for (auto cf_key : db_pair->second) {
     auto cf_pair = cf_info_map_.find(cf_key);
     assert(cf_pair != cf_info_map_.end());

From a564be715da5651ac442e7922a1d5fe3515fc054 Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Thu, 20 Nov 2014 11:45:30 -0800
Subject: [PATCH 529/829] Fix asan error in thread_status_impl.cc

---
 util/thread_status_impl.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/util/thread_status_impl.cc b/util/thread_status_impl.cc
index f8368a830..24be214f8 100644
--- a/util/thread_status_impl.cc
+++ b/util/thread_status_impl.cc
@@ -150,8 +150,8 @@ void ThreadStatusImpl::EraseDatabaseInfo(const void* db_key) {
   for (auto cf_key : db_pair->second) {
     auto cf_pair = cf_info_map_.find(cf_key);
     assert(cf_pair != cf_info_map_.end());
-    result = cf_info_map_.erase(cf_key);
     delete cf_pair->second;
+    result = cf_info_map_.erase(cf_key);
     assert(result);
   }
   db_key_map_.erase(db_key);

From fbc42a09333dc7b028e39f565cc7d02aa6f3d313 Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Thu, 20 Nov 2014 13:44:37 -0800
Subject: [PATCH 530/829] Fixed -Werror=unused-but-set-variable in
 thread_status_impl

Summary:
Fixed -Werror=unused-but-set-variable in thread_status_impl
---
 util/thread_status_impl.cc       | 3 +--
 util/thread_status_impl_debug.cc | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/util/thread_status_impl.cc b/util/thread_status_impl.cc
index 24be214f8..d07a60463 100644
--- a/util/thread_status_impl.cc
+++ b/util/thread_status_impl.cc
@@ -68,8 +68,7 @@ Status ThreadStatusImpl::GetThreadList(
         std::memory_order_relaxed);
     auto cf_key = thread_data->cf_key.load(
         std::memory_order_relaxed);
-    auto iter = cf_info_map_.find(
-        thread_data->cf_key.load(std::memory_order_relaxed));
+    auto iter = cf_info_map_.find(cf_key);
     assert(cf_key == 0 || iter != cf_info_map_.end());
     auto* cf_info = iter != cf_info_map_.end() ?
         iter->second : nullptr;
diff --git a/util/thread_status_impl_debug.cc b/util/thread_status_impl_debug.cc
index 1def2b143..5717e40c3 100644
--- a/util/thread_status_impl_debug.cc
+++ b/util/thread_status_impl_debug.cc
@@ -16,7 +16,7 @@ void ThreadStatusImpl::TEST_VerifyColumnFamilyInfoMap(
   assert(cf_info_map_.size() == handles.size());
   for (auto* handle : handles) {
     auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(handle)->cfd();
-    auto iter = cf_info_map_.find(cfd);
+    auto iter __attribute__((unused)) = cf_info_map_.find(cfd);
     assert(iter != cf_info_map_.end());
     assert(iter->second);
     assert(iter->second->cf_name == cfd->GetName());

From 9972f969ee03eeaa5a413cda1f74e374fcbc70cc Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Sat, 15 Nov 2014 23:03:32 +0100
Subject: [PATCH 531/829] [RocksJava] BackupableDBOptions alginment + 3.8

- Updated the BackupableDBOptions functionality to 3.8.0.
- Aligned Options implementation with remaining source code.
- Invented test-case.
---
 java/Makefile                                 |   3 +-
 java/org/rocksdb/BackupableDBOptions.java     | 239 +++++++++++++--
 .../rocksdb/test/BackupableDBOptionsTest.java | 284 ++++++++++++++++++
 java/org/rocksdb/test/BackupableDBTest.java   |   3 +-
 java/rocksjni/backupablejni.cc                | 169 ++++++++++-
 5 files changed, 654 insertions(+), 44 deletions(-)
 create mode 100644 java/org/rocksdb/test/BackupableDBOptionsTest.java

diff --git a/java/Makefile b/java/Makefile
index e8dc5cb47..0a4e2ba16 100644
--- a/java/Makefile
+++ b/java/Makefile
@@ -47,7 +47,8 @@ ifeq ($(PLATFORM), OS_MACOSX)
 ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-osx.jar
 endif
 
-JAVA_TESTS = org.rocksdb.test.BackupableDBTest\
+JAVA_TESTS = org.rocksdb.test.BackupableDBOptionsTest\
+        org.rocksdb.test.BackupableDBTest\
 		org.rocksdb.test.BlockBasedTableConfigTest\
 		org.rocksdb.test.ColumnFamilyOptionsTest\
 		org.rocksdb.test.ColumnFamilyTest\
diff --git a/java/org/rocksdb/BackupableDBOptions.java b/java/org/rocksdb/BackupableDBOptions.java
index 07751a64d..18e2dfa11 100644
--- a/java/org/rocksdb/BackupableDBOptions.java
+++ b/java/org/rocksdb/BackupableDBOptions.java
@@ -6,62 +6,234 @@
 package org.rocksdb;
 
 /**
- * BackupableDBOptions to control the behavior of a backupable database.
+ * <p>BackupableDBOptions to control the behavior of a backupable database.
  * It will be used during the creation of a {@link org.rocksdb.BackupableDB}.
- *
- * Note that dispose() must be called before an Options instance
- * become out-of-scope to release the allocated memory in c++.
+ * </p>
+ * <p>Note that dispose() must be called before an Options instance
+ * become out-of-scope to release the allocated memory in c++.</p>
  *
  * @see org.rocksdb.BackupableDB
  */
 public class BackupableDBOptions extends RocksObject {
 
   /**
-   * BackupableDBOptions constructor
+   * <p>BackupableDBOptions constructor.</p>
    *
    * @param path Where to keep the backup files. Has to be different than db name.
-   *             Best to set this to {@code db name_ + "/backups"}
+   *     Best to set this to {@code db name_ + "/backups"}
+   */
+  public BackupableDBOptions(String path) {
+    super();
+    assert(path != null);
+    newBackupableDBOptions(path);
+  }
+
+  /**
+   * <p>Returns the path to the BackupableDB directory.</p>
+   *
+   * @return the path to the BackupableDB directory.
+   */
+  public String backupDir() {
+    assert(isInitialized());
+    return backupDir(nativeHandle_);
+  }
+
+  /**
+   * <p>Share table files between backups.</p>
+   *
    * @param shareTableFiles If {@code share_table_files == true}, backup will assume
    *     that table files with same name have the same contents. This enables incremental
    *     backups and avoids unnecessary data copies. If {@code share_table_files == false},
    *     each backup will be on its own and will not share any data with other backups.
-   *     Default: true
+   *
+   * <p>Default: true</p>
+   *
+   * @return instance of current BackupableDBOptions.
+   */
+  public BackupableDBOptions setShareTableFiles(boolean shareTableFiles) {
+    assert(isInitialized());
+    setShareTableFiles(nativeHandle_, shareTableFiles);
+    return this;
+  }
+
+  /**
+   * <p>Share table files between backups.</p>
+   *
+   * @return boolean value indicating if SST files will be shared between
+   *     backups.
+   */
+  public boolean shareTableFiles() {
+    assert(isInitialized());
+    return shareTableFiles(nativeHandle_);
+  }
+
+  /**
+   * <p>Set synchronous backups.</p>
+   *
    * @param sync If {@code sync == true}, we can guarantee you'll get consistent backup
    *     even on a machine crash/reboot. Backup process is slower with sync enabled.
    *     If {@code sync == false}, we don't guarantee anything on machine reboot.
    *     However,chances are some of the backups are consistent.
-   *     Default: true
+   *
+   * <p>Default: true</p>
+   *
+   * @return instance of current BackupableDBOptions.
+   */
+  public BackupableDBOptions setSync(boolean sync) {
+    assert(isInitialized());
+    setSync(nativeHandle_, sync);
+    return this;
+  }
+
+  /**
+   * <p>Are synchronous backups activated.</p>
+   *
+   * @return boolean value if synchronous backups are configured.
+   */
+  public boolean sync() {
+    assert(isInitialized());
+    return sync(nativeHandle_);
+  }
+
+  /**
+   * <p>Set if old data will be destroyed.</p>
+   *
    * @param destroyOldData If true, it will delete whatever backups there are already.
-   *     Default: false
+   *
+   * <p>Default: false</p>
+   *
+   * @return instance of current BackupableDBOptions.
+   */
+  public BackupableDBOptions setDestroyOldData(boolean destroyOldData) {
+    assert(isInitialized());
+    setDestroyOldData(nativeHandle_, destroyOldData);
+    return this;
+  }
+
+  /**
+   * <p>Returns if old data will be destroyed will performing new backups.</p>
+   *
+   * @return boolean value indicating if old data will be destroyed.
+   */
+  public boolean destroyOldData() {
+    assert(isInitialized());
+    return destroyOldData(nativeHandle_);
+  }
+
+  /**
+   * <p>Set if log files shall be persisted.</p>
+   *
    * @param backupLogFiles If false, we won't backup log files. This option can be
    *     useful for backing up in-memory databases where log file are persisted,but table
    *     files are in memory.
-   *     Default: true
-   * @param backupRateLimit Max bytes that can be transferred in a second during backup.
-   *     If 0 or negative, then go as fast as you can. Default: 0
-   * @param restoreRateLimit Max bytes that can be transferred in a second during restore.
-   *     If 0 or negative, then go as fast as you can. Default: 0
+   *
+   * <p>Default: true</p>
+   *
+   * @return instance of current BackupableDBOptions.
    */
-  public BackupableDBOptions(String path, boolean shareTableFiles, boolean sync,
-      boolean destroyOldData, boolean backupLogFiles, long backupRateLimit,
-      long restoreRateLimit) {
-    super();
+  public BackupableDBOptions setBackupLogFiles(boolean backupLogFiles) {
+    assert(isInitialized());
+    setBackupLogFiles(nativeHandle_, backupLogFiles);
+    return this;
+  }
+
+  /**
+   * <p>Return information if log files shall be persisted.</p>
+   *
+   * @return boolean value indicating if log files will be persisted.
+   */
+  public boolean backupLogFiles() {
+    assert(isInitialized());
+    return backupLogFiles(nativeHandle_);
+  }
 
+  /**
+   * <p>Set backup rate limit.</p>
+   *
+   * @param backupRateLimit Max bytes that can be transferred in a second during backup.
+   *     If 0 or negative, then go as fast as you can.
+   *
+   * <p>Default: 0</p>
+   *
+   * @return instance of current BackupableDBOptions.
+   */
+  public BackupableDBOptions setBackupRateLimit(long backupRateLimit) {
+    assert(isInitialized());
     backupRateLimit = (backupRateLimit <= 0) ? 0 : backupRateLimit;
+    setBackupRateLimit(nativeHandle_, backupRateLimit);
+    return this;
+  }
+
+  /**
+   * <p>Return backup rate limit which described the max bytes that can be transferred in a
+   * second during backup.</p>
+   *
+   * @return numerical value describing the backup transfer limit in bytes per second.
+   */
+  public long backupRateLimit() {
+    assert(isInitialized());
+    return backupRateLimit(nativeHandle_);
+  }
+
+  /**
+   * <p>Set restore rate limit.</p>
+   *
+   * @param restoreRateLimit Max bytes that can be transferred in a second during restore.
+   *     If 0 or negative, then go as fast as you can.
+   *
+   * <p>Default: 0</p>
+   *
+   * @return instance of current BackupableDBOptions.
+   */
+  public BackupableDBOptions setRestoreRateLimit(long restoreRateLimit) {
+    assert(isInitialized());
     restoreRateLimit = (restoreRateLimit <= 0) ? 0 : restoreRateLimit;
+    setRestoreRateLimit(nativeHandle_, restoreRateLimit);
+    return this;
+  }
 
-    newBackupableDBOptions(path, shareTableFiles, sync, destroyOldData,
-        backupLogFiles, backupRateLimit, restoreRateLimit);
+  /**
+   * <p>Return restore rate limit which described the max bytes that can be transferred in a
+   * second during restore.</p>
+   *
+   * @return numerical value describing the restore transfer limit in bytes per second.
+   */
+  public long restoreRateLimit() {
+    assert(isInitialized());
+    return restoreRateLimit(nativeHandle_);
   }
 
   /**
-   * Returns the path to the BackupableDB directory.
+   * <p>Only used if share_table_files is set to true. If true, will consider that
+   * backups can come from different databases, hence a sst is not uniquely
+   * identified by its name, but by the triple (file name, crc32, file length)</p>
    *
-   * @return the path to the BackupableDB directory.
+   * @param shareFilesWithChecksum boolean value indicating if SST files are stored
+   *     using the triple (file name, crc32, file length) and not its name.
+   *
+   * <p>Note: this is an experimental option, and you'll need to set it manually
+   * turn it on only if you know what you're doing*</p>
+   *
+   * <p>Default: false</p>
+   *
+   * @return instance of current BackupableDBOptions.
    */
-  public String backupDir() {
+  public BackupableDBOptions setShareFilesWithChecksum(
+      boolean shareFilesWithChecksum) {
     assert(isInitialized());
-    return backupDir(nativeHandle_);
+    setShareFilesWithChecksum(nativeHandle_, shareFilesWithChecksum);
+    return this;
+  }
+
+  /**
+   * <p>Return of share files with checksum is active.</p>
+   *
+   * @return boolean value indicating if share files with checksum
+   *     is active.
+   */
+  public boolean shareFilesWithChecksum() {
+    assert(isInitialized());
+    return shareFilesWithChecksum(nativeHandle_);
   }
 
   /**
@@ -69,13 +241,24 @@ public class BackupableDBOptions extends RocksObject {
    * in the c++ side.
    */
   @Override protected void disposeInternal() {
-    assert(isInitialized());
     disposeInternal(nativeHandle_);
   }
 
-  private native void newBackupableDBOptions(String path,
-      boolean shareTableFiles, boolean sync, boolean destroyOldData,
-      boolean backupLogFiles, long backupRateLimit, long restoreRateLimit);
+  private native void newBackupableDBOptions(String path);
   private native String backupDir(long handle);
+  private native void setShareTableFiles(long handle, boolean flag);
+  private native boolean shareTableFiles(long handle);
+  private native void setSync(long handle, boolean flag);
+  private native boolean sync(long handle);
+  private native void setDestroyOldData(long handle, boolean flag);
+  private native boolean destroyOldData(long handle);
+  private native void setBackupLogFiles(long handle, boolean flag);
+  private native boolean backupLogFiles(long handle);
+  private native void setBackupRateLimit(long handle, long rateLimit);
+  private native long backupRateLimit(long handle);
+  private native void setRestoreRateLimit(long handle, long rateLimit);
+  private native long restoreRateLimit(long handle);
+  private native void setShareFilesWithChecksum(long handle, boolean flag);
+  private native boolean shareFilesWithChecksum(long handle);
   private native void disposeInternal(long handle);
 }
diff --git a/java/org/rocksdb/test/BackupableDBOptionsTest.java b/java/org/rocksdb/test/BackupableDBOptionsTest.java
new file mode 100644
index 000000000..6be056986
--- /dev/null
+++ b/java/org/rocksdb/test/BackupableDBOptionsTest.java
@@ -0,0 +1,284 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb.test;
+
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.ExpectedException;
+import org.rocksdb.BackupableDBOptions;
+
+import java.util.Random;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class BackupableDBOptionsTest {
+
+  private final static String ARBITRARY_PATH = "/path";
+
+  @ClassRule
+  public static final RocksMemoryResource rocksMemoryResource =
+      new RocksMemoryResource();
+
+  @Rule
+  public ExpectedException exception = ExpectedException.none();
+
+  public static final Random rand = PlatformRandomHelper.
+      getPlatformSpecificRandomFactory();
+
+  @Test
+  public void backupDir() {
+    BackupableDBOptions backupableDBOptions = null;
+    try {
+      backupableDBOptions = new BackupableDBOptions(ARBITRARY_PATH);
+      assertThat(backupableDBOptions.backupDir()).
+          isEqualTo(ARBITRARY_PATH);
+    } finally {
+      if (backupableDBOptions != null) {
+        backupableDBOptions.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void shareTableFiles() {
+    BackupableDBOptions backupableDBOptions = null;
+    try {
+      backupableDBOptions = new BackupableDBOptions(ARBITRARY_PATH);
+      boolean value = rand.nextBoolean();
+      backupableDBOptions.setShareTableFiles(value);
+      assertThat(backupableDBOptions.shareTableFiles()).
+          isEqualTo(value);
+    } finally {
+      if (backupableDBOptions != null) {
+        backupableDBOptions.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void sync() {
+    BackupableDBOptions backupableDBOptions = null;
+    try {
+      backupableDBOptions = new BackupableDBOptions(ARBITRARY_PATH);
+      boolean value = rand.nextBoolean();
+      backupableDBOptions.setSync(value);
+      assertThat(backupableDBOptions.sync()).isEqualTo(value);
+    } finally {
+      if (backupableDBOptions != null) {
+        backupableDBOptions.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void destroyOldData() {
+    BackupableDBOptions backupableDBOptions = null;
+    try {
+      backupableDBOptions = new BackupableDBOptions(ARBITRARY_PATH);
+      boolean value = rand.nextBoolean();
+      backupableDBOptions.setDestroyOldData(value);
+      assertThat(backupableDBOptions.destroyOldData()).
+          isEqualTo(value);
+    } finally {
+      if (backupableDBOptions != null) {
+        backupableDBOptions.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void backupLogFiles() {
+    BackupableDBOptions backupableDBOptions = null;
+    try {
+      backupableDBOptions = new BackupableDBOptions(ARBITRARY_PATH);
+      boolean value = rand.nextBoolean();
+      backupableDBOptions.setBackupLogFiles(value);
+      assertThat(backupableDBOptions.backupLogFiles()).
+          isEqualTo(value);
+    } finally {
+      if (backupableDBOptions != null) {
+        backupableDBOptions.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void backupRateLimit() {
+    BackupableDBOptions backupableDBOptions = null;
+    try {
+      backupableDBOptions = new BackupableDBOptions(ARBITRARY_PATH);
+      long value = Math.abs(rand.nextLong());
+      backupableDBOptions.setBackupRateLimit(value);
+      assertThat(backupableDBOptions.backupRateLimit()).
+          isEqualTo(value);
+      // negative will be mapped to 0
+      backupableDBOptions.setBackupRateLimit(-1);
+      assertThat(backupableDBOptions.backupRateLimit()).
+          isEqualTo(0);
+    } finally {
+      if (backupableDBOptions != null) {
+        backupableDBOptions.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void restoreRateLimit() {
+    BackupableDBOptions backupableDBOptions = null;
+    try {
+      backupableDBOptions = new BackupableDBOptions(ARBITRARY_PATH);
+      long value = Math.abs(rand.nextLong());
+      backupableDBOptions.setRestoreRateLimit(value);
+      assertThat(backupableDBOptions.restoreRateLimit()).
+          isEqualTo(value);
+      // negative will be mapped to 0
+      backupableDBOptions.setRestoreRateLimit(-1);
+      assertThat(backupableDBOptions.restoreRateLimit()).
+          isEqualTo(0);
+    } finally {
+      if (backupableDBOptions != null) {
+        backupableDBOptions.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void shareFilesWithChecksum() {
+    BackupableDBOptions backupableDBOptions = null;
+    try {
+      backupableDBOptions = new BackupableDBOptions(ARBITRARY_PATH);
+      boolean value = rand.nextBoolean();
+      backupableDBOptions.setShareFilesWithChecksum(value);
+      assertThat(backupableDBOptions.shareFilesWithChecksum()).
+          isEqualTo(value);
+    } finally {
+      if (backupableDBOptions != null) {
+        backupableDBOptions.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void failBackupDirIsNull() {
+    exception.expect(AssertionError.class);
+    new BackupableDBOptions(null);
+  }
+
+  @Test
+  public void failBackupDirIfDisposed(){
+    BackupableDBOptions options = setupUninitializedBackupableDBOptions(
+        exception);
+    options.backupDir();
+  }
+
+  @Test
+  public void failSetShareTableFilesIfDisposed(){
+    BackupableDBOptions options = setupUninitializedBackupableDBOptions(
+        exception);
+    options.setShareTableFiles(true);
+  }
+
+  @Test
+  public void failShareTableFilesIfDisposed(){
+    BackupableDBOptions options = setupUninitializedBackupableDBOptions(
+        exception);
+    options.shareTableFiles();
+  }
+
+  @Test
+  public void failSetSyncIfDisposed(){
+    BackupableDBOptions options = setupUninitializedBackupableDBOptions(
+        exception);
+    options.setSync(true);
+  }
+
+  @Test
+  public void failSyncIfDisposed(){
+    BackupableDBOptions options = setupUninitializedBackupableDBOptions(
+        exception);
+    options.sync();
+  }
+
+  @Test
+  public void failSetDestroyOldDataIfDisposed(){
+    BackupableDBOptions options = setupUninitializedBackupableDBOptions(
+        exception);
+    options.setDestroyOldData(true);
+  }
+
+  @Test
+  public void failDestroyOldDataIfDisposed(){
+    BackupableDBOptions options = setupUninitializedBackupableDBOptions(
+        exception);
+    options.destroyOldData();
+  }
+
+  @Test
+  public void failSetBackupLogFilesIfDisposed(){
+    BackupableDBOptions options = setupUninitializedBackupableDBOptions(
+        exception);
+    options.setBackupLogFiles(true);
+  }
+
+  @Test
+  public void failBackupLogFilesIfDisposed(){
+    BackupableDBOptions options = setupUninitializedBackupableDBOptions(
+        exception);
+    options.backupLogFiles();
+  }
+
+  @Test
+  public void failSetBackupRateLimitIfDisposed(){
+    BackupableDBOptions options = setupUninitializedBackupableDBOptions(
+        exception);
+    options.setBackupRateLimit(1);
+  }
+
+  @Test
+  public void failBackupRateLimitIfDisposed(){
+    BackupableDBOptions options = setupUninitializedBackupableDBOptions(
+        exception);
+    options.backupRateLimit();
+  }
+
+  @Test
+  public void failSetRestoreRateLimitIfDisposed(){
+    BackupableDBOptions options = setupUninitializedBackupableDBOptions(
+        exception);
+    options.setRestoreRateLimit(1);
+  }
+
+  @Test
+  public void failRestoreRateLimitIfDisposed(){
+    BackupableDBOptions options = setupUninitializedBackupableDBOptions(
+        exception);
+    options.restoreRateLimit();
+  }
+
+  @Test
+  public void failSetShareFilesWithChecksumIfDisposed(){
+    BackupableDBOptions options = setupUninitializedBackupableDBOptions(
+        exception);
+    options.setShareFilesWithChecksum(true);
+  }
+
+  @Test
+  public void failShareFilesWithChecksumIfDisposed(){
+    BackupableDBOptions options = setupUninitializedBackupableDBOptions(
+        exception);
+    options.shareFilesWithChecksum();
+  }
+
+  private BackupableDBOptions setupUninitializedBackupableDBOptions(
+      ExpectedException exception) {
+    BackupableDBOptions backupableDBOptions =
+        new BackupableDBOptions(ARBITRARY_PATH);
+    backupableDBOptions.dispose();
+    exception.expect(AssertionError.class);
+    return backupableDBOptions;
+  }
+}
diff --git a/java/org/rocksdb/test/BackupableDBTest.java b/java/org/rocksdb/test/BackupableDBTest.java
index 55a707687..f24163bd5 100644
--- a/java/org/rocksdb/test/BackupableDBTest.java
+++ b/java/org/rocksdb/test/BackupableDBTest.java
@@ -39,8 +39,7 @@ public class BackupableDBTest {
       opt.setCreateIfMissing(true);
 
       bopt = new BackupableDBOptions(
-          backupFolder.getRoot().getAbsolutePath(), false,
-          true, false, true, 0, 0);
+          backupFolder.getRoot().getAbsolutePath());
       assertThat(bopt.backupDir()).isEqualTo(
           backupFolder.getRoot().getAbsolutePath());
 
diff --git a/java/rocksjni/backupablejni.cc b/java/rocksjni/backupablejni.cc
index 609cbd73e..4db9f5682 100644
--- a/java/rocksjni/backupablejni.cc
+++ b/java/rocksjni/backupablejni.cc
@@ -101,20 +101,10 @@ jobject Java_org_rocksdb_BackupableDB_getBackupInfo(
  * Signature: (Ljava/lang/String;)V
  */
 void Java_org_rocksdb_BackupableDBOptions_newBackupableDBOptions(
-    JNIEnv* env, jobject jobj, jstring jpath, jboolean jshare_table_files,
-    jboolean jsync, jboolean jdestroy_old_data, jboolean jbackup_log_files,
-    jlong jbackup_rate_limit, jlong jrestore_rate_limit) {
-  jbackup_rate_limit = (jbackup_rate_limit <= 0) ? 0 : jbackup_rate_limit;
-  jrestore_rate_limit = (jrestore_rate_limit <= 0) ? 0 : jrestore_rate_limit;
-
+    JNIEnv* env, jobject jobj, jstring jpath) {
   const char* cpath = env->GetStringUTFChars(jpath, 0);
-
-  auto bopt = new rocksdb::BackupableDBOptions(cpath, nullptr,
-      jshare_table_files, nullptr, jsync, jdestroy_old_data, jbackup_log_files,
-      jbackup_rate_limit, jrestore_rate_limit);
-
+  auto bopt = new rocksdb::BackupableDBOptions(cpath);
   env->ReleaseStringUTFChars(jpath, cpath);
-
   rocksdb::BackupableDBOptionsJni::setHandle(env, jobj, bopt);
 }
 
@@ -129,6 +119,160 @@ jstring Java_org_rocksdb_BackupableDBOptions_backupDir(
   return env->NewStringUTF(bopt->backup_dir.c_str());
 }
 
+/*
+ * Class:     org_rocksdb_BackupableDBOptions
+ * Method:    setShareTableFiles
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_BackupableDBOptions_setShareTableFiles(
+    JNIEnv* env, jobject jobj, jlong jhandle, jboolean flag) {
+  auto bopt = reinterpret_cast<rocksdb::BackupableDBOptions*>(jhandle);
+  bopt->share_table_files = flag;
+}
+
+/*
+ * Class:     org_rocksdb_BackupableDBOptions
+ * Method:    shareTableFiles
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_BackupableDBOptions_shareTableFiles(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  auto bopt = reinterpret_cast<rocksdb::BackupableDBOptions*>(jhandle);
+  return bopt->share_table_files;
+}
+
+/*
+ * Class:     org_rocksdb_BackupableDBOptions
+ * Method:    setSync
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_BackupableDBOptions_setSync(
+    JNIEnv* env, jobject jobj, jlong jhandle, jboolean flag) {
+  auto bopt = reinterpret_cast<rocksdb::BackupableDBOptions*>(jhandle);
+  bopt->sync = flag;
+}
+
+/*
+ * Class:     org_rocksdb_BackupableDBOptions
+ * Method:    sync
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_BackupableDBOptions_sync(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  auto bopt = reinterpret_cast<rocksdb::BackupableDBOptions*>(jhandle);
+  return bopt->sync;
+}
+
+/*
+  * Class:     org_rocksdb_BackupableDBOptions
+  * Method:    setDestroyOldData
+  * Signature: (JZ)V
+  */
+ void Java_org_rocksdb_BackupableDBOptions_setDestroyOldData(
+     JNIEnv* env, jobject jobj, jlong jhandle, jboolean flag) {
+   auto bopt = reinterpret_cast<rocksdb::BackupableDBOptions*>(jhandle);
+   bopt->destroy_old_data = flag;
+ }
+
+ /*
+  * Class:     org_rocksdb_BackupableDBOptions
+  * Method:    destroyOldData
+  * Signature: (J)Z
+  */
+ jboolean Java_org_rocksdb_BackupableDBOptions_destroyOldData(
+     JNIEnv* env, jobject jobj, jlong jhandle) {
+   auto bopt = reinterpret_cast<rocksdb::BackupableDBOptions*>(jhandle);
+   return bopt->destroy_old_data;
+ }
+
+ /*
+  * Class:     org_rocksdb_BackupableDBOptions
+  * Method:    setBackupLogFiles
+  * Signature: (JZ)V
+  */
+ void Java_org_rocksdb_BackupableDBOptions_setBackupLogFiles(
+     JNIEnv* env, jobject jobj, jlong jhandle, jboolean flag) {
+   auto bopt = reinterpret_cast<rocksdb::BackupableDBOptions*>(jhandle);
+   bopt->backup_log_files = flag;
+ }
+
+ /*
+  * Class:     org_rocksdb_BackupableDBOptions
+  * Method:    backupLogFiles
+  * Signature: (J)Z
+  */
+ jboolean Java_org_rocksdb_BackupableDBOptions_backupLogFiles(
+     JNIEnv* env, jobject jobj, jlong jhandle) {
+   auto bopt = reinterpret_cast<rocksdb::BackupableDBOptions*>(jhandle);
+   return bopt->backup_log_files;
+ }
+
+ /*
+  * Class:     org_rocksdb_BackupableDBOptions
+  * Method:    setBackupRateLimit
+  * Signature: (JJ)V
+  */
+ void Java_org_rocksdb_BackupableDBOptions_setBackupRateLimit(
+     JNIEnv* env, jobject jobj, jlong jhandle, jlong jbackup_rate_limit) {
+   auto bopt = reinterpret_cast<rocksdb::BackupableDBOptions*>(jhandle);
+   bopt->backup_rate_limit = jbackup_rate_limit;
+ }
+
+ /*
+  * Class:     org_rocksdb_BackupableDBOptions
+  * Method:    backupRateLimit
+  * Signature: (J)J
+  */
+ jlong Java_org_rocksdb_BackupableDBOptions_backupRateLimit(
+     JNIEnv* env, jobject jobj, jlong jhandle) {
+   auto bopt = reinterpret_cast<rocksdb::BackupableDBOptions*>(jhandle);
+   return bopt->backup_rate_limit;
+ }
+
+ /*
+  * Class:     org_rocksdb_BackupableDBOptions
+  * Method:    setRestoreRateLimit
+  * Signature: (JJ)V
+  */
+ void Java_org_rocksdb_BackupableDBOptions_setRestoreRateLimit(
+     JNIEnv* env, jobject jobj, jlong jhandle, jlong jrestore_rate_limit) {
+   auto bopt = reinterpret_cast<rocksdb::BackupableDBOptions*>(jhandle);
+   bopt->restore_rate_limit = jrestore_rate_limit;
+ }
+
+ /*
+  * Class:     org_rocksdb_BackupableDBOptions
+  * Method:    restoreRateLimit
+  * Signature: (J)J
+  */
+ jlong Java_org_rocksdb_BackupableDBOptions_restoreRateLimit(
+     JNIEnv* env, jobject jobj, jlong jhandle) {
+   auto bopt = reinterpret_cast<rocksdb::BackupableDBOptions*>(jhandle);
+   return bopt->restore_rate_limit;
+ }
+
+ /*
+  * Class:     org_rocksdb_BackupableDBOptions
+  * Method:    setShareFilesWithChecksum
+  * Signature: (JZ)V
+  */
+ void Java_org_rocksdb_BackupableDBOptions_setShareFilesWithChecksum(
+     JNIEnv* env, jobject jobj, jlong jhandle, jboolean flag) {
+   auto bopt = reinterpret_cast<rocksdb::BackupableDBOptions*>(jhandle);
+   bopt->share_files_with_checksum = flag;
+ }
+
+ /*
+  * Class:     org_rocksdb_BackupableDBOptions
+  * Method:    shareFilesWithChecksum
+  * Signature: (J)Z
+  */
+ jboolean Java_org_rocksdb_BackupableDBOptions_shareFilesWithChecksum(
+     JNIEnv* env, jobject jobj, jlong jhandle) {
+   auto bopt = reinterpret_cast<rocksdb::BackupableDBOptions*>(jhandle);
+   return bopt->share_files_with_checksum;
+ }
+
 /*
  * Class:     org_rocksdb_BackupableDBOptions
  * Method:    disposeInternal
@@ -139,6 +283,5 @@ void Java_org_rocksdb_BackupableDBOptions_disposeInternal(
   auto bopt = reinterpret_cast<rocksdb::BackupableDBOptions*>(jhandle);
   assert(bopt);
   delete bopt;
-
   rocksdb::BackupableDBOptionsJni::setHandle(env, jopt, nullptr);
 }

From 24fdc47416c56f460a4cd114d6af445f578c6c4f Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Sat, 15 Nov 2014 23:42:03 +0100
Subject: [PATCH 532/829] [RocksJava] Backupable/Restorable DB update 3.8.0

- GarbageCollectMethod() available.
- GetCorruptedBackups() available.
---
 java/org/rocksdb/BackupableDB.java          | 26 ++++++++++++
 java/org/rocksdb/RestoreBackupableDB.java   | 28 +++++++++++-
 java/org/rocksdb/test/BackupableDBTest.java |  6 +++
 java/rocksjni/backupablejni.cc              | 46 ++++++++++++++++++++
 java/rocksjni/restorejni.cc                 | 47 +++++++++++++++++++++
 5 files changed, 152 insertions(+), 1 deletion(-)

diff --git a/java/org/rocksdb/BackupableDB.java b/java/org/rocksdb/BackupableDB.java
index 2644fec8f..e73df52e0 100644
--- a/java/org/rocksdb/BackupableDB.java
+++ b/java/org/rocksdb/BackupableDB.java
@@ -92,6 +92,29 @@ public class BackupableDB extends RocksDB {
     return getBackupInfo(nativeHandle_);
   }
 
+  /**
+   * <p>Returns a list of corrupted backup ids. If there
+   * is no corrupted backup the method will return an
+   * empty list.</p>
+   *
+   * @return list of backup ids as Integer.
+   */
+  public List<Integer> getCorruptedBackups() {
+    return getCorruptedBackups(nativeHandle_);
+  }
+
+  /**
+   * <p>Will delete all the files we don't need anymore. It will
+   * do the full scan of the files/ directory and delete all the
+   * files that are not referenced.</p>
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public void garbageCollect() throws RocksDBException {
+    garbageCollect(nativeHandle_);
+  }
+
   /**
    * Close the BackupableDB instance and release resource.
    *
@@ -126,4 +149,7 @@ public class BackupableDB extends RocksDB {
   private native void deleteBackup0(long nativeHandle, int backupId)
       throws RocksDBException;
   protected native List<BackupInfo> getBackupInfo(long handle);
+  private native List<Integer> getCorruptedBackups(long handle);
+  private native void garbageCollect(long handle)
+      throws RocksDBException;
 }
diff --git a/java/org/rocksdb/RestoreBackupableDB.java b/java/org/rocksdb/RestoreBackupableDB.java
index ffbc2e011..9c41f4345 100644
--- a/java/org/rocksdb/RestoreBackupableDB.java
+++ b/java/org/rocksdb/RestoreBackupableDB.java
@@ -101,6 +101,29 @@ public class RestoreBackupableDB extends RocksObject {
     return getBackupInfo(nativeHandle_);
   }
 
+  /**
+   * <p>Returns a list of corrupted backup ids. If there
+   * is no corrupted backup the method will return an
+   * empty list.</p>
+   *
+   * @return list of backup ids as Integer.
+   */
+  public List<Integer> getCorruptedBackups() {
+    return getCorruptedBackups(nativeHandle_);
+  }
+
+  /**
+   * <p>Will delete all the files we don't need anymore. It will
+   * do the full scan of the files/ directory and delete all the
+   * files that are not referenced.</p>
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public void garbageCollect() throws RocksDBException {
+    garbageCollect(nativeHandle_);
+  }
+
   /**
    * Release the memory allocated for the current instance
    * in the c++ side.
@@ -121,6 +144,9 @@ public class RestoreBackupableDB extends RocksObject {
       throws RocksDBException;
   private native void deleteBackup0(long nativeHandle, int backupId)
       throws RocksDBException;
-  protected native List<BackupInfo> getBackupInfo(long handle);
+  private native List<BackupInfo> getBackupInfo(long handle);
+  private native List<Integer> getCorruptedBackups(long handle);
+  private native void garbageCollect(long handle)
+      throws RocksDBException;
   private native void dispose(long nativeHandle);
 }
diff --git a/java/org/rocksdb/test/BackupableDBTest.java b/java/org/rocksdb/test/BackupableDBTest.java
index f24163bd5..0a0b12849 100644
--- a/java/org/rocksdb/test/BackupableDBTest.java
+++ b/java/org/rocksdb/test/BackupableDBTest.java
@@ -56,6 +56,9 @@ public class BackupableDBTest {
           isEqualTo(0);
 
       bdb.createNewBackup(true);
+      assertThat(bdb.getCorruptedBackups().size()).
+          isEqualTo(0);
+      bdb.garbageCollect();
       backupInfos = bdb.getBackupInfos();
       assertThat(backupInfos.size()).
           isEqualTo(1);
@@ -102,6 +105,9 @@ public class BackupableDBTest {
           ropt);
       // do nothing because there is only one backup
       rdb.purgeOldBackups(1);
+      rdb.garbageCollect();
+      assertThat(rdb.getCorruptedBackups().size()).
+          isEqualTo(0);
       restoreInfos = rdb.getBackupInfos();
       assertThat(restoreInfos.size()).
           isEqualTo(1);
diff --git a/java/rocksjni/backupablejni.cc b/java/rocksjni/backupablejni.cc
index 4db9f5682..639c73eba 100644
--- a/java/rocksjni/backupablejni.cc
+++ b/java/rocksjni/backupablejni.cc
@@ -92,6 +92,52 @@ jobject Java_org_rocksdb_BackupableDB_getBackupInfo(
       backup_infos);
 }
 
+/*
+ * Class:     org_rocksdb_BackupableDB
+ * Method:    getCorruptedBackups
+ * Signature: (J)Ljava/util/List;
+ */
+jobject Java_org_rocksdb_BackupableDB_getCorruptedBackups(
+    JNIEnv* env, jobject jbdb, jlong jhandle) {
+  std::vector<rocksdb::BackupID> backup_ids;
+  reinterpret_cast<rocksdb::BackupableDB*>(jhandle)->
+      GetCorruptedBackups(&backup_ids);
+
+  jclass jclazz = env->FindClass("java/util/ArrayList");
+  jmethodID mid = rocksdb::ListJni::getArrayListConstructorMethodId(
+      env, jclazz);
+  jobject jbackup_id_handle_list = env->NewObject(jclazz, mid,
+      backup_ids.size());
+  // insert in java list
+  for (std::vector<rocksdb::BackupID>::size_type i = 0;
+      i != backup_ids.size(); i++) {
+    // convert BackupID to Integer
+    jclass jIntClazz = env->FindClass("java/lang/Integer");
+    jmethodID midLong = env->GetMethodID(jIntClazz, "<init>", "(I)V");
+    jobject obj = env->NewObject(jIntClazz, midLong,
+        (backup_ids[i]));
+    // add Integer to List
+    env->CallBooleanMethod(jbackup_id_handle_list,
+        rocksdb::ListJni::getListAddMethodId(env), obj);
+  }
+  return jbackup_id_handle_list;
+}
+
+/*
+ * Class:     org_rocksdb_BackupableDB
+ * Method:    garbageCollect
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_BackupableDB_garbageCollect(JNIEnv* env,
+    jobject jobj, jlong jhandle) {
+  auto db = reinterpret_cast<rocksdb::BackupableDB*>(jhandle);
+  rocksdb::Status s = db->GarbageCollect();
+
+  if (!s.ok()) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
 ///////////////////////////////////////////////////////////////////////////
 // BackupDBOptions
 
diff --git a/java/rocksjni/restorejni.cc b/java/rocksjni/restorejni.cc
index 4fe813d09..99ffc4256 100644
--- a/java/rocksjni/restorejni.cc
+++ b/java/rocksjni/restorejni.cc
@@ -145,6 +145,53 @@ jobject Java_org_rocksdb_RestoreBackupableDB_getBackupInfo(
       backup_infos);
 }
 
+/*
+ * Class:     org_rocksdb_RestoreBackupableDB
+ * Method:    getCorruptedBackups
+ * Signature: (J)Ljava/util/List;
+ */
+jobject Java_org_rocksdb_RestoreBackupableDB_getCorruptedBackups(
+    JNIEnv* env, jobject jbdb, jlong jhandle) {
+  std::vector<rocksdb::BackupID> backup_ids;
+  reinterpret_cast<rocksdb::RestoreBackupableDB*>(jhandle)->
+      GetCorruptedBackups(&backup_ids);
+
+  jclass jclazz = env->FindClass("java/util/ArrayList");
+  jmethodID mid = rocksdb::ListJni::getArrayListConstructorMethodId(
+      env, jclazz);
+  jobject jbackup_id_handle_list = env->NewObject(jclazz, mid,
+      backup_ids.size());
+  // insert in java list
+  for (std::vector<rocksdb::BackupID>::size_type i = 0;
+      i != backup_ids.size(); i++) {
+    // convert BackupID to Integer
+    jclass jIntClazz = env->FindClass("java/lang/Integer");
+    jmethodID midLong = env->GetMethodID(jIntClazz, "<init>", "(I)V");
+    jobject obj = env->NewObject(jIntClazz, midLong,
+        (backup_ids[i]));
+    // add Integer to List
+    env->CallBooleanMethod(jbackup_id_handle_list,
+        rocksdb::ListJni::getListAddMethodId(env), obj);
+  }
+  return jbackup_id_handle_list;
+}
+
+/*
+ * Class:     org_rocksdb_RestoreBackupableDB
+ * Method:    garbageCollect
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_RestoreBackupableDB_garbageCollect(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  auto db = reinterpret_cast<rocksdb::RestoreBackupableDB*>(
+      jhandle);
+  rocksdb::Status s = db->GarbageCollect();
+
+  if (!s.ok()) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
 /*
  * Class:     org_rocksdb_RestoreBackupableDB
  * Method:    dispose

From fa703efb28c456af140ab213fc5b6dc9001df58c Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Sun, 16 Nov 2014 13:56:18 +0100
Subject: [PATCH 533/829] [RocksJava] Improved BackupableDBTest

- Splitted methods to meaningful tests
- Added tests for additional functionality
- Covered missing parts
---
 java/org/rocksdb/test/BackupableDBTest.java | 452 +++++++++++++++-----
 1 file changed, 348 insertions(+), 104 deletions(-)

diff --git a/java/org/rocksdb/test/BackupableDBTest.java b/java/org/rocksdb/test/BackupableDBTest.java
index 0a0b12849..0b5334607 100644
--- a/java/org/rocksdb/test/BackupableDBTest.java
+++ b/java/org/rocksdb/test/BackupableDBTest.java
@@ -28,143 +28,387 @@ public class BackupableDBTest {
   public TemporaryFolder backupFolder = new TemporaryFolder();
 
   @Test
-  public void backupableDb() throws RocksDBException {
+  public void backupDb() throws RocksDBException {
     Options opt = null;
     BackupableDBOptions bopt = null;
     BackupableDB bdb = null;
-    RestoreOptions ropt = null;
-    RestoreBackupableDB rdb = null;
     try {
-      opt = new Options();
-      opt.setCreateIfMissing(true);
+      opt = new Options().setCreateIfMissing(true);
+      bopt = new BackupableDBOptions(
+          backupFolder.getRoot().getAbsolutePath());
+      assertThat(bopt.backupDir()).isEqualTo(
+          backupFolder.getRoot().getAbsolutePath());
+      // Open empty database.
+      bdb = BackupableDB.open(opt, bopt,
+          dbFolder.getRoot().getAbsolutePath());
+      // Fill database with some test values
+      prepareDatabase(bdb);
+      // Create two backups
+      bdb.createNewBackup(false);
+      bdb.createNewBackup(true);
+      verifyNumberOfValidBackups(bdb, 2);
+    } finally {
+      if (bdb != null) {
+        bdb.close();
+      }
+      if (bopt != null) {
+        bopt.dispose();
+      }
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
 
+  @Test
+  public void deleteBackup() throws RocksDBException {
+    Options opt = null;
+    BackupableDBOptions bopt = null;
+    BackupableDB bdb = null;
+    try {
+      opt = new Options().setCreateIfMissing(true);
       bopt = new BackupableDBOptions(
           backupFolder.getRoot().getAbsolutePath());
       assertThat(bopt.backupDir()).isEqualTo(
           backupFolder.getRoot().getAbsolutePath());
+      // Open empty database.
+      bdb = BackupableDB.open(opt, bopt,
+          dbFolder.getRoot().getAbsolutePath());
+      // Fill database with some test values
+      prepareDatabase(bdb);
+      // Create two backups
+      bdb.createNewBackup(false);
+      bdb.createNewBackup(true);
+      List<BackupInfo> backupInfo =
+          verifyNumberOfValidBackups(bdb, 2);
+      // Delete the first backup
+      bdb.deleteBackup(backupInfo.get(0).backupId());
+      List<BackupInfo> newBackupInfo =
+          verifyNumberOfValidBackups(bdb, 1);
+      // The second backup must remain.
+      assertThat(newBackupInfo.get(0).backupId()).
+          isEqualTo(backupInfo.get(1).backupId());
+    } finally {
+      if (bdb != null) {
+        bdb.close();
+      }
+      if (bopt != null) {
+        bopt.dispose();
+      }
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
 
-      List<BackupInfo> backupInfos;
-      List<BackupInfo> restoreInfos;
+  @Test
+  public void deleteBackupWithRestoreBackupableDB()
+      throws RocksDBException {
+    Options opt = null;
+    BackupableDBOptions bopt = null;
+    BackupableDB bdb = null;
+    RestoreBackupableDB rdb = null;
+    try {
+      opt = new Options().setCreateIfMissing(true);
+      bopt = new BackupableDBOptions(
+          backupFolder.getRoot().getAbsolutePath());
+      assertThat(bopt.backupDir()).isEqualTo(
+          backupFolder.getRoot().getAbsolutePath());
+      // Open empty database.
+      bdb = BackupableDB.open(opt, bopt,
+          dbFolder.getRoot().getAbsolutePath());
+      // Fill database with some test values
+      prepareDatabase(bdb);
+      // Create two backups
+      bdb.createNewBackup(false);
+      bdb.createNewBackup(true);
+      List<BackupInfo> backupInfo =
+          verifyNumberOfValidBackups(bdb, 2);
+      // init RestoreBackupableDB
+      rdb = new RestoreBackupableDB(bopt);
+      // Delete the first backup
+      rdb.deleteBackup(backupInfo.get(0).backupId());
+      // Fetch backup info using RestoreBackupableDB
+      List<BackupInfo> newBackupInfo = verifyNumberOfValidBackups(rdb, 1);
+      // The second backup must remain.
+      assertThat(newBackupInfo.get(0).backupId()).
+          isEqualTo(backupInfo.get(1).backupId());
+    } finally {
+      if (bdb != null) {
+        bdb.close();
+      }
+      if (rdb != null) {
+        rdb.dispose();
+      }
+      if (bopt != null) {
+        bopt.dispose();
+      }
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
 
+  @Test
+  public void purgeOldBackups() throws RocksDBException {
+    Options opt = null;
+    BackupableDBOptions bopt = null;
+    BackupableDB bdb = null;
+    try {
+      opt = new Options().setCreateIfMissing(true);
+      bopt = new BackupableDBOptions(
+          backupFolder.getRoot().getAbsolutePath());
+      assertThat(bopt.backupDir()).isEqualTo(
+          backupFolder.getRoot().getAbsolutePath());
+      // Open empty database.
       bdb = BackupableDB.open(opt, bopt,
           dbFolder.getRoot().getAbsolutePath());
-      bdb.put("abc".getBytes(), "def".getBytes());
-      bdb.put("ghi".getBytes(), "jkl".getBytes());
+      // Fill database with some test values
+      prepareDatabase(bdb);
+      // Create two backups
+      bdb.createNewBackup(false);
+      bdb.createNewBackup(true);
+      bdb.createNewBackup(true);
+      bdb.createNewBackup(true);
+      List<BackupInfo> backupInfo =
+          verifyNumberOfValidBackups(bdb, 4);
+      // Delete everything except the latest backup
+      bdb.purgeOldBackups(1);
+      List<BackupInfo> newBackupInfo =
+          verifyNumberOfValidBackups(bdb, 1);
+      // The latest backup must remain.
+      assertThat(newBackupInfo.get(0).backupId()).
+          isEqualTo(backupInfo.get(3).backupId());
+    } finally {
+      if (bdb != null) {
+        bdb.close();
+      }
+      if (bopt != null) {
+        bopt.dispose();
+      }
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
 
-      backupInfos = bdb.getBackupInfos();
-      assertThat(backupInfos.size()).
-          isEqualTo(0);
+  @Test
+  public void purgeOldBackupsWithRestoreBackupableDb()
+      throws RocksDBException {
+    Options opt = null;
+    BackupableDBOptions bopt = null;
+    BackupableDB bdb = null;
+    RestoreBackupableDB rdb = null;
+    try {
+      opt = new Options().setCreateIfMissing(true);
+      bopt = new BackupableDBOptions(
+          backupFolder.getRoot().getAbsolutePath());
+      assertThat(bopt.backupDir()).isEqualTo(
+          backupFolder.getRoot().getAbsolutePath());
+      // Open empty database.
+      bdb = BackupableDB.open(opt, bopt,
+          dbFolder.getRoot().getAbsolutePath());
+      // Fill database with some test values
+      prepareDatabase(bdb);
+      // Create two backups
+      bdb.createNewBackup(false);
+      bdb.createNewBackup(true);
+      bdb.createNewBackup(true);
+      bdb.createNewBackup(true);
+      verifyNumberOfValidBackups(bdb, 4);
+      // init RestoreBackupableDB
+      rdb = new RestoreBackupableDB(bopt);
+      // the same number of backups must
+      // exist using RestoreBackupableDB.
+      verifyNumberOfValidBackups(rdb, 4);
+      rdb.purgeOldBackups(1);
+      verifyNumberOfValidBackups(rdb, 1);
+    } finally {
+      if (bdb != null) {
+        bdb.close();
+      }
+      if (rdb != null) {
+        rdb.dispose();
+      }
+      if (bopt != null) {
+        bopt.dispose();
+      }
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
 
+  @Test
+  public void restoreLatestBackup()
+      throws RocksDBException {
+    Options opt = null;
+    BackupableDBOptions bopt = null;
+    BackupableDB bdb = null;
+    RestoreBackupableDB rdb = null;
+    try {
+      opt = new Options().setCreateIfMissing(true);
+      bopt = new BackupableDBOptions(
+          backupFolder.getRoot().getAbsolutePath());
+      assertThat(bopt.backupDir()).isEqualTo(
+          backupFolder.getRoot().getAbsolutePath());
+      // Open empty database.
+      bdb = BackupableDB.open(opt, bopt,
+          dbFolder.getRoot().getAbsolutePath());
+      // Fill database with some test values
+      prepareDatabase(bdb);
+      bdb.createNewBackup(true);
+      verifyNumberOfValidBackups(bdb, 1);
+      bdb.put("key1".getBytes(), "valueV2".getBytes());
+      bdb.put("key2".getBytes(), "valueV2".getBytes());
       bdb.createNewBackup(true);
-      assertThat(bdb.getCorruptedBackups().size()).
-          isEqualTo(0);
-      bdb.garbageCollect();
-      backupInfos = bdb.getBackupInfos();
-      assertThat(backupInfos.size()).
-          isEqualTo(1);
-
-      // Retrieving backup infos twice shall not
-      // lead to different results
-      List<BackupInfo> tmpBackupInfo = bdb.getBackupInfos();
-      assertThat(tmpBackupInfo.get(0).backupId()).
-          isEqualTo(backupInfos.get(0).backupId());
-      assertThat(tmpBackupInfo.get(0).timestamp()).
-          isEqualTo(backupInfos.get(0).timestamp());
-      assertThat(tmpBackupInfo.get(0).size()).
-          isEqualTo(backupInfos.get(0).size());
-      assertThat(tmpBackupInfo.get(0).numberFiles()).
-          isEqualTo(backupInfos.get(0).numberFiles());
-
-      // delete record after backup
-      bdb.remove("abc".getBytes());
-      byte[] value = bdb.get("abc".getBytes());
-      assertThat(value).isNull();
+      verifyNumberOfValidBackups(bdb, 2);
+      bdb.put("key1".getBytes(), "valueV3".getBytes());
+      bdb.put("key2".getBytes(), "valueV3".getBytes());
+      assertThat(new String(bdb.get("key1".getBytes()))).endsWith("V3");
+      assertThat(new String(bdb.get("key2".getBytes()))).endsWith("V3");
       bdb.close();
 
-      // restore from backup
-      ropt = new RestoreOptions(false);
+      // init RestoreBackupableDB
       rdb = new RestoreBackupableDB(bopt);
-
-      // getting backup infos from restorable db should
-      // lead to the same infos as from backupable db
-      restoreInfos = rdb.getBackupInfos();
-      assertThat(restoreInfos.size()).
-          isEqualTo(backupInfos.size());
-      assertThat(restoreInfos.get(0).backupId()).
-          isEqualTo(backupInfos.get(0).backupId());
-      assertThat(restoreInfos.get(0).timestamp()).
-          isEqualTo(backupInfos.get(0).timestamp());
-      assertThat(restoreInfos.get(0).size()).
-          isEqualTo(backupInfos.get(0).size());
-      assertThat(restoreInfos.get(0).numberFiles()).
-          isEqualTo(backupInfos.get(0).numberFiles());
-
-      rdb.restoreDBFromLatestBackup(
+      verifyNumberOfValidBackups(rdb, 2);
+      // restore db from latest backup
+      rdb.restoreDBFromLatestBackup(dbFolder.getRoot().getAbsolutePath(),
           dbFolder.getRoot().getAbsolutePath(),
-          dbFolder.getRoot().getAbsolutePath(),
-          ropt);
-      // do nothing because there is only one backup
-      rdb.purgeOldBackups(1);
-      rdb.garbageCollect();
-      assertThat(rdb.getCorruptedBackups().size()).
-          isEqualTo(0);
-      restoreInfos = rdb.getBackupInfos();
-      assertThat(restoreInfos.size()).
-          isEqualTo(1);
-      rdb.dispose();
-      ropt.dispose();
-
-      // verify that backed up data contains deleted record
+          new RestoreOptions(false));
+      // Open database again.
       bdb = BackupableDB.open(opt, bopt,
           dbFolder.getRoot().getAbsolutePath());
-      value = bdb.get("abc".getBytes());
-      assertThat(new String(value)).
-          isEqualTo("def");
-
-      bdb.createNewBackup(false);
-      // after new backup there must be two backup infos
-      backupInfos = bdb.getBackupInfos();
-      assertThat(backupInfos.size()).
-          isEqualTo(2);
-      // deleting the backup must be possible using the
-      // id provided by backup infos
-      bdb.deleteBackup(backupInfos.get(1).backupId());
-      // after deletion there should only be one info
-      backupInfos = bdb.getBackupInfos();
-      assertThat(backupInfos.size()).
-          isEqualTo(1);
-      bdb.createNewBackup(false);
-      bdb.createNewBackup(false);
-      bdb.createNewBackup(false);
-      backupInfos = bdb.getBackupInfos();
-      assertThat(backupInfos.size()).
-          isEqualTo(4);
-      // purge everything and keep two
-      bdb.purgeOldBackups(2);
-      // backup infos need to be two
-      backupInfos = bdb.getBackupInfos();
-      assertThat(backupInfos.size()).
-          isEqualTo(2);
-      assertThat(backupInfos.get(0).backupId()).
-          isEqualTo(4);
-      assertThat(backupInfos.get(1).backupId()).
-          isEqualTo(5);
+      // Values must have suffix V2 because of restoring latest backup.
+      assertThat(new String(bdb.get("key1".getBytes()))).endsWith("V2");
+      assertThat(new String(bdb.get("key2".getBytes()))).endsWith("V2");
     } finally {
-      if (opt != null) {
-        opt.dispose();
+      if (bdb != null) {
+        bdb.close();
+      }
+      if (rdb != null) {
+        rdb.dispose();
       }
       if (bopt != null) {
         bopt.dispose();
       }
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void restoreFromBackup()
+      throws RocksDBException {
+    Options opt = null;
+    BackupableDBOptions bopt = null;
+    BackupableDB bdb = null;
+    RestoreBackupableDB rdb = null;
+    try {
+      opt = new Options().setCreateIfMissing(true);
+      bopt = new BackupableDBOptions(
+          backupFolder.getRoot().getAbsolutePath());
+      assertThat(bopt.backupDir()).isEqualTo(
+          backupFolder.getRoot().getAbsolutePath());
+      // Open empty database.
+      bdb = BackupableDB.open(opt, bopt,
+          dbFolder.getRoot().getAbsolutePath());
+      // Fill database with some test values
+      prepareDatabase(bdb);
+      bdb.createNewBackup(true);
+      verifyNumberOfValidBackups(bdb, 1);
+      bdb.put("key1".getBytes(), "valueV2".getBytes());
+      bdb.put("key2".getBytes(), "valueV2".getBytes());
+      bdb.createNewBackup(true);
+      verifyNumberOfValidBackups(bdb, 2);
+      bdb.put("key1".getBytes(), "valueV3".getBytes());
+      bdb.put("key2".getBytes(), "valueV3".getBytes());
+      assertThat(new String(bdb.get("key1".getBytes()))).endsWith("V3");
+      assertThat(new String(bdb.get("key2".getBytes()))).endsWith("V3");
+      bdb.close();
+
+      // init RestoreBackupableDB
+      rdb = new RestoreBackupableDB(bopt);
+      List<BackupInfo> backupInfo = verifyNumberOfValidBackups(rdb, 2);
+      // restore db from first backup
+      rdb.restoreDBFromBackup(backupInfo.get(0).backupId(),
+          dbFolder.getRoot().getAbsolutePath(),
+          dbFolder.getRoot().getAbsolutePath(),
+          new RestoreOptions(false));
+      // Open database again.
+      bdb = BackupableDB.open(opt, bopt,
+          dbFolder.getRoot().getAbsolutePath());
+      // Values must have suffix V2 because of restoring latest backup.
+      assertThat(new String(bdb.get("key1".getBytes()))).endsWith("V1");
+      assertThat(new String(bdb.get("key2".getBytes()))).endsWith("V1");
+    } finally {
       if (bdb != null) {
         bdb.close();
       }
-      if (ropt != null) {
-        ropt.dispose();
-      }
       if (rdb != null) {
         rdb.dispose();
       }
+      if (bopt != null) {
+        bopt.dispose();
+      }
+      if (opt != null) {
+        opt.dispose();
+      }
     }
   }
+
+  /**
+   * Verify backups.
+   *
+   * @param bdb {@link BackupableDB} instance.
+   * @param expectedNumberOfBackups numerical value
+   * @throws RocksDBException thrown if an error occurs within the native
+   *     part of the library.
+   */
+  private List<BackupInfo> verifyNumberOfValidBackups(BackupableDB bdb,
+     int expectedNumberOfBackups) throws RocksDBException {
+    // Verify that backups exist
+    assertThat(bdb.getCorruptedBackups().size()).
+        isEqualTo(0);
+    bdb.garbageCollect();
+    List<BackupInfo> backupInfo = bdb.getBackupInfos();
+    assertThat(backupInfo.size()).
+        isEqualTo(expectedNumberOfBackups);
+    return backupInfo;
+  }
+
+  /**
+   * Verify backups.
+   *
+   * @param rdb {@link RestoreBackupableDB} instance.
+   * @param expectedNumberOfBackups numerical value
+   * @throws RocksDBException thrown if an error occurs within the native
+   *     part of the library.
+   */
+  private List<BackupInfo> verifyNumberOfValidBackups(
+      RestoreBackupableDB rdb, int expectedNumberOfBackups)
+      throws RocksDBException {
+    // Verify that backups exist
+    assertThat(rdb.getCorruptedBackups().size()).
+        isEqualTo(0);
+    rdb.garbageCollect();
+    List<BackupInfo> backupInfo = rdb.getBackupInfos();
+    assertThat(backupInfo.size()).
+        isEqualTo(expectedNumberOfBackups);
+    return backupInfo;
+  }
+
+  /**
+   * Fill database with some test values.
+   *
+   * @param db {@link RocksDB} instance.
+   * @throws RocksDBException thrown if an error occurs within the native
+   *     part of the library.
+   */
+  private void prepareDatabase(RocksDB db)
+      throws RocksDBException {
+    db.put("key1".getBytes(), "valueV1".getBytes());
+    db.put("key2".getBytes(), "valueV1".getBytes());
+  }
 }

From d7529b2de9eb20236dc20ec600c7160e6423635c Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Sun, 16 Nov 2014 14:36:22 +0100
Subject: [PATCH 534/829] [RocksJava] Cleanup Backupable implementations

- Correct usage of isInitialized()
- Adjusted JavaDoc
---
 java/org/rocksdb/BackupableDB.java        | 43 ++++++++++++-------
 java/org/rocksdb/RestoreBackupableDB.java | 52 ++++++++++++++---------
 2 files changed, 58 insertions(+), 37 deletions(-)

diff --git a/java/org/rocksdb/BackupableDB.java b/java/org/rocksdb/BackupableDB.java
index e73df52e0..5c5de5fd3 100644
--- a/java/org/rocksdb/BackupableDB.java
+++ b/java/org/rocksdb/BackupableDB.java
@@ -8,21 +8,23 @@ package org.rocksdb;
 import java.util.List;
 
 /**
- * A subclass of RocksDB which supports backup-related operations.
+ * <p>A subclass of RocksDB which supports
+ * backup-related operations.</p>
  *
  * @see org.rocksdb.BackupableDBOptions
  */
 public class BackupableDB extends RocksDB {
   /**
-   * Open a {@code BackupableDB} under the specified path.
+   * <p>Open a {@code BackupableDB} under the specified path.
    * Note that the backup path should be set properly in the
-   * input BackupableDBOptions.
+   * input BackupableDBOptions.</p>
    *
    * @param opt {@link org.rocksdb.Options} to set for the database.
    * @param bopt {@link org.rocksdb.BackupableDBOptions} to use.
    * @param db_path Path to store data to. The path for storing the backup should be
    *     specified in the {@link org.rocksdb.BackupableDBOptions}.
-   * @return BackupableDB reference to the opened database.
+   *
+   * @return {@link BackupableDB} reference to the opened database.
    *
    * @throws RocksDBException thrown if error happens in underlying
    *    native library.
@@ -43,8 +45,8 @@ public class BackupableDB extends RocksDB {
   }
 
   /**
-   * Captures the state of the database in the latest backup.
-   * Note that this function is not thread-safe.
+   * <p>Captures the state of the database in the latest backup.
+   * Note that this function is not thread-safe.</p>
    *
    * @param flushBeforeBackup if true, then all data will be flushed
    *     before creating backup.
@@ -54,11 +56,12 @@ public class BackupableDB extends RocksDB {
    */
   public void createNewBackup(boolean flushBeforeBackup)
       throws RocksDBException {
+    assert(isInitialized());
     createNewBackup(nativeHandle_, flushBeforeBackup);
   }
 
   /**
-   * Deletes old backups, keeping latest numBackupsToKeep alive.
+   * <p>Deletes old backups, keeping latest numBackupsToKeep alive.</p>
    *
    * @param numBackupsToKeep Number of latest backups to keep.
    *
@@ -67,11 +70,12 @@ public class BackupableDB extends RocksDB {
    */
   public void purgeOldBackups(int numBackupsToKeep)
       throws RocksDBException {
+    assert(isInitialized());
     purgeOldBackups(nativeHandle_, numBackupsToKeep);
   }
 
   /**
-   * Deletes a specific backup.
+   * <p>Deletes a specific backup.</p>
    *
    * @param backupId of backup to delete.
    *
@@ -79,16 +83,18 @@ public class BackupableDB extends RocksDB {
    *    native library.
    */
   public void deleteBackup(int backupId) throws RocksDBException {
+    assert(isInitialized());
     deleteBackup0(nativeHandle_, backupId);
   }
 
   /**
-   * Returns a list of {@link BackupInfo} instances, which describe
-   * already made backups.
+   * <p>Returns a list of {@link BackupInfo} instances, which describe
+   * already made backups.</p>
    *
    * @return List of {@link BackupInfo} instances.
    */
   public List<BackupInfo> getBackupInfos() {
+    assert(isInitialized());
     return getBackupInfo(nativeHandle_);
   }
 
@@ -100,6 +106,7 @@ public class BackupableDB extends RocksDB {
    * @return list of backup ids as Integer.
    */
   public List<Integer> getCorruptedBackups() {
+    assert(isInitialized());
     return getCorruptedBackups(nativeHandle_);
   }
 
@@ -112,15 +119,18 @@ public class BackupableDB extends RocksDB {
    *    native library.
    */
   public void garbageCollect() throws RocksDBException {
+    assert(isInitialized());
     garbageCollect(nativeHandle_);
   }
 
   /**
-   * Close the BackupableDB instance and release resource.
+   * <p>Close the BackupableDB instance and release resource.</p>
    *
-   * Internally, BackupableDB owns the {@code rocksdb::DB} pointer to its associated
-   * {@link org.rocksdb.RocksDB}. The release of that RocksDB pointer is handled in the destructor
-   * of the c++ {@code rocksdb::BackupableDB} and should be transparent to Java developers.
+   * <p>Internally, {@link BackupableDB} owns the {@code rocksdb::DB}
+   * pointer to its associated {@link org.rocksdb.RocksDB}.
+   * The release of that RocksDB pointer is handled in the destructor
+   * of the c++ {@code rocksdb::BackupableDB} and should be transparent
+   * to Java developers.</p>
    */
   @Override public synchronized void close() {
     if (isInitialized()) {
@@ -129,8 +139,9 @@ public class BackupableDB extends RocksDB {
   }
 
   /**
-   * A protected construction that will be used in the static factory
-   * method {@link #open(Options, BackupableDBOptions, String)}.
+   * <p>A protected construction that will be used in the static
+   * factory method {@link #open(Options, BackupableDBOptions, String)}.
+   * </p>
    */
   protected BackupableDB() {
     super();
diff --git a/java/org/rocksdb/RestoreBackupableDB.java b/java/org/rocksdb/RestoreBackupableDB.java
index 9c41f4345..e7890c278 100644
--- a/java/org/rocksdb/RestoreBackupableDB.java
+++ b/java/org/rocksdb/RestoreBackupableDB.java
@@ -8,15 +8,17 @@ package org.rocksdb;
 import java.util.List;
 
 /**
- * This class is used to access information about backups and restore from them.
+ * <p>This class is used to access information about backups and
+ * restore from them.</p>
  *
- * Note that dispose() must be called before this instance become out-of-scope
- * to release the allocated memory in c++.
+ * <p>Note: {@code dispose()} must be called before this instance
+ * become out-of-scope to release the allocated
+ * memory in c++.</p>
  *
  */
 public class RestoreBackupableDB extends RocksObject {
   /**
-   * Constructor
+   * <p>Construct new estoreBackupableDB instance.</p>
    *
    * @param options {@link org.rocksdb.BackupableDBOptions} instance
    */
@@ -26,16 +28,18 @@ public class RestoreBackupableDB extends RocksObject {
   }
 
   /**
-   * Restore from backup with backup_id
-   * IMPORTANT -- if options_.share_table_files == true and you restore DB
-   * from some backup that is not the latest, and you start creating new
-   * backups from the new DB, they will probably fail.
+   * <p>Restore from backup with backup_id.</p>
    *
-   * Example: Let's say you have backups 1, 2, 3, 4, 5 and you restore 3.
-   * If you add new data to the DB and try creating a new backup now, the
-   * database will diverge from backups 4 and 5 and the new backup will fail.
-   * If you want to create new backup, you will first have to delete backups 4
-   * and 5.
+   * <p><strong>Important</strong>: If options_.share_table_files == true
+   * and you restore DB from some backup that is not the latest, and you
+   * start creating new backups from the new DB, they will probably
+   * fail.</p>
+   *
+   * <p><strong>Example</strong>: Let's say you have backups 1, 2, 3, 4, 5
+   * and you restore 3. If you add new data to the DB and try creating a new
+   * backup now, the database will diverge from backups 4 and 5 and the new
+   * backup will fail. If you want to create new backup, you will first have
+   * to delete backups 4 and 5.</p>
    *
    * @param backupId id pointing to backup
    * @param dbDir database directory to restore to
@@ -47,12 +51,13 @@ public class RestoreBackupableDB extends RocksObject {
    */
   public void restoreDBFromBackup(long backupId, String dbDir, String walDir,
       RestoreOptions restoreOptions) throws RocksDBException {
+    assert(isInitialized());
     restoreDBFromBackup0(nativeHandle_, backupId, dbDir, walDir,
         restoreOptions.nativeHandle_);
   }
 
   /**
-   * Restore from the latest backup.
+   * <p>Restore from the latest backup.</p>
    *
    * @param dbDir database directory to restore to
    * @param walDir directory where wal files are located
@@ -63,12 +68,13 @@ public class RestoreBackupableDB extends RocksObject {
    */
   public void restoreDBFromLatestBackup(String dbDir, String walDir,
       RestoreOptions restoreOptions) throws RocksDBException {
+    assert(isInitialized());
     restoreDBFromLatestBackup0(nativeHandle_, dbDir, walDir,
         restoreOptions.nativeHandle_);
   }
 
   /**
-   * Deletes old backups, keeping latest numBackupsToKeep alive.
+   * <p>Deletes old backups, keeping latest numBackupsToKeep alive.</p>
    *
    * @param numBackupsToKeep of latest backups to keep
    *
@@ -76,11 +82,12 @@ public class RestoreBackupableDB extends RocksObject {
    *    native library.
    */
   public void purgeOldBackups(int numBackupsToKeep) throws RocksDBException {
+    assert(isInitialized());
     purgeOldBackups0(nativeHandle_, numBackupsToKeep);
   }
 
   /**
-   * Deletes a specific backup.
+   * <p>Deletes a specific backup.</p>
    *
    * @param backupId of backup to delete.
    *
@@ -88,16 +95,18 @@ public class RestoreBackupableDB extends RocksObject {
    *    native library.
    */
   public void deleteBackup(int backupId) throws RocksDBException {
+    assert(isInitialized());
     deleteBackup0(nativeHandle_, backupId);
   }
 
   /**
-   * Returns a list of {@link BackupInfo} instances, which describe
-   * already made backups.
+   * <p>Returns a list of {@link BackupInfo} instances, which describe
+   * already made backups.</p>
    *
    * @return List of {@link BackupInfo} instances.
    */
   public List<BackupInfo> getBackupInfos() {
+    assert(isInitialized());
     return getBackupInfo(nativeHandle_);
   }
 
@@ -109,6 +118,7 @@ public class RestoreBackupableDB extends RocksObject {
    * @return list of backup ids as Integer.
    */
   public List<Integer> getCorruptedBackups() {
+    assert(isInitialized());
     return getCorruptedBackups(nativeHandle_);
   }
 
@@ -121,15 +131,15 @@ public class RestoreBackupableDB extends RocksObject {
    *    native library.
    */
   public void garbageCollect() throws RocksDBException {
+    assert(isInitialized());
     garbageCollect(nativeHandle_);
   }
 
   /**
-   * Release the memory allocated for the current instance
-   * in the c++ side.
+   * <p>Release the memory allocated for the current instance
+   * in the c++ side.</p>
    */
   @Override public synchronized void disposeInternal() {
-    assert(isInitialized());
     dispose(nativeHandle_);
   }
 

From 3d78c7a8cfd2553a7f24133b185ccdc5cb5b99ad Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Sun, 16 Nov 2014 14:52:53 +0100
Subject: [PATCH 535/829] [RocksJava] Lint adjustments

---
 java/rocksjni/backupablejni.cc | 198 ++++++++++++++++-----------------
 1 file changed, 99 insertions(+), 99 deletions(-)

diff --git a/java/rocksjni/backupablejni.cc b/java/rocksjni/backupablejni.cc
index 639c73eba..23de95407 100644
--- a/java/rocksjni/backupablejni.cc
+++ b/java/rocksjni/backupablejni.cc
@@ -210,114 +210,114 @@ jboolean Java_org_rocksdb_BackupableDBOptions_sync(
 }
 
 /*
-  * Class:     org_rocksdb_BackupableDBOptions
-  * Method:    setDestroyOldData
-  * Signature: (JZ)V
-  */
- void Java_org_rocksdb_BackupableDBOptions_setDestroyOldData(
-     JNIEnv* env, jobject jobj, jlong jhandle, jboolean flag) {
-   auto bopt = reinterpret_cast<rocksdb::BackupableDBOptions*>(jhandle);
-   bopt->destroy_old_data = flag;
- }
+ * Class:     org_rocksdb_BackupableDBOptions
+ * Method:    setDestroyOldData
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_BackupableDBOptions_setDestroyOldData(
+    JNIEnv* env, jobject jobj, jlong jhandle, jboolean flag) {
+  auto bopt = reinterpret_cast<rocksdb::BackupableDBOptions*>(jhandle);
+  bopt->destroy_old_data = flag;
+}
 
- /*
-  * Class:     org_rocksdb_BackupableDBOptions
-  * Method:    destroyOldData
-  * Signature: (J)Z
-  */
- jboolean Java_org_rocksdb_BackupableDBOptions_destroyOldData(
-     JNIEnv* env, jobject jobj, jlong jhandle) {
-   auto bopt = reinterpret_cast<rocksdb::BackupableDBOptions*>(jhandle);
-   return bopt->destroy_old_data;
- }
+/*
+ * Class:     org_rocksdb_BackupableDBOptions
+ * Method:    destroyOldData
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_BackupableDBOptions_destroyOldData(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  auto bopt = reinterpret_cast<rocksdb::BackupableDBOptions*>(jhandle);
+  return bopt->destroy_old_data;
+}
 
- /*
-  * Class:     org_rocksdb_BackupableDBOptions
-  * Method:    setBackupLogFiles
-  * Signature: (JZ)V
-  */
- void Java_org_rocksdb_BackupableDBOptions_setBackupLogFiles(
-     JNIEnv* env, jobject jobj, jlong jhandle, jboolean flag) {
-   auto bopt = reinterpret_cast<rocksdb::BackupableDBOptions*>(jhandle);
-   bopt->backup_log_files = flag;
- }
+/*
+ * Class:     org_rocksdb_BackupableDBOptions
+ * Method:    setBackupLogFiles
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_BackupableDBOptions_setBackupLogFiles(
+    JNIEnv* env, jobject jobj, jlong jhandle, jboolean flag) {
+  auto bopt = reinterpret_cast<rocksdb::BackupableDBOptions*>(jhandle);
+  bopt->backup_log_files = flag;
+}
 
- /*
-  * Class:     org_rocksdb_BackupableDBOptions
-  * Method:    backupLogFiles
-  * Signature: (J)Z
-  */
- jboolean Java_org_rocksdb_BackupableDBOptions_backupLogFiles(
-     JNIEnv* env, jobject jobj, jlong jhandle) {
-   auto bopt = reinterpret_cast<rocksdb::BackupableDBOptions*>(jhandle);
-   return bopt->backup_log_files;
- }
+/*
+ * Class:     org_rocksdb_BackupableDBOptions
+ * Method:    backupLogFiles
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_BackupableDBOptions_backupLogFiles(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  auto bopt = reinterpret_cast<rocksdb::BackupableDBOptions*>(jhandle);
+  return bopt->backup_log_files;
+}
 
- /*
-  * Class:     org_rocksdb_BackupableDBOptions
-  * Method:    setBackupRateLimit
-  * Signature: (JJ)V
-  */
- void Java_org_rocksdb_BackupableDBOptions_setBackupRateLimit(
-     JNIEnv* env, jobject jobj, jlong jhandle, jlong jbackup_rate_limit) {
-   auto bopt = reinterpret_cast<rocksdb::BackupableDBOptions*>(jhandle);
-   bopt->backup_rate_limit = jbackup_rate_limit;
- }
+/*
+ * Class:     org_rocksdb_BackupableDBOptions
+ * Method:    setBackupRateLimit
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_BackupableDBOptions_setBackupRateLimit(
+    JNIEnv* env, jobject jobj, jlong jhandle, jlong jbackup_rate_limit) {
+  auto bopt = reinterpret_cast<rocksdb::BackupableDBOptions*>(jhandle);
+  bopt->backup_rate_limit = jbackup_rate_limit;
+}
 
- /*
-  * Class:     org_rocksdb_BackupableDBOptions
-  * Method:    backupRateLimit
-  * Signature: (J)J
-  */
- jlong Java_org_rocksdb_BackupableDBOptions_backupRateLimit(
-     JNIEnv* env, jobject jobj, jlong jhandle) {
-   auto bopt = reinterpret_cast<rocksdb::BackupableDBOptions*>(jhandle);
-   return bopt->backup_rate_limit;
- }
+/*
+ * Class:     org_rocksdb_BackupableDBOptions
+ * Method:    backupRateLimit
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_BackupableDBOptions_backupRateLimit(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  auto bopt = reinterpret_cast<rocksdb::BackupableDBOptions*>(jhandle);
+  return bopt->backup_rate_limit;
+}
 
- /*
-  * Class:     org_rocksdb_BackupableDBOptions
-  * Method:    setRestoreRateLimit
-  * Signature: (JJ)V
-  */
- void Java_org_rocksdb_BackupableDBOptions_setRestoreRateLimit(
-     JNIEnv* env, jobject jobj, jlong jhandle, jlong jrestore_rate_limit) {
-   auto bopt = reinterpret_cast<rocksdb::BackupableDBOptions*>(jhandle);
-   bopt->restore_rate_limit = jrestore_rate_limit;
- }
+/*
+ * Class:     org_rocksdb_BackupableDBOptions
+ * Method:    setRestoreRateLimit
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_BackupableDBOptions_setRestoreRateLimit(
+    JNIEnv* env, jobject jobj, jlong jhandle, jlong jrestore_rate_limit) {
+  auto bopt = reinterpret_cast<rocksdb::BackupableDBOptions*>(jhandle);
+  bopt->restore_rate_limit = jrestore_rate_limit;
+}
 
- /*
-  * Class:     org_rocksdb_BackupableDBOptions
-  * Method:    restoreRateLimit
-  * Signature: (J)J
-  */
- jlong Java_org_rocksdb_BackupableDBOptions_restoreRateLimit(
-     JNIEnv* env, jobject jobj, jlong jhandle) {
-   auto bopt = reinterpret_cast<rocksdb::BackupableDBOptions*>(jhandle);
-   return bopt->restore_rate_limit;
- }
+/*
+ * Class:     org_rocksdb_BackupableDBOptions
+ * Method:    restoreRateLimit
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_BackupableDBOptions_restoreRateLimit(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  auto bopt = reinterpret_cast<rocksdb::BackupableDBOptions*>(jhandle);
+  return bopt->restore_rate_limit;
+}
 
- /*
-  * Class:     org_rocksdb_BackupableDBOptions
-  * Method:    setShareFilesWithChecksum
-  * Signature: (JZ)V
-  */
- void Java_org_rocksdb_BackupableDBOptions_setShareFilesWithChecksum(
-     JNIEnv* env, jobject jobj, jlong jhandle, jboolean flag) {
-   auto bopt = reinterpret_cast<rocksdb::BackupableDBOptions*>(jhandle);
-   bopt->share_files_with_checksum = flag;
- }
+/*
+ * Class:     org_rocksdb_BackupableDBOptions
+ * Method:    setShareFilesWithChecksum
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_BackupableDBOptions_setShareFilesWithChecksum(
+    JNIEnv* env, jobject jobj, jlong jhandle, jboolean flag) {
+  auto bopt = reinterpret_cast<rocksdb::BackupableDBOptions*>(jhandle);
+  bopt->share_files_with_checksum = flag;
+}
 
- /*
-  * Class:     org_rocksdb_BackupableDBOptions
-  * Method:    shareFilesWithChecksum
-  * Signature: (J)Z
-  */
- jboolean Java_org_rocksdb_BackupableDBOptions_shareFilesWithChecksum(
-     JNIEnv* env, jobject jobj, jlong jhandle) {
-   auto bopt = reinterpret_cast<rocksdb::BackupableDBOptions*>(jhandle);
-   return bopt->share_files_with_checksum;
- }
+/*
+ * Class:     org_rocksdb_BackupableDBOptions
+ * Method:    shareFilesWithChecksum
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_BackupableDBOptions_shareFilesWithChecksum(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  auto bopt = reinterpret_cast<rocksdb::BackupableDBOptions*>(jhandle);
+  return bopt->share_files_with_checksum;
+}
 
 /*
  * Class:     org_rocksdb_BackupableDBOptions

From faa8d32be0d73be2a6127539062728b6aa0347fc Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Thu, 20 Nov 2014 22:47:48 +0100
Subject: [PATCH 536/829] [RocksJava] Integrated changes from D29019.

---
 java/org/rocksdb/BackupableDB.java            |  6 ++--
 java/org/rocksdb/BackupableDBOptions.java     |  9 +++++-
 java/org/rocksdb/RestoreBackupableDB.java     |  6 ++--
 .../rocksdb/test/BackupableDBOptionsTest.java |  4 +--
 java/org/rocksdb/test/BackupableDBTest.java   |  4 +--
 java/rocksjni/backupablejni.cc                | 29 +++++++------------
 java/rocksjni/restorejni.cc                   | 29 +++++++------------
 7 files changed, 40 insertions(+), 47 deletions(-)

diff --git a/java/org/rocksdb/BackupableDB.java b/java/org/rocksdb/BackupableDB.java
index 5c5de5fd3..a743d861e 100644
--- a/java/org/rocksdb/BackupableDB.java
+++ b/java/org/rocksdb/BackupableDB.java
@@ -103,9 +103,9 @@ public class BackupableDB extends RocksDB {
    * is no corrupted backup the method will return an
    * empty list.</p>
    *
-   * @return list of backup ids as Integer.
+   * @return array of backup ids as int ids.
    */
-  public List<Integer> getCorruptedBackups() {
+  public int[] getCorruptedBackups() {
     assert(isInitialized());
     return getCorruptedBackups(nativeHandle_);
   }
@@ -160,7 +160,7 @@ public class BackupableDB extends RocksDB {
   private native void deleteBackup0(long nativeHandle, int backupId)
       throws RocksDBException;
   protected native List<BackupInfo> getBackupInfo(long handle);
-  private native List<Integer> getCorruptedBackups(long handle);
+  private native int[] getCorruptedBackups(long handle);
   private native void garbageCollect(long handle)
       throws RocksDBException;
 }
diff --git a/java/org/rocksdb/BackupableDBOptions.java b/java/org/rocksdb/BackupableDBOptions.java
index 18e2dfa11..ab532f282 100644
--- a/java/org/rocksdb/BackupableDBOptions.java
+++ b/java/org/rocksdb/BackupableDBOptions.java
@@ -5,6 +5,9 @@
 
 package org.rocksdb;
 
+import java.io.File;
+import java.nio.file.Path;
+
 /**
  * <p>BackupableDBOptions to control the behavior of a backupable database.
  * It will be used during the creation of a {@link org.rocksdb.BackupableDB}.
@@ -21,10 +24,14 @@ public class BackupableDBOptions extends RocksObject {
    *
    * @param path Where to keep the backup files. Has to be different than db name.
    *     Best to set this to {@code db name_ + "/backups"}
+   * @throws java.lang.IllegalArgumentException if illegal path is used.
    */
   public BackupableDBOptions(String path) {
     super();
-    assert(path != null);
+    File backupPath = path == null ? null : new File(path);
+    if (backupPath == null || !backupPath.isDirectory() || !backupPath.canWrite()) {
+      throw new IllegalArgumentException("Illegal path provided.");
+    }
     newBackupableDBOptions(path);
   }
 
diff --git a/java/org/rocksdb/RestoreBackupableDB.java b/java/org/rocksdb/RestoreBackupableDB.java
index e7890c278..e29628815 100644
--- a/java/org/rocksdb/RestoreBackupableDB.java
+++ b/java/org/rocksdb/RestoreBackupableDB.java
@@ -115,9 +115,9 @@ public class RestoreBackupableDB extends RocksObject {
    * is no corrupted backup the method will return an
    * empty list.</p>
    *
-   * @return list of backup ids as Integer.
+   * @return array of backup ids as int ids.
    */
-  public List<Integer> getCorruptedBackups() {
+  public int[] getCorruptedBackups() {
     assert(isInitialized());
     return getCorruptedBackups(nativeHandle_);
   }
@@ -155,7 +155,7 @@ public class RestoreBackupableDB extends RocksObject {
   private native void deleteBackup0(long nativeHandle, int backupId)
       throws RocksDBException;
   private native List<BackupInfo> getBackupInfo(long handle);
-  private native List<Integer> getCorruptedBackups(long handle);
+  private native int[] getCorruptedBackups(long handle);
   private native void garbageCollect(long handle)
       throws RocksDBException;
   private native void dispose(long nativeHandle);
diff --git a/java/org/rocksdb/test/BackupableDBOptionsTest.java b/java/org/rocksdb/test/BackupableDBOptionsTest.java
index 6be056986..b7bdc0011 100644
--- a/java/org/rocksdb/test/BackupableDBOptionsTest.java
+++ b/java/org/rocksdb/test/BackupableDBOptionsTest.java
@@ -17,7 +17,7 @@ import static org.assertj.core.api.Assertions.assertThat;
 
 public class BackupableDBOptionsTest {
 
-  private final static String ARBITRARY_PATH = "/path";
+  private final static String ARBITRARY_PATH = "/tmp";
 
   @ClassRule
   public static final RocksMemoryResource rocksMemoryResource =
@@ -164,7 +164,7 @@ public class BackupableDBOptionsTest {
 
   @Test
   public void failBackupDirIsNull() {
-    exception.expect(AssertionError.class);
+    exception.expect(IllegalArgumentException.class);
     new BackupableDBOptions(null);
   }
 
diff --git a/java/org/rocksdb/test/BackupableDBTest.java b/java/org/rocksdb/test/BackupableDBTest.java
index 0b5334607..3da519418 100644
--- a/java/org/rocksdb/test/BackupableDBTest.java
+++ b/java/org/rocksdb/test/BackupableDBTest.java
@@ -369,7 +369,7 @@ public class BackupableDBTest {
   private List<BackupInfo> verifyNumberOfValidBackups(BackupableDB bdb,
      int expectedNumberOfBackups) throws RocksDBException {
     // Verify that backups exist
-    assertThat(bdb.getCorruptedBackups().size()).
+    assertThat(bdb.getCorruptedBackups().length).
         isEqualTo(0);
     bdb.garbageCollect();
     List<BackupInfo> backupInfo = bdb.getBackupInfos();
@@ -390,7 +390,7 @@ public class BackupableDBTest {
       RestoreBackupableDB rdb, int expectedNumberOfBackups)
       throws RocksDBException {
     // Verify that backups exist
-    assertThat(rdb.getCorruptedBackups().size()).
+    assertThat(rdb.getCorruptedBackups().length).
         isEqualTo(0);
     rdb.garbageCollect();
     List<BackupInfo> backupInfo = rdb.getBackupInfos();
diff --git a/java/rocksjni/backupablejni.cc b/java/rocksjni/backupablejni.cc
index 23de95407..83c641370 100644
--- a/java/rocksjni/backupablejni.cc
+++ b/java/rocksjni/backupablejni.cc
@@ -95,32 +95,25 @@ jobject Java_org_rocksdb_BackupableDB_getBackupInfo(
 /*
  * Class:     org_rocksdb_BackupableDB
  * Method:    getCorruptedBackups
- * Signature: (J)Ljava/util/List;
+ * Signature: (J)[I;
  */
-jobject Java_org_rocksdb_BackupableDB_getCorruptedBackups(
+jintArray Java_org_rocksdb_BackupableDB_getCorruptedBackups(
     JNIEnv* env, jobject jbdb, jlong jhandle) {
   std::vector<rocksdb::BackupID> backup_ids;
   reinterpret_cast<rocksdb::BackupableDB*>(jhandle)->
       GetCorruptedBackups(&backup_ids);
-
-  jclass jclazz = env->FindClass("java/util/ArrayList");
-  jmethodID mid = rocksdb::ListJni::getArrayListConstructorMethodId(
-      env, jclazz);
-  jobject jbackup_id_handle_list = env->NewObject(jclazz, mid,
-      backup_ids.size());
-  // insert in java list
+  // store backupids in int array
+  const int kIdSize = backup_ids.size();
+  int int_backup_ids[kIdSize];
   for (std::vector<rocksdb::BackupID>::size_type i = 0;
       i != backup_ids.size(); i++) {
-    // convert BackupID to Integer
-    jclass jIntClazz = env->FindClass("java/lang/Integer");
-    jmethodID midLong = env->GetMethodID(jIntClazz, "<init>", "(I)V");
-    jobject obj = env->NewObject(jIntClazz, midLong,
-        (backup_ids[i]));
-    // add Integer to List
-    env->CallBooleanMethod(jbackup_id_handle_list,
-        rocksdb::ListJni::getListAddMethodId(env), obj);
+    int_backup_ids[i] = backup_ids[i];
   }
-  return jbackup_id_handle_list;
+  // Store ints in java array
+  jintArray ret_backup_ids;
+  ret_backup_ids = env->NewIntArray(kIdSize);
+  env->SetIntArrayRegion(ret_backup_ids, 0, kIdSize, int_backup_ids);
+  return ret_backup_ids;
 }
 
 /*
diff --git a/java/rocksjni/restorejni.cc b/java/rocksjni/restorejni.cc
index 99ffc4256..ad8749758 100644
--- a/java/rocksjni/restorejni.cc
+++ b/java/rocksjni/restorejni.cc
@@ -148,32 +148,25 @@ jobject Java_org_rocksdb_RestoreBackupableDB_getBackupInfo(
 /*
  * Class:     org_rocksdb_RestoreBackupableDB
  * Method:    getCorruptedBackups
- * Signature: (J)Ljava/util/List;
+ * Signature: (J)[I;
  */
-jobject Java_org_rocksdb_RestoreBackupableDB_getCorruptedBackups(
+jintArray Java_org_rocksdb_RestoreBackupableDB_getCorruptedBackups(
     JNIEnv* env, jobject jbdb, jlong jhandle) {
   std::vector<rocksdb::BackupID> backup_ids;
   reinterpret_cast<rocksdb::RestoreBackupableDB*>(jhandle)->
       GetCorruptedBackups(&backup_ids);
-
-  jclass jclazz = env->FindClass("java/util/ArrayList");
-  jmethodID mid = rocksdb::ListJni::getArrayListConstructorMethodId(
-      env, jclazz);
-  jobject jbackup_id_handle_list = env->NewObject(jclazz, mid,
-      backup_ids.size());
-  // insert in java list
+  // store backupids in int array
+  const int kIdSize = backup_ids.size();
+  int int_backup_ids[kIdSize];
   for (std::vector<rocksdb::BackupID>::size_type i = 0;
       i != backup_ids.size(); i++) {
-    // convert BackupID to Integer
-    jclass jIntClazz = env->FindClass("java/lang/Integer");
-    jmethodID midLong = env->GetMethodID(jIntClazz, "<init>", "(I)V");
-    jobject obj = env->NewObject(jIntClazz, midLong,
-        (backup_ids[i]));
-    // add Integer to List
-    env->CallBooleanMethod(jbackup_id_handle_list,
-        rocksdb::ListJni::getListAddMethodId(env), obj);
+    int_backup_ids[i] = backup_ids[i];
   }
-  return jbackup_id_handle_list;
+  // Store ints in java array
+  jintArray ret_backup_ids;
+  ret_backup_ids = env->NewIntArray(kIdSize);
+  env->SetIntArrayRegion(ret_backup_ids, 0, kIdSize, int_backup_ids);
+  return ret_backup_ids;
 }
 
 /*

From beabc6879c9a2b7153a4f21262e7d62160c8b23b Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Thu, 20 Nov 2014 15:45:56 -0800
Subject: [PATCH 537/829] Fixed ~ThreadStatusImpl().

---
 util/thread_status_impl.cc | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/util/thread_status_impl.cc b/util/thread_status_impl.cc
index d07a60463..5775aacd7 100644
--- a/util/thread_status_impl.cc
+++ b/util/thread_status_impl.cc
@@ -21,13 +21,7 @@ std::unordered_map<const void*, std::unordered_set<const void*>>
     ThreadStatusImpl::db_key_map_;
 
 ThreadStatusImpl::~ThreadStatusImpl() {
-  std::lock_guard<std::mutex> lck(thread_list_mutex_);
-  for (auto* thread_data : thread_data_set_) {
-    assert(thread_data->thread_type == ThreadStatus::ThreadType::USER_THREAD);
-    delete thread_data;
-  }
   assert(thread_data_set_.size() == 0);
-  thread_data_set_.clear();
 }
 
 void ThreadStatusImpl::UnregisterThread() {
@@ -35,6 +29,7 @@ void ThreadStatusImpl::UnregisterThread() {
     std::lock_guard<std::mutex> lck(thread_list_mutex_);
     thread_data_set_.erase(thread_status_data_);
     delete thread_status_data_;
+    thread_status_data_ = nullptr;
   }
 }
 

From 004f416b7750d4b044f3d8f7cfcd360ec24e9387 Mon Sep 17 00:00:00 2001
From: Venkatesh Radhakrishnan <rven@fb.com>
Date: Thu, 20 Nov 2014 15:54:47 -0800
Subject: [PATCH 538/829] Moved checkpoint to utilities

Summary:
Moved checkpoint to utilities.
Addressed comments by Igor, Siying, Dhruba

Test Plan: db_test/SnapshotLink

Reviewers: dhruba, igor, sdong

Reviewed By: igor

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D29079
---
 HISTORY.md                               |   6 +
 db/db_filesnapshot.cc                    | 107 ---------------
 db/db_impl.h                             |   5 -
 db/db_test.cc                            |   9 +-
 include/rocksdb/db.h                     |   6 -
 include/rocksdb/utilities/checkpoint.h   |  34 +++++
 include/rocksdb/utilities/stackable_db.h |   4 -
 utilities/checkpoint/checkpoint.cc       | 168 +++++++++++++++++++++++
 8 files changed, 212 insertions(+), 127 deletions(-)
 create mode 100644 include/rocksdb/utilities/checkpoint.h
 create mode 100644 utilities/checkpoint/checkpoint.cc

diff --git a/HISTORY.md b/HISTORY.md
index 78973adec..93170fa6f 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -3,6 +3,12 @@
 ### Unreleased Features
 * Add rocksdb::GetThreadList(), which returns the current status of all rocksdb-related threads.
 
+### Public API changes
+* New API to create a checkpoint added. Given a directory name, creates a new
+  database which is an image of the existing database.
+*New API LinkFile added to Env. If you implement your own Env class, an
+ implementation of the API LinkFile will have to be provided.
+
 ## 3.8.0 (11/14/2014)
 
 ### Public API changes
diff --git a/db/db_filesnapshot.cc b/db/db_filesnapshot.cc
index a442c68b2..ce009a976 100644
--- a/db/db_filesnapshot.cc
+++ b/db/db_filesnapshot.cc
@@ -138,113 +138,6 @@ Status DBImpl::GetSortedWalFiles(VectorLogPtr& files) {
   return wal_manager_.GetSortedWalFiles(files);
 }
 
-// Builds an openable snapshot of RocksDB
-Status DBImpl::CreateCheckpoint(const std::string& snapshot_dir) {
-  Status s;
-  std::vector<std::string> live_files;
-  uint64_t manifest_file_size = 0;
-  uint64_t sequence_number = GetLatestSequenceNumber();
-  bool same_fs = true;
-
-  if (env_->FileExists(snapshot_dir)) {
-    return Status::InvalidArgument("Directory exists");
-  }
-
-  s = DisableFileDeletions();
-  if (s.ok()) {
-    // this will return live_files prefixed with "/"
-    s = GetLiveFiles(live_files, &manifest_file_size, true);
-  }
-  if (!s.ok()) {
-    EnableFileDeletions(false);
-    return s;
-  }
-
-  Log(db_options_.info_log,
-      "Started the snapshot process -- creating snapshot in directory %s",
-      snapshot_dir.c_str());
-
-  std::string full_private_path = snapshot_dir + ".tmp";
-
-  // create snapshot directory
-  s = env_->CreateDir(full_private_path);
-
-  // copy/hard link live_files
-  for (size_t i = 0; s.ok() && i < live_files.size(); ++i) {
-    uint64_t number;
-    FileType type;
-    bool ok = ParseFileName(live_files[i], &number, &type);
-    if (!ok) {
-      s = Status::Corruption("Can't parse file name. This is very bad");
-      break;
-    }
-    // we should only get sst, manifest and current files here
-    assert(type == kTableFile || type == kDescriptorFile ||
-           type == kCurrentFile);
-    assert(live_files[i].size() > 0 && live_files[i][0] == '/');
-    std::string src_fname = live_files[i];
-
-    // rules:
-    // * if it's kTableFile, then it's shared
-    // * if it's kDescriptorFile, limit the size to manifest_file_size
-    // * always copy if cross-device link
-    if ((type == kTableFile) && same_fs) {
-      Log(db_options_.info_log, "Hard Linking %s", src_fname.c_str());
-      s = env_->LinkFile(GetName() + src_fname, full_private_path + src_fname);
-      if (s.IsNotSupported()) {
-        same_fs = false;
-        s = Status::OK();
-      }
-    }
-    if ((type != kTableFile) || (!same_fs)) {
-      Log(db_options_.info_log, "Copying %s", src_fname.c_str());
-      s = CopyFile(env_, GetName() + src_fname, full_private_path + src_fname,
-                   (type == kDescriptorFile) ? manifest_file_size : 0);
-    }
-  }
-
-  // we copied all the files, enable file deletions
-  EnableFileDeletions(false);
-
-  if (s.ok()) {
-    // move tmp private backup to real snapshot directory
-    s = env_->RenameFile(full_private_path, snapshot_dir);
-  }
-  if (s.ok()) {
-    unique_ptr<Directory> snapshot_directory;
-    env_->NewDirectory(snapshot_dir, &snapshot_directory);
-    if (snapshot_directory != nullptr) {
-      s = snapshot_directory->Fsync();
-    }
-  }
-
-  if (!s.ok()) {
-    // clean all the files we might have created
-    Log(db_options_.info_log, "Snapshot failed -- %s", s.ToString().c_str());
-    // we have to delete the dir and all its children
-    std::vector<std::string> subchildren;
-    env_->GetChildren(full_private_path, &subchildren);
-    for (auto& subchild : subchildren) {
-      Status s1 = env_->DeleteFile(full_private_path + subchild);
-      if (s1.ok()) {
-        Log(db_options_.info_log, "Deleted %s",
-            (full_private_path + subchild).c_str());
-      }
-    }
-    // finally delete the private dir
-    Status s1 = env_->DeleteDir(full_private_path);
-    Log(db_options_.info_log, "Deleted dir %s -- %s", full_private_path.c_str(),
-        s1.ToString().c_str());
-    return s;
-  }
-
-  // here we know that we succeeded and installed the new snapshot
-  Log(db_options_.info_log, "Snapshot DONE. All is good");
-  Log(db_options_.info_log, "Snapshot sequence number: %" PRIu64,
-      sequence_number);
-
-  return s;
-}
 }
 
 #endif  // ROCKSDB_LITE
diff --git a/db/db_impl.h b/db/db_impl.h
index 283796120..1217610b5 100644
--- a/db/db_impl.h
+++ b/db/db_impl.h
@@ -170,11 +170,6 @@ class DBImpl : public DB {
       ColumnFamilyHandle* column_family,
       ColumnFamilyMetaData* metadata) override;
 
-  // Builds an openable snapshot of RocksDB on the same disk, which
-  // accepts an output directory on the same disk, and under the directory
-  // (1) hard-linked SST files pointing to existing live SST files
-  // (2) a copied manifest files and other files
-  virtual Status CreateCheckpoint(const std::string& snapshot_dir);
 #endif  // ROCKSDB_LITE
 
   // checks if all live files exist on file system and that their file sizes
diff --git a/db/db_test.cc b/db/db_test.cc
index 870fc7268..b8c04495b 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -35,6 +35,7 @@
 #include "rocksdb/table_properties.h"
 #include "rocksdb/thread_status.h"
 #include "rocksdb/utilities/write_batch_with_index.h"
+#include "rocksdb/utilities/checkpoint.h"
 #include "table/block_based_table_factory.h"
 #include "table/plain_table_factory.h"
 #include "util/hash.h"
@@ -1616,6 +1617,7 @@ TEST(DBTest, GetSnapshotLink) {
     DB* snapshotDB;
     ReadOptions roptions;
     std::string result;
+    Checkpoint* checkpoint;
 
     options = CurrentOptions(options);
     delete db_;
@@ -1631,7 +1633,8 @@ TEST(DBTest, GetSnapshotLink) {
     std::string key = std::string("foo");
     ASSERT_OK(Put(key, "v1"));
     // Take a snapshot
-    ASSERT_OK(db_->CreateCheckpoint(snapshot_name));
+    ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+    ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name));
     ASSERT_OK(Put(key, "v2"));
     ASSERT_EQ("v2", Get(key));
     ASSERT_OK(Flush());
@@ -7525,10 +7528,6 @@ class ModelDB: public DB {
       ColumnFamilyHandle* column_family,
       ColumnFamilyMetaData* metadata) {}
 
-  virtual Status CreateCheckpoint(const std::string& snapshot_dir) {
-    return Status::NotSupported("Not supported in Model DB");
-  }
-
  private:
   class ModelIter: public Iterator {
    public:
diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h
index ad3745c5e..326989418 100644
--- a/include/rocksdb/db.h
+++ b/include/rocksdb/db.h
@@ -522,12 +522,6 @@ class DB {
   virtual Status GetPropertiesOfAllTables(TablePropertiesCollection* props) {
     return GetPropertiesOfAllTables(DefaultColumnFamily(), props);
   }
-
-  // Builds an openable snapshot of RocksDB on the same disk, which
-  // accepts an output directory on the same disk, and under the directory
-  // (1) hard-linked SST files pointing to existing live SST files
-  // (2) a copied manifest files and other files
-  virtual Status CreateCheckpoint(const std::string& snapshot_dir) = 0;
 #endif  // ROCKSDB_LITE
 
  private:
diff --git a/include/rocksdb/utilities/checkpoint.h b/include/rocksdb/utilities/checkpoint.h
new file mode 100644
index 000000000..b60f4ebc6
--- /dev/null
+++ b/include/rocksdb/utilities/checkpoint.h
@@ -0,0 +1,34 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+//
+// A checkpoint is an openable snapshot of a database at a point in time.
+
+#pragma once
+
+#include <string>
+#include "rocksdb/status.h"
+
+namespace rocksdb {
+
+class DB;
+
+class Checkpoint {
+ public:
+  // Creates a Checkpoint object to be used for creating openable sbapshots
+  static Status Create(DB* db, Checkpoint** checkpoint_ptr);
+
+  // Builds an openable snapshot of RocksDB on the same disk, which
+  // accepts an output directory on the same disk, and under the directory
+  // (1) hard-linked SST files pointing to existing live SST files
+  // SST files will be copied if output directory is on a different filesystem
+  // (2) a copied manifest files and other files
+  // The directory should not already exist and will be created by this API.
+  // The directory will be an absolute path
+  virtual Status CreateCheckpoint(const std::string& checkpoint_dir);
+
+  virtual ~Checkpoint() {}
+};
+
+}  // namespace rocksdb
diff --git a/include/rocksdb/utilities/stackable_db.h b/include/rocksdb/utilities/stackable_db.h
index 9366bd84f..7bdf9928e 100644
--- a/include/rocksdb/utilities/stackable_db.h
+++ b/include/rocksdb/utilities/stackable_db.h
@@ -247,10 +247,6 @@ class StackableDB : public DB {
     return db_->DefaultColumnFamily();
   }
 
-  virtual Status CreateCheckpoint(const std::string& snapshot_dir) override {
-    return db_->CreateCheckpoint(snapshot_dir);
-  }
-
  protected:
   DB* db_;
 };
diff --git a/utilities/checkpoint/checkpoint.cc b/utilities/checkpoint/checkpoint.cc
new file mode 100644
index 000000000..b180bbd38
--- /dev/null
+++ b/utilities/checkpoint/checkpoint.cc
@@ -0,0 +1,168 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2012 Facebook.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/utilities/checkpoint.h"
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include <inttypes.h>
+#include <algorithm>
+#include <string>
+#include "db/filename.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "util/file_util.h"
+
+namespace rocksdb {
+
+class CheckpointImpl : public Checkpoint {
+ public:
+  // Creates a Checkpoint object to be used for creating openable sbapshots
+  explicit CheckpointImpl(DB* db) : db_(db) {}
+
+  // Builds an openable snapshot of RocksDB on the same disk, which
+  // accepts an output directory on the same disk, and under the directory
+  // (1) hard-linked SST files pointing to existing live SST files
+  // SST files will be copied if output directory is on a different filesystem
+  // (2) a copied manifest files and other files
+  // The directory should not already exist and will be created by this API.
+  // The directory will be an absolute path
+  using Checkpoint::CreateCheckpoint;
+  virtual Status CreateCheckpoint(const std::string& checkpoint_dir);
+
+ private:
+  DB* db_;
+};
+
+Status Checkpoint::Create(DB* db, Checkpoint** checkpoint_ptr) {
+  *checkpoint_ptr = new CheckpointImpl(db);
+  return Status::OK();
+}
+
+Status Checkpoint::CreateCheckpoint(const std::string& checkpoint_dir) {
+  return Status::NotSupported("");
+}
+
+// Builds an openable snapshot of RocksDB
+Status CheckpointImpl::CreateCheckpoint(const std::string& checkpoint_dir) {
+  Status s;
+  std::vector<std::string> live_files;
+  uint64_t manifest_file_size = 0;
+  uint64_t sequence_number = db_->GetLatestSequenceNumber();
+  bool same_fs = true;
+
+  if (db_->GetEnv()->FileExists(checkpoint_dir)) {
+    return Status::InvalidArgument("Directory exists");
+  }
+
+  s = db_->DisableFileDeletions();
+  if (s.ok()) {
+    // this will return live_files prefixed with "/"
+    s = db_->GetLiveFiles(live_files, &manifest_file_size, true);
+  }
+  if (!s.ok()) {
+    db_->EnableFileDeletions(false);
+    return s;
+  }
+
+  Log(db_->GetOptions().info_log,
+      "Started the snapshot process -- creating snapshot in directory %s",
+      checkpoint_dir.c_str());
+
+  std::string full_private_path = checkpoint_dir + ".tmp";
+
+  // create snapshot directory
+  s = db_->GetEnv()->CreateDir(full_private_path);
+
+  // copy/hard link live_files
+  for (size_t i = 0; s.ok() && i < live_files.size(); ++i) {
+    uint64_t number;
+    FileType type;
+    bool ok = ParseFileName(live_files[i], &number, &type);
+    if (!ok) {
+      s = Status::Corruption("Can't parse file name. This is very bad");
+      break;
+    }
+    // we should only get sst, manifest and current files here
+    assert(type == kTableFile || type == kDescriptorFile ||
+           type == kCurrentFile);
+    assert(live_files[i].size() > 0 && live_files[i][0] == '/');
+    std::string src_fname = live_files[i];
+
+    // rules:
+    // * if it's kTableFile, then it's shared
+    // * if it's kDescriptorFile, limit the size to manifest_file_size
+    // * always copy if cross-device link
+    if ((type == kTableFile) && same_fs) {
+      Log(db_->GetOptions().info_log, "Hard Linking %s", src_fname.c_str());
+      s = db_->GetEnv()->LinkFile(db_->GetName() + src_fname,
+                                  full_private_path + src_fname);
+      if (s.IsNotSupported()) {
+        same_fs = false;
+        s = Status::OK();
+      }
+    }
+    if ((type != kTableFile) || (!same_fs)) {
+      Log(db_->GetOptions().info_log, "Copying %s", src_fname.c_str());
+      s = CopyFile(db_->GetEnv(), db_->GetName() + src_fname,
+                   full_private_path + src_fname,
+                   (type == kDescriptorFile) ? manifest_file_size : 0);
+    }
+  }
+
+  // we copied all the files, enable file deletions
+  db_->EnableFileDeletions(false);
+
+  if (s.ok()) {
+    // move tmp private backup to real snapshot directory
+    s = db_->GetEnv()->RenameFile(full_private_path, checkpoint_dir);
+  }
+  if (s.ok()) {
+    unique_ptr<Directory> checkpoint_directory;
+    db_->GetEnv()->NewDirectory(checkpoint_dir, &checkpoint_directory);
+    if (checkpoint_directory != nullptr) {
+      s = checkpoint_directory->Fsync();
+    }
+  }
+
+  if (!s.ok()) {
+    // clean all the files we might have created
+    Log(db_->GetOptions().info_log, "Snapshot failed -- %s",
+        s.ToString().c_str());
+    // we have to delete the dir and all its children
+    std::vector<std::string> subchildren;
+    db_->GetEnv()->GetChildren(full_private_path, &subchildren);
+    for (auto& subchild : subchildren) {
+      Status s1 = db_->GetEnv()->DeleteFile(full_private_path + subchild);
+      if (s1.ok()) {
+        Log(db_->GetOptions().info_log, "Deleted %s",
+            (full_private_path + subchild).c_str());
+      }
+    }
+    // finally delete the private dir
+    Status s1 = db_->GetEnv()->DeleteDir(full_private_path);
+    Log(db_->GetOptions().info_log, "Deleted dir %s -- %s",
+        full_private_path.c_str(), s1.ToString().c_str());
+    return s;
+  }
+
+  // here we know that we succeeded and installed the new snapshot
+  Log(db_->GetOptions().info_log, "Snapshot DONE. All is good");
+  Log(db_->GetOptions().info_log, "Snapshot sequence number: %" PRIu64,
+      sequence_number);
+
+  return s;
+}
+}  // namespace rocksdb
+
+#endif  // ROCKSDB_LITE

From eecdebe65b1f2b37f2da92b7ef58e824481bb107 Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Thu, 20 Nov 2014 16:02:03 -0800
Subject: [PATCH 539/829] Fixed the destruction order of static variables in
 ThreadStatusImpl.

---
 util/thread_status_impl.cc | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/util/thread_status_impl.cc b/util/thread_status_impl.cc
index 5775aacd7..d1cd5ccdc 100644
--- a/util/thread_status_impl.cc
+++ b/util/thread_status_impl.cc
@@ -9,8 +9,6 @@
 
 namespace rocksdb {
 
-ThreadStatusImpl thread_local_status;
-
 #if ROCKSDB_USING_THREAD_STATUS
 __thread ThreadStatusData* ThreadStatusImpl::thread_status_data_ = nullptr;
 std::mutex ThreadStatusImpl::thread_list_mutex_;
@@ -20,6 +18,8 @@ std::unordered_map<const void*, ConstantColumnFamilyInfo*>
 std::unordered_map<const void*, std::unordered_set<const void*>>
     ThreadStatusImpl::db_key_map_;
 
+ThreadStatusImpl thread_local_status;
+
 ThreadStatusImpl::~ThreadStatusImpl() {
   assert(thread_data_set_.size() == 0);
 }
@@ -188,5 +188,6 @@ void ThreadStatusImpl::EraseColumnFamilyInfo(const void* cf_key) {
 void ThreadStatusImpl::EraseDatabaseInfo(const void* db_key) {
 }
 
+ThreadStatusImpl thread_local_status;
 #endif  // ROCKSDB_USING_THREAD_STATUS
 }  // namespace rocksdb

From 353307758b056dd328b3e5a9da9d9022d1794f9a Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Thu, 20 Nov 2014 16:13:20 -0800
Subject: [PATCH 540/829] Add IOS_CROSS_COMPILE to macro guard for
 GetThreadList feature.

---
 include/rocksdb/thread_status.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/rocksdb/thread_status.h b/include/rocksdb/thread_status.h
index bfd4a79fc..f622aa405 100644
--- a/include/rocksdb/thread_status.h
+++ b/include/rocksdb/thread_status.h
@@ -12,7 +12,8 @@
 #define ROCKSDB_USING_THREAD_STATUS \
     !defined(ROCKSDB_LITE) && \
     !defined(NROCKSDB_THREAD_STATUS) && \
-    !defined(OS_MACOSX)
+    !defined(OS_MACOSX) && \
+    !defined(IOS_CROSS_COMPILE)
 #endif
 
 namespace rocksdb {

From 9e285d423858b50f51d584005bcb81d2724dcfa6 Mon Sep 17 00:00:00 2001
From: Bryan Rosario <bcr@fb.com>
Date: Thu, 20 Nov 2014 19:24:39 -0800
Subject: [PATCH 541/829] Added CompatibleOptions for compatibility with
 LevelDB Options

Summary: Created a CompatibleOptions object that can be used as a LevelDB Options object and then converted to a RocksDB Options object using the ConvertOptions() method.

Test Plan: Unit test included in diff.

Reviewers: ljin

Reviewed By: ljin

Subscribers: sdong, dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D28893
---
 include/rocksdb/utilities/leveldb_options.h  | 144 +++++++++++++++++++
 table/block_based_table_factory.cc           |   4 +
 table/block_based_table_factory.h            |   2 +
 util/options_test.cc                         |  32 +++++
 utilities/leveldb_options/leveldb_options.cc |  56 ++++++++
 5 files changed, 238 insertions(+)
 create mode 100644 include/rocksdb/utilities/leveldb_options.h
 create mode 100644 utilities/leveldb_options/leveldb_options.cc

diff --git a/include/rocksdb/utilities/leveldb_options.h b/include/rocksdb/utilities/leveldb_options.h
new file mode 100644
index 000000000..09033b7e7
--- /dev/null
+++ b/include/rocksdb/utilities/leveldb_options.h
@@ -0,0 +1,144 @@
+//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <stddef.h>
+
+namespace rocksdb {
+
+class Cache;
+class Comparator;
+class Env;
+class FilterPolicy;
+class Logger;
+class Options;
+class Snapshot;
+
+enum CompressionType : char;
+
+// Options to control the behavior of a database (passed to
+// DB::Open). A LevelDBOptions object can be initialized as though
+// it were a LevelDB Options object, and then it can be converted into
+// a RocksDB Options object.
+struct LevelDBOptions {
+  // -------------------
+  // Parameters that affect behavior
+
+  // Comparator used to define the order of keys in the table.
+  // Default: a comparator that uses lexicographic byte-wise ordering
+  //
+  // REQUIRES: The client must ensure that the comparator supplied
+  // here has the same name and orders keys *exactly* the same as the
+  // comparator provided to previous open calls on the same DB.
+  const Comparator* comparator;
+
+  // If true, the database will be created if it is missing.
+  // Default: false
+  bool create_if_missing;
+
+  // If true, an error is raised if the database already exists.
+  // Default: false
+  bool error_if_exists;
+
+  // If true, the implementation will do aggressive checking of the
+  // data it is processing and will stop early if it detects any
+  // errors.  This may have unforeseen ramifications: for example, a
+  // corruption of one DB entry may cause a large number of entries to
+  // become unreadable or for the entire DB to become unopenable.
+  // Default: false
+  bool paranoid_checks;
+
+  // Use the specified object to interact with the environment,
+  // e.g. to read/write files, schedule background work, etc.
+  // Default: Env::Default()
+  Env* env;
+
+  // Any internal progress/error information generated by the db will
+  // be written to info_log if it is non-NULL, or to a file stored
+  // in the same directory as the DB contents if info_log is NULL.
+  // Default: NULL
+  Logger* info_log;
+
+  // -------------------
+  // Parameters that affect performance
+
+  // Amount of data to build up in memory (backed by an unsorted log
+  // on disk) before converting to a sorted on-disk file.
+  //
+  // Larger values increase performance, especially during bulk loads.
+  // Up to two write buffers may be held in memory at the same time,
+  // so you may wish to adjust this parameter to control memory usage.
+  // Also, a larger write buffer will result in a longer recovery time
+  // the next time the database is opened.
+  //
+  // Default: 4MB
+  size_t write_buffer_size;
+
+  // Number of open files that can be used by the DB.  You may need to
+  // increase this if your database has a large working set (budget
+  // one open file per 2MB of working set).
+  //
+  // Default: 1000
+  int max_open_files;
+
+  // Control over blocks (user data is stored in a set of blocks, and
+  // a block is the unit of reading from disk).
+
+  // If non-NULL, use the specified cache for blocks.
+  // If NULL, leveldb will automatically create and use an 8MB internal cache.
+  // Default: NULL
+  Cache* block_cache;
+
+  // Approximate size of user data packed per block.  Note that the
+  // block size specified here corresponds to uncompressed data.  The
+  // actual size of the unit read from disk may be smaller if
+  // compression is enabled.  This parameter can be changed dynamically.
+  //
+  // Default: 4K
+  size_t block_size;
+
+  // Number of keys between restart points for delta encoding of keys.
+  // This parameter can be changed dynamically.  Most clients should
+  // leave this parameter alone.
+  //
+  // Default: 16
+  int block_restart_interval;
+
+  // Compress blocks using the specified compression algorithm.  This
+  // parameter can be changed dynamically.
+  //
+  // Default: kSnappyCompression, which gives lightweight but fast
+  // compression.
+  //
+  // Typical speeds of kSnappyCompression on an Intel(R) Core(TM)2 2.4GHz:
+  //    ~200-500MB/s compression
+  //    ~400-800MB/s decompression
+  // Note that these speeds are significantly faster than most
+  // persistent storage speeds, and therefore it is typically never
+  // worth switching to kNoCompression.  Even if the input data is
+  // incompressible, the kSnappyCompression implementation will
+  // efficiently detect that and will switch to uncompressed mode.
+  CompressionType compression;
+
+  // If non-NULL, use the specified filter policy to reduce disk reads.
+  // Many applications will benefit from passing the result of
+  // NewBloomFilterPolicy() here.
+  //
+  // Default: NULL
+  const FilterPolicy* filter_policy;
+
+  // Create a LevelDBOptions object with default values for all fields.
+  LevelDBOptions();
+};
+
+// Converts a LevelDBOptions object into a RocksDB Options object.
+Options ConvertOptions(const LevelDBOptions& leveldb_options);
+
+}  // namespace rocksdb
diff --git a/table/block_based_table_factory.cc b/table/block_based_table_factory.cc
index 3013ade2a..9708e1954 100644
--- a/table/block_based_table_factory.cc
+++ b/table/block_based_table_factory.cc
@@ -139,6 +139,10 @@ std::string BlockBasedTableFactory::GetPrintableTableOptions() const {
   return ret;
 }
 
+const BlockBasedTableOptions& BlockBasedTableFactory::GetTableOptions() const {
+  return table_options_;
+}
+
 TableFactory* NewBlockBasedTableFactory(
     const BlockBasedTableOptions& table_options) {
   return new BlockBasedTableFactory(table_options);
diff --git a/table/block_based_table_factory.h b/table/block_based_table_factory.h
index 247fcd691..674289779 100644
--- a/table/block_based_table_factory.h
+++ b/table/block_based_table_factory.h
@@ -51,6 +51,8 @@ class BlockBasedTableFactory : public TableFactory {
 
   std::string GetPrintableTableOptions() const override;
 
+  const BlockBasedTableOptions& GetTableOptions() const;
+
  private:
   BlockBasedTableOptions table_options_;
 };
diff --git a/util/options_test.cc b/util/options_test.cc
index cd456a0ae..1a6702143 100644
--- a/util/options_test.cc
+++ b/util/options_test.cc
@@ -16,7 +16,11 @@
 #include <gflags/gflags.h>
 
 #include "rocksdb/options.h"
+#include "rocksdb/table.h"
+#include "table/block_based_table_factory.h"
 #include "util/testharness.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/utilities/leveldb_options.h"
 #include "rocksdb/utilities/convenience.h"
 
 using GFLAGS::ParseCommandLineFlags;
@@ -322,6 +326,34 @@ TEST(OptionsTest, GetOptionsFromStringTest) {
   ASSERT_EQ(new_cf_opt.arena_block_size, 21*1024UL*1024UL*1024UL*1024UL);
 }
 
+TEST(OptionsTest, ConvertOptionsTest) {
+  LevelDBOptions leveldb_opt;
+  Options converted_opt = ConvertOptions(leveldb_opt);
+
+  ASSERT_EQ(converted_opt.create_if_missing, leveldb_opt.create_if_missing);
+  ASSERT_EQ(converted_opt.error_if_exists, leveldb_opt.error_if_exists);
+  ASSERT_EQ(converted_opt.paranoid_checks, leveldb_opt.paranoid_checks);
+  ASSERT_EQ(converted_opt.env, leveldb_opt.env);
+  ASSERT_EQ(converted_opt.info_log.get(), leveldb_opt.info_log);
+  ASSERT_EQ(converted_opt.write_buffer_size, leveldb_opt.write_buffer_size);
+  ASSERT_EQ(converted_opt.max_open_files, leveldb_opt.max_open_files);
+  ASSERT_EQ(converted_opt.compression, leveldb_opt.compression);
+
+  std::shared_ptr<BlockBasedTableFactory> table_factory =
+      std::dynamic_pointer_cast<BlockBasedTableFactory>(
+          converted_opt.table_factory);
+
+  ASSERT_TRUE(table_factory.get() != nullptr);
+
+  const BlockBasedTableOptions table_opt = table_factory->GetTableOptions();
+
+  ASSERT_EQ(table_opt.block_cache->GetCapacity(), 8UL << 20);
+  ASSERT_EQ(table_opt.block_size, leveldb_opt.block_size);
+  ASSERT_EQ(table_opt.block_restart_interval,
+            leveldb_opt.block_restart_interval);
+  ASSERT_EQ(table_opt.filter_policy.get(), leveldb_opt.filter_policy);
+}
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
diff --git a/utilities/leveldb_options/leveldb_options.cc b/utilities/leveldb_options/leveldb_options.cc
new file mode 100644
index 000000000..cb7dfb8ea
--- /dev/null
+++ b/utilities/leveldb_options/leveldb_options.cc
@@ -0,0 +1,56 @@
+//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "rocksdb/utilities/leveldb_options.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/env.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/options.h"
+#include "rocksdb/table.h"
+
+namespace rocksdb {
+
+LevelDBOptions::LevelDBOptions()
+    : comparator(BytewiseComparator()),
+      create_if_missing(false),
+      error_if_exists(false),
+      paranoid_checks(false),
+      env(Env::Default()),
+      info_log(nullptr),
+      write_buffer_size(4 << 20),
+      max_open_files(1000),
+      block_cache(nullptr),
+      block_size(4096),
+      block_restart_interval(16),
+      compression(kSnappyCompression),
+      filter_policy(nullptr) {}
+
+Options ConvertOptions(const LevelDBOptions& leveldb_options) {
+  Options options = Options();
+  options.create_if_missing = leveldb_options.create_if_missing;
+  options.error_if_exists = leveldb_options.error_if_exists;
+  options.paranoid_checks = leveldb_options.paranoid_checks;
+  options.env = leveldb_options.env;
+  options.info_log.reset(leveldb_options.info_log);
+  options.write_buffer_size = leveldb_options.write_buffer_size;
+  options.max_open_files = leveldb_options.max_open_files;
+  options.compression = leveldb_options.compression;
+
+  BlockBasedTableOptions table_options;
+  table_options.block_cache.reset(leveldb_options.block_cache);
+  table_options.block_size = leveldb_options.block_size;
+  table_options.block_restart_interval = leveldb_options.block_restart_interval;
+  table_options.filter_policy.reset(leveldb_options.filter_policy);
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  return options;
+}
+
+}  // namespace rocksdb

From bafce61979de3e7ae4a5ada9f4e8a13a5b93b542 Mon Sep 17 00:00:00 2001
From: Saghm Rossi <saghmrossi@gmail.com>
Date: Tue, 4 Nov 2014 22:05:17 -0500
Subject: [PATCH 542/829] first rdb commit

Summary: First commit for rdb shell

Test Plan: unit_test.js does simple assertions on most of the main functionality; will update with rest of tests

Reviewers: igor, rven, lijn, yhciang, sdong

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D28749
---
 tools/rdb/.gitignore    |   1 +
 tools/rdb/API.md        | 178 ++++++++++++++
 tools/rdb/README.md     |  40 +++
 tools/rdb/binding.gyp   |  25 ++
 tools/rdb/db_wrapper.cc | 525 ++++++++++++++++++++++++++++++++++++++++
 tools/rdb/db_wrapper.h  |  58 +++++
 tools/rdb/rdb           |   3 +
 tools/rdb/rdb.cc        |  15 ++
 tools/rdb/unit_test.js  | 124 ++++++++++
 9 files changed, 969 insertions(+)
 create mode 100644 tools/rdb/.gitignore
 create mode 100644 tools/rdb/API.md
 create mode 100644 tools/rdb/README.md
 create mode 100644 tools/rdb/binding.gyp
 create mode 100644 tools/rdb/db_wrapper.cc
 create mode 100644 tools/rdb/db_wrapper.h
 create mode 100755 tools/rdb/rdb
 create mode 100644 tools/rdb/rdb.cc
 create mode 100644 tools/rdb/unit_test.js

diff --git a/tools/rdb/.gitignore b/tools/rdb/.gitignore
new file mode 100644
index 000000000..378eac25d
--- /dev/null
+++ b/tools/rdb/.gitignore
@@ -0,0 +1 @@
+build
diff --git a/tools/rdb/API.md b/tools/rdb/API.md
new file mode 100644
index 000000000..f25949706
--- /dev/null
+++ b/tools/rdb/API.md
@@ -0,0 +1,178 @@
+# JavaScript API
+
+## DBWrapper
+
+### Constructor
+
+    # Creates a new database wrapper object
+    RDB()
+
+### Open
+
+    # Open a new or existing RocksDB database.
+    #
+    # db_name         (string)   - Location of the database (inside the
+    #                              `/tmp` directory).
+    # column_families (string[]) - Names of additional column families
+    #                              beyond the default. If there are no other
+    #                              column families, this argument can be
+    #                              left off.
+    #
+    # Returns true if the database was opened successfully, or false otherwise
+    db_obj.(db_name, column_families = [])
+
+### Get
+
+    # Get the value of a given key.
+    #
+    # key           (string) - Which key to get the value of.
+    # column_family (string) - Which column family to check for the key.
+    #                          This argument can be left off for the default
+    #                          column family
+    #
+    # Returns the value (string) that is associated with the given key if
+    # one exists, or null otherwise.
+    db_obj.get(key, column_family = { default })
+
+### Put
+
+    # Associate a value with a key.
+    #
+    # key           (string) - Which key to associate the value with.
+    # value         (string) - The value to associate with the key.
+    # column_family (string) - Which column family to put the key-value pair
+    #                          in. This argument can be left off for the
+    #                          default column family.
+    #
+    # Returns true if the key-value pair was successfully stored in the
+    # database, or false otherwise.
+    db_obj.put(key, value, column_family = { default })
+
+### Delete
+
+    # Delete a value associated with a given key.
+    #
+    # key           (string) - Which key to delete the value of..
+    # column_family (string) - Which column family to check for the key.
+    #                          This argument can be left off for the default
+    #                          column family
+    #
+    # Returns true if an error occured while trying to delete the key in
+    # the database, or false otherwise. Note that this is NOT the same as
+    # whether a value was deleted; in the case of a specified key not having
+    # a value, this will still return true. Use the `get` method prior to
+    # this method to check if a value existed before the call to `delete`.
+    db_obj.delete(key, column_family = { default })
+
+### Dump
+
+    # Print out all the key-value pairs in a given column family of the
+    # database.
+    #
+    # column_family (string) - Which column family to dump the pairs from.
+    #                          This argument can be left off for the default
+    #                          column family.
+    #
+    # Returns true if the keys were successfully read from the database, or
+    # false otherwise.
+    db_obj.dump(column_family = { default })
+
+### WriteBatch
+
+    # Execute an atomic batch of writes (i.e. puts and deletes) to the
+    # database.
+    #
+    # cf_batches (BatchObject[]; see below) - Put and Delete writes grouped
+    #                                         by column family to execute
+    #                                         atomically.
+    #
+    # Returns true if the argument array was well-formed and was
+    # successfully written to the database, or false otherwise.
+    db_obj.writeBatch(cf_batches)
+
+### CreateColumnFamily
+
+    # Create a new column familiy for the database.
+    #
+    # column_family_name (string) - Name of the new column family.
+    #
+    # Returns true if the new column family was successfully created, or
+    # false otherwise.
+    db_obj.createColumnFamily(column_family_name)
+
+### CompactRange
+
+    # Compact the underlying storage for a given range.
+    #
+    # In addition to the endpoints of the range, the method is overloaded to
+    # accept a non-default column family, a set of options, or both.
+    #
+    # begin (string)         - First key in the range to compact.
+    # end   (string)         - Last key in the range to compact.
+    # options (object)       - Contains a subset of the following key-value
+    #                          pairs:
+    #                            * 'target_level'   => int
+    #                            * 'target_path_id' => int
+    # column_family (string) - Which column family to compact the range in.
+    db_obj.compactRange(begin, end)
+    db_obj.compactRange(begin, end, options)
+    db_obj.compactRange(begin, end, column_family)
+    db_obj.compactRange(begin, end, options, column_family)
+
+
+
+### Close
+
+    # Close an a database and free the memory associated with it.
+    #
+    # Return null.
+    # db_obj.close()
+
+
+## BatchObject
+
+### Structure
+
+A BatchObject must have at least one of the following key-value pairs:
+
+* 'put' => Array of ['string1', 'string1'] pairs, each of which signifies that
+the key 'string1' should be associated with the value 'string2'
+* 'delete' => Array of strings, each of which is a key whose value should be
+deleted.
+
+The following key-value pair is optional:
+
+* 'column_family' => The name (string) of the column family to apply the
+changes to.
+
+### Examples
+
+    # Writes the key-value pairs 'firstname' => 'Saghm' and
+    # 'lastname' => 'Rossi' atomically to the database.
+    db_obj.writeBatch([
+        {
+            put: [ ['firstname', 'Saghm'], ['lastname', 'Rossi'] ]
+        }
+    ]);
+
+
+    # Deletes the values associated with 'firstname' and 'lastname' in
+    # the default column family and adds the key 'number_of_people' with
+    # with the value '2'. Additionally, adds the key-value pair
+    # 'name' => 'Saghm Rossi' to the column family 'user1' and the pair
+    # 'name' => 'Matt Blaze' to the column family 'user2'. All writes
+    # are done atomically.
+    db_obj.writeBatch([
+        {
+            put: [ ['number_of_people', '2'] ],
+            delete: ['firstname', 'lastname']
+        },
+        {
+            put: [ ['name', 'Saghm Rossi'] ],
+            column_family: 'user1'
+        },
+        {
+            put: [ ['name', Matt Blaze'] ],
+            column_family: 'user2'
+        }
+    ]);
diff --git a/tools/rdb/README.md b/tools/rdb/README.md
new file mode 100644
index 000000000..2cc9acad2
--- /dev/null
+++ b/tools/rdb/README.md
@@ -0,0 +1,40 @@
+# RDB - RocksDB Shell
+
+RDB is a NodeJS-based shell interface to RocksDB. It can also be used as a
+JavaScript binding for RocksDB within a Node application.
+
+## Setup/Compilation
+
+### Requirements
+
+* static RocksDB library (i.e. librocksdb.a)
+* libsnappy
+* node (tested onv0.10.33, no guarantees on anything else!)
+* node-gyp
+* python2 (for node-gyp; tested with 2.7.8)
+
+### Installation
+
+NOTE: If your default `python` binary is not a version of python2, add
+the arguments `--python /path/to/python2` to the the `node-gyp` commands.
+
+1. Make sure you have the static library (i.e. "librocksdb.a") in the root
+directory of your rocksdb installation. If not, `cd` there and run
+`make static_lib`.
+
+2. Run `node-gyp configure` to generate the build.
+
+3. Run `node-gyp build` to compile RDB.
+
+## Usage
+
+### Running the shell
+
+Assuming everything compiled correctly, you can run the `rdb` executable
+located in the root of the `tools/rdb` directory to start the shell. The file is
+just a shell script that runs the node shell and loads the constructor for the
+RDB object into the top-level function `RDB`.
+
+### JavaScript API
+
+See `API.md` for how to use RocksDB from the shell.
diff --git a/tools/rdb/binding.gyp b/tools/rdb/binding.gyp
new file mode 100644
index 000000000..89145541c
--- /dev/null
+++ b/tools/rdb/binding.gyp
@@ -0,0 +1,25 @@
+{
+    "targets": [
+        {
+            "target_name": "rdb",
+            "sources": [
+                "rdb.cc",
+                "db_wrapper.cc",
+                "db_wrapper.h"
+            ],
+            "cflags_cc!": [
+                "-fno-exceptions"
+            ],
+            "cflags_cc+": [
+                "-std=c++11",
+            ],
+            "include_dirs+": [
+                "../../include"
+            ],
+            "libraries": [
+                "../../../librocksdb.a",
+                "-lsnappy"
+            ],
+        }
+    ]
+}
diff --git a/tools/rdb/db_wrapper.cc b/tools/rdb/db_wrapper.cc
new file mode 100644
index 000000000..34725379d
--- /dev/null
+++ b/tools/rdb/db_wrapper.cc
@@ -0,0 +1,525 @@
+#include <iostream>
+#include <memory>
+#include <vector>
+#include <v8.h>
+#include <node.h>
+
+#include "db_wrapper.h"
+#include "rocksdb/db.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/options.h"
+
+namespace {
+  void printWithBackSlashes(std::string str) {
+    for (std::string::size_type i = 0; i < str.size(); i++) {
+      if (str[i] == '\\' || str[i] == '"') {
+        std::cout << "\\";
+      }
+
+      std::cout << str[i];
+    }
+  }
+
+  bool has_key_for_array(Local<Object> obj, std::string key) {
+    return obj->Has(String::NewSymbol(key.c_str())) &&
+        obj->Get(String::NewSymbol(key.c_str()))->IsArray();
+  }
+}
+
+using namespace v8;
+
+
+Persistent<Function> DBWrapper::constructor;
+
+DBWrapper::DBWrapper() {
+  options_.IncreaseParallelism();
+  options_.OptimizeLevelStyleCompaction();
+  options_.disable_auto_compactions = true;
+  options_.create_if_missing = true;
+}
+
+DBWrapper::~DBWrapper() {
+  delete db_;
+}
+
+bool DBWrapper::HasFamilyNamed(std::string& name, DBWrapper* db) {
+  return db->columnFamilies_.find(name) != db->columnFamilies_.end();
+}
+
+
+void DBWrapper::Init(Handle<Object> exports) {
+  Local<FunctionTemplate> tpl = FunctionTemplate::New(New);
+  tpl->SetClassName(String::NewSymbol("DBWrapper"));
+  tpl->InstanceTemplate()->SetInternalFieldCount(8);
+  tpl->PrototypeTemplate()->Set(String::NewSymbol("open"),
+      FunctionTemplate::New(Open)->GetFunction());
+  tpl->PrototypeTemplate()->Set(String::NewSymbol("get"),
+      FunctionTemplate::New(Get)->GetFunction());
+  tpl->PrototypeTemplate()->Set(String::NewSymbol("put"),
+      FunctionTemplate::New(Put)->GetFunction());
+  tpl->PrototypeTemplate()->Set(String::NewSymbol("delete"),
+      FunctionTemplate::New(Delete)->GetFunction());
+  tpl->PrototypeTemplate()->Set(String::NewSymbol("dump"),
+      FunctionTemplate::New(Dump)->GetFunction());
+  tpl->PrototypeTemplate()->Set(String::NewSymbol("createColumnFamily"),
+      FunctionTemplate::New(CreateColumnFamily)->GetFunction());
+  tpl->PrototypeTemplate()->Set(String::NewSymbol("writeBatch"),
+      FunctionTemplate::New(WriteBatch)->GetFunction());
+  tpl->PrototypeTemplate()->Set(String::NewSymbol("compactRange"),
+      FunctionTemplate::New(CompactRange)->GetFunction());
+
+  constructor = Persistent<Function>::New(tpl->GetFunction());
+  exports->Set(String::NewSymbol("DBWrapper"), constructor);
+}
+
+Handle<Value> DBWrapper::Open(const Arguments& args) {
+  HandleScope scope;
+  DBWrapper* db_wrapper = ObjectWrap::Unwrap<DBWrapper>(args.This());
+
+  if (!(args[0]->IsString() &&
+       (args[1]->IsUndefined() || args[1]->IsArray()))) {
+    return scope.Close(Boolean::New(false));
+  }
+
+  std::string db_file = *v8::String::Utf8Value(args[0]->ToString());
+
+  std::vector<std::string> cfs = { rocksdb::kDefaultColumnFamilyName };
+
+  if (!args[1]->IsUndefined()) {
+    Handle<Array> array = Handle<Array>::Cast(args[1]);
+    for (uint i = 0; i < array->Length(); i++) {
+      if (!array->Get(i)->IsString()) {
+        return scope.Close(Boolean::New(false));
+      }
+
+      cfs.push_back(*v8::String::Utf8Value(array->Get(i)->ToString()));
+    }
+  }
+
+  if (cfs.size() == 1) {
+    db_wrapper->status_ = rocksdb::DB::Open(
+        db_wrapper->options_, db_file, &db_wrapper->db_);
+
+    return scope.Close(Boolean::New(db_wrapper->status_.ok()));
+  }
+
+  std::vector<rocksdb::ColumnFamilyDescriptor> families;
+
+  for (std::vector<int>::size_type i = 0; i < cfs.size(); i++) {
+    families.push_back(rocksdb::ColumnFamilyDescriptor(
+        cfs[i], rocksdb::ColumnFamilyOptions()));
+  }
+
+  std::vector<rocksdb::ColumnFamilyHandle*> handles;
+  db_wrapper->status_ = rocksdb::DB::Open(
+      db_wrapper->options_, db_file, families, &handles, &db_wrapper->db_);
+
+  if (!db_wrapper->status_.ok()) {
+    return scope.Close(Boolean::New(db_wrapper->status_.ok()));
+  }
+
+  for (std::vector<int>::size_type i = 0; i < handles.size(); i++) {
+    db_wrapper->columnFamilies_[cfs[i]] = handles[i];
+  }
+
+  return scope.Close(Boolean::New(true));
+}
+
+
+Handle<Value> DBWrapper::New(const Arguments& args) {
+  HandleScope scope;
+  Handle<Value> to_return;
+
+  if (args.IsConstructCall()) {
+    DBWrapper* db_wrapper = new DBWrapper();
+    db_wrapper->Wrap(args.This());
+
+    return args.This();
+  }
+
+  const int argc = 0;
+  Local<Value> argv[0] = {};
+
+  return scope.Close(constructor->NewInstance(argc, argv));
+}
+
+Handle<Value> DBWrapper::Get(const Arguments& args) {
+  HandleScope scope;
+
+  if (!(args[0]->IsString() &&
+        (args[1]->IsUndefined() || args[1]->IsString()))) {
+    return scope.Close(Null());
+  }
+
+  DBWrapper* db_wrapper = ObjectWrap::Unwrap<DBWrapper>(args.This());
+  std::string key       = *v8::String::Utf8Value(args[0]->ToString());
+  std::string cf        = *v8::String::Utf8Value(args[1]->ToString());
+  std::string value;
+
+  if (args[1]->IsUndefined()) {
+    db_wrapper->status_ = db_wrapper->db_->Get(
+        rocksdb::ReadOptions(), key, &value);
+  } else if (db_wrapper->HasFamilyNamed(cf, db_wrapper)) {
+    db_wrapper->status_ = db_wrapper->db_->Get(
+        rocksdb::ReadOptions(), db_wrapper->columnFamilies_[cf], key, &value);
+  } else {
+    return scope.Close(Null());
+  }
+
+  Handle<Value> v = db_wrapper->status_.ok() ?
+      String::NewSymbol(value.c_str()) : Null();
+
+  return scope.Close(v);
+}
+
+Handle<Value> DBWrapper::Put(const Arguments& args) {
+  HandleScope scope;
+
+  if (!(args[0]->IsString() && args[1]->IsString() &&
+       (args[2]->IsUndefined() || args[2]->IsString()))) {
+    return scope.Close(Boolean::New(false));
+  }
+
+  DBWrapper* db_wrapper = ObjectWrap::Unwrap<DBWrapper>(args.This());
+  std::string key       = *v8::String::Utf8Value(args[0]->ToString());
+  std::string value     = *v8::String::Utf8Value(args[1]->ToString());
+  std::string cf        = *v8::String::Utf8Value(args[2]->ToString());
+
+  if (args[2]->IsUndefined()) {
+    db_wrapper->status_  = db_wrapper->db_->Put(
+      rocksdb::WriteOptions(), key, value
+    );
+  } else if (db_wrapper->HasFamilyNamed(cf, db_wrapper)) {
+    db_wrapper->status_ = db_wrapper->db_->Put(
+      rocksdb::WriteOptions(),
+      db_wrapper->columnFamilies_[cf],
+      key,
+      value
+    );
+  } else {
+    return scope.Close(Boolean::New(false));
+  }
+
+
+  return scope.Close(Boolean::New(db_wrapper->status_.ok()));
+}
+
+Handle<Value> DBWrapper::Delete(const Arguments& args) {
+  HandleScope scope;
+
+  if (!args[0]->IsString()) {
+    return scope.Close(Boolean::New(false));
+  }
+
+  DBWrapper* db_wrapper = ObjectWrap::Unwrap<DBWrapper>(args.This());
+  std::string arg0      = *v8::String::Utf8Value(args[0]->ToString());
+  std::string arg1      = *v8::String::Utf8Value(args[1]->ToString());
+
+  if (args[1]->IsUndefined()) {
+    db_wrapper->status_ = db_wrapper->db_->Delete(
+        rocksdb::WriteOptions(), arg0);
+  } else {
+    if (!db_wrapper->HasFamilyNamed(arg1, db_wrapper)) {
+      return scope.Close(Boolean::New(false));
+    }
+    db_wrapper->status_ = db_wrapper->db_->Delete(
+        rocksdb::WriteOptions(), db_wrapper->columnFamilies_[arg1], arg0);
+  }
+
+  return scope.Close(Boolean::New(db_wrapper->status_.ok()));
+}
+
+Handle<Value> DBWrapper::Dump(const Arguments& args) {
+  HandleScope scope;
+  std::unique_ptr<rocksdb::Iterator> iterator;
+  DBWrapper* db_wrapper = ObjectWrap::Unwrap<DBWrapper>(args.This());
+  std::string arg0      = *v8::String::Utf8Value(args[0]->ToString());
+
+  if (args[0]->IsUndefined()) {
+    iterator.reset(db_wrapper->db_->NewIterator(rocksdb::ReadOptions()));
+  } else {
+    if (!db_wrapper->HasFamilyNamed(arg0, db_wrapper)) {
+      return scope.Close(Boolean::New(false));
+    }
+
+    iterator.reset(db_wrapper->db_->NewIterator(
+        rocksdb::ReadOptions(), db_wrapper->columnFamilies_[arg0]));
+  }
+
+  iterator->SeekToFirst();
+
+  while (iterator->Valid()) {
+    std::cout << "\"";
+    printWithBackSlashes(iterator->key().ToString());
+    std::cout << "\" => \"";
+    printWithBackSlashes(iterator->value().ToString());
+    std::cout << "\"\n";
+    iterator->Next();
+  }
+
+  return scope.Close(Boolean::New(true));
+}
+
+Handle<Value> DBWrapper::CreateColumnFamily(const Arguments& args) {
+  HandleScope scope;
+
+  if (!args[0]->IsString()) {
+    return scope.Close(Boolean::New(false));
+  }
+
+  DBWrapper* db_wrapper = ObjectWrap::Unwrap<DBWrapper>(args.This());
+  std::string cf_name   = *v8::String::Utf8Value(args[0]->ToString());
+
+  if (db_wrapper->HasFamilyNamed(cf_name, db_wrapper)) {
+    return scope.Close(Boolean::New(false));
+  }
+
+  rocksdb::ColumnFamilyHandle* cf;
+  db_wrapper->status_ = db_wrapper->db_->CreateColumnFamily(
+      rocksdb::ColumnFamilyOptions(), cf_name, &cf);
+
+  if (!db_wrapper->status_.ok()) {
+    return scope.Close(Boolean::New(false));
+  }
+
+  db_wrapper->columnFamilies_[cf_name] = cf;
+
+  return scope.Close(Boolean::New(true));
+}
+
+bool DBWrapper::AddToBatch(rocksdb::WriteBatch& batch, bool del,
+                           Handle<Array> array) {
+  Handle<Array> put_pair;
+  for (uint i = 0; i < array->Length(); i++) {
+    if (del) {
+      if (!array->Get(i)->IsString()) {
+        return false;
+      }
+
+      batch.Delete(*v8::String::Utf8Value(array->Get(i)->ToString()));
+      continue;
+    }
+
+    if (!array->Get(i)->IsArray()) {
+      return false;
+    }
+
+    put_pair = Handle<Array>::Cast(array->Get(i));
+
+    if (!put_pair->Get(0)->IsString() || !put_pair->Get(1)->IsString()) {
+      return false;
+    }
+
+    batch.Put(
+        *v8::String::Utf8Value(put_pair->Get(0)->ToString()),
+        *v8::String::Utf8Value(put_pair->Get(1)->ToString()));
+  }
+
+  return true;
+}
+
+bool DBWrapper::AddToBatch(rocksdb::WriteBatch& batch, bool del,
+                           Handle<Array> array, DBWrapper* db_wrapper,
+                           std::string cf) {
+  Handle<Array> put_pair;
+  for (uint i = 0; i < array->Length(); i++) {
+    if (del) {
+      if (!array->Get(i)->IsString()) {
+        return false;
+      }
+
+      batch.Delete(
+          db_wrapper->columnFamilies_[cf],
+          *v8::String::Utf8Value(array->Get(i)->ToString()));
+      continue;
+    }
+
+    if (!array->Get(i)->IsArray()) {
+      return false;
+    }
+
+    put_pair = Handle<Array>::Cast(array->Get(i));
+
+    if (!put_pair->Get(0)->IsString() || !put_pair->Get(1)->IsString()) {
+      return false;
+    }
+
+    batch.Put(
+        db_wrapper->columnFamilies_[cf],
+        *v8::String::Utf8Value(put_pair->Get(0)->ToString()),
+        *v8::String::Utf8Value(put_pair->Get(1)->ToString()));
+  }
+
+  return true;
+}
+
+Handle<Value> DBWrapper::WriteBatch(const Arguments& args) {
+  HandleScope scope;
+
+  if (!args[0]->IsArray()) {
+    return scope.Close(Boolean::New(false));
+  }
+
+  DBWrapper* db_wrapper     = ObjectWrap::Unwrap<DBWrapper>(args.This());
+  Handle<Array> sub_batches = Handle<Array>::Cast(args[0]);
+  Local<Object> sub_batch;
+  rocksdb::WriteBatch batch;
+  bool well_formed;
+
+  for (uint i = 0; i < sub_batches->Length(); i++) {
+    if (!sub_batches->Get(i)->IsObject()) {
+      return scope.Close(Boolean::New(false));
+    }
+    sub_batch = sub_batches->Get(i)->ToObject();
+
+    if (sub_batch->Has(String::NewSymbol("column_family"))) {
+      if (!has_key_for_array(sub_batch, "put") &&
+          !has_key_for_array(sub_batch, "delete")) {
+        return scope.Close(Boolean::New(false));
+      }
+
+      well_formed = db_wrapper->AddToBatch(
+        batch, false,
+        Handle<Array>::Cast(sub_batch->Get(String::NewSymbol("put"))),
+        db_wrapper, *v8::String::Utf8Value(sub_batch->Get(
+            String::NewSymbol("column_family"))));
+
+      well_formed = db_wrapper->AddToBatch(
+          batch, true,
+          Handle<Array>::Cast(sub_batch->Get(String::NewSymbol("delete"))),
+          db_wrapper, *v8::String::Utf8Value(sub_batch->Get(
+          String::NewSymbol("column_family"))));
+    } else {
+      well_formed = db_wrapper->AddToBatch(
+          batch, false,
+          Handle<Array>::Cast(sub_batch->Get(String::NewSymbol("put"))));
+      well_formed = db_wrapper->AddToBatch(
+          batch, true,
+          Handle<Array>::Cast(sub_batch->Get(String::NewSymbol("delete"))));
+
+      if (!well_formed) {
+        return scope.Close(Boolean::New(false));
+      }
+    }
+  }
+
+  db_wrapper->status_ = db_wrapper->db_->Write(rocksdb::WriteOptions(), &batch);
+
+  return scope.Close(Boolean::New(db_wrapper->status_.ok()));
+}
+
+Handle<Value> DBWrapper::CompactRangeDefault(const Arguments& args) {
+  HandleScope scope;
+
+  DBWrapper* db_wrapper = ObjectWrap::Unwrap<DBWrapper>(args.This());
+  rocksdb::Slice begin     = *v8::String::Utf8Value(args[0]->ToString());
+  rocksdb::Slice end       = *v8::String::Utf8Value(args[1]->ToString());
+  db_wrapper->status_    = db_wrapper->db_->CompactRange(&end, &begin);
+
+  return scope.Close(Boolean::New(db_wrapper->status_.ok()));
+}
+
+Handle<Value> DBWrapper::CompactColumnFamily(const Arguments& args) {
+  HandleScope scope;
+
+  DBWrapper* db_wrapper = ObjectWrap::Unwrap<DBWrapper>(args.This());
+  rocksdb::Slice begin  = *v8::String::Utf8Value(args[0]->ToString());
+  rocksdb::Slice end    = *v8::String::Utf8Value(args[1]->ToString());
+  std::string cf        = *v8::String::Utf8Value(args[2]->ToString());
+  db_wrapper->status_    = db_wrapper->db_->CompactRange(
+      db_wrapper->columnFamilies_[cf], &begin, &end);
+
+  return scope.Close(Boolean::New(db_wrapper->status_.ok()));
+}
+
+Handle<Value> DBWrapper::CompactOptions(const Arguments& args) {
+  HandleScope scope;
+
+  if (!args[2]->IsObject()) {
+    return scope.Close(Boolean::New(false));
+  }
+
+  DBWrapper* db_wrapper = ObjectWrap::Unwrap<DBWrapper>(args.This());
+  rocksdb::Slice begin     = *v8::String::Utf8Value(args[0]->ToString());
+  rocksdb::Slice end       = *v8::String::Utf8Value(args[1]->ToString());
+  Local<Object> options  = args[2]->ToObject();
+  int target_level = -1, target_path_id = 0;
+
+  if (options->Has(String::NewSymbol("target_level")) &&
+      options->Get(String::NewSymbol("target_level"))->IsInt32()) {
+    target_level = (int)(options->Get(
+        String::NewSymbol("target_level"))->ToInt32()->Value());
+
+    if (options->Has(String::NewSymbol("target_path_id")) ||
+        options->Get(String::NewSymbol("target_path_id"))->IsInt32()) {
+      target_path_id = (int)(options->Get(
+          String::NewSymbol("target_path_id"))->ToInt32()->Value());
+    }
+  }
+
+  db_wrapper->status_ = db_wrapper->db_->CompactRange(
+    &begin, &end, true, target_level, target_path_id
+  );
+
+  return scope.Close(Boolean::New(db_wrapper->status_.ok()));
+}
+
+Handle<Value> DBWrapper::CompactAll(const Arguments& args) {
+  HandleScope scope;
+
+  if (!args[2]->IsObject() || !args[3]->IsString()) {
+    return scope.Close(Boolean::New(false));
+  }
+
+  DBWrapper* db_wrapper = ObjectWrap::Unwrap<DBWrapper>(args.This());
+  rocksdb::Slice begin  = *v8::String::Utf8Value(args[0]->ToString());
+  rocksdb::Slice end    = *v8::String::Utf8Value(args[1]->ToString());
+  Local<Object> options = args[2]->ToObject();
+  std::string cf        = *v8::String::Utf8Value(args[3]->ToString());
+
+  int target_level = -1, target_path_id = 0;
+
+  if (options->Has(String::NewSymbol("target_level")) &&
+      options->Get(String::NewSymbol("target_level"))->IsInt32()) {
+    target_level = (int)(options->Get(
+        String::NewSymbol("target_level"))->ToInt32()->Value());
+
+    if (options->Has(String::NewSymbol("target_path_id")) ||
+        options->Get(String::NewSymbol("target_path_id"))->IsInt32()) {
+      target_path_id = (int)(options->Get(
+          String::NewSymbol("target_path_id"))->ToInt32()->Value());
+    }
+  }
+
+  db_wrapper->status_ = db_wrapper->db_->CompactRange(
+    db_wrapper->columnFamilies_[cf], &begin, &end, true, target_level,
+    target_path_id);
+
+  return scope.Close(Boolean::New(db_wrapper->status_.ok()));
+}
+
+Handle<Value> DBWrapper::CompactRange(const Arguments& args) {
+  HandleScope scope;
+
+  if (!args[0]->IsString() || !args[1]->IsString()) {
+    return scope.Close(Boolean::New(false));
+  }
+
+  switch(args.Length()) {
+  case 2:
+    return CompactRangeDefault(args);
+  case 3:
+    return args[2]->IsString() ? CompactColumnFamily(args) :
+        CompactOptions(args);
+  default:
+    return CompactAll(args);
+  }
+}
+
+Handle<Value> DBWrapper::Close(const Arguments& args) {
+  HandleScope scope;
+
+  delete ObjectWrap::Unwrap<DBWrapper>(args.This());
+
+  return scope.Close(Null());
+}
diff --git a/tools/rdb/db_wrapper.h b/tools/rdb/db_wrapper.h
new file mode 100644
index 000000000..9d1c8f886
--- /dev/null
+++ b/tools/rdb/db_wrapper.h
@@ -0,0 +1,58 @@
+#ifndef DBWRAPPER_H
+#define DBWRAPPER_H
+
+#include <map>
+#include <node.h>
+
+#include "rocksdb/db.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/options.h"
+
+using namespace v8;
+
+// Used to encapsulate a particular instance of an opened database.
+//
+// This object should not be used directly in C++; it exists solely to provide
+// a mapping from a JavaScript object to a C++ code that can use the RocksDB
+// API.
+class DBWrapper : public node::ObjectWrap {
+  public:
+    static void Init(Handle<Object> exports);
+
+  private:
+    explicit DBWrapper();
+    ~DBWrapper();
+
+    // Helper methods
+    static bool HasFamilyNamed(std::string& name, DBWrapper* db);
+    static bool AddToBatch(rocksdb::WriteBatch& batch, bool del,
+        Handle<Array> array);
+    static bool AddToBatch(rocksdb::WriteBatch& batch, bool del,
+        Handle<Array> array, DBWrapper* db_wrapper, std::string cf);
+    static Handle<Value> CompactRangeDefault(const v8::Arguments& args);
+    static Handle<Value> CompactColumnFamily(const Arguments& args);
+    static Handle<Value> CompactOptions(const Arguments& args);
+    static Handle<Value> CompactAll(const Arguments& args);
+
+    // C++ mappings of API methods
+    static Persistent<v8::Function> constructor;
+    static Handle<Value> Open(const Arguments& args);
+    static Handle<Value> New(const Arguments& args);
+    static Handle<Value> Get(const Arguments& args);
+    static Handle<Value> Put(const Arguments& args);
+    static Handle<Value> Delete(const Arguments& args);
+    static Handle<Value> Dump(const Arguments& args);
+    static Handle<Value> WriteBatch(const Arguments& args);
+    static Handle<Value> CreateColumnFamily(const Arguments& args);
+    static Handle<Value> CompactRange(const Arguments& args);
+    static Handle<Value> Close(const Arguments& args);
+
+    // Internal fields
+    rocksdb::Options options_;
+    rocksdb::Status status_;
+    rocksdb::DB* db_;
+    std::unordered_map<std::string, rocksdb::ColumnFamilyHandle*>
+        columnFamilies_;
+};
+
+#endif
diff --git a/tools/rdb/rdb b/tools/rdb/rdb
new file mode 100755
index 000000000..82cd17fb7
--- /dev/null
+++ b/tools/rdb/rdb
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+node -e "RDB = require('./build/Release/rdb').DBWrapper; console.log('Loaded rocksdb in variable RDB'); repl = require('repl').start('> ');"
diff --git a/tools/rdb/rdb.cc b/tools/rdb/rdb.cc
new file mode 100644
index 000000000..8710e4623
--- /dev/null
+++ b/tools/rdb/rdb.cc
@@ -0,0 +1,15 @@
+#ifndef BUILDING_NODE_EXTENSION
+#define BUILDING_NODE_EXTENSION
+#endif
+
+#include <v8.h>
+#include <node.h>
+#include "db_wrapper.h"
+
+using namespace v8;
+
+void InitAll(Handle<Object> exports) {
+  DBWrapper::Init(exports);
+}
+
+NODE_MODULE(rdb, InitAll)
diff --git a/tools/rdb/unit_test.js b/tools/rdb/unit_test.js
new file mode 100644
index 000000000..d74ee8ce5
--- /dev/null
+++ b/tools/rdb/unit_test.js
@@ -0,0 +1,124 @@
+assert = require('assert')
+RDB    = require('./build/Release/rdb').DBWrapper
+exec   = require('child_process').exec
+util   = require('util')
+
+DB_NAME = '/tmp/rocksdbtest-' + process.getuid()
+
+a = RDB()
+assert.equal(a.open(DB_NAME, ['b']), false)
+
+exec(
+    util.format(
+        "node -e \"RDB = require('./build/Release/rdb').DBWrapper; \
+        a = RDB('%s'); a.createColumnFamily('b')\"",
+        DB_NAME
+    ).exitCode, null
+)
+
+
+exec(
+    util.format(
+        "node -e \"RDB = require('./build/Release/rdb').DBWrapper; \
+        a = RDB('%s', ['b'])\"",
+        DB_NAME
+    ).exitCode, null
+)
+
+exec('rm -rf ' + DB_NAME)
+
+a = RDB()
+assert.equal(a.open(DB_NAME, ['a']), false)
+assert(a.open(DB_NAME), true)
+assert(a.createColumnFamily('temp'))
+
+b = RDB()
+assert.equal(b.open(DB_NAME), false)
+
+exec('rm -rf ' + DB_NAME)
+
+DB_NAME += 'b'
+
+a = RDB()
+assert(a.open(DB_NAME))
+assert.equal(a.constructor.name, 'DBWrapper')
+assert.equal(a.createColumnFamily(), false)
+assert.equal(a.createColumnFamily(1), false)
+assert.equal(a.createColumnFamily(['']), false)
+assert(a.createColumnFamily('b'))
+assert.equal(a.createColumnFamily('b'), false)
+
+// Get and Put
+assert.equal(a.get(1), null)
+assert.equal(a.get(['a']), null)
+assert.equal(a.get('a', 1), null)
+assert.equal(a.get(1, 'a'), null)
+assert.equal(a.get(1, 1), null)
+
+assert.equal(a.put(1), false)
+assert.equal(a.put(['a']), false)
+assert.equal(a.put('a', 1), false)
+assert.equal(a.put(1, 'a'), false)
+assert.equal(a.put(1, 1), false)
+assert.equal(a.put('a', 'a', 1), false)
+assert.equal(a.put('a', 1, 'a'), false)
+assert.equal(a.put(1, 'a', 'a'), false)
+assert.equal(a.put('a', 1, 1), false)
+assert.equal(a.put(1, 'a', 1), false)
+assert.equal(a.put(1, 1, 'a'), false)
+assert.equal(a.put(1, 1, 1), false)
+
+
+assert.equal(a.get(), null)
+assert.equal(a.get('a'), null)
+assert.equal(a.get('a', 'c'), null)
+assert.equal(a.put(), false)
+assert.equal(a.put('a'), false)
+assert.equal(a.get('a', 'b', 'c'), null)
+
+assert(a.put('a', 'axe'))
+assert(a.put('a', 'first'))
+assert.equal(a.get('a'), 'first')
+assert.equal(a.get('a', 'b'), null)
+assert.equal(a.get('a', 'c'), null)
+
+assert(a.put('a', 'apple', 'b'))
+assert.equal(a.get('a', 'b'), 'apple')
+assert.equal(a.get('a'), 'first')
+assert(a.put('b', 'butter', 'b'), 'butter')
+assert(a.put('b', 'banana', 'b'))
+assert.equal(a.get('b', 'b'), 'banana')
+assert.equal(a.get('b'), null)
+assert.equal(a.get('b', 'c'), null)
+
+// Delete
+assert.equal(a.delete(1), false)
+assert.equal(a.delete('a', 1), false)
+assert.equal(a.delete(1, 'a'), false)
+assert.equal(a.delete(1, 1), false)
+
+assert.equal(a.delete('b'), true)
+assert(a.delete('a'))
+assert.equal(a.get('a'), null)
+assert.equal(a.get('a', 'b'), 'apple')
+assert.equal(a.delete('c', 'c'), false)
+assert.equal(a.delete('c', 'b'), true)
+assert(a.delete('b', 'b'))
+assert.equal(a.get('b', 'b'), null)
+
+// Dump
+console.log("MARKER 1")
+assert(a.dump())
+console.log("Should be no output between 'MARKER 1' and here\n")
+console.log('Next line should be "a" => "apple"')
+assert(a.dump('b'))
+
+console.log("\nMARKER 2")
+assert.equal(a.dump('c'), false)
+console.log("Should be no output between 'MARKER 2' and here\n")
+
+// WriteBatch
+
+
+// Clean up test database
+exec('rm -rf ' + DB_NAME)

From 4b63fcbff32dfa8985c5308e6f1d1f6b3579af87 Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Thu, 20 Nov 2014 21:13:18 -0800
Subject: [PATCH 543/829] Add enable_thread_tracking to DBOptions

Summary:
Add enable_thread_tracking to DBOptions to allow
tracking thread status related to the DB.  Default is off.

Test Plan:
export ROCKSDB_TESTS=ThreadList
./db_test

Reviewers: ljin, sdong, igor

Reviewed By: igor

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D29289
---
 db/db_impl.cc                    | 14 ++++++++++----
 db/db_test.cc                    | 18 +++++++++++++++---
 include/rocksdb/options.h        |  6 ++++++
 util/options.cc                  |  8 ++++++--
 util/thread_status_impl.h        |  3 ++-
 util/thread_status_impl_debug.cc | 17 ++++++++++++-----
 6 files changed, 51 insertions(+), 15 deletions(-)

diff --git a/db/db_impl.cc b/db/db_impl.cc
index c004ddfbb..9688af26b 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -3725,17 +3725,23 @@ Status DestroyDB(const std::string& dbname, const Options& options) {
 #if ROCKSDB_USING_THREAD_STATUS
 void DBImpl::NewThreadStatusCfInfo(
     ColumnFamilyData* cfd) const {
-  ThreadStatusImpl::NewColumnFamilyInfo(
-      this, GetName(), cfd, cfd->GetName());
+  if (db_options_.enable_thread_tracking) {
+    ThreadStatusImpl::NewColumnFamilyInfo(
+        this, GetName(), cfd, cfd->GetName());
+  }
 }
 
 void DBImpl::EraseThreadStatusCfInfo(
     ColumnFamilyData* cfd) const {
-  ThreadStatusImpl::EraseColumnFamilyInfo(cfd);
+  if (db_options_.enable_thread_tracking) {
+    ThreadStatusImpl::EraseColumnFamilyInfo(cfd);
+  }
 }
 
 void DBImpl::EraseThreadStatusDbInfo() const {
-  ThreadStatusImpl::EraseDatabaseInfo(this);
+  if (db_options_.enable_thread_tracking) {
+    ThreadStatusImpl::EraseDatabaseInfo(this);
+  }
 }
 
 Status GetThreadList(std::vector<ThreadStatus>* thread_list) {
diff --git a/db/db_test.cc b/db/db_test.cc
index b8c04495b..76a0b7ff6 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -8986,6 +8986,8 @@ TEST(DBTest, DynamicMemtableOptions) {
 TEST(DBTest, GetThreadList) {
   Options options;
   options.env = env_;
+  options.enable_thread_tracking = true;
+  TryReopen(options);
 
   std::vector<ThreadStatus> thread_list;
   Status s = GetThreadList(&thread_list);
@@ -9025,14 +9027,24 @@ TEST(DBTest, GetThreadList) {
     if (i == 0) {
       // repeat the test with multiple column families
       CreateAndReopenWithCF({"pikachu", "about-to-remove"}, options);
-      ThreadStatusImpl::TEST_VerifyColumnFamilyInfoMap(handles_);
+      ThreadStatusImpl::TEST_VerifyColumnFamilyInfoMap(handles_, true);
     }
   }
   db_->DropColumnFamily(handles_[2]);
   handles_.erase(handles_.begin() + 2);
-  ThreadStatusImpl::TEST_VerifyColumnFamilyInfoMap(handles_);
+  ThreadStatusImpl::TEST_VerifyColumnFamilyInfoMap(handles_, true);
   Close();
-  ThreadStatusImpl::TEST_VerifyColumnFamilyInfoMap(handles_);
+  ThreadStatusImpl::TEST_VerifyColumnFamilyInfoMap(handles_, true);
+}
+
+TEST(DBTest, DisableThreadList) {
+  Options options;
+  options.env = env_;
+  options.enable_thread_tracking = false;
+  TryReopen(options);
+  CreateAndReopenWithCF({"pikachu", "about-to-remove"}, options);
+  // Verify non of the column family info exists
+  ThreadStatusImpl::TEST_VerifyColumnFamilyInfoMap(handles_, false);
 }
 #endif  // ROCKSDB_USING_THREAD_STATUS
 
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index dd05aa9de..4e20e618f 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -884,6 +884,12 @@ struct DBOptions {
   // When rate limiter is enabled, it automatically enables bytes_per_sync
   // to 1MB.
   uint64_t bytes_per_sync;
+
+  // If true, then the status of the threads involved in this DB will
+  // be tracked and available via GetThreadList() API.
+  //
+  // Default: false
+  bool enable_thread_tracking;
 };
 
 // Options to control the behavior of a database (passed to DB::Open)
diff --git a/util/options.cc b/util/options.cc
index b97f622d2..c6b883779 100644
--- a/util/options.cc
+++ b/util/options.cc
@@ -232,7 +232,8 @@ DBOptions::DBOptions()
       advise_random_on_open(true),
       access_hint_on_compaction_start(NORMAL),
       use_adaptive_mutex(false),
-      bytes_per_sync(0) {}
+      bytes_per_sync(0),
+      enable_thread_tracking(false) {}
 
 DBOptions::DBOptions(const Options& options)
     : create_if_missing(options.create_if_missing),
@@ -274,7 +275,8 @@ DBOptions::DBOptions(const Options& options)
       advise_random_on_open(options.advise_random_on_open),
       access_hint_on_compaction_start(options.access_hint_on_compaction_start),
       use_adaptive_mutex(options.use_adaptive_mutex),
-      bytes_per_sync(options.bytes_per_sync) {}
+      bytes_per_sync(options.bytes_per_sync),
+      enable_thread_tracking(options.enable_thread_tracking) {}
 
 static const char* const access_hints[] = {
   "NONE", "NORMAL", "SEQUENTIAL", "WILLNEED"
@@ -342,6 +344,8 @@ void DBOptions::Dump(Logger* log) const {
         rate_limiter.get());
     Log(log, "                          Options.bytes_per_sync: %" PRIu64,
         bytes_per_sync);
+    Log(log, "                    enable_thread_tracking: %d",
+        enable_thread_tracking);
 }  // DBOptions::Dump
 
 void ColumnFamilyOptions::Dump(Logger* log) const {
diff --git a/util/thread_status_impl.h b/util/thread_status_impl.h
index 3d4987a34..a76c02b09 100644
--- a/util/thread_status_impl.h
+++ b/util/thread_status_impl.h
@@ -127,7 +127,8 @@ class ThreadStatusImpl {
   // Verifies whether the input ColumnFamilyHandles matches
   // the information stored in the current cf_info_map.
   static void TEST_VerifyColumnFamilyInfoMap(
-      const std::vector<ColumnFamilyHandle*>& handles);
+      const std::vector<ColumnFamilyHandle*>& handles,
+      bool check_exist);
 
  protected:
   // The thread-local variable for storing thread status.
diff --git a/util/thread_status_impl_debug.cc b/util/thread_status_impl_debug.cc
index 5717e40c3..5489499d3 100644
--- a/util/thread_status_impl_debug.cc
+++ b/util/thread_status_impl_debug.cc
@@ -11,15 +11,22 @@
 
 namespace rocksdb {
 void ThreadStatusImpl::TEST_VerifyColumnFamilyInfoMap(
-    const std::vector<ColumnFamilyHandle*>& handles) {
+    const std::vector<ColumnFamilyHandle*>& handles,
+    bool check_exist) {
   std::unique_lock<std::mutex> lock(thread_list_mutex_);
-  assert(cf_info_map_.size() == handles.size());
+  if (check_exist) {
+    assert(cf_info_map_.size() == handles.size());
+  }
   for (auto* handle : handles) {
     auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(handle)->cfd();
     auto iter __attribute__((unused)) = cf_info_map_.find(cfd);
-    assert(iter != cf_info_map_.end());
-    assert(iter->second);
-    assert(iter->second->cf_name == cfd->GetName());
+    if (check_exist) {
+      assert(iter != cf_info_map_.end());
+      assert(iter->second);
+      assert(iter->second->cf_name == cfd->GetName());
+    } else {
+      assert(iter == cf_info_map_.end());
+    }
   }
 }
 }  // namespace rocksdb

From d84069995c74a25f7e29b6d36de793ee603167eb Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Fri, 21 Nov 2014 09:41:51 -0500
Subject: [PATCH 544/829] Fix mac compile

---
 include/rocksdb/utilities/leveldb_options.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/rocksdb/utilities/leveldb_options.h b/include/rocksdb/utilities/leveldb_options.h
index 09033b7e7..8e2c3a1d5 100644
--- a/include/rocksdb/utilities/leveldb_options.h
+++ b/include/rocksdb/utilities/leveldb_options.h
@@ -18,7 +18,7 @@ class Comparator;
 class Env;
 class FilterPolicy;
 class Logger;
-class Options;
+struct Options;
 class Snapshot;
 
 enum CompressionType : char;

From cd278584c9fe47426c200742205e1462b88ee36d Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Fri, 21 Nov 2014 11:05:28 -0500
Subject: [PATCH 545/829] Clean up StringSplit

Summary: stringSplit is not how we name our functions. Also, we had two StringSplit's in the codebase

Test Plan: make check

Reviewers: yhchiang, dhruba

Reviewed By: dhruba

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D29361
---
 db/db_bench.cc                |  4 ++--
 util/ldb_cmd.cc               |  2 +-
 util/string_util.cc           |  2 +-
 util/string_util.h            |  2 +-
 utilities/geodb/geodb_impl.cc | 17 +++--------------
 utilities/geodb/geodb_impl.h  |  5 -----
 6 files changed, 8 insertions(+), 24 deletions(-)

diff --git a/db/db_bench.cc b/db/db_bench.cc
index 54ede5161..2b5d6c9c3 100644
--- a/db/db_bench.cc
+++ b/db/db_bench.cc
@@ -3058,8 +3058,8 @@ int main(int argc, char** argv) {
     dbstats = rocksdb::CreateDBStatistics();
   }
 
-  std::vector<std::string> fanout =
-    rocksdb::stringSplit(FLAGS_max_bytes_for_level_multiplier_additional, ',');
+  std::vector<std::string> fanout = rocksdb::StringSplit(
+      FLAGS_max_bytes_for_level_multiplier_additional, ',');
   for (unsigned int j= 0; j < fanout.size(); j++) {
     FLAGS_max_bytes_for_level_multiplier_additional_v.push_back(
       std::stoi(fanout[j]));
diff --git a/util/ldb_cmd.cc b/util/ldb_cmd.cc
index d35b9412c..9a5ec5185 100644
--- a/util/ldb_cmd.cc
+++ b/util/ldb_cmd.cc
@@ -92,7 +92,7 @@ LDBCommand* LDBCommand::InitFromCmdLineArgs(
 
   for (const auto& arg : args) {
     if (arg[0] == '-' && arg[1] == '-'){
-      vector<string> splits = stringSplit(arg, '=');
+      vector<string> splits = StringSplit(arg, '=');
       if (splits.size() == 2) {
         string optionKey = splits[0].substr(OPTION_PREFIX.size());
         option_map[optionKey] = splits[1];
diff --git a/util/string_util.cc b/util/string_util.cc
index 97b7f9de9..4e0bc4668 100644
--- a/util/string_util.cc
+++ b/util/string_util.cc
@@ -10,7 +10,7 @@
 
 namespace rocksdb {
 
-std::vector<std::string> stringSplit(std::string arg, char delim) {
+std::vector<std::string> StringSplit(const std::string& arg, char delim) {
   std::vector<std::string> splits;
   std::stringstream ss(arg);
   std::string item;
diff --git a/util/string_util.h b/util/string_util.h
index 676f4aae8..b9f0a1d28 100644
--- a/util/string_util.h
+++ b/util/string_util.h
@@ -10,6 +10,6 @@
 #pragma once
 namespace rocksdb {
 
-extern std::vector<std::string> stringSplit(std::string arg, char delim);
+extern std::vector<std::string> StringSplit(const std::string& arg, char delim);
 
 }  // namespace rocksdb
diff --git a/utilities/geodb/geodb_impl.cc b/utilities/geodb/geodb_impl.cc
index 56cfa954e..194e51232 100644
--- a/utilities/geodb/geodb_impl.cc
+++ b/utilities/geodb/geodb_impl.cc
@@ -17,6 +17,7 @@
 #include <limits>
 #include "db/filename.h"
 #include "util/coding.h"
+#include "util/string_util.h"
 
 //
 // There are two types of keys. The first type of key-values
@@ -116,9 +117,8 @@ Status GeoDBImpl::GetById(const Slice& id, GeoObject* object) {
   }
 
   // split the key into p + quadkey + id + lat + lon
-  std::vector<std::string> parts;
   Slice key = iter->key();
-  StringSplit(&parts, key.ToString(), ':');
+  std::vector<std::string> parts = StringSplit(key.ToString(), ':');
   assert(parts.size() == 5);
   assert(parts[0] == "p");
   assert(parts[1] == quadkey);
@@ -180,9 +180,8 @@ Status GeoDBImpl::SearchRadial(const GeoPosition& pos,
          number_of_values > 0 && iter->Valid() && iter->status().ok();
          iter->Next()) {
       // split the key into p + quadkey + id + lat + lon
-      std::vector<std::string> parts;
       Slice key = iter->key();
-      StringSplit(&parts, key.ToString(), ':');
+      std::vector<std::string> parts = StringSplit(key.ToString(), ':');
       assert(parts.size() == 5);
       assert(parts[0] == "p");
       std::string* quadkey = &parts[1];
@@ -243,16 +242,6 @@ std::string GeoDBImpl::MakeQuadKeyPrefix(std::string quadkey) {
   return key;
 }
 
-void GeoDBImpl::StringSplit(std::vector<std::string>* tokens,
-                            const std::string &text, char sep) {
-  std::size_t start = 0, end = 0;
-  while ((end = text.find(sep, start)) != std::string::npos) {
-    tokens->push_back(text.substr(start, end - start));
-    start = end + 1;
-  }
-  tokens->push_back(text.substr(start));
-}
-
 // convert degrees to radians
 double GeoDBImpl::radians(double x) {
   return (x * PI) / 180;
diff --git a/utilities/geodb/geodb_impl.h b/utilities/geodb/geodb_impl.h
index c7e410458..94b2d6ceb 100644
--- a/utilities/geodb/geodb_impl.h
+++ b/utilities/geodb/geodb_impl.h
@@ -169,11 +169,6 @@ class GeoDBImpl : public GeoDB {
                        double radius,
                        std::vector<std::string>* quadKeys);
 
-  // splits a string into its components
-  static void StringSplit(std::vector<std::string>* tokens,
-                          const std::string &text,
-                          char sep);
-
   //
   // Create keys for accessing rocksdb table(s)
   //

From 7ec71f101c30381af1d1dda6711e28debaa11809 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Fri, 21 Nov 2014 11:05:48 -0500
Subject: [PATCH 546/829] Provide default implementation of LinkFile, don't
 break the build

Summary: By providing default implementation of LinkFile, we don't break other implementations of Env.

Test Plan: none

Reviewers: rven, dhruba

Reviewed By: dhruba

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D29355
---
 include/rocksdb/env.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/include/rocksdb/env.h b/include/rocksdb/env.h
index 291676002..aded546ca 100644
--- a/include/rocksdb/env.h
+++ b/include/rocksdb/env.h
@@ -179,8 +179,9 @@ class Env {
                             const std::string& target) = 0;
 
   // Hard Link file src to target.
-  virtual Status LinkFile(const std::string& src,
-                          const std::string& target) = 0;
+  virtual Status LinkFile(const std::string& src, const std::string& target) {
+    return Status::NotSupported("LinkFile is not supported for this Env");
+  }
 
   // Lock the specified file.  Used to prevent concurrent access to
   // the same db by multiple processes.  On failure, stores nullptr in

From aa31fc506827b11b585d150b63b42f103a74f07a Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Fri, 21 Nov 2014 10:22:05 -0800
Subject: [PATCH 547/829] Improve listener_test by ensuring flushes are
 completed before assert.

Summary: Improve listener_test by ensuring flushes are completed before assert.

Test Plan: listener_test

Reviewers: ljin, sdong, igor

Reviewed By: igor

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D29319
---
 db/listener_test.cc | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/db/listener_test.cc b/db/listener_test.cc
index 35e00c94a..595e01e6e 100644
--- a/db/listener_test.cc
+++ b/db/listener_test.cc
@@ -135,10 +135,12 @@ class EventListenerTest {
   }
 
   Status Flush(int cf = 0) {
+    FlushOptions opt = FlushOptions();
+    opt.wait = true;
     if (cf == 0) {
-      return db_->Flush(FlushOptions());
+      return db_->Flush(opt);
     } else {
-      return db_->Flush(FlushOptions(), handles_[cf]);
+      return db_->Flush(opt, handles_[cf]);
     }
   }
 

From 9222a2d02495fb3b9b2868b2e31938e990ab7999 Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Fri, 21 Nov 2014 11:00:42 -0800
Subject: [PATCH 548/829] Fixed iOS build caused by GetThreadList feature.

Summary:
Fixed iOS build caused by GetThreadList feature.
---
 util/thread_status_impl.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/util/thread_status_impl.h b/util/thread_status_impl.h
index a76c02b09..a678e0988 100644
--- a/util/thread_status_impl.h
+++ b/util/thread_status_impl.h
@@ -131,10 +131,10 @@ class ThreadStatusImpl {
       bool check_exist);
 
  protected:
-  // The thread-local variable for storing thread status.
-  static __thread ThreadStatusData* thread_status_data_;
 
 #if ROCKSDB_USING_THREAD_STATUS
+  // The thread-local variable for storing thread status.
+  static __thread ThreadStatusData* thread_status_data_;
 
   // Obtain the pointer to the thread status data.  It also performs
   // initialization when necessary.
@@ -157,6 +157,8 @@ class ThreadStatusImpl {
   // associated to the same db_key faster.
   static std::unordered_map<
       const void*, std::unordered_set<const void*>> db_key_map_;
+#else
+  static ThreadStatusData* thread_status_data_;
 #endif  // ROCKSDB_USING_THREAD_STATUS
 };
 

From 3a40c427b9dc84275c3ad7fff7e8f9f22b60ea6b Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Fri, 21 Nov 2014 11:20:42 -0800
Subject: [PATCH 549/829] Fix db_bench on CLANG mode

Summary: "build all" breaks in Clang mode with db_bench. Fix it.

Test Plan: USE_CLANG=1 make all

Reviewers: ljin, rven, yhchiang, igor

Reviewed By: igor

Subscribers: leveldb, dhruba

Differential Revision: https://reviews.facebook.net/D29379
---
 db/db_bench.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/db/db_bench.cc b/db/db_bench.cc
index 2b5d6c9c3..17889ebd8 100644
--- a/db/db_bench.cc
+++ b/db/db_bench.cc
@@ -1193,7 +1193,7 @@ class Benchmark {
     while (start < s.size() && isspace(s[start])) {
       start++;
     }
-    unsigned int limit = s.size();
+    unsigned int limit = static_cast<unsigned int>(s.size());
     while (limit > start && isspace(s[limit-1])) {
       limit--;
     }

From 14101801677f759e8b54980c04e02b7032b93382 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Fri, 21 Nov 2014 14:11:22 -0800
Subject: [PATCH 550/829] Make arena use hugepage if possible

Summary:
arena doesn't use huge page by default. This change will make it happen
if possible. A new paramerter is added for Arena(). If it's set, Arena
will use huge page always. If huge page allocation fails, Arena
allocation will fallback to malloc().

Test Plan:
Change util/arena_test to support huge page allocation.
Run below tests:
1. normal regression test:
  make check
2. Check if huge page allocation works
  echo 50 > /proc/sys/vm/nr_hugepages
  make check

Reviewers: sdong

Reviewed By: sdong

Subscribers: dhruba

Differential Revision: https://reviews.facebook.net/D28647
---
 util/arena.cc      | 55 +++++++++++++++++++++++++++++++--------
 util/arena.h       |  7 ++++-
 util/arena_test.cc | 65 ++++++++++++++++++++++++++++++++++++++--------
 3 files changed, 104 insertions(+), 23 deletions(-)

diff --git a/util/arena.cc b/util/arena.cc
index 6efe687c6..3f00f0845 100644
--- a/util/arena.cc
+++ b/util/arena.cc
@@ -32,13 +32,20 @@ size_t OptimizeBlockSize(size_t block_size) {
   return block_size;
 }
 
-Arena::Arena(size_t block_size) : kBlockSize(OptimizeBlockSize(block_size)) {
+Arena::Arena(size_t block_size, size_t huge_page_size)
+    : kBlockSize(OptimizeBlockSize(block_size)) {
   assert(kBlockSize >= kMinBlockSize && kBlockSize <= kMaxBlockSize &&
          kBlockSize % kAlignUnit == 0);
   alloc_bytes_remaining_ = sizeof(inline_block_);
   blocks_memory_ += alloc_bytes_remaining_;
   aligned_alloc_ptr_ = inline_block_;
   unaligned_alloc_ptr_ = inline_block_ + alloc_bytes_remaining_;
+#ifdef MAP_HUGETLB
+  hugetlb_size_ = huge_page_size;
+  if (hugetlb_size_ && kBlockSize > hugetlb_size_) {
+    hugetlb_size_ = ((kBlockSize - 1U) / hugetlb_size_ + 1U) * hugetlb_size_;
+  }
+#endif
 }
 
 Arena::~Arena() {
@@ -62,20 +69,49 @@ char* Arena::AllocateFallback(size_t bytes, bool aligned) {
   }
 
   // We waste the remaining space in the current block.
-  auto block_head = AllocateNewBlock(kBlockSize);
-  alloc_bytes_remaining_ = kBlockSize - bytes;
+  size_t size;
+  char* block_head = nullptr;
+  if (hugetlb_size_) {
+    size = hugetlb_size_;
+    block_head = AllocateFromHugePage(size);
+  }
+  if (!block_head) {
+    size = kBlockSize;
+    block_head = AllocateNewBlock(size);
+  }
+  alloc_bytes_remaining_ = size - bytes;
 
   if (aligned) {
     aligned_alloc_ptr_ = block_head + bytes;
-    unaligned_alloc_ptr_ = block_head + kBlockSize;
+    unaligned_alloc_ptr_ = block_head + size;
     return block_head;
   } else {
     aligned_alloc_ptr_ = block_head;
-    unaligned_alloc_ptr_ = block_head + kBlockSize - bytes;
+    unaligned_alloc_ptr_ = block_head + size - bytes;
     return unaligned_alloc_ptr_;
   }
 }
 
+char* Arena::AllocateFromHugePage(size_t bytes) {
+#ifdef MAP_HUGETLB
+  if (hugetlb_size_ == 0) {
+    return nullptr;
+  }
+
+  void* addr = mmap(nullptr, bytes, (PROT_READ | PROT_WRITE),
+                    (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB), 0, 0);
+
+  if (addr == MAP_FAILED) {
+    return nullptr;
+  }
+  huge_blocks_.push_back(MmapInfo(addr, bytes));
+  blocks_memory_ += bytes;
+  return reinterpret_cast<char*>(addr);
+#else
+  return nullptr;
+#endif
+}
+
 char* Arena::AllocateAligned(size_t bytes, size_t huge_page_size,
                              Logger* logger) {
   assert((kAlignUnit & (kAlignUnit - 1)) ==
@@ -88,17 +124,14 @@ char* Arena::AllocateAligned(size_t bytes, size_t huge_page_size,
     size_t reserved_size =
         ((bytes - 1U) / huge_page_size + 1U) * huge_page_size;
     assert(reserved_size >= bytes);
-    void* addr = mmap(nullptr, reserved_size, (PROT_READ | PROT_WRITE),
-                      (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB), 0, 0);
 
-    if (addr == MAP_FAILED) {
+    char* addr = AllocateFromHugePage(reserved_size);
+    if (addr == nullptr) {
       Warn(logger, "AllocateAligned fail to allocate huge TLB pages: %s",
            strerror(errno));
       // fail back to malloc
     } else {
-      blocks_memory_ += reserved_size;
-      huge_blocks_.push_back(MmapInfo(addr, reserved_size));
-      return reinterpret_cast<char*>(addr);
+      return addr;
     }
   }
 #endif
diff --git a/util/arena.h b/util/arena.h
index dfd8e2b24..4764c1568 100644
--- a/util/arena.h
+++ b/util/arena.h
@@ -35,7 +35,10 @@ class Arena {
   static const size_t kMinBlockSize;
   static const size_t kMaxBlockSize;
 
-  explicit Arena(size_t block_size = kMinBlockSize);
+  // huge_page_size: if 0, don't use huge page TLB. If > 0 (should set to the
+  // supported hugepage size of the system), block allocation will try huge
+  // page TLB first. If allocation fails, will fall back to normal case.
+  explicit Arena(size_t block_size = kMinBlockSize, size_t huge_page_size = 0);
   ~Arena();
 
   char* Allocate(size_t bytes);
@@ -100,6 +103,8 @@ class Arena {
   // How many bytes left in currently active block?
   size_t alloc_bytes_remaining_ = 0;
 
+  size_t hugetlb_size_ = 0;
+  char* AllocateFromHugePage(size_t bytes);
   char* AllocateFallback(size_t bytes, bool aligned);
   char* AllocateNewBlock(size_t block_bytes);
 
diff --git a/util/arena_test.cc b/util/arena_test.cc
index 7b6cfd0af..7f55a7e53 100644
--- a/util/arena_test.cc
+++ b/util/arena_test.cc
@@ -13,17 +13,21 @@
 
 namespace rocksdb {
 
+namespace {
+const size_t kHugePageSize = 2 * 1024 * 1024;
+}  // namespace
 class ArenaTest {};
 
 TEST(ArenaTest, Empty) { Arena arena0; }
 
-TEST(ArenaTest, MemoryAllocatedBytes) {
+namespace {
+void MemoryAllocatedBytesTest(size_t huge_page_size) {
   const int N = 17;
   size_t req_sz;  // requested size
   size_t bsz = 8192;  // block size
   size_t expected_memory_allocated;
 
-  Arena arena(bsz);
+  Arena arena(bsz, huge_page_size);
 
   // requested size > quarter of a block:
   //   allocate requested size separately
@@ -44,8 +48,15 @@ TEST(ArenaTest, MemoryAllocatedBytes) {
   for (int i = 0; i < N; i++) {
     arena.Allocate(req_sz);
   }
-  expected_memory_allocated += bsz;
-  ASSERT_EQ(arena.MemoryAllocatedBytes(), expected_memory_allocated);
+  if (huge_page_size) {
+    ASSERT_TRUE(arena.MemoryAllocatedBytes() ==
+                    expected_memory_allocated + bsz ||
+                arena.MemoryAllocatedBytes() ==
+                    expected_memory_allocated + huge_page_size);
+  } else {
+    expected_memory_allocated += bsz;
+    ASSERT_EQ(arena.MemoryAllocatedBytes(), expected_memory_allocated);
+  }
 
   // requested size > quarter of a block:
   //   allocate requested size separately
@@ -54,16 +65,23 @@ TEST(ArenaTest, MemoryAllocatedBytes) {
     arena.Allocate(req_sz);
   }
   expected_memory_allocated += req_sz * N;
-  ASSERT_EQ(arena.MemoryAllocatedBytes(), expected_memory_allocated);
+  if (huge_page_size) {
+    ASSERT_TRUE(arena.MemoryAllocatedBytes() ==
+                    expected_memory_allocated + bsz ||
+                arena.MemoryAllocatedBytes() ==
+                    expected_memory_allocated + huge_page_size);
+  } else {
+    ASSERT_EQ(arena.MemoryAllocatedBytes(), expected_memory_allocated);
+  }
 }
 
 // Make sure we didn't count the allocate but not used memory space in
 // Arena::ApproximateMemoryUsage()
-TEST(ArenaTest, ApproximateMemoryUsageTest) {
+static void ApproximateMemoryUsageTest(size_t huge_page_size) {
   const size_t kBlockSize = 4096;
   const size_t kEntrySize = kBlockSize / 8;
   const size_t kZero = 0;
-  Arena arena(kBlockSize);
+  Arena arena(kBlockSize, huge_page_size);
   ASSERT_EQ(kZero, arena.ApproximateMemoryUsage());
 
   // allocate inline bytes
@@ -78,7 +96,12 @@ TEST(ArenaTest, ApproximateMemoryUsageTest) {
   // first allocation
   arena.AllocateAligned(kEntrySize);
   auto mem_usage = arena.MemoryAllocatedBytes();
-  ASSERT_EQ(mem_usage, kBlockSize + Arena::kInlineSize);
+  if (huge_page_size) {
+    ASSERT_TRUE(mem_usage == kBlockSize + Arena::kInlineSize ||
+                mem_usage == huge_page_size + Arena::kInlineSize);
+  } else {
+    ASSERT_EQ(mem_usage, kBlockSize + Arena::kInlineSize);
+  }
   auto usage = arena.ApproximateMemoryUsage();
   ASSERT_LT(usage, mem_usage);
   for (size_t i = 1; i < num_blocks; ++i) {
@@ -87,12 +110,17 @@ TEST(ArenaTest, ApproximateMemoryUsageTest) {
     ASSERT_EQ(arena.ApproximateMemoryUsage(), usage + kEntrySize);
     usage = arena.ApproximateMemoryUsage();
   }
-  ASSERT_GT(usage, mem_usage);
+  if (huge_page_size) {
+    ASSERT_TRUE(usage > mem_usage ||
+                usage + huge_page_size - kBlockSize == mem_usage);
+  } else {
+    ASSERT_GT(usage, mem_usage);
+  }
 }
 
-TEST(ArenaTest, Simple) {
+static void SimpleTest(size_t huge_page_size) {
   std::vector<std::pair<size_t, char*>> allocated;
-  Arena arena;
+  Arena arena(Arena::kMinBlockSize, huge_page_size);
   const int N = 100000;
   size_t bytes = 0;
   Random rnd(301);
@@ -136,7 +164,22 @@ TEST(ArenaTest, Simple) {
     }
   }
 }
+}  // namespace
 
+TEST(ArenaTest, MemoryAllocatedBytes) {
+  MemoryAllocatedBytesTest(0);
+  MemoryAllocatedBytesTest(kHugePageSize);
+}
+
+TEST(ArenaTest, ApproximateMemoryUsage) {
+  ApproximateMemoryUsageTest(0);
+  ApproximateMemoryUsageTest(kHugePageSize);
+}
+
+TEST(ArenaTest, Simple) {
+  SimpleTest(0);
+  SimpleTest(kHugePageSize);
+}
 }  // namespace rocksdb
 
 int main(int argc, char** argv) { return rocksdb::test::RunAllTests(); }

From c4b65f70fbf6650b6f46c2a14a95ec27f092406e Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Fri, 21 Nov 2014 23:49:31 +0100
Subject: [PATCH 551/829] [RocksJava] Makefile correction

Prevent non exist error message while executing
clean twice.
---
 java/Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/java/Makefile b/java/Makefile
index 0a4e2ba16..7a0edb3cf 100644
--- a/java/Makefile
+++ b/java/Makefile
@@ -91,8 +91,8 @@ clean:
 	rm -rf javadoc/*
 	rm -rf test-libs/
 	rm -rf target
-	rm librocksdbjni*
-	rm rocksdbjni*
+	rm -f librocksdbjni*
+	rm -f rocksdbjni*
 
 
 javadocs:

From 569853ed10dda98557360f52544bdf2873488f59 Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Sat, 22 Nov 2014 00:04:41 -0800
Subject: [PATCH 552/829] Fix leak when create_missing_column_families=true on
 ThreadStatus

Summary:
An entry of ConstantColumnFamilyInfo is created when:
1. DB::Open
2. CreateColumnFamily.

However, there are cases that DB::Open could also call CreateColumnFamily
when create_missing_column_families=true.  As a result, it will create
duplicate ConstantColumnFamilyInfo and one of them would be leaked.

Test Plan: ./deletefile_test

Reviewers: igor, sdong, ljin

Reviewed By: ljin

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D29307
---
 db/db_impl.cc              | 16 ++--------------
 db/db_test.cc              |  1 +
 db/deletefile_test.cc      |  1 +
 util/thread_status_impl.cc | 13 +++++++------
 util/thread_status_impl.h  |  2 +-
 5 files changed, 12 insertions(+), 21 deletions(-)

diff --git a/db/db_impl.cc b/db/db_impl.cc
index 9688af26b..4b9b96ec7 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -2493,7 +2493,7 @@ Status DBImpl::CreateColumnFamily(const ColumnFamilyOptions& cf_options,
           "Creating column family [%s] FAILED -- %s",
           column_family_name.c_str(), s.ToString().c_str());
     }
-  }
+  }  // MutexLock l(&mutex_)
 
   // this is outside the mutex
   if (s.ok()) {
@@ -3545,6 +3545,7 @@ Status DB::Open(const DBOptions& db_options, const std::string& dbname,
         if (cfd != nullptr) {
           handles->push_back(
               new ColumnFamilyHandleImpl(cfd, impl, &impl->mutex_));
+          impl->NewThreadStatusCfInfo(cfd);
         } else {
           if (db_options.create_missing_column_families) {
             // missing column family, create it
@@ -3609,19 +3610,6 @@ Status DB::Open(const DBOptions& db_options, const std::string& dbname,
   if (s.ok()) {
     impl->opened_successfully_ = true;
     *dbptr = impl;
-    // TODO(yhchiang): Add NotifyOnDatabaseOpen() here.
-    // Since the column-family handles are only available after DB::Open(),
-    // typically developers will need to pass the returned ColumnFamilyHandles
-    // to their EventListeners in order to maintain the mapping between
-    // column-family-name to ColumnFamilyHandle.  However, some database
-    // events might happen before the user passing those ColumnFamilyHandle to
-    // their Listeners.  To address this, we should have NotifyOnDatabaseOpen()
-    // here which passes the created ColumnFamilyHandle to the Listeners
-    // as the first event after DB::Open().
-    for (auto* h : *handles) {
-      impl->NewThreadStatusCfInfo(
-          reinterpret_cast<ColumnFamilyHandleImpl*>(h)->cfd());
-    }
   } else {
     for (auto* h : *handles) {
       delete h;
diff --git a/db/db_test.cc b/db/db_test.cc
index 76a0b7ff6..6f8fb00c9 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -9031,6 +9031,7 @@ TEST(DBTest, GetThreadList) {
     }
   }
   db_->DropColumnFamily(handles_[2]);
+  delete handles_[2];
   handles_.erase(handles_.begin() + 2);
   ThreadStatusImpl::TEST_VerifyColumnFamilyInfoMap(handles_, true);
   Close();
diff --git a/db/deletefile_test.cc b/db/deletefile_test.cc
index 6a6f8e953..9e3ea70aa 100644
--- a/db/deletefile_test.cc
+++ b/db/deletefile_test.cc
@@ -34,6 +34,7 @@ class DeleteFileTest {
   DeleteFileTest() {
     db_ = nullptr;
     env_ = Env::Default();
+    options_.enable_thread_tracking = true;
     options_.max_background_flushes = 0;
     options_.write_buffer_size = 1024*1024*1000;
     options_.target_file_size_base = 1024*1024*1000;
diff --git a/util/thread_status_impl.cc b/util/thread_status_impl.cc
index d1cd5ccdc..35dc181e2 100644
--- a/util/thread_status_impl.cc
+++ b/util/thread_status_impl.cc
@@ -13,7 +13,7 @@ namespace rocksdb {
 __thread ThreadStatusData* ThreadStatusImpl::thread_status_data_ = nullptr;
 std::mutex ThreadStatusImpl::thread_list_mutex_;
 std::unordered_set<ThreadStatusData*> ThreadStatusImpl::thread_data_set_;
-std::unordered_map<const void*, ConstantColumnFamilyInfo*>
+std::unordered_map<const void*, std::unique_ptr<ConstantColumnFamilyInfo>>
     ThreadStatusImpl::cf_info_map_;
 std::unordered_map<const void*, std::unordered_set<const void*>>
     ThreadStatusImpl::db_key_map_;
@@ -66,7 +66,7 @@ Status ThreadStatusImpl::GetThreadList(
     auto iter = cf_info_map_.find(cf_key);
     assert(cf_key == 0 || iter != cf_info_map_.end());
     auto* cf_info = iter != cf_info_map_.end() ?
-        iter->second : nullptr;
+        iter->second.get() : nullptr;
     auto* event_info = thread_data->event_info.load(
         std::memory_order_relaxed);
     const std::string* db_name = nullptr;
@@ -106,7 +106,8 @@ void ThreadStatusImpl::NewColumnFamilyInfo(
     const void* cf_key, const std::string& cf_name) {
   std::lock_guard<std::mutex> lck(thread_list_mutex_);
 
-  cf_info_map_[cf_key] = new ConstantColumnFamilyInfo(db_key, db_name, cf_name);
+  cf_info_map_[cf_key].reset(
+      new ConstantColumnFamilyInfo(db_key, db_name, cf_name));
   db_key_map_[db_key].insert(cf_key);
 }
 
@@ -115,7 +116,7 @@ void ThreadStatusImpl::EraseColumnFamilyInfo(const void* cf_key) {
   auto cf_pair = cf_info_map_.find(cf_key);
   assert(cf_pair != cf_info_map_.end());
 
-  auto* cf_info = cf_pair->second;
+  auto* cf_info = cf_pair->second.get();
   assert(cf_info);
 
   // Remove its entry from db_key_map_ by the following steps:
@@ -126,7 +127,7 @@ void ThreadStatusImpl::EraseColumnFamilyInfo(const void* cf_key) {
   size_t result __attribute__((unused)) = db_pair->second.erase(cf_key);
   assert(result);
 
-  delete cf_info;
+  cf_pair->second.reset();
   result = cf_info_map_.erase(cf_key);
   assert(result);
 }
@@ -144,7 +145,7 @@ void ThreadStatusImpl::EraseDatabaseInfo(const void* db_key) {
   for (auto cf_key : db_pair->second) {
     auto cf_pair = cf_info_map_.find(cf_key);
     assert(cf_pair != cf_info_map_.end());
-    delete cf_pair->second;
+    cf_pair->second.reset();
     result = cf_info_map_.erase(cf_key);
     assert(result);
   }
diff --git a/util/thread_status_impl.h b/util/thread_status_impl.h
index a678e0988..a6e9a7e5b 100644
--- a/util/thread_status_impl.h
+++ b/util/thread_status_impl.h
@@ -151,7 +151,7 @@ class ThreadStatusImpl {
   // closing while GetThreadList function already get the pointer to its
   // CopnstantColumnFamilyInfo.
   static std::unordered_map<
-      const void*, ConstantColumnFamilyInfo*> cf_info_map_;
+      const void*, std::unique_ptr<ConstantColumnFamilyInfo>> cf_info_map_;
 
   // A db_key to cf_key map that allows erasing elements in cf_info_map
   // associated to the same db_key faster.

From 325722149925b8dfdfcdf96531b2ae114b736b32 Mon Sep 17 00:00:00 2001
From: Venkatesh Radhakrishnan <rven@fb.com>
Date: Mon, 24 Nov 2014 10:20:50 -0800
Subject: [PATCH 553/829] Fixes valgrind error in GetSnapshotLink. Free
 checkpoint now.

Summary: Free checkpoint after its directory is removed.

Test Plan: Run valgrind with GetSnapshotLink.

Reviewers: igor

Reviewed By: igor

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D29493
---
 db/db_test.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/db/db_test.cc b/db/db_test.cc
index 6f8fb00c9..6545f6744 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -1660,6 +1660,7 @@ TEST(DBTest, GetSnapshotLink) {
     delete db_;
     db_ = nullptr;
     ASSERT_OK(DestroyDB(dbname_, options));
+    delete checkpoint;
 
     // Restore DB name
     dbname_ = test::TmpDir(env_) + "/db_test";

From d699d703433357633eed5c178f7f01ae924c759e Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Mon, 24 Nov 2014 15:53:23 -0500
Subject: [PATCH 554/829] Make RocksDB compile without gflags

Summary: We want to make sure people without gflags can compile RocksDB.

Test Plan: remove gflags, make all

Reviewers: sdong, rven, yhchiang, ljin

Reviewed By: ljin

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D29469
---
 util/benchharness.cc | 27 +++++++++++++--------------
 util/options_test.cc |  8 +++++++-
 2 files changed, 20 insertions(+), 15 deletions(-)

diff --git a/util/benchharness.cc b/util/benchharness.cc
index 1c282aee4..4218f3c30 100644
--- a/util/benchharness.cc
+++ b/util/benchharness.cc
@@ -17,25 +17,13 @@
 #include <utility>
 #include <vector>
 
-#include <gflags/gflags.h>
-
-using std::function;
-using std::get;
-using std::make_pair;
-using std::max;
-using std::min;
-using std::pair;
-using std::sort;
-using std::string;
-using std::tuple;
-using std::vector;
-
 #ifndef GFLAGS
 bool FLAGS_benchmark = false;
 uint64_t FLAGS_bm_min_usec = 100;
-int64_t FLAGS_bm_min_iter = 1;
+int64_t FLAGS_bm_min_iters = 1;
 int32_t FLAGS_bm_max_secs = 1;
 #else
+#include <gflags/gflags.h>
 DEFINE_bool(benchmark, false, "Run benchmarks.");
 
 DEFINE_uint64(bm_min_usec, 100,
@@ -48,6 +36,17 @@ DEFINE_int32(bm_max_secs, 1,
              "Maximum # of seconds we'll spend on each benchmark.");
 #endif  // GFLAGS
 
+using std::function;
+using std::get;
+using std::make_pair;
+using std::max;
+using std::min;
+using std::pair;
+using std::sort;
+using std::string;
+using std::tuple;
+using std::vector;
+
 namespace rocksdb {
 namespace benchmark {
 
diff --git a/util/options_test.cc b/util/options_test.cc
index 1a6702143..4d6746ec2 100644
--- a/util/options_test.cc
+++ b/util/options_test.cc
@@ -13,7 +13,6 @@
 
 #include <unordered_map>
 #include <inttypes.h>
-#include <gflags/gflags.h>
 
 #include "rocksdb/options.h"
 #include "rocksdb/table.h"
@@ -23,8 +22,13 @@
 #include "rocksdb/utilities/leveldb_options.h"
 #include "rocksdb/utilities/convenience.h"
 
+#ifndef GFLAGS
+bool FLAGS_enable_print = false;
+#else
+#include <gflags/gflags.h>
 using GFLAGS::ParseCommandLineFlags;
 DEFINE_bool(enable_print, false, "Print options generated to console.");
+#endif  // GFLAGS
 
 namespace rocksdb {
 
@@ -357,6 +361,8 @@ TEST(OptionsTest, ConvertOptionsTest) {
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
+#ifdef GFLAGS
   ParseCommandLineFlags(&argc, &argv, true);
+#endif  // GFLAGS
   return rocksdb::test::RunAllTests();
 }

From 9c7ca65d21561a55110c8ce9cab9d376e130004b Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Mon, 24 Nov 2014 15:03:08 -0800
Subject: [PATCH 555/829] free builders in VersionSet::DumpManifest

Summary:
Reported by bootcamper
This causes ldb tool to fail the assertion in ~ColumnFamilyData()

Test Plan:
./ldb --db=/tmp/test_db1 --create_if_missing put a1 b1
./ldb manifest_dump --path=/tmp/test_db1/MANIFEST-000001

Reviewers: sdong, yhchiang, rven, igor

Reviewed By: igor

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D29517
---
 db/version_set.cc | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/db/version_set.cc b/db/version_set.cc
index db8808687..f71ffce95 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -2436,7 +2436,6 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname,
           v->storage_info()->num_levels() - 1);
       cfd->compaction_picker()->SizeBeingCompacted(size_being_compacted);
       v->PrepareApply(*cfd->GetLatestMutableCFOptions(), size_being_compacted);
-      delete builder;
 
       printf("--------------- Column family \"%s\"  (ID %u) --------------\n",
              cfd->GetName().c_str(), (unsigned int)cfd->GetID());
@@ -2451,6 +2450,11 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname,
       delete v;
     }
 
+    // Free builders
+    for (auto& builder : builders) {
+      delete builder.second;
+    }
+
     next_file_number_.store(next_file + 1);
     last_sequence_ = last_sequence;
     prev_log_number_ = previous_log_number;

From 2946e37a087fd7d2834d65d585ae3579279472ff Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Mon, 24 Nov 2014 15:18:09 -0800
Subject: [PATCH 556/829] remove unreliable test in db/cuckoo_table_db_test.cc

Summary:
This compaction trigger does not seem to test any thing specific to
cuckoo table. Remove it.

Test Plan: make all check

Reviewers: igor

Reviewed By: igor

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D29523
---
 db/cuckoo_table_db_test.cc | 27 ---------------------------
 1 file changed, 27 deletions(-)

diff --git a/db/cuckoo_table_db_test.cc b/db/cuckoo_table_db_test.cc
index 4fff07c46..a35eba270 100644
--- a/db/cuckoo_table_db_test.cc
+++ b/db/cuckoo_table_db_test.cc
@@ -228,33 +228,6 @@ TEST(CuckooTableDBTest, Uint64Comparator) {
   ASSERT_EQ("v4", Get(Uint64Key(4)));
 }
 
-TEST(CuckooTableDBTest, CompactionTrigger) {
-  Options options = CurrentOptions();
-  options.write_buffer_size = 100 << 10;  // 100KB
-  options.level0_file_num_compaction_trigger = 2;
-  Reopen(&options);
-
-  // Write 11 values, each 10016 B
-  for (int idx = 0; idx < 11; ++idx) {
-    ASSERT_OK(Put(Key(idx), std::string(10000, 'a' + idx)));
-  }
-  dbfull()->TEST_WaitForFlushMemTable();
-  ASSERT_EQ("1", FilesPerLevel());
-
-  // Generate one more file in level-0, and should trigger level-0 compaction
-  for (int idx = 11; idx < 22; ++idx) {
-    ASSERT_OK(Put(Key(idx), std::string(10000, 'a' + idx)));
-  }
-  dbfull()->TEST_WaitForFlushMemTable();
-  ASSERT_EQ("2", FilesPerLevel());
-
-  dbfull()->TEST_CompactRange(0, nullptr, nullptr);
-  ASSERT_EQ("0,2", FilesPerLevel());
-  for (int idx = 0; idx < 22; ++idx) {
-    ASSERT_EQ(std::string(10000, 'a' + idx), Get(Key(idx)));
-  }
-}
-
 TEST(CuckooTableDBTest, CompactionIntoMultipleFiles) {
   // Create a big L0 file and check it compacts into multiple files in L1.
   Options options = CurrentOptions();

From 90ee85f8e11d52a04dedc663f20c8128ee0bde8d Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Mon, 24 Nov 2014 18:28:06 -0800
Subject: [PATCH 557/829] Improve listener_test to avoid possible false alarm

Summary:
Improve listener_test to avoid possible false alarm

Test Plan:
./listener_test
---
 db/listener_test.cc | 34 ++++++++++++++++++----------------
 1 file changed, 18 insertions(+), 16 deletions(-)

diff --git a/db/listener_test.cc b/db/listener_test.cc
index 595e01e6e..bac00b78d 100644
--- a/db/listener_test.cc
+++ b/db/listener_test.cc
@@ -175,6 +175,7 @@ class TestFlushListener : public EventListener {
 
 TEST(EventListenerTest, OnSingleDBFlushTest) {
   Options options;
+  options.write_buffer_size = 100000;
   TestFlushListener* listener = new TestFlushListener();
   options.listeners.emplace_back(listener);
   std::vector<std::string> cf_names = {
@@ -182,15 +183,15 @@ TEST(EventListenerTest, OnSingleDBFlushTest) {
       "nikitich", "alyosha", "popovich"};
   CreateAndReopenWithCF(cf_names, &options);
 
-  ASSERT_OK(Put(1, "pikachu", "pikachu"));
-  ASSERT_OK(Put(2, "ilya", "ilya"));
-  ASSERT_OK(Put(3, "muromec", "muromec"));
-  ASSERT_OK(Put(4, "dobrynia", "dobrynia"));
-  ASSERT_OK(Put(5, "nikitich", "nikitich"));
-  ASSERT_OK(Put(6, "alyosha", "alyosha"));
-  ASSERT_OK(Put(7, "popovich", "popovich"));
+  ASSERT_OK(Put(1, "pikachu", std::string(90000, 'p')));
+  ASSERT_OK(Put(2, "ilya", std::string(90000, 'i')));
+  ASSERT_OK(Put(3, "muromec", std::string(90000, 'm')));
+  ASSERT_OK(Put(4, "dobrynia", std::string(90000, 'd')));
+  ASSERT_OK(Put(5, "nikitich", std::string(90000, 'n')));
+  ASSERT_OK(Put(6, "alyosha", std::string(90000, 'a')));
+  ASSERT_OK(Put(7, "popovich", std::string(90000, 'p')));
   for (size_t i = 1; i < 8; ++i) {
-    Flush(static_cast<int>(i));
+    ASSERT_OK(Flush(static_cast<int>(i)));
     dbfull()->TEST_WaitForFlushMemTable();
     ASSERT_EQ(listener->flushed_dbs_.size(), i);
     ASSERT_EQ(listener->flushed_column_family_names_.size(), i);
@@ -205,6 +206,7 @@ TEST(EventListenerTest, OnSingleDBFlushTest) {
 
 TEST(EventListenerTest, MultiCF) {
   Options options;
+  options.write_buffer_size = 100000;
   TestFlushListener* listener = new TestFlushListener();
   options.listeners.emplace_back(listener);
   std::vector<std::string> cf_names = {
@@ -212,15 +214,15 @@ TEST(EventListenerTest, MultiCF) {
       "nikitich", "alyosha", "popovich"};
   CreateAndReopenWithCF(cf_names, &options);
 
-  ASSERT_OK(Put(1, "pikachu", "pikachu"));
-  ASSERT_OK(Put(2, "ilya", "ilya"));
-  ASSERT_OK(Put(3, "muromec", "muromec"));
-  ASSERT_OK(Put(4, "dobrynia", "dobrynia"));
-  ASSERT_OK(Put(5, "nikitich", "nikitich"));
-  ASSERT_OK(Put(6, "alyosha", "alyosha"));
-  ASSERT_OK(Put(7, "popovich", "popovich"));
+  ASSERT_OK(Put(1, "pikachu", std::string(90000, 'p')));
+  ASSERT_OK(Put(2, "ilya", std::string(90000, 'i')));
+  ASSERT_OK(Put(3, "muromec", std::string(90000, 'm')));
+  ASSERT_OK(Put(4, "dobrynia", std::string(90000, 'd')));
+  ASSERT_OK(Put(5, "nikitich", std::string(90000, 'n')));
+  ASSERT_OK(Put(6, "alyosha", std::string(90000, 'a')));
+  ASSERT_OK(Put(7, "popovich", std::string(90000, 'p')));
   for (size_t i = 1; i < 8; ++i) {
-    Flush(static_cast<int>(i));
+    ASSERT_OK(Flush(static_cast<int>(i)));
     ASSERT_EQ(listener->flushed_dbs_.size(), i);
     ASSERT_EQ(listener->flushed_column_family_names_.size(), i);
   }

From 13de000f07f602cc44fa65623db3a0ed9fdabc4e Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Mon, 24 Nov 2014 20:44:49 -0800
Subject: [PATCH 558/829] Add rocksdb::ToString() to address cases where
 std::to_string is not available.

Summary:
In some environment such as android, the c++ library does not have
std::to_string.  This path adds rocksdb::ToString(), which wraps std::to_string
when std::to_string is not available, and implements std::to_string
in the other case.

Test Plan:
make dbg -j32
./db_test
make clean
make dbg OPT=-DOS_ANDROID -j32
./db_test

Reviewers: ljin, sdong, igor

Reviewed By: igor

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D29181
---
 db/column_family_test.cc          |  8 ++--
 db/compaction_job_test.cc         |  4 +-
 db/compaction_picker.cc           |  7 +--
 db/compaction_picker_test.cc      | 12 ++---
 db/comparator_db_test.cc          |  2 +-
 db/db_bench.cc                    |  2 +-
 db/db_impl.cc                     |  7 +--
 db/db_iter_test.cc                | 14 +++---
 db/db_test.cc                     | 45 +++++++++---------
 db/deletefile_test.cc             |  2 +-
 db/flush_job_test.cc              |  4 +-
 db/internal_stats.cc              |  3 +-
 db/listener_test.cc               |  6 +--
 db/perf_context_test.cc           | 31 +++++++------
 db/plain_table_db_test.cc         |  8 ++--
 db/prefix_test.cc                 |  2 +-
 db/table_properties_collector.cc  |  3 +-
 db/version_builder_test.cc        |  4 +-
 db/wal_manager.cc                 |  5 +-
 db/wal_manager_test.cc            |  2 +-
 db/write_batch_test.cc            |  6 +--
 examples/compact_files_example.cc |  4 +-
 port/port.h                       |  4 +-
 table/block_based_table_reader.cc |  3 +-
 table/cuckoo_table_builder.cc     |  3 +-
 table/cuckoo_table_reader_test.cc |  2 +-
 table/plain_table_reader.cc       |  9 ++--
 table/table_properties.cc         |  4 +-
 table/table_test.cc               |  4 +-
 tools/db_sanity_test.cc           | 10 ++--
 tools/db_stress.cc                | 77 ++++++++++++++++---------------
 util/autovector_test.cc           |  6 +--
 util/benchharness.cc              |  3 +-
 util/cache_test.cc                |  6 +--
 util/env_hdfs.cc                  |  8 ++--
 util/env_test.cc                  |  3 +-
 util/ldb_cmd.cc                   |  6 +--
 util/slice.cc                     |  3 +-
 util/string_util.h                | 13 ++++++
 util/testharness.h                |  1 +
 40 files changed, 189 insertions(+), 157 deletions(-)

diff --git a/db/column_family_test.cc b/db/column_family_test.cc
index 69f21a580..686bab20d 100644
--- a/db/column_family_test.cc
+++ b/db/column_family_test.cc
@@ -218,7 +218,7 @@ class ColumnFamilyTest {
 
   int NumTableFilesAtLevel(int level, int cf) {
     return GetProperty(cf,
-                       "rocksdb.num-files-at-level" + std::to_string(level));
+                       "rocksdb.num-files-at-level" + ToString(level));
   }
 
   // Return spread of files per level
@@ -387,7 +387,7 @@ TEST(ColumnFamilyTest, DropTest) {
     Open({"default"});
     CreateColumnFamiliesAndReopen({"pikachu"});
     for (int i = 0; i < 100; ++i) {
-      ASSERT_OK(Put(1, std::to_string(i), "bar" + std::to_string(i)));
+      ASSERT_OK(Put(1, ToString(i), "bar" + ToString(i)));
     }
     ASSERT_OK(Flush(1));
 
@@ -774,14 +774,14 @@ TEST(ColumnFamilyTest, DifferentCompactionStyles) {
   for (int i = 0; i < one.level0_file_num_compaction_trigger - 1; ++i) {
     PutRandomData(1, 11, 10000);
     WaitForFlush(1);
-    ASSERT_EQ(std::to_string(i + 1), FilesPerLevel(1));
+    ASSERT_EQ(ToString(i + 1), FilesPerLevel(1));
   }
 
   // SETUP column family "two" -- level style with 4 levels
   for (int i = 0; i < two.level0_file_num_compaction_trigger - 1; ++i) {
     PutRandomData(2, 15, 10000);
     WaitForFlush(2);
-    ASSERT_EQ(std::to_string(i + 1), FilesPerLevel(2));
+    ASSERT_EQ(ToString(i + 1), FilesPerLevel(2));
   }
 
   // TRIGGER compaction "one"
diff --git a/db/compaction_job_test.cc b/db/compaction_job_test.cc
index 75132fe00..712471657 100644
--- a/db/compaction_job_test.cc
+++ b/db/compaction_job_test.cc
@@ -60,8 +60,8 @@ class CompactionJobTest {
       SequenceNumber smallest_seqno = 0, largest_seqno = 0;
       InternalKey smallest, largest;
       for (int k = 0; k < kKeysPerFile; ++k) {
-        auto key = std::to_string(i * (kKeysPerFile / 2) + k);
-        auto value = std::to_string(i * kKeysPerFile + k);
+        auto key = ToString(i * (kKeysPerFile / 2) + k);
+        auto value = ToString(i * kKeysPerFile + k);
         InternalKey internal_key(key, ++sequence_number, kTypeValue);
         if (k == 0) {
           smallest = internal_key;
diff --git a/db/compaction_picker.cc b/db/compaction_picker.cc
index e6c5fd8ee..9ea4c187a 100644
--- a/db/compaction_picker.cc
+++ b/db/compaction_picker.cc
@@ -19,6 +19,7 @@
 #include "db/filename.h"
 #include "util/log_buffer.h"
 #include "util/statistics.h"
+#include "util/string_util.h"
 
 namespace rocksdb {
 
@@ -262,7 +263,7 @@ Status CompactionPicker::GetCompactionInputsFromFileNumbers(
         "Cannot find matched SST files for the following file numbers:");
     for (auto fn : *input_set) {
       message += " ";
-      message += std::to_string(fn);
+      message += ToString(fn);
     }
     return Status::InvalidArgument(message);
   }
@@ -616,7 +617,7 @@ Status CompactionPicker::SanitizeCompactionInputFiles(
     return Status::InvalidArgument(
         "Output level for column family " + cf_meta.name +
         " must between [0, " +
-        std::to_string(cf_meta.levels[cf_meta.levels.size() - 1].level) +
+        ToString(cf_meta.levels[cf_meta.levels.size() - 1].level) +
         "].");
   }
 
@@ -624,7 +625,7 @@ Status CompactionPicker::SanitizeCompactionInputFiles(
     return Status::InvalidArgument(
         "Exceed the maximum output level defined by "
         "the current compaction algorithm --- " +
-            std::to_string(MaxOutputLevel()));
+            ToString(MaxOutputLevel()));
   }
 
   if (output_level < 0) {
diff --git a/db/compaction_picker_test.cc b/db/compaction_picker_test.cc
index 913e745c8..419d239c8 100644
--- a/db/compaction_picker_test.cc
+++ b/db/compaction_picker_test.cc
@@ -192,8 +192,8 @@ TEST(CompactionPickerTest, NeedsCompactionLevel) {
       // start a brand new version in each test.
       NewVersionStorage(kLevels, kCompactionStyleLevel);
       for (int i = 0; i < file_count; ++i) {
-        Add(level, i, std::to_string((i + 100) * 1000).c_str(),
-            std::to_string((i + 100) * 1000 + 999).c_str(),
+        Add(level, i, ToString((i + 100) * 1000).c_str(),
+            ToString((i + 100) * 1000 + 999).c_str(),
             file_size, 0, i * 100, i * 100 + 99);
       }
       UpdateVersionStorageInfo();
@@ -217,8 +217,8 @@ TEST(CompactionPickerTest, NeedsCompactionUniversal) {
   // verify the trigger given different number of L0 files.
   for (int i = 1;
        i <= mutable_cf_options_.level0_file_num_compaction_trigger * 2; ++i) {
-    Add(0, i, std::to_string((i + 100) * 1000).c_str(),
-        std::to_string((i + 100) * 1000 + 999).c_str(), 1000000, 0, i * 100,
+    Add(0, i, ToString((i + 100) * 1000).c_str(),
+        ToString((i + 100) * 1000 + 999).c_str(), 1000000, 0, i * 100,
         i * 100 + 99);
     ASSERT_EQ(level_compaction_picker.NeedsCompaction(vstorage_.get()),
               vstorage_->CompactionScore(0) >= 1);
@@ -243,8 +243,8 @@ TEST(CompactionPickerTest, NeedsCompactionFIFO) {
   // size of L0 files.
   uint64_t current_size = 0;
   for (int i = 1; i <= kFileCount; ++i) {
-    Add(0, i, std::to_string((i + 100) * 1000).c_str(),
-        std::to_string((i + 100) * 1000 + 999).c_str(),
+    Add(0, i, ToString((i + 100) * 1000).c_str(),
+        ToString((i + 100) * 1000 + 999).c_str(),
         kFileSize, 0, i * 100, i * 100 + 99);
     current_size += kFileSize;
     ASSERT_EQ(level_compaction_picker.NeedsCompaction(vstorage_.get()),
diff --git a/db/comparator_db_test.cc b/db/comparator_db_test.cc
index e0f842730..325017224 100644
--- a/db/comparator_db_test.cc
+++ b/db/comparator_db_test.cc
@@ -377,7 +377,7 @@ TEST(ComparatorDBTest, DoubleComparator) {
       for (uint32_t j = 0; j < divide_order; j++) {
         to_divide *= 10.0;
       }
-      source_strings.push_back(std::to_string(r / to_divide));
+      source_strings.push_back(ToString(r / to_divide));
     }
 
     DoRandomIteraratorTest(GetDB(), source_strings, &rnd, 200, 1000, 66);
diff --git a/db/db_bench.cc b/db/db_bench.cc
index 17889ebd8..6e5b63f24 100644
--- a/db/db_bench.cc
+++ b/db/db_bench.cc
@@ -1350,7 +1350,7 @@ class Benchmark {
   }
 
   std::string GetDbNameForMultiple(std::string base_name, size_t id) {
-    return base_name + std::to_string(id);
+    return base_name + ToString(id);
   }
 
   std::string ColumnFamilyName(int i) {
diff --git a/db/db_impl.cc b/db/db_impl.cc
index 4b9b96ec7..99a386e76 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -75,6 +75,7 @@
 #include "util/iostats_context_imp.h"
 #include "util/stop_watch.h"
 #include "util/sync_point.h"
+#include "util/string_util.h"
 #include "util/thread_status_impl.h"
 
 namespace rocksdb {
@@ -3121,7 +3122,7 @@ bool DBImpl::GetProperty(ColumnFamilyHandle* column_family,
     bool ret_value = GetIntPropertyInternal(column_family, property_type,
                                             need_out_of_mutex, &int_value);
     if (ret_value) {
-      *value = std::to_string(int_value);
+      *value = ToString(int_value);
     }
     return ret_value;
   } else {
@@ -3378,8 +3379,8 @@ Status DBImpl::CheckConsistency() {
     } else if (fsize != md.size) {
       corruption_messages += "Sst file size mismatch: " + file_path +
                              ". Size recorded in manifest " +
-                             std::to_string(md.size) + ", actual size " +
-                             std::to_string(fsize) + "\n";
+                             ToString(md.size) + ", actual size " +
+                             ToString(fsize) + "\n";
     }
   }
   if (corruption_messages.size() == 0) {
diff --git a/db/db_iter_test.cc b/db/db_iter_test.cc
index 79623ce17..e06900010 100644
--- a/db/db_iter_test.cc
+++ b/db/db_iter_test.cc
@@ -366,7 +366,7 @@ TEST(DBIteratorTest, DBIteratorUseSkip) {
       internal_iter->AddMerge("b", "merge_1");
       internal_iter->AddMerge("a", "merge_2");
       for (size_t k = 0; k < 200; ++k) {
-        internal_iter->AddPut("c", std::to_string(k));
+        internal_iter->AddPut("c", ToString(k));
       }
       internal_iter->Finish();
 
@@ -379,7 +379,7 @@ TEST(DBIteratorTest, DBIteratorUseSkip) {
       ASSERT_TRUE(db_iter->Valid());
 
       ASSERT_EQ(db_iter->key().ToString(), "c");
-      ASSERT_EQ(db_iter->value().ToString(), std::to_string(i));
+      ASSERT_EQ(db_iter->value().ToString(), ToString(i));
       db_iter->Prev();
       ASSERT_TRUE(db_iter->Valid());
 
@@ -513,11 +513,11 @@ TEST(DBIteratorTest, DBIteratorUseSkip) {
       internal_iter->AddMerge("b", "merge_1");
       internal_iter->AddMerge("a", "merge_2");
       for (size_t k = 0; k < 200; ++k) {
-        internal_iter->AddPut("d", std::to_string(k));
+        internal_iter->AddPut("d", ToString(k));
       }
 
       for (size_t k = 0; k < 200; ++k) {
-        internal_iter->AddPut("c", std::to_string(k));
+        internal_iter->AddPut("c", ToString(k));
       }
       internal_iter->Finish();
 
@@ -529,7 +529,7 @@ TEST(DBIteratorTest, DBIteratorUseSkip) {
       ASSERT_TRUE(db_iter->Valid());
 
       ASSERT_EQ(db_iter->key().ToString(), "d");
-      ASSERT_EQ(db_iter->value().ToString(), std::to_string(i));
+      ASSERT_EQ(db_iter->value().ToString(), ToString(i));
       db_iter->Prev();
       ASSERT_TRUE(db_iter->Valid());
 
@@ -552,7 +552,7 @@ TEST(DBIteratorTest, DBIteratorUseSkip) {
       internal_iter->AddMerge("b", "b");
       internal_iter->AddMerge("a", "a");
       for (size_t k = 0; k < 200; ++k) {
-        internal_iter->AddMerge("c", std::to_string(k));
+        internal_iter->AddMerge("c", ToString(k));
       }
       internal_iter->Finish();
 
@@ -566,7 +566,7 @@ TEST(DBIteratorTest, DBIteratorUseSkip) {
       ASSERT_EQ(db_iter->key().ToString(), "c");
       std::string merge_result = "0";
       for (size_t j = 1; j <= i; ++j) {
-        merge_result += "," + std::to_string(j);
+        merge_result += "," + ToString(j);
       }
       ASSERT_EQ(db_iter->value().ToString(), merge_result);
 
diff --git a/db/db_test.cc b/db/db_test.cc
index 6545f6744..de7132e58 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -50,6 +50,7 @@
 #include "util/sync_point.h"
 #include "util/testutil.h"
 #include "util/mock_env.h"
+#include "util/string_util.h"
 #include "util/thread_status_impl.h"
 
 namespace rocksdb {
@@ -1494,7 +1495,7 @@ TEST(DBTest, GetPropertiesOfAllTablesTest) {
   // Create 4 tables
   for (int table = 0; table < 4; ++table) {
     for (int i = 0; i < 10 + table; ++i) {
-      db_->Put(WriteOptions(), std::to_string(table * 100 + i), "val");
+      db_->Put(WriteOptions(), ToString(table * 100 + i), "val");
     }
     db_->Flush(FlushOptions());
   }
@@ -1508,7 +1509,7 @@ TEST(DBTest, GetPropertiesOfAllTablesTest) {
   // fetch key from 1st and 2nd table, which will internally place that table to
   // the table cache.
   for (int i = 0; i < 2; ++i) {
-    Get(std::to_string(i * 100 + 0));
+    Get(ToString(i * 100 + 0));
   }
 
   VerifyTableProperties(db_, 10 + 11 + 12 + 13);
@@ -1518,7 +1519,7 @@ TEST(DBTest, GetPropertiesOfAllTablesTest) {
   // fetch key from 1st and 2nd table, which will internally place that table to
   // the table cache.
   for (int i = 0; i < 4; ++i) {
-    Get(std::to_string(i * 100 + 0));
+    Get(ToString(i * 100 + 0));
   }
   VerifyTableProperties(db_, 10 + 11 + 12 + 13);
 }
@@ -4747,7 +4748,7 @@ TEST(DBTest, CompactionFilterDeletesAll) {
   // put some data
   for (int table = 0; table < 4; ++table) {
     for (int i = 0; i < 10 + table; ++i) {
-      Put(std::to_string(table * 100 + i), "val");
+      Put(ToString(table * 100 + i), "val");
     }
     Flush();
   }
@@ -6994,7 +6995,7 @@ TEST(DBTest, TransactionLogIteratorCorruptedLog) {
     Options options = OptionsForLogIterTest();
     DestroyAndReopen(options);
     for (int i = 0; i < 1024; i++) {
-      Put("key"+std::to_string(i), DummyString(10));
+      Put("key"+ToString(i), DummyString(10));
     }
     dbfull()->Flush(FlushOptions());
     // Corrupt this log to create a gap
@@ -7062,20 +7063,20 @@ TEST(DBTest, TransactionLogIteratorBlobs) {
   struct Handler : public WriteBatch::Handler {
     std::string seen;
     virtual Status PutCF(uint32_t cf, const Slice& key, const Slice& value) {
-      seen += "Put(" + std::to_string(cf) + ", " + key.ToString() + ", " +
-              std::to_string(value.size()) + ")";
+      seen += "Put(" + ToString(cf) + ", " + key.ToString() + ", " +
+              ToString(value.size()) + ")";
       return Status::OK();
     }
     virtual Status MergeCF(uint32_t cf, const Slice& key, const Slice& value) {
-      seen += "Merge(" + std::to_string(cf) + ", " + key.ToString() + ", " +
-              std::to_string(value.size()) + ")";
+      seen += "Merge(" + ToString(cf) + ", " + key.ToString() + ", " +
+              ToString(value.size()) + ")";
       return Status::OK();
     }
     virtual void LogData(const Slice& blob) {
       seen += "LogData(" + blob.ToString() + ")";
     }
     virtual Status DeleteCF(uint32_t cf, const Slice& key) {
-      seen += "Delete(" + std::to_string(cf) + ", " + key.ToString() + ")";
+      seen += "Delete(" + ToString(cf) + ", " + key.ToString() + ")";
       return Status::OK();
     }
   } handler;
@@ -7203,7 +7204,7 @@ TEST(DBTest, MultiThreaded) {
   do {
     std::vector<std::string> cfs;
     for (int i = 1; i < kColumnFamilies; ++i) {
-      cfs.push_back(std::to_string(i));
+      cfs.push_back(ToString(i));
     }
     CreateAndReopenWithCF(cfs, CurrentOptions());
     // Initialize state
@@ -7256,7 +7257,7 @@ static void GCThreadBody(void* arg) {
   WriteOptions wo;
 
   for (int i = 0; i < kGCNumKeys; ++i) {
-    std::string kv(std::to_string(i + id * kGCNumKeys));
+    std::string kv(ToString(i + id * kGCNumKeys));
     ASSERT_OK(db->Put(wo, kv, kv));
   }
   t->done = true;
@@ -7292,7 +7293,7 @@ TEST(DBTest, GroupCommitTest) {
 
     std::vector<std::string> expected_db;
     for (int i = 0; i < kGCNumThreads * kGCNumKeys; ++i) {
-      expected_db.push_back(std::to_string(i));
+      expected_db.push_back(ToString(i));
     }
     sort(expected_db.begin(), expected_db.end());
 
@@ -8176,7 +8177,7 @@ TEST(DBTest, FIFOCompactionTest) {
     Random rnd(301);
     for (int i = 0; i < 6; ++i) {
       for (int j = 0; j < 100; ++j) {
-        ASSERT_OK(Put(std::to_string(i * 100 + j), RandomString(&rnd, 1024)));
+        ASSERT_OK(Put(ToString(i * 100 + j), RandomString(&rnd, 1024)));
       }
       // flush should happen here
     }
@@ -8189,7 +8190,7 @@ TEST(DBTest, FIFOCompactionTest) {
     ASSERT_EQ(NumTableFilesAtLevel(0), 5);
     for (int i = 0; i < 50; ++i) {
       // these keys should be deleted in previous compaction
-      ASSERT_EQ("NOT_FOUND", Get(std::to_string(i)));
+      ASSERT_EQ("NOT_FOUND", Get(ToString(i)));
     }
   }
 }
@@ -8517,7 +8518,7 @@ TEST(DBTest, CompactFilesOnLevelCompaction) {
 
   Random rnd(301);
   for (int key = 64 * kEntriesPerBuffer; key >= 0; --key) {
-    ASSERT_OK(Put(1, std::to_string(key), RandomString(&rnd, kTestValueSize)));
+    ASSERT_OK(Put(1, ToString(key), RandomString(&rnd, kTestValueSize)));
   }
   dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
   dbfull()->TEST_WaitForCompact();
@@ -8549,7 +8550,7 @@ TEST(DBTest, CompactFilesOnLevelCompaction) {
 
   // make sure all key-values are still there.
   for (int key = 64 * kEntriesPerBuffer; key >= 0; --key) {
-    ASSERT_NE(Get(1, std::to_string(key)), "NOT_FOUND");
+    ASSERT_NE(Get(1, ToString(key)), "NOT_FOUND");
   }
 }
 
@@ -8571,7 +8572,7 @@ TEST(DBTest, CompactFilesOnUniversalCompaction) {
   ASSERT_EQ(options.compaction_style, kCompactionStyleUniversal);
   Random rnd(301);
   for (int key = 1024 * kEntriesPerBuffer; key >= 0; --key) {
-    ASSERT_OK(Put(1, std::to_string(key), RandomString(&rnd, kTestValueSize)));
+    ASSERT_OK(Put(1, ToString(key), RandomString(&rnd, kTestValueSize)));
   }
   dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
   dbfull()->TEST_WaitForCompact();
@@ -9112,7 +9113,7 @@ TEST(DBTest, DynamicCompactionOptions) {
   // result in 2 32KB L1 files.
   ASSERT_OK(dbfull()->SetOptions({
     {"level0_file_num_compaction_trigger", "2"},
-    {"target_file_size_base", std::to_string(k32KB) }
+    {"target_file_size_base", ToString(k32KB) }
   }));
 
   gen_l0_kb(0, 64, 1);
@@ -9133,7 +9134,7 @@ TEST(DBTest, DynamicCompactionOptions) {
   // fill L1 and L2. L1 size should be around 256KB while L2 size should be
   // around 256KB x 4.
   ASSERT_OK(dbfull()->SetOptions({
-    {"max_bytes_for_level_base", std::to_string(k1MB) }
+    {"max_bytes_for_level_base", ToString(k1MB) }
   }));
 
   // writing 96 x 64KB => 6 * 1024KB
@@ -9155,7 +9156,7 @@ TEST(DBTest, DynamicCompactionOptions) {
   // reduces to 128KB from 256KB which was asserted previously. Same for L2.
   ASSERT_OK(dbfull()->SetOptions({
     {"max_bytes_for_level_multiplier", "2"},
-    {"max_bytes_for_level_base", std::to_string(k128KB) }
+    {"max_bytes_for_level_base", ToString(k128KB) }
   }));
 
   // writing 20 x 64KB = 10 x 128KB
@@ -9255,7 +9256,7 @@ TEST(DBTest, DynamicCompactionOptions) {
   // L1 - L3. Then thrink max_bytes_for_level_base and disable auto compaction
   // at the same time, we should see some level with score greater than 2.
   ASSERT_OK(dbfull()->SetOptions({
-    {"max_bytes_for_level_base", std::to_string(k1MB) }
+    {"max_bytes_for_level_base", ToString(k1MB) }
   }));
   // writing 40 x 64KB = 10 x 256KB
   // (L1 + L2 + L3) = (1 + 2 + 4) * 256KB
diff --git a/db/deletefile_test.cc b/db/deletefile_test.cc
index 9e3ea70aa..ac8c0e7b0 100644
--- a/db/deletefile_test.cc
+++ b/db/deletefile_test.cc
@@ -79,7 +79,7 @@ class DeleteFileTest {
     options.sync = false;
     ReadOptions roptions;
     for (int i = startkey; i < (numkeys + startkey) ; i++) {
-      std::string temp = std::to_string(i);
+      std::string temp = ToString(i);
       Slice key(temp);
       Slice value(temp);
       ASSERT_OK(db_->Put(options, key, value));
diff --git a/db/flush_job_test.cc b/db/flush_job_test.cc
index 33d1abe86..aee3fd1a8 100644
--- a/db/flush_job_test.cc
+++ b/db/flush_job_test.cc
@@ -97,8 +97,8 @@ TEST(FlushJobTest, NonEmpty) {
   new_mem->Ref();
   std::map<std::string, std::string> inserted_keys;
   for (int i = 1; i < 10000; ++i) {
-    std::string key(std::to_string(i));
-    std::string value("value" + std::to_string(i));
+    std::string key(ToString(i));
+    std::string value("value" + ToString(i));
     new_mem->Add(SequenceNumber(i), kTypeValue, key, value);
     InternalKey internal_key(key, SequenceNumber(i), kTypeValue);
     inserted_keys.insert({internal_key.Encode().ToString(), value});
diff --git a/db/internal_stats.cc b/db/internal_stats.cc
index 33842fed8..6d25eb397 100644
--- a/db/internal_stats.cc
+++ b/db/internal_stats.cc
@@ -16,6 +16,7 @@
 #include <vector>
 #include "db/column_family.h"
 #include "db/db_impl.h"
+#include "util/string_util.h"
 
 namespace rocksdb {
 
@@ -433,7 +434,7 @@ void InternalStats::DumpCFStats(std::string* value) {
       double w_amp = (comp_stats_[level].bytes_readn == 0) ? 0.0
           : comp_stats_[level].bytes_written /
             static_cast<double>(comp_stats_[level].bytes_readn);
-      PrintLevelStats(buf, sizeof(buf), "L" + std::to_string(level), files,
+      PrintLevelStats(buf, sizeof(buf), "L" + ToString(level), files,
                       files_being_compacted[level],
                       vstorage->NumLevelBytes(level), compaction_score[level],
                       rw_amp, w_amp, stall_us, stalls, comp_stats_[level]);
diff --git a/db/listener_test.cc b/db/listener_test.cc
index bac00b78d..dfc075803 100644
--- a/db/listener_test.cc
+++ b/db/listener_test.cc
@@ -258,10 +258,10 @@ TEST(EventListenerTest, MultiDBMultiListeners) {
   std::vector<std::vector<ColumnFamilyHandle *>> vec_handles;
 
   for (int d = 0; d < kNumDBs; ++d) {
-    ASSERT_OK(DestroyDB(dbname_ + std::to_string(d), options));
+    ASSERT_OK(DestroyDB(dbname_ + ToString(d), options));
     DB* db;
     std::vector<ColumnFamilyHandle*> handles;
-    ASSERT_OK(DB::Open(options, dbname_ + std::to_string(d), &db));
+    ASSERT_OK(DB::Open(options, dbname_ + ToString(d), &db));
     for (size_t c = 0; c < cf_names.size(); ++c) {
       ColumnFamilyHandle* handle;
       db->CreateColumnFamily(cf_opts, cf_names[c], &handle);
@@ -331,7 +331,7 @@ TEST(EventListenerTest, DisableBGCompaction) {
   db_->GetColumnFamilyMetaData(handles_[1], &cf_meta);
   // keep writing until writes are forced to stop.
   for (int i = 0; static_cast<int>(cf_meta.file_count) < kStopTrigger; ++i) {
-    Put(1, std::to_string(i), std::string(100000, 'x'), wopts);
+    Put(1, ToString(i), std::string(100000, 'x'), wopts);
     db_->GetColumnFamilyMetaData(handles_[1], &cf_meta);
   }
   ASSERT_GE(listener->slowdown_count, kStopTrigger - kSlowdownTrigger);
diff --git a/db/perf_context_test.cc b/db/perf_context_test.cc
index 2d20a0186..81e3eb156 100644
--- a/db/perf_context_test.cc
+++ b/db/perf_context_test.cc
@@ -14,6 +14,7 @@
 #include "util/histogram.h"
 #include "util/stop_watch.h"
 #include "util/testharness.h"
+#include "util/string_util.h"
 
 
 bool FLAGS_random_key = false;
@@ -66,21 +67,21 @@ TEST(PerfContextTest, SeekIntoDeletion) {
   ReadOptions read_options;
 
   for (int i = 0; i < FLAGS_total_keys; ++i) {
-    std::string key = "k" + std::to_string(i);
-    std::string value = "v" + std::to_string(i);
+    std::string key = "k" + ToString(i);
+    std::string value = "v" + ToString(i);
 
     db->Put(write_options, key, value);
   }
 
   for (int i = 0; i < FLAGS_total_keys -1 ; ++i) {
-    std::string key = "k" + std::to_string(i);
+    std::string key = "k" + ToString(i);
     db->Delete(write_options, key);
   }
 
   HistogramImpl hist_get;
   HistogramImpl hist_get_time;
   for (int i = 0; i < FLAGS_total_keys - 1; ++i) {
-    std::string key = "k" + std::to_string(i);
+    std::string key = "k" + ToString(i);
     std::string value;
 
     perf_context.Reset();
@@ -118,7 +119,7 @@ TEST(PerfContextTest, SeekIntoDeletion) {
   HistogramImpl hist_seek;
   for (int i = 0; i < FLAGS_total_keys; ++i) {
     std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
-    std::string key = "k" + std::to_string(i);
+    std::string key = "k" + ToString(i);
 
     perf_context.Reset();
     StopWatchNano timer(Env::Default(), true);
@@ -231,8 +232,8 @@ void ProfileQueries(bool enabled_time = false) {
       db->Flush(fo);
       continue;
     }
-    std::string key = "k" + std::to_string(i);
-    std::string value = "v" + std::to_string(i);
+    std::string key = "k" + ToString(i);
+    std::string value = "v" + ToString(i);
 
     std::vector<std::string> values;
 
@@ -245,8 +246,8 @@ void ProfileQueries(bool enabled_time = false) {
   }
 
   for (const int i : keys) {
-    std::string key = "k" + std::to_string(i);
-    std::string value = "v" + std::to_string(i);
+    std::string key = "k" + ToString(i);
+    std::string value = "v" + ToString(i);
 
     std::vector<Slice> multiget_keys = {Slice(key)};
     std::vector<std::string> values;
@@ -335,8 +336,8 @@ void ProfileQueries(bool enabled_time = false) {
   hist_mget_num_memtable_checked.Clear();
 
   for (const int i : keys) {
-    std::string key = "k" + std::to_string(i);
-    std::string value = "v" + std::to_string(i);
+    std::string key = "k" + ToString(i);
+    std::string value = "v" + ToString(i);
 
     std::vector<Slice> multiget_keys = {Slice(key)};
     std::vector<std::string> values;
@@ -451,8 +452,8 @@ TEST(PerfContextTest, SeekKeyComparison) {
   SetPerfLevel(kEnableTime);
   StopWatchNano timer(Env::Default());
   for (const int i : keys) {
-    std::string key = "k" + std::to_string(i);
-    std::string value = "v" + std::to_string(i);
+    std::string key = "k" + ToString(i);
+    std::string value = "v" + ToString(i);
 
     perf_context.Reset();
     timer.Start();
@@ -471,8 +472,8 @@ TEST(PerfContextTest, SeekKeyComparison) {
   HistogramImpl hist_next;
 
   for (int i = 0; i < FLAGS_total_keys; ++i) {
-    std::string key = "k" + std::to_string(i);
-    std::string value = "v" + std::to_string(i);
+    std::string key = "k" + ToString(i);
+    std::string value = "v" + ToString(i);
 
     std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
     perf_context.Reset();
diff --git a/db/plain_table_db_test.cc b/db/plain_table_db_test.cc
index 1720b678f..906ff8c8f 100644
--- a/db/plain_table_db_test.cc
+++ b/db/plain_table_db_test.cc
@@ -628,7 +628,7 @@ TEST(PlainTableDBTest, IteratorLargeKeys) {
   };
 
   for (size_t i = 0; i < 7; i++) {
-    ASSERT_OK(Put(key_list[i], std::to_string(i)));
+    ASSERT_OK(Put(key_list[i], ToString(i)));
   }
 
   dbfull()->TEST_FlushMemTable();
@@ -639,7 +639,7 @@ TEST(PlainTableDBTest, IteratorLargeKeys) {
   for (size_t i = 0; i < 7; i++) {
     ASSERT_TRUE(iter->Valid());
     ASSERT_EQ(key_list[i], iter->key().ToString());
-    ASSERT_EQ(std::to_string(i), iter->value().ToString());
+    ASSERT_EQ(ToString(i), iter->value().ToString());
     iter->Next();
   }
 
@@ -676,7 +676,7 @@ TEST(PlainTableDBTest, IteratorLargeKeysWithPrefix) {
       MakeLongKeyWithPrefix(26, '6')};
 
   for (size_t i = 0; i < 7; i++) {
-    ASSERT_OK(Put(key_list[i], std::to_string(i)));
+    ASSERT_OK(Put(key_list[i], ToString(i)));
   }
 
   dbfull()->TEST_FlushMemTable();
@@ -687,7 +687,7 @@ TEST(PlainTableDBTest, IteratorLargeKeysWithPrefix) {
   for (size_t i = 0; i < 7; i++) {
     ASSERT_TRUE(iter->Valid());
     ASSERT_EQ(key_list[i], iter->key().ToString());
-    ASSERT_EQ(std::to_string(i), iter->value().ToString());
+    ASSERT_EQ(ToString(i), iter->value().ToString());
     iter->Next();
   }
 
diff --git a/db/prefix_test.cc b/db/prefix_test.cc
index 238f84330..fa2c128c4 100644
--- a/db/prefix_test.cc
+++ b/db/prefix_test.cc
@@ -442,7 +442,7 @@ TEST(PrefixTest, DynamicPrefixIterator) {
     for (auto prefix : prefixes) {
       TestKey test_key(prefix, FLAGS_items_per_prefix / 2);
       Slice key = TestKeyToSlice(test_key);
-      std::string value = "v" + std::to_string(0);
+      std::string value = "v" + ToString(0);
 
       perf_context.Reset();
       StopWatchNano timer(Env::Default(), true);
diff --git a/db/table_properties_collector.cc b/db/table_properties_collector.cc
index 25bd70036..36ed0f97f 100644
--- a/db/table_properties_collector.cc
+++ b/db/table_properties_collector.cc
@@ -7,6 +7,7 @@
 
 #include "db/dbformat.h"
 #include "util/coding.h"
+#include "util/string_util.h"
 
 namespace rocksdb {
 
@@ -40,7 +41,7 @@ Status InternalKeyPropertiesCollector::Finish(
 UserCollectedProperties
 InternalKeyPropertiesCollector::GetReadableProperties() const {
   return {
-    { "kDeletedKeys", std::to_string(deleted_keys_) }
+    { "kDeletedKeys", ToString(deleted_keys_) }
   };
 }
 
diff --git a/db/version_builder_test.cc b/db/version_builder_test.cc
index 978251998..1373e2f88 100644
--- a/db/version_builder_test.cc
+++ b/db/version_builder_test.cc
@@ -135,8 +135,8 @@ TEST(VersionBuilderTest, EstimatedActiveKeys) {
   const uint32_t kDeletionsPerFile = 100;
   for (uint32_t i = 0; i < kNumFiles; ++i) {
     Add(static_cast<int>(i / kFilesPerLevel), i + 1,
-        std::to_string((i + 100) * 1000).c_str(),
-        std::to_string((i + 100) * 1000 + 999).c_str(),
+        ToString((i + 100) * 1000).c_str(),
+        ToString((i + 100) * 1000 + 999).c_str(),
         100U,  0, 100, 100,
         kEntriesPerFile, kDeletionsPerFile,
         (i < kTotalSamples));
diff --git a/db/wal_manager.cc b/db/wal_manager.cc
index 0889df301..7fac575f2 100644
--- a/db/wal_manager.cc
+++ b/db/wal_manager.cc
@@ -31,6 +31,7 @@
 #include "util/logging.h"
 #include "util/mutexlock.h"
 #include "util/sync_point.h"
+#include "util/string_util.h"
 
 namespace rocksdb {
 
@@ -355,9 +356,9 @@ Status WalManager::ReadFirstRecord(const WalFileType type,
                                    SequenceNumber* sequence) {
   if (type != kAliveLogFile && type != kArchivedLogFile) {
     Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
-        "[WalManger] Unknown file type %s", std::to_string(type).c_str());
+        "[WalManger] Unknown file type %s", ToString(type).c_str());
     return Status::NotSupported(
-        "File Type Not Known " + std::to_string(type));
+        "File Type Not Known " + ToString(type));
   }
   {
     MutexLock l(&read_first_record_cache_mutex_);
diff --git a/db/wal_manager_test.cc b/db/wal_manager_test.cc
index 1f609d083..5c12586c8 100644
--- a/db/wal_manager_test.cc
+++ b/db/wal_manager_test.cc
@@ -73,7 +73,7 @@ class WalManagerTest {
     for (int i = 1; i <= num_logs; ++i) {
       RollTheLog(true);
       for (int k = 0; k < entries_per_log; ++k) {
-        Put(std::to_string(k), std::string(1024, 'a'));
+        Put(ToString(k), std::string(1024, 'a'));
       }
     }
   }
diff --git a/db/write_batch_test.cc b/db/write_batch_test.cc
index 7f180d9e6..dbf65b6e9 100644
--- a/db/write_batch_test.cc
+++ b/db/write_batch_test.cc
@@ -155,7 +155,7 @@ namespace {
       if (column_family_id == 0) {
         seen += "Put(" + key.ToString() + ", " + value.ToString() + ")";
       } else {
-        seen += "PutCF(" + std::to_string(column_family_id) + ", " +
+        seen += "PutCF(" + ToString(column_family_id) + ", " +
                 key.ToString() + ", " + value.ToString() + ")";
       }
       return Status::OK();
@@ -165,7 +165,7 @@ namespace {
       if (column_family_id == 0) {
         seen += "Merge(" + key.ToString() + ", " + value.ToString() + ")";
       } else {
-        seen += "MergeCF(" + std::to_string(column_family_id) + ", " +
+        seen += "MergeCF(" + ToString(column_family_id) + ", " +
                 key.ToString() + ", " + value.ToString() + ")";
       }
       return Status::OK();
@@ -177,7 +177,7 @@ namespace {
       if (column_family_id == 0) {
         seen += "Delete(" + key.ToString() + ")";
       } else {
-        seen += "DeleteCF(" + std::to_string(column_family_id) + ", " +
+        seen += "DeleteCF(" + ToString(column_family_id) + ", " +
                 key.ToString() + ")";
       }
       return Status::OK();
diff --git a/examples/compact_files_example.cc b/examples/compact_files_example.cc
index 3e7638b7e..bf6cec262 100644
--- a/examples/compact_files_example.cc
+++ b/examples/compact_files_example.cc
@@ -156,14 +156,14 @@ int main() {
   // if background compaction is not working, write will stall
   // because of options.level0_stop_writes_trigger
   for (int i = 1000; i < 99999; ++i) {
-    db->Put(WriteOptions(), std::to_string(i),
+    db->Put(WriteOptions(), ToString(i),
                             std::string(500, 'a' + (i % 26)));
   }
 
   // verify the values are still there
   std::string value;
   for (int i = 1000; i < 99999; ++i) {
-    db->Get(ReadOptions(), std::to_string(i),
+    db->Get(ReadOptions(), ToString(i),
                            &value);
     assert(value == std::string(500, 'a' + (i % 26)));
   }
diff --git a/port/port.h b/port/port.h
index 2dc9a0fa6..153dc5b5b 100644
--- a/port/port.h
+++ b/port/port.h
@@ -10,13 +10,13 @@
 #ifndef STORAGE_LEVELDB_PORT_PORT_H_
 #define STORAGE_LEVELDB_PORT_PORT_H_
 
-#include <string.h>
+#include <string>
 
 // Include the appropriate platform specific file below.  If you are
 // porting to a new platform, see "port_example.h" for documentation
 // of what the new port_<platform>.h file must provide.
 #if defined(ROCKSDB_PLATFORM_POSIX)
-#  include "port/port_posix.h"
+#include "port/port_posix.h"
 #endif
 
 #endif  // STORAGE_LEVELDB_PORT_PORT_H_
diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc
index 5cb35834a..d60ba3d21 100644
--- a/table/block_based_table_reader.cc
+++ b/table/block_based_table_reader.cc
@@ -38,6 +38,7 @@
 #include "util/coding.h"
 #include "util/perf_context_imp.h"
 #include "util/stop_watch.h"
+#include "util/string_util.h"
 
 namespace rocksdb {
 
@@ -1264,7 +1265,7 @@ Status BlockBasedTable::CreateIndexReader(IndexReader** index_reader,
     }
     default: {
       std::string error_message =
-          "Unrecognized index type: " + std::to_string(rep_->index_type);
+          "Unrecognized index type: " + ToString(rep_->index_type);
       return Status::InvalidArgument(error_message.c_str());
     }
   }
diff --git a/table/cuckoo_table_builder.cc b/table/cuckoo_table_builder.cc
index 947c465e9..8a57f1c6b 100644
--- a/table/cuckoo_table_builder.cc
+++ b/table/cuckoo_table_builder.cc
@@ -21,6 +21,7 @@
 #include "table/meta_blocks.h"
 #include "util/autovector.h"
 #include "util/random.h"
+#include "util/string_util.h"
 
 namespace rocksdb {
 const std::string CuckooTablePropertyNames::kEmptyKey =
@@ -88,7 +89,7 @@ void CuckooTableBuilder::Add(const Slice& key, const Slice& value) {
   }
   if (ikey.type != kTypeDeletion && ikey.type != kTypeValue) {
     status_ = Status::NotSupported("Unsupported key type " +
-                                   std::to_string(ikey.type));
+                                   ToString(ikey.type));
     return;
   }
 
diff --git a/table/cuckoo_table_reader_test.cc b/table/cuckoo_table_reader_test.cc
index 4245b749e..aaeb3956c 100644
--- a/table/cuckoo_table_reader_test.cc
+++ b/table/cuckoo_table_reader_test.cc
@@ -387,7 +387,7 @@ std::string GetFileName(uint64_t num) {
     FLAGS_file_dir = test::TmpDir();
   }
   return FLAGS_file_dir + "/cuckoo_read_benchmark" +
-    std::to_string(num/1000000) + "Mkeys";
+    ToString(num/1000000) + "Mkeys";
 }
 
 // Create last level file as we are interested in measuring performance of
diff --git a/table/plain_table_reader.cc b/table/plain_table_reader.cc
index 48b709e80..0cd73ac32 100644
--- a/table/plain_table_reader.cc
+++ b/table/plain_table_reader.cc
@@ -36,6 +36,7 @@
 #include "util/murmurhash.h"
 #include "util/perf_context_imp.h"
 #include "util/stop_watch.h"
+#include "util/string_util.h"
 
 
 namespace rocksdb {
@@ -380,14 +381,14 @@ Status PlainTableReader::PopulateIndex(TableProperties* props,
   // Fill two table properties.
   if (!index_in_file) {
     props->user_collected_properties["plain_table_hash_table_size"] =
-        std::to_string(index_.GetIndexSize() * PlainTableIndex::kOffsetLen);
+        ToString(index_.GetIndexSize() * PlainTableIndex::kOffsetLen);
     props->user_collected_properties["plain_table_sub_index_size"] =
-        std::to_string(index_.GetSubIndexSize());
+        ToString(index_.GetSubIndexSize());
   } else {
     props->user_collected_properties["plain_table_hash_table_size"] =
-        std::to_string(0);
+        ToString(0);
     props->user_collected_properties["plain_table_sub_index_size"] =
-        std::to_string(0);
+        ToString(0);
   }
 
   return Status::OK();
diff --git a/table/table_properties.cc b/table/table_properties.cc
index 98d519971..1ee34a671 100644
--- a/table/table_properties.cc
+++ b/table/table_properties.cc
@@ -7,6 +7,8 @@
 #include "rocksdb/table_properties.h"
 #include "rocksdb/iterator.h"
 #include "rocksdb/env.h"
+#include "port/port.h"
+#include "util/string_util.h"
 
 namespace rocksdb {
 
@@ -31,7 +33,7 @@ namespace {
       const std::string& prop_delim,
       const std::string& kv_delim) {
     AppendProperty(
-        props, key, std::to_string(value), prop_delim, kv_delim
+        props, key, ToString(value), prop_delim, kv_delim
     );
   }
 }
diff --git a/table/table_test.cc b/table/table_test.cc
index a02846ccc..9e5f8a49e 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -1039,9 +1039,9 @@ TEST(TablePropertyTest, PrefixScanTest) {
              pos->first.compare(0, prefix.size(), prefix) == 0;
          ++pos) {
       ++num;
-      auto key = prefix + "." + std::to_string(num);
+      auto key = prefix + "." + ToString(num);
       ASSERT_EQ(key, pos->first);
-      ASSERT_EQ(std::to_string(num), pos->second);
+      ASSERT_EQ(ToString(num), pos->second);
     }
     ASSERT_EQ(3, num);
   }
diff --git a/tools/db_sanity_test.cc b/tools/db_sanity_test.cc
index f994ab38b..8219feb37 100644
--- a/tools/db_sanity_test.cc
+++ b/tools/db_sanity_test.cc
@@ -17,6 +17,8 @@
 #include "rocksdb/table.h"
 #include "rocksdb/slice_transform.h"
 #include "rocksdb/filter_policy.h"
+#include "port/port.h"
+#include "util/string_util.h"
 
 namespace rocksdb {
 
@@ -43,8 +45,8 @@ class SanityTest {
       return s;
     }
     for (int i = 0; i < 1000000; ++i) {
-      std::string k = "key" + std::to_string(i);
-      std::string v = "value" + std::to_string(i);
+      std::string k = "key" + ToString(i);
+      std::string v = "value" + ToString(i);
       s = db->Put(WriteOptions(), Slice(k), Slice(v));
       if (!s.ok()) {
         return s;
@@ -61,8 +63,8 @@ class SanityTest {
       return s;
     }
     for (int i = 0; i < 1000000; ++i) {
-      std::string k = "key" + std::to_string(i);
-      std::string v = "value" + std::to_string(i);
+      std::string k = "key" + ToString(i);
+      std::string v = "value" + ToString(i);
       std::string result;
       s = db->Get(ReadOptions(), Slice(k), &result);
       if (!s.ok()) {
diff --git a/tools/db_stress.cc b/tools/db_stress.cc
index a6d8c9ace..9aad6efb9 100644
--- a/tools/db_stress.cc
+++ b/tools/db_stress.cc
@@ -52,6 +52,7 @@ int main() {
 #include "util/logging.h"
 #include "hdfs/env_hdfs.h"
 #include "utilities/merge_operators.h"
+#include "util/string_util.h"
 
 using GFLAGS::ParseCommandLineFlags;
 using GFLAGS::RegisterFlagValidator;
@@ -801,23 +802,23 @@ class StressTest {
     options_table_ = {
       {"write_buffer_size",
         {
-          std::to_string(FLAGS_write_buffer_size),
-          std::to_string(FLAGS_write_buffer_size * 2),
-          std::to_string(FLAGS_write_buffer_size * 4)
+          ToString(FLAGS_write_buffer_size),
+          ToString(FLAGS_write_buffer_size * 2),
+          ToString(FLAGS_write_buffer_size * 4)
         }
       },
       {"max_write_buffer_number",
         {
-          std::to_string(FLAGS_max_write_buffer_number),
-          std::to_string(FLAGS_max_write_buffer_number * 2),
-          std::to_string(FLAGS_max_write_buffer_number * 4)
+          ToString(FLAGS_max_write_buffer_number),
+          ToString(FLAGS_max_write_buffer_number * 2),
+          ToString(FLAGS_max_write_buffer_number * 4)
         }
       },
       {"arena_block_size",
         {
-          std::to_string(Options().arena_block_size),
-          std::to_string(FLAGS_write_buffer_size / 4),
-          std::to_string(FLAGS_write_buffer_size / 8),
+          ToString(Options().arena_block_size),
+          ToString(FLAGS_write_buffer_size / 4),
+          ToString(FLAGS_write_buffer_size / 8),
         }
       },
       {"memtable_prefix_bloom_bits", {"0", "8", "10"}},
@@ -825,7 +826,7 @@ class StressTest {
       {"memtable_prefix_bloom_huge_page_tlb_size",
         {
           "0",
-          std::to_string(2 * 1024 * 1024)
+          ToString(2 * 1024 * 1024)
         }
       },
       {"max_successive_merges", {"0", "2", "4"}},
@@ -837,70 +838,70 @@ class StressTest {
       {"hard_rate_limit", {"0", "1.1", "2.0"}},
       {"level0_file_num_compaction_trigger",
         {
-          std::to_string(FLAGS_level0_file_num_compaction_trigger),
-          std::to_string(FLAGS_level0_file_num_compaction_trigger + 2),
-          std::to_string(FLAGS_level0_file_num_compaction_trigger + 4),
+          ToString(FLAGS_level0_file_num_compaction_trigger),
+          ToString(FLAGS_level0_file_num_compaction_trigger + 2),
+          ToString(FLAGS_level0_file_num_compaction_trigger + 4),
         }
       },
       {"level0_slowdown_writes_trigger",
         {
-          std::to_string(FLAGS_level0_slowdown_writes_trigger),
-          std::to_string(FLAGS_level0_slowdown_writes_trigger + 2),
-          std::to_string(FLAGS_level0_slowdown_writes_trigger + 4),
+          ToString(FLAGS_level0_slowdown_writes_trigger),
+          ToString(FLAGS_level0_slowdown_writes_trigger + 2),
+          ToString(FLAGS_level0_slowdown_writes_trigger + 4),
         }
       },
       {"level0_stop_writes_trigger",
         {
-          std::to_string(FLAGS_level0_stop_writes_trigger),
-          std::to_string(FLAGS_level0_stop_writes_trigger + 2),
-          std::to_string(FLAGS_level0_stop_writes_trigger + 4),
+          ToString(FLAGS_level0_stop_writes_trigger),
+          ToString(FLAGS_level0_stop_writes_trigger + 2),
+          ToString(FLAGS_level0_stop_writes_trigger + 4),
         }
       },
       {"max_grandparent_overlap_factor",
         {
-          std::to_string(Options().max_grandparent_overlap_factor - 5),
-          std::to_string(Options().max_grandparent_overlap_factor),
-          std::to_string(Options().max_grandparent_overlap_factor + 5),
+          ToString(Options().max_grandparent_overlap_factor - 5),
+          ToString(Options().max_grandparent_overlap_factor),
+          ToString(Options().max_grandparent_overlap_factor + 5),
         }
       },
       {"expanded_compaction_factor",
         {
-          std::to_string(Options().expanded_compaction_factor - 5),
-          std::to_string(Options().expanded_compaction_factor),
-          std::to_string(Options().expanded_compaction_factor + 5),
+          ToString(Options().expanded_compaction_factor - 5),
+          ToString(Options().expanded_compaction_factor),
+          ToString(Options().expanded_compaction_factor + 5),
         }
       },
       {"source_compaction_factor",
         {
-          std::to_string(Options().source_compaction_factor),
-          std::to_string(Options().source_compaction_factor * 2),
-          std::to_string(Options().source_compaction_factor * 4),
+          ToString(Options().source_compaction_factor),
+          ToString(Options().source_compaction_factor * 2),
+          ToString(Options().source_compaction_factor * 4),
         }
       },
       {"target_file_size_base",
         {
-          std::to_string(FLAGS_target_file_size_base),
-          std::to_string(FLAGS_target_file_size_base * 2),
-          std::to_string(FLAGS_target_file_size_base * 4),
+          ToString(FLAGS_target_file_size_base),
+          ToString(FLAGS_target_file_size_base * 2),
+          ToString(FLAGS_target_file_size_base * 4),
         }
       },
       {"target_file_size_multiplier",
         {
-          std::to_string(FLAGS_target_file_size_multiplier),
+          ToString(FLAGS_target_file_size_multiplier),
           "1",
           "2",
         }
       },
       {"max_bytes_for_level_base",
         {
-          std::to_string(FLAGS_max_bytes_for_level_base / 2),
-          std::to_string(FLAGS_max_bytes_for_level_base),
-          std::to_string(FLAGS_max_bytes_for_level_base * 2),
+          ToString(FLAGS_max_bytes_for_level_base / 2),
+          ToString(FLAGS_max_bytes_for_level_base),
+          ToString(FLAGS_max_bytes_for_level_base * 2),
         }
       },
       {"max_bytes_for_level_multiplier",
         {
-          std::to_string(FLAGS_max_bytes_for_level_multiplier),
+          ToString(FLAGS_max_bytes_for_level_multiplier),
           "1",
           "2",
         }
@@ -1377,7 +1378,7 @@ class StressTest {
           // drop column family and then create it again (can't drop default)
           int cf = thread->rand.Next() % (FLAGS_column_families - 1) + 1;
           std::string new_name =
-              std::to_string(new_column_family_name_.fetch_add(1));
+              ToString(new_column_family_name_.fetch_add(1));
           {
             MutexLock l(thread->shared->GetMutex());
             fprintf(
@@ -1881,7 +1882,7 @@ class StressTest {
         cf_descriptors.emplace_back(name, ColumnFamilyOptions(options_));
       }
       while (cf_descriptors.size() < (size_t)FLAGS_column_families) {
-        std::string name = std::to_string(new_column_family_name_.load());
+        std::string name = ToString(new_column_family_name_.load());
         new_column_family_name_++;
         cf_descriptors.emplace_back(name, ColumnFamilyOptions(options_));
         column_family_names_.push_back(name);
diff --git a/util/autovector_test.cc b/util/autovector_test.cc
index 86cafc893..4ff982250 100644
--- a/util/autovector_test.cc
+++ b/util/autovector_test.cc
@@ -53,7 +53,7 @@ TEST(AutoVectorTest, EmplaceBack) {
   autovector<ValType, kSize> vec;
 
   for (size_t i = 0; i < 1000 * kSize; ++i) {
-    vec.emplace_back(i, std::to_string(i + 123));
+    vec.emplace_back(i, ToString(i + 123));
     ASSERT_TRUE(!vec.empty());
     if (i < kSize) {
       ASSERT_TRUE(vec.only_in_stack());
@@ -63,7 +63,7 @@ TEST(AutoVectorTest, EmplaceBack) {
 
     ASSERT_EQ(i + 1, vec.size());
     ASSERT_EQ(i, vec[i].first);
-    ASSERT_EQ(std::to_string(i + 123), vec[i].second);
+    ASSERT_EQ(ToString(i + 123), vec[i].second);
   }
 
   vec.clear();
@@ -129,7 +129,7 @@ TEST(AutoVectorTest, CopyAndAssignment) {
 TEST(AutoVectorTest, Iterators) {
   autovector<std::string, kSize> vec;
   for (size_t i = 0; i < kSize * 1000; ++i) {
-    vec.push_back(std::to_string(i));
+    vec.push_back(ToString(i));
   }
 
   // basic operator test
diff --git a/util/benchharness.cc b/util/benchharness.cc
index 4218f3c30..e533ed454 100644
--- a/util/benchharness.cc
+++ b/util/benchharness.cc
@@ -16,6 +16,7 @@
 #include <string>
 #include <utility>
 #include <vector>
+#include "util/string_util.h"
 
 #ifndef GFLAGS
 bool FLAGS_benchmark = false;
@@ -283,7 +284,7 @@ static const ScaleInfo kMetricSuffixes[] {
 static string HumanReadable(double n, unsigned int decimals,
                             const ScaleInfo* scales) {
   if (std::isinf(n) || std::isnan(n)) {
-    return std::to_string(n);
+    return ToString(n);
   }
 
   const double absValue = fabs(n);
diff --git a/util/cache_test.cc b/util/cache_test.cc
index 3aba95645..2fed4d867 100644
--- a/util/cache_test.cc
+++ b/util/cache_test.cc
@@ -133,7 +133,7 @@ TEST(CacheTest, UsageTest) {
 
   // make sure the cache will be overloaded
   for (uint64_t i = 1; i < kCapacity; ++i) {
-    auto key = std::to_string(i);
+    auto key = ToString(i);
     cache->Release(
         cache->Insert(key, (void*)value, key.size() + 5, dumbDeleter)
     );
@@ -402,13 +402,13 @@ TEST(CacheTest, BadEviction) {
 
   // Insert n+1 entries, but not releasing.
   for (int i = 0; i < n+1; i++) {
-    std::string key = std::to_string(i+1);
+    std::string key = ToString(i+1);
     handles[i] = cache->Insert(key, new Value(i+1), 1, &deleter);
   }
 
   // Guess what's in the cache now?
   for (int i = 0; i < n+1; i++) {
-    std::string key = std::to_string(i+1);
+    std::string key = ToString(i+1);
     auto h = cache->Lookup(key);
     std::cout << key << (h?" found\n":" not found\n");
     // Only the first entry should be missing
diff --git a/util/env_hdfs.cc b/util/env_hdfs.cc
index bbd5b9779..b0a3b6751 100644
--- a/util/env_hdfs.cc
+++ b/util/env_hdfs.cc
@@ -443,7 +443,7 @@ Status HdfsEnv::NewDirectory(const std::string& name,
       Log(InfoLogLevel::FATAL_LEVEL,
           mylog, "NewDirectory hdfsExists call failed");
       throw HdfsFatalException("hdfsExists call failed with error " +
-                               std::to_string(value) + " on path " + name +
+                               ToString(value) + " on path " + name +
                                ".\n");
   }
 }
@@ -460,7 +460,7 @@ bool HdfsEnv::FileExists(const std::string& fname) {
       Log(InfoLogLevel::FATAL_LEVEL,
           mylog, "FileExists hdfsExists call failed");
       throw HdfsFatalException("hdfsExists call failed with error " +
-                               std::to_string(value) + " on path " + fname +
+                               ToString(value) + " on path " + fname +
                                ".\n");
   }
 }
@@ -499,7 +499,7 @@ Status HdfsEnv::GetChildren(const std::string& path,
     Log(InfoLogLevel::FATAL_LEVEL, mylog,
         "GetChildren hdfsExists call failed");
     throw HdfsFatalException("hdfsExists call failed with error " +
-                             std::to_string(value) + ".\n");
+                             ToString(value) + ".\n");
   }
   return Status::OK();
 }
@@ -530,7 +530,7 @@ Status HdfsEnv::CreateDirIfMissing(const std::string& name) {
       Log(InfoLogLevel::FATAL_LEVEL, mylog,
           "CreateDirIfMissing hdfsExists call failed");
       throw HdfsFatalException("hdfsExists call failed with error " +
-                               std::to_string(value) + ".\n");
+                               ToString(value) + ".\n");
   }
 };
 
diff --git a/util/env_test.cc b/util/env_test.cc
index 9819d837a..3511d985b 100644
--- a/util/env_test.cc
+++ b/util/env_test.cc
@@ -603,7 +603,8 @@ TEST(EnvPosixTest, RandomAccessUniqueIDConcurrent) {
   // Create the files
   std::vector<std::string> fnames;
   for (int i = 0; i < 1000; ++i) {
-    fnames.push_back(GetOnDiskTestDir() + "/" + "testfile" + std::to_string(i));
+    fnames.push_back(
+        GetOnDiskTestDir() + "/" + "testfile" + ToString(i));
 
     // Create file.
     unique_ptr<WritableFile> wfile;
diff --git a/util/ldb_cmd.cc b/util/ldb_cmd.cc
index 9a5ec5185..4f925c7c3 100644
--- a/util/ldb_cmd.cc
+++ b/util/ldb_cmd.cc
@@ -1293,15 +1293,15 @@ void ChangeCompactionStyleCommand::DoCommand() {
     // level 0 should have only 1 file
     if (i == 0 && num_files != 1) {
       exec_state_ = LDBCommandExecuteResult::FAILED("Number of db files at "
-        "level 0 after compaction is " + std::to_string(num_files) +
+        "level 0 after compaction is " + ToString(num_files) +
         ", not 1.\n");
       return;
     }
     // other levels should have no file
     if (i > 0 && num_files != 0) {
       exec_state_ = LDBCommandExecuteResult::FAILED("Number of db files at "
-        "level " + std::to_string(i) + " after compaction is " +
-        std::to_string(num_files) + ", not 0.\n");
+        "level " + ToString(i) + " after compaction is " +
+        ToString(num_files) + ", not 0.\n");
       return;
     }
   }
diff --git a/util/slice.cc b/util/slice.cc
index 5a1f4f10e..cd197ced5 100644
--- a/util/slice.cc
+++ b/util/slice.cc
@@ -9,6 +9,7 @@
 
 #include "rocksdb/slice_transform.h"
 #include "rocksdb/slice.h"
+#include "util/string_util.h"
 
 namespace rocksdb {
 
@@ -22,7 +23,7 @@ class FixedPrefixTransform : public SliceTransform {
  public:
   explicit FixedPrefixTransform(size_t prefix_len)
       : prefix_len_(prefix_len),
-        name_("rocksdb.FixedPrefix." + std::to_string(prefix_len_)) {}
+        name_("rocksdb.FixedPrefix." + ToString(prefix_len_)) {}
 
   virtual const char* Name() const { return name_.c_str(); }
 
diff --git a/util/string_util.h b/util/string_util.h
index b9f0a1d28..2238a569b 100644
--- a/util/string_util.h
+++ b/util/string_util.h
@@ -12,4 +12,17 @@ namespace rocksdb {
 
 extern std::vector<std::string> StringSplit(const std::string& arg, char delim);
 
+template <typename T>
+inline std::string ToString(T value) {
+#ifndef OS_ANDROID
+  return std::to_string(value);
+#else
+  // Andorid doesn't support all of C++11, std::to_string() being
+  // one of the not supported features.
+  std::ostringstream os;
+  os << value;
+  return os.str();
+#endif
+}
+
 }  // namespace rocksdb
diff --git a/util/testharness.h b/util/testharness.h
index 6115d68f7..ae2570889 100644
--- a/util/testharness.h
+++ b/util/testharness.h
@@ -17,6 +17,7 @@
 #include "rocksdb/env.h"
 #include "rocksdb/slice.h"
 #include "util/random.h"
+#include "util/string_util.h"
 
 namespace rocksdb {
 namespace test {

From 7e608e2fe386fdb7e3ef29d5c30464f199d53d64 Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Mon, 24 Nov 2014 20:47:27 -0800
Subject: [PATCH 559/829] Block plain_table_index.cc in ROCKSDB_LITE

Summary: Block plain_table_index.cc in ROCKSDB_LITE

Test Plan:
make clean
make OPT=-DROCKSDB_LITE shared_lib -j32
make clean
make shared_lib -j32

Reviewers: ljin, igor

Reviewed By: igor

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D29535
---
 table/plain_table_index.cc | 4 ++++
 table/plain_table_index.h  | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/table/plain_table_index.cc b/table/plain_table_index.cc
index ea8ac6b94..7ca451eb3 100644
--- a/table/plain_table_index.cc
+++ b/table/plain_table_index.cc
@@ -3,6 +3,8 @@
 //  LICENSE file in the root directory of this source tree. An additional grant
 //  of patent rights can be found in the PATENTS file in the same directory.
 
+#ifndef ROCKSDB_LITE
+
 #ifndef __STDC_FORMAT_MACROS
 #define __STDC_FORMAT_MACROS
 #endif
@@ -209,3 +211,5 @@ Slice PlainTableIndexBuilder::FillIndexes(
 const std::string PlainTableIndexBuilder::kPlainTableIndexBlock =
     "PlainTableIndexBlock";
 };  // namespace rocksdb
+
+#endif  // ROCKSDB_LITE
diff --git a/table/plain_table_index.h b/table/plain_table_index.h
index 870e3fb00..be8ad1639 100644
--- a/table/plain_table_index.h
+++ b/table/plain_table_index.h
@@ -5,6 +5,8 @@
 
 #pragma once
 
+#ifndef ROCKSDB_LITE
+
 #include <string>
 #include <vector>
 
@@ -219,3 +221,5 @@ class PlainTableIndexBuilder {
 };
 
 };  // namespace rocksdb
+
+#endif  // ROCKSDB_LITE

From 88dd8d889bd26ef54c387caeee39d45da1ff603a Mon Sep 17 00:00:00 2001
From: Reed Allman <rdallman10@gmail.com>
Date: Mon, 24 Nov 2014 22:00:29 -0800
Subject: [PATCH 560/829] c api: add max wal total to opts

---
 db/c.cc             | 4 ++++
 include/rocksdb/c.h | 1 +
 2 files changed, 5 insertions(+)

diff --git a/db/c.cc b/db/c.cc
index 64ce9d0a3..b5bd38daf 100644
--- a/db/c.cc
+++ b/db/c.cc
@@ -1269,6 +1269,10 @@ void rocksdb_options_set_max_open_files(rocksdb_options_t* opt, int n) {
   opt->rep.max_open_files = n;
 }
 
+void rocksdb_options_set_max_total_wal_size(rocksdb_options_t* opt, uint64_t n) {
+  opt->rep.max_total_wal_size = n;
+}
+
 void rocksdb_options_set_target_file_size_base(
     rocksdb_options_t* opt, uint64_t n) {
   opt->rep.target_file_size_base = n;
diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h
index b12e4fe5c..235b58530 100644
--- a/include/rocksdb/c.h
+++ b/include/rocksdb/c.h
@@ -440,6 +440,7 @@ extern void rocksdb_options_set_info_log(rocksdb_options_t*, rocksdb_logger_t*);
 extern void rocksdb_options_set_info_log_level(rocksdb_options_t*, int);
 extern void rocksdb_options_set_write_buffer_size(rocksdb_options_t*, size_t);
 extern void rocksdb_options_set_max_open_files(rocksdb_options_t*, int);
+extern void rocksdb_options_set_max_total_wal_size(rocksdb_options_t* opt, uint64_t n);
 extern void rocksdb_options_set_compression_options(
     rocksdb_options_t*, int, int, int);
 extern void rocksdb_options_set_prefix_extractor(

From c4765dc10b93d556154b2930247abc42182ca4ec Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Tue, 25 Nov 2014 20:29:52 +0100
Subject: [PATCH 561/829] [RocksJava] Fix precision problem in rocksjni

---
 java/rocksjni/backupablejni.cc | 5 +++--
 java/rocksjni/restorejni.cc    | 6 ++++--
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/java/rocksjni/backupablejni.cc b/java/rocksjni/backupablejni.cc
index 83c641370..8007e2ce0 100644
--- a/java/rocksjni/backupablejni.cc
+++ b/java/rocksjni/backupablejni.cc
@@ -103,10 +103,11 @@ jintArray Java_org_rocksdb_BackupableDB_getCorruptedBackups(
   reinterpret_cast<rocksdb::BackupableDB*>(jhandle)->
       GetCorruptedBackups(&backup_ids);
   // store backupids in int array
-  const int kIdSize = backup_ids.size();
+  const std::vector<rocksdb::BackupID>::size_type
+      kIdSize = backup_ids.size();
   int int_backup_ids[kIdSize];
   for (std::vector<rocksdb::BackupID>::size_type i = 0;
-      i != backup_ids.size(); i++) {
+      i != kIdSize; i++) {
     int_backup_ids[i] = backup_ids[i];
   }
   // Store ints in java array
diff --git a/java/rocksjni/restorejni.cc b/java/rocksjni/restorejni.cc
index ad8749758..4fe747e10 100644
--- a/java/rocksjni/restorejni.cc
+++ b/java/rocksjni/restorejni.cc
@@ -156,10 +156,12 @@ jintArray Java_org_rocksdb_RestoreBackupableDB_getCorruptedBackups(
   reinterpret_cast<rocksdb::RestoreBackupableDB*>(jhandle)->
       GetCorruptedBackups(&backup_ids);
   // store backupids in int array
-  const int kIdSize = backup_ids.size();
+  const std::vector<rocksdb::BackupID>::size_type
+        kIdSize = backup_ids.size();
+
   int int_backup_ids[kIdSize];
   for (std::vector<rocksdb::BackupID>::size_type i = 0;
-      i != backup_ids.size(); i++) {
+      i != kIdSize; i++) {
     int_backup_ids[i] = backup_ids[i];
   }
   // Store ints in java array

From 274ba62707c92548c84de835aae49a96292d7aa4 Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Tue, 25 Nov 2014 12:01:27 -0800
Subject: [PATCH 562/829] Block internal_stats in ROCKSDB_LITE

Summary: Block internal_stats in ROCKSDB_LITE.

Test Plan: make OPT=-DROCKSDB_LITE shared_lib

Reviewers: ljin, igor

Reviewed By: igor

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D29541
---
 db/internal_stats.cc | 12 ++++++++
 db/internal_stats.h  | 72 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 84 insertions(+)

diff --git a/db/internal_stats.cc b/db/internal_stats.cc
index 6d25eb397..6344be56d 100644
--- a/db/internal_stats.cc
+++ b/db/internal_stats.cc
@@ -15,11 +15,13 @@
 #include <inttypes.h>
 #include <vector>
 #include "db/column_family.h"
+
 #include "db/db_impl.h"
 #include "util/string_util.h"
 
 namespace rocksdb {
 
+#ifndef ROCKSDB_LITE
 namespace {
 const double kMB = 1048576.0;
 const double kGB = kMB * 1024;
@@ -497,4 +499,14 @@ void InternalStats::DumpCFStats(std::string* value) {
   cf_stats_snapshot_.stall_count = total_stall_count;
 }
 
+
+#else
+
+DBPropertyType GetPropertyType(const Slice& property, bool* is_int_property,
+                               bool* need_out_of_mutex) {
+  return kUnknown;
+}
+
+#endif  // !ROCKSDB_LITE
+
 }  // namespace rocksdb
diff --git a/db/internal_stats.h b/db/internal_stats.h
index 2fbcefd4c..c50809d31 100644
--- a/db/internal_stats.h
+++ b/db/internal_stats.h
@@ -52,6 +52,8 @@ extern DBPropertyType GetPropertyType(const Slice& property,
                                       bool* is_int_property,
                                       bool* need_out_of_mutex);
 
+
+#ifndef ROCKSDB_LITE
 class InternalStats {
  public:
   enum InternalCFStatsType {
@@ -296,4 +298,74 @@ class InternalStats {
   const uint64_t started_at_;
 };
 
+#else
+
+class InternalStats {
+ public:
+  enum InternalCFStatsType {
+    LEVEL0_SLOWDOWN,
+    MEMTABLE_COMPACTION,
+    LEVEL0_NUM_FILES,
+    WRITE_STALLS_ENUM_MAX,
+    BYTES_FLUSHED,
+    INTERNAL_CF_STATS_ENUM_MAX,
+  };
+
+  enum InternalDBStatsType {
+    WAL_FILE_BYTES,
+    WAL_FILE_SYNCED,
+    BYTES_WRITTEN,
+    NUMBER_KEYS_WRITTEN,
+    WRITE_DONE_BY_OTHER,
+    WRITE_DONE_BY_SELF,
+    WRITE_WITH_WAL,
+    INTERNAL_DB_STATS_ENUM_MAX,
+  };
+
+  InternalStats(int num_levels, Env* env, ColumnFamilyData* cfd) {}
+
+  struct CompactionStats {
+    uint64_t micros;
+    uint64_t bytes_readn;
+    uint64_t bytes_readnp1;
+    uint64_t bytes_written;
+    int files_in_leveln;
+    int files_in_levelnp1;
+    int files_out_levelnp1;
+    uint64_t num_input_records;
+    uint64_t num_dropped_records;
+    int count;
+
+    explicit CompactionStats(int _count = 0) {}
+
+    explicit CompactionStats(const CompactionStats& c) {}
+
+    void Add(const CompactionStats& c) {}
+
+    void Subtract(const CompactionStats& c) {}
+  };
+
+  void AddCompactionStats(int level, const CompactionStats& stats) {}
+
+  void RecordLevelNSlowdown(int level, uint64_t micros, bool soft) {}
+
+  void AddCFStats(InternalCFStatsType type, uint64_t value) {}
+
+  void AddDBStats(InternalDBStatsType type, uint64_t value) {}
+
+  uint64_t GetBackgroundErrorCount() const { return 0; }
+
+  uint64_t BumpAndGetBackgroundErrorCount() { return 0; }
+
+  bool GetStringProperty(DBPropertyType property_type, const Slice& property,
+                         std::string* value) { return false; }
+
+  bool GetIntProperty(DBPropertyType property_type, uint64_t* value,
+                      DBImpl* db) const { return false; }
+
+  bool GetIntPropertyOutOfMutex(DBPropertyType property_type, Version* version,
+                                uint64_t* value) const { return false; }
+};
+#endif  // !ROCKSDB_LITE
+
 }  // namespace rocksdb

From fcc2dfd9f9fb07a4134d9ddb0961dc21707a678f Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Fri, 21 Nov 2014 21:57:32 +0100
Subject: [PATCH 563/829] [RocksJava] Support for stored snapshots

Summary:
RocksDB supports two ways of saving snapshots. In
memory and on disk. The later was added with this
pull request to RocksJava.

Test Plan:

Reviewers:

Subscribers:
---
 java/Makefile                             |   2 +
 java/org/rocksdb/Checkpoint.java          |  65 ++++++++++++++
 java/org/rocksdb/test/CheckPointTest.java | 101 ++++++++++++++++++++++
 java/rocksjni/checkpoint.cc               |  61 +++++++++++++
 4 files changed, 229 insertions(+)
 create mode 100644 java/org/rocksdb/Checkpoint.java
 create mode 100644 java/org/rocksdb/test/CheckPointTest.java
 create mode 100644 java/rocksjni/checkpoint.cc

diff --git a/java/Makefile b/java/Makefile
index 7a0edb3cf..1fbb40c85 100644
--- a/java/Makefile
+++ b/java/Makefile
@@ -4,6 +4,7 @@ NATIVE_JAVA_CLASSES = org.rocksdb.AbstractComparator\
 	org.rocksdb.BackupableDBOptions\
 	org.rocksdb.BlockBasedTableConfig\
 	org.rocksdb.BloomFilter\
+	org.rocksdb.Checkpoint\
 	org.rocksdb.ColumnFamilyHandle\
 	org.rocksdb.ColumnFamilyOptions\
 	org.rocksdb.Comparator\
@@ -50,6 +51,7 @@ endif
 JAVA_TESTS = org.rocksdb.test.BackupableDBOptionsTest\
         org.rocksdb.test.BackupableDBTest\
 		org.rocksdb.test.BlockBasedTableConfigTest\
+		org.rocksdb.test.CheckpointTest\
 		org.rocksdb.test.ColumnFamilyOptionsTest\
 		org.rocksdb.test.ColumnFamilyTest\
 		org.rocksdb.test.ComparatorOptionsTest\
diff --git a/java/org/rocksdb/Checkpoint.java b/java/org/rocksdb/Checkpoint.java
new file mode 100644
index 000000000..0830f2fb1
--- /dev/null
+++ b/java/org/rocksdb/Checkpoint.java
@@ -0,0 +1,65 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+/**
+ * Provides Checkpoint functionality. Checkpoints
+ * provide persistent snapshots of RocksDB databases.
+ */
+public class Checkpoint extends RocksObject {
+
+  /**
+   * Creates a Checkpoint object to be used for creating open-able
+   * snapshots.
+   *
+   * @param db {@link RocksDB} instance.
+   * @return a Checkpoint instance.
+   */
+  public static Checkpoint create(RocksDB db) {
+    if (db == null || !db.isInitialized()) {
+      throw new IllegalArgumentException(
+          "RocksDB instance needs to be initialized.");
+    }
+    Checkpoint checkpoint = new Checkpoint(
+        newCheckpoint(db.nativeHandle_));
+    checkpoint.db_ = db;
+    return checkpoint;
+  }
+
+  /**
+   * <p>Builds an open-able snapshot of RocksDB on the same disk, which
+   * accepts an output directory on the same disk, and under the directory
+   * (1) hard-linked SST files pointing to existing live SST files
+   * (2) a copied manifest files and other files</p>
+   *
+   * @param checkpointPath path to the folder where the snapshot is going
+   *     to be stored.
+   * @throws RocksDBException thrown if an error occurs within the native
+   *     part of the library.
+   */
+  public void createCheckpoint(String checkpointPath)
+      throws RocksDBException {
+    createCheckpoint(nativeHandle_, checkpointPath);
+  }
+
+  @Override
+  protected void disposeInternal() {
+    disposeInternal(nativeHandle_);
+  }
+
+  private Checkpoint(long handle) {
+    super();
+    nativeHandle_ = handle;
+  }
+
+  RocksDB db_;
+
+  private static native long newCheckpoint(long dbHandle);
+  private native void disposeInternal(long handle);
+
+  private native void createCheckpoint(long handle, String checkpointPath)
+      throws RocksDBException;
+}
diff --git a/java/org/rocksdb/test/CheckPointTest.java b/java/org/rocksdb/test/CheckPointTest.java
new file mode 100644
index 000000000..63e996622
--- /dev/null
+++ b/java/org/rocksdb/test/CheckPointTest.java
@@ -0,0 +1,101 @@
+package org.rocksdb.test;
+
+
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+import org.rocksdb.Checkpoint;
+import org.rocksdb.Options;
+import org.rocksdb.RocksDB;
+import org.rocksdb.RocksDBException;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class CheckpointTest {
+
+  @ClassRule
+  public static final RocksMemoryResource rocksMemoryResource =
+      new RocksMemoryResource();
+
+  @Rule
+  public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  @Rule
+  public TemporaryFolder checkpointFolder = new TemporaryFolder();
+
+  @Test
+  public void checkPoint() throws RocksDBException {
+    RocksDB db = null;
+    Options options = null;
+    Checkpoint checkpoint = null;
+    try {
+      options = new Options().
+          setCreateIfMissing(true);
+      db = RocksDB.open(options,
+          dbFolder.getRoot().getAbsolutePath());
+      db.put("key".getBytes(), "value".getBytes());
+      checkpoint = Checkpoint.create(db);
+      checkpoint.createCheckpoint(checkpointFolder.
+          getRoot().getAbsolutePath() + "/snapshot1");
+      db.put("key2".getBytes(), "value2".getBytes());
+      checkpoint.createCheckpoint(checkpointFolder.
+          getRoot().getAbsolutePath() + "/snapshot2");
+      db.close();
+      db = RocksDB.open(options,
+          checkpointFolder.getRoot().getAbsolutePath() +
+              "/snapshot1");
+      assertThat(new String(db.get("key".getBytes()))).
+          isEqualTo("value");
+      assertThat(db.get("key2".getBytes())).isNull();
+      db.close();
+      db = RocksDB.open(options,
+          checkpointFolder.getRoot().getAbsolutePath() +
+              "/snapshot2");
+      assertThat(new String(db.get("key".getBytes()))).
+          isEqualTo("value");
+      assertThat(new String(db.get("key2".getBytes()))).
+          isEqualTo("value2");
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
+      if (checkpoint != null) {
+        checkpoint.dispose();
+      }
+    }
+  }
+
+  @Test(expected = IllegalArgumentException.class)
+  public void failIfDbIsNull() {
+    Checkpoint.create(null);
+  }
+
+  @Test(expected = IllegalArgumentException.class)
+  public void failIfDbNotInitialized() throws RocksDBException {
+    RocksDB db = RocksDB.open(dbFolder.getRoot().getAbsolutePath());
+    db.dispose();
+    Checkpoint.create(db);
+  }
+
+  @Test(expected = RocksDBException.class)
+  public void failWithIllegalPath() throws RocksDBException {
+    RocksDB db = null;
+    Checkpoint checkpoint = null;
+    try {
+      db = RocksDB.open(dbFolder.getRoot().getAbsolutePath());
+      checkpoint = Checkpoint.create(db);
+      checkpoint.createCheckpoint("/Z:///:\\C:\\TZ/-");
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+      if (checkpoint != null) {
+        checkpoint.dispose();
+      }
+    }
+  }
+}
diff --git a/java/rocksjni/checkpoint.cc b/java/rocksjni/checkpoint.cc
new file mode 100644
index 000000000..841144d81
--- /dev/null
+++ b/java/rocksjni/checkpoint.cc
@@ -0,0 +1,61 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+//
+// This file implements the "bridge" between Java and C++ and enables
+// calling c++ rocksdb::Checkpoint methods from Java side.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <jni.h>
+#include <string>
+
+#include "include/org_rocksdb_Checkpoint.h"
+#include "rocksjni/portal.h"
+#include "rocksdb/db.h"
+#include "rocksdb/utilities/checkpoint.h"
+/*
+ * Class:     org_rocksdb_Checkpoint
+ * Method:    newCheckpoint
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Checkpoint_newCheckpoint(JNIEnv* env,
+    jclass jclazz, jlong jdb_handle) {
+  auto db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  rocksdb::Checkpoint* checkpoint;
+  rocksdb::Checkpoint::Create(db, &checkpoint);
+  return reinterpret_cast<jlong>(checkpoint);
+}
+
+/*
+ * Class:     org_rocksdb_Checkpoint
+ * Method:    dispose
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_Checkpoint_disposeInternal(JNIEnv* env, jobject jobj,
+    jlong jhandle) {
+  auto checkpoint = reinterpret_cast<rocksdb::Checkpoint*>(jhandle);
+  assert(checkpoint);
+  delete checkpoint;
+}
+
+/*
+ * Class:     org_rocksdb_Checkpoint
+ * Method:    createCheckpoint
+ * Signature: (JLjava/lang/String;)V
+ */
+void Java_org_rocksdb_Checkpoint_createCheckpoint(
+    JNIEnv* env, jobject jobj, jlong jcheckpoint_handle,
+    jstring jcheckpoint_path) {
+  auto checkpoint = reinterpret_cast<rocksdb::Checkpoint*>(
+      jcheckpoint_handle);
+  const char* checkpoint_path = env->GetStringUTFChars(
+      jcheckpoint_path, 0);
+  rocksdb::Status s = checkpoint->CreateCheckpoint(
+      checkpoint_path);
+  env->ReleaseStringUTFChars(jcheckpoint_path, checkpoint_path);
+  if (!s.ok()) {
+      rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
\ No newline at end of file

From a280af2a5767b9257c4b4ec44eb27b8c0ff843ca Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Sat, 15 Nov 2014 12:37:51 +0100
Subject: [PATCH 564/829] [RocksJava] Sigsegv fix for MergerOperatorByName

---
 java/Makefile                             |  4 +-
 java/org/rocksdb/ColumnFamilyOptions.java |  2 +
 java/org/rocksdb/Options.java             |  2 +
 java/org/rocksdb/test/MergeTest.java      | 47 +++++++++++++++++++++++
 java/rocksjni.pom                         |  2 +-
 java/rocksjni/options.cc                  | 20 ++++++----
 6 files changed, 67 insertions(+), 10 deletions(-)

diff --git a/java/Makefile b/java/Makefile
index 7a0edb3cf..e2ec5e06d 100644
--- a/java/Makefile
+++ b/java/Makefile
@@ -56,6 +56,7 @@ JAVA_TESTS = org.rocksdb.test.BackupableDBOptionsTest\
 		org.rocksdb.test.ComparatorTest\
 		org.rocksdb.test.DBOptionsTest\
 		org.rocksdb.test.DirectComparatorTest\
+		org.rocksdb.test.EnvironmentTest\
 		org.rocksdb.test.FilterTest\
 		org.rocksdb.test.FlushTest\
 		org.rocksdb.test.InfoLogLevelTest\
@@ -70,6 +71,7 @@ JAVA_TESTS = org.rocksdb.test.BackupableDBOptionsTest\
 		org.rocksdb.test.RocksDBTest\
 		org.rocksdb.test.RocksEnvTest\
 		org.rocksdb.test.RocksIteratorTest\
+		org.rocksdb.test.SizeUnitTest\
 		org.rocksdb.test.SnapshotTest\
 		org.rocksdb.test.StatisticsCollectorTest\
 		org.rocksdb.test.WriteBatchHandlerTest\
@@ -130,7 +132,7 @@ resolve_test_deps:
 	test -s "$(JAVA_ASSERTJ_JAR)" || curl -k -L -o "$(JAVA_ASSERTJ_JAR)" http://central.maven.org/maven2/org/assertj/assertj-core/1.7.0/assertj-core-1.7.0.jar
 
 test: java resolve_test_deps
-	java -ea -Djava.library.path=.:../ -cp "$(JAVA_TESTCLASSPATH)" org.rocksdb.test.RocksJunitRunner $(JAVA_TESTS)
+	java -ea -Xcheck:jni -Djava.library.path=.:../ -cp "$(JAVA_TESTCLASSPATH)" org.rocksdb.test.RocksJunitRunner $(JAVA_TESTS)
 
 db_bench: java
 	javac org/rocksdb/benchmark/*.java
diff --git a/java/org/rocksdb/ColumnFamilyOptions.java b/java/org/rocksdb/ColumnFamilyOptions.java
index 3d3b236a2..86e42bf7d 100644
--- a/java/org/rocksdb/ColumnFamilyOptions.java
+++ b/java/org/rocksdb/ColumnFamilyOptions.java
@@ -127,6 +127,8 @@ public class ColumnFamilyOptions extends RocksObject
 
   @Override
   public ColumnFamilyOptions setMergeOperatorName(String name) {
+    assert (isInitialized());
+    assert (name != null);
     setMergeOperatorName(nativeHandle_, name);
     return this;
   }
diff --git a/java/org/rocksdb/Options.java b/java/org/rocksdb/Options.java
index 55f3defd2..68a11e633 100644
--- a/java/org/rocksdb/Options.java
+++ b/java/org/rocksdb/Options.java
@@ -165,6 +165,8 @@ public class Options extends RocksObject
 
   @Override
   public Options setMergeOperatorName(String name) {
+    assert (isInitialized());
+    assert (name != null);
     setMergeOperatorName(nativeHandle_, name);
     return this;
   }
diff --git a/java/org/rocksdb/test/MergeTest.java b/java/org/rocksdb/test/MergeTest.java
index 3ebd55975..3dacb3923 100644
--- a/java/org/rocksdb/test/MergeTest.java
+++ b/java/org/rocksdb/test/MergeTest.java
@@ -8,6 +8,7 @@ package org.rocksdb.test;
 import java.util.List;
 import java.util.ArrayList;
 
+import org.junit.Assert;
 import org.junit.ClassRule;
 import org.junit.Rule;
 import org.junit.Test;
@@ -252,4 +253,50 @@ public class MergeTest {
       }
     }
   }
+
+  @Test
+  public void emptyStringInSetMergeOperatorByName() {
+    Options opt = null;
+    ColumnFamilyOptions cOpt = null;
+    try {
+      opt = new Options();
+      cOpt = new ColumnFamilyOptions();
+      opt.setMergeOperatorName("");
+      cOpt.setMergeOperatorName("");
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+      if (cOpt != null) {
+        cOpt.dispose();
+      }
+    }
+  }
+
+  @Test(expected = AssertionError.class)
+  public void nullStringInSetMergeOperatorByNameOptions() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      opt.setMergeOperatorName(null);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test(expected = AssertionError.class)
+  public void
+      nullStringInSetMergeOperatorByNameColumnFamilyOptions() {
+    ColumnFamilyOptions opt = null;
+    try {
+      opt = new ColumnFamilyOptions();
+      opt.setMergeOperatorName(null);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
 }
diff --git a/java/rocksjni.pom b/java/rocksjni.pom
index e18a7734d..69e124c48 100644
--- a/java/rocksjni.pom
+++ b/java/rocksjni.pom
@@ -118,7 +118,7 @@
                 <artifactId>maven-surefire-plugin</artifactId>
                 <version>2.17</version>
                 <configuration>
-                    <argLine>${argLine}</argLine>
+                    <argLine>${argLine} -Xcheck:jni</argLine>
                 </configuration>
             </plugin>
             <plugin>
diff --git a/java/rocksjni/options.cc b/java/rocksjni/options.cc
index 82fb1fd1b..339a3b095 100644
--- a/java/rocksjni/options.cc
+++ b/java/rocksjni/options.cc
@@ -146,10 +146,12 @@ void Java_org_rocksdb_Options_setComparatorHandle__JJ(
  * Signature: (JJjava/lang/String)V
  */
 void Java_org_rocksdb_Options_setMergeOperatorName(
-    JNIEnv* env, jobject jobj, jlong jhandle, jstring name) {
-  const char* op_name = env->GetStringUTFChars(name, 0);
-  reinterpret_cast<rocksdb::Options*>(jhandle)->merge_operator =
-    rocksdb::MergeOperators::CreateFromStringId(op_name);
+    JNIEnv* env, jobject jobj, jlong jhandle, jstring jop_name) {
+  auto options = reinterpret_cast<rocksdb::Options*>(jhandle);
+  const char* op_name = env->GetStringUTFChars(jop_name, 0);
+  options->merge_operator = rocksdb::MergeOperators::CreateFromStringId(
+        op_name);
+  env->ReleaseStringUTFChars(jop_name, op_name);
 }
 
 /*
@@ -1884,10 +1886,12 @@ void Java_org_rocksdb_ColumnFamilyOptions_setComparatorHandle__JI(
  * Signature: (JJjava/lang/String)V
  */
 void Java_org_rocksdb_ColumnFamilyOptions_setMergeOperatorName(
-    JNIEnv* env, jobject jobj, jlong jhandle, jstring name) {
-  const char* op_name = env->GetStringUTFChars(name, 0);
-  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->merge_operator =
-    rocksdb::MergeOperators::CreateFromStringId(op_name);
+    JNIEnv* env, jobject jobj, jlong jhandle, jstring jop_name) {
+  auto options = reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle);
+  const char* op_name = env->GetStringUTFChars(jop_name, 0);
+  options->merge_operator = rocksdb::MergeOperators::CreateFromStringId(
+        op_name);
+  env->ReleaseStringUTFChars(jop_name, op_name);
 }
 
 /*

From 94f70a86b9dc9b1a8beade56faa12d2c695cb81c Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Sun, 16 Nov 2014 18:23:06 +0100
Subject: [PATCH 565/829] [RocksJava] Incoroporated changes for D29013

---
 java/org/rocksdb/ColumnFamilyOptions.java | 5 ++++-
 java/org/rocksdb/Options.java             | 5 ++++-
 java/org/rocksdb/test/MergeTest.java      | 4 ++--
 3 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/java/org/rocksdb/ColumnFamilyOptions.java b/java/org/rocksdb/ColumnFamilyOptions.java
index 86e42bf7d..898a6cb45 100644
--- a/java/org/rocksdb/ColumnFamilyOptions.java
+++ b/java/org/rocksdb/ColumnFamilyOptions.java
@@ -128,7 +128,10 @@ public class ColumnFamilyOptions extends RocksObject
   @Override
   public ColumnFamilyOptions setMergeOperatorName(String name) {
     assert (isInitialized());
-    assert (name != null);
+    if (name == null) {
+      throw new IllegalArgumentException(
+          "Merge operator name must not be null.");
+    }
     setMergeOperatorName(nativeHandle_, name);
     return this;
   }
diff --git a/java/org/rocksdb/Options.java b/java/org/rocksdb/Options.java
index 68a11e633..7781b80a6 100644
--- a/java/org/rocksdb/Options.java
+++ b/java/org/rocksdb/Options.java
@@ -166,7 +166,10 @@ public class Options extends RocksObject
   @Override
   public Options setMergeOperatorName(String name) {
     assert (isInitialized());
-    assert (name != null);
+    if (name == null) {
+      throw new IllegalArgumentException(
+          "Merge operator name must not be null.");
+    }
     setMergeOperatorName(nativeHandle_, name);
     return this;
   }
diff --git a/java/org/rocksdb/test/MergeTest.java b/java/org/rocksdb/test/MergeTest.java
index 3dacb3923..f90b0b0c1 100644
--- a/java/org/rocksdb/test/MergeTest.java
+++ b/java/org/rocksdb/test/MergeTest.java
@@ -273,7 +273,7 @@ public class MergeTest {
     }
   }
 
-  @Test(expected = AssertionError.class)
+  @Test(expected = IllegalArgumentException.class)
   public void nullStringInSetMergeOperatorByNameOptions() {
     Options opt = null;
     try {
@@ -286,7 +286,7 @@ public class MergeTest {
     }
   }
 
-  @Test(expected = AssertionError.class)
+  @Test(expected = IllegalArgumentException.class)
   public void
       nullStringInSetMergeOperatorByNameColumnFamilyOptions() {
     ColumnFamilyOptions opt = null;

From f193deea31d4509e2d20d3544d1f08d9ddf7c61a Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Tue, 25 Nov 2014 23:08:53 +0100
Subject: [PATCH 566/829] [RocksJava] Addressed comments in D28971

---
 java/Makefile                             |  2 +-
 java/org/rocksdb/Checkpoint.java          | 21 ++++++++++++++-------
 java/org/rocksdb/test/CheckPointTest.java |  4 ++--
 java/rocksjni/checkpoint.cc               |  2 +-
 4 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/java/Makefile b/java/Makefile
index 1fbb40c85..a48dd0843 100644
--- a/java/Makefile
+++ b/java/Makefile
@@ -51,7 +51,7 @@ endif
 JAVA_TESTS = org.rocksdb.test.BackupableDBOptionsTest\
         org.rocksdb.test.BackupableDBTest\
 		org.rocksdb.test.BlockBasedTableConfigTest\
-		org.rocksdb.test.CheckpointTest\
+		org.rocksdb.test.CheckPointTest\
 		org.rocksdb.test.ColumnFamilyOptionsTest\
 		org.rocksdb.test.ColumnFamilyTest\
 		org.rocksdb.test.ComparatorOptionsTest\
diff --git a/java/org/rocksdb/Checkpoint.java b/java/org/rocksdb/Checkpoint.java
index 0830f2fb1..2525bb08b 100644
--- a/java/org/rocksdb/Checkpoint.java
+++ b/java/org/rocksdb/Checkpoint.java
@@ -17,15 +17,21 @@ public class Checkpoint extends RocksObject {
    *
    * @param db {@link RocksDB} instance.
    * @return a Checkpoint instance.
+   *
+   * @throws java.lang.IllegalArgumentException if {@link RocksDB}
+   *     instance is null.
+   * @throws java.lang.IllegalStateException if {@link RocksDB}
+   *     instance is not initialized.
    */
   public static Checkpoint create(RocksDB db) {
-    if (db == null || !db.isInitialized()) {
+    if (db == null) {
       throw new IllegalArgumentException(
-          "RocksDB instance needs to be initialized.");
+          "RocksDB instance shall not be null.");
+    } else if (!db.isInitialized()) {
+      throw new IllegalStateException(
+          "RocksDB instance must be initialized.");
     }
-    Checkpoint checkpoint = new Checkpoint(
-        newCheckpoint(db.nativeHandle_));
-    checkpoint.db_ = db;
+    Checkpoint checkpoint = new Checkpoint(db);
     return checkpoint;
   }
 
@@ -50,9 +56,10 @@ public class Checkpoint extends RocksObject {
     disposeInternal(nativeHandle_);
   }
 
-  private Checkpoint(long handle) {
+  private Checkpoint(RocksDB db) {
     super();
-    nativeHandle_ = handle;
+    nativeHandle_ = newCheckpoint(db.nativeHandle_);
+    db_ = db;
   }
 
   RocksDB db_;
diff --git a/java/org/rocksdb/test/CheckPointTest.java b/java/org/rocksdb/test/CheckPointTest.java
index 63e996622..3891e062e 100644
--- a/java/org/rocksdb/test/CheckPointTest.java
+++ b/java/org/rocksdb/test/CheckPointTest.java
@@ -12,7 +12,7 @@ import org.rocksdb.RocksDBException;
 
 import static org.assertj.core.api.Assertions.assertThat;
 
-public class CheckpointTest {
+public class CheckPointTest {
 
   @ClassRule
   public static final RocksMemoryResource rocksMemoryResource =
@@ -74,7 +74,7 @@ public class CheckpointTest {
     Checkpoint.create(null);
   }
 
-  @Test(expected = IllegalArgumentException.class)
+  @Test(expected = IllegalStateException.class)
   public void failIfDbNotInitialized() throws RocksDBException {
     RocksDB db = RocksDB.open(dbFolder.getRoot().getAbsolutePath());
     db.dispose();
diff --git a/java/rocksjni/checkpoint.cc b/java/rocksjni/checkpoint.cc
index 841144d81..72a40be00 100644
--- a/java/rocksjni/checkpoint.cc
+++ b/java/rocksjni/checkpoint.cc
@@ -58,4 +58,4 @@ void Java_org_rocksdb_Checkpoint_createCheckpoint(
   if (!s.ok()) {
       rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
   }
-}
\ No newline at end of file
+}

From 805bac6d25f548c8864f4e70f8136802c0c3da28 Mon Sep 17 00:00:00 2001
From: Matt Amos <zerebubuth@gmail.com>
Date: Tue, 25 Nov 2014 23:07:40 +0000
Subject: [PATCH 567/829] Add test for upper bounds on iterators using C
 interface.

---
 db/c_test.c | 60 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 60 insertions(+)

diff --git a/db/c_test.c b/db/c_test.c
index ed9a62a9d..ba239a43d 100644
--- a/db/c_test.c
+++ b/db/c_test.c
@@ -823,6 +823,66 @@ int main(int argc, char** argv) {
     rocksdb_cuckoo_options_destroy(cuckoo_options);
   }
 
+  StartPhase("iterate_upper_bound");
+  {
+    // Create new empty database
+    rocksdb_close(db);
+    rocksdb_destroy_db(options, dbname, &err);
+    CheckNoError(err);
+
+    rocksdb_options_set_prefix_extractor(options, NULL);
+    db = rocksdb_open(options, dbname, &err);
+    CheckNoError(err);
+
+    rocksdb_put(db, woptions, "a",    1, "0",    1, &err); CheckNoError(err);
+    rocksdb_put(db, woptions, "foo",  3, "bar",  3, &err); CheckNoError(err);
+    rocksdb_put(db, woptions, "foo1", 4, "bar1", 4, &err); CheckNoError(err);
+    rocksdb_put(db, woptions, "g1",   2, "0",    1, &err); CheckNoError(err);
+
+    // testing basic case with no iterate_upper_bound and no prefix_extractor
+    {
+       rocksdb_readoptions_set_iterate_upper_bound(roptions, NULL, 0);
+       rocksdb_iterator_t* iter = rocksdb_create_iterator(db, roptions);
+
+       rocksdb_iter_seek(iter, "foo", 3);
+       CheckCondition(rocksdb_iter_valid(iter));
+       CheckIter(iter, "foo", "bar");
+
+       rocksdb_iter_next(iter);
+       CheckCondition(rocksdb_iter_valid(iter));
+       CheckIter(iter, "foo1", "bar1");
+
+       rocksdb_iter_next(iter);
+       CheckCondition(rocksdb_iter_valid(iter));
+       CheckIter(iter, "g1", "0");
+
+       rocksdb_iter_destroy(iter);
+    }
+
+    // testing iterate_upper_bound and forward iterator
+    // to make sure it stops at bound
+    {
+       // iterate_upper_bound points beyond the last expected entry
+       rocksdb_readoptions_set_iterate_upper_bound(roptions, "foo2", 4);
+
+       rocksdb_iterator_t* iter = rocksdb_create_iterator(db, roptions);
+
+       rocksdb_iter_seek(iter, "foo", 3);
+       CheckCondition(rocksdb_iter_valid(iter));
+       CheckIter(iter, "foo", "bar");
+
+       rocksdb_iter_next(iter);
+       CheckCondition(rocksdb_iter_valid(iter));
+       CheckIter(iter, "foo1", "bar1");
+
+       rocksdb_iter_next(iter);
+       // should stop here...
+       CheckCondition(!rocksdb_iter_valid(iter));
+
+       rocksdb_iter_destroy(iter);
+    }
+  }
+
   StartPhase("cleanup");
   rocksdb_close(db);
   rocksdb_options_destroy(options);

From 9d5019327bfd10e10cc0d0d27119d5c82e0fa98c Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Tue, 25 Nov 2014 23:28:36 -0800
Subject: [PATCH 568/829] Replace log2 by implementing Log2 in options_builder

Summary:
log2 function is only used in options_builder, and this function
is not available under certain platform such as android.
This patch implements Log2 by log(n) / log(2).

Test Plan:
make
---
 util/options_builder.cc | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/util/options_builder.cc b/util/options_builder.cc
index a92a5e86e..01774b608 100644
--- a/util/options_builder.cc
+++ b/util/options_builder.cc
@@ -11,6 +11,10 @@ namespace rocksdb {
 
 namespace {
 
+double Log2(double n) {
+  return log(n) / log(2);
+}
+
 // For now, always use 1-0 as level bytes multiplier.
 const int kBytesForLevelMultiplier = 10;
 const size_t kBytesForOneMb = 1024 * 1024;
@@ -28,7 +32,7 @@ CompactionStyle PickCompactionStyle(size_t write_buffer_size,
       ::log(target_db_size / write_buffer_size) / ::log(kBytesForLevelMultiplier)));
 
   int expected_max_files_universal =
-      static_cast<int>(ceil(log2(target_db_size / write_buffer_size)));
+      static_cast<int>(ceil(Log2(target_db_size / write_buffer_size)));
 
   const int kEstimatedLevel0FilesInLevelStyle = 2;
   // Estimate write amplification:

From a97314219e3a3a11b1b9e30d05e3923b2bf519a6 Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Tue, 25 Nov 2014 23:39:52 -0800
Subject: [PATCH 569/829] Fix compile error in ROCKSDB_LITE

---
 util/env_posix.cc | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/util/env_posix.cc b/util/env_posix.cc
index a850ed130..039e79c4a 100644
--- a/util/env_posix.cc
+++ b/util/env_posix.cc
@@ -1667,14 +1667,18 @@ class PosixEnv : public Env {
       BGThreadMetadata* meta = reinterpret_cast<BGThreadMetadata*>(arg);
       size_t thread_id = meta->thread_id_;
       ThreadPool* tp = meta->thread_pool_;
+#if ROCKSDB_USING_THREAD_STATUS
       // for thread-status
       thread_local_status.SetThreadType(
           (tp->GetThreadPriority() == Env::Priority::HIGH ?
               ThreadStatus::ThreadType::ROCKSDB_HIGH_PRIORITY :
               ThreadStatus::ThreadType::ROCKSDB_LOW_PRIORITY));
+#endif
       delete meta;
       tp->BGThread(thread_id);
+#if ROCKSDB_USING_THREAD_STATUS
       thread_local_status.UnregisterThread();
+#endif
       return nullptr;
     }
 

From 26109d487a2e7b6ce20cb5510cc6364d8685f441 Mon Sep 17 00:00:00 2001
From: Matt Amos <zerebubuth@gmail.com>
Date: Tue, 25 Nov 2014 23:08:59 +0000
Subject: [PATCH 570/829] Store upper bound `Slice` with the same lifetime as
 the `ReadOptions` so that we can provide a pointer to it.

---
 db/c.cc | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/db/c.cc b/db/c.cc
index b5bd38daf..857f4e654 100644
--- a/db/c.cc
+++ b/db/c.cc
@@ -80,7 +80,10 @@ struct rocksdb_writebatch_t      { WriteBatch        rep; };
 struct rocksdb_snapshot_t        { const Snapshot*   rep; };
 struct rocksdb_flushoptions_t    { FlushOptions      rep; };
 struct rocksdb_fifo_compaction_options_t { CompactionOptionsFIFO rep; };
-struct rocksdb_readoptions_t     { ReadOptions       rep; };
+struct rocksdb_readoptions_t {
+   ReadOptions rep;
+   Slice upper_bound; // stack variable to set pointer to in ReadOptions
+};
 struct rocksdb_writeoptions_t    { WriteOptions      rep; };
 struct rocksdb_options_t         { Options           rep; };
 struct rocksdb_block_based_table_options_t  { BlockBasedTableOptions rep; };
@@ -1891,8 +1894,14 @@ void rocksdb_readoptions_set_snapshot(
 void rocksdb_readoptions_set_iterate_upper_bound(
     rocksdb_readoptions_t* opt,
     const char* key, size_t keylen) {
-  Slice prefix = Slice(key, keylen);
-  opt->rep.iterate_upper_bound = &prefix;
+  if (key == nullptr) {
+    opt->upper_bound = Slice();
+    opt->rep.iterate_upper_bound = nullptr;
+
+  } else {
+    opt->upper_bound = Slice(key, keylen);
+    opt->rep.iterate_upper_bound = &opt->upper_bound;
+  }
 }
 
 void rocksdb_readoptions_set_read_tier(

From beb74c14ca55a9ffb339aa26b6a037bc34316ee9 Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Wed, 26 Nov 2014 09:39:11 -0800
Subject: [PATCH 571/829] Fix travis-build error

Summary:
Fix travis-build error
---
 util/options_builder.cc | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/util/options_builder.cc b/util/options_builder.cc
index 01774b608..d5cf4b20a 100644
--- a/util/options_builder.cc
+++ b/util/options_builder.cc
@@ -11,8 +11,12 @@ namespace rocksdb {
 
 namespace {
 
-double Log2(double n) {
+inline double Log2(double n) {
+#ifndef OS_ANDROID
+  return log2(n);
+#else
   return log(n) / log(2);
+#endif
 }
 
 // For now, always use 1-0 as level bytes multiplier.

From 2a792cd30072eeee678ad9aa93ed4d51dae7ac62 Mon Sep 17 00:00:00 2001
From: Adam Retter <adam.retter@googlemail.com>
Date: Wed, 26 Nov 2014 18:07:06 +0000
Subject: [PATCH 572/829] There will also be a librocksdbjni-osx.jnilib.dSYM
 folder on MacOSX builds to be deleted

---
 java/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/java/Makefile b/java/Makefile
index 7a0edb3cf..15dd13980 100644
--- a/java/Makefile
+++ b/java/Makefile
@@ -91,7 +91,7 @@ clean:
 	rm -rf javadoc/*
 	rm -rf test-libs/
 	rm -rf target
-	rm -f librocksdbjni*
+	rm -rf librocksdbjni*
 	rm -f rocksdbjni*
 
 

From ff0cb90d1c574aa01cda4d65303d4f243f2d7292 Mon Sep 17 00:00:00 2001
From: Adam Retter <adam.retter@googlemail.com>
Date: Wed, 26 Nov 2014 18:08:09 +0000
Subject: [PATCH 573/829] Do not delete Java Fatal Error Log, developers may
 still want these for reference

---
 java/Makefile | 1 -
 1 file changed, 1 deletion(-)

diff --git a/java/Makefile b/java/Makefile
index 15dd13980..3a2ef8c74 100644
--- a/java/Makefile
+++ b/java/Makefile
@@ -86,7 +86,6 @@ JAVA_TESTCLASSPATH = $(ROCKSDB_JAR):$(JAVA_JUNIT_JAR):$(JAVA_HAMCR_JAR):$(JAVA_M
 
 clean:
 	-find . -name "*.class" -exec rm {} \;
-	-find . -name "hs*.log" -exec rm {} \;
 	rm -rf include/*
 	rm -rf javadoc/*
 	rm -rf test-libs/

From 73d72ed5c7b45342e28a47d9ccad3ba1a790c80c Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Wed, 26 Nov 2014 11:37:59 -0800
Subject: [PATCH 574/829] Block ReadOnlyDB in ROCKSDB_LITE

Summary:
db_imp_readonly.o is one of the big obj file.  If it's not a necessary
feature, we should probably block it in ROCKSDB_LITE.

    1322704 Nov 24 16:55 db/db_impl_readonly.o

Test Plan:
make shared_lib -j32
make ROCKSDB_LITE shared_lib -j32

Reviewers: ljin, igor

Reviewed By: igor

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D29583
---
 db/db_impl_readonly.cc | 20 ++++++++++++++++++--
 db/db_impl_readonly.h  |  5 +++++
 include/rocksdb/db.h   |  6 ++++++
 3 files changed, 29 insertions(+), 2 deletions(-)

diff --git a/db/db_impl_readonly.cc b/db/db_impl_readonly.cc
index 298944f62..8b0beb7e0 100644
--- a/db/db_impl_readonly.cc
+++ b/db/db_impl_readonly.cc
@@ -3,6 +3,7 @@
 //  LICENSE file in the root directory of this source tree. An additional grant
 //  of patent rights can be found in the PATENTS file in the same directory.
 
+
 #include "db/db_impl_readonly.h"
 #include "utilities/compacted_db/compacted_db_impl.h"
 #include "db/db_impl.h"
@@ -13,6 +14,8 @@
 
 namespace rocksdb {
 
+#ifndef ROCKSDB_LITE
+
 DBImplReadOnly::DBImplReadOnly(const DBOptions& db_options,
                                const std::string& dbname)
     : DBImpl(db_options, dbname) {
@@ -97,12 +100,10 @@ Status DB::OpenForReadOnly(const Options& options, const std::string& dbname,
 
   // Try to first open DB as fully compacted DB
   Status s;
-#ifndef ROCKSDB_LITE
   s = CompactedDBImpl::Open(options, dbname, dbptr);
   if (s.ok()) {
     return s;
   }
-#endif
 
   DBOptions db_options(options);
   ColumnFamilyOptions cf_options(options);
@@ -167,5 +168,20 @@ Status DB::OpenForReadOnly(
   return s;
 }
 
+#else  // !ROCKSDB_LITE
+
+Status DB::OpenForReadOnly(const Options& options, const std::string& dbname,
+                           DB** dbptr, bool error_if_log_file_exist) {
+  return Status::NotSupported("Not supported in ROCKSDB_LITE.");
+}
+
+Status DB::OpenForReadOnly(
+    const DBOptions& db_options, const std::string& dbname,
+    const std::vector<ColumnFamilyDescriptor>& column_families,
+    std::vector<ColumnFamilyHandle*>* handles, DB** dbptr,
+    bool error_if_log_file_exist) {
+  return Status::NotSupported("Not supported in ROCKSDB_LITE.");
+}
+#endif  // !ROCKSDB_LITE
 
 }   // namespace rocksdb
diff --git a/db/db_impl_readonly.h b/db/db_impl_readonly.h
index d84b23f18..25fcb4350 100644
--- a/db/db_impl_readonly.h
+++ b/db/db_impl_readonly.h
@@ -4,6 +4,9 @@
 //  of patent rights can be found in the PATENTS file in the same directory.
 
 #pragma once
+
+#ifndef ROCKSDB_LITE
+
 #include "db/db_impl.h"
 #include <vector>
 #include <string>
@@ -100,3 +103,5 @@ class DBImplReadOnly : public DBImpl {
   void operator=(const DBImplReadOnly&);
 };
 }
+
+#endif  // !ROCKSDB_LITE
diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h
index 326989418..72878ff57 100644
--- a/include/rocksdb/db.h
+++ b/include/rocksdb/db.h
@@ -101,6 +101,9 @@ class DB {
   // that modify data, like put/delete, will return error.
   // If the db is opened in read only mode, then no compactions
   // will happen.
+  //
+  // Not supported in ROCKSDB_LITE, in which case the function will
+  // return Status::NotSupported.
   static Status OpenForReadOnly(const Options& options,
       const std::string& name, DB** dbptr,
       bool error_if_log_file_exist = false);
@@ -110,6 +113,9 @@ class DB {
   // database that should be opened. However, you always need to specify default
   // column family. The default column family name is 'default' and it's stored
   // in rocksdb::kDefaultColumnFamilyName
+  //
+  // Not supported in ROCKSDB_LITE, in which case the function will
+  // return Status::NotSupported.
   static Status OpenForReadOnly(
       const DBOptions& db_options, const std::string& name,
       const std::vector<ColumnFamilyDescriptor>& column_families,

From 67cb7ca758b9b632ca6956dc26d2ef20d1ee627b Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Wed, 26 Nov 2014 20:51:50 +0100
Subject: [PATCH 575/829] [RocksJava] Fixed MacOS build of RocksJava

There were still some precision loss problems
remainging in RocksJava. This pull request resolve
these.
---
 java/rocksjni/backupablejni.cc | 7 +++++--
 java/rocksjni/restorejni.cc    | 7 +++++--
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/java/rocksjni/backupablejni.cc b/java/rocksjni/backupablejni.cc
index 8007e2ce0..d26e46e88 100644
--- a/java/rocksjni/backupablejni.cc
+++ b/java/rocksjni/backupablejni.cc
@@ -112,8 +112,11 @@ jintArray Java_org_rocksdb_BackupableDB_getCorruptedBackups(
   }
   // Store ints in java array
   jintArray ret_backup_ids;
-  ret_backup_ids = env->NewIntArray(kIdSize);
-  env->SetIntArrayRegion(ret_backup_ids, 0, kIdSize, int_backup_ids);
+  // Its ok to loose precision here (64->32)
+  jsize ret_backup_ids_size = static_cast<jsize>(kIdSize);
+  ret_backup_ids = env->NewIntArray(ret_backup_ids_size);
+  env->SetIntArrayRegion(ret_backup_ids, 0, ret_backup_ids_size,
+      int_backup_ids);
   return ret_backup_ids;
 }
 
diff --git a/java/rocksjni/restorejni.cc b/java/rocksjni/restorejni.cc
index 4fe747e10..a2341632b 100644
--- a/java/rocksjni/restorejni.cc
+++ b/java/rocksjni/restorejni.cc
@@ -166,8 +166,11 @@ jintArray Java_org_rocksdb_RestoreBackupableDB_getCorruptedBackups(
   }
   // Store ints in java array
   jintArray ret_backup_ids;
-  ret_backup_ids = env->NewIntArray(kIdSize);
-  env->SetIntArrayRegion(ret_backup_ids, 0, kIdSize, int_backup_ids);
+  // Its ok to loose precision here (64->32)
+  jsize ret_backup_ids_size = static_cast<jsize>(kIdSize);
+  ret_backup_ids = env->NewIntArray(ret_backup_ids_size);
+  env->SetIntArrayRegion(ret_backup_ids, 0, ret_backup_ids_size,
+      int_backup_ids);
   return ret_backup_ids;
 }
 

From bcf9086899065d46ef3efc738afeac18a130f6a3 Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Wed, 26 Nov 2014 15:45:11 -0800
Subject: [PATCH 576/829] Block Universal and FIFO compactions in ROCKSDB_LITE

Summary: Block Universal and FIFO compactions in ROCKSDB_LITE

Test Plan:
make shared_lib -j32
make OPT=-DROCKSDB_LITE shared_lib

Reviewers: ljin, igor

Reviewed By: igor

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D29589
---
 db/column_family.cc       | 10 ++++---
 db/compaction_picker.cc   |  6 +++-
 db/compaction_picker.h    | 60 ++++++++++++++++++++-------------------
 include/rocksdb/options.h | 23 ++++++++++-----
 util/options_builder.cc   | 10 +++++++
 5 files changed, 68 insertions(+), 41 deletions(-)

diff --git a/db/column_family.cc b/db/column_family.cc
index 8456ed9ca..84fd1ac4b 100644
--- a/db/column_family.cc
+++ b/db/column_family.cc
@@ -253,12 +253,13 @@ ColumnFamilyData::ColumnFamilyData(uint32_t id, const std::string& name,
     internal_stats_.reset(
         new InternalStats(ioptions_.num_levels, db_options->env, this));
     table_cache_.reset(new TableCache(ioptions_, env_options, _table_cache));
-    if (ioptions_.compaction_style == kCompactionStyleUniversal) {
-      compaction_picker_.reset(
-          new UniversalCompactionPicker(ioptions_, &internal_comparator_));
-    } else if (ioptions_.compaction_style == kCompactionStyleLevel) {
+    if (ioptions_.compaction_style == kCompactionStyleLevel) {
       compaction_picker_.reset(
           new LevelCompactionPicker(ioptions_, &internal_comparator_));
+#ifndef ROCKSDB_LITE
+    } else if (ioptions_.compaction_style == kCompactionStyleUniversal) {
+      compaction_picker_.reset(
+          new UniversalCompactionPicker(ioptions_, &internal_comparator_));
     } else if (ioptions_.compaction_style == kCompactionStyleFIFO) {
       compaction_picker_.reset(
           new FIFOCompactionPicker(ioptions_, &internal_comparator_));
@@ -269,6 +270,7 @@ ColumnFamilyData::ColumnFamilyData(uint32_t id, const std::string& name,
           "Column family %s does not use any background compaction. "
           "Compactions can only be done via CompactFiles\n",
           GetName().c_str());
+#endif  // !ROCKSDB_LITE
     } else {
       Log(InfoLogLevel::ERROR_LEVEL, ioptions_.info_log,
           "Unable to recognize the specified compaction style %d. "
diff --git a/db/compaction_picker.cc b/db/compaction_picker.cc
index 9ea4c187a..213daefc1 100644
--- a/db/compaction_picker.cc
+++ b/db/compaction_picker.cc
@@ -387,6 +387,7 @@ Compaction* CompactionPicker::CompactRange(
     begin = nullptr;
     end = nullptr;
   }
+
   vstorage->GetOverlappingInputs(input_level, begin, end, &inputs);
   if (inputs.empty()) {
     return nullptr;
@@ -676,7 +677,7 @@ Status CompactionPicker::SanitizeCompactionInputFiles(
 
   return Status::OK();
 }
-#endif  // ROCKSDB_LITE
+#endif  // !ROCKSDB_LITE
 
 bool LevelCompactionPicker::NeedsCompaction(const VersionStorageInfo* vstorage)
     const {
@@ -842,6 +843,7 @@ Compaction* LevelCompactionPicker::PickCompactionBySize(
   return c;
 }
 
+#ifndef ROCKSDB_LITE
 bool UniversalCompactionPicker::NeedsCompaction(
     const VersionStorageInfo* vstorage) const {
   const int kLevel0 = 0;
@@ -1325,4 +1327,6 @@ Compaction* FIFOCompactionPicker::CompactRange(
   return c;
 }
 
+#endif  // !ROCKSDB_LITE
+
 }  // namespace rocksdb
diff --git a/db/compaction_picker.h b/db/compaction_picker.h
index f5bb2f256..cfed5109d 100644
--- a/db/compaction_picker.h
+++ b/db/compaction_picker.h
@@ -169,6 +169,36 @@ class CompactionPicker {
   const InternalKeyComparator* const icmp_;
 };
 
+class LevelCompactionPicker : public CompactionPicker {
+ public:
+  LevelCompactionPicker(const ImmutableCFOptions& ioptions,
+                        const InternalKeyComparator* icmp)
+      : CompactionPicker(ioptions, icmp) {}
+  virtual Compaction* PickCompaction(const std::string& cf_name,
+                                     const MutableCFOptions& mutable_cf_options,
+                                     VersionStorageInfo* vstorage,
+                                     LogBuffer* log_buffer) override;
+
+  // Returns current_num_levels - 2, meaning the last level cannot be
+  // compaction input level.
+  virtual int MaxInputLevel(int current_num_levels) const override {
+    return current_num_levels - 2;
+  }
+
+  virtual bool NeedsCompaction(const VersionStorageInfo* vstorage) const
+      override;
+
+ private:
+  // For the specfied level, pick a compaction.
+  // Returns nullptr if there is no compaction to be done.
+  // If level is 0 and there is already a compaction on that level, this
+  // function will return nullptr.
+  Compaction* PickCompactionBySize(const MutableCFOptions& mutable_cf_options,
+                                   VersionStorageInfo* vstorage, int level,
+                                   double score);
+};
+
+#ifndef ROCKSDB_LITE
 class UniversalCompactionPicker : public CompactionPicker {
  public:
   UniversalCompactionPicker(const ImmutableCFOptions& ioptions,
@@ -210,35 +240,6 @@ class UniversalCompactionPicker : public CompactionPicker {
                             uint64_t file_size);
 };
 
-class LevelCompactionPicker : public CompactionPicker {
- public:
-  LevelCompactionPicker(const ImmutableCFOptions& ioptions,
-                        const InternalKeyComparator* icmp)
-      : CompactionPicker(ioptions, icmp) {}
-  virtual Compaction* PickCompaction(const std::string& cf_name,
-                                     const MutableCFOptions& mutable_cf_options,
-                                     VersionStorageInfo* vstorage,
-                                     LogBuffer* log_buffer) override;
-
-  // Returns current_num_levels - 2, meaning the last level cannot be
-  // compaction input level.
-  virtual int MaxInputLevel(int current_num_levels) const override {
-    return current_num_levels - 2;
-  }
-
-  virtual bool NeedsCompaction(const VersionStorageInfo* vstorage) const
-      override;
-
- private:
-  // For the specfied level, pick a compaction.
-  // Returns nullptr if there is no compaction to be done.
-  // If level is 0 and there is already a compaction on that level, this
-  // function will return nullptr.
-  Compaction* PickCompactionBySize(const MutableCFOptions& mutable_cf_options,
-                                   VersionStorageInfo* vstorage, int level,
-                                   double score);
-};
-
 class FIFOCompactionPicker : public CompactionPicker {
  public:
   FIFOCompactionPicker(const ImmutableCFOptions& ioptions,
@@ -306,6 +307,7 @@ class NullCompactionPicker : public CompactionPicker {
     return false;
   }
 };
+#endif  // !ROCKSDB_LITE
 
 // Utility function
 extern uint64_t TotalCompensatedFileSize(const std::vector<FileMetaData*>& files);
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index 4e20e618f..91c6604ae 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -56,11 +56,18 @@ enum CompressionType : char {
 };
 
 enum CompactionStyle : char {
-  kCompactionStyleLevel = 0x0,      // level based compaction style
-  kCompactionStyleUniversal = 0x1,  // Universal compaction style
-  kCompactionStyleFIFO = 0x2,    // FIFO compaction style
-  kCompactionStyleNone = 0x3,  // Disable background compaction. Compaction
-                               // jobs are submitted via CompactFiles()
+  // level based compaction style
+  kCompactionStyleLevel = 0x0,
+  // Universal compaction style
+  // Not supported in ROCKSDB_LITE.
+  kCompactionStyleUniversal = 0x1,
+  // FIFO compaction style
+  // Not supported in ROCKSDB_LITE
+  kCompactionStyleFIFO = 0x2,
+  // Disable background compaction. Compaction jobs are submitted
+  // via CompactFiles().
+  // Not supported in ROCKSDB_LITE
+  kCompactionStyleNone = 0x3,
 };
 
 struct CompactionOptionsFIFO {
@@ -101,9 +108,10 @@ struct Options;
 struct ColumnFamilyOptions {
   // Some functions that make it easier to optimize RocksDB
 
-#ifndef ROCKSDB_LITE
   // Use this if you don't need to keep the data sorted, i.e. you'll never use
   // an iterator, only Put() and Get() API calls
+  //
+  // Not supported in ROCKSDB_LITE
   ColumnFamilyOptions* OptimizeForPointLookup(
       uint64_t block_cache_size_mb);
 
@@ -121,11 +129,12 @@ struct ColumnFamilyOptions {
   // biggest performance gains.
   // Note: we might use more memory than memtable_memory_budget during high
   // write rate period
+  //
+  // OptimizeUniversalStyleCompaction is not supported in ROCKSDB_LITE
   ColumnFamilyOptions* OptimizeLevelStyleCompaction(
       uint64_t memtable_memory_budget = 512 * 1024 * 1024);
   ColumnFamilyOptions* OptimizeUniversalStyleCompaction(
       uint64_t memtable_memory_budget = 512 * 1024 * 1024);
-#endif  // ROCKSDB_LITE
 
   // -------------------
   // Parameters that affect behavior
diff --git a/util/options_builder.cc b/util/options_builder.cc
index d5cf4b20a..d473ca943 100644
--- a/util/options_builder.cc
+++ b/util/options_builder.cc
@@ -28,6 +28,7 @@ CompactionStyle PickCompactionStyle(size_t write_buffer_size,
                                     int read_amp_threshold,
                                     int write_amp_threshold,
                                     uint64_t target_db_size) {
+#ifndef ROCKSDB_LITE
   // Estimate read amplification and write amplification of two compaction
   // styles. If there is hard limit to force a choice, make the choice.
   // Otherwise, calculate a score based on threshold and expected value of
@@ -78,6 +79,9 @@ CompactionStyle PickCompactionStyle(size_t write_buffer_size,
   } else {
     return kCompactionStyleUniversal;
   }
+#else
+  return kCompactionStyleLevel;
+#endif  // !ROCKSDB_LITE
 }
 
 // Pick mem table size
@@ -107,12 +111,14 @@ void PickWriteBufferSize(size_t total_write_buffer_limit, Options* options) {
   options->min_write_buffer_number_to_merge = 1;
 }
 
+#ifndef ROCKSDB_LITE
 void OptimizeForUniversal(Options* options) {
   options->level0_file_num_compaction_trigger = 2;
   options->level0_slowdown_writes_trigger = 30;
   options->level0_stop_writes_trigger = 40;
   options->max_open_files = -1;
 }
+#endif
 
 // Optimize parameters for level-based compaction
 void OptimizeForLevel(int read_amplification_threshold,
@@ -192,9 +198,13 @@ Options GetOptions(size_t total_write_buffer_limit,
   options.compaction_style =
       PickCompactionStyle(write_buffer_size, read_amplification_threshold,
                           write_amplification_threshold, target_db_size);
+#ifndef ROCKSDB_LITE
   if (options.compaction_style == kCompactionStyleUniversal) {
     OptimizeForUniversal(&options);
   } else {
+#else
+  {
+#endif  // !ROCKSDB_LITE
     OptimizeForLevel(read_amplification_threshold,
                      write_amplification_threshold, target_db_size, &options);
   }

From 0a9a7e753c499db59a62a3232ce2f275e2028339 Mon Sep 17 00:00:00 2001
From: Haneef Mubarak <haneefmubarak@users.noreply.github.com>
Date: Thu, 27 Nov 2014 13:49:19 -0800
Subject: [PATCH 577/829] added C version of simple_example

---
 examples/simple_example.c | 42 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)
 create mode 100644 examples/simple_example.c

diff --git a/examples/simple_example.c b/examples/simple_example.c
new file mode 100644
index 000000000..a64861346
--- /dev/null
+++ b/examples/simple_example.c
@@ -0,0 +1,42 @@
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+
+#include "rocksdb/c.h"
+
+#include <unistd.h>	// sysconf() - get CPU count
+
+char DBPath[] = "/tmp/rocksdb_simple_example";
+
+int main (int argc, char **argv) {
+	rocksdb_t *db;
+	rocksdb_options_t *options = rocksdb_options_create ();
+	// Optimize RocksDB. This is the easiest way to get RocksDB to perform well
+	int cpus = sysconf (_SC_NPROCESSORS_ONLN);	// get number of online cores
+	rocksdb_options_increase_parallelism (options, cpus);
+	rocksdb_options_optimize_level_style_compaction (options, 0);
+	// create the DB if it's not already present
+	rocksdb_options_set_create_if_missing (options, 1);
+
+	// open DB
+	char *err;
+	db = rocksdb_open (options, DBPath, &err);
+//	assert (!err);
+
+	// Put key-value
+	rocksdb_writeoptions_t *writeoptions = rocksdb_writeoptions_create ();
+	const char key[] = "key";
+	char *value = "value";
+	rocksdb_put (db, writeoptions, key, strlen (key), value, strlen (value), &err);
+//	assert (!err);
+	// Get value
+	rocksdb_readoptions_t *readoptions = rocksdb_readoptions_create ();
+	size_t len;
+	value = rocksdb_get (db, readoptions, key, strlen (key), &len, &err);
+//	assert (!err);
+	assert (strcmp (value, "value") == 0);
+
+	rocksdb_close (db);
+
+	return 0;
+}

From 9c34d5e36152f067c1f4068e3f9fdd8405de636e Mon Sep 17 00:00:00 2001
From: Haneef Mubarak <haneefmubarak@users.noreply.github.com>
Date: Thu, 27 Nov 2014 13:53:04 -0800
Subject: [PATCH 578/829] fix type in C simple example

---
 examples/simple_example.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/simple_example.c b/examples/simple_example.c
index a64861346..ecb4c8f37 100644
--- a/examples/simple_example.c
+++ b/examples/simple_example.c
@@ -6,7 +6,7 @@
 
 #include <unistd.h>	// sysconf() - get CPU count
 
-char DBPath[] = "/tmp/rocksdb_simple_example";
+const char DBPath[] = "/tmp/rocksdb_simple_example";
 
 int main (int argc, char **argv) {
 	rocksdb_t *db;

From d7f5ccb0c27fa5cd869a6bf822d887a2dd956f69 Mon Sep 17 00:00:00 2001
From: Haneef Mubarak <haneefmubarak@users.noreply.github.com>
Date: Thu, 27 Nov 2014 15:06:12 -0800
Subject: [PATCH 579/829] add c example to makefile and fix "make clean"

---
 examples/Makefile | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/examples/Makefile b/examples/Makefile
index ce43785e0..10d5fe190 100644
--- a/examples/Makefile
+++ b/examples/Makefile
@@ -2,7 +2,7 @@ include ../build_config.mk
 
 .PHONY: clean
 
-all: simple_example column_families_example compact_files_example
+all: simple_example column_families_example compact_files_example simple_example-c
 
 simple_example: simple_example.cc
 	$(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++11 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
@@ -13,5 +13,8 @@ column_families_example: column_families_example.cc
 compact_files_example: compact_files_example.cc
 	$(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++11 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
 
-clean: simple_example column_families_example compact_files_example
-	rm -rf ./simple_example ./column_families_example ./compact_files_example
+simple_example-c:
+	$(CXX) $(CXXFLAGS) -xc $@.c -o$@ ../librocksdb.a -I../include -O2 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
+
+clean:
+	rm -rf ./simple_example ./column_families_example ./compact_files_example ./simple_example-c

From ac4ed1e305dda3d030bf384d8570957107303a9d Mon Sep 17 00:00:00 2001
From: Haneef Mubarak <haneefmubarak@users.noreply.github.com>
Date: Thu, 27 Nov 2014 15:20:55 -0800
Subject: [PATCH 580/829] fix examples/makefile for C example

---
 examples/Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/Makefile b/examples/Makefile
index 10d5fe190..abeee2488 100644
--- a/examples/Makefile
+++ b/examples/Makefile
@@ -13,8 +13,8 @@ column_families_example: column_families_example.cc
 compact_files_example: compact_files_example.cc
 	$(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++11 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
 
-simple_example-c:
-	$(CXX) $(CXXFLAGS) -xc $@.c -o$@ ../librocksdb.a -I../include -O2 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
+simple_example-c: simple_example.c
+	$(CXX) -xc -I../include simple_example.c -L.. -lrocksdb -pthread -lsnappy -lbz2 -lz -lrt
 
 clean:
 	rm -rf ./simple_example ./column_families_example ./compact_files_example ./simple_example-c

From c6f31a289303794f990b78baac104c1b761ffb32 Mon Sep 17 00:00:00 2001
From: Haneef Mubarak <haneefmubarak@users.noreply.github.com>
Date: Sat, 29 Nov 2014 21:42:42 -0800
Subject: [PATCH 581/829] minor memory leak in C example

---
 examples/Makefile         | 2 +-
 examples/simple_example.c | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/examples/Makefile b/examples/Makefile
index abeee2488..96c8bc3cf 100644
--- a/examples/Makefile
+++ b/examples/Makefile
@@ -14,7 +14,7 @@ compact_files_example: compact_files_example.cc
 	$(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++11 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
 
 simple_example-c: simple_example.c
-	$(CXX) -xc -I../include simple_example.c -L.. -lrocksdb -pthread -lsnappy -lbz2 -lz -lrt
+	$(CXX) -xc -I../include simple_example.c -o$@ -L.. -lrocksdb -pthread -lsnappy -lbz2 -lz -lrt
 
 clean:
 	rm -rf ./simple_example ./column_families_example ./compact_files_example ./simple_example-c
diff --git a/examples/simple_example.c b/examples/simple_example.c
index ecb4c8f37..59848902a 100644
--- a/examples/simple_example.c
+++ b/examples/simple_example.c
@@ -1,5 +1,6 @@
 #include <stdio.h>
 #include <string.h>
+#include <stdlib.h>
 #include <assert.h>
 
 #include "rocksdb/c.h"
@@ -35,6 +36,7 @@ int main (int argc, char **argv) {
 	value = rocksdb_get (db, readoptions, key, strlen (key), &len, &err);
 //	assert (!err);
 	assert (strcmp (value, "value") == 0);
+	free (value);
 
 	rocksdb_close (db);
 

From 91d89816391a341cd48ab932ccc864f38f7220c4 Mon Sep 17 00:00:00 2001
From: Stefan Eilemann <Stefan.Eilemann@epfl.ch>
Date: Wed, 26 Nov 2014 15:48:03 +0100
Subject: [PATCH 582/829] Tweak Makefile for building on BG/Q

---
 Makefile | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 1862d2bf9..0b12d90b0 100644
--- a/Makefile
+++ b/Makefile
@@ -6,8 +6,16 @@
 
 #-----------------------------------------------
 
+CFLAGS += ${EXTRA_CFLAGS}
+CXXFLAGS += ${EXTRA_CXXFLAGS}
+LDFLAGS += $(EXTRA_LDFLAGS)
+MACHINE ?= $(shell uname -m)
+
 ifneq ($(MAKECMDGOALS),dbg)
-OPT += -O2 -fno-omit-frame-pointer -momit-leaf-frame-pointer
+OPT += -O2 -fno-omit-frame-pointer
+ifneq ($(MACHINE),ppc64) # ppc64 doesn't support -momit-leaf-frame-pointer
+OPT += -momit-leaf-frame-pointer
+endif
 else
 # intentionally left blank
 endif

From b426675061741fe0beccb029f19a43fff99dda98 Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Mon, 1 Dec 2014 19:01:29 +0100
Subject: [PATCH 583/829] [RocksJava] MacOSX strip support

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 0b12d90b0..513659900 100644
--- a/Makefile
+++ b/Makefile
@@ -609,7 +609,7 @@ rocksdbjavastatic: libz.a libbz2.a libsnappy.a
 	cd java;$(MAKE) javalib;
 	rm -f ./java/$(ROCKSDBJNILIB)
 	$(CXX) $(CXXFLAGS) -I./java/. $(JAVA_INCLUDE) -shared -fPIC -o ./java/$(ROCKSDBJNILIB) $(JNI_NATIVE_SOURCES) $(LIBOBJECTS) $(COVERAGEFLAGS) libz.a libbz2.a libsnappy.a
-	cd java;strip $(ROCKSDBJNILIB)
+	cd java;strip -S -x $(ROCKSDBJNILIB)
 	cd java;jar -cf $(ROCKSDB_JAR) org/rocksdb/*.class org/rocksdb/util/*.class HISTORY*.md $(ROCKSDBJNILIB)
 	cd java/javadoc;jar -cf ../$(ROCKSDB_JAVADOCS_JAR) *
 	cd java;jar -cf $(ROCKSDB_SOURCES_JAR) org

From b036804ac1e570969bd68dcb6e03332acd34f7b9 Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Sat, 22 Nov 2014 20:59:00 +0100
Subject: [PATCH 584/829] RocksJava - FindBugs issues

Addressed some FindBugs issues.
---
 java/RocksDBColumnFamilySample.java         |   3 +
 java/RocksDBSample.java                     |   2 +-
 java/org/rocksdb/ColumnFamilyHandle.java    |   2 +-
 java/org/rocksdb/RocksIterator.java         |   2 +-
 java/org/rocksdb/benchmark/DbBenchmark.java | 180 +++++++++++---------
 java/org/rocksdb/test/ColumnFamilyTest.java |   3 +
 java/org/rocksdb/test/InfoLogLevelTest.java |   9 +
 7 files changed, 120 insertions(+), 81 deletions(-)

diff --git a/java/RocksDBColumnFamilySample.java b/java/RocksDBColumnFamilySample.java
index 200e53a1d..5515845cb 100644
--- a/java/RocksDBColumnFamilySample.java
+++ b/java/RocksDBColumnFamilySample.java
@@ -44,6 +44,9 @@ public class RocksDBColumnFamilySample {
         db.close();
         db = null;
       }
+      if (options != null) {
+        options.dispose();
+      }
     }
 
     // open DB with two column families
diff --git a/java/RocksDBSample.java b/java/RocksDBSample.java
index 302d4e04d..84cf6404f 100644
--- a/java/RocksDBSample.java
+++ b/java/RocksDBSample.java
@@ -119,7 +119,7 @@ public class RocksDBSample {
       byte[] value = db.get("hello".getBytes());
       assert("world".equals(new String(value)));
       String str = db.getProperty("rocksdb.stats");
-      assert(str != null && str != "");
+      assert(str != null && !str.equals(""));
     } catch (RocksDBException e) {
       System.out.format("[ERROR] caught the unexpceted exception -- %s\n", e);
       assert(db == null);
diff --git a/java/org/rocksdb/ColumnFamilyHandle.java b/java/org/rocksdb/ColumnFamilyHandle.java
index ed8417728..835628702 100644
--- a/java/org/rocksdb/ColumnFamilyHandle.java
+++ b/java/org/rocksdb/ColumnFamilyHandle.java
@@ -40,5 +40,5 @@ public class ColumnFamilyHandle extends RocksObject {
 
   private native void disposeInternal(long handle);
 
-  private RocksDB rocksDB_;
+  private final RocksDB rocksDB_;
 }
diff --git a/java/org/rocksdb/RocksIterator.java b/java/org/rocksdb/RocksIterator.java
index b947b2c83..1abe7e704 100644
--- a/java/org/rocksdb/RocksIterator.java
+++ b/java/org/rocksdb/RocksIterator.java
@@ -162,5 +162,5 @@ public class RocksIterator extends RocksObject {
   private native void seek0(long handle, byte[] target, int targetLen);
   private native void status0(long handle);
 
-  RocksDB rocksDB_;
+  final RocksDB rocksDB_;
 }
diff --git a/java/org/rocksdb/benchmark/DbBenchmark.java b/java/org/rocksdb/benchmark/DbBenchmark.java
index d3d9f8c58..26b295f7b 100644
--- a/java/org/rocksdb/benchmark/DbBenchmark.java
+++ b/java/org/rocksdb/benchmark/DbBenchmark.java
@@ -459,16 +459,22 @@ public class DbBenchmark {
     compressionType_ = (String) flags.get(Flag.compression_type);
     compression_ = CompressionType.NONE;
     try {
-      if (compressionType_.equals("snappy")) {
-        System.loadLibrary("snappy");
-      } else if (compressionType_.equals("zlib")) {
-        System.loadLibrary("z");
-      } else if (compressionType_.equals("bzip2")) {
-        System.loadLibrary("bzip2");
-      } else if (compressionType_.equals("lz4")) {
-        System.loadLibrary("lz4");
-      } else if (compressionType_.equals("lz4hc")) {
-        System.loadLibrary("lz4hc");
+      switch (compressionType_) {
+        case "snappy":
+          System.loadLibrary("snappy");
+          break;
+        case "zlib":
+          System.loadLibrary("z");
+          break;
+        case "bzip2":
+          System.loadLibrary("bzip2");
+          break;
+        case "lz4":
+          System.loadLibrary("lz4");
+          break;
+        case "lz4hc":
+          System.loadLibrary("lz4hc");
+          break;
       }
     } catch (UnsatisfiedLinkError e) {
       System.err.format("Unable to load %s library:%s%n" +
@@ -495,26 +501,32 @@ public class DbBenchmark {
     } else {
       options.setCreateIfMissing(false);
     }
-    if (memtable_.equals("skip_list")) {
-      options.setMemTableConfig(new SkipListMemTableConfig());
-    } else if (memtable_.equals("vector")) {
-      options.setMemTableConfig(new VectorMemTableConfig());
-    } else if (memtable_.equals("hash_linkedlist")) {
-      options.setMemTableConfig(
-          new HashLinkedListMemTableConfig()
-              .setBucketCount(hashBucketCount_));
-      options.useFixedLengthPrefixExtractor(prefixSize_);
-    } else if (memtable_.equals("hash_skiplist") ||
-               memtable_.equals("prefix_hash")) {
-      options.setMemTableConfig(
-          new HashSkipListMemTableConfig()
-              .setBucketCount(hashBucketCount_));
-      options.useFixedLengthPrefixExtractor(prefixSize_);
-    } else {
-      System.err.format(
-          "unable to detect the specified memtable, " +
-          "use the default memtable factory %s%n",
-          options.memTableFactoryName());
+    switch (memtable_) {
+      case "skip_list":
+        options.setMemTableConfig(new SkipListMemTableConfig());
+        break;
+      case "vector":
+        options.setMemTableConfig(new VectorMemTableConfig());
+        break;
+      case "hash_linkedlist":
+        options.setMemTableConfig(
+            new HashLinkedListMemTableConfig()
+                .setBucketCount(hashBucketCount_));
+        options.useFixedLengthPrefixExtractor(prefixSize_);
+        break;
+      case "hash_skiplist":
+      case "prefix_hash":
+        options.setMemTableConfig(
+            new HashSkipListMemTableConfig()
+                .setBucketCount(hashBucketCount_));
+        options.useFixedLengthPrefixExtractor(prefixSize_);
+        break;
+      default:
+        System.err.format(
+            "unable to detect the specified memtable, " +
+                "use the default memtable factory %s%n",
+            options.memTableFactoryName());
+        break;
     }
     if (usePlainTable_) {
       options.setTableFormatConfig(
@@ -645,53 +657,65 @@ public class DbBenchmark {
       int currentTaskId = 0;
       boolean known = true;
 
-      if (benchmark.equals("fillseq")) {
-        tasks.add(new WriteSequentialTask(
-            currentTaskId++, randSeed_, num_, num_, writeOpt, 1));
-      } else if (benchmark.equals("fillbatch")) {
-        tasks.add(new WriteRandomTask(
-            currentTaskId++, randSeed_, num_ / 1000, num_, writeOpt, 1000));
-      } else if (benchmark.equals("fillrandom")) {
-        tasks.add(new WriteRandomTask(
-            currentTaskId++, randSeed_, num_, num_, writeOpt, 1));
-      } else if (benchmark.equals("filluniquerandom")) {
-        tasks.add(new WriteUniqueRandomTask(
-            currentTaskId++, randSeed_, num_, num_, writeOpt, 1));
-      } else if (benchmark.equals("fillsync")) {
-        writeOpt.setSync(true);
-        tasks.add(new WriteRandomTask(
-            currentTaskId++, randSeed_, num_ / 1000, num_ / 1000,
-            writeOpt, 1));
-      } else if (benchmark.equals("readseq")) {
-        for (int t = 0; t < threadNum_; ++t) {
-          tasks.add(new ReadSequentialTask(
-              currentTaskId++, randSeed_, reads_ / threadNum_, num_));
-        }
-      } else if (benchmark.equals("readrandom")) {
-        for (int t = 0; t < threadNum_; ++t) {
-          tasks.add(new ReadRandomTask(
-              currentTaskId++, randSeed_, reads_ / threadNum_, num_));
-        }
-      } else if (benchmark.equals("readwhilewriting")) {
-        WriteTask writeTask = new WriteRandomTask(
-            -1, randSeed_, Long.MAX_VALUE, num_, writeOpt, 1, writesPerSeconds_);
-        writeTask.stats_.setExcludeFromMerge();
-        bgTasks.add(writeTask);
-        for (int t = 0; t < threadNum_; ++t) {
-          tasks.add(new ReadRandomTask(
-              currentTaskId++, randSeed_, reads_ / threadNum_, num_));
-        }
-      } else if (benchmark.equals("readhot")) {
-        for (int t = 0; t < threadNum_; ++t) {
-          tasks.add(new ReadRandomTask(
-              currentTaskId++, randSeed_, reads_ / threadNum_, num_ / 100));
-        }
-      } else if (benchmark.equals("delete")) {
-        destroyDb();
-        open(options);
-      } else {
-        known = false;
-        System.err.println("Unknown benchmark: " + benchmark);
+      switch (benchmark) {
+        case "fillseq":
+          tasks.add(new WriteSequentialTask(
+              currentTaskId++, randSeed_, num_, num_, writeOpt, 1));
+          break;
+        case "fillbatch":
+          tasks.add(new WriteRandomTask(
+              currentTaskId++, randSeed_, num_ / 1000, num_, writeOpt, 1000));
+          break;
+        case "fillrandom":
+          tasks.add(new WriteRandomTask(
+              currentTaskId++, randSeed_, num_, num_, writeOpt, 1));
+          break;
+        case "filluniquerandom":
+          tasks.add(new WriteUniqueRandomTask(
+              currentTaskId++, randSeed_, num_, num_, writeOpt, 1));
+          break;
+        case "fillsync":
+          writeOpt.setSync(true);
+          tasks.add(new WriteRandomTask(
+              currentTaskId++, randSeed_, num_ / 1000, num_ / 1000,
+              writeOpt, 1));
+          break;
+        case "readseq":
+          for (int t = 0; t < threadNum_; ++t) {
+            tasks.add(new ReadSequentialTask(
+                currentTaskId++, randSeed_, reads_ / threadNum_, num_));
+          }
+          break;
+        case "readrandom":
+          for (int t = 0; t < threadNum_; ++t) {
+            tasks.add(new ReadRandomTask(
+                currentTaskId++, randSeed_, reads_ / threadNum_, num_));
+          }
+          break;
+        case "readwhilewriting":
+          WriteTask writeTask = new WriteRandomTask(
+              -1, randSeed_, Long.MAX_VALUE, num_, writeOpt, 1, writesPerSeconds_);
+          writeTask.stats_.setExcludeFromMerge();
+          bgTasks.add(writeTask);
+          for (int t = 0; t < threadNum_; ++t) {
+            tasks.add(new ReadRandomTask(
+                currentTaskId++, randSeed_, reads_ / threadNum_, num_));
+          }
+          break;
+        case "readhot":
+          for (int t = 0; t < threadNum_; ++t) {
+            tasks.add(new ReadRandomTask(
+                currentTaskId++, randSeed_, reads_ / threadNum_, num_ / 100));
+          }
+          break;
+        case "delete":
+          destroyDb();
+          open(options);
+          break;
+        default:
+          known = false;
+          System.err.println("Unknown benchmark: " + benchmark);
+          break;
       }
       if (known) {
         ExecutorService executor = Executors.newCachedThreadPool();
@@ -800,7 +824,7 @@ public class DbBenchmark {
 
     System.out.printf(
         "%-16s : %11.5f micros/op; %6.1f MB/s;%s %d / %d task(s) finished.\n",
-        benchmark, (double) elapsedSeconds / stats.done_ * 1e6,
+        benchmark, elapsedSeconds / stats.done_ * 1e6,
         (stats.bytes_ / 1048576.0) / elapsedSeconds, extra,
         taskFinishedCount, concurrentThreads);
   }
diff --git a/java/org/rocksdb/test/ColumnFamilyTest.java b/java/org/rocksdb/test/ColumnFamilyTest.java
index 92f977ce3..5b51ee718 100644
--- a/java/org/rocksdb/test/ColumnFamilyTest.java
+++ b/java/org/rocksdb/test/ColumnFamilyTest.java
@@ -171,6 +171,9 @@ public class ColumnFamilyTest {
       if (db != null) {
         db.close();
       }
+      if (options != null) {
+        options.dispose();
+      }
     }
   }
 
diff --git a/java/org/rocksdb/test/InfoLogLevelTest.java b/java/org/rocksdb/test/InfoLogLevelTest.java
index f96ca92b9..7a04160b4 100644
--- a/java/org/rocksdb/test/InfoLogLevelTest.java
+++ b/java/org/rocksdb/test/InfoLogLevelTest.java
@@ -57,6 +57,9 @@ public class InfoLogLevelTest {
       if (db != null) {
         db.close();
       }
+      if (options != null) {
+        options.dispose();
+      }
     }
   }
 
@@ -84,6 +87,12 @@ public class InfoLogLevelTest {
       if (db != null) {
         db.close();
       }
+      if (options != null) {
+        options.dispose();
+      }
+      if (dbOptions != null) {
+        dbOptions.dispose();
+      }
     }
   }
 

From 335e6ad5cd38e5a595f62840704f7ffa4195448e Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Tue, 25 Nov 2014 20:52:46 +0100
Subject: [PATCH 585/829] [RocksJava] Remove obsolete dbFolder cleanup

---
 .../rocksdb/test/AbstractComparatorTest.java  | 41 -------------------
 1 file changed, 41 deletions(-)

diff --git a/java/org/rocksdb/test/AbstractComparatorTest.java b/java/org/rocksdb/test/AbstractComparatorTest.java
index e3e2f8849..6694b5be2 100644
--- a/java/org/rocksdb/test/AbstractComparatorTest.java
+++ b/java/org/rocksdb/test/AbstractComparatorTest.java
@@ -95,8 +95,6 @@ public abstract class AbstractComparatorTest {
       if (opt != null) {
         opt.dispose();
       }
-
-      removeDb(db_path); // cleanup after ourselves!
     }
   }
 
@@ -127,43 +125,4 @@ public abstract class AbstractComparatorTest {
 
     return result;
   }
-
-  /**
-   * Utility method for deleting database files
-   *
-   * @param db_path The path to the database to remove
-   *                from the filesystem
-   */
-  private static void removeDb(final Path db_path) throws IOException {
-    Files.walkFileTree(db_path, new SimpleFileVisitor<Path>() {
-      @Override
-      public FileVisitResult visitFile(final Path file, final BasicFileAttributes attrs)
-          throws IOException {
-        Files.delete(file);
-        return FileVisitResult.CONTINUE;
-      }
-
-      @Override
-      public FileVisitResult visitFileFailed(final Path file, IOException exc)
-          throws IOException {
-        // try to delete the file anyway, even if its attributes
-        // could not be read, since delete-only access is
-        // theoretically possible
-        Files.delete(file);
-        return FileVisitResult.CONTINUE;
-      }
-
-      @Override
-      public FileVisitResult postVisitDirectory(final Path dir, IOException exc)
-          throws IOException {
-        if (exc == null) {
-          Files.delete(dir);
-          return FileVisitResult.CONTINUE;
-        } else {
-          // directory iteration failed; propagate exception
-          throw exc;
-        }
-      }
-    });
-  }
 }

From e002a6122fdcd2508a7fa14e1ef82659cebc1019 Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Tue, 25 Nov 2014 21:32:19 +0100
Subject: [PATCH 586/829] [RocksJava] Comparator tests for CF

- Added AbstractComparatorTest.
- Fixed a bug in the JNI Part about Java comparators
---
 .../rocksdb/test/AbstractComparatorTest.java  | 99 +++++++++++++++++--
 java/org/rocksdb/test/ComparatorTest.java     | 28 +++++-
 .../rocksdb/test/DirectComparatorTest.java    |  2 +-
 java/rocksjni/options.cc                      |  4 +-
 4 files changed, 123 insertions(+), 10 deletions(-)

diff --git a/java/org/rocksdb/test/AbstractComparatorTest.java b/java/org/rocksdb/test/AbstractComparatorTest.java
index 6694b5be2..04abeb34d 100644
--- a/java/org/rocksdb/test/AbstractComparatorTest.java
+++ b/java/org/rocksdb/test/AbstractComparatorTest.java
@@ -9,7 +9,8 @@ import org.rocksdb.*;
 
 import java.io.IOException;
 import java.nio.file.*;
-import java.nio.file.attribute.BasicFileAttributes;
+import java.util.ArrayList;
+import java.util.List;
 import java.util.Random;
 
 import static org.assertj.core.api.Assertions.assertThat;
@@ -40,7 +41,7 @@ public abstract class AbstractComparatorTest {
    *
    * @throws java.io.IOException if IO error happens.
    */
-  public void testRoundtrip(final Path db_path) throws IOException {
+  public void testRoundtrip(final Path db_path) throws IOException, RocksDBException {
 
     Options opt = null;
     RocksDB db = null;
@@ -65,7 +66,6 @@ public abstract class AbstractComparatorTest {
       }
       db.close();
 
-
       // re-open db and read from start to end
       // integer keys should be in ascending
       // order as defined by SimpleIntComparator
@@ -84,9 +84,6 @@ public abstract class AbstractComparatorTest {
 
       assertThat(count).isEqualTo(ITERATIONS);
 
-    } catch (final RocksDBException e) {
-      System.err.format("[ERROR]: %s%n", e);
-      e.printStackTrace();
     } finally {
       if (db != null) {
         db.close();
@@ -98,6 +95,96 @@ public abstract class AbstractComparatorTest {
     }
   }
 
+  /**
+   * Test which stores random keys into a column family
+   * in the database
+   * using an @see getAscendingIntKeyComparator
+   * it then checks that these keys are read back in
+   * ascending order
+   *
+   * @param db_path A path where we can store database
+   *                files temporarily
+   *
+   * @throws java.io.IOException if IO error happens.
+   */
+  public void testRoundtripCf(final Path db_path) throws IOException,
+      RocksDBException {
+
+    DBOptions opt = null;
+    RocksDB db = null;
+    List<ColumnFamilyDescriptor> cfDescriptors =
+        new ArrayList<>();
+    cfDescriptors.add(new ColumnFamilyDescriptor(
+        RocksDB.DEFAULT_COLUMN_FAMILY));
+    cfDescriptors.add(new ColumnFamilyDescriptor("new_cf",
+        new ColumnFamilyOptions().setComparator(
+            getAscendingIntKeyComparator())));
+    List<ColumnFamilyHandle> cfHandles = new ArrayList<>();
+    try {
+      opt = new DBOptions().
+          setCreateIfMissing(true).
+          setCreateMissingColumnFamilies(true);
+
+      // store 10,000 random integer keys
+      final int ITERATIONS = 10000;
+
+      db = RocksDB.open(opt, db_path.toString(), cfDescriptors, cfHandles);
+      assertThat(cfDescriptors.size()).isEqualTo(2);
+      assertThat(cfHandles.size()).isEqualTo(2);
+
+      final Random random = new Random();
+      for (int i = 0; i < ITERATIONS; i++) {
+        final byte key[] = intToByte(random.nextInt());
+        if (i > 0 && db.get(cfHandles.get(1), key) != null) { // does key already exist (avoid duplicates)
+          i--; // generate a different key
+        } else {
+          db.put(cfHandles.get(1), key, "value".getBytes());
+        }
+      }
+      for (ColumnFamilyHandle handle : cfHandles) {
+        handle.dispose();
+      }
+      cfHandles.clear();
+      db.close();
+
+      // re-open db and read from start to end
+      // integer keys should be in ascending
+      // order as defined by SimpleIntComparator
+      db = RocksDB.open(opt, db_path.toString(), cfDescriptors, cfHandles);
+      assertThat(cfDescriptors.size()).isEqualTo(2);
+      assertThat(cfHandles.size()).isEqualTo(2);
+      final RocksIterator it = db.newIterator(cfHandles.get(1));
+      it.seekToFirst();
+      int lastKey = Integer.MIN_VALUE;
+      int count = 0;
+      for (it.seekToFirst(); it.isValid(); it.next()) {
+        final int thisKey = byteToInt(it.key());
+        assertThat(thisKey).isGreaterThan(lastKey);
+        lastKey = thisKey;
+        count++;
+      }
+      for (ColumnFamilyHandle handle : cfHandles) {
+        handle.dispose();
+      }
+      cfHandles.clear();
+      db.close();
+      assertThat(count).isEqualTo(ITERATIONS);
+
+    } finally {
+      for (ColumnFamilyHandle handle : cfHandles) {
+        handle.dispose();
+      }
+
+      if (db != null) {
+        db.close();
+      }
+
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
   /**
    * Compares integer keys
    * so that they are in ascending order
diff --git a/java/org/rocksdb/test/ComparatorTest.java b/java/org/rocksdb/test/ComparatorTest.java
index 299d8f62d..290ff21f7 100644
--- a/java/org/rocksdb/test/ComparatorTest.java
+++ b/java/org/rocksdb/test/ComparatorTest.java
@@ -26,7 +26,7 @@ public class ComparatorTest {
   public TemporaryFolder dbFolder = new TemporaryFolder();
 
   @Test
-  public void javaComparator() throws IOException {
+     public void javaComparator() throws IOException, RocksDBException {
 
     final AbstractComparatorTest comparatorTest = new AbstractComparatorTest() {
       @Override
@@ -51,6 +51,32 @@ public class ComparatorTest {
         dbFolder.getRoot().getAbsolutePath()));
   }
 
+  @Test
+  public void javaComparatorCf() throws IOException, RocksDBException {
+
+    final AbstractComparatorTest comparatorTest = new AbstractComparatorTest() {
+      @Override
+      public AbstractComparator getAscendingIntKeyComparator() {
+        return new Comparator(new ComparatorOptions()) {
+
+          @Override
+          public String name() {
+            return "test.AscendingIntKeyComparator";
+          }
+
+          @Override
+          public int compare(final Slice a, final Slice b) {
+            return compareIntKeys(a.data(), b.data());
+          }
+        };
+      }
+    };
+
+    // test the round-tripability of keys written and read with the Comparator
+    comparatorTest.testRoundtripCf(FileSystems.getDefault().getPath(
+        dbFolder.getRoot().getAbsolutePath()));
+  }
+
   @Test
   public void builtinForwardComparator()
       throws RocksDBException {
diff --git a/java/org/rocksdb/test/DirectComparatorTest.java b/java/org/rocksdb/test/DirectComparatorTest.java
index f09d94843..328ea0089 100644
--- a/java/org/rocksdb/test/DirectComparatorTest.java
+++ b/java/org/rocksdb/test/DirectComparatorTest.java
@@ -23,7 +23,7 @@ public class DirectComparatorTest {
   public TemporaryFolder dbFolder = new TemporaryFolder();
 
   @Test
-  public void directComparator() throws IOException {
+  public void directComparator() throws IOException, RocksDBException {
 
     final AbstractComparatorTest comparatorTest = new AbstractComparatorTest() {
       @Override
diff --git a/java/rocksjni/options.cc b/java/rocksjni/options.cc
index 339a3b095..d139b1a57 100644
--- a/java/rocksjni/options.cc
+++ b/java/rocksjni/options.cc
@@ -1874,9 +1874,9 @@ void Java_org_rocksdb_ColumnFamilyOptions_setComparatorHandle__JI(
  * Method:    setComparatorHandle
  * Signature: (JJ)V
  */
-void Java_org_rocksdb_ColumnFamilyOptions_setComparatorHandle__JI(
+void Java_org_rocksdb_ColumnFamilyOptions_setComparatorHandle__JJ(
     JNIEnv* env, jobject jobj, jlong jopt_handle, jlong jcomparator_handle) {
-  reinterpret_cast<rocksdb::Options*>(jopt_handle)->comparator =
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jopt_handle)->comparator =
       reinterpret_cast<rocksdb::Comparator*>(jcomparator_handle);
 }
 

From b7f9e644cc0050b1e9cb7f9d63d565e8255a4a02 Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Tue, 25 Nov 2014 22:13:23 +0100
Subject: [PATCH 587/829] [RocksJava] Quality improvements

Summary:
- Addressed some FindBugs issues.
- Remove obsolete dbFolder cleanup
- Comparator tests for CF
 - Added AbstractComparatorTest.
 - Fixed a bug in the JNI Part about Java comparators
- Minor test improvements

Test Plan:
make rocksdbjava
make jtest
mvn -f rocksjni.pom package

Reviewers: adamretter, yhchiang, ankgup87

Subscribers: dhruba

Differential Revision: https://reviews.facebook.net/D29571
---
 java/org/rocksdb/test/BackupableDBTest.java   |  16 +-
 .../test/BlockBasedTableConfigTest.java       |   3 +
 java/org/rocksdb/test/ComparatorTest.java     |   3 +
 java/org/rocksdb/test/InfoLogLevelTest.java   |  11 +
 .../rocksdb/test/PlainTableConfigTest.java    |   4 +
 .../rocksdb/test/WriteBatchHandlerTest.java   | 248 +++++++++---------
 6 files changed, 161 insertions(+), 124 deletions(-)

diff --git a/java/org/rocksdb/test/BackupableDBTest.java b/java/org/rocksdb/test/BackupableDBTest.java
index 3da519418..2ac2abfa1 100644
--- a/java/org/rocksdb/test/BackupableDBTest.java
+++ b/java/org/rocksdb/test/BackupableDBTest.java
@@ -215,14 +215,26 @@ public class BackupableDBTest {
       bdb.createNewBackup(true);
       bdb.createNewBackup(true);
       bdb.createNewBackup(true);
-      verifyNumberOfValidBackups(bdb, 4);
+      List<BackupInfo> infos = verifyNumberOfValidBackups(bdb, 4);
+      assertThat(infos.get(1).size()).
+          isEqualTo(infos.get(2).size());
+      assertThat(infos.get(1).numberFiles()).
+          isEqualTo(infos.get(2).numberFiles());
+      long maxTimeBeforePurge = Long.MIN_VALUE;
+      for (BackupInfo backupInfo : infos) {
+        if (maxTimeBeforePurge < backupInfo.timestamp()) {
+          maxTimeBeforePurge = backupInfo.timestamp();
+        }
+      }
       // init RestoreBackupableDB
       rdb = new RestoreBackupableDB(bopt);
       // the same number of backups must
       // exist using RestoreBackupableDB.
       verifyNumberOfValidBackups(rdb, 4);
       rdb.purgeOldBackups(1);
-      verifyNumberOfValidBackups(rdb, 1);
+      infos = verifyNumberOfValidBackups(rdb, 1);
+      assertThat(infos.get(0).timestamp()).
+          isEqualTo(maxTimeBeforePurge);
     } finally {
       if (bdb != null) {
         bdb.close();
diff --git a/java/org/rocksdb/test/BlockBasedTableConfigTest.java b/java/org/rocksdb/test/BlockBasedTableConfigTest.java
index 241429542..5e0b96f29 100644
--- a/java/org/rocksdb/test/BlockBasedTableConfigTest.java
+++ b/java/org/rocksdb/test/BlockBasedTableConfigTest.java
@@ -84,6 +84,9 @@ public class BlockBasedTableConfigTest {
   @Test
   public void checksumType() {
     BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+    assertThat(ChecksumType.values().length).isEqualTo(3);
+    assertThat(ChecksumType.valueOf("kxxHash")).
+        isEqualTo(ChecksumType.kxxHash);
     blockBasedTableConfig.setChecksumType(ChecksumType.kNoChecksum);
     blockBasedTableConfig.setChecksumType(ChecksumType.kxxHash);
     assertThat(blockBasedTableConfig.checksumType().equals(
diff --git a/java/org/rocksdb/test/ComparatorTest.java b/java/org/rocksdb/test/ComparatorTest.java
index 290ff21f7..e1bba6a7f 100644
--- a/java/org/rocksdb/test/ComparatorTest.java
+++ b/java/org/rocksdb/test/ComparatorTest.java
@@ -221,5 +221,8 @@ public class ComparatorTest {
     assertThat(
         BuiltinComparator.REVERSE_BYTEWISE_COMPARATOR.ordinal())
         .isEqualTo(1);
+    assertThat(BuiltinComparator.values().length).isEqualTo(2);
+    assertThat(BuiltinComparator.valueOf("BYTEWISE_COMPARATOR")).
+        isEqualTo(BuiltinComparator.BYTEWISE_COMPARATOR);
   }
 }
diff --git a/java/org/rocksdb/test/InfoLogLevelTest.java b/java/org/rocksdb/test/InfoLogLevelTest.java
index 7a04160b4..82bf485de 100644
--- a/java/org/rocksdb/test/InfoLogLevelTest.java
+++ b/java/org/rocksdb/test/InfoLogLevelTest.java
@@ -96,6 +96,17 @@ public class InfoLogLevelTest {
     }
   }
 
+  @Test(expected = IllegalArgumentException.class)
+  public void failIfIllegalByteValueProvided() {
+    InfoLogLevel.getInfoLogLevel((byte)-1);
+  }
+
+  @Test
+  public void valueOf() {
+    assertThat(InfoLogLevel.valueOf("DEBUG_LEVEL")).
+        isEqualTo(InfoLogLevel.DEBUG_LEVEL);
+  }
+
   /**
    * Read LOG file contents into String.
    *
diff --git a/java/org/rocksdb/test/PlainTableConfigTest.java b/java/org/rocksdb/test/PlainTableConfigTest.java
index 72347e7d4..a533141ea 100644
--- a/java/org/rocksdb/test/PlainTableConfigTest.java
+++ b/java/org/rocksdb/test/PlainTableConfigTest.java
@@ -63,6 +63,10 @@ public class PlainTableConfigTest {
   public void encodingType() {
     PlainTableConfig plainTableConfig = new PlainTableConfig();
     plainTableConfig.setEncodingType(EncodingType.kPrefix);
+    assertThat(EncodingType.valueOf("kPrefix")).isEqualTo(
+        EncodingType.kPrefix);
+    assertThat(EncodingType.values().length).
+        isEqualTo(2);
     assertThat(plainTableConfig.encodingType()).isEqualTo(
         EncodingType.kPrefix);
   }
diff --git a/java/org/rocksdb/test/WriteBatchHandlerTest.java b/java/org/rocksdb/test/WriteBatchHandlerTest.java
index 5a330e409..ca26c9275 100644
--- a/java/org/rocksdb/test/WriteBatchHandlerTest.java
+++ b/java/org/rocksdb/test/WriteBatchHandlerTest.java
@@ -5,7 +5,6 @@
 
 package org.rocksdb.test;
 
-import org.rocksdb.RocksDB;
 import org.rocksdb.RocksDBException;
 import org.rocksdb.WriteBatch;
 
@@ -13,9 +12,9 @@ import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
+
 import org.junit.ClassRule;
 import org.junit.Test;
-import org.rocksdb.WriteOptions;
 
 import static org.assertj.core.api.Assertions.assertThat;
 
@@ -27,143 +26,148 @@ public class WriteBatchHandlerTest {
 
   @Test
   public void writeBatchHandler() throws IOException, RocksDBException {
-
-        // setup test data
-        final List<Tuple<Action, Tuple<byte[], byte[]>>> testEvents = new ArrayList<>();
-        testEvents.add(new Tuple<>(Action.DELETE,
-            new Tuple<byte[], byte[]>("k0".getBytes(), null)));
-        testEvents.add(new Tuple<>(Action.PUT,
-            new Tuple<>("k1".getBytes(), "v1".getBytes())));
-        testEvents.add(new Tuple<>(Action.PUT,
-            new Tuple<>("k2".getBytes(), "v2".getBytes())));
-        testEvents.add(new Tuple<>(Action.PUT,
-            new Tuple<>("k3".getBytes(), "v3".getBytes())));
-        testEvents.add(new Tuple<>(Action.LOG,
-            new Tuple<byte[], byte[]>(null, "log1".getBytes())));
-        testEvents.add(new Tuple<>(Action.MERGE,
-            new Tuple<>("k2".getBytes(), "v22".getBytes())));
-        testEvents.add(new Tuple<>(Action.DELETE,
-            new Tuple<byte[], byte[]>("k3".getBytes(), null)));
-
-        // load test data to the write batch
-        final WriteBatch batch = new WriteBatch();
-        for(final Tuple<Action, Tuple<byte[], byte[]>> testEvent : testEvents) {
-            final Tuple<byte[], byte[]> data = testEvent.value;
-            switch(testEvent.key) {
-
-                case PUT:
-                    batch.put(data.key, data.value);
-                    break;
-
-                case MERGE:
-                    batch.merge(data.key, data.value);
-                    break;
-
-                case DELETE:
-                    batch.remove(data.key);
-                    break;
-
-                case LOG:
-                    batch.putLogData(data.value);
-                    break;
-            }
+    WriteBatch batch = null;
+    CapturingWriteBatchHandler handler = null;
+    try {
+      // setup test data
+      final List<Tuple<Action, Tuple<byte[], byte[]>>> testEvents = new ArrayList<>();
+      testEvents.add(new Tuple<>(Action.DELETE,
+          new Tuple<byte[], byte[]>("k0".getBytes(), null)));
+      testEvents.add(new Tuple<>(Action.PUT,
+          new Tuple<>("k1".getBytes(), "v1".getBytes())));
+      testEvents.add(new Tuple<>(Action.PUT,
+          new Tuple<>("k2".getBytes(), "v2".getBytes())));
+      testEvents.add(new Tuple<>(Action.PUT,
+          new Tuple<>("k3".getBytes(), "v3".getBytes())));
+      testEvents.add(new Tuple<>(Action.LOG,
+          new Tuple<byte[], byte[]>(null, "log1".getBytes())));
+      testEvents.add(new Tuple<>(Action.MERGE,
+          new Tuple<>("k2".getBytes(), "v22".getBytes())));
+      testEvents.add(new Tuple<>(Action.DELETE,
+          new Tuple<byte[], byte[]>("k3".getBytes(), null)));
+
+      // load test data to the write batch
+      batch = new WriteBatch();
+      for (final Tuple<Action, Tuple<byte[], byte[]>> testEvent : testEvents) {
+        final Tuple<byte[], byte[]> data = testEvent.value;
+        switch (testEvent.key) {
+
+          case PUT:
+            batch.put(data.key, data.value);
+            break;
+
+          case MERGE:
+            batch.merge(data.key, data.value);
+            break;
+
+          case DELETE:
+            batch.remove(data.key);
+            break;
+
+          case LOG:
+            batch.putLogData(data.value);
+            break;
         }
-
-        // attempt to read test data back from the WriteBatch by iterating with a handler
-        final CapturingWriteBatchHandler handler = new CapturingWriteBatchHandler();
-        batch.iterate(handler);
-
-        // compare the results to the test data
-        final List<Tuple<Action, Tuple<byte[], byte[]>>> actualEvents = handler.getEvents();
-        assertThat(testEvents.size()).isSameAs(actualEvents.size());
-
-        for(int i = 0; i < testEvents.size(); i++) {
-            assertThat(equals(testEvents.get(i), actualEvents.get(i))).isTrue();
-        }
-
-        System.out.println("Passed WriteBatchHandler Test");
+      }
+
+      // attempt to read test data back from the WriteBatch by iterating with a handler
+      handler = new CapturingWriteBatchHandler();
+      batch.iterate(handler);
+
+      // compare the results to the test data
+      final List<Tuple<Action, Tuple<byte[], byte[]>>> actualEvents = handler.getEvents();
+      assertThat(testEvents.size()).isSameAs(actualEvents.size());
+
+      for (int i = 0; i < testEvents.size(); i++) {
+        assertThat(equals(testEvents.get(i), actualEvents.get(i))).isTrue();
+      }
+    } finally {
+      if (handler != null) {
+        handler.dispose();
+      }
+      if (batch != null) {
+        batch.dispose();
+      }
     }
+  }
 
-    private static boolean equals(final Tuple<Action, Tuple<byte[], byte[]>> expected,
-                                  final Tuple<Action, Tuple<byte[], byte[]>> actual) {
-        if(!expected.key.equals(actual.key)) {
-            return false;
-        }
+  private static boolean equals(final Tuple<Action, Tuple<byte[], byte[]>> expected,
+                                final Tuple<Action, Tuple<byte[], byte[]>> actual) {
+    if (!expected.key.equals(actual.key)) {
+      return false;
+    }
 
-        final Tuple<byte[], byte[]> expectedData = expected.value;
-        final Tuple<byte[], byte[]> actualData = actual.value;
+    final Tuple<byte[], byte[]> expectedData = expected.value;
+    final Tuple<byte[], byte[]> actualData = actual.value;
 
-        if(equals(expectedData.key, actualData.key)) {
-            return equals(expectedData.value, actualData.value);
-        } else {
-            return false;
-        }
-    }
+    return equals(expectedData.key, actualData.key)
+        && equals(expectedData.value, actualData.value);
+  }
 
-    private static boolean equals(byte[] expected, byte[] actual) {
-        if(expected != null) {
-            return  Arrays.equals(expected, actual);
-        } else {
-            return actual == null;
-        }
+  private static boolean equals(byte[] expected, byte[] actual) {
+    if (expected != null) {
+      return Arrays.equals(expected, actual);
+    } else {
+      return actual == null;
     }
+  }
 
-    private static class Tuple<K, V> {
-        public final K key;
-        public final V value;
+  private static class Tuple<K, V> {
+    public final K key;
+    public final V value;
 
-        public Tuple(final K key, final V value) {
-            this.key = key;
-            this.value = value;
-        }
+    public Tuple(final K key, final V value) {
+      this.key = key;
+      this.value = value;
     }
+  }
+
+  /**
+   * Enumeration of Write Batch
+   * event actions
+   */
+  private enum Action {
+    PUT,
+    MERGE,
+    DELETE,
+    LOG
+  }
+
+  /**
+   * A simple WriteBatch Handler which adds a record
+   * of each event that it receives to a list
+   */
+  private static class CapturingWriteBatchHandler extends WriteBatch.Handler {
+
+    private final List<Tuple<Action, Tuple<byte[], byte[]>>> events = new ArrayList<>();
 
     /**
-     * Enumeration of Write Batch
-     * event actions
+     * Returns a copy of the current events list
+     *
+     * @return a list of the events which have happened upto now
      */
-    private enum Action {
-        PUT,
-        MERGE,
-        DELETE,
-        LOG
+    public List<Tuple<Action, Tuple<byte[], byte[]>>> getEvents() {
+      return new ArrayList<>(events);
     }
 
-    /**
-     * A simple WriteBatch Handler which adds a record
-     * of each event that it receives to a list
-     */
-    private static class CapturingWriteBatchHandler extends WriteBatch.Handler {
-
-        private final List<Tuple<Action, Tuple<byte[], byte[]>>> events = new ArrayList<>();
-
-        /**
-         * Returns a copy of the current events list
-         *
-         * @return a list of the events which have happened upto now
-         */
-        public List<Tuple<Action, Tuple<byte[], byte[]>>> getEvents() {
-            return new ArrayList<>(events);
-        }
-
-        @Override
-        public void put(final byte[] key, final byte[] value) {
-            events.add(new Tuple<>(Action.PUT, new Tuple<>(key, value)));
-        }
+    @Override
+    public void put(final byte[] key, final byte[] value) {
+      events.add(new Tuple<>(Action.PUT, new Tuple<>(key, value)));
+    }
 
-        @Override
-        public void merge(final byte[] key, final byte[] value) {
-            events.add(new Tuple<>(Action.MERGE, new Tuple<>(key, value)));
-        }
+    @Override
+    public void merge(final byte[] key, final byte[] value) {
+      events.add(new Tuple<>(Action.MERGE, new Tuple<>(key, value)));
+    }
 
-        @Override
-        public void delete(final byte[] key) {
-            events.add(new Tuple<>(Action.DELETE, new Tuple<byte[], byte[]>(key, null)));
-        }
+    @Override
+    public void delete(final byte[] key) {
+      events.add(new Tuple<>(Action.DELETE, new Tuple<byte[], byte[]>(key, null)));
+    }
 
-        @Override
-        public void logData(final byte[] blob) {
-            events.add(new Tuple<>(Action.LOG, new Tuple<byte[], byte[]>(null, blob)));
-        }
+    @Override
+    public void logData(final byte[] blob) {
+      events.add(new Tuple<>(Action.LOG, new Tuple<byte[], byte[]>(null, blob)));
     }
+  }
 }

From a15169f2e9f4729c73d207862cb0c79f1e7d322c Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Tue, 25 Nov 2014 22:30:35 +0100
Subject: [PATCH 588/829] Fixed a Lint problem

---
 java/org/rocksdb/test/AbstractComparatorTest.java | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/java/org/rocksdb/test/AbstractComparatorTest.java b/java/org/rocksdb/test/AbstractComparatorTest.java
index 04abeb34d..9d1f2fc64 100644
--- a/java/org/rocksdb/test/AbstractComparatorTest.java
+++ b/java/org/rocksdb/test/AbstractComparatorTest.java
@@ -135,7 +135,8 @@ public abstract class AbstractComparatorTest {
       final Random random = new Random();
       for (int i = 0; i < ITERATIONS; i++) {
         final byte key[] = intToByte(random.nextInt());
-        if (i > 0 && db.get(cfHandles.get(1), key) != null) { // does key already exist (avoid duplicates)
+        if (i > 0 && db.get(cfHandles.get(1), key) != null) {
+          // does key already exist (avoid duplicates)
           i--; // generate a different key
         } else {
           db.put(cfHandles.get(1), key, "value".getBytes());

From 37d73d597e2613af925f3f531eac1d2477e7d432 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Tue, 2 Dec 2014 13:53:39 -0500
Subject: [PATCH 589/829] Fix linters

Summary:
Two fixes:
1. if cpplint is not present on the system, don't return a confusing error in the linter
2. Add include_alpha, which means our includes should be sorted lexicographically

Test Plan: Tried unsorting our includes, lint complained

Reviewers: rven, ljin, yhchiang, sdong

Reviewed By: sdong

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D28845
---
 db/column_family.cc                    | 6 +++---
 linters/cpp_linter/FbcodeCppLinter.php | 5 ++++-
 linters/cpp_linter/cpplint.py          | 2 +-
 3 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/db/column_family.cc b/db/column_family.cc
index 84fd1ac4b..7ba5ad763 100644
--- a/db/column_family.cc
+++ b/db/column_family.cc
@@ -19,12 +19,12 @@
 #include <algorithm>
 #include <limits>
 
+#include "db/compaction_picker.h"
 #include "db/db_impl.h"
-#include "db/job_context.h"
-#include "db/version_set.h"
 #include "db/internal_stats.h"
-#include "db/compaction_picker.h"
+#include "db/job_context.h"
 #include "db/table_properties_collector.h"
+#include "db/version_set.h"
 #include "db/write_controller.h"
 #include "util/autovector.h"
 #include "util/hash_skiplist_rep.h"
diff --git a/linters/cpp_linter/FbcodeCppLinter.php b/linters/cpp_linter/FbcodeCppLinter.php
index e62d3bbe1..c7b4935e7 100644
--- a/linters/cpp_linter/FbcodeCppLinter.php
+++ b/linters/cpp_linter/FbcodeCppLinter.php
@@ -31,7 +31,7 @@ class FbcodeCppLinter extends ArcanistLinter {
                              $this->getEngine()->getFilePathOnDisk($p));
         } else {
           $futures[$p] = new ExecFuture("%s %s 2>&1",
-            self::CPPLINT, $this->getEngine()->getFilePathOnDisk($p));
+            $CPP_LINT, $this->getEngine()->getFilePathOnDisk($p));
         }
       }
 
@@ -68,6 +68,9 @@ class FbcodeCppLinter extends ArcanistLinter {
   }
 
   private function getCppLintOutput($path) {
+    if (!array_key_exists($path, $this->rawLintOutput)) {
+      return array();
+    }
     list($output) = $this->rawLintOutput[$path];
 
     $msgs = array();
diff --git a/linters/cpp_linter/cpplint.py b/linters/cpp_linter/cpplint.py
index d264b00da..d6201945a 100755
--- a/linters/cpp_linter/cpplint.py
+++ b/linters/cpp_linter/cpplint.py
@@ -213,7 +213,7 @@ _ERROR_CATEGORIES = [
 # flag. By default all errors are on, so only add here categories that should be
 # off by default (i.e., categories that must be enabled by the --filter= flags).
 # All entries here should start with a '-' or '+', as in the --filter= flag.
-_DEFAULT_FILTERS = ['-build/include_alpha']
+_DEFAULT_FILTERS = []
 
 # We used to check for high-bit characters, but after much discussion we
 # decided those were OK, as long as they were in UTF-8 and didn't represent

From 3e684aa68557f2589327cb7f5e3ce4aec140cfc5 Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Tue, 2 Dec 2014 14:29:00 +0100
Subject: [PATCH 590/829] Integrated changes from D29571

---
 java/org/rocksdb/CompressionType.java         | 54 +++++++++++++++----
 java/org/rocksdb/RocksDB.java                 | 16 +++---
 java/org/rocksdb/benchmark/DbBenchmark.java   | 35 ++++--------
 .../rocksdb/test/CompressionOptionsTest.java  | 22 ++++++++
 java/org/rocksdb/test/MixedOptionsTest.java   |  1 -
 .../rocksdb/test/PlainTableConfigTest.java    |  4 --
 6 files changed, 85 insertions(+), 47 deletions(-)
 create mode 100644 java/org/rocksdb/test/CompressionOptionsTest.java

diff --git a/java/org/rocksdb/CompressionType.java b/java/org/rocksdb/CompressionType.java
index 9c158ccf4..c718d26a9 100644
--- a/java/org/rocksdb/CompressionType.java
+++ b/java/org/rocksdb/CompressionType.java
@@ -14,25 +14,59 @@ package org.rocksdb;
  * compression method (if any) is used to compress a block.</p>
  */
 public enum CompressionType {
-  NO_COMPRESSION((byte) 0),
-  SNAPPY_COMPRESSION((byte) 1),
-  ZLIB_COMPRESSION((byte) 2),
-  BZLIB2_COMPRESSION((byte) 3),
-  LZ4_COMPRESSION((byte) 4),
-  LZ4HC_COMPRESSION((byte) 5);
 
-  private final byte value_;
+  NO_COMPRESSION((byte) 0, null),
+  SNAPPY_COMPRESSION((byte) 1, "snappy"),
+  ZLIB_COMPRESSION((byte) 2, "z"),
+  BZLIB2_COMPRESSION((byte) 3, "bzip2"),
+  LZ4_COMPRESSION((byte) 4, "lz4"),
+  LZ4HC_COMPRESSION((byte) 5, "lz4hc");
 
-  private CompressionType(byte value) {
-    value_ = value;
+  /**
+   * <p>Get the CompressionType enumeration value by
+   * passing the library name to this method.</p>
+   *
+   * <p>If library cannot be found the enumeration
+   * value {@code NO_COMPRESSION} will be returned.</p>
+   *
+   * @return CompressionType instance.
+   */
+  public static CompressionType getCompressionType(String libraryName) {
+    if (libraryName != null) {
+      for (CompressionType compressionType : CompressionType.values()) {
+        if (compressionType.getLibraryName() != null &&
+            compressionType.getLibraryName().equals(libraryName)) {
+          return compressionType;
+        }
+      }
+    }
+    return CompressionType.NO_COMPRESSION;
   }
 
   /**
-   * Returns the byte value of the enumerations value
+   * <p>Returns the byte value of the enumerations value.</p>
    *
    * @return byte representation
    */
   public byte getValue() {
     return value_;
   }
+
+  /**
+   * <p>Returns the library name of the compression type
+   * identified by the enumeration value.</p>
+   *
+   * @return library name
+   */
+  public String getLibraryName() {
+    return libraryName_;
+  }
+
+  private CompressionType(byte value, final String libraryName) {
+        value_ = value;
+        libraryName_ = libraryName;
+  }
+
+  private final byte value_;
+  private final String libraryName_;
 }
diff --git a/java/org/rocksdb/RocksDB.java b/java/org/rocksdb/RocksDB.java
index bb88710ed..3d420adea 100644
--- a/java/org/rocksdb/RocksDB.java
+++ b/java/org/rocksdb/RocksDB.java
@@ -18,8 +18,6 @@ import org.rocksdb.util.Environment;
 public class RocksDB extends RocksObject {
   public static final String DEFAULT_COLUMN_FAMILY = "default";
   public static final int NOT_FOUND = -1;
-  private static final String[] compressionLibs_ = {
-      "snappy", "z", "bzip2", "lz4", "lz4hc"};
 
   static {
     RocksDB.loadLibrary();
@@ -35,9 +33,11 @@ public class RocksDB extends RocksObject {
   public static synchronized void loadLibrary() {
     String tmpDir = System.getenv("ROCKSDB_SHAREDLIB_DIR");
     // loading possibly necessary libraries.
-    for (String lib : compressionLibs_) {
+    for (CompressionType compressionType : CompressionType.values()) {
       try {
-      System.loadLibrary(lib);
+        if (compressionType.getLibraryName() != null) {
+          System.loadLibrary(compressionType.getLibraryName());
+        }
       } catch (UnsatisfiedLinkError e) {
         // since it may be optional, we ignore its loading failure here.
       }
@@ -60,10 +60,14 @@ public class RocksDB extends RocksObject {
    *     of a library.
    */
   public static synchronized void loadLibrary(List<String> paths) {
-    for (String lib : compressionLibs_) {
+    for (CompressionType compressionType : CompressionType.values()) {
+      if (compressionType.equals(CompressionType.NO_COMPRESSION)) {
+        continue;
+      }
       for (String path : paths) {
         try {
-          System.load(path + "/" + Environment.getSharedLibraryName(lib));
+          System.load(path + "/" + Environment.getSharedLibraryName(
+              compressionType.getLibraryName()));
           break;
         } catch (UnsatisfiedLinkError e) {
           // since they are optional, we ignore loading fails.
diff --git a/java/org/rocksdb/benchmark/DbBenchmark.java b/java/org/rocksdb/benchmark/DbBenchmark.java
index 26b295f7b..64fc5f0a7 100644
--- a/java/org/rocksdb/benchmark/DbBenchmark.java
+++ b/java/org/rocksdb/benchmark/DbBenchmark.java
@@ -163,15 +163,6 @@ public class DbBenchmark {
     EXISTING
   }
 
-  enum CompressionType {
-    NONE,
-    SNAPPY,
-    ZLIB,
-    BZIP2,
-    LZ4,
-    LZ4HC
-  }
-
   static {
     RocksDB.loadLibrary();
   }
@@ -457,24 +448,16 @@ public class DbBenchmark {
     // options.setPrefixSize((Integer)flags_.get(Flag.prefix_size));
     // options.setKeysPerPrefix((Long)flags_.get(Flag.keys_per_prefix));
     compressionType_ = (String) flags.get(Flag.compression_type);
-    compression_ = CompressionType.NONE;
+    compression_ = CompressionType.NO_COMPRESSION;
     try {
-      switch (compressionType_) {
-        case "snappy":
-          System.loadLibrary("snappy");
-          break;
-        case "zlib":
-          System.loadLibrary("z");
-          break;
-        case "bzip2":
-          System.loadLibrary("bzip2");
-          break;
-        case "lz4":
-          System.loadLibrary("lz4");
-          break;
-        case "lz4hc":
-          System.loadLibrary("lz4hc");
-          break;
+      if (compressionType_!=null) {
+          final CompressionType compressionType =
+              CompressionType.getCompressionType(compressionType_);
+          if (compressionType != null &&
+              compressionType != CompressionType.NO_COMPRESSION) {
+            System.loadLibrary(compressionType.getLibraryName());
+          }
+
       }
     } catch (UnsatisfiedLinkError e) {
       System.err.format("Unable to load %s library:%s%n" +
diff --git a/java/org/rocksdb/test/CompressionOptionsTest.java b/java/org/rocksdb/test/CompressionOptionsTest.java
new file mode 100644
index 000000000..f8aff9268
--- /dev/null
+++ b/java/org/rocksdb/test/CompressionOptionsTest.java
@@ -0,0 +1,22 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb.test;
+
+import org.junit.Test;
+import org.rocksdb.CompressionType;
+
+
+public class CompressionOptionsTest
+{
+  @Test
+  public void getCompressionType() {
+    for (CompressionType compressionType : CompressionType.values()) {
+      String libraryName = compressionType.getLibraryName();
+      compressionType.equals(CompressionType.getCompressionType(
+          libraryName));
+    }
+  }
+}
diff --git a/java/org/rocksdb/test/MixedOptionsTest.java b/java/org/rocksdb/test/MixedOptionsTest.java
index 0f15e668c..528bea2e3 100644
--- a/java/org/rocksdb/test/MixedOptionsTest.java
+++ b/java/org/rocksdb/test/MixedOptionsTest.java
@@ -53,6 +53,5 @@ public class MixedOptionsTest {
     options.optimizeUniversalStyleCompaction(400);
     options.optimizeForPointLookup(1024);
     options.prepareForBulkLoad();
-    System.out.println("Mixed options test passed");
   }
 }
diff --git a/java/org/rocksdb/test/PlainTableConfigTest.java b/java/org/rocksdb/test/PlainTableConfigTest.java
index a533141ea..72347e7d4 100644
--- a/java/org/rocksdb/test/PlainTableConfigTest.java
+++ b/java/org/rocksdb/test/PlainTableConfigTest.java
@@ -63,10 +63,6 @@ public class PlainTableConfigTest {
   public void encodingType() {
     PlainTableConfig plainTableConfig = new PlainTableConfig();
     plainTableConfig.setEncodingType(EncodingType.kPrefix);
-    assertThat(EncodingType.valueOf("kPrefix")).isEqualTo(
-        EncodingType.kPrefix);
-    assertThat(EncodingType.values().length).
-        isEqualTo(2);
     assertThat(plainTableConfig.encodingType()).isEqualTo(
         EncodingType.kPrefix);
   }

From a14b7873ee85f12150e3ef544cd535c322c864f9 Mon Sep 17 00:00:00 2001
From: Jonah Cohen <jonah@fb.com>
Date: Tue, 2 Dec 2014 12:09:20 -0800
Subject: [PATCH 591/829] Enforce write buffer memory limit across column
 families

Summary:
Introduces a new class for managing write buffer memory across column
families.  We supplement ColumnFamilyOptions::write_buffer_size with
ColumnFamilyOptions::write_buffer, a shared pointer to a WriteBuffer
instance that enforces memory limits before flushing out to disk.

Test Plan: Added SharedWriteBuffer unit test to db_test.cc

Reviewers: sdong, rven, ljin, igor

Reviewed By: igor

Subscribers: tnovak, yhchiang, dhruba, xjin, MarkCallaghan, yoshinorim

Differential Revision: https://reviews.facebook.net/D22581
---
 HISTORY.md                    |  1 +
 db/c.cc                       |  5 ++
 db/column_family.cc           | 24 ++++++++--
 db/column_family.h            |  9 +++-
 db/compaction_job_test.cc     |  6 ++-
 db/db_bench.cc                |  4 ++
 db/db_impl.cc                 | 25 ++++++++--
 db/db_impl.h                  |  3 ++
 db/db_test.cc                 | 86 +++++++++++++++++++++++++++++++++--
 db/flush_job_test.cc          | 10 ++--
 db/log_and_apply_bench.cc     |  4 +-
 db/memtable.cc                | 12 +++--
 db/memtable.h                 | 14 ++++--
 db/memtable_allocator.cc      | 52 +++++++++++++++++++++
 db/memtable_allocator.h       | 47 +++++++++++++++++++
 db/repair.cc                  |  4 +-
 db/skiplist.h                 | 24 +++++-----
 db/version_set.cc             |  8 +++-
 db/version_set.h              |  3 +-
 db/wal_manager_test.cc        |  6 ++-
 db/write_batch_test.cc        |  4 +-
 db/writebuffer.h              | 44 ++++++++++++++++++
 include/rocksdb/memtablerep.h | 22 +++++----
 include/rocksdb/options.h     | 15 ++++++
 table/bloom_block.h           |  7 +--
 table/table_test.cc           | 31 ++++++++-----
 tools/db_stress.cc            |  5 ++
 util/allocator.h              | 32 +++++++++++++
 util/arena.h                  | 14 +++---
 util/dynamic_bloom.cc         | 13 +++---
 util/dynamic_bloom.h          | 11 +++--
 util/dynamic_bloom_test.cc    |  1 +
 util/hash_cuckoo_rep.cc       | 22 +++++----
 util/hash_cuckoo_rep.h        |  2 +-
 util/hash_linklist_rep.cc     | 44 +++++++++---------
 util/hash_linklist_rep.h      |  2 +-
 util/hash_skiplist_rep.cc     | 29 ++++++------
 util/hash_skiplist_rep.h      |  2 +-
 util/ldb_cmd.cc               | 19 +++++++-
 util/ldb_cmd.h                |  1 +
 util/ldb_tool.cc              |  2 +
 util/options.cc               |  5 ++
 util/options_helper.cc        |  2 +
 util/skiplistrep.cc           | 11 +++--
 util/vectorrep.cc             | 12 +++--
 45 files changed, 551 insertions(+), 148 deletions(-)
 create mode 100644 db/memtable_allocator.cc
 create mode 100644 db/memtable_allocator.h
 create mode 100644 db/writebuffer.h
 create mode 100644 util/allocator.h

diff --git a/HISTORY.md b/HISTORY.md
index 93170fa6f..f2b5bf873 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -8,6 +8,7 @@
   database which is an image of the existing database.
 *New API LinkFile added to Env. If you implement your own Env class, an
  implementation of the API LinkFile will have to be provided.
+* MemTableRep takes MemTableAllocator instead of Arena
 
 ## 3.8.0 (11/14/2014)
 
diff --git a/db/c.cc b/db/c.cc
index 857f4e654..76a949cd1 100644
--- a/db/c.cc
+++ b/db/c.cc
@@ -1264,6 +1264,11 @@ void rocksdb_options_set_info_log_level(
   opt->rep.info_log_level = static_cast<InfoLogLevel>(v);
 }
 
+void rocksdb_options_set_db_write_buffer_size(rocksdb_options_t* opt,
+                                              size_t s) {
+  opt->rep.db_write_buffer_size = s;
+}
+
 void rocksdb_options_set_write_buffer_size(rocksdb_options_t* opt, size_t s) {
   opt->rep.write_buffer_size = s;
 }
diff --git a/db/column_family.cc b/db/column_family.cc
index 7ba5ad763..f07c741a4 100644
--- a/db/column_family.cc
+++ b/db/column_family.cc
@@ -21,6 +21,9 @@
 
 #include "db/compaction_picker.h"
 #include "db/db_impl.h"
+#include "db/job_context.h"
+#include "db/version_set.h"
+#include "db/writebuffer.h"
 #include "db/internal_stats.h"
 #include "db/job_context.h"
 #include "db/table_properties_collector.h"
@@ -223,6 +226,7 @@ void SuperVersionUnrefHandle(void* ptr) {
 ColumnFamilyData::ColumnFamilyData(uint32_t id, const std::string& name,
                                    Version* _dummy_versions,
                                    Cache* _table_cache,
+                                   WriteBuffer* write_buffer,
                                    const ColumnFamilyOptions& cf_options,
                                    const DBOptions* db_options,
                                    const EnvOptions& env_options,
@@ -237,6 +241,7 @@ ColumnFamilyData::ColumnFamilyData(uint32_t id, const std::string& name,
       options_(*db_options, SanitizeOptions(&internal_comparator_, cf_options)),
       ioptions_(options_),
       mutable_cf_options_(options_, ioptions_),
+      write_buffer_(write_buffer),
       mem_(nullptr),
       imm_(options_.min_write_buffer_number_to_merge),
       super_version_(nullptr),
@@ -413,13 +418,19 @@ void ColumnFamilyData::SetCurrent(Version* current_version) {
   current_ = current_version;
 }
 
-void ColumnFamilyData::CreateNewMemtable(
+MemTable* ColumnFamilyData::ConstructNewMemtable(
     const MutableCFOptions& mutable_cf_options) {
   assert(current_ != nullptr);
+  return new MemTable(internal_comparator_, ioptions_,
+                      mutable_cf_options, write_buffer_);
+}
+
+void ColumnFamilyData::CreateNewMemtable(
+    const MutableCFOptions& mutable_cf_options) {
   if (mem_ != nullptr) {
     delete mem_->Unref();
   }
-  mem_ = new MemTable(internal_comparator_, ioptions_, mutable_cf_options);
+  SetMemtable(ConstructNewMemtable(mutable_cf_options));
   mem_->Ref();
 }
 
@@ -600,9 +611,10 @@ ColumnFamilySet::ColumnFamilySet(const std::string& dbname,
                                  const DBOptions* db_options,
                                  const EnvOptions& env_options,
                                  Cache* table_cache,
+                                 WriteBuffer* write_buffer,
                                  WriteController* write_controller)
     : max_column_family_(0),
-      dummy_cfd_(new ColumnFamilyData(0, "", nullptr, nullptr,
+      dummy_cfd_(new ColumnFamilyData(0, "", nullptr, nullptr, nullptr,
                                       ColumnFamilyOptions(), db_options,
                                       env_options, nullptr)),
       default_cfd_cache_(nullptr),
@@ -610,6 +622,7 @@ ColumnFamilySet::ColumnFamilySet(const std::string& dbname,
       db_options_(db_options),
       env_options_(env_options),
       table_cache_(table_cache),
+      write_buffer_(write_buffer),
       write_controller_(write_controller),
       spin_lock_(ATOMIC_FLAG_INIT) {
   // initialize linked list
@@ -674,8 +687,9 @@ ColumnFamilyData* ColumnFamilySet::CreateColumnFamily(
     const ColumnFamilyOptions& options) {
   assert(column_families_.find(name) == column_families_.end());
   ColumnFamilyData* new_cfd =
-      new ColumnFamilyData(id, name, dummy_versions, table_cache_, options,
-                           db_options_, env_options_, this);
+      new ColumnFamilyData(id, name, dummy_versions, table_cache_,
+                           write_buffer_, options, db_options_,
+                           env_options_, this);
   Lock();
   column_families_.insert({name, id});
   column_family_data_.insert({id, new_cfd});
diff --git a/db/column_family.h b/db/column_family.h
index c6d49e71b..51ccd99ac 100644
--- a/db/column_family.h
+++ b/db/column_family.h
@@ -201,8 +201,9 @@ class ColumnFamilyData {
   MemTable* mem() { return mem_; }
   Version* current() { return current_; }
   Version* dummy_versions() { return dummy_versions_; }
-  void SetMemtable(MemTable* new_mem) { mem_ = new_mem; }
   void SetCurrent(Version* current);
+  MemTable* ConstructNewMemtable(const MutableCFOptions& mutable_cf_options);
+  void SetMemtable(MemTable* new_mem) { mem_ = new_mem; }
   void CreateNewMemtable(const MutableCFOptions& mutable_cf_options);
 
   TableCache* table_cache() const { return table_cache_.get(); }
@@ -264,6 +265,7 @@ class ColumnFamilyData {
   friend class ColumnFamilySet;
   ColumnFamilyData(uint32_t id, const std::string& name,
                    Version* dummy_versions, Cache* table_cache,
+                   WriteBuffer* write_buffer,
                    const ColumnFamilyOptions& options,
                    const DBOptions* db_options, const EnvOptions& env_options,
                    ColumnFamilySet* column_family_set);
@@ -294,6 +296,8 @@ class ColumnFamilyData {
 
   std::unique_ptr<InternalStats> internal_stats_;
 
+  WriteBuffer* write_buffer_;
+
   MemTable* mem_;
   MemTableList imm_;
   SuperVersion* super_version_;
@@ -366,7 +370,7 @@ class ColumnFamilySet {
 
   ColumnFamilySet(const std::string& dbname, const DBOptions* db_options,
                   const EnvOptions& env_options, Cache* table_cache,
-                  WriteController* write_controller);
+                  WriteBuffer* write_buffer, WriteController* write_controller);
   ~ColumnFamilySet();
 
   ColumnFamilyData* GetDefault() const;
@@ -421,6 +425,7 @@ class ColumnFamilySet {
   const DBOptions* const db_options_;
   const EnvOptions env_options_;
   Cache* table_cache_;
+  WriteBuffer* write_buffer_;
   WriteController* write_controller_;
   std::atomic_flag spin_lock_;
 };
diff --git a/db/compaction_job_test.cc b/db/compaction_job_test.cc
index 712471657..1db802813 100644
--- a/db/compaction_job_test.cc
+++ b/db/compaction_job_test.cc
@@ -9,6 +9,7 @@
 #include "db/compaction_job.h"
 #include "db/column_family.h"
 #include "db/version_set.h"
+#include "db/writebuffer.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/options.h"
 #include "rocksdb/db.h"
@@ -26,8 +27,10 @@ class CompactionJobTest {
         dbname_(test::TmpDir() + "/compaction_job_test"),
         mutable_cf_options_(Options(), ImmutableCFOptions(Options())),
         table_cache_(NewLRUCache(50000, 16, 8)),
+        write_buffer_(db_options_.db_write_buffer_size),
         versions_(new VersionSet(dbname_, &db_options_, env_options_,
-                                 table_cache_.get(), &write_controller_)),
+                                 table_cache_.get(), &write_buffer_,
+                                 &write_controller_)),
         shutting_down_(false),
         mock_table_factory_(new mock::MockTableFactory()) {
     ASSERT_OK(env_->CreateDirIfMissing(dbname_));
@@ -125,6 +128,7 @@ class CompactionJobTest {
   WriteController write_controller_;
   DBOptions db_options_;
   ColumnFamilyOptions cf_options_;
+  WriteBuffer write_buffer_;
   std::unique_ptr<VersionSet> versions_;
   port::Mutex mutex_;
   std::atomic<bool> shutting_down_;
diff --git a/db/db_bench.cc b/db/db_bench.cc
index 6e5b63f24..c7fd0365c 100644
--- a/db/db_bench.cc
+++ b/db/db_bench.cc
@@ -198,6 +198,9 @@ DEFINE_bool(enable_numa, false,
             "CPU and memory of same node. Use \"$numactl --hardware\" command "
             "to see NUMA memory architecture.");
 
+DEFINE_int64(db_write_buffer_size, rocksdb::Options().db_write_buffer_size,
+             "Number of bytes to buffer in all memtables before compacting");
+
 DEFINE_int64(write_buffer_size, rocksdb::Options().write_buffer_size,
              "Number of bytes to buffer in memtable before compacting");
 
@@ -1834,6 +1837,7 @@ class Benchmark {
     Options options;
     options.create_if_missing = !FLAGS_use_existing_db;
     options.create_missing_column_families = FLAGS_num_column_families > 1;
+    options.db_write_buffer_size = FLAGS_db_write_buffer_size;
     options.write_buffer_size = FLAGS_write_buffer_size;
     options.max_write_buffer_number = FLAGS_max_write_buffer_number;
     options.min_write_buffer_number_to_merge =
diff --git a/db/db_impl.cc b/db/db_impl.cc
index 99a386e76..bdc0030ae 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -44,6 +44,7 @@
 #include "db/forward_iterator.h"
 #include "db/transaction_log_impl.h"
 #include "db/version_set.h"
+#include "db/writebuffer.h"
 #include "db/write_batch_internal.h"
 #include "port/port.h"
 #include "rocksdb/cache.h"
@@ -201,6 +202,7 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname)
       default_cf_handle_(nullptr),
       total_log_size_(0),
       max_total_in_memory_state_(0),
+      write_buffer_(options.db_write_buffer_size),
       tmp_batch_(),
       bg_schedule_needed_(false),
       bg_compaction_scheduled_(0),
@@ -231,7 +233,8 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname)
                   db_options_.table_cache_remove_scan_count_limit);
 
   versions_.reset(new VersionSet(dbname_, &db_options_, env_options_,
-                                 table_cache_.get(), &write_controller_));
+                                 table_cache_.get(), &write_buffer_,
+                                 &write_controller_));
   column_family_memtables_.reset(new ColumnFamilyMemTablesImpl(
       versions_->GetColumnFamilySet(), &flush_scheduler_));
 
@@ -2823,6 +2826,23 @@ Status DBImpl::Write(const WriteOptions& write_options, WriteBatch* my_batch) {
       }
     }
     MaybeScheduleFlushOrCompaction();
+  } else if (UNLIKELY(write_buffer_.ShouldFlush())) {
+    Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
+        "Flushing all column families. Write buffer is using %" PRIu64
+        " bytes out of a total of %" PRIu64 ".",
+        write_buffer_.memory_usage(), write_buffer_.buffer_size());
+    // no need to refcount because drop is happening in write thread, so can't
+    // happen while we're in the write thread
+    for (auto cfd : *versions_->GetColumnFamilySet()) {
+      if (!cfd->mem()->IsEmpty()) {
+        status = SetNewMemtableAndNewLogFile(cfd, &context);
+        if (!status.ok()) {
+          break;
+        }
+        cfd->imm()->FlushRequested();
+      }
+    }
+    MaybeScheduleFlushOrCompaction();
   }
 
   if (UNLIKELY(status.ok() && !bg_error_.ok())) {
@@ -3030,8 +3050,7 @@ Status DBImpl::SetNewMemtableAndNewLogFile(ColumnFamilyData* cfd,
     }
 
     if (s.ok()) {
-      new_mem = new MemTable(cfd->internal_comparator(), *cfd->ioptions(),
-                             mutable_cf_options);
+      new_mem = cfd->ConstructNewMemtable(mutable_cf_options);
       new_superversion = new SuperVersion();
     }
   }
diff --git a/db/db_impl.h b/db/db_impl.h
index 1217610b5..c2c3969c1 100644
--- a/db/db_impl.h
+++ b/db/db_impl.h
@@ -24,6 +24,7 @@
 #include "db/column_family.h"
 #include "db/version_edit.h"
 #include "db/wal_manager.h"
+#include "db/writebuffer.h"
 #include "memtable_list.h"
 #include "port/port.h"
 #include "rocksdb/db.h"
@@ -436,6 +437,8 @@ class DBImpl : public DB {
 
   std::unique_ptr<Directory> db_directory_;
 
+  WriteBuffer write_buffer_;
+
   WriteThread write_thread_;
 
   WriteBatch tmp_batch_;
diff --git a/db/db_test.cc b/db/db_test.cc
index de7132e58..ccc7597a2 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -3445,7 +3445,7 @@ class ChangeFilterFactory : public CompactionFilterFactory {
 
 // TODO(kailiu) The tests on UniversalCompaction has some issues:
 //  1. A lot of magic numbers ("11" or "12").
-//  2. Made assumption on the memtable flush conidtions, which may change from
+//  2. Made assumption on the memtable flush conditions, which may change from
 //     time to time.
 TEST(DBTest, UniversalCompactionTrigger) {
   Options options;
@@ -3521,7 +3521,7 @@ TEST(DBTest, UniversalCompactionTrigger) {
   }
   dbfull()->TEST_WaitForCompact();
   // Before compaction, we have 4 files at level 0, with size 4, 0.4, 1, 1.
-  // After comapction, we should have 2 files, with size 4, 2.4.
+  // After compaction, we should have 2 files, with size 4, 2.4.
   ASSERT_EQ(NumTableFilesAtLevel(0, 1), 2);
   for (int i = 1; i < options.num_levels ; i++) {
     ASSERT_EQ(NumTableFilesAtLevel(i, 1), 0);
@@ -3549,7 +3549,7 @@ TEST(DBTest, UniversalCompactionTrigger) {
   }
   dbfull()->TEST_WaitForCompact();
   // Before compaction, we have 4 files at level 0, with size 4, 2.4, 1, 1.
-  // After comapction, we should have 3 files, with size 4, 2.4, 2.
+  // After compaction, we should have 3 files, with size 4, 2.4, 2.
   ASSERT_EQ(NumTableFilesAtLevel(0, 1), 3);
   for (int i = 1; i < options.num_levels ; i++) {
     ASSERT_EQ(NumTableFilesAtLevel(i, 1), 0);
@@ -6802,6 +6802,86 @@ TEST(DBTest, RecoverCheckFileAmount) {
   }
 }
 
+TEST(DBTest, SharedWriteBuffer) {
+  Options options;
+  options.db_write_buffer_size = 100000;  // this is the real limit
+  options.write_buffer_size    = 500000;  // this is never hit
+  CreateAndReopenWithCF({"pikachu", "dobrynia", "nikitich"}, options);
+
+  // Trigger a flush on every CF
+  ASSERT_OK(Put(0, Key(1), DummyString(1)));
+  ASSERT_OK(Put(1, Key(1), DummyString(1)));
+  ASSERT_OK(Put(3, Key(1), DummyString(90000)));
+  ASSERT_OK(Put(2, Key(2), DummyString(20000)));
+  ASSERT_OK(Put(2, Key(1), DummyString(1)));
+  dbfull()->TEST_WaitForFlushMemTable(handles_[0]);
+  dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+  dbfull()->TEST_WaitForFlushMemTable(handles_[2]);
+  dbfull()->TEST_WaitForFlushMemTable(handles_[3]);
+  {
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+              static_cast<uint64_t>(1));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
+              static_cast<uint64_t>(1));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
+              static_cast<uint64_t>(1));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+              static_cast<uint64_t>(1));
+  }
+
+  // Flush 'dobrynia' and 'nikitich'
+  ASSERT_OK(Put(2, Key(2), DummyString(50000)));
+  ASSERT_OK(Put(3, Key(2), DummyString(40000)));
+  ASSERT_OK(Put(2, Key(3), DummyString(20000)));
+  ASSERT_OK(Put(3, Key(2), DummyString(40000)));
+  dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+  dbfull()->TEST_WaitForFlushMemTable(handles_[2]);
+  dbfull()->TEST_WaitForFlushMemTable(handles_[3]);
+  {
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+              static_cast<uint64_t>(1));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
+              static_cast<uint64_t>(1));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
+              static_cast<uint64_t>(2));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+              static_cast<uint64_t>(2));
+  }
+
+  // Make 'dobrynia' and 'nikitich' both take up 40% of space
+  // When 'pikachu' puts us over 100%, all 3 flush.
+  ASSERT_OK(Put(2, Key(2), DummyString(40000)));
+  ASSERT_OK(Put(1, Key(2), DummyString(20000)));
+  ASSERT_OK(Put(0, Key(1), DummyString(1)));
+  dbfull()->TEST_WaitForFlushMemTable(handles_[2]);
+  dbfull()->TEST_WaitForFlushMemTable(handles_[3]);
+  {
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+              static_cast<uint64_t>(1));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
+              static_cast<uint64_t>(2));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
+              static_cast<uint64_t>(3));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+              static_cast<uint64_t>(3));
+  }
+
+  // Some remaining writes so 'default' and 'nikitich' flush on closure.
+  ASSERT_OK(Put(3, Key(1), DummyString(1)));
+  ReopenWithColumnFamilies({"default", "pikachu", "dobrynia", "nikitich"},
+                           options);
+  {
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+              static_cast<uint64_t>(2));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
+              static_cast<uint64_t>(2));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
+              static_cast<uint64_t>(3));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+              static_cast<uint64_t>(4));
+  }
+}
+
 TEST(DBTest, PurgeInfoLogs) {
   Options options = CurrentOptions();
   options.keep_log_file_num = 5;
diff --git a/db/flush_job_test.cc b/db/flush_job_test.cc
index aee3fd1a8..7d779b58f 100644
--- a/db/flush_job_test.cc
+++ b/db/flush_job_test.cc
@@ -9,6 +9,7 @@
 #include "db/flush_job.h"
 #include "db/column_family.h"
 #include "db/version_set.h"
+#include "db/writebuffer.h"
 #include "rocksdb/cache.h"
 #include "util/testharness.h"
 #include "util/testutil.h"
@@ -25,8 +26,10 @@ class FlushJobTest {
       : env_(Env::Default()),
         dbname_(test::TmpDir() + "/flush_job_test"),
         table_cache_(NewLRUCache(50000, 16, 8)),
+        write_buffer_(db_options_.db_write_buffer_size),
         versions_(new VersionSet(dbname_, &db_options_, env_options_,
-                                 table_cache_.get(), &write_controller_)),
+                                 table_cache_.get(), &write_buffer_,
+                                 &write_controller_)),
         shutting_down_(false),
         mock_table_factory_(new mock::MockTableFactory()) {
     ASSERT_OK(env_->CreateDirIfMissing(dbname_));
@@ -69,6 +72,7 @@ class FlushJobTest {
   std::shared_ptr<Cache> table_cache_;
   WriteController write_controller_;
   DBOptions db_options_;
+  WriteBuffer write_buffer_;
   ColumnFamilyOptions cf_options_;
   std::unique_ptr<VersionSet> versions_;
   port::Mutex mutex_;
@@ -91,9 +95,7 @@ TEST(FlushJobTest, Empty) {
 TEST(FlushJobTest, NonEmpty) {
   JobContext job_context;
   auto cfd = versions_->GetColumnFamilySet()->GetDefault();
-
-  auto new_mem = new MemTable(cfd->internal_comparator(), *cfd->ioptions(),
-                              *cfd->GetLatestMutableCFOptions());
+  auto new_mem = cfd->ConstructNewMemtable(*cfd->GetLatestMutableCFOptions());
   new_mem->Ref();
   std::map<std::string, std::string> inserted_keys;
   for (int i = 1; i < 10000; ++i) {
diff --git a/db/log_and_apply_bench.cc b/db/log_and_apply_bench.cc
index 417a2a8d7..b55ec0539 100644
--- a/db/log_and_apply_bench.cc
+++ b/db/log_and_apply_bench.cc
@@ -15,6 +15,7 @@
 #include "util/benchharness.h"
 #include "db/version_set.h"
 #include "db/write_controller.h"
+#include "db/writebuffer.h"
 #include "util/mutexlock.h"
 
 namespace rocksdb {
@@ -52,9 +53,10 @@ void BM_LogAndApply(int iters, int num_base_files) {
     // Notice we are using the default options not through SanitizeOptions().
     // We might want to initialize some options manually if needed.
     options.db_paths.emplace_back(dbname, 0);
+    WriteBuffer wb(options.db_write_buffer_size);
     // The parameter of table cache is passed in as null, so any file I/O
     // operation is likely to fail.
-    vset = new VersionSet(dbname, &options, sopt, nullptr, &wc);
+    vset = new VersionSet(dbname, &options, sopt, nullptr, &wb, &wc);
     std::vector<ColumnFamilyDescriptor> dummy;
     dummy.push_back(ColumnFamilyDescriptor());
     ASSERT_OK(vset->Recover(dummy));
diff --git a/db/memtable.cc b/db/memtable.cc
index 98212a61b..6dcacc421 100644
--- a/db/memtable.cc
+++ b/db/memtable.cc
@@ -15,6 +15,7 @@
 
 #include "db/dbformat.h"
 #include "db/merge_context.h"
+#include "db/writebuffer.h"
 #include "rocksdb/comparator.h"
 #include "rocksdb/env.h"
 #include "rocksdb/iterator.h"
@@ -52,14 +53,17 @@ MemTableOptions::MemTableOptions(
 
 MemTable::MemTable(const InternalKeyComparator& cmp,
                    const ImmutableCFOptions& ioptions,
-                   const MutableCFOptions& mutable_cf_options)
+                   const MutableCFOptions& mutable_cf_options,
+                   WriteBuffer* write_buffer)
     : comparator_(cmp),
       moptions_(ioptions, mutable_cf_options),
       refs_(0),
       kArenaBlockSize(OptimizeBlockSize(moptions_.arena_block_size)),
       arena_(moptions_.arena_block_size),
+      allocator_(&arena_, write_buffer),
       table_(ioptions.memtable_factory->CreateMemTableRep(
-          comparator_, &arena_, ioptions.prefix_extractor, ioptions.info_log)),
+          comparator_, &allocator_, ioptions.prefix_extractor,
+          ioptions.info_log)),
       num_entries_(0),
       flush_in_progress_(false),
       flush_completed_(false),
@@ -76,7 +80,7 @@ MemTable::MemTable(const InternalKeyComparator& cmp,
   assert(!should_flush_);
   if (prefix_extractor_ && moptions_.memtable_prefix_bloom_bits > 0) {
     prefix_bloom_.reset(new DynamicBloom(
-        &arena_,
+        &allocator_,
         moptions_.memtable_prefix_bloom_bits, ioptions.bloom_locality,
         moptions_.memtable_prefix_bloom_probes, nullptr,
         moptions_.memtable_prefix_bloom_huge_page_tlb_size,
@@ -179,7 +183,7 @@ Slice MemTableRep::UserKey(const char* key) const {
 }
 
 KeyHandle MemTableRep::Allocate(const size_t len, char** buf) {
-  *buf = arena_->Allocate(len);
+  *buf = allocator_->Allocate(len);
   return static_cast<KeyHandle>(*buf);
 }
 
diff --git a/db/memtable.h b/db/memtable.h
index 96af1e90a..0c1f0de1a 100644
--- a/db/memtable.h
+++ b/db/memtable.h
@@ -19,16 +19,17 @@
 #include "rocksdb/db.h"
 #include "rocksdb/memtablerep.h"
 #include "rocksdb/immutable_options.h"
+#include "db/memtable_allocator.h"
 #include "util/arena.h"
 #include "util/dynamic_bloom.h"
 #include "util/mutable_cf_options.h"
 
 namespace rocksdb {
 
-class Arena;
 class Mutex;
 class MemTableIterator;
 class MergeContext;
+class WriteBuffer;
 
 struct MemTableOptions {
   explicit MemTableOptions(
@@ -67,7 +68,8 @@ class MemTable {
   // is zero and the caller must call Ref() at least once.
   explicit MemTable(const InternalKeyComparator& comparator,
                     const ImmutableCFOptions& ioptions,
-                    const MutableCFOptions& mutable_cf_options);
+                    const MutableCFOptions& mutable_cf_options,
+                    WriteBuffer* write_buffer);
 
   ~MemTable();
 
@@ -183,7 +185,10 @@ class MemTable {
   void SetNextLogNumber(uint64_t num) { mem_next_logfile_number_ = num; }
 
   // Notify the underlying storage that no more items will be added
-  void MarkImmutable() { table_->MarkReadOnly(); }
+  void MarkImmutable() {
+    table_->MarkReadOnly();
+    allocator_.DoneAllocating();
+  }
 
   // return true if the current MemTableRep supports merge operator.
   bool IsMergeOperatorSupported() const {
@@ -200,8 +205,6 @@ class MemTable {
     return comparator_.comparator;
   }
 
-  const Arena& TEST_GetArena() const { return arena_; }
-
   const MemTableOptions* GetMemTableOptions() const { return &moptions_; }
 
  private:
@@ -217,6 +220,7 @@ class MemTable {
   int refs_;
   const size_t kArenaBlockSize;
   Arena arena_;
+  MemTableAllocator allocator_;
   unique_ptr<MemTableRep> table_;
 
   uint64_t num_entries_;
diff --git a/db/memtable_allocator.cc b/db/memtable_allocator.cc
new file mode 100644
index 000000000..d3ecea2fd
--- /dev/null
+++ b/db/memtable_allocator.cc
@@ -0,0 +1,52 @@
+//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <assert.h>
+
+#include "db/memtable_allocator.h"
+#include "db/writebuffer.h"
+#include "util/arena.h"
+
+namespace rocksdb {
+
+MemTableAllocator::MemTableAllocator(Arena* arena, WriteBuffer* write_buffer)
+    : arena_(arena), write_buffer_(write_buffer), bytes_allocated_(0) {
+}
+
+MemTableAllocator::~MemTableAllocator() {
+  DoneAllocating();
+}
+
+char* MemTableAllocator::Allocate(size_t bytes) {
+  assert(write_buffer_ != nullptr);
+  bytes_allocated_ += bytes;
+  write_buffer_->ReserveMem(bytes);
+  return arena_->Allocate(bytes);
+}
+
+char* MemTableAllocator::AllocateAligned(size_t bytes, size_t huge_page_size,
+                                         Logger* logger) {
+  assert(write_buffer_ != nullptr);
+  bytes_allocated_ += bytes;
+  write_buffer_->ReserveMem(bytes);
+  return arena_->AllocateAligned(bytes, huge_page_size, logger);
+}
+
+void MemTableAllocator::DoneAllocating() {
+  if (write_buffer_ != nullptr) {
+    write_buffer_->FreeMem(bytes_allocated_);
+    write_buffer_ = nullptr;
+  }
+}
+
+size_t MemTableAllocator::BlockSize() const {
+  return arena_->BlockSize();
+}
+
+}  // namespace rocksdb
diff --git a/db/memtable_allocator.h b/db/memtable_allocator.h
new file mode 100644
index 000000000..fa8ee1287
--- /dev/null
+++ b/db/memtable_allocator.h
@@ -0,0 +1,47 @@
+//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// This is used by the MemTable to allocate write buffer memory. It connects
+// to WriteBuffer so we can track and enforce overall write buffer limits.
+
+#pragma once
+#include "util/allocator.h"
+
+namespace rocksdb {
+
+class Arena;
+class Logger;
+class WriteBuffer;
+
+class MemTableAllocator : public Allocator {
+ public:
+  explicit MemTableAllocator(Arena* arena, WriteBuffer* write_buffer);
+  ~MemTableAllocator();
+
+  // Allocator interface
+  char* Allocate(size_t bytes) override;
+  char* AllocateAligned(size_t bytes, size_t huge_page_size = 0,
+                        Logger* logger = nullptr) override;
+  size_t BlockSize() const override;
+
+  // Call when we're finished allocating memory so we can free it from
+  // the write buffer's limit.
+  void DoneAllocating();
+
+ private:
+  Arena* arena_;
+  WriteBuffer* write_buffer_;
+  size_t bytes_allocated_;
+
+  // No copying allowed
+  MemTableAllocator(const MemTableAllocator&);
+  void operator=(const MemTableAllocator&);
+};
+
+}  // namespace rocksdb
diff --git a/db/repair.cc b/db/repair.cc
index 8fa312638..3b5952dd0 100644
--- a/db/repair.cc
+++ b/db/repair.cc
@@ -45,6 +45,7 @@
 #include "db/memtable.h"
 #include "db/table_cache.h"
 #include "db/version_edit.h"
+#include "db/writebuffer.h"
 #include "db/write_batch_internal.h"
 #include "rocksdb/comparator.h"
 #include "rocksdb/db.h"
@@ -220,8 +221,9 @@ class Repairer {
     std::string scratch;
     Slice record;
     WriteBatch batch;
+    WriteBuffer wb(options_.db_write_buffer_size);
     MemTable* mem = new MemTable(icmp_, ioptions_,
-                                 MutableCFOptions(options_, ioptions_));
+                                 MutableCFOptions(options_, ioptions_), &wb);
     auto cf_mems_default = new ColumnFamilyMemTablesDefault(mem);
     mem->Ref();
     int counter = 0;
diff --git a/db/skiplist.h b/db/skiplist.h
index 4ee4ed714..c1e375007 100644
--- a/db/skiplist.h
+++ b/db/skiplist.h
@@ -34,9 +34,8 @@
 #include <assert.h>
 #include <atomic>
 #include <stdlib.h>
-#include "util/arena.h"
 #include "port/port.h"
-#include "util/arena.h"
+#include "util/allocator.h"
 #include "util/random.h"
 
 namespace rocksdb {
@@ -48,9 +47,9 @@ class SkipList {
 
  public:
   // Create a new SkipList object that will use "cmp" for comparing keys,
-  // and will allocate memory using "*arena".  Objects allocated in the arena
-  // must remain allocated for the lifetime of the skiplist object.
-  explicit SkipList(Comparator cmp, Arena* arena,
+  // and will allocate memory using "*allocator".  Objects allocated in the
+  // allocator must remain allocated for the lifetime of the skiplist object.
+  explicit SkipList(Comparator cmp, Allocator* allocator,
                     int32_t max_height = 12, int32_t branching_factor = 4);
 
   // Insert key into the list.
@@ -110,7 +109,7 @@ class SkipList {
 
   // Immutable after construction
   Comparator const compare_;
-  Arena* const arena_;    // Arena used for allocations of nodes
+  Allocator* const allocator_;    // Allocator used for allocations of nodes
 
   Node* const head_;
 
@@ -196,7 +195,7 @@ struct SkipList<Key, Comparator>::Node {
 template<typename Key, class Comparator>
 typename SkipList<Key, Comparator>::Node*
 SkipList<Key, Comparator>::NewNode(const Key& key, int height) {
-  char* mem = arena_->AllocateAligned(
+  char* mem = allocator_->AllocateAligned(
       sizeof(Node) + sizeof(std::atomic<Node*>) * (height - 1));
   return new (mem) Node(key);
 }
@@ -356,23 +355,24 @@ typename SkipList<Key, Comparator>::Node* SkipList<Key, Comparator>::FindLast()
 }
 
 template<typename Key, class Comparator>
-SkipList<Key, Comparator>::SkipList(const Comparator cmp, Arena* arena,
+SkipList<Key, Comparator>::SkipList(const Comparator cmp, Allocator* allocator,
                                    int32_t max_height,
                                    int32_t branching_factor)
     : kMaxHeight_(max_height),
       kBranching_(branching_factor),
       compare_(cmp),
-      arena_(arena),
+      allocator_(allocator),
       head_(NewNode(0 /* any key will do */, max_height)),
       max_height_(1),
       prev_height_(1),
       rnd_(0xdeadbeef) {
   assert(kMaxHeight_ > 0);
   assert(kBranching_ > 0);
-  // Allocate the prev_ Node* array, directly from the passed-in arena.
+  // Allocate the prev_ Node* array, directly from the passed-in allocator.
   // prev_ does not need to be freed, as its life cycle is tied up with
-  // the arena as a whole.
-  prev_ = (Node**) arena_->AllocateAligned(sizeof(Node*) * kMaxHeight_);
+  // the allocator as a whole.
+  prev_ = reinterpret_cast<Node**>(
+            allocator_->AllocateAligned(sizeof(Node*) * kMaxHeight_));
   for (int i = 0; i < kMaxHeight_; i++) {
     head_->SetNext(i, nullptr);
     prev_[i] = head_;
diff --git a/db/version_set.cc b/db/version_set.cc
index f71ffce95..f138c8232 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -31,6 +31,7 @@
 #include "db/table_cache.h"
 #include "db/compaction.h"
 #include "db/version_builder.h"
+#include "db/writebuffer.h"
 #include "rocksdb/env.h"
 #include "rocksdb/merge_operator.h"
 #include "table/table_reader.h"
@@ -1490,9 +1491,11 @@ struct VersionSet::ManifestWriter {
 
 VersionSet::VersionSet(const std::string& dbname, const DBOptions* db_options,
                        const EnvOptions& storage_options, Cache* table_cache,
+                       WriteBuffer* write_buffer,
                        WriteController* write_controller)
     : column_family_set_(new ColumnFamilySet(
-          dbname, db_options, storage_options, table_cache, write_controller)),
+          dbname, db_options, storage_options, table_cache,
+          write_buffer, write_controller)),
       env_(db_options->env),
       dbname_(dbname),
       db_options_(db_options),
@@ -2215,7 +2218,8 @@ Status VersionSet::ReduceNumberOfLevels(const std::string& dbname,
       options->max_open_files - 10, options->table_cache_numshardbits,
       options->table_cache_remove_scan_count_limit));
   WriteController wc;
-  VersionSet versions(dbname, options, env_options, tc.get(), &wc);
+  WriteBuffer wb(options->db_write_buffer_size);
+  VersionSet versions(dbname, options, env_options, tc.get(), &wb, &wc);
   Status status;
 
   std::vector<ColumnFamilyDescriptor> dummy;
diff --git a/db/version_set.h b/db/version_set.h
index 04ad37773..6e645680b 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -50,6 +50,7 @@ class LookupKey;
 class MemTable;
 class Version;
 class VersionSet;
+class WriteBuffer;
 class MergeContext;
 class ColumnFamilyData;
 class ColumnFamilySet;
@@ -475,7 +476,7 @@ class VersionSet {
  public:
   VersionSet(const std::string& dbname, const DBOptions* db_options,
              const EnvOptions& env_options, Cache* table_cache,
-             WriteController* write_controller);
+             WriteBuffer* write_buffer, WriteController* write_controller);
   ~VersionSet();
 
   // Apply *edit to the current version to form a new descriptor that
diff --git a/db/wal_manager_test.cc b/db/wal_manager_test.cc
index 5c12586c8..bc12012ba 100644
--- a/db/wal_manager_test.cc
+++ b/db/wal_manager_test.cc
@@ -13,6 +13,7 @@
 #include "db/log_writer.h"
 #include "db/column_family.h"
 #include "db/version_set.h"
+#include "db/writebuffer.h"
 #include "util/testharness.h"
 #include "util/testutil.h"
 #include "table/mock_table.h"
@@ -28,6 +29,7 @@ class WalManagerTest {
       : env_(Env::Default()),
         dbname_(test::TmpDir() + "/wal_manager_test"),
         table_cache_(NewLRUCache(50000, 16, 8)),
+        write_buffer_(db_options_.db_write_buffer_size),
         current_log_number_(0) {
     DestroyDB(dbname_, Options());
   }
@@ -40,7 +42,8 @@ class WalManagerTest {
     db_options_.wal_dir = dbname_;
 
     versions_.reset(new VersionSet(dbname_, &db_options_, env_options_,
-                                   table_cache_.get(), &write_controller_));
+                                   table_cache_.get(), &write_buffer_,
+                                   &write_controller_));
 
     wal_manager_.reset(new WalManager(db_options_, env_options_));
   }
@@ -93,6 +96,7 @@ class WalManagerTest {
   EnvOptions env_options_;
   std::shared_ptr<Cache> table_cache_;
   DBOptions db_options_;
+  WriteBuffer write_buffer_;
   std::unique_ptr<VersionSet> versions_;
   std::unique_ptr<WalManager> wal_manager_;
 
diff --git a/db/write_batch_test.cc b/db/write_batch_test.cc
index dbf65b6e9..e28d02aef 100644
--- a/db/write_batch_test.cc
+++ b/db/write_batch_test.cc
@@ -13,6 +13,7 @@
 #include "db/memtable.h"
 #include "db/column_family.h"
 #include "db/write_batch_internal.h"
+#include "db/writebuffer.h"
 #include "rocksdb/env.h"
 #include "rocksdb/memtablerep.h"
 #include "rocksdb/utilities/write_batch_with_index.h"
@@ -28,8 +29,9 @@ static std::string PrintContents(WriteBatch* b) {
   Options options;
   options.memtable_factory = factory;
   ImmutableCFOptions ioptions(options);
+  WriteBuffer wb(options.db_write_buffer_size);
   MemTable* mem = new MemTable(cmp, ioptions,
-                               MutableCFOptions(options, ioptions));
+                               MutableCFOptions(options, ioptions), &wb);
   mem->Ref();
   std::string state;
   ColumnFamilyMemTablesDefault cf_mems_default(mem);
diff --git a/db/writebuffer.h b/db/writebuffer.h
new file mode 100644
index 000000000..7047a9244
--- /dev/null
+++ b/db/writebuffer.h
@@ -0,0 +1,44 @@
+//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// WriteBuffer is for managing memory allocation for one or more MemTables.
+
+#pragma once
+
+namespace rocksdb {
+
+class WriteBuffer {
+ public:
+  explicit WriteBuffer(size_t _buffer_size)
+    : buffer_size_(_buffer_size), memory_used_(0) {}
+
+  ~WriteBuffer() {}
+
+  size_t memory_usage() const { return memory_used_; }
+  size_t buffer_size() const { return buffer_size_; }
+
+  // Should only be called from write thread
+  bool ShouldFlush() const {
+    return buffer_size() > 0 && memory_usage() >= buffer_size();
+  }
+
+  // Should only be called from write thread
+  void ReserveMem(size_t mem) { memory_used_ += mem; }
+  void FreeMem(size_t mem) { memory_used_ -= mem; }
+
+ private:
+  const size_t buffer_size_;
+  size_t memory_used_;
+
+  // No copying allowed
+  WriteBuffer(const WriteBuffer&);
+  void operator=(const WriteBuffer&);
+};
+
+}  // namespace rocksdb
diff --git a/include/rocksdb/memtablerep.h b/include/rocksdb/memtablerep.h
index 8c2d7201b..97141cc73 100644
--- a/include/rocksdb/memtablerep.h
+++ b/include/rocksdb/memtablerep.h
@@ -14,8 +14,8 @@
 //  (4) Items are never deleted.
 // The liberal use of assertions is encouraged to enforce (1).
 //
-// The factory will be passed an Arena object when a new MemTableRep is
-// requested. The API for this object is in rocksdb/arena.h.
+// The factory will be passed an MemTableAllocator object when a new MemTableRep
+// is requested.
 //
 // Users can implement their own memtable representations. We include three
 // types built in:
@@ -41,6 +41,7 @@
 namespace rocksdb {
 
 class Arena;
+class MemTableAllocator;
 class LookupKey;
 class Slice;
 class SliceTransform;
@@ -65,7 +66,7 @@ class MemTableRep {
     virtual ~KeyComparator() { }
   };
 
-  explicit MemTableRep(Arena* arena) : arena_(arena) {}
+  explicit MemTableRep(MemTableAllocator* allocator) : allocator_(allocator) {}
 
   // Allocate a buf of len size for storing key. The idea is that a specific
   // memtable representation knows its underlying data structure better. By
@@ -101,7 +102,7 @@ class MemTableRep {
                    bool (*callback_func)(void* arg, const char* entry));
 
   // Report an approximation of how much memory has been used other than memory
-  // that was allocated through the arena.
+  // that was allocated through the allocator.
   virtual size_t ApproximateMemoryUsage() = 0;
 
   virtual ~MemTableRep() { }
@@ -150,7 +151,7 @@ class MemTableRep {
 
   // Return an iterator that has a special Seek semantics. The result of
   // a Seek might only include keys with the same prefix as the target key.
-  // arena: If not null, the arena needs to be used to allocate the Iterator.
+  // arena: If not null, the arena is used to allocate the Iterator.
   //        When destroying the iterator, the caller will not call "delete"
   //        but Iterator::~Iterator() directly. The destructor needs to destroy
   //        all the states but those allocated in arena.
@@ -171,7 +172,7 @@ class MemTableRep {
   // user key.
   virtual Slice UserKey(const char* key) const;
 
-  Arena* arena_;
+  MemTableAllocator* allocator_;
 };
 
 // This is the base class for all factories that are used by RocksDB to create
@@ -180,7 +181,8 @@ class MemTableRepFactory {
  public:
   virtual ~MemTableRepFactory() {}
   virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&,
-                                         Arena*, const SliceTransform*,
+                                         MemTableAllocator*,
+                                         const SliceTransform*,
                                          Logger* logger) = 0;
   virtual const char* Name() const = 0;
 };
@@ -197,7 +199,8 @@ class SkipListFactory : public MemTableRepFactory {
   explicit SkipListFactory(size_t lookahead = 0) : lookahead_(lookahead) {}
 
   virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&,
-                                         Arena*, const SliceTransform*,
+                                         MemTableAllocator*,
+                                         const SliceTransform*,
                                          Logger* logger) override;
   virtual const char* Name() const override { return "SkipListFactory"; }
 
@@ -220,7 +223,8 @@ class VectorRepFactory : public MemTableRepFactory {
  public:
   explicit VectorRepFactory(size_t count = 0) : count_(count) { }
   virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&,
-                                         Arena*, const SliceTransform*,
+                                         MemTableAllocator*,
+                                         const SliceTransform*,
                                          Logger* logger) override;
   virtual const char* Name() const override {
     return "VectorRepFactory";
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index 91c6604ae..09b72ca6b 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -205,6 +205,9 @@ struct ColumnFamilyOptions {
   // Also, a larger write buffer will result in a longer recovery time
   // the next time the database is opened.
   //
+  // Note that write_buffer_size is enforced per column family.
+  // See db_write_buffer_size for sharing memory across column families.
+  //
   // Default: 4MB
   //
   // Dynamically changeable through SetOptions() API
@@ -859,6 +862,18 @@ struct DBOptions {
   // Default: true
   bool advise_random_on_open;
 
+  // Amount of data to build up in memtables across all column
+  // families before writing to disk.
+  //
+  // This is distinct from write_buffer_size, which enforces a limit
+  // for a single memtable.
+  //
+  // This feature is disabled by default. Specify a non-zero value
+  // to enable it.
+  //
+  // Default: 0 (disabled)
+  size_t db_write_buffer_size;
+
   // Specify the file access pattern once a compaction is started.
   // It will be applied to all input files of a compaction.
   // Default: NORMAL
diff --git a/table/bloom_block.h b/table/bloom_block.h
index 7ef5d14b6..5b60d2bca 100644
--- a/table/bloom_block.h
+++ b/table/bloom_block.h
@@ -18,9 +18,10 @@ class BloomBlockBuilder {
   explicit BloomBlockBuilder(uint32_t num_probes = 6)
       : bloom_(num_probes, nullptr) {}
 
-  void SetTotalBits(Arena* arena, uint32_t total_bits, uint32_t locality,
-                    size_t huge_page_tlb_size, Logger* logger) {
-    bloom_.SetTotalBits(arena, total_bits, locality, huge_page_tlb_size,
+  void SetTotalBits(Allocator* allocator, uint32_t total_bits,
+                    uint32_t locality, size_t huge_page_tlb_size,
+                    Logger* logger) {
+    bloom_.SetTotalBits(allocator, total_bits, locality, huge_page_tlb_size,
                         logger);
   }
 
diff --git a/table/table_test.cc b/table/table_test.cc
index 9e5f8a49e..3d603bf31 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -20,6 +20,7 @@
 #include "db/dbformat.h"
 #include "db/memtable.h"
 #include "db/write_batch_internal.h"
+#include "db/writebuffer.h"
 
 #include "rocksdb/cache.h"
 #include "rocksdb/db.h"
@@ -427,15 +428,15 @@ uint64_t TableConstructor::cur_uniq_id_ = 1;
 
 class MemTableConstructor: public Constructor {
  public:
-  explicit MemTableConstructor(const Comparator* cmp)
+  explicit MemTableConstructor(const Comparator* cmp, WriteBuffer* wb)
       : Constructor(cmp),
         internal_comparator_(cmp),
+        write_buffer_(wb),
         table_factory_(new SkipListFactory) {
-    Options options;
-    options.memtable_factory = table_factory_;
-    ImmutableCFOptions ioptions(options);
+    options_.memtable_factory = table_factory_;
+    ImmutableCFOptions ioptions(options_);
     memtable_ = new MemTable(internal_comparator_, ioptions,
-                             MutableCFOptions(options, ioptions));
+                             MutableCFOptions(options_, ioptions), wb);
     memtable_->Ref();
   }
   ~MemTableConstructor() {
@@ -446,11 +447,10 @@ class MemTableConstructor: public Constructor {
                             const InternalKeyComparator& internal_comparator,
                             const KVMap& kv_map) {
     delete memtable_->Unref();
-    Options options;
-    options.memtable_factory = table_factory_;
-    ImmutableCFOptions mem_ioptions(options);
+    ImmutableCFOptions mem_ioptions(ioptions);
     memtable_ = new MemTable(internal_comparator_, mem_ioptions,
-                             MutableCFOptions(options, mem_ioptions));
+                             MutableCFOptions(options_, mem_ioptions),
+                             write_buffer_);
     memtable_->Ref();
     int seq = 1;
     for (const auto kv : kv_map) {
@@ -471,6 +471,8 @@ class MemTableConstructor: public Constructor {
  private:
   mutable Arena arena_;
   InternalKeyComparator internal_comparator_;
+  Options options_;
+  WriteBuffer* write_buffer_;
   MemTable* memtable_;
   std::shared_ptr<SkipListFactory> table_factory_;
 };
@@ -696,7 +698,9 @@ class FixedOrLessPrefixTransform : public SliceTransform {
 
 class Harness {
  public:
-  Harness() : ioptions_(options_), constructor_(nullptr) {}
+  Harness()
+    : ioptions_(options_), constructor_(nullptr),
+      write_buffer_(options_.db_write_buffer_size) {}
 
   void Init(const TestArgs& args) {
     delete constructor_;
@@ -773,7 +777,8 @@ class Harness {
         table_options_.block_size = 256;
         options_.table_factory.reset(
             new BlockBasedTableFactory(table_options_));
-        constructor_ = new MemTableConstructor(options_.comparator);
+        constructor_ = new MemTableConstructor(options_.comparator,
+                                               &write_buffer_);
         break;
       case DB_TEST:
         table_options_.block_size = 256;
@@ -981,6 +986,7 @@ class Harness {
   ImmutableCFOptions ioptions_;
   BlockBasedTableOptions table_options_ = BlockBasedTableOptions();
   Constructor* constructor_;
+  WriteBuffer write_buffer_;
   bool support_prev_;
   bool only_support_prefix_seek_;
   shared_ptr<InternalKeyComparator> internal_comparator_;
@@ -1870,8 +1876,9 @@ TEST(MemTableTest, Simple) {
   Options options;
   options.memtable_factory = table_factory;
   ImmutableCFOptions ioptions(options);
+  WriteBuffer wb(options.db_write_buffer_size);
   MemTable* memtable = new MemTable(cmp, ioptions,
-                                    MutableCFOptions(options, ioptions));
+                                    MutableCFOptions(options, ioptions), &wb);
   memtable->Ref();
   WriteBatch batch;
   WriteBatchInternal::SetSequence(&batch, 100);
diff --git a/tools/db_stress.cc b/tools/db_stress.cc
index 9aad6efb9..c63d82413 100644
--- a/tools/db_stress.cc
+++ b/tools/db_stress.cc
@@ -114,6 +114,9 @@ DEFINE_bool(verbose, false, "Verbose");
 DEFINE_bool(progress_reports, true,
             "If true, db_stress will report number of finished operations");
 
+DEFINE_uint64(db_write_buffer_size, rocksdb::Options().db_write_buffer_size,
+              "Number of bytes to buffer in all memtables before compacting");
+
 DEFINE_int32(write_buffer_size,
              static_cast<int32_t>(rocksdb::Options().write_buffer_size),
              "Number of bytes to buffer in memtable before compacting");
@@ -1682,6 +1685,7 @@ class StressTest {
     fprintf(stdout, "Write percentage    : %d%%\n", FLAGS_writepercent);
     fprintf(stdout, "Delete percentage   : %d%%\n", FLAGS_delpercent);
     fprintf(stdout, "Iterate percentage  : %d%%\n", FLAGS_iterpercent);
+    fprintf(stdout, "DB-write-buffer-size: %lu\n", FLAGS_db_write_buffer_size);
     fprintf(stdout, "Write-buffer-size   : %d\n", FLAGS_write_buffer_size);
     fprintf(stdout,
             "Iterations          : %lu\n",
@@ -1753,6 +1757,7 @@ class StressTest {
     block_based_options.filter_policy = filter_policy_;
     options_.table_factory.reset(
         NewBlockBasedTableFactory(block_based_options));
+    options_.db_write_buffer_size = FLAGS_db_write_buffer_size;
     options_.write_buffer_size = FLAGS_write_buffer_size;
     options_.max_write_buffer_number = FLAGS_max_write_buffer_number;
     options_.min_write_buffer_number_to_merge =
diff --git a/util/allocator.h b/util/allocator.h
new file mode 100644
index 000000000..58bf0da31
--- /dev/null
+++ b/util/allocator.h
@@ -0,0 +1,32 @@
+//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Abstract interface for allocating memory in blocks. This memory is freed
+// when the allocator object is destroyed. See the Arena class for more info.
+
+#pragma once
+#include <cstddef>
+#include <cerrno>
+
+namespace rocksdb {
+
+class Logger;
+
+class Allocator {
+ public:
+  virtual ~Allocator() {}
+
+  virtual char* Allocate(size_t bytes) = 0;
+  virtual char* AllocateAligned(size_t bytes, size_t huge_page_size = 0,
+                                Logger* logger = nullptr) = 0;
+
+  virtual size_t BlockSize() const = 0;
+};
+
+}  // namespace rocksdb
diff --git a/util/arena.h b/util/arena.h
index 4764c1568..644a12947 100644
--- a/util/arena.h
+++ b/util/arena.h
@@ -7,7 +7,7 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-// Arena is an implementation of Arena class. For a request of small size,
+// Arena is an implementation of Allocator class. For a request of small size,
 // it allocates a block with pre-defined block size. For a request of big
 // size, it uses malloc to directly get the requested size.
 
@@ -17,15 +17,13 @@
 #include <vector>
 #include <assert.h>
 #include <stdint.h>
-#include "util/arena.h"
+#include "util/allocator.h"
 
 namespace rocksdb {
 
-class Logger;
-
 const size_t kInlineSize = 2048;
 
-class Arena {
+class Arena : public Allocator {
  public:
   // No copying allowed
   Arena(const Arena&) = delete;
@@ -41,7 +39,7 @@ class Arena {
   explicit Arena(size_t block_size = kMinBlockSize, size_t huge_page_size = 0);
   ~Arena();
 
-  char* Allocate(size_t bytes);
+  char* Allocate(size_t bytes) override;
 
   // huge_page_size: if >0, will try to allocate from huage page TLB.
   // The argument will be the size of the page size for huge page TLB. Bytes
@@ -56,7 +54,7 @@ class Arena {
   // huge_page_tlb_size > 0, we highly recommend a logger is passed in.
   // Otherwise, the error message will be printed out to stderr directly.
   char* AllocateAligned(size_t bytes, size_t huge_page_size = 0,
-                        Logger* logger = nullptr);
+                        Logger* logger = nullptr) override;
 
   // Returns an estimate of the total memory usage of data allocated
   // by the arena (exclude the space allocated but not yet used for future
@@ -74,7 +72,7 @@ class Arena {
   // same size of that allocation.
   size_t IrregularBlockNum() const { return irregular_block_num; }
 
-  size_t BlockSize() const { return kBlockSize; }
+  size_t BlockSize() const override { return kBlockSize; }
 
  private:
   char inline_block_[kInlineSize];
diff --git a/util/dynamic_bloom.cc b/util/dynamic_bloom.cc
index 73c2c9436..ffe8157cc 100644
--- a/util/dynamic_bloom.cc
+++ b/util/dynamic_bloom.cc
@@ -9,6 +9,7 @@
 
 #include "port/port.h"
 #include "rocksdb/slice.h"
+#include "util/allocator.h"
 #include "util/hash.h"
 
 namespace rocksdb {
@@ -29,13 +30,13 @@ uint32_t GetTotalBitsForLocality(uint32_t total_bits) {
 }
 }
 
-DynamicBloom::DynamicBloom(Arena* arena, uint32_t total_bits, uint32_t locality,
-                           uint32_t num_probes,
+DynamicBloom::DynamicBloom(Allocator* allocator, uint32_t total_bits,
+                           uint32_t locality, uint32_t num_probes,
                            uint32_t (*hash_func)(const Slice& key),
                            size_t huge_page_tlb_size,
                            Logger* logger)
     : DynamicBloom(num_probes, hash_func) {
-  SetTotalBits(arena, total_bits, locality, huge_page_tlb_size, logger);
+  SetTotalBits(allocator, total_bits, locality, huge_page_tlb_size, logger);
 }
 
 DynamicBloom::DynamicBloom(uint32_t num_probes,
@@ -52,7 +53,7 @@ void DynamicBloom::SetRawData(unsigned char* raw_data, uint32_t total_bits,
   kNumBlocks = num_blocks;
 }
 
-void DynamicBloom::SetTotalBits(Arena* arena,
+void DynamicBloom::SetTotalBits(Allocator* allocator,
                                 uint32_t total_bits, uint32_t locality,
                                 size_t huge_page_tlb_size,
                                 Logger* logger) {
@@ -67,9 +68,9 @@ void DynamicBloom::SetTotalBits(Arena* arena,
   if (kNumBlocks > 0) {
     sz += CACHE_LINE_SIZE - 1;
   }
-  assert(arena);
+  assert(allocator);
   raw_ = reinterpret_cast<unsigned char*>(
-      arena->AllocateAligned(sz, huge_page_tlb_size, logger));
+      allocator->AllocateAligned(sz, huge_page_tlb_size, logger));
   memset(raw_, 0, sz);
   if (kNumBlocks > 0 && (reinterpret_cast<uint64_t>(raw_) % CACHE_LINE_SIZE)) {
     data_ = raw_ + CACHE_LINE_SIZE -
diff --git a/util/dynamic_bloom.h b/util/dynamic_bloom.h
index b3b402c4f..a6e4d7367 100644
--- a/util/dynamic_bloom.h
+++ b/util/dynamic_bloom.h
@@ -9,7 +9,6 @@
 
 #include "rocksdb/slice.h"
 
-#include "util/arena.h"
 #include "port/port_posix.h"
 
 #include <atomic>
@@ -18,11 +17,12 @@
 namespace rocksdb {
 
 class Slice;
+class Allocator;
 class Logger;
 
 class DynamicBloom {
  public:
-  // arena: pass arena to bloom filter, hence trace the usage of memory
+  // allocator: pass allocator to bloom filter, hence trace the usage of memory
   // total_bits: fixed total bits for the bloom
   // num_probes: number of hash probes for a single key
   // locality:  If positive, optimize for cache line locality, 0 otherwise.
@@ -32,7 +32,7 @@ class DynamicBloom {
   //                      it to be allocated, like:
   //                         sysctl -w vm.nr_hugepages=20
   //                     See linux doc Documentation/vm/hugetlbpage.txt
-  explicit DynamicBloom(Arena* arena,
+  explicit DynamicBloom(Allocator* allocator,
                         uint32_t total_bits, uint32_t locality = 0,
                         uint32_t num_probes = 6,
                         uint32_t (*hash_func)(const Slice& key) = nullptr,
@@ -42,8 +42,9 @@ class DynamicBloom {
   explicit DynamicBloom(uint32_t num_probes = 6,
                         uint32_t (*hash_func)(const Slice& key) = nullptr);
 
-  void SetTotalBits(Arena* arena, uint32_t total_bits, uint32_t locality,
-                    size_t huge_page_tlb_size, Logger* logger);
+  void SetTotalBits(Allocator* allocator, uint32_t total_bits,
+                    uint32_t locality, size_t huge_page_tlb_size,
+                    Logger* logger);
 
   ~DynamicBloom() {}
 
diff --git a/util/dynamic_bloom_test.cc b/util/dynamic_bloom_test.cc
index a3d6e0fc7..a8b1c529b 100644
--- a/util/dynamic_bloom_test.cc
+++ b/util/dynamic_bloom_test.cc
@@ -21,6 +21,7 @@ int main() {
 
 #include "dynamic_bloom.h"
 #include "port/port.h"
+#include "util/arena.h"
 #include "util/logging.h"
 #include "util/testharness.h"
 #include "util/testutil.h"
diff --git a/util/hash_cuckoo_rep.cc b/util/hash_cuckoo_rep.cc
index 6a67fab44..3ac5ba746 100644
--- a/util/hash_cuckoo_rep.cc
+++ b/util/hash_cuckoo_rep.cc
@@ -52,25 +52,26 @@ struct CuckooStep {
 class HashCuckooRep : public MemTableRep {
  public:
   explicit HashCuckooRep(const MemTableRep::KeyComparator& compare,
-                         Arena* arena, const size_t bucket_count,
+                         MemTableAllocator* allocator,
+                         const size_t bucket_count,
                          const unsigned int hash_func_count)
-      : MemTableRep(arena),
+      : MemTableRep(allocator),
         compare_(compare),
-        arena_(arena),
+        allocator_(allocator),
         bucket_count_(bucket_count),
         cuckoo_path_max_depth_(kDefaultCuckooPathMaxDepth),
         occupied_count_(0),
         hash_function_count_(hash_func_count),
         backup_table_(nullptr) {
     char* mem = reinterpret_cast<char*>(
-        arena_->Allocate(sizeof(std::atomic<const char*>) * bucket_count_));
+        allocator_->Allocate(sizeof(std::atomic<const char*>) * bucket_count_));
     cuckoo_array_ = new (mem) std::atomic<const char*>[bucket_count_];
     for (unsigned int bid = 0; bid < bucket_count_; ++bid) {
       cuckoo_array_[bid].store(nullptr, std::memory_order_relaxed);
     }
 
     cuckoo_path_ = reinterpret_cast<int*>(
-        arena_->Allocate(sizeof(int) * (cuckoo_path_max_depth_ + 1)));
+        allocator_->Allocate(sizeof(int) * (cuckoo_path_max_depth_ + 1)));
     is_nearly_full_ = false;
   }
 
@@ -181,8 +182,8 @@ class HashCuckooRep : public MemTableRep {
 
  private:
   const MemTableRep::KeyComparator& compare_;
-  // the pointer to Arena to allocate memory, immutable after construction.
-  Arena* const arena_;
+  // the pointer to Allocator to allocate memory, immutable after construction.
+  MemTableAllocator* const allocator_;
   // the number of hash bucket in the hash table.
   const size_t bucket_count_;
   // the maxinum depth of the cuckoo path.
@@ -321,7 +322,7 @@ void HashCuckooRep::Insert(KeyHandle handle) {
     if (backup_table_.get() == nullptr) {
       VectorRepFactory factory(10);
       backup_table_.reset(
-          factory.CreateMemTableRep(compare_, arena_, nullptr, nullptr));
+          factory.CreateMemTableRep(compare_, allocator_, nullptr, nullptr));
       is_nearly_full_ = true;
     }
     backup_table_->Insert(key);
@@ -601,7 +602,7 @@ void HashCuckooRep::Iterator::SeekToLast() {
 }  // anom namespace
 
 MemTableRep* HashCuckooRepFactory::CreateMemTableRep(
-    const MemTableRep::KeyComparator& compare, Arena* arena,
+    const MemTableRep::KeyComparator& compare, MemTableAllocator* allocator,
     const SliceTransform* transform, Logger* logger) {
   // The estimated average fullness.  The write performance of any close hash
   // degrades as the fullness of the mem-table increases.  Setting kFullness
@@ -620,7 +621,8 @@ MemTableRep* HashCuckooRepFactory::CreateMemTableRep(
   if (hash_function_count > kMaxHashCount) {
     hash_function_count = kMaxHashCount;
   }
-  return new HashCuckooRep(compare, arena, bucket_count, hash_function_count);
+  return new HashCuckooRep(compare, allocator, bucket_count,
+                           hash_function_count);
 }
 
 MemTableRepFactory* NewHashCuckooRepFactory(size_t write_buffer_size,
diff --git a/util/hash_cuckoo_rep.h b/util/hash_cuckoo_rep.h
index 669b6b7d4..9f374a978 100644
--- a/util/hash_cuckoo_rep.h
+++ b/util/hash_cuckoo_rep.h
@@ -28,7 +28,7 @@ class HashCuckooRepFactory : public MemTableRepFactory {
   virtual ~HashCuckooRepFactory() {}
 
   virtual MemTableRep* CreateMemTableRep(
-      const MemTableRep::KeyComparator& compare, Arena* arena,
+      const MemTableRep::KeyComparator& compare, MemTableAllocator* allocator,
       const SliceTransform* transform, Logger* logger) override;
 
   virtual const char* Name() const override { return "HashCuckooRepFactory"; }
diff --git a/util/hash_linklist_rep.cc b/util/hash_linklist_rep.cc
index 4573d8340..d8e6da6aa 100644
--- a/util/hash_linklist_rep.cc
+++ b/util/hash_linklist_rep.cc
@@ -45,10 +45,10 @@ struct SkipListBucketHeader {
   MemtableSkipList skip_list;
 
   explicit SkipListBucketHeader(const MemTableRep::KeyComparator& cmp,
-                                Arena* arena, uint32_t count)
+                                MemTableAllocator* allocator, uint32_t count)
       : Counting_header(this,  // Pointing to itself to indicate header type.
                         count),
-        skip_list(cmp, arena) {}
+        skip_list(cmp, allocator) {}
 };
 
 struct Node {
@@ -143,10 +143,11 @@ struct Node {
 // which can be significant decrease of memory utilization.
 class HashLinkListRep : public MemTableRep {
  public:
-  HashLinkListRep(const MemTableRep::KeyComparator& compare, Arena* arena,
-                  const SliceTransform* transform, size_t bucket_size,
-                  uint32_t threshold_use_skiplist, size_t huge_page_tlb_size,
-                  Logger* logger, int bucket_entries_logging_threshold,
+  HashLinkListRep(const MemTableRep::KeyComparator& compare,
+                  MemTableAllocator* allocator, const SliceTransform* transform,
+                  size_t bucket_size, uint32_t threshold_use_skiplist,
+                  size_t huge_page_tlb_size, Logger* logger,
+                  int bucket_entries_logging_threshold,
                   bool if_log_bucket_dist_when_flash);
 
   virtual KeyHandle Allocate(const size_t len, char** buf) override;
@@ -166,7 +167,7 @@ class HashLinkListRep : public MemTableRep {
   virtual MemTableRep::Iterator* GetIterator(Arena* arena = nullptr) override;
 
   virtual MemTableRep::Iterator* GetDynamicPrefixIterator(
-      Arena* arena = nullptr) override;
+       Arena* arena = nullptr) override;
 
  private:
   friend class DynamicIterator;
@@ -233,8 +234,8 @@ class HashLinkListRep : public MemTableRep {
 
   class FullListIterator : public MemTableRep::Iterator {
    public:
-    explicit FullListIterator(MemtableSkipList* list, Arena* arena)
-        : iter_(list), full_list_(list), arena_(arena) {}
+    explicit FullListIterator(MemtableSkipList* list, Allocator* allocator)
+        : iter_(list), full_list_(list), allocator_(allocator) {}
 
     virtual ~FullListIterator() {
     }
@@ -288,7 +289,7 @@ class HashLinkListRep : public MemTableRep {
     MemtableSkipList::Iterator iter_;
     // To destruct with the iterator.
     std::unique_ptr<MemtableSkipList> full_list_;
-    std::unique_ptr<Arena> arena_;
+    std::unique_ptr<Allocator> allocator_;
     std::string tmp_;       // For passing to EncodeKey
   };
 
@@ -453,13 +454,14 @@ class HashLinkListRep : public MemTableRep {
 };
 
 HashLinkListRep::HashLinkListRep(const MemTableRep::KeyComparator& compare,
-                                 Arena* arena, const SliceTransform* transform,
+                                 MemTableAllocator* allocator,
+                                 const SliceTransform* transform,
                                  size_t bucket_size,
                                  uint32_t threshold_use_skiplist,
                                  size_t huge_page_tlb_size, Logger* logger,
                                  int bucket_entries_logging_threshold,
                                  bool if_log_bucket_dist_when_flash)
-    : MemTableRep(arena),
+    : MemTableRep(allocator),
       bucket_size_(bucket_size),
       // Threshold to use skip list doesn't make sense if less than 3, so we
       // force it to be minimum of 3 to simplify implementation.
@@ -469,7 +471,7 @@ HashLinkListRep::HashLinkListRep(const MemTableRep::KeyComparator& compare,
       logger_(logger),
       bucket_entries_logging_threshold_(bucket_entries_logging_threshold),
       if_log_bucket_dist_when_flash_(if_log_bucket_dist_when_flash) {
-  char* mem = arena_->AllocateAligned(sizeof(Pointer) * bucket_size,
+  char* mem = allocator_->AllocateAligned(sizeof(Pointer) * bucket_size,
                                       huge_page_tlb_size, logger);
 
   buckets_ = new (mem) Pointer[bucket_size];
@@ -483,7 +485,7 @@ HashLinkListRep::~HashLinkListRep() {
 }
 
 KeyHandle HashLinkListRep::Allocate(const size_t len, char** buf) {
-  char* mem = arena_->AllocateAligned(sizeof(Node) + len);
+  char* mem = allocator_->AllocateAligned(sizeof(Node) + len);
   Node* x = new (mem) Node();
   *buf = x->key;
   return static_cast<void*>(x);
@@ -559,7 +561,7 @@ void HashLinkListRep::Insert(KeyHandle handle) {
     // the new node. Otherwise, we might need to change next pointer of first.
     // In that case, a reader might sees the next pointer is NULL and wrongly
     // think the node is a bucket header.
-    auto* mem = arena_->AllocateAligned(sizeof(BucketHeader));
+    auto* mem = allocator_->AllocateAligned(sizeof(BucketHeader));
     header = new (mem) BucketHeader(first, 1);
     bucket.store(header, std::memory_order_release);
   } else {
@@ -591,9 +593,9 @@ void HashLinkListRep::Insert(KeyHandle handle) {
     LinkListIterator bucket_iter(
         this, reinterpret_cast<Node*>(
                   first_next_pointer->load(std::memory_order_relaxed)));
-    auto mem = arena_->AllocateAligned(sizeof(SkipListBucketHeader));
+    auto mem = allocator_->AllocateAligned(sizeof(SkipListBucketHeader));
     SkipListBucketHeader* new_skip_list_header = new (mem)
-        SkipListBucketHeader(compare_, arena_, header->num_entries + 1);
+        SkipListBucketHeader(compare_, allocator_, header->num_entries + 1);
     auto& skip_list = new_skip_list_header->skip_list;
 
     // Add all current entries to the skip list
@@ -669,7 +671,7 @@ bool HashLinkListRep::Contains(const char* key) const {
 }
 
 size_t HashLinkListRep::ApproximateMemoryUsage() {
-  // Memory is always allocated from the arena.
+  // Memory is always allocated from the allocator.
   return 0;
 }
 
@@ -700,7 +702,7 @@ void HashLinkListRep::Get(const LookupKey& k, void* callback_args,
 
 MemTableRep::Iterator* HashLinkListRep::GetIterator(Arena* alloc_arena) {
   // allocate a new arena of similar size to the one currently in use
-  Arena* new_arena = new Arena(arena_->BlockSize());
+  Arena* new_arena = new Arena(allocator_->BlockSize());
   auto list = new MemtableSkipList(compare_, new_arena);
   HistogramImpl keys_per_bucket_hist;
 
@@ -784,9 +786,9 @@ Node* HashLinkListRep::FindGreaterOrEqualInBucket(Node* head,
 } // anon namespace
 
 MemTableRep* HashLinkListRepFactory::CreateMemTableRep(
-    const MemTableRep::KeyComparator& compare, Arena* arena,
+    const MemTableRep::KeyComparator& compare, MemTableAllocator* allocator,
     const SliceTransform* transform, Logger* logger) {
-  return new HashLinkListRep(compare, arena, transform, bucket_count_,
+  return new HashLinkListRep(compare, allocator, transform, bucket_count_,
                              threshold_use_skiplist_, huge_page_tlb_size_,
                              logger, bucket_entries_logging_threshold_,
                              if_log_bucket_dist_when_flash_);
diff --git a/util/hash_linklist_rep.h b/util/hash_linklist_rep.h
index 0df35b545..629272394 100644
--- a/util/hash_linklist_rep.h
+++ b/util/hash_linklist_rep.h
@@ -29,7 +29,7 @@ class HashLinkListRepFactory : public MemTableRepFactory {
   virtual ~HashLinkListRepFactory() {}
 
   virtual MemTableRep* CreateMemTableRep(
-      const MemTableRep::KeyComparator& compare, Arena* arena,
+      const MemTableRep::KeyComparator& compare, MemTableAllocator* allocator,
       const SliceTransform* transform, Logger* logger) override;
 
   virtual const char* Name() const override {
diff --git a/util/hash_skiplist_rep.cc b/util/hash_skiplist_rep.cc
index 1393a917e..4fb226811 100644
--- a/util/hash_skiplist_rep.cc
+++ b/util/hash_skiplist_rep.cc
@@ -23,9 +23,10 @@ namespace {
 
 class HashSkipListRep : public MemTableRep {
  public:
-  HashSkipListRep(const MemTableRep::KeyComparator& compare, Arena* arena,
-                  const SliceTransform* transform, size_t bucket_size,
-                  int32_t skiplist_height, int32_t skiplist_branching_factor);
+  HashSkipListRep(const MemTableRep::KeyComparator& compare,
+                  MemTableAllocator* allocator, const SliceTransform* transform,
+                  size_t bucket_size, int32_t skiplist_height,
+                  int32_t skiplist_branching_factor);
 
   virtual void Insert(KeyHandle handle) override;
 
@@ -62,7 +63,7 @@ class HashSkipListRep : public MemTableRep {
 
   const MemTableRep::KeyComparator& compare_;
   // immutable after construction
-  Arena* const arena_;
+  MemTableAllocator* const allocator_;
 
   inline size_t GetHash(const Slice& slice) const {
     return MurmurHash(slice.data(), static_cast<int>(slice.size()), 0) %
@@ -221,17 +222,19 @@ class HashSkipListRep : public MemTableRep {
 };
 
 HashSkipListRep::HashSkipListRep(const MemTableRep::KeyComparator& compare,
-                                 Arena* arena, const SliceTransform* transform,
+                                 MemTableAllocator* allocator,
+                                 const SliceTransform* transform,
                                  size_t bucket_size, int32_t skiplist_height,
                                  int32_t skiplist_branching_factor)
-    : MemTableRep(arena),
+    : MemTableRep(allocator),
       bucket_size_(bucket_size),
       skiplist_height_(skiplist_height),
       skiplist_branching_factor_(skiplist_branching_factor),
       transform_(transform),
       compare_(compare),
-      arena_(arena) {
-  auto mem = arena->AllocateAligned(sizeof(std::atomic<void*>) * bucket_size);
+      allocator_(allocator) {
+  auto mem = allocator->AllocateAligned(
+               sizeof(std::atomic<void*>) * bucket_size);
   buckets_ = new (mem) std::atomic<Bucket*>[bucket_size];
 
   for (size_t i = 0; i < bucket_size_; ++i) {
@@ -247,8 +250,8 @@ HashSkipListRep::Bucket* HashSkipListRep::GetInitializedBucket(
   size_t hash = GetHash(transformed);
   auto bucket = GetBucket(hash);
   if (bucket == nullptr) {
-    auto addr = arena_->AllocateAligned(sizeof(Bucket));
-    bucket = new (addr) Bucket(compare_, arena_, skiplist_height_,
+    auto addr = allocator_->AllocateAligned(sizeof(Bucket));
+    bucket = new (addr) Bucket(compare_, allocator_, skiplist_height_,
                                skiplist_branching_factor_);
     buckets_[hash].store(bucket, std::memory_order_release);
   }
@@ -291,7 +294,7 @@ void HashSkipListRep::Get(const LookupKey& k, void* callback_args,
 
 MemTableRep::Iterator* HashSkipListRep::GetIterator(Arena* arena) {
   // allocate a new arena of similar size to the one currently in use
-  Arena* new_arena = new Arena(arena_->BlockSize());
+  Arena* new_arena = new Arena(allocator_->BlockSize());
   auto list = new Bucket(compare_, new_arena);
   for (size_t i = 0; i < bucket_size_; ++i) {
     auto bucket = GetBucket(i);
@@ -322,9 +325,9 @@ MemTableRep::Iterator* HashSkipListRep::GetDynamicPrefixIterator(Arena* arena) {
 } // anon namespace
 
 MemTableRep* HashSkipListRepFactory::CreateMemTableRep(
-    const MemTableRep::KeyComparator& compare, Arena* arena,
+    const MemTableRep::KeyComparator& compare, MemTableAllocator* allocator,
     const SliceTransform* transform, Logger* logger) {
-  return new HashSkipListRep(compare, arena, transform, bucket_count_,
+  return new HashSkipListRep(compare, allocator, transform, bucket_count_,
                              skiplist_height_, skiplist_branching_factor_);
 }
 
diff --git a/util/hash_skiplist_rep.h b/util/hash_skiplist_rep.h
index 6fec60a47..15d0fc77f 100644
--- a/util/hash_skiplist_rep.h
+++ b/util/hash_skiplist_rep.h
@@ -26,7 +26,7 @@ class HashSkipListRepFactory : public MemTableRepFactory {
   virtual ~HashSkipListRepFactory() {}
 
   virtual MemTableRep* CreateMemTableRep(
-      const MemTableRep::KeyComparator& compare, Arena* arena,
+      const MemTableRep::KeyComparator& compare, MemTableAllocator* allocator,
       const SliceTransform* transform, Logger* logger) override;
 
   virtual const char* Name() const override {
diff --git a/util/ldb_cmd.cc b/util/ldb_cmd.cc
index 4f925c7c3..8a8fa7a2e 100644
--- a/util/ldb_cmd.cc
+++ b/util/ldb_cmd.cc
@@ -10,6 +10,7 @@
 #include "db/db_impl.h"
 #include "db/log_reader.h"
 #include "db/filename.h"
+#include "db/writebuffer.h"
 #include "db/write_batch_internal.h"
 #include "rocksdb/write_batch.h"
 #include "rocksdb/cache.h"
@@ -44,6 +45,7 @@ const string LDBCommand::ARG_FIX_PREFIX_LEN = "fix_prefix_len";
 const string LDBCommand::ARG_COMPRESSION_TYPE = "compression_type";
 const string LDBCommand::ARG_BLOCK_SIZE = "block_size";
 const string LDBCommand::ARG_AUTO_COMPACTION = "auto_compaction";
+const string LDBCommand::ARG_DB_WRITE_BUFFER_SIZE = "db_write_buffer_size";
 const string LDBCommand::ARG_WRITE_BUFFER_SIZE = "write_buffer_size";
 const string LDBCommand::ARG_FILE_SIZE = "file_size";
 const string LDBCommand::ARG_CREATE_IF_MISSING = "create_if_missing";
@@ -276,6 +278,17 @@ Options LDBCommand::PrepareOptionsForOpenDB() {
     }
   }
 
+  int db_write_buffer_size;
+  if (ParseIntOption(option_map_, ARG_DB_WRITE_BUFFER_SIZE,
+        db_write_buffer_size, exec_state_)) {
+    if (db_write_buffer_size >= 0) {
+      opt.db_write_buffer_size = db_write_buffer_size;
+    } else {
+      exec_state_ = LDBCommandExecuteResult::FAILED(ARG_DB_WRITE_BUFFER_SIZE +
+                      " must be >= 0.");
+    }
+  }
+
   int write_buffer_size;
   if (ParseIntOption(option_map_, ARG_WRITE_BUFFER_SIZE, write_buffer_size,
         exec_state_)) {
@@ -584,7 +597,8 @@ void ManifestDumpCommand::DoCommand() {
   // SanitizeOptions(), we need to initialize it manually.
   options.db_paths.emplace_back("dummy", 0);
   WriteController wc;
-  VersionSet versions(dbname, &options, sopt, tc.get(), &wc);
+  WriteBuffer wb(options.db_write_buffer_size);
+  VersionSet versions(dbname, &options, sopt, tc.get(), &wb, &wc);
   Status s = versions.DumpManifest(options, file, verbose_, is_key_hex_);
   if (!s.ok()) {
     printf("Error in processing file %s %s\n", manifestfile.c_str(),
@@ -1111,7 +1125,8 @@ Status ReduceDBLevelsCommand::GetOldNumOfLevels(Options& opt,
                   opt.table_cache_remove_scan_count_limit));
   const InternalKeyComparator cmp(opt.comparator);
   WriteController wc;
-  VersionSet versions(db_path_, &opt, soptions, tc.get(), &wc);
+  WriteBuffer wb(opt.db_write_buffer_size);
+  VersionSet versions(db_path_, &opt, soptions, tc.get(), &wb, &wc);
   std::vector<ColumnFamilyDescriptor> dummy;
   ColumnFamilyDescriptor dummy_descriptor(kDefaultColumnFamilyName,
                                           ColumnFamilyOptions(opt));
diff --git a/util/ldb_cmd.h b/util/ldb_cmd.h
index 7436cc368..fd4d4d4b9 100644
--- a/util/ldb_cmd.h
+++ b/util/ldb_cmd.h
@@ -53,6 +53,7 @@ public:
   static const string ARG_COMPRESSION_TYPE;
   static const string ARG_BLOCK_SIZE;
   static const string ARG_AUTO_COMPACTION;
+  static const string ARG_DB_WRITE_BUFFER_SIZE;
   static const string ARG_WRITE_BUFFER_SIZE;
   static const string ARG_FILE_SIZE;
   static const string ARG_CREATE_IF_MISSING;
diff --git a/util/ldb_tool.cc b/util/ldb_tool.cc
index bb6c8ffca..fe2d7d538 100644
--- a/util/ldb_tool.cc
+++ b/util/ldb_tool.cc
@@ -53,6 +53,8 @@ public:
     ret.append("  --" + LDBCommand::ARG_BLOCK_SIZE +
         "=<block_size_in_bytes>\n");
     ret.append("  --" + LDBCommand::ARG_AUTO_COMPACTION + "=<true|false>\n");
+    ret.append("  --" + LDBCommand::ARG_DB_WRITE_BUFFER_SIZE +
+        "=<int,e.g.:16777216>\n");
     ret.append("  --" + LDBCommand::ARG_WRITE_BUFFER_SIZE +
         "=<int,e.g.:4194304>\n");
     ret.append("  --" + LDBCommand::ARG_FILE_SIZE + "=<int,e.g.:2097152>\n");
diff --git a/util/options.cc b/util/options.cc
index c6b883779..085df053d 100644
--- a/util/options.cc
+++ b/util/options.cc
@@ -17,6 +17,7 @@
 #include <inttypes.h>
 #include <limits>
 
+#include "db/writebuffer.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/compaction_filter.h"
 #include "rocksdb/comparator.h"
@@ -230,6 +231,7 @@ DBOptions::DBOptions()
       skip_log_error_on_recovery(false),
       stats_dump_period_sec(3600),
       advise_random_on_open(true),
+      db_write_buffer_size(0),
       access_hint_on_compaction_start(NORMAL),
       use_adaptive_mutex(false),
       bytes_per_sync(0),
@@ -273,6 +275,7 @@ DBOptions::DBOptions(const Options& options)
       skip_log_error_on_recovery(options.skip_log_error_on_recovery),
       stats_dump_period_sec(options.stats_dump_period_sec),
       advise_random_on_open(options.advise_random_on_open),
+      db_write_buffer_size(options.db_write_buffer_size),
       access_hint_on_compaction_start(options.access_hint_on_compaction_start),
       use_adaptive_mutex(options.use_adaptive_mutex),
       bytes_per_sync(options.bytes_per_sync),
@@ -336,6 +339,8 @@ void DBOptions::Dump(Logger* log) const {
         stats_dump_period_sec);
     Log(log, "                   Options.advise_random_on_open: %d",
         advise_random_on_open);
+    Log(log, "                   Options.db_write_buffer_size: %zd",
+        db_write_buffer_size);
     Log(log, "         Options.access_hint_on_compaction_start: %s",
         access_hints[access_hint_on_compaction_start]);
     Log(log, "                      Options.use_adaptive_mutex: %d",
diff --git a/util/options_helper.cc b/util/options_helper.cc
index bea7f1a9d..c2bd3cb83 100644
--- a/util/options_helper.cc
+++ b/util/options_helper.cc
@@ -437,6 +437,8 @@ bool GetDBOptionsFromMap(
         new_options->stats_dump_period_sec = ParseUint32(o.second);
       } else if (o.first == "advise_random_on_open") {
         new_options->advise_random_on_open = ParseBoolean(o.first, o.second);
+      } else if (o.first == "db_write_buffer_size") {
+        new_options->db_write_buffer_size = ParseUint64(o.second);
       } else if (o.first == "use_adaptive_mutex") {
         new_options->use_adaptive_mutex = ParseBoolean(o.first, o.second);
       } else if (o.first == "bytes_per_sync") {
diff --git a/util/skiplistrep.cc b/util/skiplistrep.cc
index 1322f6c9a..ee57372fa 100644
--- a/util/skiplistrep.cc
+++ b/util/skiplistrep.cc
@@ -18,9 +18,10 @@ class SkipListRep : public MemTableRep {
 
   friend class LookaheadIterator;
 public:
-  explicit SkipListRep(const MemTableRep::KeyComparator& compare, Arena* arena,
+  explicit SkipListRep(const MemTableRep::KeyComparator& compare,
+                       MemTableAllocator* allocator,
                        const SliceTransform* transform, const size_t lookahead)
-    : MemTableRep(arena), skip_list_(compare, arena), cmp_(compare),
+    : MemTableRep(allocator), skip_list_(compare, allocator), cmp_(compare),
       transform_(transform), lookahead_(lookahead) {
   }
 
@@ -36,7 +37,7 @@ public:
   }
 
   virtual size_t ApproximateMemoryUsage() override {
-    // All memory is allocated through arena; nothing to report here
+    // All memory is allocated through allocator; nothing to report here
     return 0;
   }
 
@@ -224,9 +225,9 @@ public:
 }
 
 MemTableRep* SkipListFactory::CreateMemTableRep(
-    const MemTableRep::KeyComparator& compare, Arena* arena,
+    const MemTableRep::KeyComparator& compare, MemTableAllocator* allocator,
     const SliceTransform* transform, Logger* logger) {
-  return new SkipListRep(compare, arena, transform, lookahead_);
+  return new SkipListRep(compare, allocator, transform, lookahead_);
 }
 
 } // namespace rocksdb
diff --git a/util/vectorrep.cc b/util/vectorrep.cc
index e61b8ad08..ee38bc304 100644
--- a/util/vectorrep.cc
+++ b/util/vectorrep.cc
@@ -25,7 +25,8 @@ using namespace stl_wrappers;
 
 class VectorRep : public MemTableRep {
  public:
-  VectorRep(const KeyComparator& compare, Arena* arena, size_t count);
+  VectorRep(const KeyComparator& compare, MemTableAllocator* allocator,
+            size_t count);
 
   // Insert key into the collection. (The caller will pack key and value into a
   // single buffer and pass that in as the parameter to Insert)
@@ -131,8 +132,9 @@ size_t VectorRep::ApproximateMemoryUsage() {
     );
 }
 
-VectorRep::VectorRep(const KeyComparator& compare, Arena* arena, size_t count)
-  : MemTableRep(arena),
+VectorRep::VectorRep(const KeyComparator& compare, MemTableAllocator* allocator,
+                     size_t count)
+  : MemTableRep(allocator),
     bucket_(new Bucket()),
     immutable_(false),
     sorted_(false),
@@ -282,9 +284,9 @@ MemTableRep::Iterator* VectorRep::GetIterator(Arena* arena) {
 } // anon namespace
 
 MemTableRep* VectorRepFactory::CreateMemTableRep(
-    const MemTableRep::KeyComparator& compare, Arena* arena,
+    const MemTableRep::KeyComparator& compare, MemTableAllocator* allocator,
     const SliceTransform*, Logger* logger) {
-  return new VectorRep(compare, arena, count_);
+  return new VectorRep(compare, allocator, count_);
 }
 } // namespace rocksdb
 #endif  // ROCKSDB_LITE

From 32a0a038447f50a012ae7ac2daad106ba4f1db5c Mon Sep 17 00:00:00 2001
From: Mark Callaghan <mdcallag@gmail.com>
Date: Wed, 3 Dec 2014 18:28:39 -0800
Subject: [PATCH 592/829] Add Moved(GB) to Compaction IO stats

Summary:
Adds counter for bytes moved (files pushed down a level rather than compacted) to compaction
IO stats as Moved(GB). From the output removed these infrequently used columns: RW-Amp, Rn(cnt), Rnp1(cnt),
Wnp1(cnt), Wnew(cnt).
Example old output:
Level   Files   Size(MB) Score Read(GB)  Rn(GB) Rnp1(GB) Write(GB) Wnew(GB) RW-Amp W-Amp Rd(MB/s) Wr(MB/s)  Rn(cnt) Rnp1(cnt) Wnp1(cnt) Wnew(cnt)  Comp(sec) Comp(cnt) Avg(sec) Stall(sec) Stall(cnt) Avg(ms) RecordIn RecordDrop
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
  L0     0/0          0   0.0      0.0     0.0      0.0    2130.8   2130.8    0.0   0.0      0.0    109.1        0         0         0         0      20002     25068    0.798      28.75     182059    0.16       0          0
  L1   142/0        509   1.0   4618.5  2036.5   2582.0    4602.1   2020.2    4.5   2.3     88.5     88.1    24220    701246   1215528    514282      53466      4229   12.643       0.00          0    0.002032745988  300688729

Example new output:
Level   Files   Size(MB) Score Read(GB)  Rn(GB) Rnp1(GB) Write(GB) Wnew(GB) Moved(GB) W-Amp Rd(MB/s) Wr(MB/s) Comp(sec) Comp(cnt) Avg(sec) Stall(sec) Stall(cnt) Avg(ms)     RecordIn   RecordDrop
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
  L0     7/0         13   1.8      0.0     0.0      0.0       0.6      0.6       0.0   0.0      0.0     14.7        44       353    0.124       0.03        626    0.05            0            0
  L1     9/0         16   1.6      0.0     0.0      0.0       0.0      0.0       0.6   0.0      0.0      0.0         0         0    0.000       0.00          0    0.00            0            0

Task ID: #

Blame Rev:

Test Plan:
make check, run db_bench --fillseq --stats_per_interval --stats_interval and look at output

Revert Plan:

Database Impact:

Memcache Impact:

Other Notes:

EImportant:

- begin *PUBLIC* platform impact section -
Bugzilla: #
- end platform impact -

Reviewers: igor

Reviewed By: igor

Subscribers: dhruba

Differential Revision: https://reviews.facebook.net/D29787
---
 db/db_impl.cc        |  2 ++
 db/internal_stats.cc | 39 ++++++++++++---------------------------
 db/internal_stats.h  | 14 ++++++++++++++
 3 files changed, 28 insertions(+), 27 deletions(-)

diff --git a/db/db_impl.cc b/db/db_impl.cc
index bdc0030ae..ce2466a2e 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -2070,6 +2070,8 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress, JobContext* job_context,
                                   *c->mutable_cf_options());
 
     VersionStorageInfo::LevelSummaryStorage tmp;
+    c->column_family_data()->internal_stats()->IncBytesMoved(
+        c->level() + 1, f->fd.GetFileSize());
     LogToBuffer(log_buffer, "[%s] Moved #%" PRIu64 " to level-%d %" PRIu64
                             " bytes %s: %s\n",
                 c->column_family_data()->GetName().c_str(), f->fd.GetNumber(),
diff --git a/db/internal_stats.cc b/db/internal_stats.cc
index 6344be56d..c729ef8d6 100644
--- a/db/internal_stats.cc
+++ b/db/internal_stats.cc
@@ -31,18 +31,18 @@ void PrintLevelStatsHeader(char* buf, size_t len, const std::string& cf_name) {
       buf, len,
       "\n** Compaction Stats [%s] **\n"
       "Level   Files   Size(MB) Score Read(GB)  Rn(GB) Rnp1(GB) "
-      "Write(GB) Wnew(GB) RW-Amp W-Amp Rd(MB/s) Wr(MB/s)  Rn(cnt) "
-      "Rnp1(cnt) Wnp1(cnt) Wnew(cnt)  Comp(sec) Comp(cnt) Avg(sec) "
+      "Write(GB) Wnew(GB) Moved(GB) W-Amp Rd(MB/s) Wr(MB/s) "
+      "Comp(sec) Comp(cnt) Avg(sec) "
       "Stall(sec) Stall(cnt) Avg(ms)     RecordIn   RecordDrop\n"
       "--------------------------------------------------------------------"
       "--------------------------------------------------------------------"
-      "--------------------------------------------------------------------\n",
+      "----------------------------------------------------------\n",
       cf_name.c_str());
 }
 
 void PrintLevelStats(char* buf, size_t len, const std::string& name,
     int num_files, int being_compacted, double total_file_size, double score,
-    double rw_amp, double w_amp, double stall_us, uint64_t stalls,
+    double w_amp, double stall_us, uint64_t stalls,
     const InternalStats::CompactionStats& stats) {
   uint64_t bytes_read = stats.bytes_readn + stats.bytes_readnp1;
   uint64_t bytes_new = stats.bytes_written - stats.bytes_readnp1;
@@ -55,15 +55,11 @@ void PrintLevelStats(char* buf, size_t len, const std::string& name,
            "%8.1f "                    /* Rnp1(GB) */
            "%9.1f "                    /* Write(GB) */
            "%8.1f "                    /* Wnew(GB) */
-           "%6.1f "                    /* RW-Amp */
+           "%9.1f "                    /* Moved(GB) */
            "%5.1f "                    /* W-Amp */
            "%8.1f "                    /* Rd(MB/s) */
            "%8.1f "                    /* Wr(MB/s) */
-           "%8d "                      /* Rn(cnt) */
-           "%9d "                      /* Rnp1(cnt) */
-           "%9d "                      /* Wnp1(cnt) */
-           "%9d "                      /* Wnew(cnt) */
-           "%10.0f "                   /* Comp(sec) */
+           "%9.0f "                   /* Comp(sec) */
            "%9d "                      /* Comp(cnt) */
            "%8.3f "                    /* Avg(sec) */
            "%10.2f "                   /* Stall(sec) */
@@ -76,10 +72,9 @@ void PrintLevelStats(char* buf, size_t len, const std::string& name,
            name.c_str(), num_files, being_compacted, total_file_size / kMB,
            score, bytes_read / kGB, stats.bytes_readn / kGB,
            stats.bytes_readnp1 / kGB, stats.bytes_written / kGB,
-           bytes_new / kGB, rw_amp, w_amp, bytes_read / kMB / elapsed,
-           stats.bytes_written / kMB / elapsed, stats.files_in_leveln,
-           stats.files_in_levelnp1, stats.files_out_levelnp1,
-           stats.files_out_levelnp1 - stats.files_in_levelnp1,
+           bytes_new / kGB, stats.bytes_moved / kGB,
+           w_amp, bytes_read / kMB / elapsed,
+           stats.bytes_written / kMB / elapsed,
            stats.micros / 1000000.0, stats.count,
            stats.count == 0 ? 0 : stats.micros / 1000000.0 / stats.count,
            stall_us / 1000000.0, stalls,
@@ -428,29 +423,22 @@ void InternalStats::DumpCFStats(std::string* value) {
       total_slowdown_count_soft += stall_leveln_slowdown_count_soft_[level];
       total_slowdown_hard += stall_leveln_slowdown_hard_[level];
       total_slowdown_count_hard += stall_leveln_slowdown_count_hard_[level];
-      int64_t bytes_read = comp_stats_[level].bytes_readn +
-                           comp_stats_[level].bytes_readnp1;
-      double rw_amp = (comp_stats_[level].bytes_readn == 0) ? 0.0
-          : (comp_stats_[level].bytes_written + bytes_read) /
-            static_cast<double>(comp_stats_[level].bytes_readn);
       double w_amp = (comp_stats_[level].bytes_readn == 0) ? 0.0
           : comp_stats_[level].bytes_written /
             static_cast<double>(comp_stats_[level].bytes_readn);
       PrintLevelStats(buf, sizeof(buf), "L" + ToString(level), files,
                       files_being_compacted[level],
                       vstorage->NumLevelBytes(level), compaction_score[level],
-                      rw_amp, w_amp, stall_us, stalls, comp_stats_[level]);
+                      w_amp, stall_us, stalls, comp_stats_[level]);
       value->append(buf);
     }
   }
   uint64_t curr_ingest = cf_stats_value_[BYTES_FLUSHED];
   // Cumulative summary
-  double rw_amp = (stats_sum.bytes_written + stats_sum.bytes_readn +
-      stats_sum.bytes_readnp1) / static_cast<double>(curr_ingest + 1);
   double w_amp = stats_sum.bytes_written / static_cast<double>(curr_ingest + 1);
   // Stats summary across levels
   PrintLevelStats(buf, sizeof(buf), "Sum", total_files,
-      total_files_being_compacted, total_file_size, 0, rw_amp, w_amp,
+      total_files_being_compacted, total_file_size, 0, w_amp,
       total_stall_us, total_stall_count, stats_sum);
   value->append(buf);
   // Interval summary
@@ -458,12 +446,9 @@ void InternalStats::DumpCFStats(std::string* value) {
       curr_ingest - cf_stats_snapshot_.ingest_bytes + 1;
   CompactionStats interval_stats(stats_sum);
   interval_stats.Subtract(cf_stats_snapshot_.comp_stats);
-  rw_amp = (interval_stats.bytes_written +
-      interval_stats.bytes_readn + interval_stats.bytes_readnp1) /
-      static_cast<double>(interval_ingest);
   w_amp = interval_stats.bytes_written / static_cast<double>(interval_ingest);
   PrintLevelStats(buf, sizeof(buf), "Int", 0, 0, 0, 0,
-      rw_amp, w_amp, total_stall_us - cf_stats_snapshot_.stall_us,
+      w_amp, total_stall_us - cf_stats_snapshot_.stall_us,
       total_stall_count - cf_stats_snapshot_.stall_count, interval_stats);
   value->append(buf);
 
diff --git a/db/internal_stats.h b/db/internal_stats.h
index c50809d31..0c98ebcea 100644
--- a/db/internal_stats.h
+++ b/db/internal_stats.h
@@ -119,6 +119,9 @@ class InternalStats {
     // Total bytes written during compaction between levels N and N+1
     uint64_t bytes_written;
 
+    // Total bytes moved to this level
+    uint64_t bytes_moved;
+
     // Files read from level N during compaction between levels N and N+1
     int files_in_leveln;
 
@@ -143,6 +146,7 @@ class InternalStats {
           bytes_readn(0),
           bytes_readnp1(0),
           bytes_written(0),
+          bytes_moved(0),
           files_in_leveln(0),
           files_in_levelnp1(0),
           files_out_levelnp1(0),
@@ -155,6 +159,7 @@ class InternalStats {
           bytes_readn(c.bytes_readn),
           bytes_readnp1(c.bytes_readnp1),
           bytes_written(c.bytes_written),
+          bytes_moved(c.bytes_moved),
           files_in_leveln(c.files_in_leveln),
           files_in_levelnp1(c.files_in_levelnp1),
           files_out_levelnp1(c.files_out_levelnp1),
@@ -167,6 +172,7 @@ class InternalStats {
       this->bytes_readn += c.bytes_readn;
       this->bytes_readnp1 += c.bytes_readnp1;
       this->bytes_written += c.bytes_written;
+      this->bytes_moved += c.bytes_moved;
       this->files_in_leveln += c.files_in_leveln;
       this->files_in_levelnp1 += c.files_in_levelnp1;
       this->files_out_levelnp1 += c.files_out_levelnp1;
@@ -180,6 +186,7 @@ class InternalStats {
       this->bytes_readn -= c.bytes_readn;
       this->bytes_readnp1 -= c.bytes_readnp1;
       this->bytes_written -= c.bytes_written;
+      this->bytes_moved -= c.bytes_moved;
       this->files_in_leveln -= c.files_in_leveln;
       this->files_in_levelnp1 -= c.files_in_levelnp1;
       this->files_out_levelnp1 -= c.files_out_levelnp1;
@@ -193,6 +200,10 @@ class InternalStats {
     comp_stats_[level].Add(stats);
   }
 
+  void IncBytesMoved(int level, uint64_t amount) {
+    comp_stats_[level].bytes_moved += amount;
+  }
+
   void RecordLevelNSlowdown(int level, uint64_t micros, bool soft) {
     if (soft) {
       stall_leveln_slowdown_soft_[level] += micros;
@@ -329,6 +340,7 @@ class InternalStats {
     uint64_t bytes_readn;
     uint64_t bytes_readnp1;
     uint64_t bytes_written;
+    uint64_t bytes_moved;
     int files_in_leveln;
     int files_in_levelnp1;
     int files_out_levelnp1;
@@ -347,6 +359,8 @@ class InternalStats {
 
   void AddCompactionStats(int level, const CompactionStats& stats) {}
 
+  void IncBytesMoved(int level, uint64_t amount) {}
+
   void RecordLevelNSlowdown(int level, uint64_t micros, bool soft) {}
 
   void AddCFStats(InternalCFStatsType type, uint64_t value) {}

From 815f638cd0e7bd2eafda81a0fe2bc105df19a1ce Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Wed, 3 Dec 2014 19:06:08 -0800
Subject: [PATCH 593/829] Fix java build

---
 java/rocksjni/write_batch.cc | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/java/rocksjni/write_batch.cc b/java/rocksjni/write_batch.cc
index 8adcfdc0f..dbf2e25e2 100644
--- a/java/rocksjni/write_batch.cc
+++ b/java/rocksjni/write_batch.cc
@@ -19,6 +19,7 @@
 #include "rocksdb/write_batch.h"
 #include "rocksdb/status.h"
 #include "db/write_batch_internal.h"
+#include "db/writebuffer.h"
 #include "rocksdb/env.h"
 #include "rocksdb/memtablerep.h"
 #include "util/logging.h"
@@ -337,11 +338,12 @@ jbyteArray Java_org_rocksdb_test_WriteBatchTest_getContents(
   rocksdb::InternalKeyComparator cmp(rocksdb::BytewiseComparator());
   auto factory = std::make_shared<rocksdb::SkipListFactory>();
   rocksdb::Options options;
+  rocksdb::WriteBuffer wb(options.db_write_buffer_size);
   options.memtable_factory = factory;
   rocksdb::MemTable* mem = new rocksdb::MemTable(
       cmp, rocksdb::ImmutableCFOptions(options),
-      rocksdb::MutableCFOptions(options,
-      rocksdb::ImmutableCFOptions(options)));
+      rocksdb::MutableCFOptions(options, rocksdb::ImmutableCFOptions(options)),
+      &wb);
   mem->Ref();
   std::string state;
   rocksdb::ColumnFamilyMemTablesDefault cf_mems_default(mem);

From c0dee851c32f7afebb202856a8da36aeb87ee694 Mon Sep 17 00:00:00 2001
From: Mark Callaghan <mdcallag@gmail.com>
Date: Thu, 4 Dec 2014 10:34:06 -0800
Subject: [PATCH 594/829] Improve formatting, add missing newlines

Summary:
Improve formatting

Task ID: #

Blame Rev:

Test Plan:
make

Revert Plan:

Database Impact:

Memcache Impact:

Other Notes:

EImportant:

- begin *PUBLIC* platform impact section -
Bugzilla: #
- end platform impact -

Reviewers: igor

Reviewed By: igor

Subscribers: dhruba

Differential Revision: https://reviews.facebook.net/D29829
---
 include/rocksdb/options.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index 09b72ca6b..054d2c3e1 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -322,6 +322,7 @@ struct ColumnFamilyOptions {
   //
   // Dynamically changeable through SetOptions() API
   uint64_t target_file_size_base;
+
   // By default target_file_size_multiplier is 1, which means
   // by default files in different levels will have similar size.
   //
@@ -341,6 +342,7 @@ struct ColumnFamilyOptions {
   //
   // Dynamically changeable through SetOptions() API
   uint64_t max_bytes_for_level_base;
+
   // Default: 10.
   //
   // Dynamically changeable through SetOptions() API

From 5f719d72027a7e9a9bcd2ae4582fa611bd76cb20 Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Thu, 4 Dec 2014 11:11:11 -0800
Subject: [PATCH 595/829] Replace exception by setting valid_ = false in
 DBIter::MergeValuesNewToOld()

Summary: Replace exception by setting valid_ = false in DBIter::MergeValuesNewToOld().

Test Plan:
Not sure if I am right at this, but it seems we currently don't have a good
way to test that code path as it requires dynamically set merge_operator = nullptr
at the time while Merge() is calling.

Reviewers: igor, sdong

Reviewed By: sdong

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D29811
---
 db/db_iter.cc | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/db/db_iter.cc b/db/db_iter.cc
index 78decd8b1..1b5bf860e 100644
--- a/db/db_iter.cc
+++ b/db/db_iter.cc
@@ -281,8 +281,9 @@ void DBIter::MergeValuesNewToOld() {
   if (!user_merge_operator_) {
     Log(InfoLogLevel::ERROR_LEVEL,
         logger_, "Options::merge_operator is null.");
-    throw std::logic_error("DBIter::MergeValuesNewToOld() with"
-                           " Options::merge_operator null");
+    status_ = Status::InvalidArgument("user_merge_operator_ must be set.");
+    valid_ = false;
+    return;
   }
 
   // Start the merge process by pushing the first operand

From 97c19408825cac2de54f3681af44e8a3bff4739c Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Thu, 4 Dec 2014 11:19:12 -0800
Subject: [PATCH 596/829] Fix compile warning in db_stress.cc on Mac

Summary:
Fix the following compile warning in db_stress.cc on Mac
tools/db_stress.cc:1688:52: error: format specifies type 'unsigned long' but the argument has type '::google::uint64' (aka 'unsigned long long') [-Werror,-Wformat]
    fprintf(stdout, "DB-write-buffer-size: %lu\n", FLAGS_db_write_buffer_size);
                                           ~~~     ^~~~~~~~~~~~~~~~~~~~~~~~~~
                                           %llu

Test Plan:
make
---
 tools/db_stress.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/db_stress.cc b/tools/db_stress.cc
index c63d82413..de1ae8c7b 100644
--- a/tools/db_stress.cc
+++ b/tools/db_stress.cc
@@ -1685,7 +1685,7 @@ class StressTest {
     fprintf(stdout, "Write percentage    : %d%%\n", FLAGS_writepercent);
     fprintf(stdout, "Delete percentage   : %d%%\n", FLAGS_delpercent);
     fprintf(stdout, "Iterate percentage  : %d%%\n", FLAGS_iterpercent);
-    fprintf(stdout, "DB-write-buffer-size: %lu\n", FLAGS_db_write_buffer_size);
+    fprintf(stdout, "DB-write-buffer-size: %llu\n", FLAGS_db_write_buffer_size);
     fprintf(stdout, "Write-buffer-size   : %d\n", FLAGS_write_buffer_size);
     fprintf(stdout,
             "Iterations          : %lu\n",

From 1a8f4821a76df0cd7e10bc28f0e9233b79263a8f Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Thu, 4 Dec 2014 11:41:56 -0800
Subject: [PATCH 597/829] Replace exception by assertion in autovector

Summary: Replace exception by assertion in autovector

Test Plan: autovector_test

Reviewers: sdong, igor

Reviewed By: igor

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D29847
---
 util/autovector.h | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/util/autovector.h b/util/autovector.h
index e143c46cb..9362536d3 100644
--- a/util/autovector.h
+++ b/util/autovector.h
@@ -201,16 +201,12 @@ class autovector {
 
   // will check boundry
   const_reference at(size_type n) const {
-    if (n >= size()) {
-      throw std::out_of_range("autovector: index out of range");
-    }
+    assert(n < size());
     return (*this)[n];
   }
 
   reference at(size_type n) {
-    if (n >= size()) {
-      throw std::out_of_range("autovector: index out of range");
-    }
+    assert(n < size());
     return (*this)[n];
   }
 

From a5d4fc0a25b58283bcca6cbac7c47bd2b357356d Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Thu, 4 Dec 2014 11:59:29 -0800
Subject: [PATCH 598/829] Fix compile warning in db_stress

Summary:
Fix compile warning in db_stress

Test Plan:
make db_stress
---
 tools/db_stress.cc | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tools/db_stress.cc b/tools/db_stress.cc
index de1ae8c7b..8b5b934a2 100644
--- a/tools/db_stress.cc
+++ b/tools/db_stress.cc
@@ -28,6 +28,8 @@ int main() {
 }
 #else
 
+#define __STDC_FORMAT_MACROS
+#include <inttypes.h>
 #include <sys/types.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -1685,7 +1687,8 @@ class StressTest {
     fprintf(stdout, "Write percentage    : %d%%\n", FLAGS_writepercent);
     fprintf(stdout, "Delete percentage   : %d%%\n", FLAGS_delpercent);
     fprintf(stdout, "Iterate percentage  : %d%%\n", FLAGS_iterpercent);
-    fprintf(stdout, "DB-write-buffer-size: %llu\n", FLAGS_db_write_buffer_size);
+    fprintf(stdout, "DB-write-buffer-size: %" PRIu64 "\n",
+        FLAGS_db_write_buffer_size);
     fprintf(stdout, "Write-buffer-size   : %d\n", FLAGS_write_buffer_size);
     fprintf(stdout,
             "Iterations          : %lu\n",

From a94d54aa4797136a8ee754bcd220095491e26a51 Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Thu, 4 Dec 2014 12:01:55 -0800
Subject: [PATCH 599/829] Remove the use of exception in WriteBatch::Handler

Summary:
Remove the use of exception in WriteBatch::Handler.  Now the default
implementations of Put, Merge, and Delete in WriteBatch::Handler are no-op.

Test Plan:
Add three test cases in write_batch_test
./write_batch_test

Reviewers: sdong, igor

Reviewed By: sdong, igor

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D29835
---
 db/write_batch.cc             | 14 --------------
 db/write_batch_test.cc        | 33 +++++++++++++++++++++++++++++++++
 include/rocksdb/write_batch.h | 11 +++++++----
 3 files changed, 40 insertions(+), 18 deletions(-)

diff --git a/db/write_batch.cc b/db/write_batch.cc
index 3c773d24a..386e7ce1f 100644
--- a/db/write_batch.cc
+++ b/db/write_batch.cc
@@ -48,20 +48,6 @@ WriteBatch::~WriteBatch() { }
 
 WriteBatch::Handler::~Handler() { }
 
-void WriteBatch::Handler::Put(const Slice& key, const Slice& value) {
-  // you need to either implement Put or PutCF
-  throw std::runtime_error("Handler::Put not implemented!");
-}
-
-void WriteBatch::Handler::Merge(const Slice& key, const Slice& value) {
-  throw std::runtime_error("Handler::Merge not implemented!");
-}
-
-void WriteBatch::Handler::Delete(const Slice& key) {
-  // you need to either implement Delete or DeleteCF
-  throw std::runtime_error("Handler::Delete not implemented!");
-}
-
 void WriteBatch::Handler::LogData(const Slice& blob) {
   // If the user has not specified something to do with blobs, then we ignore
   // them.
diff --git a/db/write_batch_test.cc b/db/write_batch_test.cc
index e28d02aef..c51d1750f 100644
--- a/db/write_batch_test.cc
+++ b/db/write_batch_test.cc
@@ -187,6 +187,39 @@ namespace {
   };
 }
 
+TEST(WriteBatchTest, MergeNotImplemented) {
+  WriteBatch batch;
+  batch.Merge(Slice("foo"), Slice("bar"));
+  ASSERT_EQ(1, batch.Count());
+  ASSERT_EQ("Merge(foo, bar)@0",
+            PrintContents(&batch));
+
+  WriteBatch::Handler handler;
+  ASSERT_OK(batch.Iterate(&handler));
+}
+
+TEST(WriteBatchTest, PutNotImplemented) {
+  WriteBatch batch;
+  batch.Put(Slice("k1"), Slice("v1"));
+  ASSERT_EQ(1, batch.Count());
+  ASSERT_EQ("Put(k1, v1)@0",
+            PrintContents(&batch));
+
+  WriteBatch::Handler handler;
+  ASSERT_OK(batch.Iterate(&handler));
+}
+
+TEST(WriteBatchTest, DeleteNotImplemented) {
+  WriteBatch batch;
+  batch.Delete(Slice("k2"));
+  ASSERT_EQ(1, batch.Count());
+  ASSERT_EQ("Delete(k2)@0",
+            PrintContents(&batch));
+
+  WriteBatch::Handler handler;
+  ASSERT_OK(batch.Iterate(&handler));
+}
+
 TEST(WriteBatchTest, Blob) {
   WriteBatch batch;
   batch.Put(Slice("k1"), Slice("v1"));
diff --git a/include/rocksdb/write_batch.h b/include/rocksdb/write_batch.h
index db440be02..462a54a59 100644
--- a/include/rocksdb/write_batch.h
+++ b/include/rocksdb/write_batch.h
@@ -105,10 +105,11 @@ class WriteBatch {
       return Status::InvalidArgument(
           "non-default column family and PutCF not implemented");
     }
-    virtual void Put(const Slice& key, const Slice& value);
+    virtual void Put(const Slice& key, const Slice& value) {}
+
     // Merge and LogData are not pure virtual. Otherwise, we would break
     // existing clients of Handler on a source code level. The default
-    // implementation of Merge simply throws a runtime exception.
+    // implementation of Merge does nothing.
     virtual Status MergeCF(uint32_t column_family_id, const Slice& key,
                            const Slice& value) {
       if (column_family_id == 0) {
@@ -118,7 +119,8 @@ class WriteBatch {
       return Status::InvalidArgument(
           "non-default column family and MergeCF not implemented");
     }
-    virtual void Merge(const Slice& key, const Slice& value);
+    virtual void Merge(const Slice& key, const Slice& value) {}
+
     // The default implementation of LogData does nothing.
     virtual void LogData(const Slice& blob);
     virtual Status DeleteCF(uint32_t column_family_id, const Slice& key) {
@@ -129,7 +131,8 @@ class WriteBatch {
       return Status::InvalidArgument(
           "non-default column family and DeleteCF not implemented");
     }
-    virtual void Delete(const Slice& key);
+    virtual void Delete(const Slice& key) {}
+
     // Continue is called by WriteBatch::Iterate. If it returns false,
     // iteration is halted. Otherwise, it continues iterating. The default
     // implementation always returns true.

From c4a7423c1dc0799256cbe84443689c4ad27b184b Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Thu, 4 Dec 2014 13:35:31 -0800
Subject: [PATCH 600/829] Replace runtime_error exception by abort() in
 thread_local

Summary: Replace runtime_error exception by abort() in thread_local

Test Plan: make dbg -j32

Reviewers: sdong, igor

Reviewed By: igor

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D29853
---
 util/thread_local.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/util/thread_local.cc b/util/thread_local.cc
index bc8a4c7d2..0b6857fff 100644
--- a/util/thread_local.cc
+++ b/util/thread_local.cc
@@ -51,7 +51,7 @@ void ThreadLocalPtr::StaticMeta::OnThreadExit(void* ptr) {
 
 ThreadLocalPtr::StaticMeta::StaticMeta() : next_instance_id_(0) {
   if (pthread_key_create(&pthread_key_, &OnThreadExit) != 0) {
-    throw std::runtime_error("pthread_key_create failed");
+    abort();
   }
   head_.next = &head_;
   head_.prev = &head_;
@@ -98,7 +98,7 @@ ThreadLocalPtr::ThreadData* ThreadLocalPtr::StaticMeta::GetThreadLocal() {
         inst->RemoveThreadData(tls_);
       }
       delete tls_;
-      throw std::runtime_error("pthread_setspecific failed");
+      abort();
     }
   }
   return tls_;

From 8f4e1c1c9a9c7f433df9978fc73aeb8224a45263 Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Thu, 4 Dec 2014 13:54:07 -0800
Subject: [PATCH 601/829] Remove the compability check on log2 OS_ANDROID as
 it's already blocked by ROCKSDB_LITE

Summary:
Remove the compability check on log2 OS_ANDROID as it's already blocked by ROCKSDB_LITE

Test Plan:
make OPT="-DROCKSDB_LITE -DOS_ANDROID" shared_lib -j32
make shared_lib -j32
---
 util/options_builder.cc | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/util/options_builder.cc b/util/options_builder.cc
index d473ca943..3ac3debd7 100644
--- a/util/options_builder.cc
+++ b/util/options_builder.cc
@@ -11,14 +11,6 @@ namespace rocksdb {
 
 namespace {
 
-inline double Log2(double n) {
-#ifndef OS_ANDROID
-  return log2(n);
-#else
-  return log(n) / log(2);
-#endif
-}
-
 // For now, always use 1-0 as level bytes multiplier.
 const int kBytesForLevelMultiplier = 10;
 const size_t kBytesForOneMb = 1024 * 1024;
@@ -37,7 +29,7 @@ CompactionStyle PickCompactionStyle(size_t write_buffer_size,
       ::log(target_db_size / write_buffer_size) / ::log(kBytesForLevelMultiplier)));
 
   int expected_max_files_universal =
-      static_cast<int>(ceil(Log2(target_db_size / write_buffer_size)));
+      static_cast<int>(ceil(log2(target_db_size / write_buffer_size)));
 
   const int kEstimatedLevel0FilesInLevelStyle = 2;
   // Estimate write amplification:

From 9260e1ad743900e269ab20f27fb0af500a79ac37 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Fri, 5 Dec 2014 11:05:24 -0800
Subject: [PATCH 602/829] Bump version to 3.9

---
 include/rocksdb/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/rocksdb/version.h b/include/rocksdb/version.h
index fde546c89..339933853 100644
--- a/include/rocksdb/version.h
+++ b/include/rocksdb/version.h
@@ -5,7 +5,7 @@
 #pragma once
 
 #define ROCKSDB_MAJOR 3
-#define ROCKSDB_MINOR 8
+#define ROCKSDB_MINOR 9
 #define ROCKSDB_PATCH 0
 
 // Do not use these. We made the mistake of declaring macros starting with

From d84b2badebe7f1a5313d098f8f6acb02166aba06 Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Fri, 5 Dec 2014 13:30:57 -0800
Subject: [PATCH 603/829] Replace exception by abort() in dummy HdfsEnv
 implementation.

Summary: Replace exception by abort() in dummy HdfsEnv implementation.

Test Plan: make dbg -j32

Reviewers: sdong, igor

Reviewed By: igor

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D29895
---
 hdfs/env_hdfs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hdfs/env_hdfs.h b/hdfs/env_hdfs.h
index 475ea7cab..67f5613de 100644
--- a/hdfs/env_hdfs.h
+++ b/hdfs/env_hdfs.h
@@ -238,7 +238,7 @@ class HdfsEnv : public Env {
   explicit HdfsEnv(const std::string& fsname) {
     fprintf(stderr, "You have not build rocksdb with HDFS support\n");
     fprintf(stderr, "Please see hdfs/README for details\n");
-    throw std::exception();
+    abort();
   }
 
   virtual ~HdfsEnv() {

From 6436ba6b06cd4e09a3cd29f6f9f47bd242a6e935 Mon Sep 17 00:00:00 2001
From: Venkatesh Radhakrishnan <rven@fb.com>
Date: Fri, 5 Dec 2014 16:16:56 -0800
Subject: [PATCH 604/829] Provide mechanism to restart tests from previous
 error

Summary:
While running rocksdb tests, we sometimes encounter errors and
the test run stops. We now provide a new make target call check_some
which restarts the test run from a specific test and continues from
there depending on the value of the environment variable ROCKSDBTESTS_START

Test Plan:
Run make check_some with different values of
ROCKSDBTESTS_START.

Reviewers: igor

Reviewed By: igor

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D29913
---
 Makefile | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/Makefile b/Makefile
index 513659900..f22ac59c4 100644
--- a/Makefile
+++ b/Makefile
@@ -161,6 +161,8 @@ TESTS = \
 	compaction_job_test \
 	thread_list_test
 
+SUBSET :=  $(shell echo $(TESTS) |sed s/^.*$(ROCKSDBTESTS_START)/$(ROCKSDBTESTS_START)/)
+
 TOOLS = \
         sst_dump \
 	db_sanity_test \
@@ -247,6 +249,10 @@ check: $(TESTS) ldb
 	for t in $(TESTS); do echo "***** Running $$t"; ./$$t || exit 1; done
 	python tools/ldb_test.py
 
+check_some: $(SUBSET) ldb
+	for t in $(SUBSET); do echo "***** Running $$t"; ./$$t || exit 1; done
+	python tools/ldb_test.py
+
 ldb_tests: ldb
 	python tools/ldb_test.py
 

From 1f04066cab7b6b542301f8686db7b9e2e8c12bc6 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Fri, 5 Dec 2014 16:12:10 -0800
Subject: [PATCH 605/829] Add DBProperty to return number of snapshots and time
 for oldest snapshot

Summary:
Add a counter in SnapshotList to show number of snapshots. Also a unix timestamp in every snapshot.
Add two DB Properties to return number of snapshots and timestamp of the oldest one.

Test Plan: Add unit test checking

Reviewers: yhchiang, rven, igor

Reviewed By: igor

Subscribers: leveldb, dhruba, MarkCallaghan

Differential Revision: https://reviews.facebook.net/D29919
---
 db/db_impl.cc        |  5 ++++-
 db/db_impl.h         |  2 ++
 db/db_test.cc        | 42 +++++++++++++++++++++++++++++++++++++++++-
 db/internal_stats.cc | 10 ++++++++++
 db/internal_stats.h  |  2 ++
 db/snapshot.h        | 19 ++++++++++++++++++-
 6 files changed, 77 insertions(+), 3 deletions(-)

diff --git a/db/db_impl.cc b/db/db_impl.cc
index ce2466a2e..a090deb76 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -2718,10 +2718,13 @@ bool DBImpl::IsSnapshotSupported() const {
 }
 
 const Snapshot* DBImpl::GetSnapshot() {
+  int64_t unix_time = 0;
+  env_->GetCurrentTime(&unix_time);  // Ignore error
+
   MutexLock l(&mutex_);
   // returns null if the underlying memtable does not support snapshot.
   if (!IsSnapshotSupported()) return nullptr;
-  return snapshots_.New(versions_->LastSequence());
+  return snapshots_.New(versions_->LastSequence(), unix_time);
 }
 
 void DBImpl::ReleaseSnapshot(const Snapshot* s) {
diff --git a/db/db_impl.h b/db/db_impl.h
index c2c3969c1..189937b0f 100644
--- a/db/db_impl.h
+++ b/db/db_impl.h
@@ -248,6 +248,8 @@ class DBImpl : public DB {
 
   ColumnFamilyHandle* DefaultColumnFamily() const;
 
+  const SnapshotList& snapshots() const { return snapshots_; }
+
  protected:
   Env* const env_;
   const std::string dbname_;
diff --git a/db/db_test.cc b/db/db_test.cc
index ccc7597a2..5d40a7b33 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -173,7 +173,9 @@ class SpecialEnv : public EnvWrapper {
 
   std::function<void()>* table_write_callback_;
 
-  explicit SpecialEnv(Env* base) : EnvWrapper(base), rnd_(301) {
+  int64_t addon_time_;
+
+  explicit SpecialEnv(Env* base) : EnvWrapper(base), rnd_(301), addon_time_(0) {
     delay_sstable_sync_.store(false, std::memory_order_release);
     drop_writes_.store(false, std::memory_order_release);
     no_space_.store(false, std::memory_order_release);
@@ -368,6 +370,14 @@ class SpecialEnv : public EnvWrapper {
     sleep_counter_.Increment();
     target()->SleepForMicroseconds(micros);
   }
+
+  virtual Status GetCurrentTime(int64_t* unix_time) override {
+    Status s = target()->GetCurrentTime(unix_time);
+    if (s.ok()) {
+      *unix_time += addon_time_;
+    }
+    return s;
+  }
 };
 
 class DBTest {
@@ -814,6 +824,19 @@ class DBTest {
     return result;
   }
 
+  uint64_t GetNumSnapshots() {
+    uint64_t int_num;
+    ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.num-snapshots", &int_num));
+    return int_num;
+  }
+
+  uint64_t GetTimeOldestSnapshots() {
+    uint64_t int_num;
+    ASSERT_TRUE(
+        dbfull()->GetIntProperty("rocksdb.oldest-snapshot-time", &int_num));
+    return int_num;
+  }
+
   // Return a string that contains all key,value pairs in order,
   // formatted like "(k1->v1)(k2->v2)".
   std::string Contents(int cf = 0) {
@@ -5429,13 +5452,25 @@ TEST(DBTest, Snapshot) {
     CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
     Put(0, "foo", "0v1");
     Put(1, "foo", "1v1");
+
     const Snapshot* s1 = db_->GetSnapshot();
+    ASSERT_EQ(1U, GetNumSnapshots());
+    uint64_t time_snap1 = GetTimeOldestSnapshots();
+    ASSERT_GT(time_snap1, 0U);
     Put(0, "foo", "0v2");
     Put(1, "foo", "1v2");
+
+    env_->addon_time_++;
+
     const Snapshot* s2 = db_->GetSnapshot();
+    ASSERT_EQ(2U, GetNumSnapshots());
+    ASSERT_EQ(time_snap1, GetTimeOldestSnapshots());
     Put(0, "foo", "0v3");
     Put(1, "foo", "1v3");
+
     const Snapshot* s3 = db_->GetSnapshot();
+    ASSERT_EQ(3U, GetNumSnapshots());
+    ASSERT_EQ(time_snap1, GetTimeOldestSnapshots());
 
     Put(0, "foo", "0v4");
     Put(1, "foo", "1v4");
@@ -5449,6 +5484,8 @@ TEST(DBTest, Snapshot) {
     ASSERT_EQ("1v4", Get(1, "foo"));
 
     db_->ReleaseSnapshot(s3);
+    ASSERT_EQ(2U, GetNumSnapshots());
+    ASSERT_EQ(time_snap1, GetTimeOldestSnapshots());
     ASSERT_EQ("0v1", Get(0, "foo", s1));
     ASSERT_EQ("1v1", Get(1, "foo", s1));
     ASSERT_EQ("0v2", Get(0, "foo", s2));
@@ -5461,8 +5498,11 @@ TEST(DBTest, Snapshot) {
     ASSERT_EQ("1v2", Get(1, "foo", s2));
     ASSERT_EQ("0v4", Get(0, "foo"));
     ASSERT_EQ("1v4", Get(1, "foo"));
+    ASSERT_EQ(1U, GetNumSnapshots());
+    ASSERT_LT(time_snap1, GetTimeOldestSnapshots());
 
     db_->ReleaseSnapshot(s2);
+    ASSERT_EQ(0U, GetNumSnapshots());
     ASSERT_EQ("0v4", Get(0, "foo"));
     ASSERT_EQ("1v4", Get(1, "foo"));
   } while (ChangeOptions(kSkipHashCuckoo));
diff --git a/db/internal_stats.cc b/db/internal_stats.cc
index c729ef8d6..c14a03c12 100644
--- a/db/internal_stats.cc
+++ b/db/internal_stats.cc
@@ -136,6 +136,10 @@ DBPropertyType GetPropertyType(const Slice& property, bool* is_int_property,
     return kEstimatedUsageByTableReaders;
   } else if (in == "is-file-deletions-enabled") {
     return kIsFileDeletionEnabled;
+  } else if (in == "num-snapshots") {
+    return kNumSnapshots;
+  } else if (in == "oldest-snapshot-time") {
+    return kOldestSnapshotTime;
   }
   return kUnknown;
 }
@@ -263,6 +267,12 @@ bool InternalStats::GetIntProperty(DBPropertyType property_type,
                cfd_->imm()->current()->GetTotalNumEntries() +
                vstorage->GetEstimatedActiveKeys();
       return true;
+    case kNumSnapshots:
+      *value = db->snapshots().count();
+      return true;
+    case kOldestSnapshotTime:
+      *value = static_cast<uint64_t>(db->snapshots().GetOldestSnapshotTime());
+      return true;
 #ifndef ROCKSDB_LITE
     case kIsFileDeletionEnabled:
       *value = db->IsFileDeletionsEnabled();
diff --git a/db/internal_stats.h b/db/internal_stats.h
index 0c98ebcea..96c13e03b 100644
--- a/db/internal_stats.h
+++ b/db/internal_stats.h
@@ -46,6 +46,8 @@ enum DBPropertyType : uint32_t {
   kEstimatedUsageByTableReaders,  // Estimated memory by table readers.
   kIsFileDeletionEnabled,         // Equals disable_delete_obsolete_files_,
                                   // 0 means file deletions enabled
+  kNumSnapshots,                  // Number of snapshots in the system
+  kOldestSnapshotTime,            // Unix timestamp of the first snapshot
 };
 
 extern DBPropertyType GetPropertyType(const Slice& property,
diff --git a/db/snapshot.h b/db/snapshot.h
index 51fa556c8..45c66eabc 100644
--- a/db/snapshot.h
+++ b/db/snapshot.h
@@ -28,6 +28,8 @@ class SnapshotImpl : public Snapshot {
   SnapshotImpl* next_;
 
   SnapshotList* list_;                 // just for sanity checks
+
+  int64_t unix_time_;
 };
 
 class SnapshotList {
@@ -36,20 +38,23 @@ class SnapshotList {
     list_.prev_ = &list_;
     list_.next_ = &list_;
     list_.number_ = 0xFFFFFFFFL;      // placeholder marker, for debugging
+    count_ = 0;
   }
 
   bool empty() const { return list_.next_ == &list_; }
   SnapshotImpl* oldest() const { assert(!empty()); return list_.next_; }
   SnapshotImpl* newest() const { assert(!empty()); return list_.prev_; }
 
-  const SnapshotImpl* New(SequenceNumber seq) {
+  const SnapshotImpl* New(SequenceNumber seq, uint64_t unix_time) {
     SnapshotImpl* s = new SnapshotImpl;
     s->number_ = seq;
+    s->unix_time_ = unix_time;
     s->list_ = this;
     s->next_ = &list_;
     s->prev_ = list_.prev_;
     s->prev_->next_ = s;
     s->next_->prev_ = s;
+    count_++;
     return s;
   }
 
@@ -57,6 +62,7 @@ class SnapshotList {
     assert(s->list_ == this);
     s->prev_->next_ = s->next_;
     s->next_->prev_ = s->prev_;
+    count_--;
     delete s;
   }
 
@@ -78,9 +84,20 @@ class SnapshotList {
     return newest()->number_;
   }
 
+  int64_t GetOldestSnapshotTime() const {
+    if (empty()) {
+      return 0;
+    } else {
+      return oldest()->unix_time_;
+    }
+  }
+
+  uint64_t count() const { return count_; }
+
  private:
   // Dummy head of doubly-linked list of snapshots
   SnapshotImpl list_;
+  uint64_t count_;
 };
 
 }  // namespace rocksdb

From 8c5781666ecca4a2b8a58e97ea7b7c4c0575aa0f Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Fri, 5 Dec 2014 21:34:20 -0800
Subject: [PATCH 606/829] Add -fno-exceptions flag to ROCKSDB_LITE.

Summary: Add -fno-exceptions flag to ROCKSDB_LITE.

Test Plan: make OPT=-DROCKSDB_LITE shared_lib -j32

Reviewers: sdong, igor

Reviewed By: igor

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D29901
---
 Makefile | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/Makefile b/Makefile
index f22ac59c4..1ca41f8fe 100644
--- a/Makefile
+++ b/Makefile
@@ -43,6 +43,12 @@ else
 OPT += -DNDEBUG
 endif
 
+ifneq ($(filter -DROCKSDB_LITE,$(OPT)),) 
+	# found
+	CFLAGS += -fno-exceptions
+	CXXFLAGS += -fno-exceptions
+endif
+
 # ASAN doesn't work well with jemalloc. If we're compiling with ASAN, we should use regular malloc.
 ifdef COMPILE_WITH_ASAN
 	# ASAN compile flags

From 635c61fd3b97c02bf0704cf35afc3a2ddb179041 Mon Sep 17 00:00:00 2001
From: Leonidas Galanis <lgalanis@fb.com>
Date: Mon, 8 Dec 2014 12:53:24 -0800
Subject: [PATCH 607/829] Fix problem with create_if_missing option when
 wal_dir is used

Summary: When wal_dir is used, DestroyDB is not passed the wal_dir option and so we get a Corruption exception.

Test Plan:
Verified manually that the following command line works now:
./db_bench --db=/mnt/db/rocksdb ... --disable_wal=0 --wal_dir=/data/users/rocksdb/WAL... --benchmarks=filluniquerandom --use_existing_db=0...

Reviewers: sdong

Reviewed By: sdong

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D29859
---
 db/db_bench.cc | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/db/db_bench.cc b/db/db_bench.cc
index c7fd0365c..34531cc3e 100644
--- a/db/db_bench.cc
+++ b/db/db_bench.cc
@@ -1367,11 +1367,13 @@ class Benchmark {
   }
 
   void Run() {
+    Options open_options;  // keep options around to properly destroy db later
+
     if (!SanityCheck()) {
       exit(1);
     }
     PrintHeader();
-    Open();
+    Open(&open_options);
     const char* benchmarks = FLAGS_benchmarks.c_str();
     while (benchmarks != nullptr) {
       const char* sep = strchr(benchmarks, ',');
@@ -1532,15 +1534,15 @@ class Benchmark {
             delete db_.db;
             db_.db = nullptr;
             db_.cfh.clear();
-            DestroyDB(FLAGS_db, Options());
+            DestroyDB(FLAGS_db, open_options);
           }
           for (size_t i = 0; i < multi_dbs_.size(); i++) {
             delete multi_dbs_[i].db;
-            DestroyDB(GetDbNameForMultiple(FLAGS_db, i), Options());
+            DestroyDB(GetDbNameForMultiple(FLAGS_db, i), open_options);
           }
           multi_dbs_.clear();
         }
-        Open();
+        Open(&open_options);  // use open_options for the last accessed
       }
 
       if (method != nullptr) {
@@ -1832,9 +1834,11 @@ class Benchmark {
     }
   }
 
-  void Open() {
+  void Open(Options* opts) {
+    Options& options = *opts;
+
     assert(db_.db == nullptr);
-    Options options;
+
     options.create_if_missing = !FLAGS_use_existing_db;
     options.create_missing_column_families = FLAGS_num_column_families > 1;
     options.db_write_buffer_size = FLAGS_db_write_buffer_size;

From 1b7fbb9e82e80294130fcc70fc4ff9e29e363bf7 Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Mon, 8 Dec 2014 15:19:48 -0800
Subject: [PATCH 608/829] Update HISTORY.md for release 3.9

---
 HISTORY.md | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index f2b5bf873..ad626711f 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,7 +1,11 @@
 # Rocksdb Change Log
 
-### Unreleased Features
-* Add rocksdb::GetThreadList(), which returns the current status of all rocksdb-related threads.
+### 3.9.0 (12/8/2014)
+
+### New Features
+* Add rocksdb::GetThreadList(), which in the future will return the current status of all
+  rocksdb-related threads.  We will have more code instruments in the following RocksDB
+  releases.
 
 ### Public API changes
 * New API to create a checkpoint added. Given a directory name, creates a new
@@ -10,6 +14,9 @@
  implementation of the API LinkFile will have to be provided.
 * MemTableRep takes MemTableAllocator instead of Arena
 
+### Improvements
+* RocksDBLite library now becomes smaller and will be compiled with -fno-exceptions flag.
+
 ## 3.8.0 (11/14/2014)
 
 ### Public API changes

From 046ba7d47c30536b38e59de27ed6d7e8b16d4efb Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Mon, 8 Dec 2014 12:52:18 -0800
Subject: [PATCH 609/829] Fix calculation of max_total_wal_size in
 db_options_.max_total_wal_size == 0 case

Summary: This is a regression bug introduced by https://reviews.facebook.net/D24729 . max_total_wal_size would be off the target it should be more and more in the case that the a user holds the current super version after flush or compaction. This patch fixes it

Test Plan: make all check

Reviewers: yhchiang, rven, igor

Reviewed By: igor

Subscribers: ljin, yoshinorim, MarkCallaghan, hermanlee4, dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D29961
---
 db/column_family_test.cc  | 24 ++++++++++++++++++++++--
 db/db_impl.cc             | 14 +++++++++-----
 db/db_impl.h              |  4 ++++
 include/rocksdb/options.h |  2 +-
 4 files changed, 36 insertions(+), 8 deletions(-)

diff --git a/db/column_family_test.cc b/db/column_family_test.cc
index 686bab20d..88479323c 100644
--- a/db/column_family_test.cc
+++ b/db/column_family_test.cc
@@ -529,8 +529,28 @@ TEST(ColumnFamilyTest, FlushTest) {
   ASSERT_OK(Put(1, "mirko", "v3"));
   ASSERT_OK(Put(0, "foo", "v2"));
   ASSERT_OK(Put(2, "fodor", "v5"));
-  for (int i = 0; i < 3; ++i) {
-    Flush(i);
+
+  for (int j = 0; j < 2; j++) {
+    ReadOptions ro;
+    std::vector<Iterator*> iterators;
+    // Hold super version.
+    if (j == 0) {
+      ASSERT_OK(db_->NewIterators(ro, handles_, &iterators));
+    }
+
+    for (int i = 0; i < 3; ++i) {
+      uint64_t max_total_in_memory_state =
+          dbfull()->TEST_max_total_in_memory_state();
+      Flush(i);
+      ASSERT_EQ(dbfull()->TEST_max_total_in_memory_state(),
+                max_total_in_memory_state);
+    }
+    ASSERT_OK(Put(1, "foofoo", "bar"));
+    ASSERT_OK(Put(0, "foofoo", "bar"));
+
+    for (auto* it : iterators) {
+      delete it;
+    }
   }
   Reopen();
 
diff --git a/db/db_impl.cc b/db/db_impl.cc
index a090deb76..965154417 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -2272,6 +2272,15 @@ SuperVersion* DBImpl::InstallSuperVersion(
     ColumnFamilyData* cfd, SuperVersion* new_sv,
     const MutableCFOptions& mutable_cf_options) {
   mutex_.AssertHeld();
+
+  // Update max_total_in_memory_state_
+  size_t old_memtable_size = 0;
+  auto* old_sv = cfd->GetSuperVersion();
+  if (old_sv) {
+    old_memtable_size = old_sv->mutable_cf_options.write_buffer_size *
+                        old_sv->mutable_cf_options.max_write_buffer_number;
+  }
+
   auto* old = cfd->InstallSuperVersion(
       new_sv ? new_sv : new SuperVersion(), &mutex_, mutable_cf_options);
 
@@ -2281,11 +2290,6 @@ SuperVersion* DBImpl::InstallSuperVersion(
   MaybeScheduleFlushOrCompaction();
 
   // Update max_total_in_memory_state_
-  size_t old_memtable_size = 0;
-  if (old) {
-    old_memtable_size = old->mutable_cf_options.write_buffer_size *
-                        old->mutable_cf_options.max_write_buffer_number;
-  }
   max_total_in_memory_state_ =
       max_total_in_memory_state_ - old_memtable_size +
       mutable_cf_options.write_buffer_size *
diff --git a/db/db_impl.h b/db/db_impl.h
index 189937b0f..6577733b6 100644
--- a/db/db_impl.h
+++ b/db/db_impl.h
@@ -230,6 +230,10 @@ class DBImpl : public DB {
   // REQUIRES: mutex locked
   // pass the pointer that you got from TEST_BeginWrite()
   void TEST_EndWrite(void* w);
+
+  uint64_t TEST_max_total_in_memory_state() {
+    return max_total_in_memory_state_;
+  }
 #endif  // ROCKSDB_LITE
 
   // Returns the list of live files in 'live' and the list
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index 054d2c3e1..298ec6aee 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -686,7 +686,7 @@ struct DBOptions {
   // column families whose memtables are backed by the oldest live WAL file
   // (i.e. the ones that are causing all the space amplification). If set to 0
   // (default), we will dynamically choose the WAL size limit to be
-  // [sum of all write_buffer_size * max_write_buffer_number] * 2
+  // [sum of all write_buffer_size * max_write_buffer_number] * 4
   // Default: 0
   uint64_t max_total_wal_size;
 

From cb82d7b081105dc4d6277ac93b67ce76a5287283 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Tue, 9 Dec 2014 10:22:07 -0800
Subject: [PATCH 610/829] Fix #434

Summary: Why do we assert here? This doesn't seem like user friendly thing to do :)

Test Plan: none

Reviewers: sdong, yhchiang, rven

Reviewed By: rven

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D30027
---
 util/env_posix.cc | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/util/env_posix.cc b/util/env_posix.cc
index 039e79c4a..30997a904 100644
--- a/util/env_posix.cc
+++ b/util/env_posix.cc
@@ -1694,11 +1694,10 @@ class PosixEnv : public Env {
       }
       if (num > total_threads_limit_ ||
           (num < total_threads_limit_ && allow_reduce)) {
-        total_threads_limit_ = num;
+        total_threads_limit_ = std::max(1, num);
         WakeUpAllThreads();
         StartBGThreads();
       }
-      assert(total_threads_limit_ > 0);
       PthreadCall("unlock", pthread_mutex_unlock(&mu_));
     }
 

From e93f044d995447bcd053d74512700a6473edbac1 Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Wed, 10 Dec 2014 13:04:58 -0800
Subject: [PATCH 611/829] add range scan test to benchmark script

Summary: as title

Test Plan: ran it

Reviewers: yhchiang, igor, sdong, MarkCallaghan

Reviewed By: MarkCallaghan

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D25563
---
 tools/benchmark.sh       | 40 +++++++++++++++++++++++++++++++++++-----
 tools/run_flash_bench.sh |  7 +++++++
 2 files changed, 42 insertions(+), 5 deletions(-)

diff --git a/tools/benchmark.sh b/tools/benchmark.sh
index 431999340..2ea300d32 100755
--- a/tools/benchmark.sh
+++ b/tools/benchmark.sh
@@ -28,6 +28,7 @@ fi
 
 num_read_threads=${NUM_READ_THREADS:-16}
 writes_per_second=${WRITES_PER_SEC:-$((80 * K))}  # (only for readwhilewriting)
+num_nexts_per_seek=${NUM_NEXTS_PER_SEEK:-10}      # (only for rangescanwhilewriting)
 cache_size=$((1 * G))
 duration=${DURATION:-0}
 
@@ -56,8 +57,6 @@ const_params="
   --target_file_size_base=$((128 * M)) \
   --max_bytes_for_level_base=$((1 * G)) \
   \
-  --sync=0 \
-  --disable_data_sync=1 \
   --verify_checksum=1 \
   --delete_obsolete_files_period_micros=$((60 * M)) \
   --max_grandparent_overlap_factor=10 \
@@ -93,6 +92,7 @@ function run_bulkload {
        --use_existing_db=0 \
        --num=$num_keys \
        --disable_auto_compactions=1 \
+       --sync=0 \
        --disable_data_sync=1 \
        --threads=1 2>&1 | tee $output_dir/benchmark_bulkload_fillrandom.log"
   echo $cmd | tee $output_dir/benchmark_bulkload_fillrandom.log
@@ -102,6 +102,7 @@ function run_bulkload {
        --use_existing_db=1 \
        --num=$num_keys \
        --disable_auto_compactions=1 \
+       --sync=0 \
        --disable_data_sync=1 \
        --threads=1 2>&1 | tee $output_dir/benchmark_bulkload_compact.log"
   echo $cmd | tee $output_dir/benchmark_bulkload_compact.log
@@ -113,6 +114,8 @@ function run_fillseq {
   cmd="./db_bench $params_w --benchmarks=fillseq \
        --use_existing_db=0 \
        --num=$num_keys \
+       --sync=1 \
+       --disable_data_sync=0 \
        --threads=1 2>&1 | tee $output_dir/benchmark_fillseq.log"
   echo $cmd | tee $output_dir/benchmark_fillseq.log
   eval $cmd
@@ -123,6 +126,8 @@ function run_overwrite {
   cmd="./db_bench $params_w --benchmarks=overwrite \
        --use_existing_db=1 \
        --num=$num_keys \
+       --sync=1 \
+       --disable_data_sync=0 \
        --threads=1 2>&1 | tee $output_dir/benchmark_overwrite.log"
   echo $cmd | tee $output_dir/benchmark_overwrite.log
   eval $cmd
@@ -133,6 +138,8 @@ function run_filluniquerandom {
   cmd="./db_bench $params_w --benchmarks=filluniquerandom \
        --use_existing_db=0 \
        --num=$num_keys \
+       --sync=1 \
+       --disable_data_sync=0 \
        --threads=1 2>&1 | tee $output_dir/benchmark_filluniquerandom.log"
   echo $cmd | tee $output_dir/benchmark_filluniquerandom.log
   eval $cmd
@@ -155,6 +162,8 @@ function run_readwhilewriting {
   cmd="./db_bench $params_r --benchmarks=readwhilewriting \
        --use_existing_db=1 \
        --num=$num_keys \
+       --sync=1 \
+       --disable_data_sync=0 \
        --threads=$num_read_threads \
        --writes_per_second=$writes_per_second \
        2>&1 | tee $output_dir/benchmark_readwhilewriting.log"
@@ -162,13 +171,27 @@ function run_readwhilewriting {
   eval $cmd
 }
 
+function run_rangescanwhilewriting {
+  echo "Range scan $num_keys random keys from database whiling writing.."
+  cmd="./db_bench $params_r --benchmarks=seekrandomwhilewriting \
+       --use_existing_db=1 \
+       --num=$num_keys \
+       --sync=1 \
+       --disable_data_sync=0 \
+       --threads=$num_read_threads \
+       --writes_per_second=$writes_per_second \
+       --seek_nexts=$num_nexts_per_seek \
+       2>&1 | tee $output_dir/benchmark_rangescanwhilewriting.log"
+  echo $cmd | tee $output_dir/benchmark_rangescanwhilewriting.log
+  eval $cmd
+}
+
 function now() {
   echo `date +"%s"`
 }
 
 report="$output_dir/report.txt"
 
-# print start time
 echo "===== Benchmark ====="
 
 # Run!!!
@@ -188,6 +211,8 @@ for job in ${jobs[@]}; do
     run_readrandom
   elif [ $job = readwhilewriting ]; then
     run_readwhilewriting
+  elif [ $job = rangescanwhilewriting ]; then
+    run_rangescanwhilewriting
   else
     echo "unknown job $job"
     exit
@@ -195,12 +220,17 @@ for job in ${jobs[@]}; do
   end=$(now)
 
   echo "Complete $job in $((end-start)) seconds" | tee -a $report
-  if [[ $job = readrandom || $job = readwhilewriting ]]; then
+  if [[ $job = readrandom || $job = readwhilewriting || $job == rangescanwhilewriting ]]; then
+    lat=$(grep "micros\/op" "$output_dir/benchmark_$job.log" | grep "ops\/sec" | awk '{print $3}')
     qps=$(grep "micros\/op" "$output_dir/benchmark_$job.log" | grep "ops\/sec" | awk '{print $5}')
     line=$(grep "rocksdb.db.get.micros" "$output_dir/benchmark_$job.log")
     p50=$(echo $line | awk '{print $7}')
     p99=$(echo $line | awk '{print $13}')
-    echo "Read latency p50 = $p50 us, p99 = $p99 us" | tee -a $report
+    print_percentile=$(echo "$p50 != 0 || $p99 != 0" | bc);
+    if [ $print_percentile == "1" ]; then
+      echo "Read latency p50 = $p50 us, p99 = $p99 us" | tee -a $report
+    fi
     echo "QPS = $qps ops/sec" | tee -a $report
+    echo "Avg Latency = $lat micros/op " | tee -a $report
   fi
 done
diff --git a/tools/run_flash_bench.sh b/tools/run_flash_bench.sh
index affebe27c..2d2fd2ade 100755
--- a/tools/run_flash_bench.sh
+++ b/tools/run_flash_bench.sh
@@ -18,6 +18,7 @@ db_dir="/tmp/rocksdb/"
 wal_dir="/tmp/rocksdb/"
 output_dir="/tmp/output"
 
+
 # Test 1: bulk load
 OUTPUT_DIR=$output_dir NUM_KEYS=$n DB_DIR=$db_dir WAL_DIR=$wal_dir \
   ./benchmark.sh bulkload
@@ -43,3 +44,9 @@ OUTPUT_DIR=$output_dir NUM_KEYS=$n DB_DIR=$db_dir WAL_DIR=$wal_dir \
 OUTPUT_DIR=$output_dir NUM_KEYS=$n DB_DIR=$db_dir WAL_DIR=$wal_dir \
   DURATION=$duration NUM_READ_THREADS=$num_read_threads WRITES_PER_SECOND=$wps \
   ./benchmark.sh readwhilewriting
+
+# Test 6: random seek + next()'s while writing
+OUTPUT_DIR=$output_dir NUM_KEYS=$n DB_DIR=$db_dir WAL_DIR=$wal_dir \
+  DURATION=$duration NUM_READ_THREADS=$num_read_threads WRITES_PER_SECOND=$wps \
+  NUM_NEXTS_PER_SEEK=10 \
+  ./benchmark.sh rangescanwhilewriting

From 0ab0242f373608c304aead5a291335a784ae0176 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Tue, 9 Dec 2014 20:33:43 -0800
Subject: [PATCH 612/829] VersionBuilder to use unordered set and map to store
 added and deleted files

Summary: Set operations in VerisonBuilder is shown as a performance bottleneck of restarting DB when there are lots of files. Make both of added_files and deleted_files use unordered set or map. Only when adding the files, sort the added files.

Test Plan: make all check

Reviewers: yhchiang, rven, igor

Reviewed By: igor

Subscribers: hermanlee4, leveldb, dhruba, ljin

Differential Revision: https://reviews.facebook.net/D30051
---
 db/version_builder.cc | 70 ++++++++++++++++++++-----------------------
 1 file changed, 32 insertions(+), 38 deletions(-)

diff --git a/db/version_builder.cc b/db/version_builder.cc
index a360ab02a..abb0d4b58 100644
--- a/db/version_builder.cc
+++ b/db/version_builder.cc
@@ -16,6 +16,8 @@
 #include <inttypes.h>
 #include <algorithm>
 #include <set>
+#include <unordered_map>
+#include <unordered_set>
 #include <vector>
 
 #include "db/dbformat.h"
@@ -69,10 +71,10 @@ class VersionBuilder::Rep {
     }
   };
 
-  typedef std::set<FileMetaData*, FileComparator> FileSet;
   struct LevelState {
-    std::set<uint64_t> deleted_files;
-    FileSet* added_files;
+    std::unordered_set<uint64_t> deleted_files;
+    // Map from file number to file meta data.
+    std::unordered_map<uint64_t, FileMetaData*> added_files;
   };
 
   const EnvOptions& env_options_;
@@ -93,25 +95,13 @@ class VersionBuilder::Rep {
     level_nonzero_cmp_.sort_method = FileComparator::kLevelNon0;
     level_nonzero_cmp_.internal_comparator =
         base_vstorage_->InternalComparator();
-
-    levels_[0].added_files = new FileSet(level_zero_cmp_);
-    for (int level = 1; level < base_vstorage_->num_levels(); level++) {
-      levels_[level].added_files = new FileSet(level_nonzero_cmp_);
-    }
   }
 
   ~Rep() {
     for (int level = 0; level < base_vstorage_->num_levels(); level++) {
-      const FileSet* added = levels_[level].added_files;
-      std::vector<FileMetaData*> to_unref;
-      to_unref.reserve(added->size());
-      for (FileSet::const_iterator it = added->begin(); it != added->end();
-           ++it) {
-        to_unref.push_back(*it);
-      }
-      delete added;
-      for (uint32_t i = 0; i < to_unref.size(); i++) {
-        FileMetaData* f = to_unref[i];
+      const auto& added = levels_[level].added_files;
+      for (auto& pair : added) {
+        FileMetaData* f = pair.second;
         f->refs--;
         if (f->refs <= 0) {
           if (f->table_reader_handle) {
@@ -175,27 +165,20 @@ class VersionBuilder::Rep {
     // is possibly moved from lower level to higher level in current
     // version
     for (int l = level + 1; !found && l < base_vstorage_->num_levels(); l++) {
-      const FileSet* added = levels_[l].added_files;
-      for (FileSet::const_iterator added_iter = added->begin();
-           added_iter != added->end(); ++added_iter) {
-        FileMetaData* f = *added_iter;
-        if (f->fd.GetNumber() == number) {
-          found = true;
-          break;
-        }
+      auto& level_added = levels_[l].added_files;
+      auto got = level_added.find(number);
+      if (got != level_added.end()) {
+        found = true;
+        break;
       }
     }
 
     // maybe this file was added in a previous edit that was Applied
     if (!found) {
-      const FileSet* added = levels_[level].added_files;
-      for (FileSet::const_iterator added_iter = added->begin();
-           added_iter != added->end(); ++added_iter) {
-        FileMetaData* f = *added_iter;
-        if (f->fd.GetNumber() == number) {
-          found = true;
-          break;
-        }
+      auto& level_added = levels_[level].added_files;
+      auto got = level_added.find(number);
+      if (got != level_added.end()) {
+        found = true;
       }
     }
     if (!found) {
@@ -224,8 +207,10 @@ class VersionBuilder::Rep {
       FileMetaData* f = new FileMetaData(new_file.second);
       f->refs = 1;
 
+      assert(levels_[level].added_files.find(f->fd.GetNumber()) ==
+             levels_[level].added_files.end());
       levels_[level].deleted_files.erase(f->fd.GetNumber());
-      levels_[level].added_files->insert(f);
+      levels_[level].added_files[f->fd.GetNumber()] = f;
     }
   }
 
@@ -241,8 +226,16 @@ class VersionBuilder::Rep {
       const auto& base_files = base_vstorage_->LevelFiles(level);
       auto base_iter = base_files.begin();
       auto base_end = base_files.end();
-      const auto& added_files = *levels_[level].added_files;
-      vstorage->Reserve(level, base_files.size() + added_files.size());
+      const auto& unordered_added_files = levels_[level].added_files;
+      vstorage->Reserve(level,
+                        base_files.size() + unordered_added_files.size());
+
+      // Sort added files for the level.
+      autovector<FileMetaData*> added_files;
+      for (const auto& pair : unordered_added_files) {
+        added_files.push_back(pair.second);
+      }
+      std::sort(added_files.begin(), added_files.end(), cmp);
 
       for (const auto& added : added_files) {
         // Add all smaller files listed in base_
@@ -266,7 +259,8 @@ class VersionBuilder::Rep {
   void LoadTableHandlers() {
     assert(table_cache_ != nullptr);
     for (int level = 0; level < base_vstorage_->num_levels(); level++) {
-      for (auto& file_meta : *(levels_[level].added_files)) {
+      for (auto& file_meta_pair : levels_[level].added_files) {
+        auto* file_meta = file_meta_pair.second;
         assert(!file_meta->table_reader_handle);
         table_cache_->FindTable(
             env_options_, *(base_vstorage_->InternalComparator()),

From ee95cae9a4718c87c6a64da2a136672daec89b42 Mon Sep 17 00:00:00 2001
From: Alexey Maykov <maykov@fb.com>
Date: Tue, 21 Oct 2014 11:49:13 -0700
Subject: [PATCH 613/829] Modifed the LRU cache eviction code so that it
 doesn't evict blocks which have exteranl references

Summary:
Currently, blocks which have more than one reference (ie referenced by something other than cache itself) are evicted from cache. This doesn't make much sense:
- blocks are still in RAM, so the RAM usage reported by the cache is incorrect
- if the same block is needed by another iterator, it will be loaded and decompressed again

This diff changes the reference counting scheme a bit. Previously, if the cache contained the block, this was accounted for in its refcount. After this change, the refcount is only used to track external references. There is a boolean flag which indicates whether or not the block is contained in the cache.
This diff also changes how LRU list is used. Previously, both hashtable and the LRU list contained all blocks. After this change, the LRU list contains blocks with the refcount==0, ie those which can be evicted from the cache.

Note that this change still allows for cache to grow beyond its capacity. This happens when all blocks are pinned (ie refcount>0). This is consistent with the current behavior. The cache's insert function never fails. I spent lots of time trying to make table_reader and other places work with the insert which might failed. It turned out to be pretty hard. It might really destabilize some customers, so finally, I decided against doing this.

table_cache_remove_scan_count_limit option will be unneeded after this change, but I will remove it in the following diff, if this one gets approved

Test Plan: Ran tests, made sure they pass

Reviewers: sdong, ljin

Differential Revision: https://reviews.facebook.net/D25503
---
 HISTORY.md         |   3 +
 util/cache.cc      | 196 ++++++++++++++++++++++++++++++---------------
 util/cache_test.cc | 110 ++++++++++---------------
 3 files changed, 175 insertions(+), 134 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index ad626711f..bdb3325fe 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,5 +1,8 @@
 # Rocksdb Change Log
 
+### Unreleased Features
+* Changed the LRU caching algorithm so that referenced blocks (by iterators) are never evicted
+
 ### 3.9.0 (12/8/2014)
 
 ### New Features
diff --git a/util/cache.cc b/util/cache.cc
index b1d8a19c3..d64ab00e2 100644
--- a/util/cache.cc
+++ b/util/cache.cc
@@ -26,8 +26,27 @@ namespace {
 
 // LRU cache implementation
 
-// An entry is a variable length heap-allocated structure.  Entries
-// are kept in a circular doubly linked list ordered by access time.
+// An entry is a variable length heap-allocated structure.
+// Entries are referenced by cache and/or by any external entity.
+// The cache keeps all its entries in table. Some elements
+// are also stored on LRU list.
+//
+// LRUHandle can be in these states:
+// 1. Referenced externally AND in hash table.
+//  In that case the entry is *not* in the LRU. (refs > 1 && in_cache == true)
+// 2. Not referenced externally and in hash table. In that case the entry is
+// in the LRU and can be freed. (refs == 1 && in_cache == true)
+// 3. Referenced externally and not in hash table. In that case the entry is
+// in not on LRU and not in table. (refs >= 1 && in_cache == false)
+//
+// All newly created LRUHandles are in state 1. If you call LRUCache::Release
+// on entry in state 1, it will go into state 2. To move from state 1 to
+// state 3, either call LRUCache::Erase or LRUCache::Insert with the same key.
+// To move from state 2 to state 1, use LRUCache::Lookup.
+// Before destruction, make sure that no handles are in state 1. This means
+// that any successful LRUCache::Lookup/LRUCache::Insert have a matching
+// RUCache::Release (to move into state 2) or LRUCache::Erase (for state 3)
+
 struct LRUHandle {
   void* value;
   void (*deleter)(const Slice&, void* value);
@@ -36,7 +55,9 @@ struct LRUHandle {
   LRUHandle* prev;
   size_t charge;      // TODO(opt): Only allow uint32_t?
   size_t key_length;
-  uint32_t refs;
+  uint32_t refs;      // a number of refs to this entry
+                      // cache itself is counted as 1
+  bool in_cache;      // true, if this entry is referenced by the hash table
   uint32_t hash;      // Hash of key(); used for fast sharding and comparisons
   char key_data[1];   // Beginning of key
 
@@ -49,6 +70,12 @@ struct LRUHandle {
       return Slice(key_data, key_length);
     }
   }
+
+  void Free() {
+    assert((refs == 1 && in_cache) || (refs == 0 && !in_cache));
+    (*deleter)(key(), value);
+    free(this);
+  }
 };
 
 // We provide our own simple hash table since it removes a whole bunch
@@ -59,7 +86,28 @@ struct LRUHandle {
 class HandleTable {
  public:
   HandleTable() : length_(0), elems_(0), list_(nullptr) { Resize(); }
-  ~HandleTable() { delete[] list_; }
+
+  template <typename T>
+  void ApplyToAllCacheEntries(T func) {
+    for (uint32_t i = 0; i < length_; i++) {
+      LRUHandle* h = list_[i];
+      while (h != nullptr) {
+        auto n = h->next_hash;
+        assert(h->in_cache);
+        func(h);
+        h = n;
+      }
+    }
+  }
+
+  ~HandleTable() {
+    ApplyToAllCacheEntries([](LRUHandle* h) {
+      if (h->refs == 1) {
+        h->Free();
+      }
+    });
+    delete[] list_;
+  }
 
   LRUHandle* Lookup(const Slice& key, uint32_t hash) {
     return *FindPointer(key, hash);
@@ -173,8 +221,6 @@ class LRUCache {
   // Just reduce the reference count by 1.
   // Return true if last reference
   bool Unref(LRUHandle* e);
-  // Call deleter and free
-  void FreeEntry(LRUHandle* e);
 
   // Initialized before use.
   size_t capacity_;
@@ -188,6 +234,7 @@ class LRUCache {
 
   // Dummy head of LRU list.
   // lru.prev is newest entry, lru.next is oldest entry.
+  // LRU contains items which can be evicted, ie reference only by cache
   LRUHandle lru_;
 
   HandleTable table_;
@@ -200,16 +247,7 @@ LRUCache::LRUCache()
   lru_.prev = &lru_;
 }
 
-LRUCache::~LRUCache() {
-  for (LRUHandle* e = lru_.next; e != &lru_; ) {
-    LRUHandle* next = e->next;
-    assert(e->refs == 1);  // Error if caller has an unreleased handle
-    if (Unref(e)) {
-      FreeEntry(e);
-    }
-    e = next;
-  }
-}
+LRUCache::~LRUCache() {}
 
 bool LRUCache::Unref(LRUHandle* e) {
   assert(e->refs > 0);
@@ -217,47 +255,48 @@ bool LRUCache::Unref(LRUHandle* e) {
   return e->refs == 0;
 }
 
-void LRUCache::FreeEntry(LRUHandle* e) {
-  assert(e->refs == 0);
-  (*e->deleter)(e->key(), e->value);
-  free(e);
-}
+// Call deleter and free
 
 void LRUCache::ApplyToAllCacheEntries(void (*callback)(void*, size_t),
                                       bool thread_safe) {
   if (thread_safe) {
     mutex_.Lock();
   }
-  for (auto e = lru_.next; e != &lru_; e = e->next) {
-    callback(e->value, e->charge);
-  }
+  table_.ApplyToAllCacheEntries([callback](LRUHandle* h) {
+    callback(h->value, h->charge);
+  });
   if (thread_safe) {
     mutex_.Unlock();
   }
 }
 
 void LRUCache::LRU_Remove(LRUHandle* e) {
+  assert(e->next != nullptr);
+  assert(e->prev != nullptr);
   e->next->prev = e->prev;
   e->prev->next = e->next;
-  usage_ -= e->charge;
+  e->prev = e->next = nullptr;
 }
 
 void LRUCache::LRU_Append(LRUHandle* e) {
   // Make "e" newest entry by inserting just before lru_
+  assert(e->next == nullptr);
+  assert(e->prev == nullptr);
   e->next = &lru_;
   e->prev = lru_.prev;
   e->prev->next = e;
   e->next->prev = e;
-  usage_ += e->charge;
 }
 
 Cache::Handle* LRUCache::Lookup(const Slice& key, uint32_t hash) {
   MutexLock l(&mutex_);
   LRUHandle* e = table_.Lookup(key, hash);
   if (e != nullptr) {
+    assert(e->in_cache);
+    if (e->refs == 1) {
+      LRU_Remove(e);
+    }
     e->refs++;
-    LRU_Remove(e);
-    LRU_Append(e);
   }
   return reinterpret_cast<Cache::Handle*>(e);
 }
@@ -268,9 +307,31 @@ void LRUCache::Release(Cache::Handle* handle) {
   {
     MutexLock l(&mutex_);
     last_reference = Unref(e);
+    if (last_reference) {
+      usage_ -= e->charge;
+    }
+    if (e->refs == 1 && e->in_cache) {
+      // The item is still in cache, and nobody else holds a reference to it
+      if (usage_ > capacity_) {
+        // the cache is full
+        // The LRU list must be empty since the cache is full
+        assert(lru_.next == &lru_);
+        // take this opportunity and remove the item
+        table_.Remove(e->key(), e->hash);
+        e->in_cache = false;
+        Unref(e);
+        usage_ -= e->charge;
+        last_reference = true;
+      } else {
+        // put the item on the list to be potentially freed
+        LRU_Append(e);
+      }
+    }
   }
+
+  // free outside of mutex
   if (last_reference) {
-    FreeEntry(e);
+    e->Free();
   }
 }
 
@@ -278,8 +339,11 @@ Cache::Handle* LRUCache::Insert(
     const Slice& key, uint32_t hash, void* value, size_t charge,
     void (*deleter)(const Slice& key, void* value)) {
 
-  LRUHandle* e = reinterpret_cast<LRUHandle*>(
-      malloc(sizeof(LRUHandle)-1 + key.size()));
+  // Allocate the memory here outside of the mutex
+  // If the cache is full, we'll have to release it
+  // It shouldn't happen very often though.
+  LRUHandle* e =
+      reinterpret_cast<LRUHandle*>(malloc(sizeof(LRUHandle) - 1 + key.size()));
   autovector<LRUHandle*> last_reference_list;
 
   e->value = value;
@@ -288,47 +352,40 @@ Cache::Handle* LRUCache::Insert(
   e->key_length = key.size();
   e->hash = hash;
   e->refs = 2;  // One from LRUCache, one for the returned handle
+  e->next = e->prev = nullptr;
+  e->in_cache = true;
   memcpy(e->key_data, key.data(), key.size());
 
   {
     MutexLock l(&mutex_);
 
-    LRU_Append(e);
-
-    LRUHandle* old = table_.Insert(e);
-    if (old != nullptr) {
-      LRU_Remove(old);
-      if (Unref(old)) {
-        last_reference_list.push_back(old);
-      }
-    }
-
-    if (remove_scan_count_limit_ > 0) {
-      // Try to free the space by evicting the entries that are only
-      // referenced by the cache first.
-      LRUHandle* cur = lru_.next;
-      for (unsigned int scanCount = 0;
-           usage_ > capacity_ && cur != &lru_
-           && scanCount < remove_scan_count_limit_; scanCount++) {
-        LRUHandle* next = cur->next;
-        if (cur->refs <= 1) {
-          LRU_Remove(cur);
-          table_.Remove(cur->key(), cur->hash);
-          if (Unref(cur)) {
-            last_reference_list.push_back(cur);
-          }
-        }
-        cur = next;
-      }
-    }
-
     // Free the space following strict LRU policy until enough space
-    // is freed.
-    while (usage_ > capacity_ && lru_.next != &lru_) {
-      old = lru_.next;
+    // is freed or the lru list is empty
+    while (usage_ + charge > capacity_ && lru_.next != &lru_) {
+      LRUHandle* old = lru_.next;
+      assert(old->in_cache);
+      assert(old->refs ==
+             1);  // LRU list contains elements which may be evicted
       LRU_Remove(old);
       table_.Remove(old->key(), old->hash);
+      old->in_cache = false;
+      Unref(old);
+      usage_ -= old->charge;
+      last_reference_list.push_back(old);
+    }
+
+    // insert into the cache
+    // note that the cache might get larger than its capacity if not enough
+    // space was freed
+    LRUHandle* old = table_.Insert(e);
+    usage_ += e->charge;
+    if (old != nullptr) {
+      old->in_cache = false;
       if (Unref(old)) {
+        usage_ -= old->charge;
+        // old is on LRU because it's in cache and its reference count
+        // was just 1 (Unref returned 0)
+        LRU_Remove(old);
         last_reference_list.push_back(old);
       }
     }
@@ -337,7 +394,7 @@ Cache::Handle* LRUCache::Insert(
   // we free the entries here outside of mutex for
   // performance reasons
   for (auto entry : last_reference_list) {
-    FreeEntry(entry);
+    entry->Free();
   }
 
   return reinterpret_cast<Cache::Handle*>(e);
@@ -350,14 +407,21 @@ void LRUCache::Erase(const Slice& key, uint32_t hash) {
     MutexLock l(&mutex_);
     e = table_.Remove(key, hash);
     if (e != nullptr) {
-      LRU_Remove(e);
       last_reference = Unref(e);
+      if (last_reference) {
+        usage_ -= e->charge;
+      }
+      if (last_reference && e->in_cache) {
+        LRU_Remove(e);
+      }
+      e->in_cache = false;
     }
   }
+
   // mutex not held here
   // last_reference will only be true if e != nullptr
   if (last_reference) {
-    FreeEntry(e);
+    e->Free();
   }
 }
 
diff --git a/util/cache_test.cc b/util/cache_test.cc
index 2fed4d867..e40317fd5 100644
--- a/util/cache_test.cc
+++ b/util/cache_test.cc
@@ -190,25 +190,30 @@ TEST(CacheTest, EntriesArePinned) {
   Insert(100, 101);
   Cache::Handle* h1 = cache_->Lookup(EncodeKey(100));
   ASSERT_EQ(101, DecodeValue(cache_->Value(h1)));
+  ASSERT_EQ(1, cache_->GetUsage());
 
   Insert(100, 102);
   Cache::Handle* h2 = cache_->Lookup(EncodeKey(100));
   ASSERT_EQ(102, DecodeValue(cache_->Value(h2)));
   ASSERT_EQ(0U, deleted_keys_.size());
+  ASSERT_EQ(2, cache_->GetUsage());
 
   cache_->Release(h1);
   ASSERT_EQ(1U, deleted_keys_.size());
   ASSERT_EQ(100, deleted_keys_[0]);
   ASSERT_EQ(101, deleted_values_[0]);
+  ASSERT_EQ(1, cache_->GetUsage());
 
   Erase(100);
   ASSERT_EQ(-1, Lookup(100));
   ASSERT_EQ(1U, deleted_keys_.size());
+  ASSERT_EQ(1, cache_->GetUsage());
 
   cache_->Release(h2);
   ASSERT_EQ(2U, deleted_keys_.size());
   ASSERT_EQ(100, deleted_keys_[1]);
   ASSERT_EQ(102, deleted_values_[1]);
+  ASSERT_EQ(0, cache_->GetUsage());
 }
 
 TEST(CacheTest, EvictionPolicy) {
@@ -273,76 +278,28 @@ TEST(CacheTest, EvictionPolicyRef) {
   cache_->Release(h204);
 }
 
-TEST(CacheTest, EvictionPolicyRef2) {
-  std::vector<Cache::Handle*> handles;
-
-  Insert(100, 101);
-  // Insert entries much more than Cache capacity
-  for (int i = 0; i < kCacheSize + 100; i++) {
-    Insert(1000 + i, 2000 + i);
-    if (i < kCacheSize ) {
-      handles.push_back(cache_->Lookup(EncodeKey(1000 + i)));
-    }
-  }
-
-  // Make sure referenced keys are also possible to be deleted
-  // if there are not sufficient non-referenced keys
-  for (int i = 0; i < 5; i++) {
-    ASSERT_EQ(-1, Lookup(1000 + i));
-  }
+TEST(CacheTest, ErasedHandleState) {
+  // insert a key and get two handles
+  Insert(100, 1000);
+  Cache::Handle* h1 = cache_->Lookup(EncodeKey(100));
+  Cache::Handle* h2 = cache_->Lookup(EncodeKey(100));
+  ASSERT_EQ(h1, h2);
+  ASSERT_EQ(DecodeValue(cache_->Value(h1)), 1000);
+  ASSERT_EQ(DecodeValue(cache_->Value(h2)), 1000);
 
-  for (int i = kCacheSize; i < kCacheSize + 100; i++) {
-    ASSERT_EQ(2000 + i, Lookup(1000 + i));
-  }
+  // delete the key from the cache
+  Erase(100);
+  // can no longer find in the cache
   ASSERT_EQ(-1, Lookup(100));
 
-  // Cleaning up all the handles
-  while (handles.size() > 0) {
-    cache_->Release(handles.back());
-    handles.pop_back();
-  }
-}
-
-TEST(CacheTest, EvictionPolicyRefLargeScanLimit) {
-  std::vector<Cache::Handle*> handles2;
-
-  // Cache2 has a cache RemoveScanCountLimit higher than cache size
-  // so it would trigger a boundary condition.
-
-  // Populate the cache with 10 more keys than its size.
-  // Reference all keys except one close to the end.
-  for (int i = 0; i < kCacheSize2 + 10; i++) {
-    Insert2(1000 + i, 2000+i);
-    if (i != kCacheSize2 ) {
-      handles2.push_back(cache2_->Lookup(EncodeKey(1000 + i)));
-    }
-  }
-
-  // Make sure referenced keys are also possible to be deleted
-  // if there are not sufficient non-referenced keys
-  for (int i = 0; i < 3; i++) {
-    ASSERT_EQ(-1, Lookup2(1000 + i));
-  }
-  // The non-referenced value is deleted even if it's accessed
-  // recently.
-  ASSERT_EQ(-1, Lookup2(1000 + kCacheSize2));
-  // Other values recently accessed are not deleted since they
-  // are referenced.
-  for (int i = kCacheSize2 - 10; i < kCacheSize2 + 10; i++) {
-    if (i != kCacheSize2) {
-      ASSERT_EQ(2000 + i, Lookup2(1000 + i));
-    }
-  }
+  // release one handle
+  cache_->Release(h1);
+  // still can't find in cache
+  ASSERT_EQ(-1, Lookup(100));
 
-  // Cleaning up all the handles
-  while (handles2.size() > 0) {
-    cache2_->Release(handles2.back());
-    handles2.pop_back();
-  }
+  cache_->Release(h2);
 }
 
-
-
 TEST(CacheTest, HeavyEntries) {
   // Add a bunch of light and heavy entries and then count the combined
   // size of items still in the cache, which must be approximately the
@@ -392,7 +349,7 @@ void deleter(const Slice& key, void* value) {
 }
 }  // namespace
 
-TEST(CacheTest, BadEviction) {
+TEST(CacheTest, OverCapacity) {
   int n = 10;
 
   // a LRUCache with n entries and one shard only
@@ -411,15 +368,32 @@ TEST(CacheTest, BadEviction) {
     std::string key = ToString(i+1);
     auto h = cache->Lookup(key);
     std::cout << key << (h?" found\n":" not found\n");
-    // Only the first entry should be missing
-    ASSERT_TRUE(h || i == 0);
+    ASSERT_TRUE(h != nullptr);
     if (h) cache->Release(h);
   }
 
+  // the cache is over capacity since nothing could be evicted
+  ASSERT_EQ(n + 1, cache->GetUsage());
   for (int i = 0; i < n+1; i++) {
     cache->Release(handles[i]);
   }
-  std::cout << "Poor entries\n";
+
+  // cache is under capacity now since elements were released
+  ASSERT_EQ(n, cache->GetUsage());
+
+  // element 0 is evicted and the rest is there
+  // This is consistent with the LRU policy since the element 0
+  // was released first
+  for (int i = 0; i < n+1; i++) {
+    std::string key = ToString(i+1);
+    auto h = cache->Lookup(key);
+    if (h) {
+      ASSERT_NE(i, 0);
+      cache->Release(h);
+    } else {
+      ASSERT_EQ(i, 0);
+    }
+  }
 }
 
 namespace {

From d7a486668cdf0378fbf58eeac2c39f5eae105af9 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Wed, 10 Dec 2014 18:39:09 -0800
Subject: [PATCH 614/829] Improve scalability of DB::GetSnapshot()

Summary: Now DB::GetSnapshot() doesn't scale to more column families, as it needs to go through all the column families to find whether snapshot is supported. This patch optimizes it.

Test Plan:
Add unit tests to cover negative cases.
make all check

Reviewers: yhchiang, rven, igor

Reviewed By: igor

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D30093
---
 db/column_family_test.cc | 21 ++++++++++++++++++++
 db/db_impl.cc            | 41 +++++++++++++++++++++++++++-------------
 db/db_impl.h             |  9 ++-------
 db/db_test.cc            |  8 ++++++--
 4 files changed, 57 insertions(+), 22 deletions(-)

diff --git a/db/column_family_test.cc b/db/column_family_test.cc
index 88479323c..209f7b528 100644
--- a/db/column_family_test.cc
+++ b/db/column_family_test.cc
@@ -731,6 +731,27 @@ TEST(ColumnFamilyTest, DifferentWriteBufferSizes) {
   Close();
 }
 
+TEST(ColumnFamilyTest, MemtableNotSupportSnapshot) {
+  Open();
+  auto* s1 = dbfull()->GetSnapshot();
+  ASSERT_TRUE(s1 != nullptr);
+  dbfull()->ReleaseSnapshot(s1);
+
+  // Add a column family that doesn't support snapshot
+  ColumnFamilyOptions first;
+  first.memtable_factory.reset(NewHashCuckooRepFactory(1024 * 1024));
+  CreateColumnFamilies({"first"}, {first});
+  auto* s2 = dbfull()->GetSnapshot();
+  ASSERT_TRUE(s2 == nullptr);
+
+  // Add a column family that supports snapshot. Snapshot stays not supported.
+  ColumnFamilyOptions second;
+  CreateColumnFamilies({"second"}, {second});
+  auto* s3 = dbfull()->GetSnapshot();
+  ASSERT_TRUE(s3 == nullptr);
+  Close();
+}
+
 TEST(ColumnFamilyTest, DifferentMergeOperators) {
   Open();
   CreateColumnFamilies({"first", "second"});
diff --git a/db/db_impl.cc b/db/db_impl.cc
index 965154417..b4b423d9d 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -202,6 +202,7 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname)
       default_cf_handle_(nullptr),
       total_log_size_(0),
       max_total_in_memory_state_(0),
+      is_snapshot_supported_(true),
       write_buffer_(options.db_write_buffer_size),
       tmp_batch_(),
       bg_schedule_needed_(false),
@@ -1305,8 +1306,8 @@ Status DBImpl::CompactFilesImpl(
   CompactionJob compaction_job(c.get(), db_options_, *c->mutable_cf_options(),
                                env_options_, versions_.get(), &shutting_down_,
                                &log_buffer, db_directory_.get(), stats_,
-                               &snapshots_, IsSnapshotSupported(), table_cache_,
-                               std::move(yield_callback));
+                               &snapshots_, is_snapshot_supported_,
+                               table_cache_, std::move(yield_callback));
   compaction_job.Prepare();
 
   mutex_.Unlock();
@@ -2090,7 +2091,7 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress, JobContext* job_context,
     CompactionJob compaction_job(c.get(), db_options_, *c->mutable_cf_options(),
                                  env_options_, versions_.get(), &shutting_down_,
                                  log_buffer, db_directory_.get(), stats_,
-                                 &snapshots_, IsSnapshotSupported(),
+                                 &snapshots_, is_snapshot_supported_,
                                  table_cache_, std::move(yield_callback));
     compaction_job.Prepare();
     mutex_.Unlock();
@@ -2494,6 +2495,11 @@ Status DBImpl::CreateColumnFamily(const ColumnFamilyOptions& cf_options,
       assert(cfd != nullptr);
       delete InstallSuperVersion(
           cfd, nullptr, *cfd->GetLatestMutableCFOptions());
+
+      if (!cfd->mem()->IsSnapshotSupported()) {
+        is_snapshot_supported_ = false;
+      }
+
       *handle = new ColumnFamilyHandleImpl(cfd, this, &mutex_);
       Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
           "Created column family [%s] (ID %u)",
@@ -2520,6 +2526,8 @@ Status DBImpl::DropColumnFamily(ColumnFamilyHandle* column_family) {
     return Status::InvalidArgument("Can't drop default column family");
   }
 
+  bool cf_support_snapshot = cfd->mem()->IsSnapshotSupported();
+
   VersionEdit edit;
   edit.DropColumnFamily();
   edit.SetColumnFamily(cfd->GetID());
@@ -2539,6 +2547,19 @@ Status DBImpl::DropColumnFamily(ColumnFamilyHandle* column_family) {
                                  &edit, &mutex_);
       write_thread_.ExitWriteThread(&w, &w, s);
     }
+
+    if (!cf_support_snapshot) {
+      // Dropped Column Family doesn't support snapshot. Need to recalculate
+      // is_snapshot_supported_.
+      bool new_is_snapshot_supported = true;
+      for (auto c : *versions_->GetColumnFamilySet()) {
+        if (!c->mem()->IsSnapshotSupported()) {
+          new_is_snapshot_supported = false;
+          break;
+        }
+      }
+      is_snapshot_supported_ = new_is_snapshot_supported;
+    }
   }
 
   if (s.ok()) {
@@ -2712,22 +2733,13 @@ Status DBImpl::NewIterators(
   return Status::OK();
 }
 
-bool DBImpl::IsSnapshotSupported() const {
-  for (auto cfd : *versions_->GetColumnFamilySet()) {
-    if (!cfd->mem()->IsSnapshotSupported()) {
-      return false;
-    }
-  }
-  return true;
-}
-
 const Snapshot* DBImpl::GetSnapshot() {
   int64_t unix_time = 0;
   env_->GetCurrentTime(&unix_time);  // Ignore error
 
   MutexLock l(&mutex_);
   // returns null if the underlying memtable does not support snapshot.
-  if (!IsSnapshotSupported()) return nullptr;
+  if (!is_snapshot_supported_) return nullptr;
   return snapshots_.New(versions_->LastSequence(), unix_time);
 }
 
@@ -3622,6 +3634,9 @@ Status DB::Open(const DBOptions& db_options, const std::string& dbname,
           }
         }
       }
+      if (!cfd->mem()->IsSnapshotSupported()) {
+        impl->is_snapshot_supported_ = false;
+      }
       if (cfd->ioptions()->merge_operator != nullptr &&
           !cfd->mem()->IsMergeOperatorSupported()) {
         s = Status::InvalidArgument(
diff --git a/db/db_impl.h b/db/db_impl.h
index 6577733b6..5e27df2c6 100644
--- a/db/db_impl.h
+++ b/db/db_impl.h
@@ -383,13 +383,6 @@ class DBImpl : public DB {
   // dump rocksdb.stats to LOG
   void MaybeDumpStats();
 
-  // Return true if the current db supports snapshot.  If the current
-  // DB does not support snapshot, then calling GetSnapshot() will always
-  // return nullptr.
-  //
-  // @see GetSnapshot()
-  virtual bool IsSnapshotSupported() const;
-
   // Return the minimum empty level that could hold the total data in the
   // input level. Return the input level, if such level could not be found.
   int FindMinimumEmptyLevelFitting(ColumnFamilyData* cfd,
@@ -441,6 +434,8 @@ class DBImpl : public DB {
   // some code-paths
   bool single_column_family_mode_;
 
+  bool is_snapshot_supported_;
+
   std::unique_ptr<Directory> db_directory_;
 
   WriteBuffer write_buffer_;
diff --git a/db/db_test.cc b/db/db_test.cc
index 5d40a7b33..facae2f68 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -1622,6 +1622,11 @@ TEST(DBTest, GetSnapshot) {
       std::string key = (i == 0) ? std::string("foo") : std::string(200, 'x');
       ASSERT_OK(Put(1, key, "v1"));
       const Snapshot* s1 = db_->GetSnapshot();
+      if (option_config_ == kHashCuckoo) {
+        // NOt supported case.
+        ASSERT_TRUE(s1 == nullptr);
+        break;
+      }
       ASSERT_OK(Put(1, key, "v2"));
       ASSERT_EQ("v2", Get(1, key));
       ASSERT_EQ("v1", Get(1, key, s1));
@@ -1630,8 +1635,7 @@ TEST(DBTest, GetSnapshot) {
       ASSERT_EQ("v1", Get(1, key, s1));
       db_->ReleaseSnapshot(s1);
     }
-    // skip as HashCuckooRep does not support snapshot
-  } while (ChangeOptions(kSkipHashCuckoo));
+  } while (ChangeOptions());
 }
 
 TEST(DBTest, GetSnapshotLink) {

From 74b3fb6d97c402f8a22685fe3df5a26eca140fa6 Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Thu, 11 Dec 2014 14:15:13 -0800
Subject: [PATCH 615/829] Fix Mac compile errors on util/cache_test.cc

Summary:
Fix Mac compile errors on util/cache_test.cc

Test Plan:
make dbg -j32
./cache_test
---
 util/cache_test.cc | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/util/cache_test.cc b/util/cache_test.cc
index e40317fd5..ea71124b2 100644
--- a/util/cache_test.cc
+++ b/util/cache_test.cc
@@ -190,30 +190,30 @@ TEST(CacheTest, EntriesArePinned) {
   Insert(100, 101);
   Cache::Handle* h1 = cache_->Lookup(EncodeKey(100));
   ASSERT_EQ(101, DecodeValue(cache_->Value(h1)));
-  ASSERT_EQ(1, cache_->GetUsage());
+  ASSERT_EQ(1U, cache_->GetUsage());
 
   Insert(100, 102);
   Cache::Handle* h2 = cache_->Lookup(EncodeKey(100));
   ASSERT_EQ(102, DecodeValue(cache_->Value(h2)));
   ASSERT_EQ(0U, deleted_keys_.size());
-  ASSERT_EQ(2, cache_->GetUsage());
+  ASSERT_EQ(2U, cache_->GetUsage());
 
   cache_->Release(h1);
   ASSERT_EQ(1U, deleted_keys_.size());
   ASSERT_EQ(100, deleted_keys_[0]);
   ASSERT_EQ(101, deleted_values_[0]);
-  ASSERT_EQ(1, cache_->GetUsage());
+  ASSERT_EQ(1U, cache_->GetUsage());
 
   Erase(100);
   ASSERT_EQ(-1, Lookup(100));
   ASSERT_EQ(1U, deleted_keys_.size());
-  ASSERT_EQ(1, cache_->GetUsage());
+  ASSERT_EQ(1U, cache_->GetUsage());
 
   cache_->Release(h2);
   ASSERT_EQ(2U, deleted_keys_.size());
   ASSERT_EQ(100, deleted_keys_[1]);
   ASSERT_EQ(102, deleted_values_[1]);
-  ASSERT_EQ(0, cache_->GetUsage());
+  ASSERT_EQ(0U, cache_->GetUsage());
 }
 
 TEST(CacheTest, EvictionPolicy) {
@@ -336,9 +336,9 @@ TEST(CacheTest, NewId) {
 
 class Value {
  private:
-  int v_;
+  size_t v_;
  public:
-  explicit Value(int v) : v_(v) { }
+  explicit Value(size_t v) : v_(v) { }
 
   ~Value() { std::cout << v_ << " is destructed\n"; }
 };
@@ -350,7 +350,7 @@ void deleter(const Slice& key, void* value) {
 }  // namespace
 
 TEST(CacheTest, OverCapacity) {
-  int n = 10;
+  size_t n = 10;
 
   // a LRUCache with n entries and one shard only
   std::shared_ptr<Cache> cache = NewLRUCache(n, 0);
@@ -358,13 +358,13 @@ TEST(CacheTest, OverCapacity) {
   std::vector<Cache::Handle*> handles(n+1);
 
   // Insert n+1 entries, but not releasing.
-  for (int i = 0; i < n+1; i++) {
+  for (size_t i = 0; i < n + 1; i++) {
     std::string key = ToString(i+1);
     handles[i] = cache->Insert(key, new Value(i+1), 1, &deleter);
   }
 
   // Guess what's in the cache now?
-  for (int i = 0; i < n+1; i++) {
+  for (size_t i = 0; i < n + 1; i++) {
     std::string key = ToString(i+1);
     auto h = cache->Lookup(key);
     std::cout << key << (h?" found\n":" not found\n");
@@ -373,8 +373,8 @@ TEST(CacheTest, OverCapacity) {
   }
 
   // the cache is over capacity since nothing could be evicted
-  ASSERT_EQ(n + 1, cache->GetUsage());
-  for (int i = 0; i < n+1; i++) {
+  ASSERT_EQ(n + 1U, cache->GetUsage());
+  for (size_t i = 0; i < n + 1; i++) {
     cache->Release(handles[i]);
   }
 
@@ -384,14 +384,14 @@ TEST(CacheTest, OverCapacity) {
   // element 0 is evicted and the rest is there
   // This is consistent with the LRU policy since the element 0
   // was released first
-  for (int i = 0; i < n+1; i++) {
+  for (size_t i = 0; i < n + 1; i++) {
     std::string key = ToString(i+1);
     auto h = cache->Lookup(key);
     if (h) {
-      ASSERT_NE(i, 0);
+      ASSERT_NE(i, 0U);
       cache->Release(h);
     } else {
-      ASSERT_EQ(i, 0);
+      ASSERT_EQ(i, 0U);
     }
   }
 }

From 7ab1526c0e32f72ccfd74c0184a2f78ed36fb826 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Thu, 11 Dec 2014 15:46:01 -0800
Subject: [PATCH 616/829] Add an assert and avoid std::sort(autovector) to
 investigate an ASAN issue

Summary:
ASAN build fails once for this error:

14:04:52 ==== Test DBTest.CompactFilesOnLevelCompaction
14:04:52 db_test: db/version_set.cc:1062: void rocksdb::VersionStorageInfo::AddFile(int, rocksdb::FileMetaData*): Assertion `level <= 0 || level_files->empty() || internal_comparator_->Compare( (*level_files)[level_files->size() - 1]->largest, f->smallest) < 0' failed.

Not abling figure out reason. We use std:vector for sorting for save and add one more assert to help figure out whether it is the sorting's problem.

Test Plan: make all check

Reviewers: yhchiang, rven, igor

Reviewed By: igor

Subscribers: leveldb, dhruba

Differential Revision: https://reviews.facebook.net/D30117
---
 db/version_builder.cc | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/db/version_builder.cc b/db/version_builder.cc
index abb0d4b58..ec7bb176a 100644
--- a/db/version_builder.cc
+++ b/db/version_builder.cc
@@ -231,13 +231,26 @@ class VersionBuilder::Rep {
                         base_files.size() + unordered_added_files.size());
 
       // Sort added files for the level.
-      autovector<FileMetaData*> added_files;
+      std::vector<FileMetaData*> added_files;
+      added_files.reserve(unordered_added_files.size());
       for (const auto& pair : unordered_added_files) {
         added_files.push_back(pair.second);
       }
       std::sort(added_files.begin(), added_files.end(), cmp);
 
+#ifndef NDEBUG
+      FileMetaData* prev_file = nullptr;
+#endif
+
       for (const auto& added : added_files) {
+#ifndef NDEBUG
+        if (level > 0 && prev_file != nullptr) {
+          assert(base_vstorage_->InternalComparator()->Compare(
+                     prev_file->smallest, added->smallest) <= 0);
+        }
+        prev_file = added;
+#endif
+
         // Add all smaller files listed in base_
         for (auto bpos = std::upper_bound(base_iter, base_end, added, cmp);
              base_iter != bpos; ++base_iter) {

From cef6f84393f691b618b837d41df62c58542f81fe Mon Sep 17 00:00:00 2001
From: Qiao Yang <qyang@dev1885.prn2.facebook.com>
Date: Mon, 24 Nov 2014 10:04:16 -0800
Subject: [PATCH 617/829] Added 'dump_live_files' command to ldb tool.

Summary:
Priliminary diff to solicit comments.
Given DB path, dump all SST files (key/value and properties), WAL file and manifest
files. What command options do we need to support for this command? Maybe
output_hex for keys?

Test Plan: Create additional ldb unit tests.

Reviewers: sdong, rven

Reviewed By: rven

Subscribers: dhruba

Differential Revision: https://reviews.facebook.net/D29547
---
 tools/ldb_test.py        |  16 +++
 util/ldb_cmd.cc          | 281 ++++++++++++++++++++++++++++++---------
 util/ldb_cmd.h           |  15 +++
 util/ldb_tool.cc         |   1 +
 util/sst_dump_tool.cc    |  69 +---------
 util/sst_dump_tool_imp.h |  78 +++++++++++
 6 files changed, 326 insertions(+), 134 deletions(-)
 create mode 100644 util/sst_dump_tool_imp.h

diff --git a/tools/ldb_test.py b/tools/ldb_test.py
index b4ef5221f..f248f88cd 100644
--- a/tools/ldb_test.py
+++ b/tools/ldb_test.py
@@ -378,6 +378,22 @@ class LDBTestCase(unittest.TestCase):
         my_check_output("rm -f %s" % sstFilePath, shell=True)
         self.assertRunFAIL("checkconsistency")
 
+    def dumpLiveFiles(self, params, dumpFile):
+        return 0 == run_err_null("./ldb dump_live_files %s > %s" % (
+            params, dumpFile))
+
+    def testDumpLiveFiles(self):
+        print "Running testDumpLiveFiles..."
+
+        dbPath = os.path.join(self.TMP_DIR, self.DB_NAME)
+        self.assertRunOK("put x1 y1 --create_if_missing", "OK")
+        self.assertRunOK("put x2 y2", "OK")
+        dumpFilePath = os.path.join(self.TMP_DIR, "dump1")
+        self.assertTrue(self.dumpLiveFiles("--db=%s" % dbPath, dumpFilePath))
+        self.assertRunOK("delete x1", "OK")
+        self.assertRunOK("put x3 y3", "OK")
+        dumpFilePath = os.path.join(self.TMP_DIR, "dump2")
+        self.assertTrue(self.dumpLiveFiles("--db=%s" % dbPath, dumpFilePath))
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/util/ldb_cmd.cc b/util/ldb_cmd.cc
index 8a8fa7a2e..5547fc085 100644
--- a/util/ldb_cmd.cc
+++ b/util/ldb_cmd.cc
@@ -14,7 +14,9 @@
 #include "db/write_batch_internal.h"
 #include "rocksdb/write_batch.h"
 #include "rocksdb/cache.h"
+#include "rocksdb/table_properties.h"
 #include "util/coding.h"
+#include "util/sst_dump_tool_imp.h"
 #include "util/scoped_arena_iterator.h"
 #include "utilities/ttl/db_ttl_impl.h"
 
@@ -165,6 +167,8 @@ LDBCommand* LDBCommand::SelectCommand(
     return new ManifestDumpCommand(cmdParams, option_map, flags);
   } else if (cmd == ListColumnFamiliesCommand::Name()) {
     return new ListColumnFamiliesCommand(cmdParams, option_map, flags);
+  } else if (cmd == DBFileDumperCommand::Name()) {
+    return new DBFileDumperCommand(cmdParams, option_map, flags);
   } else if (cmd == InternalDumpCommand::Name()) {
     return new InternalDumpCommand(cmdParams, option_map, flags);
   } else if (cmd == CheckConsistencyCommand::Name()) {
@@ -438,6 +442,8 @@ void CompactorCommand::DoCommand() {
   delete end;
 }
 
+// ----------------------------------------------------------------------------
+
 const string DBLoaderCommand::ARG_DISABLE_WAL = "disable_wal";
 const string DBLoaderCommand::ARG_BULK_LOAD = "bulk_load";
 const string DBLoaderCommand::ARG_COMPACT = "compact";
@@ -513,6 +519,31 @@ void DBLoaderCommand::DoCommand() {
 
 // ----------------------------------------------------------------------------
 
+namespace {
+
+void DumpManifestFile(std::string file, bool verbose, bool hex) {
+  Options options;
+  EnvOptions sopt;
+  std::string dbname("dummy");
+  std::shared_ptr<Cache> tc(
+      NewLRUCache(options.max_open_files - 10, options.table_cache_numshardbits,
+                  options.table_cache_remove_scan_count_limit));
+  // Notice we are using the default options not through SanitizeOptions(),
+  // if VersionSet::DumpManifest() depends on any option done by
+  // SanitizeOptions(), we need to initialize it manually.
+  options.db_paths.emplace_back("dummy", 0);
+  WriteController wc;
+  WriteBuffer wb(options.db_write_buffer_size);
+  VersionSet versions(dbname, &options, sopt, tc.get(), &wb, &wc);
+  Status s = versions.DumpManifest(options, file, verbose, hex);
+  if (!s.ok()) {
+    printf("Error in processing file %s %s\n", file.c_str(),
+           s.ToString().c_str());
+  }
+}
+
+}  // namespace
+
 const string ManifestDumpCommand::ARG_VERBOSE = "verbose";
 const string ManifestDumpCommand::ARG_PATH    = "path";
 
@@ -585,25 +616,7 @@ void ManifestDumpCommand::DoCommand() {
     printf("Processing Manifest file %s\n", manifestfile.c_str());
   }
 
-  Options options;
-  EnvOptions sopt;
-  std::string file(manifestfile);
-  std::string dbname("dummy");
-  std::shared_ptr<Cache> tc(NewLRUCache(
-      options.max_open_files - 10, options.table_cache_numshardbits,
-      options.table_cache_remove_scan_count_limit));
-  // Notice we are using the default options not through SanitizeOptions(),
-  // if VersionSet::DumpManifest() depends on any option done by
-  // SanitizeOptions(), we need to initialize it manually.
-  options.db_paths.emplace_back("dummy", 0);
-  WriteController wc;
-  WriteBuffer wb(options.db_write_buffer_size);
-  VersionSet versions(dbname, &options, sopt, tc.get(), &wb, &wc);
-  Status s = versions.DumpManifest(options, file, verbose_, is_key_hex_);
-  if (!s.ok()) {
-    printf("Error in processing file %s %s\n", manifestfile.c_str(),
-           s.ToString().c_str());
-  }
+  DumpManifestFile(manifestfile, verbose_, is_key_hex_);
   if (verbose_) {
     printf("Processing Manifest file %s done\n", manifestfile.c_str());
   }
@@ -1325,9 +1338,19 @@ void ChangeCompactionStyleCommand::DoCommand() {
           files_per_level.c_str());
 }
 
+// ----------------------------------------------------------------------------
+
+namespace {
+
+struct StdErrReporter : public log::Reader::Reporter {
+  virtual void Corruption(size_t bytes, const Status& s) {
+    cerr << "Corruption detected in log file " << s.ToString() << "\n";
+  }
+};
+
 class InMemoryHandler : public WriteBatch::Handler {
  public:
-  InMemoryHandler(stringstream& row, bool print_values) : Handler(),row_(row) {
+  InMemoryHandler(stringstream& row, bool print_values) : Handler(), row_(row) {
     print_values_ = print_values;
   }
 
@@ -1357,13 +1380,63 @@ class InMemoryHandler : public WriteBatch::Handler {
     row_ << LDBCommand::StringToHex(key.ToString()) << " ";
   }
 
-  virtual ~InMemoryHandler() { };
+  virtual ~InMemoryHandler() {}
 
  private:
   stringstream & row_;
   bool print_values_;
 };
 
+void DumpWalFile(std::string wal_file, bool print_header, bool print_values,
+                 LDBCommandExecuteResult* exec_state) {
+  unique_ptr<SequentialFile> file;
+  Env* env_ = Env::Default();
+  EnvOptions soptions;
+  Status status = env_->NewSequentialFile(wal_file, &file, soptions);
+  if (!status.ok()) {
+    if (exec_state) {
+      *exec_state = LDBCommandExecuteResult::FAILED("Failed to open WAL file " +
+                                                    status.ToString());
+    } else {
+      cerr << "Error: Failed to open WAL file " << status.ToString()
+           << std::endl;
+    }
+  } else {
+    StdErrReporter reporter;
+    log::Reader reader(move(file), &reporter, true, 0);
+    string scratch;
+    WriteBatch batch;
+    Slice record;
+    stringstream row;
+    if (print_header) {
+      cout << "Sequence,Count,ByteSize,Physical Offset,Key(s)";
+      if (print_values) {
+        cout << " : value ";
+      }
+      cout << "\n";
+    }
+    while (reader.ReadRecord(&record, &scratch)) {
+      row.str("");
+      if (record.size() < 12) {
+        reporter.Corruption(record.size(),
+                            Status::Corruption("log record too small"));
+      } else {
+        WriteBatchInternal::SetContents(&batch, record);
+        row << WriteBatchInternal::Sequence(&batch) << ",";
+        row << WriteBatchInternal::Count(&batch) << ",";
+        row << WriteBatchInternal::ByteSize(&batch) << ",";
+        row << reader.LastRecordOffset() << ",";
+        InMemoryHandler handler(row, print_values);
+        batch.Iterate(&handler);
+        row << "\n";
+      }
+      cout << row.str();
+    }
+  }
+}
+
+}  // namespace
+
 const string WALDumperCommand::ARG_WAL_FILE = "walfile";
 const string WALDumperCommand::ARG_PRINT_VALUE = "print_value";
 const string WALDumperCommand::ARG_PRINT_HEADER = "header";
@@ -1401,53 +1474,10 @@ void WALDumperCommand::Help(string& ret) {
 }
 
 void WALDumperCommand::DoCommand() {
-  struct StdErrReporter : public log::Reader::Reporter {
-    virtual void Corruption(size_t bytes, const Status& s) {
-      cerr<<"Corruption detected in log file "<<s.ToString()<<"\n";
-    }
-  };
-
-  unique_ptr<SequentialFile> file;
-  Env* env_ = Env::Default();
-  EnvOptions soptions;
-  Status status = env_->NewSequentialFile(wal_file_, &file, soptions);
-  if (!status.ok()) {
-    exec_state_ = LDBCommandExecuteResult::FAILED("Failed to open WAL file " +
-      status.ToString());
-  } else {
-    StdErrReporter reporter;
-    log::Reader reader(move(file), &reporter, true, 0);
-    string scratch;
-    WriteBatch batch;
-    Slice record;
-    stringstream row;
-    if (print_header_) {
-      cout<<"Sequence,Count,ByteSize,Physical Offset,Key(s)";
-      if (print_values_) {
-        cout << " : value ";
-      }
-      cout << "\n";
-    }
-    while(reader.ReadRecord(&record, &scratch)) {
-      row.str("");
-      if (record.size() < 12) {
-        reporter.Corruption(
-            record.size(), Status::Corruption("log record too small"));
-      } else {
-        WriteBatchInternal::SetContents(&batch, record);
-        row<<WriteBatchInternal::Sequence(&batch)<<",";
-        row<<WriteBatchInternal::Count(&batch)<<",";
-        row<<WriteBatchInternal::ByteSize(&batch)<<",";
-        row<<reader.LastRecordOffset()<<",";
-        InMemoryHandler handler(row, print_values_);
-        batch.Iterate(&handler);
-        row<<"\n";
-      }
-      cout<<row.str();
-    }
-  }
+  DumpWalFile(wal_file_, print_header_, print_values_, &exec_state_);
 }
 
+// ----------------------------------------------------------------------------
 
 GetCommand::GetCommand(const vector<string>& params,
       const map<string, string>& options, const vector<string>& flags) :
@@ -1486,6 +1516,7 @@ void GetCommand::DoCommand() {
   }
 }
 
+// ----------------------------------------------------------------------------
 
 ApproxSizeCommand::ApproxSizeCommand(const vector<string>& params,
       const map<string, string>& options, const vector<string>& flags) :
@@ -1537,6 +1568,7 @@ void ApproxSizeCommand::DoCommand() {
   */
 }
 
+// ----------------------------------------------------------------------------
 
 BatchPutCommand::BatchPutCommand(const vector<string>& params,
       const map<string, string>& options, const vector<string>& flags) :
@@ -1590,6 +1622,7 @@ Options BatchPutCommand::PrepareOptionsForOpenDB() {
   return opt;
 }
 
+// ----------------------------------------------------------------------------
 
 ScanCommand::ScanCommand(const vector<string>& params,
       const map<string, string>& options, const vector<string>& flags) :
@@ -1701,6 +1734,7 @@ void ScanCommand::DoCommand() {
   delete it;
 }
 
+// ----------------------------------------------------------------------------
 
 DeleteCommand::DeleteCommand(const vector<string>& params,
       const map<string, string>& options, const vector<string>& flags) :
@@ -1780,6 +1814,7 @@ Options PutCommand::PrepareOptionsForOpenDB() {
   return opt;
 }
 
+// ----------------------------------------------------------------------------
 
 const char* DBQuerierCommand::HELP_CMD = "help";
 const char* DBQuerierCommand::GET_CMD = "get";
@@ -1861,6 +1896,8 @@ void DBQuerierCommand::DoCommand() {
   }
 }
 
+// ----------------------------------------------------------------------------
+
 CheckConsistencyCommand::CheckConsistencyCommand(const vector<string>& params,
     const map<string, string>& options, const vector<string>& flags) :
   LDBCommand(options, flags, false,
@@ -1889,5 +1926,117 @@ void CheckConsistencyCommand::DoCommand() {
   }
 }
 
+// ----------------------------------------------------------------------------
+
+namespace {
+
+void DumpSstFile(std::string filename, bool output_hex, bool show_properties) {
+  std::string from_key;
+  std::string to_key;
+  if (filename.length() <= 4 ||
+      filename.rfind(".sst") != filename.length() - 4) {
+    std::cout << "Invalid sst file name." << std::endl;
+    return;
+  }
+  // no verification
+  rocksdb::SstFileReader reader(filename, false, output_hex);
+  Status st = reader.ReadSequential(true, -1, false,  // has_from
+                                    from_key, false,  // has_to
+                                    to_key);
+  if (!st.ok()) {
+    std::cerr << "Error in reading SST file " << filename << st.ToString()
+              << std::endl;
+    return;
+  }
+
+  if (show_properties) {
+    const rocksdb::TableProperties* table_properties;
+
+    std::shared_ptr<const rocksdb::TableProperties>
+        table_properties_from_reader;
+    st = reader.ReadTableProperties(&table_properties_from_reader);
+    if (!st.ok()) {
+      std::cerr << filename << ": " << st.ToString()
+                << ". Try to use initial table properties" << std::endl;
+      table_properties = reader.GetInitTableProperties();
+    } else {
+      table_properties = table_properties_from_reader.get();
+    }
+    if (table_properties != nullptr) {
+      std::cout << std::endl << "Table Properties:" << std::endl;
+      std::cout << table_properties->ToString("\n") << std::endl;
+      std::cout << "# deleted keys: "
+                << rocksdb::GetDeletedKeys(
+                       table_properties->user_collected_properties)
+                << std::endl;
+    }
+  }
+}
+
+}  // namespace
+
+DBFileDumperCommand::DBFileDumperCommand(const vector<string>& params,
+                                         const map<string, string>& options,
+                                         const vector<string>& flags)
+    : LDBCommand(options, flags, true, BuildCmdLineOptions({})) {}
+
+void DBFileDumperCommand::Help(string& ret) {
+  ret.append("  ");
+  ret.append(DBFileDumperCommand::Name());
+  ret.append("\n");
+}
+
+void DBFileDumperCommand::DoCommand() {
+  if (!db_) {
+    return;
+  }
+  Status s;
+
+  std::cout << "Manifest File" << std::endl;
+  std::cout << "==============================" << std::endl;
+  std::string manifest_filename;
+  s = ReadFileToString(db_->GetEnv(), CurrentFileName(db_->GetName()),
+                       &manifest_filename);
+  if (!s.ok() || manifest_filename.empty() ||
+      manifest_filename.back() != '\n') {
+    std::cerr << "Error when reading CURRENT file "
+              << CurrentFileName(db_->GetName()) << std::endl;
+  }
+  // remove the trailing '\n'
+  manifest_filename.resize(manifest_filename.size() - 1);
+  string manifest_filepath = db_->GetName() + "/" + manifest_filename;
+  std::cout << manifest_filepath << std::endl;
+  DumpManifestFile(manifest_filepath, false, false);
+  std::cout << std::endl;
+
+  std::cout << "SST Files" << std::endl;
+  std::cout << "==============================" << std::endl;
+  std::vector<LiveFileMetaData> metadata;
+  db_->GetLiveFilesMetaData(&metadata);
+  for (auto& fileMetadata : metadata) {
+    std::string filename = fileMetadata.db_path + fileMetadata.name;
+    std::cout << filename << " level:" << fileMetadata.level << std::endl;
+    std::cout << "------------------------------" << std::endl;
+    DumpSstFile(filename, false, true);
+    std::cout << std::endl;
+  }
+  std::cout << std::endl;
+
+  std::cout << "Write Ahead Log Files" << std::endl;
+  std::cout << "==============================" << std::endl;
+  rocksdb::VectorLogPtr wal_files;
+  s = db_->GetSortedWalFiles(wal_files);
+  if (!s.ok()) {
+    std::cerr << "Error when getting WAL files" << std::endl;
+  } else {
+    for (auto& wal : wal_files) {
+      // TODO(qyang): option.wal_dir should be passed into ldb command
+      std::string filename = db_->GetOptions().wal_dir + wal->PathName();
+      std::cout << filename << std::endl;
+      DumpWalFile(filename, true, true, &exec_state_);
+    }
+  }
+}
+
 }   // namespace rocksdb
 #endif  // ROCKSDB_LITE
diff --git a/util/ldb_cmd.h b/util/ldb_cmd.h
index fd4d4d4b9..e75433e76 100644
--- a/util/ldb_cmd.h
+++ b/util/ldb_cmd.h
@@ -13,6 +13,8 @@
 #include <stdlib.h>
 #include <algorithm>
 #include <stdio.h>
+#include <vector>
+#include <map>
 
 #include "db/version_set.h"
 #include "rocksdb/env.h"
@@ -392,6 +394,19 @@ private:
   string to_;
 };
 
+class DBFileDumperCommand : public LDBCommand {
+ public:
+  static string Name() { return "dump_live_files"; }
+
+  DBFileDumperCommand(const vector<string>& params,
+                      const map<string, string>& options,
+                      const vector<string>& flags);
+
+  static void Help(string& ret);
+
+  virtual void DoCommand();
+};
+
 class DBDumperCommand: public LDBCommand {
 public:
   static string Name() { return "dump"; }
diff --git a/util/ldb_tool.cc b/util/ldb_tool.cc
index fe2d7d538..fe84fa933 100644
--- a/util/ldb_tool.cc
+++ b/util/ldb_tool.cc
@@ -80,6 +80,7 @@ public:
     DBLoaderCommand::Help(ret);
     ManifestDumpCommand::Help(ret);
     ListColumnFamiliesCommand::Help(ret);
+    DBFileDumperCommand::Help(ret);
     InternalDumpCommand::Help(ret);
 
     fprintf(stderr, "%s\n", ret.c_str());
diff --git a/util/sst_dump_tool.cc b/util/sst_dump_tool.cc
index 8d2233de8..d0bef3e36 100644
--- a/util/sst_dump_tool.cc
+++ b/util/sst_dump_tool.cc
@@ -5,83 +5,16 @@
 //
 #ifndef ROCKSDB_LITE
 
-#include "rocksdb/sst_dump_tool.h"
+#include "util/sst_dump_tool_imp.h"
 
 #ifndef __STDC_FORMAT_MACROS
 #define __STDC_FORMAT_MACROS
 #endif
 
-#include <map>
-#include <string>
-#include <vector>
 #include <inttypes.h>
 
-#include "db/dbformat.h"
-#include "db/memtable.h"
-#include "db/write_batch_internal.h"
-#include "rocksdb/db.h"
-#include "rocksdb/env.h"
-#include "rocksdb/immutable_options.h"
-#include "rocksdb/iterator.h"
-#include "rocksdb/slice_transform.h"
-#include "rocksdb/status.h"
-#include "rocksdb/table.h"
-#include "rocksdb/table_properties.h"
-#include "table/block.h"
-#include "table/block_based_table_factory.h"
-#include "table/block_builder.h"
-#include "table/format.h"
-#include "table/meta_blocks.h"
-#include "table/plain_table_factory.h"
-#include "util/ldb_cmd.h"
-#include "util/random.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
-
 namespace rocksdb {
 
-class SstFileReader {
- public:
-  explicit SstFileReader(const std::string& file_name,
-                         bool verify_checksum,
-                         bool output_hex);
-
-  Status ReadSequential(bool print_kv,
-                        uint64_t read_num,
-                        bool has_from,
-                        const std::string& from_key,
-                        bool has_to,
-                        const std::string& to_key);
-
-  Status ReadTableProperties(
-      std::shared_ptr<const TableProperties>* table_properties);
-  uint64_t GetReadNumber() { return read_num_; }
-  TableProperties* GetInitTableProperties() { return table_properties_.get(); }
-
- private:
-  Status NewTableReader(const std::string& file_path);
-  Status ReadTableProperties(uint64_t table_magic_number,
-                             RandomAccessFile* file, uint64_t file_size);
-  Status SetTableOptionsByMagicNumber(uint64_t table_magic_number);
-  Status SetOldTableOptions();
-
-  std::string file_name_;
-  uint64_t read_num_;
-  bool verify_checksum_;
-  bool output_hex_;
-  EnvOptions soptions_;
-
-  Status init_result_;
-  unique_ptr<TableReader> table_reader_;
-  unique_ptr<RandomAccessFile> file_;
-  // options_ and internal_comparator_ will also be used in
-  // ReadSequential internally (specifically, seek-related operations)
-  Options options_;
-  const ImmutableCFOptions ioptions_;
-  InternalKeyComparator internal_comparator_;
-  unique_ptr<TableProperties> table_properties_;
-};
-
 SstFileReader::SstFileReader(const std::string& file_path,
                              bool verify_checksum,
                              bool output_hex)
diff --git a/util/sst_dump_tool_imp.h b/util/sst_dump_tool_imp.h
new file mode 100644
index 000000000..833f62a42
--- /dev/null
+++ b/util/sst_dump_tool_imp.h
@@ -0,0 +1,78 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+#ifndef ROCKSDB_LITE
+#pragma once
+
+#include "rocksdb/sst_dump_tool.h"
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "db/memtable.h"
+#include "db/write_batch_internal.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/immutable_options.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table.h"
+#include "rocksdb/table_properties.h"
+#include "table/block.h"
+#include "table/block_based_table_factory.h"
+#include "table/block_builder.h"
+#include "table/format.h"
+#include "table/meta_blocks.h"
+#include "table/plain_table_factory.h"
+#include "util/ldb_cmd.h"
+#include "util/random.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+namespace rocksdb {
+
+class SstFileReader {
+ public:
+  explicit SstFileReader(const std::string& file_name, bool verify_checksum,
+                         bool output_hex);
+
+  Status ReadSequential(bool print_kv, uint64_t read_num, bool has_from,
+                        const std::string& from_key, bool has_to,
+                        const std::string& to_key);
+
+  Status ReadTableProperties(
+      std::shared_ptr<const TableProperties>* table_properties);
+  uint64_t GetReadNumber() { return read_num_; }
+  TableProperties* GetInitTableProperties() { return table_properties_.get(); }
+
+ private:
+  Status NewTableReader(const std::string& file_path);
+  Status ReadTableProperties(uint64_t table_magic_number,
+                             RandomAccessFile* file, uint64_t file_size);
+  Status SetTableOptionsByMagicNumber(uint64_t table_magic_number);
+  Status SetOldTableOptions();
+
+  std::string file_name_;
+  uint64_t read_num_;
+  bool verify_checksum_;
+  bool output_hex_;
+  EnvOptions soptions_;
+
+  Status init_result_;
+  unique_ptr<TableReader> table_reader_;
+  unique_ptr<RandomAccessFile> file_;
+  // options_ and internal_comparator_ will also be used in
+  // ReadSequential internally (specifically, seek-related operations)
+  Options options_;
+  const ImmutableCFOptions ioptions_;
+  InternalKeyComparator internal_comparator_;
+  unique_ptr<TableProperties> table_properties_;
+};
+
+}  // namespace rocksdb
+
+#endif  // ROCKSDB_LITE

From 06eed650a0e4aad03b71f229e596879471daae06 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Mon, 15 Dec 2014 11:29:41 +0100
Subject: [PATCH 618/829] Optimize default compile to compilation platform by
 default

Summary:
This diff changes compile to optimize for native platform by default. This will automatically turn on crc32 optimizations for modern processors, which greatly improves rocksdb's performance.

I also did some more changes to compilation documentation.

Test Plan:
compile with `make`, observe -march=native
compile with `PORTABLE=1 make`, observe no -march=native

Reviewers: sdong, rven, yhchiang, MarkCallaghan

Reviewed By: MarkCallaghan

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D30225
---
 HISTORY.md                        |  1 +
 INSTALL.md                        | 16 +++++-----------
 build_tools/build_detect_platform |  8 ++++++--
 build_tools/fbcode_config.sh      |  2 +-
 build_tools/mac-install-gflags.sh | 25 -------------------------
 5 files changed, 13 insertions(+), 39 deletions(-)
 delete mode 100755 build_tools/mac-install-gflags.sh

diff --git a/HISTORY.md b/HISTORY.md
index bdb3325fe..e1b9f15a1 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -2,6 +2,7 @@
 
 ### Unreleased Features
 * Changed the LRU caching algorithm so that referenced blocks (by iterators) are never evicted
+* By default we now optimize the compilation for the compilation platform (using -march=native). If you want to build portable binary, use 'PORTABLE=1' before the make command.
 
 ### 3.9.0 (12/8/2014)
 
diff --git a/INSTALL.md b/INSTALL.md
index 21e8d26f0..330f8bcbd 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -2,7 +2,7 @@
 
 RocksDB's library should be able to compile without any dependency installed,
 although we recommend installing some compression libraries (see below).
-We do depend on newer gcc with C++11 support.
+We do depend on newer gcc/clang with C++11 support.
 
 There are few options when compiling RocksDB:
 
@@ -15,9 +15,9 @@ There are few options when compiling RocksDB:
 * `make all` will compile our static library, and all our tools and unit tests. Our tools
 depend on gflags. You will need to have gflags installed to run `make all`.
 
-* if Intel SSE instruction set is supported, set USE_SSE=" -msse -msse4.2 " to make sure
-SSE4.2 is used to speed up CRC32 when calculating data checksum.
-
+* By default the binary we produce is optimized for the platform you're compiling on
+(-march=native). If you want to build a portable binary, add 'PORTABLE=1' before
+your make commands, like this: `PORTABLE=1 make static_lib`
 
 ## Dependencies
 
@@ -76,13 +76,7 @@ SSE4.2 is used to speed up CRC32 when calculating data checksum.
         * Install via [homebrew](http://brew.sh/).
             * If you're first time developer in MacOS, you still need to run: `xcode-select --install` in your command line.
             * run `brew tap homebrew/dupes; brew install gcc47 --use-llvm` to install gcc 4.7 (or higher).
-    * Install zlib, bzip2 and snappy libraries for compression.
-    * Install gflags. We have included a script
-    `build_tools/mac-install-gflags.sh`, which should automatically install it (execute this file instead of runing using "source" command).
-    If you installed gflags by other means (for example, `brew install gflags`),
-    please set `LIBRARY_PATH` and `CPATH` accordingly.
-    * Please note that some of the optimizations/features are disabled in OSX.
-    We did not run any production workloads on it.
+    * run `brew install rocksdb`
 
 * **iOS**:
   * Run: `TARGET_OS=IOS make static_lib`. When building the project which uses rocksdb iOS library, make sure to define two important pre-processing macros: `ROCKSDB_LITE` and `IOS_CROSS_COMPILE`.
diff --git a/build_tools/build_detect_platform b/build_tools/build_detect_platform
index 7abccc8cc..3b3f34037 100755
--- a/build_tools/build_detect_platform
+++ b/build_tools/build_detect_platform
@@ -308,8 +308,12 @@ if test "$USE_HDFS"; then
   JAVA_LDFLAGS="$JAVA_LDFLAGS $HDFS_LDFLAGS"
 fi
 
-# if Intel SSE instruction set is supported, set USE_SSE=" -msse -msse4.2 "
-COMMON_FLAGS="$COMMON_FLAGS $USE_SSE"
+if test "$USE_SSE"; then
+  # if Intel SSE instruction set is supported, set USE_SSE=1
+  COMMON_FLAGS="$COMMON_FLAGS -msse -msse4.2 "
+elif test -z "$PORTABLE"; then
+  COMMON_FLAGS="$COMMON_FLAGS -march=native -mtune=native "
+fi
 
 PLATFORM_CCFLAGS="$PLATFORM_CCFLAGS $COMMON_FLAGS"
 PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS $COMMON_FLAGS"
diff --git a/build_tools/fbcode_config.sh b/build_tools/fbcode_config.sh
index fefd48d59..7c1ff5147 100644
--- a/build_tools/fbcode_config.sh
+++ b/build_tools/fbcode_config.sh
@@ -48,7 +48,7 @@ LIBUNWIND_REV=2c060e64064559905d46fd194000d61592087bdc
 LIBUNWIND="/mnt/gvfs/third-party2/libunwind/$LIBUNWIND_REV/1.1/gcc-4.8.1-glibc-2.17/675d945/lib/libunwind.a"
 
 # use Intel SSE support for checksum calculations
-export USE_SSE=" -msse -msse4.2 "
+export USE_SSE=1
 
 BINUTILS="/mnt/gvfs/third-party2/binutils/2aff2e7b474cd3e6ab23495ad1224b7d214b9f8e/2.21.1/centos6-native/da39a3e/bin"
 AR="$BINUTILS/ar"
diff --git a/build_tools/mac-install-gflags.sh b/build_tools/mac-install-gflags.sh
deleted file mode 100755
index a245a26a8..000000000
--- a/build_tools/mac-install-gflags.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-#!/bin/sh
-# Install gflags for mac developers.
-
-set -e
-
-DIR=`mktemp -d /tmp/rocksdb_gflags_XXXX`
-
-cd $DIR
-wget https://gflags.googlecode.com/files/gflags-2.0.tar.gz
-tar xvfz gflags-2.0.tar.gz
-cd gflags-2.0
-
-./configure
-make
-make install
-
-# Add include/lib path for g++
-echo 'export LIBRARY_PATH+=":/usr/local/lib"' >> ~/.bash_profile
-echo 'export CPATH+=":/usr/local/include"' >> ~/.bash_profile
-
-echo ""
-echo "-----------------------------------------------------------------------------"
-echo "|                         Installation Completed                            |"
-echo "-----------------------------------------------------------------------------"
-echo "Please run \`. ~/.bash_profile\` to be able to compile with gflags"

From 153f4f0719bf904c9d38bf32dbf9069f15760f9d Mon Sep 17 00:00:00 2001
From: Venkatesh Radhakrishnan <rven@fb.com>
Date: Mon, 15 Dec 2014 21:48:16 -0800
Subject: [PATCH 619/829] RocksDB: Allow Level-Style Compaction to Place Files
 in Different Paths

Summary:
Allow Level-style compaction to place files in different paths
This diff provides the code for task 4854591. We now support level-compaction
to place files in different paths by specifying  them in db_paths  along with
the minimum level for files to store in that path.

Test Plan: ManualLevelCompactionOutputPathId in db_test.cc

Reviewers: yhchiang, MarkCallaghan, dhruba, yoshinorim, sdong

Reviewed By: sdong

Subscribers: yoshinorim, dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D29799
---
 HISTORY.md                   |   8 +-
 db/compaction_picker.cc      |  42 ++++-
 db/compaction_picker.h       |   5 +
 db/compaction_picker_test.cc |   3 +
 db/db_impl.cc                |  36 ++++-
 db/db_test.cc                | 302 +++++++++++++++++++++++++++++++++++
 db/flush_job.cc              |   7 +
 7 files changed, 394 insertions(+), 9 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index e1b9f15a1..121b936af 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -14,9 +14,13 @@
 ### Public API changes
 * New API to create a checkpoint added. Given a directory name, creates a new
   database which is an image of the existing database.
-*New API LinkFile added to Env. If you implement your own Env class, an
- implementation of the API LinkFile will have to be provided.
+* New API LinkFile added to Env. If you implement your own Env class, an
+  implementation of the API LinkFile will have to be provided.
 * MemTableRep takes MemTableAllocator instead of Arena
+* We now allow level-compaction to place files in different paths by
+  specifying them in db_paths along with the target_size.
+  Lower numbered levels will be placed earlier in the db_paths and higher
+  numbered levels will be placed later in the db_paths vector.
 
 ### Improvements
 * RocksDBLite library now becomes smaller and will be compiled with -fno-exceptions flag.
diff --git a/db/compaction_picker.cc b/db/compaction_picker.cc
index 213daefc1..82653ff70 100644
--- a/db/compaction_picker.cc
+++ b/db/compaction_picker.cc
@@ -769,6 +769,44 @@ Compaction* LevelCompactionPicker::PickCompaction(
   return c;
 }
 
+/*
+ * Find the optimal path to place a file
+ * Given a level, finds the path where levels up to it will fit in levels
+ * up to and including this path
+ */
+uint32_t LevelCompactionPicker::GetPathId(
+    const ImmutableCFOptions& ioptions,
+    const MutableCFOptions& mutable_cf_options, int level) {
+  uint32_t p = 0;
+  assert(!ioptions.db_paths.empty());
+
+  // size remaining in the most recent path
+  uint64_t current_path_size = ioptions.db_paths[0].target_size;
+
+  uint64_t level_size;
+  int cur_level = 0;
+
+  level_size = mutable_cf_options.max_bytes_for_level_base;
+
+  // Last path is the fallback
+  while (p < ioptions.db_paths.size() - 1) {
+    if (level_size <= current_path_size) {
+      if (cur_level == level) {
+        // Does desired level fit in this path?
+        return p;
+      } else {
+        current_path_size -= level_size;
+        level_size *= mutable_cf_options.max_bytes_for_level_multiplier;
+        cur_level++;
+        continue;
+      }
+    }
+    p++;
+    current_path_size = ioptions.db_paths[p].target_size;
+  }
+  return p;
+}
+
 Compaction* LevelCompactionPicker::PickCompactionBySize(
     const MutableCFOptions& mutable_cf_options, VersionStorageInfo* vstorage,
     int level, double score) {
@@ -786,7 +824,8 @@ Compaction* LevelCompactionPicker::PickCompactionBySize(
   assert(level + 1 < NumberLevels());
   c = new Compaction(vstorage->num_levels(), level, level + 1,
                      mutable_cf_options.MaxFileSizeForLevel(level + 1),
-                     mutable_cf_options.MaxGrandParentOverlapBytes(level), 0,
+                     mutable_cf_options.MaxGrandParentOverlapBytes(level),
+                     GetPathId(ioptions_, mutable_cf_options, level + 1),
                      GetCompressionType(ioptions_, level + 1));
   c->score_ = score;
 
@@ -960,6 +999,7 @@ uint32_t UniversalCompactionPicker::GetPathId(
   uint64_t future_size = file_size *
     (100 - ioptions.compaction_options_universal.size_ratio) / 100;
   uint32_t p = 0;
+  assert(!ioptions.db_paths.empty());
   for (; p < ioptions.db_paths.size() - 1; p++) {
     uint64_t target_size = ioptions.db_paths[p].target_size;
     if (target_size > file_size &&
diff --git a/db/compaction_picker.h b/db/compaction_picker.h
index cfed5109d..ad72e609a 100644
--- a/db/compaction_picker.h
+++ b/db/compaction_picker.h
@@ -188,6 +188,11 @@ class LevelCompactionPicker : public CompactionPicker {
   virtual bool NeedsCompaction(const VersionStorageInfo* vstorage) const
       override;
 
+  // Pick a path ID to place a newly generated file, with its level
+  static uint32_t GetPathId(const ImmutableCFOptions& ioptions,
+                            const MutableCFOptions& mutable_cf_options,
+                            int level);
+
  private:
   // For the specfied level, pick a compaction.
   // Returns nullptr if there is no compaction to be done.
diff --git a/db/compaction_picker_test.cc b/db/compaction_picker_test.cc
index 419d239c8..5ffc74f0d 100644
--- a/db/compaction_picker_test.cc
+++ b/db/compaction_picker_test.cc
@@ -4,6 +4,7 @@
 //  of patent rights can be found in the PATENTS file in the same directory.
 
 #include "db/compaction_picker.h"
+#include <limits>
 #include <string>
 #include "util/logging.h"
 #include "util/testharness.h"
@@ -47,6 +48,8 @@ class CompactionPickerTest {
     fifo_options_.max_table_files_size = 1;
     mutable_cf_options_.RefreshDerivedOptions(ioptions_);
     size_being_compacted_.resize(options_.num_levels);
+    ioptions_.db_paths.emplace_back("dummy",
+                                    std::numeric_limits<uint64_t>::max());
   }
 
   ~CompactionPickerTest() {
diff --git a/db/db_impl.cc b/db/db_impl.cc
index b4b423d9d..2d9b32cc8 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -67,6 +67,7 @@
 #include "util/build_version.h"
 #include "util/coding.h"
 #include "util/db_info_dumper.h"
+#include "util/file_util.h"
 #include "util/hash_skiplist_rep.h"
 #include "util/hash_linklist_rep.h"
 #include "util/logging.h"
@@ -2059,10 +2060,31 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress, JobContext* job_context,
     // Move file to next level
     assert(c->num_input_files(0) == 1);
     FileMetaData* f = c->input(0, 0);
+    FileMetaData ftemp;
+    uint64_t fdnum = f->fd.GetNumber();
+    uint32_t fdpath = f->fd.GetPathId();
     c->edit()->DeleteFile(c->level(), f->fd.GetNumber());
-    c->edit()->AddFile(c->level() + 1, f->fd.GetNumber(), f->fd.GetPathId(),
-                       f->fd.GetFileSize(), f->smallest, f->largest,
-                       f->smallest_seqno, f->largest_seqno);
+    // Need to move file if file is to be stored in a new path
+    if (c->GetOutputPathId() != f->fd.GetPathId()) {
+      fdnum = versions_->NewFileNumber();
+      std::string source = TableFileName(db_options_.db_paths,
+                                         f->fd.GetNumber(), f->fd.GetPathId());
+      std::string destination =
+          TableFileName(db_options_.db_paths, fdnum, c->GetOutputPathId());
+      Status s = CopyFile(env_, source, destination, 0);
+      if (s.ok()) {
+        fdpath = c->GetOutputPathId();
+      } else {
+        fdnum = f->fd.GetNumber();
+        if (!s.IsShutdownInProgress()) {
+          Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log,
+              "Compaction error: %s", s.ToString().c_str());
+        }
+      }
+    }
+    c->edit()->AddFile(c->level() + 1, fdnum, fdpath, f->fd.GetFileSize(),
+                       f->smallest, f->largest, f->smallest_seqno,
+                       f->largest_seqno);
     status = versions_->LogAndApply(c->column_family_data(),
                                     *c->mutable_cf_options(),
                                     c->edit(), &mutex_, db_directory_.get());
@@ -3519,18 +3541,20 @@ Status DB::Open(const DBOptions& db_options, const std::string& dbname,
   if (!s.ok()) {
     return s;
   }
+
   if (db_options.db_paths.size() > 1) {
     for (auto& cfd : column_families) {
-      if (cfd.options.compaction_style != kCompactionStyleUniversal) {
+      if ((cfd.options.compaction_style != kCompactionStyleUniversal) &&
+          (cfd.options.compaction_style != kCompactionStyleLevel)) {
         return Status::NotSupported(
             "More than one DB paths are only supported in "
-            "universal compaction style. ");
+            "universal and level compaction styles. ");
       }
     }
 
     if (db_options.db_paths.size() > 4) {
       return Status::NotSupported(
-        "More than four DB paths are not supported yet. ");
+          "More than four DB paths are not supported yet. ");
     }
   }
 
diff --git a/db/db_test.cc b/db/db_test.cc
index facae2f68..6c995e7a0 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -1004,6 +1004,12 @@ class DBTest {
     return size;
   }
 
+  void Compact(int cf, const Slice& start, const Slice& limit,
+               uint32_t target_path_id) {
+    ASSERT_OK(db_->CompactRange(handles_[cf], &start, &limit, false, -1,
+                                target_path_id));
+  }
+
   void Compact(int cf, const Slice& start, const Slice& limit) {
     ASSERT_OK(db_->CompactRange(handles_[cf], &start, &limit));
   }
@@ -4087,6 +4093,233 @@ TEST(DBTest, UniversalCompactionSecondPathRatio) {
   Destroy(options);
 }
 
+TEST(DBTest, LevelCompactionThirdPath) {
+  Options options;
+  options.db_paths.emplace_back(dbname_, 500 * 1024);
+  options.db_paths.emplace_back(dbname_ + "_2", 4 * 1024 * 1024);
+  options.db_paths.emplace_back(dbname_ + "_3", 1024 * 1024 * 1024);
+  options.compaction_style = kCompactionStyleLevel;
+  options.write_buffer_size = 100 << 10;  // 100KB
+  options.level0_file_num_compaction_trigger = 2;
+  options.num_levels = 4;
+  options.max_bytes_for_level_base = 400 * 1024;
+  //  options = CurrentOptions(options);
+
+  std::vector<std::string> filenames;
+  env_->GetChildren(options.db_paths[1].path, &filenames);
+  // Delete archival files.
+  for (size_t i = 0; i < filenames.size(); ++i) {
+    env_->DeleteFile(options.db_paths[1].path + "/" + filenames[i]);
+  }
+  env_->DeleteDir(options.db_paths[1].path);
+  Reopen(options);
+
+  Random rnd(301);
+  int key_idx = 0;
+
+  // First three 110KB files are not going to second path.
+  // After that, (100K, 200K)
+  for (int num = 0; num < 3; num++) {
+    GenerateNewFile(&rnd, &key_idx);
+  }
+
+  // Another 110KB triggers a compaction to 400K file to fill up first path
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(3, GetSstFileCount(options.db_paths[1].path));
+
+  // (1, 4)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4", FilesPerLevel(0));
+  ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  // (1, 4, 1)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,1", FilesPerLevel(0));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  // (1, 4, 2)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,2", FilesPerLevel(0));
+  ASSERT_EQ(2, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  // (1, 4, 3)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,3", FilesPerLevel(0));
+  ASSERT_EQ(3, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  // (1, 4, 4)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,4", FilesPerLevel(0));
+  ASSERT_EQ(4, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  // (1, 4, 5)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,5", FilesPerLevel(0));
+  ASSERT_EQ(5, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  // (1, 4, 6)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,6", FilesPerLevel(0));
+  ASSERT_EQ(6, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  // (1, 4, 7)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,7", FilesPerLevel(0));
+  ASSERT_EQ(7, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  // (1, 4, 8)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,8", FilesPerLevel(0));
+  ASSERT_EQ(8, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  for (int i = 0; i < key_idx; i++) {
+    auto v = Get(Key(i));
+    ASSERT_NE(v, "NOT_FOUND");
+    ASSERT_TRUE(v.size() == 1 || v.size() == 10000);
+  }
+
+  Reopen(options);
+
+  for (int i = 0; i < key_idx; i++) {
+    auto v = Get(Key(i));
+    ASSERT_NE(v, "NOT_FOUND");
+    ASSERT_TRUE(v.size() == 1 || v.size() == 10000);
+  }
+
+  Destroy(options);
+}
+
+TEST(DBTest, LevelCompactionPathUse) {
+  Options options;
+  options.db_paths.emplace_back(dbname_, 500 * 1024);
+  options.db_paths.emplace_back(dbname_ + "_2", 4 * 1024 * 1024);
+  options.db_paths.emplace_back(dbname_ + "_3", 1024 * 1024 * 1024);
+  options.compaction_style = kCompactionStyleLevel;
+  options.write_buffer_size = 100 << 10;  // 100KB
+  options.level0_file_num_compaction_trigger = 2;
+  options.num_levels = 4;
+  options.max_bytes_for_level_base = 400 * 1024;
+  //  options = CurrentOptions(options);
+
+  std::vector<std::string> filenames;
+  env_->GetChildren(options.db_paths[1].path, &filenames);
+  // Delete archival files.
+  for (size_t i = 0; i < filenames.size(); ++i) {
+    env_->DeleteFile(options.db_paths[1].path + "/" + filenames[i]);
+  }
+  env_->DeleteDir(options.db_paths[1].path);
+  Reopen(options);
+
+  Random rnd(301);
+  int key_idx = 0;
+
+  // Always gets compacted into 1 Level1 file,
+  // 0/1 Level 0 file
+  for (int num = 0; num < 3; num++) {
+    key_idx = 0;
+    GenerateNewFile(&rnd, &key_idx);
+  }
+
+  key_idx = 0;
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+
+  key_idx = 0;
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,1", FilesPerLevel(0));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  key_idx = 0;
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("0,1", FilesPerLevel(0));
+  ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+  key_idx = 0;
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,1", FilesPerLevel(0));
+  ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  key_idx = 0;
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("0,1", FilesPerLevel(0));
+  ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+  key_idx = 0;
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,1", FilesPerLevel(0));
+  ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  key_idx = 0;
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("0,1", FilesPerLevel(0));
+  ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+  key_idx = 0;
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,1", FilesPerLevel(0));
+  ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  key_idx = 0;
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("0,1", FilesPerLevel(0));
+  ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+  key_idx = 0;
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,1", FilesPerLevel(0));
+  ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  for (int i = 0; i < key_idx; i++) {
+    auto v = Get(Key(i));
+    ASSERT_NE(v, "NOT_FOUND");
+    ASSERT_TRUE(v.size() == 1 || v.size() == 10000);
+  }
+
+  Reopen(options);
+
+  for (int i = 0; i < key_idx; i++) {
+    auto v = Get(Key(i));
+    ASSERT_NE(v, "NOT_FOUND");
+    ASSERT_TRUE(v.size() == 1 || v.size() == 10000);
+  }
+
+  Destroy(options);
+}
+
 TEST(DBTest, UniversalCompactionFourPaths) {
   Options options;
   options.db_paths.emplace_back(dbname_, 300 * 1024);
@@ -5953,6 +6186,75 @@ TEST(DBTest, ManualCompactionOutputPathId) {
                   .IsInvalidArgument());
 }
 
+TEST(DBTest, ManualLevelCompactionOutputPathId) {
+  Options options = CurrentOptions();
+  options.db_paths.emplace_back(dbname_ + "_2", 2 * 10485760);
+  options.db_paths.emplace_back(dbname_ + "_3", 100 * 10485760);
+  options.db_paths.emplace_back(dbname_ + "_4", 120 * 10485760);
+  options.max_background_flushes = 1;
+  CreateAndReopenWithCF({"pikachu"}, options);
+  ASSERT_EQ(dbfull()->MaxMemCompactionLevel(), 2)
+      << "Need to update this test to match kMaxMemCompactLevel";
+
+  // iter - 0 with 7 levels
+  // iter - 1 with 3 levels
+  for (int iter = 0; iter < 2; ++iter) {
+    MakeTables(3, "p", "q", 1);
+    ASSERT_EQ("3", FilesPerLevel(1));
+    ASSERT_EQ(3, GetSstFileCount(options.db_paths[0].path));
+    ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+    // Compaction range falls before files
+    Compact(1, "", "c");
+    ASSERT_EQ("3", FilesPerLevel(1));
+
+    // Compaction range falls after files
+    Compact(1, "r", "z");
+    ASSERT_EQ("3", FilesPerLevel(1));
+
+    // Compaction range overlaps files
+    Compact(1, "p1", "p9", 1);
+    ASSERT_EQ("0,1", FilesPerLevel(1));
+    ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+    ASSERT_EQ(0, GetSstFileCount(options.db_paths[0].path));
+    ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+    // Populate a different range
+    MakeTables(3, "c", "e", 1);
+    ASSERT_EQ("3,1", FilesPerLevel(1));
+
+    // Compact just the new range
+    Compact(1, "b", "f", 1);
+    ASSERT_EQ("0,2", FilesPerLevel(1));
+    ASSERT_EQ(2, GetSstFileCount(options.db_paths[1].path));
+    ASSERT_EQ(0, GetSstFileCount(options.db_paths[0].path));
+    ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+    // Compact all
+    MakeTables(1, "a", "z", 1);
+    ASSERT_EQ("1,2", FilesPerLevel(1));
+    ASSERT_EQ(2, GetSstFileCount(options.db_paths[1].path));
+    ASSERT_EQ(1, GetSstFileCount(options.db_paths[0].path));
+    db_->CompactRange(handles_[1], nullptr, nullptr, false, 1, 1);
+    ASSERT_EQ("0,1", FilesPerLevel(1));
+    ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+    ASSERT_EQ(0, GetSstFileCount(options.db_paths[0].path));
+    ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+    if (iter == 0) {
+      DestroyAndReopen(options);
+      options = CurrentOptions();
+      options.db_paths.emplace_back(dbname_ + "_2", 2 * 10485760);
+      options.db_paths.emplace_back(dbname_ + "_3", 100 * 10485760);
+      options.db_paths.emplace_back(dbname_ + "_4", 120 * 10485760);
+      options.max_background_flushes = 1;
+      options.num_levels = 3;
+      options.create_if_missing = true;
+      CreateAndReopenWithCF({"pikachu"}, options);
+    }
+  }
+}
+
 TEST(DBTest, DBOpen_Options) {
   Options options = CurrentOptions();
   std::string dbname = test::TmpDir(env_) + "/db_options_test";
diff --git a/db/flush_job.cc b/db/flush_job.cc
index 10bd6f96b..ccc0245a3 100644
--- a/db/flush_job.cc
+++ b/db/flush_job.cc
@@ -40,6 +40,7 @@
 #include "table/table_builder.h"
 #include "table/two_level_iterator.h"
 #include "util/coding.h"
+#include "util/file_util.h"
 #include "util/logging.h"
 #include "util/log_buffer.h"
 #include "util/mutexlock.h"
@@ -194,6 +195,12 @@ Status FlushJob::WriteLevel0Table(const autovector<MemTable*>& mems,
         cfd_->ioptions()->compaction_style == kCompactionStyleLevel) {
       level = base->storage_info()->PickLevelForMemTableOutput(
           mutable_cf_options_, min_user_key, max_user_key);
+      // If level does not match path id, reset level back to 0
+      uint32_t fdpath = LevelCompactionPicker::GetPathId(
+          *cfd_->ioptions(), mutable_cf_options_, level);
+      if (fdpath != 0) {
+        level = 0;
+      }
     }
     edit->AddFile(level, meta.fd.GetNumber(), meta.fd.GetPathId(),
                   meta.fd.GetFileSize(), meta.smallest, meta.largest,

From 48adce77cc251031ca470212a08570477cd5f00b Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Mon, 17 Nov 2014 23:29:52 +0100
Subject: [PATCH 620/829] [RocksJava] CompactRange support

- manual range compaction support in RocksJava
---
 java/org/rocksdb/RocksDB.java          | 291 +++++++++++++++++++++++++
 java/org/rocksdb/test/RocksDBTest.java | 244 +++++++++++++++++++++
 java/rocksjni/rocksjni.cc              | 114 ++++++++++
 3 files changed, 649 insertions(+)

diff --git a/java/org/rocksdb/RocksDB.java b/java/org/rocksdb/RocksDB.java
index 3d420adea..021ed80b0 100644
--- a/java/org/rocksdb/RocksDB.java
+++ b/java/org/rocksdb/RocksDB.java
@@ -1251,6 +1251,287 @@ public class RocksDB extends RocksObject {
         columnFamilyHandle.nativeHandle_);
   }
 
+  /**
+   * <p>Full compaction of the underlying storage using key
+   * range mode.</p>
+   * <p><strong>Note</strong>: After the entire database is compacted,
+   * all data are pushed down to the last level containing any data.
+   * If the total data size after compaction is reduced, that level
+   * might not be appropriate for hosting all the files.
+   * </p>
+   *
+   * <p><strong>See also</strong></p>
+   * <ul>
+   * <li>{@link #compactRange(boolean, int, int)}</li>
+   * <li>{@link #compactRange(byte[], byte[])}</li>
+   * <li>{@link #compactRange(byte[], byte[], boolean, int, int)}</li>
+   * </ul>
+   *
+   * @throws RocksDBException thrown if an error occurs within the native
+   *     part of the library.
+   */
+  public void compactRange() throws RocksDBException {
+    compactRange0(nativeHandle_, false, -1, 0);
+  }
+
+  /**
+   * <p>Compaction of the underlying storage using key
+   * using key range {@code [begin, end]}.</p>
+   * <p><strong>Note</strong>: After the entire database is compacted,
+   * all data are pushed down to the last level containing any data.
+   * If the total data size after compaction is reduced, that level
+   * might not be appropriate for hosting all the files.
+   * </p>
+   *
+   * <p><strong>See also</strong></p>
+   * <ul>
+   * <li>{@link #compactRange()}</li>
+   * <li>{@link #compactRange(boolean, int, int)}</li>
+   * <li>{@link #compactRange(byte[], byte[], boolean, int, int)}</li>
+   * </ul>
+   *
+   * @param begin start of key range (included in range)
+   * @param end end of key range (excluded from range)
+   *
+   * @throws RocksDBException thrown if an error occurs within the native
+   *     part of the library.
+   */
+  public void compactRange(byte[] begin, byte[] end)
+      throws RocksDBException {
+    compactRange0(nativeHandle_, begin, begin.length, end,
+        end.length, false, -1, 0);
+  }
+
+  /**
+   * <p>Full compaction of the underlying storage using key
+   * range mode.</p>
+   * <p><strong>Note</strong>: After the entire database is compacted,
+   * all data are pushed down to the last level containing any data.
+   * If the total data size after compaction is reduced, that level
+   * might not be appropriate for hosting all the files.
+   * In this case, client could set reduce_level to true, to move
+   * the files back to the minimum level capable of holding the data
+   * set or a given level (specified by non-negative target_level).
+   * </p>
+   * <p>Compaction outputs should be placed in options.db_paths
+   * [target_path_id]. Behavior is undefined if target_path_id is
+   * out of range.</p>
+   *
+   * <p><strong>See also</strong></p>
+   * <ul>
+   * <li>{@link #compactRange()}</li>
+   * <li>{@link #compactRange(byte[], byte[])}</li>
+   * <li>{@link #compactRange(byte[], byte[], boolean, int, int)}</li>
+   * </ul>
+   *
+   * @param reduce_level reduce level after compaction
+   * @param target_level target level to compact to
+   * @param target_path_id the target path id of output path
+   *
+   * @throws RocksDBException thrown if an error occurs within the native
+   *     part of the library.
+   */
+  public void compactRange(boolean reduce_level, int target_level,
+      int target_path_id) throws RocksDBException {
+    compactRange0(nativeHandle_, reduce_level,
+        target_level, target_path_id);
+  }
+
+
+  /**
+   * <p>Compaction of the underlying storage using key
+   * using key range {@code [begin, end]}.</p>
+   * <p><strong>Note</strong>: After the entire database is compacted,
+   * all data are pushed down to the last level containing any data.
+   * If the total data size after compaction is reduced, that level
+   * might not be appropriate for hosting all the files.
+   * In this case, client could set reduce_level to true, to move
+   * the files back to the minimum level capable of holding the data
+   * set or a given level (specified by non-negative target_level).
+   * </p>
+   * <p>Compaction outputs should be placed in options.db_paths
+   * [target_path_id]. Behavior is undefined if target_path_id is
+   * out of range.</p>
+   *
+   * <p><strong>See also</strong></p>
+   * <ul>
+   * <li>{@link #compactRange()}</li>
+   * <li>{@link #compactRange(boolean, int, int)}</li>
+   * <li>{@link #compactRange(byte[], byte[])}</li>
+   * </ul>
+   *
+   * @param begin start of key range (included in range)
+   * @param end end of key range (excluded from range)
+   * @param reduce_level reduce level after compaction
+   * @param target_level target level to compact to
+   * @param target_path_id the target path id of output path
+   *
+   * @throws RocksDBException thrown if an error occurs within the native
+   *     part of the library.
+   */
+  public void compactRange(byte[] begin, byte[] end,
+      boolean reduce_level, int target_level, int target_path_id)
+      throws RocksDBException {
+    compactRange0(nativeHandle_, begin, begin.length, end, end.length,
+        reduce_level, target_level, target_path_id);
+  }
+
+  /**
+   * <p>Full compaction of the underlying storage of a column family
+   * using key range mode.</p>
+   * <p><strong>Note</strong>: After the entire database is compacted,
+   * all data are pushed down to the last level containing any data.
+   * If the total data size after compaction is reduced, that level
+   * might not be appropriate for hosting all the files.</p>
+   *
+   * <p><strong>See also</strong></p>
+   * <ul>
+   * <li>
+   *   {@link #compactRange(ColumnFamilyHandle, boolean, int, int)}
+   * </li>
+   * <li>
+   *   {@link #compactRange(ColumnFamilyHandle, byte[], byte[])}
+   * </li>
+   * <li>
+   *   {@link #compactRange(ColumnFamilyHandle, byte[], byte[],
+   *   boolean, int, int)}
+   * </li>
+   * </ul>
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance.
+   *
+   * @throws RocksDBException thrown if an error occurs within the native
+   *     part of the library.
+   */
+  public void compactRange(ColumnFamilyHandle columnFamilyHandle)
+      throws RocksDBException {
+    compactRange(nativeHandle_, false, -1, 0,
+        columnFamilyHandle.nativeHandle_);
+  }
+
+  /**
+   * <p>Compaction of the underlying storage of a column family
+   * using key range {@code [begin, end]}.</p>
+   * <p><strong>Note</strong>: After the entire database is compacted,
+   * all data are pushed down to the last level containing any data.
+   * If the total data size after compaction is reduced, that level
+   * might not be appropriate for hosting all the files.</p>
+   *
+   * <p><strong>See also</strong></p>
+   * <ul>
+   * <li>{@link #compactRange(ColumnFamilyHandle)}</li>
+   * <li>
+   *   {@link #compactRange(ColumnFamilyHandle, boolean, int, int)}
+   * </li>
+   * <li>
+   *   {@link #compactRange(ColumnFamilyHandle, byte[], byte[],
+   *   boolean, int, int)}
+   * </li>
+   * </ul>
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance.
+   * @param begin start of key range (included in range)
+   * @param end end of key range (excluded from range)
+   *
+   * @throws RocksDBException thrown if an error occurs within the native
+   *     part of the library.
+   */
+  public void compactRange(ColumnFamilyHandle columnFamilyHandle,
+      byte[] begin, byte[] end) throws RocksDBException {
+    compactRange(nativeHandle_, begin, begin.length, end, end.length,
+        false, -1, 0, columnFamilyHandle.nativeHandle_);
+  }
+
+  /**
+   * <p>Full compaction of the underlying storage of a column family
+   * using key range mode.</p>
+   * <p><strong>Note</strong>: After the entire database is compacted,
+   * all data are pushed down to the last level containing any data.
+   * If the total data size after compaction is reduced, that level
+   * might not be appropriate for hosting all the files.
+   * In this case, client could set reduce_level to true, to move
+   * the files back to the minimum level capable of holding the data
+   * set or a given level (specified by non-negative target_level).
+   * </p>
+   * <p>Compaction outputs should be placed in options.db_paths
+   * [target_path_id]. Behavior is undefined if target_path_id is
+   * out of range.</p>
+   *
+   * <p><strong>See also</strong></p>
+   * <ul>
+   * <li>{@link #compactRange(ColumnFamilyHandle)}</li>
+   * <li>
+   *   {@link #compactRange(ColumnFamilyHandle, byte[], byte[])}
+   * </li>
+   * <li>
+   *   {@link #compactRange(ColumnFamilyHandle, byte[], byte[],
+   *   boolean, int, int)}
+   * </li>
+   * </ul>
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance.
+   * @param reduce_level reduce level after compaction
+   * @param target_level target level to compact to
+   * @param target_path_id the target path id of output path
+   *
+   * @throws RocksDBException thrown if an error occurs within the native
+   *     part of the library.
+   */
+  public void compactRange(ColumnFamilyHandle columnFamilyHandle,
+      boolean reduce_level, int target_level, int target_path_id)
+      throws RocksDBException {
+    compactRange(nativeHandle_, reduce_level, target_level,
+        target_path_id, columnFamilyHandle.nativeHandle_);
+  }
+
+  /**
+   * <p>Compaction of the underlying storage of a column family
+   * using key range {@code [begin, end]}.</p>
+   * <p><strong>Note</strong>: After the entire database is compacted,
+   * all data are pushed down to the last level containing any data.
+   * If the total data size after compaction is reduced, that level
+   * might not be appropriate for hosting all the files.
+   * In this case, client could set reduce_level to true, to move
+   * the files back to the minimum level capable of holding the data
+   * set or a given level (specified by non-negative target_level).
+   * </p>
+   * <p>Compaction outputs should be placed in options.db_paths
+   * [target_path_id]. Behavior is undefined if target_path_id is
+   * out of range.</p>
+   *
+   * <p><strong>See also</strong></p>
+   * <ul>
+   * <li>{@link #compactRange(ColumnFamilyHandle)}</li>
+   * <li>
+   *   {@link #compactRange(ColumnFamilyHandle, boolean, int, int)}
+   * </li>
+   * <li>
+   *   {@link #compactRange(ColumnFamilyHandle, byte[], byte[])}
+   * </li>
+   * </ul>
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance.
+   * @param begin start of key range (included in range)
+   * @param end end of key range (excluded from range)
+   * @param reduce_level reduce level after compaction
+   * @param target_level target level to compact to
+   * @param target_path_id the target path id of output path
+   *
+   * @throws RocksDBException thrown if an error occurs within the native
+   *     part of the library.
+   */
+  public void compactRange(ColumnFamilyHandle columnFamilyHandle,
+      byte[] begin, byte[] end, boolean reduce_level, int target_level,
+      int target_path_id) throws RocksDBException {
+    compactRange(nativeHandle_, begin, begin.length, end, end.length,
+        reduce_level, target_level, target_path_id,
+        columnFamilyHandle.nativeHandle_);
+  }
+
   /**
    * Private constructor.
    */
@@ -1376,6 +1657,16 @@ public class RocksDB extends RocksObject {
       throws RocksDBException;
   private native void flush(long handle, long flushOptHandle,
       long cfHandle) throws RocksDBException;
+  private native void compactRange0(long handle, boolean reduce_level, int target_level,
+      int target_path_id) throws RocksDBException;
+  private native void compactRange0(long handle, byte[] begin, int beginLen, byte[] end,
+      int endLen, boolean reduce_level, int target_level, int target_path_id)
+      throws RocksDBException;
+  private native void compactRange(long handle, boolean reduce_level, int target_level,
+      int target_path_id, long cfHandle) throws RocksDBException;
+  private native void compactRange(long handle, byte[] begin, int beginLen, byte[] end,
+      int endLen, boolean reduce_level, int target_level, int target_path_id,
+      long cfHandle) throws RocksDBException;
 
   protected DBOptionsInterface options_;
 }
diff --git a/java/org/rocksdb/test/RocksDBTest.java b/java/org/rocksdb/test/RocksDBTest.java
index 5a8613aa1..c5e96c6aa 100644
--- a/java/org/rocksdb/test/RocksDBTest.java
+++ b/java/org/rocksdb/test/RocksDBTest.java
@@ -13,6 +13,7 @@ import org.rocksdb.*;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Map;
+import java.util.Random;
 
 import static org.assertj.core.api.Assertions.assertThat;
 
@@ -25,6 +26,9 @@ public class RocksDBTest {
   @Rule
   public TemporaryFolder dbFolder = new TemporaryFolder();
 
+  public static final Random rand = PlatformRandomHelper.
+      getPlatformSpecificRandomFactory();
+
   @Test
   public void open() throws RocksDBException {
     RocksDB db = null;
@@ -312,4 +316,244 @@ public class RocksDBTest {
       }
     }
   }
+
+  @Test
+  public void fullCompactRange() throws RocksDBException {
+    RocksDB db = null;
+    Options opt = null;
+    try {
+      opt = new Options().
+          setCreateIfMissing(true).
+          setDisableAutoCompactions(true).
+          setCompactionStyle(CompactionStyle.LEVEL).
+          setNumLevels(4).
+          setWriteBufferSize(100<<10).
+          setLevelZeroFileNumCompactionTrigger(3).
+          setTargetFileSizeBase(200 << 10).
+          setTargetFileSizeMultiplier(1).
+          setMaxBytesForLevelBase(500 << 10).
+          setMaxBytesForLevelMultiplier(1).
+          setDisableAutoCompactions(false);
+      // open database
+      db = RocksDB.open(opt,
+          dbFolder.getRoot().getAbsolutePath());
+      // fill database with key/value pairs
+      byte[] b = new byte[10000];
+      for (int i = 0; i < 200; i++) {
+        rand.nextBytes(b);
+        db.put((String.valueOf(i)).getBytes(), b);
+      }
+      db.compactRange();
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void fullCompactRangeColumnFamily() throws RocksDBException {
+    RocksDB db = null;
+    DBOptions opt = null;
+    List<ColumnFamilyHandle> columnFamilyHandles =
+        new ArrayList<>();
+    try {
+      opt = new DBOptions().
+          setCreateIfMissing(true).
+          setCreateMissingColumnFamilies(true);
+      List<ColumnFamilyDescriptor> columnFamilyDescriptors =
+          new ArrayList<>();
+      columnFamilyDescriptors.add(new ColumnFamilyDescriptor(
+          RocksDB.DEFAULT_COLUMN_FAMILY));
+      columnFamilyDescriptors.add(new ColumnFamilyDescriptor(
+          "new_cf",
+          new ColumnFamilyOptions().
+              setDisableAutoCompactions(true).
+              setCompactionStyle(CompactionStyle.LEVEL).
+              setNumLevels(4).
+              setWriteBufferSize(100<<10).
+              setLevelZeroFileNumCompactionTrigger(3).
+              setTargetFileSizeBase(200 << 10).
+              setTargetFileSizeMultiplier(1).
+              setMaxBytesForLevelBase(500 << 10).
+              setMaxBytesForLevelMultiplier(1).
+              setDisableAutoCompactions(false)));
+      // open database
+      db = RocksDB.open(opt,
+          dbFolder.getRoot().getAbsolutePath(),
+          columnFamilyDescriptors,
+          columnFamilyHandles);
+      // fill database with key/value pairs
+      byte[] b = new byte[10000];
+      for (int i = 0; i < 200; i++) {
+        rand.nextBytes(b);
+        db.put(columnFamilyHandles.get(1),
+            String.valueOf(i).getBytes(), b);
+      }
+      db.compactRange(columnFamilyHandles.get(1));
+    } finally {
+      for (ColumnFamilyHandle handle : columnFamilyHandles) {
+        handle.dispose();
+      }
+      if (db != null) {
+        db.close();
+      }
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void compactRangeWithKeys() {
+
+  }
+
+  @Test
+  public void compactRangeWithKeysColumnFamily() {
+
+  }
+
+  @Test
+  public void compactRangeToLevel() throws RocksDBException, InterruptedException {
+    RocksDB db = null;
+    Options opt = null;
+    try {
+      opt = new Options().
+          setCreateIfMissing(true).
+          setCompactionStyle(CompactionStyle.LEVEL).
+          setNumLevels(4).
+          setWriteBufferSize(100<<10).
+          setLevelZeroFileNumCompactionTrigger(3).
+          setTargetFileSizeBase(200 << 10).
+          setTargetFileSizeMultiplier(1).
+          setMaxBytesForLevelBase(500 << 10).
+          setMaxBytesForLevelMultiplier(1).
+          setDisableAutoCompactions(false);
+      // open database
+      db = RocksDB.open(opt,
+          dbFolder.getRoot().getAbsolutePath());
+      // fill database with key/value pairs
+      byte[] b = new byte[10000];
+      for (int i = 0; i < 200; i++) {
+        rand.nextBytes(b);
+        db.put((String.valueOf(i)).getBytes(), b);
+      }
+      db.flush(new FlushOptions().setWaitForFlush(true));
+      db.close();
+      opt.setTargetFileSizeBase(Long.MAX_VALUE).
+          setTargetFileSizeMultiplier(1).
+          setMaxBytesForLevelBase(Long.MAX_VALUE).
+          setMaxBytesForLevelMultiplier(1).
+          setDisableAutoCompactions(true);
+
+      db = RocksDB.open(opt,
+          dbFolder.getRoot().getAbsolutePath());
+
+      db.compactRange(true, 0, 0);
+      for (int i = 0; i < 4; i++) {
+        if (i == 0) {
+          assertThat(db.getProperty("rocksdb.num-files-at-level" + i)).
+              isEqualTo("1");
+        } else {
+          assertThat(db.getProperty("rocksdb.num-files-at-level" + i)).
+              isEqualTo("0");
+        }
+      }
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void compactRangeToLevelColumnFamily() throws RocksDBException {
+    RocksDB db = null;
+    DBOptions opt = null;
+    List<ColumnFamilyHandle> columnFamilyHandles =
+        new ArrayList<>();
+    try {
+      opt = new DBOptions().
+          setCreateIfMissing(true).
+          setCreateMissingColumnFamilies(true);
+      List<ColumnFamilyDescriptor> columnFamilyDescriptors =
+          new ArrayList<>();
+      columnFamilyDescriptors.add(new ColumnFamilyDescriptor(
+          RocksDB.DEFAULT_COLUMN_FAMILY));
+      columnFamilyDescriptors.add(new ColumnFamilyDescriptor(
+          "new_cf",
+          new ColumnFamilyOptions().
+              setDisableAutoCompactions(true).
+              setCompactionStyle(CompactionStyle.LEVEL).
+              setNumLevels(4).
+              setWriteBufferSize(100<<10).
+              setLevelZeroFileNumCompactionTrigger(3).
+              setTargetFileSizeBase(200 << 10).
+              setTargetFileSizeMultiplier(1).
+              setMaxBytesForLevelBase(500 << 10).
+              setMaxBytesForLevelMultiplier(1).
+              setDisableAutoCompactions(false)));
+      // open database
+      db = RocksDB.open(opt,
+          dbFolder.getRoot().getAbsolutePath(),
+          columnFamilyDescriptors,
+          columnFamilyHandles);
+      // fill database with key/value pairs
+      byte[] b = new byte[10000];
+      for (int i = 0; i < 200; i++) {
+        rand.nextBytes(b);
+        db.put(columnFamilyHandles.get(1),
+            String.valueOf(i).getBytes(), b);
+      }
+      db.flush(new FlushOptions().setWaitForFlush(true),
+          columnFamilyHandles.get(1));
+      // free column families
+      for (ColumnFamilyHandle handle : columnFamilyHandles) {
+        handle.dispose();
+      }
+      // clear column family handles for reopen
+      columnFamilyHandles.clear();
+      db.close();
+      columnFamilyDescriptors.get(1).
+          columnFamilyOptions().
+          setTargetFileSizeBase(Long.MAX_VALUE).
+          setTargetFileSizeMultiplier(1).
+          setMaxBytesForLevelBase(Long.MAX_VALUE).
+          setMaxBytesForLevelMultiplier(1).
+          setDisableAutoCompactions(true);
+      // reopen database
+      db = RocksDB.open(opt,
+          dbFolder.getRoot().getAbsolutePath(),
+          columnFamilyDescriptors,
+          columnFamilyHandles);
+      // compact new column family
+      db.compactRange(columnFamilyHandles.get(1), true, 0, 0);
+      // check if new column family is compacted to level zero
+      for (int i = 0; i < 4; i++) {
+        if (i == 0) {
+          assertThat(db.getProperty(columnFamilyHandles.get(1),
+              "rocksdb.num-files-at-level" + i)).
+              isEqualTo("1");
+        } else {
+          assertThat(db.getProperty(columnFamilyHandles.get(1),
+              "rocksdb.num-files-at-level" + i)).
+              isEqualTo("0");
+        }
+      }
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
 }
diff --git a/java/rocksjni/rocksjni.cc b/java/rocksjni/rocksjni.cc
index 5af3c6b68..efcaf95ae 100644
--- a/java/rocksjni/rocksjni.cc
+++ b/java/rocksjni/rocksjni.cc
@@ -1379,3 +1379,117 @@ void Java_org_rocksdb_RocksDB_flush__JJJ(
   auto cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
   rocksdb_flush_helper(env, db, *flush_options, cf_handle);
 }
+
+//////////////////////////////////////////////////////////////////////////////
+// rocksdb::DB::CompactRange - Full
+
+void rocksdb_compactrange_helper(JNIEnv* env, rocksdb::DB* db,
+    rocksdb::ColumnFamilyHandle* cf_handle, jboolean jreduce_level,
+    jint jtarget_level, jint jtarget_path_id) {
+
+  rocksdb::Status s;
+  if (cf_handle != nullptr) {
+    s = db->CompactRange(cf_handle, nullptr, nullptr, jreduce_level,
+        jtarget_level, static_cast<uint32_t>(jtarget_path_id));
+  } else {
+    // backwards compatibility
+    s = db->CompactRange(nullptr, nullptr, jreduce_level,
+        jtarget_level, static_cast<uint32_t>(jtarget_path_id));
+  }
+
+  if (s.ok()) {
+    return;
+  }
+  rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    compactRange0
+ * Signature: (JZII)V
+ */
+void Java_org_rocksdb_RocksDB_compactRange0__JZII(JNIEnv* env,
+    jobject jdb, jlong jdb_handle, jboolean jreduce_level,
+    jint jtarget_level, jint jtarget_path_id) {
+  auto db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  rocksdb_compactrange_helper(env, db, nullptr, jreduce_level,
+      jtarget_level, jtarget_path_id);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    compactRange
+ * Signature: (JZIIJ)V
+ */
+void Java_org_rocksdb_RocksDB_compactRange__JZIIJ(
+    JNIEnv* env, jobject jdb, jlong jdb_handle,
+     jboolean jreduce_level, jint jtarget_level,
+     jint jtarget_path_id, jlong jcf_handle) {
+  auto db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  auto cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
+  rocksdb_compactrange_helper(env, db, cf_handle, jreduce_level,
+      jtarget_level, jtarget_path_id);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// rocksdb::DB::CompactRange - Range
+
+void rocksdb_compactrange_helper(JNIEnv* env, rocksdb::DB* db,
+    rocksdb::ColumnFamilyHandle* cf_handle, jbyteArray jbegin, jint jbegin_len,
+    jbyteArray jend, jint jend_len, jboolean jreduce_level, jint jtarget_level,
+    jint jtarget_path_id) {
+
+  jbyte* begin = env->GetByteArrayElements(jbegin, 0);
+  jbyte* end = env->GetByteArrayElements(jend, 0);
+  const rocksdb::Slice begin_slice(reinterpret_cast<char*>(begin), jbegin_len);
+  const rocksdb::Slice end_slice(reinterpret_cast<char*>(end), jend_len);
+
+  rocksdb::Status s;
+  if (cf_handle != nullptr) {
+    s = db->CompactRange(cf_handle, &begin_slice, &end_slice, jreduce_level,
+        jtarget_level, static_cast<uint32_t>(jtarget_path_id));
+  } else {
+    // backwards compatibility
+    s = db->CompactRange(&begin_slice, &end_slice, jreduce_level,
+        jtarget_level, static_cast<uint32_t>(jtarget_path_id));
+  }
+
+  env->ReleaseByteArrayElements(jbegin, begin, JNI_ABORT);
+  env->ReleaseByteArrayElements(jend, end, JNI_ABORT);
+
+  if (s.ok()) {
+    return;
+  }
+  rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    compactRange0
+ * Signature: (J[BI[BIZII)V
+ */
+void Java_org_rocksdb_RocksDB_compactRange0__J_3BI_3BIZII(JNIEnv* env,
+    jobject jdb, jlong jdb_handle, jbyteArray jbegin, jint jbegin_len,
+    jbyteArray jend, jint jend_len, jboolean jreduce_level,
+    jint jtarget_level, jint jtarget_path_id) {
+  auto db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  rocksdb_compactrange_helper(env, db, nullptr, jbegin, jbegin_len,
+      jend, jend_len, jreduce_level, jtarget_level, jtarget_path_id);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    compactRange
+ * Signature: (JJ[BI[BIZII)V
+ */
+void Java_org_rocksdb_RocksDB_compactRange__J_3BI_3BIZIIJ(
+    JNIEnv* env, jobject jdb, jlong jdb_handle, jbyteArray jbegin,
+    jint jbegin_len, jbyteArray jend, jint jend_len,
+    jboolean jreduce_level, jint jtarget_level,
+    jint jtarget_path_id, jlong jcf_handle) {
+  auto db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  auto cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
+  rocksdb_compactrange_helper(env, db, cf_handle, jbegin, jbegin_len,
+      jend, jend_len, jreduce_level, jtarget_level, jtarget_path_id);
+}
+

From 69188ff449826505bff95d3f4f6dd0e89cfdc1a7 Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Thu, 20 Nov 2014 23:55:15 +0100
Subject: [PATCH 621/829] [RocksJava] CompactRange support

Summary: Manual range compaction support in RocksJava.

Test Plan:
make rocksdbjava
make jtest
mvn -f rocksjni.pom package

Reviewers: adamretter, yhchiang, ankgup87

Subscribers: dhruba

Differential Revision: https://reviews.facebook.net/D29283
---
 java/org/rocksdb/test/RocksDBTest.java | 200 +++++++++++++++++++++++--
 java/rocksjni/rocksjni.cc              |   1 -
 2 files changed, 191 insertions(+), 10 deletions(-)

diff --git a/java/org/rocksdb/test/RocksDBTest.java b/java/org/rocksdb/test/RocksDBTest.java
index c5e96c6aa..df0c04787 100644
--- a/java/org/rocksdb/test/RocksDBTest.java
+++ b/java/org/rocksdb/test/RocksDBTest.java
@@ -355,7 +355,8 @@ public class RocksDBTest {
   }
 
   @Test
-  public void fullCompactRangeColumnFamily() throws RocksDBException {
+  public void fullCompactRangeColumnFamily()
+      throws RocksDBException {
     RocksDB db = null;
     DBOptions opt = null;
     List<ColumnFamilyHandle> columnFamilyHandles =
@@ -374,7 +375,7 @@ public class RocksDBTest {
               setDisableAutoCompactions(true).
               setCompactionStyle(CompactionStyle.LEVEL).
               setNumLevels(4).
-              setWriteBufferSize(100<<10).
+              setWriteBufferSize(100 << 10).
               setLevelZeroFileNumCompactionTrigger(3).
               setTargetFileSizeBase(200 << 10).
               setTargetFileSizeMultiplier(1).
@@ -408,17 +409,195 @@ public class RocksDBTest {
   }
 
   @Test
-  public void compactRangeWithKeys() {
+  public void compactRangeWithKeys()
+      throws RocksDBException {
+    RocksDB db = null;
+    Options opt = null;
+    try {
+      opt = new Options().
+          setCreateIfMissing(true).
+          setDisableAutoCompactions(true).
+          setCompactionStyle(CompactionStyle.LEVEL).
+          setNumLevels(4).
+          setWriteBufferSize(100<<10).
+          setLevelZeroFileNumCompactionTrigger(3).
+          setTargetFileSizeBase(200 << 10).
+          setTargetFileSizeMultiplier(1).
+          setMaxBytesForLevelBase(500 << 10).
+          setMaxBytesForLevelMultiplier(1).
+          setDisableAutoCompactions(false);
+      // open database
+      db = RocksDB.open(opt,
+          dbFolder.getRoot().getAbsolutePath());
+      // fill database with key/value pairs
+      byte[] b = new byte[10000];
+      for (int i = 0; i < 200; i++) {
+        rand.nextBytes(b);
+        db.put((String.valueOf(i)).getBytes(), b);
+      }
+      db.compactRange("0".getBytes(), "201".getBytes());
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
 
+  @Test
+  public void compactRangeWithKeysReduce()
+      throws RocksDBException {
+    RocksDB db = null;
+    Options opt = null;
+    try {
+      opt = new Options().
+          setCreateIfMissing(true).
+          setDisableAutoCompactions(true).
+          setCompactionStyle(CompactionStyle.LEVEL).
+          setNumLevels(4).
+          setWriteBufferSize(100<<10).
+          setLevelZeroFileNumCompactionTrigger(3).
+          setTargetFileSizeBase(200 << 10).
+          setTargetFileSizeMultiplier(1).
+          setMaxBytesForLevelBase(500 << 10).
+          setMaxBytesForLevelMultiplier(1).
+          setDisableAutoCompactions(false);
+      // open database
+      db = RocksDB.open(opt,
+          dbFolder.getRoot().getAbsolutePath());
+      // fill database with key/value pairs
+      byte[] b = new byte[10000];
+      for (int i = 0; i < 200; i++) {
+        rand.nextBytes(b);
+        db.put((String.valueOf(i)).getBytes(), b);
+      }
+      db.compactRange("0".getBytes(), "201".getBytes(),
+          true, 0, 0);
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
   }
 
   @Test
-  public void compactRangeWithKeysColumnFamily() {
+  public void compactRangeWithKeysColumnFamily()
+      throws RocksDBException {
+    RocksDB db = null;
+    DBOptions opt = null;
+    List<ColumnFamilyHandle> columnFamilyHandles =
+        new ArrayList<>();
+    try {
+      opt = new DBOptions().
+          setCreateIfMissing(true).
+          setCreateMissingColumnFamilies(true);
+      List<ColumnFamilyDescriptor> columnFamilyDescriptors =
+          new ArrayList<>();
+      columnFamilyDescriptors.add(new ColumnFamilyDescriptor(
+          RocksDB.DEFAULT_COLUMN_FAMILY));
+      columnFamilyDescriptors.add(new ColumnFamilyDescriptor(
+          "new_cf",
+          new ColumnFamilyOptions().
+              setDisableAutoCompactions(true).
+              setCompactionStyle(CompactionStyle.LEVEL).
+              setNumLevels(4).
+              setWriteBufferSize(100<<10).
+              setLevelZeroFileNumCompactionTrigger(3).
+              setTargetFileSizeBase(200 << 10).
+              setTargetFileSizeMultiplier(1).
+              setMaxBytesForLevelBase(500 << 10).
+              setMaxBytesForLevelMultiplier(1).
+              setDisableAutoCompactions(false)));
+      // open database
+      db = RocksDB.open(opt,
+          dbFolder.getRoot().getAbsolutePath(),
+          columnFamilyDescriptors,
+          columnFamilyHandles);
+      // fill database with key/value pairs
+      byte[] b = new byte[10000];
+      for (int i = 0; i < 200; i++) {
+        rand.nextBytes(b);
+        db.put(columnFamilyHandles.get(1),
+            String.valueOf(i).getBytes(), b);
+      }
+      db.compactRange(columnFamilyHandles.get(1),
+          "0".getBytes(), "201".getBytes());
+    } finally {
+      for (ColumnFamilyHandle handle : columnFamilyHandles) {
+        handle.dispose();
+      }
+      if (db != null) {
+        db.close();
+      }
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
 
+  @Test
+  public void compactRangeWithKeysReduceColumnFamily()
+      throws RocksDBException {
+    RocksDB db = null;
+    DBOptions opt = null;
+    List<ColumnFamilyHandle> columnFamilyHandles =
+        new ArrayList<>();
+    try {
+      opt = new DBOptions().
+          setCreateIfMissing(true).
+          setCreateMissingColumnFamilies(true);
+      List<ColumnFamilyDescriptor> columnFamilyDescriptors =
+          new ArrayList<>();
+      columnFamilyDescriptors.add(new ColumnFamilyDescriptor(
+          RocksDB.DEFAULT_COLUMN_FAMILY));
+      columnFamilyDescriptors.add(new ColumnFamilyDescriptor(
+          "new_cf",
+          new ColumnFamilyOptions().
+              setDisableAutoCompactions(true).
+              setCompactionStyle(CompactionStyle.LEVEL).
+              setNumLevels(4).
+              setWriteBufferSize(100<<10).
+              setLevelZeroFileNumCompactionTrigger(3).
+              setTargetFileSizeBase(200 << 10).
+              setTargetFileSizeMultiplier(1).
+              setMaxBytesForLevelBase(500 << 10).
+              setMaxBytesForLevelMultiplier(1).
+              setDisableAutoCompactions(false)));
+      // open database
+      db = RocksDB.open(opt,
+          dbFolder.getRoot().getAbsolutePath(),
+          columnFamilyDescriptors,
+          columnFamilyHandles);
+      // fill database with key/value pairs
+      byte[] b = new byte[10000];
+      for (int i = 0; i < 200; i++) {
+        rand.nextBytes(b);
+        db.put(columnFamilyHandles.get(1),
+            String.valueOf(i).getBytes(), b);
+      }
+      db.compactRange(columnFamilyHandles.get(1), "0".getBytes(),
+          "201".getBytes(), true, 0, 0);
+    } finally {
+      for (ColumnFamilyHandle handle : columnFamilyHandles) {
+        handle.dispose();
+      }
+      if (db != null) {
+        db.close();
+      }
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
   }
 
   @Test
-  public void compactRangeToLevel() throws RocksDBException, InterruptedException {
+  public void compactRangeToLevel()
+      throws RocksDBException, InterruptedException {
     RocksDB db = null;
     Options opt = null;
     try {
@@ -456,10 +635,12 @@ public class RocksDBTest {
       db.compactRange(true, 0, 0);
       for (int i = 0; i < 4; i++) {
         if (i == 0) {
-          assertThat(db.getProperty("rocksdb.num-files-at-level" + i)).
+          assertThat(
+              db.getProperty("rocksdb.num-files-at-level" + i)).
               isEqualTo("1");
         } else {
-          assertThat(db.getProperty("rocksdb.num-files-at-level" + i)).
+          assertThat(
+              db.getProperty("rocksdb.num-files-at-level" + i)).
               isEqualTo("0");
         }
       }
@@ -474,7 +655,8 @@ public class RocksDBTest {
   }
 
   @Test
-  public void compactRangeToLevelColumnFamily() throws RocksDBException {
+  public void compactRangeToLevelColumnFamily()
+      throws RocksDBException {
     RocksDB db = null;
     DBOptions opt = null;
     List<ColumnFamilyHandle> columnFamilyHandles =
@@ -493,7 +675,7 @@ public class RocksDBTest {
               setDisableAutoCompactions(true).
               setCompactionStyle(CompactionStyle.LEVEL).
               setNumLevels(4).
-              setWriteBufferSize(100<<10).
+              setWriteBufferSize(100 << 10).
               setLevelZeroFileNumCompactionTrigger(3).
               setTargetFileSizeBase(200 << 10).
               setTargetFileSizeMultiplier(1).
diff --git a/java/rocksjni/rocksjni.cc b/java/rocksjni/rocksjni.cc
index efcaf95ae..57a20e487 100644
--- a/java/rocksjni/rocksjni.cc
+++ b/java/rocksjni/rocksjni.cc
@@ -1492,4 +1492,3 @@ void Java_org_rocksdb_RocksDB_compactRange__J_3BI_3BIZIIJ(
   rocksdb_compactrange_helper(env, db, cf_handle, jbegin, jbegin_len,
       jend, jend_len, jreduce_level, jtarget_level, jtarget_path_id);
 }
-

From efc94ceb27d5e12b25734679730302ff6cfc55be Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Sun, 7 Dec 2014 22:19:46 +0100
Subject: [PATCH 622/829] [RocksJava] Incorporated changes for D29283

---
 java/org/rocksdb/RocksDB.java | 102 ++++++++++++----------------------
 1 file changed, 36 insertions(+), 66 deletions(-)

diff --git a/java/org/rocksdb/RocksDB.java b/java/org/rocksdb/RocksDB.java
index 021ed80b0..04a93eacd 100644
--- a/java/org/rocksdb/RocksDB.java
+++ b/java/org/rocksdb/RocksDB.java
@@ -1252,13 +1252,10 @@ public class RocksDB extends RocksObject {
   }
 
   /**
-   * <p>Full compaction of the underlying storage using key
-   * range mode.</p>
-   * <p><strong>Note</strong>: After the entire database is compacted,
-   * all data are pushed down to the last level containing any data.
-   * If the total data size after compaction is reduced, that level
-   * might not be appropriate for hosting all the files.
-   * </p>
+   * <p>Range compaction of database.</p>
+   * <p><strong>Note</strong>: After the database has been compacted,
+   * all data will have been pushed down to the last level containing
+   * any data.</p>
    *
    * <p><strong>See also</strong></p>
    * <ul>
@@ -1275,13 +1272,10 @@ public class RocksDB extends RocksObject {
   }
 
   /**
-   * <p>Compaction of the underlying storage using key
-   * using key range {@code [begin, end]}.</p>
-   * <p><strong>Note</strong>: After the entire database is compacted,
-   * all data are pushed down to the last level containing any data.
-   * If the total data size after compaction is reduced, that level
-   * might not be appropriate for hosting all the files.
-   * </p>
+   * <p>Range compaction of database.</p>
+   * <p><strong>Note</strong>: After the database has been compacted,
+   * all data will have been pushed down to the last level containing
+   * any data.</p>
    *
    * <p><strong>See also</strong></p>
    * <ul>
@@ -1303,16 +1297,11 @@ public class RocksDB extends RocksObject {
   }
 
   /**
-   * <p>Full compaction of the underlying storage using key
-   * range mode.</p>
-   * <p><strong>Note</strong>: After the entire database is compacted,
-   * all data are pushed down to the last level containing any data.
-   * If the total data size after compaction is reduced, that level
-   * might not be appropriate for hosting all the files.
-   * In this case, client could set reduce_level to true, to move
-   * the files back to the minimum level capable of holding the data
-   * set or a given level (specified by non-negative target_level).
-   * </p>
+   * <p>Range compaction of database.</p>
+   * <p><strong>Note</strong>: After the database has been compacted,
+   * all data will have been pushed down to the last level containing
+   * any data.</p>
+   *
    * <p>Compaction outputs should be placed in options.db_paths
    * [target_path_id]. Behavior is undefined if target_path_id is
    * out of range.</p>
@@ -1339,16 +1328,11 @@ public class RocksDB extends RocksObject {
 
 
   /**
-   * <p>Compaction of the underlying storage using key
-   * using key range {@code [begin, end]}.</p>
-   * <p><strong>Note</strong>: After the entire database is compacted,
-   * all data are pushed down to the last level containing any data.
-   * If the total data size after compaction is reduced, that level
-   * might not be appropriate for hosting all the files.
-   * In this case, client could set reduce_level to true, to move
-   * the files back to the minimum level capable of holding the data
-   * set or a given level (specified by non-negative target_level).
-   * </p>
+   * <p>Range compaction of database.</p>
+   * <p><strong>Note</strong>: After the database has been compacted,
+   * all data will have been pushed down to the last level containing
+   * any data.</p>
+   *
    * <p>Compaction outputs should be placed in options.db_paths
    * [target_path_id]. Behavior is undefined if target_path_id is
    * out of range.</p>
@@ -1377,12 +1361,10 @@ public class RocksDB extends RocksObject {
   }
 
   /**
-   * <p>Full compaction of the underlying storage of a column family
-   * using key range mode.</p>
-   * <p><strong>Note</strong>: After the entire database is compacted,
-   * all data are pushed down to the last level containing any data.
-   * If the total data size after compaction is reduced, that level
-   * might not be appropriate for hosting all the files.</p>
+   * <p>Range compaction of column family.</p>
+   * <p><strong>Note</strong>: After the database has been compacted,
+   * all data will have been pushed down to the last level containing
+   * any data.</p>
    *
    * <p><strong>See also</strong></p>
    * <ul>
@@ -1411,12 +1393,10 @@ public class RocksDB extends RocksObject {
   }
 
   /**
-   * <p>Compaction of the underlying storage of a column family
-   * using key range {@code [begin, end]}.</p>
-   * <p><strong>Note</strong>: After the entire database is compacted,
-   * all data are pushed down to the last level containing any data.
-   * If the total data size after compaction is reduced, that level
-   * might not be appropriate for hosting all the files.</p>
+   * <p>Range compaction of column family.</p>
+   * <p><strong>Note</strong>: After the database has been compacted,
+   * all data will have been pushed down to the last level containing
+   * any data.</p>
    *
    * <p><strong>See also</strong></p>
    * <ul>
@@ -1445,16 +1425,11 @@ public class RocksDB extends RocksObject {
   }
 
   /**
-   * <p>Full compaction of the underlying storage of a column family
-   * using key range mode.</p>
-   * <p><strong>Note</strong>: After the entire database is compacted,
-   * all data are pushed down to the last level containing any data.
-   * If the total data size after compaction is reduced, that level
-   * might not be appropriate for hosting all the files.
-   * In this case, client could set reduce_level to true, to move
-   * the files back to the minimum level capable of holding the data
-   * set or a given level (specified by non-negative target_level).
-   * </p>
+   * <p>Range compaction of column family.</p>
+   * <p><strong>Note</strong>: After the database has been compacted,
+   * all data will have been pushed down to the last level containing
+   * any data.</p>
+   *
    * <p>Compaction outputs should be placed in options.db_paths
    * [target_path_id]. Behavior is undefined if target_path_id is
    * out of range.</p>
@@ -1488,16 +1463,11 @@ public class RocksDB extends RocksObject {
   }
 
   /**
-   * <p>Compaction of the underlying storage of a column family
-   * using key range {@code [begin, end]}.</p>
-   * <p><strong>Note</strong>: After the entire database is compacted,
-   * all data are pushed down to the last level containing any data.
-   * If the total data size after compaction is reduced, that level
-   * might not be appropriate for hosting all the files.
-   * In this case, client could set reduce_level to true, to move
-   * the files back to the minimum level capable of holding the data
-   * set or a given level (specified by non-negative target_level).
-   * </p>
+   * <p>Range compaction of column family.</p>
+   * <p><strong>Note</strong>: After the database has been compacted,
+   * all data will have been pushed down to the last level containing
+   * any data.</p>
+   *
    * <p>Compaction outputs should be placed in options.db_paths
    * [target_path_id]. Behavior is undefined if target_path_id is
    * out of range.</p>

From eda0dcdd97fc675b4ac0424492d3f9228e5913dd Mon Sep 17 00:00:00 2001
From: Adam Retter <adam.retter@googlemail.com>
Date: Fri, 5 Dec 2014 13:41:39 +0000
Subject: [PATCH 623/829] Exposed IncreasedParallelism option to Java API as
 setIncreasedParallelism

---
 java/org/rocksdb/DBOptions.java          |  8 ++++++++
 java/org/rocksdb/DBOptionsInterface.java | 15 +++++++++++++++
 java/org/rocksdb/Options.java            |  8 ++++++++
 java/rocksjni/options.cc                 | 23 +++++++++++++++++++++++
 4 files changed, 54 insertions(+)

diff --git a/java/org/rocksdb/DBOptions.java b/java/org/rocksdb/DBOptions.java
index 600369dec..e3614f463 100644
--- a/java/org/rocksdb/DBOptions.java
+++ b/java/org/rocksdb/DBOptions.java
@@ -72,6 +72,13 @@ public class DBOptions extends RocksObject implements DBOptionsInterface {
     return dbOptions;
   }
 
+  @Override
+  public DBOptions setIncreaseParallelism(int totalThreads) {
+    assert (isInitialized());
+    setIncreaseParallelism(nativeHandle_, totalThreads);
+    return this;
+  }
+
   @Override
   public DBOptions setCreateIfMissing(boolean flag) {
     assert(isInitialized());
@@ -547,6 +554,7 @@ public class DBOptions extends RocksObject implements DBOptionsInterface {
   private native void newDBOptions();
   private native void disposeInternal(long handle);
 
+  private native void setIncreaseParallelism(long handle, int totalThreads);
   private native void setCreateIfMissing(long handle, boolean flag);
   private native boolean createIfMissing(long handle);
   private native void setCreateMissingColumnFamilies(
diff --git a/java/org/rocksdb/DBOptionsInterface.java b/java/org/rocksdb/DBOptionsInterface.java
index 19ffe375d..39ba13d25 100644
--- a/java/org/rocksdb/DBOptionsInterface.java
+++ b/java/org/rocksdb/DBOptionsInterface.java
@@ -7,6 +7,21 @@ package org.rocksdb;
 
 public interface DBOptionsInterface {
 
+  /**
+   * <p>By default, RocksDB uses only one background thread for flush and
+   * compaction. Calling this function will set it up such that total of
+   * `total_threads` is used.</p>
+   *
+   * <p>You almost definitely want to call this function if your system is
+   * bottlenecked by RocksDB.</p>
+   *
+   * @param The total number of threads to be used by RocksDB. A good value
+   *            is the number of cores.
+   *
+   * @return the instance of the current Options
+   */
+  Object setIncreaseParallelism(int totalThreads);
+
   /**
    * If this value is set to true, then the database will be created
    * if it is missing during {@code RocksDB.open()}.
diff --git a/java/org/rocksdb/Options.java b/java/org/rocksdb/Options.java
index 7781b80a6..ac4037508 100644
--- a/java/org/rocksdb/Options.java
+++ b/java/org/rocksdb/Options.java
@@ -43,6 +43,13 @@ public class Options extends RocksObject
     env_ = RocksEnv.getDefault();
   }
 
+  @Override
+  public Options setIncreaseParallelism(int totalThreads) {
+    assert(isInitialized());
+    setIncreaseParallelism(nativeHandle_, totalThreads);
+    return this;
+  }
+
   @Override
   public Options setCreateIfMissing(boolean flag) {
     assert(isInitialized());
@@ -1032,6 +1039,7 @@ public class Options extends RocksObject
   private native void prepareForBulkLoad(long handle);
 
   // DB native handles
+  private native void setIncreaseParallelism(long handle, int totalThreads);
   private native void setCreateIfMissing(long handle, boolean flag);
   private native boolean createIfMissing(long handle);
   private native void setCreateMissingColumnFamilies(
diff --git a/java/rocksjni/options.cc b/java/rocksjni/options.cc
index d139b1a57..667d74508 100644
--- a/java/rocksjni/options.cc
+++ b/java/rocksjni/options.cc
@@ -68,6 +68,17 @@ void Java_org_rocksdb_Options_disposeInternal(
   delete reinterpret_cast<rocksdb::Options*>(handle);
 }
 
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setIncreaseParallelism
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_setIncreaseParallelism(
+    JNIEnv * evnv, jobject jobj, jlong jhandle, jint totalThreads) {
+  reinterpret_cast<rocksdb::Options*>
+      (jhandle)->IncreaseParallelism(static_cast<int>(totalThreads));
+}
+
 /*
  * Class:     org_rocksdb_Options
  * Method:    setCreateIfMissing
@@ -2816,6 +2827,18 @@ void Java_org_rocksdb_DBOptions_disposeInternal(
   delete reinterpret_cast<rocksdb::DBOptions*>(handle);
 }
 
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setIncreaseParallelism
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_DBOptions_setIncreaseParallelism(
+    JNIEnv * env, jobject jobj, jlong jhandle, jint totalThreads) {
+  reinterpret_cast<rocksdb::DBOptions*>
+      (jhandle)->IncreaseParallelism(static_cast<int>(totalThreads));
+}
+
+
 /*
  * Class:     org_rocksdb_DBOptions
  * Method:    setCreateIfMissing

From 17e84f2151a5de7ab57d6c6a3ad57a638c01727f Mon Sep 17 00:00:00 2001
From: Adam Retter <adam.retter@googlemail.com>
Date: Tue, 16 Dec 2014 14:32:42 +0000
Subject: [PATCH 624/829] Rudimentary test cases for setIncreaseParallelism

---
 java/org/rocksdb/test/DBOptionsTest.java | 14 ++++++++++++++
 java/org/rocksdb/test/OptionsTest.java   | 14 ++++++++++++++
 2 files changed, 28 insertions(+)

diff --git a/java/org/rocksdb/test/DBOptionsTest.java b/java/org/rocksdb/test/DBOptionsTest.java
index 6064dd694..858379768 100644
--- a/java/org/rocksdb/test/DBOptionsTest.java
+++ b/java/org/rocksdb/test/DBOptionsTest.java
@@ -73,6 +73,20 @@ public class DBOptionsTest {
         new Properties());
   }
 
+  @Test
+  public void setIncreaseParallelism() {
+    DBOptions opt = null;
+    try {
+      opt = new DBOptions();
+      final int threads = Runtime.getRuntime().availableProcessors() * 2;
+      opt.setIncreaseParallelism(threads);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
   @Test
   public void createIfMissing() {
     DBOptions opt = null;
diff --git a/java/org/rocksdb/test/OptionsTest.java b/java/org/rocksdb/test/OptionsTest.java
index 3425502d8..0e699c406 100644
--- a/java/org/rocksdb/test/OptionsTest.java
+++ b/java/org/rocksdb/test/OptionsTest.java
@@ -22,6 +22,20 @@ public class OptionsTest {
   public static final Random rand = PlatformRandomHelper.
       getPlatformSpecificRandomFactory();
 
+  @Test
+  public void setIncreaseParallelism() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      final int threads = Runtime.getRuntime().availableProcessors() * 2;
+      opt.setIncreaseParallelism(threads);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
   @Test
   public void writeBufferSize() throws RocksDBException {
     Options opt = null;

From 7661e5a76e58eef0fb360f00de94301ad6cab40c Mon Sep 17 00:00:00 2001
From: Venkatesh Radhakrishnan <rven@fb.com>
Date: Tue, 16 Dec 2014 16:57:22 -0800
Subject: [PATCH 625/829] Move the file copy out of the mutex.

Summary:
We now release the mutex before copying the files in the case
of the trivial move. This path does not use the compaction job.

Test Plan: DBTest.LevelCompactionThirdPath

Reviewers: yhchiang, igor, sdong

Reviewed By: sdong

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D30381
---
 HISTORY.md       |  8 ++++----
 db/compaction.cc |  1 +
 db/compaction.h  |  2 +-
 db/db_impl.cc    | 27 +++------------------------
 4 files changed, 9 insertions(+), 29 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index 121b936af..dede7580a 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -3,6 +3,10 @@
 ### Unreleased Features
 * Changed the LRU caching algorithm so that referenced blocks (by iterators) are never evicted
 * By default we now optimize the compilation for the compilation platform (using -march=native). If you want to build portable binary, use 'PORTABLE=1' before the make command.
+* We now allow level-compaction to place files in different paths by
+  specifying them in db_paths along with the target_size.
+  Lower numbered levels will be placed earlier in the db_paths and higher
+  numbered levels will be placed later in the db_paths vector.
 
 ### 3.9.0 (12/8/2014)
 
@@ -17,10 +21,6 @@
 * New API LinkFile added to Env. If you implement your own Env class, an
   implementation of the API LinkFile will have to be provided.
 * MemTableRep takes MemTableAllocator instead of Arena
-* We now allow level-compaction to place files in different paths by
-  specifying them in db_paths along with the target_size.
-  Lower numbered levels will be placed earlier in the db_paths and higher
-  numbered levels will be placed later in the db_paths vector.
 
 ### Improvements
 * RocksDBLite library now becomes smaller and will be compiled with -fno-exceptions flag.
diff --git a/db/compaction.cc b/db/compaction.cc
index 3d4c352c9..0d85ce486 100644
--- a/db/compaction.cc
+++ b/db/compaction.cc
@@ -140,6 +140,7 @@ bool Compaction::IsTrivialMove() const {
           num_input_levels() == 2 &&
           num_input_files(0) == 1 &&
           num_input_files(1) == 0 &&
+          input(0, 0)->fd.GetPathId() == GetOutputPathId() &&
           TotalFileSize(grandparents_) <= max_grandparent_overlap_bytes_);
 }
 
diff --git a/db/compaction.h b/db/compaction.h
index 4333cc208..99f35abb9 100644
--- a/db/compaction.h
+++ b/db/compaction.h
@@ -119,7 +119,7 @@ class Compaction {
   // moving a single input file to the next level (no merging or splitting)
   bool IsTrivialMove() const;
 
-  // If true, then the comaction can be done by simply deleting input files.
+  // If true, then the compaction can be done by simply deleting input files.
   bool IsDeletionCompaction() const {
     return deletion_compaction_;
   }
diff --git a/db/db_impl.cc b/db/db_impl.cc
index 2d9b32cc8..a87d4d147 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -2060,31 +2060,10 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress, JobContext* job_context,
     // Move file to next level
     assert(c->num_input_files(0) == 1);
     FileMetaData* f = c->input(0, 0);
-    FileMetaData ftemp;
-    uint64_t fdnum = f->fd.GetNumber();
-    uint32_t fdpath = f->fd.GetPathId();
     c->edit()->DeleteFile(c->level(), f->fd.GetNumber());
-    // Need to move file if file is to be stored in a new path
-    if (c->GetOutputPathId() != f->fd.GetPathId()) {
-      fdnum = versions_->NewFileNumber();
-      std::string source = TableFileName(db_options_.db_paths,
-                                         f->fd.GetNumber(), f->fd.GetPathId());
-      std::string destination =
-          TableFileName(db_options_.db_paths, fdnum, c->GetOutputPathId());
-      Status s = CopyFile(env_, source, destination, 0);
-      if (s.ok()) {
-        fdpath = c->GetOutputPathId();
-      } else {
-        fdnum = f->fd.GetNumber();
-        if (!s.IsShutdownInProgress()) {
-          Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log,
-              "Compaction error: %s", s.ToString().c_str());
-        }
-      }
-    }
-    c->edit()->AddFile(c->level() + 1, fdnum, fdpath, f->fd.GetFileSize(),
-                       f->smallest, f->largest, f->smallest_seqno,
-                       f->largest_seqno);
+    c->edit()->AddFile(c->level() + 1, f->fd.GetNumber(), f->fd.GetPathId(),
+                       f->fd.GetFileSize(), f->smallest, f->largest,
+                       f->smallest_seqno, f->largest_seqno);
     status = versions_->LogAndApply(c->column_family_data(),
                                     *c->mutable_cf_options(),
                                     c->edit(), &mutex_, db_directory_.get());

From 25f70a5abbc2e3e38b8a67013f91c1ccea961aaa Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Tue, 16 Dec 2014 17:10:23 -0800
Subject: [PATCH 626/829] Avoid unnecessary unlock and lock mutex when
 notifying events.

Summary: Avoid unnecessary unlock and lock mutex when notifying events.

Test Plan: ./listener_test

Reviewers: igor

Reviewed By: igor

Subscribers: sdong, dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D30267
---
 db/db_impl.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/db/db_impl.cc b/db/db_impl.cc
index a87d4d147..3275165e8 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -1112,6 +1112,9 @@ Status DBImpl::FlushMemTableToOutputFile(
 void DBImpl::NotifyOnFlushCompleted(
     ColumnFamilyData* cfd, uint64_t file_number,
     const MutableCFOptions& mutable_cf_options) {
+  if (cfd->ioptions()->listeners.size() == 0U) {
+    return;
+  }
   mutex_.AssertHeld();
   if (shutting_down_.load(std::memory_order_acquire)) {
     return;

From 91c58752fa4bc32cfd51189b3f35edde77816ff4 Mon Sep 17 00:00:00 2001
From: Haneef Mubarak <haneefmubarak@users.noreply.github.com>
Date: Wed, 17 Dec 2014 02:06:36 -0800
Subject: [PATCH 627/829] error detection and memory leaks in c example

---
 examples/simple_example.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/examples/simple_example.c b/examples/simple_example.c
index 59848902a..29de5629b 100644
--- a/examples/simple_example.c
+++ b/examples/simple_example.c
@@ -20,24 +20,28 @@ int main (int argc, char **argv) {
 	rocksdb_options_set_create_if_missing (options, 1);
 
 	// open DB
-	char *err;
+	char *err = NULL;
 	db = rocksdb_open (options, DBPath, &err);
-//	assert (!err);
+	assert (!err);
 
 	// Put key-value
 	rocksdb_writeoptions_t *writeoptions = rocksdb_writeoptions_create ();
 	const char key[] = "key";
 	char *value = "value";
 	rocksdb_put (db, writeoptions, key, strlen (key), value, strlen (value), &err);
-//	assert (!err);
+	assert (!err);
 	// Get value
 	rocksdb_readoptions_t *readoptions = rocksdb_readoptions_create ();
 	size_t len;
 	value = rocksdb_get (db, readoptions, key, strlen (key), &len, &err);
-//	assert (!err);
+	assert (!err);
 	assert (strcmp (value, "value") == 0);
 	free (value);
 
+	// cleanup
+	rocksdb_writeoptions_destroy (writeoptions);
+	rocksdb_readoptions_destroy (readoptions);
+	rocksdb_options_destroy (options);
 	rocksdb_close (db);
 
 	return 0;

From 7198ed5a2ed6285123b374af78f36a440469eef6 Mon Sep 17 00:00:00 2001
From: Venkatesh Radhakrishnan <rven@fb.com>
Date: Wed, 17 Dec 2014 16:25:09 -0800
Subject: [PATCH 628/829] Handle errors during pthread calls

Summary: Release locks before calling exit.

Test Plan: Force errors in debugger and verify correctness

Reviewers: igor, yhchiang, sdong

Reviewed By: sdong

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D30423
---
 util/env_posix.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/util/env_posix.cc b/util/env_posix.cc
index 30997a904..da090ddf5 100644
--- a/util/env_posix.cc
+++ b/util/env_posix.cc
@@ -1068,7 +1068,7 @@ class PosixFileLock : public FileLock {
 void PthreadCall(const char* label, int result) {
   if (result != 0) {
     fprintf(stderr, "pthread %s: %s\n", label, strerror(result));
-    exit(1);
+    abort();
   }
 }
 

From 28424d734bee62ff1999b17e6938c42abfd8bdc9 Mon Sep 17 00:00:00 2001
From: Haneef Mubarak <haneefmubarak@users.noreply.github.com>
Date: Thu, 18 Dec 2014 06:48:46 -0800
Subject: [PATCH 629/829] style fixes in c example

---
 examples/simple_example.c | 44 ++++++++++++++++++++-------------------
 1 file changed, 23 insertions(+), 21 deletions(-)

diff --git a/examples/simple_example.c b/examples/simple_example.c
index 29de5629b..e982cce89 100644
--- a/examples/simple_example.c
+++ b/examples/simple_example.c
@@ -9,40 +9,42 @@
 
 const char DBPath[] = "/tmp/rocksdb_simple_example";
 
-int main (int argc, char **argv) {
+int main(int argc, char **argv) {
 	rocksdb_t *db;
-	rocksdb_options_t *options = rocksdb_options_create ();
-	// Optimize RocksDB. This is the easiest way to get RocksDB to perform well
-	int cpus = sysconf (_SC_NPROCESSORS_ONLN);	// get number of online cores
-	rocksdb_options_increase_parallelism (options, cpus);
-	rocksdb_options_optimize_level_style_compaction (options, 0);
+	rocksdb_options_t *options = rocksdb_options_create();
+	// Optimize RocksDB. This is the easiest way to
+	// get RocksDB to perform well
+	int cpus = sysconf(_SC_NPROCESSORS_ONLN);	// get # of online cores
+	rocksdb_options_increase_parallelism(options, cpus);
+	rocksdb_options_optimize_level_style_compaction(options, 0);
 	// create the DB if it's not already present
-	rocksdb_options_set_create_if_missing (options, 1);
+	rocksdb_options_set_create_if_missing(options, 1);
 
 	// open DB
 	char *err = NULL;
-	db = rocksdb_open (options, DBPath, &err);
-	assert (!err);
+	db = rocksdb_open(options, DBPath, &err);
+	assert(!err);
 
 	// Put key-value
-	rocksdb_writeoptions_t *writeoptions = rocksdb_writeoptions_create ();
+	rocksdb_writeoptions_t *writeoptions = rocksdb_writeoptions_create();
 	const char key[] = "key";
 	char *value = "value";
-	rocksdb_put (db, writeoptions, key, strlen (key), value, strlen (value), &err);
-	assert (!err);
+	rocksdb_put(db, writeoptions, key, strlen (key), value,	\
+			strlen (value), &err);
+	assert(!err);
 	// Get value
-	rocksdb_readoptions_t *readoptions = rocksdb_readoptions_create ();
+	rocksdb_readoptions_t *readoptions = rocksdb_readoptions_create();
 	size_t len;
-	value = rocksdb_get (db, readoptions, key, strlen (key), &len, &err);
-	assert (!err);
-	assert (strcmp (value, "value") == 0);
-	free (value);
+	value = rocksdb_get(db, readoptions, key, strlen (key), &len, &err);
+	assert(!err);
+	assert(strcmp(value, "value") == 0);
+	free(value);
 
 	// cleanup
-	rocksdb_writeoptions_destroy (writeoptions);
-	rocksdb_readoptions_destroy (readoptions);
-	rocksdb_options_destroy (options);
-	rocksdb_close (db);
+	rocksdb_writeoptions_destroy(writeoptions);
+	rocksdb_readoptions_destroy(readoptions);
+	rocksdb_options_destroy(options);
+	rocksdb_close(db);
 
 	return 0;
 }

From b015ed0ca6585766bd2ef381fc9507bb33c09863 Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Wed, 10 Dec 2014 00:37:08 +0100
Subject: [PATCH 630/829] [RocksJava] Slice / DirectSlice improvements

Summary:
- AssertionError when initialized with Non-Direct Buffer
- Tests + coverage for DirectSlice
- Slice sigsegv fixes when initializing from String and byte arrays
- Slice Tests

Test Plan: Run tests without source modifications.

Reviewers: yhchiang, adamretter, ankgup87

Subscribers: dhruba

Differential Revision: https://reviews.facebook.net/D30081
---
 java/Makefile                              |   2 +
 java/org/rocksdb/DirectSlice.java          |   2 +
 java/org/rocksdb/Slice.java                |   2 +-
 java/org/rocksdb/test/DirectSliceTest.java | 105 +++++++++++++++++++++
 java/org/rocksdb/test/SliceTest.java       | 105 +++++++++++++++++++++
 java/rocksjni/slice.cc                     |  37 ++++----
 6 files changed, 235 insertions(+), 18 deletions(-)
 create mode 100644 java/org/rocksdb/test/DirectSliceTest.java
 create mode 100644 java/org/rocksdb/test/SliceTest.java

diff --git a/java/Makefile b/java/Makefile
index 6edfb9091..26fa38d05 100644
--- a/java/Makefile
+++ b/java/Makefile
@@ -58,6 +58,7 @@ JAVA_TESTS = org.rocksdb.test.BackupableDBOptionsTest\
 		org.rocksdb.test.ComparatorTest\
 		org.rocksdb.test.DBOptionsTest\
 		org.rocksdb.test.DirectComparatorTest\
+		org.rocksdb.test.DirectSliceTest\
 		org.rocksdb.test.EnvironmentTest\
 		org.rocksdb.test.FilterTest\
 		org.rocksdb.test.FlushTest\
@@ -74,6 +75,7 @@ JAVA_TESTS = org.rocksdb.test.BackupableDBOptionsTest\
 		org.rocksdb.test.RocksEnvTest\
 		org.rocksdb.test.RocksIteratorTest\
 		org.rocksdb.test.SizeUnitTest\
+		org.rocksdb.test.SliceTest\
 		org.rocksdb.test.SnapshotTest\
 		org.rocksdb.test.StatisticsCollectorTest\
 		org.rocksdb.test.WriteBatchHandlerTest\
diff --git a/java/org/rocksdb/DirectSlice.java b/java/org/rocksdb/DirectSlice.java
index 847bbd9c1..c69b61460 100644
--- a/java/org/rocksdb/DirectSlice.java
+++ b/java/org/rocksdb/DirectSlice.java
@@ -56,6 +56,7 @@ public class DirectSlice extends AbstractSlice<ByteBuffer> {
    */
   public DirectSlice(final ByteBuffer data, final int length) {
     super();
+    assert(data.isDirect());
     createNewDirectSlice0(data, length);
   }
 
@@ -68,6 +69,7 @@ public class DirectSlice extends AbstractSlice<ByteBuffer> {
    */
   public DirectSlice(final ByteBuffer data) {
     super();
+    assert(data.isDirect());
     createNewDirectSlice1(data);
   }
 
diff --git a/java/org/rocksdb/Slice.java b/java/org/rocksdb/Slice.java
index 0dfa12ee7..d26490e5f 100644
--- a/java/org/rocksdb/Slice.java
+++ b/java/org/rocksdb/Slice.java
@@ -77,8 +77,8 @@ public class Slice extends AbstractSlice<byte[]> {
    */
   @Override
   protected void disposeInternal() {
-    super.disposeInternal();
     disposeInternalBuf(nativeHandle_);
+    super.disposeInternal();
   }
 
   @Override protected final native byte[] data0(long handle);
diff --git a/java/org/rocksdb/test/DirectSliceTest.java b/java/org/rocksdb/test/DirectSliceTest.java
new file mode 100644
index 000000000..a50664867
--- /dev/null
+++ b/java/org/rocksdb/test/DirectSliceTest.java
@@ -0,0 +1,105 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+package org.rocksdb.test;
+
+import org.junit.ClassRule;
+import org.junit.Test;
+import org.rocksdb.DirectSlice;
+
+import java.nio.ByteBuffer;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class DirectSliceTest {
+  @ClassRule
+  public static final RocksMemoryResource rocksMemoryResource =
+      new RocksMemoryResource();
+
+  @Test
+  public void directSlice() {
+    DirectSlice directSlice = null;
+    DirectSlice otherSlice = null;
+    try {
+      directSlice = new DirectSlice("abc");
+      otherSlice = new DirectSlice("abc");
+      assertThat(directSlice.toString()).isEqualTo("abc");
+      // clear first slice
+      directSlice.clear();
+      assertThat(directSlice.toString()).isEmpty();
+      // get first char in otherslice
+      assertThat(otherSlice.get(0)).isEqualTo("a".getBytes()[0]);
+      // remove prefix
+      otherSlice.removePrefix(1);
+      assertThat(otherSlice.toString()).isEqualTo("bc");
+    } finally {
+      if (directSlice != null) {
+        directSlice.dispose();
+      }
+      if (otherSlice != null) {
+        otherSlice.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void directSliceWithByteBuffer() {
+    DirectSlice directSlice = null;
+    try {
+      byte[] data = "Some text".getBytes();
+      ByteBuffer buffer = ByteBuffer.allocateDirect(data.length);
+      buffer.put(data);
+      directSlice = new DirectSlice(buffer);
+      assertThat(directSlice.toString()).isEqualTo("Some text");
+    } finally {
+      if (directSlice != null) {
+        directSlice.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void directSliceWithByteBufferAndLength() {
+    DirectSlice directSlice = null;
+    try {
+      byte[] data = "Some text".getBytes();
+      ByteBuffer buffer = ByteBuffer.allocateDirect(data.length);
+      buffer.put(data);
+      directSlice = new DirectSlice(buffer, 4);
+      assertThat(directSlice.toString()).isEqualTo("Some");
+    } finally {
+      if (directSlice != null) {
+        directSlice.dispose();
+      }
+    }
+  }
+
+  @Test(expected = AssertionError.class)
+  public void directSliceInitWithoutDirectAllocation() {
+    DirectSlice directSlice = null;
+    try {
+      byte[] data = "Some text".getBytes();
+      ByteBuffer buffer = ByteBuffer.wrap(data);
+      directSlice = new DirectSlice(buffer);
+    } finally {
+      if (directSlice != null) {
+        directSlice.dispose();
+      }
+    }
+  }
+
+  @Test(expected = AssertionError.class)
+  public void directSlicePrefixInitWithoutDirectAllocation() {
+    DirectSlice directSlice = null;
+    try {
+      byte[] data = "Some text".getBytes();
+      ByteBuffer buffer = ByteBuffer.wrap(data);
+      directSlice = new DirectSlice(buffer, 4);
+    } finally {
+      if (directSlice != null) {
+        directSlice.dispose();
+      }
+    }
+  }
+}
diff --git a/java/org/rocksdb/test/SliceTest.java b/java/org/rocksdb/test/SliceTest.java
new file mode 100644
index 000000000..4b04172f8
--- /dev/null
+++ b/java/org/rocksdb/test/SliceTest.java
@@ -0,0 +1,105 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+package org.rocksdb.test;
+
+import org.junit.ClassRule;
+import org.junit.Test;
+import org.rocksdb.Slice;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class SliceTest {
+
+  @ClassRule
+  public static final RocksMemoryResource rocksMemoryResource =
+      new RocksMemoryResource();
+
+  @Test
+  public void slice() {
+    Slice slice = null;
+    Slice otherSlice = null;
+    Slice thirdSlice = null;
+    try {
+      slice = new Slice("testSlice");
+      assertThat(slice.empty()).isFalse();
+      assertThat(slice.size()).isEqualTo(9);
+      assertThat(slice.data()).isEqualTo("testSlice".getBytes());
+
+      otherSlice = new Slice("otherSlice".getBytes());
+      assertThat(otherSlice.data()).isEqualTo("otherSlice".getBytes());
+
+      thirdSlice = new Slice("otherSlice".getBytes(), 5);
+      assertThat(thirdSlice.data()).isEqualTo("Slice".getBytes());
+    } finally {
+      if (slice != null) {
+        slice.dispose();
+      }
+      if (otherSlice != null) {
+        otherSlice.dispose();
+      }
+      if (thirdSlice != null) {
+        thirdSlice.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void sliceEquals() {
+    Slice slice = null;
+    Slice slice2 = null;
+    try {
+      slice = new Slice("abc");
+      slice2 = new Slice("abc");
+      assertThat(slice.equals(slice2)).isTrue();
+    } finally {
+      if (slice != null) {
+        slice.dispose();
+      }
+      if (slice2 != null) {
+        slice2.dispose();
+      }
+    }
+  }
+
+
+  @Test
+  public void sliceStartWith() {
+    Slice slice = null;
+    Slice match = null;
+    Slice noMatch = null;
+    try {
+      slice = new Slice("matchpoint");
+      match = new Slice("mat");
+      noMatch = new Slice("nomatch");
+
+      //assertThat(slice.startsWith(match)).isTrue();
+      assertThat(slice.startsWith(noMatch)).isFalse();
+    } finally {
+      if (slice != null) {
+        slice.dispose();
+      }
+      if (match != null) {
+        match.dispose();
+      }
+      if (noMatch != null) {
+        noMatch.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void sliceToString() {
+    Slice slice = null;
+    try {
+      slice = new Slice("stringTest");
+      assertThat(slice.toString()).isEqualTo("stringTest");
+      assertThat(slice.toString(true)).isNotEqualTo("");
+    } finally {
+      if (slice != null) {
+        slice.dispose();
+      }
+    }
+  }
+}
diff --git a/java/rocksjni/slice.cc b/java/rocksjni/slice.cc
index 64f89b211..6ea8bab3e 100644
--- a/java/rocksjni/slice.cc
+++ b/java/rocksjni/slice.cc
@@ -25,9 +25,15 @@
  * Signature: (Ljava/lang/String;)V
  */
 void Java_org_rocksdb_AbstractSlice_createNewSliceFromString(
-    JNIEnv* env, jobject jobj, jstring str) {
-  const std::string s = rocksdb::JniUtil::copyString(env, str);
-  const rocksdb::Slice* slice = new rocksdb::Slice(s);
+    JNIEnv* env, jobject jobj, jstring jstr) {
+
+  const char* str = env->GetStringUTFChars(jstr, 0);
+  const int len = strlen(str);
+  char* buf = new char[len];
+  memcpy(buf, str, len);
+  env->ReleaseStringUTFChars(jstr, str);
+
+  rocksdb::Slice* slice = new rocksdb::Slice(buf);
   rocksdb::AbstractSliceJni::setHandle(env, jobj, slice);
 }
 
@@ -85,8 +91,8 @@ jint Java_org_rocksdb_AbstractSlice_compare0(
  */
 jboolean Java_org_rocksdb_AbstractSlice_startsWith0(
     JNIEnv* env, jobject jobj, jlong handle, jlong otherHandle) {
-  const rocksdb::Slice* slice = reinterpret_cast<rocksdb::Slice*>(handle);
-  const rocksdb::Slice* otherSlice =
+  auto slice = reinterpret_cast<rocksdb::Slice*>(handle);
+  auto otherSlice =
     reinterpret_cast<rocksdb::Slice*>(otherHandle);
   return slice->starts_with(*otherSlice);
 }
@@ -130,19 +136,20 @@ void Java_org_rocksdb_Slice_createNewSlice0(
 void Java_org_rocksdb_Slice_createNewSlice1(
     JNIEnv * env, jobject jobj, jbyteArray data) {
 
-  const int len = env->GetArrayLength(data);
+  const int len = env->GetArrayLength(data) + 1;
 
   jboolean isCopy;
   jbyte* ptrData = env->GetByteArrayElements(data, &isCopy);
-  const char* buf = new char[len];
-  memcpy(const_cast<char*>(buf), ptrData, len);
+  char* buf = new char[len];
+
+  memcpy(buf, ptrData, len - 1);
+  buf[len-1]='\0';
 
   const rocksdb::Slice* slice =
-    new rocksdb::Slice(buf, env->GetArrayLength(data));
-  rocksdb::AbstractSliceJni::setHandle(env, jobj, slice);
+      new rocksdb::Slice(buf, len - 1);
 
+  rocksdb::AbstractSliceJni::setHandle(env, jobj, slice);
   env->ReleaseByteArrayElements(data, ptrData, JNI_ABORT);
-
   // NOTE: buf will be deleted in the org.rocksdb.Slice#dispose method
 }
 
@@ -153,11 +160,11 @@ void Java_org_rocksdb_Slice_createNewSlice1(
  */
 jbyteArray Java_org_rocksdb_Slice_data0(
     JNIEnv* env, jobject jobj, jlong handle) {
-  const rocksdb::Slice* slice = reinterpret_cast<rocksdb::Slice*>(handle);
+  auto slice = reinterpret_cast<rocksdb::Slice*>(handle);
   const int len = static_cast<int>(slice->size());
   const jbyteArray data = env->NewByteArray(len);
   env->SetByteArrayRegion(data, 0, len,
-    reinterpret_cast<jbyte*>(const_cast<char*>(slice->data())));
+    reinterpret_cast<const jbyte*>(slice->data()));
   return data;
 }
 
@@ -172,10 +179,6 @@ void Java_org_rocksdb_Slice_disposeInternalBuf(
     delete [] slice->data_;
 }
 
-// </editor-fold>
-
-// <editor-fold desc="org.rocksdb.DirectSlice>
-
 /*
  * Class:     org_rocksdb_DirectSlice
  * Method:    createNewDirectSlice0

From b0230d7e094183330d7c551c211ccf11066122d3 Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Sun, 14 Dec 2014 21:34:37 +0100
Subject: [PATCH 631/829] [RocksJava] Incorporate additions for D30081

---
 java/rocksjni/slice.cc | 46 +++++++++++++++++++++++-------------------
 1 file changed, 25 insertions(+), 21 deletions(-)

diff --git a/java/rocksjni/slice.cc b/java/rocksjni/slice.cc
index 6ea8bab3e..980dec8a1 100644
--- a/java/rocksjni/slice.cc
+++ b/java/rocksjni/slice.cc
@@ -33,7 +33,7 @@ void Java_org_rocksdb_AbstractSlice_createNewSliceFromString(
   memcpy(buf, str, len);
   env->ReleaseStringUTFChars(jstr, str);
 
-  rocksdb::Slice* slice = new rocksdb::Slice(buf);
+  const auto slice = new rocksdb::Slice(buf);
   rocksdb::AbstractSliceJni::setHandle(env, jobj, slice);
 }
 
@@ -44,7 +44,7 @@ void Java_org_rocksdb_AbstractSlice_createNewSliceFromString(
  */
 jint Java_org_rocksdb_AbstractSlice_size0(
     JNIEnv* env, jobject jobj, jlong handle) {
-  const rocksdb::Slice* slice = reinterpret_cast<rocksdb::Slice*>(handle);
+  const auto slice = reinterpret_cast<rocksdb::Slice*>(handle);
   return static_cast<jint>(slice->size());
 }
 
@@ -55,7 +55,7 @@ jint Java_org_rocksdb_AbstractSlice_size0(
  */
 jboolean Java_org_rocksdb_AbstractSlice_empty0(
     JNIEnv* env, jobject jobj, jlong handle) {
-  const rocksdb::Slice* slice = reinterpret_cast<rocksdb::Slice*>(handle);
+  const auto slice = reinterpret_cast<rocksdb::Slice*>(handle);
   return slice->empty();
 }
 
@@ -66,7 +66,7 @@ jboolean Java_org_rocksdb_AbstractSlice_empty0(
  */
 jstring Java_org_rocksdb_AbstractSlice_toString0(
     JNIEnv* env, jobject jobj, jlong handle, jboolean hex) {
-  const rocksdb::Slice* slice = reinterpret_cast<rocksdb::Slice*>(handle);
+  const auto slice = reinterpret_cast<rocksdb::Slice*>(handle);
   const std::string s = slice->ToString(hex);
   return env->NewStringUTF(s.c_str());
 }
@@ -78,8 +78,8 @@ jstring Java_org_rocksdb_AbstractSlice_toString0(
  */
 jint Java_org_rocksdb_AbstractSlice_compare0(
     JNIEnv* env, jobject jobj, jlong handle, jlong otherHandle) {
-  const rocksdb::Slice* slice = reinterpret_cast<rocksdb::Slice*>(handle);
-  const rocksdb::Slice* otherSlice =
+  const auto slice = reinterpret_cast<rocksdb::Slice*>(handle);
+  const auto otherSlice =
     reinterpret_cast<rocksdb::Slice*>(otherHandle);
   return slice->compare(*otherSlice);
 }
@@ -91,8 +91,8 @@ jint Java_org_rocksdb_AbstractSlice_compare0(
  */
 jboolean Java_org_rocksdb_AbstractSlice_startsWith0(
     JNIEnv* env, jobject jobj, jlong handle, jlong otherHandle) {
-  auto slice = reinterpret_cast<rocksdb::Slice*>(handle);
-  auto otherSlice =
+  const auto slice = reinterpret_cast<rocksdb::Slice*>(handle);
+  const auto otherSlice =
     reinterpret_cast<rocksdb::Slice*>(otherHandle);
   return slice->starts_with(*otherSlice);
 }
@@ -124,7 +124,7 @@ void Java_org_rocksdb_Slice_createNewSlice0(
   jbyte* ptrData = new jbyte[len];
   env->GetByteArrayRegion(data, offset, len, ptrData);
 
-  const rocksdb::Slice* slice = new rocksdb::Slice((const char*)ptrData, len);
+  const auto slice = new rocksdb::Slice((const char*)ptrData, len);
   rocksdb::AbstractSliceJni::setHandle(env, jobj, slice);
 }
 
@@ -145,7 +145,7 @@ void Java_org_rocksdb_Slice_createNewSlice1(
   memcpy(buf, ptrData, len - 1);
   buf[len-1]='\0';
 
-  const rocksdb::Slice* slice =
+  const auto slice =
       new rocksdb::Slice(buf, len - 1);
 
   rocksdb::AbstractSliceJni::setHandle(env, jobj, slice);
@@ -160,7 +160,7 @@ void Java_org_rocksdb_Slice_createNewSlice1(
  */
 jbyteArray Java_org_rocksdb_Slice_data0(
     JNIEnv* env, jobject jobj, jlong handle) {
-  auto slice = reinterpret_cast<rocksdb::Slice*>(handle);
+  const auto slice = reinterpret_cast<rocksdb::Slice*>(handle);
   const int len = static_cast<int>(slice->size());
   const jbyteArray data = env->NewByteArray(len);
   env->SetByteArrayRegion(data, 0, len,
@@ -175,10 +175,14 @@ jbyteArray Java_org_rocksdb_Slice_data0(
  */
 void Java_org_rocksdb_Slice_disposeInternalBuf(
     JNIEnv * env, jobject jobj, jlong handle) {
-    const rocksdb::Slice* slice = reinterpret_cast<rocksdb::Slice*>(handle);
-    delete [] slice->data_;
+  const auto slice = reinterpret_cast<rocksdb::Slice*>(handle);
+  delete [] slice->data_;
 }
 
+// </editor-fold>
+
+// <editor-fold desc="org.rocksdb.DirectSlice>
+
 /*
  * Class:     org_rocksdb_DirectSlice
  * Method:    createNewDirectSlice0
@@ -186,9 +190,9 @@ void Java_org_rocksdb_Slice_disposeInternalBuf(
  */
 void Java_org_rocksdb_DirectSlice_createNewDirectSlice0(
     JNIEnv* env, jobject jobj, jobject data, jint length) {
-  const char* ptrData =
-    reinterpret_cast<char*>(env->GetDirectBufferAddress(data));
-  const rocksdb::Slice* slice = new rocksdb::Slice(ptrData, length);
+  const auto ptrData =
+     reinterpret_cast<char*>(env->GetDirectBufferAddress(data));
+  const auto slice = new rocksdb::Slice(ptrData, length);
   rocksdb::AbstractSliceJni::setHandle(env, jobj, slice);
 }
 
@@ -199,7 +203,7 @@ void Java_org_rocksdb_DirectSlice_createNewDirectSlice0(
  */
 void Java_org_rocksdb_DirectSlice_createNewDirectSlice1(
     JNIEnv* env, jobject jobj, jobject data) {
-  const char* ptrData =
+  const auto ptrData =
     reinterpret_cast<char*>(env->GetDirectBufferAddress(data));
   const rocksdb::Slice* slice = new rocksdb::Slice(ptrData);
   rocksdb::AbstractSliceJni::setHandle(env, jobj, slice);
@@ -212,7 +216,7 @@ void Java_org_rocksdb_DirectSlice_createNewDirectSlice1(
  */
 jobject Java_org_rocksdb_DirectSlice_data0(
     JNIEnv* env, jobject jobj, jlong handle) {
-  const rocksdb::Slice* slice = reinterpret_cast<rocksdb::Slice*>(handle);
+  const auto slice = reinterpret_cast<rocksdb::Slice*>(handle);
   return env->NewDirectByteBuffer(const_cast<char*>(slice->data()),
     slice->size());
 }
@@ -224,7 +228,7 @@ jobject Java_org_rocksdb_DirectSlice_data0(
  */
 jbyte Java_org_rocksdb_DirectSlice_get0(
     JNIEnv* env, jobject jobj, jlong handle, jint offset) {
-  rocksdb::Slice* slice = reinterpret_cast<rocksdb::Slice*>(handle);
+  const auto slice = reinterpret_cast<rocksdb::Slice*>(handle);
   return (*slice)[offset];
 }
 
@@ -235,7 +239,7 @@ jbyte Java_org_rocksdb_DirectSlice_get0(
  */
 void Java_org_rocksdb_DirectSlice_clear0(
     JNIEnv* env, jobject jobj, jlong handle) {
-  rocksdb::Slice* slice = reinterpret_cast<rocksdb::Slice*>(handle);
+  const auto slice = reinterpret_cast<rocksdb::Slice*>(handle);
   delete [] slice->data_;
   slice->clear();
 }
@@ -247,7 +251,7 @@ void Java_org_rocksdb_DirectSlice_clear0(
  */
 void Java_org_rocksdb_DirectSlice_removePrefix0(
     JNIEnv* env, jobject jobj, jlong handle, jint length) {
-  rocksdb::Slice* slice = reinterpret_cast<rocksdb::Slice*>(handle);
+  const auto slice = reinterpret_cast<rocksdb::Slice*>(handle);
   slice->remove_prefix(length);
 }
 

From 5fbba60b6af5054797cf88ca95e275b1ffea4637 Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Thu, 18 Dec 2014 22:15:00 +0100
Subject: [PATCH 632/829] [RocksJava] Incorporated changes D30081

---
 java/rocksjni/slice.cc | 44 +++++++++++++++++++++---------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/java/rocksjni/slice.cc b/java/rocksjni/slice.cc
index 980dec8a1..6b7ab0ba5 100644
--- a/java/rocksjni/slice.cc
+++ b/java/rocksjni/slice.cc
@@ -27,13 +27,13 @@
 void Java_org_rocksdb_AbstractSlice_createNewSliceFromString(
     JNIEnv* env, jobject jobj, jstring jstr) {
 
-  const char* str = env->GetStringUTFChars(jstr, 0);
+  const auto* str = env->GetStringUTFChars(jstr, 0);
   const int len = strlen(str);
   char* buf = new char[len];
   memcpy(buf, str, len);
   env->ReleaseStringUTFChars(jstr, str);
 
-  const auto slice = new rocksdb::Slice(buf);
+  const auto* slice = new rocksdb::Slice(buf);
   rocksdb::AbstractSliceJni::setHandle(env, jobj, slice);
 }
 
@@ -44,7 +44,7 @@ void Java_org_rocksdb_AbstractSlice_createNewSliceFromString(
  */
 jint Java_org_rocksdb_AbstractSlice_size0(
     JNIEnv* env, jobject jobj, jlong handle) {
-  const auto slice = reinterpret_cast<rocksdb::Slice*>(handle);
+  const auto* slice = reinterpret_cast<rocksdb::Slice*>(handle);
   return static_cast<jint>(slice->size());
 }
 
@@ -55,7 +55,7 @@ jint Java_org_rocksdb_AbstractSlice_size0(
  */
 jboolean Java_org_rocksdb_AbstractSlice_empty0(
     JNIEnv* env, jobject jobj, jlong handle) {
-  const auto slice = reinterpret_cast<rocksdb::Slice*>(handle);
+  const auto* slice = reinterpret_cast<rocksdb::Slice*>(handle);
   return slice->empty();
 }
 
@@ -66,7 +66,7 @@ jboolean Java_org_rocksdb_AbstractSlice_empty0(
  */
 jstring Java_org_rocksdb_AbstractSlice_toString0(
     JNIEnv* env, jobject jobj, jlong handle, jboolean hex) {
-  const auto slice = reinterpret_cast<rocksdb::Slice*>(handle);
+  const auto* slice = reinterpret_cast<rocksdb::Slice*>(handle);
   const std::string s = slice->ToString(hex);
   return env->NewStringUTF(s.c_str());
 }
@@ -78,8 +78,8 @@ jstring Java_org_rocksdb_AbstractSlice_toString0(
  */
 jint Java_org_rocksdb_AbstractSlice_compare0(
     JNIEnv* env, jobject jobj, jlong handle, jlong otherHandle) {
-  const auto slice = reinterpret_cast<rocksdb::Slice*>(handle);
-  const auto otherSlice =
+  const auto* slice = reinterpret_cast<rocksdb::Slice*>(handle);
+  const auto* otherSlice =
     reinterpret_cast<rocksdb::Slice*>(otherHandle);
   return slice->compare(*otherSlice);
 }
@@ -91,8 +91,8 @@ jint Java_org_rocksdb_AbstractSlice_compare0(
  */
 jboolean Java_org_rocksdb_AbstractSlice_startsWith0(
     JNIEnv* env, jobject jobj, jlong handle, jlong otherHandle) {
-  const auto slice = reinterpret_cast<rocksdb::Slice*>(handle);
-  const auto otherSlice =
+  const auto* slice = reinterpret_cast<rocksdb::Slice*>(handle);
+  const auto* otherSlice =
     reinterpret_cast<rocksdb::Slice*>(otherHandle);
   return slice->starts_with(*otherSlice);
 }
@@ -124,7 +124,7 @@ void Java_org_rocksdb_Slice_createNewSlice0(
   jbyte* ptrData = new jbyte[len];
   env->GetByteArrayRegion(data, offset, len, ptrData);
 
-  const auto slice = new rocksdb::Slice((const char*)ptrData, len);
+  const auto* slice = new rocksdb::Slice((const char*)ptrData, len);
   rocksdb::AbstractSliceJni::setHandle(env, jobj, slice);
 }
 
@@ -145,7 +145,7 @@ void Java_org_rocksdb_Slice_createNewSlice1(
   memcpy(buf, ptrData, len - 1);
   buf[len-1]='\0';
 
-  const auto slice =
+  const auto* slice =
       new rocksdb::Slice(buf, len - 1);
 
   rocksdb::AbstractSliceJni::setHandle(env, jobj, slice);
@@ -160,7 +160,7 @@ void Java_org_rocksdb_Slice_createNewSlice1(
  */
 jbyteArray Java_org_rocksdb_Slice_data0(
     JNIEnv* env, jobject jobj, jlong handle) {
-  const auto slice = reinterpret_cast<rocksdb::Slice*>(handle);
+  const auto* slice = reinterpret_cast<rocksdb::Slice*>(handle);
   const int len = static_cast<int>(slice->size());
   const jbyteArray data = env->NewByteArray(len);
   env->SetByteArrayRegion(data, 0, len,
@@ -175,7 +175,7 @@ jbyteArray Java_org_rocksdb_Slice_data0(
  */
 void Java_org_rocksdb_Slice_disposeInternalBuf(
     JNIEnv * env, jobject jobj, jlong handle) {
-  const auto slice = reinterpret_cast<rocksdb::Slice*>(handle);
+  const auto* slice = reinterpret_cast<rocksdb::Slice*>(handle);
   delete [] slice->data_;
 }
 
@@ -190,9 +190,9 @@ void Java_org_rocksdb_Slice_disposeInternalBuf(
  */
 void Java_org_rocksdb_DirectSlice_createNewDirectSlice0(
     JNIEnv* env, jobject jobj, jobject data, jint length) {
-  const auto ptrData =
+  const auto* ptrData =
      reinterpret_cast<char*>(env->GetDirectBufferAddress(data));
-  const auto slice = new rocksdb::Slice(ptrData, length);
+  const auto* slice = new rocksdb::Slice(ptrData, length);
   rocksdb::AbstractSliceJni::setHandle(env, jobj, slice);
 }
 
@@ -203,9 +203,9 @@ void Java_org_rocksdb_DirectSlice_createNewDirectSlice0(
  */
 void Java_org_rocksdb_DirectSlice_createNewDirectSlice1(
     JNIEnv* env, jobject jobj, jobject data) {
-  const auto ptrData =
+  const auto* ptrData =
     reinterpret_cast<char*>(env->GetDirectBufferAddress(data));
-  const rocksdb::Slice* slice = new rocksdb::Slice(ptrData);
+  const auto* slice = new rocksdb::Slice(ptrData);
   rocksdb::AbstractSliceJni::setHandle(env, jobj, slice);
 }
 
@@ -216,7 +216,7 @@ void Java_org_rocksdb_DirectSlice_createNewDirectSlice1(
  */
 jobject Java_org_rocksdb_DirectSlice_data0(
     JNIEnv* env, jobject jobj, jlong handle) {
-  const auto slice = reinterpret_cast<rocksdb::Slice*>(handle);
+  const auto* slice = reinterpret_cast<rocksdb::Slice*>(handle);
   return env->NewDirectByteBuffer(const_cast<char*>(slice->data()),
     slice->size());
 }
@@ -228,8 +228,8 @@ jobject Java_org_rocksdb_DirectSlice_data0(
  */
 jbyte Java_org_rocksdb_DirectSlice_get0(
     JNIEnv* env, jobject jobj, jlong handle, jint offset) {
-  const auto slice = reinterpret_cast<rocksdb::Slice*>(handle);
-  return (*slice)[offset];
+  const auto* slice = reinterpret_cast<rocksdb::Slice*>(handle);
+  return (slice)[offset];
 }
 
 /*
@@ -239,7 +239,7 @@ jbyte Java_org_rocksdb_DirectSlice_get0(
  */
 void Java_org_rocksdb_DirectSlice_clear0(
     JNIEnv* env, jobject jobj, jlong handle) {
-  const auto slice = reinterpret_cast<rocksdb::Slice*>(handle);
+  const auto* slice = reinterpret_cast<rocksdb::Slice*>(handle);
   delete [] slice->data_;
   slice->clear();
 }
@@ -251,7 +251,7 @@ void Java_org_rocksdb_DirectSlice_clear0(
  */
 void Java_org_rocksdb_DirectSlice_removePrefix0(
     JNIEnv* env, jobject jobj, jlong handle, jint length) {
-  const auto slice = reinterpret_cast<rocksdb::Slice*>(handle);
+  const auto* slice = reinterpret_cast<rocksdb::Slice*>(handle);
   slice->remove_prefix(length);
 }
 

From 5b9ceef01d9abc2c936a7eba41b1636a83bac1ae Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Thu, 18 Dec 2014 22:19:57 +0100
Subject: [PATCH 633/829] [RocksJava] JavaDoc correction

---
 java/org/rocksdb/CompressionType.java    | 2 ++
 java/org/rocksdb/DBOptionsInterface.java | 4 ++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/java/org/rocksdb/CompressionType.java b/java/org/rocksdb/CompressionType.java
index c718d26a9..9f75b55e6 100644
--- a/java/org/rocksdb/CompressionType.java
+++ b/java/org/rocksdb/CompressionType.java
@@ -29,6 +29,8 @@ public enum CompressionType {
    * <p>If library cannot be found the enumeration
    * value {@code NO_COMPRESSION} will be returned.</p>
    *
+   * @param libraryName compression library name.
+   *
    * @return CompressionType instance.
    */
   public static CompressionType getCompressionType(String libraryName) {
diff --git a/java/org/rocksdb/DBOptionsInterface.java b/java/org/rocksdb/DBOptionsInterface.java
index 39ba13d25..83d7ba1e1 100644
--- a/java/org/rocksdb/DBOptionsInterface.java
+++ b/java/org/rocksdb/DBOptionsInterface.java
@@ -15,8 +15,8 @@ public interface DBOptionsInterface {
    * <p>You almost definitely want to call this function if your system is
    * bottlenecked by RocksDB.</p>
    *
-   * @param The total number of threads to be used by RocksDB. A good value
-   *            is the number of cores.
+   * @param totalThreads The total number of threads to be used by RocksDB.
+   *     A good value is the number of cores.
    *
    * @return the instance of the current Options
    */

From 1fed1282ad338fc92c9608b8bd6f97e969f64120 Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Thu, 18 Dec 2014 22:27:50 +0100
Subject: [PATCH 634/829] [RocksJava] Incorporated changes D30081

---
 java/rocksjni/slice.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/java/rocksjni/slice.cc b/java/rocksjni/slice.cc
index 6b7ab0ba5..c92ca5ec6 100644
--- a/java/rocksjni/slice.cc
+++ b/java/rocksjni/slice.cc
@@ -229,7 +229,7 @@ jobject Java_org_rocksdb_DirectSlice_data0(
 jbyte Java_org_rocksdb_DirectSlice_get0(
     JNIEnv* env, jobject jobj, jlong handle, jint offset) {
   const auto* slice = reinterpret_cast<rocksdb::Slice*>(handle);
-  return (slice)[offset];
+  return (*slice)[offset];
 }
 
 /*
@@ -239,7 +239,7 @@ jbyte Java_org_rocksdb_DirectSlice_get0(
  */
 void Java_org_rocksdb_DirectSlice_clear0(
     JNIEnv* env, jobject jobj, jlong handle) {
-  const auto* slice = reinterpret_cast<rocksdb::Slice*>(handle);
+  auto* slice = reinterpret_cast<rocksdb::Slice*>(handle);
   delete [] slice->data_;
   slice->clear();
 }
@@ -251,7 +251,7 @@ void Java_org_rocksdb_DirectSlice_clear0(
  */
 void Java_org_rocksdb_DirectSlice_removePrefix0(
     JNIEnv* env, jobject jobj, jlong handle, jint length) {
-  const auto* slice = reinterpret_cast<rocksdb::Slice*>(handle);
+  auto* slice = reinterpret_cast<rocksdb::Slice*>(handle);
   slice->remove_prefix(length);
 }
 

From a3001b1d3d73892d12993f4c08597363f23d69cc Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Fri, 19 Dec 2014 09:06:45 -0800
Subject: [PATCH 635/829] Remove -mtune=native because it's redundant

---
 build_tools/build_detect_platform | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/build_tools/build_detect_platform b/build_tools/build_detect_platform
index 3b3f34037..c17cd3ead 100755
--- a/build_tools/build_detect_platform
+++ b/build_tools/build_detect_platform
@@ -312,7 +312,7 @@ if test "$USE_SSE"; then
   # if Intel SSE instruction set is supported, set USE_SSE=1
   COMMON_FLAGS="$COMMON_FLAGS -msse -msse4.2 "
 elif test -z "$PORTABLE"; then
-  COMMON_FLAGS="$COMMON_FLAGS -march=native -mtune=native "
+  COMMON_FLAGS="$COMMON_FLAGS -march=native "
 fi
 
 PLATFORM_CCFLAGS="$PLATFORM_CCFLAGS $COMMON_FLAGS"

From fdb6be4e2476943d36b94078bb6b29cd4133731a Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Fri, 19 Dec 2014 20:38:12 +0100
Subject: [PATCH 636/829] Rewritten system for scheduling background work

Summary:
When scaling to higher number of column families, the worst bottleneck was MaybeScheduleFlushOrCompaction(), which did a for loop over all column families while holding a mutex. This patch addresses the issue.

The approach is similar to our earlier efforts: instead of a pull-model, where we do something for every column family, we can do a push-based model -- when we detect that column family is ready to be flushed/compacted, we add it to the flush_queue_/compaction_queue_. That way we don't need to loop over every column family in MaybeScheduleFlushOrCompaction.

Here are the performance results:

Command:

    ./db_bench --write_buffer_size=268435456 --db_write_buffer_size=268435456 --db=/fast-rocksdb-tmp/rocks_lots_of_cf --use_existing_db=0 --open_files=55000 --statistics=1 --histogram=1 --disable_data_sync=1 --max_write_buffer_number=2 --sync=0 --benchmarks=fillrandom --threads=16 --num_column_families=5000  --disable_wal=1 --max_background_flushes=16 --max_background_compactions=16 --level0_file_num_compaction_trigger=2 --level0_slowdown_writes_trigger=2 --level0_stop_writes_trigger=3 --hard_rate_limit=1 --num=33333333 --writes=33333333

Before the patch:

     fillrandom   :      26.950 micros/op 37105 ops/sec;    4.1 MB/s

After the patch:

      fillrandom   :      17.404 micros/op 57456 ops/sec;    6.4 MB/s

Next bottleneck is VersionSet::AddLiveFiles, which is painfully slow when we have a lot of files. This is coming in the next patch, but when I removed that code, here's what I got:

      fillrandom   :       7.590 micros/op 131758 ops/sec;   14.6 MB/s

Test Plan:
make check

two stress tests:

Big number of compactions and flushes:

    ./db_stress --threads=30 --ops_per_thread=20000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0  --reopen=15 --max_background_compactions=10 --max_background_flushes=10 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000

max_background_flushes=0, to verify that this case also works correctly

    ./db_stress --threads=30 --ops_per_thread=2000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0  --reopen=3 --max_background_compactions=3 --max_background_flushes=0 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000

Reviewers: ljin, rven, yhchiang, sdong

Reviewed By: sdong

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D30123
---
 db/column_family.cc     |  38 ++--
 db/column_family.h      |  17 ++
 db/compaction_picker.cc |  24 ++-
 db/db_impl.cc           | 371 ++++++++++++++++++++++++----------------
 db/db_impl.h            |  51 +++++-
 5 files changed, 324 insertions(+), 177 deletions(-)

diff --git a/db/column_family.cc b/db/column_family.cc
index f07c741a4..8a5c4a01f 100644
--- a/db/column_family.cc
+++ b/db/column_family.cc
@@ -223,14 +223,11 @@ void SuperVersionUnrefHandle(void* ptr) {
 }
 }  // anonymous namespace
 
-ColumnFamilyData::ColumnFamilyData(uint32_t id, const std::string& name,
-                                   Version* _dummy_versions,
-                                   Cache* _table_cache,
-                                   WriteBuffer* write_buffer,
-                                   const ColumnFamilyOptions& cf_options,
-                                   const DBOptions* db_options,
-                                   const EnvOptions& env_options,
-                                   ColumnFamilySet* column_family_set)
+ColumnFamilyData::ColumnFamilyData(
+    uint32_t id, const std::string& name, Version* _dummy_versions,
+    Cache* _table_cache, WriteBuffer* write_buffer,
+    const ColumnFamilyOptions& cf_options, const DBOptions* db_options,
+    const EnvOptions& env_options, ColumnFamilySet* column_family_set)
     : id_(id),
       name_(name),
       dummy_versions_(_dummy_versions),
@@ -250,7 +247,9 @@ ColumnFamilyData::ColumnFamilyData(uint32_t id, const std::string& name,
       next_(nullptr),
       prev_(nullptr),
       log_number_(0),
-      column_family_set_(column_family_set) {
+      column_family_set_(column_family_set),
+      pending_flush_(false),
+      pending_compaction_(false) {
   Ref();
 
   // if _dummy_versions is nullptr, then this is a dummy column family.
@@ -285,10 +284,14 @@ ColumnFamilyData::ColumnFamilyData(uint32_t id, const std::string& name,
           new LevelCompactionPicker(ioptions_, &internal_comparator_));
     }
 
-    Log(InfoLogLevel::INFO_LEVEL,
-        ioptions_.info_log, "Options for column family \"%s\":\n",
-        name.c_str());
-    options_.Dump(ioptions_.info_log);
+    if (column_family_set_->NumberOfColumnFamilies() < 10) {
+      Log(InfoLogLevel::INFO_LEVEL, ioptions_.info_log,
+          "--------------- Options for column family [%s]:\n", name.c_str());
+      options_.Dump(ioptions_.info_log);
+    } else {
+      Log(InfoLogLevel::INFO_LEVEL, ioptions_.info_log,
+          "\t(skipping printing options)\n");
+    }
   }
 
   RecalculateWriteStallConditions(mutable_cf_options_);
@@ -313,6 +316,11 @@ ColumnFamilyData::~ColumnFamilyData() {
     current_->Unref();
   }
 
+  // It would be wrong if this ColumnFamilyData is in flush_queue_ or
+  // compaction_queue_ and we destroyed it
+  assert(!pending_flush_);
+  assert(!pending_compaction_);
+
   if (super_version_ != nullptr) {
     // Release SuperVersion reference kept in ThreadLocalPtr.
     // This must be done outside of mutex_ since unref handler can lock mutex.
@@ -434,6 +442,10 @@ void ColumnFamilyData::CreateNewMemtable(
   mem_->Ref();
 }
 
+bool ColumnFamilyData::NeedsCompaction() const {
+  return compaction_picker_->NeedsCompaction(current_->storage_info());
+}
+
 Compaction* ColumnFamilyData::PickCompaction(
     const MutableCFOptions& mutable_options, LogBuffer* log_buffer) {
   auto* result = compaction_picker_->PickCompaction(
diff --git a/db/column_family.h b/db/column_family.h
index 51ccd99ac..8cf66a0c0 100644
--- a/db/column_family.h
+++ b/db/column_family.h
@@ -210,8 +210,11 @@ class ColumnFamilyData {
 
   // See documentation in compaction_picker.h
   // REQUIRES: DB mutex held
+  bool NeedsCompaction() const;
+  // REQUIRES: DB mutex held
   Compaction* PickCompaction(const MutableCFOptions& mutable_options,
                              LogBuffer* log_buffer);
+  // REQUIRES: DB mutex held
   Compaction* CompactRange(
       const MutableCFOptions& mutable_cf_options,
       int input_level, int output_level, uint32_t output_path_id,
@@ -248,6 +251,7 @@ class ColumnFamilyData {
   // if its reference count is zero and needs deletion or nullptr if not
   // As argument takes a pointer to allocated SuperVersion to enable
   // the clients to allocate SuperVersion outside of mutex.
+  // IMPORTANT: Only call this from DBImpl::InstallSuperVersion()
   SuperVersion* InstallSuperVersion(SuperVersion* new_superversion,
                                     port::Mutex* db_mutex,
                                     const MutableCFOptions& mutable_cf_options);
@@ -261,6 +265,12 @@ class ColumnFamilyData {
       bool triggered_flush_slowdown,
       bool triggered_flush_stop);
 
+  // Protected by DB mutex
+  void set_pending_flush(bool value) { pending_flush_ = value; }
+  void set_pending_compaction(bool value) { pending_compaction_ = value; }
+  bool pending_flush() { return pending_flush_; }
+  bool pending_compaction() { return pending_compaction_; }
+
  private:
   friend class ColumnFamilySet;
   ColumnFamilyData(uint32_t id, const std::string& name,
@@ -328,6 +338,13 @@ class ColumnFamilyData {
   ColumnFamilySet* column_family_set_;
 
   std::unique_ptr<WriteControllerToken> write_controller_token_;
+
+  // If true --> this ColumnFamily is currently present in DBImpl::flush_queue_
+  bool pending_flush_;
+
+  // If true --> this ColumnFamily is currently present in
+  // DBImpl::compaction_queue_
+  bool pending_compaction_;
 };
 
 // ColumnFamilySet has interesting thread-safety requirements
diff --git a/db/compaction_picker.cc b/db/compaction_picker.cc
index 82653ff70..70be388c9 100644
--- a/db/compaction_picker.cc
+++ b/db/compaction_picker.cc
@@ -695,15 +695,6 @@ Compaction* LevelCompactionPicker::PickCompaction(
   Compaction* c = nullptr;
   int level = -1;
 
-  // Compute the compactions needed. It is better to do it here
-  // and also in LogAndApply(), otherwise the values could be stale.
-  std::vector<uint64_t> size_being_compacted(NumberLevels() - 1);
-  SizeBeingCompacted(size_being_compacted);
-
-  CompactionOptionsFIFO dummy_compaction_options_fifo;
-  vstorage->ComputeCompactionScore(
-      mutable_cf_options, dummy_compaction_options_fifo, size_being_compacted);
-
   // We prefer compactions triggered by too much data in a level over
   // the compactions triggered by seeks.
   //
@@ -766,6 +757,21 @@ Compaction* LevelCompactionPicker::PickCompaction(
   compactions_in_progress_[level].insert(c);
 
   c->mutable_cf_options_ = mutable_cf_options;
+
+  // Creating a compaction influences the compaction score because the score
+  // takes running compactions into account (by skipping files that are already
+  // being compacted). Since we just changed compaction score, we recalculate it
+  // here
+  {  // this piece of code recomputes compaction score
+    std::vector<uint64_t> size_being_compacted(NumberLevels() - 1);
+    SizeBeingCompacted(size_being_compacted);
+
+    CompactionOptionsFIFO dummy_compaction_options_fifo;
+    vstorage->ComputeCompactionScore(mutable_cf_options,
+                                     dummy_compaction_options_fifo,
+                                     size_being_compacted);
+  }
+
   return c;
 }
 
diff --git a/db/db_impl.cc b/db/db_impl.cc
index 3275165e8..cb5dcc59c 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -89,6 +89,7 @@ void DumpRocksDBBuildVersion(Logger * log);
 struct DBImpl::WriteContext {
   autovector<SuperVersion*> superversions_to_free_;
   autovector<log::Writer*> logs_to_free_;
+  bool schedule_bg_work_ = false;
 
   ~WriteContext() {
     for (auto& sv : superversions_to_free_) {
@@ -205,8 +206,8 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname)
       max_total_in_memory_state_(0),
       is_snapshot_supported_(true),
       write_buffer_(options.db_write_buffer_size),
-      tmp_batch_(),
-      bg_schedule_needed_(false),
+      unscheduled_flushes_(0),
+      unscheduled_compactions_(0),
       bg_compaction_scheduled_(0),
       bg_manual_only_(0),
       bg_flush_scheduled_(0),
@@ -272,6 +273,19 @@ DBImpl::~DBImpl() {
   listeners_.clear();
   flush_scheduler_.Clear();
 
+  while (!flush_queue_.empty()) {
+    auto cfd = PopFirstFromFlushQueue();
+    if (cfd->Unref()) {
+      delete cfd;
+    }
+  }
+  while (!compaction_queue_.empty()) {
+    auto cfd = PopFirstFromCompactionQueue();
+    if (cfd->Unref()) {
+      delete cfd;
+    }
+  }
+
   if (default_cf_handle_ != nullptr) {
     // we need to delete handle outside of lock because it does its own locking
     mutex_.Unlock();
@@ -1643,10 +1657,13 @@ Status DBImpl::FlushMemTable(ColumnFamilyData* cfd,
     // SetNewMemtableAndNewLogFile() will release and reacquire mutex
     // during execution
     s = SetNewMemtableAndNewLogFile(cfd, &context);
+    write_thread_.ExitWriteThread(&w, &w, s);
+
     cfd->imm()->FlushRequested();
-    MaybeScheduleFlushOrCompaction();
 
-    write_thread_.ExitWriteThread(&w, &w, s);
+    // schedule flush
+    SchedulePendingFlush(cfd);
+    MaybeScheduleFlushOrCompaction();
   }
 
   if (s.ok() && flush_options.wait) {
@@ -1671,52 +1688,90 @@ Status DBImpl::WaitForFlushMemTable(ColumnFamilyData* cfd) {
 
 void DBImpl::MaybeScheduleFlushOrCompaction() {
   mutex_.AssertHeld();
-  bg_schedule_needed_ = false;
   if (bg_work_gate_closed_) {
-    // gate closed for backgrond work
+    // gate closed for background work
+    return;
   } else if (shutting_down_.load(std::memory_order_acquire)) {
     // DB is being deleted; no more background compactions
-  } else {
-    bool is_flush_pending = false;
-    // no need to refcount since we're under a mutex
-    for (auto cfd : *versions_->GetColumnFamilySet()) {
-      if (cfd->imm()->IsFlushPending()) {
-        is_flush_pending = true;
-      }
-    }
-    if (is_flush_pending) {
-      // memtable flush needed
-      if (bg_flush_scheduled_ < db_options_.max_background_flushes) {
-        bg_flush_scheduled_++;
-        env_->Schedule(&DBImpl::BGWorkFlush, this, Env::Priority::HIGH);
-      } else if (db_options_.max_background_flushes > 0) {
-        bg_schedule_needed_ = true;
-      }
-    }
-    bool is_compaction_needed = false;
-    // no need to refcount since we're under a mutex
-    for (auto cfd : *versions_->GetColumnFamilySet()) {
-      if (cfd->compaction_picker()->NeedsCompaction(
-              cfd->current()->storage_info())) {
-        is_compaction_needed = true;
-        break;
-      }
-    }
+    return;
+  } else if (bg_manual_only_) {
+    // manual only
+    return;
+  }
 
-    // Schedule BGWorkCompaction if there's a compaction pending (or a memtable
-    // flush, but the HIGH pool is not enabled)
-    // Do it only if max_background_compactions hasn't been reached and
-    // bg_manual_only_ == 0
-    if (!bg_manual_only_ &&
-        (is_compaction_needed ||
-         (is_flush_pending && db_options_.max_background_flushes == 0))) {
-      if (bg_compaction_scheduled_ < db_options_.max_background_compactions) {
-        bg_compaction_scheduled_++;
-        env_->Schedule(&DBImpl::BGWorkCompaction, this, Env::Priority::LOW);
-      } else {
-        bg_schedule_needed_ = true;
-      }
+  while (unscheduled_flushes_ > 0 &&
+         bg_flush_scheduled_ < db_options_.max_background_flushes) {
+    unscheduled_flushes_--;
+    bg_flush_scheduled_++;
+    env_->Schedule(&DBImpl::BGWorkFlush, this, Env::Priority::HIGH);
+  }
+
+  if (db_options_.max_background_flushes == 0 &&
+      bg_compaction_scheduled_ < db_options_.max_background_compactions &&
+      unscheduled_flushes_ > 0) {
+    // special case where flush is executed by compaction thread
+    // (if max_background_flushes == 0).
+    // Compaction thread will execute all the flushes
+    unscheduled_flushes_ = 0;
+    if (unscheduled_compactions_ > 0) {
+      // bg compaction will execute one compaction
+      unscheduled_compactions_--;
     }
+    bg_compaction_scheduled_++;
+    env_->Schedule(&DBImpl::BGWorkCompaction, this, Env::Priority::LOW);
+  }
+
+  while (bg_compaction_scheduled_ < db_options_.max_background_compactions &&
+         unscheduled_compactions_ > 0) {
+    bg_compaction_scheduled_++;
+    unscheduled_compactions_--;
+    env_->Schedule(&DBImpl::BGWorkCompaction, this, Env::Priority::LOW);
+  }
+}
+
+void DBImpl::AddToCompactionQueue(ColumnFamilyData* cfd) {
+  assert(!cfd->pending_compaction());
+  cfd->Ref();
+  compaction_queue_.push_back(cfd);
+  cfd->set_pending_compaction(true);
+}
+
+ColumnFamilyData* DBImpl::PopFirstFromCompactionQueue() {
+  assert(!compaction_queue_.empty());
+  auto cfd = *compaction_queue_.begin();
+  compaction_queue_.pop_front();
+  assert(cfd->pending_compaction());
+  cfd->set_pending_compaction(false);
+  return cfd;
+}
+
+void DBImpl::AddToFlushQueue(ColumnFamilyData* cfd) {
+  assert(!cfd->pending_flush());
+  cfd->Ref();
+  flush_queue_.push_back(cfd);
+  cfd->set_pending_flush(true);
+}
+
+ColumnFamilyData* DBImpl::PopFirstFromFlushQueue() {
+  assert(!flush_queue_.empty());
+  auto cfd = *flush_queue_.begin();
+  flush_queue_.pop_front();
+  assert(cfd->pending_flush());
+  cfd->set_pending_flush(false);
+  return cfd;
+}
+
+void DBImpl::SchedulePendingFlush(ColumnFamilyData* cfd) {
+  if (!cfd->pending_flush() && cfd->imm()->IsFlushPending()) {
+    AddToFlushQueue(cfd);
+    ++unscheduled_flushes_;
+  }
+}
+
+void DBImpl::SchedulePendingCompaction(ColumnFamilyData* cfd) {
+  if (!cfd->pending_compaction() && cfd->NeedsCompaction()) {
+    AddToCompactionQueue(cfd);
+    ++unscheduled_compactions_;
   }
 }
 
@@ -1743,33 +1798,41 @@ Status DBImpl::BackgroundFlush(bool* madeProgress, JobContext* job_context,
     return bg_error_;
   }
 
-  // call_status is failure if at least one flush was a failure. even if
-  // flushing one column family reports a failure, we will continue flushing
-  // other column families. however, call_status will be a failure in that case.
-  Status call_status;
-  // refcounting in iteration
-  for (auto cfd : *versions_->GetColumnFamilySet()) {
-    cfd->Ref();
-    Status flush_status;
-    const MutableCFOptions mutable_cf_options =
-      *cfd->GetLatestMutableCFOptions();
-    while (flush_status.ok() && cfd->imm()->IsFlushPending()) {
-      LogToBuffer(
-          log_buffer,
-          "BackgroundCallFlush doing FlushMemTableToOutputFile with column "
-          "family [%s], flush slots available %d",
-          cfd->GetName().c_str(),
-          db_options_.max_background_flushes - bg_flush_scheduled_);
-      flush_status = FlushMemTableToOutputFile(
-          cfd, mutable_cf_options, madeProgress, job_context, log_buffer);
+  ColumnFamilyData* cfd = nullptr;
+  while (!flush_queue_.empty()) {
+    // This cfd is already referenced
+    cfd = PopFirstFromFlushQueue();
+
+    if (cfd->IsDropped() || !cfd->imm()->IsFlushPending()) {
+      // can't flush this CF, try next one
+      if (cfd->Unref()) {
+        delete cfd;
+      }
+      continue;
     }
-    if (call_status.ok() && !flush_status.ok()) {
-      call_status = flush_status;
+
+    // found a flush!
+    break;
+  }
+
+  Status status;
+  if (cfd != nullptr) {
+    const MutableCFOptions mutable_cf_options =
+        *cfd->GetLatestMutableCFOptions();
+    LogToBuffer(
+        log_buffer,
+        "Calling FlushMemTableToOutputFile with column "
+        "family [%s], flush slots available %d, compaction slots available %d",
+        cfd->GetName().c_str(),
+        db_options_.max_background_flushes - bg_flush_scheduled_,
+        db_options_.max_background_compactions - bg_compaction_scheduled_);
+    status = FlushMemTableToOutputFile(cfd, mutable_cf_options, madeProgress,
+                                       job_context, log_buffer);
+    if (cfd->Unref()) {
+      delete cfd;
     }
-    cfd->Unref();
   }
-  versions_->GetColumnFamilySet()->FreeDeadColumnFamilies();
-  return call_status;
+  return status;
 }
 
 void DBImpl::BackgroundCallFlush() {
@@ -1829,13 +1892,8 @@ void DBImpl::BackgroundCallFlush() {
     }
 
     bg_flush_scheduled_--;
-    // Any time the mutex is released After finding the work to do, another
-    // thread might execute MaybeScheduleFlushOrCompaction(). It is possible
-    // that there is a pending job but it is not scheduled because of the
-    // max thread limit.
-    if (madeProgress || bg_schedule_needed_) {
-      MaybeScheduleFlushOrCompaction();
-    }
+    // See if there's more work to be done
+    MaybeScheduleFlushOrCompaction();
     RecordFlushIOStats();
     bg_cv_.SignalAll();
     // IMPORTANT: there should be no code after calling SignalAll. This call may
@@ -1909,17 +1967,8 @@ void DBImpl::BackgroundCallCompaction() {
 
     versions_->GetColumnFamilySet()->FreeDeadColumnFamilies();
 
-    // Previous compaction may have produced too many files in a level,
-    // So reschedule another compaction if we made progress in the
-    // last compaction.
-    //
-    // Also, any time the mutex is released After finding the work to do,
-    // another thread might execute MaybeScheduleFlushOrCompaction(). It is
-    // possible  that there is a pending job but it is not scheduled because of
-    // the max thread limit.
-    if (madeProgress || bg_schedule_needed_) {
-      MaybeScheduleFlushOrCompaction();
-    }
+    // See if there's more work to be done
+    MaybeScheduleFlushOrCompaction();
     if (madeProgress || bg_compaction_scheduled_ == 0 || bg_manual_only_ > 0) {
       // signal if
       // * madeProgress -- need to wakeup DelayWrite
@@ -1964,35 +2013,28 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress, JobContext* job_context,
   }
 
   // FLUSH preempts compaction
-  Status flush_stat;
-  for (auto cfd : *versions_->GetColumnFamilySet()) {
-    const MutableCFOptions mutable_cf_options =
-      *cfd->GetLatestMutableCFOptions();
-    while (cfd->imm()->IsFlushPending()) {
-      LogToBuffer(
-          log_buffer,
-          "BackgroundCompaction doing FlushMemTableToOutputFile, "
-          "compaction slots available %d",
-          db_options_.max_background_compactions - bg_compaction_scheduled_);
-      cfd->Ref();
-      flush_stat = FlushMemTableToOutputFile(
-          cfd, mutable_cf_options, madeProgress, job_context, log_buffer);
-      cfd->Unref();
-      if (!flush_stat.ok()) {
-        if (is_manual) {
-          manual_compaction_->status = flush_stat;
-          manual_compaction_->done = true;
-          manual_compaction_->in_progress = false;
-          manual_compaction_ = nullptr;
-        }
-        return flush_stat;
+  // TODO(icanadi) we should only do this if max_background_flushes == 0
+  // BackgroundFlush() will only execute a single flush. We keep calling it as
+  // long as there's more flushes to be done
+  while (!flush_queue_.empty()) {
+    LogToBuffer(
+        log_buffer,
+        "BackgroundCompaction calling BackgroundFlush. flush slots available "
+        "%d, compaction slots available %d",
+        db_options_.max_background_flushes - bg_flush_scheduled_,
+        db_options_.max_background_compactions - bg_compaction_scheduled_);
+    auto flush_status = BackgroundFlush(madeProgress, job_context, log_buffer);
+    if (!flush_status.ok()) {
+      if (is_manual) {
+        manual_compaction_->status = flush_status;
+        manual_compaction_->done = true;
+        manual_compaction_->in_progress = false;
+        manual_compaction_ = nullptr;
       }
+      return flush_status;
     }
   }
 
-  // Compaction makes a copy of the latest MutableCFOptions. It should be used
-  // throughout the compaction procedure to make sure consistency. It will
-  // eventually be installed into SuperVersion
   unique_ptr<Compaction> c;
   InternalKey manual_end_storage;
   InternalKey* manual_end = &manual_end_storage;
@@ -2014,22 +2056,53 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress, JobContext* job_context,
                 ((m->done || manual_end == nullptr)
                      ? "(end)"
                      : manual_end->DebugString().c_str()));
-  } else {
-    // no need to refcount in iteration since it's always under a mutex
-    for (auto cfd : *versions_->GetColumnFamilySet()) {
-      // Pick up latest mutable CF Options and use it throughout the
-      // compaction job
-      auto* mutable_cf_options = cfd->GetLatestMutableCFOptions();
-      if (!mutable_cf_options->disable_auto_compactions) {
-        // NOTE: try to avoid unnecessary copy of MutableCFOptions if
-        // compaction is not necessary. Need to make sure mutex is held
-        // until we make a copy in the following code
-        c.reset(cfd->PickCompaction(*mutable_cf_options, log_buffer));
-        if (c != nullptr) {
-          // update statistics
-          MeasureTime(stats_, NUM_FILES_IN_SINGLE_COMPACTION,
-                      c->inputs(0)->size());
-          break;
+  } else if (!compaction_queue_.empty()) {
+    // cfd is referenced here
+    auto cfd = PopFirstFromCompactionQueue();
+    // We unreference here because the following code will take a Ref() on
+    // this cfd if it is going to use it (Compaction class holds a
+    // reference).
+    // This will all happen under a mutex so we don't have to be afraid of
+    // somebody else deleting it.
+    if (cfd->Unref()) {
+      delete cfd;
+      // This was the last reference of the column family, so no need to
+      // compact.
+      return Status::OK();
+    }
+
+    // Pick up latest mutable CF Options and use it throughout the
+    // compaction job
+    // Compaction makes a copy of the latest MutableCFOptions. It should be used
+    // throughout the compaction procedure to make sure consistency. It will
+    // eventually be installed into SuperVersion
+    auto* mutable_cf_options = cfd->GetLatestMutableCFOptions();
+    if (!mutable_cf_options->disable_auto_compactions && !cfd->IsDropped()) {
+      // NOTE: try to avoid unnecessary copy of MutableCFOptions if
+      // compaction is not necessary. Need to make sure mutex is held
+      // until we make a copy in the following code
+      c.reset(cfd->PickCompaction(*mutable_cf_options, log_buffer));
+      if (c != nullptr) {
+        // update statistics
+        MeasureTime(stats_, NUM_FILES_IN_SINGLE_COMPACTION,
+                    c->inputs(0)->size());
+        // There are three things that can change compaction score:
+        // 1) When flush or compaction finish. This case is covered by
+        // InstallSuperVersion()
+        // 2) When MutableCFOptions changes. This case is also covered by
+        // InstallSuperVersion(), because this is when the new options take
+        // effect.
+        // 3) When we Pick a new compaction, we "remove" those files being
+        // compacted from the calculation, which then influences compaction
+        // score. Here we check if we need the new compaction even without the
+        // files that are currently being compacted. If we need another
+        // compaction, we might be able to execute it in parallel, so we add it
+        // to the queue and schedule a new thread.
+        if (cfd->NeedsCompaction()) {
+          // Yes, we need more compactions!
+          AddToCompactionQueue(cfd);
+          ++unscheduled_compactions_;
+          MaybeScheduleFlushOrCompaction();
         }
       }
     }
@@ -2085,8 +2158,6 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress, JobContext* job_context,
     c->ReleaseCompactionFiles(status);
     *madeProgress = true;
   } else {
-    MaybeScheduleFlushOrCompaction(); // do more compaction work in parallel.
-
     auto yield_callback = [&]() {
       return CallFlushDuringCompaction(c->column_family_data(),
                                        *c->mutable_cf_options(), job_context,
@@ -2275,7 +2346,7 @@ void DBImpl::InstallSuperVersionBackground(
 
 SuperVersion* DBImpl::InstallSuperVersion(
     ColumnFamilyData* cfd, SuperVersion* new_sv,
-    const MutableCFOptions& mutable_cf_options) {
+    const MutableCFOptions& mutable_cf_options, bool dont_schedule_bg_work) {
   mutex_.AssertHeld();
 
   // Update max_total_in_memory_state_
@@ -2289,10 +2360,15 @@ SuperVersion* DBImpl::InstallSuperVersion(
   auto* old = cfd->InstallSuperVersion(
       new_sv ? new_sv : new SuperVersion(), &mutex_, mutable_cf_options);
 
-  // We want to schedule potential flush or compactions since new options may
-  // have been picked up in this new version. New options may cause flush
-  // compaction trigger condition to change.
-  MaybeScheduleFlushOrCompaction();
+  // Whenever we install new SuperVersion, we might need to issue new flushes or
+  // compactions. dont_schedule_bg_work is true when scheduling from write
+  // thread and we don't want to add additional overhead. Callers promise to
+  // call SchedulePendingFlush() and MaybeScheduleFlushOrCompaction() eventually
+  if (!dont_schedule_bg_work) {
+    SchedulePendingFlush(cfd);
+    SchedulePendingCompaction(cfd);
+    MaybeScheduleFlushOrCompaction();
+  }
 
   // Update max_total_in_memory_state_
   max_total_in_memory_state_ =
@@ -2848,9 +2924,10 @@ Status DBImpl::Write(const WriteOptions& write_options, WriteBatch* my_batch) {
           break;
         }
         cfd->imm()->FlushRequested();
+        SchedulePendingFlush(cfd);
+        context.schedule_bg_work_ = true;
       }
     }
-    MaybeScheduleFlushOrCompaction();
   } else if (UNLIKELY(write_buffer_.ShouldFlush())) {
     Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
         "Flushing all column families. Write buffer is using %" PRIu64
@@ -2865,6 +2942,8 @@ Status DBImpl::Write(const WriteOptions& write_options, WriteBatch* my_batch) {
           break;
         }
         cfd->imm()->FlushRequested();
+        SchedulePendingFlush(cfd);
+        context.schedule_bg_work_ = true;
       }
     }
     MaybeScheduleFlushOrCompaction();
@@ -2986,6 +3065,10 @@ Status DBImpl::Write(const WriteOptions& write_options, WriteBatch* my_batch) {
   }
 
   write_thread_.ExitWriteThread(&w, last_writer, status);
+
+  if (context.schedule_bg_work_) {
+    MaybeScheduleFlushOrCompaction();
+  }
   mutex_.Unlock();
 
   if (status.IsTimedOut()) {
@@ -3023,11 +3106,11 @@ Status DBImpl::DelayWrite(uint64_t expiration_time) {
 }
 
 Status DBImpl::ScheduleFlushes(WriteContext* context) {
-  bool schedule_bg_work = false;
   ColumnFamilyData* cfd;
   while ((cfd = flush_scheduler_.GetNextColumnFamily()) != nullptr) {
-    schedule_bg_work = true;
     auto status = SetNewMemtableAndNewLogFile(cfd, context);
+    SchedulePendingFlush(cfd);
+    context->schedule_bg_work_ = true;
     if (cfd->Unref()) {
       delete cfd;
     }
@@ -3035,9 +3118,6 @@ Status DBImpl::ScheduleFlushes(WriteContext* context) {
       return status;
     }
   }
-  if (schedule_bg_work) {
-    MaybeScheduleFlushOrCompaction();
-  }
   return Status::OK();
 }
 
@@ -3113,7 +3193,7 @@ Status DBImpl::SetNewMemtableAndNewLogFile(ColumnFamilyData* cfd,
   new_mem->Ref();
   cfd->SetMemtable(new_mem);
   context->superversions_to_free_.push_back(
-      InstallSuperVersion(cfd, new_superversion, mutable_cf_options));
+      InstallSuperVersion(cfd, new_superversion, mutable_cf_options, true));
   return s;
 }
 
@@ -3380,12 +3460,6 @@ Status DBImpl::DeleteFile(std::string name) {
     PurgeObsoleteFiles(job_context);
   }
   job_context.Clean();
-  {
-    MutexLock l(&mutex_);
-    // schedule flush if file deletion means we freed the space for flushes to
-    // continue
-    MaybeScheduleFlushOrCompaction();
-  }
   return status;
 }
 
@@ -3620,7 +3694,6 @@ Status DB::Open(const DBOptions& db_options, const std::string& dbname,
       impl->alive_log_files_.push_back(
           DBImpl::LogFileNumberSize(impl->logfile_number_));
       impl->DeleteObsoleteFiles();
-      impl->MaybeScheduleFlushOrCompaction();
       s = impl->db_directory_->Fsync();
     }
   }
diff --git a/db/db_impl.h b/db/db_impl.h
index 5e27df2c6..7a3a7984d 100644
--- a/db/db_impl.h
+++ b/db/db_impl.h
@@ -362,6 +362,8 @@ class DBImpl : public DB {
   ColumnFamilyData* GetColumnFamilyDataByName(const std::string& cf_name);
 
   void MaybeScheduleFlushOrCompaction();
+  void SchedulePendingFlush(ColumnFamilyData* cfd);
+  void SchedulePendingCompaction(ColumnFamilyData* cfd);
   static void BGWorkCompaction(void* db);
   static void BGWorkFlush(void* db);
   void BackgroundCallCompaction();
@@ -393,6 +395,12 @@ class DBImpl : public DB {
   // hold the data set.
   Status ReFitLevel(ColumnFamilyData* cfd, int level, int target_level = -1);
 
+  // helper functions for adding and removing from flush & compaction queues
+  void AddToCompactionQueue(ColumnFamilyData* cfd);
+  ColumnFamilyData* PopFirstFromCompactionQueue();
+  void AddToFlushQueue(ColumnFamilyData* cfd);
+  ColumnFamilyData* PopFirstFromFlushQueue();
+
   // table_cache_ provides its own synchronization
   std::shared_ptr<Cache> table_cache_;
 
@@ -460,9 +468,32 @@ class DBImpl : public DB {
   // State is protected with db mutex.
   std::list<uint64_t> pending_outputs_;
 
-  // At least one compaction or flush job is pending but not yet scheduled
-  // because of the max background thread limit.
-  bool bg_schedule_needed_;
+  // flush_queue_ and compaction_queue_ hold column families that we need to
+  // flush and compact, respectively.
+  // A column family is inserted into flush_queue_ when it satisfies condition
+  // cfd->imm()->IsFlushPending()
+  // A column family is inserted into compaction_queue_ when it satisfied
+  // condition cfd->NeedsCompaction()
+  // Column families in this list are all Ref()-erenced
+  // TODO(icanadi) Provide some kind of ReferencedColumnFamily class that will
+  // do RAII on ColumnFamilyData
+  // Column families are in this queue when they need to be flushed or
+  // compacted. Consumers of these queues are flush and compaction threads. When
+  // column family is put on this queue, we increase unscheduled_flushes_ and
+  // unscheduled_compactions_. When these variables are bigger than zero, that
+  // means we need to schedule background threads for compaction and thread.
+  // Once the background threads are scheduled, we decrease unscheduled_flushes_
+  // and unscheduled_compactions_. That way we keep track of number of
+  // compaction and flush threads we need to schedule. This scheduling is done
+  // in MaybeScheduleFlushOrCompaction()
+  // invariant(column family present in flush_queue_ <==>
+  // ColumnFamilyData::pending_flush_ == true)
+  std::deque<ColumnFamilyData*> flush_queue_;
+  // invariant(column family present in compaction_queue_ <==>
+  // ColumnFamilyData::pending_compaction_ == true)
+  std::deque<ColumnFamilyData*> compaction_queue_;
+  int unscheduled_flushes_;
+  int unscheduled_compactions_;
 
   // count how many background compactions are running or have been scheduled
   int bg_compaction_scheduled_;
@@ -553,9 +584,17 @@ class DBImpl : public DB {
       ColumnFamilyData* cfd, JobContext* job_context,
       const MutableCFOptions& mutable_cf_options);
 
-  SuperVersion* InstallSuperVersion(
-    ColumnFamilyData* cfd, SuperVersion* new_sv,
-    const MutableCFOptions& mutable_cf_options);
+  // All ColumnFamily state changes go through this function. Here we analyze
+  // the new state and we schedule background work if we detect that the new
+  // state needs flush or compaction.
+  // If dont_schedule_bg_work == true, then caller asks us to not schedule flush
+  // or compaction here, but it also promises to schedule needed background
+  // work. We use this to  scheduling background compactions when we are in the
+  // write thread, which is very performance critical. Caller schedules
+  // background work as soon as it exits the write thread
+  SuperVersion* InstallSuperVersion(ColumnFamilyData* cfd, SuperVersion* new_sv,
+                                    const MutableCFOptions& mutable_cf_options,
+                                    bool dont_schedule_bg_work = false);
 
   // Find Super version and reference it. Based on options, it might return
   // the thread local cached one.

From ade4034a9d4d42caed8d9e635a905fdf1152b361 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Sat, 20 Dec 2014 12:46:37 +0100
Subject: [PATCH 637/829] MultiGet for DBWithTTL

Summary: This is a feature request from rocksdb's user. I didn't even realize we don't support multigets on TTL DB :)

Test Plan: added a unit test

Reviewers: yhchiang, rven, sdong

Reviewed By: sdong

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D30561
---
 utilities/ttl/db_ttl_impl.cc | 14 ++++++++++++--
 utilities/ttl/ttl_test.cc    | 30 ++++++++++++++++++++++++++++++
 2 files changed, 42 insertions(+), 2 deletions(-)

diff --git a/utilities/ttl/db_ttl_impl.cc b/utilities/ttl/db_ttl_impl.cc
index 4d2d8406e..622e668b1 100644
--- a/utilities/ttl/db_ttl_impl.cc
+++ b/utilities/ttl/db_ttl_impl.cc
@@ -202,8 +202,18 @@ std::vector<Status> DBWithTTLImpl::MultiGet(
     const ReadOptions& options,
     const std::vector<ColumnFamilyHandle*>& column_family,
     const std::vector<Slice>& keys, std::vector<std::string>* values) {
-  return std::vector<Status>(
-      keys.size(), Status::NotSupported("MultiGet not supported with TTL"));
+  auto statuses = db_->MultiGet(options, column_family, keys, values);
+  for (size_t i = 0; i < keys.size(); ++i) {
+    if (!statuses[i].ok()) {
+      continue;
+    }
+    statuses[i] = SanityCheckTimestamp((*values)[i]);
+    if (!statuses[i].ok()) {
+      continue;
+    }
+    statuses[i] = StripTS(&(*values)[i]);
+  }
+  return statuses;
 }
 
 bool DBWithTTLImpl::KeyMayExist(const ReadOptions& options,
diff --git a/utilities/ttl/ttl_test.cc b/utilities/ttl/ttl_test.cc
index 66c8db50c..73756a704 100644
--- a/utilities/ttl/ttl_test.cc
+++ b/utilities/ttl/ttl_test.cc
@@ -194,6 +194,25 @@ class TtlTest {
     }
   }
 
+  // checks the whole kvmap_ to return correct values using MultiGet
+  void SimpleMultiGetTest() {
+    static ReadOptions ropts;
+    std::vector<Slice> keys;
+    std::vector<std::string> values;
+
+    for (auto& kv : kvmap_) {
+      keys.emplace_back(kv.first);
+    }
+
+    auto statuses = db_ttl_->MultiGet(ropts, keys, &values);
+    size_t i = 0;
+    for (auto& kv : kvmap_) {
+      ASSERT_OK(statuses[i]);
+      ASSERT_EQ(values[i], kv.second);
+      ++i;
+    }
+  }
+
   // Sleeps for slp_tim then runs a manual compaction
   // Checks span starting from st_pos from kvmap_ in the db and
   // Gets should return true if check is true and false otherwise
@@ -533,6 +552,17 @@ TEST(TtlTest, KeyMayExist) {
   CloseTtl();
 }
 
+TEST(TtlTest, MultiGetTest) {
+  MakeKVMap(kSampleSize_);
+
+  OpenTtl();
+  PutValues(0, kSampleSize_, false);
+
+  SimpleMultiGetTest();
+
+  CloseTtl();
+}
+
 TEST(TtlTest, ColumnFamiliesTest) {
   DB* db;
   Options options;

From f8999fcf31be758d99fac3f64bc4ca0717b7f576 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Sun, 21 Dec 2014 00:23:28 -0800
Subject: [PATCH 638/829] Fix a SIGSEGV in BackgroundFlush

Summary:
This one wasn't easy to find :)

What happens is we go through all cfds on flush_queue_ and find no cfds to flush, *but* the cfd is set to the last CF we looped through and following code assumes we want it flushed.

BTW @sdong do you think we should also make BackgroundFlush() only check a single cfd for flushing instead of doing this `while (!flush_queue_.empty())`?

Test Plan: regression test no longer fails

Reviewers: sdong, rven, yhchiang

Reviewed By: yhchiang

Subscribers: sdong, dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D30591
---
 db/db_impl.cc | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/db/db_impl.cc b/db/db_impl.cc
index cb5dcc59c..aff68ed45 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -1801,17 +1801,18 @@ Status DBImpl::BackgroundFlush(bool* madeProgress, JobContext* job_context,
   ColumnFamilyData* cfd = nullptr;
   while (!flush_queue_.empty()) {
     // This cfd is already referenced
-    cfd = PopFirstFromFlushQueue();
+    auto first_cfd = PopFirstFromFlushQueue();
 
-    if (cfd->IsDropped() || !cfd->imm()->IsFlushPending()) {
+    if (first_cfd->IsDropped() || !first_cfd->imm()->IsFlushPending()) {
       // can't flush this CF, try next one
-      if (cfd->Unref()) {
-        delete cfd;
+      if (first_cfd->Unref()) {
+        delete first_cfd;
       }
       continue;
     }
 
     // found a flush!
+    cfd = first_cfd;
     break;
   }
 

From 949bd71fd0f1dac2cec92e21815b5163e897624e Mon Sep 17 00:00:00 2001
From: alabid <alabidan@gmail.com>
Date: Mon, 22 Dec 2014 00:36:16 -0500
Subject: [PATCH 639/829] fix really trivial typo

---
 examples/column_families_example.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/column_families_example.cc b/examples/column_families_example.cc
index 2bdf6ec42..3ffac064d 100644
--- a/examples/column_families_example.cc
+++ b/examples/column_families_example.cc
@@ -33,7 +33,7 @@ int main() {
 
   // open DB with two column families
   std::vector<ColumnFamilyDescriptor> column_families;
-  // have to open default column familiy
+  // have to open default column family
   column_families.push_back(ColumnFamilyDescriptor(
       kDefaultColumnFamilyName, ColumnFamilyOptions()));
   // open the new one, too

From 0acc7388101c7f0c043d1dc961238d1d44ee9971 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Mon, 22 Dec 2014 12:04:45 +0100
Subject: [PATCH 640/829] Speed up FindObsoleteFiles()

Summary:
There are two versions of FindObsoleteFiles():
* full scan, which is executed every 6 hours (and it's terribly slow)
* no full scan, which is executed every time a background process finishes and iterator is deleted

This diff is optimizing the second case (no full scan). Here's what we do before the diff:
* Get the list of obsolete files (files with ref==0). Some files in obsolete_files set might actually be live.
* Get the list of live files to avoid deleting files that are live.
* Delete files that are in obsolete_files and not in live_files.

After this diff:
* The only files with ref==0 that are still live are files that have been part of move compaction. Don't include moved files in obsolete_files.
* Get the list of obsolete files (which exclude moved files).
* No need to get the list of live files, since all files in obsolete_files need to be deleted.

I'll post the benchmark results, but you can get the feel of it here: https://reviews.facebook.net/D30123

This depends on D30123.

P.S. We should do full scan only in failure scenarios, not every 6 hours. I'll do this in a follow-up diff.

Test Plan:
One new unit test. Made sure that unit test fails if we don't have a `if (!f->moved)` safeguard in ~Version.

make check

Big number of compactions and flushes:

  ./db_stress --threads=30 --ops_per_thread=20000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0  --reopen=15 --max_background_compactions=10 --max_background_flushes=10 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000

Reviewers: yhchiang, rven, sdong

Reviewed By: sdong

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D30249
---
 HISTORY.md            |  1 +
 db/db_bench.cc        |  7 ++-----
 db/db_impl.cc         | 48 ++++++++++++++++++++++++-------------------
 db/db_impl.h          |  4 ++--
 db/db_test.cc         | 32 ++++++++++++++++++++++++++++-
 db/job_context.h      |  8 +++++---
 db/version_builder.cc |  1 +
 db/version_edit.h     |  7 ++++++-
 db/version_set.cc     |  8 +++++++-
 9 files changed, 82 insertions(+), 34 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index dede7580a..e24cad8e8 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -7,6 +7,7 @@
   specifying them in db_paths along with the target_size.
   Lower numbered levels will be placed earlier in the db_paths and higher
   numbered levels will be placed later in the db_paths vector.
+* Potentially big performance improvements if you're using RocksDB with lots of column families (100-1000)
 
 ### 3.9.0 (12/8/2014)
 
diff --git a/db/db_bench.cc b/db/db_bench.cc
index 34531cc3e..8562d04aa 100644
--- a/db/db_bench.cc
+++ b/db/db_bench.cc
@@ -367,9 +367,8 @@ DEFINE_int32(deletepercent, 2, "Percentage of deletes out of reads/writes/"
              "deletepercent), so deletepercent must be smaller than (100 - "
              "FLAGS_readwritepercent)");
 
-DEFINE_uint64(delete_obsolete_files_period_micros, 0, "Option to delete "
-              "obsolete files periodically. 0 means that obsolete files are"
-              " deleted after every compaction run.");
+DEFINE_uint64(delete_obsolete_files_period_micros, 0,
+              "Ignored. Left here for backward compatibility");
 
 namespace {
 enum rocksdb::CompressionType StringToCompressionType(const char* ctype) {
@@ -2008,8 +2007,6 @@ class Benchmark {
         options.compression_per_level[i] = FLAGS_compression_type_e;
       }
     }
-    options.delete_obsolete_files_period_micros =
-      FLAGS_delete_obsolete_files_period_micros;
     options.soft_rate_limit = FLAGS_soft_rate_limit;
     options.hard_rate_limit = FLAGS_hard_rate_limit;
     options.rate_limit_delay_max_milliseconds =
diff --git a/db/db_impl.cc b/db/db_impl.cc
index aff68ed45..0c3bd778d 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -213,7 +213,9 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname)
       bg_flush_scheduled_(0),
       manual_compaction_(nullptr),
       disable_delete_obsolete_files_(0),
-      delete_obsolete_files_last_run_(options.env->NowMicros()),
+      delete_obsolete_files_next_run_(
+          options.env->NowMicros() +
+          db_options_.delete_obsolete_files_period_micros),
       last_stats_dump_time_microsec_(0),
       flush_on_destroy_(false),
       env_options_(options),
@@ -421,14 +423,17 @@ void DBImpl::MaybeDumpStats() {
   }
 }
 
-// Returns the list of live files in 'sst_live' and the list
-// of all files in the filesystem in 'candidate_files'.
+// If it's doing full scan:
+// * Returns the list of live files in 'full_scan_sst_live' and the list
+// of all files in the filesystem in 'full_scan_candidate_files'.
+// Otherwise, gets obsolete files from VersionSet.
 // no_full_scan = true -- never do the full scan using GetChildren()
 // force = false -- don't force the full scan, except every
 //  db_options_.delete_obsolete_files_period_micros
 // force = true -- force the full scan
 void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force,
                                bool no_full_scan) {
+  // TODO(icanadi) clean up FindObsoleteFiles, no need to do full scans anymore
   mutex_.AssertHeld();
 
   // if deletion is disabled, do nothing
@@ -445,10 +450,10 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force,
     doing_the_full_scan = true;
   } else {
     const uint64_t now_micros = env_->NowMicros();
-    if (delete_obsolete_files_last_run_ +
-        db_options_.delete_obsolete_files_period_micros < now_micros) {
+    if (delete_obsolete_files_next_run_ < now_micros) {
       doing_the_full_scan = true;
-      delete_obsolete_files_last_run_ = now_micros;
+      delete_obsolete_files_next_run_ =
+          now_micros + db_options_.delete_obsolete_files_period_micros;
     }
   }
 
@@ -462,13 +467,6 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force,
   job_context->log_number = versions_->MinLogNumber();
   job_context->prev_log_number = versions_->prev_log_number();
 
-  if (!doing_the_full_scan && !job_context->HaveSomethingToDelete()) {
-    // avoid filling up sst_live if we're sure that we
-    // are not going to do the full scan and that we don't have
-    // anything to delete at the moment
-    return;
-  }
-
   // don't delete live files
   if (pending_outputs_.size()) {
     job_context->min_pending_output = *pending_outputs_.begin();
@@ -476,11 +474,16 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force,
     // delete all of them
     job_context->min_pending_output = std::numeric_limits<uint64_t>::max();
   }
-  versions_->AddLiveFiles(&job_context->sst_live);
 
   if (doing_the_full_scan) {
-    for (uint32_t path_id = 0;
-         path_id < db_options_.db_paths.size(); path_id++) {
+    // Here we find all files in the DB directory and all the live files. In the
+    // DeleteObsoleteFiles(), we will calculate a set difference (all_files -
+    // live_files) and delete all files in that difference. If we're not doing
+    // the full scan we don't need to get live files, because all files returned
+    // by GetObsoleteFiles() will be dead (and need to be deleted)
+    versions_->AddLiveFiles(&job_context->full_scan_sst_live);
+    for (uint32_t path_id = 0; path_id < db_options_.db_paths.size();
+         path_id++) {
       // set of all files in the directory. We'll exclude files that are still
       // alive in the subsequent processings.
       std::vector<std::string> files;
@@ -488,7 +491,8 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force,
                         &files);  // Ignore errors
       for (std::string file : files) {
         // TODO(icanadi) clean up this mess to avoid having one-off "/" prefixes
-        job_context->candidate_files.emplace_back("/" + file, path_id);
+        job_context->full_scan_candidate_files.emplace_back("/" + file,
+                                                            path_id);
       }
     }
 
@@ -497,7 +501,7 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force,
       std::vector<std::string> log_files;
       env_->GetChildren(db_options_.wal_dir, &log_files);  // Ignore errors
       for (std::string log_file : log_files) {
-        job_context->candidate_files.emplace_back(log_file, 0);
+        job_context->full_scan_candidate_files.emplace_back(log_file, 0);
       }
     }
     // Add info log files in db_log_dir
@@ -506,7 +510,7 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force,
       // Ignore errors
       env_->GetChildren(db_options_.db_log_dir, &info_log_files);
       for (std::string log_file : info_log_files) {
-        job_context->candidate_files.emplace_back(log_file, 0);
+        job_context->full_scan_candidate_files.emplace_back(log_file, 0);
       }
     }
   }
@@ -543,11 +547,11 @@ void DBImpl::PurgeObsoleteFiles(const JobContext& state) {
   // Now, convert live list to an unordered map, WITHOUT mutex held;
   // set is slow.
   std::unordered_map<uint64_t, const FileDescriptor*> sst_live_map;
-  for (const FileDescriptor& fd : state.sst_live) {
+  for (const FileDescriptor& fd : state.full_scan_sst_live) {
     sst_live_map[fd.GetNumber()] = &fd;
   }
 
-  auto candidate_files = state.candidate_files;
+  auto candidate_files = state.full_scan_candidate_files;
   candidate_files.reserve(candidate_files.size() +
                           state.sst_delete_files.size() +
                           state.log_delete_files.size());
@@ -1491,6 +1495,7 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) {
     VersionEdit edit;
     edit.SetColumnFamily(cfd->GetID());
     for (const auto& f : cfd->current()->storage_info()->LevelFiles(level)) {
+      f->moved = true;
       edit.DeleteFile(level, f->fd.GetNumber());
       edit.AddFile(to_level, f->fd.GetNumber(), f->fd.GetPathId(),
                    f->fd.GetFileSize(), f->smallest, f->largest,
@@ -2137,6 +2142,7 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress, JobContext* job_context,
     // Move file to next level
     assert(c->num_input_files(0) == 1);
     FileMetaData* f = c->input(0, 0);
+    f->moved = true;
     c->edit()->DeleteFile(c->level(), f->fd.GetNumber());
     c->edit()->AddFile(c->level() + 1, f->fd.GetNumber(), f->fd.GetPathId(),
                        f->fd.GetFileSize(), f->smallest, f->largest,
diff --git a/db/db_impl.h b/db/db_impl.h
index 7a3a7984d..de834a0fa 100644
--- a/db/db_impl.h
+++ b/db/db_impl.h
@@ -532,8 +532,8 @@ class DBImpl : public DB {
   // without any synchronization
   int disable_delete_obsolete_files_;
 
-  // last time when DeleteObsoleteFiles was invoked
-  uint64_t delete_obsolete_files_last_run_;
+  // next time when we should run DeleteObsoleteFiles with full scan
+  uint64_t delete_obsolete_files_next_run_;
 
   // last time stats were dumped to LOG
   std::atomic<uint64_t> last_stats_dump_time_microsec_;
diff --git a/db/db_test.cc b/db/db_test.cc
index 6c995e7a0..7feb98808 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -85,7 +85,7 @@ static bool LZ4HCCompressionSupported(const CompressionOptions &options) {
   return port::LZ4HC_Compress(options, in.data(), in.size(), &out);
 }
 
-static std::string RandomString(Random *rnd, int len) {
+static std::string RandomString(Random* rnd, int len) {
   std::string r;
   test::RandomString(rnd, len, &r);
   return r;
@@ -9993,6 +9993,36 @@ TEST(DBTest, DontDeletePendingOutputs) {
   Compact("a", "b");
 }
 
+TEST(DBTest, DontDeleteMovedFile) {
+  // This test triggers move compaction and verifies that the file is not
+  // deleted when it's part of move compaction
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.max_bytes_for_level_base = 1024 * 1024;  // 1 MB
+  options.level0_file_num_compaction_trigger =
+      2;  // trigger compaction when we have 2 files
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+  // Create two 1MB sst files
+  for (int i = 0; i < 2; ++i) {
+    // Create 1MB sst file
+    for (int j = 0; j < 100; ++j) {
+      ASSERT_OK(Put(Key(i * 50 + j), RandomString(&rnd, 10 * 1024)));
+    }
+    ASSERT_OK(Flush());
+  }
+  // this should execute both L0->L1 and L1->(move)->L2 compactions
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ("0,0,1", FilesPerLevel(0));
+
+  // If the moved file is actually deleted (the move-safeguard in
+  // ~Version::Version() is not there), we get this failure:
+  // Corruption: Can't access /000009.sst
+  Reopen(options);
+}
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
diff --git a/db/job_context.h b/db/job_context.h
index 9b14d5995..01c868c03 100644
--- a/db/job_context.h
+++ b/db/job_context.h
@@ -20,7 +20,7 @@ class MemTable;
 
 struct JobContext {
   inline bool HaveSomethingToDelete() const {
-    return candidate_files.size() || sst_delete_files.size() ||
+    return full_scan_candidate_files.size() || sst_delete_files.size() ||
            log_delete_files.size() || new_superversion != nullptr ||
            superversions_to_free.size() > 0 || memtables_to_free.size() > 0;
   }
@@ -39,10 +39,12 @@ struct JobContext {
   // a list of all files that we'll consider deleting
   // (every once in a while this is filled up with all files
   // in the DB directory)
-  std::vector<CandidateFileInfo> candidate_files;
+  // (filled only if we're doing full scan)
+  std::vector<CandidateFileInfo> full_scan_candidate_files;
 
   // the list of all live sst files that cannot be deleted
-  std::vector<FileDescriptor> sst_live;
+  // (filled only if we're doing full scan)
+  std::vector<FileDescriptor> full_scan_sst_live;
 
   // a list of sst files that we need to delete
   std::vector<FileMetaData*> sst_delete_files;
diff --git a/db/version_builder.cc b/db/version_builder.cc
index ec7bb176a..e282e670c 100644
--- a/db/version_builder.cc
+++ b/db/version_builder.cc
@@ -206,6 +206,7 @@ class VersionBuilder::Rep {
       const int level = new_file.first;
       FileMetaData* f = new FileMetaData(new_file.second);
       f->refs = 1;
+      f->moved = false;
 
       assert(levels_[level].added_files.find(f->fd.GetNumber()) ==
              levels_[level].added_files.end());
diff --git a/db/version_edit.h b/db/version_edit.h
index 86e315c11..35b894954 100644
--- a/db/version_edit.h
+++ b/db/version_edit.h
@@ -85,6 +85,10 @@ struct FileMetaData {
   bool init_stats_from_file;   // true if the data-entry stats of this file
                                // has initialized from file.
 
+  // Always false for new files. Set to true if the file was part of move
+  // compaction. Can only be mutated from the compaction process, under DB mutex
+  bool moved;
+
   FileMetaData()
       : refs(0),
         being_compacted(false),
@@ -94,7 +98,8 @@ struct FileMetaData {
         num_deletions(0),
         raw_key_size(0),
         raw_value_size(0),
-        init_stats_from_file(false) {}
+        init_stats_from_file(false),
+        moved(false) {}
 };
 
 // A compressed copy of file meta data that just contain
diff --git a/db/version_set.cc b/db/version_set.cc
index f138c8232..0dbac7667 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -309,7 +309,13 @@ Version::~Version() {
           cfd_->table_cache()->ReleaseHandle(f->table_reader_handle);
           f->table_reader_handle = nullptr;
         }
-        vset_->obsolete_files_.push_back(f);
+        if (!f->moved) {
+          vset_->obsolete_files_.push_back(f);
+        } else {
+          // moved!
+          // TODO(icanadi) delete this outside of mutex
+          delete f;
+        }
       }
     }
   }

From 4fd26f287ca1b1453a5af9dc178cc7cba93594f6 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Mon, 22 Dec 2014 12:05:14 +0100
Subject: [PATCH 641/829] Only execute flush from compaction if
 max_background_flushes = 0

Summary: As title. We shouldn't need to execute flush from compaction if there are dedicated threads doing flushes.

Test Plan: make check

Reviewers: rven, yhchiang, sdong

Reviewed By: sdong

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D30579
---
 db/db_impl.cc | 41 ++++++++++++++++++++++-------------------
 1 file changed, 22 insertions(+), 19 deletions(-)

diff --git a/db/db_impl.cc b/db/db_impl.cc
index 0c3bd778d..96778e44a 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -2018,26 +2018,29 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress, JobContext* job_context,
     return Status::OK();
   }
 
-  // FLUSH preempts compaction
-  // TODO(icanadi) we should only do this if max_background_flushes == 0
-  // BackgroundFlush() will only execute a single flush. We keep calling it as
-  // long as there's more flushes to be done
-  while (!flush_queue_.empty()) {
-    LogToBuffer(
-        log_buffer,
-        "BackgroundCompaction calling BackgroundFlush. flush slots available "
-        "%d, compaction slots available %d",
-        db_options_.max_background_flushes - bg_flush_scheduled_,
-        db_options_.max_background_compactions - bg_compaction_scheduled_);
-    auto flush_status = BackgroundFlush(madeProgress, job_context, log_buffer);
-    if (!flush_status.ok()) {
-      if (is_manual) {
-        manual_compaction_->status = flush_status;
-        manual_compaction_->done = true;
-        manual_compaction_->in_progress = false;
-        manual_compaction_ = nullptr;
+  // If there are no flush threads, then compaction thread needs to execute the
+  // flushes
+  if (db_options_.max_background_flushes == 0) {
+    // BackgroundFlush() will only execute a single flush. We keep calling it as
+    // long as there's more flushes to be done
+    while (!flush_queue_.empty()) {
+      LogToBuffer(
+          log_buffer,
+          "BackgroundCompaction calling BackgroundFlush. flush slots available "
+          "%d, compaction slots available %d",
+          db_options_.max_background_flushes - bg_flush_scheduled_,
+          db_options_.max_background_compactions - bg_compaction_scheduled_);
+      auto flush_status =
+          BackgroundFlush(madeProgress, job_context, log_buffer);
+      if (!flush_status.ok()) {
+        if (is_manual) {
+          manual_compaction_->status = flush_status;
+          manual_compaction_->done = true;
+          manual_compaction_->in_progress = false;
+          manual_compaction_ = nullptr;
+        }
+        return flush_status;
       }
-      return flush_status;
     }
   }
 

From 45bab305f98d2233b66546f6de78d7a1dad7bc44 Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Mon, 22 Dec 2014 12:20:17 -0800
Subject: [PATCH 642/829] Move GetThreadList() feature under Env.

Summary:
GetThreadList() feature depends on the thread creation and destruction, which is currently handled under Env.
This patch moves GetThreadList() feature under Env to better manage the dependency of GetThreadList() feature
on thread creation and destruction.

Renamed ThreadStatusImpl to ThreadStatusUpdater.  Add ThreadStatusUtil, which is a static class contains
utility functions for ThreadStatusUpdater.

Test Plan: run db_test, thread_list_test and db_bench and verify the life cycle of Env and ThreadStatusUpdater is properly managed.

Reviewers: igor, sdong

Reviewed By: sdong

Subscribers: ljin, dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D30057
---
 db/db_impl.cc                                 |  14 ++-
 db/db_impl_readonly.cc                        |   1 -
 db/db_test.cc                                 |  18 ++--
 include/rocksdb/db.h                          |   6 --
 include/rocksdb/env.h                         |  36 ++++++-
 util/env_posix.cc                             |  45 ++++++--
 util/thread_list_test.cc                      |  14 +--
 ...tatus_impl.cc => thread_status_updater.cc} |  57 ++++------
 ..._status_impl.h => thread_status_updater.h} |  45 ++++----
 ...ebug.cc => thread_status_updater_debug.cc} |   4 +-
 util/thread_status_util.cc                    | 102 ++++++++++++++++++
 util/thread_status_util.h                     |  93 ++++++++++++++++
 utilities/compacted_db/compacted_db_impl.cc   |   1 -
 13 files changed, 338 insertions(+), 98 deletions(-)
 rename util/{thread_status_impl.cc => thread_status_updater.cc} (74%)
 rename util/{thread_status_impl.h => thread_status_updater.h} (84%)
 rename util/{thread_status_impl_debug.cc => thread_status_updater_debug.cc} (91%)
 create mode 100644 util/thread_status_util.cc
 create mode 100644 util/thread_status_util.h

diff --git a/db/db_impl.cc b/db/db_impl.cc
index 96778e44a..2bafc8f81 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -78,7 +78,8 @@
 #include "util/stop_watch.h"
 #include "util/sync_point.h"
 #include "util/string_util.h"
-#include "util/thread_status_impl.h"
+#include "util/thread_status_updater.h"
+#include "util/thread_status_util.h"
 
 namespace rocksdb {
 
@@ -3844,30 +3845,27 @@ Status DestroyDB(const std::string& dbname, const Options& options) {
 }
 
 #if ROCKSDB_USING_THREAD_STATUS
+
 void DBImpl::NewThreadStatusCfInfo(
     ColumnFamilyData* cfd) const {
   if (db_options_.enable_thread_tracking) {
-    ThreadStatusImpl::NewColumnFamilyInfo(
-        this, GetName(), cfd, cfd->GetName());
+    ThreadStatusUtil::NewColumnFamilyInfo(this, cfd);
   }
 }
 
 void DBImpl::EraseThreadStatusCfInfo(
     ColumnFamilyData* cfd) const {
   if (db_options_.enable_thread_tracking) {
-    ThreadStatusImpl::EraseColumnFamilyInfo(cfd);
+    ThreadStatusUtil::EraseColumnFamilyInfo(cfd);
   }
 }
 
 void DBImpl::EraseThreadStatusDbInfo() const {
   if (db_options_.enable_thread_tracking) {
-    ThreadStatusImpl::EraseDatabaseInfo(this);
+    ThreadStatusUtil::EraseDatabaseInfo(this);
   }
 }
 
-Status GetThreadList(std::vector<ThreadStatus>* thread_list) {
-  return thread_local_status.GetThreadList(thread_list);
-}
 #else
 void DBImpl::NewThreadStatusCfInfo(
     ColumnFamilyData* cfd) const {
diff --git a/db/db_impl_readonly.cc b/db/db_impl_readonly.cc
index 8b0beb7e0..c1d61e377 100644
--- a/db/db_impl_readonly.cc
+++ b/db/db_impl_readonly.cc
@@ -10,7 +10,6 @@
 #include "db/merge_context.h"
 #include "db/db_iter.h"
 #include "util/perf_context_imp.h"
-#include "util/thread_status_impl.h"
 
 namespace rocksdb {
 
diff --git a/db/db_test.cc b/db/db_test.cc
index 7feb98808..cb2458954 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -51,7 +51,7 @@
 #include "util/testutil.h"
 #include "util/mock_env.h"
 #include "util/string_util.h"
-#include "util/thread_status_impl.h"
+#include "util/thread_status_updater.h"
 
 namespace rocksdb {
 
@@ -9418,7 +9418,7 @@ TEST(DBTest, GetThreadList) {
   TryReopen(options);
 
   std::vector<ThreadStatus> thread_list;
-  Status s = GetThreadList(&thread_list);
+  Status s = env_->GetThreadList(&thread_list);
 
   for (int i = 0; i < 2; ++i) {
     // repeat the test with differet number of high / low priority threads
@@ -9431,7 +9431,7 @@ TEST(DBTest, GetThreadList) {
       env_->SetBackgroundThreads(kLowPriCounts[test], Env::LOW);
       // Wait to ensure the all threads has been registered
       env_->SleepForMicroseconds(100000);
-      s = GetThreadList(&thread_list);
+      s = env_->GetThreadList(&thread_list);
       ASSERT_OK(s);
       unsigned int thread_type_counts[ThreadStatus::ThreadType::TOTAL];
       memset(thread_type_counts, 0, sizeof(thread_type_counts));
@@ -9455,15 +9455,18 @@ TEST(DBTest, GetThreadList) {
     if (i == 0) {
       // repeat the test with multiple column families
       CreateAndReopenWithCF({"pikachu", "about-to-remove"}, options);
-      ThreadStatusImpl::TEST_VerifyColumnFamilyInfoMap(handles_, true);
+      env_->GetThreadStatusUpdater()->TEST_VerifyColumnFamilyInfoMap(
+          handles_, true);
     }
   }
   db_->DropColumnFamily(handles_[2]);
   delete handles_[2];
   handles_.erase(handles_.begin() + 2);
-  ThreadStatusImpl::TEST_VerifyColumnFamilyInfoMap(handles_, true);
+  env_->GetThreadStatusUpdater()->TEST_VerifyColumnFamilyInfoMap(
+      handles_, true);
   Close();
-  ThreadStatusImpl::TEST_VerifyColumnFamilyInfoMap(handles_, true);
+  env_->GetThreadStatusUpdater()->TEST_VerifyColumnFamilyInfoMap(
+      handles_, true);
 }
 
 TEST(DBTest, DisableThreadList) {
@@ -9473,7 +9476,8 @@ TEST(DBTest, DisableThreadList) {
   TryReopen(options);
   CreateAndReopenWithCF({"pikachu", "about-to-remove"}, options);
   // Verify non of the column family info exists
-  ThreadStatusImpl::TEST_VerifyColumnFamilyInfoMap(handles_, false);
+  env_->GetThreadStatusUpdater()->TEST_VerifyColumnFamilyInfoMap(
+      handles_, false);
 }
 #endif  // ROCKSDB_USING_THREAD_STATUS
 
diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h
index 72878ff57..a8cb694b4 100644
--- a/include/rocksdb/db.h
+++ b/include/rocksdb/db.h
@@ -548,12 +548,6 @@ Status DestroyDB(const std::string& name, const Options& options);
 Status RepairDB(const std::string& dbname, const Options& options);
 #endif
 
-#if ROCKSDB_USING_THREAD_STATUS
-// Obtain the status of all rocksdb-related threads.
-Status GetThreadList(std::vector<ThreadStatus>* thread_list);
-#endif
-
-
 }  // namespace rocksdb
 
 #endif  // STORAGE_ROCKSDB_INCLUDE_DB_H_
diff --git a/include/rocksdb/env.h b/include/rocksdb/env.h
index aded546ca..8a96ef1e1 100644
--- a/include/rocksdb/env.h
+++ b/include/rocksdb/env.h
@@ -24,6 +24,7 @@
 #include <vector>
 #include <stdint.h>
 #include "rocksdb/status.h"
+#include "rocksdb/thread_status.h"
 
 namespace rocksdb {
 
@@ -37,6 +38,7 @@ class RandomRWFile;
 class Directory;
 struct DBOptions;
 class RateLimiter;
+class ThreadStatusUpdater;
 
 using std::unique_ptr;
 using std::shared_ptr;
@@ -83,7 +85,8 @@ struct EnvOptions {
 
 class Env {
  public:
-  Env() { }
+  Env() : thread_status_updater_(nullptr) {}
+
   virtual ~Env();
 
   // Return a default environment suitable for the current operating
@@ -302,12 +305,34 @@ class Env {
   virtual EnvOptions OptimizeForManifestWrite(const EnvOptions& env_options)
       const;
 
+  // Returns the status of all threads that belong to the current Env.
+  virtual Status GetThreadList(std::vector<ThreadStatus>* thread_list) {
+    return Status::NotSupported("Not supported.");
+  }
+
+  // Returns the pointer to ThreadStatusUpdater.  This function will be
+  // used in RocksDB internally to update thread status and supports
+  // GetThreadList().
+  virtual ThreadStatusUpdater* GetThreadStatusUpdater() const {
+    return thread_status_updater_;
+  }
+
+ protected:
+  // The pointer to an internal structure that will update the
+  // status of each thread.
+  ThreadStatusUpdater* thread_status_updater_;
+
  private:
   // No copying allowed
   Env(const Env&);
   void operator=(const Env&);
 };
 
+// The factory function to construct a ThreadStatusUpdater.  Any Env
+// that supports GetThreadList() feature should call this function in its
+// constructor to initialize thread_status_updater_.
+ThreadStatusUpdater* CreateThreadStatusUpdater();
+
 // A file abstraction for reading sequentially through a file
 class SequentialFile {
  public:
@@ -805,10 +830,19 @@ class EnvWrapper : public Env {
   void LowerThreadPoolIOPriority(Priority pool = LOW) override {
     target_->LowerThreadPoolIOPriority(pool);
   }
+
   std::string TimeToString(uint64_t time) {
     return target_->TimeToString(time);
   }
 
+  Status GetThreadList(std::vector<ThreadStatus>* thread_list) {
+    return target_->GetThreadList(thread_list);
+  }
+
+  ThreadStatusUpdater* GetThreadStatusUpdater() const override {
+    return target_->GetThreadStatusUpdater();
+  }
+
  private:
   Env* target_;
 };
diff --git a/util/env_posix.cc b/util/env_posix.cc
index da090ddf5..5bad58466 100644
--- a/util/env_posix.cc
+++ b/util/env_posix.cc
@@ -42,7 +42,8 @@
 #include "util/random.h"
 #include "util/iostats_context_imp.h"
 #include "util/rate_limiter.h"
-#include "util/thread_status_impl.h"
+#include "util/thread_status_updater.h"
+#include "util/thread_status_util.h"
 
 // Get nano time for mach systems
 #ifdef __MACH__
@@ -76,10 +77,6 @@ int rocksdb_kill_odds = 0;
 
 namespace rocksdb {
 
-#if ROCKSDB_USING_THREAD_STATUS
-extern ThreadStatusImpl thread_local_status;
-#endif
-
 namespace {
 
 // A wrapper for fadvise, if the platform doesn't support fadvise,
@@ -92,6 +89,10 @@ int Fadvise(int fd, off_t offset, size_t len, int advice) {
 #endif
 }
 
+ThreadStatusUpdater* CreateThreadStatusUpdater() {
+  return new ThreadStatusUpdater();
+}
+
 // list of pathnames that are locked
 static std::set<std::string> lockedFiles;
 static port::Mutex mutex_lockedFiles;
@@ -1076,10 +1077,16 @@ class PosixEnv : public Env {
  public:
   PosixEnv();
 
-  virtual ~PosixEnv(){
+  virtual ~PosixEnv() {
     for (const auto tid : threads_to_join_) {
       pthread_join(tid, nullptr);
     }
+    for (int pool_id = 0; pool_id < Env::Priority::TOTAL; ++pool_id) {
+      thread_pools_[pool_id].JoinAllThreads();
+    }
+    // All threads must be joined before the deletion of
+    // thread_status_updater_.
+    delete thread_status_updater_;
   }
 
   void SetFD_CLOEXEC(int fd, const EnvOptions* options) {
@@ -1356,6 +1363,12 @@ class PosixEnv : public Env {
     return Status::OK();
   }
 
+  virtual Status GetThreadList(
+      std::vector<ThreadStatus>* thread_list) override {
+    assert(thread_status_updater_);
+    return thread_status_updater_->GetThreadList(thread_list);
+  }
+
   static uint64_t gettid(pthread_t tid) {
     uint64_t thread_id = 0;
     memcpy(&thread_id, &tid, std::min(sizeof(thread_id), sizeof(tid)));
@@ -1534,12 +1547,17 @@ class PosixEnv : public Env {
           queue_(),
           queue_len_(0),
           exit_all_threads_(false),
-          low_io_priority_(false) {
+          low_io_priority_(false),
+          env_(nullptr) {
       PthreadCall("mutex_init", pthread_mutex_init(&mu_, nullptr));
       PthreadCall("cvar_init", pthread_cond_init(&bgsignal_, nullptr));
     }
 
     ~ThreadPool() {
+      assert(bgthreads_.size() == 0U);
+    }
+
+    void JoinAllThreads() {
       PthreadCall("lock", pthread_mutex_lock(&mu_));
       assert(!exit_all_threads_);
       exit_all_threads_ = true;
@@ -1548,6 +1566,11 @@ class PosixEnv : public Env {
       for (const auto tid : bgthreads_) {
         pthread_join(tid, nullptr);
       }
+      bgthreads_.clear();
+    }
+
+    void SetHostEnv(Env* env) {
+      env_ = env;
     }
 
     void LowerIOPriority() {
@@ -1669,7 +1692,7 @@ class PosixEnv : public Env {
       ThreadPool* tp = meta->thread_pool_;
 #if ROCKSDB_USING_THREAD_STATUS
       // for thread-status
-      thread_local_status.SetThreadType(
+      ThreadStatusUtil::SetThreadType(tp->env_,
           (tp->GetThreadPriority() == Env::Priority::HIGH ?
               ThreadStatus::ThreadType::ROCKSDB_HIGH_PRIORITY :
               ThreadStatus::ThreadType::ROCKSDB_LOW_PRIORITY));
@@ -1677,7 +1700,7 @@ class PosixEnv : public Env {
       delete meta;
       tp->BGThread(thread_id);
 #if ROCKSDB_USING_THREAD_STATUS
-      thread_local_status.UnregisterThread();
+      ThreadStatusUtil::UnregisterThread();
 #endif
       return nullptr;
     }
@@ -1779,6 +1802,7 @@ class PosixEnv : public Env {
     bool exit_all_threads_;
     bool low_io_priority_;
     Env::Priority priority_;
+    Env* env_;
   };
 
   std::vector<ThreadPool> thread_pools_;
@@ -1796,7 +1820,10 @@ PosixEnv::PosixEnv() : checkedDiskForMmap_(false),
   for (int pool_id = 0; pool_id < Env::Priority::TOTAL; ++pool_id) {
     thread_pools_[pool_id].SetThreadPriority(
         static_cast<Env::Priority>(pool_id));
+    // This allows later initializing the thread-local-env of each thread.
+    thread_pools_[pool_id].SetHostEnv(this);
   }
+  thread_status_updater_ = CreateThreadStatusUpdater();
 }
 
 void PosixEnv::Schedule(void (*function)(void*), void* arg, Priority pri) {
diff --git a/util/thread_list_test.cc b/util/thread_list_test.cc
index b5ff60cc7..12ad14719 100644
--- a/util/thread_list_test.cc
+++ b/util/thread_list_test.cc
@@ -6,7 +6,7 @@
 #include <mutex>
 #include <condition_variable>
 
-#include "util/thread_status_impl.h"
+#include "util/thread_status_updater.h"
 #include "util/testharness.h"
 #include "rocksdb/db.h"
 
@@ -21,16 +21,16 @@ class SleepingBackgroundTask {
       : db_key_(db_key), db_name_(db_name),
         cf_key_(cf_key), cf_name_(cf_name),
         should_sleep_(true), sleeping_count_(0) {
-    ThreadStatusImpl::NewColumnFamilyInfo(
+    Env::Default()->GetThreadStatusUpdater()->NewColumnFamilyInfo(
         db_key_, db_name_, cf_key_, cf_name_);
   }
 
   ~SleepingBackgroundTask() {
-    ThreadStatusImpl::EraseDatabaseInfo(db_key_);
+    Env::Default()->GetThreadStatusUpdater()->EraseDatabaseInfo(db_key_);
   }
 
   void DoSleep() {
-    thread_local_status.SetColumnFamilyInfoKey(cf_key_);
+    Env::Default()->GetThreadStatusUpdater()->SetColumnFamilyInfoKey(cf_key_);
     std::unique_lock<std::mutex> l(mutex_);
     sleeping_count_++;
     while (should_sleep_) {
@@ -38,7 +38,7 @@ class SleepingBackgroundTask {
     }
     sleeping_count_--;
     bg_cv_.notify_all();
-    thread_local_status.SetColumnFamilyInfoKey(0);
+    Env::Default()->GetThreadStatusUpdater()->SetColumnFamilyInfoKey(0);
   }
   void WakeUp() {
     std::unique_lock<std::mutex> l(mutex_);
@@ -101,7 +101,7 @@ TEST(ThreadListTest, SimpleColumnFamilyInfoTest) {
   std::vector<ThreadStatus> thread_list;
 
   // Verify the number of sleeping threads in each pool.
-  GetThreadList(&thread_list);
+  env->GetThreadList(&thread_list);
   int sleeping_count[ThreadStatus::ThreadType::TOTAL] = {0};
   for (auto thread_status : thread_list) {
     if (thread_status.cf_name == "pikachu" &&
@@ -122,7 +122,7 @@ TEST(ThreadListTest, SimpleColumnFamilyInfoTest) {
   sleeping_task.WaitUntilDone();
 
   // Verify none of the threads are sleeping
-  GetThreadList(&thread_list);
+  env->GetThreadList(&thread_list);
   for (int i = 0; i < ThreadStatus::ThreadType::TOTAL; ++i) {
     sleeping_count[i] = 0;
   }
diff --git a/util/thread_status_impl.cc b/util/thread_status_updater.cc
similarity index 74%
rename from util/thread_status_impl.cc
rename to util/thread_status_updater.cc
index 35dc181e2..0a4336251 100644
--- a/util/thread_status_impl.cc
+++ b/util/thread_status_updater.cc
@@ -5,26 +5,15 @@
 
 #include "port/likely.h"
 #include "util/mutexlock.h"
-#include "util/thread_status_impl.h"
+#include "util/thread_status_updater.h"
 
 namespace rocksdb {
 
 #if ROCKSDB_USING_THREAD_STATUS
-__thread ThreadStatusData* ThreadStatusImpl::thread_status_data_ = nullptr;
-std::mutex ThreadStatusImpl::thread_list_mutex_;
-std::unordered_set<ThreadStatusData*> ThreadStatusImpl::thread_data_set_;
-std::unordered_map<const void*, std::unique_ptr<ConstantColumnFamilyInfo>>
-    ThreadStatusImpl::cf_info_map_;
-std::unordered_map<const void*, std::unordered_set<const void*>>
-    ThreadStatusImpl::db_key_map_;
 
-ThreadStatusImpl thread_local_status;
+__thread ThreadStatusData* ThreadStatusUpdater::thread_status_data_ = nullptr;
 
-ThreadStatusImpl::~ThreadStatusImpl() {
-  assert(thread_data_set_.size() == 0);
-}
-
-void ThreadStatusImpl::UnregisterThread() {
+void ThreadStatusUpdater::UnregisterThread() {
   if (thread_status_data_ != nullptr) {
     std::lock_guard<std::mutex> lck(thread_list_mutex_);
     thread_data_set_.erase(thread_status_data_);
@@ -33,26 +22,26 @@ void ThreadStatusImpl::UnregisterThread() {
   }
 }
 
-void ThreadStatusImpl::SetThreadType(
+void ThreadStatusUpdater::SetThreadType(
     ThreadStatus::ThreadType ttype) {
   auto* data = InitAndGet();
   data->thread_type.store(ttype, std::memory_order_relaxed);
 }
 
-void ThreadStatusImpl::SetColumnFamilyInfoKey(
+void ThreadStatusUpdater::SetColumnFamilyInfoKey(
     const void* cf_key) {
   auto* data = InitAndGet();
   data->cf_key.store(cf_key, std::memory_order_relaxed);
 }
 
-void ThreadStatusImpl::SetEventInfoPtr(
+void ThreadStatusUpdater::SetEventInfoPtr(
     const ThreadEventInfo* event_info) {
   auto* data = InitAndGet();
   data->event_info.store(event_info, std::memory_order_relaxed);
 }
 
-Status ThreadStatusImpl::GetThreadList(
-    std::vector<ThreadStatus>* thread_list) const {
+Status ThreadStatusUpdater::GetThreadList(
+    std::vector<ThreadStatus>* thread_list) {
   thread_list->clear();
   std::vector<std::shared_ptr<ThreadStatusData>> valid_list;
 
@@ -90,7 +79,7 @@ Status ThreadStatusImpl::GetThreadList(
   return Status::OK();
 }
 
-ThreadStatusData* ThreadStatusImpl::InitAndGet() {
+ThreadStatusData* ThreadStatusUpdater::InitAndGet() {
   if (UNLIKELY(thread_status_data_ == nullptr)) {
     thread_status_data_ = new ThreadStatusData();
     thread_status_data_->thread_id = reinterpret_cast<uint64_t>(
@@ -101,7 +90,7 @@ ThreadStatusData* ThreadStatusImpl::InitAndGet() {
   return thread_status_data_;
 }
 
-void ThreadStatusImpl::NewColumnFamilyInfo(
+void ThreadStatusUpdater::NewColumnFamilyInfo(
     const void* db_key, const std::string& db_name,
     const void* cf_key, const std::string& cf_name) {
   std::lock_guard<std::mutex> lck(thread_list_mutex_);
@@ -111,7 +100,7 @@ void ThreadStatusImpl::NewColumnFamilyInfo(
   db_key_map_[db_key].insert(cf_key);
 }
 
-void ThreadStatusImpl::EraseColumnFamilyInfo(const void* cf_key) {
+void ThreadStatusUpdater::EraseColumnFamilyInfo(const void* cf_key) {
   std::lock_guard<std::mutex> lck(thread_list_mutex_);
   auto cf_pair = cf_info_map_.find(cf_key);
   assert(cf_pair != cf_info_map_.end());
@@ -132,7 +121,7 @@ void ThreadStatusImpl::EraseColumnFamilyInfo(const void* cf_key) {
   assert(result);
 }
 
-void ThreadStatusImpl::EraseDatabaseInfo(const void* db_key) {
+void ThreadStatusUpdater::EraseDatabaseInfo(const void* db_key) {
   std::lock_guard<std::mutex> lck(thread_list_mutex_);
   auto db_pair = db_key_map_.find(db_key);
   if (UNLIKELY(db_pair == db_key_map_.end())) {
@@ -154,41 +143,37 @@ void ThreadStatusImpl::EraseDatabaseInfo(const void* db_key) {
 
 #else
 
-ThreadStatusImpl::~ThreadStatusImpl() {
-}
-
-void ThreadStatusImpl::UnregisterThread() {
+void ThreadStatusUpdater::UnregisterThread() {
 }
 
-void ThreadStatusImpl::SetThreadType(
+void ThreadStatusUpdater::SetThreadType(
     ThreadStatus::ThreadType ttype) {
 }
 
-void ThreadStatusImpl::SetColumnFamilyInfoKey(
+void ThreadStatusUpdater::SetColumnFamilyInfoKey(
     const void* cf_key) {
 }
 
-void ThreadStatusImpl::SetEventInfoPtr(
+void ThreadStatusUpdater::SetEventInfoPtr(
     const ThreadEventInfo* event_info) {
 }
 
-Status ThreadStatusImpl::GetThreadList(
-    std::vector<ThreadStatus>* thread_list) const {
+Status ThreadStatusUpdater::GetThreadList(
+    std::vector<ThreadStatus>* thread_list) {
   return Status::NotSupported(
       "GetThreadList is not supported in the current running environment.");
 }
 
-void ThreadStatusImpl::NewColumnFamilyInfo(
+void ThreadStatusUpdater::NewColumnFamilyInfo(
     const void* db_key, const std::string& db_name,
     const void* cf_key, const std::string& cf_name) {
 }
 
-void ThreadStatusImpl::EraseColumnFamilyInfo(const void* cf_key) {
+void ThreadStatusUpdater::EraseColumnFamilyInfo(const void* cf_key) {
 }
 
-void ThreadStatusImpl::EraseDatabaseInfo(const void* db_key) {
+void ThreadStatusUpdater::EraseDatabaseInfo(const void* db_key) {
 }
 
-ThreadStatusImpl thread_local_status;
 #endif  // ROCKSDB_USING_THREAD_STATUS
 }  // namespace rocksdb
diff --git a/util/thread_status_impl.h b/util/thread_status_updater.h
similarity index 84%
rename from util/thread_status_impl.h
rename to util/thread_status_updater.h
index a6e9a7e5b..e0434cd21 100644
--- a/util/thread_status_impl.h
+++ b/util/thread_status_updater.h
@@ -3,8 +3,7 @@
 // LICENSE file in the root directory of this source tree. An additional grant
 // of patent rights can be found in the PATENTS file in the same directory.
 //
-// The implementation of ThreadStatus.  It is implemented via combination
-// of macros and thread-local variables.
+// The implementation of ThreadStatus.
 //
 // Note that we make get and set access to ThreadStatusData lockless.
 // As a result, ThreadStatusData as a whole is not atomic.  However,
@@ -43,10 +42,7 @@ namespace rocksdb {
 
 class ColumnFamilyHandle;
 
-// The mutable version of ThreadStatus.  It has a static set maintaining
-// the set of current registered threades.
-//
-// Note that it is suggested to call the above macros.
+// The structure that keeps constant information about a column family.
 struct ConstantColumnFamilyInfo {
 #if ROCKSDB_USING_THREAD_STATUS
  public:
@@ -61,6 +57,7 @@ struct ConstantColumnFamilyInfo {
 #endif  // ROCKSDB_USING_THREAD_STATUS
 };
 
+// The structure that describes an event.
 struct ThreadEventInfo {
 #if ROCKSDB_USING_THREAD_STATUS
  public:
@@ -84,13 +81,22 @@ struct ThreadStatusData {
 #endif  // ROCKSDB_USING_THREAD_STATUS
 };
 
-class ThreadStatusImpl {
+// The class that stores and updates the status of the current thread
+// using a thread-local ThreadStatusData.
+//
+// In most of the case, you should use ThreadStatusUtil to update
+// the status of the current thread instead of using ThreadSatusUpdater
+// directly.
+//
+// @see ThreadStatusUtil
+class ThreadStatusUpdater {
  public:
-  ThreadStatusImpl() {}
+  ThreadStatusUpdater() {}
 
   // Releases all ThreadStatusData of all active threads.
-  ~ThreadStatusImpl();
+  virtual ~ThreadStatusUpdater() {}
 
+  // Unregister the current thread.
   void UnregisterThread();
 
   // Set the thread type of the current thread.
@@ -104,29 +110,30 @@ class ThreadStatusImpl {
   // its thread-local pointer of ThreadEventInfo to the correct entry.
   void SetEventInfoPtr(const ThreadEventInfo* event_info);
 
+  // Obtain the status of all active registered threads.
   Status GetThreadList(
-      std::vector<ThreadStatus>* thread_list) const;
+      std::vector<ThreadStatus>* thread_list);
 
   // Create an entry in the global ColumnFamilyInfo table for the
   // specified column family.  This function should be called only
   // when the current thread does not hold db_mutex.
-  static void NewColumnFamilyInfo(
+  void NewColumnFamilyInfo(
       const void* db_key, const std::string& db_name,
       const void* cf_key, const std::string& cf_name);
 
   // Erase all ConstantColumnFamilyInfo that is associated with the
   // specified db instance.  This function should be called only when
   // the current thread does not hold db_mutex.
-  static void EraseDatabaseInfo(const void* db_key);
+  void EraseDatabaseInfo(const void* db_key);
 
   // Erase the ConstantColumnFamilyInfo that is associated with the
   // specified ColumnFamilyData.  This function should be called only
   // when the current thread does not hold db_mutex.
-  static void EraseColumnFamilyInfo(const void* cf_key);
+  void EraseColumnFamilyInfo(const void* cf_key);
 
   // Verifies whether the input ColumnFamilyHandles matches
   // the information stored in the current cf_info_map.
-  static void TEST_VerifyColumnFamilyInfoMap(
+  void TEST_VerifyColumnFamilyInfoMap(
       const std::vector<ColumnFamilyHandle*>& handles,
       bool check_exist);
 
@@ -141,27 +148,25 @@ class ThreadStatusImpl {
   ThreadStatusData* InitAndGet();
 
   // The mutex that protects cf_info_map and db_key_map.
-  static std::mutex thread_list_mutex_;
+  std::mutex thread_list_mutex_;
 
   // The current status data of all active threads.
-  static std::unordered_set<ThreadStatusData*> thread_data_set_;
+  std::unordered_set<ThreadStatusData*> thread_data_set_;
 
   // A global map that keeps the column family information.  It is stored
   // globally instead of inside DB is to avoid the situation where DB is
   // closing while GetThreadList function already get the pointer to its
   // CopnstantColumnFamilyInfo.
-  static std::unordered_map<
+  std::unordered_map<
       const void*, std::unique_ptr<ConstantColumnFamilyInfo>> cf_info_map_;
 
   // A db_key to cf_key map that allows erasing elements in cf_info_map
   // associated to the same db_key faster.
-  static std::unordered_map<
+  std::unordered_map<
       const void*, std::unordered_set<const void*>> db_key_map_;
 #else
   static ThreadStatusData* thread_status_data_;
 #endif  // ROCKSDB_USING_THREAD_STATUS
 };
 
-
-extern ThreadStatusImpl thread_local_status;
 }  // namespace rocksdb
diff --git a/util/thread_status_impl_debug.cc b/util/thread_status_updater_debug.cc
similarity index 91%
rename from util/thread_status_impl_debug.cc
rename to util/thread_status_updater_debug.cc
index 5489499d3..1f53e5fc1 100644
--- a/util/thread_status_impl_debug.cc
+++ b/util/thread_status_updater_debug.cc
@@ -5,12 +5,12 @@
 
 #include <mutex>
 
-#include "util/thread_status_impl.h"
+#include "util/thread_status_updater.h"
 #include "db/column_family.h"
 #if ROCKSDB_USING_THREAD_STATUS
 
 namespace rocksdb {
-void ThreadStatusImpl::TEST_VerifyColumnFamilyInfoMap(
+void ThreadStatusUpdater::TEST_VerifyColumnFamilyInfoMap(
     const std::vector<ColumnFamilyHandle*>& handles,
     bool check_exist) {
   std::unique_lock<std::mutex> lock(thread_list_mutex_);
diff --git a/util/thread_status_util.cc b/util/thread_status_util.cc
new file mode 100644
index 000000000..c8767d9a8
--- /dev/null
+++ b/util/thread_status_util.cc
@@ -0,0 +1,102 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#include "rocksdb/env.h"
+#include "util/thread_status_updater.h"
+#include "util/thread_status_util.h"
+
+namespace rocksdb {
+
+#if ROCKSDB_USING_THREAD_STATUS
+__thread ThreadStatusUpdater*
+    ThreadStatusUtil::thread_updater_local_cache_ = nullptr;
+__thread bool ThreadStatusUtil::thread_updater_initialized_ = false;
+
+void ThreadStatusUtil::SetThreadType(
+    const Env* env, ThreadStatus::ThreadType thread_type) {
+  if (!MaybeInitThreadLocalUpdater(env)) {
+    return;
+  }
+  assert(thread_updater_local_cache_);
+  thread_updater_local_cache_->SetThreadType(thread_type);
+}
+
+void ThreadStatusUtil::UnregisterThread() {
+  thread_updater_initialized_ = false;
+  if (thread_updater_local_cache_ != nullptr) {
+    thread_updater_local_cache_->UnregisterThread();
+    thread_updater_local_cache_ = nullptr;
+  }
+}
+
+void ThreadStatusUtil::SetColumnFamily(const ColumnFamilyData* cfd) {
+  if (!MaybeInitThreadLocalUpdater(cfd->ioptions()->env)) {
+    return;
+  }
+  assert(thread_updater_local_cache_);
+  thread_updater_local_cache_->SetColumnFamilyInfoKey(cfd);
+}
+
+void ThreadStatusUtil::NewColumnFamilyInfo(
+    const DB* db, const ColumnFamilyData* cfd) {
+  if (!MaybeInitThreadLocalUpdater(cfd->ioptions()->env)) {
+    return;
+  }
+  assert(thread_updater_local_cache_);
+  if (thread_updater_local_cache_) {
+    thread_updater_local_cache_->NewColumnFamilyInfo(
+        db, db->GetName(), cfd, cfd->GetName());
+  }
+}
+
+void ThreadStatusUtil::EraseColumnFamilyInfo(
+    const ColumnFamilyData* cfd) {
+  if (thread_updater_local_cache_ == nullptr) {
+    return;
+  }
+  thread_updater_local_cache_->EraseColumnFamilyInfo(cfd);
+}
+
+void ThreadStatusUtil::EraseDatabaseInfo(const DB* db) {
+  if (thread_updater_local_cache_ == nullptr) {
+    return;
+  }
+  thread_updater_local_cache_->EraseDatabaseInfo(db);
+}
+
+bool ThreadStatusUtil::MaybeInitThreadLocalUpdater(const Env* env) {
+  if (!thread_updater_initialized_ && env != nullptr) {
+    thread_updater_initialized_ = true;
+    thread_updater_local_cache_ = env->GetThreadStatusUpdater();
+  }
+  return (thread_updater_local_cache_ != nullptr);
+}
+
+#else
+
+ThreadStatusUpdater* ThreadStatusUtil::thread_updater_local_cache_ = nullptr;
+bool ThreadStatusUtil::thread_updater_initialized_ = false;
+
+bool ThreadStatusUtil::MaybeInitThreadLocalUpdater(const Env* env) {
+  return false;
+}
+
+void ThreadStatusUtil::SetColumnFamily(const ColumnFamilyData* cfd) {
+}
+
+void ThreadStatusUtil::NewColumnFamilyInfo(
+    const DB* db, const ColumnFamilyData* cfd) {
+}
+
+void ThreadStatusUtil::EraseColumnFamilyInfo(
+    const ColumnFamilyData* cfd) {
+}
+
+void ThreadStatusUtil::EraseDatabaseInfo(const DB* db) {
+}
+
+#endif  // ROCKSDB_USING_THREAD_STATUS
+
+}  // namespace rocksdb
diff --git a/util/thread_status_util.h b/util/thread_status_util.h
new file mode 100644
index 000000000..c583d5a5d
--- /dev/null
+++ b/util/thread_status_util.h
@@ -0,0 +1,93 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#pragma once
+
+#include "db/column_family.h"
+#include "rocksdb/env.h"
+#include "rocksdb/thread_status.h"
+#include "util/thread_status_updater.h"
+
+namespace rocksdb {
+
+// The static utility class for updating thread-local status.
+//
+// The thread-local status is updated via the thread-local cached
+// pointer thread_updater_local_cache_.  During each function call,
+// when ThreadStatusUtil finds thread_updater_local_cache_ is
+// left uninitialized (determined by thread_updater_initialized_),
+// it will tries to initialize it using the return value of
+// Env::GetThreadStatusUpdater().  When thread_updater_local_cache_
+// is initialized by a non-null pointer, each function call will
+// then update the status of the current thread.  Otherwise,
+// all function calls to ThreadStatusUtil will be no-op.
+class ThreadStatusUtil {
+ public:
+  // Set the thread type of the current thread.
+  static void SetThreadType(
+      const Env* env, ThreadStatus::ThreadType thread_type);
+
+  // Unregister the current thread.
+  static void UnregisterThread();
+
+  // Create an entry in the global ColumnFamilyInfo table for the
+  // specified column family.  This function should be called only
+  // when the current thread does not hold db_mutex.
+  static void NewColumnFamilyInfo(
+      const DB* db, const ColumnFamilyData* cfd);
+
+  // Erase the ConstantColumnFamilyInfo that is associated with the
+  // specified ColumnFamilyData.  This function should be called only
+  // when the current thread does not hold db_mutex.
+  static void EraseColumnFamilyInfo(const ColumnFamilyData* cfd);
+
+  // Erase all ConstantColumnFamilyInfo that is associated with the
+  // specified db instance.  This function should be called only when
+  // the current thread does not hold db_mutex.
+  static void EraseDatabaseInfo(const DB* db);
+
+  // Update the thread status to indicate the current thread is doing
+  // something related to the specified column family.
+  static void SetColumnFamily(const ColumnFamilyData* cfd);
+
+ protected:
+  // Initialize the thread-local ThreadStatusUpdater when it finds
+  // the cached value is nullptr.  Returns true if it has cached
+  // a non-null pointer.
+  static bool MaybeInitThreadLocalUpdater(const Env* env);
+
+#if ROCKSDB_USING_THREAD_STATUS
+  // A boolean flag indicating whether thread_updater_local_cache_
+  // is initialized.  It is set to true when an Env uses any
+  // ThreadStatusUtil functions using the current thread other
+  // than UnregisterThread().  It will be set to false when
+  // UnregisterThread() is called.
+  //
+  // When this variable is set to true, thread_updater_local_cache_
+  // will not be updated until this variable is again set to false
+  // in UnregisterThread().
+  static  __thread bool thread_updater_initialized_;
+
+  // The thread-local cached ThreadStatusUpdater that caches the
+  // thread_status_updater_ of the first Env that uses any ThreadStatusUtil
+  // function other than UnregisterThread().  This variable will
+  // be cleared when UnregisterThread() is called.
+  //
+  // When this variable is set to a non-null pointer, then the status
+  // of the current thread will be updated when a function of
+  // ThreadStatusUtil is called.  Otherwise, all functions of
+  // ThreadStatusUtil will be no-op.
+  //
+  // When thread_updater_initialized_ is set to true, this variable
+  // will not be updated until this thread_updater_initialized_ is
+  // again set to false in UnregisterThread().
+  static __thread ThreadStatusUpdater* thread_updater_local_cache_;
+#else
+  static bool thread_updater_initialized_;
+  static ThreadStatusUpdater* thread_updater_local_cache_;
+#endif
+};
+
+}  // namespace rocksdb
diff --git a/utilities/compacted_db/compacted_db_impl.cc b/utilities/compacted_db/compacted_db_impl.cc
index fd35698b4..3bd27e46a 100644
--- a/utilities/compacted_db/compacted_db_impl.cc
+++ b/utilities/compacted_db/compacted_db_impl.cc
@@ -8,7 +8,6 @@
 #include "db/db_impl.h"
 #include "db/version_set.h"
 #include "table/get_context.h"
-#include "util/thread_status_impl.h"
 
 namespace rocksdb {
 

From d232cb156bf541db5105cc15319316e23bdef5d9 Mon Sep 17 00:00:00 2001
From: Chris BeHanna <chris@behanna.org>
Date: Mon, 22 Dec 2014 16:56:27 -0600
Subject: [PATCH 643/829] Fix the build with -DNDEBUG.

Dike out the body of VerifyCompactionResult.  With assert() compiled out, the
loop index variable in the inner loop was unused, breaking the build when
-Werror is enabled.
---
 db/db_test.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/db/db_test.cc b/db/db_test.cc
index cb2458954..a371cfd9e 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -8896,12 +8896,14 @@ namespace {
   void VerifyCompactionResult(
       const ColumnFamilyMetaData& cf_meta,
       const std::set<std::string>& overlapping_file_numbers) {
+#ifndef NDEBUG
     for (auto& level : cf_meta.levels) {
       for (auto& file : level.files) {
         assert(overlapping_file_numbers.find(file.name) ==
                overlapping_file_numbers.end());
       }
     }
+#endif
   }
 
   const SstFileMetaData* PickFileRandomly(

From 5045c439441ccdce394fdb4b0e3e886bf1074575 Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Mon, 22 Dec 2014 13:18:57 -0800
Subject: [PATCH 644/829] add support for nested BlockBasedTableOptions in
 config string

Summary:
Add support to allow nested config for block-based table factory. The format looks like this:

"write_buffer_size=1024;block_based_table_factory={block_size=4k};max_write_buffer_num=2"

Test Plan: unit test

Reviewers: yhchiang, rven, igor, ljin, jonahcohen

Reviewed By: jonahcohen

Subscribers: jonahcohen, dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D29223
---
 HISTORY.md                              |   2 +
 Makefile                                |   4 +-
 include/rocksdb/utilities/convenience.h |  24 +-
 java/rocksjni/options.cc                |   8 +-
 util/options_helper.cc                  | 260 ++++++++++++++---
 util/options_test.cc                    | 356 +++++++++++++++++++++---
 util/testharness.h                      |   9 +
 7 files changed, 563 insertions(+), 100 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index e24cad8e8..49fc56df8 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -15,6 +15,8 @@
 * Add rocksdb::GetThreadList(), which in the future will return the current status of all
   rocksdb-related threads.  We will have more code instruments in the following RocksDB
   releases.
+* Change convert function in rocksdb/utilities/convenience.h to return Status instead of boolean.
+  Also add support for nested options in convert function
 
 ### Public API changes
 * New API to create a checkpoint added. Given a directory name, creates a new
diff --git a/Makefile b/Makefile
index 1ca41f8fe..a500e5b60 100644
--- a/Makefile
+++ b/Makefile
@@ -175,7 +175,7 @@ TOOLS = \
         db_stress \
         ldb \
 	db_repl_stress \
-  options_test \
+	options_test \
 	blob_store_bench
 
 PROGRAMS = db_bench signal_test table_reader_bench log_and_apply_bench cache_bench perf_context_test $(TOOLS)
@@ -536,7 +536,7 @@ thread_list_test: util/thread_list_test.o $(LIBOBJECTS) $(TESTHARNESS)
 compactor_test: utilities/compaction/compactor_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) utilities/compaction/compactor_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
 
-options_test: util/options_test.o $(LIBOBJECTS) $(TESTHARNESS)
+options_test: util/options_test.o util/options_helper.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) util/options_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
 
 $(MEMENVLIBRARY) : $(MEMENVOBJECTS)
diff --git a/include/rocksdb/utilities/convenience.h b/include/rocksdb/utilities/convenience.h
index 77913c254..bf3942aae 100644
--- a/include/rocksdb/utilities/convenience.h
+++ b/include/rocksdb/utilities/convenience.h
@@ -8,35 +8,51 @@
 #include <unordered_map>
 #include <string>
 #include "rocksdb/options.h"
+#include "rocksdb/table.h"
 
 namespace rocksdb {
 
 #ifndef ROCKSDB_LITE
 // Take a map of option name and option value, apply them into the
 // base_options, and return the new options as a result
-bool GetColumnFamilyOptionsFromMap(
+Status GetColumnFamilyOptionsFromMap(
     const ColumnFamilyOptions& base_options,
     const std::unordered_map<std::string, std::string>& opts_map,
     ColumnFamilyOptions* new_options);
 
-bool GetDBOptionsFromMap(
+Status GetDBOptionsFromMap(
     const DBOptions& base_options,
     const std::unordered_map<std::string, std::string>& opts_map,
     DBOptions* new_options);
 
+Status GetBlockBasedTableOptionsFromMap(
+    const BlockBasedTableOptions& table_options,
+    const std::unordered_map<std::string, std::string>& opts_map,
+    BlockBasedTableOptions* new_table_options);
+
 // Take a string representation of option names and  values, apply them into the
 // base_options, and return the new options as a result. The string has the
 // following format:
 //   "write_buffer_size=1024;max_write_buffer_number=2"
-bool GetColumnFamilyOptionsFromString(
+// Nested options config is also possible. For example, you can define
+// BlockBasedTableOptions as part of the string for block-based table factory:
+//   "write_buffer_size=1024;block_based_table_factory={block_size=4k};"
+//   "max_write_buffer_num=2"
+Status GetColumnFamilyOptionsFromString(
     const ColumnFamilyOptions& base_options,
     const std::string& opts_str,
     ColumnFamilyOptions* new_options);
 
-bool GetDBOptionsFromString(
+Status GetDBOptionsFromString(
     const DBOptions& base_options,
     const std::string& opts_str,
     DBOptions* new_options);
+
+Status GetBlockBasedTableOptionsFromString(
+    const BlockBasedTableOptions& table_options,
+    const std::string& opts_str,
+    BlockBasedTableOptions* new_table_options);
+
 #endif  // ROCKSDB_LITE
 
 }  // namespace rocksdb
diff --git a/java/rocksjni/options.cc b/java/rocksjni/options.cc
index 667d74508..9f0875b32 100644
--- a/java/rocksjni/options.cc
+++ b/java/rocksjni/options.cc
@@ -1801,11 +1801,11 @@ jlong Java_org_rocksdb_ColumnFamilyOptions_getColumnFamilyOptionsFromProps(
   rocksdb::ColumnFamilyOptions* cf_options =
       new rocksdb::ColumnFamilyOptions();
   const char* opt_string = env->GetStringUTFChars(jopt_string, 0);
-  bool status = rocksdb::GetColumnFamilyOptionsFromString(
+  rocksdb::Status status = rocksdb::GetColumnFamilyOptionsFromString(
       rocksdb::ColumnFamilyOptions(), opt_string, cf_options);
   env->ReleaseStringUTFChars(jopt_string, opt_string);
   // Check if ColumnFamilyOptions creation was possible.
-  if (status) {
+  if (status.ok()) {
     ret_value = reinterpret_cast<jlong>(cf_options);
   } else {
     // if operation failed the ColumnFamilyOptions need to be deleted
@@ -2803,11 +2803,11 @@ jlong Java_org_rocksdb_DBOptions_getDBOptionsFromProps(
   rocksdb::DBOptions* db_options =
       new rocksdb::DBOptions();
   const char* opt_string = env->GetStringUTFChars(jopt_string, 0);
-  bool status = rocksdb::GetDBOptionsFromString(
+  rocksdb::Status status = rocksdb::GetDBOptionsFromString(
       rocksdb::DBOptions(), opt_string, db_options);
   env->ReleaseStringUTFChars(jopt_string, opt_string);
   // Check if DBOptions creation was possible.
-  if (status) {
+  if (status.ok()) {
     ret_value = reinterpret_cast<jlong>(db_options);
   } else {
     // if operation failed the DBOptions need to be deleted
diff --git a/util/options_helper.cc b/util/options_helper.cc
index c2bd3cb83..4a169ce3f 100644
--- a/util/options_helper.cc
+++ b/util/options_helper.cc
@@ -6,7 +6,10 @@
 #include <cassert>
 #include <cctype>
 #include <unordered_set>
+#include "rocksdb/cache.h"
+#include "rocksdb/filter_policy.h"
 #include "rocksdb/options.h"
+#include "rocksdb/table.h"
 #include "rocksdb/utilities/convenience.h"
 #include "util/options_helper.h"
 
@@ -29,19 +32,40 @@ CompressionType ParseCompressionType(const std::string& type) {
   } else if (type == "kLZ4HCCompression") {
     return kLZ4HCCompression;
   } else {
-    throw "unknown compression type: " + type;
+    throw std::invalid_argument("Unknown compression type: " + type);
   }
   return kNoCompression;
 }
 
+BlockBasedTableOptions::IndexType ParseBlockBasedTableIndexType(
+    const std::string& type) {
+  if (type == "kBinarySearch") {
+    return BlockBasedTableOptions::kBinarySearch;
+  } else if (type == "kHashSearch") {
+    return BlockBasedTableOptions::kHashSearch;
+  }
+  throw std::invalid_argument("Unknown index type: " + type);
+}
+
+ChecksumType ParseBlockBasedTableChecksumType(
+    const std::string& type) {
+  if (type == "kNoChecksum") {
+    return kNoChecksum;
+  } else if (type == "kCRC32c") {
+    return kCRC32c;
+  } else if (type == "kxxHash") {
+    return kxxHash;
+  }
+  throw std::invalid_argument("Unknown checksum type: " + type);
+}
+
 bool ParseBoolean(const std::string& type, const std::string& value) {
   if (value == "true" || value == "1") {
     return true;
   } else if (value == "false" || value == "0") {
     return false;
-  } else {
-    throw type;
   }
+  throw std::invalid_argument(type);
 }
 
 uint64_t ParseUint64(const std::string& value) {
@@ -105,7 +129,7 @@ CompactionStyle ParseCompactionStyle(const std::string& type) {
   } else if (type == "kCompactionStyleFIFO") {
     return kCompactionStyleFIFO;
   } else {
-    throw "unknown compaction style: " + type;
+    throw std::invalid_argument("unknown compaction style: " + type);
   }
   return kCompactionStyleLevel;
 }
@@ -172,7 +196,7 @@ bool ParseCompactionOptions(const std::string& name, const std::string& value,
     new_options->max_bytes_for_level_multiplier_additional.clear();
     size_t start = 0;
     while (true) {
-      size_t end = value.find_first_of(':', start);
+      size_t end = value.find(':', start);
       if (end == std::string::npos) {
         new_options->max_bytes_for_level_multiplier_additional.push_back(
             ParseInt(value.substr(start)));
@@ -210,8 +234,8 @@ Status GetMutableOptionsFromStrings(
     MutableCFOptions* new_options) {
   assert(new_options);
   *new_options = base_options;
-  try {
-    for (const auto& o : options_map) {
+  for (const auto& o : options_map) {
+    try {
       if (ParseMemtableOptions(o.first, o.second, new_options)) {
       } else if (ParseCompactionOptions(o.first, o.second, new_options)) {
       } else if (ParseMiscOptions(o.first, o.second, new_options)) {
@@ -219,9 +243,10 @@ Status GetMutableOptionsFromStrings(
         return Status::InvalidArgument(
             "unsupported dynamic option: " + o.first);
       }
+    } catch (std::exception& e) {
+      return Status::InvalidArgument("error parsing " + o.first + ":" +
+                                     std::string(e.what()));
     }
-  } catch (std::exception& e) {
-    return Status::InvalidArgument("error parsing " + std::string(e.what()));
   }
   return Status::OK();
 }
@@ -243,38 +268,165 @@ std::string trim(const std::string& str) {
   return std::string();
 }
 
-bool StringToMap(const std::string& opts_str,
-                 std::unordered_map<std::string, std::string>* opts_map) {
+}  // anonymous namespace
+
+Status StringToMap(const std::string& opts_str,
+                   std::unordered_map<std::string, std::string>* opts_map) {
   assert(opts_map);
   // Example:
-  //   opts_str = "write_buffer_size=1024;max_write_buffer_number=2"
+  //   opts_str = "write_buffer_size=1024;max_write_buffer_number=2;"
+  //              "nested_opt={opt1=1;opt2=2};max_bytes_for_level_base=100"
   size_t pos = 0;
-
   std::string opts = trim(opts_str);
   while (pos < opts.size()) {
     size_t eq_pos = opts.find('=', pos);
     if (eq_pos == std::string::npos) {
-      return false;
+      return Status::InvalidArgument("Mismatched key value pair, '=' expected");
     }
     std::string key = trim(opts.substr(pos, eq_pos - pos));
+    if (key.empty()) {
+      return Status::InvalidArgument("Empty key found");
+    }
 
-    size_t sc_pos = opts.find(';', eq_pos + 1);
-    if (sc_pos == std::string::npos) {
-      (*opts_map)[key] = trim(opts.substr(eq_pos + 1));
-      // It either ends with a trailing semi-colon or the last key-value pair
+    // skip space after '=' and look for '{' for possible nested options
+    pos = eq_pos + 1;
+    while (pos < opts.size() && isspace(opts[pos])) {
+      ++pos;
+    }
+    // Empty value at the end
+    if (pos >= opts.size()) {
+      (*opts_map)[key] = "";
       break;
+    }
+    if (opts[pos] == '{') {
+      int count = 1;
+      size_t brace_pos = pos + 1;
+      while (brace_pos < opts.size()) {
+        if (opts[brace_pos] == '{') {
+          ++count;
+        } else if (opts[brace_pos] == '}') {
+          --count;
+          if (count == 0) {
+            break;
+          }
+        }
+        ++brace_pos;
+      }
+      // found the matching closing brace
+      if (count == 0) {
+        (*opts_map)[key] = trim(opts.substr(pos + 1, brace_pos - pos - 1));
+        // skip all whitespace and move to the next ';'
+        // brace_pos points to the next position after the matching '}'
+        pos = brace_pos + 1;
+        while (pos < opts.size() && isspace(opts[pos])) {
+          ++pos;
+        }
+        if (pos < opts.size() && opts[pos] != ';') {
+          return Status::InvalidArgument(
+              "Unexpected chars after nested options");
+        }
+        ++pos;
+      } else {
+        return Status::InvalidArgument(
+            "Mismatched curly braces for nested options");
+      }
     } else {
-      (*opts_map)[key] = trim(opts.substr(eq_pos + 1, sc_pos - eq_pos - 1));
+      size_t sc_pos = opts.find(';', pos);
+      if (sc_pos == std::string::npos) {
+        (*opts_map)[key] = trim(opts.substr(pos));
+        // It either ends with a trailing semi-colon or the last key-value pair
+        break;
+      } else {
+        (*opts_map)[key] = trim(opts.substr(pos, sc_pos - pos));
+      }
+      pos = sc_pos + 1;
     }
-    pos = sc_pos + 1;
   }
 
-  return true;
+  return Status::OK();
 }
 
-}  // anonymous namespace
 
-bool GetColumnFamilyOptionsFromMap(
+Status GetBlockBasedTableOptionsFromMap(
+    const BlockBasedTableOptions& table_options,
+    const std::unordered_map<std::string, std::string>& opts_map,
+    BlockBasedTableOptions* new_table_options) {
+
+  assert(new_table_options);
+  *new_table_options = table_options;
+  for (const auto& o : opts_map) {
+    try {
+      if (o.first == "cache_index_and_filter_blocks") {
+        new_table_options->cache_index_and_filter_blocks =
+          ParseBoolean(o.first, o.second);
+      } else if (o.first == "index_type") {
+        new_table_options->index_type = ParseBlockBasedTableIndexType(o.second);
+      } else if (o.first == "hash_index_allow_collision") {
+        new_table_options->hash_index_allow_collision =
+          ParseBoolean(o.first, o.second);
+      } else if (o.first == "checksum") {
+        new_table_options->checksum =
+          ParseBlockBasedTableChecksumType(o.second);
+      } else if (o.first == "no_block_cache") {
+        new_table_options->no_block_cache = ParseBoolean(o.first, o.second);
+      } else if (o.first == "block_cache") {
+        new_table_options->block_cache = NewLRUCache(ParseSizeT(o.second));
+      } else if (o.first == "block_cache_compressed") {
+        new_table_options->block_cache_compressed =
+          NewLRUCache(ParseSizeT(o.second));
+      } else if (o.first == "block_size") {
+        new_table_options->block_size = ParseSizeT(o.second);
+      } else if (o.first == "block_size_deviation") {
+        new_table_options->block_size_deviation = ParseInt(o.second);
+      } else if (o.first == "block_restart_interval") {
+        new_table_options->block_restart_interval = ParseInt(o.second);
+      } else if (o.first == "filter_policy") {
+        // Expect the following format
+        // bloomfilter:int:bool
+        const std::string kName = "bloomfilter:";
+        if (o.second.compare(0, kName.size(), kName) != 0) {
+          return Status::InvalidArgument("Invalid filter policy name");
+        }
+        size_t pos = o.second.find(':', kName.size());
+        if (pos == std::string::npos) {
+          return Status::InvalidArgument("Invalid filter policy config, "
+                                         "missing bits_per_key");
+        }
+        int bits_per_key = ParseInt(
+            trim(o.second.substr(kName.size(), pos - kName.size())));
+        bool use_block_based_builder =
+          ParseBoolean("use_block_based_builder",
+                       trim(o.second.substr(pos + 1)));
+        new_table_options->filter_policy.reset(
+            NewBloomFilterPolicy(bits_per_key, use_block_based_builder));
+      } else if (o.first == "whole_key_filtering") {
+        new_table_options->whole_key_filtering =
+          ParseBoolean(o.first, o.second);
+      } else {
+        return Status::InvalidArgument("Unrecognized option: " + o.first);
+      }
+    } catch (std::exception& e) {
+      return Status::InvalidArgument("error parsing " + o.first + ":" +
+                                     std::string(e.what()));
+    }
+  }
+  return Status::OK();
+}
+
+Status GetBlockBasedTableOptionsFromString(
+    const BlockBasedTableOptions& table_options,
+    const std::string& opts_str,
+    BlockBasedTableOptions* new_table_options) {
+  std::unordered_map<std::string, std::string> opts_map;
+  Status s = StringToMap(opts_str, &opts_map);
+  if (!s.ok()) {
+    return s;
+  }
+  return GetBlockBasedTableOptionsFromMap(table_options, opts_map,
+                                          new_table_options);
+}
+
+Status GetColumnFamilyOptionsFromMap(
     const ColumnFamilyOptions& base_options,
     const std::unordered_map<std::string, std::string>& opts_map,
     ColumnFamilyOptions* new_options) {
@@ -285,6 +437,15 @@ bool GetColumnFamilyOptionsFromMap(
       if (ParseMemtableOptions(o.first, o.second, new_options)) {
       } else if (ParseCompactionOptions(o.first, o.second, new_options)) {
       } else if (ParseMiscOptions(o.first, o.second, new_options)) {
+      } else if (o.first == "block_based_table_factory") {
+        // Nested options
+        BlockBasedTableOptions table_opt;
+        Status table_opt_s = GetBlockBasedTableOptionsFromString(
+            BlockBasedTableOptions(), o.second, &table_opt);
+        if (!table_opt_s.ok()) {
+          return table_opt_s;
+        }
+        new_options->table_factory.reset(NewBlockBasedTableFactory(table_opt));
       } else if (o.first == "min_write_buffer_number_to_merge") {
         new_options->min_write_buffer_number_to_merge = ParseInt(o.second);
       } else if (o.first == "compression") {
@@ -293,7 +454,7 @@ bool GetColumnFamilyOptionsFromMap(
         new_options->compression_per_level.clear();
         size_t start = 0;
         while (true) {
-          size_t end = o.second.find_first_of(':', start);
+          size_t end = o.second.find(':', start);
           if (end == std::string::npos) {
             new_options->compression_per_level.push_back(
                 ParseCompressionType(o.second.substr(start)));
@@ -306,22 +467,25 @@ bool GetColumnFamilyOptionsFromMap(
         }
       } else if (o.first == "compression_opts") {
         size_t start = 0;
-        size_t end = o.second.find_first_of(':');
+        size_t end = o.second.find(':');
         if (end == std::string::npos) {
-          throw o.first;
+          return Status::InvalidArgument("invalid config value for: "
+                                         + o.first);
         }
         new_options->compression_opts.window_bits =
             ParseInt(o.second.substr(start, end - start));
         start = end + 1;
-        end = o.second.find_first_of(':', start);
+        end = o.second.find(':', start);
         if (end == std::string::npos) {
-          throw o.first;
+          return Status::InvalidArgument("invalid config value for: "
+                                         + o.first);
         }
         new_options->compression_opts.level =
             ParseInt(o.second.substr(start, end - start));
         start = end + 1;
         if (start >= o.second.size()) {
-          throw o.first;
+          return Status::InvalidArgument("invalid config value for: "
+                                         + o.first);
         }
         new_options->compression_opts.strategy =
             ParseInt(o.second.substr(start, o.second.size() - start));
@@ -334,7 +498,7 @@ bool GetColumnFamilyOptionsFromMap(
         new_options->compaction_style = ParseCompactionStyle(o.second);
       } else if (o.first == "compaction_options_universal") {
         // TODO(ljin): add support
-        throw o.first;
+        return Status::NotSupported("Not supported: " + o.first);
       } else if (o.first == "compaction_options_fifo") {
         new_options->compaction_options_fifo.max_table_files_size
           = ParseUint64(o.second);
@@ -345,27 +509,29 @@ bool GetColumnFamilyOptionsFromMap(
       } else if (o.first == "inplace_update_support") {
         new_options->inplace_update_support = ParseBoolean(o.first, o.second);
       } else {
-        return false;
+        return Status::InvalidArgument("Unrecognized option: " + o.first);
       }
-    } catch (std::exception) {
-      return false;
+    } catch (std::exception& e) {
+      return Status::InvalidArgument("error parsing " + o.first + ":" +
+                                     std::string(e.what()));
     }
   }
-  return true;
+  return Status::OK();
 }
 
-bool GetColumnFamilyOptionsFromString(
+Status GetColumnFamilyOptionsFromString(
     const ColumnFamilyOptions& base_options,
     const std::string& opts_str,
     ColumnFamilyOptions* new_options) {
   std::unordered_map<std::string, std::string> opts_map;
-  if (!StringToMap(opts_str, &opts_map)) {
-    return false;
+  Status s = StringToMap(opts_str, &opts_map);
+  if (!s.ok()) {
+    return s;
   }
   return GetColumnFamilyOptionsFromMap(base_options, opts_map, new_options);
 }
 
-bool GetDBOptionsFromMap(
+Status GetDBOptionsFromMap(
     const DBOptions& base_options,
     const std::unordered_map<std::string, std::string>& opts_map,
     DBOptions* new_options) {
@@ -392,7 +558,7 @@ bool GetDBOptionsFromMap(
         new_options->use_fsync = ParseBoolean(o.first, o.second);
       } else if (o.first == "db_paths") {
         // TODO(ljin): add support
-        throw o.first;
+        return Status::NotSupported("Not supported: " + o.first);
       } else if (o.first == "db_log_dir") {
         new_options->db_log_dir = o.second;
       } else if (o.first == "wal_dir") {
@@ -444,22 +610,24 @@ bool GetDBOptionsFromMap(
       } else if (o.first == "bytes_per_sync") {
         new_options->bytes_per_sync = ParseUint64(o.second);
       } else {
-        return false;
+        return Status::InvalidArgument("Unrecognized option: " + o.first);
       }
-    } catch (std::exception) {
-      return false;
+    } catch (std::exception& e) {
+      return Status::InvalidArgument("error parsing " + o.first + ":" +
+                                     std::string(e.what()));
     }
   }
-  return true;
+  return Status::OK();
 }
 
-bool GetDBOptionsFromString(
+Status GetDBOptionsFromString(
     const DBOptions& base_options,
     const std::string& opts_str,
     DBOptions* new_options) {
   std::unordered_map<std::string, std::string> opts_map;
-  if (!StringToMap(opts_str, &opts_map)) {
-    return false;
+  Status s = StringToMap(opts_str, &opts_map);
+  if (!s.ok()) {
+    return s;
   }
   return GetDBOptionsFromMap(base_options, opts_map, new_options);
 }
diff --git a/util/options_test.cc b/util/options_test.cc
index 4d6746ec2..a9e609f4f 100644
--- a/util/options_test.cc
+++ b/util/options_test.cc
@@ -14,13 +14,13 @@
 #include <unordered_map>
 #include <inttypes.h>
 
+#include "rocksdb/cache.h"
 #include "rocksdb/options.h"
 #include "rocksdb/table.h"
+#include "rocksdb/utilities/convenience.h"
+#include "rocksdb/utilities/leveldb_options.h"
 #include "table/block_based_table_factory.h"
 #include "util/testharness.h"
-#include "rocksdb/cache.h"
-#include "rocksdb/utilities/leveldb_options.h"
-#include "rocksdb/utilities/convenience.h"
 
 #ifndef GFLAGS
 bool FLAGS_enable_print = false;
@@ -168,8 +168,8 @@ TEST(OptionsTest, GetOptionsFromMapTest) {
 
   ColumnFamilyOptions base_cf_opt;
   ColumnFamilyOptions new_cf_opt;
-  ASSERT_TRUE(GetColumnFamilyOptionsFromMap(
-              base_cf_opt, cf_options_map, &new_cf_opt));
+  ASSERT_OK(GetColumnFamilyOptionsFromMap(
+            base_cf_opt, cf_options_map, &new_cf_opt));
   ASSERT_EQ(new_cf_opt.write_buffer_size, 1U);
   ASSERT_EQ(new_cf_opt.max_write_buffer_number, 2);
   ASSERT_EQ(new_cf_opt.min_write_buffer_number_to_merge, 3);
@@ -222,18 +222,18 @@ TEST(OptionsTest, GetOptionsFromMapTest) {
   ASSERT_EQ(new_cf_opt.min_partial_merge_operands, 31U);
 
   cf_options_map["write_buffer_size"] = "hello";
-  ASSERT_TRUE(!GetColumnFamilyOptionsFromMap(
-              base_cf_opt, cf_options_map, &new_cf_opt));
+  ASSERT_NOK(GetColumnFamilyOptionsFromMap(
+             base_cf_opt, cf_options_map, &new_cf_opt));
   cf_options_map["write_buffer_size"] = "1";
-  ASSERT_TRUE(GetColumnFamilyOptionsFromMap(
-              base_cf_opt, cf_options_map, &new_cf_opt));
+  ASSERT_OK(GetColumnFamilyOptionsFromMap(
+            base_cf_opt, cf_options_map, &new_cf_opt));
   cf_options_map["unknown_option"] = "1";
-  ASSERT_TRUE(!GetColumnFamilyOptionsFromMap(
-              base_cf_opt, cf_options_map, &new_cf_opt));
+  ASSERT_NOK(GetColumnFamilyOptionsFromMap(
+             base_cf_opt, cf_options_map, &new_cf_opt));
 
   DBOptions base_db_opt;
   DBOptions new_db_opt;
-  ASSERT_TRUE(GetDBOptionsFromMap(base_db_opt, db_options_map, &new_db_opt));
+  ASSERT_OK(GetDBOptionsFromMap(base_db_opt, db_options_map, &new_db_opt));
   ASSERT_EQ(new_db_opt.create_if_missing, false);
   ASSERT_EQ(new_db_opt.create_missing_column_families, true);
   ASSERT_EQ(new_db_opt.error_if_exists, false);
@@ -271,63 +271,331 @@ TEST(OptionsTest, GetOptionsFromMapTest) {
 TEST(OptionsTest, GetOptionsFromStringTest) {
   ColumnFamilyOptions base_cf_opt;
   ColumnFamilyOptions new_cf_opt;
-  ASSERT_TRUE(GetColumnFamilyOptionsFromString(base_cf_opt, "", &new_cf_opt));
-  ASSERT_TRUE(GetColumnFamilyOptionsFromString(base_cf_opt,
-              "write_buffer_size=5", &new_cf_opt));
+  base_cf_opt.table_factory.reset();
+  ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt, "", &new_cf_opt));
+  ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt,
+            "write_buffer_size=5", &new_cf_opt));
   ASSERT_EQ(new_cf_opt.write_buffer_size, 5U);
-  ASSERT_TRUE(GetColumnFamilyOptionsFromString(base_cf_opt,
-              "write_buffer_size=6;", &new_cf_opt));
+  ASSERT_TRUE(new_cf_opt.table_factory == nullptr);
+  ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt,
+            "write_buffer_size=6;", &new_cf_opt));
   ASSERT_EQ(new_cf_opt.write_buffer_size, 6U);
-  ASSERT_TRUE(GetColumnFamilyOptionsFromString(base_cf_opt,
-              "  write_buffer_size =  7  ", &new_cf_opt));
+  ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt,
+            "  write_buffer_size =  7  ", &new_cf_opt));
   ASSERT_EQ(new_cf_opt.write_buffer_size, 7U);
-  ASSERT_TRUE(GetColumnFamilyOptionsFromString(base_cf_opt,
-              "  write_buffer_size =  8 ; ", &new_cf_opt));
+  ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt,
+            "  write_buffer_size =  8 ; ", &new_cf_opt));
   ASSERT_EQ(new_cf_opt.write_buffer_size, 8U);
-  ASSERT_TRUE(GetColumnFamilyOptionsFromString(base_cf_opt,
-              "write_buffer_size=9;max_write_buffer_number=10", &new_cf_opt));
+  ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt,
+            "write_buffer_size=9;max_write_buffer_number=10", &new_cf_opt));
   ASSERT_EQ(new_cf_opt.write_buffer_size, 9U);
   ASSERT_EQ(new_cf_opt.max_write_buffer_number, 10);
-  ASSERT_TRUE(GetColumnFamilyOptionsFromString(base_cf_opt,
-              "write_buffer_size=11; max_write_buffer_number  =  12 ;",
-              &new_cf_opt));
+  ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt,
+            "write_buffer_size=11; max_write_buffer_number  =  12 ;",
+            &new_cf_opt));
   ASSERT_EQ(new_cf_opt.write_buffer_size, 11U);
   ASSERT_EQ(new_cf_opt.max_write_buffer_number, 12);
   // Wrong name "max_write_buffer_number_"
-  ASSERT_TRUE(!GetColumnFamilyOptionsFromString(base_cf_opt,
-              "write_buffer_size=13;max_write_buffer_number_=14;",
+  ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt,
+             "write_buffer_size=13;max_write_buffer_number_=14;",
               &new_cf_opt));
   // Wrong key/value pair
-  ASSERT_TRUE(!GetColumnFamilyOptionsFromString(base_cf_opt,
-              "write_buffer_size=13;max_write_buffer_number;", &new_cf_opt));
+  ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt,
+             "write_buffer_size=13;max_write_buffer_number;", &new_cf_opt));
   // Error Paring value
-  ASSERT_TRUE(!GetColumnFamilyOptionsFromString(base_cf_opt,
-              "write_buffer_size=13;max_write_buffer_number=;", &new_cf_opt));
+  ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt,
+             "write_buffer_size=13;max_write_buffer_number=;", &new_cf_opt));
   // Missing option name
-  ASSERT_TRUE(!GetColumnFamilyOptionsFromString(base_cf_opt,
-              "write_buffer_size=13; =100;", &new_cf_opt));
+  ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt,
+             "write_buffer_size=13; =100;", &new_cf_opt));
   // Units (k)
-  ASSERT_TRUE(GetColumnFamilyOptionsFromString(base_cf_opt,
-              "memtable_prefix_bloom_bits=14k;max_write_buffer_number=-15K",
-              &new_cf_opt));
+  ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt,
+            "memtable_prefix_bloom_bits=14k;max_write_buffer_number=-15K",
+            &new_cf_opt));
   ASSERT_EQ(new_cf_opt.memtable_prefix_bloom_bits, 14UL*1024UL);
   ASSERT_EQ(new_cf_opt.max_write_buffer_number, -15*1024);
   // Units (m)
-  ASSERT_TRUE(GetColumnFamilyOptionsFromString(base_cf_opt,
-              "max_write_buffer_number=16m;inplace_update_num_locks=17M",
-              &new_cf_opt));
+  ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt,
+            "max_write_buffer_number=16m;inplace_update_num_locks=17M",
+            &new_cf_opt));
   ASSERT_EQ(new_cf_opt.max_write_buffer_number, 16*1024*1024);
   ASSERT_EQ(new_cf_opt.inplace_update_num_locks, 17*1024UL*1024UL);
   // Units (g)
-  ASSERT_TRUE(GetColumnFamilyOptionsFromString(base_cf_opt,
-              "write_buffer_size=18g;arena_block_size=19G", &new_cf_opt));
+  ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt,
+            "write_buffer_size=18g;arena_block_size=19G", &new_cf_opt));
   ASSERT_EQ(new_cf_opt.write_buffer_size, 18*1024UL*1024UL*1024UL);
   ASSERT_EQ(new_cf_opt.arena_block_size, 19*1024UL*1024UL*1024UL);
   // Units (t)
-  ASSERT_TRUE(GetColumnFamilyOptionsFromString(base_cf_opt,
-              "write_buffer_size=20t;arena_block_size=21T", &new_cf_opt));
+  ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt,
+            "write_buffer_size=20t;arena_block_size=21T", &new_cf_opt));
   ASSERT_EQ(new_cf_opt.write_buffer_size, 20*1024UL*1024UL*1024UL*1024UL);
   ASSERT_EQ(new_cf_opt.arena_block_size, 21*1024UL*1024UL*1024UL*1024UL);
+
+  // Nested block based table options
+  // Emtpy
+  ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt,
+            "write_buffer_size=10;max_write_buffer_number=16;"
+            "block_based_table_factory={};arena_block_size=1024",
+            &new_cf_opt));
+  ASSERT_TRUE(new_cf_opt.table_factory != nullptr);
+  // Non-empty
+  ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt,
+            "write_buffer_size=10;max_write_buffer_number=16;"
+            "block_based_table_factory={block_cache=1M;block_size=4;};"
+            "arena_block_size=1024",
+            &new_cf_opt));
+  ASSERT_TRUE(new_cf_opt.table_factory != nullptr);
+  // Last one
+  ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt,
+            "write_buffer_size=10;max_write_buffer_number=16;"
+            "block_based_table_factory={block_cache=1M;block_size=4;}",
+            &new_cf_opt));
+  ASSERT_TRUE(new_cf_opt.table_factory != nullptr);
+  // Mismatch curly braces
+  ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt,
+             "write_buffer_size=10;max_write_buffer_number=16;"
+             "block_based_table_factory={{{block_size=4;};"
+             "arena_block_size=1024",
+             &new_cf_opt));
+  // Unexpected chars after closing curly brace
+  ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt,
+             "write_buffer_size=10;max_write_buffer_number=16;"
+             "block_based_table_factory={block_size=4;}};"
+             "arena_block_size=1024",
+             &new_cf_opt));
+  ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt,
+             "write_buffer_size=10;max_write_buffer_number=16;"
+             "block_based_table_factory={block_size=4;}xdfa;"
+             "arena_block_size=1024",
+             &new_cf_opt));
+  ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt,
+             "write_buffer_size=10;max_write_buffer_number=16;"
+             "block_based_table_factory={block_size=4;}xdfa",
+             &new_cf_opt));
+  // Invalid block based table option
+  ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt,
+             "write_buffer_size=10;max_write_buffer_number=16;"
+             "block_based_table_factory={xx_block_size=4;}",
+             &new_cf_opt));
+}
+
+TEST(OptionsTest, GetBlockBasedTableOptionsFromString) {
+  BlockBasedTableOptions table_opt;
+  BlockBasedTableOptions new_opt;
+  // make sure default values are overwritten by something else
+  ASSERT_OK(GetBlockBasedTableOptionsFromString(table_opt,
+            "cache_index_and_filter_blocks=1;index_type=kHashSearch;"
+            "checksum=kxxHash;hash_index_allow_collision=1;no_block_cache=1;"
+            "block_cache=1M;block_cache_compressed=1k;block_size=1024;"
+            "block_size_deviation=8;block_restart_interval=4;"
+            "filter_policy=bloomfilter:4:true;whole_key_filtering=1",
+            &new_opt));
+  ASSERT_TRUE(new_opt.cache_index_and_filter_blocks);
+  ASSERT_EQ(new_opt.index_type, BlockBasedTableOptions::kHashSearch);
+  ASSERT_EQ(new_opt.checksum, ChecksumType::kxxHash);
+  ASSERT_TRUE(new_opt.hash_index_allow_collision);
+  ASSERT_TRUE(new_opt.no_block_cache);
+  ASSERT_TRUE(new_opt.block_cache != nullptr);
+  ASSERT_EQ(new_opt.block_cache->GetCapacity(), 1024UL*1024UL);
+  ASSERT_TRUE(new_opt.block_cache_compressed != nullptr);
+  ASSERT_EQ(new_opt.block_cache_compressed->GetCapacity(), 1024UL);
+  ASSERT_EQ(new_opt.block_size, 1024UL);
+  ASSERT_EQ(new_opt.block_size_deviation, 8);
+  ASSERT_EQ(new_opt.block_restart_interval, 4);
+  ASSERT_TRUE(new_opt.filter_policy != nullptr);
+
+  // unknown option
+  ASSERT_NOK(GetBlockBasedTableOptionsFromString(table_opt,
+             "cache_index_and_filter_blocks=1;index_type=kBinarySearch;"
+             "bad_option=1",
+             &new_opt));
+
+  // unrecognized index type
+  ASSERT_NOK(GetBlockBasedTableOptionsFromString(table_opt,
+             "cache_index_and_filter_blocks=1;index_type=kBinarySearchXX",
+             &new_opt));
+
+  // unrecognized checksum type
+  ASSERT_NOK(GetBlockBasedTableOptionsFromString(table_opt,
+             "cache_index_and_filter_blocks=1;checksum=kxxHashXX",
+             &new_opt));
+
+  // unrecognized filter policy name
+  ASSERT_NOK(GetBlockBasedTableOptionsFromString(table_opt,
+             "cache_index_and_filter_blocks=1;"
+             "filter_policy=bloomfilterxx:4:true",
+             &new_opt));
+  // unrecognized filter policy config
+  ASSERT_NOK(GetBlockBasedTableOptionsFromString(table_opt,
+             "cache_index_and_filter_blocks=1;"
+             "filter_policy=bloomfilter:4",
+             &new_opt));
+}
+
+Status StringToMap(
+    const std::string& opts_str,
+    std::unordered_map<std::string, std::string>* opts_map);
+
+TEST(OptionsTest, StringToMapTest) {
+  std::unordered_map<std::string, std::string> opts_map;
+  // Regular options
+  ASSERT_OK(StringToMap("k1=v1;k2=v2;k3=v3", &opts_map));
+  ASSERT_EQ(opts_map["k1"], "v1");
+  ASSERT_EQ(opts_map["k2"], "v2");
+  ASSERT_EQ(opts_map["k3"], "v3");
+  // Value with '='
+  opts_map.clear();
+  ASSERT_OK(StringToMap("k1==v1;k2=v2=;", &opts_map));
+  ASSERT_EQ(opts_map["k1"], "=v1");
+  ASSERT_EQ(opts_map["k2"], "v2=");
+  // Overwrriten option
+  opts_map.clear();
+  ASSERT_OK(StringToMap("k1=v1;k1=v2;k3=v3", &opts_map));
+  ASSERT_EQ(opts_map["k1"], "v2");
+  ASSERT_EQ(opts_map["k3"], "v3");
+  // Empty value
+  opts_map.clear();
+  ASSERT_OK(StringToMap("k1=v1;k2=;k3=v3;k4=", &opts_map));
+  ASSERT_EQ(opts_map["k1"], "v1");
+  ASSERT_TRUE(opts_map.find("k2") != opts_map.end());
+  ASSERT_EQ(opts_map["k2"], "");
+  ASSERT_EQ(opts_map["k3"], "v3");
+  ASSERT_TRUE(opts_map.find("k4") != opts_map.end());
+  ASSERT_EQ(opts_map["k4"], "");
+  opts_map.clear();
+  ASSERT_OK(StringToMap("k1=v1;k2=;k3=v3;k4=   ", &opts_map));
+  ASSERT_EQ(opts_map["k1"], "v1");
+  ASSERT_TRUE(opts_map.find("k2") != opts_map.end());
+  ASSERT_EQ(opts_map["k2"], "");
+  ASSERT_EQ(opts_map["k3"], "v3");
+  ASSERT_TRUE(opts_map.find("k4") != opts_map.end());
+  ASSERT_EQ(opts_map["k4"], "");
+  opts_map.clear();
+  ASSERT_OK(StringToMap("k1=v1;k2=;k3=", &opts_map));
+  ASSERT_EQ(opts_map["k1"], "v1");
+  ASSERT_TRUE(opts_map.find("k2") != opts_map.end());
+  ASSERT_EQ(opts_map["k2"], "");
+  ASSERT_TRUE(opts_map.find("k3") != opts_map.end());
+  ASSERT_EQ(opts_map["k3"], "");
+  opts_map.clear();
+  ASSERT_OK(StringToMap("k1=v1;k2=;k3=;", &opts_map));
+  ASSERT_EQ(opts_map["k1"], "v1");
+  ASSERT_TRUE(opts_map.find("k2") != opts_map.end());
+  ASSERT_EQ(opts_map["k2"], "");
+  ASSERT_TRUE(opts_map.find("k3") != opts_map.end());
+  ASSERT_EQ(opts_map["k3"], "");
+  // Regular nested options
+  opts_map.clear();
+  ASSERT_OK(StringToMap("k1=v1;k2={nk1=nv1;nk2=nv2};k3=v3", &opts_map));
+  ASSERT_EQ(opts_map["k1"], "v1");
+  ASSERT_EQ(opts_map["k2"], "nk1=nv1;nk2=nv2");
+  ASSERT_EQ(opts_map["k3"], "v3");
+  // Multi-level nested options
+  opts_map.clear();
+  ASSERT_OK(StringToMap("k1=v1;k2={nk1=nv1;nk2={nnk1=nnk2}};"
+                        "k3={nk1={nnk1={nnnk1=nnnv1;nnnk2;nnnv2}}};k4=v4",
+                        &opts_map));
+  ASSERT_EQ(opts_map["k1"], "v1");
+  ASSERT_EQ(opts_map["k2"], "nk1=nv1;nk2={nnk1=nnk2}");
+  ASSERT_EQ(opts_map["k3"], "nk1={nnk1={nnnk1=nnnv1;nnnk2;nnnv2}}");
+  ASSERT_EQ(opts_map["k4"], "v4");
+  // Garbage inside curly braces
+  opts_map.clear();
+  ASSERT_OK(StringToMap("k1=v1;k2={dfad=};k3={=};k4=v4",
+                        &opts_map));
+  ASSERT_EQ(opts_map["k1"], "v1");
+  ASSERT_EQ(opts_map["k2"], "dfad=");
+  ASSERT_EQ(opts_map["k3"], "=");
+  ASSERT_EQ(opts_map["k4"], "v4");
+  // Empty nested options
+  opts_map.clear();
+  ASSERT_OK(StringToMap("k1=v1;k2={};", &opts_map));
+  ASSERT_EQ(opts_map["k1"], "v1");
+  ASSERT_EQ(opts_map["k2"], "");
+  opts_map.clear();
+  ASSERT_OK(StringToMap("k1=v1;k2={{{{}}}{}{}};", &opts_map));
+  ASSERT_EQ(opts_map["k1"], "v1");
+  ASSERT_EQ(opts_map["k2"], "{{{}}}{}{}");
+  // With random spaces
+  opts_map.clear();
+  ASSERT_OK(StringToMap("  k1 =  v1 ; k2= {nk1=nv1; nk2={nnk1=nnk2}}  ; "
+                        "k3={  {   } }; k4= v4  ",
+                        &opts_map));
+  ASSERT_EQ(opts_map["k1"], "v1");
+  ASSERT_EQ(opts_map["k2"], "nk1=nv1; nk2={nnk1=nnk2}");
+  ASSERT_EQ(opts_map["k3"], "{   }");
+  ASSERT_EQ(opts_map["k4"], "v4");
+
+  // Empty key
+  ASSERT_NOK(StringToMap("k1=v1;k2=v2;=", &opts_map));
+  ASSERT_NOK(StringToMap("=v1;k2=v2", &opts_map));
+  ASSERT_NOK(StringToMap("k1=v1;k2v2;", &opts_map));
+  ASSERT_NOK(StringToMap("k1=v1;k2=v2;fadfa", &opts_map));
+  ASSERT_NOK(StringToMap("k1=v1;k2=v2;;", &opts_map));
+  // Mismatch curly braces
+  ASSERT_NOK(StringToMap("k1=v1;k2={;k3=v3", &opts_map));
+  ASSERT_NOK(StringToMap("k1=v1;k2={{};k3=v3", &opts_map));
+  ASSERT_NOK(StringToMap("k1=v1;k2={}};k3=v3", &opts_map));
+  ASSERT_NOK(StringToMap("k1=v1;k2={{}{}}};k3=v3", &opts_map));
+  // However this is valid!
+  opts_map.clear();
+  ASSERT_OK(StringToMap("k1=v1;k2=};k3=v3", &opts_map));
+  ASSERT_EQ(opts_map["k1"], "v1");
+  ASSERT_EQ(opts_map["k2"], "}");
+  ASSERT_EQ(opts_map["k3"], "v3");
+
+  // Invalid chars after closing curly brace
+  ASSERT_NOK(StringToMap("k1=v1;k2={{}}{};k3=v3", &opts_map));
+  ASSERT_NOK(StringToMap("k1=v1;k2={{}}cfda;k3=v3", &opts_map));
+  ASSERT_NOK(StringToMap("k1=v1;k2={{}}  cfda;k3=v3", &opts_map));
+  ASSERT_NOK(StringToMap("k1=v1;k2={{}}  cfda", &opts_map));
+  ASSERT_NOK(StringToMap("k1=v1;k2={{}}{}", &opts_map));
+  ASSERT_NOK(StringToMap("k1=v1;k2={{dfdl}adfa}{}", &opts_map));
+}
+
+TEST(OptionsTest, StringToMapRandomTest) {
+  std::unordered_map<std::string, std::string> opts_map;
+  // Make sure segfault is not hit by semi-random strings
+
+  std::vector<std::string> bases = {
+      "a={aa={};tt={xxx={}}};c=defff",
+      "a={aa={};tt={xxx={}}};c=defff;d={{}yxx{}3{xx}}",
+      "abc={{}{}{}{{{}}}{{}{}{}{}{}{}{}"};
+
+  for (std::string base : bases) {
+    for (int rand_seed = 301; rand_seed < 401; rand_seed++) {
+      Random rnd(rand_seed);
+      for (int attempt = 0; attempt < 10; attempt++) {
+        std::string str = base;
+        // Replace random position to space
+        size_t pos = static_cast<size_t>(
+            rnd.Uniform(static_cast<int>(base.size())));
+        str[pos] = ' ';
+        Status s = StringToMap(str, &opts_map);
+        ASSERT_TRUE(s.ok() || s.IsInvalidArgument());
+        opts_map.clear();
+      }
+    }
+  }
+
+  // Random Construct a string
+  std::vector<char> chars = {'{', '}', ' ', '=', ';', 'c'};
+  for (int rand_seed = 301; rand_seed < 1301; rand_seed++) {
+    Random rnd(rand_seed);
+    int len = rnd.Uniform(30);
+    std::string str = "";
+    for (int attempt = 0; attempt < len; attempt++) {
+      // Add a random character
+      size_t pos = static_cast<size_t>(
+          rnd.Uniform(static_cast<int>(chars.size())));
+      str.append(1, chars[pos]);
+    }
+    Status s = StringToMap(str, &opts_map);
+    ASSERT_TRUE(s.ok() || s.IsInvalidArgument());
+    s = StringToMap("name=" + str, &opts_map);
+    ASSERT_TRUE(s.ok() || s.IsInvalidArgument());
+    opts_map.clear();
+  }
 }
 
 TEST(OptionsTest, ConvertOptionsTest) {
diff --git a/util/testharness.h b/util/testharness.h
index ae2570889..e57b98a6f 100644
--- a/util/testharness.h
+++ b/util/testharness.h
@@ -84,6 +84,14 @@ class Tester {
     return *this;
   }
 
+  Tester& IsNotOk(const Status& s) {
+    if (s.ok()) {
+      ss_ << " Error status expected";
+      ok_ = false;
+    }
+    return *this;
+  }
+
 #define BINARY_OP(name,op)                              \
   template <class X, class Y>                           \
   Tester& name(const X& x, const Y& y) {                \
@@ -114,6 +122,7 @@ class Tester {
 
 #define ASSERT_TRUE(c) ::rocksdb::test::Tester(__FILE__, __LINE__).Is((c), #c)
 #define ASSERT_OK(s) ::rocksdb::test::Tester(__FILE__, __LINE__).IsOk((s))
+#define ASSERT_NOK(s) ::rocksdb::test::Tester(__FILE__, __LINE__).IsNotOk((s))
 #define ASSERT_EQ(a,b) ::rocksdb::test::Tester(__FILE__, __LINE__).IsEq((a),(b))
 #define ASSERT_NE(a,b) ::rocksdb::test::Tester(__FILE__, __LINE__).IsNe((a),(b))
 #define ASSERT_GE(a,b) ::rocksdb::test::Tester(__FILE__, __LINE__).IsGe((a),(b))

From 98490bccf605478f98cb8101b6faf1f4d857b87f Mon Sep 17 00:00:00 2001
From: Adam Retter <adam.retter@googlemail.com>
Date: Tue, 23 Dec 2014 14:22:56 +0000
Subject: [PATCH 645/829] Fix the build on Mac OS X

---
 java/rocksjni/slice.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/java/rocksjni/slice.cc b/java/rocksjni/slice.cc
index c92ca5ec6..e4b7cf03b 100644
--- a/java/rocksjni/slice.cc
+++ b/java/rocksjni/slice.cc
@@ -28,7 +28,7 @@ void Java_org_rocksdb_AbstractSlice_createNewSliceFromString(
     JNIEnv* env, jobject jobj, jstring jstr) {
 
   const auto* str = env->GetStringUTFChars(jstr, 0);
-  const int len = strlen(str);
+  const size_t len = strlen(str);
   char* buf = new char[len];
   memcpy(buf, str, len);
   env->ReleaseStringUTFChars(jstr, str);

From b623009619c2ab08e8a212c8a79a3bea4d82dbe9 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Tue, 23 Dec 2014 17:14:44 +0100
Subject: [PATCH 646/829] Fix compile of compact_file_example

---
 examples/compact_files_example.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/compact_files_example.cc b/examples/compact_files_example.cc
index bf6cec262..3e7638b7e 100644
--- a/examples/compact_files_example.cc
+++ b/examples/compact_files_example.cc
@@ -156,14 +156,14 @@ int main() {
   // if background compaction is not working, write will stall
   // because of options.level0_stop_writes_trigger
   for (int i = 1000; i < 99999; ++i) {
-    db->Put(WriteOptions(), ToString(i),
+    db->Put(WriteOptions(), std::to_string(i),
                             std::string(500, 'a' + (i % 26)));
   }
 
   // verify the values are still there
   std::string value;
   for (int i = 1000; i < 99999; ++i) {
-    db->Get(ReadOptions(), ToString(i),
+    db->Get(ReadOptions(), std::to_string(i),
                            &value);
     assert(value == std::string(500, 'a' + (i % 26)));
   }

From ae508df90e070e676b2f7533cd77a0ec85483480 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Tue, 23 Dec 2014 17:32:30 +0100
Subject: [PATCH 647/829] Clean up compile for c_simple_example

---
 examples/Makefile                                 | 11 +++++++----
 examples/{simple_example.c => c_simple_example.c} |  7 +++----
 2 files changed, 10 insertions(+), 8 deletions(-)
 rename examples/{simple_example.c => c_simple_example.c} (88%)

diff --git a/examples/Makefile b/examples/Makefile
index 96c8bc3cf..efc5fe30e 100644
--- a/examples/Makefile
+++ b/examples/Makefile
@@ -2,7 +2,7 @@ include ../build_config.mk
 
 .PHONY: clean
 
-all: simple_example column_families_example compact_files_example simple_example-c
+all: simple_example column_families_example compact_files_example c_simple_example
 
 simple_example: simple_example.cc
 	$(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++11 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
@@ -13,8 +13,11 @@ column_families_example: column_families_example.cc
 compact_files_example: compact_files_example.cc
 	$(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++11 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
 
-simple_example-c: simple_example.c
-	$(CXX) -xc -I../include simple_example.c -o$@ -L.. -lrocksdb -pthread -lsnappy -lbz2 -lz -lrt
+.c.o:
+	$(CC) $(CFLAGS) -c $< -o $@ -I../include
+
+c_simple_example: c_simple_example.o
+	$(CXX) $@.o -o$@ ../librocksdb.a $(PLATFORM_LDFLAGS) $(EXEC_LDFLAGS)
 
 clean:
-	rm -rf ./simple_example ./column_families_example ./compact_files_example ./simple_example-c
+	rm -rf ./simple_example ./column_families_example ./compact_files_example ./c_simple_example c_simple_example.o
diff --git a/examples/simple_example.c b/examples/c_simple_example.c
similarity index 88%
rename from examples/simple_example.c
rename to examples/c_simple_example.c
index e982cce89..8c6f89e39 100644
--- a/examples/simple_example.c
+++ b/examples/c_simple_example.c
@@ -14,8 +14,8 @@ int main(int argc, char **argv) {
 	rocksdb_options_t *options = rocksdb_options_create();
 	// Optimize RocksDB. This is the easiest way to
 	// get RocksDB to perform well
-	int cpus = sysconf(_SC_NPROCESSORS_ONLN);	// get # of online cores
-	rocksdb_options_increase_parallelism(options, cpus);
+	long cpus = sysconf(_SC_NPROCESSORS_ONLN);	// get # of online cores
+	rocksdb_options_increase_parallelism(options, (int)(cpus));
 	rocksdb_options_optimize_level_style_compaction(options, 0);
 	// create the DB if it's not already present
 	rocksdb_options_set_create_if_missing(options, 1);
@@ -28,7 +28,7 @@ int main(int argc, char **argv) {
 	// Put key-value
 	rocksdb_writeoptions_t *writeoptions = rocksdb_writeoptions_create();
 	const char key[] = "key";
-	char *value = "value";
+	const char *value = "value";
 	rocksdb_put(db, writeoptions, key, strlen (key), value,	\
 			strlen (value), &err);
 	assert(!err);
@@ -38,7 +38,6 @@ int main(int argc, char **argv) {
 	value = rocksdb_get(db, readoptions, key, strlen (key), &len, &err);
 	assert(!err);
 	assert(strcmp(value, "value") == 0);
-	free(value);
 
 	// cleanup
 	rocksdb_writeoptions_destroy(writeoptions);

From 7ea7bdf04d62523978005b9b7217df4122e0317a Mon Sep 17 00:00:00 2001
From: Manish Patil <manishpatil@dev714.prn1.facebook.com>
Date: Tue, 23 Dec 2014 13:24:07 -0800
Subject: [PATCH 648/829] Dump routine to BlockBasedTableReader

Summary: Added necessary routines for dumping block based SST with block filter

Test Plan: Added "raw" mode to utility sst_dump

Reviewers: sdong, rven

Reviewed By: rven

Subscribers: dhruba

Differential Revision: https://reviews.facebook.net/D29679
---
 Makefile                          |   6 +-
 include/rocksdb/sst_dump_tool.h   |   2 +-
 table/block_based_filter_block.cc |  54 ++++++++
 table/block_based_filter_block.h  |   3 +
 table/block_based_table_reader.cc | 213 ++++++++++++++++++++++++++++++
 table/block_based_table_reader.h  |   7 +
 table/filter_block.h              |   6 +
 table/format.cc                   |  40 ++++++
 table/format.h                    |   6 +
 table/table_reader.h              |   5 +
 util/sst_dump_test.cc             | 152 +++++++++++++++++++++
 util/sst_dump_tool.cc             |  35 ++++-
 util/sst_dump_tool_imp.h          |   3 +
 13 files changed, 528 insertions(+), 4 deletions(-)
 create mode 100644 util/sst_dump_test.cc

diff --git a/Makefile b/Makefile
index a500e5b60..d84eb4fa0 100644
--- a/Makefile
+++ b/Makefile
@@ -165,7 +165,8 @@ TESTS = \
 	wal_manager_test \
 	listener_test \
 	compaction_job_test \
-	thread_list_test
+	thread_list_test \
+	sst_dump_test
 
 SUBSET :=  $(shell echo $(TESTS) |sed s/^.*$(ROCKSDBTESTS_START)/$(ROCKSDBTESTS_START)/)
 
@@ -539,6 +540,9 @@ compactor_test: utilities/compaction/compactor_test.o $(LIBOBJECTS) $(TESTHARNES
 options_test: util/options_test.o util/options_helper.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) util/options_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
 
+sst_dump_test: util/sst_dump_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) util/sst_dump_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@  $(LDFLAGS) $(COVERAGEFLAGS)
+
 $(MEMENVLIBRARY) : $(MEMENVOBJECTS)
 	rm -f $@
 	$(AR) -rs $@ $(MEMENVOBJECTS)
diff --git a/include/rocksdb/sst_dump_tool.h b/include/rocksdb/sst_dump_tool.h
index e3ee2a9c8..39bfb519b 100644
--- a/include/rocksdb/sst_dump_tool.h
+++ b/include/rocksdb/sst_dump_tool.h
@@ -9,7 +9,7 @@ namespace rocksdb {
 
 class SSTDumpTool {
  public:
-  void Run(int argc, char** argv);
+  int Run(int argc, char** argv);
 };
 
 }  // namespace rocksdb
diff --git a/table/block_based_filter_block.cc b/table/block_based_filter_block.cc
index 647fc020c..7037d85bc 100644
--- a/table/block_based_filter_block.cc
+++ b/table/block_based_filter_block.cc
@@ -7,6 +7,7 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
+#include <algorithm>
 #include "table/block_based_filter_block.h"
 
 #include "db/dbformat.h"
@@ -29,6 +30,38 @@ bool SamePrefix(const SliceTransform* prefix_extractor,
             prefix_extractor->Transform(key2));
   }
 }
+
+void AppendItem(std::string* props, const std::string& key,
+                const std::string& value) {
+  char cspace = ' ';
+  std::string value_str("");
+  size_t i = 0;
+  const size_t dataLength = 64;
+  const size_t tabLength = 2;
+  const size_t offLength = 16;
+
+  value_str.append(&value[i], std::min(size_t(dataLength), value.size()));
+  i += dataLength;
+  while (i < value.size()) {
+    value_str.append("\n");
+    value_str.append(offLength, cspace);
+    value_str.append(&value[i], std::min(size_t(dataLength), value.size() - i));
+    i += dataLength;
+  }
+
+  std::string result("");
+  if (key.size() < (offLength - tabLength))
+    result.append(size_t((offLength - tabLength)) - key.size(), cspace);
+  result.append(key);
+
+  props->append(result + ": " + value_str + "\n");
+}
+
+template <class TKey>
+void AppendItem(std::string* props, const TKey& key, const std::string& value) {
+  std::string key_str = std::to_string(key);
+  AppendItem(props, key_str, value);
+}
 }  // namespace
 
 
@@ -196,4 +229,25 @@ bool BlockBasedFilterBlockReader::MayMatch(const Slice& entry,
 size_t BlockBasedFilterBlockReader::ApproximateMemoryUsage() const {
   return num_ * 4 + 5 + (offset_ - data_);
 }
+
+std::string BlockBasedFilterBlockReader::ToString() const {
+  std::string result, filter_meta;
+  result.reserve(1024);
+
+  std::string s_bo("Block offset"), s_hd("Hex dump"), s_fb("# filter blocks");
+  AppendItem(&result, s_fb, std::to_string(num_));
+  AppendItem(&result, s_bo, s_hd);
+
+  for (size_t index = 0; index < num_; index++) {
+    uint32_t start = DecodeFixed32(offset_ + index * 4);
+    uint32_t limit = DecodeFixed32(offset_ + index * 4 + 4);
+
+    if (start != limit) {
+      result.append(" filter block # " + std::to_string(index + 1) + "\n");
+      Slice filter = Slice(data_ + start, limit - start);
+      AppendItem(&result, start, filter.ToString(true));
+    }
+  }
+  return result;
 }
+}  // namespace rocksdb
diff --git a/table/block_based_filter_block.h b/table/block_based_filter_block.h
index 9621425e3..cf8c1b47c 100644
--- a/table/block_based_filter_block.h
+++ b/table/block_based_filter_block.h
@@ -82,6 +82,9 @@ class BlockBasedFilterBlockReader : public FilterBlockReader {
                               uint64_t block_offset = kNotValid) override;
   virtual size_t ApproximateMemoryUsage() const override;
 
+  // convert this object to a human readable form
+  std::string ToString() const override;
+
  private:
   const FilterPolicy* policy_;
   const SliceTransform* prefix_extractor_;
diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc
index d60ba3d21..1e4da1e1f 100644
--- a/table/block_based_table_reader.cc
+++ b/table/block_based_table_reader.cc
@@ -1312,4 +1312,217 @@ bool BlockBasedTable::TEST_index_reader_preloaded() const {
   return rep_->index_reader != nullptr;
 }
 
+Status BlockBasedTable::DumpTable(WritableFile* out_file) {
+  // Output Footer
+  out_file->Append(
+      "Footer Details:\n"
+      "--------------------------------------\n"
+      "  ");
+  out_file->Append(rep_->footer.ToString().c_str());
+  out_file->Append("\n");
+
+  // Output MetaIndex
+  out_file->Append(
+      "Metaindex Details:\n"
+      "--------------------------------------\n");
+  std::unique_ptr<Block> meta;
+  std::unique_ptr<Iterator> meta_iter;
+  Status s = ReadMetaBlock(rep_, &meta, &meta_iter);
+  if (s.ok()) {
+    for (meta_iter->SeekToFirst(); meta_iter->Valid(); meta_iter->Next()) {
+      s = meta_iter->status();
+      if (!s.ok()) {
+        return s;
+      }
+      if (meta_iter->key() == rocksdb::kPropertiesBlock) {
+        out_file->Append("  Properties block handle: ");
+        out_file->Append(meta_iter->value().ToString(true).c_str());
+        out_file->Append("\n");
+      } else if (strstr(meta_iter->key().ToString().c_str(),
+                        "filter.rocksdb.") != nullptr) {
+        out_file->Append("  Filter block handle: ");
+        out_file->Append(meta_iter->value().ToString(true).c_str());
+        out_file->Append("\n");
+      }
+    }
+    out_file->Append("\n");
+  } else {
+    return s;
+  }
+
+  // Output TableProperties
+  const rocksdb::TableProperties* table_properties;
+  table_properties = rep_->table_properties.get();
+
+  if (table_properties != nullptr) {
+    out_file->Append(
+        "Table Properties:\n"
+        "--------------------------------------\n"
+        "  ");
+    out_file->Append(table_properties->ToString("\n  ", ": ").c_str());
+    out_file->Append("\n");
+  }
+
+  // Output Filter blocks
+  if (!rep_->filter && !table_properties->filter_policy_name.empty()) {
+    // Support only BloomFilter as off now
+    rocksdb::BlockBasedTableOptions table_options;
+    table_options.filter_policy.reset(rocksdb::NewBloomFilterPolicy(1));
+    if (table_properties->filter_policy_name.compare(
+            table_options.filter_policy->Name()) == 0) {
+      std::string filter_block_key = kFilterBlockPrefix;
+      filter_block_key.append(table_properties->filter_policy_name);
+      BlockHandle handle;
+      if (FindMetaBlock(meta_iter.get(), filter_block_key, &handle).ok()) {
+        BlockContents block;
+        if (ReadBlockContents(rep_->file.get(), rep_->footer, ReadOptions(),
+                              handle, &block, rep_->ioptions.env, false).ok()) {
+          rep_->filter.reset(
+              new BlockBasedFilterBlockReader(rep_->ioptions.prefix_extractor,
+                                              table_options, std::move(block)));
+        }
+      }
+    }
+  }
+  if (rep_->filter) {
+    out_file->Append(
+        "Filter Details:\n"
+        "--------------------------------------\n"
+        "  ");
+    out_file->Append(rep_->filter->ToString().c_str());
+    out_file->Append("\n");
+  }
+
+  // Output Index block
+  s = DumpIndexBlock(out_file);
+  if (!s.ok()) {
+    return s;
+  }
+  // Output Data blocks
+  s = DumpDataBlocks(out_file);
+
+  return s;
+}
+
+Status BlockBasedTable::DumpIndexBlock(WritableFile* out_file) {
+  out_file->Append(
+      "Index Details:\n"
+      "--------------------------------------\n");
+
+  std::unique_ptr<Iterator> blockhandles_iter(NewIndexIterator(ReadOptions()));
+  Status s = blockhandles_iter->status();
+  if (!s.ok()) {
+    out_file->Append("Can not read Index Block \n\n");
+    return s;
+  }
+
+  out_file->Append("  Block key hex dump: Data block handle\n");
+  out_file->Append("  Block key ascii\n\n");
+  for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid();
+       blockhandles_iter->Next()) {
+    s = blockhandles_iter->status();
+    if (!s.ok()) {
+      break;
+    }
+    Slice key = blockhandles_iter->key();
+    InternalKey ikey;
+    ikey.DecodeFrom(key);
+
+    out_file->Append("  HEX    ");
+    out_file->Append(ikey.user_key().ToString(true).c_str());
+    out_file->Append(": ");
+    out_file->Append(blockhandles_iter->value().ToString(true).c_str());
+    out_file->Append("\n");
+
+    std::string str_key = ikey.user_key().ToString();
+    std::string res_key("");
+    char cspace = ' ';
+    for (size_t i = 0; i < str_key.size(); i++) {
+      res_key.append(&str_key[i], 1);
+      res_key.append(1, cspace);
+    }
+    out_file->Append("  ASCII  ");
+    out_file->Append(res_key.c_str());
+    out_file->Append("\n  ------\n");
+  }
+  out_file->Append("\n");
+  return Status::OK();
+}
+
+Status BlockBasedTable::DumpDataBlocks(WritableFile* out_file) {
+  std::unique_ptr<Iterator> blockhandles_iter(NewIndexIterator(ReadOptions()));
+  Status s = blockhandles_iter->status();
+  if (!s.ok()) {
+    out_file->Append("Can not read Index Block \n\n");
+    return s;
+  }
+
+  size_t block_id = 1;
+  for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid();
+       block_id++, blockhandles_iter->Next()) {
+    s = blockhandles_iter->status();
+    if (!s.ok()) {
+      break;
+    }
+
+    out_file->Append("Data Block # ");
+    out_file->Append(std::to_string(block_id));
+    out_file->Append(" @ ");
+    out_file->Append(blockhandles_iter->value().ToString(true).c_str());
+    out_file->Append("\n");
+    out_file->Append("--------------------------------------\n");
+
+    std::unique_ptr<Iterator> datablock_iter;
+    datablock_iter.reset(
+        NewDataBlockIterator(rep_, ReadOptions(), blockhandles_iter->value()));
+    s = datablock_iter->status();
+
+    if (!s.ok()) {
+      out_file->Append("Error reading the block - Skipped \n\n");
+      continue;
+    }
+
+    for (datablock_iter->SeekToFirst(); datablock_iter->Valid();
+         datablock_iter->Next()) {
+      s = datablock_iter->status();
+      if (!s.ok()) {
+        out_file->Append("Error reading the block - Skipped \n");
+        break;
+      }
+      Slice key = datablock_iter->key();
+      Slice value = datablock_iter->value();
+      InternalKey ikey, iValue;
+      ikey.DecodeFrom(key);
+      iValue.DecodeFrom(value);
+
+      out_file->Append("  HEX    ");
+      out_file->Append(ikey.user_key().ToString(true).c_str());
+      out_file->Append(": ");
+      out_file->Append(iValue.user_key().ToString(true).c_str());
+      out_file->Append("\n");
+
+      std::string str_key = ikey.user_key().ToString();
+      std::string str_value = iValue.user_key().ToString();
+      std::string res_key(""), res_value("");
+      char cspace = ' ';
+      for (size_t i = 0; i < str_key.size(); i++) {
+        res_key.append(&str_key[i], 1);
+        res_key.append(1, cspace);
+      }
+      for (size_t i = 0; i < str_value.size(); i++) {
+        res_value.append(&str_value[i], 1);
+        res_value.append(1, cspace);
+      }
+
+      out_file->Append("  ASCII  ");
+      out_file->Append(res_key.c_str());
+      out_file->Append(": ");
+      out_file->Append(res_value.c_str());
+      out_file->Append("\n  ------\n");
+    }
+    out_file->Append("\n");
+  }
+  return Status::OK();
+}
+
 }  // namespace rocksdb
diff --git a/table/block_based_table_reader.h b/table/block_based_table_reader.h
index 0b89edd3f..ae849ad6c 100644
--- a/table/block_based_table_reader.h
+++ b/table/block_based_table_reader.h
@@ -100,6 +100,9 @@ class BlockBasedTable : public TableReader {
 
   size_t ApproximateMemoryUsage() const override;
 
+  // convert SST file to a human readable form
+  Status DumpTable(WritableFile* out_file) override;
+
   ~BlockBasedTable();
 
   bool TEST_filter_block_preloaded() const;
@@ -204,6 +207,10 @@ class BlockBasedTable : public TableReader {
   // For Posix files the unique ID is three varints.
   static const size_t kMaxCacheKeyPrefixSize = kMaxVarint64Length*3+1;
 
+  // Helper functions for DumpTable()
+  Status DumpIndexBlock(WritableFile* out_file);
+  Status DumpDataBlocks(WritableFile* out_file);
+
   // No copying allowed
   explicit BlockBasedTable(const TableReader&) = delete;
   void operator=(const TableReader&) = delete;
diff --git a/table/filter_block.h b/table/filter_block.h
index 197676827..855a23169 100644
--- a/table/filter_block.h
+++ b/table/filter_block.h
@@ -75,6 +75,12 @@ class FilterBlockReader {
                               uint64_t block_offset = kNotValid) = 0;
   virtual size_t ApproximateMemoryUsage() const = 0;
 
+  // convert this object to a human readable form
+  virtual std::string ToString() const {
+    std::string error_msg("Unsupported filter \n");
+    return error_msg;
+  }
+
  private:
   // No copying allowed
   FilterBlockReader(const FilterBlockReader&);
diff --git a/table/format.cc b/table/format.cc
index 90d7ac8dc..227090bb2 100644
--- a/table/format.cc
+++ b/table/format.cc
@@ -51,6 +51,25 @@ Status BlockHandle::DecodeFrom(Slice* input) {
     return Status::Corruption("bad block handle");
   }
 }
+
+// Return a string that contains the copy of handle.
+std::string BlockHandle::ToString(bool hex) const {
+  std::string handle_str;
+  EncodeTo(&handle_str);
+  if (hex) {
+    std::string result;
+    char buf[10];
+    for (size_t i = 0; i < handle_str.size(); i++) {
+      snprintf(buf, sizeof(buf), "%02X",
+               static_cast<unsigned char>(handle_str[i]));
+      result += buf;
+    }
+    return result;
+  } else {
+    return handle_str;
+  }
+}
+
 const BlockHandle BlockHandle::kNullBlockHandle(0, 0);
 
 // legacy footer format:
@@ -179,6 +198,27 @@ Status Footer::DecodeFrom(Slice* input) {
   return result;
 }
 
+std::string Footer::ToString() const {
+  std::string result, handle_;
+  result.reserve(1024);
+
+  bool legacy = IsLegacyFooterFormat(table_magic_number_);
+  if (legacy) {
+    result.append("metaindex handle: " + metaindex_handle_.ToString() + "\n  ");
+    result.append("index handle: " + index_handle_.ToString() + "\n  ");
+    result.append("table_magic_number: " + std::to_string(table_magic_number_) +
+                  "\n  ");
+  } else {
+    result.append("checksum: " + std::to_string(checksum_) + "\n  ");
+    result.append("metaindex handle: " + metaindex_handle_.ToString() + "\n  ");
+    result.append("index handle: " + index_handle_.ToString() + "\n  ");
+    result.append("footer version: " + std::to_string(version_) + "\n  ");
+    result.append("table_magic_number: " + std::to_string(table_magic_number_) +
+                  "\n  ");
+  }
+  return result;
+}
+
 Status ReadFooterFromFile(RandomAccessFile* file,
                           uint64_t file_size,
                           Footer* footer) {
diff --git a/table/format.h b/table/format.h
index 1df32bcf1..e8586c986 100644
--- a/table/format.h
+++ b/table/format.h
@@ -42,6 +42,9 @@ class BlockHandle {
   void EncodeTo(std::string* dst) const;
   Status DecodeFrom(Slice* input);
 
+  // Return a string that contains the copy of handle.
+  std::string ToString(bool hex = true) const;
+
   // if the block handle's offset and size are both "0", we will view it
   // as a null block handle that points to no where.
   bool IsNull() const {
@@ -129,6 +132,9 @@ class Footer {
 
   static const uint64_t kInvalidTableMagicNumber = 0;
 
+  // convert this object to a human readable form
+  std::string ToString() const;
+
  private:
   // REQUIRES: magic number wasn't initialized.
   void set_table_magic_number(uint64_t magic_number) {
diff --git a/table/table_reader.h b/table/table_reader.h
index 2f6360ad1..d3801442e 100644
--- a/table/table_reader.h
+++ b/table/table_reader.h
@@ -67,6 +67,11 @@ class TableReader {
   // key is the key to search for
   virtual Status Get(const ReadOptions& readOptions, const Slice& key,
                      GetContext* get_context) = 0;
+
+  // convert db file to a human readable form
+  virtual Status DumpTable(WritableFile* out_file) {
+    return Status::NotSupported("DumpTable() not supported");
+  }
 };
 
 }  // namespace rocksdb
diff --git a/util/sst_dump_test.cc b/util/sst_dump_test.cc
new file mode 100644
index 000000000..6b980ddb4
--- /dev/null
+++ b/util/sst_dump_test.cc
@@ -0,0 +1,152 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <stdint.h>
+#include "rocksdb/sst_dump_tool.h"
+
+#include "rocksdb/filter_policy.h"
+#include "table/block_based_table_factory.h"
+#include "table/table_builder.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+namespace rocksdb {
+
+const uint32_t optLength = 100;
+
+namespace {
+static std::string MakeKey(int i) {
+  char buf[100];
+  snprintf(buf, sizeof(buf), "k_%04d", i);
+  InternalKey key(std::string(buf), 0, ValueType::kTypeValue);
+  return key.Encode().ToString();
+}
+
+static std::string MakeValue(int i) {
+  char buf[100];
+  snprintf(buf, sizeof(buf), "v_%04d", i);
+  InternalKey key(std::string(buf), 0, ValueType::kTypeValue);
+  return key.Encode().ToString();
+}
+
+void createSST(const std::string& file_name,
+               const BlockBasedTableOptions& table_options) {
+  std::shared_ptr<rocksdb::TableFactory> tf;
+  tf.reset(new rocksdb::BlockBasedTableFactory(table_options));
+
+  unique_ptr<WritableFile> file;
+  Env* env = Env::Default();
+  EnvOptions env_options;
+  ReadOptions read_options;
+  Options opts;
+  const ImmutableCFOptions imoptions(opts);
+  rocksdb::InternalKeyComparator ikc(opts.comparator);
+  TableBuilder* tb = nullptr;
+
+  env->NewWritableFile(file_name, &file, env_options);
+  opts.table_factory = tf;
+  tb = opts.table_factory->NewTableBuilder(imoptions, ikc, file.get(),
+                                           CompressionType::kNoCompression,
+                                           CompressionOptions());
+
+  // Populate slightly more than 1K keys
+  uint32_t num_keys = 1024;
+  for (uint32_t i = 0; i < num_keys; i++) {
+    tb->Add(MakeKey(i), MakeValue(i));
+  }
+  tb->Finish();
+  file->Close();
+}
+
+void cleanup(const std::string& file_name) {
+  Env* env = Env::Default();
+  env->DeleteFile(file_name);
+  std::string outfile_name = file_name.substr(0, file_name.length() - 4);
+  outfile_name.append("_dump.txt");
+  env->DeleteFile(outfile_name);
+}
+}  // namespace
+
+// Test for sst dump tool "raw" mode
+class SSTDumpToolTest {
+ public:
+  BlockBasedTableOptions table_options_;
+
+  SSTDumpToolTest() {}
+
+  ~SSTDumpToolTest() {}
+};
+
+TEST(SSTDumpToolTest, EmptyFilter) {
+  std::string file_name = "rocksdb_sst_test.sst";
+  createSST(file_name, table_options_);
+
+  char* usage[3];
+  for (int i = 0; i < 3; i++) {
+    usage[i] = new char[optLength];
+  }
+  snprintf(usage[0], optLength, "./sst_dump");
+  snprintf(usage[1], optLength, "--command=raw");
+  snprintf(usage[2], optLength, "--file=rocksdb_sst_test.sst");
+
+  rocksdb::SSTDumpTool tool;
+  ASSERT_TRUE(!tool.Run(3, usage));
+
+  cleanup(file_name);
+  for (int i = 0; i < 3; i++) {
+    delete[] usage[i];
+  }
+}
+
+TEST(SSTDumpToolTest, FilterBlock) {
+  table_options_.filter_policy.reset(rocksdb::NewBloomFilterPolicy(10, true));
+  std::string file_name = "rocksdb_sst_test.sst";
+  createSST(file_name, table_options_);
+
+  char* usage[3];
+  for (int i = 0; i < 3; i++) {
+    usage[i] = new char[optLength];
+  }
+  snprintf(usage[0], optLength, "./sst_dump");
+  snprintf(usage[1], optLength, "--command=raw");
+  snprintf(usage[2], optLength, "--file=rocksdb_sst_test.sst");
+
+  rocksdb::SSTDumpTool tool;
+  ASSERT_TRUE(!tool.Run(3, usage));
+
+  cleanup(file_name);
+  for (int i = 0; i < 3; i++) {
+    delete[] usage[i];
+  }
+}
+
+TEST(SSTDumpToolTest, FullFilterBlock) {
+  table_options_.filter_policy.reset(rocksdb::NewBloomFilterPolicy(10, false));
+  std::string file_name = "rocksdb_sst_test.sst";
+  createSST(file_name, table_options_);
+
+  char* usage[3];
+  for (int i = 0; i < 3; i++) {
+    usage[i] = new char[optLength];
+  }
+  snprintf(usage[0], optLength, "./sst_dump");
+  snprintf(usage[1], optLength, "--command=raw");
+  snprintf(usage[2], optLength, "--file=rocksdb_sst_test.sst");
+
+  rocksdb::SSTDumpTool tool;
+  ASSERT_TRUE(!tool.Run(3, usage));
+
+  cleanup(file_name);
+  for (int i = 0; i < 3; i++) {
+    delete[] usage[i];
+  }
+}
+}  // namespace rocksdb
+
+int main(int argc, char** argv) { return rocksdb::test::RunAllTests(); }
diff --git a/util/sst_dump_tool.cc b/util/sst_dump_tool.cc
index d0bef3e36..1d0270c72 100644
--- a/util/sst_dump_tool.cc
+++ b/util/sst_dump_tool.cc
@@ -73,6 +73,15 @@ Status SstFileReader::NewTableReader(const std::string& file_path) {
   return s;
 }
 
+Status SstFileReader::DumpTable(const std::string& out_filename) {
+  unique_ptr<WritableFile> out_file;
+  Env* env = Env::Default();
+  env->NewWritableFile(out_filename, &out_file, soptions_);
+  Status s = table_reader_->DumpTable(out_file.get());
+  out_file->Close();
+  return s;
+}
+
 Status SstFileReader::ReadTableProperties(uint64_t table_magic_number,
                                           RandomAccessFile* file,
                                           uint64_t file_size) {
@@ -206,7 +215,7 @@ namespace {
 
 void print_help() {
   fprintf(stderr,
-          "sst_dump [--command=check|scan|none] [--verify_checksum] "
+          "sst_dump [--command=check|scan|none|raw] [--verify_checksum] "
           "--file=data_dir_OR_sst_file"
           " [--output_hex]"
           " [--input_key_hex]"
@@ -235,7 +244,7 @@ string HexToString(const string& str) {
 
 }  // namespace
 
-void SSTDumpTool::Run(int argc, char** argv) {
+int SSTDumpTool::Run(int argc, char** argv) {
   const char* dir_or_file = nullptr;
   uint64_t read_num = -1;
   std::string command;
@@ -318,8 +327,29 @@ void SSTDumpTool::Run(int argc, char** argv) {
     if (dir) {
       filename = std::string(dir_or_file) + "/" + filename;
     }
+
     rocksdb::SstFileReader reader(filename, verify_checksum,
                                   output_hex);
+    if (!reader.getStatus().ok()) {
+      fprintf(stderr, "%s: %s\n", filename.c_str(),
+              reader.getStatus().ToString().c_str());
+      exit(1);
+    }
+
+    if (command == "raw") {
+      std::string out_filename = filename.substr(0, filename.length() - 4);
+      out_filename.append("_dump.txt");
+
+      st = reader.DumpTable(out_filename);
+      if (!st.ok()) {
+        fprintf(stderr, "%s: %s\n", filename.c_str(), st.ToString().c_str());
+        exit(1);
+      } else {
+        fprintf(stdout, "raw dump written to file %s\n", &out_filename[0]);
+      }
+      continue;
+    }
+
     // scan all files in give file path.
     if (command == "" || command == "scan" || command == "check") {
       st = reader.ReadSequential(command != "check",
@@ -360,6 +390,7 @@ void SSTDumpTool::Run(int argc, char** argv) {
       }
     }
   }
+  return 0;
 }
 }  // namespace rocksdb
 
diff --git a/util/sst_dump_tool_imp.h b/util/sst_dump_tool_imp.h
index 833f62a42..7e975a534 100644
--- a/util/sst_dump_tool_imp.h
+++ b/util/sst_dump_tool_imp.h
@@ -49,6 +49,9 @@ class SstFileReader {
   uint64_t GetReadNumber() { return read_num_; }
   TableProperties* GetInitTableProperties() { return table_properties_.get(); }
 
+  Status DumpTable(const std::string& out_filename);
+  Status getStatus() { return init_result_; }
+
  private:
   Status NewTableReader(const std::string& file_path);
   Status ReadTableProperties(uint64_t table_magic_number,

From a944afd3565e807eee455d6fb02acd822c63dca4 Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Tue, 23 Dec 2014 16:17:53 -0800
Subject: [PATCH 649/829] Fixed a compile error in db/db_impl.cc on
 ROCKSDB_LITE

---
 db/db_impl.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/db/db_impl.cc b/db/db_impl.cc
index 2bafc8f81..2764c8cfd 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -1131,6 +1131,7 @@ Status DBImpl::FlushMemTableToOutputFile(
 void DBImpl::NotifyOnFlushCompleted(
     ColumnFamilyData* cfd, uint64_t file_number,
     const MutableCFOptions& mutable_cf_options) {
+#ifndef ROCKSDB_LITE
   if (cfd->ioptions()->listeners.size() == 0U) {
     return;
   }
@@ -1157,6 +1158,7 @@ void DBImpl::NotifyOnFlushCompleted(
   assert(notifying_events_ >= 0);
   // no need to signal bg_cv_ as it will be signaled at the end of the
   // flush process.
+#endif  // ROCKSDB_LITE
 }
 
 Status DBImpl::CompactRange(ColumnFamilyHandle* column_family,

From ddc81440d5fb35a7ae1d0652bd81240cc0aff446 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Mon, 22 Dec 2014 18:39:28 -0800
Subject: [PATCH 650/829] db_bench to add an option as number of hot column
 families to add to

Summary:
Add option --num_hot_column_families in db_bench. If it is set, write options will first write to that number of column families, and then move on to next set of hot column families. The working set of column families can be smaller than total number of CFs.

It is to test how RocksDB can handle cold column families

Test Plan: Run db_bench with  --num_hot_column_families set and not set.

Reviewers: yhchiang, rven, igor

Reviewed By: igor

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D30663
---
 db/db_bench.cc | 138 +++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 111 insertions(+), 27 deletions(-)

diff --git a/db/db_bench.cc b/db/db_bench.cc
index 8562d04aa..56e649add 100644
--- a/db/db_bench.cc
+++ b/db/db_bench.cc
@@ -153,6 +153,13 @@ DEFINE_int64(merge_keys, -1,
              "If negative, there will be FLAGS_num keys.");
 DEFINE_int32(num_column_families, 1, "Number of Column Families to use.");
 
+DEFINE_int32(
+    num_hot_column_families, 8,
+    "Number of Hot Column Families. If more than 0, only write to this "
+    "number of column families. After finishing all the writes to them, "
+    "create new set of column families and insert to them. Only used "
+    "when num_column_families > 1.");
+
 DEFINE_int64(reads, -1, "Number of read operations to do.  "
              "If negative, do FLAGS_num reads.");
 
@@ -390,6 +397,16 @@ enum rocksdb::CompressionType StringToCompressionType(const char* ctype) {
   fprintf(stdout, "Cannot parse compression type '%s'\n", ctype);
   return rocksdb::kSnappyCompression; //default value
 }
+
+std::string ColumnFamilyName(int i) {
+  if (i == 0) {
+    return rocksdb::kDefaultColumnFamilyName;
+  } else {
+    char name[100];
+    snprintf(name, sizeof(name), "column_family_name_%06d", i);
+    return std::string(name);
+  }
+}
 }  // namespace
 
 DEFINE_string(compression_type, "snappy",
@@ -475,6 +492,7 @@ DEFINE_int32(source_compaction_factor, 1, "Cap the size of data in level-K for"
 DEFINE_uint64(wal_ttl_seconds, 0, "Set the TTL for the WAL Files in seconds.");
 DEFINE_uint64(wal_size_limit_MB, 0, "Set the size limit for the WAL Files"
               " in MB.");
+DEFINE_uint64(max_total_wal_size, 0, "Set total max WAL size");
 
 DEFINE_bool(bufferedio, rocksdb::EnvOptions().use_os_buffer,
             "Allow buffered io using OS buffers");
@@ -779,9 +797,50 @@ static void AppendWithSpace(std::string* str, Slice msg) {
 struct DBWithColumnFamilies {
   std::vector<ColumnFamilyHandle*> cfh;
   DB* db;
+  std::atomic<size_t> num_created;  // Need to be updated after all the
+                                    // new entries in cfh are set.
+  size_t num_hot;  // Number of column families to be queried at each moment.
+                   // After each CreateNewCf(), another num_hot number of new
+                   // Column families will be created and used to be queried.
+  port::Mutex create_cf_mutex;  // Only one thread can execute CreateNewCf()
+
   DBWithColumnFamilies() : db(nullptr) {
     cfh.clear();
   }
+
+  DBWithColumnFamilies(const DBWithColumnFamilies& other)
+      : cfh(other.cfh),
+        db(other.db),
+        num_created(other.num_created.load()),
+        num_hot(other.num_hot) {}
+
+  ColumnFamilyHandle* GetCfh(int64_t rand_num) {
+    assert(num_hot > 0);
+    return cfh[num_created.load(std::memory_order_acquire) - num_hot +
+               rand_num % num_hot];
+  }
+
+  // stage: assume CF from 0 to stage * num_hot has be created. Need to create
+  //        stage * num_hot + 1 to stage * (num_hot + 1).
+  void CreateNewCf(ColumnFamilyOptions options, int64_t stage) {
+    MutexLock l(&create_cf_mutex);
+    if ((stage + 1) * num_hot <= num_created) {
+      // Already created.
+      return;
+    }
+    auto new_num_created = num_created + num_hot;
+    assert(new_num_created <= cfh.size());
+    for (size_t i = num_created; i < new_num_created; i++) {
+      Status s =
+          db->CreateColumnFamily(options, ColumnFamilyName(i), &(cfh[i]));
+      if (!s.ok()) {
+        fprintf(stderr, "create column family error: %s\n",
+                s.ToString().c_str());
+        abort();
+      }
+    }
+    num_created.store(new_num_created, std::memory_order_release);
+  }
 };
 
 class Stats {
@@ -888,8 +947,8 @@ class Stats {
         if (FLAGS_stats_per_interval) {
           std::string stats;
 
-          if (db_with_cfh && db_with_cfh->cfh.size()) {
-            for (size_t i = 0; i < db_with_cfh->cfh.size(); ++i) {
+          if (db_with_cfh && db_with_cfh->num_created.load()) {
+            for (size_t i = 0; i < db_with_cfh->num_created.load(); ++i) {
               if (db->GetProperty(db_with_cfh->cfh[i], "rocksdb.cfstats",
                                   &stats))
                 fprintf(stderr, "%s\n", stats.c_str());
@@ -994,13 +1053,16 @@ struct ThreadState {
 
 class Duration {
  public:
-  Duration(int max_seconds, int64_t max_ops) {
+  Duration(int max_seconds, int64_t max_ops, int64_t ops_per_stage = 0) {
     max_seconds_ = max_seconds;
     max_ops_= max_ops;
+    ops_per_stage_ = (ops_per_stage > 0) ? ops_per_stage : max_ops;
     ops_ = 0;
     start_at_ = FLAGS_env->NowMicros();
   }
 
+  int64_t GetStage() { return std::min(ops_, max_ops_ - 1) / ops_per_stage_; }
+
   bool Done(int64_t increment) {
     if (increment <= 0) increment = 1;    // avoid Done(0) and infinite loops
     ops_ += increment;
@@ -1021,6 +1083,7 @@ class Duration {
  private:
   int max_seconds_;
   int64_t max_ops_;
+  int64_t ops_per_stage_;
   int64_t ops_;
   double start_at_;
 };
@@ -1040,6 +1103,7 @@ class Benchmark {
   int64_t keys_per_prefix_;
   int64_t entries_per_batch_;
   WriteOptions write_options_;
+  Options open_options_;  // keep options around to properly destroy db later
   int64_t reads_;
   int64_t writes_;
   int64_t readwrites_;
@@ -1355,24 +1419,12 @@ class Benchmark {
     return base_name + ToString(id);
   }
 
-  std::string ColumnFamilyName(int i) {
-    if (i == 0) {
-      return kDefaultColumnFamilyName;
-    } else {
-      char name[100];
-      snprintf(name, sizeof(name), "column_family_name_%06d", i);
-      return std::string(name);
-    }
-  }
-
   void Run() {
-    Options open_options;  // keep options around to properly destroy db later
-
     if (!SanityCheck()) {
       exit(1);
     }
     PrintHeader();
-    Open(&open_options);
+    Open(&open_options_);
     const char* benchmarks = FLAGS_benchmarks.c_str();
     while (benchmarks != nullptr) {
       const char* sep = strchr(benchmarks, ',');
@@ -1533,15 +1585,15 @@ class Benchmark {
             delete db_.db;
             db_.db = nullptr;
             db_.cfh.clear();
-            DestroyDB(FLAGS_db, open_options);
+            DestroyDB(FLAGS_db, open_options_);
           }
           for (size_t i = 0; i < multi_dbs_.size(); i++) {
             delete multi_dbs_[i].db;
-            DestroyDB(GetDbNameForMultiple(FLAGS_db, i), open_options);
+            DestroyDB(GetDbNameForMultiple(FLAGS_db, i), open_options_);
           }
           multi_dbs_.clear();
         }
-        Open(&open_options);  // use open_options for the last accessed
+        Open(&open_options_);  // use open_options for the last accessed
       }
 
       if (method != nullptr) {
@@ -1996,6 +2048,8 @@ class Benchmark {
     options.compression_opts.level = FLAGS_compression_level;
     options.WAL_ttl_seconds = FLAGS_wal_ttl_seconds;
     options.WAL_size_limit_MB = FLAGS_wal_size_limit_MB;
+    options.max_total_wal_size = FLAGS_max_total_wal_size;
+
     if (FLAGS_min_level_to_compress >= 0) {
       assert(FLAGS_min_level_to_compress <= FLAGS_num_levels);
       options.compression_per_level.resize(FLAGS_num_levels);
@@ -2077,9 +2131,15 @@ class Benchmark {
     Status s;
     // Open with column families if necessary.
     if (FLAGS_num_column_families > 1) {
-      db->cfh.resize(FLAGS_num_column_families);
+      size_t num_hot = FLAGS_num_column_families;
+      if (FLAGS_num_hot_column_families > 0 &&
+          FLAGS_num_hot_column_families < FLAGS_num_column_families) {
+        num_hot = FLAGS_num_hot_column_families;
+      } else {
+        FLAGS_num_hot_column_families = FLAGS_num_column_families;
+      }
       std::vector<ColumnFamilyDescriptor> column_families;
-      for (int i = 0; i < FLAGS_num_column_families; i++) {
+      for (size_t i = 0; i < num_hot; i++) {
         column_families.push_back(ColumnFamilyDescriptor(
               ColumnFamilyName(i), ColumnFamilyOptions(options)));
       }
@@ -2089,6 +2149,10 @@ class Benchmark {
       } else {
         s = DB::Open(options, db_name, column_families, &db->cfh, &db->db);
       }
+      db->cfh.resize(FLAGS_num_column_families);
+      db->num_created = num_hot;
+      db->num_hot = num_hot;
+
     } else if (FLAGS_readonly) {
       s = DB::OpenForReadOnly(options, db_name, &db->db);
     } else {
@@ -2185,9 +2249,18 @@ class Benchmark {
       num_key_gens = multi_dbs_.size();
     }
     std::vector<std::unique_ptr<KeyGenerator>> key_gens(num_key_gens);
-    Duration duration(test_duration, num_ops * num_key_gens);
+    int64_t max_ops = num_ops * num_key_gens;
+    int64_t ops_per_stage = max_ops;
+    if (FLAGS_num_column_families > 1 && FLAGS_num_hot_column_families > 0) {
+      ops_per_stage = (max_ops - 1) / (FLAGS_num_column_families /
+                                       FLAGS_num_hot_column_families) +
+                      1;
+    }
+
+    Duration duration(test_duration, max_ops, ops_per_stage);
     for (size_t i = 0; i < num_key_gens; i++) {
-      key_gens[i].reset(new KeyGenerator(&(thread->rand), write_mode, num_ops));
+      key_gens[i].reset(new KeyGenerator(&(thread->rand), write_mode, num_ops,
+                                         ops_per_stage));
     }
 
     if (num_ != FLAGS_num) {
@@ -2203,7 +2276,18 @@ class Benchmark {
 
     Slice key = AllocateKey();
     std::unique_ptr<const char[]> key_guard(key.data());
+    int64_t stage = 0;
     while (!duration.Done(entries_per_batch_)) {
+      if (duration.GetStage() != stage) {
+        stage = duration.GetStage();
+        if (db_.db != nullptr) {
+          db_.CreateNewCf(open_options_, stage);
+        } else {
+          for (auto& db : multi_dbs_) {
+            db.CreateNewCf(open_options_, stage);
+          }
+        }
+      }
       size_t id = thread->rand.Next() % num_key_gens;
       DBWithColumnFamilies* db_with_cfh = SelectDBWithCfh(id);
       batch.Clear();
@@ -2216,8 +2300,8 @@ class Benchmark {
           // We use same rand_num as seed for key and column family so that we
           // can deterministically find the cfh corresponding to a particular
           // key while reading the key.
-          batch.Put(db_with_cfh->cfh[rand_num % db_with_cfh->cfh.size()],
-              key, gen.Generate(value_size_));
+          batch.Put(db_with_cfh->GetCfh(rand_num), key,
+                    gen.Generate(value_size_));
         }
         bytes += value_size_ + key_size_;
       }
@@ -2343,8 +2427,8 @@ class Benchmark {
       read++;
       Status s;
       if (FLAGS_num_column_families > 1) {
-        s = db_with_cfh->db->Get(options,
-            db_with_cfh->cfh[key_rand % db_with_cfh->cfh.size()], key, &value);
+        s = db_with_cfh->db->Get(options, db_with_cfh->GetCfh(key_rand), key,
+                                 &value);
       } else {
         s = db_with_cfh->db->Get(options, key, &value);
       }

From 2067058a608be3bb070523f598d124deb39b18aa Mon Sep 17 00:00:00 2001
From: Manish Patil <manishpatil@dev714.prn1.facebook.com>
Date: Sun, 30 Nov 2014 19:00:31 -0800
Subject: [PATCH 651/829] Dump routine to BlockBasedTableReader (valgrind)

Summary: Fixed valgrind issue

Test Plan: valgrind check done

Reviewers: rven, sdong

Reviewed By: sdong

Subscribers: sdong, dhruba

Differential Revision: https://reviews.facebook.net/D30699
---
 util/sst_dump_test.cc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/util/sst_dump_test.cc b/util/sst_dump_test.cc
index 6b980ddb4..f3fa1664d 100644
--- a/util/sst_dump_test.cc
+++ b/util/sst_dump_test.cc
@@ -47,13 +47,13 @@ void createSST(const std::string& file_name,
   Options opts;
   const ImmutableCFOptions imoptions(opts);
   rocksdb::InternalKeyComparator ikc(opts.comparator);
-  TableBuilder* tb = nullptr;
+  unique_ptr<TableBuilder> tb;
 
   env->NewWritableFile(file_name, &file, env_options);
   opts.table_factory = tf;
-  tb = opts.table_factory->NewTableBuilder(imoptions, ikc, file.get(),
-                                           CompressionType::kNoCompression,
-                                           CompressionOptions());
+  tb.reset(opts.table_factory->NewTableBuilder(imoptions, ikc, file.get(),
+                                               CompressionType::kNoCompression,
+                                               CompressionOptions()));
 
   // Populate slightly more than 1K keys
   uint32_t num_keys = 1024;

From a801c1fb099167cf48a714483163061062e3dcb7 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Wed, 24 Dec 2014 08:54:28 -0800
Subject: [PATCH 652/829] db_bench --num_hot_column_families to be default off

Summary: Having --num_hot_column_families default on fails some existing regression tests. By default turn it off

Test Plan: Run db_bench to make sure it is default off.

Reviewers: yhchiang, rven, igor

Reviewed By: igor

Subscribers: leveldb, dhruba

Differential Revision: https://reviews.facebook.net/D30705
---
 db/db_bench.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/db/db_bench.cc b/db/db_bench.cc
index 56e649add..5e4bbd9d1 100644
--- a/db/db_bench.cc
+++ b/db/db_bench.cc
@@ -154,7 +154,7 @@ DEFINE_int64(merge_keys, -1,
 DEFINE_int32(num_column_families, 1, "Number of Column Families to use.");
 
 DEFINE_int32(
-    num_hot_column_families, 8,
+    num_hot_column_families, 0,
     "Number of Hot Column Families. If more than 0, only write to this "
     "number of column families. After finishing all the writes to them, "
     "create new set of column families and insert to them. Only used "

From bf287b76e0e7b5998de49e3ceaa2b34d1f3c13ae Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Tue, 30 Dec 2014 10:39:13 -0800
Subject: [PATCH 653/829] Add structures for exposing thread events and
 operations.

Summary:
Add structures for exposing events and operations.  Event describes
high-level action about a thread such as doing compaciton or
doing flush, while an operation describes lower-level action
of a thread such as reading / writing a SST table, waiting for
mutex.  Events and operations are designed to be independent.
One thread would typically involve in one event and one operation.

Code instrument will be in a separate diff.

Test Plan:
Add unit-tests in thread_list_test
make dbg -j32
./thread_list_test
export ROCKSDB_TESTS=ThreadList
./db_test

Reviewers: ljin, igor, sdong

Reviewed By: sdong

Subscribers: rven, jonahcohen, dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D29781
---
 db/db_test.cc                   |   8 +-
 include/rocksdb/thread_status.h |  57 +++++--
 util/env_posix.cc               |   4 +-
 util/thread_list_test.cc        | 283 ++++++++++++++++++++++++++------
 util/thread_operation.h         |  68 ++++++++
 util/thread_status_updater.cc   |  61 +++++--
 util/thread_status_updater.h    |  35 ++--
 7 files changed, 417 insertions(+), 99 deletions(-)
 create mode 100644 util/thread_operation.h

diff --git a/db/db_test.cc b/db/db_test.cc
index a371cfd9e..455d6cb7e 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -9435,10 +9435,10 @@ TEST(DBTest, GetThreadList) {
       env_->SleepForMicroseconds(100000);
       s = env_->GetThreadList(&thread_list);
       ASSERT_OK(s);
-      unsigned int thread_type_counts[ThreadStatus::ThreadType::TOTAL];
+      unsigned int thread_type_counts[ThreadStatus::NUM_THREAD_TYPES];
       memset(thread_type_counts, 0, sizeof(thread_type_counts));
       for (auto thread : thread_list) {
-        ASSERT_LT(thread.thread_type, ThreadStatus::ThreadType::TOTAL);
+        ASSERT_LT(thread.thread_type, ThreadStatus::NUM_THREAD_TYPES);
         thread_type_counts[thread.thread_type]++;
       }
       // Verify the total number of threades
@@ -9447,11 +9447,11 @@ TEST(DBTest, GetThreadList) {
           kHighPriCounts[test] + kLowPriCounts[test]);
       // Verify the number of high-priority threads
       ASSERT_EQ(
-          thread_type_counts[ThreadStatus::ThreadType::ROCKSDB_HIGH_PRIORITY],
+          thread_type_counts[ThreadStatus::HIGH_PRIORITY],
           kHighPriCounts[test]);
       // Verify the number of low-priority threads
       ASSERT_EQ(
-          thread_type_counts[ThreadStatus::ThreadType::ROCKSDB_LOW_PRIORITY],
+          thread_type_counts[ThreadStatus::LOW_PRIORITY],
           kLowPriCounts[test]);
     }
     if (i == 0) {
diff --git a/include/rocksdb/thread_status.h b/include/rocksdb/thread_status.h
index f622aa405..57a87a21a 100644
--- a/include/rocksdb/thread_status.h
+++ b/include/rocksdb/thread_status.h
@@ -2,6 +2,14 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree. An additional grant
 // of patent rights can be found in the PATENTS file in the same directory.
+//
+// This file defines the structures for exposing run-time status of any
+// rocksdb-related thread.  Such run-time status can be obtained via
+// GetThreadList() API.
+//
+// Note that all thread-status features are still under-development, and
+// thus APIs and class definitions might subject to change at this point.
+// Will remove this comment once the APIs have been finalized.
 
 #pragma once
 
@@ -22,29 +30,48 @@ namespace rocksdb {
 // The status of active threads can be fetched using
 // rocksdb::GetThreadList().
 struct ThreadStatus {
-  enum ThreadType {
-    ROCKSDB_HIGH_PRIORITY = 0x0,
-    ROCKSDB_LOW_PRIORITY = 0x1,
-    USER_THREAD = 0x2,
-    TOTAL = 0x3
+  // The type of a thread.
+  enum ThreadType : int {
+    HIGH_PRIORITY = 0,  // RocksDB BG thread in high-pri thread pool
+    LOW_PRIORITY,  // RocksDB BG thread in low-pri thread pool
+    USER,  // User thread (Non-RocksDB BG thread)
+    NUM_THREAD_TYPES
+  };
+
+  // The type used to refer to a thread operation.
+  // A thread operation describes high-level action of a thread.
+  // Examples include compaction and flush.
+  enum OperationType : int {
+    OP_UNKNOWN = 0,
+    OP_COMPACTION,
+    OP_FLUSH,
+    NUM_OP_TYPES
+  };
+
+  // The type used to refer to a thread state.
+  // A state describes lower-level action of a thread
+  // such as reading / writing a file or waiting for a mutex.
+  enum StateType : int {
+    STATE_UNKNOWN = 0,
+    NUM_STATE_TYPES
   };
 
-#if ROCKSDB_USING_THREAD_STATUS
   ThreadStatus(const uint64_t _id,
                const ThreadType _thread_type,
                const std::string& _db_name,
                const std::string& _cf_name,
-               const std::string& _event) :
+               const OperationType _operation_type,
+               const StateType _state_type) :
       thread_id(_id), thread_type(_thread_type),
       db_name(_db_name),
       cf_name(_cf_name),
-      event(_event) {}
+      operation_type(_operation_type), state_type(_state_type) {}
 
   // An unique ID for the thread.
   const uint64_t thread_id;
 
-  // The type of the thread, it could be ROCKSDB_HIGH_PRIORITY,
-  // ROCKSDB_LOW_PRIORITY, and USER_THREAD
+  // The type of the thread, it could be HIGH_PRIORITY,
+  // LOW_PRIORITY, and USER
   const ThreadType thread_type;
 
   // The name of the DB instance where the thread is currently
@@ -57,11 +84,11 @@ struct ThreadStatus {
   // in any column family.
   const std::string cf_name;
 
-  // The event that the current thread is involved.
-  // It would be set to empty string if the information about event
-  // is not currently available.
-  const std::string event;
-#endif  // ROCKSDB_USING_THREAD_STATUS
+  // The operation (high-level action) that the current thread is involved.
+  const OperationType operation_type;
+
+  // The state (lower-level action) that the current thread is involved.
+  const StateType state_type;
 };
 
 }  // namespace rocksdb
diff --git a/util/env_posix.cc b/util/env_posix.cc
index 5bad58466..9e1e4da5b 100644
--- a/util/env_posix.cc
+++ b/util/env_posix.cc
@@ -1694,8 +1694,8 @@ class PosixEnv : public Env {
       // for thread-status
       ThreadStatusUtil::SetThreadType(tp->env_,
           (tp->GetThreadPriority() == Env::Priority::HIGH ?
-              ThreadStatus::ThreadType::ROCKSDB_HIGH_PRIORITY :
-              ThreadStatus::ThreadType::ROCKSDB_LOW_PRIORITY));
+              ThreadStatus::HIGH_PRIORITY :
+              ThreadStatus::LOW_PRIORITY));
 #endif
       delete meta;
       tp->BGThread(thread_id);
diff --git a/util/thread_list_test.cc b/util/thread_list_test.cc
index 12ad14719..86ce1c4d9 100644
--- a/util/thread_list_test.cc
+++ b/util/thread_list_test.cc
@@ -14,46 +14,65 @@
 
 namespace rocksdb {
 
-class SleepingBackgroundTask {
+class SimulatedBackgroundTask {
  public:
-  SleepingBackgroundTask(const void* db_key, const std::string& db_name,
-                         const void* cf_key, const std::string& cf_name)
+  SimulatedBackgroundTask(
+      const void* db_key, const std::string& db_name,
+      const void* cf_key, const std::string& cf_name,
+      const ThreadStatus::OperationType operation_type =
+          ThreadStatus::OP_UNKNOWN,
+      const ThreadStatus::StateType state_type =
+          ThreadStatus::STATE_UNKNOWN)
       : db_key_(db_key), db_name_(db_name),
         cf_key_(cf_key), cf_name_(cf_name),
-        should_sleep_(true), sleeping_count_(0) {
+        operation_type_(operation_type), state_type_(state_type),
+        should_run_(true), running_count_(0) {
     Env::Default()->GetThreadStatusUpdater()->NewColumnFamilyInfo(
         db_key_, db_name_, cf_key_, cf_name_);
   }
 
-  ~SleepingBackgroundTask() {
+  ~SimulatedBackgroundTask() {
     Env::Default()->GetThreadStatusUpdater()->EraseDatabaseInfo(db_key_);
   }
 
-  void DoSleep() {
-    Env::Default()->GetThreadStatusUpdater()->SetColumnFamilyInfoKey(cf_key_);
+  void Run() {
     std::unique_lock<std::mutex> l(mutex_);
-    sleeping_count_++;
-    while (should_sleep_) {
+    running_count_++;
+    Env::Default()->GetThreadStatusUpdater()->SetColumnFamilyInfoKey(cf_key_);
+    Env::Default()->GetThreadStatusUpdater()->SetThreadOperation(
+        operation_type_);
+    Env::Default()->GetThreadStatusUpdater()->SetThreadState(state_type_);
+    while (should_run_) {
       bg_cv_.wait(l);
     }
-    sleeping_count_--;
-    bg_cv_.notify_all();
+    Env::Default()->GetThreadStatusUpdater()->ClearThreadState();
+    Env::Default()->GetThreadStatusUpdater()->ClearThreadOperation();
     Env::Default()->GetThreadStatusUpdater()->SetColumnFamilyInfoKey(0);
+    running_count_--;
+    bg_cv_.notify_all();
   }
-  void WakeUp() {
+
+  void FinishAllTasks() {
     std::unique_lock<std::mutex> l(mutex_);
-    should_sleep_ = false;
+    should_run_ = false;
     bg_cv_.notify_all();
   }
+
+  void WaitUntilScheduled(int job_count, Env* env) {
+    while (running_count_ < job_count) {
+      env->SleepForMicroseconds(1000);
+    }
+  }
+
   void WaitUntilDone() {
     std::unique_lock<std::mutex> l(mutex_);
-    while (sleeping_count_ > 0) {
+    while (running_count_ > 0) {
       bg_cv_.wait(l);
     }
   }
 
-  static void DoSleepTask(void* arg) {
-    reinterpret_cast<SleepingBackgroundTask*>(arg)->DoSleep();
+  static void DoSimulatedTask(void* arg) {
+    reinterpret_cast<SimulatedBackgroundTask*>(arg)->Run();
   }
 
  private:
@@ -61,10 +80,12 @@ class SleepingBackgroundTask {
   const std::string db_name_;
   const void* cf_key_;
   const std::string cf_name_;
+  const ThreadStatus::OperationType operation_type_;
+  const ThreadStatus::StateType state_type_;
   std::mutex mutex_;
   std::condition_variable bg_cv_;
-  bool should_sleep_;
-  std::atomic<int> sleeping_count_;
+  bool should_run_;
+  std::atomic<int> running_count_;
 };
 
 class ThreadListTest {
@@ -73,72 +94,232 @@ class ThreadListTest {
   }
 };
 
+TEST(ThreadListTest, EventTables) {
+  // verify the global tables for operations and states are properly indexed.
+  for (int type = 0; type != ThreadStatus::NUM_OP_TYPES; ++type) {
+    ASSERT_EQ(global_operation_table[type].type, type);
+  }
+
+  for (int type = 0; type != ThreadStatus::NUM_STATE_TYPES; ++type) {
+    ASSERT_EQ(global_state_table[type].type, type);
+  }
+}
+
 TEST(ThreadListTest, SimpleColumnFamilyInfoTest) {
   Env* env = Env::Default();
   const int kHighPriorityThreads = 3;
   const int kLowPriorityThreads = 5;
-  const int kSleepingHighPriThreads = kHighPriorityThreads - 1;
-  const int kSleepingLowPriThreads = kLowPriorityThreads / 3;
+  const int kSimulatedHighPriThreads = kHighPriorityThreads - 1;
+  const int kSimulatedLowPriThreads = kLowPriorityThreads / 3;
   env->SetBackgroundThreads(kHighPriorityThreads, Env::HIGH);
   env->SetBackgroundThreads(kLowPriorityThreads, Env::LOW);
 
-  SleepingBackgroundTask sleeping_task(
-      reinterpret_cast<void*>(1234), "sleeping",
+  SimulatedBackgroundTask running_task(
+      reinterpret_cast<void*>(1234), "running",
       reinterpret_cast<void*>(5678), "pikachu");
 
-  for (int test = 0; test < kSleepingHighPriThreads; ++test) {
-    env->Schedule(&SleepingBackgroundTask::DoSleepTask,
-        &sleeping_task, Env::Priority::HIGH);
+  for (int test = 0; test < kSimulatedHighPriThreads; ++test) {
+    env->Schedule(&SimulatedBackgroundTask::DoSimulatedTask,
+        &running_task, Env::Priority::HIGH);
   }
-  for (int test = 0; test < kSleepingLowPriThreads; ++test) {
-    env->Schedule(&SleepingBackgroundTask::DoSleepTask,
-        &sleeping_task, Env::Priority::LOW);
+  for (int test = 0; test < kSimulatedLowPriThreads; ++test) {
+    env->Schedule(&SimulatedBackgroundTask::DoSimulatedTask,
+        &running_task, Env::Priority::LOW);
   }
-
-  // make sure everything is scheduled.
-  env->SleepForMicroseconds(10000);
+  running_task.WaitUntilScheduled(
+      kSimulatedHighPriThreads + kSimulatedLowPriThreads, env);
 
   std::vector<ThreadStatus> thread_list;
 
-  // Verify the number of sleeping threads in each pool.
+  // Verify the number of running threads in each pool.
   env->GetThreadList(&thread_list);
-  int sleeping_count[ThreadStatus::ThreadType::TOTAL] = {0};
+  int running_count[ThreadStatus::NUM_THREAD_TYPES] = {0};
   for (auto thread_status : thread_list) {
     if (thread_status.cf_name == "pikachu" &&
-        thread_status.db_name == "sleeping") {
-      sleeping_count[thread_status.thread_type]++;
+        thread_status.db_name == "running") {
+      running_count[thread_status.thread_type]++;
     }
   }
   ASSERT_EQ(
-      sleeping_count[ThreadStatus::ThreadType::ROCKSDB_HIGH_PRIORITY],
-      kSleepingHighPriThreads);
+      running_count[ThreadStatus::HIGH_PRIORITY],
+      kSimulatedHighPriThreads);
   ASSERT_EQ(
-      sleeping_count[ThreadStatus::ThreadType::ROCKSDB_LOW_PRIORITY],
-      kSleepingLowPriThreads);
+      running_count[ThreadStatus::LOW_PRIORITY],
+      kSimulatedLowPriThreads);
   ASSERT_EQ(
-      sleeping_count[ThreadStatus::ThreadType::USER_THREAD], 0);
+      running_count[ThreadStatus::USER], 0);
 
-  sleeping_task.WakeUp();
-  sleeping_task.WaitUntilDone();
+  running_task.FinishAllTasks();
+  running_task.WaitUntilDone();
 
-  // Verify none of the threads are sleeping
+  // Verify none of the threads are running
   env->GetThreadList(&thread_list);
-  for (int i = 0; i < ThreadStatus::ThreadType::TOTAL; ++i) {
-    sleeping_count[i] = 0;
-  }
 
+  for (int i = 0; i < ThreadStatus::NUM_THREAD_TYPES; ++i) {
+    running_count[i] = 0;
+  }
   for (auto thread_status : thread_list) {
     if (thread_status.cf_name == "pikachu" &&
-        thread_status.db_name == "sleeping") {
-      sleeping_count[thread_status.thread_type]++;
+        thread_status.db_name == "running") {
+      running_count[thread_status.thread_type]++;
     }
   }
+
   ASSERT_EQ(
-      sleeping_count[ThreadStatus::ThreadType::ROCKSDB_HIGH_PRIORITY], 0);
+      running_count[ThreadStatus::HIGH_PRIORITY], 0);
   ASSERT_EQ(
-      sleeping_count[ThreadStatus::ThreadType::ROCKSDB_LOW_PRIORITY], 0);
+      running_count[ThreadStatus::LOW_PRIORITY], 0);
   ASSERT_EQ(
-      sleeping_count[ThreadStatus::ThreadType::USER_THREAD], 0);
+      running_count[ThreadStatus::USER], 0);
+}
+
+namespace {
+  void UpdateStatusCounts(
+      const std::vector<ThreadStatus>& thread_list,
+      int operation_counts[], int state_counts[]) {
+    for (auto thread_status : thread_list) {
+      operation_counts[thread_status.operation_type]++;
+      state_counts[thread_status.state_type]++;
+    }
+  }
+
+  void VerifyAndResetCounts(
+      const int correct_counts[], int collected_counts[], int size) {
+    for (int i = 0; i < size; ++i) {
+      ASSERT_EQ(collected_counts[i], correct_counts[i]);
+      collected_counts[i] = 0;
+    }
+  }
+
+  void UpdateCount(
+      int operation_counts[], int from_event, int to_event, int amount) {
+    operation_counts[from_event] -= amount;
+    operation_counts[to_event] += amount;
+  }
+}  // namespace
+
+TEST(ThreadListTest, SimpleEventTest) {
+  Env* env = Env::Default();
+
+  // simulated tasks
+  const int kFlushWriteTasks = 3;
+  SimulatedBackgroundTask flush_write_task(
+      reinterpret_cast<void*>(1234), "running",
+      reinterpret_cast<void*>(5678), "pikachu",
+      ThreadStatus::OP_FLUSH);
+
+  const int kCompactionWriteTasks = 4;
+  SimulatedBackgroundTask compaction_write_task(
+      reinterpret_cast<void*>(1234), "running",
+      reinterpret_cast<void*>(5678), "pikachu",
+      ThreadStatus::OP_COMPACTION);
+
+  const int kCompactionReadTasks = 5;
+  SimulatedBackgroundTask compaction_read_task(
+      reinterpret_cast<void*>(1234), "running",
+      reinterpret_cast<void*>(5678), "pikachu",
+      ThreadStatus::OP_COMPACTION);
+
+  const int kCompactionWaitTasks = 6;
+  SimulatedBackgroundTask compaction_wait_task(
+      reinterpret_cast<void*>(1234), "running",
+      reinterpret_cast<void*>(5678), "pikachu",
+      ThreadStatus::OP_COMPACTION);
+
+  // setup right answers
+  int correct_operation_counts[ThreadStatus::NUM_OP_TYPES] = {0};
+  correct_operation_counts[ThreadStatus::OP_FLUSH] =
+      kFlushWriteTasks;
+  correct_operation_counts[ThreadStatus::OP_COMPACTION] =
+      kCompactionWriteTasks + kCompactionReadTasks + kCompactionWaitTasks;
+
+  env->SetBackgroundThreads(
+      correct_operation_counts[ThreadStatus::OP_FLUSH], Env::HIGH);
+  env->SetBackgroundThreads(
+      correct_operation_counts[ThreadStatus::OP_COMPACTION], Env::LOW);
+
+  // schedule the simulated tasks
+  for (int t = 0; t < kFlushWriteTasks; ++t) {
+    env->Schedule(&SimulatedBackgroundTask::DoSimulatedTask,
+        &flush_write_task, Env::Priority::HIGH);
+  }
+  flush_write_task.WaitUntilScheduled(kFlushWriteTasks, env);
+
+  for (int t = 0; t < kCompactionWriteTasks; ++t) {
+    env->Schedule(&SimulatedBackgroundTask::DoSimulatedTask,
+        &compaction_write_task, Env::Priority::LOW);
+  }
+  compaction_write_task.WaitUntilScheduled(kCompactionWriteTasks, env);
+
+  for (int t = 0; t < kCompactionReadTasks; ++t) {
+    env->Schedule(&SimulatedBackgroundTask::DoSimulatedTask,
+        &compaction_read_task, Env::Priority::LOW);
+  }
+  compaction_read_task.WaitUntilScheduled(kCompactionReadTasks, env);
+
+  for (int t = 0; t < kCompactionWaitTasks; ++t) {
+    env->Schedule(&SimulatedBackgroundTask::DoSimulatedTask,
+        &compaction_wait_task, Env::Priority::LOW);
+  }
+  compaction_wait_task.WaitUntilScheduled(kCompactionWaitTasks, env);
+
+  // verify the thread-status
+  int operation_counts[ThreadStatus::NUM_OP_TYPES] = {0};
+  int state_counts[ThreadStatus::NUM_STATE_TYPES] = {0};
+
+  std::vector<ThreadStatus> thread_list;
+  env->GetThreadList(&thread_list);
+  UpdateStatusCounts(thread_list, operation_counts, state_counts);
+  VerifyAndResetCounts(correct_operation_counts, operation_counts,
+                       ThreadStatus::NUM_OP_TYPES);
+
+  // terminate compaction-wait tasks and see if the thread-status
+  // reflects this update
+  compaction_wait_task.FinishAllTasks();
+  compaction_wait_task.WaitUntilDone();
+  UpdateCount(correct_operation_counts, ThreadStatus::OP_COMPACTION,
+              ThreadStatus::OP_UNKNOWN, kCompactionWaitTasks);
+
+  env->GetThreadList(&thread_list);
+  UpdateStatusCounts(thread_list, operation_counts, state_counts);
+  VerifyAndResetCounts(correct_operation_counts, operation_counts,
+                       ThreadStatus::NUM_OP_TYPES);
+
+  // terminate flush-write tasks and see if the thread-status
+  // reflects this update
+  flush_write_task.FinishAllTasks();
+  flush_write_task.WaitUntilDone();
+  UpdateCount(correct_operation_counts, ThreadStatus::OP_FLUSH,
+              ThreadStatus::OP_UNKNOWN, kFlushWriteTasks);
+
+  env->GetThreadList(&thread_list);
+  UpdateStatusCounts(thread_list, operation_counts, state_counts);
+  VerifyAndResetCounts(correct_operation_counts, operation_counts,
+                       ThreadStatus::NUM_OP_TYPES);
+
+  // terminate compaction-write tasks and see if the thread-status
+  // reflects this update
+  compaction_write_task.FinishAllTasks();
+  compaction_write_task.WaitUntilDone();
+  UpdateCount(correct_operation_counts, ThreadStatus::OP_COMPACTION,
+              ThreadStatus::OP_UNKNOWN, kCompactionWriteTasks);
+
+  env->GetThreadList(&thread_list);
+  UpdateStatusCounts(thread_list, operation_counts, state_counts);
+  VerifyAndResetCounts(correct_operation_counts, operation_counts,
+                       ThreadStatus::NUM_OP_TYPES);
+
+  // terminate compaction-write tasks and see if the thread-status
+  // reflects this update
+  compaction_read_task.FinishAllTasks();
+  compaction_read_task.WaitUntilDone();
+  UpdateCount(correct_operation_counts, ThreadStatus::OP_COMPACTION,
+              ThreadStatus::OP_UNKNOWN, kCompactionReadTasks);
+
+  env->GetThreadList(&thread_list);
+  UpdateStatusCounts(thread_list, operation_counts, state_counts);
+  VerifyAndResetCounts(correct_operation_counts, operation_counts,
+                       ThreadStatus::NUM_OP_TYPES);
 }
 
 }  // namespace rocksdb
diff --git a/util/thread_operation.h b/util/thread_operation.h
new file mode 100644
index 000000000..b4326f5bd
--- /dev/null
+++ b/util/thread_operation.h
@@ -0,0 +1,68 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+//
+// This file defines the structures for thread operation and state.
+// Thread operations are used to describe high level action of a
+// thread such as doing compaction or flush, while thread state
+// are used to describe lower-level action such as reading /
+// writing a file or waiting for a mutex.  Operations and states
+// are designed to be independent.  Typically, a thread usually involves
+// in one operation and one state at any specific point in time.
+
+#pragma once
+
+#include "include/rocksdb/thread_status.h"
+
+#include <string>
+
+namespace rocksdb {
+
+#if ROCKSDB_USING_THREAD_STATUS
+
+// The structure that describes a major thread operation.
+struct OperationInfo {
+  const ThreadStatus::OperationType type;
+  const std::string name;
+};
+
+// The global operation table.
+//
+// When updating a status of a thread, the pointer of the OperationInfo
+// of the current ThreadStatusData will be pointing to one of the
+// rows in this global table.
+//
+// Note that it's not designed to be constant as in the future we
+// might consider adding global count to the OperationInfo.
+static OperationInfo global_operation_table[] = {
+  {ThreadStatus::OP_UNKNOWN, ""},
+  {ThreadStatus::OP_COMPACTION, "Compaction"},
+  {ThreadStatus::OP_FLUSH, "Flush"}
+};
+
+// The structure that describes a state.
+struct StateInfo {
+  const ThreadStatus::StateType type;
+  const std::string name;
+};
+
+// The global state table.
+//
+// When updating a status of a thread, the pointer of the StateInfo
+// of the current ThreadStatusData will be pointing to one of the
+// rows in this global table.
+static StateInfo global_state_table[] = {
+  {ThreadStatus::STATE_UNKNOWN, ""},
+};
+
+#else
+
+struct OperationInfo {
+};
+
+struct StateInfo {
+};
+
+#endif  // ROCKSDB_USING_THREAD_STATUS
+}  // namespace rocksdb
diff --git a/util/thread_status_updater.cc b/util/thread_status_updater.cc
index 0a4336251..119174db5 100644
--- a/util/thread_status_updater.cc
+++ b/util/thread_status_updater.cc
@@ -34,10 +34,28 @@ void ThreadStatusUpdater::SetColumnFamilyInfoKey(
   data->cf_key.store(cf_key, std::memory_order_relaxed);
 }
 
-void ThreadStatusUpdater::SetEventInfoPtr(
-    const ThreadEventInfo* event_info) {
+void ThreadStatusUpdater::SetThreadOperation(
+    const ThreadStatus::OperationType type) {
   auto* data = InitAndGet();
-  data->event_info.store(event_info, std::memory_order_relaxed);
+  data->operation_type.store(type, std::memory_order_relaxed);
+}
+
+void ThreadStatusUpdater::ClearThreadOperation() {
+  auto* data = InitAndGet();
+  data->operation_type.store(
+      ThreadStatus::OP_UNKNOWN, std::memory_order_relaxed);
+}
+
+void ThreadStatusUpdater::SetThreadState(
+    const ThreadStatus::StateType type) {
+  auto* data = InitAndGet();
+  data->state_type.store(type, std::memory_order_relaxed);
+}
+
+void ThreadStatusUpdater::ClearThreadState() {
+  auto* data = InitAndGet();
+  data->state_type.store(
+      ThreadStatus::STATE_UNKNOWN, std::memory_order_relaxed);
 }
 
 Status ThreadStatusUpdater::GetThreadList(
@@ -50,30 +68,35 @@ Status ThreadStatusUpdater::GetThreadList(
     assert(thread_data);
     auto thread_type = thread_data->thread_type.load(
         std::memory_order_relaxed);
+    // Since any change to cf_info_map requires thread_list_mutex,
+    // which is currently held by GetThreadList(), here we can safely
+    // use "memory_order_relaxed" to load the cf_key.
     auto cf_key = thread_data->cf_key.load(
         std::memory_order_relaxed);
     auto iter = cf_info_map_.find(cf_key);
     assert(cf_key == 0 || iter != cf_info_map_.end());
     auto* cf_info = iter != cf_info_map_.end() ?
         iter->second.get() : nullptr;
-    auto* event_info = thread_data->event_info.load(
-        std::memory_order_relaxed);
     const std::string* db_name = nullptr;
     const std::string* cf_name = nullptr;
-    const std::string* event_name = nullptr;
+    ThreadStatus::OperationType op_type = ThreadStatus::OP_UNKNOWN;
+    ThreadStatus::StateType state_type = ThreadStatus::STATE_UNKNOWN;
     if (cf_info != nullptr) {
       db_name = &cf_info->db_name;
       cf_name = &cf_info->cf_name;
+      op_type = thread_data->operation_type.load(
+          std::memory_order_relaxed);
       // display lower-level info only when higher-level info is available.
-      if (event_info != nullptr) {
-        event_name = &event_info->event_name;
+      if (op_type != ThreadStatus::OP_UNKNOWN) {
+        state_type = thread_data->state_type.load(
+            std::memory_order_relaxed);
       }
     }
     thread_list->emplace_back(
         thread_data->thread_id, thread_type,
         db_name ? *db_name : "",
         cf_name ? *cf_name : "",
-        event_name ? *event_name : "");
+        op_type, state_type);
   }
 
   return Status::OK();
@@ -93,6 +116,8 @@ ThreadStatusData* ThreadStatusUpdater::InitAndGet() {
 void ThreadStatusUpdater::NewColumnFamilyInfo(
     const void* db_key, const std::string& db_name,
     const void* cf_key, const std::string& cf_name) {
+  // Acquiring same lock as GetThreadList() to guarantee
+  // a consistent view of global column family table (cf_info_map).
   std::lock_guard<std::mutex> lck(thread_list_mutex_);
 
   cf_info_map_[cf_key].reset(
@@ -101,6 +126,8 @@ void ThreadStatusUpdater::NewColumnFamilyInfo(
 }
 
 void ThreadStatusUpdater::EraseColumnFamilyInfo(const void* cf_key) {
+  // Acquiring same lock as GetThreadList() to guarantee
+  // a consistent view of global column family table (cf_info_map).
   std::lock_guard<std::mutex> lck(thread_list_mutex_);
   auto cf_pair = cf_info_map_.find(cf_key);
   assert(cf_pair != cf_info_map_.end());
@@ -122,6 +149,8 @@ void ThreadStatusUpdater::EraseColumnFamilyInfo(const void* cf_key) {
 }
 
 void ThreadStatusUpdater::EraseDatabaseInfo(const void* db_key) {
+  // Acquiring same lock as GetThreadList() to guarantee
+  // a consistent view of global column family table (cf_info_map).
   std::lock_guard<std::mutex> lck(thread_list_mutex_);
   auto db_pair = db_key_map_.find(db_key);
   if (UNLIKELY(db_pair == db_key_map_.end())) {
@@ -154,8 +183,18 @@ void ThreadStatusUpdater::SetColumnFamilyInfoKey(
     const void* cf_key) {
 }
 
-void ThreadStatusUpdater::SetEventInfoPtr(
-    const ThreadEventInfo* event_info) {
+void ThreadStatusUpdater::SetThreadOperation(
+    const ThreadStatus::OperationType type) {
+}
+
+void ThreadStatusUpdater::ClearThreadOperation() {
+}
+
+void ThreadStatusUpdater::SetThreadState(
+    const ThreadStatus::StateType type) {
+}
+
+void ThreadStatusUpdater::ClearThreadState() {
 }
 
 Status ThreadStatusUpdater::GetThreadList(
diff --git a/util/thread_status_updater.h b/util/thread_status_updater.h
index e0434cd21..8cb80022f 100644
--- a/util/thread_status_updater.h
+++ b/util/thread_status_updater.h
@@ -22,7 +22,7 @@
 //    should be ignored.
 //
 // The high to low level information would be:
-// thread_id > thread_type > db > cf > event > event_count > event_details
+// thread_id > thread_type > db > cf > operation > state
 //
 // This means user might not always get full information, but whenever
 // returned by the GetThreadList() is guaranteed to be consistent.
@@ -37,6 +37,7 @@
 #include "rocksdb/status.h"
 #include "rocksdb/thread_status.h"
 #include "port/port_posix.h"
+#include "util/thread_operation.h"
 
 namespace rocksdb {
 
@@ -57,27 +58,21 @@ struct ConstantColumnFamilyInfo {
 #endif  // ROCKSDB_USING_THREAD_STATUS
 };
 
-// The structure that describes an event.
-struct ThreadEventInfo {
-#if ROCKSDB_USING_THREAD_STATUS
- public:
-  const std::string event_name;
-#endif  // ROCKSDB_USING_THREAD_STATUS
-};
-
 // the internal data-structure that is used to reflect the current
 // status of a thread using a set of atomic pointers.
 struct ThreadStatusData {
 #if ROCKSDB_USING_THREAD_STATUS
   explicit ThreadStatusData() : thread_id(0) {
-    thread_type.store(ThreadStatus::ThreadType::USER_THREAD);
+    thread_type.store(ThreadStatus::USER);
     cf_key.store(0);
-    event_info.store(nullptr);
+    operation_type.store(ThreadStatus::OP_UNKNOWN);
+    state_type.store(ThreadStatus::STATE_UNKNOWN);
   }
   uint64_t thread_id;
   std::atomic<ThreadStatus::ThreadType> thread_type;
   std::atomic<const void*> cf_key;
-  std::atomic<const ThreadEventInfo*> event_info;
+  std::atomic<ThreadStatus::OperationType> operation_type;
+  std::atomic<ThreadStatus::StateType> state_type;
 #endif  // ROCKSDB_USING_THREAD_STATUS
 };
 
@@ -103,12 +98,20 @@ class ThreadStatusUpdater {
   void SetThreadType(ThreadStatus::ThreadType ttype);
 
   // Update the column-family info of the current thread by setting
-  // its thread-local pointer of ThreadEventInfo to the correct entry.
+  // its thread-local pointer of ThreadStateInfo to the correct entry.
   void SetColumnFamilyInfoKey(const void* cf_key);
 
-  // Update the event info of the current thread by setting
-  // its thread-local pointer of ThreadEventInfo to the correct entry.
-  void SetEventInfoPtr(const ThreadEventInfo* event_info);
+  // Update the thread operation of the current thread.
+  void SetThreadOperation(const ThreadStatus::OperationType type);
+
+  // Clear thread operation of the current thread.
+  void ClearThreadOperation();
+
+  // Update the thread state of the current thread.
+  void SetThreadState(const ThreadStatus::StateType type);
+
+  // Clear the thread state of the current thread.
+  void ClearThreadState();
 
   // Obtain the status of all active registered threads.
   Status GetThreadList(

From e9ca3581579c03799856edad5b4413ad430f8263 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Tue, 30 Dec 2014 18:33:35 -0800
Subject: [PATCH 654/829] Fix CLANG build for db_bench

Summary: CLANG was broken for a recent change in db_ench. Fix it.

Test Plan: Build db_bench using CLANG.

Reviewers: rven, igor, yhchiang

Reviewed By: yhchiang

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D30801
---
 db/db_bench.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/db/db_bench.cc b/db/db_bench.cc
index 5e4bbd9d1..3fc17d7b7 100644
--- a/db/db_bench.cc
+++ b/db/db_bench.cc
@@ -398,12 +398,12 @@ enum rocksdb::CompressionType StringToCompressionType(const char* ctype) {
   return rocksdb::kSnappyCompression; //default value
 }
 
-std::string ColumnFamilyName(int i) {
+std::string ColumnFamilyName(size_t i) {
   if (i == 0) {
     return rocksdb::kDefaultColumnFamilyName;
   } else {
     char name[100];
-    snprintf(name, sizeof(name), "column_family_name_%06d", i);
+    snprintf(name, sizeof(name), "column_family_name_%06zu", i);
     return std::string(name);
   }
 }

From caa1fd0e0e26b520ff4fd2da25224a3372935a7d Mon Sep 17 00:00:00 2001
From: Robert <robert@arctic.tw>
Date: Sun, 4 Jan 2015 12:02:52 +0800
Subject: [PATCH 655/829] Improve performance when loading BackupMeta.

* Use strtoul() and strtoull() instead of sscanf().
  glibc's sscanf() will do a implicit strlen().

* Move implicit construction of Slice("crc32 ") out of loop.
---
 utilities/backupable/backupable_db.cc | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/utilities/backupable/backupable_db.cc b/utilities/backupable/backupable_db.cc
index 269e9e9f1..ca7521ff3 100644
--- a/utilities/backupable/backupable_db.cc
+++ b/utilities/backupable/backupable_db.cc
@@ -20,6 +20,7 @@
 #endif
 
 #include <inttypes.h>
+#include <stdlib.h>
 #include <algorithm>
 #include <vector>
 #include <map>
@@ -1163,16 +1164,18 @@ Status BackupEngineImpl::BackupMeta::LoadFromFile(
   buf[data.size()] = 0;
 
   uint32_t num_files = 0;
-  int bytes_read = 0;
-  sscanf(data.data(), "%" PRId64 "%n", &timestamp_, &bytes_read);
-  data.remove_prefix(bytes_read + 1);  // +1 for '\n'
-  sscanf(data.data(), "%" PRIu64 "%n", &sequence_number_, &bytes_read);
-  data.remove_prefix(bytes_read + 1);  // +1 for '\n'
-  sscanf(data.data(), "%u%n", &num_files, &bytes_read);
-  data.remove_prefix(bytes_read + 1);  // +1 for '\n'
+  char *next;
+  timestamp_ = strtoull(data.data(), &next, 10);
+  data.remove_prefix(next - data.data() + 1); // +1 for '\n'
+  sequence_number_ = strtoull(data.data(), &next, 10);
+  data.remove_prefix(next - data.data() + 1); // +1 for '\n'
+  num_files = strtoul(data.data(), &next, 10);
+  data.remove_prefix(next - data.data() + 1); // +1 for '\n'
 
   std::vector<FileInfo> files;
 
+  Slice checksum_prefix("crc32 ");
+
   for (uint32_t i = 0; s.ok() && i < num_files; ++i) {
     auto line = GetSliceUntil(&data, '\n');
     std::string filename = GetSliceUntil(&line, ' ').ToString();
@@ -1188,9 +1191,9 @@ Status BackupEngineImpl::BackupMeta::LoadFromFile(
     }
 
     uint32_t checksum_value = 0;
-    if (line.starts_with("crc32 ")) {
-      line.remove_prefix(6);
-      sscanf(line.data(), "%u", &checksum_value);
+    if (line.starts_with(checksum_prefix)) {
+      line.remove_prefix(checksum_prefix.size());
+      checksum_value = strtoul(line.data(), nullptr, 10);
       if (memcmp(line.data(), std::to_string(checksum_value).c_str(),
                  line.size() - 1) != 0) {
         return Status::Corruption("Invalid checksum value");

From a8c5564a9de2014306887d8121328caeffb6480d Mon Sep 17 00:00:00 2001
From: Robert <robert@arctic.tw>
Date: Sun, 4 Jan 2015 12:06:59 +0800
Subject: [PATCH 656/829] Do not issue extra GetFileSize() calls when loading
 BackupMeta.

---
 utilities/backupable/backupable_db.cc | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/utilities/backupable/backupable_db.cc b/utilities/backupable/backupable_db.cc
index ca7521ff3..2e04488cb 100644
--- a/utilities/backupable/backupable_db.cc
+++ b/utilities/backupable/backupable_db.cc
@@ -185,6 +185,13 @@ class BackupEngineImpl : public BackupEngine {
       return files_.empty();
     }
 
+    const FileInfo* GetFile(const std::string& filename) const {
+      auto it = file_infos_->find(filename);
+      if (it == file_infos_->end())
+        return nullptr;
+      return &it->second;
+    }
+
     const std::vector<std::string>& GetFiles() {
       return files_;
     }
@@ -1181,9 +1188,14 @@ Status BackupEngineImpl::BackupMeta::LoadFromFile(
     std::string filename = GetSliceUntil(&line, ' ').ToString();
 
     uint64_t size;
-    s = env_->GetFileSize(backup_dir + "/" + filename, &size);
-    if (!s.ok()) {
-      return s;
+    const FileInfo* file_info = GetFile(filename);
+    if (file_info != nullptr) {
+      size = file_info->size;
+    } else {
+      s = env_->GetFileSize(backup_dir + "/" + filename, &size);
+      if (!s.ok()) {
+        return s;
+      }
     }
 
     if (line.empty()) {

From 49376bfe87cb4bb8aa7667d1258a5248054d54a2 Mon Sep 17 00:00:00 2001
From: Robert <robert@arctic.tw>
Date: Mon, 5 Jan 2015 21:20:06 +0800
Subject: [PATCH 657/829] Fix errors when using -Wshorten-64-to-32.

---
 utilities/backupable/backupable_db.cc | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/utilities/backupable/backupable_db.cc b/utilities/backupable/backupable_db.cc
index 2e04488cb..2a526c940 100644
--- a/utilities/backupable/backupable_db.cc
+++ b/utilities/backupable/backupable_db.cc
@@ -1176,7 +1176,7 @@ Status BackupEngineImpl::BackupMeta::LoadFromFile(
   data.remove_prefix(next - data.data() + 1); // +1 for '\n'
   sequence_number_ = strtoull(data.data(), &next, 10);
   data.remove_prefix(next - data.data() + 1); // +1 for '\n'
-  num_files = strtoul(data.data(), &next, 10);
+  num_files = static_cast<uint32_t>(strtoul(data.data(), &next, 10));
   data.remove_prefix(next - data.data() + 1); // +1 for '\n'
 
   std::vector<FileInfo> files;
@@ -1205,7 +1205,8 @@ Status BackupEngineImpl::BackupMeta::LoadFromFile(
     uint32_t checksum_value = 0;
     if (line.starts_with(checksum_prefix)) {
       line.remove_prefix(checksum_prefix.size());
-      checksum_value = strtoul(line.data(), nullptr, 10);
+      checksum_value = static_cast<uint32_t>(
+          strtoul(line.data(), nullptr, 10));
       if (memcmp(line.data(), std::to_string(checksum_value).c_str(),
                  line.size() - 1) != 0) {
         return Status::Corruption("Invalid checksum value");

From d7b4bb62a74f7da155c6da39309b78bc87160fe4 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Mon, 5 Jan 2015 10:26:34 -0800
Subject: [PATCH 658/829] Fail DB::Open() on WAL corruption

Summary:
This is a serious bug. If paranod_check == true and WAL is corrupted, we don't fail DB::Open(). I tried going into history and it seems we've been doing this for a long long time.

I found this when investigating t5852041.

Test Plan: Added unit test to verify correct behavior.

Reviewers: yhchiang, rven, sdong

Reviewed By: sdong

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D30597
---
 db/corruption_test.cc | 4 +++-
 db/db_impl.cc         | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/db/corruption_test.cc b/db/corruption_test.cc
index e73725a63..2cea9da65 100644
--- a/db/corruption_test.cc
+++ b/db/corruption_test.cc
@@ -231,7 +231,9 @@ TEST(CorruptionTest, Recovery) {
   Check(100, 100);
   Corrupt(kLogFile, 19, 1);      // WriteBatch tag for first record
   Corrupt(kLogFile, log::kBlockSize + 1000, 1);  // Somewhere in second block
-  Reopen();
+  ASSERT_TRUE(!TryReopen().ok());
+  options_.paranoid_checks = false;
+  Reopen(&options_);
 
   // The 64 records in the first two log blocks are completely lost.
   Check(36, 36);
diff --git a/db/db_impl.cc b/db/db_impl.cc
index 2764c8cfd..f381fd3ef 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -905,7 +905,7 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
     std::string scratch;
     Slice record;
     WriteBatch batch;
-    while (reader.ReadRecord(&record, &scratch)) {
+    while (reader.ReadRecord(&record, &scratch) && status.ok()) {
       if (record.size() < 12) {
         reporter.Corruption(record.size(),
                             Status::Corruption("log record too small"));

From fa0b126c0ca60e593eee4a8e833be8b724a3499b Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Mon, 5 Jan 2015 10:49:41 -0800
Subject: [PATCH 659/829] Fix corruption_test -- if status is not OK, return
 status -- during recovery

---
 db/db_impl.cc | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/db/db_impl.cc b/db/db_impl.cc
index f381fd3ef..e529db3c7 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -955,6 +955,10 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
       }
     }
 
+    if (!status.ok()) {
+      return status;
+    }
+
     flush_scheduler_.Clear();
     if (versions_->LastSequence() < *max_sequence) {
       versions_->SetLastSequence(*max_sequence);

From 62ad0a9b19f0be4cefa70b6b32876e764b7f3c11 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Mon, 5 Jan 2015 13:35:56 -0800
Subject: [PATCH 660/829] Deprecating skip_log_error_on_recovery

Summary:
Since https://reviews.facebook.net/D16119, we ignore partial tailing writes. Because of that, we no longer need skip_log_error_on_recovery.

The documentation says "Skip log corruption error on recovery (If client is ok with losing most recent changes)", while the option actually ignores any corruption of the WAL (not only just the most recent changes). This is very dangerous and can lead to DB inconsistencies. This was originally set up to ignore partial tailing writes, which we now do automatically (after D16119). I have digged up old task t2416297 which confirms my findings.

Test Plan: There was actually no tests that verified correct behavior of skip_log_error_on_recovery.

Reviewers: yhchiang, rven, dhruba, sdong

Reviewed By: sdong

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D30603
---
 HISTORY.md                | 3 +++
 db/db_impl.cc             | 8 ++------
 include/rocksdb/options.h | 4 +---
 util/options.cc           | 2 --
 4 files changed, 6 insertions(+), 11 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index 49fc56df8..245f4ec61 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -9,6 +9,9 @@
   numbered levels will be placed later in the db_paths vector.
 * Potentially big performance improvements if you're using RocksDB with lots of column families (100-1000)
 
+### Public API changes
+* Deprecated skip_log_error_on_recovery option
+
 ### 3.9.0 (12/8/2014)
 
 ### New Features
diff --git a/db/db_impl.cc b/db/db_impl.cc
index e529db3c7..dfe66eeab 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -842,8 +842,7 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
     Env* env;
     Logger* info_log;
     const char* fname;
-    Status* status;  // nullptr if db_options_.paranoid_checks==false or
-                     //            db_options_.skip_log_error_on_recovery==true
+    Status* status;  // nullptr if db_options_.paranoid_checks==false
     virtual void Corruption(size_t bytes, const Status& s) {
       Log(InfoLogLevel::WARN_LEVEL,
           info_log, "%s%s: dropping %d bytes; %s",
@@ -888,10 +887,7 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
     reporter.env = env_;
     reporter.info_log = db_options_.info_log.get();
     reporter.fname = fname.c_str();
-    reporter.status =
-        (db_options_.paranoid_checks && !db_options_.skip_log_error_on_recovery
-             ? &status
-             : nullptr);
+    reporter.status = (db_options_.paranoid_checks) ? &status : nullptr;
     // We intentially make log::Reader do checksumming even if
     // paranoid_checks==false so that corruptions cause entire commits
     // to be skipped instead of propagating bad information (like overly
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index 298ec6aee..75625abcc 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -850,9 +850,7 @@ struct DBOptions {
   // Disable child process inherit open files. Default: true
   bool is_fd_close_on_exec;
 
-  // Skip log corruption error on recovery (If client is ok with
-  // losing most recent changes)
-  // Default: false
+  // DEPRECATED -- this options is no longer used
   bool skip_log_error_on_recovery;
 
   // if not zero, dump rocksdb.stats to LOG every stats_dump_period_sec
diff --git a/util/options.cc b/util/options.cc
index 085df053d..75307f13f 100644
--- a/util/options.cc
+++ b/util/options.cc
@@ -333,8 +333,6 @@ void DBOptions::Dump(Logger* log) const {
         allow_mmap_writes);
     Log(log, "                     Options.is_fd_close_on_exec: %d",
         is_fd_close_on_exec);
-    Log(log, "              Options.skip_log_error_on_recovery: %d",
-        skip_log_error_on_recovery);
     Log(log, "                   Options.stats_dump_period_sec: %u",
         stats_dump_period_sec);
     Log(log, "                   Options.advise_random_on_open: %d",

From 9d5bd411be284c36d2758cf7d1ec731a783d029e Mon Sep 17 00:00:00 2001
From: Leonidas Galanis <lgalanis@fb.com>
Date: Mon, 5 Jan 2015 15:36:47 -0800
Subject: [PATCH 661/829] benchmark.sh won't run through all tests properly if
 one specifies wal_dir to be different than db directory.

Summary:
A command line like this to run all the tests:
source benchmark.config.sh && nohup ./benchmark.sh 'bulkload,fillseq,overwrite,filluniquerandom,readrandom,readwhilewriting'
where
benchmark.config.sh is:
export DB_DIR=/data/mysql/rocksdata
export WAL_DIR=/txlogs/rockswal
export OUTPUT_DIR=/root/rocks_benchmarking/output

Will fail for the tests that need a new DB .

Also 1) set disable_data_sync=0 and 2) add debug mode to run through all the tests more quickly

Test Plan: run ./benchmark.sh 'debug,bulkload,fillseq,overwrite,filluniquerandom,readrandom,readwhilewriting' and verify that there are no complaints about WAL dir not being empty.

Reviewers: sdong, yhchiang, rven, igor

Reviewed By: igor

Subscribers: dhruba

Differential Revision: https://reviews.facebook.net/D30909
---
 db/db_bench.cc     |  6 +++++-
 tools/benchmark.sh | 18 ++++++++++++++----
 2 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/db/db_bench.cc b/db/db_bench.cc
index 3fc17d7b7..8e5d07a59 100644
--- a/db/db_bench.cc
+++ b/db/db_bench.cc
@@ -1353,7 +1353,11 @@ class Benchmark {
       }
     }
     if (!FLAGS_use_existing_db) {
-      DestroyDB(FLAGS_db, Options());
+      Options options;
+      if (!FLAGS_wal_dir.empty()) {
+        options.wal_dir = FLAGS_wal_dir;
+      }
+      DestroyDB(FLAGS_db, options);
     }
   }
 
diff --git a/tools/benchmark.sh b/tools/benchmark.sh
index 2ea300d32..135209384 100755
--- a/tools/benchmark.sh
+++ b/tools/benchmark.sh
@@ -93,7 +93,7 @@ function run_bulkload {
        --num=$num_keys \
        --disable_auto_compactions=1 \
        --sync=0 \
-       --disable_data_sync=1 \
+       --disable_data_sync=0 \
        --threads=1 2>&1 | tee $output_dir/benchmark_bulkload_fillrandom.log"
   echo $cmd | tee $output_dir/benchmark_bulkload_fillrandom.log
   eval $cmd
@@ -103,7 +103,7 @@ function run_bulkload {
        --num=$num_keys \
        --disable_auto_compactions=1 \
        --sync=0 \
-       --disable_data_sync=1 \
+       --disable_data_sync=0 \
        --threads=1 2>&1 | tee $output_dir/benchmark_bulkload_compact.log"
   echo $cmd | tee $output_dir/benchmark_bulkload_compact.log
   eval $cmd
@@ -197,7 +197,11 @@ echo "===== Benchmark ====="
 # Run!!!
 IFS=',' read -a jobs <<< $1
 for job in ${jobs[@]}; do
-  echo "Start $job at `date`" | tee -a $report
+
+  if [ $job != debug ]; then
+    echo "Start $job at `date`" | tee -a $report
+  fi
+
   start=$(now)
   if [ $job = bulkload ]; then
     run_bulkload
@@ -213,13 +217,19 @@ for job in ${jobs[@]}; do
     run_readwhilewriting
   elif [ $job = rangescanwhilewriting ]; then
     run_rangescanwhilewriting
+  elif [ $job = debug ]; then
+    num_keys=10000; # debug
+    echo "Setting num_keys to $num_keys"
   else
     echo "unknown job $job"
     exit
   fi
   end=$(now)
 
-  echo "Complete $job in $((end-start)) seconds" | tee -a $report
+  if [ $job != debug ]; then
+    echo "Complete $job in $((end-start)) seconds" | tee -a $report
+  fi
+
   if [[ $job = readrandom || $job = readwhilewriting || $job == rangescanwhilewriting ]]; then
     lat=$(grep "micros\/op" "$output_dir/benchmark_$job.log" | grep "ops\/sec" | awk '{print $3}')
     qps=$(grep "micros\/op" "$output_dir/benchmark_$job.log" | grep "ops\/sec" | awk '{print $5}')

From 07aa4e0e35831a29a68e82747b271cca1d5058f6 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Mon, 5 Jan 2015 17:32:49 -0800
Subject: [PATCH 662/829] Fix compaction summary log for trivial move

Summary: When trivial move commit is done, we log the summary of the input version instead of current. This is inconsistent with other log messages and confusing.

Test Plan: compiles

Reviewers: sdong

Reviewed By: sdong

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D30939
---
 db/db_impl.cc | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/db/db_impl.cc b/db/db_impl.cc
index dfe66eeab..7350d5729 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -2163,11 +2163,12 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress, JobContext* job_context,
     VersionStorageInfo::LevelSummaryStorage tmp;
     c->column_family_data()->internal_stats()->IncBytesMoved(
         c->level() + 1, f->fd.GetFileSize());
-    LogToBuffer(log_buffer, "[%s] Moved #%" PRIu64 " to level-%d %" PRIu64
-                            " bytes %s: %s\n",
-                c->column_family_data()->GetName().c_str(), f->fd.GetNumber(),
-                c->level() + 1, f->fd.GetFileSize(), status.ToString().c_str(),
-                c->input_version()->storage_info()->LevelSummary(&tmp));
+    LogToBuffer(
+        log_buffer,
+        "[%s] Moved #%" PRIu64 " to level-%d %" PRIu64 " bytes %s: %s\n",
+        c->column_family_data()->GetName().c_str(), f->fd.GetNumber(),
+        c->level() + 1, f->fd.GetFileSize(), status.ToString().c_str(),
+        c->column_family_data()->current()->storage_info()->LevelSummary(&tmp));
     c->ReleaseCompactionFiles(status);
     *madeProgress = true;
   } else {

From 7731d51c824e70c316eed3a07677163cddf47b61 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Tue, 6 Jan 2015 12:44:21 -0800
Subject: [PATCH 663/829] Simplify column family concurrency

Summary:
This patch changes concurrency guarantees around ColumnFamilySet::column_families_ and ColumnFamilySet::column_families_data_.

Before:
* When mutating: lock DB mutex and spin lock
* When reading: lock DB mutex OR spin lock

After:
* When mutating: lock DB mutex and be in write thread
* When reading: lock DB mutex or be in write thread

That way, we eliminate the spin lock that protects these hash maps and  simplify concurrency. That means we don't need to lock the spin lock during writing, since writing is mutually exclusive with column family create/drop (the only operations that mutate those hash maps).

With these new restrictions, I also needed to move column family create to the write thread (column family drop was already in the write thread).

Even though we don't need to lock the spin lock during write, impact on performance should be minimal -- the spin lock is almost never busy, so locking it is almost free.

This addresses task t5116919.

Test Plan:
make check

Stress test with lots and lots of column family drop and create:

   time ./db_stress --threads=30 --ops_per_thread=5000000 --max_key=5000 --column_families=200 --clear_column_family_one_in=100000 --verify_before_write=0  --reopen=15 --max_background_compactions=10 --max_background_flushes=10 --db=/fast-rocksdb-tmp/db_stress/

Reviewers: yhchiang, rven, sdong

Reviewed By: sdong

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D30651
---
 db/column_family.cc | 45 +++++++++++++++----------------------
 db/column_family.h  | 54 ++++++++++++++++++++++++---------------------
 db/db_impl.cc       | 14 +++++++++---
 db/write_batch.cc   |  9 ++++++++
 4 files changed, 67 insertions(+), 55 deletions(-)

diff --git a/db/column_family.cc b/db/column_family.cc
index 8a5c4a01f..19bb09564 100644
--- a/db/column_family.cc
+++ b/db/column_family.cc
@@ -306,9 +306,10 @@ ColumnFamilyData::~ColumnFamilyData() {
   prev->next_ = next;
   next->prev_ = prev;
 
-  // it's nullptr for dummy CFD
-  if (column_family_set_ != nullptr) {
-    // remove from column_family_set
+  if (!dropped_ && column_family_set_ != nullptr) {
+    // If it's dropped, it's already removed from column family set
+    // If column_family_set_ == nullptr, this is dummy CFD and not in
+    // ColumnFamilySet
     column_family_set_->RemoveColumnFamily(this);
   }
 
@@ -353,6 +354,16 @@ ColumnFamilyData::~ColumnFamilyData() {
   }
 }
 
+void ColumnFamilyData::SetDropped() {
+  // can't drop default CF
+  assert(id_ != 0);
+  dropped_ = true;
+  write_controller_token_.reset();
+
+  // remove from column_family_set
+  column_family_set_->RemoveColumnFamily(this);
+}
+
 void ColumnFamilyData::RecalculateWriteStallConditions(
       const MutableCFOptions& mutable_cf_options) {
   if (current_ != nullptr) {
@@ -635,8 +646,7 @@ ColumnFamilySet::ColumnFamilySet(const std::string& dbname,
       env_options_(env_options),
       table_cache_(table_cache),
       write_buffer_(write_buffer),
-      write_controller_(write_controller),
-      spin_lock_(ATOMIC_FLAG_INIT) {
+      write_controller_(write_controller) {
   // initialize linked list
   dummy_cfd_->prev_ = dummy_cfd_;
   dummy_cfd_->next_ = dummy_cfd_;
@@ -693,7 +703,7 @@ size_t ColumnFamilySet::NumberOfColumnFamilies() const {
   return column_families_.size();
 }
 
-// under a DB mutex
+// under a DB mutex AND write thread
 ColumnFamilyData* ColumnFamilySet::CreateColumnFamily(
     const std::string& name, uint32_t id, Version* dummy_versions,
     const ColumnFamilyOptions& options) {
@@ -702,10 +712,8 @@ ColumnFamilyData* ColumnFamilySet::CreateColumnFamily(
       new ColumnFamilyData(id, name, dummy_versions, table_cache_,
                            write_buffer_, options, db_options_,
                            env_options_, this);
-  Lock();
   column_families_.insert({name, id});
   column_family_data_.insert({id, new_cfd});
-  Unlock();
   max_column_family_ = std::max(max_column_family_, id);
   // add to linked list
   new_cfd->next_ = dummy_cfd_;
@@ -719,14 +727,6 @@ ColumnFamilyData* ColumnFamilySet::CreateColumnFamily(
   return new_cfd;
 }
 
-void ColumnFamilySet::Lock() {
-  // spin lock
-  while (spin_lock_.test_and_set(std::memory_order_acquire)) {
-  }
-}
-
-void ColumnFamilySet::Unlock() { spin_lock_.clear(std::memory_order_release); }
-
 // REQUIRES: DB mutex held
 void ColumnFamilySet::FreeDeadColumnFamilies() {
   autovector<ColumnFamilyData*> to_delete;
@@ -741,30 +741,21 @@ void ColumnFamilySet::FreeDeadColumnFamilies() {
   }
 }
 
-// under a DB mutex
+// under a DB mutex AND from a write thread
 void ColumnFamilySet::RemoveColumnFamily(ColumnFamilyData* cfd) {
   auto cfd_iter = column_family_data_.find(cfd->GetID());
   assert(cfd_iter != column_family_data_.end());
-  Lock();
   column_family_data_.erase(cfd_iter);
   column_families_.erase(cfd->GetName());
-  Unlock();
 }
 
+// under a DB mutex OR from a write thread
 bool ColumnFamilyMemTablesImpl::Seek(uint32_t column_family_id) {
   if (column_family_id == 0) {
     // optimization for common case
     current_ = column_family_set_->GetDefault();
   } else {
-    // maybe outside of db mutex, should lock
-    column_family_set_->Lock();
     current_ = column_family_set_->GetColumnFamily(column_family_id);
-    column_family_set_->Unlock();
-    // TODO(icanadi) Maybe remove column family from the hash table when it's
-    // dropped?
-    if (current_ != nullptr && current_->IsDropped()) {
-      current_ = nullptr;
-    }
   }
   handle_.SetCFD(current_);
   return current_ != nullptr;
diff --git a/db/column_family.h b/db/column_family.h
index 8cf66a0c0..1c987a3f0 100644
--- a/db/column_family.h
+++ b/db/column_family.h
@@ -123,8 +123,7 @@ extern ColumnFamilyOptions SanitizeOptions(const InternalKeyComparator* icmp,
 
 class ColumnFamilySet;
 
-// This class keeps all the data that a column family needs. It's mosly dumb and
-// used just to provide access to metadata.
+// This class keeps all the data that a column family needs.
 // Most methods require DB mutex held, unless otherwise noted
 class ColumnFamilyData {
  public:
@@ -145,7 +144,10 @@ class ColumnFamilyData {
     return --refs_ == 0;
   }
 
-  // This can only be called from single-threaded VersionSet::LogAndApply()
+  // SetDropped() can only be called under following conditions:
+  // 1) Holding a DB mutex,
+  // 2) from single-threaded write thread, AND
+  // 3) from single-threaded VersionSet::LogAndApply()
   // After dropping column family no other operation on that column family
   // will be executed. All the files and memory will be, however, kept around
   // until client drops the column family handle. That way, client can still
@@ -153,17 +155,12 @@ class ColumnFamilyData {
   // Column family can be dropped and still alive. In that state:
   // *) Column family is not included in the iteration.
   // *) Compaction and flush is not executed on the dropped column family.
-  // *) Client can continue writing and reading from column family. However, all
-  // writes stay in the current memtable.
+  // *) Client can continue reading from column family. Writes will fail unless
+  // WriteOptions::ignore_missing_column_families is true
   // When the dropped column family is unreferenced, then we:
   // *) delete all memory associated with that column family
   // *) delete all the files associated with that column family
-  void SetDropped() {
-    // can't drop default CF
-    assert(id_ != 0);
-    dropped_ = true;
-    write_controller_token_.reset();
-  }
+  void SetDropped();
   bool IsDropped() const { return dropped_; }
 
   // thread-safe
@@ -348,18 +345,21 @@ class ColumnFamilyData {
 };
 
 // ColumnFamilySet has interesting thread-safety requirements
-// * CreateColumnFamily() or RemoveColumnFamily() -- need to protect by DB
-// mutex. Inside, column_family_data_ and column_families_ will be protected
-// by Lock() and Unlock(). CreateColumnFamily() should ONLY be called from
-// VersionSet::LogAndApply() in the normal runtime. It is also called
-// during Recovery and in DumpManifest(). RemoveColumnFamily() is called
-// from ColumnFamilyData destructor
+// * CreateColumnFamily() or RemoveColumnFamily() -- need to be protected by DB
+// mutex AND executed in the write thread.
+// CreateColumnFamily() should ONLY be called from VersionSet::LogAndApply() AND
+// single-threaded write thread. It is also called during Recovery and in
+// DumpManifest().
+// RemoveColumnFamily() is only called from SetDropped(). DB mutex needs to be
+// held and it needs to be executed from the write thread. SetDropped() also
+// guarantees that it will be called only from single-threaded LogAndApply(),
+// but this condition is not that important.
 // * Iteration -- hold DB mutex, but you can release it in the body of
 // iteration. If you release DB mutex in body, reference the column
 // family before the mutex and unreference after you unlock, since the column
 // family might get dropped when the DB mutex is released
 // * GetDefault() -- thread safe
-// * GetColumnFamily() -- either inside of DB mutex or call Lock() <-> Unlock()
+// * GetColumnFamily() -- either inside of DB mutex or from a write thread
 // * GetNextColumnFamilyID(), GetMaxColumnFamily(), UpdateMaxColumnFamily(),
 // NumberOfColumnFamilies -- inside of DB mutex
 class ColumnFamilySet {
@@ -410,9 +410,6 @@ class ColumnFamilySet {
   iterator begin() { return iterator(dummy_cfd_->next_); }
   iterator end() { return iterator(dummy_cfd_); }
 
-  void Lock();
-  void Unlock();
-
   // REQUIRES: DB mutex held
   // Don't call while iterating over ColumnFamilySet
   void FreeDeadColumnFamilies();
@@ -424,9 +421,12 @@ class ColumnFamilySet {
   void RemoveColumnFamily(ColumnFamilyData* cfd);
 
   // column_families_ and column_family_data_ need to be protected:
-  // * when mutating: 1. DB mutex locked first, 2. spinlock locked second
-  // * when reading, either: 1. lock DB mutex, or 2. lock spinlock
-  //  (if both, respect the ordering to avoid deadlock!)
+  // * when mutating both conditions have to be satisfied:
+  // 1. DB mutex locked
+  // 2. thread currently in single-threaded write thread
+  // * when reading, at least one condition needs to be satisfied:
+  // 1. DB mutex locked
+  // 2. accessed from a single-threaded write thread
   std::unordered_map<std::string, uint32_t> column_families_;
   std::unordered_map<uint32_t, ColumnFamilyData*> column_family_data_;
 
@@ -444,7 +444,6 @@ class ColumnFamilySet {
   Cache* table_cache_;
   WriteBuffer* write_buffer_;
   WriteController* write_controller_;
-  std::atomic_flag spin_lock_;
 };
 
 // We use ColumnFamilyMemTablesImpl to provide WriteBatch a way to access
@@ -459,17 +458,22 @@ class ColumnFamilyMemTablesImpl : public ColumnFamilyMemTables {
 
   // sets current_ to ColumnFamilyData with column_family_id
   // returns false if column family doesn't exist
+  // REQUIRES: under a DB mutex OR from a write thread
   bool Seek(uint32_t column_family_id) override;
 
   // Returns log number of the selected column family
+  // REQUIRES: under a DB mutex OR from a write thread
   uint64_t GetLogNumber() const override;
 
   // REQUIRES: Seek() called first
+  // REQUIRES: under a DB mutex OR from a write thread
   virtual MemTable* GetMemTable() const override;
 
   // Returns column family handle for the selected column family
+  // REQUIRES: under a DB mutex OR from a write thread
   virtual ColumnFamilyHandle* GetColumnFamilyHandle() override;
 
+  // REQUIRES: under a DB mutex OR from a write thread
   virtual void CheckMemtableFull() override;
 
  private:
diff --git a/db/db_impl.cc b/db/db_impl.cc
index 7350d5729..412146a3e 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -2579,9 +2579,17 @@ Status DBImpl::CreateColumnFamily(const ColumnFamilyOptions& cf_options,
     // LogAndApply will both write the creation in MANIFEST and create
     // ColumnFamilyData object
     Options opt(db_options_, cf_options);
-    s = versions_->LogAndApply(nullptr,
-        MutableCFOptions(opt, ImmutableCFOptions(opt)),
-        &edit, &mutex_, db_directory_.get(), false, &cf_options);
+    {  // write thread
+      WriteThread::Writer w(&mutex_);
+      s = write_thread_.EnterWriteThread(&w, 0);
+      assert(s.ok() && !w.done);  // No timeout and nobody should do our job
+      // LogAndApply will both write the creation in MANIFEST and create
+      // ColumnFamilyData object
+      s = versions_->LogAndApply(
+          nullptr, MutableCFOptions(opt, ImmutableCFOptions(opt)), &edit,
+          &mutex_, db_directory_.get(), false, &cf_options);
+      write_thread_.ExitWriteThread(&w, &w, s);
+    }
     if (s.ok()) {
       single_column_family_mode_ = false;
       auto* cfd =
diff --git a/db/write_batch.cc b/db/write_batch.cc
index 386e7ce1f..285a1b37d 100644
--- a/db/write_batch.cc
+++ b/db/write_batch.cc
@@ -280,6 +280,8 @@ void WriteBatch::PutLogData(const Slice& blob) {
 }
 
 namespace {
+// This class can *only* be used from a single-threaded write thread, because it
+// calls ColumnFamilyMemTablesImpl::Seek()
 class MemTableInserter : public WriteBatch::Handler {
  public:
   SequenceNumber sequence_;
@@ -305,6 +307,8 @@ class MemTableInserter : public WriteBatch::Handler {
   }
 
   bool SeekToColumnFamily(uint32_t column_family_id, Status* s) {
+    // We are only allowed to call this from a single-threaded write thread
+    // (or while holding DB mutex)
     bool found = cf_mems_->Seek(column_family_id);
     if (!found) {
       if (ignore_missing_column_families_) {
@@ -485,6 +489,11 @@ class MemTableInserter : public WriteBatch::Handler {
 };
 }  // namespace
 
+// This function can only be called in these conditions:
+// 1) During Recovery()
+// 2) during Write(), in a single-threaded write thread
+// The reason is that it calles ColumnFamilyMemTablesImpl::Seek(), which needs
+// to be called from a single-threaded write thread (or while holding DB mutex)
 Status WriteBatchInternal::InsertInto(const WriteBatch* b,
                                       ColumnFamilyMemTables* memtables,
                                       bool ignore_missing_column_families,

From 4d16a9a633ab4ee3c7e9dbbe3d211e1af629ae4e Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Wed, 7 Jan 2015 10:29:21 -0800
Subject: [PATCH 664/829] VersionBuilder to optimize for applying a later edit
 deleting files added by previous edits

Summary: During recovery, VersionBuilder::Apply() was called multiple times. If the DB is open for long enough, most of files added earlier will be deleted by later deletes. In current solution, sorting added file happens first and then deletes are applied. In this patch, deletes are applied when possible inside Apply(), which can significantly reduce the sorting time in some cases.

Test Plan:
Add unit tests in version_builder
valgrind_check
Open a manifest of 50MB, with 9K live files. The manifest read time reduced from 1.6 seconds to 0.7 seconds.

Reviewers: rven, yhchiang, igor

Reviewed By: igor

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D30765
---
 db/version_builder.cc      | 29 +++++++++-----
 db/version_builder_test.cc | 78 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 97 insertions(+), 10 deletions(-)

diff --git a/db/version_builder.cc b/db/version_builder.cc
index e282e670c..3a4143b9e 100644
--- a/db/version_builder.cc
+++ b/db/version_builder.cc
@@ -101,22 +101,25 @@ class VersionBuilder::Rep {
     for (int level = 0; level < base_vstorage_->num_levels(); level++) {
       const auto& added = levels_[level].added_files;
       for (auto& pair : added) {
-        FileMetaData* f = pair.second;
-        f->refs--;
-        if (f->refs <= 0) {
-          if (f->table_reader_handle) {
-            assert(table_cache_ != nullptr);
-            table_cache_->ReleaseHandle(f->table_reader_handle);
-            f->table_reader_handle = nullptr;
-          }
-          delete f;
-        }
+        UnrefFile(pair.second);
       }
     }
 
     delete[] levels_;
   }
 
+  void UnrefFile(FileMetaData* f) {
+    f->refs--;
+    if (f->refs <= 0) {
+      if (f->table_reader_handle) {
+        assert(table_cache_ != nullptr);
+        table_cache_->ReleaseHandle(f->table_reader_handle);
+        f->table_reader_handle = nullptr;
+      }
+      delete f;
+    }
+  }
+
   void CheckConsistency(VersionStorageInfo* vstorage) {
 #ifndef NDEBUG
     // make sure the files are sorted correctly
@@ -199,6 +202,12 @@ class VersionBuilder::Rep {
       const auto number = del_file.second;
       levels_[level].deleted_files.insert(number);
       CheckConsistencyForDeletes(edit, number, level);
+
+      auto exising = levels_[level].added_files.find(number);
+      if (exising != levels_[level].added_files.end()) {
+        UnrefFile(exising->second);
+        levels_[level].added_files.erase(number);
+      }
     }
 
     // Add new files
diff --git a/db/version_builder_test.cc b/db/version_builder_test.cc
index 1373e2f88..5da73cbc3 100644
--- a/db/version_builder_test.cc
+++ b/db/version_builder_test.cc
@@ -126,6 +126,84 @@ TEST(VersionBuilderTest, ApplyAndSaveTo) {
   }
 }
 
+TEST(VersionBuilderTest, ApplyMultipleAndSaveTo) {
+  UpdateVersionStorageInfo();
+
+  VersionEdit version_edit;
+  version_edit.AddFile(2, 666, 0, 100U, GetInternalKey("301"),
+                       GetInternalKey("350"), 200, 200);
+  version_edit.AddFile(2, 676, 0, 100U, GetInternalKey("401"),
+                       GetInternalKey("450"), 200, 200);
+  version_edit.AddFile(2, 636, 0, 100U, GetInternalKey("601"),
+                       GetInternalKey("650"), 200, 200);
+  version_edit.AddFile(2, 616, 0, 100U, GetInternalKey("501"),
+                       GetInternalKey("550"), 200, 200);
+  version_edit.AddFile(2, 606, 0, 100U, GetInternalKey("701"),
+                       GetInternalKey("750"), 200, 200);
+
+  EnvOptions env_options;
+
+  VersionBuilder version_builder(env_options, nullptr, &vstorage_);
+
+  VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
+                                  kCompactionStyleLevel, nullptr);
+  version_builder.Apply(&version_edit);
+  version_builder.SaveTo(&new_vstorage);
+
+  ASSERT_EQ(500U, new_vstorage.NumLevelBytes(2));
+
+  for (int i = 0; i < new_vstorage.num_levels(); i++) {
+    for (auto* f : new_vstorage.LevelFiles(i)) {
+      if (--f->refs == 0) {
+        delete f;
+      }
+    }
+  }
+}
+
+TEST(VersionBuilderTest, ApplyDeleteAndSaveTo) {
+  UpdateVersionStorageInfo();
+
+  EnvOptions env_options;
+  VersionBuilder version_builder(env_options, nullptr, &vstorage_);
+  VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
+                                  kCompactionStyleLevel, nullptr);
+
+  VersionEdit version_edit;
+  version_edit.AddFile(2, 666, 0, 100U, GetInternalKey("301"),
+                       GetInternalKey("350"), 200, 200);
+  version_edit.AddFile(2, 676, 0, 100U, GetInternalKey("401"),
+                       GetInternalKey("450"), 200, 200);
+  version_edit.AddFile(2, 636, 0, 100U, GetInternalKey("601"),
+                       GetInternalKey("650"), 200, 200);
+  version_edit.AddFile(2, 616, 0, 100U, GetInternalKey("501"),
+                       GetInternalKey("550"), 200, 200);
+  version_edit.AddFile(2, 606, 0, 100U, GetInternalKey("701"),
+                       GetInternalKey("750"), 200, 200);
+  version_builder.Apply(&version_edit);
+
+  VersionEdit version_edit2;
+  version_edit.AddFile(2, 808, 0, 100U, GetInternalKey("901"),
+                       GetInternalKey("950"), 200, 200);
+  version_edit2.DeleteFile(2, 616);
+  version_edit2.DeleteFile(2, 636);
+  version_edit.AddFile(2, 806, 0, 100U, GetInternalKey("801"),
+                       GetInternalKey("850"), 200, 200);
+  version_builder.Apply(&version_edit2);
+
+  version_builder.SaveTo(&new_vstorage);
+
+  ASSERT_EQ(300U, new_vstorage.NumLevelBytes(2));
+
+  for (int i = 0; i < new_vstorage.num_levels(); i++) {
+    for (auto* f : new_vstorage.LevelFiles(i)) {
+      if (--f->refs == 0) {
+        delete f;
+      }
+    }
+  }
+}
+
 TEST(VersionBuilderTest, EstimatedActiveKeys) {
   const uint32_t kTotalSamples = 20;
   const uint32_t kNumLevels = 5;

From 9ef59a09a50c61e4743fc7edac6a660853cbd9fd Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Wed, 7 Jan 2015 10:43:29 -0800
Subject: [PATCH 665/829] VersionSet::AddLiveFiles() to assert current version
 is included.

Summary: Add an extra assert to make sure current version is included in VersionSet::AddLiveFiles().

Test Plan: make all check

Reviewers: yhchiang, rven, igor

Reviewed By: igor

Subscribers: dhruba, hermanlee4, leveldb

Differential Revision: https://reviews.facebook.net/D30819
---
 db/version_set.cc | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/db/version_set.cc b/db/version_set.cc
index 0dbac7667..b206fe5b3 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -2636,16 +2636,21 @@ void VersionSet::AddLiveFiles(std::vector<FileDescriptor>* live_list) {
   live_list->reserve(live_list->size() + static_cast<size_t>(total_files));
 
   for (auto cfd : *column_family_set_) {
+    auto* current = cfd->current();
+    bool found_current = false;
     Version* dummy_versions = cfd->dummy_versions();
     for (Version* v = dummy_versions->next_; v != dummy_versions;
          v = v->next_) {
-      const auto* vstorage = v->storage_info();
-      for (int level = 0; level < vstorage->num_levels(); level++) {
-        for (const auto& f : vstorage->LevelFiles(level)) {
-          live_list->push_back(f->fd);
-        }
+      v->AddLiveFiles(live_list);
+      if (v == current) {
+        found_current = true;
       }
     }
+    if (!found_current && current != nullptr) {
+      // Should never happen unless it is a bug.
+      assert(false);
+      current->AddLiveFiles(live_list);
+    }
   }
 }
 

From 4b57d9a820aacb38bc3be39903812bf233ad1097 Mon Sep 17 00:00:00 2001
From: stash93 <glebik.stanislav@gmail.com>
Date: Thu, 8 Jan 2015 01:03:51 +0300
Subject: [PATCH 666/829] Fixed negative numbers comparison in DocumentDB

---
 utilities/document/document_db.cc      |  7 ++--
 utilities/document/document_db_test.cc | 44 ++++++++++++++++++++++++--
 2 files changed, 47 insertions(+), 4 deletions(-)

diff --git a/utilities/document/document_db.cc b/utilities/document/document_db.cc
index 6540c2d8c..04d88714b 100644
--- a/utilities/document/document_db.cc
+++ b/utilities/document/document_db.cc
@@ -312,8 +312,11 @@ bool EncodeJSONPrimitive(const JSONDocument& json, std::string* dst) {
       break;
     case JSONDocument::kInt64:
       dst->push_back(kInt64);
-      // TODO(icanadi) oops, this will not work correctly for negative numbers
-      PutFixed64(dst, static_cast<uint64_t>(json.GetInt64()));
+      {
+        auto val = json.GetInt64();
+        dst->push_back((val < 0) ? '0' : '1');
+        PutFixed64(dst, static_cast<uint64_t>(val));
+      }
       break;
     case JSONDocument::kString:
       dst->push_back(kString);
diff --git a/utilities/document/document_db_test.cc b/utilities/document/document_db_test.cc
index 5b36a2060..bacef9a50 100644
--- a/utilities/document/document_db_test.cc
+++ b/utilities/document/document_db_test.cc
@@ -164,7 +164,9 @@ TEST(DocumentDBTest, ComplexQueryTest) {
       "{'_id': 8, 'job_name': 'rock', 'priority': 3, 'progress': 93.24}",
       "{'_id': 9, 'job_name': 'steady', 'priority': 3, 'progress': 9.1}",
       "{'_id': 10, 'job_name': 'white', 'priority': 1, 'progress': 61.4}",
-      "{'_id': 11, 'job_name': 'who', 'priority': 4, 'progress': 39.41}", };
+      "{'_id': 11, 'job_name': 'who', 'priority': 4, 'progress': 39.41}",
+      "{'_id': 12, 'job_name': 'who', 'priority': -1, 'progress': 39.42}",
+      "{'_id': 13, 'job_name': 'who', 'priority': -2, 'progress': 39.42}", };
 
   // add index on the fly!
   CreateIndexes({job_name_index});
@@ -185,6 +187,15 @@ TEST(DocumentDBTest, ComplexQueryTest) {
     AssertCursorIDs(cursor.get(), {4, 8});
   }
 
+  // -1 <= priority <= 1, index priority
+  {
+    std::unique_ptr<JSONDocument> query(Parse(
+        "[{'$filter': {'priority': {'$lte': 1, '$gte': -1},"
+        " '$index': 'priority'}}]"));
+    std::unique_ptr<Cursor> cursor(db_->Query(ReadOptions(), *query));
+    AssertCursorIDs(cursor.get(), {6, 10, 12});
+  }
+
   // 2 < priority < 4 AND progress > 10.0, index progress
   {
     std::unique_ptr<JSONDocument> query(Parse(
@@ -209,7 +220,7 @@ TEST(DocumentDBTest, ComplexQueryTest) {
         "[{'$filter': {'progress': {'$gt': 5.0, '$gte': 35.0, '$lt': 65.5}, "
         "'$index': 'progress'}}]"));
     std::unique_ptr<Cursor> cursor(db_->Query(ReadOptions(), *query));
-    AssertCursorIDs(cursor.get(), {2, 5, 10, 11});
+    AssertCursorIDs(cursor.get(), {2, 5, 10, 11, 12, 13});
   }
 
   // 2 < priority <= 4, index priority
@@ -244,6 +255,35 @@ TEST(DocumentDBTest, ComplexQueryTest) {
     ASSERT_OK(db_->Update(ReadOptions(), WriteOptions(), *query, *update));
   }
 
+  // priority < 0
+  {
+    std::unique_ptr<JSONDocument> query(
+        Parse("[{'$filter': {'priority': {'$lt': 0}, '$index': 'priority'}}]"));
+    std::unique_ptr<Cursor> cursor(db_->Query(ReadOptions(), *query));
+    ASSERT_OK(cursor->status());
+    AssertCursorIDs(cursor.get(), {12, 13});
+  }
+
+  // -2 < priority < 0
+  {
+    std::unique_ptr<JSONDocument> query(
+        Parse("[{'$filter': {'priority': {'$gt': -2, '$lt': 0},"
+        " '$index': 'priority'}}]"));
+    std::unique_ptr<Cursor> cursor(db_->Query(ReadOptions(), *query));
+    ASSERT_OK(cursor->status());
+    AssertCursorIDs(cursor.get(), {12});
+  }
+
+  // -2 <= priority < 0
+  {
+    std::unique_ptr<JSONDocument> query(
+        Parse("[{'$filter': {'priority': {'$gte': -2, '$lt': 0},"
+        " '$index': 'priority'}}]"));
+    std::unique_ptr<Cursor> cursor(db_->Query(ReadOptions(), *query));
+    ASSERT_OK(cursor->status());
+    AssertCursorIDs(cursor.get(), {12, 13});
+  }
+
   // 4 < priority
   {
     std::unique_ptr<JSONDocument> query(

From 73ee4febab82111db135300fce0d24c013e99883 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Wed, 7 Jan 2015 11:47:32 -0800
Subject: [PATCH 667/829] Add comments about properties supported by
 DB::GetProperty() and DB::GetIntProperty()

Summary: Add comments in db.h to help users discover their options.

Test Plan: Compile

Reviewers: rven, yhchiang, igor

Reviewed By: igor

Subscribers: MarkCallaghan, yoshinorim, dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D31077
---
 db/internal_stats.h  |  2 ++
 include/rocksdb/db.h | 32 +++++++++++++++++++++++++++++++-
 2 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/db/internal_stats.h b/db/internal_stats.h
index 96c13e03b..702008032 100644
--- a/db/internal_stats.h
+++ b/db/internal_stats.h
@@ -21,6 +21,8 @@ namespace rocksdb {
 class MemTableList;
 class DBImpl;
 
+// IMPORTANT: If you add a new property here, also add it to the list in
+//            include/rocksdb/db.h
 enum DBPropertyType : uint32_t {
   kUnknown,
   kNumFilesAtLevel,  // Number of files at a specific level
diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h
index a8cb694b4..a519db7f6 100644
--- a/include/rocksdb/db.h
+++ b/include/rocksdb/db.h
@@ -304,6 +304,22 @@ class DB {
   //     about the internal operation of the DB.
   //  "rocksdb.sstables" - returns a multi-line string that describes all
   //     of the sstables that make up the db contents.
+  //  "rocksdb.cfstats"
+  //  "rocksdb.dbstats"
+  //  "rocksdb.num-immutable-mem-table"
+  //  "rocksdb.mem-table-flush-pending"
+  //  "rocksdb.compaction-pending" - 1 if at least one compaction is pending
+  //  "rocksdb.background-errors" - accumulated number of background errors
+  //  "rocksdb.cur-size-active-mem-table"
+  //  "rocksdb.cur-size-all-mem-tables"
+  //  "rocksdb.num-entries-active-mem-table"
+  //  "rocksdb.num-entries-imm-mem-tables"
+  //  "rocksdb.estimate-num-keys" - estimated keys in the column family
+  //  "rocksdb.estimate-table-readers-mem" - estimated memory used for reding
+  //      SST tables, that is not counted as a part of block cache.
+  //  "rocksdb.is-file-deletions-enabled"
+  //  "rocksdb.num-snapshots"
+  //  "rocksdb.oldest-snapshot-time"
   virtual bool GetProperty(ColumnFamilyHandle* column_family,
                            const Slice& property, std::string* value) = 0;
   virtual bool GetProperty(const Slice& property, std::string* value) {
@@ -311,7 +327,21 @@ class DB {
   }
 
   // Similar to GetProperty(), but only works for a subset of properties whose
-  // return value is an integer. Return the value by integer.
+  // return value is an integer. Return the value by integer. Supported
+  // properties:
+  //  "rocksdb.num-immutable-mem-table"
+  //  "rocksdb.mem-table-flush-pending"
+  //  "rocksdb.compaction-pending"
+  //  "rocksdb.background-errors"
+  //  "rocksdb.cur-size-active-mem-table"
+  //  "rocksdb.cur-size-all-mem-tables"
+  //  "rocksdb.num-entries-active-mem-table"
+  //  "rocksdb.num-entries-imm-mem-tables"
+  //  "rocksdb.estimate-num-keys"
+  //  "rocksdb.estimate-table-readers-mem"
+  //  "rocksdb.is-file-deletions-enabled"
+  //  "rocksdb.num-snapshots"
+  //  "rocksdb.oldest-snapshot-time"
   virtual bool GetIntProperty(ColumnFamilyHandle* column_family,
                               const Slice& property, uint64_t* value) = 0;
   virtual bool GetIntProperty(const Slice& property, uint64_t* value) {

From 242b9769c39c378382e653c6e258af5accbfab94 Mon Sep 17 00:00:00 2001
From: Ameya Gupte <ameyag@fb.com>
Date: Wed, 7 Jan 2015 15:15:30 -0800
Subject: [PATCH 668/829] Memtablerep Benchmark

Summary:
Create a benchmark for testing memtablereps. This diff is a bit rough, but it should do the trick until other bootcampers can clean it up.

Addressing comments
Removed the mutexes
Changed ReadWriteBenchmark to fix number of reads and count the number of writes we can perform in that time.

Test Plan:
Run it.

Below runs pass
./memtablerep_bench --benchmarks fillrandom,readrandom --memtablerep skiplist

./memtablerep_bench --benchmarks fillseq,readseq --memtablerep skiplist

./memtablerep_bench --benchmarks readwrite,seqreadwrite --memtablerep skiplist --num_operations 200 --num_threads 5

./memtablerep_bench --benchmarks fillrandom,readrandom --memtablerep hashskiplist

./memtablerep_bench --benchmarks fillseq,readseq --memtablerep hashskiplist
 --num_scans 2

./memtablerep_bench --benchmarks fillseq,readseq --memtablerep vector

Reviewers: jpaton, ikabiljo, sdong

Reviewed By: sdong

Subscribers: dhruba, ameyag

Differential Revision: https://reviews.facebook.net/D22683
---
 Makefile                |   5 +-
 db/memtablerep_bench.cc | 695 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 699 insertions(+), 1 deletion(-)
 create mode 100644 db/memtablerep_bench.cc

diff --git a/Makefile b/Makefile
index d84eb4fa0..9dab353e3 100644
--- a/Makefile
+++ b/Makefile
@@ -179,7 +179,7 @@ TOOLS = \
 	options_test \
 	blob_store_bench
 
-PROGRAMS = db_bench signal_test table_reader_bench log_and_apply_bench cache_bench perf_context_test $(TOOLS)
+PROGRAMS = db_bench signal_test table_reader_bench log_and_apply_bench cache_bench perf_context_test memtablerep_bench $(TOOLS)
 
 # The library name is configurable since we are maintaining libraries of both
 # debug/release mode.
@@ -330,6 +330,9 @@ db_bench: db/db_bench.o $(LIBOBJECTS) $(TESTUTIL)
 cache_bench: util/cache_bench.o $(LIBOBJECTS) $(TESTUTIL)
 	$(CXX) util/cache_bench.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@  $(LDFLAGS) $(COVERAGEFLAGS)
 
+memtablerep_bench: db/memtablerep_bench.o $(LIBOBJECTS) $(TESTUTIL)
+	  $(CXX) db/memtablerep_bench.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@  $(LDFLAGS) $(COVERAGEFLAGS)
+
 block_hash_index_test: table/block_hash_index_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	 $(CXX) table/block_hash_index_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
 
diff --git a/db/memtablerep_bench.cc b/db/memtablerep_bench.cc
new file mode 100644
index 000000000..a24eca010
--- /dev/null
+++ b/db/memtablerep_bench.cc
@@ -0,0 +1,695 @@
+//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#define __STDC_FORMAT_MACROS
+
+#ifndef GFLAGS
+#include <cstdio>
+int main() {
+  fprintf(stderr, "Please install gflags to run rocksdb tools\n");
+  return 1;
+}
+#else
+
+#include <gflags/gflags.h>
+
+#include <atomic>
+#include <iostream>
+#include <memory>
+#include <thread>
+#include <type_traits>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "db/memtable.h"
+#include "db/writebuffer.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/options.h"
+#include "rocksdb/slice_transform.h"
+#include "util/arena.h"
+#include "util/mutexlock.h"
+#include "util/stop_watch.h"
+#include "util/testutil.h"
+
+using GFLAGS::ParseCommandLineFlags;
+using GFLAGS::RegisterFlagValidator;
+using GFLAGS::SetUsageMessage;
+
+DEFINE_string(benchmarks, "fillrandom",
+              "Comma-separated list of benchmarks to run. Options:\n"
+              "\tfillrandom             -- write N random values\n"
+              "\tfillseq                -- write N values in sequential order\n"
+              "\treadrandom             -- read N values in random order\n"
+              "\treadseq                -- scan the DB\n"
+              "\treadwrite              -- 1 thread writes while N - 1 threads "
+              "do random\n"
+              "\t                          reads\n"
+              "\tseqreadwrite           -- 1 thread writes while N - 1 threads "
+              "do scans\n");
+
+DEFINE_string(memtablerep, "skiplist",
+              "Which implementation of memtablerep to use. See "
+              "include/memtablerep.h for\n"
+              "  more details. Options:\n"
+              "\tskiplist            -- backed by a skiplist\n"
+              "\tvector              -- backed by an std::vector\n"
+              "\thashskiplist        -- backed by a hash skip list\n"
+              "\thashlinklist        -- backed by a hash linked list\n"
+              "\tcuckoo              -- backed by a cuckoo hash table");
+
+DEFINE_int64(bucket_count, 1000000,
+             "bucket_count parameter to pass into NewHashSkiplistRepFactory or "
+             "NewHashLinkListRepFactory");
+
+DEFINE_int32(
+    hashskiplist_height, 4,
+    "skiplist_height parameter to pass into NewHashSkiplistRepFactory");
+
+DEFINE_int32(
+    hashskiplist_branching_factor, 4,
+    "branching_factor parameter to pass into NewHashSkiplistRepFactory");
+
+DEFINE_int32(
+    huge_page_tlb_size, 0,
+    "huge_page_tlb_size parameter to pass into NewHashLinkListRepFactory");
+
+DEFINE_int32(bucket_entries_logging_threshold, 4096,
+             "bucket_entries_logging_threshold parameter to pass into "
+             "NewHashLinkListRepFactory");
+
+DEFINE_bool(if_log_bucket_dist_when_flash, true,
+            "if_log_bucket_dist_when_flash parameter to pass into "
+            "NewHashLinkListRepFactory");
+
+DEFINE_int32(
+    threshold_use_skiplist, 256,
+    "threshold_use_skiplist parameter to pass into NewHashLinkListRepFactory");
+
+DEFINE_int64(
+    write_buffer_size, 256,
+    "write_buffer_size parameter to pass into NewHashCuckooRepFactory");
+
+DEFINE_int64(
+    average_data_size, 64,
+    "average_data_size parameter to pass into NewHashCuckooRepFactory");
+
+DEFINE_int64(
+    hash_function_count, 4,
+    "hash_function_count parameter to pass into NewHashCuckooRepFactory");
+
+DEFINE_int32(
+    num_threads, 1,
+    "Number of concurrent threads to run. If the benchmark includes writes,\n"
+    "then at most one thread will be a writer");
+
+DEFINE_int32(num_operations, 1000000,
+             "Number of operations to do for write and random read benchmarks");
+
+DEFINE_int32(num_scans, 10,
+             "Number of times for each thread to scan the memtablerep for "
+             "sequential read "
+             "benchmarks");
+
+DEFINE_int32(item_size, 100, "Number of bytes each item should be");
+
+DEFINE_int32(prefix_length, 8,
+             "Prefix length to pass into NewFixedPrefixTransform");
+
+/* VectorRep settings */
+DEFINE_int64(vectorrep_count, 0,
+             "Number of entries to reserve on VectorRep initialization");
+
+DEFINE_int64(seed, 0,
+             "Seed base for random number generators. "
+             "When 0 it is deterministic.");
+
+static rocksdb::Env* FLAGS_env = rocksdb::Env::Default();
+
+namespace rocksdb {
+
+namespace {
+struct CallbackVerifyArgs {
+  bool found;
+  LookupKey* key;
+  MemTableRep* table;
+  InternalKeyComparator* comparator;
+};
+}  // namespace
+
+// Helper for quickly generating random data.
+class RandomGenerator {
+ private:
+  std::string data_;
+  unsigned int pos_;
+
+ public:
+  RandomGenerator() {
+    Random rnd(301);
+    auto size = (unsigned)std::max(1048576, FLAGS_item_size);
+    test::RandomString(&rnd, size, &data_);
+    pos_ = 0;
+  }
+
+  Slice Generate(unsigned int len) {
+    assert(len <= data_.size());
+    if (pos_ + len > data_.size()) {
+      pos_ = 0;
+    }
+    pos_ += len;
+    return Slice(data_.data() + pos_ - len, len);
+  }
+};
+
+enum WriteMode { SEQUENTIAL, RANDOM, UNIQUE_RANDOM };
+
+class KeyGenerator {
+ public:
+  KeyGenerator(Random64* rand, WriteMode mode, uint64_t num)
+      : rand_(rand), mode_(mode), num_(num), next_(0) {
+    if (mode_ == UNIQUE_RANDOM) {
+      // NOTE: if memory consumption of this approach becomes a concern,
+      // we can either break it into pieces and only random shuffle a section
+      // each time. Alternatively, use a bit map implementation
+      // (https://reviews.facebook.net/differential/diff/54627/)
+      values_.resize(num_);
+      for (uint64_t i = 0; i < num_; ++i) {
+        values_[i] = i;
+      }
+      std::shuffle(values_.begin(), values_.end(),
+                   std::default_random_engine(FLAGS_seed));
+    }
+  }
+
+  uint64_t Next() {
+    switch (mode_) {
+      case SEQUENTIAL:
+        return next_++;
+      case RANDOM:
+        return rand_->Next() % num_;
+      case UNIQUE_RANDOM:
+        return values_[next_++];
+    }
+    assert(false);
+    return std::numeric_limits<uint64_t>::max();
+  }
+
+ private:
+  Random64* rand_;
+  WriteMode mode_;
+  const uint64_t num_;
+  uint64_t next_;
+  std::vector<uint64_t> values_;
+};
+
+class BenchmarkThread {
+ public:
+  explicit BenchmarkThread(MemTableRep* table, KeyGenerator* key_gen,
+                           uint64_t* bytes_written, uint64_t* bytes_read,
+                           uint64_t* sequence, uint64_t num_ops,
+                           uint64_t* read_hits)
+      : table_(table),
+        key_gen_(key_gen),
+        bytes_written_(bytes_written),
+        bytes_read_(bytes_read),
+        sequence_(sequence),
+        num_ops_(num_ops),
+        read_hits_(read_hits) {}
+
+  virtual void operator()() = 0;
+  virtual ~BenchmarkThread() {}
+
+ protected:
+  MemTableRep* table_;
+  KeyGenerator* key_gen_;
+  uint64_t* bytes_written_;
+  uint64_t* bytes_read_;
+  uint64_t* sequence_;
+  uint64_t num_ops_;
+  uint64_t* read_hits_;
+  RandomGenerator generator_;
+};
+
+class FillBenchmarkThread : public BenchmarkThread {
+ public:
+  FillBenchmarkThread(MemTableRep* table, KeyGenerator* key_gen,
+                      uint64_t* bytes_written, uint64_t* bytes_read,
+                      uint64_t* sequence, uint64_t num_ops, uint64_t* read_hits)
+      : BenchmarkThread(table, key_gen, bytes_written, bytes_read, sequence,
+                        num_ops, read_hits) {}
+
+  void FillOne() {
+    char* buf = nullptr;
+    auto internal_key_size = 16;
+    auto encoded_len =
+        FLAGS_item_size + VarintLength(internal_key_size) + internal_key_size;
+    KeyHandle handle = table_->Allocate(encoded_len, &buf);
+    assert(buf != nullptr);
+    char* p = EncodeVarint32(buf, internal_key_size);
+    auto key = key_gen_->Next();
+    EncodeFixed64(p, key);
+    p += 8;
+    EncodeFixed64(p, ++(*sequence_));
+    p += 8;
+    Slice bytes = generator_.Generate(FLAGS_item_size);
+    memcpy(p, bytes.data(), FLAGS_item_size);
+    p += FLAGS_item_size;
+    assert(p == buf + encoded_len);
+    table_->Insert(handle);
+    *bytes_written_ += encoded_len;
+  }
+
+  void operator()() override {
+    for (unsigned int i = 0; i < num_ops_; ++i) {
+      FillOne();
+    }
+  }
+};
+
+class ConcurrentFillBenchmarkThread : public FillBenchmarkThread {
+ public:
+  ConcurrentFillBenchmarkThread(MemTableRep* table, KeyGenerator* key_gen,
+                                uint64_t* bytes_written, uint64_t* bytes_read,
+                                uint64_t* sequence, uint64_t num_ops,
+                                uint64_t* read_hits,
+                                std::atomic_int* threads_done)
+      : FillBenchmarkThread(table, key_gen, bytes_written, bytes_read, sequence,
+                            num_ops, read_hits) {
+    threads_done_ = threads_done;
+  }
+
+  void operator()() override {
+    // # of read threads will be total threads - write threads (always 1). Loop
+    // while all reads complete.
+    while ((*threads_done_).load() < (FLAGS_num_threads - 1)) {
+      FillOne();
+    }
+  }
+
+ private:
+  std::atomic_int* threads_done_;
+};
+
+class ReadBenchmarkThread : public BenchmarkThread {
+ public:
+  ReadBenchmarkThread(MemTableRep* table, KeyGenerator* key_gen,
+                      uint64_t* bytes_written, uint64_t* bytes_read,
+                      uint64_t* sequence, uint64_t num_ops, uint64_t* read_hits)
+      : BenchmarkThread(table, key_gen, bytes_written, bytes_read, sequence,
+                        num_ops, read_hits) {}
+
+  static bool callback(void* arg, const char* entry) {
+    CallbackVerifyArgs* callback_args = static_cast<CallbackVerifyArgs*>(arg);
+    assert(callback_args != nullptr);
+    uint32_t key_length;
+    const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
+    if ((callback_args->comparator)->user_comparator()->Compare(
+            Slice(key_ptr, key_length - 8), callback_args->key->user_key()) ==
+        0) {
+      callback_args->found = true;
+    }
+    return false;
+  }
+
+  void ReadOne() {
+    std::string user_key;
+    auto key = key_gen_->Next();
+    PutFixed64(&user_key, key);
+    LookupKey lookup_key(user_key, *sequence_);
+    InternalKeyComparator internal_key_comp(BytewiseComparator());
+    CallbackVerifyArgs verify_args;
+    verify_args.found = false;
+    verify_args.key = &lookup_key;
+    verify_args.table = table_;
+    verify_args.comparator = &internal_key_comp;
+    table_->Get(lookup_key, &verify_args, callback);
+    if (verify_args.found) {
+      *bytes_read_ += VarintLength(16) + 16 + FLAGS_item_size;
+      ++*read_hits_;
+    }
+  }
+  void operator()() override {
+    for (unsigned int i = 0; i < num_ops_; ++i) {
+      ReadOne();
+    }
+  }
+};
+
+class SeqReadBenchmarkThread : public BenchmarkThread {
+ public:
+  SeqReadBenchmarkThread(MemTableRep* table, KeyGenerator* key_gen,
+                         uint64_t* bytes_written, uint64_t* bytes_read,
+                         uint64_t* sequence, uint64_t num_ops,
+                         uint64_t* read_hits)
+      : BenchmarkThread(table, key_gen, bytes_written, bytes_read, sequence,
+                        num_ops, read_hits) {}
+
+  void ReadOneSeq() {
+    std::unique_ptr<MemTableRep::Iterator> iter(table_->GetIterator());
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      // pretend to read the value
+      *bytes_read_ += VarintLength(16) + 16 + FLAGS_item_size;
+    }
+    ++*read_hits_;
+  }
+
+  void operator()() override {
+    for (unsigned int i = 0; i < num_ops_; ++i) {
+      { ReadOneSeq(); }
+    }
+  }
+};
+
+class ConcurrentReadBenchmarkThread : public ReadBenchmarkThread {
+ public:
+  ConcurrentReadBenchmarkThread(MemTableRep* table, KeyGenerator* key_gen,
+                                uint64_t* bytes_written, uint64_t* bytes_read,
+                                uint64_t* sequence, uint64_t num_ops,
+                                uint64_t* read_hits,
+                                std::atomic_int* threads_done)
+      : ReadBenchmarkThread(table, key_gen, bytes_written, bytes_read, sequence,
+                            num_ops, read_hits) {
+    threads_done_ = threads_done;
+  }
+
+  void operator()() override {
+    for (unsigned int i = 0; i < num_ops_; ++i) {
+      ReadOne();
+    }
+    ++*threads_done_;
+  }
+
+ private:
+  std::atomic_int* threads_done_;
+};
+
+class SeqConcurrentReadBenchmarkThread : public SeqReadBenchmarkThread {
+ public:
+  SeqConcurrentReadBenchmarkThread(MemTableRep* table, KeyGenerator* key_gen,
+                                   uint64_t* bytes_written,
+                                   uint64_t* bytes_read, uint64_t* sequence,
+                                   uint64_t num_ops, uint64_t* read_hits,
+                                   std::atomic_int* threads_done)
+      : SeqReadBenchmarkThread(table, key_gen, bytes_written, bytes_read,
+                               sequence, num_ops, read_hits) {
+    threads_done_ = threads_done;
+  }
+
+  void operator()() override {
+    for (unsigned int i = 0; i < num_ops_; ++i) {
+      ReadOneSeq();
+    }
+    ++*threads_done_;
+  }
+
+ private:
+  std::atomic_int* threads_done_;
+};
+
+class Benchmark {
+ public:
+  explicit Benchmark(MemTableRep* table, KeyGenerator* key_gen,
+                     uint64_t* sequence, uint32_t num_threads)
+      : table_(table),
+        key_gen_(key_gen),
+        sequence_(sequence),
+        num_threads_(num_threads) {}
+
+  virtual ~Benchmark() {}
+  virtual void Run() {
+    std::cout << "Number of threads: " << num_threads_ << std::endl;
+    std::vector<std::thread> threads;
+    uint64_t bytes_written = 0;
+    uint64_t bytes_read = 0;
+    uint64_t read_hits = 0;
+    StopWatchNano timer(Env::Default(), true);
+    RunThreads(&threads, &bytes_written, &bytes_read, true, &read_hits);
+    auto elapsed_time = static_cast<double>(timer.ElapsedNanos() / 1000);
+    std::cout << "Elapsed time: " << static_cast<int>(elapsed_time) << " us"
+              << std::endl;
+
+    if (bytes_written > 0) {
+      auto MiB_written = static_cast<double>(bytes_written) / (1 << 20);
+      auto write_throughput = MiB_written / (elapsed_time / 1000000);
+      std::cout << "Total bytes written: " << MiB_written << " MiB"
+                << std::endl;
+      std::cout << "Write throughput: " << write_throughput << " MiB/s"
+                << std::endl;
+      auto us_per_op = elapsed_time / num_write_ops_per_thread_;
+      std::cout << "write us/op: " << us_per_op << std::endl;
+    }
+    if (bytes_read > 0) {
+      auto MiB_read = static_cast<double>(bytes_read) / (1 << 20);
+      auto read_throughput = MiB_read / (elapsed_time / 1000000);
+      std::cout << "Total bytes read: " << MiB_read << " MiB" << std::endl;
+      std::cout << "Read throughput: " << read_throughput << " MiB/s"
+                << std::endl;
+      auto us_per_op = elapsed_time / num_read_ops_per_thread_;
+      std::cout << "read us/op: " << us_per_op << std::endl;
+    }
+  }
+
+  virtual void RunThreads(std::vector<std::thread>* threads,
+                          uint64_t* bytes_written, uint64_t* bytes_read,
+                          bool write, uint64_t* read_hits) = 0;
+
+ protected:
+  MemTableRep* table_;
+  KeyGenerator* key_gen_;
+  uint64_t* sequence_;
+  uint64_t num_write_ops_per_thread_;
+  uint64_t num_read_ops_per_thread_;
+  const uint32_t num_threads_;
+};
+
+class FillBenchmark : public Benchmark {
+ public:
+  explicit FillBenchmark(MemTableRep* table, KeyGenerator* key_gen,
+                         uint64_t* sequence)
+      : Benchmark(table, key_gen, sequence, 1) {
+    num_write_ops_per_thread_ = FLAGS_num_operations;
+  }
+
+  void RunThreads(std::vector<std::thread>* threads, uint64_t* bytes_written,
+                  uint64_t* bytes_read, bool write,
+                  uint64_t* read_hits) override {
+    FillBenchmarkThread(table_, key_gen_, bytes_written, bytes_read, sequence_,
+                        num_write_ops_per_thread_, read_hits)();
+  }
+};
+
+class ReadBenchmark : public Benchmark {
+ public:
+  explicit ReadBenchmark(MemTableRep* table, KeyGenerator* key_gen,
+                         uint64_t* sequence)
+      : Benchmark(table, key_gen, sequence, FLAGS_num_threads) {
+    num_read_ops_per_thread_ = FLAGS_num_operations / FLAGS_num_threads;
+  }
+
+  void RunThreads(std::vector<std::thread>* threads, uint64_t* bytes_written,
+                  uint64_t* bytes_read, bool write,
+                  uint64_t* read_hits) override {
+    for (int i = 0; i < FLAGS_num_threads; ++i) {
+      threads->emplace_back(
+          ReadBenchmarkThread(table_, key_gen_, bytes_written, bytes_read,
+                              sequence_, num_read_ops_per_thread_, read_hits));
+    }
+    for (auto& thread : *threads) {
+      thread.join();
+    }
+    std::cout << "read hit%: "
+              << (static_cast<double>(*read_hits) / FLAGS_num_operations) * 100
+              << std::endl;
+  }
+};
+
+class SeqReadBenchmark : public Benchmark {
+ public:
+  explicit SeqReadBenchmark(MemTableRep* table, uint64_t* sequence)
+      : Benchmark(table, nullptr, sequence, FLAGS_num_threads) {
+    num_read_ops_per_thread_ = FLAGS_num_scans;
+  }
+
+  void RunThreads(std::vector<std::thread>* threads, uint64_t* bytes_written,
+                  uint64_t* bytes_read, bool write,
+                  uint64_t* read_hits) override {
+    for (int i = 0; i < FLAGS_num_threads; ++i) {
+      threads->emplace_back(SeqReadBenchmarkThread(
+          table_, key_gen_, bytes_written, bytes_read, sequence_,
+          num_read_ops_per_thread_, read_hits));
+    }
+    for (auto& thread : *threads) {
+      thread.join();
+    }
+  }
+};
+
+template <class ReadThreadType>
+class ReadWriteBenchmark : public Benchmark {
+ public:
+  explicit ReadWriteBenchmark(MemTableRep* table, KeyGenerator* key_gen,
+                              uint64_t* sequence)
+      : Benchmark(table, key_gen, sequence, FLAGS_num_threads) {
+    num_read_ops_per_thread_ =
+        FLAGS_num_threads <= 1
+            ? 0
+            : (FLAGS_num_operations / (FLAGS_num_threads - 1));
+    num_write_ops_per_thread_ = FLAGS_num_operations;
+  }
+
+  void RunThreads(std::vector<std::thread>* threads, uint64_t* bytes_written,
+                  uint64_t* bytes_read, bool write,
+                  uint64_t* read_hits) override {
+    std::atomic_int threads_done;
+    threads_done.store(0);
+    threads->emplace_back(ConcurrentFillBenchmarkThread(
+        table_, key_gen_, bytes_written, bytes_read, sequence_,
+        num_write_ops_per_thread_, read_hits, &threads_done));
+    for (int i = 1; i < FLAGS_num_threads; ++i) {
+      threads->emplace_back(
+          ReadThreadType(table_, key_gen_, bytes_written, bytes_read, sequence_,
+                         num_read_ops_per_thread_, read_hits, &threads_done));
+    }
+    for (auto& thread : *threads) {
+      thread.join();
+    }
+  }
+};
+
+}  // namespace rocksdb
+
+void PrintWarnings() {
+#if defined(__GNUC__) && !defined(__OPTIMIZE__)
+  fprintf(stdout,
+          "WARNING: Optimization is disabled: benchmarks unnecessarily slow\n");
+#endif
+#ifndef NDEBUG
+  fprintf(stdout,
+          "WARNING: Assertions are enabled; benchmarks unnecessarily slow\n");
+#endif
+}
+
+int main(int argc, char** argv) {
+  rocksdb::port::InstallStackTraceHandler();
+  SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) +
+                  " [OPTIONS]...");
+  ParseCommandLineFlags(&argc, &argv, true);
+
+  PrintWarnings();
+
+  rocksdb::Options options;
+
+  std::unique_ptr<rocksdb::MemTableRepFactory> factory;
+  if (FLAGS_memtablerep == "skiplist") {
+    factory.reset(new rocksdb::SkipListFactory);
+  } else if (FLAGS_memtablerep == "vector") {
+    factory.reset(new rocksdb::VectorRepFactory);
+  } else if (FLAGS_memtablerep == "hashskiplist") {
+    factory.reset(rocksdb::NewHashSkipListRepFactory(
+        FLAGS_bucket_count, FLAGS_hashskiplist_height,
+        FLAGS_hashskiplist_branching_factor));
+    options.prefix_extractor.reset(
+        rocksdb::NewFixedPrefixTransform(FLAGS_prefix_length));
+  } else if (FLAGS_memtablerep == "hashlinklist") {
+    factory.reset(rocksdb::NewHashLinkListRepFactory(
+        FLAGS_bucket_count, FLAGS_huge_page_tlb_size,
+        FLAGS_bucket_entries_logging_threshold,
+        FLAGS_if_log_bucket_dist_when_flash, FLAGS_threshold_use_skiplist));
+    options.prefix_extractor.reset(
+        rocksdb::NewFixedPrefixTransform(FLAGS_prefix_length));
+  } else if (FLAGS_memtablerep == "cuckoo") {
+    factory.reset(rocksdb::NewHashCuckooRepFactory(
+        FLAGS_write_buffer_size, FLAGS_average_data_size,
+        static_cast<uint32_t>(FLAGS_hash_function_count)));
+    options.prefix_extractor.reset(
+        rocksdb::NewFixedPrefixTransform(FLAGS_prefix_length));
+  } else {
+    fprintf(stdout, "Unknown memtablerep: %s\n", FLAGS_memtablerep.c_str());
+    exit(1);
+  }
+
+  rocksdb::InternalKeyComparator internal_key_comp(
+      rocksdb::BytewiseComparator());
+  rocksdb::MemTable::KeyComparator key_comp(internal_key_comp);
+  rocksdb::Arena arena;
+  rocksdb::WriteBuffer wb(FLAGS_write_buffer_size);
+  rocksdb::MemTableAllocator memtable_allocator(&arena, &wb);
+  uint64_t sequence;
+  auto createMemtableRep = [&] {
+    sequence = 0;
+    return factory->CreateMemTableRep(key_comp, &memtable_allocator,
+                                      options.prefix_extractor.get(),
+                                      options.info_log.get());
+  };
+  std::unique_ptr<rocksdb::MemTableRep> memtablerep;
+  rocksdb::Random64 rng(FLAGS_seed);
+  const char* benchmarks = FLAGS_benchmarks.c_str();
+  while (benchmarks != nullptr) {
+    std::unique_ptr<rocksdb::KeyGenerator> key_gen;
+    const char* sep = strchr(benchmarks, ',');
+    rocksdb::Slice name;
+    if (sep == nullptr) {
+      name = benchmarks;
+      benchmarks = nullptr;
+    } else {
+      name = rocksdb::Slice(benchmarks, sep - benchmarks);
+      benchmarks = sep + 1;
+    }
+    std::unique_ptr<rocksdb::Benchmark> benchmark;
+    if (name == rocksdb::Slice("fillseq")) {
+      memtablerep.reset(createMemtableRep());
+      key_gen.reset(new rocksdb::KeyGenerator(&rng, rocksdb::SEQUENTIAL,
+                                              FLAGS_num_operations));
+      benchmark.reset(new rocksdb::FillBenchmark(memtablerep.get(),
+                                                 key_gen.get(), &sequence));
+    } else if (name == rocksdb::Slice("fillrandom")) {
+      memtablerep.reset(createMemtableRep());
+      key_gen.reset(new rocksdb::KeyGenerator(&rng, rocksdb::UNIQUE_RANDOM,
+                                              FLAGS_num_operations));
+      benchmark.reset(new rocksdb::FillBenchmark(memtablerep.get(),
+                                                 key_gen.get(), &sequence));
+    } else if (name == rocksdb::Slice("readrandom")) {
+      key_gen.reset(new rocksdb::KeyGenerator(&rng, rocksdb::RANDOM,
+                                              FLAGS_num_operations));
+      benchmark.reset(new rocksdb::ReadBenchmark(memtablerep.get(),
+                                                 key_gen.get(), &sequence));
+    } else if (name == rocksdb::Slice("readseq")) {
+      key_gen.reset(new rocksdb::KeyGenerator(&rng, rocksdb::SEQUENTIAL,
+                                              FLAGS_num_operations));
+      benchmark.reset(
+          new rocksdb::SeqReadBenchmark(memtablerep.get(), &sequence));
+    } else if (name == rocksdb::Slice("readwrite")) {
+      memtablerep.reset(createMemtableRep());
+      key_gen.reset(new rocksdb::KeyGenerator(&rng, rocksdb::RANDOM,
+                                              FLAGS_num_operations));
+      benchmark.reset(new rocksdb::ReadWriteBenchmark<
+          rocksdb::ConcurrentReadBenchmarkThread>(memtablerep.get(),
+                                                  key_gen.get(), &sequence));
+    } else if (name == rocksdb::Slice("seqreadwrite")) {
+      memtablerep.reset(createMemtableRep());
+      key_gen.reset(new rocksdb::KeyGenerator(&rng, rocksdb::RANDOM,
+                                              FLAGS_num_operations));
+      benchmark.reset(new rocksdb::ReadWriteBenchmark<
+          rocksdb::SeqConcurrentReadBenchmarkThread>(memtablerep.get(),
+                                                     key_gen.get(), &sequence));
+    } else {
+      std::cout << "WARNING: skipping unknown benchmark '" << name.ToString()
+                << std::endl;
+      continue;
+    }
+    std::cout << "Running " << name.ToString() << std::endl;
+    benchmark->Run();
+  }
+
+  return 0;
+}
+
+#endif  // GFLAGS

From b89d58dfa3887d561bd772421fb92ef01bd26fc4 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Wed, 7 Jan 2015 17:26:24 -0800
Subject: [PATCH 669/829] :%s/build_config/make_config

Summary: I'm tired of double-tab when opening build_tools/<something>. This change will make bu<tab> fully complete my path :)

Test Plan: `vi bu<tab>` gives me `vi build_tools/` yay!

Reviewers: yhchiang, rven, sdong

Reviewed By: sdong

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D30639
---
 .gitignore        | 2 +-
 Makefile          | 6 +++---
 examples/Makefile | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/.gitignore b/.gitignore
index ccbb46b03..70316aebc 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,5 @@
 TARGETS
-build_config.mk
+make_config.mk
 
 *.a
 *.arc
diff --git a/Makefile b/Makefile
index 9dab353e3..26dde6b45 100644
--- a/Makefile
+++ b/Makefile
@@ -31,9 +31,9 @@ endif
 #-----------------------------------------------
 
 # detect what platform we're building on
-$(shell (export ROCKSDB_ROOT="$(CURDIR)"; "$(CURDIR)/build_tools/build_detect_platform" "$(CURDIR)/build_config.mk"))
+$(shell (export ROCKSDB_ROOT="$(CURDIR)"; "$(CURDIR)/build_tools/build_detect_platform" "$(CURDIR)/make_config.mk"))
 # this file is generated by the previous line to set build flags and sources
-include build_config.mk
+include make_config.mk
 
 ifneq ($(PLATFORM), IOS)
 CFLAGS += -g
@@ -302,7 +302,7 @@ unity: unity.cc unity.o
 	$(CXX) unity.o $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
 
 clean:
-	-rm -f $(PROGRAMS) $(TESTS) $(LIBRARY) $(SHARED) $(MEMENVLIBRARY) build_config.mk unity.cc
+	-rm -f $(PROGRAMS) $(TESTS) $(LIBRARY) $(SHARED) $(MEMENVLIBRARY) make_config.mk unity.cc
 	-rm -rf ios-x86/* ios-arm/*
 	-find . -name "*.[oda]" -exec rm {} \;
 	-find . -type f -regex ".*\.\(\(gcda\)\|\(gcno\)\)" -exec rm {} \;
diff --git a/examples/Makefile b/examples/Makefile
index efc5fe30e..7bd88fbf0 100644
--- a/examples/Makefile
+++ b/examples/Makefile
@@ -1,4 +1,4 @@
-include ../build_config.mk
+include ../make_config.mk
 
 .PHONY: clean
 

From ef390164244b14d3cecdd87dde1ff2404a6ae86b Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Fri, 9 Jan 2015 09:22:49 +0100
Subject: [PATCH 670/829] Fixed memory issue in c_simple_example

Valgrind report prior to this fix:
==20829== Memcheck, a memory error detector
==20829== Copyright (C) 2002-2013, and GNU GPL'd, by Julian Seward et al.
==20829== Using Valgrind-3.10.0.SVN and LibVEX; rerun with -h for copyright info
==20829== Command: ./c_simple_example
==20829==
==20829== Invalid read of size 1
==20829==    at 0x4C2F1C8: strcmp (in /usr/lib/valgrind/vgpreload_memcheck-amd64-linux.so)
==20829==    by 0x422522: main (in /home/user/rocksgit/transfer/rocksdb-git/examples/c_simple_example)
==20829==  Address 0x5f60df5 is 0 bytes after a block of size 5 alloc'd
==20829==    at 0x4C2AB80: malloc (in /usr/lib/valgrind/vgpreload_memcheck-amd64-linux.so)
==20829==    by 0x4226D5: CopyString (c.cc:498)
==20829==    by 0x423032: rocksdb_get (c.cc:730)
==20829==    by 0x4224EB: main (in /home/user/rocksgit/transfer/rocksdb-git/examples/c_simple_example)
==20829==
==20829==
==20829== HEAP SUMMARY:
==20829==     in use at exit: 77 bytes in 5 blocks
==20829==   total heap usage: 4,491 allocs, 4,486 frees, 839,216 bytes allocated
==20829==
==20829== LEAK SUMMARY:
==20829==    definitely lost: 5 bytes in 1 blocks
==20829==    indirectly lost: 0 bytes in 0 blocks
==20829==      possibly lost: 0 bytes in 0 blocks
==20829==    still reachable: 72 bytes in 4 blocks
==20829==         suppressed: 0 bytes in 0 blocks
==20829== Rerun with --leak-check=full to see details of leaked memory
==20829==
==20829== For counts of detected and suppressed errors, rerun with: -v
==20829== ERROR SUMMARY: 1 errors from 1 contexts (suppressed: 0 from 0)
---
 examples/c_simple_example.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/c_simple_example.c b/examples/c_simple_example.c
index 8c6f89e39..2a467fb4e 100644
--- a/examples/c_simple_example.c
+++ b/examples/c_simple_example.c
@@ -30,7 +30,7 @@ int main(int argc, char **argv) {
 	const char key[] = "key";
 	const char *value = "value";
 	rocksdb_put(db, writeoptions, key, strlen (key), value,	\
-			strlen (value), &err);
+			strlen (value) + 1, &err);
 	assert(!err);
 	// Get value
 	rocksdb_readoptions_t *readoptions = rocksdb_readoptions_create();

From 628a67b0071f77ce45334601d12269a49db94d95 Mon Sep 17 00:00:00 2001
From: Robert <robert@arctic.tw>
Date: Mon, 5 Jan 2015 21:35:09 +0800
Subject: [PATCH 671/829] Reduce memory footprint in backupable db.

* Use emplace when possible.
* Make FileInfo shared among all BackupMeta, instead of storing filenames.
* Make checksum_value in FileInfo constant.
* Reserve space beforehand if container size is known.
* Make FileInfo and BackupMeta non-copyable and non-assignable to prevent future logic errors.
  It is very dangerous to copy BackupMeta without careful handling refcounts of FileInfo.
* Remove a copy of BackupMeta when detected corrupt backup.
---
 utilities/backupable/backupable_db.cc | 130 ++++++++++++++------------
 1 file changed, 70 insertions(+), 60 deletions(-)

diff --git a/utilities/backupable/backupable_db.cc b/utilities/backupable/backupable_db.cc
index 2a526c940..1d05115ce 100644
--- a/utilities/backupable/backupable_db.cc
+++ b/utilities/backupable/backupable_db.cc
@@ -145,19 +145,26 @@ class BackupEngineImpl : public BackupEngine {
     FileInfo(const std::string& fname, uint64_t sz, uint32_t checksum)
       : refs(0), filename(fname), size(sz), checksum_value(checksum) {}
 
+    FileInfo(const FileInfo&) = delete;
+    FileInfo& operator=(const FileInfo&) = delete;
+
     int refs;
     const std::string filename;
     const uint64_t size;
-    uint32_t checksum_value;
+    const uint32_t checksum_value;
   };
 
   class BackupMeta {
    public:
     BackupMeta(const std::string& meta_filename,
-        std::unordered_map<std::string, FileInfo>* file_infos, Env* env)
+        std::unordered_map<std::string, std::shared_ptr<FileInfo>>* file_infos,
+        Env* env)
       : timestamp_(0), size_(0), meta_filename_(meta_filename),
         file_infos_(file_infos), env_(env) {}
 
+    BackupMeta(const BackupMeta&) = delete;
+    BackupMeta& operator=(const BackupMeta&) = delete;
+
     ~BackupMeta() {}
 
     void RecordTimestamp() {
@@ -177,7 +184,7 @@ class BackupEngineImpl : public BackupEngine {
       return sequence_number_;
     }
 
-    Status AddFile(const FileInfo& file_info);
+    Status AddFile(std::shared_ptr<FileInfo> file_info);
 
     void Delete(bool delete_meta = true);
 
@@ -185,14 +192,14 @@ class BackupEngineImpl : public BackupEngine {
       return files_.empty();
     }
 
-    const FileInfo* GetFile(const std::string& filename) const {
+    std::shared_ptr<FileInfo> GetFile(const std::string& filename) const {
       auto it = file_infos_->find(filename);
       if (it == file_infos_->end())
         return nullptr;
-      return &it->second;
+      return it->second;
     }
 
-    const std::vector<std::string>& GetFiles() {
+    const std::vector<std::shared_ptr<FileInfo>>& GetFiles() {
       return files_;
     }
 
@@ -207,8 +214,8 @@ class BackupEngineImpl : public BackupEngine {
     uint64_t size_;
     std::string const meta_filename_;
     // files with relative paths (without "/" prefix!!)
-    std::vector<std::string> files_;
-    std::unordered_map<std::string, FileInfo>* file_infos_;
+    std::vector<std::shared_ptr<FileInfo>> files_;
+    std::unordered_map<std::string, std::shared_ptr<FileInfo>>* file_infos_;
     Env* env_;
 
     static const size_t max_backup_meta_file_size_ = 10 * 1024 * 1024;  // 10MB
@@ -297,9 +304,11 @@ class BackupEngineImpl : public BackupEngine {
 
   // backup state data
   BackupID latest_backup_id_;
-  std::map<BackupID, BackupMeta> backups_;
-  std::map<BackupID, std::pair<Status, BackupMeta> > corrupt_backups_;
-  std::unordered_map<std::string, FileInfo> backuped_file_infos_;
+  std::map<BackupID, unique_ptr<BackupMeta>> backups_;
+  std::map<BackupID,
+           std::pair<Status, unique_ptr<BackupMeta>>> corrupt_backups_;
+  std::unordered_map<std::string,
+                     std::shared_ptr<FileInfo>> backuped_file_infos_;
   std::atomic<bool> stop_backup_;
 
   // options data
@@ -382,9 +391,10 @@ BackupEngineImpl::BackupEngineImpl(Env* db_env,
       continue;
     }
     assert(backups_.find(backup_id) == backups_.end());
-    backups_.insert(std::make_pair(
-        backup_id, BackupMeta(GetBackupMetaFile(backup_id),
-                              &backuped_file_infos_, backup_env_)));
+    backups_.emplace(backup_id,
+        unique_ptr<BackupMeta>(new BackupMeta(
+            GetBackupMetaFile(backup_id),
+            &backuped_file_infos_, backup_env_)));
   }
 
   if (options_.destroy_old_data) {  // Destory old data
@@ -396,16 +406,16 @@ BackupEngineImpl::BackupEngineImpl(Env* db_env,
   } else {  // Load data from storage
     // load the backups if any
     for (auto& backup : backups_) {
-      Status s = backup.second.LoadFromFile(options_.backup_dir);
+      Status s = backup.second->LoadFromFile(options_.backup_dir);
       if (!s.ok()) {
         Log(options_.info_log, "Backup %u corrupted -- %s", backup.first,
             s.ToString().c_str());
         corrupt_backups_.insert(std::make_pair(
-              backup.first, std::make_pair(s, backup.second)));
+              backup.first, std::make_pair(s, std::move(backup.second))));
       }
     }
 
-    for (auto corrupt : corrupt_backups_) {
+    for (const auto& corrupt : corrupt_backups_) {
       backups_.erase(backups_.find(corrupt.first));
     }
 
@@ -465,13 +475,14 @@ Status BackupEngineImpl::CreateNewBackup(DB* db, bool flush_before_backup) {
 
   BackupID new_backup_id = latest_backup_id_ + 1;
   assert(backups_.find(new_backup_id) == backups_.end());
-  auto ret = backups_.insert(std::make_pair(
-      new_backup_id, BackupMeta(GetBackupMetaFile(new_backup_id),
-                                &backuped_file_infos_, backup_env_)));
+  auto ret = backups_.emplace(new_backup_id,
+      unique_ptr<BackupMeta>(new BackupMeta(
+          GetBackupMetaFile(new_backup_id),
+          &backuped_file_infos_, backup_env_)));
   assert(ret.second == true);
   auto& new_backup = ret.first->second;
-  new_backup.RecordTimestamp();
-  new_backup.SetSequenceNumber(sequence_number);
+  new_backup->RecordTimestamp();
+  new_backup->SetSequenceNumber(sequence_number);
 
   auto start_backup = backup_env_-> NowMicros();
 
@@ -506,7 +517,7 @@ Status BackupEngineImpl::CreateNewBackup(DB* db, bool flush_before_backup) {
     // * if it's kTableFile, then it's shared
     // * if it's kDescriptorFile, limit the size to manifest_file_size
     s = BackupFile(new_backup_id,
-                   &new_backup,
+                   new_backup.get(),
                    options_.share_table_files && type == kTableFile,
                    db->GetName(),            /* src_dir */
                    live_files[i],            /* src_fname */
@@ -521,7 +532,7 @@ Status BackupEngineImpl::CreateNewBackup(DB* db, bool flush_before_backup) {
       // we only care about live log files
       // copy the file into backup_dir/files/<new backup>/
       s = BackupFile(new_backup_id,
-                     &new_backup,
+                     new_backup.get(),
                      false, /* not shared */
                      db->GetOptions().wal_dir,
                      live_wal_files[i]->PathName(),
@@ -543,7 +554,7 @@ Status BackupEngineImpl::CreateNewBackup(DB* db, bool flush_before_backup) {
 
   if (s.ok()) {
     // persist the backup metadata on the disk
-    s = new_backup.StoreToFile(options_.sync);
+    s = new_backup->StoreToFile(options_.sync);
   }
   if (s.ok()) {
     // install the newly created backup meta! (atomic)
@@ -591,11 +602,11 @@ Status BackupEngineImpl::CreateNewBackup(DB* db, bool flush_before_backup) {
   Log(options_.info_log, "Backup DONE. All is good");
 
   // backup_speed is in byte/second
-  double backup_speed = new_backup.GetSize() / (1.048576 * backup_time);
+  double backup_speed = new_backup->GetSize() / (1.048576 * backup_time);
   Log(options_.info_log, "Backup number of files: %u",
-      new_backup.GetNumberFiles());
+      new_backup->GetNumberFiles());
   Log(options_.info_log, "Backup size: %" PRIu64 " bytes",
-      new_backup.GetSize());
+      new_backup->GetSize());
   Log(options_.info_log, "Backup time: %" PRIu64 " microseconds", backup_time);
   Log(options_.info_log, "Backup speed: %.3f MB/s", backup_speed);
   Log(options_.info_log, "Backup Statistics %s",
@@ -624,20 +635,20 @@ Status BackupEngineImpl::DeleteBackup(BackupID backup_id) {
   Log(options_.info_log, "Deleting backup %u", backup_id);
   auto backup = backups_.find(backup_id);
   if (backup != backups_.end()) {
-    backup->second.Delete();
+    backup->second->Delete();
     backups_.erase(backup);
   } else {
     auto corrupt = corrupt_backups_.find(backup_id);
     if (corrupt == corrupt_backups_.end()) {
       return Status::NotFound("Backup not found");
     }
-    corrupt->second.second.Delete();
+    corrupt->second.second->Delete();
     corrupt_backups_.erase(corrupt);
   }
 
   std::vector<std::string> to_delete;
   for (auto& itr : backuped_file_infos_) {
-    if (itr.second.refs == 0) {
+    if (itr.second->refs == 0) {
       Status s = backup_env_->DeleteFile(GetAbsolutePath(itr.first));
       Log(options_.info_log, "Deleting %s -- %s", itr.first.c_str(),
           s.ToString().c_str());
@@ -660,10 +671,11 @@ Status BackupEngineImpl::DeleteBackup(BackupID backup_id) {
 void BackupEngineImpl::GetBackupInfo(std::vector<BackupInfo>* backup_info) {
   backup_info->reserve(backups_.size());
   for (auto& backup : backups_) {
-    if (!backup.second.Empty()) {
+    if (!backup.second->Empty()) {
         backup_info->push_back(BackupInfo(
-            backup.first, backup.second.GetTimestamp(), backup.second.GetSize(),
-            backup.second.GetNumberFiles()));
+            backup.first, backup.second->GetTimestamp(),
+            backup.second->GetSize(),
+            backup.second->GetNumberFiles()));
     }
   }
 }
@@ -689,7 +701,7 @@ Status BackupEngineImpl::RestoreDBFromBackup(
     return Status::NotFound("Backup not found");
   }
   auto& backup = backup_itr->second;
-  if (backup.Empty()) {
+  if (backup->Empty()) {
     return Status::NotFound("Backup not found");
   }
 
@@ -737,7 +749,8 @@ Status BackupEngineImpl::RestoreDBFromBackup(
           options_.restore_rate_limit, copy_file_buffer_size_));
   }
   Status s;
-  for (auto& file : backup.GetFiles()) {
+  for (const auto& file_info : backup->GetFiles()) {
+    const std::string &file = file_info->filename;
     std::string dst;
     // 1. extract the filename
     size_t slash = file.find_last_of('/');
@@ -772,9 +785,7 @@ Status BackupEngineImpl::RestoreDBFromBackup(
       break;
     }
 
-    const auto iter = backuped_file_infos_.find(file);
-    assert(iter != backuped_file_infos_.end());
-    if (iter->second.checksum_value != checksum_value) {
+    if (file_info->checksum_value != checksum_value) {
       s = Status::Corruption("Checksum check failed");
       break;
     }
@@ -988,7 +999,8 @@ Status BackupEngineImpl::BackupFile(BackupID backup_id, BackupMeta* backup,
     }
   }
   if (s.ok()) {
-    s = backup->AddFile(FileInfo(dst_relative, size, checksum_value));
+    s = backup->AddFile(std::make_shared<FileInfo>(
+          dst_relative, size, checksum_value));
   }
   return s;
 }
@@ -1107,34 +1119,34 @@ Status BackupEngineImpl::GarbageCollect() {
 
 // ------- BackupMeta class --------
 
-Status BackupEngineImpl::BackupMeta::AddFile(const FileInfo& file_info) {
-  size_ += file_info.size;
-  files_.push_back(file_info.filename);
-
-  auto itr = file_infos_->find(file_info.filename);
+Status BackupEngineImpl::BackupMeta::AddFile(
+    std::shared_ptr<FileInfo> file_info) {
+  auto itr = file_infos_->find(file_info->filename);
   if (itr == file_infos_->end()) {
-    auto ret = file_infos_->insert({file_info.filename, file_info});
+    auto ret = file_infos_->emplace(file_info->filename, file_info);
     if (ret.second) {
-      ret.first->second.refs = 1;
+      itr = ret.first;
+      itr->second->refs = 1;
     } else {
       // if this happens, something is seriously wrong
       return Status::Corruption("In memory metadata insertion error");
     }
   } else {
-    if (itr->second.checksum_value != file_info.checksum_value) {
+    if (itr->second->checksum_value != file_info->checksum_value) {
       return Status::Corruption("Checksum mismatch for existing backup file");
     }
-    ++itr->second.refs;  // increase refcount if already present
+    ++itr->second->refs;  // increase refcount if already present
   }
 
+  size_ += file_info->size;
+  files_.push_back(itr->second);
+
   return Status::OK();
 }
 
 void BackupEngineImpl::BackupMeta::Delete(bool delete_meta) {
   for (const auto& file : files_) {
-    auto itr = file_infos_->find(file);
-    assert(itr != file_infos_->end());
-    --(itr->second.refs);  // decrease refcount
+    --file->refs;  // decrease refcount
   }
   files_.clear();
   // delete meta file
@@ -1179,7 +1191,7 @@ Status BackupEngineImpl::BackupMeta::LoadFromFile(
   num_files = static_cast<uint32_t>(strtoul(data.data(), &next, 10));
   data.remove_prefix(next - data.data() + 1); // +1 for '\n'
 
-  std::vector<FileInfo> files;
+  std::vector<std::shared_ptr<FileInfo>> files;
 
   Slice checksum_prefix("crc32 ");
 
@@ -1188,8 +1200,8 @@ Status BackupEngineImpl::BackupMeta::LoadFromFile(
     std::string filename = GetSliceUntil(&line, ' ').ToString();
 
     uint64_t size;
-    const FileInfo* file_info = GetFile(filename);
-    if (file_info != nullptr) {
+    const std::shared_ptr<FileInfo> file_info = GetFile(filename);
+    if (file_info) {
       size = file_info->size;
     } else {
       s = env_->GetFileSize(backup_dir + "/" + filename, &size);
@@ -1215,7 +1227,7 @@ Status BackupEngineImpl::BackupMeta::LoadFromFile(
       return Status::Corruption("Unknown checksum type");
     }
 
-    files.emplace_back(filename, size, checksum_value);
+    files.emplace_back(new FileInfo(filename, size, checksum_value));
   }
 
   if (s.ok() && data.size() > 0) {
@@ -1224,6 +1236,7 @@ Status BackupEngineImpl::BackupMeta::LoadFromFile(
   }
 
   if (s.ok()) {
+    files_.reserve(files.size());
     for (const auto& file_info : files) {
       s = AddFile(file_info);
       if (!s.ok()) {
@@ -1253,12 +1266,9 @@ Status BackupEngineImpl::BackupMeta::StoreToFile(bool sync) {
                   sequence_number_);
   len += snprintf(buf.get() + len, buf_size - len, "%zu\n", files_.size());
   for (const auto& file : files_) {
-    const auto& iter = file_infos_->find(file);
-
-    assert(iter != file_infos_->end());
     // use crc32 for now, switch to something else if needed
     len += snprintf(buf.get() + len, buf_size - len, "%s crc32 %u\n",
-                    file.c_str(), iter->second.checksum_value);
+                    file->filename.c_str(), file->checksum_value);
   }
 
   s = backup_meta_file->Append(Slice(buf.get(), (size_t)len));

From 9132e52ea4fd60886616cbec6c412f88117333fa Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Thu, 8 Jan 2015 17:51:08 -0800
Subject: [PATCH 672/829] DB Stats Dump to print total stall time

Summary:
Add printing of stall time in DB Stats:

Sample outputs:

** DB Stats **
Uptime(secs): 53.2 total, 1.7 interval
Cumulative writes: 625940 writes, 625939 keys, 625940 batches, 1.0 writes per batch, 0.49 GB user ingest, stall micros: 50691070
Cumulative WAL: 625940 writes, 625939 syncs, 1.00 writes per sync, 0.49 GB written
Interval writes: 10859 writes, 10859 keys, 10859 batches, 1.0 writes per batch, 8.7 MB user ingest, stall micros: 1692319
Interval WAL: 10859 writes, 10859 syncs, 1.00 writes per sync, 0.01 MB written

Test Plan:
make all check
verify printing using db_bench

Reviewers: igor, yhchiang, rven, MarkCallaghan

Reviewed By: MarkCallaghan

Subscribers: leveldb, dhruba

Differential Revision: https://reviews.facebook.net/D31239
---
 db/db_impl.cc                | 43 ++++++++++++++++++++++--------------
 db/internal_stats.cc         | 13 +++++++----
 db/internal_stats.h          |  5 +++++
 include/rocksdb/statistics.h | 12 +++++-----
 4 files changed, 48 insertions(+), 25 deletions(-)

diff --git a/db/db_impl.cc b/db/db_impl.cc
index 412146a3e..4720742ae 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -3103,26 +3103,37 @@ Status DBImpl::Write(const WriteOptions& write_options, WriteBatch* my_batch) {
 // REQUIRES: mutex_ is held
 // REQUIRES: this thread is currently at the front of the writer queue
 Status DBImpl::DelayWrite(uint64_t expiration_time) {
-  StopWatch sw(env_, stats_, WRITE_STALL);
-  bool has_timeout = (expiration_time > 0);
-  auto delay = write_controller_.GetDelay();
-  if (write_controller_.IsStopped() == false && delay > 0) {
-    mutex_.Unlock();
-    // hopefully we don't have to sleep more than 2 billion microseconds
-    env_->SleepForMicroseconds(static_cast<int>(delay));
-    mutex_.Lock();
-  }
+  uint64_t time_delayed = 0;
+  bool delayed = false;
+  {
+    StopWatch sw(env_, stats_, WRITE_STALL, &time_delayed);
+    bool has_timeout = (expiration_time > 0);
+    auto delay = write_controller_.GetDelay();
+    if (write_controller_.IsStopped() == false && delay > 0) {
+      mutex_.Unlock();
+      delayed = true;
+      // hopefully we don't have to sleep more than 2 billion microseconds
+      env_->SleepForMicroseconds(static_cast<int>(delay));
+      mutex_.Lock();
+    }
 
-  while (bg_error_.ok() && write_controller_.IsStopped()) {
-    if (has_timeout) {
-      bg_cv_.TimedWait(expiration_time);
-      if (env_->NowMicros() > expiration_time) {
-        return Status::TimedOut();
+    while (bg_error_.ok() && write_controller_.IsStopped()) {
+      delayed = true;
+      if (has_timeout) {
+        bg_cv_.TimedWait(expiration_time);
+        if (env_->NowMicros() > expiration_time) {
+          return Status::TimedOut();
+        }
+      } else {
+        bg_cv_.Wait();
       }
-    } else {
-      bg_cv_.Wait();
     }
   }
+  if (delayed) {
+    default_cf_internal_stats_->AddDBStats(InternalStats::WRITE_STALL_MICROS,
+                                           time_delayed);
+    RecordTick(stats_, STALL_MICROS, time_delayed);
+  }
 
   return bg_error_;
 }
diff --git a/db/internal_stats.cc b/db/internal_stats.cc
index c14a03c12..e27e74de7 100644
--- a/db/internal_stats.cc
+++ b/db/internal_stats.cc
@@ -300,6 +300,7 @@ void InternalStats::DumpDBStats(std::string* value) {
   uint64_t wal_bytes = db_stats_[InternalStats::WAL_FILE_BYTES];
   uint64_t wal_synced = db_stats_[InternalStats::WAL_FILE_SYNCED];
   uint64_t write_with_wal = db_stats_[InternalStats::WRITE_WITH_WAL];
+  uint64_t write_stall_micros = db_stats_[InternalStats::WRITE_STALL_MICROS];
   // Data
   // writes: total number of write requests.
   // keys: total number of key updates issued by all the write requests
@@ -311,10 +312,11 @@ void InternalStats::DumpDBStats(std::string* value) {
   // The format is the same for interval stats.
   snprintf(buf, sizeof(buf),
            "Cumulative writes: %" PRIu64 " writes, %" PRIu64 " keys, %" PRIu64
-           " batches, %.1f writes per batch, %.2f GB user ingest\n",
+           " batches, %.1f writes per batch, %.2f GB user ingest, "
+           "stall micros: %" PRIu64 "\n",
            write_other + write_self, num_keys_written, write_self,
            (write_other + write_self) / static_cast<double>(write_self + 1),
-           user_bytes_written / kGB);
+           user_bytes_written / kGB, write_stall_micros);
   value->append(buf);
   // WAL
   snprintf(buf, sizeof(buf),
@@ -332,12 +334,14 @@ void InternalStats::DumpDBStats(std::string* value) {
       num_keys_written - db_stats_snapshot_.num_keys_written;
   snprintf(buf, sizeof(buf),
            "Interval writes: %" PRIu64 " writes, %" PRIu64 " keys, %" PRIu64
-           " batches, %.1f writes per batch, %.1f MB user ingest\n",
+           " batches, %.1f writes per batch, %.1f MB user ingest, "
+           "stall micros: %" PRIu64 "\n",
            interval_write_other + interval_write_self,
            interval_num_keys_written, interval_write_self,
            static_cast<double>(interval_write_other + interval_write_self) /
                (interval_write_self + 1),
-           (user_bytes_written - db_stats_snapshot_.ingest_bytes) / kMB);
+           (user_bytes_written - db_stats_snapshot_.ingest_bytes) / kMB,
+           write_stall_micros - db_stats_snapshot_.write_stall_micros);
   value->append(buf);
 
   uint64_t interval_write_with_wal =
@@ -363,6 +367,7 @@ void InternalStats::DumpDBStats(std::string* value) {
   db_stats_snapshot_.wal_bytes = wal_bytes;
   db_stats_snapshot_.wal_synced = wal_synced;
   db_stats_snapshot_.write_with_wal = write_with_wal;
+  db_stats_snapshot_.write_stall_micros = write_stall_micros;
 }
 
 void InternalStats::DumpCFStats(std::string* value) {
diff --git a/db/internal_stats.h b/db/internal_stats.h
index 702008032..c1d77b6b6 100644
--- a/db/internal_stats.h
+++ b/db/internal_stats.h
@@ -77,6 +77,7 @@ class InternalStats {
     WRITE_DONE_BY_OTHER,
     WRITE_DONE_BY_SELF,
     WRITE_WITH_WAL,
+    WRITE_STALL_MICROS,
     INTERNAL_DB_STATS_ENUM_MAX,
   };
 
@@ -287,6 +288,8 @@ class InternalStats {
     // to multiple keys. num_keys_written is total number of keys updated by all
     // those writes.
     uint64_t num_keys_written;
+    // Total time writes delayed by stalls.
+    uint64_t write_stall_micros;
     double seconds_up;
 
     DBStatsSnapshot()
@@ -297,6 +300,7 @@ class InternalStats {
           write_other(0),
           write_self(0),
           num_keys_written(0),
+          write_stall_micros(0),
           seconds_up(0) {}
   } db_stats_snapshot_;
 
@@ -334,6 +338,7 @@ class InternalStats {
     WRITE_DONE_BY_OTHER,
     WRITE_DONE_BY_SELF,
     WRITE_WITH_WAL,
+    WRITE_STALL_MICROS,
     INTERNAL_DB_STATS_ENUM_MAX,
   };
 
diff --git a/include/rocksdb/statistics.h b/include/rocksdb/statistics.h
index 87ac321c9..7d0dad5d6 100644
--- a/include/rocksdb/statistics.h
+++ b/include/rocksdb/statistics.h
@@ -73,12 +73,14 @@ enum Tickers : uint32_t {
   NO_FILE_CLOSES,
   NO_FILE_OPENS,
   NO_FILE_ERRORS,
-  // Time system had to wait to do LO-L1 compactions
+  // DEPRECATED Time system had to wait to do LO-L1 compactions
   STALL_L0_SLOWDOWN_MICROS,
-  // Time system had to wait to move memtable to L1.
+  // DEPRECATED Time system had to wait to move memtable to L1.
   STALL_MEMTABLE_COMPACTION_MICROS,
-  // write throttle because of too many files in L0
+  // DEPRECATED write throttle because of too many files in L0
   STALL_L0_NUM_FILES_MICROS,
+  // Writer has to wait for compaction or flush to finish.
+  STALL_MICROS,
   RATE_LIMIT_DELAY_MILLIS,
   NO_ITERATORS,  // number of iterators currently open
 
@@ -160,6 +162,7 @@ const std::vector<std::pair<Tickers, std::string>> TickersNameMap = {
     {STALL_L0_SLOWDOWN_MICROS, "rocksdb.l0.slowdown.micros"},
     {STALL_MEMTABLE_COMPACTION_MICROS, "rocksdb.memtable.compaction.micros"},
     {STALL_L0_NUM_FILES_MICROS, "rocksdb.l0.num.files.stall.micros"},
+    {STALL_MICROS, "rocksdb.stall.micros"},
     {RATE_LIMIT_DELAY_MILLIS, "rocksdb.rate.limit.delay.millis"},
     {NO_ITERATORS, "rocksdb.num.iterators"},
     {NUMBER_MULTIGET_CALLS, "rocksdb.number.multiget.get"},
@@ -188,8 +191,7 @@ const std::vector<std::pair<Tickers, std::string>> TickersNameMap = {
     {NUMBER_SUPERVERSION_ACQUIRES, "rocksdb.number.superversion_acquires"},
     {NUMBER_SUPERVERSION_RELEASES, "rocksdb.number.superversion_releases"},
     {NUMBER_SUPERVERSION_CLEANUPS, "rocksdb.number.superversion_cleanups"},
-    {NUMBER_BLOCK_NOT_COMPRESSED, "rocksdb.number.block.not_compressed"},
-};
+    {NUMBER_BLOCK_NOT_COMPRESSED, "rocksdb.number.block.not_compressed"}, };
 
 /**
  * Keep adding histogram's here.

From abb9b95ffeb83a8d1ac7da0454a85ecea99adea6 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Fri, 9 Jan 2015 12:57:11 -0800
Subject: [PATCH 673/829] Move compression functions from port/ to util/

Summary: We keep checksum functions in util/, there is no reason for compression to be in port/

Test Plan: compiles

Reviewers: sdong

Reviewed By: sdong

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D31281
---
 db/filename.h                       |   4 +-
 port/port_posix.h                   | 352 --------------------------
 table/block_based_table_builder.cc  |  21 +-
 table/format.cc                     |  20 +-
 util/compression.h                  | 367 ++++++++++++++++++++++++++++
 util/thread_status_updater.cc       |   1 +
 util/thread_status_updater.h        |   8 +-
 utilities/document/json_document.cc |   4 +-
 8 files changed, 399 insertions(+), 378 deletions(-)
 create mode 100644 util/compression.h

diff --git a/db/filename.h b/db/filename.h
index 4136ff12e..fda873676 100644
--- a/db/filename.h
+++ b/db/filename.h
@@ -14,10 +14,12 @@
 #include <unordered_map>
 #include <string>
 #include <vector>
+
+#include "port/port.h"
+#include "rocksdb/options.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/status.h"
 #include "rocksdb/transaction_log.h"
-#include "port/port.h"
 
 namespace rocksdb {
 
diff --git a/port/port_posix.h b/port/port_posix.h
index 476542cfc..f730c483b 100644
--- a/port/port_posix.h
+++ b/port/port_posix.h
@@ -34,27 +34,10 @@
   #include <endian.h>
 #endif
 #include <pthread.h>
-#ifdef SNAPPY
-#include <snappy.h>
-#endif
-
-#ifdef ZLIB
-#include <zlib.h>
-#endif
-
-#ifdef BZIP2
-#include <bzlib.h>
-#endif
-
-#if defined(LZ4)
-#include <lz4.h>
-#include <lz4hc.h>
-#endif
 
 #include <stdint.h>
 #include <string>
 #include <string.h>
-#include "rocksdb/options.h"
 
 #ifndef PLATFORM_IS_LITTLE_ENDIAN
 #define PLATFORM_IS_LITTLE_ENDIAN (__BYTE_ORDER == __LITTLE_ENDIAN)
@@ -149,341 +132,6 @@ typedef pthread_once_t OnceType;
 #define LEVELDB_ONCE_INIT PTHREAD_ONCE_INIT
 extern void InitOnce(OnceType* once, void (*initializer)());
 
-inline bool Snappy_Compress(const CompressionOptions& opts, const char* input,
-                            size_t length, ::std::string* output) {
-#ifdef SNAPPY
-  output->resize(snappy::MaxCompressedLength(length));
-  size_t outlen;
-  snappy::RawCompress(input, length, &(*output)[0], &outlen);
-  output->resize(outlen);
-  return true;
-#endif
-
-  return false;
-}
-
-inline bool Snappy_GetUncompressedLength(const char* input, size_t length,
-                                         size_t* result) {
-#ifdef SNAPPY
-  return snappy::GetUncompressedLength(input, length, result);
-#else
-  return false;
-#endif
-}
-
-inline bool Snappy_Uncompress(const char* input, size_t length,
-                              char* output) {
-#ifdef SNAPPY
-  return snappy::RawUncompress(input, length, output);
-#else
-  return false;
-#endif
-}
-
-inline bool Zlib_Compress(const CompressionOptions& opts, const char* input,
-                          size_t length, ::std::string* output) {
-#ifdef ZLIB
-  // The memLevel parameter specifies how much memory should be allocated for
-  // the internal compression state.
-  // memLevel=1 uses minimum memory but is slow and reduces compression ratio.
-  // memLevel=9 uses maximum memory for optimal speed.
-  // The default value is 8. See zconf.h for more details.
-  static const int memLevel = 8;
-  z_stream _stream;
-  memset(&_stream, 0, sizeof(z_stream));
-  int st = deflateInit2(&_stream, opts.level, Z_DEFLATED, opts.window_bits,
-                        memLevel, opts.strategy);
-  if (st != Z_OK) {
-    return false;
-  }
-
-  // Resize output to be the plain data length.
-  // This may not be big enough if the compression actually expands data.
-  output->resize(length);
-
-  // Compress the input, and put compressed data in output.
-  _stream.next_in = (Bytef *)input;
-  _stream.avail_in = static_cast<unsigned int>(length);
-
-  // Initialize the output size.
-  _stream.avail_out = static_cast<unsigned int>(length);
-  _stream.next_out = (Bytef*)&(*output)[0];
-
-  size_t old_sz = 0, new_sz = 0, new_sz_delta = 0;
-  bool done = false;
-  while (!done) {
-    st = deflate(&_stream, Z_FINISH);
-    switch (st) {
-      case Z_STREAM_END:
-        done = true;
-        break;
-      case Z_OK:
-        // No output space. Increase the output space by 20%.
-        // (Should we fail the compression since it expands the size?)
-        old_sz = output->size();
-        new_sz_delta = static_cast<size_t>(output->size() * 0.2);
-        new_sz = output->size() + (new_sz_delta < 10 ? 10 : new_sz_delta);
-        output->resize(new_sz);
-        // Set more output.
-        _stream.next_out = (Bytef *)&(*output)[old_sz];
-        _stream.avail_out = static_cast<unsigned int>(new_sz - old_sz);
-        break;
-      case Z_BUF_ERROR:
-      default:
-        deflateEnd(&_stream);
-        return false;
-    }
-  }
-
-  output->resize(output->size() - _stream.avail_out);
-  deflateEnd(&_stream);
-  return true;
-#endif
-  return false;
-}
-
-inline char* Zlib_Uncompress(const char* input_data, size_t input_length,
-    int* decompress_size, int windowBits = -14) {
-#ifdef ZLIB
-  z_stream _stream;
-  memset(&_stream, 0, sizeof(z_stream));
-
-  // For raw inflate, the windowBits should be -8..-15.
-  // If windowBits is bigger than zero, it will use either zlib
-  // header or gzip header. Adding 32 to it will do automatic detection.
-  int st = inflateInit2(&_stream,
-      windowBits > 0 ? windowBits + 32 : windowBits);
-  if (st != Z_OK) {
-    return nullptr;
-  }
-
-  _stream.next_in = (Bytef *)input_data;
-  _stream.avail_in = static_cast<unsigned int>(input_length);
-
-  // Assume the decompressed data size will 5x of compressed size.
-  size_t output_len = input_length * 5;
-  char* output = new char[output_len];
-  size_t old_sz = output_len;
-
-  _stream.next_out = (Bytef *)output;
-  _stream.avail_out = static_cast<unsigned int>(output_len);
-
-  char* tmp = nullptr;
-  size_t output_len_delta;
-  bool done = false;
-
-  //while(_stream.next_in != nullptr && _stream.avail_in != 0) {
-  while (!done) {
-    st = inflate(&_stream, Z_SYNC_FLUSH);
-    switch (st) {
-      case Z_STREAM_END:
-        done = true;
-        break;
-      case Z_OK:
-        // No output space. Increase the output space by 20%.
-        old_sz = output_len;
-        output_len_delta = static_cast<size_t>(output_len * 0.2);
-        output_len += output_len_delta < 10 ? 10 : output_len_delta;
-        tmp = new char[output_len];
-        memcpy(tmp, output, old_sz);
-        delete[] output;
-        output = tmp;
-
-        // Set more output.
-        _stream.next_out = (Bytef *)(output + old_sz);
-        _stream.avail_out = static_cast<unsigned int>(output_len - old_sz);
-        break;
-      case Z_BUF_ERROR:
-      default:
-        delete[] output;
-        inflateEnd(&_stream);
-        return nullptr;
-    }
-  }
-
-  *decompress_size = static_cast<int>(output_len - _stream.avail_out);
-  inflateEnd(&_stream);
-  return output;
-#endif
-
-  return nullptr;
-}
-
-inline bool BZip2_Compress(const CompressionOptions& opts, const char* input,
-                           size_t length, ::std::string* output) {
-#ifdef BZIP2
-  bz_stream _stream;
-  memset(&_stream, 0, sizeof(bz_stream));
-
-  // Block size 1 is 100K.
-  // 0 is for silent.
-  // 30 is the default workFactor
-  int st = BZ2_bzCompressInit(&_stream, 1, 0, 30);
-  if (st != BZ_OK) {
-    return false;
-  }
-
-  // Resize output to be the plain data length.
-  // This may not be big enough if the compression actually expands data.
-  output->resize(length);
-
-  // Compress the input, and put compressed data in output.
-  _stream.next_in = (char *)input;
-  _stream.avail_in = static_cast<unsigned int>(length);
-
-  // Initialize the output size.
-  _stream.next_out = (char *)&(*output)[0];
-  _stream.avail_out = static_cast<unsigned int>(length);
-
-  size_t old_sz = 0, new_sz = 0;
-  while (_stream.next_in != nullptr && _stream.avail_in != 0) {
-    st = BZ2_bzCompress(&_stream, BZ_FINISH);
-    switch (st) {
-      case BZ_STREAM_END:
-        break;
-      case BZ_FINISH_OK:
-        // No output space. Increase the output space by 20%.
-        // (Should we fail the compression since it expands the size?)
-        old_sz = output->size();
-        new_sz = static_cast<size_t>(output->size() * 1.2);
-        output->resize(new_sz);
-        // Set more output.
-        _stream.next_out = (char *)&(*output)[old_sz];
-        _stream.avail_out = static_cast<unsigned int>(new_sz - old_sz);
-        break;
-      case BZ_SEQUENCE_ERROR:
-      default:
-        BZ2_bzCompressEnd(&_stream);
-        return false;
-    }
-  }
-
-  output->resize(output->size() - _stream.avail_out);
-  BZ2_bzCompressEnd(&_stream);
-  return true;
-#endif
-  return false;
-}
-
-inline char* BZip2_Uncompress(const char* input_data, size_t input_length,
-                              int* decompress_size) {
-#ifdef BZIP2
-  bz_stream _stream;
-  memset(&_stream, 0, sizeof(bz_stream));
-
-  int st = BZ2_bzDecompressInit(&_stream, 0, 0);
-  if (st != BZ_OK) {
-    return nullptr;
-  }
-
-  _stream.next_in = (char *)input_data;
-  _stream.avail_in = static_cast<unsigned int>(input_length);
-
-  // Assume the decompressed data size will be 5x of compressed size.
-  size_t output_len = input_length * 5;
-  char* output = new char[output_len];
-  size_t old_sz = output_len;
-
-  _stream.next_out = (char *)output;
-  _stream.avail_out = static_cast<unsigned int>(output_len);
-
-  char* tmp = nullptr;
-
-  while(_stream.next_in != nullptr && _stream.avail_in != 0) {
-    st = BZ2_bzDecompress(&_stream);
-    switch (st) {
-      case BZ_STREAM_END:
-        break;
-      case BZ_OK:
-        // No output space. Increase the output space by 20%.
-        old_sz = output_len;
-        output_len = static_cast<size_t>(output_len * 1.2);
-        tmp = new char[output_len];
-        memcpy(tmp, output, old_sz);
-        delete[] output;
-        output = tmp;
-
-        // Set more output.
-        _stream.next_out = (char *)(output + old_sz);
-        _stream.avail_out = static_cast<unsigned int>(output_len - old_sz);
-        break;
-      default:
-        delete[] output;
-        BZ2_bzDecompressEnd(&_stream);
-        return nullptr;
-    }
-  }
-
-  *decompress_size = static_cast<int>(output_len - _stream.avail_out);
-  BZ2_bzDecompressEnd(&_stream);
-  return output;
-#endif
-  return nullptr;
-}
-
-inline bool LZ4_Compress(const CompressionOptions &opts, const char *input,
-                         size_t length, ::std::string* output) {
-#ifdef LZ4
-  int compressBound = LZ4_compressBound(static_cast<int>(length));
-  output->resize(static_cast<size_t>(8 + compressBound));
-  char* p = const_cast<char*>(output->c_str());
-  memcpy(p, &length, sizeof(length));
-  int outlen = LZ4_compress_limitedOutput(
-      input, p + 8, static_cast<int>(length), compressBound);
-  if (outlen == 0) {
-    return false;
-  }
-  output->resize(static_cast<size_t>(8 + outlen));
-  return true;
-#endif
-  return false;
-}
-
-inline char* LZ4_Uncompress(const char* input_data, size_t input_length,
-                            int* decompress_size) {
-#ifdef LZ4
-  if (input_length < 8) {
-    return nullptr;
-  }
-  int output_len;
-  memcpy(&output_len, input_data, sizeof(output_len));
-  char *output = new char[output_len];
-  *decompress_size = LZ4_decompress_safe_partial(
-      input_data + 8, output, static_cast<int>(input_length - 8), output_len,
-      output_len);
-  if (*decompress_size < 0) {
-    delete[] output;
-    return nullptr;
-  }
-  return output;
-#endif
-  return nullptr;
-}
-
-inline bool LZ4HC_Compress(const CompressionOptions &opts, const char* input,
-                           size_t length, ::std::string* output) {
-#ifdef LZ4
-  int compressBound = LZ4_compressBound(static_cast<int>(length));
-  output->resize(static_cast<size_t>(8 + compressBound));
-  char* p = const_cast<char*>(output->c_str());
-  memcpy(p, &length, sizeof(length));
-  int outlen;
-#ifdef LZ4_VERSION_MAJOR  // they only started defining this since r113
-  outlen = LZ4_compressHC2_limitedOutput(input, p + 8, static_cast<int>(length),
-                                         compressBound, opts.level);
-#else
-  outlen = LZ4_compressHC_limitedOutput(input, p + 8, static_cast<int>(length),
-                                        compressBound);
-#endif
-  if (outlen == 0) {
-    return false;
-  }
-  output->resize(static_cast<size_t>(8 + outlen));
-  return true;
-#endif
-  return false;
-}
-
 #define CACHE_LINE_SIZE 64U
 
 #define PREFETCH(addr, rw, locality) __builtin_prefetch(addr, rw, locality)
diff --git a/table/block_based_table_builder.cc b/table/block_based_table_builder.cc
index 0a93e309d..cdae8508b 100644
--- a/table/block_based_table_builder.cc
+++ b/table/block_based_table_builder.cc
@@ -39,6 +39,7 @@
 #include "table/table_builder.h"
 
 #include "util/coding.h"
+#include "util/compression.h"
 #include "util/crc32c.h"
 #include "util/stop_watch.h"
 #include "util/xxhash.h"
@@ -312,36 +313,36 @@ Slice CompressBlock(const Slice& raw,
   // supported in this platform and (2) the compression rate is "good enough".
   switch (*type) {
     case kSnappyCompression:
-      if (port::Snappy_Compress(compression_options, raw.data(), raw.size(),
-                                compressed_output) &&
+      if (Snappy_Compress(compression_options, raw.data(), raw.size(),
+                          compressed_output) &&
           GoodCompressionRatio(compressed_output->size(), raw.size())) {
         return *compressed_output;
       }
       break;  // fall back to no compression.
     case kZlibCompression:
-      if (port::Zlib_Compress(compression_options, raw.data(), raw.size(),
-                              compressed_output) &&
+      if (Zlib_Compress(compression_options, raw.data(), raw.size(),
+                        compressed_output) &&
           GoodCompressionRatio(compressed_output->size(), raw.size())) {
         return *compressed_output;
       }
       break;  // fall back to no compression.
     case kBZip2Compression:
-      if (port::BZip2_Compress(compression_options, raw.data(), raw.size(),
-                               compressed_output) &&
+      if (BZip2_Compress(compression_options, raw.data(), raw.size(),
+                         compressed_output) &&
           GoodCompressionRatio(compressed_output->size(), raw.size())) {
         return *compressed_output;
       }
       break;  // fall back to no compression.
     case kLZ4Compression:
-      if (port::LZ4_Compress(compression_options, raw.data(), raw.size(),
-                             compressed_output) &&
+      if (LZ4_Compress(compression_options, raw.data(), raw.size(),
+                       compressed_output) &&
           GoodCompressionRatio(compressed_output->size(), raw.size())) {
         return *compressed_output;
       }
       break;  // fall back to no compression.
     case kLZ4HCCompression:
-      if (port::LZ4HC_Compress(compression_options, raw.data(), raw.size(),
-                               compressed_output) &&
+      if (LZ4HC_Compress(compression_options, raw.data(), raw.size(),
+                         compressed_output) &&
           GoodCompressionRatio(compressed_output->size(), raw.size())) {
         return *compressed_output;
       }
diff --git a/table/format.cc b/table/format.cc
index 227090bb2..c7f96f427 100644
--- a/table/format.cc
+++ b/table/format.cc
@@ -12,10 +12,10 @@
 #include <string>
 #include <inttypes.h>
 
-#include "port/port.h"
 #include "rocksdb/env.h"
 #include "table/block.h"
 #include "util/coding.h"
+#include "util/compression.h"
 #include "util/crc32c.h"
 #include "util/perf_context_imp.h"
 #include "util/xxhash.h"
@@ -367,19 +367,19 @@ Status UncompressBlockContents(const char* data, size_t n,
       size_t ulength = 0;
       static char snappy_corrupt_msg[] =
         "Snappy not supported or corrupted Snappy compressed block contents";
-      if (!port::Snappy_GetUncompressedLength(data, n, &ulength)) {
+      if (!Snappy_GetUncompressedLength(data, n, &ulength)) {
         return Status::Corruption(snappy_corrupt_msg);
       }
       ubuf = std::unique_ptr<char[]>(new char[ulength]);
-      if (!port::Snappy_Uncompress(data, n, ubuf.get())) {
+      if (!Snappy_Uncompress(data, n, ubuf.get())) {
         return Status::Corruption(snappy_corrupt_msg);
       }
       *contents = BlockContents(std::move(ubuf), ulength, true, kNoCompression);
       break;
     }
     case kZlibCompression:
-      ubuf = std::unique_ptr<char[]>(
-          port::Zlib_Uncompress(data, n, &decompress_size));
+      ubuf =
+          std::unique_ptr<char[]>(Zlib_Uncompress(data, n, &decompress_size));
       if (!ubuf) {
         static char zlib_corrupt_msg[] =
           "Zlib not supported or corrupted Zlib compressed block contents";
@@ -389,8 +389,8 @@ Status UncompressBlockContents(const char* data, size_t n,
           BlockContents(std::move(ubuf), decompress_size, true, kNoCompression);
       break;
     case kBZip2Compression:
-      ubuf = std::unique_ptr<char[]>(
-          port::BZip2_Uncompress(data, n, &decompress_size));
+      ubuf =
+          std::unique_ptr<char[]>(BZip2_Uncompress(data, n, &decompress_size));
       if (!ubuf) {
         static char bzip2_corrupt_msg[] =
           "Bzip2 not supported or corrupted Bzip2 compressed block contents";
@@ -400,8 +400,7 @@ Status UncompressBlockContents(const char* data, size_t n,
           BlockContents(std::move(ubuf), decompress_size, true, kNoCompression);
       break;
     case kLZ4Compression:
-      ubuf = std::unique_ptr<char[]>(
-          port::LZ4_Uncompress(data, n, &decompress_size));
+      ubuf = std::unique_ptr<char[]>(LZ4_Uncompress(data, n, &decompress_size));
       if (!ubuf) {
         static char lz4_corrupt_msg[] =
           "LZ4 not supported or corrupted LZ4 compressed block contents";
@@ -411,8 +410,7 @@ Status UncompressBlockContents(const char* data, size_t n,
           BlockContents(std::move(ubuf), decompress_size, true, kNoCompression);
       break;
     case kLZ4HCCompression:
-      ubuf = std::unique_ptr<char[]>(
-          port::LZ4_Uncompress(data, n, &decompress_size));
+      ubuf = std::unique_ptr<char[]>(LZ4_Uncompress(data, n, &decompress_size));
       if (!ubuf) {
         static char lz4hc_corrupt_msg[] =
           "LZ4HC not supported or corrupted LZ4HC compressed block contents";
diff --git a/util/compression.h b/util/compression.h
new file mode 100644
index 000000000..a0ca91f7f
--- /dev/null
+++ b/util/compression.h
@@ -0,0 +1,367 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+#pragma once
+
+#include "rocksdb/options.h"
+
+#ifdef SNAPPY
+#include <snappy.h>
+#endif
+
+#ifdef ZLIB
+#include <zlib.h>
+#endif
+
+#ifdef BZIP2
+#include <bzlib.h>
+#endif
+
+#if defined(LZ4)
+#include <lz4.h>
+#include <lz4hc.h>
+#endif
+
+namespace rocksdb {
+
+inline bool Snappy_Compress(const CompressionOptions& opts, const char* input,
+                            size_t length, ::std::string* output) {
+#ifdef SNAPPY
+  output->resize(snappy::MaxCompressedLength(length));
+  size_t outlen;
+  snappy::RawCompress(input, length, &(*output)[0], &outlen);
+  output->resize(outlen);
+  return true;
+#endif
+
+  return false;
+}
+
+inline bool Snappy_GetUncompressedLength(const char* input, size_t length,
+                                         size_t* result) {
+#ifdef SNAPPY
+  return snappy::GetUncompressedLength(input, length, result);
+#else
+  return false;
+#endif
+}
+
+inline bool Snappy_Uncompress(const char* input, size_t length,
+                              char* output) {
+#ifdef SNAPPY
+  return snappy::RawUncompress(input, length, output);
+#else
+  return false;
+#endif
+}
+
+inline bool Zlib_Compress(const CompressionOptions& opts, const char* input,
+                          size_t length, ::std::string* output) {
+#ifdef ZLIB
+  // The memLevel parameter specifies how much memory should be allocated for
+  // the internal compression state.
+  // memLevel=1 uses minimum memory but is slow and reduces compression ratio.
+  // memLevel=9 uses maximum memory for optimal speed.
+  // The default value is 8. See zconf.h for more details.
+  static const int memLevel = 8;
+  z_stream _stream;
+  memset(&_stream, 0, sizeof(z_stream));
+  int st = deflateInit2(&_stream, opts.level, Z_DEFLATED, opts.window_bits,
+                        memLevel, opts.strategy);
+  if (st != Z_OK) {
+    return false;
+  }
+
+  // Resize output to be the plain data length.
+  // This may not be big enough if the compression actually expands data.
+  output->resize(length);
+
+  // Compress the input, and put compressed data in output.
+  _stream.next_in = (Bytef *)input;
+  _stream.avail_in = static_cast<unsigned int>(length);
+
+  // Initialize the output size.
+  _stream.avail_out = static_cast<unsigned int>(length);
+  _stream.next_out = (Bytef*)&(*output)[0];
+
+  size_t old_sz = 0, new_sz = 0, new_sz_delta = 0;
+  bool done = false;
+  while (!done) {
+    st = deflate(&_stream, Z_FINISH);
+    switch (st) {
+      case Z_STREAM_END:
+        done = true;
+        break;
+      case Z_OK:
+        // No output space. Increase the output space by 20%.
+        // (Should we fail the compression since it expands the size?)
+        old_sz = output->size();
+        new_sz_delta = static_cast<size_t>(output->size() * 0.2);
+        new_sz = output->size() + (new_sz_delta < 10 ? 10 : new_sz_delta);
+        output->resize(new_sz);
+        // Set more output.
+        _stream.next_out = (Bytef *)&(*output)[old_sz];
+        _stream.avail_out = static_cast<unsigned int>(new_sz - old_sz);
+        break;
+      case Z_BUF_ERROR:
+      default:
+        deflateEnd(&_stream);
+        return false;
+    }
+  }
+
+  output->resize(output->size() - _stream.avail_out);
+  deflateEnd(&_stream);
+  return true;
+#endif
+  return false;
+}
+
+inline char* Zlib_Uncompress(const char* input_data, size_t input_length,
+    int* decompress_size, int windowBits = -14) {
+#ifdef ZLIB
+  z_stream _stream;
+  memset(&_stream, 0, sizeof(z_stream));
+
+  // For raw inflate, the windowBits should be -8..-15.
+  // If windowBits is bigger than zero, it will use either zlib
+  // header or gzip header. Adding 32 to it will do automatic detection.
+  int st = inflateInit2(&_stream,
+      windowBits > 0 ? windowBits + 32 : windowBits);
+  if (st != Z_OK) {
+    return nullptr;
+  }
+
+  _stream.next_in = (Bytef *)input_data;
+  _stream.avail_in = static_cast<unsigned int>(input_length);
+
+  // Assume the decompressed data size will 5x of compressed size.
+  size_t output_len = input_length * 5;
+  char* output = new char[output_len];
+  size_t old_sz = output_len;
+
+  _stream.next_out = (Bytef *)output;
+  _stream.avail_out = static_cast<unsigned int>(output_len);
+
+  char* tmp = nullptr;
+  size_t output_len_delta;
+  bool done = false;
+
+  //while(_stream.next_in != nullptr && _stream.avail_in != 0) {
+  while (!done) {
+    st = inflate(&_stream, Z_SYNC_FLUSH);
+    switch (st) {
+      case Z_STREAM_END:
+        done = true;
+        break;
+      case Z_OK:
+        // No output space. Increase the output space by 20%.
+        old_sz = output_len;
+        output_len_delta = static_cast<size_t>(output_len * 0.2);
+        output_len += output_len_delta < 10 ? 10 : output_len_delta;
+        tmp = new char[output_len];
+        memcpy(tmp, output, old_sz);
+        delete[] output;
+        output = tmp;
+
+        // Set more output.
+        _stream.next_out = (Bytef *)(output + old_sz);
+        _stream.avail_out = static_cast<unsigned int>(output_len - old_sz);
+        break;
+      case Z_BUF_ERROR:
+      default:
+        delete[] output;
+        inflateEnd(&_stream);
+        return nullptr;
+    }
+  }
+
+  *decompress_size = static_cast<int>(output_len - _stream.avail_out);
+  inflateEnd(&_stream);
+  return output;
+#endif
+
+  return nullptr;
+}
+
+inline bool BZip2_Compress(const CompressionOptions& opts, const char* input,
+                           size_t length, ::std::string* output) {
+#ifdef BZIP2
+  bz_stream _stream;
+  memset(&_stream, 0, sizeof(bz_stream));
+
+  // Block size 1 is 100K.
+  // 0 is for silent.
+  // 30 is the default workFactor
+  int st = BZ2_bzCompressInit(&_stream, 1, 0, 30);
+  if (st != BZ_OK) {
+    return false;
+  }
+
+  // Resize output to be the plain data length.
+  // This may not be big enough if the compression actually expands data.
+  output->resize(length);
+
+  // Compress the input, and put compressed data in output.
+  _stream.next_in = (char *)input;
+  _stream.avail_in = static_cast<unsigned int>(length);
+
+  // Initialize the output size.
+  _stream.next_out = (char *)&(*output)[0];
+  _stream.avail_out = static_cast<unsigned int>(length);
+
+  size_t old_sz = 0, new_sz = 0;
+  while (_stream.next_in != nullptr && _stream.avail_in != 0) {
+    st = BZ2_bzCompress(&_stream, BZ_FINISH);
+    switch (st) {
+      case BZ_STREAM_END:
+        break;
+      case BZ_FINISH_OK:
+        // No output space. Increase the output space by 20%.
+        // (Should we fail the compression since it expands the size?)
+        old_sz = output->size();
+        new_sz = static_cast<size_t>(output->size() * 1.2);
+        output->resize(new_sz);
+        // Set more output.
+        _stream.next_out = (char *)&(*output)[old_sz];
+        _stream.avail_out = static_cast<unsigned int>(new_sz - old_sz);
+        break;
+      case BZ_SEQUENCE_ERROR:
+      default:
+        BZ2_bzCompressEnd(&_stream);
+        return false;
+    }
+  }
+
+  output->resize(output->size() - _stream.avail_out);
+  BZ2_bzCompressEnd(&_stream);
+  return true;
+#endif
+  return false;
+}
+
+inline char* BZip2_Uncompress(const char* input_data, size_t input_length,
+                              int* decompress_size) {
+#ifdef BZIP2
+  bz_stream _stream;
+  memset(&_stream, 0, sizeof(bz_stream));
+
+  int st = BZ2_bzDecompressInit(&_stream, 0, 0);
+  if (st != BZ_OK) {
+    return nullptr;
+  }
+
+  _stream.next_in = (char *)input_data;
+  _stream.avail_in = static_cast<unsigned int>(input_length);
+
+  // Assume the decompressed data size will be 5x of compressed size.
+  size_t output_len = input_length * 5;
+  char* output = new char[output_len];
+  size_t old_sz = output_len;
+
+  _stream.next_out = (char *)output;
+  _stream.avail_out = static_cast<unsigned int>(output_len);
+
+  char* tmp = nullptr;
+
+  while(_stream.next_in != nullptr && _stream.avail_in != 0) {
+    st = BZ2_bzDecompress(&_stream);
+    switch (st) {
+      case BZ_STREAM_END:
+        break;
+      case BZ_OK:
+        // No output space. Increase the output space by 20%.
+        old_sz = output_len;
+        output_len = static_cast<size_t>(output_len * 1.2);
+        tmp = new char[output_len];
+        memcpy(tmp, output, old_sz);
+        delete[] output;
+        output = tmp;
+
+        // Set more output.
+        _stream.next_out = (char *)(output + old_sz);
+        _stream.avail_out = static_cast<unsigned int>(output_len - old_sz);
+        break;
+      default:
+        delete[] output;
+        BZ2_bzDecompressEnd(&_stream);
+        return nullptr;
+    }
+  }
+
+  *decompress_size = static_cast<int>(output_len - _stream.avail_out);
+  BZ2_bzDecompressEnd(&_stream);
+  return output;
+#endif
+  return nullptr;
+}
+
+inline bool LZ4_Compress(const CompressionOptions &opts, const char *input,
+                         size_t length, ::std::string* output) {
+#ifdef LZ4
+  int compressBound = LZ4_compressBound(static_cast<int>(length));
+  output->resize(static_cast<size_t>(8 + compressBound));
+  char* p = const_cast<char*>(output->c_str());
+  memcpy(p, &length, sizeof(length));
+  int outlen = LZ4_compress_limitedOutput(
+      input, p + 8, static_cast<int>(length), compressBound);
+  if (outlen == 0) {
+    return false;
+  }
+  output->resize(static_cast<size_t>(8 + outlen));
+  return true;
+#endif
+  return false;
+}
+
+inline char* LZ4_Uncompress(const char* input_data, size_t input_length,
+                            int* decompress_size) {
+#ifdef LZ4
+  if (input_length < 8) {
+    return nullptr;
+  }
+  int output_len;
+  memcpy(&output_len, input_data, sizeof(output_len));
+  char *output = new char[output_len];
+  *decompress_size = LZ4_decompress_safe_partial(
+      input_data + 8, output, static_cast<int>(input_length - 8), output_len,
+      output_len);
+  if (*decompress_size < 0) {
+    delete[] output;
+    return nullptr;
+  }
+  return output;
+#endif
+  return nullptr;
+}
+
+inline bool LZ4HC_Compress(const CompressionOptions &opts, const char* input,
+                           size_t length, ::std::string* output) {
+#ifdef LZ4
+  int compressBound = LZ4_compressBound(static_cast<int>(length));
+  output->resize(static_cast<size_t>(8 + compressBound));
+  char* p = const_cast<char*>(output->c_str());
+  memcpy(p, &length, sizeof(length));
+  int outlen;
+#ifdef LZ4_VERSION_MAJOR  // they only started defining this since r113
+  outlen = LZ4_compressHC2_limitedOutput(input, p + 8, static_cast<int>(length),
+                                         compressBound, opts.level);
+#else
+  outlen = LZ4_compressHC_limitedOutput(input, p + 8, static_cast<int>(length),
+                                        compressBound);
+#endif
+  if (outlen == 0) {
+    return false;
+  }
+  output->resize(static_cast<size_t>(8 + outlen));
+  return true;
+#endif
+  return false;
+}
+} // namespace rocksdb
diff --git a/util/thread_status_updater.cc b/util/thread_status_updater.cc
index 119174db5..feb129885 100644
--- a/util/thread_status_updater.cc
+++ b/util/thread_status_updater.cc
@@ -3,6 +3,7 @@
 // LICENSE file in the root directory of this source tree. An additional grant
 // of patent rights can be found in the PATENTS file in the same directory.
 
+#include <memory>
 #include "port/likely.h"
 #include "util/mutexlock.h"
 #include "util/thread_status_updater.h"
diff --git a/util/thread_status_updater.h b/util/thread_status_updater.h
index 8cb80022f..c97102a96 100644
--- a/util/thread_status_updater.h
+++ b/util/thread_status_updater.h
@@ -27,13 +27,15 @@
 // This means user might not always get full information, but whenever
 // returned by the GetThreadList() is guaranteed to be consistent.
 #pragma once
-#include <unordered_set>
 #include <atomic>
+#include <list>
+#include <memory>
+#include <mutex>
 #include <string>
 #include <unordered_map>
-#include <mutex>
-#include <list>
+#include <unordered_set>
 #include <vector>
+
 #include "rocksdb/status.h"
 #include "rocksdb/thread_status.h"
 #include "port/port_posix.h"
diff --git a/utilities/document/json_document.cc b/utilities/document/json_document.cc
index e5b745573..254574113 100644
--- a/utilities/document/json_document.cc
+++ b/utilities/document/json_document.cc
@@ -11,9 +11,11 @@
 #endif
 
 #include <inttypes.h>
+
 #include <cassert>
-#include <string>
 #include <map>
+#include <memory>
+#include <string>
 #include <vector>
 
 #include "third-party/rapidjson/reader.h"

From 15d2abbec306bfd9ed7e7eadaff10bee74fe5367 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Fri, 9 Jan 2015 13:04:06 -0800
Subject: [PATCH 674/829] Fix build issues

---
 db/db_bench.cc      | 79 +++++++++++++++++++++++----------------------
 db/db_test.cc       | 11 ++++---
 table/table_test.cc | 20 ++++--------
 3 files changed, 53 insertions(+), 57 deletions(-)

diff --git a/db/db_bench.cc b/db/db_bench.cc
index 8e5d07a59..d34dbb34b 100644
--- a/db/db_bench.cc
+++ b/db/db_bench.cc
@@ -45,6 +45,7 @@ int main() {
 #include "port/port.h"
 #include "port/stack_trace.h"
 #include "util/crc32c.h"
+#include "util/compression.h"
 #include "util/histogram.h"
 #include "util/mutexlock.h"
 #include "util/random.h"
@@ -1213,27 +1214,27 @@ class Benchmark {
       text[len] = '\0';
       switch (FLAGS_compression_type_e) {
         case kSnappyCompression:
-          result = port::Snappy_Compress(Options().compression_opts, text,
-                                         strlen(text), &compressed);
+          result = Snappy_Compress(Options().compression_opts, text,
+                                   strlen(text), &compressed);
           name = "Snappy";
           break;
         case kZlibCompression:
-          result = port::Zlib_Compress(Options().compression_opts, text,
-                                       strlen(text), &compressed);
+          result = Zlib_Compress(Options().compression_opts, text, strlen(text),
+                                 &compressed);
           name = "Zlib";
           break;
         case kBZip2Compression:
-          result = port::BZip2_Compress(Options().compression_opts, text,
-                                        strlen(text), &compressed);
+          result = BZip2_Compress(Options().compression_opts, text,
+                                  strlen(text), &compressed);
           name = "BZip2";
           break;
         case kLZ4Compression:
-          result = port::LZ4_Compress(Options().compression_opts, text,
-                                      strlen(text), &compressed);
+          result = LZ4_Compress(Options().compression_opts, text, strlen(text),
+                                &compressed);
           name = "LZ4";
           break;
         case kLZ4HCCompression:
-          result = port::LZ4HC_Compress(Options().compression_opts, text,
+          result = LZ4HC_Compress(Options().compression_opts, text,
                                         strlen(text), &compressed);
           name = "LZ4HC";
           break;
@@ -1774,24 +1775,24 @@ class Benchmark {
     while (ok && bytes < int64_t(1) << 30) {
       switch (FLAGS_compression_type_e) {
       case rocksdb::kSnappyCompression:
-        ok = port::Snappy_Compress(Options().compression_opts, input.data(),
-                                   input.size(), &compressed);
+        ok = Snappy_Compress(Options().compression_opts, input.data(),
+                             input.size(), &compressed);
         break;
       case rocksdb::kZlibCompression:
-        ok = port::Zlib_Compress(Options().compression_opts, input.data(),
-                                 input.size(), &compressed);
+        ok = Zlib_Compress(Options().compression_opts, input.data(),
+                           input.size(), &compressed);
         break;
       case rocksdb::kBZip2Compression:
-        ok = port::BZip2_Compress(Options().compression_opts, input.data(),
-                                  input.size(), &compressed);
+        ok = BZip2_Compress(Options().compression_opts, input.data(),
+                            input.size(), &compressed);
         break;
       case rocksdb::kLZ4Compression:
-        ok = port::LZ4_Compress(Options().compression_opts, input.data(),
-                                input.size(), &compressed);
+        ok = LZ4_Compress(Options().compression_opts, input.data(),
+                          input.size(), &compressed);
         break;
       case rocksdb::kLZ4HCCompression:
-        ok = port::LZ4HC_Compress(Options().compression_opts, input.data(),
-                                  input.size(), &compressed);
+        ok = LZ4HC_Compress(Options().compression_opts, input.data(),
+                            input.size(), &compressed);
         break;
       default:
         ok = false;
@@ -1820,24 +1821,24 @@ class Benchmark {
     bool ok;
     switch (FLAGS_compression_type_e) {
     case rocksdb::kSnappyCompression:
-      ok = port::Snappy_Compress(Options().compression_opts, input.data(),
-                                 input.size(), &compressed);
+      ok = Snappy_Compress(Options().compression_opts, input.data(),
+                           input.size(), &compressed);
       break;
     case rocksdb::kZlibCompression:
-      ok = port::Zlib_Compress(Options().compression_opts, input.data(),
-                               input.size(), &compressed);
+      ok = Zlib_Compress(Options().compression_opts, input.data(), input.size(),
+                         &compressed);
       break;
     case rocksdb::kBZip2Compression:
-      ok = port::BZip2_Compress(Options().compression_opts, input.data(),
-                                input.size(), &compressed);
+      ok = BZip2_Compress(Options().compression_opts, input.data(),
+                          input.size(), &compressed);
       break;
     case rocksdb::kLZ4Compression:
-      ok = port::LZ4_Compress(Options().compression_opts, input.data(),
-                              input.size(), &compressed);
+      ok = LZ4_Compress(Options().compression_opts, input.data(), input.size(),
+                        &compressed);
       break;
     case rocksdb::kLZ4HCCompression:
-      ok = port::LZ4HC_Compress(Options().compression_opts, input.data(),
-                                input.size(), &compressed);
+      ok = LZ4HC_Compress(Options().compression_opts, input.data(),
+                          input.size(), &compressed);
       break;
     default:
       ok = false;
@@ -1851,27 +1852,27 @@ class Benchmark {
       case rocksdb::kSnappyCompression:
         // allocate here to make comparison fair
         uncompressed = new char[input.size()];
-        ok = port::Snappy_Uncompress(compressed.data(), compressed.size(),
-                                     uncompressed);
+        ok = Snappy_Uncompress(compressed.data(), compressed.size(),
+                               uncompressed);
         break;
       case rocksdb::kZlibCompression:
-        uncompressed = port::Zlib_Uncompress(
-            compressed.data(), compressed.size(), &decompress_size);
+        uncompressed = Zlib_Uncompress(compressed.data(), compressed.size(),
+                                       &decompress_size);
         ok = uncompressed != nullptr;
         break;
       case rocksdb::kBZip2Compression:
-        uncompressed = port::BZip2_Uncompress(
-            compressed.data(), compressed.size(), &decompress_size);
+        uncompressed = BZip2_Uncompress(compressed.data(), compressed.size(),
+                                        &decompress_size);
         ok = uncompressed != nullptr;
         break;
       case rocksdb::kLZ4Compression:
-        uncompressed = port::LZ4_Uncompress(
-            compressed.data(), compressed.size(), &decompress_size);
+        uncompressed = LZ4_Uncompress(compressed.data(), compressed.size(),
+                                      &decompress_size);
         ok = uncompressed != nullptr;
         break;
       case rocksdb::kLZ4HCCompression:
-        uncompressed = port::LZ4_Uncompress(
-            compressed.data(), compressed.size(), &decompress_size);
+        uncompressed = LZ4_Uncompress(compressed.data(), compressed.size(),
+                                      &decompress_size);
         ok = uncompressed != nullptr;
         break;
       default:
diff --git a/db/db_test.cc b/db/db_test.cc
index 455d6cb7e..9fa2a40b5 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -42,6 +42,7 @@
 #include "util/hash_linklist_rep.h"
 #include "utilities/merge_operators.h"
 #include "util/logging.h"
+#include "util/compression.h"
 #include "util/mutexlock.h"
 #include "util/rate_limiter.h"
 #include "util/statistics.h"
@@ -58,31 +59,31 @@ namespace rocksdb {
 static bool SnappyCompressionSupported(const CompressionOptions& options) {
   std::string out;
   Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
-  return port::Snappy_Compress(options, in.data(), in.size(), &out);
+  return Snappy_Compress(options, in.data(), in.size(), &out);
 }
 
 static bool ZlibCompressionSupported(const CompressionOptions& options) {
   std::string out;
   Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
-  return port::Zlib_Compress(options, in.data(), in.size(), &out);
+  return Zlib_Compress(options, in.data(), in.size(), &out);
 }
 
 static bool BZip2CompressionSupported(const CompressionOptions& options) {
   std::string out;
   Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
-  return port::BZip2_Compress(options, in.data(), in.size(), &out);
+  return BZip2_Compress(options, in.data(), in.size(), &out);
 }
 
 static bool LZ4CompressionSupported(const CompressionOptions &options) {
   std::string out;
   Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
-  return port::LZ4_Compress(options, in.data(), in.size(), &out);
+  return LZ4_Compress(options, in.data(), in.size(), &out);
 }
 
 static bool LZ4HCCompressionSupported(const CompressionOptions &options) {
   std::string out;
   Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
-  return port::LZ4HC_Compress(options, in.data(), in.size(), &out);
+  return LZ4HC_Compress(options, in.data(), in.size(), &out);
 }
 
 static std::string RandomString(Random* rnd, int len) {
diff --git a/table/table_test.cc b/table/table_test.cc
index 3d603bf31..8810a2254 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -40,6 +40,7 @@
 #include "table/plain_table_factory.h"
 #include "table/get_context.h"
 
+#include "util/compression.h"
 #include "util/random.h"
 #include "util/statistics.h"
 #include "util/testharness.h"
@@ -533,9 +534,8 @@ static bool SnappyCompressionSupported() {
 #ifdef SNAPPY
   std::string out;
   Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
-  return port::Snappy_Compress(Options().compression_opts,
-                               in.data(), in.size(),
-                               &out);
+  return Snappy_Compress(Options().compression_opts, in.data(), in.size(),
+                         &out);
 #else
   return false;
 #endif
@@ -545,9 +545,7 @@ static bool ZlibCompressionSupported() {
 #ifdef ZLIB
   std::string out;
   Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
-  return port::Zlib_Compress(Options().compression_opts,
-                             in.data(), in.size(),
-                             &out);
+  return Zlib_Compress(Options().compression_opts, in.data(), in.size(), &out);
 #else
   return false;
 #endif
@@ -557,9 +555,7 @@ static bool BZip2CompressionSupported() {
 #ifdef BZIP2
   std::string out;
   Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
-  return port::BZip2_Compress(Options().compression_opts,
-                              in.data(), in.size(),
-                              &out);
+  return BZip2_Compress(Options().compression_opts, in.data(), in.size(), &out);
 #else
   return false;
 #endif
@@ -569,8 +565,7 @@ static bool LZ4CompressionSupported() {
 #ifdef LZ4
   std::string out;
   Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
-  return port::LZ4_Compress(Options().compression_opts, in.data(), in.size(),
-                            &out);
+  return LZ4_Compress(Options().compression_opts, in.data(), in.size(), &out);
 #else
   return false;
 #endif
@@ -580,8 +575,7 @@ static bool LZ4HCCompressionSupported() {
 #ifdef LZ4
   std::string out;
   Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
-  return port::LZ4HC_Compress(Options().compression_opts, in.data(), in.size(),
-                              &out);
+  return LZ4HC_Compress(Options().compression_opts, in.data(), in.size(), &out);
 #else
   return false;
 #endif

From 0aab1005f83b6222b3f477da271bfa49d7034adb Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Thu, 18 Dec 2014 23:43:14 +0100
Subject: [PATCH 675/829] [RocksJava] ColumnFamilyDescriptor alignment with
 listColumnFamilies

Summary:
Previous to this commit ColumnFamilyDescriptor took a String as name for the ColumnFamily name. String is however encoding dependent which is bad because listColumnFamilies returns byte arrays without any encoding information.

All public API call were deprecated and flagged to be removed in 3.10.0

Test Plan:
make rocksdbjava
make test
mvn -f rocksjni.pom package

Reviewers: yhchiang, adamretter, ankgup87

Subscribers: dhruba

Differential Revision: https://reviews.facebook.net/D30525
---
 java/RocksDBColumnFamilySample.java           |  5 +-
 java/org/rocksdb/ColumnFamilyDescriptor.java  | 34 ++++++++++++-
 java/org/rocksdb/RocksDB.java                 |  2 +-
 .../rocksdb/test/AbstractComparatorTest.java  |  2 +-
 java/org/rocksdb/test/ColumnFamilyTest.java   | 26 +++++-----
 java/org/rocksdb/test/KeyMayExistTest.java    |  4 +-
 java/org/rocksdb/test/MergeTest.java          | 11 ++---
 java/org/rocksdb/test/ReadOnlyTest.java       |  6 +--
 java/org/rocksdb/test/RocksDBTest.java        |  8 ++--
 java/rocksjni/portal.h                        |  2 +-
 java/rocksjni/rocksjni.cc                     | 48 +++++++++++--------
 11 files changed, 92 insertions(+), 56 deletions(-)

diff --git a/java/RocksDBColumnFamilySample.java b/java/RocksDBColumnFamilySample.java
index 5515845cb..da9f4d28b 100644
--- a/java/RocksDBColumnFamilySample.java
+++ b/java/RocksDBColumnFamilySample.java
@@ -33,7 +33,8 @@ public class RocksDBColumnFamilySample {
 
       // create column family
       columnFamilyHandle = db.createColumnFamily(
-          new ColumnFamilyDescriptor("new_cf", new ColumnFamilyOptions()));
+          new ColumnFamilyDescriptor("new_cf".getBytes(),
+          new ColumnFamilyOptions()));
       assert(columnFamilyHandle != null);
 
     } finally {
@@ -56,7 +57,7 @@ public class RocksDBColumnFamilySample {
         RocksDB.DEFAULT_COLUMN_FAMILY, new ColumnFamilyOptions()));
     // open the new one, too
     columnFamilyDescriptors.add(new ColumnFamilyDescriptor(
-        "new_cf", new ColumnFamilyOptions()));
+        "new_cf".getBytes(), new ColumnFamilyOptions()));
     List<ColumnFamilyHandle> columnFamilyHandles = new ArrayList<>();
     try {
       db = RocksDB.open(new DBOptions(), db_path,
diff --git a/java/org/rocksdb/ColumnFamilyDescriptor.java b/java/org/rocksdb/ColumnFamilyDescriptor.java
index b01c0e858..193865e55 100644
--- a/java/org/rocksdb/ColumnFamilyDescriptor.java
+++ b/java/org/rocksdb/ColumnFamilyDescriptor.java
@@ -16,8 +16,21 @@ public class ColumnFamilyDescriptor {
    * options,</p>
    *
    * @param columnFamilyName name of column family.
+   * @deprecated will be removed in RocksDB 3.10.0. Use {}
    */
+  @Deprecated
   public ColumnFamilyDescriptor(final String columnFamilyName){
+    this(columnFamilyName.getBytes(), new ColumnFamilyOptions());
+  }
+
+  /**
+   * <p>Creates a new Column Family using a name and default
+   * options,</p>
+   *
+   * @param columnFamilyName name of column family.
+   * @since 3.10.0
+   */
+  public ColumnFamilyDescriptor(final byte[] columnFamilyName) {
     this(columnFamilyName, new ColumnFamilyOptions());
   }
 
@@ -28,9 +41,25 @@ public class ColumnFamilyDescriptor {
    * @param columnFamilyName name of column family.
    * @param columnFamilyOptions options to be used with
    *     column family.
+   * @deprecated will be removed in RocksDB 3.10.0. Use {}
    */
+  @Deprecated
   public ColumnFamilyDescriptor(final String columnFamilyName,
       final ColumnFamilyOptions columnFamilyOptions) {
+    this(columnFamilyName.getBytes(), columnFamilyOptions);
+  }
+
+  /**
+   * <p>Creates a new Column Family using a name and custom
+   * options.</p>
+   *
+   * @param columnFamilyName name of column family.
+   * @param columnFamilyOptions options to be used with
+   *     column family.
+   * @since 3.10.0
+   */
+  public ColumnFamilyDescriptor(final byte[] columnFamilyName,
+      final ColumnFamilyOptions columnFamilyOptions) {
     columnFamilyName_ = columnFamilyName;
     columnFamilyOptions_ = columnFamilyOptions;
   }
@@ -39,8 +68,9 @@ public class ColumnFamilyDescriptor {
    * Retrieve name of column family.
    *
    * @return column family name.
+   * @since 3.10.0
    */
-  public String columnFamilyName() {
+  public byte[] columnFamilyName() {
     return columnFamilyName_;
   }
 
@@ -53,6 +83,6 @@ public class ColumnFamilyDescriptor {
     return columnFamilyOptions_;
   }
 
-  private final String columnFamilyName_;
+  private final byte[] columnFamilyName_;
   private final ColumnFamilyOptions columnFamilyOptions_;
 }
diff --git a/java/org/rocksdb/RocksDB.java b/java/org/rocksdb/RocksDB.java
index 04a93eacd..22a608207 100644
--- a/java/org/rocksdb/RocksDB.java
+++ b/java/org/rocksdb/RocksDB.java
@@ -16,7 +16,7 @@ import org.rocksdb.util.Environment;
  * indicates sth wrong at the RocksDB library side and the call failed.
  */
 public class RocksDB extends RocksObject {
-  public static final String DEFAULT_COLUMN_FAMILY = "default";
+  public static final byte[] DEFAULT_COLUMN_FAMILY = "default".getBytes();
   public static final int NOT_FOUND = -1;
 
   static {
diff --git a/java/org/rocksdb/test/AbstractComparatorTest.java b/java/org/rocksdb/test/AbstractComparatorTest.java
index 9d1f2fc64..f0281a521 100644
--- a/java/org/rocksdb/test/AbstractComparatorTest.java
+++ b/java/org/rocksdb/test/AbstractComparatorTest.java
@@ -116,7 +116,7 @@ public abstract class AbstractComparatorTest {
         new ArrayList<>();
     cfDescriptors.add(new ColumnFamilyDescriptor(
         RocksDB.DEFAULT_COLUMN_FAMILY));
-    cfDescriptors.add(new ColumnFamilyDescriptor("new_cf",
+    cfDescriptors.add(new ColumnFamilyDescriptor("new_cf".getBytes(),
         new ColumnFamilyOptions().setComparator(
             getAscendingIntKeyComparator())));
     List<ColumnFamilyHandle> cfHandles = new ArrayList<>();
diff --git a/java/org/rocksdb/test/ColumnFamilyTest.java b/java/org/rocksdb/test/ColumnFamilyTest.java
index 5b51ee718..703ed296f 100644
--- a/java/org/rocksdb/test/ColumnFamilyTest.java
+++ b/java/org/rocksdb/test/ColumnFamilyTest.java
@@ -68,7 +68,7 @@ public class ColumnFamilyTest {
       dbOptions.setCreateIfMissing(true);
 
       db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath());
-      db.createColumnFamily(new ColumnFamilyDescriptor("new_cf",
+      db.createColumnFamily(new ColumnFamilyDescriptor("new_cf".getBytes(),
           new ColumnFamilyOptions()));
       db.close();
       List<byte[]> columnFamilyNames;
@@ -102,7 +102,7 @@ public class ColumnFamilyTest {
       List<ColumnFamilyHandle> columnFamilyHandleList =
           new ArrayList<>();
       cfNames.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY));
-      cfNames.add(new ColumnFamilyDescriptor("new_cf"));
+      cfNames.add(new ColumnFamilyDescriptor("new_cf".getBytes()));
 
       db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath(),
           cfNames, columnFamilyHandleList);
@@ -191,12 +191,12 @@ public class ColumnFamilyTest {
       List<ColumnFamilyHandle> columnFamilyHandleList =
           new ArrayList<>();
       cfNames.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY));
-      cfNames.add(new ColumnFamilyDescriptor("new_cf"));
+      cfNames.add(new ColumnFamilyDescriptor("new_cf".getBytes()));
 
       db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath(),
           cfNames, columnFamilyHandleList);
       tmpColumnFamilyHandle = db.createColumnFamily(
-          new ColumnFamilyDescriptor("tmpCF", new ColumnFamilyOptions()));
+          new ColumnFamilyDescriptor("tmpCF".getBytes(), new ColumnFamilyOptions()));
       db.put(tmpColumnFamilyHandle, "key".getBytes(), "value".getBytes());
       db.dropColumnFamily(tmpColumnFamilyHandle);
       tmpColumnFamilyHandle.dispose();
@@ -226,7 +226,7 @@ public class ColumnFamilyTest {
       List<ColumnFamilyHandle> columnFamilyHandleList =
           new ArrayList<>();
       cfNames.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY));
-      cfNames.add(new ColumnFamilyDescriptor("new_cf"));
+      cfNames.add(new ColumnFamilyDescriptor("new_cf".getBytes()));
 
       db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath(),
           cfNames, columnFamilyHandleList);
@@ -273,7 +273,7 @@ public class ColumnFamilyTest {
       List<ColumnFamilyHandle> columnFamilyHandleList =
           new ArrayList<>();
       cfNames.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY));
-      cfNames.add(new ColumnFamilyDescriptor("new_cf"));
+      cfNames.add(new ColumnFamilyDescriptor("new_cf".getBytes()));
 
       db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath(),
           cfNames, columnFamilyHandleList);
@@ -322,7 +322,7 @@ public class ColumnFamilyTest {
       List<ColumnFamilyHandle> columnFamilyHandleList =
           new ArrayList<>();
       cfDescriptors.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY));
-      cfDescriptors.add(new ColumnFamilyDescriptor("new_cf"));
+      cfDescriptors.add(new ColumnFamilyDescriptor("new_cf".getBytes()));
 
       db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath(),
           cfDescriptors, columnFamilyHandleList);
@@ -367,7 +367,7 @@ public class ColumnFamilyTest {
       List<ColumnFamilyHandle> columnFamilyHandleList =
           new ArrayList<>();
       cfNames.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY));
-      cfNames.add(new ColumnFamilyDescriptor("new_cf"));
+      cfNames.add(new ColumnFamilyDescriptor("new_cf".getBytes()));
 
       db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath(),
           cfNames, columnFamilyHandleList);
@@ -409,7 +409,7 @@ public class ColumnFamilyTest {
       List<ColumnFamilyHandle> columnFamilyHandleList =
           new ArrayList<>();
       cfNames.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY));
-      cfNames.add(new ColumnFamilyDescriptor("new_cf"));
+      cfNames.add(new ColumnFamilyDescriptor("new_cf".getBytes()));
 
       db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath(),
           cfNames, columnFamilyHandleList);
@@ -459,7 +459,7 @@ public class ColumnFamilyTest {
       List<ColumnFamilyHandle> columnFamilyHandleList =
           new ArrayList<>();
       cfNames.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY));
-      cfNames.add(new ColumnFamilyDescriptor("new_cf"));
+      cfNames.add(new ColumnFamilyDescriptor("new_cf".getBytes()));
 
       db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath(),
           cfNames, columnFamilyHandleList);
@@ -487,7 +487,7 @@ public class ColumnFamilyTest {
       List<ColumnFamilyHandle> columnFamilyHandleList =
           new ArrayList<>();
       cfNames.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY));
-      cfNames.add(new ColumnFamilyDescriptor("new_cf"));
+      cfNames.add(new ColumnFamilyDescriptor("new_cf".getBytes()));
 
       db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath(),
           cfNames, columnFamilyHandleList);
@@ -515,7 +515,7 @@ public class ColumnFamilyTest {
       List<ColumnFamilyHandle> columnFamilyHandleList =
           new ArrayList<>();
       cfNames.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY));
-      cfNames.add(new ColumnFamilyDescriptor("new_cf"));
+      cfNames.add(new ColumnFamilyDescriptor("new_cf".getBytes()));
 
       db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath(),
           cfNames, columnFamilyHandleList);
@@ -543,7 +543,7 @@ public class ColumnFamilyTest {
       List<ColumnFamilyHandle> columnFamilyHandleList =
           new ArrayList<>();
       cfNames.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY));
-      cfNames.add(new ColumnFamilyDescriptor("new_cf"));
+      cfNames.add(new ColumnFamilyDescriptor("new_cf".getBytes()));
 
       db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath(),
           cfNames, columnFamilyHandleList);
diff --git a/java/org/rocksdb/test/KeyMayExistTest.java b/java/org/rocksdb/test/KeyMayExistTest.java
index 4fe45e4c0..921a6593c 100644
--- a/java/org/rocksdb/test/KeyMayExistTest.java
+++ b/java/org/rocksdb/test/KeyMayExistTest.java
@@ -37,8 +37,8 @@ public class KeyMayExistTest {
           new ArrayList<>();
       List<ColumnFamilyHandle> columnFamilyHandleList =
           new ArrayList<>();
-      cfDescriptors.add(new ColumnFamilyDescriptor("default"));
-      cfDescriptors.add(new ColumnFamilyDescriptor("new_cf"));
+      cfDescriptors.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY));
+      cfDescriptors.add(new ColumnFamilyDescriptor("new_cf".getBytes()));
       db = RocksDB.open(options,
           dbFolder.getRoot().getAbsolutePath(),
           cfDescriptors, columnFamilyHandleList);
diff --git a/java/org/rocksdb/test/MergeTest.java b/java/org/rocksdb/test/MergeTest.java
index f90b0b0c1..9bb882e44 100644
--- a/java/org/rocksdb/test/MergeTest.java
+++ b/java/org/rocksdb/test/MergeTest.java
@@ -8,7 +8,6 @@ package org.rocksdb.test;
 import java.util.List;
 import java.util.ArrayList;
 
-import org.junit.Assert;
 import org.junit.ClassRule;
 import org.junit.Rule;
 import org.junit.Test;
@@ -73,10 +72,10 @@ public class MergeTest {
 
       List<ColumnFamilyDescriptor> cfDescriptors =
           new ArrayList<>();
-      cfDescriptors.add(new ColumnFamilyDescriptor("default",
+      cfDescriptors.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY,
           new ColumnFamilyOptions().setMergeOperatorName(
               "stringappend")));
-      cfDescriptors.add(new ColumnFamilyDescriptor("default",
+      cfDescriptors.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY,
           new ColumnFamilyOptions().setMergeOperatorName(
               "stringappend")));
       db = RocksDB.open(opt, db_path_string,
@@ -158,10 +157,10 @@ public class MergeTest {
           new ArrayList<>();
       List<ColumnFamilyHandle> columnFamilyHandleList =
           new ArrayList<>();
-      cfDescriptors.add(new ColumnFamilyDescriptor("default",
+      cfDescriptors.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY,
           new ColumnFamilyOptions().setMergeOperator(
               stringAppendOperator)));
-      cfDescriptors.add(new ColumnFamilyDescriptor("new_cf",
+      cfDescriptors.add(new ColumnFamilyDescriptor("new_cf".getBytes(),
           new ColumnFamilyOptions().setMergeOperator(
               stringAppendOperator)));
       db = RocksDB.open(opt, db_path_string,
@@ -178,7 +177,7 @@ public class MergeTest {
 
       // Test also with createColumnFamily
       columnFamilyHandle = db.createColumnFamily(
-          new ColumnFamilyDescriptor("new_cf2",
+          new ColumnFamilyDescriptor("new_cf2".getBytes(),
               new ColumnFamilyOptions().setMergeOperator(stringAppendOperator)));
       // writing xx under cfkey2
       db.put(columnFamilyHandle, "cfkey2".getBytes(), "xx".getBytes());
diff --git a/java/org/rocksdb/test/ReadOnlyTest.java b/java/org/rocksdb/test/ReadOnlyTest.java
index bf6bb5eb5..fce704eb5 100644
--- a/java/org/rocksdb/test/ReadOnlyTest.java
+++ b/java/org/rocksdb/test/ReadOnlyTest.java
@@ -58,9 +58,9 @@ public class ReadOnlyTest {
       db = RocksDB.open(
           dbFolder.getRoot().getAbsolutePath(), cfDescriptors, columnFamilyHandleList);
       columnFamilyHandleList.add(db.createColumnFamily(
-          new ColumnFamilyDescriptor("new_cf", new ColumnFamilyOptions())));
+          new ColumnFamilyDescriptor("new_cf".getBytes(), new ColumnFamilyOptions())));
       columnFamilyHandleList.add(db.createColumnFamily(
-          new ColumnFamilyDescriptor("new_cf2", new ColumnFamilyOptions())));
+          new ColumnFamilyDescriptor("new_cf2".getBytes(), new ColumnFamilyOptions())));
       db.put(columnFamilyHandleList.get(2), "key2".getBytes(),
           "value2".getBytes());
 
@@ -75,7 +75,7 @@ public class ReadOnlyTest {
           new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY,
               new ColumnFamilyOptions()));
       cfDescriptors.add(
-          new ColumnFamilyDescriptor("new_cf2", new ColumnFamilyOptions()));
+          new ColumnFamilyDescriptor("new_cf2".getBytes(), new ColumnFamilyOptions()));
       db3 = RocksDB.openReadOnly(
           dbFolder.getRoot().getAbsolutePath(), cfDescriptors, readOnlyColumnFamilyHandleList2);
       assertThat(new String(db3.get(readOnlyColumnFamilyHandleList2.get(1),
diff --git a/java/org/rocksdb/test/RocksDBTest.java b/java/org/rocksdb/test/RocksDBTest.java
index df0c04787..a6934b310 100644
--- a/java/org/rocksdb/test/RocksDBTest.java
+++ b/java/org/rocksdb/test/RocksDBTest.java
@@ -370,7 +370,7 @@ public class RocksDBTest {
       columnFamilyDescriptors.add(new ColumnFamilyDescriptor(
           RocksDB.DEFAULT_COLUMN_FAMILY));
       columnFamilyDescriptors.add(new ColumnFamilyDescriptor(
-          "new_cf",
+          "new_cf".getBytes(),
           new ColumnFamilyOptions().
               setDisableAutoCompactions(true).
               setCompactionStyle(CompactionStyle.LEVEL).
@@ -501,7 +501,7 @@ public class RocksDBTest {
       columnFamilyDescriptors.add(new ColumnFamilyDescriptor(
           RocksDB.DEFAULT_COLUMN_FAMILY));
       columnFamilyDescriptors.add(new ColumnFamilyDescriptor(
-          "new_cf",
+          "new_cf".getBytes(),
           new ColumnFamilyOptions().
               setDisableAutoCompactions(true).
               setCompactionStyle(CompactionStyle.LEVEL).
@@ -556,7 +556,7 @@ public class RocksDBTest {
       columnFamilyDescriptors.add(new ColumnFamilyDescriptor(
           RocksDB.DEFAULT_COLUMN_FAMILY));
       columnFamilyDescriptors.add(new ColumnFamilyDescriptor(
-          "new_cf",
+          "new_cf".getBytes(),
           new ColumnFamilyOptions().
               setDisableAutoCompactions(true).
               setCompactionStyle(CompactionStyle.LEVEL).
@@ -670,7 +670,7 @@ public class RocksDBTest {
       columnFamilyDescriptors.add(new ColumnFamilyDescriptor(
           RocksDB.DEFAULT_COLUMN_FAMILY));
       columnFamilyDescriptors.add(new ColumnFamilyDescriptor(
-          "new_cf",
+          "new_cf".getBytes(),
           new ColumnFamilyOptions().
               setDisableAutoCompactions(true).
               setCompactionStyle(CompactionStyle.LEVEL).
diff --git a/java/rocksjni/portal.h b/java/rocksjni/portal.h
index 9fdab09a4..539e824e5 100644
--- a/java/rocksjni/portal.h
+++ b/java/rocksjni/portal.h
@@ -172,7 +172,7 @@ class ColumnFamilyDescriptorJni {
   static jmethodID getColumnFamilyNameMethod(JNIEnv* env) {
     static jmethodID mid = env->GetMethodID(
         getColumnFamilyDescriptorClass(env),
-        "columnFamilyName", "()Ljava/lang/String;");
+        "columnFamilyName", "()[B");
     assert(mid != nullptr);
     return mid;
   }
diff --git a/java/rocksjni/rocksjni.cc b/java/rocksjni/rocksjni.cc
index 57a20e487..2a536b104 100644
--- a/java/rocksjni/rocksjni.cc
+++ b/java/rocksjni/rocksjni.cc
@@ -74,8 +74,8 @@ jobject
   rocksdb::DB* db = nullptr;
   const char* db_path = env->GetStringUTFChars(jdb_path, 0);
 
-  std::vector<const char*> cfnames_to_free;
-  std::vector<jstring> jcfnames_for_free;
+  std::vector<jbyte*> cfnames_to_free;
+  std::vector<jbyteArray> jcfnames_for_free;
 
   std::vector<rocksdb::ColumnFamilyDescriptor> column_families;
   std::vector<rocksdb::ColumnFamilyHandle* > handles;
@@ -90,9 +90,10 @@ jobject
       jobject jcf_descriptor = env->CallObjectMethod(iteratorObj,
           rocksdb::ListJni::getNextMethod(env));
       // get ColumnFamilyName
-      jstring jstr = (jstring) env->CallObjectMethod(jcf_descriptor,
+      jbyteArray byteArray = static_cast<jbyteArray>(env->CallObjectMethod(
+          jcf_descriptor,
           rocksdb::ColumnFamilyDescriptorJni::getColumnFamilyNameMethod(
-          env));
+          env)));
       // get CF Options
       jobject jcf_opt_obj = env->CallObjectMethod(jcf_descriptor,
           rocksdb::ColumnFamilyDescriptorJni::getColumnFamilyOptionsMethod(
@@ -100,12 +101,13 @@ jobject
       rocksdb::ColumnFamilyOptions* cfOptions =
           rocksdb::ColumnFamilyOptionsJni::getHandle(env, jcf_opt_obj);
 
-      const char* cfname = env->GetStringUTFChars(jstr, 0);
+      jbyte* cfname = env->GetByteArrayElements(byteArray, 0);
 
       // free allocated cfnames after call to open
       cfnames_to_free.push_back(cfname);
-      jcfnames_for_free.push_back(jstr);
-      column_families.push_back(rocksdb::ColumnFamilyDescriptor(cfname,
+      jcfnames_for_free.push_back(byteArray);
+      column_families.push_back(rocksdb::ColumnFamilyDescriptor(
+          reinterpret_cast<char *>(cfname),
           *cfOptions));
   }
 
@@ -116,7 +118,7 @@ jobject
   for (std::vector<jbyte*>::size_type i = 0;
       i != cfnames_to_free.size(); i++) {
     // free  cfnames
-    env->ReleaseStringUTFChars(jcfnames_for_free[i], cfnames_to_free[i]);
+    env->ReleaseByteArrayElements(jcfnames_for_free[i], cfnames_to_free[i], 0);
   }
 
   // check if open operation was successful
@@ -157,8 +159,8 @@ jobject Java_org_rocksdb_RocksDB_open__JLjava_lang_String_2Ljava_util_List_2I(
   rocksdb::DB* db = nullptr;
   const char* db_path = env->GetStringUTFChars(jdb_path, 0);
 
-  std::vector<const char*> cfnames_to_free;
-  std::vector<jstring> jcfnames_for_free;
+  std::vector<jbyte*> cfnames_to_free;
+  std::vector<jbyteArray> jcfnames_for_free;
 
   std::vector<rocksdb::ColumnFamilyDescriptor> column_families;
   std::vector<rocksdb::ColumnFamilyHandle* > handles;
@@ -173,9 +175,10 @@ jobject Java_org_rocksdb_RocksDB_open__JLjava_lang_String_2Ljava_util_List_2I(
       jobject jcf_descriptor = env->CallObjectMethod(iteratorObj,
           rocksdb::ListJni::getNextMethod(env));
       // get ColumnFamilyName
-      jstring jstr = (jstring) env->CallObjectMethod(jcf_descriptor,
+      jbyteArray byteArray = static_cast<jbyteArray>(env->CallObjectMethod(
+          jcf_descriptor,
           rocksdb::ColumnFamilyDescriptorJni::getColumnFamilyNameMethod(
-          env));
+          env)));
       // get CF Options
       jobject jcf_opt_obj = env->CallObjectMethod(jcf_descriptor,
           rocksdb::ColumnFamilyDescriptorJni::getColumnFamilyOptionsMethod(
@@ -183,12 +186,13 @@ jobject Java_org_rocksdb_RocksDB_open__JLjava_lang_String_2Ljava_util_List_2I(
       rocksdb::ColumnFamilyOptions* cfOptions =
           rocksdb::ColumnFamilyOptionsJni::getHandle(env, jcf_opt_obj);
 
-      const char* cfname = env->GetStringUTFChars(jstr, 0);
+      jbyte* cfname = env->GetByteArrayElements(byteArray, 0);
 
       // free allocated cfnames after call to open
       cfnames_to_free.push_back(cfname);
-      jcfnames_for_free.push_back(jstr);
-      column_families.push_back(rocksdb::ColumnFamilyDescriptor(cfname,
+      jcfnames_for_free.push_back(byteArray);
+      column_families.push_back(rocksdb::ColumnFamilyDescriptor(
+          reinterpret_cast<const char *>(cfname),
           *cfOptions));
   }
 
@@ -199,7 +203,7 @@ jobject Java_org_rocksdb_RocksDB_open__JLjava_lang_String_2Ljava_util_List_2I(
   for (std::vector<jbyte*>::size_type i = 0;
       i != cfnames_to_free.size(); i++) {
     // free  cfnames
-    env->ReleaseStringUTFChars(jcfnames_for_free[i], cfnames_to_free[i]);
+    env->ReleaseByteArrayElements(jcfnames_for_free[i], cfnames_to_free[i], 0);
   }
 
   // check if open operation was successful
@@ -1181,9 +1185,11 @@ jlong Java_org_rocksdb_RocksDB_createColumnFamily(
   rocksdb::ColumnFamilyHandle* handle;
   auto db_handle = reinterpret_cast<rocksdb::DB*>(jdb_handle);
 
-  jstring jstr = (jstring) env->CallObjectMethod(jcf_descriptor,
+  // get ColumnFamilyName
+  jbyteArray byteArray = static_cast<jbyteArray>(env->CallObjectMethod(
+      jcf_descriptor,
       rocksdb::ColumnFamilyDescriptorJni::getColumnFamilyNameMethod(
-      env));
+      env)));
   // get CF Options
   jobject jcf_opt_obj = env->CallObjectMethod(jcf_descriptor,
       rocksdb::ColumnFamilyDescriptorJni::getColumnFamilyOptionsMethod(
@@ -1191,10 +1197,10 @@ jlong Java_org_rocksdb_RocksDB_createColumnFamily(
   rocksdb::ColumnFamilyOptions* cfOptions =
       rocksdb::ColumnFamilyOptionsJni::getHandle(env, jcf_opt_obj);
 
-  const char* cfname = env->GetStringUTFChars(jstr, 0);
+  jbyte* cfname = env->GetByteArrayElements(byteArray, 0);
   rocksdb::Status s = db_handle->CreateColumnFamily(
-      *cfOptions, cfname, &handle);
-  env->ReleaseStringUTFChars(jstr, cfname);
+      *cfOptions, reinterpret_cast<char *>(cfname), &handle);
+  env->ReleaseByteArrayElements(byteArray, cfname, 0);
 
   if (s.ok()) {
     return reinterpret_cast<jlong>(handle);

From 23ad5f401a3fac0cc88e00f84a5b2a2295f87804 Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Sat, 10 Jan 2015 20:52:03 +0100
Subject: [PATCH 676/829] [RocksJava] Incorporated changes for D30525

---
 java/org/rocksdb/ColumnFamilyDescriptor.java |  6 ++++--
 java/rocksjni/rocksjni.cc                    | 12 ++++++------
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/java/org/rocksdb/ColumnFamilyDescriptor.java b/java/org/rocksdb/ColumnFamilyDescriptor.java
index 193865e55..4c0954740 100644
--- a/java/org/rocksdb/ColumnFamilyDescriptor.java
+++ b/java/org/rocksdb/ColumnFamilyDescriptor.java
@@ -16,7 +16,8 @@ public class ColumnFamilyDescriptor {
    * options,</p>
    *
    * @param columnFamilyName name of column family.
-   * @deprecated will be removed in RocksDB 3.10.0. Use {}
+   * @deprecated will be removed in RocksDB 3.10.0. Use
+   * {@link #ColumnFamilyDescriptor(byte[])} instead.
    */
   @Deprecated
   public ColumnFamilyDescriptor(final String columnFamilyName){
@@ -41,7 +42,8 @@ public class ColumnFamilyDescriptor {
    * @param columnFamilyName name of column family.
    * @param columnFamilyOptions options to be used with
    *     column family.
-   * @deprecated will be removed in RocksDB 3.10.0. Use {}
+   * @deprecated will be removed in RocksDB 3.10.0. Use
+   * {@link #ColumnFamilyDescriptor(byte[], ColumnFamilyOptions)} instead.
    */
   @Deprecated
   public ColumnFamilyDescriptor(final String columnFamilyName,
diff --git a/java/rocksjni/rocksjni.cc b/java/rocksjni/rocksjni.cc
index 2a536b104..be70670ae 100644
--- a/java/rocksjni/rocksjni.cc
+++ b/java/rocksjni/rocksjni.cc
@@ -93,11 +93,11 @@ jobject
       jbyteArray byteArray = static_cast<jbyteArray>(env->CallObjectMethod(
           jcf_descriptor,
           rocksdb::ColumnFamilyDescriptorJni::getColumnFamilyNameMethod(
-          env)));
+              env)));
       // get CF Options
       jobject jcf_opt_obj = env->CallObjectMethod(jcf_descriptor,
           rocksdb::ColumnFamilyDescriptorJni::getColumnFamilyOptionsMethod(
-          env));
+              env));
       rocksdb::ColumnFamilyOptions* cfOptions =
           rocksdb::ColumnFamilyOptionsJni::getHandle(env, jcf_opt_obj);
 
@@ -178,11 +178,11 @@ jobject Java_org_rocksdb_RocksDB_open__JLjava_lang_String_2Ljava_util_List_2I(
       jbyteArray byteArray = static_cast<jbyteArray>(env->CallObjectMethod(
           jcf_descriptor,
           rocksdb::ColumnFamilyDescriptorJni::getColumnFamilyNameMethod(
-          env)));
+              env)));
       // get CF Options
       jobject jcf_opt_obj = env->CallObjectMethod(jcf_descriptor,
           rocksdb::ColumnFamilyDescriptorJni::getColumnFamilyOptionsMethod(
-          env));
+              env));
       rocksdb::ColumnFamilyOptions* cfOptions =
           rocksdb::ColumnFamilyOptionsJni::getHandle(env, jcf_opt_obj);
 
@@ -1189,11 +1189,11 @@ jlong Java_org_rocksdb_RocksDB_createColumnFamily(
   jbyteArray byteArray = static_cast<jbyteArray>(env->CallObjectMethod(
       jcf_descriptor,
       rocksdb::ColumnFamilyDescriptorJni::getColumnFamilyNameMethod(
-      env)));
+          env)));
   // get CF Options
   jobject jcf_opt_obj = env->CallObjectMethod(jcf_descriptor,
       rocksdb::ColumnFamilyDescriptorJni::getColumnFamilyOptionsMethod(
-      env));
+          env));
   rocksdb::ColumnFamilyOptions* cfOptions =
       rocksdb::ColumnFamilyOptionsJni::getHandle(env, jcf_opt_obj);
 

From a9ea65d65275d419a9178f5906a3f98c8a30e267 Mon Sep 17 00:00:00 2001
From: Anders Bakken <agbakken@gmail.com>
Date: Mon, 12 Jan 2015 09:59:36 -0800
Subject: [PATCH 677/829] Build with clang 3.5 on Linux.

---
 util/thread_local.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/util/thread_local.cc b/util/thread_local.cc
index 0b6857fff..60e418dff 100644
--- a/util/thread_local.cc
+++ b/util/thread_local.cc
@@ -10,7 +10,7 @@
 #include "util/thread_local.h"
 #include "util/mutexlock.h"
 #include "port/likely.h"
-
+#include <stdlib.h>
 
 namespace rocksdb {
 

From 402c1152a7b69c2aee7d4aa8f678d65a0f222058 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Mon, 12 Jan 2015 18:17:30 -0800
Subject: [PATCH 678/829] Fix c_simple_example

---
 examples/c_simple_example.c | 76 +++++++++++++++++++------------------
 1 file changed, 39 insertions(+), 37 deletions(-)

diff --git a/examples/c_simple_example.c b/examples/c_simple_example.c
index 2a467fb4e..1dd380721 100644
--- a/examples/c_simple_example.c
+++ b/examples/c_simple_example.c
@@ -5,45 +5,47 @@
 
 #include "rocksdb/c.h"
 
-#include <unistd.h>	// sysconf() - get CPU count
+#include <unistd.h>  // sysconf() - get CPU count
 
 const char DBPath[] = "/tmp/rocksdb_simple_example";
 
 int main(int argc, char **argv) {
-	rocksdb_t *db;
-	rocksdb_options_t *options = rocksdb_options_create();
-	// Optimize RocksDB. This is the easiest way to
-	// get RocksDB to perform well
-	long cpus = sysconf(_SC_NPROCESSORS_ONLN);	// get # of online cores
-	rocksdb_options_increase_parallelism(options, (int)(cpus));
-	rocksdb_options_optimize_level_style_compaction(options, 0);
-	// create the DB if it's not already present
-	rocksdb_options_set_create_if_missing(options, 1);
-
-	// open DB
-	char *err = NULL;
-	db = rocksdb_open(options, DBPath, &err);
-	assert(!err);
-
-	// Put key-value
-	rocksdb_writeoptions_t *writeoptions = rocksdb_writeoptions_create();
-	const char key[] = "key";
-	const char *value = "value";
-	rocksdb_put(db, writeoptions, key, strlen (key), value,	\
-			strlen (value) + 1, &err);
-	assert(!err);
-	// Get value
-	rocksdb_readoptions_t *readoptions = rocksdb_readoptions_create();
-	size_t len;
-	value = rocksdb_get(db, readoptions, key, strlen (key), &len, &err);
-	assert(!err);
-	assert(strcmp(value, "value") == 0);
-
-	// cleanup
-	rocksdb_writeoptions_destroy(writeoptions);
-	rocksdb_readoptions_destroy(readoptions);
-	rocksdb_options_destroy(options);
-	rocksdb_close(db);
-
-	return 0;
+  rocksdb_t *db;
+  rocksdb_options_t *options = rocksdb_options_create();
+  // Optimize RocksDB. This is the easiest way to
+  // get RocksDB to perform well
+  long cpus = sysconf(_SC_NPROCESSORS_ONLN);  // get # of online cores
+  rocksdb_options_increase_parallelism(options, (int)(cpus));
+  rocksdb_options_optimize_level_style_compaction(options, 0);
+  // create the DB if it's not already present
+  rocksdb_options_set_create_if_missing(options, 1);
+
+  // open DB
+  char *err = NULL;
+  db = rocksdb_open(options, DBPath, &err);
+  assert(!err);
+
+  // Put key-value
+  rocksdb_writeoptions_t *writeoptions = rocksdb_writeoptions_create();
+  const char key[] = "key";
+  const char *value = "value";
+  rocksdb_put(db, writeoptions, key, strlen(key), value, strlen(value) + 1,
+              &err);
+  assert(!err);
+  // Get value
+  rocksdb_readoptions_t *readoptions = rocksdb_readoptions_create();
+  size_t len;
+  char *returned_value =
+      rocksdb_get(db, readoptions, key, strlen(key), &len, &err);
+  assert(!err);
+  assert(strcmp(returned_value, "value") == 0);
+  free(returned_value);
+
+  // cleanup
+  rocksdb_writeoptions_destroy(writeoptions);
+  rocksdb_readoptions_destroy(readoptions);
+  rocksdb_options_destroy(options);
+  rocksdb_close(db);
+
+  return 0;
 }

From c91cdd59c1c5499e52dda9acecf08f33bae47e04 Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Tue, 13 Jan 2015 00:04:08 -0800
Subject: [PATCH 679/829] Allow GetThreadList() to indicate a thread is doing
 Compaction.

Summary: Allow GetThreadList() to indicate a thread is doing Compaction.

Test Plan:
export ROCKSDB_TESTS=ThreadStatus
./db_test

Reviewers: ljin, igor, sdong

Reviewed By: sdong

Subscribers: leveldb, dhruba, jonahcohen, rven

Differential Revision: https://reviews.facebook.net/D30105
---
 db/compaction_job.cc                |   7 ++
 db/db_impl.cc                       |  11 +++
 db/db_impl.h                        |   1 +
 db/db_impl_debug.cc                 |   1 +
 db/db_test.cc                       | 146 +++++++++++++++++++++++++++-
 util/thread_event_info.h            |  71 ++++++++++++++
 util/thread_status_updater.cc       |  37 +++++++
 util/thread_status_updater.h        |  24 ++++-
 util/thread_status_updater_debug.cc |  17 +++-
 util/thread_status_util.cc          |  46 ++++++++-
 util/thread_status_util.h           |  17 ++++
 util/thread_status_util_debug.cc    |  41 ++++++++
 12 files changed, 410 insertions(+), 9 deletions(-)
 create mode 100644 util/thread_event_info.h
 create mode 100644 util/thread_status_util_debug.cc

diff --git a/db/compaction_job.cc b/db/compaction_job.cc
index d836ccd30..7b786c116 100644
--- a/db/compaction_job.cc
+++ b/db/compaction_job.cc
@@ -50,6 +50,7 @@
 #include "util/iostats_context_imp.h"
 #include "util/stop_watch.h"
 #include "util/sync_point.h"
+#include "util/thread_status_util.h"
 
 namespace rocksdb {
 
@@ -270,6 +271,11 @@ void CompactionJob::Prepare() {
 Status CompactionJob::Run() {
   log_buffer_->FlushBufferToLog();
   ColumnFamilyData* cfd = compact_->compaction->column_family_data();
+  ThreadStatusUtil::SetColumnFamily(cfd);
+  ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_COMPACTION);
+#ifndef NDEBUG
+  ThreadStatusUtil::TEST_OperationDelay(ThreadStatus::OP_COMPACTION);
+#endif
 
   const uint64_t start_micros = env_->NowMicros();
   std::unique_ptr<Iterator> input(
@@ -459,6 +465,7 @@ Status CompactionJob::Run() {
   RecordCompactionIOStats();
 
   LogFlush(db_options_.info_log);
+  ThreadStatusUtil::ResetThreadStatus();
   return status;
 }
 
diff --git a/db/db_impl.cc b/db/db_impl.cc
index 4720742ae..f5d6d99f0 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -2145,6 +2145,14 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress, JobContext* job_context,
     c->ReleaseCompactionFiles(status);
     *madeProgress = true;
   } else if (!is_manual && c->IsTrivialMove()) {
+    // Instrument for event update
+    // TODO(yhchiang): add op details for showing trivial-move.
+    ThreadStatusUtil::SetColumnFamily(c->column_family_data());
+    ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_COMPACTION);
+#ifndef NDEBUG
+    ThreadStatusUtil::TEST_OperationDelay(ThreadStatus::OP_COMPACTION);
+#endif
+
     // Move file to next level
     assert(c->num_input_files(0) == 1);
     FileMetaData* f = c->input(0, 0);
@@ -2171,6 +2179,9 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress, JobContext* job_context,
         c->column_family_data()->current()->storage_info()->LevelSummary(&tmp));
     c->ReleaseCompactionFiles(status);
     *madeProgress = true;
+
+    // Clear Instrument
+    ThreadStatusUtil::ResetThreadStatus();
   } else {
     auto yield_callback = [&]() {
       return CallFlushDuringCompaction(c->column_family_data(),
diff --git a/db/db_impl.h b/db/db_impl.h
index de834a0fa..4664a3d60 100644
--- a/db/db_impl.h
+++ b/db/db_impl.h
@@ -234,6 +234,7 @@ class DBImpl : public DB {
   uint64_t TEST_max_total_in_memory_state() {
     return max_total_in_memory_state_;
   }
+
 #endif  // ROCKSDB_LITE
 
   // Returns the list of live files in 'live' and the list
diff --git a/db/db_impl_debug.cc b/db/db_impl_debug.cc
index 65eaff6b3..db4c91ae5 100644
--- a/db/db_impl_debug.cc
+++ b/db/db_impl_debug.cc
@@ -10,6 +10,7 @@
 #ifndef ROCKSDB_LITE
 
 #include "db/db_impl.h"
+#include "util/thread_status_updater.h"
 
 namespace rocksdb {
 
diff --git a/db/db_test.cc b/db/db_test.cc
index 9fa2a40b5..7146f9585 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -52,7 +52,7 @@
 #include "util/testutil.h"
 #include "util/mock_env.h"
 #include "util/string_util.h"
-#include "util/thread_status_updater.h"
+#include "util/thread_status_util.h"
 
 namespace rocksdb {
 
@@ -9414,7 +9414,7 @@ TEST(DBTest, DynamicMemtableOptions) {
 }
 
 #if ROCKSDB_USING_THREAD_STATUS
-TEST(DBTest, GetThreadList) {
+TEST(DBTest, GetThreadStatus) {
   Options options;
   options.env = env_;
   options.enable_thread_tracking = true;
@@ -9472,7 +9472,7 @@ TEST(DBTest, GetThreadList) {
       handles_, true);
 }
 
-TEST(DBTest, DisableThreadList) {
+TEST(DBTest, DisableThreadStatus) {
   Options options;
   options.env = env_;
   options.enable_thread_tracking = false;
@@ -9482,6 +9482,146 @@ TEST(DBTest, DisableThreadList) {
   env_->GetThreadStatusUpdater()->TEST_VerifyColumnFamilyInfoMap(
       handles_, false);
 }
+
+TEST(DBTest, ThreadStatusSingleCompaction) {
+  const int kTestKeySize = 16;
+  const int kTestValueSize = 984;
+  const int kEntrySize = kTestKeySize + kTestValueSize;
+  const int kEntriesPerBuffer = 100;
+  Options options;
+  options.create_if_missing = true;
+  options.write_buffer_size = kEntrySize * kEntriesPerBuffer;
+  options.compaction_style = kCompactionStyleLevel;
+  options.target_file_size_base = options.write_buffer_size;
+  options.max_bytes_for_level_base = options.target_file_size_base * 2;
+  options.max_bytes_for_level_multiplier = 2;
+  options.compression = kNoCompression;
+  options = CurrentOptions(options);
+  options.env = env_;
+  options.enable_thread_tracking = true;
+  const int kNumL0Files = 4;
+  options.level0_file_num_compaction_trigger = kNumL0Files;
+  for (int tests = 0; tests < 2; ++tests) {
+    TryReopen(options);
+    // Each compaction will run at least 2 seconds, which allows
+    // the test to capture the status of compaction with fewer
+    // false alarm.
+    const int kCompactionDelayMicro = 2000000;
+    ThreadStatusUtil::TEST_SetOperationDelay(
+        ThreadStatus::OP_COMPACTION, kCompactionDelayMicro);
+
+    Random rnd(301);
+    for (int key = kEntriesPerBuffer * kNumL0Files; key >= 0; --key) {
+      ASSERT_OK(Put(ToString(key), RandomString(&rnd, kTestValueSize)));
+    }
+
+    // wait for compaction to be scheduled
+    env_->SleepForMicroseconds(500000);
+
+    // check how many threads are doing compaction using GetThreadList
+    std::vector<ThreadStatus> thread_list;
+    Status s = env_->GetThreadList(&thread_list);
+    ASSERT_OK(s);
+    int compaction_count = 0;
+    for (auto thread : thread_list) {
+      if (thread.operation_type == ThreadStatus::OP_COMPACTION) {
+        compaction_count++;
+      }
+    }
+
+    if (options.enable_thread_tracking) {
+      // expecting one single L0 to L1 compaction
+      ASSERT_EQ(compaction_count, 1);
+    } else {
+      // If thread tracking is not enabled, compaction count should be 0.
+      ASSERT_EQ(compaction_count, 0);
+    }
+
+    ThreadStatusUtil::TEST_SetOperationDelay(
+        ThreadStatus::OP_COMPACTION, 0);
+
+    // repeat the test with disabling thread tracking.
+    options.enable_thread_tracking = false;
+  }
+}
+
+TEST(DBTest, ThreadStatusMultipleCompaction) {
+  const int kTestKeySize = 16;
+  const int kTestValueSize = 984;
+  const int kEntrySize = kTestKeySize + kTestValueSize;
+  const int kEntriesPerBuffer = 10;
+  const int kNumL0Files = 4;
+
+  const int kHighPriCount = 3;
+  const int kLowPriCount = 5;
+  env_->SetBackgroundThreads(kHighPriCount, Env::HIGH);
+  env_->SetBackgroundThreads(kLowPriCount, Env::LOW);
+
+  Options options;
+  options.create_if_missing = true;
+  options.write_buffer_size = kEntrySize * kEntriesPerBuffer;
+  options.compaction_style = kCompactionStyleLevel;
+  options.target_file_size_base = options.write_buffer_size;
+  options.max_bytes_for_level_base =
+      options.target_file_size_base * kNumL0Files;
+  options.compression = kNoCompression;
+  options = CurrentOptions(options);
+  options.env = env_;
+  options.enable_thread_tracking = true;
+  options.level0_file_num_compaction_trigger = kNumL0Files;
+  options.max_bytes_for_level_multiplier = 2;
+  options.max_background_compactions = kLowPriCount;
+
+  for (int tests = 0; tests < 2; ++tests) {
+    TryReopen(options);
+    Random rnd(301);
+
+    int max_compaction_count = 0;
+    std::vector<ThreadStatus> thread_list;
+    const int kCompactionDelayMicro = 10000;
+    ThreadStatusUtil::TEST_SetOperationDelay(
+        ThreadStatus::OP_COMPACTION, kCompactionDelayMicro);
+
+    // Make rocksdb busy
+    int key = 0;
+    for (int file = 0; file < 64 * kNumL0Files; ++file) {
+      for (int k = 0; k < kEntriesPerBuffer; ++k) {
+        ASSERT_OK(Put(ToString(key++), RandomString(&rnd, kTestValueSize)));
+      }
+
+      // check how many threads are doing compaction using GetThreadList
+      int compaction_count = 0;
+      Status s = env_->GetThreadList(&thread_list);
+      for (auto thread : thread_list) {
+        if (thread.operation_type == ThreadStatus::OP_COMPACTION) {
+          compaction_count++;
+        }
+      }
+
+      // Record the max number of compactions at a time.
+      if (max_compaction_count < compaction_count) {
+        max_compaction_count = compaction_count;
+      }
+    }
+
+    if (options.enable_thread_tracking) {
+      // Expect rocksdb max-out the concurrent compaction jobs.
+      ASSERT_EQ(max_compaction_count, options.max_background_compactions);
+    } else {
+      // If thread tracking is not enabled, compaction count should be 0.
+      ASSERT_EQ(max_compaction_count, 0);
+    }
+
+    // repeat the test with disabling thread tracking.
+    options.enable_thread_tracking = false;
+  }
+
+  ThreadStatusUtil::TEST_SetOperationDelay(
+      ThreadStatus::OP_COMPACTION, 0);
+}
+
+
+
 #endif  // ROCKSDB_USING_THREAD_STATUS
 
 TEST(DBTest, DynamicCompactionOptions) {
diff --git a/util/thread_event_info.h b/util/thread_event_info.h
new file mode 100644
index 000000000..28916deb4
--- /dev/null
+++ b/util/thread_event_info.h
@@ -0,0 +1,71 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+//
+// This file defines the structures for thread event and operation.
+// Thread events are used to describe high level action of a
+// thread such as doing compaction or flush, while thread operation
+// are used to describe lower-level action such as reading /
+// writing a file or waiting for a mutex.  Events and operations
+// are designed to be independent.  Typically, a thread usually involves
+// in one event and one operation at any specific point in time.
+
+#pragma once
+
+#include "include/rocksdb/thread_status.h"
+
+#include <string>
+
+namespace rocksdb {
+
+#if ROCKSDB_USING_THREAD_STATUS
+
+// The structure that describes a major thread event.
+struct EventInfo {
+  const ThreadStatus::EventType code;
+  const std::string name;
+};
+
+// The global event table.
+//
+// When updating a status of a thread, the pointer of the EventInfo
+// of the current ThreadStatusData will be pointing to one of the
+// rows in this global table.
+//
+// Note that it's not designed to be constant as in the future we
+// might consider adding global count to the EventInfo.
+static EventInfo global_event_table[] = {
+  {ThreadStatus::EVENT_UNKNOWN, ""},
+  {ThreadStatus::EVENT_COMPACTION, "Compaction"},
+  {ThreadStatus::EVENT_FLUSH, "Flush"}
+};
+
+// The structure that describes a operation.
+struct OperationInfo {
+  const ThreadStatus::OperationType code;
+  const std::string name;
+};
+
+// The global operation table.
+//
+// When updating a status of a thread, the pointer of the OperationInfo
+// of the current ThreadStatusData will be pointing to one of the
+// rows in this global table.
+static OperationInfo global_operation_table[] = {
+  {ThreadStatus::OPERATION_UNKNOWN, ""},
+  {ThreadStatus::OPERATION_WRITE_FILE, "Writing SST file"},
+  {ThreadStatus::OPERATION_READ_FILE, "Reaing SST file"},
+  {ThreadStatus::OPERATION_WAIT_DB_MUTEX, "Waiting DB Mutex"}
+};
+
+#else
+
+struct EventInfo {
+};
+
+struct OperationInfo {
+};
+
+#endif  // ROCKSDB_USING_THREAD_STATUS
+}  // namespace rocksdb
diff --git a/util/thread_status_updater.cc b/util/thread_status_updater.cc
index feb129885..25f7b1c5c 100644
--- a/util/thread_status_updater.cc
+++ b/util/thread_status_updater.cc
@@ -29,20 +29,46 @@ void ThreadStatusUpdater::SetThreadType(
   data->thread_type.store(ttype, std::memory_order_relaxed);
 }
 
+void ThreadStatusUpdater::ResetThreadStatus() {
+  ClearThreadState();
+  ClearThreadOperation();
+  SetColumnFamilyInfoKey(nullptr);
+}
+
 void ThreadStatusUpdater::SetColumnFamilyInfoKey(
     const void* cf_key) {
   auto* data = InitAndGet();
+  // set the tracking flag based on whether cf_key is non-null or not.
+  // If enable_thread_tracking is set to false, the input cf_key
+  // would be nullptr.
+  data->enable_tracking = (cf_key != nullptr);
   data->cf_key.store(cf_key, std::memory_order_relaxed);
 }
 
+const void* ThreadStatusUpdater::GetColumnFamilyInfoKey() {
+  auto* data = InitAndGet();
+  if (data->enable_tracking == false) {
+    return nullptr;
+  }
+  return data->cf_key.load(std::memory_order_relaxed);
+}
+
 void ThreadStatusUpdater::SetThreadOperation(
     const ThreadStatus::OperationType type) {
   auto* data = InitAndGet();
+  if (!data->enable_tracking) {
+    assert(data->cf_key.load(std::memory_order_relaxed) == nullptr);
+    return;
+  }
   data->operation_type.store(type, std::memory_order_relaxed);
 }
 
 void ThreadStatusUpdater::ClearThreadOperation() {
   auto* data = InitAndGet();
+  if (!data->enable_tracking) {
+    assert(data->cf_key.load(std::memory_order_relaxed) == nullptr);
+    return;
+  }
   data->operation_type.store(
       ThreadStatus::OP_UNKNOWN, std::memory_order_relaxed);
 }
@@ -50,11 +76,19 @@ void ThreadStatusUpdater::ClearThreadOperation() {
 void ThreadStatusUpdater::SetThreadState(
     const ThreadStatus::StateType type) {
   auto* data = InitAndGet();
+  if (!data->enable_tracking) {
+    assert(data->cf_key.load(std::memory_order_relaxed) == nullptr);
+    return;
+  }
   data->state_type.store(type, std::memory_order_relaxed);
 }
 
 void ThreadStatusUpdater::ClearThreadState() {
   auto* data = InitAndGet();
+  if (!data->enable_tracking) {
+    assert(data->cf_key.load(std::memory_order_relaxed) == nullptr);
+    return;
+  }
   data->state_type.store(
       ThreadStatus::STATE_UNKNOWN, std::memory_order_relaxed);
 }
@@ -176,6 +210,9 @@ void ThreadStatusUpdater::EraseDatabaseInfo(const void* db_key) {
 void ThreadStatusUpdater::UnregisterThread() {
 }
 
+void ThreadStatusUpdater::ResetThreadStatus() {
+}
+
 void ThreadStatusUpdater::SetThreadType(
     ThreadStatus::ThreadType ttype) {
 }
diff --git a/util/thread_status_updater.h b/util/thread_status_updater.h
index c97102a96..5d4e55bb2 100644
--- a/util/thread_status_updater.h
+++ b/util/thread_status_updater.h
@@ -64,13 +64,24 @@ struct ConstantColumnFamilyInfo {
 // status of a thread using a set of atomic pointers.
 struct ThreadStatusData {
 #if ROCKSDB_USING_THREAD_STATUS
-  explicit ThreadStatusData() : thread_id(0) {
+  explicit ThreadStatusData() : thread_id(0), enable_tracking(false) {
     thread_type.store(ThreadStatus::USER);
-    cf_key.store(0);
+    cf_key.store(nullptr);
     operation_type.store(ThreadStatus::OP_UNKNOWN);
     state_type.store(ThreadStatus::STATE_UNKNOWN);
   }
+
   uint64_t thread_id;
+
+  // A flag to indicate whether the thread tracking is enabled
+  // in the current thread.  This value will be updated based on whether
+  // the associated Options::enable_thread_tracking is set to true
+  // in ThreadStatusUtil::SetColumnFamily().
+  //
+  // If set to false, then SetThreadOperation and SetThreadState
+  // will be no-op.
+  bool enable_tracking;
+
   std::atomic<ThreadStatus::ThreadType> thread_type;
   std::atomic<const void*> cf_key;
   std::atomic<ThreadStatus::OperationType> operation_type;
@@ -96,6 +107,10 @@ class ThreadStatusUpdater {
   // Unregister the current thread.
   void UnregisterThread();
 
+  // Reset the status of the current thread.  This includes resetting
+  // ColumnFamilyInfoKey, ThreadOperation, and ThreadState.
+  void ResetThreadStatus();
+
   // Set the thread type of the current thread.
   void SetThreadType(ThreadStatus::ThreadType ttype);
 
@@ -103,6 +118,9 @@ class ThreadStatusUpdater {
   // its thread-local pointer of ThreadStateInfo to the correct entry.
   void SetColumnFamilyInfoKey(const void* cf_key);
 
+  // returns the column family info key.
+  const void* GetColumnFamilyInfoKey();
+
   // Update the thread operation of the current thread.
   void SetThreadOperation(const ThreadStatus::OperationType type);
 
@@ -143,7 +161,6 @@ class ThreadStatusUpdater {
       bool check_exist);
 
  protected:
-
 #if ROCKSDB_USING_THREAD_STATUS
   // The thread-local variable for storing thread status.
   static __thread ThreadStatusData* thread_status_data_;
@@ -169,6 +186,7 @@ class ThreadStatusUpdater {
   // associated to the same db_key faster.
   std::unordered_map<
       const void*, std::unordered_set<const void*>> db_key_map_;
+
 #else
   static ThreadStatusData* thread_status_data_;
 #endif  // ROCKSDB_USING_THREAD_STATUS
diff --git a/util/thread_status_updater_debug.cc b/util/thread_status_updater_debug.cc
index 1f53e5fc1..274f427d3 100644
--- a/util/thread_status_updater_debug.cc
+++ b/util/thread_status_updater_debug.cc
@@ -7,9 +7,11 @@
 
 #include "util/thread_status_updater.h"
 #include "db/column_family.h"
-#if ROCKSDB_USING_THREAD_STATUS
 
 namespace rocksdb {
+
+#ifndef NDEBUG
+#if ROCKSDB_USING_THREAD_STATUS
 void ThreadStatusUpdater::TEST_VerifyColumnFamilyInfoMap(
     const std::vector<ColumnFamilyHandle*>& handles,
     bool check_exist) {
@@ -29,5 +31,16 @@ void ThreadStatusUpdater::TEST_VerifyColumnFamilyInfoMap(
     }
   }
 }
-}  // namespace rocksdb
+
+#else
+
+void ThreadStatusUpdater::TEST_VerifyColumnFamilyInfoMap(
+    const std::vector<ColumnFamilyHandle*>& handles,
+    bool check_exist) {
+}
+
 #endif  // ROCKSDB_USING_THREAD_STATUS
+#endif  // !NDEBUG
+
+
+}  // namespace rocksdb
diff --git a/util/thread_status_util.cc b/util/thread_status_util.cc
index c8767d9a8..970f79ae8 100644
--- a/util/thread_status_util.cc
+++ b/util/thread_status_util.cc
@@ -9,6 +9,7 @@
 
 namespace rocksdb {
 
+
 #if ROCKSDB_USING_THREAD_STATUS
 __thread ThreadStatusUpdater*
     ThreadStatusUtil::thread_updater_local_cache_ = nullptr;
@@ -36,7 +37,41 @@ void ThreadStatusUtil::SetColumnFamily(const ColumnFamilyData* cfd) {
     return;
   }
   assert(thread_updater_local_cache_);
-  thread_updater_local_cache_->SetColumnFamilyInfoKey(cfd);
+  if (cfd != nullptr && cfd->options()->enable_thread_tracking) {
+    thread_updater_local_cache_->SetColumnFamilyInfoKey(cfd);
+  } else {
+    // When cfd == nullptr or enable_thread_tracking == false, we set
+    // ColumnFamilyInfoKey to nullptr, which makes SetThreadOperation
+    // and SetThreadState become no-op.
+    thread_updater_local_cache_->SetColumnFamilyInfoKey(nullptr);
+  }
+}
+
+void ThreadStatusUtil::SetThreadOperation(ThreadStatus::OperationType op) {
+  if (thread_updater_local_cache_ == nullptr) {
+    // thread_updater_local_cache_ must be set in SetColumnFamily
+    // or other ThreadStatusUtil functions.
+    return;
+  }
+
+  thread_updater_local_cache_->SetThreadOperation(op);
+}
+
+void ThreadStatusUtil::SetThreadState(ThreadStatus::StateType state) {
+  if (thread_updater_local_cache_ == nullptr) {
+    // thread_updater_local_cache_ must be set in SetColumnFamily
+    // or other ThreadStatusUtil functions.
+    return;
+  }
+
+  thread_updater_local_cache_->SetThreadState(state);
+}
+
+void ThreadStatusUtil::ResetThreadStatus() {
+  if (thread_updater_local_cache_ == nullptr) {
+    return;
+  }
+  thread_updater_local_cache_->ResetThreadStatus();
 }
 
 void ThreadStatusUtil::NewColumnFamilyInfo(
@@ -86,6 +121,12 @@ bool ThreadStatusUtil::MaybeInitThreadLocalUpdater(const Env* env) {
 void ThreadStatusUtil::SetColumnFamily(const ColumnFamilyData* cfd) {
 }
 
+void ThreadStatusUtil::SetThreadOperation(ThreadStatus::OperationType op) {
+}
+
+void ThreadStatusUtil::SetThreadState(ThreadStatus::StateType state) {
+}
+
 void ThreadStatusUtil::NewColumnFamilyInfo(
     const DB* db, const ColumnFamilyData* cfd) {
 }
@@ -97,6 +138,9 @@ void ThreadStatusUtil::EraseColumnFamilyInfo(
 void ThreadStatusUtil::EraseDatabaseInfo(const DB* db) {
 }
 
+void ThreadStatusUtil::ResetThreadStatus() {
+}
+
 #endif  // ROCKSDB_USING_THREAD_STATUS
 
 }  // namespace rocksdb
diff --git a/util/thread_status_util.h b/util/thread_status_util.h
index c583d5a5d..a8549e8ae 100644
--- a/util/thread_status_util.h
+++ b/util/thread_status_util.h
@@ -12,6 +12,7 @@
 
 namespace rocksdb {
 
+
 // The static utility class for updating thread-local status.
 //
 // The thread-local status is updated via the thread-local cached
@@ -52,6 +53,22 @@ class ThreadStatusUtil {
   // something related to the specified column family.
   static void SetColumnFamily(const ColumnFamilyData* cfd);
 
+  static void SetThreadOperation(ThreadStatus::OperationType type);
+
+  static void SetThreadState(ThreadStatus::StateType type);
+
+  static void ResetThreadStatus();
+
+#ifndef NDEBUG
+  static void TEST_SetOperationDelay(
+      const ThreadStatus::OperationType operation, int micro);
+  static void TEST_OperationDelay(
+      const ThreadStatus::OperationType operation);
+  static void TEST_SetStateDelay(
+      const ThreadStatus::StateType state, int micro);
+  static void TEST_StateDelay(const ThreadStatus::StateType state);
+#endif
+
  protected:
   // Initialize the thread-local ThreadStatusUpdater when it finds
   // the cached value is nullptr.  Returns true if it has cached
diff --git a/util/thread_status_util_debug.cc b/util/thread_status_util_debug.cc
new file mode 100644
index 000000000..5378acaf8
--- /dev/null
+++ b/util/thread_status_util_debug.cc
@@ -0,0 +1,41 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#include "rocksdb/env.h"
+#include "util/thread_status_updater.h"
+#include "util/thread_status_util.h"
+
+namespace rocksdb {
+
+#ifndef NDEBUG
+// the delay for debugging purpose.
+static int operations_delay[ThreadStatus::NUM_OP_TYPES] ={0};
+static int states_delay[ThreadStatus::NUM_STATE_TYPES] = {0};
+
+void ThreadStatusUtil::TEST_SetStateDelay(
+    const ThreadStatus::StateType state, int micro) {
+  states_delay[state] = micro;
+}
+
+void ThreadStatusUtil::TEST_StateDelay(
+    const ThreadStatus::StateType state) {
+  Env::Default()->SleepForMicroseconds(
+      states_delay[state]);
+}
+
+void ThreadStatusUtil::TEST_SetOperationDelay(
+    const ThreadStatus::OperationType operation, int micro) {
+  operations_delay[operation] = micro;
+}
+
+
+void ThreadStatusUtil::TEST_OperationDelay(
+    const ThreadStatus::OperationType operation) {
+  Env::Default()->SleepForMicroseconds(
+      operations_delay[operation]);
+}
+#endif  // !NDEBUG
+
+}  // namespace rocksdb

From bf9aa4dfcd27e7dc4f55d63d0adf5abaf86b06bd Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Tue, 13 Jan 2015 00:38:09 -0800
Subject: [PATCH 680/829] Improve GetThreadStatus to avoid false alarm in some
 case.

---
 db/db_test.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/db/db_test.cc b/db/db_test.cc
index 7146f9585..1bddde8b4 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -9444,7 +9444,8 @@ TEST(DBTest, GetThreadStatus) {
       }
       // Verify the total number of threades
       ASSERT_EQ(
-          thread_list.size(),
+          thread_type_counts[ThreadStatus::HIGH_PRIORITY] +
+              thread_type_counts[ThreadStatus::LOW_PRIORITY],
           kHighPriCounts[test] + kLowPriCounts[test]);
       // Verify the number of high-priority threads
       ASSERT_EQ(

From d2c018fd5bc45152e3d69ab29edf00dc10448d23 Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Tue, 13 Jan 2015 01:00:32 -0800
Subject: [PATCH 681/829] Make ThreadStatusMultipleCompaction more robust.

---
 db/db_test.cc | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/db/db_test.cc b/db/db_test.cc
index 1bddde8b4..2b2bb251d 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -9579,7 +9579,7 @@ TEST(DBTest, ThreadStatusMultipleCompaction) {
 
     int max_compaction_count = 0;
     std::vector<ThreadStatus> thread_list;
-    const int kCompactionDelayMicro = 10000;
+    const int kCompactionDelayMicro = 20000;
     ThreadStatusUtil::TEST_SetOperationDelay(
         ThreadStatus::OP_COMPACTION, kCompactionDelayMicro);
 
@@ -9606,8 +9606,9 @@ TEST(DBTest, ThreadStatusMultipleCompaction) {
     }
 
     if (options.enable_thread_tracking) {
-      // Expect rocksdb max-out the concurrent compaction jobs.
-      ASSERT_EQ(max_compaction_count, options.max_background_compactions);
+      // Expect rocksdb to at least utilize 80% of the compaction threads.
+      ASSERT_GE(1.0 * max_compaction_count,
+                0.8 * options.max_background_compactions);
     } else {
       // If thread tracking is not enabled, compaction count should be 0.
       ASSERT_EQ(max_compaction_count, 0);

From 2159484dd6a3af5de3fa05e358aacc4f8f378e6a Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Tue, 13 Jan 2015 01:38:06 -0800
Subject: [PATCH 682/829] Remove two unnecessary blank lines in db/db_test.cc

---
 db/db_test.cc | 2 --
 1 file changed, 2 deletions(-)

diff --git a/db/db_test.cc b/db/db_test.cc
index 2b2bb251d..566fdd077 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -9622,8 +9622,6 @@ TEST(DBTest, ThreadStatusMultipleCompaction) {
       ThreadStatus::OP_COMPACTION, 0);
 }
 
-
-
 #endif  // ROCKSDB_USING_THREAD_STATUS
 
 TEST(DBTest, DynamicCompactionOptions) {

From 53f615df6acfd716945a040cd4a9e50d9c6c8783 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Tue, 13 Jan 2015 12:26:57 -0800
Subject: [PATCH 683/829] Fix clang build

---
 db/memtablerep_bench.cc | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/db/memtablerep_bench.cc b/db/memtablerep_bench.cc
index a24eca010..5bdfa836d 100644
--- a/db/memtablerep_bench.cc
+++ b/db/memtablerep_bench.cc
@@ -184,8 +184,9 @@ class KeyGenerator {
       for (uint64_t i = 0; i < num_; ++i) {
         values_[i] = i;
       }
-      std::shuffle(values_.begin(), values_.end(),
-                   std::default_random_engine(FLAGS_seed));
+      std::shuffle(
+          values_.begin(), values_.end(),
+          std::default_random_engine(static_cast<unsigned int>(FLAGS_seed)));
     }
   }
 

From 96b8240bc5794eecb94092435b80003013e06396 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Tue, 13 Jan 2015 14:33:04 -0800
Subject: [PATCH 684/829] Support footer versions bigger than 1

Summary:
In this diff I add another parameter to BlockBasedTableOptions that will let users specify block based table's format. This will greatly simplify block based table's format changes in the future.

First format change that this will support is encoding decompressed size in Zlib and BZip2 blocks. This diff is blocking https://reviews.facebook.net/D31311.

Test Plan: Added a unit tests. More tests to come as part of https://reviews.facebook.net/D31311.

Reviewers: dhruba, MarkCallaghan, yhchiang, rven, sdong

Reviewed By: sdong

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D31383
---
 include/rocksdb/table.h            | 13 +++++
 table/block_based_table_builder.cc | 25 ++++++--
 table/block_based_table_factory.cc |  6 ++
 table/block_based_table_reader.cc  | 10 +++-
 table/cuckoo_table_builder.cc      |  2 +-
 table/format.cc                    | 93 +++++++++++++++---------------
 table/format.h                     | 47 +++++++--------
 table/meta_blocks.cc               | 12 ++--
 table/plain_table_builder.cc       |  2 +-
 table/table_test.cc                | 31 ++++++++--
 10 files changed, 148 insertions(+), 93 deletions(-)

diff --git a/include/rocksdb/table.h b/include/rocksdb/table.h
index da525d4a2..d4e0e156f 100644
--- a/include/rocksdb/table.h
+++ b/include/rocksdb/table.h
@@ -125,6 +125,19 @@ struct BlockBasedTableOptions {
   // If true, place whole keys in the filter (not just prefixes).
   // This must generally be true for gets to be efficient.
   bool whole_key_filtering = true;
+
+  // For more details on BlockBasedTable's formats, see FORMAT-CHANGES.md
+  // We currently have two versions:
+  // 0 -- This version is currently written out by all RocksDB's versions by
+  // default.  Can be read by really old RocksDB's. Doesn't support changing
+  // checksum (default is CRC32).
+  // 1 -- Can be read by RocksDB's versions since 3.0. Supports non-default
+  // checksum, like xxHash. It is written by RocksDB when
+  // BlockBasedTableOptions::checksum is something other than kCRC32c. (version
+  // 0 is silently upconverted)
+  // This only affects newly written tables. When reading exising tables, the
+  // information about version is read from the footer.
+  uint32_t format_version = 0;
 };
 
 // Table Properties that are specific to block-based table properties.
diff --git a/table/block_based_table_builder.cc b/table/block_based_table_builder.cc
index cdae8508b..f04906ff8 100644
--- a/table/block_based_table_builder.cc
+++ b/table/block_based_table_builder.cc
@@ -472,9 +472,20 @@ BlockBasedTableBuilder::BlockBasedTableBuilder(
     const BlockBasedTableOptions& table_options,
     const InternalKeyComparator& internal_comparator, WritableFile* file,
     const CompressionType compression_type,
-    const CompressionOptions& compression_opts)
-    : rep_(new Rep(ioptions, table_options, internal_comparator,
-                   file, compression_type, compression_opts)) {
+    const CompressionOptions& compression_opts) {
+  BlockBasedTableOptions sanitized_table_options(table_options);
+  if (sanitized_table_options.format_version == 0 &&
+      sanitized_table_options.checksum != kCRC32c) {
+    Log(InfoLogLevel::WARN_LEVEL, ioptions.info_log,
+        "Silently converting format_version to 1 because checksum is "
+        "non-default");
+    // silently convert format_version to 1 to keep consistent with current
+    // behavior
+    sanitized_table_options.format_version = 1;
+  }
+
+  rep_ = new Rep(ioptions, sanitized_table_options, internal_comparator, file,
+                 compression_type, compression_opts);
   if (rep_->filter_block != nullptr) {
     rep_->filter_block->StartBlock(0);
   }
@@ -771,9 +782,13 @@ Status BlockBasedTableBuilder::Finish() {
     // TODO(icanadi) at some point in the future, when we're absolutely sure
     // nobody will roll back to RocksDB 2.x versions, retire the legacy magic
     // number and always write new table files with new magic number
-    bool legacy = (r->table_options.checksum == kCRC32c);
+    bool legacy = (r->table_options.format_version == 0);
+    // this is guaranteed by BlockBasedTableBuilder's constructor
+    assert(r->table_options.checksum == kCRC32c ||
+           r->table_options.format_version != 0);
     Footer footer(legacy ? kLegacyBlockBasedTableMagicNumber
-                         : kBlockBasedTableMagicNumber);
+                         : kBlockBasedTableMagicNumber,
+                  r->table_options.format_version);
     footer.set_metaindex_handle(metaindex_block_handle);
     footer.set_index_handle(index_block_handle);
     footer.set_checksum(r->table_options.checksum);
diff --git a/table/block_based_table_factory.cc b/table/block_based_table_factory.cc
index 9708e1954..063bc2587 100644
--- a/table/block_based_table_factory.cc
+++ b/table/block_based_table_factory.cc
@@ -76,6 +76,10 @@ Status BlockBasedTableFactory::SanitizeOptions(
     return Status::InvalidArgument("Enable cache_index_and_filter_blocks, "
         ", but block cache is disabled");
   }
+  if (table_options_.format_version > 1) {
+    return Status::InvalidArgument(
+        "We currently only support versions 0 and 1");
+  }
   return Status::OK();
 }
 
@@ -135,6 +139,8 @@ std::string BlockBasedTableFactory::GetPrintableTableOptions() const {
   ret.append(buffer);
   snprintf(buffer, kBufferSize, "  whole_key_filtering: %d\n",
            table_options_.whole_key_filtering);
+  snprintf(buffer, kBufferSize, "  format_version: %d\n",
+           table_options_.format_version);
   ret.append(buffer);
   return ret;
 }
diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc
index 1e4da1e1f..727f9c43a 100644
--- a/table/block_based_table_reader.cc
+++ b/table/block_based_table_reader.cc
@@ -436,11 +436,17 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions,
                              unique_ptr<TableReader>* table_reader) {
   table_reader->reset();
 
-  Footer footer(kBlockBasedTableMagicNumber);
-  auto s = ReadFooterFromFile(file.get(), file_size, &footer);
+  Footer footer;
+  auto s = ReadFooterFromFile(file.get(), file_size, &footer,
+                              kBlockBasedTableMagicNumber);
   if (!s.ok()) {
     return s;
   }
+  if (footer.version() > 1) {
+    return Status::Corruption(
+        "Unknown Footer version. Maybe this file was created with too new "
+        "version of RocksDB?");
+  }
 
   // We've successfully read the footer and the index block: we're
   // ready to serve requests.
diff --git a/table/cuckoo_table_builder.cc b/table/cuckoo_table_builder.cc
index 8a57f1c6b..1aa1e0707 100644
--- a/table/cuckoo_table_builder.cc
+++ b/table/cuckoo_table_builder.cc
@@ -377,7 +377,7 @@ Status CuckooTableBuilder::Finish() {
     return s;
   }
 
-  Footer footer(kCuckooTableMagicNumber);
+  Footer footer(kCuckooTableMagicNumber, 1);
   footer.set_metaindex_handle(meta_index_block_handle);
   footer.set_index_handle(BlockHandle::NullBlockHandle());
   std::string footer_encoding;
diff --git a/table/format.cc b/table/format.cc
index c7f96f427..2ea4b9171 100644
--- a/table/format.cc
+++ b/table/format.cc
@@ -72,6 +72,23 @@ std::string BlockHandle::ToString(bool hex) const {
 
 const BlockHandle BlockHandle::kNullBlockHandle(0, 0);
 
+namespace {
+inline bool IsLegacyFooterFormat(uint64_t magic_number) {
+  return magic_number == kLegacyBlockBasedTableMagicNumber ||
+         magic_number == kLegacyPlainTableMagicNumber;
+}
+inline uint64_t UpconvertLegacyFooterFormat(uint64_t magic_number) {
+  if (magic_number == kLegacyBlockBasedTableMagicNumber) {
+    return kBlockBasedTableMagicNumber;
+  }
+  if (magic_number == kLegacyPlainTableMagicNumber) {
+    return kPlainTableMagicNumber;
+  }
+  assert(false);
+  return 0;
+}
+}  // namespace
+
 // legacy footer format:
 //    metaindex handle (varint64 offset, varint64 size)
 //    index handle     (varint64 offset, varint64 size)
@@ -85,7 +102,8 @@ const BlockHandle BlockHandle::kNullBlockHandle(0, 0);
 //    footer version (4 bytes)
 //    table_magic_number (8 bytes)
 void Footer::EncodeTo(std::string* dst) const {
-  if (version() == kLegacyFooter) {
+  assert(HasInitializedTableMagicNumber());
+  if (IsLegacyFooterFormat(table_magic_number())) {
     // has to be default checksum with legacy footer
     assert(checksum_ == kCRC32c);
     const size_t original_size = dst->size();
@@ -100,39 +118,24 @@ void Footer::EncodeTo(std::string* dst) const {
     dst->push_back(static_cast<char>(checksum_));
     metaindex_handle_.EncodeTo(dst);
     index_handle_.EncodeTo(dst);
-    dst->resize(original_size + kVersion1EncodedLength - 12);  // Padding
-    PutFixed32(dst, kFooterVersion);
+    dst->resize(original_size + kNewVersionsEncodedLength - 12);  // Padding
+    PutFixed32(dst, version());
     PutFixed32(dst, static_cast<uint32_t>(table_magic_number() & 0xffffffffu));
     PutFixed32(dst, static_cast<uint32_t>(table_magic_number() >> 32));
-    assert(dst->size() == original_size + kVersion1EncodedLength);
+    assert(dst->size() == original_size + kNewVersionsEncodedLength);
   }
 }
 
-namespace {
-inline bool IsLegacyFooterFormat(uint64_t magic_number) {
-  return magic_number == kLegacyBlockBasedTableMagicNumber ||
-         magic_number == kLegacyPlainTableMagicNumber;
-}
-
-inline uint64_t UpconvertLegacyFooterFormat(uint64_t magic_number) {
-  if (magic_number == kLegacyBlockBasedTableMagicNumber) {
-    return kBlockBasedTableMagicNumber;
-  }
-  if (magic_number == kLegacyPlainTableMagicNumber) {
-    return kPlainTableMagicNumber;
-  }
-  assert(false);
-  return 0;
-}
-}  // namespace
-
-Footer::Footer(uint64_t _table_magic_number)
-    : version_(IsLegacyFooterFormat(_table_magic_number) ? kLegacyFooter
-                                                         : kFooterVersion),
+Footer::Footer(uint64_t _table_magic_number, uint32_t _version)
+    : version_(_version),
       checksum_(kCRC32c),
-      table_magic_number_(_table_magic_number) {}
+      table_magic_number_(_table_magic_number) {
+  // This should be guaranteed by constructor callers
+  assert(!IsLegacyFooterFormat(_table_magic_number) || version_ == 0);
+}
 
 Status Footer::DecodeFrom(Slice* input) {
+  assert(!HasInitializedTableMagicNumber());
   assert(input != nullptr);
   assert(input->size() >= kMinEncodedLength);
 
@@ -148,36 +151,23 @@ Status Footer::DecodeFrom(Slice* input) {
   if (legacy) {
     magic = UpconvertLegacyFooterFormat(magic);
   }
-  if (HasInitializedTableMagicNumber()) {
-    if (magic != table_magic_number()) {
-      char buffer[80];
-      snprintf(buffer, sizeof(buffer) - 1,
-               "not an sstable (bad magic number --- %lx)",
-               (long)magic);
-      return Status::Corruption(buffer);
-    }
-  } else {
-    set_table_magic_number(magic);
-  }
+  set_table_magic_number(magic);
 
   if (legacy) {
     // The size is already asserted to be at least kMinEncodedLength
     // at the beginning of the function
     input->remove_prefix(input->size() - kVersion0EncodedLength);
-    version_ = kLegacyFooter;
+    version_ = 0 /* legacy */;
     checksum_ = kCRC32c;
   } else {
     version_ = DecodeFixed32(magic_ptr - 4);
-    if (version_ != kFooterVersion) {
-      return Status::Corruption("bad footer version");
-    }
-    // Footer version 1 will always occupy exactly this many bytes.
+    // Footer version 1 and higher will always occupy exactly this many bytes.
     // It consists of the checksum type, two block handles, padding,
     // a version number, and a magic number
-    if (input->size() < kVersion1EncodedLength) {
+    if (input->size() < kNewVersionsEncodedLength) {
       return Status::Corruption("input is too short to be an sstable");
     } else {
-      input->remove_prefix(input->size() - kVersion1EncodedLength);
+      input->remove_prefix(input->size() - kNewVersionsEncodedLength);
     }
     uint32_t chksum;
     if (!GetVarint32(input, &chksum)) {
@@ -219,9 +209,8 @@ std::string Footer::ToString() const {
   return result;
 }
 
-Status ReadFooterFromFile(RandomAccessFile* file,
-                          uint64_t file_size,
-                          Footer* footer) {
+Status ReadFooterFromFile(RandomAccessFile* file, uint64_t file_size,
+                          Footer* footer, uint64_t enforce_table_magic_number) {
   if (file_size < Footer::kMinEncodedLength) {
     return Status::Corruption("file is too short to be an sstable");
   }
@@ -242,7 +231,15 @@ Status ReadFooterFromFile(RandomAccessFile* file,
     return Status::Corruption("file is too short to be an sstable");
   }
 
-  return footer->DecodeFrom(&footer_input);
+  s = footer->DecodeFrom(&footer_input);
+  if (!s.ok()) {
+    return s;
+  }
+  if (enforce_table_magic_number != 0 &&
+      enforce_table_magic_number != footer->table_magic_number()) {
+    return Status::Corruption("Bad table magic number");
+  }
+  return Status::OK();
 }
 
 // Without anonymous namespace here, we fail the warning -Wmissing-prototypes
diff --git a/table/format.h b/table/format.h
index e8586c986..d8bc43735 100644
--- a/table/format.h
+++ b/table/format.h
@@ -72,12 +72,13 @@ class Footer {
   // Constructs a footer without specifying its table magic number.
   // In such case, the table magic number of such footer should be
   // initialized via @ReadFooterFromFile().
-  Footer() : Footer(kInvalidTableMagicNumber) {}
+  // Use this when you plan to load Footer with DecodeFrom(). Never use this
+  // when you plan to EncodeTo.
+  Footer() : Footer(kInvalidTableMagicNumber, 0) {}
 
-  // @table_magic_number serves two purposes:
-  //  1. Identify different types of the tables.
-  //  2. Help us to identify if a given file is a valid sst.
-  explicit Footer(uint64_t table_magic_number);
+  // Use this constructor when you plan to write out the footer using
+  // EncodeTo(). Never use this constructor with DecodeFrom().
+  Footer(uint64_t table_magic_number, uint32_t version);
 
   // The version of the footer in this file
   uint32_t version() const { return version_; }
@@ -97,20 +98,13 @@ class Footer {
 
   uint64_t table_magic_number() const { return table_magic_number_; }
 
-  // The version of Footer we encode
-  enum {
-    kLegacyFooter = 0,
-    kFooterVersion = 1,
-  };
-
   void EncodeTo(std::string* dst) const;
 
-  // Set the current footer based on the input slice.  If table_magic_number_
-  // is not set (i.e., HasInitializedTableMagicNumber() is true), then this
-  // function will also initialize table_magic_number_.  Otherwise, this
-  // function will verify whether the magic number specified in the input
-  // slice matches table_magic_number_ and update the current footer only
-  // when the test passes.
+  // Set the current footer based on the input slice.
+  //
+  // REQUIRES: table_magic_number_ is not set (i.e.,
+  // HasInitializedTableMagicNumber() is true). The function will initialize the
+  // magic number
   Status DecodeFrom(Slice* input);
 
   // Encoded length of a Footer.  Note that the serialization of a Footer will
@@ -121,13 +115,12 @@ class Footer {
     // Footer version 0 (legacy) will always occupy exactly this many bytes.
     // It consists of two block handles, padding, and a magic number.
     kVersion0EncodedLength = 2 * BlockHandle::kMaxEncodedLength + 8,
-    // Footer version 1 will always occupy exactly this many bytes.
-    // It consists of the checksum type, two block handles, padding,
-    // a version number, and a magic number
-    kVersion1EncodedLength = 1 + 2 * BlockHandle::kMaxEncodedLength + 4 + 8,
-
+    // Footer of versions 1 and higher will always occupy exactly this many
+    // bytes. It consists of the checksum type, two block handles, padding,
+    // a version number (bigger than 1), and a magic number
+    kNewVersionsEncodedLength = 1 + 2 * BlockHandle::kMaxEncodedLength + 4 + 8,
     kMinEncodedLength = kVersion0EncodedLength,
-    kMaxEncodedLength = kVersion1EncodedLength
+    kMaxEncodedLength = kNewVersionsEncodedLength,
   };
 
   static const uint64_t kInvalidTableMagicNumber = 0;
@@ -156,9 +149,11 @@ class Footer {
 };
 
 // Read the footer from file
-Status ReadFooterFromFile(RandomAccessFile* file,
-                          uint64_t file_size,
-                          Footer* footer);
+// If enforce_table_magic_number != 0, ReadFooterFromFile() will return
+// corruption if table_magic number is not equal to enforce_table_magic_number
+Status ReadFooterFromFile(RandomAccessFile* file, uint64_t file_size,
+                          Footer* footer,
+                          uint64_t enforce_table_magic_number = 0);
 
 // 1-byte type + 32-bit crc
 static const size_t kBlockTrailerSize = 5;
diff --git a/table/meta_blocks.cc b/table/meta_blocks.cc
index 25a785787..6f83f42d4 100644
--- a/table/meta_blocks.cc
+++ b/table/meta_blocks.cc
@@ -220,8 +220,8 @@ Status ReadTableProperties(RandomAccessFile* file, uint64_t file_size,
                            uint64_t table_magic_number, Env* env,
                            Logger* info_log, TableProperties** properties) {
   // -- Read metaindex block
-  Footer footer(table_magic_number);
-  auto s = ReadFooterFromFile(file, file_size, &footer);
+  Footer footer;
+  auto s = ReadFooterFromFile(file, file_size, &footer, table_magic_number);
   if (!s.ok()) {
     return s;
   }
@@ -274,8 +274,8 @@ Status FindMetaBlock(RandomAccessFile* file, uint64_t file_size,
                      uint64_t table_magic_number, Env* env,
                      const std::string& meta_block_name,
                      BlockHandle* block_handle) {
-  Footer footer(table_magic_number);
-  auto s = ReadFooterFromFile(file, file_size, &footer);
+  Footer footer;
+  auto s = ReadFooterFromFile(file, file_size, &footer, table_magic_number);
   if (!s.ok()) {
     return s;
   }
@@ -302,8 +302,8 @@ Status ReadMetaBlock(RandomAccessFile* file, uint64_t file_size,
                      const std::string& meta_block_name,
                      BlockContents* contents) {
   Status status;
-  Footer footer(table_magic_number);
-  status = ReadFooterFromFile(file, file_size, &footer);
+  Footer footer;
+  status = ReadFooterFromFile(file, file_size, &footer, table_magic_number);
   if (!status.ok()) {
     return status;
   }
diff --git a/table/plain_table_builder.cc b/table/plain_table_builder.cc
index 74a71cb35..0f89dd1f5 100644
--- a/table/plain_table_builder.cc
+++ b/table/plain_table_builder.cc
@@ -258,7 +258,7 @@ Status PlainTableBuilder::Finish() {
 
   // Write Footer
   // no need to write out new footer if we're using default checksum
-  Footer footer(kLegacyPlainTableMagicNumber);
+  Footer footer(kLegacyPlainTableMagicNumber, 0);
   footer.set_metaindex_handle(metaindex_block_handle);
   footer.set_index_handle(BlockHandle::NullBlockHandle());
   std::string footer_encoding;
diff --git a/table/table_test.cc b/table/table_test.cc
index 8810a2254..4289059f9 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -1943,7 +1943,7 @@ TEST(Harness, FooterTests) {
   {
     // upconvert legacy block based
     std::string encoded;
-    Footer footer(kLegacyBlockBasedTableMagicNumber);
+    Footer footer(kLegacyBlockBasedTableMagicNumber, 0);
     BlockHandle meta_index(10, 5), index(20, 15);
     footer.set_metaindex_handle(meta_index);
     footer.set_index_handle(index);
@@ -1957,11 +1957,12 @@ TEST(Harness, FooterTests) {
     ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size());
     ASSERT_EQ(decoded_footer.index_handle().offset(), index.offset());
     ASSERT_EQ(decoded_footer.index_handle().size(), index.size());
+    ASSERT_EQ(decoded_footer.version(), 0U);
   }
   {
     // xxhash block based
     std::string encoded;
-    Footer footer(kBlockBasedTableMagicNumber);
+    Footer footer(kBlockBasedTableMagicNumber, 1);
     BlockHandle meta_index(10, 5), index(20, 15);
     footer.set_metaindex_handle(meta_index);
     footer.set_index_handle(index);
@@ -1976,11 +1977,12 @@ TEST(Harness, FooterTests) {
     ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size());
     ASSERT_EQ(decoded_footer.index_handle().offset(), index.offset());
     ASSERT_EQ(decoded_footer.index_handle().size(), index.size());
+    ASSERT_EQ(decoded_footer.version(), 1U);
   }
   {
     // upconvert legacy plain table
     std::string encoded;
-    Footer footer(kLegacyPlainTableMagicNumber);
+    Footer footer(kLegacyPlainTableMagicNumber, 0);
     BlockHandle meta_index(10, 5), index(20, 15);
     footer.set_metaindex_handle(meta_index);
     footer.set_index_handle(index);
@@ -1994,11 +1996,12 @@ TEST(Harness, FooterTests) {
     ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size());
     ASSERT_EQ(decoded_footer.index_handle().offset(), index.offset());
     ASSERT_EQ(decoded_footer.index_handle().size(), index.size());
+    ASSERT_EQ(decoded_footer.version(), 0U);
   }
   {
     // xxhash block based
     std::string encoded;
-    Footer footer(kPlainTableMagicNumber);
+    Footer footer(kPlainTableMagicNumber, 1);
     BlockHandle meta_index(10, 5), index(20, 15);
     footer.set_metaindex_handle(meta_index);
     footer.set_index_handle(index);
@@ -2013,6 +2016,26 @@ TEST(Harness, FooterTests) {
     ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size());
     ASSERT_EQ(decoded_footer.index_handle().offset(), index.offset());
     ASSERT_EQ(decoded_footer.index_handle().size(), index.size());
+    ASSERT_EQ(decoded_footer.version(), 1U);
+  }
+  {
+    // version == 2
+    std::string encoded;
+    Footer footer(kBlockBasedTableMagicNumber, 2);
+    BlockHandle meta_index(10, 5), index(20, 15);
+    footer.set_metaindex_handle(meta_index);
+    footer.set_index_handle(index);
+    footer.EncodeTo(&encoded);
+    Footer decoded_footer;
+    Slice encoded_slice(encoded);
+    decoded_footer.DecodeFrom(&encoded_slice);
+    ASSERT_EQ(decoded_footer.table_magic_number(), kBlockBasedTableMagicNumber);
+    ASSERT_EQ(decoded_footer.checksum(), kCRC32c);
+    ASSERT_EQ(decoded_footer.metaindex_handle().offset(), meta_index.offset());
+    ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size());
+    ASSERT_EQ(decoded_footer.index_handle().offset(), index.offset());
+    ASSERT_EQ(decoded_footer.index_handle().size(), index.size());
+    ASSERT_EQ(decoded_footer.version(), 2U);
   }
 }
 

From bb128bfec3e620597a4827bfb727de6d083c42ff Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Tue, 13 Jan 2015 16:30:31 -0800
Subject: [PATCH 685/829] More accurate message for compaction applied to a
 different version

Test Plan: Compile. Run it.

Reviewers: yhchiang, dhruba, igor

Reviewed By: igor

Subscribers: leveldb, dhruba

Differential Revision: https://reviews.facebook.net/D31479
---
 db/version_set.cc | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/db/version_set.cc b/db/version_set.cc
index b206fe5b3..c5956a534 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -2705,8 +2705,9 @@ bool VersionSet::VerifyCompactionFileConsistency(Compaction* c) {
   Version* version = c->column_family_data()->current();
   const VersionStorageInfo* vstorage = version->storage_info();
   if (c->input_version() != version) {
-    Log(InfoLogLevel::ERROR_LEVEL, db_options_->info_log,
-        "[%s] VerifyCompactionFileConsistency version mismatch",
+    Log(InfoLogLevel::INFO_LEVEL, db_options_->info_log,
+        "[%s] compaction output being applied to a different base version from"
+        " input version",
         c->column_family_data()->GetName().c_str());
   }
 

From 2a7bd0ea45f18cb82ae784ce1a1a618235787b3f Mon Sep 17 00:00:00 2001
From: Xiangyong Ouyang <shawn@ssdcache002.ec2.pin220.com>
Date: Wed, 14 Jan 2015 01:20:30 +0000
Subject: [PATCH 686/829] Remove duplicated method declarations in C header.

---
 include/rocksdb/c.h | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h
index 235b58530..c686c90c7 100644
--- a/include/rocksdb/c.h
+++ b/include/rocksdb/c.h
@@ -544,9 +544,6 @@ extern void rocksdb_options_set_hash_skip_list_rep(rocksdb_options_t*, size_t, i
 extern void rocksdb_options_set_hash_link_list_rep(rocksdb_options_t*, size_t);
 extern void rocksdb_options_set_plain_table_factory(rocksdb_options_t*, uint32_t, int, double, size_t);
 
-extern void rocksdb_options_set_max_bytes_for_level_base(rocksdb_options_t* opt, uint64_t n);
-extern void rocksdb_options_set_stats_dump_period_sec(rocksdb_options_t* opt, unsigned int sec);
-
 extern void rocksdb_options_set_min_level_to_compress(rocksdb_options_t* opt, int level);
 
 extern void rocksdb_options_set_memtable_prefix_bloom_bits(

From 516a04267ed77edc9b902b6aaf3407c940a81f2d Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Wed, 14 Jan 2015 09:45:29 -0800
Subject: [PATCH 687/829] Add LZ4 compression to sanity test

Summary: This will be used to test format changes in https://reviews.facebook.net/D31461

Test Plan: run it

Reviewers: MarkCallaghan, sdong

Reviewed By: sdong

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D31515
---
 tools/db_sanity_test.cc | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/tools/db_sanity_test.cc b/tools/db_sanity_test.cc
index 8219feb37..df3cae11d 100644
--- a/tools/db_sanity_test.cc
+++ b/tools/db_sanity_test.cc
@@ -133,6 +133,32 @@ class SanityTestZlibCompression : public SanityTest {
   Options options_;
 };
 
+class SanityTestLZ4Compression : public SanityTest {
+ public:
+  explicit SanityTestLZ4Compression(const std::string& path)
+      : SanityTest(path) {
+    options_.compression = kLZ4Compression;
+  }
+  virtual Options GetOptions() const { return options_; }
+  virtual std::string Name() const { return "LZ4Compression"; }
+
+ private:
+  Options options_;
+};
+
+class SanityTestLZ4HCCompression : public SanityTest {
+ public:
+  explicit SanityTestLZ4HCCompression(const std::string& path)
+      : SanityTest(path) {
+    options_.compression = kLZ4HCCompression;
+  }
+  virtual Options GetOptions() const { return options_; }
+  virtual std::string Name() const { return "LZ4HCCompression"; }
+
+ private:
+  Options options_;
+};
+
 #ifndef ROCKSDB_LITE
 class SanityTestPlainTableFactory : public SanityTest {
  public:
@@ -171,6 +197,8 @@ bool RunSanityTests(const std::string& command, const std::string& path) {
   std::vector<SanityTest*> sanity_tests = {
       new SanityTestBasic(path), new SanityTestSpecialComparator(path),
       new SanityTestZlibCompression(path),
+      new SanityTestLZ4Compression(path),
+      new SanityTestLZ4HCCompression(path),
 #ifndef ROCKSDB_LITE
       new SanityTestPlainTableFactory(path),
 #endif  // ROCKSDB_LITE

From 45e43b81df353284a48e1eb6db3611480dd133aa Mon Sep 17 00:00:00 2001
From: Adam Retter <adam.retter@googlemail.com>
Date: Sun, 28 Dec 2014 16:37:57 +0000
Subject: [PATCH 688/829] Adds support for db->DefaultColumnFamily() to the
 Java API

---
 java/org/rocksdb/RocksDB.java               | 10 ++++++
 java/org/rocksdb/test/ColumnFamilyTest.java | 34 +++++++++++++++++++++
 java/rocksjni/rocksjni.cc                   | 12 ++++++++
 3 files changed, 56 insertions(+)

diff --git a/java/org/rocksdb/RocksDB.java b/java/org/rocksdb/RocksDB.java
index 22a608207..6c6e72260 100644
--- a/java/org/rocksdb/RocksDB.java
+++ b/java/org/rocksdb/RocksDB.java
@@ -1180,6 +1180,15 @@ public class RocksDB extends RocksObject {
     return iterators;
   }
 
+  /**
+   * Gets the handle for the default column family
+   *
+   * @return The handle of the default column family
+   */
+  public ColumnFamilyHandle getDefaultColumnFamily() {
+    return new ColumnFamilyHandle(this, getDefaultColumnFamily(nativeHandle_));
+  }
+
   /**
    * Creates a new column family with the name columnFamilyName and
    * allocates a ColumnFamilyHandle within an internal structure.
@@ -1620,6 +1629,7 @@ public class RocksDB extends RocksObject {
   protected native void releaseSnapshot(
       long nativeHandle, long snapshotHandle);
   private native void disposeInternal(long handle);
+  private native long getDefaultColumnFamily(long handle);
   private native long createColumnFamily(long handle,
       ColumnFamilyDescriptor columnFamilyDescriptor) throws RocksDBException;
   private native void dropColumnFamily(long handle, long cfHandle) throws RocksDBException;
diff --git a/java/org/rocksdb/test/ColumnFamilyTest.java b/java/org/rocksdb/test/ColumnFamilyTest.java
index 703ed296f..fb95e8010 100644
--- a/java/org/rocksdb/test/ColumnFamilyTest.java
+++ b/java/org/rocksdb/test/ColumnFamilyTest.java
@@ -56,6 +56,40 @@ public class ColumnFamilyTest {
     }
   }
 
+  @Test
+  public void defaultColumnFamily() throws RocksDBException {
+    RocksDB db = null;
+    Options options = null;
+    try {
+      options = new Options();
+      options.setCreateIfMissing(true);
+
+      DBOptions dbOptions = new DBOptions();
+      dbOptions.setCreateIfMissing(true);
+
+      db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath());
+      ColumnFamilyHandle cfh = db.getDefaultColumnFamily();
+      assertThat(cfh).isNotNull();
+
+      final byte[] key = "key".getBytes();
+      final byte[] value = "value".getBytes();
+
+      db.put(cfh, key, value);
+
+      final byte[] actualValue = db.get(cfh, key);
+
+      assertThat(cfh).isNotNull();
+      assertThat(actualValue).isEqualTo(value);
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
+    }
+  }
+
   @Test
   public void createColumnFamily() throws RocksDBException {
     RocksDB db = null;
diff --git a/java/rocksjni/rocksjni.cc b/java/rocksjni/rocksjni.cc
index be70670ae..fdd1b009f 100644
--- a/java/rocksjni/rocksjni.cc
+++ b/java/rocksjni/rocksjni.cc
@@ -1174,6 +1174,18 @@ jlongArray Java_org_rocksdb_RocksDB_iterators(
   return env->NewLongArray(0);
 }
 
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    getDefaultColumnFamily
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_RocksDB_getDefaultColumnFamily(
+    JNIEnv* env, jobject jobj, jlong jdb_handle) {
+  auto* db_handle = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  auto* cf_handle = db_handle->DefaultColumnFamily();
+  return reinterpret_cast<jlong>(cf_handle);
+}
+
 /*
  * Class:     org_rocksdb_RocksDB
  * Method:    createColumnFamily

From a8cfa7ace8d80dd156d11c64592faacbe3c01a34 Mon Sep 17 00:00:00 2001
From: Adam Retter <adam.retter@googlemail.com>
Date: Tue, 16 Dec 2014 15:15:40 +0000
Subject: [PATCH 689/829] Extract the interface for a WriteBatch

---
 java/org/rocksdb/WriteBatch.java          | 76 +++---------------
 java/org/rocksdb/WriteBatchInterface.java | 98 +++++++++++++++++++++++
 2 files changed, 108 insertions(+), 66 deletions(-)
 create mode 100644 java/org/rocksdb/WriteBatchInterface.java

diff --git a/java/org/rocksdb/WriteBatch.java b/java/org/rocksdb/WriteBatch.java
index 3407033ab..64f472c89 100644
--- a/java/org/rocksdb/WriteBatch.java
+++ b/java/org/rocksdb/WriteBatch.java
@@ -22,7 +22,7 @@ package org.rocksdb;
  * non-const method, all threads accessing the same WriteBatch must use
  * external synchronization.
  */
-public class WriteBatch extends RocksObject {
+public class WriteBatch extends RocksObject implements WriteBatchInterface {
   /**
    * Constructs a WriteBatch instance.
    */
@@ -41,98 +41,44 @@ public class WriteBatch extends RocksObject {
     newWriteBatch(reserved_bytes);
   }
 
-  /**
-   * Returns the number of updates in the batch.
-   *
-   * @return number of items in WriteBatch
-   */
+  @Override
   public native int count();
 
-  /**
-   * <p>Store the mapping "key-&gt;value" in the database.</p>
-   *
-   * @param key the specified key to be inserted.
-   * @param value the value associated with the specified key.
-   */
+  @Override
   public void put(byte[] key, byte[] value) {
     put(key, key.length, value, value.length);
   }
 
-  /**
-   * <p>Store the mapping "key-&gt;value" within given column
-   * family.</p>
-   *
-   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
-   *     instance
-   * @param key the specified key to be inserted.
-   * @param value the value associated with the specified key.
-   */
+  @Override
   public void put(ColumnFamilyHandle columnFamilyHandle,
       byte[] key, byte[] value) {
     put(key, key.length, value, value.length,
         columnFamilyHandle.nativeHandle_);
   }
 
-  /**
-   * <p>Merge "value" with the existing value of "key" in the database.
-   * "key-&gt;merge(existing, value)"</p>
-   *
-   * @param key the specified key to be merged.
-   * @param value the value to be merged with the current value for
-   * the specified key.
-   */
+  @Override
   public void merge(byte[] key, byte[] value) {
     merge(key, key.length, value, value.length);
   }
 
-  /**
-   * <p>Merge "value" with the existing value of "key" in given column family.
-   * "key-&gt;merge(existing, value)"</p>
-   *
-   * @param columnFamilyHandle {@link ColumnFamilyHandle} instance
-   * @param key the specified key to be merged.
-   * @param value the value to be merged with the current value for
-   * the specified key.
-   */
+  @Override
   public void merge(ColumnFamilyHandle columnFamilyHandle,
       byte[] key, byte[] value) {
     merge(key, key.length, value, value.length,
         columnFamilyHandle.nativeHandle_);
   }
 
-  /**
-   * <p>If the database contains a mapping for "key", erase it.  Else do nothing.</p>
-   *
-   * @param key Key to delete within database
-   */
+  @Override
   public void remove(byte[] key) {
     remove(key, key.length);
   }
 
-  /**
-   * <p>If column family contains a mapping for "key", erase it.  Else do nothing.</p>
-   *
-   * @param columnFamilyHandle {@link ColumnFamilyHandle} instance
-   * @param key Key to delete within database
-   */
+  @Override
   public void remove(ColumnFamilyHandle columnFamilyHandle, byte[] key) {
     remove(key, key.length, columnFamilyHandle.nativeHandle_);
   }
 
-  /**
-   * Append a blob of arbitrary size to the records in this batch. The blob will
-   * be stored in the transaction log but not in any other file. In particular,
-   * it will not be persisted to the SST files. When iterating over this
-   * WriteBatch, WriteBatch::Handler::LogData will be called with the contents
-   * of the blob as it is encountered. Blobs, puts, deletes, and merges will be
-   * encountered in the same order in thich they were inserted. The blob will
-   * NOT consume sequence number(s) and will NOT increase the count of the batch
-   *
-   * Example application: add timestamps to the transaction log for use in
-   * replication.
-   *
-   * @param blob binary object to be inserted
-   */
+  @Override
   public void putLogData(byte[] blob) {
     putLogData(blob, blob.length);
   }
@@ -149,9 +95,7 @@ public class WriteBatch extends RocksObject {
     iterate(handler.nativeHandle_);
   }
 
-  /**
-   * Clear all updates buffered in this batch
-   */
+  @Override
   public native void clear();
 
   /**
diff --git a/java/org/rocksdb/WriteBatchInterface.java b/java/org/rocksdb/WriteBatchInterface.java
new file mode 100644
index 000000000..4eaf1ad9d
--- /dev/null
+++ b/java/org/rocksdb/WriteBatchInterface.java
@@ -0,0 +1,98 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+/**
+ * <p>Defines the interface for a Write Batch which
+ * holds a collection of updates to apply atomically to a DB.</p>
+ */
+public interface WriteBatchInterface {
+
+    /**
+     * Returns the number of updates in the batch.
+     *
+     * @return number of items in WriteBatch
+     */
+    public int count();
+
+    /**
+     * <p>Store the mapping "key-&gt;value" in the database.</p>
+     *
+     * @param key the specified key to be inserted.
+     * @param value the value associated with the specified key.
+     */
+    public void put(byte[] key, byte[] value);
+
+    /**
+     * <p>Store the mapping "key-&gt;value" within given column
+     * family.</p>
+     *
+     * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+     *     instance
+     * @param key the specified key to be inserted.
+     * @param value the value associated with the specified key.
+     */
+    public void put(ColumnFamilyHandle columnFamilyHandle,
+                    byte[] key, byte[] value);
+
+    /**
+     * <p>Merge "value" with the existing value of "key" in the database.
+     * "key-&gt;merge(existing, value)"</p>
+     *
+     * @param key the specified key to be merged.
+     * @param value the value to be merged with the current value for
+     * the specified key.
+     */
+    public void merge(byte[] key, byte[] value);
+
+    /**
+     * <p>Merge "value" with the existing value of "key" in given column family.
+     * "key-&gt;merge(existing, value)"</p>
+     *
+     * @param columnFamilyHandle {@link ColumnFamilyHandle} instance
+     * @param key the specified key to be merged.
+     * @param value the value to be merged with the current value for
+     * the specified key.
+     */
+    public void merge(ColumnFamilyHandle columnFamilyHandle,
+                      byte[] key, byte[] value);
+
+    /**
+     * <p>If the database contains a mapping for "key", erase it.  Else do nothing.</p>
+     *
+     * @param key Key to delete within database
+     */
+    public void remove(byte[] key);
+
+    /**
+     * <p>If column family contains a mapping for "key", erase it.  Else do nothing.</p>
+     *
+     * @param columnFamilyHandle {@link ColumnFamilyHandle} instance
+     * @param key Key to delete within database
+     */
+    public void remove(ColumnFamilyHandle columnFamilyHandle, byte[] key);
+
+    /**
+     * Append a blob of arbitrary size to the records in this batch. The blob will
+     * be stored in the transaction log but not in any other file. In particular,
+     * it will not be persisted to the SST files. When iterating over this
+     * WriteBatch, WriteBatch::Handler::LogData will be called with the contents
+     * of the blob as it is encountered. Blobs, puts, deletes, and merges will be
+     * encountered in the same order in thich they were inserted. The blob will
+     * NOT consume sequence number(s) and will NOT increase the count of the batch
+     *
+     * Example application: add timestamps to the transaction log for use in
+     * replication.
+     *
+     * @param blob binary object to be inserted
+     */
+    public void putLogData(byte[] blob);
+
+    /**
+     * Clear all updates buffered in this batch
+     */
+    public void clear();
+}

From 2241e3f4d525e411e26bd1ebcd5bdce3add9c24b Mon Sep 17 00:00:00 2001
From: Adam Retter <adam.retter@googlemail.com>
Date: Tue, 16 Dec 2014 15:34:46 +0000
Subject: [PATCH 690/829] Extract the interface for a RocksIterator

---
 java/org/rocksdb/RocksIterator.java          | 77 +++++--------------
 java/org/rocksdb/RocksIteratorInterface.java | 80 ++++++++++++++++++++
 2 files changed, 101 insertions(+), 56 deletions(-)
 create mode 100644 java/org/rocksdb/RocksIteratorInterface.java

diff --git a/java/org/rocksdb/RocksIterator.java b/java/org/rocksdb/RocksIterator.java
index 1abe7e704..cecf9c309 100644
--- a/java/org/rocksdb/RocksIterator.java
+++ b/java/org/rocksdb/RocksIterator.java
@@ -6,9 +6,9 @@
 package org.rocksdb;
 
 /**
- * <p>An iterator yields a sequence of key/value pairs from a source.
- * The following class defines the interface. Multiple implementations
- * are provided by this library.  In particular, iterators are provided
+ * <p>An iterator that yields a sequence of key/value pairs from a source.
+ * Multiple implementations are provided by this library.
+ * In particular, iterators are provided
  * to access the contents of a Table or a DB.</p>
  *
  * <p>Multiple threads can invoke const methods on an RocksIterator without
@@ -18,7 +18,7 @@ package org.rocksdb;
  *
  * @see org.rocksdb.RocksObject
  */
-public class RocksIterator extends RocksObject {
+public class RocksIterator extends RocksObject implements RocksIteratorInterface {
   public RocksIterator(RocksDB rocksDB, long nativeHandle) {
     super();
     nativeHandle_ = nativeHandle;
@@ -30,57 +30,48 @@ public class RocksIterator extends RocksObject {
     rocksDB_ = rocksDB;
   }
 
-  /**
-   * An iterator is either positioned at a key/value pair, or
-   * not valid.  This method returns true iff the iterator is valid.
-   *
-   * @return true if iterator is valid.
-   */
+  @Override
   public boolean isValid() {
     assert(isInitialized());
     return isValid0(nativeHandle_);
   }
 
-  /**
-   * Position at the first key in the source.  The iterator is Valid()
-   * after this call iff the source is not empty.
-   */
+  @Override
   public void seekToFirst() {
     assert(isInitialized());
     seekToFirst0(nativeHandle_);
   }
 
-  /**
-   * Position at the last key in the source.  The iterator is
-   * valid after this call iff the source is not empty.
-   */
+  @Override
   public void seekToLast() {
     assert(isInitialized());
     seekToLast0(nativeHandle_);
   }
 
-  /**
-   * <p>Moves to the next entry in the source.  After this call, Valid() is
-   * true iff the iterator was not positioned at the last entry in the source.</p>
-   *
-   * <p>REQUIRES: {@link #isValid()}</p>
-   */
+  @Override
+  public void seek(byte[] target) {
+    assert(isInitialized());
+    seek0(nativeHandle_, target, target.length);
+  }
+
+  @Override
   public void next() {
     assert(isInitialized());
     next0(nativeHandle_);
   }
 
-  /**
-   * <p>Moves to the previous entry in the source.  After this call, Valid() is
-   * true iff the iterator was not positioned at the first entry in source.</p>
-   *
-   * <p>REQUIRES: {@link #isValid()}</p>
-   */
+  @Override
   public void prev() {
     assert(isInitialized());
     prev0(nativeHandle_);
   }
 
+  @Override
+  public void status() throws RocksDBException {
+    assert(isInitialized());
+    status0(nativeHandle_);
+  }
+
   /**
    * <p>Return the key for the current entry.  The underlying storage for
    * the returned slice is valid only until the next modification of
@@ -108,32 +99,6 @@ public class RocksIterator extends RocksObject {
     return value0(nativeHandle_);
   }
 
-  /**
-   * <p>Position at the first key in the source that at or past target
-   * The iterator is valid after this call iff the source contains
-   * an entry that comes at or past target.</p>
-   *
-   * @param target byte array describing a key or a
-   *     key prefix to seek for.
-   */
-  public void seek(byte[] target) {
-    assert(isInitialized());
-    seek0(nativeHandle_, target, target.length);
-  }
-
-  /**
-   * If an error has occurred, return it.  Else return an ok status.
-   * If non-blocking IO is requested and this operation cannot be
-   * satisfied without doing some IO, then this returns Status::Incomplete().
-   *
-   * @throws RocksDBException thrown if error happens in underlying
-   *    native library.
-   */
-  public void status() throws RocksDBException {
-    assert(isInitialized());
-    status0(nativeHandle_);
-  }
-
   /**
    * <p>Deletes underlying C++ iterator pointer.</p>
    *
diff --git a/java/org/rocksdb/RocksIteratorInterface.java b/java/org/rocksdb/RocksIteratorInterface.java
new file mode 100644
index 000000000..15f3a9aa9
--- /dev/null
+++ b/java/org/rocksdb/RocksIteratorInterface.java
@@ -0,0 +1,80 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+/**
+ * <p>Defines the interface for an Iterator which provides
+ * access to data one entry at a time. Multiple implementations
+ * are provided by this library.  In particular, iterators are provided
+ * to access the contents of a DB and Write Batch.</p>
+ * <p/>
+ * <p>Multiple threads can invoke const methods on an RocksIterator without
+ * external synchronization, but if any of the threads may call a
+ * non-const method, all threads accessing the same RocksIterator must use
+ * external synchronization.</p>
+ *
+ * @see org.rocksdb.RocksObject
+ */
+public interface RocksIteratorInterface {
+
+  /**
+   * <p>An iterator is either positioned at an entry, or
+   * not valid.  This method returns true if the iterator is valid.</p>
+   *
+   * @return true if iterator is valid.
+   */
+  public boolean isValid();
+
+  /**
+   * <p>Position at the first entry in the source.  The iterator is Valid()
+   * after this call if the source is not empty.</p>
+   */
+  public void seekToFirst();
+
+  /**
+   * <p>Position at the last entry in the source.  The iterator is
+   * valid after this call if the source is not empty.</p>
+   */
+  public void seekToLast();
+
+  /**
+   * <p>Position at the first entry in the source whose key is that or
+   * past target.</p>
+   * <p/>
+   * <p>The iterator is valid after this call if the source contains
+   * a key that comes at or past target.</p>
+   *
+   * @param target byte array describing a key or a
+   *               key prefix to seek for.
+   */
+  public void seek(byte[] target);
+
+  /**
+   * <p>Moves to the next entry in the source.  After this call, Valid() is
+   * true if the iterator was not positioned at the last entry in the source.</p>
+   * <p/>
+   * <p>REQUIRES: {@link #isValid()}</p>
+   */
+  public void next();
+
+  /**
+   * <p>Moves to the previous entry in the source.  After this call, Valid() is
+   * true if the iterator was not positioned at the first entry in source.</p>
+   * <p/>
+   * <p>REQUIRES: {@link #isValid()}</p>
+   */
+  public void prev();
+
+  /**
+   * <pIf an error has occurred, return it.  Else return an ok status.
+   * If non-blocking IO is requested and this operation cannot be
+   * satisfied without doing some IO, then this returns Status::Incomplete().</p>
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *                          native library.
+   */
+  public void status() throws RocksDBException;
+}

From be905491bfd5968bcd2cb4013e25648bc21cb30d Mon Sep 17 00:00:00 2001
From: Adam Retter <adam.retter@googlemail.com>
Date: Fri, 19 Dec 2014 15:29:16 +0000
Subject: [PATCH 691/829] Test for
 WriteBatchWithIndex#newIteratorWithBase(org.rocksdb.RocksIterator)

---
 java/Makefile                                 |   1 +
 .../rocksdb/test/WriteBatchWithIndexTest.java | 113 ++++++++++++++++++
 2 files changed, 114 insertions(+)
 create mode 100644 java/org/rocksdb/test/WriteBatchWithIndexTest.java

diff --git a/java/Makefile b/java/Makefile
index 26fa38d05..33a1008d4 100644
--- a/java/Makefile
+++ b/java/Makefile
@@ -81,6 +81,7 @@ JAVA_TESTS = org.rocksdb.test.BackupableDBOptionsTest\
 		org.rocksdb.test.WriteBatchHandlerTest\
 		org.rocksdb.test.WriteBatchTest\
 		org.rocksdb.test.WriteOptionsTest\
+		org.rocksdb.test.WriteBatchWithIndexTest
 
 JAVA_TEST_LIBDIR = ./test-libs/
 JAVA_JUNIT_JAR = $(JAVA_TEST_LIBDIR)junit-4.12-beta-2.jar
diff --git a/java/org/rocksdb/test/WriteBatchWithIndexTest.java b/java/org/rocksdb/test/WriteBatchWithIndexTest.java
new file mode 100644
index 000000000..3638d0637
--- /dev/null
+++ b/java/org/rocksdb/test/WriteBatchWithIndexTest.java
@@ -0,0 +1,113 @@
+//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+package org.rocksdb.test;
+
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+import org.rocksdb.*;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+
+public class WriteBatchWithIndexTest {
+
+  @ClassRule
+  public static final RocksMemoryResource rocksMemoryResource =
+      new RocksMemoryResource();
+
+  @Rule
+  public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  @Test
+  public void readYourOwnWrites() throws RocksDBException {
+    RocksDB db = null;
+    Options options = null;
+    try {
+      options = new Options();
+      // Setup options
+      options.setCreateIfMissing(true);
+      db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath());
+
+      final byte[] k1 = "key1".getBytes();
+      final byte[] v1 = "value1".getBytes();
+      final byte[] k2 = "key2".getBytes();
+      final byte[] v2 = "value2".getBytes();
+
+      db.put(k1, v1);
+      db.put(k2, v2);
+
+      final WriteBatchWithIndex wbwi = new WriteBatchWithIndex(true);
+
+      RocksIterator base = null;
+      RocksIterator it = null;
+      try {
+        base = db.newIterator();
+        it = wbwi.newIteratorWithBase(base);
+
+        it.seek(k1);
+        assertThat(it.isValid()).isTrue();
+        assertThat(it.key()).isEqualTo(k1);
+        assertThat(it.value()).isEqualTo(v1);
+
+        it.seek(k2);
+        assertThat(it.isValid()).isTrue();
+        assertThat(it.key()).isEqualTo(k2);
+        assertThat(it.value()).isEqualTo(v2);
+
+        //put data to the write batch and make sure we can read it.
+        final byte[] k3 = "key3".getBytes();
+        final byte[] v3 = "value3".getBytes();
+        wbwi.put(k3, v3);
+        it.seek(k3);
+        assertThat(it.isValid()).isTrue();
+        assertThat(it.key()).isEqualTo(k3);
+        assertThat(it.value()).isEqualTo(v3);
+
+        //update k2 in the write batch and check the value
+        final byte[] v2Other = "otherValue2".getBytes();
+        wbwi.put(k2, v2Other);
+        it.seek(k2);
+        assertThat(it.isValid()).isTrue();
+        assertThat(it.key()).isEqualTo(k2);
+        assertThat(it.value()).isEqualTo(v2Other);
+
+        //remove k1 and make sure we can read back the write
+        wbwi.remove(k1);
+        it.seek(k1);
+        assertThat(it.key()).isNotEqualTo(k1);
+
+        //reinsert k1 and make sure we see the new value
+        final byte[] v1Other = "otherValue1".getBytes();
+        wbwi.put(k1, v1Other);
+        it.seek(k1);
+        assertThat(it.isValid()).isTrue();
+        assertThat(it.key()).isEqualTo(k1);
+        assertThat(it.value()).isEqualTo(v1Other);
+      } finally {
+        if (it != null) {
+          it.dispose();
+        }
+        if (base != null) {
+          base.dispose();
+        }
+      }
+
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
+    }
+  }
+}

From c6e55456128a26481c6d7a374fb2228c6e3ee165 Mon Sep 17 00:00:00 2001
From: Adam Retter <adam.retter@googlemail.com>
Date: Sat, 3 Jan 2015 18:36:59 +0000
Subject: [PATCH 692/829] Abstractions for common write batch behaviour

---
 java/org/rocksdb/AbstractWriteBatch.java | 92 ++++++++++++++++++++++++
 java/org/rocksdb/WriteBatch.java         | 82 ++++-----------------
 java/rocksjni/write_batch.cc             |  8 +--
 3 files changed, 110 insertions(+), 72 deletions(-)
 create mode 100644 java/org/rocksdb/AbstractWriteBatch.java

diff --git a/java/org/rocksdb/AbstractWriteBatch.java b/java/org/rocksdb/AbstractWriteBatch.java
new file mode 100644
index 000000000..b380c5d8a
--- /dev/null
+++ b/java/org/rocksdb/AbstractWriteBatch.java
@@ -0,0 +1,92 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+public abstract class AbstractWriteBatch extends RocksObject implements WriteBatchInterface {
+
+  @Override
+  public int count() {
+    assert (isInitialized());
+    return count0();
+  }
+
+  @Override
+  public void put(byte[] key, byte[] value) {
+    assert (isInitialized());
+    put(key, key.length, value, value.length);
+  }
+
+  @Override
+  public void put(ColumnFamilyHandle columnFamilyHandle, byte[] key, byte[] value) {
+    assert (isInitialized());
+    put(key, key.length, value, value.length, columnFamilyHandle.nativeHandle_);
+  }
+
+  @Override
+  public void merge(byte[] key, byte[] value) {
+    assert (isInitialized());
+    merge(key, key.length, value, value.length);
+  }
+
+  @Override
+  public void merge(ColumnFamilyHandle columnFamilyHandle, byte[] key, byte[] value) {
+    assert (isInitialized());
+    merge(key, key.length, value, value.length, columnFamilyHandle.nativeHandle_);
+  }
+
+  @Override
+  public void remove(byte[] key) {
+    assert (isInitialized());
+    remove(key, key.length);
+  }
+
+  @Override
+  public void remove(ColumnFamilyHandle columnFamilyHandle, byte[] key) {
+    assert (isInitialized());
+    remove(key, key.length, columnFamilyHandle.nativeHandle_);
+  }
+
+  @Override
+  public void putLogData(byte[] blob) {
+    assert (isInitialized());
+    putLogData(blob, blob.length);
+  }
+
+  @Override
+  public void clear() {
+    assert (isInitialized());
+    clear0();
+  }
+
+  /**
+   * Delete the c++ side pointer.
+   */
+  @Override
+  protected void disposeInternal() {
+    assert (isInitialized());
+    disposeInternal(nativeHandle_);
+  }
+
+  abstract void disposeInternal(long handle);
+
+  abstract int count0();
+
+  abstract void put(byte[] key, int keyLen, byte[] value, int valueLen);
+
+  abstract void put(byte[] key, int keyLen, byte[] value, int valueLen, long cfHandle);
+
+  abstract void merge(byte[] key, int keyLen, byte[] value, int valueLen);
+
+  abstract void merge(byte[] key, int keyLen, byte[] value, int valueLen, long cfHandle);
+
+  abstract void remove(byte[] key, int keyLen);
+
+  abstract void remove(byte[] key, int keyLen, long cfHandle);
+
+  abstract void putLogData(byte[] blob, int blobLen);
+
+  abstract void clear0();
+}
diff --git a/java/org/rocksdb/WriteBatch.java b/java/org/rocksdb/WriteBatch.java
index 64f472c89..24133ec39 100644
--- a/java/org/rocksdb/WriteBatch.java
+++ b/java/org/rocksdb/WriteBatch.java
@@ -22,7 +22,7 @@ package org.rocksdb;
  * non-const method, all threads accessing the same WriteBatch must use
  * external synchronization.
  */
-public class WriteBatch extends RocksObject implements WriteBatchInterface {
+public class WriteBatch extends AbstractWriteBatch {
   /**
    * Constructs a WriteBatch instance.
    */
@@ -41,48 +41,6 @@ public class WriteBatch extends RocksObject implements WriteBatchInterface {
     newWriteBatch(reserved_bytes);
   }
 
-  @Override
-  public native int count();
-
-  @Override
-  public void put(byte[] key, byte[] value) {
-    put(key, key.length, value, value.length);
-  }
-
-  @Override
-  public void put(ColumnFamilyHandle columnFamilyHandle,
-      byte[] key, byte[] value) {
-    put(key, key.length, value, value.length,
-        columnFamilyHandle.nativeHandle_);
-  }
-
-  @Override
-  public void merge(byte[] key, byte[] value) {
-    merge(key, key.length, value, value.length);
-  }
-
-  @Override
-  public void merge(ColumnFamilyHandle columnFamilyHandle,
-      byte[] key, byte[] value) {
-    merge(key, key.length, value, value.length,
-        columnFamilyHandle.nativeHandle_);
-  }
-
-  @Override
-  public void remove(byte[] key) {
-    remove(key, key.length);
-  }
-
-  @Override
-  public void remove(ColumnFamilyHandle columnFamilyHandle, byte[] key) {
-    remove(key, key.length, columnFamilyHandle.nativeHandle_);
-  }
-
-  @Override
-  public void putLogData(byte[] blob) {
-    putLogData(blob, blob.length);
-  }
-
   /**
    * Support for iterating over the contents of a batch.
    *
@@ -95,34 +53,22 @@ public class WriteBatch extends RocksObject implements WriteBatchInterface {
     iterate(handler.nativeHandle_);
   }
 
-  @Override
-  public native void clear();
-
-  /**
-   * Delete the c++ side pointer.
-   */
-  @Override protected void disposeInternal() {
-    assert(isInitialized());
-    disposeInternal(nativeHandle_);
-  }
+  @Override final native void disposeInternal(long handle);
+  @Override final native int count0();
+  @Override final native void put(byte[] key, int keyLen, byte[] value, int valueLen);
+  @Override final native void put(byte[] key, int keyLen, byte[] value, int valueLen,
+      long cfHandle);
+  @Override final native void merge(byte[] key, int keyLen, byte[] value, int valueLen);
+  @Override final native void merge(byte[] key, int keyLen, byte[] value, int valueLen,
+      long cfHandle);
+  @Override final native void remove(byte[] key, int keyLen);
+  @Override final native void remove(byte[] key, int keyLen, long cfHandle);
+  @Override final native void putLogData(byte[] blob, int blobLen);
+  @Override final native void clear0();
 
   private native void newWriteBatch(int reserved_bytes);
-  private native void put(byte[] key, int keyLen,
-                          byte[] value, int valueLen);
-  private native void put(byte[] key, int keyLen,
-                          byte[] value, int valueLen,
-                          long cfHandle);
-  private native void merge(byte[] key, int keyLen,
-                            byte[] value, int valueLen);
-  private native void merge(byte[] key, int keyLen,
-                            byte[] value, int valueLen,
-                            long cfHandle);
-  private native void remove(byte[] key, int keyLen);
-  private native void remove(byte[] key, int keyLen,
-                            long cfHandle);
-  private native void putLogData(byte[] blob, int blobLen);
   private native void iterate(long handlerHandle) throws RocksDBException;
-  private native void disposeInternal(long handle);
+
 
   /**
    * Handler callback for iterating over the contents of a batch.
diff --git a/java/rocksjni/write_batch.cc b/java/rocksjni/write_batch.cc
index dbf2e25e2..02f3989a9 100644
--- a/java/rocksjni/write_batch.cc
+++ b/java/rocksjni/write_batch.cc
@@ -41,10 +41,10 @@ void Java_org_rocksdb_WriteBatch_newWriteBatch(
 
 /*
  * Class:     org_rocksdb_WriteBatch
- * Method:    count
+ * Method:    count0
  * Signature: ()I
  */
-jint Java_org_rocksdb_WriteBatch_count(JNIEnv* env, jobject jobj) {
+jint Java_org_rocksdb_WriteBatch_count0(JNIEnv* env, jobject jobj) {
   rocksdb::WriteBatch* wb = rocksdb::WriteBatchJni::getHandle(env, jobj);
   assert(wb != nullptr);
 
@@ -53,10 +53,10 @@ jint Java_org_rocksdb_WriteBatch_count(JNIEnv* env, jobject jobj) {
 
 /*
  * Class:     org_rocksdb_WriteBatch
- * Method:    clear
+ * Method:    clear0
  * Signature: ()V
  */
-void Java_org_rocksdb_WriteBatch_clear(JNIEnv* env, jobject jobj) {
+void Java_org_rocksdb_WriteBatch_clear0(JNIEnv* env, jobject jobj) {
   rocksdb::WriteBatch* wb = rocksdb::WriteBatchJni::getHandle(env, jobj);
   assert(wb != nullptr);
 

From ef5b34dee0fe7276b893a536a68d15e089f79657 Mon Sep 17 00:00:00 2001
From: Adam Retter <adam.retter@googlemail.com>
Date: Sat, 3 Jan 2015 14:52:07 +0000
Subject: [PATCH 693/829] Implement WriteBatchWithIndex in the Java API

---
 java/Makefile                             |   2 +
 java/org/rocksdb/WBWIRocksIterator.java   | 125 +++++++++
 java/org/rocksdb/WriteBatchWithIndex.java | 153 +++++++++++
 java/rocksjni/portal.h                    |  32 +++
 java/rocksjni/write_batch_with_index.cc   | 299 ++++++++++++++++++++++
 5 files changed, 611 insertions(+)
 create mode 100644 java/org/rocksdb/WBWIRocksIterator.java
 create mode 100644 java/org/rocksdb/WriteBatchWithIndex.java
 create mode 100644 java/rocksjni/write_batch_with_index.cc

diff --git a/java/Makefile b/java/Makefile
index 33a1008d4..c8f443f7b 100644
--- a/java/Makefile
+++ b/java/Makefile
@@ -36,6 +36,8 @@ NATIVE_JAVA_CLASSES = org.rocksdb.AbstractComparator\
 	org.rocksdb.test.WriteBatchInternal\
 	org.rocksdb.test.WriteBatchTest\
         org.rocksdb.WriteOptions\
+	org.rocksdb.WriteBatchWithIndex\
+	org.rocksdb.WBWIRocksIterator
 
 ROCKSDB_MAJOR = $(shell egrep "ROCKSDB_MAJOR.[0-9]" ../include/rocksdb/version.h | cut -d ' ' -f 3)
 ROCKSDB_MINOR = $(shell egrep "ROCKSDB_MINOR.[0-9]" ../include/rocksdb/version.h | cut -d ' ' -f 3)
diff --git a/java/org/rocksdb/WBWIRocksIterator.java b/java/org/rocksdb/WBWIRocksIterator.java
new file mode 100644
index 000000000..aafe3aca6
--- /dev/null
+++ b/java/org/rocksdb/WBWIRocksIterator.java
@@ -0,0 +1,125 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+public class WBWIRocksIterator extends RocksObject implements RocksIteratorInterface {
+
+  //TODO(AR) abstract common code from WBWIRocksIterator and RocksIterator into AbstractRocksIterator
+
+  final WriteBatchWithIndex wbwi_;
+
+  protected WBWIRocksIterator(WriteBatchWithIndex wbwi, long nativeHandle) {
+    super();
+    nativeHandle_ = nativeHandle;
+    // rocksDB must point to a valid RocksDB instance.
+    assert (wbwi != null);
+    // WBWIRocksIterator must hold a reference to the related WriteBatchWithIndex instance
+    // to guarantee that while a GC cycle starts WBWIRocksIterator instances
+    // are freed prior to WriteBatchWithIndex instances.
+    wbwi_ = wbwi;
+  }
+
+  @Override
+  public boolean isValid() {
+    return false;
+  }
+
+  @Override
+  public void seekToFirst() {
+
+  }
+
+  @Override
+  public void seekToLast() {
+
+  }
+
+  @Override
+  public void seek(byte[] target) {
+
+  }
+
+  @Override
+  public void next() {
+
+  }
+
+  @Override
+  public void prev() {
+
+  }
+
+  /**
+   * Get the current entry
+   */
+  public WriteEntry entry() {
+    throw new UnsupportedOperationException("NOT YET IMPLEMENTED"); //TODO(AR) implement
+  }
+
+  @Override
+  public void status() throws RocksDBException {
+
+  }
+
+  /**
+   * <p>Deletes underlying C++ iterator pointer.</p>
+   * <p/>
+   * <p>Note: the underlying handle can only be safely deleted if the WriteBatchWithIndex
+   * instance related to a certain WBWIRocksIterator is still valid and initialized.
+   * Therefore {@code disposeInternal()} checks if the WriteBatchWithIndex is initialized
+   * before freeing the native handle.</p>
+   */
+  @Override
+  protected void disposeInternal() {
+    synchronized (wbwi_) {
+      assert (isInitialized());
+      if (wbwi_.isInitialized()) {
+        disposeInternal(nativeHandle_);
+      }
+    }
+  }
+
+  private native void disposeInternal(long handle);
+
+  /**
+   * Enumeration of the Write operation
+   * that created the record in the Write Batch
+   */
+  public enum WriteType {
+    PutRecord,
+    MergeRecord,
+    DeleteRecord,
+    LogDataRecord
+  }
+
+  /**
+   * Represents the entry returned by a
+   * WBWIRocksIterator
+   */
+  public static class WriteEntry {
+    final WriteType type;
+    final Slice key;
+    final Slice value;
+
+    public WriteEntry(final WriteType type, final Slice key, final Slice value) {
+      this.type = type;
+      this.key = key;
+      this.value = value;
+    }
+
+    public WriteType getType() {
+      return type;
+    }
+
+    public Slice getKey() {
+      return key;
+    }
+
+    public Slice getValue() {
+      return value;
+    }
+  }
+}
diff --git a/java/org/rocksdb/WriteBatchWithIndex.java b/java/org/rocksdb/WriteBatchWithIndex.java
new file mode 100644
index 000000000..bb42dc3d7
--- /dev/null
+++ b/java/org/rocksdb/WriteBatchWithIndex.java
@@ -0,0 +1,153 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+/**
+ * Similar to {@link org.rocksdb.WriteBatch} but with a binary searchable
+ * index built for all the keys inserted.
+ *
+ * Calling put, merge, remove or putLogData calls the same function
+ * as with {@link org.rocksdb.WriteBatch} whilst also building an index.
+ *
+ * A user can call {@link org.rocksdb.WriteBatchWithIndex#newIterator() }to create an iterator
+ * over the write batch or
+ * {@link org.rocksdb.WriteBatchWithIndex#newIteratorWithBase(org.rocksdb.RocksIterator)} to
+ * get an iterator for the database with Read-Your-Own-Writes like capability
+ */
+public class WriteBatchWithIndex extends AbstractWriteBatch {
+
+  //TODO(AR) need to cover directly passing WriteBatchWithIndex to {@see org.rocksdb.RocksDB#write(WriteBatch)
+  //this simplifies the Java API beyond the C++ API as you don't need to call
+  //GetWriteBatch on the WriteBatchWithIndex
+
+  /**
+   * Creates a WriteBatchWithIndex where no bytes
+   * are reserved up-front, bytewise comparison is
+   * used for fallback key comparisons,
+   * and duplicate keys operations are retained
+   */
+  public WriteBatchWithIndex() {
+    super();
+    newWriteBatchWithIndex();
+  }
+
+
+  /**
+   * Creates a WriteBatchWithIndex where no bytes
+   * are reserved up-front, bytewise comparison is
+   * used for fallback key comparisons, and duplicate key
+   * assignment is determined by the constructor argument
+   *
+   * @param overwriteKey if true, overwrite the key in the index when
+   *                     inserting a duplicate key, in this way an iterator will never
+   *                     show two entries with the same key.
+   */
+  public WriteBatchWithIndex(boolean overwriteKey) {
+    super();
+    newWriteBatchWithIndex(overwriteKey);
+  }
+
+  /**
+   * Creates a WriteBatchWithIndex
+   *
+   * @param fallbackIndexComparator We fallback to this comparator
+   *                                to compare keys within a column family if we cannot determine
+   *                                the column family and so look up it's comparator.
+   * @param reservedBytes           reserved bytes in underlying WriteBatch
+   * @param overwriteKey            if true, overwrite the key in the index when
+   *                                inserting a duplicate key, in this way an iterator will never
+   *                                show two entries with the same key.
+   */
+  public WriteBatchWithIndex(AbstractComparator fallbackIndexComparator, int reservedBytes,
+      boolean overwriteKey) {
+    super();
+    newWriteBatchWithIndex(fallbackIndexComparator.nativeHandle_, reservedBytes, overwriteKey);
+  }
+
+  /**
+   * Create an iterator of a column family. User can call
+   * {@link org.rocksdb.RocksIteratorInterface#seek(byte[])} to
+   * search to the next entry of or after a key. Keys will be iterated in the
+   * order given by index_comparator. For multiple updates on the same key,
+   * each update will be returned as a separate entry, in the order of update
+   * time.
+   *
+   * @param columnFamilyHandle The column family to iterate over
+   * @return An iterator for the Write Batch contents, restricted to the column family
+   */
+  public WBWIRocksIterator newIterator(ColumnFamilyHandle columnFamilyHandle) {
+    return new WBWIRocksIterator(this, iterator1(columnFamilyHandle.nativeHandle_));
+  }
+
+  /**
+   * Create an iterator of the default column family. User can call
+   * {@link org.rocksdb.RocksIteratorInterface#seek(byte[])} to
+   * search to the next entry of or after a key. Keys will be iterated in the
+   * order given by index_comparator. For multiple updates on the same key,
+   * each update will be returned as a separate entry, in the order of update
+   * time.
+   *
+   * @return An iterator for the Write Batch contents
+   */
+  public WBWIRocksIterator newIterator() {
+    return new WBWIRocksIterator(this, iterator0());
+  }
+
+  /**
+   * Provides Read-Your-Own-Writes like functionality by
+   * creating a new Iterator that will use {@link org.rocksdb.WBWIRocksIterator}
+   * as a delta and baseIterator as a base
+   *
+   * @param columnFamilyHandle The column family to iterate over
+   * @param baseIterator       The base iterator, e.g. {@link org.rocksdb.RocksDB#newIterator()}
+   * @return An iterator which shows a view comprised of both the database point-in-time
+   * from baseIterator and modifications made in this write batch.
+   */
+  public RocksIterator newIteratorWithBase(ColumnFamilyHandle columnFamilyHandle,
+      RocksIterator baseIterator) {
+    RocksIterator iterator = new RocksIterator(
+        baseIterator.rocksDB_,
+        iteratorWithBase(columnFamilyHandle.nativeHandle_, baseIterator.nativeHandle_));
+
+    //when the iterator is deleted it will also delete the baseIterator
+    baseIterator.disOwnNativeHandle();
+    return iterator;
+  }
+
+  /**
+   * Provides Read-Your-Own-Writes like functionality by
+   * creating a new Iterator that will use {@link org.rocksdb.WBWIRocksIterator}
+   * as a delta and baseIterator as a base. Operates on the default column family.
+   *
+   * @param baseIterator The base iterator, e.g. {@link org.rocksdb.RocksDB#newIterator()}
+   * @return An iterator which shows a view comprised of both the database point-in-time
+   * from baseIterator and modifications made in this write batch.
+   */
+  public RocksIterator newIteratorWithBase(RocksIterator baseIterator) {
+    return newIteratorWithBase(baseIterator.rocksDB_.getDefaultColumnFamily(), baseIterator);
+  }
+
+  @Override final native void disposeInternal(long handle);
+  @Override final native int count0();
+  @Override final native void put(byte[] key, int keyLen, byte[] value, int valueLen);
+  @Override final native void put(byte[] key, int keyLen, byte[] value, int valueLen,
+      long cfHandle);
+  @Override final native void merge(byte[] key, int keyLen, byte[] value, int valueLen);
+  @Override final native void merge(byte[] key, int keyLen, byte[] value, int valueLen,
+      long cfHandle);
+  @Override final native void remove(byte[] key, int keyLen);
+  @Override final native void remove(byte[] key, int keyLen, long cfHandle);
+  @Override final native void putLogData(byte[] blob, int blobLen);
+  @Override final native void clear0();
+
+  private native void newWriteBatchWithIndex();
+  private native void newWriteBatchWithIndex(boolean overwriteKey);
+  private native void newWriteBatchWithIndex(long fallbackIndexComparatorHandle, int reservedBytes,
+      boolean overwriteKey);
+  private native long iterator0();
+  private native long iterator1(long cfHandle);
+  private native long iteratorWithBase(long baseIteratorHandle, long cfHandle);
+}
diff --git a/java/rocksjni/portal.h b/java/rocksjni/portal.h
index 539e824e5..746dde539 100644
--- a/java/rocksjni/portal.h
+++ b/java/rocksjni/portal.h
@@ -19,6 +19,7 @@
 #include "rocksdb/filter_policy.h"
 #include "rocksdb/status.h"
 #include "rocksdb/utilities/backupable_db.h"
+#include "rocksdb/utilities/write_batch_with_index.h"
 #include "rocksjni/comparatorjnicallback.h"
 #include "rocksjni/writebatchhandlerjnicallback.h"
 
@@ -390,6 +391,37 @@ class WriteBatchHandlerJni {
   }
 };
 
+class WriteBatchWithIndexJni {
+ public:
+  static jclass getJClass(JNIEnv* env) {
+    jclass jclazz = env->FindClass("org/rocksdb/WriteBatchWithIndex");
+    assert(jclazz != nullptr);
+    return jclazz;
+  }
+
+  static jfieldID getHandleFieldID(JNIEnv* env) {
+    static jfieldID fid = env->GetFieldID(
+        getJClass(env), "nativeHandle_", "J");
+    assert(fid != nullptr);
+    return fid;
+  }
+
+  // Get the pointer to rocksdb::WriteBatchWithIndex of the specified
+  // org.rocksdb.WriteBatchWithIndex.
+  static rocksdb::WriteBatchWithIndex* getHandle(JNIEnv* env, jobject jwbwi) {
+    return reinterpret_cast<rocksdb::WriteBatchWithIndex*>(
+        env->GetLongField(jwbwi, getHandleFieldID(env)));
+  }
+
+  // Pass the rocksdb::WriteBatchWithIndex pointer to the java side.
+  static void setHandle(JNIEnv* env, jobject jwbwi,
+      rocksdb::WriteBatchWithIndex* wbwi) {
+    env->SetLongField(
+        jwbwi, getHandleFieldID(env),
+        reinterpret_cast<jlong>(wbwi));
+  }
+};
+
 class HistogramDataJni {
  public:
   static jmethodID getConstructorMethodId(JNIEnv* env, jclass jclazz) {
diff --git a/java/rocksjni/write_batch_with_index.cc b/java/rocksjni/write_batch_with_index.cc
new file mode 100644
index 000000000..3d04b4ddd
--- /dev/null
+++ b/java/rocksjni/write_batch_with_index.cc
@@ -0,0 +1,299 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+//
+// This file implements the "bridge" between Java and C++ and enables
+// calling c++ rocksdb::WriteBatchWithIndex methods from Java side.
+
+#include "include/org_rocksdb_WriteBatchWithIndex.h"
+#include "rocksjni/portal.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/utilities/write_batch_with_index.h"
+
+/*
+ * Class:     org_rocksdb_WriteBatchWithIndex
+ * Method:    newWriteBatchWithIndex
+ * Signature: ()V
+ */
+void Java_org_rocksdb_WriteBatchWithIndex_newWriteBatchWithIndex__(
+    JNIEnv* env, jobject jobj) {
+  rocksdb::WriteBatchWithIndex* wbwi = new rocksdb::WriteBatchWithIndex();
+  rocksdb::WriteBatchWithIndexJni::setHandle(env, jobj, wbwi);
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatchWithIndex
+ * Method:    newWriteBatchWithIndex
+ * Signature: (Z)V
+ */
+void Java_org_rocksdb_WriteBatchWithIndex_newWriteBatchWithIndex__Z(
+    JNIEnv* env, jobject jobj, jboolean joverwrite_key) {
+  rocksdb::WriteBatchWithIndex* wbwi =
+      new rocksdb::WriteBatchWithIndex(rocksdb::BytewiseComparator(), 0,
+      static_cast<bool>(joverwrite_key));
+  rocksdb::WriteBatchWithIndexJni::setHandle(env, jobj, wbwi);
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatchWithIndex
+ * Method:    newWriteBatchWithIndex
+ * Signature: (JIZ)V
+ */
+void Java_org_rocksdb_WriteBatchWithIndex_newWriteBatchWithIndex__JIZ(
+    JNIEnv* env, jobject jobj, jlong jfallback_index_comparator_handle,
+    jint jreserved_bytes, jboolean joverwrite_key) {
+  rocksdb::WriteBatchWithIndex* wbwi =
+      new rocksdb::WriteBatchWithIndex(
+      reinterpret_cast<rocksdb::Comparator*>(jfallback_index_comparator_handle),
+      static_cast<size_t>(jreserved_bytes), static_cast<bool>(joverwrite_key));
+  rocksdb::WriteBatchWithIndexJni::setHandle(env, jobj, wbwi);
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatchWithIndex
+ * Method:    count
+ * Signature: ()I
+ */
+jint Java_org_rocksdb_WriteBatchWithIndex_count0(
+    JNIEnv* env, jobject jobj) {
+  rocksdb::WriteBatchWithIndex* wbwi =
+      rocksdb::WriteBatchWithIndexJni::getHandle(env, jobj);
+  assert(wbwi != nullptr);
+
+  return static_cast<jint>(wbwi->GetWriteBatch()->Count());
+}
+
+//TODO(AR) make generic with WriteBatch equivalent
+/*
+ * Helper for WriteBatchWithIndex put operations
+ */
+void write_batch_with_index_put_helper(
+    JNIEnv* env, jobject jobj,
+    jbyteArray jkey, jint jkey_len,
+    jbyteArray jentry_value, jint jentry_value_len,
+    rocksdb::ColumnFamilyHandle* cf_handle) {
+  rocksdb::WriteBatchWithIndex* wbwi =
+      rocksdb::WriteBatchWithIndexJni::getHandle(env, jobj);
+  assert(wbwi != nullptr);
+
+  jbyte* key = env->GetByteArrayElements(jkey, nullptr);
+  jbyte* value = env->GetByteArrayElements(jentry_value, nullptr);
+  rocksdb::Slice key_slice(reinterpret_cast<char*>(key), jkey_len);
+  rocksdb::Slice value_slice(reinterpret_cast<char*>(value),
+      jentry_value_len);
+  if (cf_handle != nullptr) {
+    wbwi->Put(cf_handle, key_slice, value_slice);
+  } else {
+    // backwards compatibility
+    wbwi->Put(key_slice, value_slice);
+  }
+  env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
+  env->ReleaseByteArrayElements(jentry_value, value, JNI_ABORT);
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatchWithIndex
+ * Method:    put
+ * Signature: ([BI[BI)V
+ */
+void Java_org_rocksdb_WriteBatchWithIndex_put___3BI_3BI(
+    JNIEnv* env, jobject jobj, jbyteArray jkey, jint jkey_len,
+    jbyteArray jentry_value, jint jentry_value_len) {
+  write_batch_with_index_put_helper(env, jobj, jkey, jkey_len, jentry_value,
+      jentry_value_len, nullptr);
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatchWithIndex
+ * Method:    put
+ * Signature: ([BI[BIJ)V
+ */
+void Java_org_rocksdb_WriteBatchWithIndex_put___3BI_3BIJ(
+    JNIEnv* env, jobject jobj, jbyteArray jkey, jint jkey_len,
+    jbyteArray jentry_value, jint jentry_value_len, jlong jcf_handle) {
+  auto* cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
+  write_batch_with_index_put_helper(env, jobj, jkey, jkey_len, jentry_value,
+      jentry_value_len, cf_handle);
+}
+
+//TODO(AR) make generic with WriteBatch equivalent
+/*
+ * Helper for WriteBatchWithIndex merge operations
+ */
+void write_batch_with_index_merge_helper(
+    JNIEnv* env, jobject jobj,
+    jbyteArray jkey, jint jkey_len,
+    jbyteArray jentry_value, jint jentry_value_len,
+    rocksdb::ColumnFamilyHandle* cf_handle) {
+  rocksdb::WriteBatchWithIndex* wbwi =
+      rocksdb::WriteBatchWithIndexJni::getHandle(env, jobj);
+  assert(wbwi != nullptr);
+
+  jbyte* key = env->GetByteArrayElements(jkey, nullptr);
+  jbyte* value = env->GetByteArrayElements(jentry_value, nullptr);
+  rocksdb::Slice key_slice(reinterpret_cast<char*>(key), jkey_len);
+  rocksdb::Slice value_slice(reinterpret_cast<char*>(value),
+      jentry_value_len);
+  if (cf_handle != nullptr) {
+    wbwi->Merge(cf_handle, key_slice, value_slice);
+  } else {
+    // backwards compatibility
+    wbwi->Merge(key_slice, value_slice);
+  }
+  env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
+  env->ReleaseByteArrayElements(jentry_value, value, JNI_ABORT);
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatchWithIndex
+ * Method:    merge
+ * Signature: ([BI[BI)V
+ */
+void Java_org_rocksdb_WriteBatchWithIndex_merge___3BI_3BI(
+    JNIEnv* env, jobject jobj, jbyteArray jkey, jint jkey_len,
+    jbyteArray jentry_value, jint jentry_value_len) {
+  write_batch_with_index_merge_helper(env, jobj, jkey, jkey_len,
+      jentry_value, jentry_value_len, nullptr);
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatchWithIndex
+ * Method:    merge
+ * Signature: ([BI[BIJ)V
+ */
+void Java_org_rocksdb_WriteBatchWithIndex_merge___3BI_3BIJ(
+    JNIEnv* env, jobject jobj, jbyteArray jkey, jint jkey_len,
+    jbyteArray jentry_value, jint jentry_value_len, jlong jcf_handle) {
+  auto* cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
+  write_batch_with_index_merge_helper(env, jobj, jkey, jkey_len,
+      jentry_value, jentry_value_len, cf_handle);
+}
+
+//TODO(AR) make generic with WriteBatch equivalent
+/*
+ * Helper for write batch remove operations
+ */
+void write_batch_with_index_remove_helper(
+    JNIEnv* env, jobject jobj,
+    jbyteArray jkey, jint jkey_len,
+    rocksdb::ColumnFamilyHandle* cf_handle) {
+  rocksdb::WriteBatchWithIndex* wbwi =
+      rocksdb::WriteBatchWithIndexJni::getHandle(env, jobj);
+  assert(wbwi != nullptr);
+
+  jbyte* key = env->GetByteArrayElements(jkey, nullptr);
+  rocksdb::Slice key_slice(reinterpret_cast<char*>(key), jkey_len);
+  if (cf_handle != nullptr) {
+    wbwi->Delete(cf_handle, key_slice);
+  } else {
+    wbwi->Delete(key_slice);
+  }
+  env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatchWithIndex
+ * Method:    remove
+ * Signature: ([BI)V
+ */
+void Java_org_rocksdb_WriteBatchWithIndex_remove___3BI(
+    JNIEnv* env, jobject jobj, jbyteArray jkey, jint jkey_len) {
+  write_batch_with_index_remove_helper(env, jobj, jkey, jkey_len, nullptr);
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatchWithIndex
+ * Method:    remove
+ * Signature: ([BIJ)V
+ */
+void Java_org_rocksdb_WriteBatchWithIndex_remove___3BIJ(
+    JNIEnv* env, jobject jobj,
+    jbyteArray jkey, jint jkey_len, jlong jcf_handle) {
+  auto* cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
+  write_batch_with_index_remove_helper(env, jobj, jkey, jkey_len, cf_handle);
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatchWithIndex
+ * Method:    putLogData
+ * Signature: ([BI)V
+ */
+void Java_org_rocksdb_WriteBatchWithIndex_putLogData(
+    JNIEnv* env, jobject jobj, jbyteArray jblob, jint jblob_len) {
+  rocksdb::WriteBatchWithIndex* wbwi =
+      rocksdb::WriteBatchWithIndexJni::getHandle(env, jobj);
+  assert(wbwi != nullptr);
+
+  jbyte* blob = env->GetByteArrayElements(jblob, nullptr);
+  rocksdb::Slice blob_slice(reinterpret_cast<char*>(blob), jblob_len);
+  wbwi->PutLogData(blob_slice);
+  env->ReleaseByteArrayElements(jblob, blob, JNI_ABORT);
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatchWithIndex
+ * Method:    clear
+ * Signature: ()V
+ */
+void Java_org_rocksdb_WriteBatchWithIndex_clear0(
+    JNIEnv* env, jobject jobj) {
+  rocksdb::WriteBatchWithIndex* wbwi =
+      rocksdb::WriteBatchWithIndexJni::getHandle(env, jobj);
+  assert(wbwi != nullptr);
+
+  wbwi->GetWriteBatch()->Clear();
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatchWithIndex
+ * Method:    iterator0
+ * Signature: ()J
+ */
+jlong Java_org_rocksdb_WriteBatchWithIndex_iterator0(
+    JNIEnv* env, jobject jobj) {
+  rocksdb::WriteBatchWithIndex* wbwi =
+      rocksdb::WriteBatchWithIndexJni::getHandle(env, jobj);
+  rocksdb::WBWIIterator* wbwi_iterator = wbwi->NewIterator();
+  return reinterpret_cast<jlong>(wbwi_iterator);
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatchWithIndex
+ * Method:    iterator1
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_WriteBatchWithIndex_iterator1(
+    JNIEnv* env, jobject jobj, jlong jcf_handle) {
+  rocksdb::WriteBatchWithIndex* wbwi =
+      rocksdb::WriteBatchWithIndexJni::getHandle(env, jobj);
+  auto* cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
+  rocksdb::WBWIIterator* wbwi_iterator = wbwi->NewIterator(cf_handle);
+  return reinterpret_cast<jlong>(wbwi_iterator);
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatchWithIndex
+ * Method:    iteratorWithBase
+ * Signature: (JJ)J
+ */
+jlong Java_org_rocksdb_WriteBatchWithIndex_iteratorWithBase(
+    JNIEnv* env, jobject jobj, jlong jcf_handle, jlong jbi_handle) {
+  rocksdb::WriteBatchWithIndex* wbwi =
+      rocksdb::WriteBatchWithIndexJni::getHandle(env, jobj);
+  auto* cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
+  auto* base_iterator = reinterpret_cast<rocksdb::Iterator*>(jbi_handle);
+  auto* iterator = wbwi->NewIteratorWithBase(cf_handle, base_iterator);
+  return reinterpret_cast<jlong>(iterator);
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatchWithIndex
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_WriteBatchWithIndex_disposeInternal(
+    JNIEnv* env, jobject jobj, jlong handle) {
+  auto* wbwi = reinterpret_cast<rocksdb::WriteBatchWithIndex*>(handle);
+  delete wbwi;
+}

From 95d5f984873ae887426ae7c3eae1d26d70cc0470 Mon Sep 17 00:00:00 2001
From: Adam Retter <adam.retter@googlemail.com>
Date: Sun, 4 Jan 2015 00:42:05 +0000
Subject: [PATCH 694/829] Test for RocksDB#write(WriteBatchWithIndex)

---
 .../rocksdb/test/WriteBatchWithIndexTest.java | 58 ++++++++++++++++++-
 1 file changed, 57 insertions(+), 1 deletion(-)

diff --git a/java/org/rocksdb/test/WriteBatchWithIndexTest.java b/java/org/rocksdb/test/WriteBatchWithIndexTest.java
index 3638d0637..a7db59dbc 100644
--- a/java/org/rocksdb/test/WriteBatchWithIndexTest.java
+++ b/java/org/rocksdb/test/WriteBatchWithIndexTest.java
@@ -13,7 +13,18 @@ import org.junit.ClassRule;
 import org.junit.Rule;
 import org.junit.Test;
 import org.junit.rules.TemporaryFolder;
-import org.rocksdb.*;
+import org.rocksdb.WriteBatchWithIndex;
+import org.rocksdb.DirectSlice;
+import org.rocksdb.Options;
+import org.rocksdb.RocksDB;
+import org.rocksdb.RocksDBException;
+import org.rocksdb.RocksIterator;
+import org.rocksdb.WriteOptions;
+import org.rocksdb.WBWIRocksIterator;
+
+import java.nio.ByteBuffer;
+import java.util.ArrayDeque;
+import java.util.Deque;
 
 import static org.assertj.core.api.Assertions.assertThat;
 
@@ -110,4 +121,49 @@ public class WriteBatchWithIndexTest {
       }
     }
   }
+
+  @Test
+  public void write_writeBatchWithIndex() throws RocksDBException {
+    RocksDB db = null;
+    Options options = null;
+    try {
+      options = new Options();
+      // Setup options
+      options.setCreateIfMissing(true);
+      db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath());
+
+      final byte[] k1 = "key1".getBytes();
+      final byte[] v1 = "value1".getBytes();
+      final byte[] k2 = "key2".getBytes();
+      final byte[] v2 = "value2".getBytes();
+
+      WriteBatchWithIndex wbwi = null;
+
+      try {
+        wbwi = new WriteBatchWithIndex();
+
+
+        wbwi.put(k1, v1);
+        wbwi.put(k2, v2);
+
+        db.write(new WriteOptions(), wbwi);
+      } finally {
+        if(wbwi != null) {
+          wbwi.dispose();
+        }
+      }
+
+      assertThat(db.get(k1)).isEqualTo(v1);
+      assertThat(db.get(k2)).isEqualTo(v2);
+
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
+    }
+  }
 }
+

From 56f24941ab7368b278935b75f25465163bb98f52 Mon Sep 17 00:00:00 2001
From: Adam Retter <adam.retter@googlemail.com>
Date: Sat, 3 Jan 2015 20:19:03 +0000
Subject: [PATCH 695/829] Simplify the Java API by permitting
 WriteBatchWithIndex to be provided straight to RocksDB#write

---
 java/org/rocksdb/RocksDB.java             | 22 ++++++++++++---
 java/org/rocksdb/WriteBatchWithIndex.java |  5 ----
 java/rocksjni/rocksjni.cc                 | 33 ++++++++++++++++++-----
 3 files changed, 46 insertions(+), 14 deletions(-)

diff --git a/java/org/rocksdb/RocksDB.java b/java/org/rocksdb/RocksDB.java
index 6c6e72260..089882532 100644
--- a/java/org/rocksdb/RocksDB.java
+++ b/java/org/rocksdb/RocksDB.java
@@ -539,7 +539,21 @@ public class RocksDB extends RocksObject {
    */
   public void write(WriteOptions writeOpts, WriteBatch updates)
       throws RocksDBException {
-    write(writeOpts.nativeHandle_, updates.nativeHandle_);
+    write0(writeOpts.nativeHandle_, updates.nativeHandle_);
+  }
+
+  /**
+   * Apply the specified updates to the database.
+   *
+   * @param writeOpts WriteOptions instance
+   * @param updates WriteBatchWithIndex instance
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public void write(WriteOptions writeOpts, WriteBatchWithIndex updates)
+      throws RocksDBException {
+    write1(writeOpts.nativeHandle_, updates.nativeHandle_);
   }
 
   /**
@@ -1547,8 +1561,10 @@ public class RocksDB extends RocksObject {
       long handle, long writeOptHandle,
       byte[] key, int keyLen,
       byte[] value, int valueLen, long cfHandle) throws RocksDBException;
-  protected native void write(
-      long writeOptHandle, long batchHandle) throws RocksDBException;
+  protected native void write0(
+      long writeOptHandle, long wbHandle) throws RocksDBException;
+  protected native void write1(
+      long writeOptHandle, long wbwiHandle) throws RocksDBException;
   protected native boolean keyMayExist(byte[] key, int keyLen,
       StringBuffer stringBuffer);
   protected native boolean keyMayExist(byte[] key, int keyLen,
diff --git a/java/org/rocksdb/WriteBatchWithIndex.java b/java/org/rocksdb/WriteBatchWithIndex.java
index bb42dc3d7..f71ba338c 100644
--- a/java/org/rocksdb/WriteBatchWithIndex.java
+++ b/java/org/rocksdb/WriteBatchWithIndex.java
@@ -18,11 +18,6 @@ package org.rocksdb;
  * get an iterator for the database with Read-Your-Own-Writes like capability
  */
 public class WriteBatchWithIndex extends AbstractWriteBatch {
-
-  //TODO(AR) need to cover directly passing WriteBatchWithIndex to {@see org.rocksdb.RocksDB#write(WriteBatch)
-  //this simplifies the Java API beyond the C++ API as you don't need to call
-  //GetWriteBatch on the WriteBatchWithIndex
-
   /**
    * Creates a WriteBatchWithIndex where no bytes
    * are reserved up-front, bytewise comparison is
diff --git a/java/rocksjni/rocksjni.cc b/java/rocksjni/rocksjni.cc
index fdd1b009f..54eef7f53 100644
--- a/java/rocksjni/rocksjni.cc
+++ b/java/rocksjni/rocksjni.cc
@@ -390,18 +390,39 @@ void Java_org_rocksdb_RocksDB_put__JJ_3BI_3BIJ(
 // rocksdb::DB::Write
 /*
  * Class:     org_rocksdb_RocksDB
- * Method:    write
+ * Method:    write0
  * Signature: (JJ)V
  */
-void Java_org_rocksdb_RocksDB_write(
+void Java_org_rocksdb_RocksDB_write0(
     JNIEnv* env, jobject jdb,
-    jlong jwrite_options_handle, jlong jbatch_handle) {
+    jlong jwrite_options_handle, jlong jwb_handle) {
   rocksdb::DB* db = rocksdb::RocksDBJni::getHandle(env, jdb);
-  auto write_options = reinterpret_cast<rocksdb::WriteOptions*>(
+  auto* write_options = reinterpret_cast<rocksdb::WriteOptions*>(
+      jwrite_options_handle);
+  auto* wb = reinterpret_cast<rocksdb::WriteBatch*>(jwb_handle);
+
+  rocksdb::Status s = db->Write(*write_options, wb);
+
+  if (!s.ok()) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    write1
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_RocksDB_write1(
+    JNIEnv* env, jobject jdb,
+    jlong jwrite_options_handle, jlong jwbwi_handle) {
+  rocksdb::DB* db = rocksdb::RocksDBJni::getHandle(env, jdb);
+  auto* write_options = reinterpret_cast<rocksdb::WriteOptions*>(
       jwrite_options_handle);
-  auto batch = reinterpret_cast<rocksdb::WriteBatch*>(jbatch_handle);
+  auto* wbwi = reinterpret_cast<rocksdb::WriteBatchWithIndex*>(jwbwi_handle);
+  auto* wb = wbwi->GetWriteBatch();
 
-  rocksdb::Status s = db->Write(*write_options, batch);
+  rocksdb::Status s = db->Write(*write_options, wb);
 
   if (!s.ok()) {
     rocksdb::RocksDBExceptionJni::ThrowNew(env, s);

From e01acb3a04db18ba6886c70d8fc797df567a54e1 Mon Sep 17 00:00:00 2001
From: Adam Retter <adam.retter@googlemail.com>
Date: Sat, 3 Jan 2015 15:04:43 +0000
Subject: [PATCH 696/829] Test for WriteBatchWithIndex#newIterator()

---
 java/org/rocksdb/DirectSlice.java             |  3 +
 .../rocksdb/test/WriteBatchWithIndexTest.java | 80 ++++++++++++++++++-
 2 files changed, 82 insertions(+), 1 deletion(-)

diff --git a/java/org/rocksdb/DirectSlice.java b/java/org/rocksdb/DirectSlice.java
index c69b61460..3012a1fc9 100644
--- a/java/org/rocksdb/DirectSlice.java
+++ b/java/org/rocksdb/DirectSlice.java
@@ -16,6 +16,9 @@ import java.nio.ByteBuffer;
  * values consider using @see org.rocksdb.Slice
  */
 public class DirectSlice extends AbstractSlice<ByteBuffer> {
+  //TODO(AR) only needed by WriteBatchWithIndexTest until JDK8
+  public final static DirectSlice NONE = new DirectSlice();
+
   /**
    * Called from JNI to construct a new Java DirectSlice
    * without an underlying C++ object set
diff --git a/java/org/rocksdb/test/WriteBatchWithIndexTest.java b/java/org/rocksdb/test/WriteBatchWithIndexTest.java
index a7db59dbc..de2b637ff 100644
--- a/java/org/rocksdb/test/WriteBatchWithIndexTest.java
+++ b/java/org/rocksdb/test/WriteBatchWithIndexTest.java
@@ -165,5 +165,83 @@ public class WriteBatchWithIndexTest {
       }
     }
   }
-}
 
+  @Test
+  public void iterator() throws RocksDBException {
+    final WriteBatchWithIndex wbwi = new WriteBatchWithIndex(true);
+
+    final String k1 = "key1";
+    final String v1 = "value1";
+    final String k2 = "key2";
+    final String v2 = "value2";
+    final String k3 = "key3";
+    final String v3 = "value3";
+    final byte[] k1b = k1.getBytes();
+    final byte[] v1b = v1.getBytes();
+    final byte[] k2b = k2.getBytes();
+    final byte[] v2b = v2.getBytes();
+    final byte[] k3b = k3.getBytes();
+    final byte[] v3b = v3.getBytes();
+
+    //add put records
+    wbwi.put(k1b, v1b);
+    wbwi.put(k2b, v2b);
+    wbwi.put(k3b, v3b);
+
+    //add a deletion record
+    final String k4 = "key4";
+    final byte[] k4b = k4.getBytes();
+    wbwi.remove(k4b);
+
+    WBWIRocksIterator.WriteEntry[] expected = {
+        new WBWIRocksIterator.WriteEntry(WBWIRocksIterator.WriteType.PUT,
+            new DirectSlice(k1), new DirectSlice(v1)),
+        new WBWIRocksIterator.WriteEntry(WBWIRocksIterator.WriteType.PUT,
+            new DirectSlice(k2), new DirectSlice(v2)),
+        new WBWIRocksIterator.WriteEntry(WBWIRocksIterator.WriteType.PUT,
+            new DirectSlice(k3), new DirectSlice(v3)),
+        new WBWIRocksIterator.WriteEntry(WBWIRocksIterator.WriteType.DELETE,
+            new DirectSlice(k4), DirectSlice.NONE)
+    };
+
+    WBWIRocksIterator it = null;
+    try {
+      it = wbwi.newIterator();
+
+      //direct access - seek to key offsets
+      final int[] testOffsets = {2, 0, 1, 3};
+
+      for(int i = 0; i < testOffsets.length; i++) {
+        final int testOffset = testOffsets[i];
+        final byte[] key = toArray(expected[testOffset].getKey().data());
+
+        it.seek(key);
+        assertThat(it.isValid()).isTrue();
+        assertThat(it.entry()).isEqualTo(expected[testOffset]);
+      }
+
+      //forward iterative access
+      int i = 0;
+      for(it.seekToFirst(); it.isValid(); it.next()) {
+        assertThat(it.entry()).isEqualTo(expected[i++]);
+      }
+
+      //reverse iterative access
+      i = expected.length - 1;
+      for(it.seekToLast(); it.isValid(); it.prev()) {
+        assertThat(it.entry()).isEqualTo(expected[i--]);
+      }
+
+    } finally {
+      if(it != null) {
+        it.dispose();
+      }
+    }
+  }
+
+  private byte[] toArray(final ByteBuffer buf) {
+    final byte[] ary = new byte[buf.remaining()];
+    buf.get(ary);
+    return ary;
+  }
+}

From de678b288e36d0403d76d278fac9f6c97d08c44c Mon Sep 17 00:00:00 2001
From: Adam Retter <adam.retter@googlemail.com>
Date: Sat, 3 Jan 2015 15:08:10 +0000
Subject: [PATCH 697/829] Abstractions for common iterator behaviour

---
 java/org/rocksdb/AbstractRocksIterator.java | 105 ++++++++++++++++++++
 java/org/rocksdb/RocksIterator.java         |  89 ++---------------
 java/rocksjni/iterator.cc                   |  88 ++++++++--------
 3 files changed, 160 insertions(+), 122 deletions(-)
 create mode 100644 java/org/rocksdb/AbstractRocksIterator.java

diff --git a/java/org/rocksdb/AbstractRocksIterator.java b/java/org/rocksdb/AbstractRocksIterator.java
new file mode 100644
index 000000000..cc7cf064f
--- /dev/null
+++ b/java/org/rocksdb/AbstractRocksIterator.java
@@ -0,0 +1,105 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+/**
+ * Base class implementation for Rocks Iterators
+ * in the Java API
+ * <p/>
+ * <p>Multiple threads can invoke const methods on an RocksIterator without
+ * external synchronization, but if any of the threads may call a
+ * non-const method, all threads accessing the same RocksIterator must use
+ * external synchronization.</p>
+ *
+ * @param P The type of the Parent Object from which the Rocks Iterator was
+ *          created. This is used by disposeInternal to avoid double-free
+ *          issues with the underlying C++ object.
+ * @see org.rocksdb.RocksObject
+ */
+public abstract class AbstractRocksIterator<P extends RocksObject>
+    extends RocksObject implements RocksIteratorInterface {
+  final P parent_;
+
+  protected AbstractRocksIterator(P parent, long nativeHandle) {
+    super();
+    nativeHandle_ = nativeHandle;
+    // parent must point to a valid RocksDB instance.
+    assert (parent != null);
+    // RocksIterator must hold a reference to the related parent instance
+    // to guarantee that while a GC cycle starts RocksIterator instances
+    // are freed prior to parent instances.
+    parent_ = parent;
+  }
+
+  @Override
+  public boolean isValid() {
+    assert (isInitialized());
+    return isValid0(nativeHandle_);
+  }
+
+  @Override
+  public void seekToFirst() {
+    assert (isInitialized());
+    seekToFirst0(nativeHandle_);
+  }
+
+  @Override
+  public void seekToLast() {
+    assert (isInitialized());
+    seekToLast0(nativeHandle_);
+  }
+
+  @Override
+  public void seek(byte[] target) {
+    assert (isInitialized());
+    seek0(nativeHandle_, target, target.length);
+  }
+
+  @Override
+  public void next() {
+    assert (isInitialized());
+    next0(nativeHandle_);
+  }
+
+  @Override
+  public void prev() {
+    assert (isInitialized());
+    prev0(nativeHandle_);
+  }
+
+  @Override
+  public void status() throws RocksDBException {
+    assert (isInitialized());
+    status0(nativeHandle_);
+  }
+
+  /**
+   * <p>Deletes underlying C++ iterator pointer.</p>
+   * <p/>
+   * <p>Note: the underlying handle can only be safely deleted if the parent
+   * instance related to a certain RocksIterator is still valid and initialized.
+   * Therefore {@code disposeInternal()} checks if the parent is initialized
+   * before freeing the native handle.</p>
+   */
+  @Override
+  protected void disposeInternal() {
+    synchronized (parent_) {
+      assert (isInitialized());
+      if (parent_.isInitialized()) {
+        disposeInternal(nativeHandle_);
+      }
+    }
+  }
+
+  abstract void disposeInternal(long handle);
+  abstract boolean isValid0(long handle);
+  abstract void seekToFirst0(long handle);
+  abstract void seekToLast0(long handle);
+  abstract void next0(long handle);
+  abstract void prev0(long handle);
+  abstract void seek0(long handle, byte[] target, int targetLen);
+  abstract void status0(long handle) throws RocksDBException;
+}
diff --git a/java/org/rocksdb/RocksIterator.java b/java/org/rocksdb/RocksIterator.java
index cecf9c309..bb9a6e697 100644
--- a/java/org/rocksdb/RocksIterator.java
+++ b/java/org/rocksdb/RocksIterator.java
@@ -18,58 +18,9 @@ package org.rocksdb;
  *
  * @see org.rocksdb.RocksObject
  */
-public class RocksIterator extends RocksObject implements RocksIteratorInterface {
-  public RocksIterator(RocksDB rocksDB, long nativeHandle) {
-    super();
-    nativeHandle_ = nativeHandle;
-    // rocksDB must point to a valid RocksDB instance.
-    assert(rocksDB != null);
-    // RocksIterator must hold a reference to the related RocksDB instance
-    // to guarantee that while a GC cycle starts RocksDBIterator instances
-    // are freed prior to RocksDB instances.
-    rocksDB_ = rocksDB;
-  }
-
-  @Override
-  public boolean isValid() {
-    assert(isInitialized());
-    return isValid0(nativeHandle_);
-  }
-
-  @Override
-  public void seekToFirst() {
-    assert(isInitialized());
-    seekToFirst0(nativeHandle_);
-  }
-
-  @Override
-  public void seekToLast() {
-    assert(isInitialized());
-    seekToLast0(nativeHandle_);
-  }
-
-  @Override
-  public void seek(byte[] target) {
-    assert(isInitialized());
-    seek0(nativeHandle_, target, target.length);
-  }
-
-  @Override
-  public void next() {
-    assert(isInitialized());
-    next0(nativeHandle_);
-  }
-
-  @Override
-  public void prev() {
-    assert(isInitialized());
-    prev0(nativeHandle_);
-  }
-
-  @Override
-  public void status() throws RocksDBException {
-    assert(isInitialized());
-    status0(nativeHandle_);
+public class RocksIterator extends AbstractRocksIterator<RocksDB> {
+  protected RocksIterator(RocksDB rocksDB, long nativeHandle) {
+    super(rocksDB, nativeHandle);
   }
 
   /**
@@ -99,33 +50,15 @@ public class RocksIterator extends RocksObject implements RocksIteratorInterface
     return value0(nativeHandle_);
   }
 
-  /**
-   * <p>Deletes underlying C++ iterator pointer.</p>
-   *
-   * <p>Note: the underlying handle can only be safely deleted if the RocksDB
-   * instance related to a certain RocksIterator is still valid and initialized.
-   * Therefore {@code disposeInternal()} checks if the RocksDB is initialized
-   * before freeing the native handle.</p>
-   */
-  @Override protected void disposeInternal() {
-    synchronized (rocksDB_) {
-      assert (isInitialized());
-      if (rocksDB_.isInitialized()) {
-        disposeInternal(nativeHandle_);
-      }
-    }
-  }
+  @Override final native void disposeInternal(long handle);
+  @Override final native boolean isValid0(long handle);
+  @Override final native void seekToFirst0(long handle);
+  @Override final native void seekToLast0(long handle);
+  @Override final native void next0(long handle);
+  @Override final native void prev0(long handle);
+  @Override final native void seek0(long handle, byte[] target, int targetLen);
+  @Override final native void status0(long handle) throws RocksDBException;
 
-  private native boolean isValid0(long handle);
-  private native void disposeInternal(long handle);
-  private native void seekToFirst0(long handle);
-  private native void seekToLast0(long handle);
-  private native void next0(long handle);
-  private native void prev0(long handle);
   private native byte[] key0(long handle);
   private native byte[] value0(long handle);
-  private native void seek0(long handle, byte[] target, int targetLen);
-  private native void status0(long handle);
-
-  final RocksDB rocksDB_;
 }
diff --git a/java/rocksjni/iterator.cc b/java/rocksjni/iterator.cc
index c7667a018..e9eb0bb37 100644
--- a/java/rocksjni/iterator.cc
+++ b/java/rocksjni/iterator.cc
@@ -14,6 +14,17 @@
 #include "rocksjni/portal.h"
 #include "rocksdb/iterator.h"
 
+/*
+ * Class:     org_rocksdb_RocksIterator
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_RocksIterator_disposeInternal(
+    JNIEnv* env, jobject jobj, jlong handle) {
+  auto it = reinterpret_cast<rocksdb::Iterator*>(handle);
+  delete it;
+}
+
 /*
  * Class:     org_rocksdb_RocksIterator
  * Method:    isValid0
@@ -36,7 +47,7 @@ void Java_org_rocksdb_RocksIterator_seekToFirst0(
 
 /*
  * Class:     org_rocksdb_RocksIterator
- * Method:    seekToFirst0
+ * Method:    seekToLast0
  * Signature: (J)V
  */
 void Java_org_rocksdb_RocksIterator_seekToLast0(
@@ -46,7 +57,7 @@ void Java_org_rocksdb_RocksIterator_seekToLast0(
 
 /*
  * Class:     org_rocksdb_RocksIterator
- * Method:    seekToLast0
+ * Method:    next0
  * Signature: (J)V
  */
 void Java_org_rocksdb_RocksIterator_next0(
@@ -56,7 +67,7 @@ void Java_org_rocksdb_RocksIterator_next0(
 
 /*
  * Class:     org_rocksdb_RocksIterator
- * Method:    next0
+ * Method:    prev0
  * Signature: (J)V
  */
 void Java_org_rocksdb_RocksIterator_prev0(
@@ -66,41 +77,8 @@ void Java_org_rocksdb_RocksIterator_prev0(
 
 /*
  * Class:     org_rocksdb_RocksIterator
- * Method:    prev0
- * Signature: (J)V
- */
-jbyteArray Java_org_rocksdb_RocksIterator_key0(
-    JNIEnv* env, jobject jobj, jlong handle) {
-  auto it = reinterpret_cast<rocksdb::Iterator*>(handle);
-  rocksdb::Slice key_slice = it->key();
-
-  jbyteArray jkey = env->NewByteArray(static_cast<jsize>(key_slice.size()));
-  env->SetByteArrayRegion(jkey, 0, static_cast<jsize>(key_slice.size()),
-                          reinterpret_cast<const jbyte*>(key_slice.data()));
-  return jkey;
-}
-
-/*
- * Class:     org_rocksdb_RocksIterator
- * Method:    key0
- * Signature: (J)[B
- */
-jbyteArray Java_org_rocksdb_RocksIterator_value0(
-    JNIEnv* env, jobject jobj, jlong handle) {
-  auto it = reinterpret_cast<rocksdb::Iterator*>(handle);
-  rocksdb::Slice value_slice = it->value();
-
-  jbyteArray jkeyValue =
-      env->NewByteArray(static_cast<jsize>(value_slice.size()));
-  env->SetByteArrayRegion(jkeyValue, 0, static_cast<jsize>(value_slice.size()),
-                          reinterpret_cast<const jbyte*>(value_slice.data()));
-  return jkeyValue;
-}
-
-/*
- * Class:     org_rocksdb_RocksIterator
- * Method:    value0
- * Signature: (J)[B
+ * Method:    seek0
+ * Signature: (J[BI)V
  */
 void Java_org_rocksdb_RocksIterator_seek0(
     JNIEnv* env, jobject jobj, jlong handle,
@@ -117,8 +95,8 @@ void Java_org_rocksdb_RocksIterator_seek0(
 
 /*
  * Class:     org_rocksdb_RocksIterator
- * Method:    seek0
- * Signature: (J[BI)V
+ * Method:    status0
+ * Signature: (J)V
  */
 void Java_org_rocksdb_RocksIterator_status0(
     JNIEnv* env, jobject jobj, jlong handle) {
@@ -134,11 +112,33 @@ void Java_org_rocksdb_RocksIterator_status0(
 
 /*
  * Class:     org_rocksdb_RocksIterator
- * Method:    disposeInternal
- * Signature: (J)V
+ * Method:    key0
+ * Signature: (J)[B
  */
-void Java_org_rocksdb_RocksIterator_disposeInternal(
+jbyteArray Java_org_rocksdb_RocksIterator_key0(
     JNIEnv* env, jobject jobj, jlong handle) {
   auto it = reinterpret_cast<rocksdb::Iterator*>(handle);
-  delete it;
+  rocksdb::Slice key_slice = it->key();
+
+  jbyteArray jkey = env->NewByteArray(static_cast<jsize>(key_slice.size()));
+  env->SetByteArrayRegion(jkey, 0, static_cast<jsize>(key_slice.size()),
+                          reinterpret_cast<const jbyte*>(key_slice.data()));
+  return jkey;
+}
+
+/*
+ * Class:     org_rocksdb_RocksIterator
+ * Method:    value0
+ * Signature: (J)[B
+ */
+jbyteArray Java_org_rocksdb_RocksIterator_value0(
+    JNIEnv* env, jobject jobj, jlong handle) {
+  auto it = reinterpret_cast<rocksdb::Iterator*>(handle);
+  rocksdb::Slice value_slice = it->value();
+
+  jbyteArray jkeyValue =
+      env->NewByteArray(static_cast<jsize>(value_slice.size()));
+  env->SetByteArrayRegion(jkeyValue, 0, static_cast<jsize>(value_slice.size()),
+                          reinterpret_cast<const jbyte*>(value_slice.data()));
+  return jkeyValue;
 }

From 2d0dd8db3b6b721eeb6ba42f23bf6ff722c1d31e Mon Sep 17 00:00:00 2001
From: Adam Retter <adam.retter@googlemail.com>
Date: Sat, 3 Jan 2015 15:23:24 +0000
Subject: [PATCH 698/829] Implement WBWIRocksIterator for WriteBatchWithIndex
 in the Java API

---
 java/org/rocksdb/DirectSlice.java         |   4 +-
 java/org/rocksdb/WBWIRocksIterator.java   | 172 ++++++++++++----------
 java/org/rocksdb/WriteBatchWithIndex.java |  25 ++--
 java/rocksjni/portal.h                    | 144 ++++++++++++++++++
 java/rocksjni/write_batch_with_index.cc   | 121 ++++++++++++++-
 5 files changed, 371 insertions(+), 95 deletions(-)

diff --git a/java/org/rocksdb/DirectSlice.java b/java/org/rocksdb/DirectSlice.java
index 3012a1fc9..765b01586 100644
--- a/java/org/rocksdb/DirectSlice.java
+++ b/java/org/rocksdb/DirectSlice.java
@@ -27,12 +27,12 @@ public class DirectSlice extends AbstractSlice<ByteBuffer> {
    * Note: You should be aware that
    * {@see org.rocksdb.RocksObject#disOwnNativeHandle()} is intentionally
    * called from the default DirectSlice constructor, and that it is marked as
-   * private. This is so that developers cannot construct their own default
+   * package-private. This is so that developers cannot construct their own default
    * DirectSlice objects (at present). As developers cannot construct their own
    * DirectSlice objects through this, they are not creating underlying C++
    * DirectSlice objects, and so there is nothing to free (dispose) from Java.
    */
-  private DirectSlice() {
+  DirectSlice() {
     super();
     disOwnNativeHandle();
   }
diff --git a/java/org/rocksdb/WBWIRocksIterator.java b/java/org/rocksdb/WBWIRocksIterator.java
index aafe3aca6..3171cc4ee 100644
--- a/java/org/rocksdb/WBWIRocksIterator.java
+++ b/java/org/rocksdb/WBWIRocksIterator.java
@@ -5,121 +5,133 @@
 
 package org.rocksdb;
 
-public class WBWIRocksIterator extends RocksObject implements RocksIteratorInterface {
-
-  //TODO(AR) abstract common code from WBWIRocksIterator and RocksIterator into AbstractRocksIterator
-
-  final WriteBatchWithIndex wbwi_;
+public class WBWIRocksIterator extends AbstractRocksIterator<WriteBatchWithIndex> {
+  private final WriteEntry entry = new WriteEntry();
 
   protected WBWIRocksIterator(WriteBatchWithIndex wbwi, long nativeHandle) {
-    super();
-    nativeHandle_ = nativeHandle;
-    // rocksDB must point to a valid RocksDB instance.
-    assert (wbwi != null);
-    // WBWIRocksIterator must hold a reference to the related WriteBatchWithIndex instance
-    // to guarantee that while a GC cycle starts WBWIRocksIterator instances
-    // are freed prior to WriteBatchWithIndex instances.
-    wbwi_ = wbwi;
-  }
-
-  @Override
-  public boolean isValid() {
-    return false;
-  }
-
-  @Override
-  public void seekToFirst() {
-
-  }
-
-  @Override
-  public void seekToLast() {
-
-  }
-
-  @Override
-  public void seek(byte[] target) {
-
-  }
-
-  @Override
-  public void next() {
-
-  }
-
-  @Override
-  public void prev() {
-
+    super(wbwi, nativeHandle);
   }
 
   /**
    * Get the current entry
+   *
+   * The WriteEntry is only valid
+   * until the iterator is repositioned.
+   * If you want to keep the WriteEntry across iterator
+   * movements, you must make a copy of its data!
+   *
+   * @return The WriteEntry of the current entry
    */
   public WriteEntry entry() {
-    throw new UnsupportedOperationException("NOT YET IMPLEMENTED"); //TODO(AR) implement
-  }
-
-  @Override
-  public void status() throws RocksDBException {
-
+    assert(isInitialized());
+    assert(entry != null);
+    entry1(nativeHandle_, entry);
+    return entry;
   }
 
-  /**
-   * <p>Deletes underlying C++ iterator pointer.</p>
-   * <p/>
-   * <p>Note: the underlying handle can only be safely deleted if the WriteBatchWithIndex
-   * instance related to a certain WBWIRocksIterator is still valid and initialized.
-   * Therefore {@code disposeInternal()} checks if the WriteBatchWithIndex is initialized
-   * before freeing the native handle.</p>
-   */
-  @Override
-  protected void disposeInternal() {
-    synchronized (wbwi_) {
-      assert (isInitialized());
-      if (wbwi_.isInitialized()) {
-        disposeInternal(nativeHandle_);
-      }
-    }
-  }
+  @Override final native void disposeInternal(long handle);
+  @Override final native boolean isValid0(long handle);
+  @Override final native void seekToFirst0(long handle);
+  @Override final native void seekToLast0(long handle);
+  @Override final native void next0(long handle);
+  @Override final native void prev0(long handle);
+  @Override final native void seek0(long handle, byte[] target, int targetLen);
+  @Override final native void status0(long handle) throws RocksDBException;
 
-  private native void disposeInternal(long handle);
+  private native void entry1(long handle, WriteEntry entry);
 
   /**
    * Enumeration of the Write operation
    * that created the record in the Write Batch
    */
   public enum WriteType {
-    PutRecord,
-    MergeRecord,
-    DeleteRecord,
-    LogDataRecord
+    PUT,
+    MERGE,
+    DELETE,
+    LOG
   }
 
   /**
-   * Represents the entry returned by a
-   * WBWIRocksIterator
+   * Represents an entry returned by
+   * {@link org.rocksdb.WBWIRocksIterator#entry()}
+   *
+   * It is worth noting that a WriteEntry with
+   * the type {@link org.rocksdb.WBWIRocksIterator.WriteType#DELETE}
+   * or {@link org.rocksdb.WBWIRocksIterator.WriteType#LOG}
+   * will not have a value.
    */
   public static class WriteEntry {
-    final WriteType type;
-    final Slice key;
-    final Slice value;
+    WriteType type = null;
+    final DirectSlice key;
+    final DirectSlice value;
+
+    /**
+     * Intentionally private as this
+     * should only be instantiated in
+     * this manner by the outer WBWIRocksIterator
+     * class; The class members are then modified
+     * by calling {@link org.rocksdb.WBWIRocksIterator#entry()}
+     */
+    private WriteEntry() {
+      key = new DirectSlice();
+      value = new DirectSlice();
+    }
 
-    public WriteEntry(final WriteType type, final Slice key, final Slice value) {
+    public WriteEntry(WriteType type, DirectSlice key, DirectSlice value) {
       this.type = type;
       this.key = key;
       this.value = value;
     }
 
+    /**
+     * Returns the type of the Write Entry
+     *
+     * @return the WriteType of the WriteEntry
+     */
     public WriteType getType() {
       return type;
     }
 
-    public Slice getKey() {
+    /**
+     * Returns the key of the Write Entry
+     *
+     * @return The slice containing the key
+     * of the WriteEntry
+     */
+    public DirectSlice getKey() {
       return key;
     }
 
-    public Slice getValue() {
-      return value;
+    /**
+     * Returns the value of the Write Entry
+     *
+     * @return The slice containing the value of
+     * the WriteEntry or null if the WriteEntry has
+     * no value
+     */
+    public DirectSlice getValue() {
+      if(!value.isInitialized()) {
+        return null; //TODO(AR) migrate to JDK8 java.util.Optional#empty()
+      } else {
+        return value;
+      }
+    }
+
+    @Override
+    public boolean equals(Object other) {
+      if(other == null) {
+        return false;
+      } else if (this == other) {
+        return true;
+      } else if(other instanceof WriteEntry) {
+        final WriteEntry otherWriteEntry = (WriteEntry)other;
+        return type.equals(otherWriteEntry.type)
+            && key.equals(otherWriteEntry.key)
+            && (value.isInitialized() ? value.equals(otherWriteEntry.value)
+                : !otherWriteEntry.value.isInitialized());
+      } else {
+        return false;
+      }
     }
   }
 }
diff --git a/java/org/rocksdb/WriteBatchWithIndex.java b/java/org/rocksdb/WriteBatchWithIndex.java
index f71ba338c..5204146c4 100644
--- a/java/org/rocksdb/WriteBatchWithIndex.java
+++ b/java/org/rocksdb/WriteBatchWithIndex.java
@@ -37,8 +37,8 @@ public class WriteBatchWithIndex extends AbstractWriteBatch {
    * assignment is determined by the constructor argument
    *
    * @param overwriteKey if true, overwrite the key in the index when
-   *                     inserting a duplicate key, in this way an iterator will never
-   *                     show two entries with the same key.
+   *   inserting a duplicate key, in this way an iterator will never
+   *   show two entries with the same key.
    */
   public WriteBatchWithIndex(boolean overwriteKey) {
     super();
@@ -49,12 +49,14 @@ public class WriteBatchWithIndex extends AbstractWriteBatch {
    * Creates a WriteBatchWithIndex
    *
    * @param fallbackIndexComparator We fallback to this comparator
-   *                                to compare keys within a column family if we cannot determine
-   *                                the column family and so look up it's comparator.
-   * @param reservedBytes           reserved bytes in underlying WriteBatch
-   * @param overwriteKey            if true, overwrite the key in the index when
-   *                                inserting a duplicate key, in this way an iterator will never
-   *                                show two entries with the same key.
+   *  to compare keys within a column family if we cannot determine
+   *  the column family and so look up it's comparator.
+   *
+   * @param reservedBytes reserved bytes in underlying WriteBatch
+   *
+   * @param overwriteKey if true, overwrite the key in the index when
+   *   inserting a duplicate key, in this way an iterator will never
+   *   show two entries with the same key.
    */
   public WriteBatchWithIndex(AbstractComparator fallbackIndexComparator, int reservedBytes,
       boolean overwriteKey) {
@@ -97,16 +99,15 @@ public class WriteBatchWithIndex extends AbstractWriteBatch {
    * as a delta and baseIterator as a base
    *
    * @param columnFamilyHandle The column family to iterate over
-   * @param baseIterator       The base iterator, e.g. {@link org.rocksdb.RocksDB#newIterator()}
+   * @param baseIterator The base iterator, e.g. {@link org.rocksdb.RocksDB#newIterator()}
    * @return An iterator which shows a view comprised of both the database point-in-time
    * from baseIterator and modifications made in this write batch.
    */
   public RocksIterator newIteratorWithBase(ColumnFamilyHandle columnFamilyHandle,
       RocksIterator baseIterator) {
     RocksIterator iterator = new RocksIterator(
-        baseIterator.rocksDB_,
+        baseIterator.parent_,
         iteratorWithBase(columnFamilyHandle.nativeHandle_, baseIterator.nativeHandle_));
-
     //when the iterator is deleted it will also delete the baseIterator
     baseIterator.disOwnNativeHandle();
     return iterator;
@@ -122,7 +123,7 @@ public class WriteBatchWithIndex extends AbstractWriteBatch {
    * from baseIterator and modifications made in this write batch.
    */
   public RocksIterator newIteratorWithBase(RocksIterator baseIterator) {
-    return newIteratorWithBase(baseIterator.rocksDB_.getDefaultColumnFamily(), baseIterator);
+    return newIteratorWithBase(baseIterator.parent_.getDefaultColumnFamily(), baseIterator);
   }
 
   @Override final native void disposeInternal(long handle);
diff --git a/java/rocksjni/portal.h b/java/rocksjni/portal.h
index 746dde539..74dc7ad46 100644
--- a/java/rocksjni/portal.h
+++ b/java/rocksjni/portal.h
@@ -863,6 +863,150 @@ class BackupInfoListJni {
   }
 };
 
+class WBWIRocksIteratorJni {
+ public:
+    // Get the java class id of org.rocksdb.WBWIRocksIterator.
+    static jclass getJClass(JNIEnv* env) {
+      static jclass jclazz = env->FindClass("org/rocksdb/WBWIRocksIterator");
+      assert(jclazz != nullptr);
+      return jclazz;
+    }
+
+    static jfieldID getWriteEntryField(JNIEnv* env) {
+      static jfieldID fid =
+          env->GetFieldID(getJClass(env), "entry",
+          "Lorg/rocksdb/WBWIRocksIterator$WriteEntry;");
+      assert(fid != nullptr);
+      return fid;
+    }
+
+    static jobject getWriteEntry(JNIEnv* env, jobject jwbwi_rocks_iterator) {
+      jobject jwe =
+          env->GetObjectField(jwbwi_rocks_iterator, getWriteEntryField(env));
+      assert(jwe != nullptr);
+      return jwe;
+    }
+};
+
+class WriteTypeJni {
+ public:
+    // Get the PUT enum field of org.rocksdb.WBWIRocksIterator.WriteType
+    static jobject PUT(JNIEnv* env) {
+      return getEnum(env, "PUT");
+    }
+
+    // Get the MERGE enum field of org.rocksdb.WBWIRocksIterator.WriteType
+    static jobject MERGE(JNIEnv* env) {
+      return getEnum(env, "MERGE");
+    }
+
+    // Get the DELETE enum field of org.rocksdb.WBWIRocksIterator.WriteType
+    static jobject DELETE(JNIEnv* env) {
+      return getEnum(env, "DELETE");
+    }
+
+    // Get the LOG enum field of org.rocksdb.WBWIRocksIterator.WriteType
+    static jobject LOG(JNIEnv* env) {
+      return getEnum(env, "LOG");
+    }
+
+ private:
+    // Get the java class id of org.rocksdb.WBWIRocksIterator.WriteType.
+    static jclass getJClass(JNIEnv* env) {
+      // TODO(AR) setting the jclazz var to static causes getEnum to fail
+      // occasionally (e.g. in WriteBatchWithIndex#iterator() test) with
+      // SIGSEGV but I have no idea why...
+      jclass jclazz = env->FindClass("org/rocksdb/WBWIRocksIterator$WriteType");
+      assert(jclazz != nullptr);
+      return jclazz;
+    }
+
+    // Get an enum field of org.rocksdb.WBWIRocksIterator.WriteType
+    static jobject getEnum(JNIEnv* env, const char name[]) {
+      // TODO(AR) setting the jclazz var to static causes getEnum to fail
+      // occasionally (e.g. in WriteBatchWithIndex#iterator() test) with
+      // SIGSEGV but I have no idea why...
+      jclass jclazz = getJClass(env);
+      jfieldID jfid =
+          env->GetStaticFieldID(jclazz, name,
+          "Lorg/rocksdb/WBWIRocksIterator$WriteType;");
+      assert(jfid != nullptr);
+      return env->GetStaticObjectField(jclazz, jfid);
+    }
+};
+
+class WriteEntryJni {
+ public:
+    // Get the java class id of org.rocksdb.WBWIRocksIterator.WriteEntry.
+    static jclass getJClass(JNIEnv* env) {
+      static jclass jclazz =
+          env->FindClass("org/rocksdb/WBWIRocksIterator$WriteEntry");
+      assert(jclazz != nullptr);
+      return jclazz;
+    }
+
+    static void setWriteType(JNIEnv* env, jobject jwrite_entry,
+        WriteType write_type) {
+      jobject jwrite_type;
+      switch (write_type) {
+        case kPutRecord:
+          jwrite_type = WriteTypeJni::PUT(env);
+          break;
+
+        case kMergeRecord:
+          jwrite_type = WriteTypeJni::MERGE(env);
+          break;
+
+        case kDeleteRecord:
+          jwrite_type = WriteTypeJni::DELETE(env);
+          break;
+
+        case kLogDataRecord:
+          jwrite_type = WriteTypeJni::LOG(env);
+          break;
+
+        default:
+          jwrite_type = nullptr;
+      }
+      assert(jwrite_type != nullptr);
+      env->SetObjectField(jwrite_entry, getWriteTypeField(env), jwrite_type);
+    }
+
+    static void setKey(JNIEnv* env, jobject jwrite_entry,
+        const rocksdb::Slice* slice) {
+      jobject jkey = env->GetObjectField(jwrite_entry, getKeyField(env));
+      AbstractSliceJni::setHandle(env, jkey, slice);
+    }
+
+    static void setValue(JNIEnv* env, jobject jwrite_entry,
+        const rocksdb::Slice* slice) {
+      jobject jvalue = env->GetObjectField(jwrite_entry, getValueField(env));
+      AbstractSliceJni::setHandle(env, jvalue, slice);
+    }
+
+ private:
+    static jfieldID getWriteTypeField(JNIEnv* env) {
+      static jfieldID fid = env->GetFieldID(
+          getJClass(env), "type", "Lorg/rocksdb/WBWIRocksIterator$WriteType;");
+        assert(fid != nullptr);
+        return fid;
+    }
+
+    static jfieldID getKeyField(JNIEnv* env) {
+      static jfieldID fid = env->GetFieldID(
+          getJClass(env), "key", "Lorg/rocksdb/DirectSlice;");
+      assert(fid != nullptr);
+      return fid;
+    }
+
+    static jfieldID getValueField(JNIEnv* env) {
+      static jfieldID fid = env->GetFieldID(
+          getJClass(env), "value", "Lorg/rocksdb/DirectSlice;");
+      assert(fid != nullptr);
+      return fid;
+    }
+};
+
 class JniUtil {
  public:
     /**
diff --git a/java/rocksjni/write_batch_with_index.cc b/java/rocksjni/write_batch_with_index.cc
index 3d04b4ddd..bf11d29c0 100644
--- a/java/rocksjni/write_batch_with_index.cc
+++ b/java/rocksjni/write_batch_with_index.cc
@@ -6,10 +6,11 @@
 // This file implements the "bridge" between Java and C++ and enables
 // calling c++ rocksdb::WriteBatchWithIndex methods from Java side.
 
+#include "include/org_rocksdb_WBWIRocksIterator.h"
 #include "include/org_rocksdb_WriteBatchWithIndex.h"
-#include "rocksjni/portal.h"
 #include "rocksdb/comparator.h"
 #include "rocksdb/utilities/write_batch_with_index.h"
+#include "rocksjni/portal.h"
 
 /*
  * Class:     org_rocksdb_WriteBatchWithIndex
@@ -297,3 +298,121 @@ void Java_org_rocksdb_WriteBatchWithIndex_disposeInternal(
   auto* wbwi = reinterpret_cast<rocksdb::WriteBatchWithIndex*>(handle);
   delete wbwi;
 }
+
+/* WBWIRocksIterator below */
+
+/*
+ * Class:     org_rocksdb_WBWIRocksIterator
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_WBWIRocksIterator_disposeInternal(
+    JNIEnv* env, jobject jobj, jlong handle) {
+  auto* it = reinterpret_cast<rocksdb::WBWIIterator*>(handle);
+  delete it;
+}
+
+/*
+ * Class:     org_rocksdb_WBWIRocksIterator
+ * Method:    isValid0
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_WBWIRocksIterator_isValid0(
+    JNIEnv* env, jobject jobj, jlong handle) {
+  return reinterpret_cast<rocksdb::WBWIIterator*>(handle)->Valid();
+}
+
+/*
+ * Class:     org_rocksdb_WBWIRocksIterator
+ * Method:    seekToFirst0
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_WBWIRocksIterator_seekToFirst0(
+    JNIEnv* env, jobject jobj, jlong handle) {
+  reinterpret_cast<rocksdb::WBWIIterator*>(handle)->SeekToFirst();
+}
+
+/*
+ * Class:     org_rocksdb_WBWIRocksIterator
+ * Method:    seekToLast0
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_WBWIRocksIterator_seekToLast0(
+    JNIEnv* env, jobject jobj, jlong handle) {
+  reinterpret_cast<rocksdb::WBWIIterator*>(handle)->SeekToLast();
+}
+
+/*
+ * Class:     org_rocksdb_WBWIRocksIterator
+ * Method:    next0
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_WBWIRocksIterator_next0(
+    JNIEnv* env, jobject jobj, jlong handle) {
+  reinterpret_cast<rocksdb::WBWIIterator*>(handle)->Next();
+}
+
+/*
+ * Class:     org_rocksdb_WBWIRocksIterator
+ * Method:    prev0
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_WBWIRocksIterator_prev0(
+    JNIEnv* env, jobject jobj, jlong handle) {
+  reinterpret_cast<rocksdb::WBWIIterator*>(handle)->Prev();
+}
+
+/*
+ * Class:     org_rocksdb_WBWIRocksIterator
+ * Method:    seek0
+ * Signature: (J[BI)V
+ */
+void Java_org_rocksdb_WBWIRocksIterator_seek0(
+    JNIEnv* env, jobject jobj, jlong handle, jbyteArray jtarget,
+    jint jtarget_len) {
+  auto* it = reinterpret_cast<rocksdb::WBWIIterator*>(handle);
+  jbyte* target = env->GetByteArrayElements(jtarget, 0);
+  rocksdb::Slice target_slice(
+      reinterpret_cast<char*>(target), jtarget_len);
+
+  it->Seek(target_slice);
+
+  env->ReleaseByteArrayElements(jtarget, target, JNI_ABORT);
+}
+
+/*
+ * Class:     org_rocksdb_WBWIRocksIterator
+ * Method:    status0
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_WBWIRocksIterator_status0(
+    JNIEnv* env, jobject jobj, jlong handle) {
+  auto* it = reinterpret_cast<rocksdb::WBWIIterator*>(handle);
+  rocksdb::Status s = it->status();
+
+  if (s.ok()) {
+    return;
+  }
+
+  rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+}
+
+/*
+ * Class:     org_rocksdb_WBWIRocksIterator
+ * Method:    entry1
+ * Signature: (JLorg/rocksdb/WBWIRocksIterator/WriteEntry;)V
+ */
+void Java_org_rocksdb_WBWIRocksIterator_entry1(
+    JNIEnv* env, jobject jobj, jlong handle, jobject jwrite_entry) {
+  auto* it = reinterpret_cast<rocksdb::WBWIIterator*>(handle);
+  const rocksdb::WriteEntry& we = it->Entry();
+  jobject jwe = rocksdb::WBWIRocksIteratorJni::getWriteEntry(env, jobj);
+  rocksdb::WriteEntryJni::setWriteType(env, jwe, we.type);
+  rocksdb::WriteEntryJni::setKey(env, jwe, &we.key);
+  if (we.type == rocksdb::kDeleteRecord || we.type == rocksdb::kLogDataRecord) {
+    // set native handle of value slice to null if no value available
+    rocksdb::WriteEntryJni::setValue(env, jwe, NULL);
+  } else {
+    rocksdb::WriteEntryJni::setValue(env, jwe, &we.value);
+  }
+}

From 3d246c89cc406783b40005e163968622ac2ea310 Mon Sep 17 00:00:00 2001
From: Adam Retter <adam.retter@googlemail.com>
Date: Sat, 3 Jan 2015 17:52:17 +0000
Subject: [PATCH 699/829] Abstract duplicate code on key and value slice
 objects into generic methods

---
 java/rocksjni/portal.h                  |  45 +++++++-
 java/rocksjni/write_batch.cc            | 136 ++++++++---------------
 java/rocksjni/write_batch_with_index.cc | 142 +++++++++---------------
 3 files changed, 143 insertions(+), 180 deletions(-)

diff --git a/java/rocksjni/portal.h b/java/rocksjni/portal.h
index 74dc7ad46..771223dba 100644
--- a/java/rocksjni/portal.h
+++ b/java/rocksjni/portal.h
@@ -1009,7 +1009,7 @@ class WriteEntryJni {
 
 class JniUtil {
  public:
-    /**
+    /*
      * Copies a jstring to a std::string
      * and releases the original jstring
      */
@@ -1019,6 +1019,49 @@ class JniUtil {
       env->ReleaseStringUTFChars(js, utf);
       return name;
     }
+
+    /*
+     * Helper for operations on a key and value
+     * for example WriteBatch->Put
+     *
+     * TODO(AR) could be extended to cover returning rocksdb::Status
+     * from `op` and used for RocksDB->Put etc.
+     */
+    static void kv_op(
+        std::function<void(rocksdb::Slice, rocksdb::Slice)> op,
+        JNIEnv* env, jobject jobj,
+        jbyteArray jkey, jint jkey_len,
+        jbyteArray jentry_value, jint jentry_value_len) {
+      jbyte* key = env->GetByteArrayElements(jkey, nullptr);
+      jbyte* value = env->GetByteArrayElements(jentry_value, nullptr);
+      rocksdb::Slice key_slice(reinterpret_cast<char*>(key), jkey_len);
+      rocksdb::Slice value_slice(reinterpret_cast<char*>(value),
+          jentry_value_len);
+
+      op(key_slice, value_slice);
+
+      env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
+      env->ReleaseByteArrayElements(jentry_value, value, JNI_ABORT);
+    }
+
+    /*
+     * Helper for operations on a key
+     * for example WriteBatch->Delete
+     *
+     * TODO(AR) could be extended to cover returning rocksdb::Status
+     * from `op` and used for RocksDB->Delete etc.
+     */
+    static void k_op(
+        std::function<void(rocksdb::Slice)> op,
+        JNIEnv* env, jobject jobj,
+        jbyteArray jkey, jint jkey_len) {
+      jbyte* key = env->GetByteArrayElements(jkey, nullptr);
+      rocksdb::Slice key_slice(reinterpret_cast<char*>(key), jkey_len);
+
+      op(key_slice);
+
+      env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
+    }
 };
 
 }  // namespace rocksdb
diff --git a/java/rocksjni/write_batch.cc b/java/rocksjni/write_batch.cc
index 02f3989a9..20eb55407 100644
--- a/java/rocksjni/write_batch.cc
+++ b/java/rocksjni/write_batch.cc
@@ -63,32 +63,6 @@ void Java_org_rocksdb_WriteBatch_clear0(JNIEnv* env, jobject jobj) {
   wb->Clear();
 }
 
-/*
- * Helper for WriteBatch put operations
- */
-void write_batch_put_helper(
-    JNIEnv* env, jobject jobj,
-    jbyteArray jkey, jint jkey_len,
-    jbyteArray jentry_value, jint jentry_value_len,
-    rocksdb::ColumnFamilyHandle* cf_handle) {
-  rocksdb::WriteBatch* wb = rocksdb::WriteBatchJni::getHandle(env, jobj);
-  assert(wb != nullptr);
-
-  jbyte* key = env->GetByteArrayElements(jkey, nullptr);
-  jbyte* value = env->GetByteArrayElements(jentry_value, nullptr);
-  rocksdb::Slice key_slice(reinterpret_cast<char*>(key), jkey_len);
-  rocksdb::Slice value_slice(reinterpret_cast<char*>(value),
-      jentry_value_len);
-  if (cf_handle != nullptr) {
-    wb->Put(cf_handle, key_slice, value_slice);
-  } else {
-    // backwards compatibility
-    wb->Put(key_slice, value_slice);
-  }
-  env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
-  env->ReleaseByteArrayElements(jentry_value, value, JNI_ABORT);
-}
-
 /*
  * Class:     org_rocksdb_WriteBatch
  * Method:    put
@@ -98,8 +72,13 @@ void Java_org_rocksdb_WriteBatch_put___3BI_3BI(
     JNIEnv* env, jobject jobj,
     jbyteArray jkey, jint jkey_len,
     jbyteArray jentry_value, jint jentry_value_len) {
-  write_batch_put_helper(env, jobj, jkey, jkey_len, jentry_value,
-      jentry_value_len, nullptr);
+  auto* wb = rocksdb::WriteBatchJni::getHandle(env, jobj);
+  assert(wb != nullptr);
+  auto put = [&wb] (rocksdb::Slice key, rocksdb::Slice value) {
+    wb->Put(key, value);
+  };
+  rocksdb::JniUtil::kv_op(put, env, jobj, jkey, jkey_len, jentry_value,
+      jentry_value_len);
 }
 
 /*
@@ -111,35 +90,15 @@ void Java_org_rocksdb_WriteBatch_put___3BI_3BIJ(
     JNIEnv* env, jobject jobj,
     jbyteArray jkey, jint jkey_len,
     jbyteArray jentry_value, jint jentry_value_len, jlong jcf_handle) {
-  auto cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
-  write_batch_put_helper(env, jobj, jkey, jkey_len, jentry_value,
-      jentry_value_len, cf_handle);
-}
-
-/*
- * Helper for write batch merge operations
- */
-void write_batch_merge_helper(
-    JNIEnv* env, jobject jobj,
-    jbyteArray jkey, jint jkey_len,
-    jbyteArray jentry_value, jint jentry_value_len,
-    rocksdb::ColumnFamilyHandle* cf_handle) {
-  rocksdb::WriteBatch* wb = rocksdb::WriteBatchJni::getHandle(env, jobj);
+  auto* wb = rocksdb::WriteBatchJni::getHandle(env, jobj);
   assert(wb != nullptr);
-
-  jbyte* key = env->GetByteArrayElements(jkey, nullptr);
-  jbyte* value = env->GetByteArrayElements(jentry_value, nullptr);
-  rocksdb::Slice key_slice(reinterpret_cast<char*>(key), jkey_len);
-  rocksdb::Slice value_slice(reinterpret_cast<char*>(value),
+  auto* cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
+  assert(cf_handle != nullptr);
+  auto put = [&wb, &cf_handle] (rocksdb::Slice key, rocksdb::Slice value) {
+    wb->Put(cf_handle, key, value);
+  };
+  rocksdb::JniUtil::kv_op(put, env, jobj, jkey, jkey_len, jentry_value,
       jentry_value_len);
-  if (cf_handle != nullptr) {
-    wb->Merge(cf_handle, key_slice, value_slice);
-  } else {
-    // backwards compatibility
-    wb->Merge(key_slice, value_slice);
-  }
-  env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
-  env->ReleaseByteArrayElements(jentry_value, value, JNI_ABORT);
 }
 
 /*
@@ -151,8 +110,13 @@ void Java_org_rocksdb_WriteBatch_merge___3BI_3BI(
     JNIEnv* env, jobject jobj,
     jbyteArray jkey, jint jkey_len,
     jbyteArray jentry_value, jint jentry_value_len) {
-  write_batch_merge_helper(env, jobj, jkey, jkey_len,
-      jentry_value, jentry_value_len, nullptr);
+  auto* wb = rocksdb::WriteBatchJni::getHandle(env, jobj);
+  assert(wb != nullptr);
+  auto merge = [&wb] (rocksdb::Slice key, rocksdb::Slice value) {
+    wb->Merge(key, value);
+  };
+  rocksdb::JniUtil::kv_op(merge, env, jobj, jkey, jkey_len, jentry_value,
+      jentry_value_len);
 }
 
 /*
@@ -164,29 +128,15 @@ void Java_org_rocksdb_WriteBatch_merge___3BI_3BIJ(
     JNIEnv* env, jobject jobj,
     jbyteArray jkey, jint jkey_len,
     jbyteArray jentry_value, jint jentry_value_len, jlong jcf_handle) {
-  auto cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
-  write_batch_merge_helper(env, jobj, jkey, jkey_len,
-      jentry_value, jentry_value_len, cf_handle);
-}
-
-/*
- * Helper for write batch remove operations
- */
-void write_batch_remove_helper(
-    JNIEnv* env, jobject jobj,
-    jbyteArray jkey, jint jkey_len,
-    rocksdb::ColumnFamilyHandle* cf_handle) {
-  rocksdb::WriteBatch* wb = rocksdb::WriteBatchJni::getHandle(env, jobj);
+  auto* wb = rocksdb::WriteBatchJni::getHandle(env, jobj);
   assert(wb != nullptr);
-
-  jbyte* key = env->GetByteArrayElements(jkey, nullptr);
-  rocksdb::Slice key_slice(reinterpret_cast<char*>(key), jkey_len);
-  if (cf_handle != nullptr) {
-    wb->Delete(cf_handle, key_slice);
-  } else {
-    wb->Delete(key_slice);
-  }
-  env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
+  auto* cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
+  assert(cf_handle != nullptr);
+  auto merge = [&wb, &cf_handle] (rocksdb::Slice key, rocksdb::Slice value) {
+    wb->Merge(cf_handle, key, value);
+  };
+  rocksdb::JniUtil::kv_op(merge, env, jobj, jkey, jkey_len, jentry_value,
+      jentry_value_len);
 }
 
 /*
@@ -197,7 +147,12 @@ void write_batch_remove_helper(
 void Java_org_rocksdb_WriteBatch_remove___3BI(
     JNIEnv* env, jobject jobj,
     jbyteArray jkey, jint jkey_len) {
-  write_batch_remove_helper(env, jobj, jkey, jkey_len, nullptr);
+  auto* wb = rocksdb::WriteBatchJni::getHandle(env, jobj);
+  assert(wb != nullptr);
+  auto remove = [&wb] (rocksdb::Slice key) {
+    wb->Delete(key);
+  };
+  rocksdb::JniUtil::k_op(remove, env, jobj, jkey, jkey_len);
 }
 
 /*
@@ -208,8 +163,14 @@ void Java_org_rocksdb_WriteBatch_remove___3BI(
 void Java_org_rocksdb_WriteBatch_remove___3BIJ(
     JNIEnv* env, jobject jobj,
     jbyteArray jkey, jint jkey_len, jlong jcf_handle) {
-  auto cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
-  write_batch_remove_helper(env, jobj, jkey, jkey_len, cf_handle);
+  auto* wb = rocksdb::WriteBatchJni::getHandle(env, jobj);
+  assert(wb != nullptr);
+  auto* cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
+  assert(cf_handle != nullptr);
+  auto remove = [&wb, &cf_handle] (rocksdb::Slice key) {
+    wb->Delete(cf_handle, key);
+  };
+  rocksdb::JniUtil::k_op(remove, env, jobj, jkey, jkey_len);
 }
 
 /*
@@ -219,13 +180,12 @@ void Java_org_rocksdb_WriteBatch_remove___3BIJ(
  */
 void Java_org_rocksdb_WriteBatch_putLogData(
     JNIEnv* env, jobject jobj, jbyteArray jblob, jint jblob_len) {
-  rocksdb::WriteBatch* wb = rocksdb::WriteBatchJni::getHandle(env, jobj);
+  auto* wb = rocksdb::WriteBatchJni::getHandle(env, jobj);
   assert(wb != nullptr);
-
-  jbyte* blob = env->GetByteArrayElements(jblob, nullptr);
-  rocksdb::Slice blob_slice(reinterpret_cast<char*>(blob), jblob_len);
-  wb->PutLogData(blob_slice);
-  env->ReleaseByteArrayElements(jblob, blob, JNI_ABORT);
+  auto putLogData = [&wb] (rocksdb::Slice blob) {
+    wb->PutLogData(blob);
+  };
+  rocksdb::JniUtil::k_op(putLogData, env, jobj, jblob, jblob_len);
 }
 
 /*
diff --git a/java/rocksjni/write_batch_with_index.cc b/java/rocksjni/write_batch_with_index.cc
index bf11d29c0..92f2ec068 100644
--- a/java/rocksjni/write_batch_with_index.cc
+++ b/java/rocksjni/write_batch_with_index.cc
@@ -65,34 +65,6 @@ jint Java_org_rocksdb_WriteBatchWithIndex_count0(
   return static_cast<jint>(wbwi->GetWriteBatch()->Count());
 }
 
-//TODO(AR) make generic with WriteBatch equivalent
-/*
- * Helper for WriteBatchWithIndex put operations
- */
-void write_batch_with_index_put_helper(
-    JNIEnv* env, jobject jobj,
-    jbyteArray jkey, jint jkey_len,
-    jbyteArray jentry_value, jint jentry_value_len,
-    rocksdb::ColumnFamilyHandle* cf_handle) {
-  rocksdb::WriteBatchWithIndex* wbwi =
-      rocksdb::WriteBatchWithIndexJni::getHandle(env, jobj);
-  assert(wbwi != nullptr);
-
-  jbyte* key = env->GetByteArrayElements(jkey, nullptr);
-  jbyte* value = env->GetByteArrayElements(jentry_value, nullptr);
-  rocksdb::Slice key_slice(reinterpret_cast<char*>(key), jkey_len);
-  rocksdb::Slice value_slice(reinterpret_cast<char*>(value),
-      jentry_value_len);
-  if (cf_handle != nullptr) {
-    wbwi->Put(cf_handle, key_slice, value_slice);
-  } else {
-    // backwards compatibility
-    wbwi->Put(key_slice, value_slice);
-  }
-  env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
-  env->ReleaseByteArrayElements(jentry_value, value, JNI_ABORT);
-}
-
 /*
  * Class:     org_rocksdb_WriteBatchWithIndex
  * Method:    put
@@ -101,8 +73,14 @@ void write_batch_with_index_put_helper(
 void Java_org_rocksdb_WriteBatchWithIndex_put___3BI_3BI(
     JNIEnv* env, jobject jobj, jbyteArray jkey, jint jkey_len,
     jbyteArray jentry_value, jint jentry_value_len) {
-  write_batch_with_index_put_helper(env, jobj, jkey, jkey_len, jentry_value,
-      jentry_value_len, nullptr);
+  auto* wbwi =
+      rocksdb::WriteBatchWithIndexJni::getHandle(env, jobj);
+  assert(wbwi != nullptr);
+  auto put = [&wbwi] (rocksdb::Slice key, rocksdb::Slice value) {
+    wbwi->Put(key, value);
+  };
+  rocksdb::JniUtil::kv_op(put, env, jobj, jkey, jkey_len, jentry_value,
+      jentry_value_len);
 }
 
 /*
@@ -113,37 +91,16 @@ void Java_org_rocksdb_WriteBatchWithIndex_put___3BI_3BI(
 void Java_org_rocksdb_WriteBatchWithIndex_put___3BI_3BIJ(
     JNIEnv* env, jobject jobj, jbyteArray jkey, jint jkey_len,
     jbyteArray jentry_value, jint jentry_value_len, jlong jcf_handle) {
-  auto* cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
-  write_batch_with_index_put_helper(env, jobj, jkey, jkey_len, jentry_value,
-      jentry_value_len, cf_handle);
-}
-
-//TODO(AR) make generic with WriteBatch equivalent
-/*
- * Helper for WriteBatchWithIndex merge operations
- */
-void write_batch_with_index_merge_helper(
-    JNIEnv* env, jobject jobj,
-    jbyteArray jkey, jint jkey_len,
-    jbyteArray jentry_value, jint jentry_value_len,
-    rocksdb::ColumnFamilyHandle* cf_handle) {
-  rocksdb::WriteBatchWithIndex* wbwi =
+  auto* wbwi =
       rocksdb::WriteBatchWithIndexJni::getHandle(env, jobj);
   assert(wbwi != nullptr);
-
-  jbyte* key = env->GetByteArrayElements(jkey, nullptr);
-  jbyte* value = env->GetByteArrayElements(jentry_value, nullptr);
-  rocksdb::Slice key_slice(reinterpret_cast<char*>(key), jkey_len);
-  rocksdb::Slice value_slice(reinterpret_cast<char*>(value),
+  auto* cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
+  assert(cf_handle != nullptr);
+  auto put = [&wbwi, &cf_handle] (rocksdb::Slice key, rocksdb::Slice value) {
+    wbwi->Put(cf_handle, key, value);
+  };
+  rocksdb::JniUtil::kv_op(put, env, jobj, jkey, jkey_len, jentry_value,
       jentry_value_len);
-  if (cf_handle != nullptr) {
-    wbwi->Merge(cf_handle, key_slice, value_slice);
-  } else {
-    // backwards compatibility
-    wbwi->Merge(key_slice, value_slice);
-  }
-  env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
-  env->ReleaseByteArrayElements(jentry_value, value, JNI_ABORT);
 }
 
 /*
@@ -154,8 +111,14 @@ void write_batch_with_index_merge_helper(
 void Java_org_rocksdb_WriteBatchWithIndex_merge___3BI_3BI(
     JNIEnv* env, jobject jobj, jbyteArray jkey, jint jkey_len,
     jbyteArray jentry_value, jint jentry_value_len) {
-  write_batch_with_index_merge_helper(env, jobj, jkey, jkey_len,
-      jentry_value, jentry_value_len, nullptr);
+  auto* wbwi =
+      rocksdb::WriteBatchWithIndexJni::getHandle(env, jobj);
+  assert(wbwi != nullptr);
+  auto merge = [&wbwi] (rocksdb::Slice key, rocksdb::Slice value) {
+    wbwi->Merge(key, value);
+  };
+  rocksdb::JniUtil::kv_op(merge, env, jobj, jkey, jkey_len, jentry_value,
+      jentry_value_len);
 }
 
 /*
@@ -166,31 +129,16 @@ void Java_org_rocksdb_WriteBatchWithIndex_merge___3BI_3BI(
 void Java_org_rocksdb_WriteBatchWithIndex_merge___3BI_3BIJ(
     JNIEnv* env, jobject jobj, jbyteArray jkey, jint jkey_len,
     jbyteArray jentry_value, jint jentry_value_len, jlong jcf_handle) {
-  auto* cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
-  write_batch_with_index_merge_helper(env, jobj, jkey, jkey_len,
-      jentry_value, jentry_value_len, cf_handle);
-}
-
-//TODO(AR) make generic with WriteBatch equivalent
-/*
- * Helper for write batch remove operations
- */
-void write_batch_with_index_remove_helper(
-    JNIEnv* env, jobject jobj,
-    jbyteArray jkey, jint jkey_len,
-    rocksdb::ColumnFamilyHandle* cf_handle) {
-  rocksdb::WriteBatchWithIndex* wbwi =
+  auto* wbwi =
       rocksdb::WriteBatchWithIndexJni::getHandle(env, jobj);
   assert(wbwi != nullptr);
-
-  jbyte* key = env->GetByteArrayElements(jkey, nullptr);
-  rocksdb::Slice key_slice(reinterpret_cast<char*>(key), jkey_len);
-  if (cf_handle != nullptr) {
-    wbwi->Delete(cf_handle, key_slice);
-  } else {
-    wbwi->Delete(key_slice);
-  }
-  env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
+  auto* cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
+  assert(cf_handle != nullptr);
+  auto merge = [&wbwi, &cf_handle] (rocksdb::Slice key, rocksdb::Slice value) {
+    wbwi->Merge(cf_handle, key, value);
+  };
+  rocksdb::JniUtil::kv_op(merge, env, jobj, jkey, jkey_len, jentry_value,
+      jentry_value_len);
 }
 
 /*
@@ -200,7 +148,13 @@ void write_batch_with_index_remove_helper(
  */
 void Java_org_rocksdb_WriteBatchWithIndex_remove___3BI(
     JNIEnv* env, jobject jobj, jbyteArray jkey, jint jkey_len) {
-  write_batch_with_index_remove_helper(env, jobj, jkey, jkey_len, nullptr);
+  auto* wbwi =
+      rocksdb::WriteBatchWithIndexJni::getHandle(env, jobj);
+  assert(wbwi != nullptr);
+  auto remove = [&wbwi] (rocksdb::Slice key) {
+    wbwi->Delete(key);
+  };
+  rocksdb::JniUtil::k_op(remove, env, jobj, jkey, jkey_len);
 }
 
 /*
@@ -211,8 +165,15 @@ void Java_org_rocksdb_WriteBatchWithIndex_remove___3BI(
 void Java_org_rocksdb_WriteBatchWithIndex_remove___3BIJ(
     JNIEnv* env, jobject jobj,
     jbyteArray jkey, jint jkey_len, jlong jcf_handle) {
+  auto* wbwi =
+      rocksdb::WriteBatchWithIndexJni::getHandle(env, jobj);
+  assert(wbwi != nullptr);
   auto* cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
-  write_batch_with_index_remove_helper(env, jobj, jkey, jkey_len, cf_handle);
+  assert(cf_handle != nullptr);
+  auto remove = [&wbwi, &cf_handle] (rocksdb::Slice key) {
+    wbwi->Delete(cf_handle, key);
+  };
+  rocksdb::JniUtil::k_op(remove, env, jobj, jkey, jkey_len);
 }
 
 /*
@@ -222,14 +183,13 @@ void Java_org_rocksdb_WriteBatchWithIndex_remove___3BIJ(
  */
 void Java_org_rocksdb_WriteBatchWithIndex_putLogData(
     JNIEnv* env, jobject jobj, jbyteArray jblob, jint jblob_len) {
-  rocksdb::WriteBatchWithIndex* wbwi =
+  auto* wbwi =
       rocksdb::WriteBatchWithIndexJni::getHandle(env, jobj);
   assert(wbwi != nullptr);
-
-  jbyte* blob = env->GetByteArrayElements(jblob, nullptr);
-  rocksdb::Slice blob_slice(reinterpret_cast<char*>(blob), jblob_len);
-  wbwi->PutLogData(blob_slice);
-  env->ReleaseByteArrayElements(jblob, blob, JNI_ABORT);
+  auto putLogData = [&wbwi] (rocksdb::Slice blob) {
+    wbwi->PutLogData(blob);
+  };
+  rocksdb::JniUtil::k_op(putLogData, env, jobj, jblob, jblob_len);
 }
 
 /*

From 9ab5adfc59a621d12357580c94451d9f7320c2dd Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Wed, 14 Jan 2015 16:24:24 -0800
Subject: [PATCH 700/829] New BlockBasedTable version -- better compressed
 block format

Summary:
This diff adds BlockBasedTable format_version = 2. New format version brings better compressed block format for these compressions:
1) Zlib -- encode decompressed size in compressed block header
2) BZip2 -- encode decompressed size in compressed block header
3) LZ4 and LZ4HC -- instead of doing memcpy of size_t encode size as varint32. memcpy is very bad because the DB is not portable accross big/little endian machines or even platforms where size_t might be 8 or 4 bytes.

It does not affect format for snappy.

If you write a new database with format_version = 2, it will not be readable by RocksDB versions before 3.10. DB::Open() will return corruption in that case.

Test Plan:
Added a new test in db_test.
I will also run db_bench and verify VSIZE when block_cache == 1GB

Reviewers: yhchiang, rven, MarkCallaghan, dhruba, sdong

Reviewed By: sdong

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D31461
---
 HISTORY.md                         |   2 +
 db/db_bench.cc                     |  43 ++--
 db/db_test.cc                      |  50 ++++-
 include/rocksdb/version.h          |   2 +-
 table/block_based_table_builder.cc |  30 ++-
 table/block_based_table_factory.cc |   8 +-
 table/block_based_table_reader.cc  |  24 ++-
 table/block_based_table_reader.h   |   4 +-
 table/format.cc                    |  24 ++-
 table/format.h                     |  20 +-
 table/table_test.cc                |  37 ++--
 tools/db_sanity_test.cc            |  22 +-
 util/compression.h                 | 328 +++++++++++++++++++++--------
 13 files changed, 434 insertions(+), 160 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index 245f4ec61..1040794fe 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -8,6 +8,8 @@
   Lower numbered levels will be placed earlier in the db_paths and higher
   numbered levels will be placed later in the db_paths vector.
 * Potentially big performance improvements if you're using RocksDB with lots of column families (100-1000)
+* Added BlockBasedTableOptions.format_version option, which allows user to specify which version of block based table he wants. As a general guidline, newer versions have more features, but might not be readable by older versions of RocksDB.
+* Added new block based table format (version 2), which you can enable by setting BlockBasedTableOptions.format_version = 2. This format changes how we encode size information in compressed blocks and should help with memory allocations if you're using Zlib or BZip2 compressions.
 
 ### Public API changes
 * Deprecated skip_log_error_on_recovery option
diff --git a/db/db_bench.cc b/db/db_bench.cc
index d34dbb34b..d289b8f4c 100644
--- a/db/db_bench.cc
+++ b/db/db_bench.cc
@@ -1219,23 +1219,23 @@ class Benchmark {
           name = "Snappy";
           break;
         case kZlibCompression:
-          result = Zlib_Compress(Options().compression_opts, text, strlen(text),
-                                 &compressed);
+          result = Zlib_Compress(Options().compression_opts, 2, text,
+                                 strlen(text), &compressed);
           name = "Zlib";
           break;
         case kBZip2Compression:
-          result = BZip2_Compress(Options().compression_opts, text,
+          result = BZip2_Compress(Options().compression_opts, 2, text,
                                   strlen(text), &compressed);
           name = "BZip2";
           break;
         case kLZ4Compression:
-          result = LZ4_Compress(Options().compression_opts, text, strlen(text),
-                                &compressed);
+          result = LZ4_Compress(Options().compression_opts, 2, text,
+                                strlen(text), &compressed);
           name = "LZ4";
           break;
         case kLZ4HCCompression:
-          result = LZ4HC_Compress(Options().compression_opts, text,
-                                        strlen(text), &compressed);
+          result = LZ4HC_Compress(Options().compression_opts, 2, text,
+                                  strlen(text), &compressed);
           name = "LZ4HC";
           break;
         case kNoCompression:
@@ -1779,19 +1779,19 @@ class Benchmark {
                              input.size(), &compressed);
         break;
       case rocksdb::kZlibCompression:
-        ok = Zlib_Compress(Options().compression_opts, input.data(),
+        ok = Zlib_Compress(Options().compression_opts, 2, input.data(),
                            input.size(), &compressed);
         break;
       case rocksdb::kBZip2Compression:
-        ok = BZip2_Compress(Options().compression_opts, input.data(),
+        ok = BZip2_Compress(Options().compression_opts, 2, input.data(),
                             input.size(), &compressed);
         break;
       case rocksdb::kLZ4Compression:
-        ok = LZ4_Compress(Options().compression_opts, input.data(),
+        ok = LZ4_Compress(Options().compression_opts, 2, input.data(),
                           input.size(), &compressed);
         break;
       case rocksdb::kLZ4HCCompression:
-        ok = LZ4HC_Compress(Options().compression_opts, input.data(),
+        ok = LZ4HC_Compress(Options().compression_opts, 2, input.data(),
                             input.size(), &compressed);
         break;
       default:
@@ -1825,19 +1825,19 @@ class Benchmark {
                            input.size(), &compressed);
       break;
     case rocksdb::kZlibCompression:
-      ok = Zlib_Compress(Options().compression_opts, input.data(), input.size(),
-                         &compressed);
+      ok = Zlib_Compress(Options().compression_opts, 2, input.data(),
+                         input.size(), &compressed);
       break;
     case rocksdb::kBZip2Compression:
-      ok = BZip2_Compress(Options().compression_opts, input.data(),
+      ok = BZip2_Compress(Options().compression_opts, 2, input.data(),
                           input.size(), &compressed);
       break;
     case rocksdb::kLZ4Compression:
-      ok = LZ4_Compress(Options().compression_opts, input.data(), input.size(),
-                        &compressed);
+      ok = LZ4_Compress(Options().compression_opts, 2, input.data(),
+                        input.size(), &compressed);
       break;
     case rocksdb::kLZ4HCCompression:
-      ok = LZ4HC_Compress(Options().compression_opts, input.data(),
+      ok = LZ4HC_Compress(Options().compression_opts, 2, input.data(),
                           input.size(), &compressed);
       break;
     default:
@@ -1857,22 +1857,22 @@ class Benchmark {
         break;
       case rocksdb::kZlibCompression:
         uncompressed = Zlib_Uncompress(compressed.data(), compressed.size(),
-                                       &decompress_size);
+                                       &decompress_size, 2);
         ok = uncompressed != nullptr;
         break;
       case rocksdb::kBZip2Compression:
         uncompressed = BZip2_Uncompress(compressed.data(), compressed.size(),
-                                        &decompress_size);
+                                        &decompress_size, 2);
         ok = uncompressed != nullptr;
         break;
       case rocksdb::kLZ4Compression:
         uncompressed = LZ4_Uncompress(compressed.data(), compressed.size(),
-                                      &decompress_size);
+                                      &decompress_size, 2);
         ok = uncompressed != nullptr;
         break;
       case rocksdb::kLZ4HCCompression:
         uncompressed = LZ4_Uncompress(compressed.data(), compressed.size(),
-                                      &decompress_size);
+                                      &decompress_size, 2);
         ok = uncompressed != nullptr;
         break;
       default:
@@ -2031,6 +2031,7 @@ class Benchmark {
       block_based_options.block_size = FLAGS_block_size;
       block_based_options.block_restart_interval = FLAGS_block_restart_interval;
       block_based_options.filter_policy = filter_policy_;
+      block_based_options.format_version = 2;
       options.table_factory.reset(
           NewBlockBasedTableFactory(block_based_options));
     }
diff --git a/db/db_test.cc b/db/db_test.cc
index 566fdd077..4d3ebb51f 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -65,25 +65,25 @@ static bool SnappyCompressionSupported(const CompressionOptions& options) {
 static bool ZlibCompressionSupported(const CompressionOptions& options) {
   std::string out;
   Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
-  return Zlib_Compress(options, in.data(), in.size(), &out);
+  return Zlib_Compress(options, 2, in.data(), in.size(), &out);
 }
 
 static bool BZip2CompressionSupported(const CompressionOptions& options) {
   std::string out;
   Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
-  return BZip2_Compress(options, in.data(), in.size(), &out);
+  return BZip2_Compress(options, 2, in.data(), in.size(), &out);
 }
 
 static bool LZ4CompressionSupported(const CompressionOptions &options) {
   std::string out;
   Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
-  return LZ4_Compress(options, in.data(), in.size(), &out);
+  return LZ4_Compress(options, 2, in.data(), in.size(), &out);
 }
 
 static bool LZ4HCCompressionSupported(const CompressionOptions &options) {
   std::string out;
   Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
-  return LZ4HC_Compress(options, in.data(), in.size(), &out);
+  return LZ4HC_Compress(options, 2, in.data(), in.size(), &out);
 }
 
 static std::string RandomString(Random* rnd, int len) {
@@ -10170,6 +10170,48 @@ TEST(DBTest, DontDeleteMovedFile) {
   Reopen(options);
 }
 
+TEST(DBTest, EncodeDecompressedBlockSizeTest) {
+  // iter 0 -- zlib
+  // iter 1 -- bzip2
+  // iter 2 -- lz4
+  // iter 3 -- lz4HC
+  CompressionType compressions[] = {kZlibCompression, kBZip2Compression,
+                                    kLZ4Compression,  kLZ4HCCompression};
+  for (int iter = 0; iter < 4; ++iter) {
+    // first_table_version 1 -- generate with table_version == 1, read with
+    // table_version == 2
+    // first_table_version 2 -- generate with table_version == 2, read with
+    // table_version == 1
+    for (int first_table_version = 1; first_table_version <= 2;
+         ++first_table_version) {
+      BlockBasedTableOptions table_options;
+      table_options.format_version = first_table_version;
+      table_options.filter_policy.reset(NewBloomFilterPolicy(10));
+      Options options = CurrentOptions();
+      options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+      options.create_if_missing = true;
+      options.compression = compressions[iter];
+      DestroyAndReopen(options);
+
+      int kNumKeysWritten = 100000;
+
+      Random rnd(301);
+      for (int i = 0; i < kNumKeysWritten; ++i) {
+        // compressible string
+        ASSERT_OK(Put(Key(i), RandomString(&rnd, 128) + std::string(128, 'a')));
+      }
+
+      table_options.format_version = first_table_version == 1 ? 2 : 1;
+      options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+      Reopen(options);
+      for (int i = 0; i < kNumKeysWritten; ++i) {
+        auto r = Get(Key(i));
+        ASSERT_EQ(r.substr(128), std::string(128, 'a'));
+      }
+    }
+  }
+}
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
diff --git a/include/rocksdb/version.h b/include/rocksdb/version.h
index 339933853..2e76fe5be 100644
--- a/include/rocksdb/version.h
+++ b/include/rocksdb/version.h
@@ -5,7 +5,7 @@
 #pragma once
 
 #define ROCKSDB_MAJOR 3
-#define ROCKSDB_MINOR 9
+#define ROCKSDB_MINOR 10
 #define ROCKSDB_PATCH 0
 
 // Do not use these. We made the mistake of declaring macros starting with
diff --git a/table/block_based_table_builder.cc b/table/block_based_table_builder.cc
index f04906ff8..813f8a125 100644
--- a/table/block_based_table_builder.cc
+++ b/table/block_based_table_builder.cc
@@ -302,9 +302,11 @@ bool GoodCompressionRatio(size_t compressed_size, size_t raw_size) {
   return compressed_size < raw_size - (raw_size / 8u);
 }
 
+// format_version is the block format as defined in include/rocksdb/table.h
 Slice CompressBlock(const Slice& raw,
                     const CompressionOptions& compression_options,
-                    CompressionType* type, std::string* compressed_output) {
+                    CompressionType* type, uint32_t format_version,
+                    std::string* compressed_output) {
   if (*type == kNoCompression) {
     return raw;
   }
@@ -320,29 +322,37 @@ Slice CompressBlock(const Slice& raw,
       }
       break;  // fall back to no compression.
     case kZlibCompression:
-      if (Zlib_Compress(compression_options, raw.data(), raw.size(),
-                        compressed_output) &&
+      if (Zlib_Compress(
+              compression_options,
+              GetCompressFormatForVersion(kZlibCompression, format_version),
+              raw.data(), raw.size(), compressed_output) &&
           GoodCompressionRatio(compressed_output->size(), raw.size())) {
         return *compressed_output;
       }
       break;  // fall back to no compression.
     case kBZip2Compression:
-      if (BZip2_Compress(compression_options, raw.data(), raw.size(),
-                         compressed_output) &&
+      if (BZip2_Compress(
+              compression_options,
+              GetCompressFormatForVersion(kBZip2Compression, format_version),
+              raw.data(), raw.size(), compressed_output) &&
           GoodCompressionRatio(compressed_output->size(), raw.size())) {
         return *compressed_output;
       }
       break;  // fall back to no compression.
     case kLZ4Compression:
-      if (LZ4_Compress(compression_options, raw.data(), raw.size(),
-                       compressed_output) &&
+      if (LZ4_Compress(
+              compression_options,
+              GetCompressFormatForVersion(kLZ4Compression, format_version),
+              raw.data(), raw.size(), compressed_output) &&
           GoodCompressionRatio(compressed_output->size(), raw.size())) {
         return *compressed_output;
       }
       break;  // fall back to no compression.
     case kLZ4HCCompression:
-      if (LZ4HC_Compress(compression_options, raw.data(), raw.size(),
-                         compressed_output) &&
+      if (LZ4HC_Compress(
+              compression_options,
+              GetCompressFormatForVersion(kLZ4HCCompression, format_version),
+              raw.data(), raw.size(), compressed_output) &&
           GoodCompressionRatio(compressed_output->size(), raw.size())) {
         return *compressed_output;
       }
@@ -579,7 +589,7 @@ void BlockBasedTableBuilder::WriteBlock(const Slice& raw_block_contents,
   if (raw_block_contents.size() < kCompressionSizeLimit) {
     block_contents =
         CompressBlock(raw_block_contents, r->compression_opts, &type,
-                      &r->compressed_output);
+                      r->table_options.format_version, &r->compressed_output);
   } else {
     RecordTick(r->ioptions.statistics, NUMBER_BLOCK_NOT_COMPRESSED);
     type = kNoCompression;
diff --git a/table/block_based_table_factory.cc b/table/block_based_table_factory.cc
index 063bc2587..17ee0b8cb 100644
--- a/table/block_based_table_factory.cc
+++ b/table/block_based_table_factory.cc
@@ -14,11 +14,12 @@
 #include <string>
 #include <stdint.h>
 
+#include "port/port.h"
 #include "rocksdb/flush_block_policy.h"
 #include "rocksdb/cache.h"
 #include "table/block_based_table_builder.h"
 #include "table/block_based_table_reader.h"
-#include "port/port.h"
+#include "table/format.h"
 
 namespace rocksdb {
 
@@ -76,9 +77,10 @@ Status BlockBasedTableFactory::SanitizeOptions(
     return Status::InvalidArgument("Enable cache_index_and_filter_blocks, "
         ", but block cache is disabled");
   }
-  if (table_options_.format_version > 1) {
+  if (!BlockBasedTableSupportedVersion(table_options_.format_version)) {
     return Status::InvalidArgument(
-        "We currently only support versions 0 and 1");
+        "Unsupported BlockBasedTable format_version. Please check "
+        "include/rocksdb/table.h for more info");
   }
   return Status::OK();
 }
diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc
index 727f9c43a..8747d83d7 100644
--- a/table/block_based_table_reader.cc
+++ b/table/block_based_table_reader.cc
@@ -442,9 +442,9 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions,
   if (!s.ok()) {
     return s;
   }
-  if (footer.version() > 1) {
+  if (!BlockBasedTableSupportedVersion(footer.version())) {
     return Status::Corruption(
-        "Unknown Footer version. Maybe this file was created with too new "
+        "Unknown Footer version. Maybe this file was created with newer "
         "version of RocksDB?");
   }
 
@@ -605,7 +605,7 @@ Status BlockBasedTable::GetDataBlockFromCache(
     const Slice& block_cache_key, const Slice& compressed_block_cache_key,
     Cache* block_cache, Cache* block_cache_compressed, Statistics* statistics,
     const ReadOptions& read_options,
-    BlockBasedTable::CachableEntry<Block>* block) {
+    BlockBasedTable::CachableEntry<Block>* block, uint32_t format_version) {
   Status s;
   Block* compressed_block = nullptr;
   Cache::Handle* block_cache_compressed_handle = nullptr;
@@ -648,7 +648,8 @@ Status BlockBasedTable::GetDataBlockFromCache(
   // Retrieve the uncompressed contents into a new buffer
   BlockContents contents;
   s = UncompressBlockContents(compressed_block->data(),
-                              compressed_block->size(), &contents);
+                              compressed_block->size(), &contents,
+                              format_version);
 
   // Insert uncompressed block into block cache
   if (s.ok()) {
@@ -673,7 +674,7 @@ Status BlockBasedTable::PutDataBlockToCache(
     const Slice& block_cache_key, const Slice& compressed_block_cache_key,
     Cache* block_cache, Cache* block_cache_compressed,
     const ReadOptions& read_options, Statistics* statistics,
-    CachableEntry<Block>* block, Block* raw_block) {
+    CachableEntry<Block>* block, Block* raw_block, uint32_t format_version) {
   assert(raw_block->compression_type() == kNoCompression ||
          block_cache_compressed != nullptr);
 
@@ -681,8 +682,8 @@ Status BlockBasedTable::PutDataBlockToCache(
   // Retrieve the uncompressed contents into a new buffer
   BlockContents contents;
   if (raw_block->compression_type() != kNoCompression) {
-    s = UncompressBlockContents(raw_block->data(), raw_block->size(),
-                                &contents);
+    s = UncompressBlockContents(raw_block->data(), raw_block->size(), &contents,
+                                format_version);
   }
   if (!s.ok()) {
     delete raw_block;
@@ -929,7 +930,8 @@ Iterator* BlockBasedTable::NewDataBlockIterator(Rep* rep,
     }
 
     s = GetDataBlockFromCache(key, ckey, block_cache, block_cache_compressed,
-                              statistics, ro, &block);
+                              statistics, ro, &block,
+                              rep->table_options.format_version);
 
     if (block.value == nullptr && !no_io && ro.fill_cache) {
       Block* raw_block = nullptr;
@@ -942,7 +944,8 @@ Iterator* BlockBasedTable::NewDataBlockIterator(Rep* rep,
 
       if (s.ok()) {
         s = PutDataBlockToCache(key, ckey, block_cache, block_cache_compressed,
-                                ro, statistics, &block, raw_block);
+                                ro, statistics, &block, raw_block,
+                                rep->table_options.format_version);
       }
     }
   }
@@ -1194,7 +1197,8 @@ bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options,
   Slice ckey;
 
   s = GetDataBlockFromCache(cache_key, ckey, block_cache, nullptr, nullptr,
-                            options, &block);
+                            options, &block,
+                            rep_->table_options.format_version);
   assert(s.ok());
   bool in_cache = block.value != nullptr;
   if (in_cache) {
diff --git a/table/block_based_table_reader.h b/table/block_based_table_reader.h
index ae849ad6c..2902aa441 100644
--- a/table/block_based_table_reader.h
+++ b/table/block_based_table_reader.h
@@ -150,7 +150,7 @@ class BlockBasedTable : public TableReader {
       const Slice& block_cache_key, const Slice& compressed_block_cache_key,
       Cache* block_cache, Cache* block_cache_compressed, Statistics* statistics,
       const ReadOptions& read_options,
-      BlockBasedTable::CachableEntry<Block>* block);
+      BlockBasedTable::CachableEntry<Block>* block, uint32_t format_version);
   // Put a raw block (maybe compressed) to the corresponding block caches.
   // This method will perform decompression against raw_block if needed and then
   // populate the block caches.
@@ -163,7 +163,7 @@ class BlockBasedTable : public TableReader {
       const Slice& block_cache_key, const Slice& compressed_block_cache_key,
       Cache* block_cache, Cache* block_cache_compressed,
       const ReadOptions& read_options, Statistics* statistics,
-      CachableEntry<Block>* block, Block* raw_block);
+      CachableEntry<Block>* block, Block* raw_block, uint32_t format_version);
 
   // Calls (*handle_result)(arg, ...) repeatedly, starting with the entry found
   // after a call to Seek(key), until handle_result returns false.
diff --git a/table/format.cc b/table/format.cc
index 2ea4b9171..6b3d4eead 100644
--- a/table/format.cc
+++ b/table/format.cc
@@ -331,7 +331,7 @@ Status ReadBlockContents(RandomAccessFile* file, const Footer& footer,
   compression_type = static_cast<rocksdb::CompressionType>(slice.data()[n]);
 
   if (decompression_requested && compression_type != kNoCompression) {
-    return UncompressBlockContents(slice.data(), n, contents);
+    return UncompressBlockContents(slice.data(), n, contents, footer.version());
   }
 
   if (slice.data() != used_buf) {
@@ -354,8 +354,10 @@ Status ReadBlockContents(RandomAccessFile* file, const Footer& footer,
 // contents are uncompresed into this buffer. This
 // buffer is returned via 'result' and it is upto the caller to
 // free this buffer.
+// format_version is the block format as defined in include/rocksdb/table.h
 Status UncompressBlockContents(const char* data, size_t n,
-                               BlockContents* contents) {
+                               BlockContents* contents,
+                               uint32_t format_version) {
   std::unique_ptr<char[]> ubuf;
   int decompress_size = 0;
   assert(data[n] != kNoCompression);
@@ -375,8 +377,9 @@ Status UncompressBlockContents(const char* data, size_t n,
       break;
     }
     case kZlibCompression:
-      ubuf =
-          std::unique_ptr<char[]>(Zlib_Uncompress(data, n, &decompress_size));
+      ubuf = std::unique_ptr<char[]>(Zlib_Uncompress(
+          data, n, &decompress_size,
+          GetCompressFormatForVersion(kZlibCompression, format_version)));
       if (!ubuf) {
         static char zlib_corrupt_msg[] =
           "Zlib not supported or corrupted Zlib compressed block contents";
@@ -386,8 +389,9 @@ Status UncompressBlockContents(const char* data, size_t n,
           BlockContents(std::move(ubuf), decompress_size, true, kNoCompression);
       break;
     case kBZip2Compression:
-      ubuf =
-          std::unique_ptr<char[]>(BZip2_Uncompress(data, n, &decompress_size));
+      ubuf = std::unique_ptr<char[]>(BZip2_Uncompress(
+          data, n, &decompress_size,
+          GetCompressFormatForVersion(kBZip2Compression, format_version)));
       if (!ubuf) {
         static char bzip2_corrupt_msg[] =
           "Bzip2 not supported or corrupted Bzip2 compressed block contents";
@@ -397,7 +401,9 @@ Status UncompressBlockContents(const char* data, size_t n,
           BlockContents(std::move(ubuf), decompress_size, true, kNoCompression);
       break;
     case kLZ4Compression:
-      ubuf = std::unique_ptr<char[]>(LZ4_Uncompress(data, n, &decompress_size));
+      ubuf = std::unique_ptr<char[]>(LZ4_Uncompress(
+          data, n, &decompress_size,
+          GetCompressFormatForVersion(kLZ4Compression, format_version)));
       if (!ubuf) {
         static char lz4_corrupt_msg[] =
           "LZ4 not supported or corrupted LZ4 compressed block contents";
@@ -407,7 +413,9 @@ Status UncompressBlockContents(const char* data, size_t n,
           BlockContents(std::move(ubuf), decompress_size, true, kNoCompression);
       break;
     case kLZ4HCCompression:
-      ubuf = std::unique_ptr<char[]>(LZ4_Uncompress(data, n, &decompress_size));
+      ubuf = std::unique_ptr<char[]>(LZ4_Uncompress(
+          data, n, &decompress_size,
+          GetCompressFormatForVersion(kLZ4HCCompression, format_version)));
       if (!ubuf) {
         static char lz4hc_corrupt_msg[] =
           "LZ4HC not supported or corrupted LZ4HC compressed block contents";
diff --git a/table/format.h b/table/format.h
index d8bc43735..900a07148 100644
--- a/table/format.h
+++ b/table/format.h
@@ -65,6 +65,21 @@ class BlockHandle {
   static const BlockHandle kNullBlockHandle;
 };
 
+inline uint32_t GetCompressFormatForVersion(CompressionType compression_type,
+                                            uint32_t version) {
+  // snappy is not versioned
+  assert(compression_type != kSnappyCompression &&
+         compression_type != kNoCompression);
+  // As of version 2, we encode compressed block with
+  // compress_format_version == 2. Before that, the version is 1.
+  // DO NOT CHANGE THIS FUNCTION, it affects disk format
+  return version >= 2 ? 2 : 1;
+}
+
+inline bool BlockBasedTableSupportedVersion(uint32_t version) {
+  return version <= 2;
+}
+
 // Footer encapsulates the fixed information stored at the tail
 // end of every table file.
 class Footer {
@@ -191,8 +206,11 @@ extern Status ReadBlockContents(RandomAccessFile* file, const Footer& footer,
 // contents are uncompresed into this buffer. This buffer is
 // returned via 'result' and it is upto the caller to
 // free this buffer.
+// For description of compress_format_version and possible values, see
+// util/compression.h
 extern Status UncompressBlockContents(const char* data, size_t n,
-                                      BlockContents* contents);
+                                      BlockContents* contents,
+                                      uint32_t compress_format_version);
 
 // Implementation details follow.  Clients should ignore,
 
diff --git a/table/table_test.cc b/table/table_test.cc
index 4289059f9..31883c3c7 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -545,7 +545,8 @@ static bool ZlibCompressionSupported() {
 #ifdef ZLIB
   std::string out;
   Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
-  return Zlib_Compress(Options().compression_opts, in.data(), in.size(), &out);
+  return Zlib_Compress(Options().compression_opts, 2, in.data(), in.size(),
+                       &out);
 #else
   return false;
 #endif
@@ -555,7 +556,8 @@ static bool BZip2CompressionSupported() {
 #ifdef BZIP2
   std::string out;
   Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
-  return BZip2_Compress(Options().compression_opts, in.data(), in.size(), &out);
+  return BZip2_Compress(Options().compression_opts, 2, in.data(), in.size(),
+                        &out);
 #else
   return false;
 #endif
@@ -565,7 +567,8 @@ static bool LZ4CompressionSupported() {
 #ifdef LZ4
   std::string out;
   Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
-  return LZ4_Compress(Options().compression_opts, in.data(), in.size(), &out);
+  return LZ4_Compress(Options().compression_opts, 2, in.data(), in.size(),
+                      &out);
 #else
   return false;
 #endif
@@ -575,7 +578,8 @@ static bool LZ4HCCompressionSupported() {
 #ifdef LZ4
   std::string out;
   Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
-  return LZ4HC_Compress(Options().compression_opts, in.data(), in.size(), &out);
+  return LZ4HC_Compress(Options().compression_opts, 2, in.data(), in.size(),
+                        &out);
 #else
   return false;
 #endif
@@ -596,6 +600,7 @@ struct TestArgs {
   bool reverse_compare;
   int restart_interval;
   CompressionType compression;
+  uint32_t format_version;
 };
 
 static std::vector<TestArgs> GenerateArgList() {
@@ -609,22 +614,26 @@ static std::vector<TestArgs> GenerateArgList() {
   std::vector<int> restart_intervals = {16, 1, 1024};
 
   // Only add compression if it is supported
-  std::vector<CompressionType> compression_types;
-  compression_types.push_back(kNoCompression);
+  std::vector<std::pair<CompressionType, bool>> compression_types;
+  compression_types.emplace_back(kNoCompression, false);
   if (SnappyCompressionSupported()) {
-    compression_types.push_back(kSnappyCompression);
+    compression_types.emplace_back(kSnappyCompression, false);
   }
   if (ZlibCompressionSupported()) {
-    compression_types.push_back(kZlibCompression);
+    compression_types.emplace_back(kZlibCompression, false);
+    compression_types.emplace_back(kZlibCompression, true);
   }
   if (BZip2CompressionSupported()) {
-    compression_types.push_back(kBZip2Compression);
+    compression_types.emplace_back(kBZip2Compression, false);
+    compression_types.emplace_back(kBZip2Compression, true);
   }
   if (LZ4CompressionSupported()) {
-    compression_types.push_back(kLZ4Compression);
+    compression_types.emplace_back(kLZ4Compression, false);
+    compression_types.emplace_back(kLZ4Compression, true);
   }
   if (LZ4HCCompressionSupported()) {
-    compression_types.push_back(kLZ4HCCompression);
+    compression_types.emplace_back(kLZ4HCCompression, false);
+    compression_types.emplace_back(kLZ4HCCompression, true);
   }
 
   for (auto test_type : test_types) {
@@ -636,7 +645,7 @@ static std::vector<TestArgs> GenerateArgList() {
         one_arg.type = test_type;
         one_arg.reverse_compare = reverse_compare;
         one_arg.restart_interval = restart_intervals[0];
-        one_arg.compression = compression_types[0];
+        one_arg.compression = compression_types[0].first;
         test_args.push_back(one_arg);
         continue;
       }
@@ -647,7 +656,8 @@ static std::vector<TestArgs> GenerateArgList() {
           one_arg.type = test_type;
           one_arg.reverse_compare = reverse_compare;
           one_arg.restart_interval = restart_interval;
-          one_arg.compression = compression_type;
+          one_arg.compression = compression_type.first;
+          one_arg.format_version = compression_type.second ? 2 : 1;
           test_args.push_back(one_arg);
         }
       }
@@ -718,6 +728,7 @@ class Harness {
             new FlushBlockBySizePolicyFactory());
         table_options_.block_size = 256;
         table_options_.block_restart_interval = args.restart_interval;
+        table_options_.format_version = args.format_version;
         options_.table_factory.reset(
             new BlockBasedTableFactory(table_options_));
         constructor_ = new TableConstructor(options_.comparator);
diff --git a/tools/db_sanity_test.cc b/tools/db_sanity_test.cc
index df3cae11d..dee180c87 100644
--- a/tools/db_sanity_test.cc
+++ b/tools/db_sanity_test.cc
@@ -133,6 +133,22 @@ class SanityTestZlibCompression : public SanityTest {
   Options options_;
 };
 
+class SanityTestZlibCompressionVersion2 : public SanityTest {
+ public:
+  explicit SanityTestZlibCompressionVersion2(const std::string& path)
+      : SanityTest(path) {
+    options_.compression = kZlibCompression;
+    BlockBasedTableOptions table_options;
+    table_options.format_version = 2;
+    options_.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  }
+  virtual Options GetOptions() const { return options_; }
+  virtual std::string Name() const { return "ZlibCompressionVersion2"; }
+
+ private:
+  Options options_;
+};
+
 class SanityTestLZ4Compression : public SanityTest {
  public:
   explicit SanityTestLZ4Compression(const std::string& path)
@@ -197,6 +213,7 @@ bool RunSanityTests(const std::string& command, const std::string& path) {
   std::vector<SanityTest*> sanity_tests = {
       new SanityTestBasic(path), new SanityTestSpecialComparator(path),
       new SanityTestZlibCompression(path),
+      new SanityTestZlibCompressionVersion2(path),
       new SanityTestLZ4Compression(path),
       new SanityTestLZ4HCCompression(path),
 #ifndef ROCKSDB_LITE
@@ -209,6 +226,7 @@ bool RunSanityTests(const std::string& command, const std::string& path) {
   } else {
     fprintf(stderr, "Verifying...\n");
   }
+  bool result = true;
   for (auto sanity_test : sanity_tests) {
     Status s;
     fprintf(stderr, "%s -- ", sanity_test->Name().c_str());
@@ -221,12 +239,12 @@ bool RunSanityTests(const std::string& command, const std::string& path) {
     fprintf(stderr, "%s\n", s.ToString().c_str());
     if (!s.ok()) {
       fprintf(stderr, "FAIL\n");
-      return false;
+      result = false;
     }
 
     delete sanity_test;
   }
-  return true;
+  return result;
 }
 }  // namespace
 
diff --git a/util/compression.h b/util/compression.h
index a0ca91f7f..664036353 100644
--- a/util/compression.h
+++ b/util/compression.h
@@ -9,7 +9,11 @@
 //
 #pragma once
 
+#include <algorithm>
+#include <limits>
+
 #include "rocksdb/options.h"
+#include "util/coding.h"
 
 #ifdef SNAPPY
 #include <snappy.h>
@@ -30,6 +34,13 @@
 
 namespace rocksdb {
 
+// compress_format_version can have two values:
+// 1 -- decompressed sizes for BZip2 and Zlib are not included in the compressed
+// block. Also, decompressed sizes for LZ4 are encoded in platform-dependent
+// way.
+// 2 -- Zlib, BZip2 and LZ4 encode decompressed size as Varint32 just before the
+// start of compressed block. Snappy format is the same as version 1.
+
 inline bool Snappy_Compress(const CompressionOptions& opts, const char* input,
                             size_t length, ::std::string* output) {
 #ifdef SNAPPY
@@ -61,9 +72,50 @@ inline bool Snappy_Uncompress(const char* input, size_t length,
 #endif
 }
 
-inline bool Zlib_Compress(const CompressionOptions& opts, const char* input,
-                          size_t length, ::std::string* output) {
+namespace compression {
+// returns size
+inline size_t PutDecompressedSizeInfo(std::string* output, uint32_t length) {
+  PutVarint32(output, length);
+  return output->size();
+}
+
+inline bool GetDecompressedSizeInfo(const char** input_data,
+                                    size_t* input_length,
+                                    uint32_t* output_len) {
+  auto new_input_data =
+      GetVarint32Ptr(*input_data, *input_data + *input_length, output_len);
+  if (new_input_data == nullptr) {
+    return false;
+  }
+  *input_length -= (new_input_data - *input_data);
+  *input_data = new_input_data;
+  return true;
+}
+}  // namespace compression
+
+// compress_format_version == 1 -- decompressed size is not included in the
+// block header
+// compress_format_version == 2 -- decompressed size is included in the block
+// header in varint32 format
+inline bool Zlib_Compress(const CompressionOptions& opts,
+                          uint32_t compress_format_version,
+                          const char* input, size_t length,
+                          ::std::string* output) {
 #ifdef ZLIB
+  if (length > std::numeric_limits<uint32_t>::max()) {
+    // Can't compress more than 4GB
+    return false;
+  }
+
+  size_t output_header_len = 0;
+  if (compress_format_version == 2) {
+    output_header_len = compression::PutDecompressedSizeInfo(
+        output, static_cast<uint32_t>(length));
+  }
+  // Resize output to be the plain data length.
+  // This may not be big enough if the compression actually expands data.
+  output->resize(output_header_len + length);
+
   // The memLevel parameter specifies how much memory should be allocated for
   // the internal compression state.
   // memLevel=1 uses minimum memory but is slow and reduces compression ratio.
@@ -78,19 +130,14 @@ inline bool Zlib_Compress(const CompressionOptions& opts, const char* input,
     return false;
   }
 
-  // Resize output to be the plain data length.
-  // This may not be big enough if the compression actually expands data.
-  output->resize(length);
-
   // Compress the input, and put compressed data in output.
   _stream.next_in = (Bytef *)input;
   _stream.avail_in = static_cast<unsigned int>(length);
 
   // Initialize the output size.
   _stream.avail_out = static_cast<unsigned int>(length);
-  _stream.next_out = (Bytef*)&(*output)[0];
+  _stream.next_out = reinterpret_cast<Bytef*>(&(*output)[output_header_len]);
 
-  size_t old_sz = 0, new_sz = 0, new_sz_delta = 0;
   bool done = false;
   while (!done) {
     st = deflate(&_stream, Z_FINISH);
@@ -99,16 +146,9 @@ inline bool Zlib_Compress(const CompressionOptions& opts, const char* input,
         done = true;
         break;
       case Z_OK:
-        // No output space. Increase the output space by 20%.
-        // (Should we fail the compression since it expands the size?)
-        old_sz = output->size();
-        new_sz_delta = static_cast<size_t>(output->size() * 0.2);
-        new_sz = output->size() + (new_sz_delta < 10 ? 10 : new_sz_delta);
-        output->resize(new_sz);
-        // Set more output.
-        _stream.next_out = (Bytef *)&(*output)[old_sz];
-        _stream.avail_out = static_cast<unsigned int>(new_sz - old_sz);
-        break;
+        // No output space. This means the compression is bigger than
+        // decompressed size. Just fail the compression in that case.
+        // Intentional fallback (to failure case)
       case Z_BUF_ERROR:
       default:
         deflateEnd(&_stream);
@@ -116,16 +156,37 @@ inline bool Zlib_Compress(const CompressionOptions& opts, const char* input,
     }
   }
 
-  output->resize(output->size() - _stream.avail_out);
+  output->resize(output->size() - _stream.avail_out + output_header_len);
   deflateEnd(&_stream);
   return true;
 #endif
   return false;
 }
 
+// compress_format_version == 1 -- decompressed size is not included in the
+// block header
+// compress_format_version == 2 -- decompressed size is included in the block
+// header in varint32 format
 inline char* Zlib_Uncompress(const char* input_data, size_t input_length,
-    int* decompress_size, int windowBits = -14) {
+                             int* decompress_size,
+                             uint32_t compress_format_version,
+                             int windowBits = -14) {
 #ifdef ZLIB
+  uint32_t output_len = 0;
+  if (compress_format_version == 2) {
+    if (!compression::GetDecompressedSizeInfo(&input_data, &input_length,
+                                              &output_len)) {
+      return nullptr;
+    }
+  } else {
+    // Assume the decompressed data size will 5x of compressed size, but round
+    // to the page size
+    size_t proposed_output_len = ((input_length * 5) & (~(4096 - 1))) + 4096;
+    output_len = static_cast<uint32_t>(
+        std::min(proposed_output_len,
+                 static_cast<size_t>(std::numeric_limits<uint32_t>::max())));
+  }
+
   z_stream _stream;
   memset(&_stream, 0, sizeof(z_stream));
 
@@ -141,31 +202,27 @@ inline char* Zlib_Uncompress(const char* input_data, size_t input_length,
   _stream.next_in = (Bytef *)input_data;
   _stream.avail_in = static_cast<unsigned int>(input_length);
 
-  // Assume the decompressed data size will 5x of compressed size.
-  size_t output_len = input_length * 5;
   char* output = new char[output_len];
-  size_t old_sz = output_len;
 
   _stream.next_out = (Bytef *)output;
   _stream.avail_out = static_cast<unsigned int>(output_len);
 
-  char* tmp = nullptr;
-  size_t output_len_delta;
   bool done = false;
-
-  //while(_stream.next_in != nullptr && _stream.avail_in != 0) {
   while (!done) {
     st = inflate(&_stream, Z_SYNC_FLUSH);
     switch (st) {
       case Z_STREAM_END:
         done = true;
         break;
-      case Z_OK:
+      case Z_OK: {
         // No output space. Increase the output space by 20%.
-        old_sz = output_len;
-        output_len_delta = static_cast<size_t>(output_len * 0.2);
+        // We should never run out of output space if
+        // compress_format_version == 2
+        assert(compress_format_version != 2);
+        size_t old_sz = output_len;
+        size_t output_len_delta = static_cast<size_t>(output_len * 0.2);
         output_len += output_len_delta < 10 ? 10 : output_len_delta;
-        tmp = new char[output_len];
+        char* tmp = new char[output_len];
         memcpy(tmp, output, old_sz);
         delete[] output;
         output = tmp;
@@ -174,6 +231,7 @@ inline char* Zlib_Uncompress(const char* input_data, size_t input_length,
         _stream.next_out = (Bytef *)(output + old_sz);
         _stream.avail_out = static_cast<unsigned int>(output_len - old_sz);
         break;
+      }
       case Z_BUF_ERROR:
       default:
         delete[] output;
@@ -182,6 +240,8 @@ inline char* Zlib_Uncompress(const char* input_data, size_t input_length,
     }
   }
 
+  // If we encoded decompressed block size, we should have no bytes left
+  assert(compress_format_version != 2 || _stream.avail_out == 0);
   *decompress_size = static_cast<int>(output_len - _stream.avail_out);
   inflateEnd(&_stream);
   return output;
@@ -190,9 +250,29 @@ inline char* Zlib_Uncompress(const char* input_data, size_t input_length,
   return nullptr;
 }
 
-inline bool BZip2_Compress(const CompressionOptions& opts, const char* input,
-                           size_t length, ::std::string* output) {
+// compress_format_version == 1 -- decompressed size is not included in the
+// block header
+// compress_format_version == 2 -- decompressed size is included in the block
+// header in varint32 format
+inline bool BZip2_Compress(const CompressionOptions& opts,
+                           uint32_t compress_format_version,
+                           const char* input, size_t length,
+                           ::std::string* output) {
 #ifdef BZIP2
+  if (length > std::numeric_limits<uint32_t>::max()) {
+    // Can't compress more than 4GB
+    return false;
+  }
+  size_t output_header_len = 0;
+  if (compress_format_version == 2) {
+    output_header_len = compression::PutDecompressedSizeInfo(
+        output, static_cast<uint32_t>(length));
+  }
+  // Resize output to be the plain data length.
+  // This may not be big enough if the compression actually expands data.
+  output->resize(output_header_len + length);
+
+
   bz_stream _stream;
   memset(&_stream, 0, sizeof(bz_stream));
 
@@ -204,34 +284,23 @@ inline bool BZip2_Compress(const CompressionOptions& opts, const char* input,
     return false;
   }
 
-  // Resize output to be the plain data length.
-  // This may not be big enough if the compression actually expands data.
-  output->resize(length);
-
   // Compress the input, and put compressed data in output.
   _stream.next_in = (char *)input;
   _stream.avail_in = static_cast<unsigned int>(length);
 
   // Initialize the output size.
-  _stream.next_out = (char *)&(*output)[0];
   _stream.avail_out = static_cast<unsigned int>(length);
+  _stream.next_out = reinterpret_cast<char*>(&(*output)[output_header_len]);
 
-  size_t old_sz = 0, new_sz = 0;
   while (_stream.next_in != nullptr && _stream.avail_in != 0) {
     st = BZ2_bzCompress(&_stream, BZ_FINISH);
     switch (st) {
       case BZ_STREAM_END:
         break;
       case BZ_FINISH_OK:
-        // No output space. Increase the output space by 20%.
-        // (Should we fail the compression since it expands the size?)
-        old_sz = output->size();
-        new_sz = static_cast<size_t>(output->size() * 1.2);
-        output->resize(new_sz);
-        // Set more output.
-        _stream.next_out = (char *)&(*output)[old_sz];
-        _stream.avail_out = static_cast<unsigned int>(new_sz - old_sz);
-        break;
+        // No output space. This means the compression is bigger than
+        // decompressed size. Just fail the compression in that case
+        // Intentional fallback (to failure case)
       case BZ_SEQUENCE_ERROR:
       default:
         BZ2_bzCompressEnd(&_stream);
@@ -239,16 +308,36 @@ inline bool BZip2_Compress(const CompressionOptions& opts, const char* input,
     }
   }
 
-  output->resize(output->size() - _stream.avail_out);
+  output->resize(output->size() - _stream.avail_out + output_header_len);
   BZ2_bzCompressEnd(&_stream);
   return true;
 #endif
   return false;
 }
 
+// compress_format_version == 1 -- decompressed size is not included in the
+// block header
+// compress_format_version == 2 -- decompressed size is included in the block
+// header in varint32 format
 inline char* BZip2_Uncompress(const char* input_data, size_t input_length,
-                              int* decompress_size) {
+                              int* decompress_size,
+                              uint32_t compress_format_version) {
 #ifdef BZIP2
+  uint32_t output_len = 0;
+  if (compress_format_version == 2) {
+    if (!compression::GetDecompressedSizeInfo(&input_data, &input_length,
+                                              &output_len)) {
+      return nullptr;
+    }
+  } else {
+    // Assume the decompressed data size will 5x of compressed size, but round
+    // to the next page size
+    size_t proposed_output_len = ((input_length * 5) & (~(4096 - 1))) + 4096;
+    output_len = static_cast<uint32_t>(
+        std::min(proposed_output_len,
+                 static_cast<size_t>(std::numeric_limits<uint32_t>::max())));
+  }
+
   bz_stream _stream;
   memset(&_stream, 0, sizeof(bz_stream));
 
@@ -260,26 +349,26 @@ inline char* BZip2_Uncompress(const char* input_data, size_t input_length,
   _stream.next_in = (char *)input_data;
   _stream.avail_in = static_cast<unsigned int>(input_length);
 
-  // Assume the decompressed data size will be 5x of compressed size.
-  size_t output_len = input_length * 5;
   char* output = new char[output_len];
-  size_t old_sz = output_len;
 
   _stream.next_out = (char *)output;
   _stream.avail_out = static_cast<unsigned int>(output_len);
 
-  char* tmp = nullptr;
-
-  while(_stream.next_in != nullptr && _stream.avail_in != 0) {
+  bool done = false;
+  while (!done) {
     st = BZ2_bzDecompress(&_stream);
     switch (st) {
       case BZ_STREAM_END:
+        done = true;
         break;
-      case BZ_OK:
+      case BZ_OK: {
         // No output space. Increase the output space by 20%.
-        old_sz = output_len;
-        output_len = static_cast<size_t>(output_len * 1.2);
-        tmp = new char[output_len];
+        // We should never run out of output space if
+        // compress_format_version == 2
+        assert(compress_format_version != 2);
+        uint32_t old_sz = output_len;
+        output_len = output_len * 1.2;
+        char* tmp = new char[output_len];
         memcpy(tmp, output, old_sz);
         delete[] output;
         output = tmp;
@@ -288,6 +377,7 @@ inline char* BZip2_Uncompress(const char* input_data, size_t input_length,
         _stream.next_out = (char *)(output + old_sz);
         _stream.avail_out = static_cast<unsigned int>(output_len - old_sz);
         break;
+      }
       default:
         delete[] output;
         BZ2_bzDecompressEnd(&_stream);
@@ -295,6 +385,8 @@ inline char* BZip2_Uncompress(const char* input_data, size_t input_length,
     }
   }
 
+  // If we encoded decompressed block size, we should have no bytes left
+  assert(compress_format_version != 2 || _stream.avail_out == 0);
   *decompress_size = static_cast<int>(output_len - _stream.avail_out);
   BZ2_bzDecompressEnd(&_stream);
   return output;
@@ -302,66 +394,132 @@ inline char* BZip2_Uncompress(const char* input_data, size_t input_length,
   return nullptr;
 }
 
-inline bool LZ4_Compress(const CompressionOptions &opts, const char *input,
+// compress_format_version == 1 -- decompressed size is included in the
+// block header using memcpy, which makes database non-portable)
+// compress_format_version == 2 -- decompressed size is included in the block
+// header in varint32 format
+inline bool LZ4_Compress(const CompressionOptions& opts,
+                         uint32_t compress_format_version, const char* input,
                          size_t length, ::std::string* output) {
 #ifdef LZ4
+  if (length > std::numeric_limits<uint32_t>::max()) {
+    // Can't compress more than 4GB
+    return false;
+  }
+
+  size_t output_header_len = 0;
+  if (compress_format_version == 2) {
+    // new encoding, using varint32 to store size information
+    output_header_len = compression::PutDecompressedSizeInfo(
+        output, static_cast<uint32_t>(length));
+  } else {
+    // legacy encoding, which is not really portable (depends on big/little
+    // endianness)
+    output_header_len = 8;
+    output->resize(output_header_len);
+    char* p = const_cast<char*>(output->c_str());
+    memcpy(p, &length, sizeof(length));
+  }
+
   int compressBound = LZ4_compressBound(static_cast<int>(length));
-  output->resize(static_cast<size_t>(8 + compressBound));
-  char* p = const_cast<char*>(output->c_str());
-  memcpy(p, &length, sizeof(length));
-  int outlen = LZ4_compress_limitedOutput(
-      input, p + 8, static_cast<int>(length), compressBound);
+  output->resize(static_cast<size_t>(output_header_len + compressBound));
+  int outlen =
+      LZ4_compress_limitedOutput(input, &(*output)[output_header_len],
+                                 static_cast<int>(length), compressBound);
   if (outlen == 0) {
     return false;
   }
-  output->resize(static_cast<size_t>(8 + outlen));
+  output->resize(static_cast<size_t>(output_header_len + outlen));
   return true;
 #endif
   return false;
 }
 
+// compress_format_version == 1 -- decompressed size is included in the
+// block header using memcpy, which makes database non-portable)
+// compress_format_version == 2 -- decompressed size is included in the block
+// header in varint32 format
 inline char* LZ4_Uncompress(const char* input_data, size_t input_length,
-                            int* decompress_size) {
+                            int* decompress_size,
+                            uint32_t compress_format_version) {
 #ifdef LZ4
-  if (input_length < 8) {
-    return nullptr;
+  uint32_t output_len = 0;
+  if (compress_format_version == 2) {
+    // new encoding, using varint32 to store size information
+    if (!compression::GetDecompressedSizeInfo(&input_data, &input_length,
+                                              &output_len)) {
+      return nullptr;
+    }
+  } else {
+    // legacy encoding, which is not really portable (depends on big/little
+    // endianness)
+    if (input_length < 8) {
+      return nullptr;
+    }
+    memcpy(&output_len, input_data, sizeof(output_len));
+    input_length -= 8;
+    input_data += 8;
   }
-  int output_len;
-  memcpy(&output_len, input_data, sizeof(output_len));
-  char *output = new char[output_len];
-  *decompress_size = LZ4_decompress_safe_partial(
-      input_data + 8, output, static_cast<int>(input_length - 8), output_len,
-      output_len);
+  char* output = new char[output_len];
+  *decompress_size =
+      LZ4_decompress_safe(input_data, output, static_cast<int>(input_length),
+                          static_cast<int>(output_len));
   if (*decompress_size < 0) {
     delete[] output;
     return nullptr;
   }
+  assert(*decompress_size == static_cast<int>(output_len));
   return output;
 #endif
   return nullptr;
 }
 
-inline bool LZ4HC_Compress(const CompressionOptions &opts, const char* input,
+// compress_format_version == 1 -- decompressed size is included in the
+// block header using memcpy, which makes database non-portable)
+// compress_format_version == 2 -- decompressed size is included in the block
+// header in varint32 format
+inline bool LZ4HC_Compress(const CompressionOptions& opts,
+                           uint32_t compress_format_version, const char* input,
                            size_t length, ::std::string* output) {
 #ifdef LZ4
+  if (length > std::numeric_limits<uint32_t>::max()) {
+    // Can't compress more than 4GB
+    return false;
+  }
+
+  size_t output_header_len = 0;
+  if (compress_format_version == 2) {
+    // new encoding, using varint32 to store size information
+    output_header_len = compression::PutDecompressedSizeInfo(
+        output, static_cast<uint32_t>(length));
+  } else {
+    // legacy encoding, which is not really portable (depends on big/little
+    // endianness)
+    output_header_len = 8;
+    output->resize(output_header_len);
+    char* p = const_cast<char*>(output->c_str());
+    memcpy(p, &length, sizeof(length));
+  }
+
   int compressBound = LZ4_compressBound(static_cast<int>(length));
-  output->resize(static_cast<size_t>(8 + compressBound));
-  char* p = const_cast<char*>(output->c_str());
-  memcpy(p, &length, sizeof(length));
+  output->resize(static_cast<size_t>(output_header_len + compressBound));
   int outlen;
 #ifdef LZ4_VERSION_MAJOR  // they only started defining this since r113
-  outlen = LZ4_compressHC2_limitedOutput(input, p + 8, static_cast<int>(length),
+  outlen = LZ4_compressHC2_limitedOutput(input, &(*output)[output_header_len],
+                                         static_cast<int>(length),
                                          compressBound, opts.level);
 #else
-  outlen = LZ4_compressHC_limitedOutput(input, p + 8, static_cast<int>(length),
-                                        compressBound);
+  outlen =
+      LZ4_compressHC_limitedOutput(input, &(*output)[output_header_len],
+                                   static_cast<int>(length), compressBound);
 #endif
   if (outlen == 0) {
     return false;
   }
-  output->resize(static_cast<size_t>(8 + outlen));
+  output->resize(static_cast<size_t>(output_header_len + outlen));
   return true;
 #endif
   return false;
 }
-} // namespace rocksdb
+
+}  // namespace rocksdb

From 2bb059007b5bd8783148f7dcf4e1cc2a1068816f Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Wed, 14 Jan 2015 16:25:36 -0800
Subject: [PATCH 701/829] Change db_stress to work with format_version == 2

---
 tools/db_stress.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/db_stress.cc b/tools/db_stress.cc
index 8b5b934a2..e33eeed73 100644
--- a/tools/db_stress.cc
+++ b/tools/db_stress.cc
@@ -1757,6 +1757,7 @@ class StressTest {
     block_based_options.block_cache = cache_;
     block_based_options.block_cache_compressed = compressed_cache_;
     block_based_options.block_size = FLAGS_block_size;
+    block_based_options.format_version = 2;
     block_based_options.filter_policy = filter_policy_;
     options_.table_factory.reset(
         NewBlockBasedTableFactory(block_based_options));

From d10f1de2b42aba5d2fab87e3ca1e098498f45eec Mon Sep 17 00:00:00 2001
From: Thomas Dudziak <tomdzk@fb.com>
Date: Thu, 15 Jan 2015 10:28:10 -0800
Subject: [PATCH 702/829] Ported LevelDB's fault_injection_test

Summary:
This is a port of [[ https://github.com/google/leveldb/blob/master/db/fault_injection_test.cc | LevelDB's fault_injection_test ]] to RocksDB. Unfortunately it fails with:

```
==== Test FaultInjectionTest.FaultTest
db/fault_injection_test.cc:491: Corruption: no meta-nextfile entry in descriptor
#0   ./fault_injection_test() [0x41477a] rocksdb::FaultInjectionTest::PartialCompactTestReopenWithFault(rocksdb::FaultInjectionTest::ResetMethod, int, int) /data/users/tomdzk/rocksdb/db/fault_injection_test.cc:491
#1   ./fault_injection_test() [0x40a38a] rocksdb::_Test_FaultTest::_Run() /data/users/tomdzk/rocksdb/db/fault_injection_test.cc:517
#2   ./fault_injection_test() [0x415bea] rocksdb::_Test_FaultTest::_RunIt() /data/users/tomdzk/rocksdb/db/fault_injection_test.cc:507
#3   ./fault_injection_test() [0x584367] rocksdb::test::RunAllTests() /data/users/tomdzk/rocksdb/util/testharness.cc:70
#4   /usr/local/fbcode/gcc-4.8.1-glibc-2.17/lib/libc.so.6(__libc_start_main+0x10e) [0x7f7a40857efe] ??  ??:0
#5   ./fault_injection_test() [0x408bb8] _start ??:0
```

so I commented out the test invocation in the source code for now (lines 514-520) so it can be merged.

Test Plan: This is a new test.

Reviewers: igor

Reviewed By: igor

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D31587
---
 Makefile                   |   6 +-
 db/fault_injection_test.cc | 541 +++++++++++++++++++++++++++++++++++++
 2 files changed, 546 insertions(+), 1 deletion(-)
 create mode 100644 db/fault_injection_test.cc

diff --git a/Makefile b/Makefile
index 26dde6b45..4c7a562b5 100644
--- a/Makefile
+++ b/Makefile
@@ -43,7 +43,7 @@ else
 OPT += -DNDEBUG
 endif
 
-ifneq ($(filter -DROCKSDB_LITE,$(OPT)),) 
+ifneq ($(filter -DROCKSDB_LITE,$(OPT)),)
 	# found
 	CFLAGS += -fno-exceptions
 	CXXFLAGS += -fno-exceptions
@@ -121,6 +121,7 @@ TESTS = \
 	crc32c_test \
 	dbformat_test \
 	env_test \
+	fault_injection_test \
 	blob_store_test \
 	filelock_test \
 	filename_test \
@@ -462,6 +463,9 @@ dbformat_test: db/dbformat_test.o $(LIBOBJECTS) $(TESTHARNESS)
 env_test: util/env_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) util/env_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
 
+fault_injection_test: db/fault_injection_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) db/fault_injection_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
 rate_limiter_test: util/rate_limiter_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) util/rate_limiter_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
 
diff --git a/db/fault_injection_test.cc b/db/fault_injection_test.cc
new file mode 100644
index 000000000..0ef0be318
--- /dev/null
+++ b/db/fault_injection_test.cc
@@ -0,0 +1,541 @@
+//  Copyright (c) 2015, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright 2014 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+// This test uses a custom Env to keep track of the state of a filesystem as of
+// the last "sync". It then checks for data loss errors by purposely dropping
+// file data (or entire files) not protected by a "sync".
+
+#include <map>
+#include <set>
+#include "db/db_impl.h"
+#include "db/filename.h"
+#include "db/log_format.h"
+#include "db/version_set.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/table.h"
+#include "rocksdb/write_batch.h"
+#include "util/logging.h"
+#include "util/mutexlock.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+namespace rocksdb {
+
+static const int kValueSize = 1000;
+static const int kMaxNumValues = 2000;
+static const size_t kNumIterations = 3;
+
+class TestWritableFile;
+class FaultInjectionTestEnv;
+
+namespace {
+
+// Assume a filename, and not a directory name like "/foo/bar/"
+static std::string GetDirName(const std::string filename) {
+  size_t found = filename.find_last_of("/\\");
+  if (found == std::string::npos) {
+    return "";
+  } else {
+    return filename.substr(0, found);
+  }
+}
+
+Status SyncDir(const std::string& dir) {
+  // As this is a test it isn't required to *actually* sync this directory.
+  return Status::OK();
+}
+
+// A basic file truncation function suitable for this test.
+Status Truncate(const std::string& filename, uint64_t length) {
+  rocksdb::Env* env = rocksdb::Env::Default();
+
+  unique_ptr<SequentialFile> orig_file;
+  const EnvOptions options;
+  Status s = env->NewSequentialFile(filename, &orig_file, options);
+  if (!s.ok())
+    return s;
+
+  char* scratch = new char[length];
+  rocksdb::Slice result;
+  s = orig_file->Read(length, &result, scratch);
+  if (s.ok()) {
+    std::string tmp_name = GetDirName(filename) + "/truncate.tmp";
+    unique_ptr<WritableFile> tmp_file;
+    s = env->NewWritableFile(tmp_name, &tmp_file, options);
+    if (s.ok()) {
+      s = tmp_file->Append(result);
+      if (s.ok()) {
+        s = env->RenameFile(tmp_name, filename);
+      } else {
+        env->DeleteFile(tmp_name);
+      }
+    }
+  }
+
+  delete[] scratch;
+
+  return s;
+}
+
+struct FileState {
+  std::string filename_;
+  ssize_t pos_;
+  ssize_t pos_at_last_sync_;
+  ssize_t pos_at_last_flush_;
+
+  explicit FileState(const std::string& filename)
+      : filename_(filename),
+        pos_(-1),
+        pos_at_last_sync_(-1),
+        pos_at_last_flush_(-1) { }
+
+  FileState() : pos_(-1), pos_at_last_sync_(-1), pos_at_last_flush_(-1) {}
+
+  bool IsFullySynced() const { return pos_ <= 0 || pos_ == pos_at_last_sync_; }
+
+  Status DropUnsyncedData() const;
+};
+
+}  // anonymous namespace
+
+// A wrapper around WritableFile which informs another Env whenever this file
+// is written to or sync'ed.
+class TestWritableFile : public WritableFile {
+ public:
+  explicit TestWritableFile(const std::string& fname,
+                            unique_ptr<WritableFile>&& f,
+                            FaultInjectionTestEnv* env);
+  virtual ~TestWritableFile();
+  virtual Status Append(const Slice& data);
+  virtual Status Close();
+  virtual Status Flush();
+  virtual Status Sync();
+
+ private:
+  FileState state_;
+  unique_ptr<WritableFile> target_;
+  bool writable_file_opened_;
+  FaultInjectionTestEnv* env_;
+
+  Status SyncParent();
+};
+
+
+class FaultInjectionTestEnv : public EnvWrapper {
+ public:
+  explicit FaultInjectionTestEnv(Env* base)
+      : EnvWrapper(base),
+        filesystem_active_(true) {}
+  virtual ~FaultInjectionTestEnv() { }
+
+  Status NewWritableFile(const std::string& fname,
+                         unique_ptr<WritableFile>* result,
+                         const EnvOptions& soptions) {
+    Status s = target()->NewWritableFile(fname, result, soptions);
+    if (s.ok()) {
+      result->reset(new TestWritableFile(fname, std::move(*result), this));
+      // WritableFile doesn't append to files, so if the same file is opened
+      // again then it will be truncated - so forget our saved state.
+      UntrackFile(fname);
+      MutexLock l(&mutex_);
+      new_files_since_last_dir_sync_.insert(fname);
+    }
+    return s;
+  }
+
+  virtual Status DeleteFile(const std::string& f) {
+    Status s = EnvWrapper::DeleteFile(f);
+    ASSERT_OK(s);
+    if (s.ok()) {
+      UntrackFile(f);
+    }
+    return s;
+  }
+
+  virtual Status RenameFile(const std::string& s, const std::string& t) {
+    Status ret = EnvWrapper::RenameFile(s, t);
+
+    if (ret.ok()) {
+      MutexLock l(&mutex_);
+      if (db_file_state_.find(s) != db_file_state_.end()) {
+        db_file_state_[t] = db_file_state_[s];
+        db_file_state_.erase(s);
+      }
+
+      if (new_files_since_last_dir_sync_.erase(s) != 0) {
+        assert(new_files_since_last_dir_sync_.find(t) ==
+               new_files_since_last_dir_sync_.end());
+        new_files_since_last_dir_sync_.insert(t);
+      }
+    }
+
+    return ret;
+  }
+
+  void WritableFileClosed(const FileState& state) {
+    MutexLock l(&mutex_);
+    db_file_state_[state.filename_] = state;
+  }
+
+  Status DropUnsyncedFileData() {
+    Status s;
+    MutexLock l(&mutex_);
+    for (std::map<std::string, FileState>::const_iterator it =
+             db_file_state_.begin();
+         s.ok() && it != db_file_state_.end(); ++it) {
+      const FileState& state = it->second;
+      if (!state.IsFullySynced()) {
+        s = state.DropUnsyncedData();
+      }
+    }
+    return s;
+  }
+
+  Status DeleteFilesCreatedAfterLastDirSync() {
+    // Because DeleteFile access this container make a copy to avoid deadlock
+    mutex_.Lock();
+    std::set<std::string> new_files(new_files_since_last_dir_sync_.begin(),
+                                    new_files_since_last_dir_sync_.end());
+    mutex_.Unlock();
+    Status s;
+    std::set<std::string>::const_iterator it;
+    for (it = new_files.begin(); s.ok() && it != new_files.end(); ++it) {
+      s = DeleteFile(*it);
+    }
+    return s;
+  }
+
+  void DirWasSynced() {
+    MutexLock l(&mutex_);
+    new_files_since_last_dir_sync_.clear();
+  }
+
+  bool IsFileCreatedSinceLastDirSync(const std::string& filename) {
+    MutexLock l(&mutex_);
+    return new_files_since_last_dir_sync_.find(filename) !=
+           new_files_since_last_dir_sync_.end();
+  }
+
+  void ResetState() {
+    MutexLock l(&mutex_);
+    db_file_state_.clear();
+    new_files_since_last_dir_sync_.clear();
+    SetFilesystemActive(true);
+  }
+
+  void UntrackFile(const std::string& f) {
+    MutexLock l(&mutex_);
+    db_file_state_.erase(f);
+    new_files_since_last_dir_sync_.erase(f);
+  }
+
+  // Setting the filesystem to inactive is the test equivalent to simulating a
+  // system reset. Setting to inactive will freeze our saved filesystem state so
+  // that it will stop being recorded. It can then be reset back to the state at
+  // the time of the reset.
+  bool IsFilesystemActive() const { return filesystem_active_; }
+  void SetFilesystemActive(bool active) { filesystem_active_ = active; }
+
+ private:
+  port::Mutex mutex_;
+  std::map<std::string, FileState> db_file_state_;
+  std::set<std::string> new_files_since_last_dir_sync_;
+  bool filesystem_active_;  // Record flushes, syncs, writes
+};
+
+Status FileState::DropUnsyncedData() const {
+  ssize_t sync_pos = pos_at_last_sync_ == -1 ? 0 : pos_at_last_sync_;
+  return Truncate(filename_, sync_pos);
+}
+
+TestWritableFile::TestWritableFile(const std::string& fname,
+                                   unique_ptr<WritableFile>&& f,
+                                   FaultInjectionTestEnv* env)
+      : state_(fname),
+        target_(std::move(f)),
+        writable_file_opened_(true),
+        env_(env) {
+  assert(target_ != nullptr);
+  state_.pos_ = 0;
+}
+
+TestWritableFile::~TestWritableFile() {
+  if (writable_file_opened_) {
+    Close();
+  }
+}
+
+Status TestWritableFile::Append(const Slice& data) {
+  Status s = target_->Append(data);
+  if (s.ok() && env_->IsFilesystemActive()) {
+    state_.pos_ += data.size();
+  }
+  return s;
+}
+
+Status TestWritableFile::Close() {
+  writable_file_opened_ = false;
+  Status s = target_->Close();
+  if (s.ok()) {
+    env_->WritableFileClosed(state_);
+  }
+  return s;
+}
+
+Status TestWritableFile::Flush() {
+  Status s = target_->Flush();
+  if (s.ok() && env_->IsFilesystemActive()) {
+    state_.pos_at_last_flush_ = state_.pos_;
+  }
+  return s;
+}
+
+Status TestWritableFile::Sync() {
+  if (!env_->IsFilesystemActive()) {
+    return Status::OK();
+  }
+  // Ensure new files referred to by the manifest are in the filesystem.
+  Status s = target_->Sync();
+  if (s.ok()) {
+    state_.pos_at_last_sync_ = state_.pos_;
+  }
+  if (env_->IsFileCreatedSinceLastDirSync(state_.filename_)) {
+    Status ps = SyncParent();
+    if (s.ok() && !ps.ok()) {
+      s = ps;
+    }
+  }
+  return s;
+}
+
+Status TestWritableFile::SyncParent() {
+  Status s = SyncDir(GetDirName(state_.filename_));
+  if (s.ok()) {
+    env_->DirWasSynced();
+  }
+  return s;
+}
+
+class FaultInjectionTest {
+ public:
+  enum ExpectedVerifResult { VAL_EXPECT_NO_ERROR, VAL_EXPECT_ERROR };
+  enum ResetMethod { RESET_DROP_UNSYNCED_DATA, RESET_DELETE_UNSYNCED_FILES };
+
+  FaultInjectionTestEnv* env_;
+  std::string dbname_;
+  shared_ptr<Cache> tiny_cache_;
+  Options options_;
+  DB* db_;
+
+  FaultInjectionTest() : env_(NULL), db_(NULL) { NewDB(); }
+
+  ~FaultInjectionTest() { ASSERT_OK(TearDown()); }
+
+  Status NewDB() {
+    assert(db_ == NULL);
+    assert(tiny_cache_ == nullptr);
+    assert(env_ == NULL);
+
+    env_ = new FaultInjectionTestEnv(Env::Default());
+
+    options_ = Options();
+    options_.env = env_;
+    options_.paranoid_checks = true;
+
+    BlockBasedTableOptions table_options;
+    tiny_cache_ = NewLRUCache(100);
+    table_options.block_cache = tiny_cache_;
+    options_.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+    dbname_ = test::TmpDir() + "/fault_test";
+
+    options_.create_if_missing = true;
+    Status s = OpenDB();
+    options_.create_if_missing = false;
+    return s;
+  }
+
+  Status SetUp() {
+    Status s = TearDown();
+    if (s.ok()) {
+      s = NewDB();
+    }
+    return s;
+  }
+
+  Status TearDown() {
+    CloseDB();
+
+    Status s = DestroyDB(dbname_, Options());
+
+    delete env_;
+    env_ = NULL;
+
+    tiny_cache_.reset();
+
+    return s;
+  }
+
+  void Build(int start_idx, int num_vals) {
+    std::string key_space, value_space;
+    WriteBatch batch;
+    for (int i = start_idx; i < start_idx + num_vals; i++) {
+      Slice key = Key(i, &key_space);
+      batch.Clear();
+      batch.Put(key, Value(i, &value_space));
+      WriteOptions options;
+      ASSERT_OK(db_->Write(options, &batch));
+    }
+  }
+
+  Status ReadValue(int i, std::string* val) const {
+    std::string key_space, value_space;
+    Slice key = Key(i, &key_space);
+    Value(i, &value_space);
+    ReadOptions options;
+    return db_->Get(options, key, val);
+  }
+
+  Status Verify(int start_idx, int num_vals,
+                ExpectedVerifResult expected) const {
+    std::string val;
+    std::string value_space;
+    Status s;
+    for (int i = start_idx; i < start_idx + num_vals && s.ok(); i++) {
+      Value(i, &value_space);
+      s = ReadValue(i, &val);
+      if (expected == VAL_EXPECT_NO_ERROR) {
+        if (s.ok()) {
+          ASSERT_EQ(value_space, val);
+        }
+      } else if (s.ok()) {
+        fprintf(stderr, "Expected an error at %d, but was OK\n", i);
+        s = Status::IOError(dbname_, "Expected value error:");
+      } else {
+        s = Status::OK();  // An expected error
+      }
+    }
+    return s;
+  }
+
+  // Return the ith key
+  Slice Key(int i, std::string* storage) const {
+    char buf[100];
+    snprintf(buf, sizeof(buf), "%016d", i);
+    storage->assign(buf, strlen(buf));
+    return Slice(*storage);
+  }
+
+  // Return the value to associate with the specified key
+  Slice Value(int k, std::string* storage) const {
+    Random r(k);
+    return test::RandomString(&r, kValueSize, storage);
+  }
+
+  Status OpenDB() {
+    delete db_;
+    db_ = NULL;
+    env_->ResetState();
+    return DB::Open(options_, dbname_, &db_);
+  }
+
+  void CloseDB() {
+    delete db_;
+    db_ = NULL;
+  }
+
+  void DeleteAllData() {
+    Iterator* iter = db_->NewIterator(ReadOptions());
+    WriteOptions options;
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      ASSERT_OK(db_->Delete(WriteOptions(), iter->key()));
+    }
+
+    delete iter;
+  }
+
+  void ResetDBState(ResetMethod reset_method) {
+    switch (reset_method) {
+      case RESET_DROP_UNSYNCED_DATA:
+        ASSERT_OK(env_->DropUnsyncedFileData());
+        break;
+      case RESET_DELETE_UNSYNCED_FILES:
+        ASSERT_OK(env_->DeleteFilesCreatedAfterLastDirSync());
+        break;
+      default:
+        assert(false);
+    }
+  }
+
+  void PartialCompactTestPreFault(int num_pre_sync, int num_post_sync) {
+    DeleteAllData();
+    Build(0, num_pre_sync);
+    db_->CompactRange(NULL, NULL);
+    Build(num_pre_sync, num_post_sync);
+  }
+
+  void PartialCompactTestReopenWithFault(ResetMethod reset_method,
+                                         int num_pre_sync,
+                                         int num_post_sync) {
+    env_->SetFilesystemActive(false);
+    CloseDB();
+    ResetDBState(reset_method);
+    ASSERT_OK(OpenDB());
+    ASSERT_OK(Verify(0, num_pre_sync, FaultInjectionTest::VAL_EXPECT_NO_ERROR));
+    ASSERT_OK(Verify(num_pre_sync, num_post_sync,
+      FaultInjectionTest::VAL_EXPECT_ERROR));
+  }
+
+  void NoWriteTestPreFault() {
+  }
+
+  void NoWriteTestReopenWithFault(ResetMethod reset_method) {
+    CloseDB();
+    ResetDBState(reset_method);
+    ASSERT_OK(OpenDB());
+  }
+};
+
+TEST(FaultInjectionTest, FaultTest) {
+  Random rnd(0);
+  ASSERT_OK(SetUp());
+  for (size_t idx = 0; idx < kNumIterations; idx++) {
+    int num_pre_sync = rnd.Uniform(kMaxNumValues);
+    int num_post_sync = rnd.Uniform(kMaxNumValues);
+
+    // TODO(t6007549) Figure out why this fails and then re-enable the test.
+#if 0
+    PartialCompactTestPreFault(num_pre_sync, num_post_sync);
+    PartialCompactTestReopenWithFault(RESET_DROP_UNSYNCED_DATA,
+                                      num_pre_sync,
+                                      num_post_sync);
+
+    NoWriteTestPreFault();
+    NoWriteTestReopenWithFault(RESET_DROP_UNSYNCED_DATA);
+#endif
+
+    PartialCompactTestPreFault(num_pre_sync, num_post_sync);
+    // No new files created so we expect all values since no files will be
+    // dropped.
+    PartialCompactTestReopenWithFault(RESET_DELETE_UNSYNCED_FILES,
+                                      num_pre_sync + num_post_sync,
+                                      0);
+
+    NoWriteTestPreFault();
+    NoWriteTestReopenWithFault(RESET_DELETE_UNSYNCED_FILES);
+  }
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}

From f2ddb8b452bb27eceef6752b5827bd287465f540 Mon Sep 17 00:00:00 2001
From: Venkatesh Radhakrishnan <rven@fb.com>
Date: Thu, 15 Jan 2015 11:47:41 -0800
Subject: [PATCH 703/829] Fix for bug where GeoDB accesses key after next
 modification of iterator

Summary:
While running cross-functional tests for weak iterators, I
encountered a bug in GeoDB. GeoDB reads a key from the database and
tries to use it after doing a Seek. Fixing it by storing the key locally
so that it is still visible after the Seek.

Test Plan: Run geodb_test

Reviewers: sdong, igor

Reviewed By: igor

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D31599
---
 utilities/geodb/geodb_impl.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/utilities/geodb/geodb_impl.cc b/utilities/geodb/geodb_impl.cc
index 194e51232..2cb9209e1 100644
--- a/utilities/geodb/geodb_impl.cc
+++ b/utilities/geodb/geodb_impl.cc
@@ -84,7 +84,7 @@ Status GeoDBImpl::GetByPosition(const GeoPosition& pos,
 
 Status GeoDBImpl::GetById(const Slice& id, GeoObject* object) {
   Status status;
-  Slice quadkey;
+  std::string quadkey;
 
   // create an iterator so that we can get a consistent picture
   // of the database.
@@ -97,7 +97,7 @@ Status GeoDBImpl::GetById(const Slice& id, GeoObject* object) {
   iter->Seek(key2);
   if (iter->Valid() && iter->status().ok()) {
     if (iter->key().compare(key2) == 0) {
-      quadkey = iter->value();
+      quadkey = iter->value().ToString();
     }
   }
   if (quadkey.size() == 0) {
@@ -108,7 +108,7 @@ Status GeoDBImpl::GetById(const Slice& id, GeoObject* object) {
   //
   // Seek to the quadkey + id prefix
   //
-  std::string prefix = MakeKey1Prefix(quadkey.ToString(), id);
+  std::string prefix = MakeKey1Prefix(quadkey, id);
   iter->Seek(Slice(prefix));
   assert(iter->Valid());
   if (!iter->Valid() || !iter->status().ok()) {

From b229f970dfd2c62450dcc089e4fecee0d8cb249f Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Thu, 15 Jan 2015 12:44:19 -0800
Subject: [PATCH 704/829] Remove Compaction::ReleaseInputs().

Summary:
This patch remove the unnecessary Compaction::ReleaseInputs().

Compaction::ReleaseInputs() tries to unref its input_version
and column_family.  However, such unref is always done in
~Compaction(), and all current ReleaseInputs() calls are
right before the destructor.

Test Plan: ./db_test

Reviewers: igor

Reviewed By: igor

Subscribers: igor, rven, dhruba, sdong

Differential Revision: https://reviews.facebook.net/D31605
---
 db/compaction.cc | 13 -------------
 db/compaction.h  |  4 ----
 db/db_impl.cc    |  3 +--
 3 files changed, 1 insertion(+), 19 deletions(-)

diff --git a/db/compaction.cc b/db/compaction.cc
index 0d85ce486..56be34ef3 100644
--- a/db/compaction.cc
+++ b/db/compaction.cc
@@ -242,19 +242,6 @@ void Compaction::SetupBottomMostLevel(VersionStorageInfo* vstorage,
   }
 }
 
-void Compaction::ReleaseInputs() {
-  if (input_version_ != nullptr) {
-    input_version_->Unref();
-    input_version_ = nullptr;
-  }
-  if (cfd_ != nullptr) {
-    if (cfd_->Unref()) {
-      delete cfd_;
-    }
-    cfd_ = nullptr;
-  }
-}
-
 void Compaction::ReleaseCompactionFiles(Status status) {
   cfd_->compaction_picker()->ReleaseCompactionFiles(this, status);
 }
diff --git a/db/compaction.h b/db/compaction.h
index 99f35abb9..37e38532b 100644
--- a/db/compaction.h
+++ b/db/compaction.h
@@ -135,10 +135,6 @@ class Compaction {
   // before processing "internal_key".
   bool ShouldStopBefore(const Slice& internal_key);
 
-  // Release the input version for the compaction, once the compaction
-  // is successful.
-  void ReleaseInputs();
-
   // Clear all files to indicate that they are not being compacted
   // Delete this compaction from the list of running compactions.
   void ReleaseCompactionFiles(Status status);
diff --git a/db/db_impl.cc b/db/db_impl.cc
index f5d6d99f0..fa1c87aee 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -1344,7 +1344,6 @@ Status DBImpl::CompactFilesImpl(
                                   *c->mutable_cf_options());
   }
   c->ReleaseCompactionFiles(s);
-  c->ReleaseInputs();
   c.reset();
 
   if (status.ok()) {
@@ -2203,9 +2202,9 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress, JobContext* job_context,
                                     *c->mutable_cf_options());
     }
     c->ReleaseCompactionFiles(status);
-    c->ReleaseInputs();
     *madeProgress = true;
   }
+  // this will unref its input_version and column_family_data
   c.reset();
 
   if (status.ok()) {

From c787fb50b8642d93f878c8187d3578815402eb24 Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Thu, 15 Jan 2015 21:22:51 +0100
Subject: [PATCH 705/829] [RocksJava] JavaDoc errors in Java8

Some of the latest commits included illegal JavaDoc formattings.
---
 java/org/rocksdb/AbstractRocksIterator.java  |  6 +++---
 java/org/rocksdb/RocksIteratorInterface.java | 10 +++++-----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/java/org/rocksdb/AbstractRocksIterator.java b/java/org/rocksdb/AbstractRocksIterator.java
index cc7cf064f..08bd9dc23 100644
--- a/java/org/rocksdb/AbstractRocksIterator.java
+++ b/java/org/rocksdb/AbstractRocksIterator.java
@@ -8,13 +8,13 @@ package org.rocksdb;
 /**
  * Base class implementation for Rocks Iterators
  * in the Java API
- * <p/>
+ *
  * <p>Multiple threads can invoke const methods on an RocksIterator without
  * external synchronization, but if any of the threads may call a
  * non-const method, all threads accessing the same RocksIterator must use
  * external synchronization.</p>
  *
- * @param P The type of the Parent Object from which the Rocks Iterator was
+ * @param <P> The type of the Parent Object from which the Rocks Iterator was
  *          created. This is used by disposeInternal to avoid double-free
  *          issues with the underlying C++ object.
  * @see org.rocksdb.RocksObject
@@ -78,7 +78,7 @@ public abstract class AbstractRocksIterator<P extends RocksObject>
 
   /**
    * <p>Deletes underlying C++ iterator pointer.</p>
-   * <p/>
+   *
    * <p>Note: the underlying handle can only be safely deleted if the parent
    * instance related to a certain RocksIterator is still valid and initialized.
    * Therefore {@code disposeInternal()} checks if the parent is initialized
diff --git a/java/org/rocksdb/RocksIteratorInterface.java b/java/org/rocksdb/RocksIteratorInterface.java
index 15f3a9aa9..b5cc90afb 100644
--- a/java/org/rocksdb/RocksIteratorInterface.java
+++ b/java/org/rocksdb/RocksIteratorInterface.java
@@ -10,7 +10,7 @@ package org.rocksdb;
  * access to data one entry at a time. Multiple implementations
  * are provided by this library.  In particular, iterators are provided
  * to access the contents of a DB and Write Batch.</p>
- * <p/>
+ *
  * <p>Multiple threads can invoke const methods on an RocksIterator without
  * external synchronization, but if any of the threads may call a
  * non-const method, all threads accessing the same RocksIterator must use
@@ -43,7 +43,7 @@ public interface RocksIteratorInterface {
   /**
    * <p>Position at the first entry in the source whose key is that or
    * past target.</p>
-   * <p/>
+   *
    * <p>The iterator is valid after this call if the source contains
    * a key that comes at or past target.</p>
    *
@@ -55,7 +55,7 @@ public interface RocksIteratorInterface {
   /**
    * <p>Moves to the next entry in the source.  After this call, Valid() is
    * true if the iterator was not positioned at the last entry in the source.</p>
-   * <p/>
+   *
    * <p>REQUIRES: {@link #isValid()}</p>
    */
   public void next();
@@ -63,13 +63,13 @@ public interface RocksIteratorInterface {
   /**
    * <p>Moves to the previous entry in the source.  After this call, Valid() is
    * true if the iterator was not positioned at the first entry in source.</p>
-   * <p/>
+   *
    * <p>REQUIRES: {@link #isValid()}</p>
    */
   public void prev();
 
   /**
-   * <pIf an error has occurred, return it.  Else return an ok status.
+   * <p>If an error has occurred, return it.  Else return an ok status.
    * If non-blocking IO is requested and this operation cannot be
    * satisfied without doing some IO, then this returns Status::Incomplete().</p>
    *

From c75c02e7a2dff2dedd72891a1652602ad5070e8f Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Thu, 15 Jan 2015 21:54:21 +0100
Subject: [PATCH 706/829] [RocksJava] WriteBatchWithIndexTest fix

Previous to this commit identiy checks were performed. Now tests are
performed using equals - method as intended.
---
 java/org/rocksdb/test/WriteBatchWithIndexTest.java | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/java/org/rocksdb/test/WriteBatchWithIndexTest.java b/java/org/rocksdb/test/WriteBatchWithIndexTest.java
index de2b637ff..dac3f1110 100644
--- a/java/org/rocksdb/test/WriteBatchWithIndexTest.java
+++ b/java/org/rocksdb/test/WriteBatchWithIndexTest.java
@@ -217,19 +217,19 @@ public class WriteBatchWithIndexTest {
 
         it.seek(key);
         assertThat(it.isValid()).isTrue();
-        assertThat(it.entry()).isEqualTo(expected[testOffset]);
+        assertThat(it.entry().equals(expected[testOffset])).isTrue();
       }
 
       //forward iterative access
       int i = 0;
       for(it.seekToFirst(); it.isValid(); it.next()) {
-        assertThat(it.entry()).isEqualTo(expected[i++]);
+        assertThat(it.entry().equals(expected[i++])).isTrue();
       }
 
       //reverse iterative access
       i = expected.length - 1;
       for(it.seekToLast(); it.isValid(); it.prev()) {
-        assertThat(it.entry()).isEqualTo(expected[i--]);
+        assertThat(it.entry().equals(expected[i--])).isTrue();
       }
 
     } finally {

From 155bec4cf22d20c41ce4be93e3a0080b4ef7c7a2 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Thu, 15 Jan 2015 14:15:43 -0800
Subject: [PATCH 707/829] fallocate also tests FALLOC_FL_KEEP_SIZE

---
 build_tools/build_detect_platform | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/build_tools/build_detect_platform b/build_tools/build_detect_platform
index c17cd3ead..5ad5552ce 100755
--- a/build_tools/build_detect_platform
+++ b/build_tools/build_detect_platform
@@ -184,7 +184,7 @@ else
       #include <fcntl.h>
       int main() {
 	int fd = open("/dev/null", 0);
-	fallocate(fd, 0, 0, 1024);
+  fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, 0, 1024);
       }
 EOF
     if [ "$?" = 0 ]; then

From 3d628f8f226daec723bafa9cd276092cd634e216 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Fri, 16 Jan 2015 09:18:45 -0800
Subject: [PATCH 708/829] Update format_version comment

Summary: We added a new format version. Reflect that in the comments.

Test Plan: none

Reviewers: sdong, rven, yhchiang, MarkCallaghan

Reviewed By: MarkCallaghan

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D31629
---
 include/rocksdb/table.h | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/include/rocksdb/table.h b/include/rocksdb/table.h
index d4e0e156f..757edebe4 100644
--- a/include/rocksdb/table.h
+++ b/include/rocksdb/table.h
@@ -127,7 +127,7 @@ struct BlockBasedTableOptions {
   bool whole_key_filtering = true;
 
   // For more details on BlockBasedTable's formats, see FORMAT-CHANGES.md
-  // We currently have two versions:
+  // We currently have three versions:
   // 0 -- This version is currently written out by all RocksDB's versions by
   // default.  Can be read by really old RocksDB's. Doesn't support changing
   // checksum (default is CRC32).
@@ -135,8 +135,12 @@ struct BlockBasedTableOptions {
   // checksum, like xxHash. It is written by RocksDB when
   // BlockBasedTableOptions::checksum is something other than kCRC32c. (version
   // 0 is silently upconverted)
-  // This only affects newly written tables. When reading exising tables, the
-  // information about version is read from the footer.
+  // 2 -- Can be read by RocksDB's versions since 3.10. Changes the way we
+  // encode compressed blocks with LZ4, BZip2 and Zlib compression. If you
+  // don't plan to run RocksDB before version 3.10, you should probably use
+  // this.
+  // This option only affects newly written tables. When reading exising tables,
+  // the information about version is read from the footer.
   uint32_t format_version = 0;
 };
 

From ceaea2b72d388b8e711cd0b75e3ff42df4a1fafc Mon Sep 17 00:00:00 2001
From: Yoshinori Matsunobu <yoshinori@fb.com>
Date: Thu, 15 Jan 2015 15:33:12 -0800
Subject: [PATCH 709/829] Adding prefix_extractor string config parameter

Summary:
This diff enables to configure prefix_extractor
string parameter as a CF option.

Test Plan: make all check, ./options_test

Reviewers: sdong, igor

Reviewed By: igor

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D31653
---
 util/options_helper.cc | 10 ++++++++++
 util/options_test.cc   |  6 +++++-
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/util/options_helper.cc b/util/options_helper.cc
index 4a169ce3f..efc028497 100644
--- a/util/options_helper.cc
+++ b/util/options_helper.cc
@@ -9,6 +9,7 @@
 #include "rocksdb/cache.h"
 #include "rocksdb/filter_policy.h"
 #include "rocksdb/options.h"
+#include "rocksdb/slice_transform.h"
 #include "rocksdb/table.h"
 #include "rocksdb/utilities/convenience.h"
 #include "util/options_helper.h"
@@ -508,6 +509,15 @@ Status GetColumnFamilyOptionsFromMap(
         new_options->min_partial_merge_operands = ParseUint32(o.second);
       } else if (o.first == "inplace_update_support") {
         new_options->inplace_update_support = ParseBoolean(o.first, o.second);
+      } else if (o.first == "prefix_extractor") {
+        const std::string kName = "fixed:";
+        if (o.second.compare(0, kName.size(), kName) != 0) {
+          return Status::InvalidArgument("Invalid Prefix Extractor type: "
+                                         + o.second);
+        }
+        int prefix_length = ParseInt(trim(o.second.substr(kName.size())));
+        new_options->prefix_extractor.reset(
+            NewFixedPrefixTransform(prefix_length));
       } else {
         return Status::InvalidArgument("Unrecognized option: " + o.first);
       }
diff --git a/util/options_test.cc b/util/options_test.cc
index a9e609f4f..d89acfb7d 100644
--- a/util/options_test.cc
+++ b/util/options_test.cc
@@ -129,7 +129,8 @@ TEST(OptionsTest, GetOptionsFromMapTest) {
     {"memtable_prefix_bloom_huge_page_tlb_size", "28"},
     {"bloom_locality", "29"},
     {"max_successive_merges", "30"},
-    {"min_partial_merge_operands", "31"}
+    {"min_partial_merge_operands", "31"},
+    {"prefix_extractor", "fixed:31"}
   };
 
   std::unordered_map<std::string, std::string> db_options_map = {
@@ -220,6 +221,9 @@ TEST(OptionsTest, GetOptionsFromMapTest) {
   ASSERT_EQ(new_cf_opt.bloom_locality, 29U);
   ASSERT_EQ(new_cf_opt.max_successive_merges, 30U);
   ASSERT_EQ(new_cf_opt.min_partial_merge_operands, 31U);
+  ASSERT_TRUE(new_cf_opt.prefix_extractor != nullptr);
+  ASSERT_EQ(std::string(new_cf_opt.prefix_extractor->Name()),
+            "rocksdb.FixedPrefix.31");
 
   cf_options_map["write_buffer_size"] = "hello";
   ASSERT_NOK(GetColumnFamilyOptionsFromMap(

From 0ddf5f73ec6918adfa4b07940a59c3c1cc478ac7 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Fri, 16 Jan 2015 15:12:54 -0800
Subject: [PATCH 710/829] memenv: normalize file path

Summary: Now using memenv, DB will not able to be reopened, since a "//" in the file name. Fix it by normalizing file path.

Test Plan: Add a unit test that used to fail and now pass.

Reviewers: rven, yhchiang, igor

Reviewed By: igor

Subscribers: leveldb, dhruba

Differential Revision: https://reviews.facebook.net/D31767
---
 helpers/memenv/memenv.cc      | 57 ++++++++++++++++++++++++++---------
 helpers/memenv/memenv_test.cc |  9 ++++++
 2 files changed, 52 insertions(+), 14 deletions(-)

diff --git a/helpers/memenv/memenv.cc b/helpers/memenv/memenv.cc
index b6499f4e0..d13fa55eb 100644
--- a/helpers/memenv/memenv.cc
+++ b/helpers/memenv/memenv.cc
@@ -15,6 +15,26 @@ namespace rocksdb {
 
 namespace {
 
+std::string NormalizeFileName(const std::string fname) {
+  if (fname.find("//") == std::string::npos) {
+    return fname;
+  }
+  std::string out_name = "";
+  bool is_slash = false;
+  for (char c : fname) {
+    if (c == '/' && is_slash) {
+      continue;
+    }
+    out_name.append(1, c);
+    if (c == '/') {
+      is_slash = true;
+    } else {
+      is_slash = false;
+    }
+  }
+  return out_name;
+}
+
 class FileState {
  public:
   // FileStates are reference counted. The initial reference count is zero
@@ -238,40 +258,43 @@ class InMemoryEnv : public EnvWrapper {
   virtual Status NewSequentialFile(const std::string& fname,
                                    unique_ptr<SequentialFile>* result,
                                    const EnvOptions& soptions) {
+    std::string nfname = NormalizeFileName(fname);
     MutexLock lock(&mutex_);
     if (file_map_.find(fname) == file_map_.end()) {
       *result = NULL;
       return Status::IOError(fname, "File not found");
     }
 
-    result->reset(new SequentialFileImpl(file_map_[fname]));
+    result->reset(new SequentialFileImpl(file_map_[nfname]));
     return Status::OK();
   }
 
   virtual Status NewRandomAccessFile(const std::string& fname,
                                      unique_ptr<RandomAccessFile>* result,
                                      const EnvOptions& soptions) {
+    std::string nfname = NormalizeFileName(fname);
     MutexLock lock(&mutex_);
-    if (file_map_.find(fname) == file_map_.end()) {
+    if (file_map_.find(nfname) == file_map_.end()) {
       *result = NULL;
       return Status::IOError(fname, "File not found");
     }
 
-    result->reset(new RandomAccessFileImpl(file_map_[fname]));
+    result->reset(new RandomAccessFileImpl(file_map_[nfname]));
     return Status::OK();
   }
 
   virtual Status NewWritableFile(const std::string& fname,
                                  unique_ptr<WritableFile>* result,
                                  const EnvOptions& soptions) {
+    std::string nfname = NormalizeFileName(fname);
     MutexLock lock(&mutex_);
-    if (file_map_.find(fname) != file_map_.end()) {
-      DeleteFileInternal(fname);
+    if (file_map_.find(nfname) != file_map_.end()) {
+      DeleteFileInternal(nfname);
     }
 
     FileState* file = new FileState();
     file->Ref();
-    file_map_[fname] = file;
+    file_map_[nfname] = file;
 
     result->reset(new WritableFileImpl(file));
     return Status::OK();
@@ -284,8 +307,9 @@ class InMemoryEnv : public EnvWrapper {
   }
 
   virtual bool FileExists(const std::string& fname) {
+    std::string nfname = NormalizeFileName(fname);
     MutexLock lock(&mutex_);
-    return file_map_.find(fname) != file_map_.end();
+    return file_map_.find(nfname) != file_map_.end();
   }
 
   virtual Status GetChildren(const std::string& dir,
@@ -315,12 +339,13 @@ class InMemoryEnv : public EnvWrapper {
   }
 
   virtual Status DeleteFile(const std::string& fname) {
+    std::string nfname = NormalizeFileName(fname);
     MutexLock lock(&mutex_);
-    if (file_map_.find(fname) == file_map_.end()) {
+    if (file_map_.find(nfname) == file_map_.end()) {
       return Status::IOError(fname, "File not found");
     }
 
-    DeleteFileInternal(fname);
+    DeleteFileInternal(nfname);
     return Status::OK();
   }
 
@@ -337,12 +362,14 @@ class InMemoryEnv : public EnvWrapper {
   }
 
   virtual Status GetFileSize(const std::string& fname, uint64_t* file_size) {
+    std::string nfname = NormalizeFileName(fname);
     MutexLock lock(&mutex_);
-    if (file_map_.find(fname) == file_map_.end()) {
+
+    if (file_map_.find(nfname) == file_map_.end()) {
       return Status::IOError(fname, "File not found");
     }
 
-    *file_size = file_map_[fname]->Size();
+    *file_size = file_map_[nfname]->Size();
     return Status::OK();
   }
 
@@ -352,14 +379,16 @@ class InMemoryEnv : public EnvWrapper {
   }
 
   virtual Status RenameFile(const std::string& src, const std::string& dest) {
+    std::string nsrc = NormalizeFileName(src);
+    std::string ndest = NormalizeFileName(dest);
     MutexLock lock(&mutex_);
-    if (file_map_.find(src) == file_map_.end()) {
+    if (file_map_.find(nsrc) == file_map_.end()) {
       return Status::IOError(src, "File not found");
     }
 
     DeleteFileInternal(dest);
-    file_map_[dest] = file_map_[src];
-    file_map_.erase(src);
+    file_map_[ndest] = file_map_[nsrc];
+    file_map_.erase(nsrc);
     return Status::OK();
   }
 
diff --git a/helpers/memenv/memenv_test.cc b/helpers/memenv/memenv_test.cc
index ea3ed61a0..6154893f0 100644
--- a/helpers/memenv/memenv_test.cc
+++ b/helpers/memenv/memenv_test.cc
@@ -222,6 +222,15 @@ TEST(MemEnvTest, DBTest) {
   }
 
   delete db;
+
+  options.create_if_missing = false;
+  ASSERT_OK(DB::Open(options, "/dir/db", &db));
+  for (size_t i = 0; i < 3; ++i) {
+    std::string res;
+    ASSERT_OK(db->Get(ReadOptions(), keys[i], &res));
+    ASSERT_TRUE(res == vals[i]);
+  }
+  delete db;
 }
 
 }  // namespace rocksdb

From d68e83c356626664988ad28c3916c02c08ac2e78 Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Wed, 14 Jan 2015 21:36:43 +0100
Subject: [PATCH 711/829] [RocksJava] DirectSlice String termination fix

DirectSlice fix for non terminated String copy. This lead sometimes
to problems with DirectSliceTest.
---
 java/rocksjni/slice.cc | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/java/rocksjni/slice.cc b/java/rocksjni/slice.cc
index e4b7cf03b..162e868d1 100644
--- a/java/rocksjni/slice.cc
+++ b/java/rocksjni/slice.cc
@@ -28,9 +28,11 @@ void Java_org_rocksdb_AbstractSlice_createNewSliceFromString(
     JNIEnv* env, jobject jobj, jstring jstr) {
 
   const auto* str = env->GetStringUTFChars(jstr, 0);
-  const size_t len = strlen(str);
+  const size_t len = strlen(str) + 1;
   char* buf = new char[len];
-  memcpy(buf, str, len);
+  memcpy(buf, str, len - 1);
+  buf[len-1]='\0';
+
   env->ReleaseStringUTFChars(jstr, str);
 
   const auto* slice = new rocksdb::Slice(buf);

From ea25ff7158b26e2b0acb9f26be206b6b57af4abb Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Sat, 17 Jan 2015 01:22:29 +0100
Subject: [PATCH 712/829] [RocksJava] Integrated proposed simplificiation

---
 java/rocksjni/slice.cc | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/java/rocksjni/slice.cc b/java/rocksjni/slice.cc
index 162e868d1..811117397 100644
--- a/java/rocksjni/slice.cc
+++ b/java/rocksjni/slice.cc
@@ -28,11 +28,10 @@ void Java_org_rocksdb_AbstractSlice_createNewSliceFromString(
     JNIEnv* env, jobject jobj, jstring jstr) {
 
   const auto* str = env->GetStringUTFChars(jstr, 0);
-  const size_t len = strlen(str) + 1;
-  char* buf = new char[len];
-  memcpy(buf, str, len - 1);
-  buf[len-1]='\0';
-
+  const size_t len = strlen(str);
+  char* buf = new char[len + 1];
+  memcpy(buf, str, len);
+  buf[len] = 0;
   env->ReleaseStringUTFChars(jstr, str);
 
   const auto* slice = new rocksdb::Slice(buf);

From ca47da9e6338f76b7fc1e1345d798cd2199487f2 Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Mon, 17 Nov 2014 19:22:44 +0100
Subject: [PATCH 713/829] [RocksJava] TTL-Support

---
 java/Makefile                        |  4 +-
 java/org/rocksdb/TtlDB.java          | 75 ++++++++++++++++++++++++++++
 java/org/rocksdb/test/TtlDBTest.java | 51 +++++++++++++++++++
 java/rocksjni/ttl.cc                 | 42 ++++++++++++++++
 4 files changed, 171 insertions(+), 1 deletion(-)
 create mode 100644 java/org/rocksdb/TtlDB.java
 create mode 100644 java/org/rocksdb/test/TtlDBTest.java
 create mode 100644 java/rocksjni/ttl.cc

diff --git a/java/Makefile b/java/Makefile
index c8f443f7b..32717ddd8 100644
--- a/java/Makefile
+++ b/java/Makefile
@@ -29,13 +29,14 @@ NATIVE_JAVA_CLASSES = org.rocksdb.AbstractComparator\
 	org.rocksdb.SkipListMemTableConfig\
 	org.rocksdb.Slice\
 	org.rocksdb.Statistics\
+	org.rocksdb.TtlDB\
 	org.rocksdb.VectorMemTableConfig\
 	org.rocksdb.StringAppendOperator\
 	org.rocksdb.WriteBatch\
 	org.rocksdb.WriteBatch.Handler\
 	org.rocksdb.test.WriteBatchInternal\
 	org.rocksdb.test.WriteBatchTest\
-        org.rocksdb.WriteOptions\
+    org.rocksdb.WriteOptions\
 	org.rocksdb.WriteBatchWithIndex\
 	org.rocksdb.WBWIRocksIterator
 
@@ -79,6 +80,7 @@ JAVA_TESTS = org.rocksdb.test.BackupableDBOptionsTest\
 		org.rocksdb.test.SizeUnitTest\
 		org.rocksdb.test.SliceTest\
 		org.rocksdb.test.SnapshotTest\
+		org.rocksdb.test.TtlDBTest\
 		org.rocksdb.test.StatisticsCollectorTest\
 		org.rocksdb.test.WriteBatchHandlerTest\
 		org.rocksdb.test.WriteBatchTest\
diff --git a/java/org/rocksdb/TtlDB.java b/java/org/rocksdb/TtlDB.java
new file mode 100644
index 000000000..dd6c8cfbc
--- /dev/null
+++ b/java/org/rocksdb/TtlDB.java
@@ -0,0 +1,75 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+import java.util.List;
+
+
+public class TtlDB extends RocksDB {
+
+  //static Status Open(const Options& options, const std::string& dbname,
+  //    DBWithTTL** dbptr, int32_t ttl = 0,
+  //    bool read_only = false);
+  public static TtlDB open(Options options, String db_path, int ttl,
+      boolean readOnly) throws RocksDBException {
+    TtlDB ttldb = new TtlDB();
+    ttldb.open(options.nativeHandle_, db_path, ttl, readOnly);
+
+    // Prevent the RocksDB object from attempting to delete
+    // the underly C++ DB object.
+    //ttldb.disOwnNativeHandle();
+    return ttldb;
+  }
+
+  //static Status Open(const DBOptions& db_options, const std::string& dbname,
+  //                   const std::vector<ColumnFamilyDescriptor>& column_families,
+  //                   std::vector<ColumnFamilyHandle*>* handles,
+  //                   DBWithTTL** dbptr, std::vector<int32_t> ttls,
+  //                   bool read_only = false);
+  public static TtlDB open(DBOptions options, String db_path,
+      List<ColumnFamilyDescriptor> columnFamilyDescriptors,
+      List<ColumnFamilyHandle> columnFamilyHandles,
+      List<Integer> ttlValues, boolean readOnly){
+
+
+    return null;
+  }
+
+  public ColumnFamilyHandle createColumnFamilyWithTtl(
+      ColumnFamilyDescriptor columnFamilyDescriptor, int ttl) {
+    return null;
+  }
+
+  /**
+   * Close the TtlDB instance and release resource.
+   *
+   * Internally, TtlDB owns the {@code rocksdb::DB} pointer to its associated
+   * {@link org.rocksdb.RocksDB}. The release of that RocksDB pointer is handled in the destructor
+   * of the c++ {@code rocksdb::TtlDB} and should be transparent to Java developers.
+   */
+  @Override public synchronized void close() {
+    if (isInitialized()) {
+      super.close();
+    }
+  }
+
+  /**
+   * A protected construction that will be used in the static factory
+   * method {@link #open(DBOptions, String, java.util.List, java.util.List)} and
+   * {@link #open(DBOptions, String, java.util.List, java.util.List, java.util.List, boolean)}.
+   */
+  protected TtlDB() {
+    super();
+  }
+
+  @Override protected void finalize() throws Throwable {
+    close();
+    super.finalize();
+  }
+
+  private native void open(long optionsHandle, String db_path, int ttl,
+      boolean readOnly) throws RocksDBException;
+}
diff --git a/java/org/rocksdb/test/TtlDBTest.java b/java/org/rocksdb/test/TtlDBTest.java
new file mode 100644
index 000000000..1637558a5
--- /dev/null
+++ b/java/org/rocksdb/test/TtlDBTest.java
@@ -0,0 +1,51 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb.test;
+
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+import org.rocksdb.Options;
+import org.rocksdb.RocksDBException;
+import org.rocksdb.TtlDB;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class TtlDBTest {
+
+  @ClassRule
+  public static final RocksMemoryResource rocksMemoryResource =
+      new RocksMemoryResource();
+
+  @Rule
+  public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  @Test
+  public void ttlDBOpen() throws RocksDBException, InterruptedException {
+    Options options = null;
+    TtlDB ttlDB = null;
+    try {
+      options = new Options().setCreateIfMissing(true);
+      ttlDB = TtlDB.open(options, dbFolder.getRoot().getAbsolutePath(),
+          1, false);
+      ttlDB.put("key".getBytes(), "value".getBytes());
+      assertThat(ttlDB.get("key".getBytes())).
+          isEqualTo("value".getBytes());
+      Thread.sleep(1250);
+      ttlDB.compactRange();
+
+      assertThat(ttlDB.get("key".getBytes())).isNull();
+    } finally {
+      if (ttlDB != null) {
+        ttlDB.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
+    }
+  }
+}
diff --git a/java/rocksjni/ttl.cc b/java/rocksjni/ttl.cc
new file mode 100644
index 000000000..c8e7c44a5
--- /dev/null
+++ b/java/rocksjni/ttl.cc
@@ -0,0 +1,42 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+//
+// This file implements the "bridge" between Java and C++ and enables
+// calling c++ rocksdb::TtlDB methods.
+// from Java side.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <jni.h>
+#include <string>
+#include <vector>
+
+#include "include/org_rocksdb_TtlDB.h"
+#include "rocksjni/portal.h"
+#include "rocksdb/utilities/db_ttl.h"
+
+/*
+ * Class:     org_rocksdb_TtlDB
+ * Method:    open
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_TtlDB_open(JNIEnv* env, jobject jttldb,
+    jlong joptions_handle, jstring jdb_path, jint jttl,
+    jboolean jread_only) {
+  auto opt = reinterpret_cast<rocksdb::Options*>(joptions_handle);
+  rocksdb::DBWithTTL* db = nullptr;
+  const char* db_path = env->GetStringUTFChars(jdb_path, 0);
+  rocksdb::Status s = rocksdb::DBWithTTL::Open(*opt, db_path, &db,
+      jttl, jread_only);
+  env->ReleaseStringUTFChars(jdb_path, db_path);
+
+  // as TTLDB extends RocksDB on the java side, we can reuse
+  // the RocksDB portal here.
+  if (s.ok()) {
+      rocksdb::RocksDBJni::setHandle(env, jttldb, db);
+      return;
+  }
+  rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+}

From 5ff8aec4db7534a2e6ab1118d824ce021b897fc0 Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Fri, 21 Nov 2014 23:38:17 +0100
Subject: [PATCH 714/829] [RocksJava] TTL Support

---
 java/org/rocksdb/TtlDB.java          | 128 +++++++++++++++++++++++----
 java/org/rocksdb/test/TtlDBTest.java |  76 ++++++++++++++--
 java/rocksjni/ttl.cc                 |  33 +++++++
 3 files changed, 213 insertions(+), 24 deletions(-)

diff --git a/java/org/rocksdb/TtlDB.java b/java/org/rocksdb/TtlDB.java
index dd6c8cfbc..0dbac0a6e 100644
--- a/java/org/rocksdb/TtlDB.java
+++ b/java/org/rocksdb/TtlDB.java
@@ -7,20 +7,85 @@ package org.rocksdb;
 
 import java.util.List;
 
-
+/**
+ * Database with TTL support.
+ *
+ * <p><strong>Use case</strong></p>
+ * <p>This API should be used to open the db when key-values inserted are
+ * meant to be removed from the db in a non-strict 'ttl' amount of time
+ * Therefore, this guarantees that key-values inserted will remain in the
+ * db for &gt;= ttl amount of time and the db will make efforts to remove the
+ * key-values as soon as possible after ttl seconds of their insertion.
+ * </p>
+ *
+ * <p><strong>Behaviour</strong></p>
+ * <p>TTL is accepted in seconds
+ * (int32_t)Timestamp(creation) is suffixed to values in Put internally
+ * Expired TTL values deleted in compaction only:(Timestamp+ttl&lt;time_now)
+ * Get/Iterator may return expired entries(compaction not run on them yet)
+ * Different TTL may be used during different Opens
+ * </p>
+ *
+ * <p><strong>Example</strong></p>
+ * <ul>
+ * <li>Open1 at t=0 with ttl=4 and insert k1,k2, close at t=2</li>
+ * <li>Open2 at t=3 with ttl=5. Now k1,k2 should be deleted at t&gt;=5</li>
+ * </ul>
+ *
+ * <p>
+ * read_only=true opens in the usual read-only mode. Compactions will not be
+ *  triggered(neither manual nor automatic), so no expired entries removed
+ * </p>
+ *
+ * <p><strong>Constraints</strong></p>
+ * <p>Not specifying/passing or non-positive TTL behaves
+ * like TTL = infinity</p>
+ *
+ * <p><strong>!!!WARNING!!!</strong></p>
+ * <p>Calling DB::Open directly to re-open a db created by this API will get
+ * corrupt values(timestamp suffixed) and no ttl effect will be there
+ * during the second Open, so use this API consistently to open the db
+ * Be careful when passing ttl with a small positive value because the
+ * whole database may be deleted in a small amount of time.</p>
+ */
 public class TtlDB extends RocksDB {
 
-  //static Status Open(const Options& options, const std::string& dbname,
-  //    DBWithTTL** dbptr, int32_t ttl = 0,
-  //    bool read_only = false);
+  /**
+   * <p>Opens a TtlDB.</p>
+   *
+   * <p>Database is opened in read-write mode without default TTL.</p>
+   *
+   * @param options {@link org.rocksdb.Options} instance.
+   * @param db_path path to database.
+   *
+   * @return TtlDB instance.
+   *
+   * @throws RocksDBException thrown if an error occurs within the native
+   *     part of the library.
+   */
+  public static TtlDB open(Options options, String db_path)
+      throws RocksDBException {
+    return open(options, db_path, 0, false);
+  }
+
+  /**
+   * <p>Opens a TtlDB.</p>
+   *
+   * @param options {@link org.rocksdb.Options} instance.
+   * @param db_path path to database.
+   * @param ttl time to live for new entries.
+   * @param readOnly boolean value indicating if database if db is
+   *     opened read-only.
+   *
+   * @return TtlDB instance.
+   *
+   * @throws RocksDBException thrown if an error occurs within the native
+   *     part of the library.
+   */
   public static TtlDB open(Options options, String db_path, int ttl,
       boolean readOnly) throws RocksDBException {
     TtlDB ttldb = new TtlDB();
     ttldb.open(options.nativeHandle_, db_path, ttl, readOnly);
-
-    // Prevent the RocksDB object from attempting to delete
-    // the underly C++ DB object.
-    //ttldb.disOwnNativeHandle();
     return ttldb;
   }
 
@@ -38,17 +103,39 @@ public class TtlDB extends RocksDB {
     return null;
   }
 
+  /**
+   * <p>Creates a new ttl based column family with a name defined
+   * in given ColumnFamilyDescriptor and allocates a
+   * ColumnFamilyHandle within an internal structure.</p>
+   *
+   * <p>The ColumnFamilyHandle is automatically disposed with DB
+   * disposal.</p>
+   *
+   * @param columnFamilyDescriptor column family to be created.
+   * @param ttl TTL to set for this column family.
+   *
+   * @return {@link org.rocksdb.ColumnFamilyHandle} instance.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
   public ColumnFamilyHandle createColumnFamilyWithTtl(
-      ColumnFamilyDescriptor columnFamilyDescriptor, int ttl) {
-    return null;
+      ColumnFamilyDescriptor columnFamilyDescriptor, int ttl)
+      throws RocksDBException {
+    assert(isInitialized());
+    return new ColumnFamilyHandle(this,
+        createColumnFamilyWithTtl(nativeHandle_,
+            columnFamilyDescriptor, ttl));
   }
 
   /**
-   * Close the TtlDB instance and release resource.
+   * <p>Close the TtlDB instance and release resource.</p>
    *
-   * Internally, TtlDB owns the {@code rocksdb::DB} pointer to its associated
-   * {@link org.rocksdb.RocksDB}. The release of that RocksDB pointer is handled in the destructor
-   * of the c++ {@code rocksdb::TtlDB} and should be transparent to Java developers.
+   * <p>Internally, TtlDB owns the {@code rocksdb::DB} pointer
+   * to its associated {@link org.rocksdb.RocksDB}. The release
+   * of that RocksDB pointer is handled in the destructor of the
+   * c++ {@code rocksdb::TtlDB} and should be transparent to
+   * Java developers.</p>
    */
   @Override public synchronized void close() {
     if (isInitialized()) {
@@ -57,9 +144,13 @@ public class TtlDB extends RocksDB {
   }
 
   /**
-   * A protected construction that will be used in the static factory
-   * method {@link #open(DBOptions, String, java.util.List, java.util.List)} and
-   * {@link #open(DBOptions, String, java.util.List, java.util.List, java.util.List, boolean)}.
+   * <p>A protected constructor that will be used in the static
+   * factory method
+   * {@link #open(DBOptions, String, java.util.List, java.util.List)}
+   * and
+   * {@link #open(DBOptions, String, java.util.List,
+   * java.util.List, java.util.List, boolean)}.
+   * </p>
    */
   protected TtlDB() {
     super();
@@ -72,4 +163,7 @@ public class TtlDB extends RocksDB {
 
   private native void open(long optionsHandle, String db_path, int ttl,
       boolean readOnly) throws RocksDBException;
+  private native long createColumnFamilyWithTtl(long handle,
+      ColumnFamilyDescriptor columnFamilyDescriptor, int ttl)
+      throws RocksDBException;
 }
diff --git a/java/org/rocksdb/test/TtlDBTest.java b/java/org/rocksdb/test/TtlDBTest.java
index 1637558a5..9175226ce 100644
--- a/java/org/rocksdb/test/TtlDBTest.java
+++ b/java/org/rocksdb/test/TtlDBTest.java
@@ -9,9 +9,9 @@ import org.junit.ClassRule;
 import org.junit.Rule;
 import org.junit.Test;
 import org.junit.rules.TemporaryFolder;
-import org.rocksdb.Options;
-import org.rocksdb.RocksDBException;
-import org.rocksdb.TtlDB;
+import org.rocksdb.*;
+
+import java.util.concurrent.TimeUnit;
 
 import static org.assertj.core.api.Assertions.assertThat;
 
@@ -25,19 +25,49 @@ public class TtlDBTest {
   public TemporaryFolder dbFolder = new TemporaryFolder();
 
   @Test
-  public void ttlDBOpen() throws RocksDBException, InterruptedException {
+  public void ttlDBOpen() throws RocksDBException,
+      InterruptedException {
     Options options = null;
     TtlDB ttlDB = null;
     try {
-      options = new Options().setCreateIfMissing(true);
+      options = new Options().
+          setCreateIfMissing(true).
+          setMaxGrandparentOverlapFactor(0).
+          setMaxMemCompactionLevel(0);
+      ttlDB = TtlDB.open(options,
+          dbFolder.getRoot().getAbsolutePath());
+      ttlDB.put("key".getBytes(), "value".getBytes());
+      assertThat(ttlDB.get("key".getBytes())).
+          isEqualTo("value".getBytes());
+      assertThat(ttlDB.get("key".getBytes())).isNotNull();
+    } finally {
+      if (ttlDB != null) {
+        ttlDB.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void ttlDBOpenWithTtl() throws RocksDBException,
+      InterruptedException {
+    Options options = null;
+    TtlDB ttlDB = null;
+    try {
+      options = new Options().
+          setCreateIfMissing(true).
+          setMaxGrandparentOverlapFactor(0).
+          setMaxMemCompactionLevel(0);
       ttlDB = TtlDB.open(options, dbFolder.getRoot().getAbsolutePath(),
           1, false);
       ttlDB.put("key".getBytes(), "value".getBytes());
       assertThat(ttlDB.get("key".getBytes())).
           isEqualTo("value".getBytes());
-      Thread.sleep(1250);
-      ttlDB.compactRange();
+      TimeUnit.SECONDS.sleep(2);
 
+      ttlDB.compactRange();
       assertThat(ttlDB.get("key".getBytes())).isNull();
     } finally {
       if (ttlDB != null) {
@@ -48,4 +78,36 @@ public class TtlDBTest {
       }
     }
   }
+
+  @Test
+  public void createTtlColumnFamily() throws RocksDBException,
+      InterruptedException {
+    Options options = null;
+    TtlDB ttlDB = null;
+    ColumnFamilyHandle columnFamilyHandle = null;
+    try {
+      options = new Options().setCreateIfMissing(true);
+      ttlDB = TtlDB.open(options,
+          dbFolder.getRoot().getAbsolutePath());
+      columnFamilyHandle = ttlDB.createColumnFamilyWithTtl(
+          new ColumnFamilyDescriptor("new_cf"), 1);
+      ttlDB.put(columnFamilyHandle, "key".getBytes(),
+          "value".getBytes());
+      assertThat(ttlDB.get(columnFamilyHandle, "key".getBytes())).
+          isEqualTo("value".getBytes());
+      Thread.sleep(2500);
+      ttlDB.compactRange(columnFamilyHandle);
+      assertThat(ttlDB.get(columnFamilyHandle, "key".getBytes())).isNull();
+    } finally {
+      if (columnFamilyHandle != null) {
+        columnFamilyHandle.dispose();
+      }
+      if (ttlDB != null) {
+        ttlDB.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
+    }
+  }
 }
diff --git a/java/rocksjni/ttl.cc b/java/rocksjni/ttl.cc
index c8e7c44a5..935fbb3b2 100644
--- a/java/rocksjni/ttl.cc
+++ b/java/rocksjni/ttl.cc
@@ -40,3 +40,36 @@ void Java_org_rocksdb_TtlDB_open(JNIEnv* env, jobject jttldb,
   }
   rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
 }
+
+/*
+ * Class:     org_rocksdb_TtlDB
+ * Method:    createColumnFamilyWithTtl
+ * Signature: (JLorg/rocksdb/ColumnFamilyDescriptor;I)J;
+ */
+jlong Java_org_rocksdb_TtlDB_createColumnFamilyWithTtl(
+    JNIEnv* env, jobject jobj, jlong jdb_handle,
+    jobject jcf_descriptor, jint jttl) {
+  rocksdb::ColumnFamilyHandle* handle;
+  auto db_handle = reinterpret_cast<rocksdb::DBWithTTL*>(jdb_handle);
+
+  jstring jstr = (jstring) env->CallObjectMethod(jcf_descriptor,
+      rocksdb::ColumnFamilyDescriptorJni::getColumnFamilyNameMethod(
+      env));
+  // get CF Options
+  jobject jcf_opt_obj = env->CallObjectMethod(jcf_descriptor,
+      rocksdb::ColumnFamilyDescriptorJni::getColumnFamilyOptionsMethod(
+      env));
+  rocksdb::ColumnFamilyOptions* cfOptions =
+      rocksdb::ColumnFamilyOptionsJni::getHandle(env, jcf_opt_obj);
+
+  const char* cfname = env->GetStringUTFChars(jstr, 0);
+  rocksdb::Status s = db_handle->CreateColumnFamilyWithTtl(
+      *cfOptions, cfname, &handle, jttl);
+  env->ReleaseStringUTFChars(jstr, cfname);
+
+  if (s.ok()) {
+    return reinterpret_cast<jlong>(handle);
+  }
+  rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  return 0;
+}

From 859c54a03d65ca8a232aa5794b835eb368b569d3 Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Tue, 13 Jan 2015 22:56:37 +0100
Subject: [PATCH 715/829] [RocksJava] TTL-Support

Summary:
TTLDB Support exposed in Java-API. It is now
possible to open a datbase using the RocksDB time
to live feature.

Test Plan:
make rocksdbjava
make test
mvn -f rocksjni.pom package

@Adam please test mac osx compile

Reviewers: yhchiang, adamretter, ankgup87

Subscribers: dhruba, adam

Differential Revision: https://reviews.facebook.net/D31449
---
 java/org/rocksdb/TtlDB.java          |  50 +++--
 java/org/rocksdb/test/TtlDBTest.java | 282 ++++++++++++++++-----------
 java/rocksjni/ttl.cc                 | 121 +++++++++++-
 3 files changed, 319 insertions(+), 134 deletions(-)

diff --git a/java/org/rocksdb/TtlDB.java b/java/org/rocksdb/TtlDB.java
index 0dbac0a6e..81d93692b 100644
--- a/java/org/rocksdb/TtlDB.java
+++ b/java/org/rocksdb/TtlDB.java
@@ -89,18 +89,40 @@ public class TtlDB extends RocksDB {
     return ttldb;
   }
 
-  //static Status Open(const DBOptions& db_options, const std::string& dbname,
-  //                   const std::vector<ColumnFamilyDescriptor>& column_families,
-  //                   std::vector<ColumnFamilyHandle*>* handles,
-  //                   DBWithTTL** dbptr, std::vector<int32_t> ttls,
-  //                   bool read_only = false);
+  /**
+   * <p>Opens a TtlDB.</p>
+   *
+   * @param options {@link org.rocksdb.Options} instance.
+   * @param db_path path to database.
+   * @param columnFamilyDescriptors list of column family descriptors
+   * @param columnFamilyHandles will be filled with ColumnFamilyHandle instances
+   *     on open.
+   * @param ttlValues time to live values per column family handle
+   * @param readOnly boolean value indicating if database if db is
+   *     opened read-only.
+   *
+   * @return TtlDB instance.
+   *
+   * @throws RocksDBException thrown if an error occurs within the native
+   *     part of the library.
+   * @throws java.lang.IllegalArgumentException when there is not a ttl value
+   *     per given column family handle.
+   */
   public static TtlDB open(DBOptions options, String db_path,
       List<ColumnFamilyDescriptor> columnFamilyDescriptors,
       List<ColumnFamilyHandle> columnFamilyHandles,
-      List<Integer> ttlValues, boolean readOnly){
-
-
-    return null;
+      List<Integer> ttlValues, boolean readOnly) throws RocksDBException {
+    if (columnFamilyDescriptors.size() != ttlValues.size()) {
+      throw new IllegalArgumentException("There must be a ttl value per column" +
+          "family handle.");
+    }
+    TtlDB ttlDB = new TtlDB();
+    List<Long> cfReferences = ttlDB.open(options.nativeHandle_, db_path, columnFamilyDescriptors,
+        columnFamilyDescriptors.size(),  ttlValues, readOnly);
+    for (int i=0; i<columnFamilyDescriptors.size(); i++) {
+      columnFamilyHandles.add(new ColumnFamilyHandle(ttlDB, cfReferences.get(i)));
+    }
+    return ttlDB;
   }
 
   /**
@@ -146,10 +168,10 @@ public class TtlDB extends RocksDB {
   /**
    * <p>A protected constructor that will be used in the static
    * factory method
-   * {@link #open(DBOptions, String, java.util.List, java.util.List)}
+   * {@link #open(Options, String, int, boolean)}
    * and
-   * {@link #open(DBOptions, String, java.util.List,
-   * java.util.List, java.util.List, boolean)}.
+   * {@link #open(DBOptions, String, java.util.List, java.util.List,
+   * java.util.List, boolean)}.
    * </p>
    */
   protected TtlDB() {
@@ -163,6 +185,10 @@ public class TtlDB extends RocksDB {
 
   private native void open(long optionsHandle, String db_path, int ttl,
       boolean readOnly) throws RocksDBException;
+  private native List<Long> open(long optionsHandle, String db_path,
+      List<ColumnFamilyDescriptor> columnFamilyDescriptors,
+      int columnFamilyDescriptorsLength, List<Integer> ttlValues,
+      boolean readOnly) throws RocksDBException;
   private native long createColumnFamilyWithTtl(long handle,
       ColumnFamilyDescriptor columnFamilyDescriptor, int ttl)
       throws RocksDBException;
diff --git a/java/org/rocksdb/test/TtlDBTest.java b/java/org/rocksdb/test/TtlDBTest.java
index 9175226ce..56f7ebc1a 100644
--- a/java/org/rocksdb/test/TtlDBTest.java
+++ b/java/org/rocksdb/test/TtlDBTest.java
@@ -1,113 +1,169 @@
-// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree. An additional grant
-// of patent rights can be found in the PATENTS file in the same directory.
-
-package org.rocksdb.test;
-
-import org.junit.ClassRule;
-import org.junit.Rule;
-import org.junit.Test;
-import org.junit.rules.TemporaryFolder;
-import org.rocksdb.*;
-
-import java.util.concurrent.TimeUnit;
-
-import static org.assertj.core.api.Assertions.assertThat;
-
-public class TtlDBTest {
-
-  @ClassRule
-  public static final RocksMemoryResource rocksMemoryResource =
-      new RocksMemoryResource();
-
-  @Rule
-  public TemporaryFolder dbFolder = new TemporaryFolder();
-
-  @Test
-  public void ttlDBOpen() throws RocksDBException,
-      InterruptedException {
-    Options options = null;
-    TtlDB ttlDB = null;
-    try {
-      options = new Options().
-          setCreateIfMissing(true).
-          setMaxGrandparentOverlapFactor(0).
-          setMaxMemCompactionLevel(0);
-      ttlDB = TtlDB.open(options,
-          dbFolder.getRoot().getAbsolutePath());
-      ttlDB.put("key".getBytes(), "value".getBytes());
-      assertThat(ttlDB.get("key".getBytes())).
-          isEqualTo("value".getBytes());
-      assertThat(ttlDB.get("key".getBytes())).isNotNull();
-    } finally {
-      if (ttlDB != null) {
-        ttlDB.close();
-      }
-      if (options != null) {
-        options.dispose();
-      }
-    }
-  }
-
-  @Test
-  public void ttlDBOpenWithTtl() throws RocksDBException,
-      InterruptedException {
-    Options options = null;
-    TtlDB ttlDB = null;
-    try {
-      options = new Options().
-          setCreateIfMissing(true).
-          setMaxGrandparentOverlapFactor(0).
-          setMaxMemCompactionLevel(0);
-      ttlDB = TtlDB.open(options, dbFolder.getRoot().getAbsolutePath(),
-          1, false);
-      ttlDB.put("key".getBytes(), "value".getBytes());
-      assertThat(ttlDB.get("key".getBytes())).
-          isEqualTo("value".getBytes());
-      TimeUnit.SECONDS.sleep(2);
-
-      ttlDB.compactRange();
-      assertThat(ttlDB.get("key".getBytes())).isNull();
-    } finally {
-      if (ttlDB != null) {
-        ttlDB.close();
-      }
-      if (options != null) {
-        options.dispose();
-      }
-    }
-  }
-
-  @Test
-  public void createTtlColumnFamily() throws RocksDBException,
-      InterruptedException {
-    Options options = null;
-    TtlDB ttlDB = null;
-    ColumnFamilyHandle columnFamilyHandle = null;
-    try {
-      options = new Options().setCreateIfMissing(true);
-      ttlDB = TtlDB.open(options,
-          dbFolder.getRoot().getAbsolutePath());
-      columnFamilyHandle = ttlDB.createColumnFamilyWithTtl(
-          new ColumnFamilyDescriptor("new_cf"), 1);
-      ttlDB.put(columnFamilyHandle, "key".getBytes(),
-          "value".getBytes());
-      assertThat(ttlDB.get(columnFamilyHandle, "key".getBytes())).
-          isEqualTo("value".getBytes());
-      Thread.sleep(2500);
-      ttlDB.compactRange(columnFamilyHandle);
-      assertThat(ttlDB.get(columnFamilyHandle, "key".getBytes())).isNull();
-    } finally {
-      if (columnFamilyHandle != null) {
-        columnFamilyHandle.dispose();
-      }
-      if (ttlDB != null) {
-        ttlDB.close();
-      }
-      if (options != null) {
-        options.dispose();
-      }
-    }
-  }
-}
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb.test;
+
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+import org.rocksdb.*;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.concurrent.TimeUnit;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class TtlDBTest {
+
+  @ClassRule
+  public static final RocksMemoryResource rocksMemoryResource =
+      new RocksMemoryResource();
+
+  @Rule
+  public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  @Test
+  public void ttlDBOpen() throws RocksDBException,
+      InterruptedException {
+    Options options = null;
+    TtlDB ttlDB = null;
+    try {
+      options = new Options().
+          setCreateIfMissing(true).
+          setMaxGrandparentOverlapFactor(0).
+          setMaxMemCompactionLevel(0);
+      ttlDB = TtlDB.open(options,
+          dbFolder.getRoot().getAbsolutePath());
+      ttlDB.put("key".getBytes(), "value".getBytes());
+      assertThat(ttlDB.get("key".getBytes())).
+          isEqualTo("value".getBytes());
+      assertThat(ttlDB.get("key".getBytes())).isNotNull();
+    } finally {
+      if (ttlDB != null) {
+        ttlDB.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void ttlDBOpenWithTtl() throws RocksDBException,
+      InterruptedException {
+    Options options = null;
+    TtlDB ttlDB = null;
+    try {
+      options = new Options().
+          setCreateIfMissing(true).
+          setMaxGrandparentOverlapFactor(0).
+          setMaxMemCompactionLevel(0);
+      ttlDB = TtlDB.open(options, dbFolder.getRoot().getAbsolutePath(),
+          1, false);
+      ttlDB.put("key".getBytes(), "value".getBytes());
+      assertThat(ttlDB.get("key".getBytes())).
+          isEqualTo("value".getBytes());
+      TimeUnit.SECONDS.sleep(2);
+
+      ttlDB.compactRange();
+      assertThat(ttlDB.get("key".getBytes())).isNull();
+    } finally {
+      if (ttlDB != null) {
+        ttlDB.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void ttlDbOpenWithColumnFamilies() throws RocksDBException, InterruptedException {
+    DBOptions dbOptions = null;
+    TtlDB ttlDB = null;
+    List<ColumnFamilyDescriptor> cfNames =
+        new ArrayList<>();
+    List<ColumnFamilyHandle> columnFamilyHandleList =
+        new ArrayList<>();
+    cfNames.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY));
+    cfNames.add(new ColumnFamilyDescriptor("new_cf".getBytes()));
+    List<Integer> ttlValues = new ArrayList<>();
+    // Default column family with infinite lifetime
+    ttlValues.add(0);
+    // new column family with 1 second ttl
+    ttlValues.add(1);
+
+    try {
+      dbOptions = new DBOptions().
+          setCreateMissingColumnFamilies(true).
+          setCreateIfMissing(true);
+      ttlDB = TtlDB.open(dbOptions, dbFolder.getRoot().getAbsolutePath(),
+          cfNames, columnFamilyHandleList, ttlValues, false);
+
+      ttlDB.put("key".getBytes(), "value".getBytes());
+      assertThat(ttlDB.get("key".getBytes())).
+          isEqualTo("value".getBytes());
+      ttlDB.put(columnFamilyHandleList.get(1), "key".getBytes(),
+          "value".getBytes());
+      assertThat(ttlDB.get(columnFamilyHandleList.get(1),
+          "key".getBytes())).isEqualTo("value".getBytes());
+      TimeUnit.SECONDS.sleep(2);
+
+      ttlDB.compactRange();
+      ttlDB.compactRange(columnFamilyHandleList.get(1));
+
+      assertThat(ttlDB.get("key".getBytes())).isNotNull();
+      assertThat(ttlDB.get(columnFamilyHandleList.get(1),
+          "key".getBytes())).isNull();
+
+
+    } finally {
+      for (ColumnFamilyHandle columnFamilyHandle :
+          columnFamilyHandleList) {
+        columnFamilyHandle.dispose();
+      }
+      if (ttlDB != null) {
+        ttlDB.close();
+      }
+      if (dbOptions != null) {
+        dbOptions.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void createTtlColumnFamily() throws RocksDBException,
+      InterruptedException {
+    Options options = null;
+    TtlDB ttlDB = null;
+    ColumnFamilyHandle columnFamilyHandle = null;
+    try {
+      options = new Options().setCreateIfMissing(true);
+      ttlDB = TtlDB.open(options,
+          dbFolder.getRoot().getAbsolutePath());
+      columnFamilyHandle = ttlDB.createColumnFamilyWithTtl(
+          new ColumnFamilyDescriptor("new_cf".getBytes()), 1);
+      ttlDB.put(columnFamilyHandle, "key".getBytes(),
+          "value".getBytes());
+      assertThat(ttlDB.get(columnFamilyHandle, "key".getBytes())).
+          isEqualTo("value".getBytes());
+      TimeUnit.SECONDS.sleep(2);
+      ttlDB.compactRange(columnFamilyHandle);
+      assertThat(ttlDB.get(columnFamilyHandle, "key".getBytes())).isNull();
+    } finally {
+      if (columnFamilyHandle != null) {
+        columnFamilyHandle.dispose();
+      }
+      if (ttlDB != null) {
+        ttlDB.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
+    }
+  }
+}
diff --git a/java/rocksjni/ttl.cc b/java/rocksjni/ttl.cc
index 935fbb3b2..474539688 100644
--- a/java/rocksjni/ttl.cc
+++ b/java/rocksjni/ttl.cc
@@ -7,22 +7,22 @@
 // calling c++ rocksdb::TtlDB methods.
 // from Java side.
 
+#include <jni.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <jni.h>
 #include <string>
 #include <vector>
 
 #include "include/org_rocksdb_TtlDB.h"
-#include "rocksjni/portal.h"
 #include "rocksdb/utilities/db_ttl.h"
+#include "rocksjni/portal.h"
 
 /*
  * Class:     org_rocksdb_TtlDB
  * Method:    open
- * Signature: (JJ)V
+ * Signature: (JLjava/lang/String;IZ)V
  */
-void Java_org_rocksdb_TtlDB_open(JNIEnv* env, jobject jttldb,
+void Java_org_rocksdb_TtlDB_open__JLjava_lang_String_2IZ(JNIEnv* env, jobject jttldb,
     jlong joptions_handle, jstring jdb_path, jint jttl,
     jboolean jread_only) {
   auto opt = reinterpret_cast<rocksdb::Options*>(joptions_handle);
@@ -41,6 +41,107 @@ void Java_org_rocksdb_TtlDB_open(JNIEnv* env, jobject jttldb,
   rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
 }
 
+/*
+ * Class:     org_rocksdb_TtlDB
+ * Method:    open
+ * Signature: (JLjava/lang/String;Ljava/util/List;ILjava/util/List;Z)Ljava/util/List;
+ */
+jobject
+    Java_org_rocksdb_TtlDB_open__JLjava_lang_String_2Ljava_util_List_2ILjava_util_List_2Z(
+    JNIEnv* env, jobject jdb, jlong jopt_handle, jstring jdb_path,
+    jobject jcfdesc_list, jint jcfdesc_count, jobject jttl_list,
+    jboolean jread_only) {
+  auto opt = reinterpret_cast<rocksdb::Options*>(jopt_handle);
+  rocksdb::DBWithTTL* db = nullptr;
+  const char* db_path = env->GetStringUTFChars(jdb_path, 0);
+
+  std::vector<jbyte*> cfnames_to_free;
+  std::vector<jbyteArray> jcfnames_for_free;
+
+  std::vector<rocksdb::ColumnFamilyDescriptor> column_families;
+  std::vector<int32_t> ttl_values;
+  std::vector<rocksdb::ColumnFamilyHandle* > handles;
+  // get iterator for ColumnFamilyDescriptors
+  jobject iteratorObj = env->CallObjectMethod(
+      jcfdesc_list, rocksdb::ListJni::getIteratorMethod(env));
+
+  // iterate over ColumnFamilyDescriptors
+  while (env->CallBooleanMethod(
+      iteratorObj, rocksdb::ListJni::getHasNextMethod(env)) == JNI_TRUE) {
+      // get ColumnFamilyDescriptor
+      jobject jcf_descriptor = env->CallObjectMethod(iteratorObj,
+          rocksdb::ListJni::getNextMethod(env));
+      // get ColumnFamilyName
+      jbyteArray byteArray = static_cast<jbyteArray>(env->CallObjectMethod(
+          jcf_descriptor,
+          rocksdb::ColumnFamilyDescriptorJni::getColumnFamilyNameMethod(
+              env)));
+      // get CF Options
+      jobject jcf_opt_obj = env->CallObjectMethod(jcf_descriptor,
+          rocksdb::ColumnFamilyDescriptorJni::getColumnFamilyOptionsMethod(
+              env));
+      rocksdb::ColumnFamilyOptions* cfOptions =
+          rocksdb::ColumnFamilyOptionsJni::getHandle(env, jcf_opt_obj);
+
+      jbyte* cfname = env->GetByteArrayElements(byteArray, 0);
+      // free allocated cfnames after call to open
+      cfnames_to_free.push_back(cfname);
+      jcfnames_for_free.push_back(byteArray);
+      column_families.push_back(rocksdb::ColumnFamilyDescriptor(
+          reinterpret_cast<const char *>(cfname),
+          *cfOptions));
+  }
+  // get iterator for TTL values
+  iteratorObj = env->CallObjectMethod(
+        jttl_list, rocksdb::ListJni::getIteratorMethod(env));
+  // iterate over TTL values
+  while (env->CallBooleanMethod(
+      iteratorObj, rocksdb::ListJni::getHasNextMethod(env)) == JNI_TRUE) {
+     // get TTL object
+     jobject jttl_object = env->CallObjectMethod(iteratorObj,
+       rocksdb::ListJni::getNextMethod(env));
+     // get Integer value
+     jclass jIntClazz = env->FindClass("java/lang/Integer");
+     jmethodID getVal = env->GetMethodID(jIntClazz, "intValue", "()I");
+     ttl_values.push_back(env->CallIntMethod(jttl_object, getVal));
+  }
+  rocksdb::Status s = rocksdb::DBWithTTL::Open(*opt, db_path, column_families,
+      &handles, &db, ttl_values, jread_only);
+
+  env->ReleaseStringUTFChars(jdb_path, db_path);
+  // free jbyte allocations
+  for (std::vector<jbyte*>::size_type i = 0;
+      i != cfnames_to_free.size(); i++) {
+    // free  cfnames
+    env->ReleaseByteArrayElements(jcfnames_for_free[i], cfnames_to_free[i], 0);
+  }
+
+  // check if open operation was successful
+  if (s.ok()) {
+    rocksdb::RocksDBJni::setHandle(env, jdb, db);
+    jclass jListClazz = env->FindClass("java/util/ArrayList");
+    jmethodID midList = rocksdb::ListJni::getArrayListConstructorMethodId(
+        env, jListClazz);
+    jobject jcfhandle_list = env->NewObject(jListClazz,
+        midList, handles.size());
+    // insert in java list
+    for (std::vector<rocksdb::ColumnFamilyHandle*>::size_type i = 0;
+        i != handles.size(); i++) {
+      // jlong must be converted to Long due to collections restrictions
+      jclass jLongClazz = env->FindClass("java/lang/Long");
+      jmethodID midLong = env->GetMethodID(jLongClazz, "<init>", "(J)V");
+      jobject obj = env->NewObject(jLongClazz, midLong,
+          reinterpret_cast<jlong>(handles[i]));
+      env->CallBooleanMethod(jcfhandle_list,
+          rocksdb::ListJni::getListAddMethodId(env), obj);
+    }
+
+    return jcfhandle_list;
+  }
+  rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  return nullptr;
+}
+
 /*
  * Class:     org_rocksdb_TtlDB
  * Method:    createColumnFamilyWithTtl
@@ -52,9 +153,11 @@ jlong Java_org_rocksdb_TtlDB_createColumnFamilyWithTtl(
   rocksdb::ColumnFamilyHandle* handle;
   auto db_handle = reinterpret_cast<rocksdb::DBWithTTL*>(jdb_handle);
 
-  jstring jstr = (jstring) env->CallObjectMethod(jcf_descriptor,
+  // get ColumnFamilyName
+  jbyteArray byteArray = static_cast<jbyteArray>(env->CallObjectMethod(
+      jcf_descriptor,
       rocksdb::ColumnFamilyDescriptorJni::getColumnFamilyNameMethod(
-      env));
+          env)));
   // get CF Options
   jobject jcf_opt_obj = env->CallObjectMethod(jcf_descriptor,
       rocksdb::ColumnFamilyDescriptorJni::getColumnFamilyOptionsMethod(
@@ -62,10 +165,10 @@ jlong Java_org_rocksdb_TtlDB_createColumnFamilyWithTtl(
   rocksdb::ColumnFamilyOptions* cfOptions =
       rocksdb::ColumnFamilyOptionsJni::getHandle(env, jcf_opt_obj);
 
-  const char* cfname = env->GetStringUTFChars(jstr, 0);
+  jbyte* cfname = env->GetByteArrayElements(byteArray, 0);
   rocksdb::Status s = db_handle->CreateColumnFamilyWithTtl(
-      *cfOptions, cfname, &handle, jttl);
-  env->ReleaseStringUTFChars(jstr, cfname);
+      *cfOptions, reinterpret_cast<char *>(cfname), &handle, jttl);
+  env->ReleaseByteArrayElements(byteArray, cfname, 0);
 
   if (s.ok()) {
     return reinterpret_cast<jlong>(handle);

From e828567541945af1c5be057d87a77247b4ede473 Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Sat, 17 Jan 2015 23:28:54 +0100
Subject: [PATCH 716/829] [RocksJava] Integrated changes from D31449

---
 java/org/rocksdb/TtlDB.java |  7 ++++---
 java/rocksjni/ttl.cc        | 13 +++++++------
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/java/org/rocksdb/TtlDB.java b/java/org/rocksdb/TtlDB.java
index 81d93692b..a78bb9435 100644
--- a/java/org/rocksdb/TtlDB.java
+++ b/java/org/rocksdb/TtlDB.java
@@ -117,8 +117,9 @@ public class TtlDB extends RocksDB {
           "family handle.");
     }
     TtlDB ttlDB = new TtlDB();
-    List<Long> cfReferences = ttlDB.open(options.nativeHandle_, db_path, columnFamilyDescriptors,
-        columnFamilyDescriptors.size(),  ttlValues, readOnly);
+    List<Long> cfReferences = ttlDB.openCF(options.nativeHandle_, db_path,
+        columnFamilyDescriptors, columnFamilyDescriptors.size(),
+        ttlValues, readOnly);
     for (int i=0; i<columnFamilyDescriptors.size(); i++) {
       columnFamilyHandles.add(new ColumnFamilyHandle(ttlDB, cfReferences.get(i)));
     }
@@ -185,7 +186,7 @@ public class TtlDB extends RocksDB {
 
   private native void open(long optionsHandle, String db_path, int ttl,
       boolean readOnly) throws RocksDBException;
-  private native List<Long> open(long optionsHandle, String db_path,
+  private native List<Long> openCF(long optionsHandle, String db_path,
       List<ColumnFamilyDescriptor> columnFamilyDescriptors,
       int columnFamilyDescriptorsLength, List<Integer> ttlValues,
       boolean readOnly) throws RocksDBException;
diff --git a/java/rocksjni/ttl.cc b/java/rocksjni/ttl.cc
index 474539688..a46a233f9 100644
--- a/java/rocksjni/ttl.cc
+++ b/java/rocksjni/ttl.cc
@@ -22,9 +22,9 @@
  * Method:    open
  * Signature: (JLjava/lang/String;IZ)V
  */
-void Java_org_rocksdb_TtlDB_open__JLjava_lang_String_2IZ(JNIEnv* env, jobject jttldb,
-    jlong joptions_handle, jstring jdb_path, jint jttl,
-    jboolean jread_only) {
+void Java_org_rocksdb_TtlDB_open(JNIEnv* env,
+    jobject jttldb, jlong joptions_handle, jstring jdb_path,
+    jint jttl, jboolean jread_only) {
   auto opt = reinterpret_cast<rocksdb::Options*>(joptions_handle);
   rocksdb::DBWithTTL* db = nullptr;
   const char* db_path = env->GetStringUTFChars(jdb_path, 0);
@@ -43,11 +43,12 @@ void Java_org_rocksdb_TtlDB_open__JLjava_lang_String_2IZ(JNIEnv* env, jobject jt
 
 /*
  * Class:     org_rocksdb_TtlDB
- * Method:    open
- * Signature: (JLjava/lang/String;Ljava/util/List;ILjava/util/List;Z)Ljava/util/List;
+ * Method:    openCF
+ * Signature: (JLjava/lang/String;Ljava/util/List;
+ *    ILjava/util/List;Z)Ljava/util/List;
  */
 jobject
-    Java_org_rocksdb_TtlDB_open__JLjava_lang_String_2Ljava_util_List_2ILjava_util_List_2Z(
+    Java_org_rocksdb_TtlDB_openCF(
     JNIEnv* env, jobject jdb, jlong jopt_handle, jstring jdb_path,
     jobject jcfdesc_list, jint jcfdesc_count, jobject jttl_list,
     jboolean jread_only) {

From 4ffe0be41452b27191ea7cb04e090979a014fcdf Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Mon, 19 Jan 2015 22:19:58 +0100
Subject: [PATCH 717/829] [RocksJava] Integrated changes for D31449

---
 java/rocksjni/ttl.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/java/rocksjni/ttl.cc b/java/rocksjni/ttl.cc
index a46a233f9..4164a0c4b 100644
--- a/java/rocksjni/ttl.cc
+++ b/java/rocksjni/ttl.cc
@@ -25,7 +25,7 @@
 void Java_org_rocksdb_TtlDB_open(JNIEnv* env,
     jobject jttldb, jlong joptions_handle, jstring jdb_path,
     jint jttl, jboolean jread_only) {
-  auto opt = reinterpret_cast<rocksdb::Options*>(joptions_handle);
+  auto* opt = reinterpret_cast<rocksdb::Options*>(joptions_handle);
   rocksdb::DBWithTTL* db = nullptr;
   const char* db_path = env->GetStringUTFChars(jdb_path, 0);
   rocksdb::Status s = rocksdb::DBWithTTL::Open(*opt, db_path, &db,
@@ -52,7 +52,7 @@ jobject
     JNIEnv* env, jobject jdb, jlong jopt_handle, jstring jdb_path,
     jobject jcfdesc_list, jint jcfdesc_count, jobject jttl_list,
     jboolean jread_only) {
-  auto opt = reinterpret_cast<rocksdb::Options*>(jopt_handle);
+  auto* opt = reinterpret_cast<rocksdb::Options*>(jopt_handle);
   rocksdb::DBWithTTL* db = nullptr;
   const char* db_path = env->GetStringUTFChars(jdb_path, 0);
 
@@ -152,7 +152,7 @@ jlong Java_org_rocksdb_TtlDB_createColumnFamilyWithTtl(
     JNIEnv* env, jobject jobj, jlong jdb_handle,
     jobject jcf_descriptor, jint jttl) {
   rocksdb::ColumnFamilyHandle* handle;
-  auto db_handle = reinterpret_cast<rocksdb::DBWithTTL*>(jdb_handle);
+  auto* db_handle = reinterpret_cast<rocksdb::DBWithTTL*>(jdb_handle);
 
   // get ColumnFamilyName
   jbyteArray byteArray = static_cast<jbyteArray>(env->CallObjectMethod(

From 206237d12126ced35688e5fa75ce1c0d059bec5b Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Fri, 16 Jan 2015 15:44:55 -0800
Subject: [PATCH 718/829] DBImpl::CheckConsistency() shouldn't create path name
 with double "/"

Summary: GetLiveFilesMetaData() already adds a leading "/" in file name. No need to add one extra "/" in DBImpl::CheckConsistency()

Test Plan: make all check

Reviewers: yhchiang, rven, igor

Reviewed By: igor

Subscribers: leveldb, dhruba

Differential Revision: https://reviews.facebook.net/D31779
---
 db/db_impl.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/db/db_impl.cc b/db/db_impl.cc
index fa1c87aee..b50ea9561 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -3530,7 +3530,8 @@ Status DBImpl::CheckConsistency() {
 
   std::string corruption_messages;
   for (const auto& md : metadata) {
-    std::string file_path = md.db_path + "/" + md.name;
+    // md.name has a leading "/".
+    std::string file_path = md.db_path + md.name;
 
     uint64_t fsize = 0;
     Status s = env_->GetFileSize(file_path, &fsize);

From 423dee8418d2ca6b091805c39d11b3f4e4492a9f Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Wed, 21 Jan 2015 18:18:15 -0800
Subject: [PATCH 719/829] Abort db_bench if Get() returns error

Summary:
I saw this when running readrandom benchmark with corrupted database -- benchmark worked!

If a Get() returns corruption we should probably abort.

Test Plan: compiles

Reviewers: yhchiang, rven, sdong

Reviewed By: sdong

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D31701
---
 db/db_bench.cc | 33 +++++++++++++++++++++++++++------
 1 file changed, 27 insertions(+), 6 deletions(-)

diff --git a/db/db_bench.cc b/db/db_bench.cc
index d289b8f4c..2b1f9cedd 100644
--- a/db/db_bench.cc
+++ b/db/db_bench.cc
@@ -2392,8 +2392,13 @@ class Benchmark {
         int64_t key_rand = thread->rand.Next() & (pot - 1);
         GenerateKeyFromInt(key_rand, FLAGS_num, &key);
         ++read;
-        if (db->Get(options, key, &value).ok()) {
+        auto status = db->Get(options, key, &value);
+        if (status.ok()) {
           ++found;
+        } else if (!status.IsNotFound()) {
+          fprintf(stderr,
+                  "Get returned an error: %s\n" status.ToString().c_str());
+          abort();
         }
         if (key_rand >= FLAGS_num) {
           ++nonexist;
@@ -2440,6 +2445,9 @@ class Benchmark {
       }
       if (s.ok()) {
         found++;
+      } else if (!s.IsNotFound()) {
+        fprintf(stderr, "Get returned an error: %s\n" s.ToString().c_str());
+        abort();
       }
       thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db, 1);
     }
@@ -2481,6 +2489,10 @@ class Benchmark {
       for (int64_t i = 0; i < entries_per_batch_; ++i) {
         if (statuses[i].ok()) {
           ++found;
+        } else if (!statuses[i].IsNotFound()) {
+          fprintf(stderr, "MultiGet returned an error: %s\n",
+                  statuses[i].ToString().c_str());
+          abort();
         }
       }
       thread->stats.FinishedOps(nullptr, db, entries_per_batch_);
@@ -2920,8 +2932,13 @@ class Benchmark {
       DB* db = SelectDB(thread);
       GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key);
 
-      if (db->Get(options, key, &value).ok()) {
-        found++;
+      auto status = db->Get(options, key, &value);
+      if (status.ok()) {
+        ++found;
+      } else if (!status.IsNotFound()) {
+        fprintf(stderr,
+                "Get returned an error: %s\n" status.ToString().c_str());
+        abort();
       }
 
       Status s = db->Put(write_options_, key, gen.Generate(value_size_));
@@ -2954,9 +2971,13 @@ class Benchmark {
       DB* db = SelectDB(thread);
       GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key);
 
-      // Get the existing value
-      if (db->Get(options, key, &value).ok()) {
-        found++;
+      auto status = db->Get(options, key, &value);
+      if (status.ok()) {
+        ++found;
+      } else if (!status.IsNotFound()) {
+        fprintf(stderr,
+                "Get returned an error: %s\n" status.ToString().c_str());
+        abort();
       } else {
         // If not existing, then just assume an empty string of data
         value.clear();

From ae82849bc9320347cced1c050089eeab8c1fc3e4 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Wed, 21 Jan 2015 18:23:12 -0800
Subject: [PATCH 720/829] Fix build failure

---
 db/db_bench.cc | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/db/db_bench.cc b/db/db_bench.cc
index 2b1f9cedd..34cf6e025 100644
--- a/db/db_bench.cc
+++ b/db/db_bench.cc
@@ -2396,8 +2396,8 @@ class Benchmark {
         if (status.ok()) {
           ++found;
         } else if (!status.IsNotFound()) {
-          fprintf(stderr,
-                  "Get returned an error: %s\n" status.ToString().c_str());
+          fprintf(stderr, "Get returned an error: %s\n",
+                  status.ToString().c_str());
           abort();
         }
         if (key_rand >= FLAGS_num) {
@@ -2446,7 +2446,7 @@ class Benchmark {
       if (s.ok()) {
         found++;
       } else if (!s.IsNotFound()) {
-        fprintf(stderr, "Get returned an error: %s\n" s.ToString().c_str());
+        fprintf(stderr, "Get returned an error: %s\n", s.ToString().c_str());
         abort();
       }
       thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db, 1);
@@ -2936,8 +2936,8 @@ class Benchmark {
       if (status.ok()) {
         ++found;
       } else if (!status.IsNotFound()) {
-        fprintf(stderr,
-                "Get returned an error: %s\n" status.ToString().c_str());
+        fprintf(stderr, "Get returned an error: %s\n",
+                status.ToString().c_str());
         abort();
       }
 
@@ -2975,8 +2975,8 @@ class Benchmark {
       if (status.ok()) {
         ++found;
       } else if (!status.IsNotFound()) {
-        fprintf(stderr,
-                "Get returned an error: %s\n" status.ToString().c_str());
+        fprintf(stderr, "Get returned an error: %s\n",
+                status.ToString().c_str());
         abort();
       } else {
         // If not existing, then just assume an empty string of data

From 96264784d94e7d5a2f46256656ae2a40e55b776d Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Thu, 15 Jan 2015 00:26:32 +0100
Subject: [PATCH 721/829] [RocksJava] ColumnFamily name JNI correction

Previous to this commit there was a problem with unterminated
String usage as jByteArrays are not zero terminated.
---
 java/rocksjni/rocksjni.cc | 30 +++++++++++++++++++++++++-----
 1 file changed, 25 insertions(+), 5 deletions(-)

diff --git a/java/rocksjni/rocksjni.cc b/java/rocksjni/rocksjni.cc
index 54eef7f53..3d44b8e9c 100644
--- a/java/rocksjni/rocksjni.cc
+++ b/java/rocksjni/rocksjni.cc
@@ -75,6 +75,7 @@ jobject
   const char* db_path = env->GetStringUTFChars(jdb_path, 0);
 
   std::vector<jbyte*> cfnames_to_free;
+  std::vector<char*> c_cfnames_to_free;
   std::vector<jbyteArray> jcfnames_for_free;
 
   std::vector<rocksdb::ColumnFamilyDescriptor> column_families;
@@ -102,13 +103,17 @@ jobject
           rocksdb::ColumnFamilyOptionsJni::getHandle(env, jcf_opt_obj);
 
       jbyte* cfname = env->GetByteArrayElements(byteArray, 0);
+      const int len = env->GetArrayLength(byteArray) + 1;
+      char* c_cfname = new char[len];
+      memcpy(c_cfname, cfname, len - 1);
+      c_cfname[len-1]='\0';
 
       // free allocated cfnames after call to open
       cfnames_to_free.push_back(cfname);
+      c_cfnames_to_free.push_back(c_cfname);
       jcfnames_for_free.push_back(byteArray);
       column_families.push_back(rocksdb::ColumnFamilyDescriptor(
-          reinterpret_cast<char *>(cfname),
-          *cfOptions));
+          c_cfname, *cfOptions));
   }
 
   rocksdb::Status s = rocksdb::DB::OpenForReadOnly(*opt,
@@ -119,6 +124,8 @@ jobject
       i != cfnames_to_free.size(); i++) {
     // free  cfnames
     env->ReleaseByteArrayElements(jcfnames_for_free[i], cfnames_to_free[i], 0);
+    // free c_cfnames
+    delete c_cfnames_to_free[i];
   }
 
   // check if open operation was successful
@@ -160,6 +167,7 @@ jobject Java_org_rocksdb_RocksDB_open__JLjava_lang_String_2Ljava_util_List_2I(
   const char* db_path = env->GetStringUTFChars(jdb_path, 0);
 
   std::vector<jbyte*> cfnames_to_free;
+  std::vector<char*> c_cfnames_to_free;
   std::vector<jbyteArray> jcfnames_for_free;
 
   std::vector<rocksdb::ColumnFamilyDescriptor> column_families;
@@ -187,13 +195,17 @@ jobject Java_org_rocksdb_RocksDB_open__JLjava_lang_String_2Ljava_util_List_2I(
           rocksdb::ColumnFamilyOptionsJni::getHandle(env, jcf_opt_obj);
 
       jbyte* cfname = env->GetByteArrayElements(byteArray, 0);
+      const int len = env->GetArrayLength(byteArray) + 1;
+      char* c_cfname = new char[len];
+      memcpy(c_cfname, cfname, len - 1);
+      c_cfname[len-1]='\0';
 
       // free allocated cfnames after call to open
       cfnames_to_free.push_back(cfname);
+      c_cfnames_to_free.push_back(c_cfname);
       jcfnames_for_free.push_back(byteArray);
       column_families.push_back(rocksdb::ColumnFamilyDescriptor(
-          reinterpret_cast<const char *>(cfname),
-          *cfOptions));
+          c_cfname, *cfOptions));
   }
 
   rocksdb::Status s = rocksdb::DB::Open(*opt, db_path, column_families,
@@ -204,6 +216,8 @@ jobject Java_org_rocksdb_RocksDB_open__JLjava_lang_String_2Ljava_util_List_2I(
       i != cfnames_to_free.size(); i++) {
     // free  cfnames
     env->ReleaseByteArrayElements(jcfnames_for_free[i], cfnames_to_free[i], 0);
+    // free c_cfnames
+    delete c_cfnames_to_free[i];
   }
 
   // check if open operation was successful
@@ -1231,9 +1245,15 @@ jlong Java_org_rocksdb_RocksDB_createColumnFamily(
       rocksdb::ColumnFamilyOptionsJni::getHandle(env, jcf_opt_obj);
 
   jbyte* cfname = env->GetByteArrayElements(byteArray, 0);
+  const int len = env->GetArrayLength(byteArray) + 1;
+  char* c_cfname = new char[len];
+  memcpy(c_cfname, cfname, len - 1);
+  c_cfname[len-1]='\0';
+
   rocksdb::Status s = db_handle->CreateColumnFamily(
-      *cfOptions, reinterpret_cast<char *>(cfname), &handle);
+      *cfOptions, c_cfname, &handle);
   env->ReleaseByteArrayElements(byteArray, cfname, 0);
+  delete c_cfname;
 
   if (s.ok()) {
     return reinterpret_cast<jlong>(handle);

From e204a5a16c9fec2cc8704be67d81ae219cf958f2 Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Sat, 17 Jan 2015 23:19:27 +0100
Subject: [PATCH 722/829] [RocksJava] ColumnFamily name JNI correction

Summary:
Previous to this commit there was a problem with unterminated
String usage as jByteArrays are not zero terminated.

Test Plan:
make rocksdbjava
make jtest
mvn -f rocksjni.pom package

Reviewers: yhchiang, adamretter, ankgup87

Subscribers: dhruba

Differential Revision: https://reviews.facebook.net/D31809
---
 java/rocksjni/rocksjni.cc | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/java/rocksjni/rocksjni.cc b/java/rocksjni/rocksjni.cc
index 3d44b8e9c..522e22e58 100644
--- a/java/rocksjni/rocksjni.cc
+++ b/java/rocksjni/rocksjni.cc
@@ -75,6 +75,7 @@ jobject
   const char* db_path = env->GetStringUTFChars(jdb_path, 0);
 
   std::vector<jbyte*> cfnames_to_free;
+  // the zero-terminated version of cfnames_to_free.
   std::vector<char*> c_cfnames_to_free;
   std::vector<jbyteArray> jcfnames_for_free;
 
@@ -106,7 +107,7 @@ jobject
       const int len = env->GetArrayLength(byteArray) + 1;
       char* c_cfname = new char[len];
       memcpy(c_cfname, cfname, len - 1);
-      c_cfname[len-1]='\0';
+      c_cfname[len - 1] = 0;
 
       // free allocated cfnames after call to open
       cfnames_to_free.push_back(cfname);
@@ -167,6 +168,7 @@ jobject Java_org_rocksdb_RocksDB_open__JLjava_lang_String_2Ljava_util_List_2I(
   const char* db_path = env->GetStringUTFChars(jdb_path, 0);
 
   std::vector<jbyte*> cfnames_to_free;
+  // the zero-terminated version of cfnames_to_free.
   std::vector<char*> c_cfnames_to_free;
   std::vector<jbyteArray> jcfnames_for_free;
 
@@ -198,7 +200,7 @@ jobject Java_org_rocksdb_RocksDB_open__JLjava_lang_String_2Ljava_util_List_2I(
       const int len = env->GetArrayLength(byteArray) + 1;
       char* c_cfname = new char[len];
       memcpy(c_cfname, cfname, len - 1);
-      c_cfname[len-1]='\0';
+      c_cfname[len - 1] = 0;
 
       // free allocated cfnames after call to open
       cfnames_to_free.push_back(cfname);
@@ -1248,7 +1250,7 @@ jlong Java_org_rocksdb_RocksDB_createColumnFamily(
   const int len = env->GetArrayLength(byteArray) + 1;
   char* c_cfname = new char[len];
   memcpy(c_cfname, cfname, len - 1);
-  c_cfname[len-1]='\0';
+  c_cfname[len - 1] = 0;
 
   rocksdb::Status s = db_handle->CreateColumnFamily(
       *cfOptions, c_cfname, &handle);

From 4e48753b735692071beaae3cfea300c78a80ed73 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Thu, 22 Jan 2015 11:43:38 -0800
Subject: [PATCH 723/829] Sync manifest file when initializing it

Summary: Now we don't sync manifest file when initializing it, so DB cannot be safely reopened before the first mem table flush. Fix it by syncing it. This fixes fault_injection_test.

Test Plan: make all check

Reviewers: rven, yhchiang, igor

Reviewed By: igor

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D32001
---
 db/db_impl.cc              |  3 +++
 db/db_test.cc              |  1 +
 db/fault_injection_test.cc |  3 ---
 db/filename.cc             | 13 +++++++++++++
 db/filename.h              |  5 +++++
 db/version_set.cc          | 12 ++----------
 6 files changed, 24 insertions(+), 13 deletions(-)

diff --git a/db/db_impl.cc b/db/db_impl.cc
index b50ea9561..34c3077fd 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -348,6 +348,9 @@ Status DBImpl::NewDB() {
     std::string record;
     new_db.EncodeTo(&record);
     s = log.AddRecord(record);
+    if (s.ok()) {
+      s = SyncManifest(env_, &db_options_, log.file());
+    }
   }
   if (s.ok()) {
     // Make "CURRENT" file that points to the new manifest file.
diff --git a/db/db_test.cc b/db/db_test.cc
index 4d3ebb51f..1d6962b58 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -9274,6 +9274,7 @@ TEST(DBTest, WriteSingleThreadEntry) {
 }
 
 TEST(DBTest, DisableDataSyncTest) {
+  env_->sync_counter_.store(0);
   // iter 0 -- no sync
   // iter 1 -- sync
   for (int iter = 0; iter < 2; ++iter) {
diff --git a/db/fault_injection_test.cc b/db/fault_injection_test.cc
index 0ef0be318..6861f1525 100644
--- a/db/fault_injection_test.cc
+++ b/db/fault_injection_test.cc
@@ -511,8 +511,6 @@ TEST(FaultInjectionTest, FaultTest) {
     int num_pre_sync = rnd.Uniform(kMaxNumValues);
     int num_post_sync = rnd.Uniform(kMaxNumValues);
 
-    // TODO(t6007549) Figure out why this fails and then re-enable the test.
-#if 0
     PartialCompactTestPreFault(num_pre_sync, num_post_sync);
     PartialCompactTestReopenWithFault(RESET_DROP_UNSYNCED_DATA,
                                       num_pre_sync,
@@ -520,7 +518,6 @@ TEST(FaultInjectionTest, FaultTest) {
 
     NoWriteTestPreFault();
     NoWriteTestReopenWithFault(RESET_DROP_UNSYNCED_DATA);
-#endif
 
     PartialCompactTestPreFault(num_pre_sync, num_post_sync);
     // No new files created so we expect all values since no files will be
diff --git a/db/filename.cc b/db/filename.cc
index e5d97bdf2..160005dda 100644
--- a/db/filename.cc
+++ b/db/filename.cc
@@ -19,6 +19,7 @@
 #include "db/dbformat.h"
 #include "rocksdb/env.h"
 #include "util/logging.h"
+#include "util/stop_watch.h"
 
 namespace rocksdb {
 
@@ -329,4 +330,16 @@ Status SetIdentityFile(Env* env, const std::string& dbname) {
   return s;
 }
 
+Status SyncManifest(Env* env, const DBOptions* db_options, WritableFile* file) {
+  if (db_options->disableDataSync) {
+    return Status::OK();
+  } else if (db_options->use_fsync) {
+    StopWatch sw(env, db_options->statistics.get(), MANIFEST_FILE_SYNC_MICROS);
+    return file->Fsync();
+  } else {
+    StopWatch sw(env, db_options->statistics.get(), MANIFEST_FILE_SYNC_MICROS);
+    return file->Sync();
+  }
+}
+
 }  // namespace rocksdb
diff --git a/db/filename.h b/db/filename.h
index fda873676..33f5ace20 100644
--- a/db/filename.h
+++ b/db/filename.h
@@ -25,6 +25,7 @@ namespace rocksdb {
 
 class Env;
 class Directory;
+class WritableFile;
 
 enum FileType {
   kLogFile,
@@ -137,4 +138,8 @@ extern Status SetCurrentFile(Env* env, const std::string& dbname,
 // Make the IDENTITY file for the db
 extern Status SetIdentityFile(Env* env, const std::string& dbname);
 
+// Sync manifest file `file`.
+extern Status SyncManifest(Env* env, const DBOptions* db_options,
+                           WritableFile* file);
+
 }  // namespace rocksdb
diff --git a/db/version_set.cc b/db/version_set.cc
index c5956a534..a0af2decc 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -1691,16 +1691,8 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
           break;
         }
       }
-      if (s.ok() && db_options_->disableDataSync == false) {
-        if (db_options_->use_fsync) {
-          StopWatch sw(env_, db_options_->statistics.get(),
-                       MANIFEST_FILE_SYNC_MICROS);
-          s = descriptor_log_->file()->Fsync();
-        } else {
-          StopWatch sw(env_, db_options_->statistics.get(),
-                       MANIFEST_FILE_SYNC_MICROS);
-          s = descriptor_log_->file()->Sync();
-        }
+      if (s.ok()) {
+        s = SyncManifest(env_, db_options_, descriptor_log_->file());
       }
       if (!s.ok()) {
         Log(InfoLogLevel::ERROR_LEVEL, db_options_->info_log,

From 2efe22849999728d4118c28451931b76a8091d3d Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Thu, 22 Jan 2015 23:37:45 +0100
Subject: [PATCH 724/829] [RocksJava] Incorporated changes for D31809

---
 java/rocksjni/rocksjni.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/java/rocksjni/rocksjni.cc b/java/rocksjni/rocksjni.cc
index 522e22e58..dee6aaa14 100644
--- a/java/rocksjni/rocksjni.cc
+++ b/java/rocksjni/rocksjni.cc
@@ -126,7 +126,7 @@ jobject
     // free  cfnames
     env->ReleaseByteArrayElements(jcfnames_for_free[i], cfnames_to_free[i], 0);
     // free c_cfnames
-    delete c_cfnames_to_free[i];
+    delete[] c_cfnames_to_free[i];
   }
 
   // check if open operation was successful
@@ -219,7 +219,7 @@ jobject Java_org_rocksdb_RocksDB_open__JLjava_lang_String_2Ljava_util_List_2I(
     // free  cfnames
     env->ReleaseByteArrayElements(jcfnames_for_free[i], cfnames_to_free[i], 0);
     // free c_cfnames
-    delete c_cfnames_to_free[i];
+    delete[] c_cfnames_to_free[i];
   }
 
   // check if open operation was successful
@@ -1255,7 +1255,7 @@ jlong Java_org_rocksdb_RocksDB_createColumnFamily(
   rocksdb::Status s = db_handle->CreateColumnFamily(
       *cfOptions, c_cfname, &handle);
   env->ReleaseByteArrayElements(byteArray, cfname, 0);
-  delete c_cfname;
+  delete[] c_cfname;
 
   if (s.ok()) {
     return reinterpret_cast<jlong>(handle);

From 908258a4f2361d2c01cecbe75dfc4dcf7f9d8857 Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Fri, 16 Jan 2015 23:00:30 +0100
Subject: [PATCH 725/829] [RocksJava] BlockBasedTableConfig 3.10

- Added support for format version in BlockBasedTableConfig
---
 java/org/rocksdb/BlockBasedTableConfig.java   | 47 ++++++++++++++++++-
 .../test/BlockBasedTableConfigTest.java       | 21 +++++++++
 java/rocksjni/table.cc                        |  5 +-
 3 files changed, 69 insertions(+), 4 deletions(-)

diff --git a/java/org/rocksdb/BlockBasedTableConfig.java b/java/org/rocksdb/BlockBasedTableConfig.java
index 76e930204..0798be2b8 100644
--- a/java/org/rocksdb/BlockBasedTableConfig.java
+++ b/java/org/rocksdb/BlockBasedTableConfig.java
@@ -26,6 +26,7 @@ public class BlockBasedTableConfig extends TableFormatConfig {
     blockCacheCompressedNumShardBits_ = 0;
     checksumType_ = ChecksumType.kCRC32c;
     indexType_ = IndexType.kBinarySearch;
+    formatVersion_ = 0;
   }
 
   /**
@@ -335,6 +336,46 @@ public class BlockBasedTableConfig extends TableFormatConfig {
     return indexType_;
   }
 
+  /**
+   * <p>For more details on BlockBasedTable's formats, see FORMAT-CHANGES.md
+   * We currently have three versions:</p>
+   *
+   * <ul>
+   * <li><strong>0</strong> - This version is currently written
+   * out by all RocksDB's versions by default. Can be read by really old
+   * RocksDB's. Doesn't support changing checksum (default is CRC32).</li>
+   * <li><strong>1</strong> - Can be read by RocksDB's versions since 3.0.
+   * Supports non-default checksum, like xxHash. It is written by RocksDB when
+   * BlockBasedTableOptions::checksum is something other than kCRC32c. (version
+   * 0 is silently upconverted)</li>
+   * <li><strong>2</strong> - Can be read by RocksDB's versions since 3.10.
+   * Changes the way we encode compressed blocks with LZ4, BZip2 and Zlib
+   * compression. If you don't plan to run RocksDB before version 3.10,
+   * you should probably use this.</li>
+   * </ul>
+   * <p> This option only affects newly written tables. When reading existing
+   * tables, the information about version is read from the footer.</p>
+   *
+   * @param formatVersion integer representing the version to be used.
+   * @return the reference to the current option.
+   */
+  public BlockBasedTableConfig setFormatVersion(int formatVersion) {
+    assert(formatVersion>=0 && formatVersion <=2);
+    formatVersion_ = formatVersion;
+    return this;
+  }
+
+  /**
+   *
+   * @return the currently configured format version.
+   * See also: {@link #setFormatVersion(int)}.
+   */
+  public int formatVersion() {
+    return formatVersion_;
+  }
+
+
+
   @Override protected long newTableFactoryHandle() {
     long filterHandle = 0;
     if (filter_ != null) {
@@ -347,7 +388,8 @@ public class BlockBasedTableConfig extends TableFormatConfig {
         filterHandle, cacheIndexAndFilterBlocks_,
         hashIndexAllowCollision_, blockCacheCompressedSize_,
         blockCacheCompressedNumShardBits_,
-        checksumType_.getValue(), indexType_.getValue());
+        checksumType_.getValue(), indexType_.getValue(),
+        formatVersion_);
   }
 
   private native long newTableFactoryHandle(
@@ -356,7 +398,7 @@ public class BlockBasedTableConfig extends TableFormatConfig {
       boolean wholeKeyFiltering, long filterPolicyHandle,
       boolean cacheIndexAndFilterBlocks, boolean hashIndexAllowCollision,
       long blockCacheCompressedSize, int blockCacheCompressedNumShardBits,
-      byte checkSumType, byte indexType);
+      byte checkSumType, byte indexType, int formatVersion);
 
   private boolean cacheIndexAndFilterBlocks_;
   private IndexType indexType_;
@@ -372,4 +414,5 @@ public class BlockBasedTableConfig extends TableFormatConfig {
   private int blockRestartInterval_;
   private Filter filter_;
   private boolean wholeKeyFiltering_;
+  private int formatVersion_;
 }
diff --git a/java/org/rocksdb/test/BlockBasedTableConfigTest.java b/java/org/rocksdb/test/BlockBasedTableConfigTest.java
index 5e0b96f29..1172effc8 100644
--- a/java/org/rocksdb/test/BlockBasedTableConfigTest.java
+++ b/java/org/rocksdb/test/BlockBasedTableConfigTest.java
@@ -162,4 +162,25 @@ public class BlockBasedTableConfigTest {
       }
     }
   }
+
+  @Test
+  public void blockBasedTableFormatVersion() {
+    BlockBasedTableConfig config = new BlockBasedTableConfig();
+    for (int version=0; version<=2; version++) {
+      config.setFormatVersion(version);
+      assertThat(config.formatVersion()).isEqualTo(version);
+    }
+  }
+
+  @Test(expected = AssertionError.class)
+  public void blockBasedTableFormatVersionFailNegative() {
+    BlockBasedTableConfig config = new BlockBasedTableConfig();
+    config.setFormatVersion(-1);
+  }
+
+  @Test(expected = AssertionError.class)
+  public void blockBasedTableFormatVersionFailIllegalVersion() {
+    BlockBasedTableConfig config = new BlockBasedTableConfig();
+    config.setFormatVersion(3);
+  }
 }
diff --git a/java/rocksjni/table.cc b/java/rocksjni/table.cc
index 1b576a754..e78e7e0d7 100644
--- a/java/rocksjni/table.cc
+++ b/java/rocksjni/table.cc
@@ -38,7 +38,7 @@ jlong Java_org_rocksdb_PlainTableConfig_newTableFactoryHandle(
 /*
  * Class:     org_rocksdb_BlockBasedTableConfig
  * Method:    newTableFactoryHandle
- * Signature: (ZJIJIIZIZZJIBB)J
+ * Signature: (ZJIJIIZIZZJIBBI)J
  */
 jlong Java_org_rocksdb_BlockBasedTableConfig_newTableFactoryHandle(
     JNIEnv* env, jobject jobj, jboolean no_block_cache, jlong block_cache_size,
@@ -47,7 +47,7 @@ jlong Java_org_rocksdb_BlockBasedTableConfig_newTableFactoryHandle(
     jlong jfilterPolicy, jboolean cache_index_and_filter_blocks,
     jboolean hash_index_allow_collision, jlong block_cache_compressed_size,
     jint block_cache_compressd_num_shard_bits, jbyte jchecksum_type,
-    jbyte jindex_type) {
+    jbyte jindex_type, jint jformat_version) {
   rocksdb::BlockBasedTableOptions options;
   options.no_block_cache = no_block_cache;
 
@@ -83,6 +83,7 @@ jlong Java_org_rocksdb_BlockBasedTableConfig_newTableFactoryHandle(
   options.checksum = static_cast<rocksdb::ChecksumType>(jchecksum_type);
   options.index_type = static_cast<
       rocksdb::BlockBasedTableOptions::IndexType>(jindex_type);
+  options.format_version = jformat_version;
 
   return reinterpret_cast<jlong>(rocksdb::NewBlockBasedTableFactory(options));
 }

From dd53428f8bcec60e34d762841ceb1d322a4cc33d Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Thu, 22 Jan 2015 23:47:31 +0100
Subject: [PATCH 726/829] Incorporated review comments

- added spaces between operators
- removed obsolete text in JavaDoc
---
 java/org/rocksdb/BlockBasedTableConfig.java | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/java/org/rocksdb/BlockBasedTableConfig.java b/java/org/rocksdb/BlockBasedTableConfig.java
index 0798be2b8..302fc8a0b 100644
--- a/java/org/rocksdb/BlockBasedTableConfig.java
+++ b/java/org/rocksdb/BlockBasedTableConfig.java
@@ -337,8 +337,7 @@ public class BlockBasedTableConfig extends TableFormatConfig {
   }
 
   /**
-   * <p>For more details on BlockBasedTable's formats, see FORMAT-CHANGES.md
-   * We currently have three versions:</p>
+   * <p>We currently have three versions:</p>
    *
    * <ul>
    * <li><strong>0</strong> - This version is currently written
@@ -360,7 +359,7 @@ public class BlockBasedTableConfig extends TableFormatConfig {
    * @return the reference to the current option.
    */
   public BlockBasedTableConfig setFormatVersion(int formatVersion) {
-    assert(formatVersion>=0 && formatVersion <=2);
+    assert(formatVersion >= 0 && formatVersion <= 2);
     formatVersion_ = formatVersion;
     return this;
   }

From e5df90f5d06b891ead0adc1205c5d3b6e2400129 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Thu, 22 Jan 2015 14:59:36 -0800
Subject: [PATCH 727/829] Fix comment (minor)

---
 include/rocksdb/table.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/include/rocksdb/table.h b/include/rocksdb/table.h
index 757edebe4..b67eeffef 100644
--- a/include/rocksdb/table.h
+++ b/include/rocksdb/table.h
@@ -126,7 +126,6 @@ struct BlockBasedTableOptions {
   // This must generally be true for gets to be efficient.
   bool whole_key_filtering = true;
 
-  // For more details on BlockBasedTable's formats, see FORMAT-CHANGES.md
   // We currently have three versions:
   // 0 -- This version is currently written out by all RocksDB's versions by
   // default.  Can be read by really old RocksDB's. Doesn't support changing

From 46a7048dcdd1ccc29058d1ed1f16dd19fb532721 Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Thu, 22 Jan 2015 15:42:53 -0800
Subject: [PATCH 728/829] Reduce false alarm in ThreadStatusMultipleCompaction
 test

---
 db/db_test.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/db/db_test.cc b/db/db_test.cc
index 1d6962b58..c734351b5 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -9607,9 +9607,9 @@ TEST(DBTest, ThreadStatusMultipleCompaction) {
     }
 
     if (options.enable_thread_tracking) {
-      // Expect rocksdb to at least utilize 80% of the compaction threads.
+      // Expect rocksdb to at least utilize 60% of the compaction threads.
       ASSERT_GE(1.0 * max_compaction_count,
-                0.8 * options.max_background_compactions);
+                0.6 * options.max_background_compactions);
     } else {
       // If thread tracking is not enabled, compaction count should be 0.
       ASSERT_EQ(max_compaction_count, 0);

From cd4c0719737046a7da6cc281b754a748cf9d3f88 Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Thu, 22 Jan 2015 15:46:56 -0800
Subject: [PATCH 729/829] Update HISTORY.md for GetThreadStatus() support on
 compaction.

---
 HISTORY.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/HISTORY.md b/HISTORY.md
index 1040794fe..bce856f4f 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -10,6 +10,7 @@
 * Potentially big performance improvements if you're using RocksDB with lots of column families (100-1000)
 * Added BlockBasedTableOptions.format_version option, which allows user to specify which version of block based table he wants. As a general guidline, newer versions have more features, but might not be readable by older versions of RocksDB.
 * Added new block based table format (version 2), which you can enable by setting BlockBasedTableOptions.format_version = 2. This format changes how we encode size information in compressed blocks and should help with memory allocations if you're using Zlib or BZip2 compressions.
+* GetThreadStatus() is now able to report compaction activity.
 
 ### Public API changes
 * Deprecated skip_log_error_on_recovery option

From 3b494a6103d1c6585f9fe96796c907bc0d1e096d Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Thu, 22 Jan 2015 16:57:16 -0800
Subject: [PATCH 730/829] Make options_test runnable on ROCKSDB_LITE

Summary:
Make options_test runnable on ROCKSDB_LITE by blocking
those tests that require non-ROCKSDB_LITE feature.

Test Plan:
make options_test OPT=-DROCKSDB_LITE -j32
./options_test
make clean
make options_test -j32
./options_test

Reviewers: sdong, igor

Reviewed By: igor

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D32025
---
 util/options_test.cc | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/util/options_test.cc b/util/options_test.cc
index d89acfb7d..cd26b0211 100644
--- a/util/options_test.cc
+++ b/util/options_test.cc
@@ -78,14 +78,17 @@ TEST(OptionsTest, LooseCondition) {
   options = PrintAndGetOptions(128 * 1024 * 1024, 8, 100);
   ASSERT_EQ(options.compaction_style, kCompactionStyleLevel);
 
+#ifndef ROCKSDB_LITE  // Universal compaction is not supported in ROCKSDB_LITE
   // Tight write amplification
   options = PrintAndGetOptions(128 * 1024 * 1024, 64, 10);
   ASSERT_EQ(options.compaction_style, kCompactionStyleUniversal);
+#endif  // !ROCKSDB_LITE
 
   // Both tight amplifications
   PrintAndGetOptions(128 * 1024 * 1024, 4, 8);
 }
 
+#ifndef ROCKSDB_LITE  // GetOptionsFromMap is not supported in ROCKSDB_LITE
 TEST(OptionsTest, GetOptionsFromMapTest) {
   std::unordered_map<std::string, std::string> cf_options_map = {
     {"write_buffer_size", "1"},
@@ -271,7 +274,9 @@ TEST(OptionsTest, GetOptionsFromMapTest) {
   ASSERT_EQ(new_db_opt.use_adaptive_mutex, false);
   ASSERT_EQ(new_db_opt.bytes_per_sync, static_cast<uint64_t>(47));
 }
+#endif  // !ROCKSDB_LITE
 
+#ifndef ROCKSDB_LITE  // GetOptionsFromString is not supported in ROCKSDB_LITE
 TEST(OptionsTest, GetOptionsFromStringTest) {
   ColumnFamilyOptions base_cf_opt;
   ColumnFamilyOptions new_cf_opt;
@@ -382,7 +387,9 @@ TEST(OptionsTest, GetOptionsFromStringTest) {
              "block_based_table_factory={xx_block_size=4;}",
              &new_cf_opt));
 }
+#endif  // !ROCKSDB_LITE
 
+#ifndef ROCKSDB_LITE  // GetBlockBasedTableOptionsFromString is not supported
 TEST(OptionsTest, GetBlockBasedTableOptionsFromString) {
   BlockBasedTableOptions table_opt;
   BlockBasedTableOptions new_opt;
@@ -435,11 +442,13 @@ TEST(OptionsTest, GetBlockBasedTableOptionsFromString) {
              "filter_policy=bloomfilter:4",
              &new_opt));
 }
+#endif  // !ROCKSDB_LITE
 
 Status StringToMap(
     const std::string& opts_str,
     std::unordered_map<std::string, std::string>* opts_map);
 
+#ifndef ROCKSDB_LITE  // StringToMap is not supported in ROCKSDB_LITE
 TEST(OptionsTest, StringToMapTest) {
   std::unordered_map<std::string, std::string> opts_map;
   // Regular options
@@ -556,7 +565,9 @@ TEST(OptionsTest, StringToMapTest) {
   ASSERT_NOK(StringToMap("k1=v1;k2={{}}{}", &opts_map));
   ASSERT_NOK(StringToMap("k1=v1;k2={{dfdl}adfa}{}", &opts_map));
 }
+#endif  // ROCKSDB_LITE
 
+#ifndef ROCKSDB_LITE  // StringToMap is not supported in ROCKSDB_LITE
 TEST(OptionsTest, StringToMapRandomTest) {
   std::unordered_map<std::string, std::string> opts_map;
   // Make sure segfault is not hit by semi-random strings
@@ -601,6 +612,7 @@ TEST(OptionsTest, StringToMapRandomTest) {
     opts_map.clear();
   }
 }
+#endif  // !ROCKSDB_LITE
 
 TEST(OptionsTest, ConvertOptionsTest) {
   LevelDBOptions leveldb_opt;

From bef7821f07025c65f1bb98afeb8034c2272d4dc0 Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Fri, 16 Jan 2015 23:35:21 +0100
Subject: [PATCH 731/829] [RocksJava] ReadOptions support in Iterators

The methods:

- newIterator
- iterators

support now also ReadOptions. That allows a user of the Java API
to retrieve RocksIterator instances on a snapshot.
---
 java/org/rocksdb/RocksDB.java           |  97 +++++++++++++++---
 java/org/rocksdb/test/SnapshotTest.java | 129 +++++++++++++++++++++++-
 java/rocksjni/rocksjni.cc               |  69 ++++++++++---
 3 files changed, 263 insertions(+), 32 deletions(-)

diff --git a/java/org/rocksdb/RocksDB.java b/java/org/rocksdb/RocksDB.java
index 089882532..ac02860e8 100644
--- a/java/org/rocksdb/RocksDB.java
+++ b/java/org/rocksdb/RocksDB.java
@@ -1107,21 +1107,40 @@ public class RocksDB extends RocksObject {
   }
 
   /**
-   * Return a heap-allocated iterator over the contents of the database.
-   * The result of newIterator() is initially invalid (caller must
-   * call one of the Seek methods on the iterator before using it).
+   * <p>Return a heap-allocated iterator over the contents of the
+   * database. The result of newIterator() is initially invalid
+   * (caller must call one of the Seek methods on the iterator
+   * before using it).</p>
    *
-   * Caller should close the iterator when it is no longer needed.
+   * <p>Caller should close the iterator when it is no longer needed.
    * The returned iterator should be closed before this db is closed.
+   * </p>
    *
    * @return instance of iterator object.
    */
   public RocksIterator newIterator() {
-    return new RocksIterator(this, iterator0(nativeHandle_));
+    return new RocksIterator(this, iterator(nativeHandle_));
   }
 
-
   /**
+   * <p>Return a heap-allocated iterator over the contents of the
+   * database. The result of newIterator() is initially invalid
+   * (caller must call one of the Seek methods on the iterator
+   * before using it).</p>
+   *
+   * <p>Caller should close the iterator when it is no longer needed.
+   * The returned iterator should be closed before this db is closed.
+   * </p>
+   *
+   * @param readOptions {@link ReadOptions} instance.
+   * @return instance of iterator object.
+   */
+  public RocksIterator newIterator(ReadOptions readOptions) {
+    return new RocksIterator(this, iterator(nativeHandle_,
+        readOptions.nativeHandle_));
+  }
+
+   /**
    * <p>Return a handle to the current DB state. Iterators created with
    * this handle will all observe a stable snapshot of the current DB
    * state. The caller must call ReleaseSnapshot(result) when the
@@ -1153,22 +1172,45 @@ public class RocksDB extends RocksObject {
   }
 
   /**
-   * Return a heap-allocated iterator over the contents of the database.
-   * The result of newIterator() is initially invalid (caller must
-   * call one of the Seek methods on the iterator before using it).
+   * <p>Return a heap-allocated iterator over the contents of the
+   * database. The result of newIterator() is initially invalid
+   * (caller must call one of the Seek methods on the iterator
+   * before using it).</p>
    *
-   * Caller should close the iterator when it is no longer needed.
+   * <p>Caller should close the iterator when it is no longer needed.
    * The returned iterator should be closed before this db is closed.
+   * </p>
    *
    * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
    *     instance
    * @return instance of iterator object.
    */
   public RocksIterator newIterator(ColumnFamilyHandle columnFamilyHandle) {
-    return new RocksIterator(this, iterator0(nativeHandle_,
+    return new RocksIterator(this, iteratorCF(nativeHandle_,
         columnFamilyHandle.nativeHandle_));
   }
 
+  /**
+   * <p>Return a heap-allocated iterator over the contents of the
+   * database. The result of newIterator() is initially invalid
+   * (caller must call one of the Seek methods on the iterator
+   * before using it).</p>
+   *
+   * <p>Caller should close the iterator when it is no longer needed.
+   * The returned iterator should be closed before this db is closed.
+   * </p>
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance
+   * @param readOptions {@link ReadOptions} instance.
+   * @return instance of iterator object.
+   */
+  public RocksIterator newIterator(ColumnFamilyHandle columnFamilyHandle,
+      ReadOptions readOptions) {
+    return new RocksIterator(this, iteratorCF(nativeHandle_,
+        columnFamilyHandle.nativeHandle_, readOptions.nativeHandle_));
+  }
+
   /**
    * Returns iterators from a consistent database state across multiple
    * column families. Iterators are heap allocated and need to be deleted
@@ -1184,10 +1226,31 @@ public class RocksDB extends RocksObject {
    */
   public List<RocksIterator> newIterators(
       List<ColumnFamilyHandle> columnFamilyHandleList) throws RocksDBException {
+    return newIterators(columnFamilyHandleList, new ReadOptions());
+  }
+
+  /**
+   * Returns iterators from a consistent database state across multiple
+   * column families. Iterators are heap allocated and need to be deleted
+   * before the db is deleted
+   *
+   * @param columnFamilyHandleList {@link java.util.List} containing
+   *     {@link org.rocksdb.ColumnFamilyHandle} instances.
+   * @param readOptions {@link ReadOptions} instance.
+   * @return {@link java.util.List} containing {@link org.rocksdb.RocksIterator}
+   *     instances
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public List<RocksIterator> newIterators(
+      List<ColumnFamilyHandle> columnFamilyHandleList,
+      ReadOptions readOptions) throws RocksDBException {
     List<RocksIterator> iterators =
         new ArrayList<>(columnFamilyHandleList.size());
 
-    long[] iteratorRefs = iterators(nativeHandle_, columnFamilyHandleList);
+    long[] iteratorRefs = iterators(nativeHandle_, columnFamilyHandleList,
+        readOptions.nativeHandle_);
     for (int i=0; i<columnFamilyHandleList.size(); i++){
       iterators.add(new RocksIterator(this, iteratorRefs[i]));
     }
@@ -1637,10 +1700,14 @@ public class RocksDB extends RocksObject {
       String property, int propertyLength) throws RocksDBException;
   protected native long getLongProperty(long nativeHandle, long cfHandle,
       String property, int propertyLength) throws RocksDBException;
-  protected native long iterator0(long handle);
-  protected native long iterator0(long handle, long cfHandle);
+  protected native long iterator(long handle);
+  protected native long iterator(long handle, long readOptHandle);
+  protected native long iteratorCF(long handle, long cfHandle);
+  protected native long iteratorCF(long handle, long cfHandle,
+      long readOptHandle);
   protected native long[] iterators(long handle,
-      List<ColumnFamilyHandle> columnFamilyNames) throws RocksDBException;
+      List<ColumnFamilyHandle> columnFamilyNames, long readOptHandle)
+      throws RocksDBException;
   protected native long getSnapshot(long nativeHandle);
   protected native void releaseSnapshot(
       long nativeHandle, long snapshotHandle);
diff --git a/java/org/rocksdb/test/SnapshotTest.java b/java/org/rocksdb/test/SnapshotTest.java
index 1b45c517e..b6dd2a360 100644
--- a/java/org/rocksdb/test/SnapshotTest.java
+++ b/java/org/rocksdb/test/SnapshotTest.java
@@ -8,11 +8,7 @@ import org.junit.ClassRule;
 import org.junit.Rule;
 import org.junit.Test;
 import org.junit.rules.TemporaryFolder;
-import org.rocksdb.Options;
-import org.rocksdb.ReadOptions;
-import org.rocksdb.RocksDB;
-import org.rocksdb.RocksDBException;
-import org.rocksdb.Snapshot;
+import org.rocksdb.*;
 
 import static org.assertj.core.api.Assertions.assertThat;
 
@@ -94,4 +90,127 @@ public class SnapshotTest {
       }
     }
   }
+
+  @Test
+  public void iteratorWithSnapshot() throws RocksDBException {
+    RocksDB db = null;
+    Options options = null;
+    ReadOptions readOptions = null;
+    RocksIterator iterator = null;
+    RocksIterator snapshotIterator = null;
+    try {
+
+      options = new Options();
+      options.setCreateIfMissing(true);
+
+      db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath());
+      db.put("key".getBytes(), "value".getBytes());
+      // Get new Snapshot of database
+      Snapshot snapshot = db.getSnapshot();
+      readOptions = new ReadOptions();
+      // set snapshot in ReadOptions
+      readOptions.setSnapshot(snapshot);
+      db.put("key2".getBytes(), "value2".getBytes());
+
+      // iterate over current state of db
+      iterator = db.newIterator();
+      iterator.seekToFirst();
+      assertThat(iterator.isValid()).isTrue();
+      assertThat(iterator.key()).isEqualTo("key".getBytes());
+      iterator.next();
+      assertThat(iterator.isValid()).isTrue();
+      assertThat(iterator.key()).isEqualTo("key2".getBytes());
+      iterator.next();
+      assertThat(iterator.isValid()).isFalse();
+
+      // iterate using a snapshot
+      snapshotIterator = db.newIterator(readOptions);
+      snapshotIterator.seekToFirst();
+      assertThat(snapshotIterator.isValid()).isTrue();
+      assertThat(snapshotIterator.key()).isEqualTo("key".getBytes());
+      snapshotIterator.next();
+      assertThat(snapshotIterator.isValid()).isFalse();
+
+      // release Snapshot
+      db.releaseSnapshot(snapshot);
+    } finally {
+      if (iterator != null) {
+        iterator.dispose();
+      }
+      if (snapshotIterator != null) {
+        snapshotIterator.dispose();
+      }
+      if (db != null) {
+        db.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
+      if (readOptions != null) {
+        readOptions.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void iteratorWithSnapshotOnColumnFamily() throws RocksDBException {
+    RocksDB db = null;
+    Options options = null;
+    ReadOptions readOptions = null;
+    RocksIterator iterator = null;
+    RocksIterator snapshotIterator = null;
+    try {
+
+      options = new Options();
+      options.setCreateIfMissing(true);
+
+      db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath());
+      db.put("key".getBytes(), "value".getBytes());
+      // Get new Snapshot of database
+      Snapshot snapshot = db.getSnapshot();
+      readOptions = new ReadOptions();
+      // set snapshot in ReadOptions
+      readOptions.setSnapshot(snapshot);
+      db.put("key2".getBytes(), "value2".getBytes());
+
+      // iterate over current state of column family
+      iterator = db.newIterator(db.getDefaultColumnFamily());
+      iterator.seekToFirst();
+      assertThat(iterator.isValid()).isTrue();
+      assertThat(iterator.key()).isEqualTo("key".getBytes());
+      iterator.next();
+      assertThat(iterator.isValid()).isTrue();
+      assertThat(iterator.key()).isEqualTo("key2".getBytes());
+      iterator.next();
+      assertThat(iterator.isValid()).isFalse();
+
+      // iterate using a snapshot on default column family
+      snapshotIterator = db.newIterator(db.getDefaultColumnFamily(),
+          readOptions);
+      snapshotIterator.seekToFirst();
+      assertThat(snapshotIterator.isValid()).isTrue();
+      assertThat(snapshotIterator.key()).isEqualTo("key".getBytes());
+      snapshotIterator.next();
+      assertThat(snapshotIterator.isValid()).isFalse();
+
+      // release Snapshot
+      db.releaseSnapshot(snapshot);
+    } finally {
+      if (iterator != null) {
+        iterator.dispose();
+      }
+      if (snapshotIterator != null) {
+        snapshotIterator.dispose();
+      }
+      if (db != null) {
+        db.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
+      if (readOptions != null) {
+        readOptions.dispose();
+      }
+    }
+  }
 }
diff --git a/java/rocksjni/rocksjni.cc b/java/rocksjni/rocksjni.cc
index dee6aaa14..1055f87fe 100644
--- a/java/rocksjni/rocksjni.cc
+++ b/java/rocksjni/rocksjni.cc
@@ -1142,40 +1142,85 @@ void Java_org_rocksdb_RocksDB_disposeInternal(
   delete reinterpret_cast<rocksdb::DB*>(jhandle);
 }
 
+jlong rocksdb_iterator_helper(
+    rocksdb::DB* db, rocksdb::ReadOptions read_options,
+    rocksdb::ColumnFamilyHandle* cf_handle) {
+  rocksdb::Iterator* iterator = nullptr;
+  if (cf_handle != nullptr) {
+    iterator = db->NewIterator(read_options, cf_handle);
+  } else {
+    iterator = db->NewIterator(read_options);
+  }
+  return reinterpret_cast<jlong>(iterator);
+}
+
 /*
  * Class:     org_rocksdb_RocksDB
- * Method:    iterator0
+ * Method:    iterator
  * Signature: (J)J
  */
-jlong Java_org_rocksdb_RocksDB_iterator0__J(
+jlong Java_org_rocksdb_RocksDB_iterator__J(
     JNIEnv* env, jobject jdb, jlong db_handle) {
   auto db = reinterpret_cast<rocksdb::DB*>(db_handle);
-  rocksdb::Iterator* iterator = db->NewIterator(rocksdb::ReadOptions());
-  return reinterpret_cast<jlong>(iterator);
+  return rocksdb_iterator_helper(db, rocksdb::ReadOptions(),
+      nullptr);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    iterator
+ * Signature: (JJ)J
+ */
+jlong Java_org_rocksdb_RocksDB_iterator__JJ(
+    JNIEnv* env, jobject jdb, jlong db_handle,
+    jlong jread_options_handle) {
+  auto db = reinterpret_cast<rocksdb::DB*>(db_handle);
+  auto& read_options = *reinterpret_cast<rocksdb::ReadOptions*>(
+      jread_options_handle);
+  return rocksdb_iterator_helper(db, read_options,
+      nullptr);
 }
 
 /*
  * Class:     org_rocksdb_RocksDB
- * Method:    iterator0
+ * Method:    iteratorCF
  * Signature: (JJ)J
  */
-jlong Java_org_rocksdb_RocksDB_iterator0__JJ(
+jlong Java_org_rocksdb_RocksDB_iteratorCF__JJ(
     JNIEnv* env, jobject jdb, jlong db_handle, jlong jcf_handle) {
   auto db = reinterpret_cast<rocksdb::DB*>(db_handle);
   auto cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
-  rocksdb::Iterator* iterator = db->NewIterator(rocksdb::ReadOptions(),
-      cf_handle);
-  return reinterpret_cast<jlong>(iterator);
+  return rocksdb_iterator_helper(db, rocksdb::ReadOptions(),
+        cf_handle);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    iteratorCF
+ * Signature: (JJJ)J
+ */
+jlong Java_org_rocksdb_RocksDB_iteratorCF__JJJ(
+    JNIEnv* env, jobject jdb, jlong db_handle, jlong jcf_handle,
+    jlong jread_options_handle) {
+  auto db = reinterpret_cast<rocksdb::DB*>(db_handle);
+  auto cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
+  auto& read_options = *reinterpret_cast<rocksdb::ReadOptions*>(
+      jread_options_handle);
+  return rocksdb_iterator_helper(db, read_options,
+        cf_handle);
 }
 
 /*
  * Class:     org_rocksdb_RocksDB
  * Method:    iterators
- * Signature: (JLjava/util/List;)[J
+ * Signature: (JLjava/util/List;J)[J
  */
 jlongArray Java_org_rocksdb_RocksDB_iterators(
-    JNIEnv* env, jobject jdb, jlong db_handle, jobject jcfhandle_list) {
+    JNIEnv* env, jobject jdb, jlong db_handle, jobject jcfhandle_list,
+    jlong jread_options_handle) {
   auto db = reinterpret_cast<rocksdb::DB*>(db_handle);
+  auto& read_options = *reinterpret_cast<rocksdb::ReadOptions*>(
+        jread_options_handle);
   std::vector<rocksdb::ColumnFamilyHandle*> cf_handles;
   std::vector<rocksdb::Iterator*> iterators;
 
@@ -1195,7 +1240,7 @@ jlongArray Java_org_rocksdb_RocksDB_iterators(
     }
   }
 
-  rocksdb::Status s = db->NewIterators(rocksdb::ReadOptions(),
+  rocksdb::Status s = db->NewIterators(read_options,
       cf_handles, &iterators);
   if (s.ok()) {
     jlongArray jLongArray =

From b068f0a673cb074c366a2524e96eb3503eb80622 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Fri, 23 Jan 2015 11:12:37 -0800
Subject: [PATCH 732/829] Upgrade our compilers

Summary:
Upgrade gcc to 4.9.1 and clang to dev.

With new compilers I succeeded to run thread sanitizer, too. I'll post output (doesn't look good) and fix some things in separate diffs.

Test Plan: compiles with both g++ and clang

Reviewers: yhchiang, rven, sdong

Reviewed By: sdong

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D32007
---
 build_tools/fbcode_config.sh | 64 +++++++++++++++++-------------------
 1 file changed, 30 insertions(+), 34 deletions(-)

diff --git a/build_tools/fbcode_config.sh b/build_tools/fbcode_config.sh
index 7c1ff5147..a3bfd58b8 100644
--- a/build_tools/fbcode_config.sh
+++ b/build_tools/fbcode_config.sh
@@ -5,57 +5,54 @@
 # uses jemalloc
 
 # location of libgcc
-LIBGCC_BASE="/mnt/gvfs/third-party2/libgcc/7712e757d7355cb51292454ee0b7b46a467fdfed/4.8.1/gcc-4.8.1-glibc-2.17/8aac7fc"
+LIBGCC_BASE="/mnt/gvfs/third-party2/libgcc/0473c80518a10d6efcbe24c5eeca3fb4ec9b519c/4.9.x/gcc-4.9-glibc-2.20/e1a7e4e/"
 LIBGCC_INCLUDE="$LIBGCC_BASE/include"
 LIBGCC_LIBS=" -L $LIBGCC_BASE/libs"
 
 # location of glibc
-GLIBC_REV=6e40560b4e0b6d690fd1cf8c7a43ad7452b04cfa
-GLIBC_INCLUDE="/mnt/gvfs/third-party2/glibc/$GLIBC_REV/2.17/gcc-4.8.1-glibc-2.17/99df8fc/include"
-GLIBC_LIBS=" -L /mnt/gvfs/third-party2/glibc/$GLIBC_REV/2.17/gcc-4.8.1-glibc-2.17/99df8fc/lib"
+GLIBC_REV=7397bed99280af5d9543439cdb7d018af7542720
+GLIBC_INCLUDE="/mnt/gvfs/third-party2/glibc/$GLIBC_REV/2.20/gcc-4.9-glibc-2.20/99df8fc/include"
+GLIBC_LIBS=" -L /mnt/gvfs/third-party2/glibc/$GLIBC_REV/2.20/gcc-4.9-glibc-2.20/99df8fc/lib"
 
 # location of snappy headers and libraries
-SNAPPY_INCLUDE=" -I /mnt/gvfs/third-party2/snappy/aef17f6c0b44b4fe408bd06f67c93701ab0a6ceb/1.0.3/gcc-4.8.1-glibc-2.17/43d84e2/include"
-SNAPPY_LIBS=" /mnt/gvfs/third-party2/snappy/aef17f6c0b44b4fe408bd06f67c93701ab0a6ceb/1.0.3/gcc-4.8.1-glibc-2.17/43d84e2/lib/libsnappy.a"
+SNAPPY_INCLUDE=" -I /mnt/gvfs/third-party2/snappy/b0f269b3ca47770121aa159b99e1d8d2ab260e1f/1.0.3/gcc-4.9-glibc-2.20/c32916f/include/"
+SNAPPY_LIBS=" /mnt/gvfs/third-party2/snappy/b0f269b3ca47770121aa159b99e1d8d2ab260e1f/1.0.3/gcc-4.9-glibc-2.20/c32916f/lib/libsnappy.a"
 
 # location of zlib headers and libraries
-ZLIB_INCLUDE=" -I /mnt/gvfs/third-party2/zlib/25c6216928b4d77b59ddeca0990ff6fe9ac16b81/1.2.5/gcc-4.8.1-glibc-2.17/c3f970a/include"
-ZLIB_LIBS=" /mnt/gvfs/third-party2/zlib/25c6216928b4d77b59ddeca0990ff6fe9ac16b81/1.2.5/gcc-4.8.1-glibc-2.17/c3f970a/lib/libz.a"
+ZLIB_INCLUDE=" -I /mnt/gvfs/third-party2/zlib/feb983d9667f4cf5e9da07ce75abc824764b67a1/1.2.8/gcc-4.9-glibc-2.20/4230243/include/"
+ZLIB_LIBS=" /mnt/gvfs/third-party2/zlib/feb983d9667f4cf5e9da07ce75abc824764b67a1/1.2.8/gcc-4.9-glibc-2.20/4230243/lib/libz.a"
 
 # location of bzip headers and libraries
-BZIP_INCLUDE=" -I /mnt/gvfs/third-party2/bzip2/c9ef7629c2aa0024f7a416e87602f06eb88f5eac/1.0.6/gcc-4.8.1-glibc-2.17/c3f970a/include/"
-BZIP_LIBS=" /mnt/gvfs/third-party2/bzip2/c9ef7629c2aa0024f7a416e87602f06eb88f5eac/1.0.6/gcc-4.8.1-glibc-2.17/c3f970a/lib/libbz2.a"
+BZIP_INCLUDE=" -I /mnt/gvfs/third-party2/bzip2/af004cceebb2dfd173ca29933ea5915e727aad2f/1.0.6/gcc-4.9-glibc-2.20/4230243/include/"
+BZIP_LIBS=" /mnt/gvfs/third-party2/bzip2/af004cceebb2dfd173ca29933ea5915e727aad2f/1.0.6/gcc-4.9-glibc-2.20/4230243/lib/libbz2.a"
 
-LZ4_REV=065ec7e38fe83329031f6668c43bef83eff5808b
-LZ4_INCLUDE=" -I /mnt/gvfs/third-party2/lz4/$LZ4_REV/r108/gcc-4.8.1-glibc-2.17/c3f970a/include"
-LZ4_LIBS=" /mnt/gvfs/third-party2/lz4/$LZ4_REV/r108/gcc-4.8.1-glibc-2.17/c3f970a/lib/liblz4.a"
+LZ4_INCLUDE=" -I /mnt/gvfs/third-party2/lz4/79d2943e2dd7208a3e0b06cf95e9f85f05fe9e1b/r124/gcc-4.9-glibc-2.20/4230243/include/"
+LZ4_LIBS=" /mnt/gvfs/third-party2/lz4/79d2943e2dd7208a3e0b06cf95e9f85f05fe9e1b/r124/gcc-4.9-glibc-2.20/4230243/lib/liblz4.a"
 
 # location of gflags headers and libraries
-GFLAGS_INCLUDE=" -I /mnt/gvfs/third-party2/gflags/1ad047a6e6f6673991918ecadc670868205a243a/1.6/gcc-4.8.1-glibc-2.17/c3f970a/include/"
-GFLAGS_LIBS=" /mnt/gvfs/third-party2/gflags/1ad047a6e6f6673991918ecadc670868205a243a/1.6/gcc-4.8.1-glibc-2.17/c3f970a/lib/libgflags.a"
+GFLAGS_INCLUDE=" -I /mnt/gvfs/third-party2/gflags/0fa60e2b88de3e469db6c482d6e6dac72f5d65f9/1.6/gcc-4.9-glibc-2.20/4230243/include/"
+GFLAGS_LIBS=" /mnt/gvfs/third-party2/gflags/0fa60e2b88de3e469db6c482d6e6dac72f5d65f9/1.6/gcc-4.9-glibc-2.20/4230243/lib/libgflags.a"
 
 # location of jemalloc
-JEMALLOC_INCLUDE=" -I /mnt/gvfs/third-party2/jemalloc/c60d854f7824f334195fe7fd34b2bc9057e3c1f9/3.6.0/gcc-4.8.1-glibc-2.17/4d53c6f/include"
-JEMALLOC_LIB=" -Wl,--whole-archive /mnt/gvfs/third-party2/jemalloc/c60d854f7824f334195fe7fd34b2bc9057e3c1f9/3.6.0/gcc-4.8.1-glibc-2.17/4d53c6f/lib/libjemalloc.a"
+JEMALLOC_INCLUDE=" -I /mnt/gvfs/third-party2/jemalloc/bcd68e5e419efa4e61b9486d6854564d6d75a0b5/3.6.0/gcc-4.9-glibc-2.20/2aafc78/include/"
+JEMALLOC_LIB=" -Wl,--whole-archive /mnt/gvfs/third-party2/jemalloc/bcd68e5e419efa4e61b9486d6854564d6d75a0b5/3.6.0/gcc-4.9-glibc-2.20/2aafc78/lib/libjemalloc.a"
 
 # location of numa
-NUMA_REV=829d10dac0230f99cd7e1778869d2adf3da24b65
-NUMA_INCLUDE=" -I /mnt/gvfs/third-party2/numa/$NUMA_REV/2.0.8/gcc-4.8.1-glibc-2.17/c3f970a/include/"
-NUMA_LIB=" /mnt/gvfs/third-party2/numa/$NUMA_REV/2.0.8/gcc-4.8.1-glibc-2.17/c3f970a/lib/libnuma.a"
+NUMA_INCLUDE=" -I /mnt/gvfs/third-party2/numa/bbefc39ecbf31d0ca184168eb613ef8d397790ee/2.0.8/gcc-4.9-glibc-2.20/4230243/include/"
+NUMA_LIB=" /mnt/gvfs/third-party2/numa/bbefc39ecbf31d0ca184168eb613ef8d397790ee/2.0.8/gcc-4.9-glibc-2.20/4230243/lib/libnuma.a"
 
 # location of libunwind
-LIBUNWIND_REV=2c060e64064559905d46fd194000d61592087bdc
-LIBUNWIND="/mnt/gvfs/third-party2/libunwind/$LIBUNWIND_REV/1.1/gcc-4.8.1-glibc-2.17/675d945/lib/libunwind.a"
+LIBUNWIND="/mnt/gvfs/third-party2/libunwind/1de3b75e0afedfe5585b231bbb340ec7a1542335/1.1/gcc-4.9-glibc-2.20/34235e8/lib/libunwind.a"
 
 # use Intel SSE support for checksum calculations
 export USE_SSE=1
 
-BINUTILS="/mnt/gvfs/third-party2/binutils/2aff2e7b474cd3e6ab23495ad1224b7d214b9f8e/2.21.1/centos6-native/da39a3e/bin"
+BINUTILS="/mnt/gvfs/third-party2/binutils/0b6ad0c88ddd903333a48ae8bff134efac468e4a/2.25/centos6-native/da39a3e/bin"
 AR="$BINUTILS/ar"
 
 DEPS_INCLUDE="$SNAPPY_INCLUDE $ZLIB_INCLUDE $BZIP_INCLUDE $LZ4_INCLUDE $GFLAGS_INCLUDE $NUMA_INCLUDE"
 
-GCC_BASE="/mnt/gvfs/third-party2/gcc/1ec615e23800f0815d474478ba476a0adc3fe788/4.8.1/centos6-native/cc6c9dc"
+GCC_BASE="/mnt/gvfs/third-party2/gcc/1c67a0b88f64d4d9ced0382d141c76aaa7d62fba/4.9.x/centos6-native/1317bc4"
 STDLIBS="-L $GCC_BASE/lib64"
 
 if [ -z "$USE_CLANG" ]; then
@@ -68,16 +65,16 @@ if [ -z "$USE_CLANG" ]; then
   CFLAGS+=" -isystem $LIBGCC_INCLUDE"
 else
   # clang 
-  CLANG_BASE="/mnt/gvfs/third-party2/clang/9ab68376f938992c4eb5946ca68f90c3185cffc8/3.4"
-  CLANG_INCLUDE="$CLANG_BASE/gcc-4.8.1-glibc-2.17/fb0f730/lib/clang/3.4/include"
-  CC="$CLANG_BASE/centos6-native/9cefd8a/bin/clang"
-  CXX="$CLANG_BASE/centos6-native/9cefd8a/bin/clang++"
+  CLANG_BASE="/mnt/gvfs/third-party2/clang/290704c112bf894bf4a30d7bbd1be81e34998473/dev"
+  CLANG_INCLUDE="$CLANG_BASE/gcc-4.9-glibc-2.20/74c386f/lib/clang/dev/include/"
+  CC="$CLANG_BASE/centos6-native/af4b1a0/bin/clang"
+  CXX="$CLANG_BASE/centos6-native/af4b1a0/bin/clang++"
 
-  KERNEL_HEADERS_INCLUDE="/mnt/gvfs/third-party2/kernel-headers/a683ed7135276731065a9d76d3016c9731f4e2f9/3.2.18_70_fbk11_00129_gc8882d0/gcc-4.8.1-glibc-2.17/da39a3e/include/"
+  KERNEL_HEADERS_INCLUDE="/mnt/gvfs/third-party2/kernel-headers/ffd14f660a43c4b92717986b1bba66722ef089d0/3.2.18_70_fbk11_00129_gc8882d0/gcc-4.9-glibc-2.20/da39a3e/include"
 
   CFLAGS="-B$BINUTILS/gold -nostdinc -nostdlib"
-  CFLAGS+=" -isystem $LIBGCC_BASE/include/c++/4.8.1 "
-  CFLAGS+=" -isystem $LIBGCC_BASE/include/c++/4.8.1/x86_64-facebook-linux "
+  CFLAGS+=" -isystem $LIBGCC_BASE/include/c++/4.9.x "
+  CFLAGS+=" -isystem $LIBGCC_BASE/include/c++/4.9.x/x86_64-facebook-linux "
   CFLAGS+=" -isystem $GLIBC_INCLUDE"
   CFLAGS+=" -isystem $LIBGCC_INCLUDE"
   CFLAGS+=" -isystem $CLANG_INCLUDE"
@@ -92,14 +89,13 @@ CFLAGS+=" -DSNAPPY -DGFLAGS=google -DZLIB -DBZIP2 -DLZ4 -DNUMA"
 CXXFLAGS+=" $CFLAGS"
 
 EXEC_LDFLAGS=" $SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $GFLAGS_LIBS $NUMA_LIB"
-EXEC_LDFLAGS+=" -Wl,--dynamic-linker,/usr/local/fbcode/gcc-4.8.1-glibc-2.17/lib/ld.so"
+EXEC_LDFLAGS+=" -Wl,--dynamic-linker,/usr/local/fbcode/gcc-4.9-glibc-2.20/lib/ld.so"
 EXEC_LDFLAGS+=" -Wl,--no-whole-archive $LIBUNWIND"
 
 PLATFORM_LDFLAGS="$LIBGCC_LIBS $GLIBC_LIBS $STDLIBS -lgcc -lstdc++"
 
 EXEC_LDFLAGS_SHARED="$SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $GFLAGS_LIBS"
 
-VALGRIND_REV=b2a9f85e4b70cd03abc85a7f3027fbc4cef35bd0
-VALGRIND_VER="/mnt/gvfs/third-party2/valgrind/$VALGRIND_REV/3.8.1/gcc-4.8.1-glibc-2.17/c3f970a/bin/"
+VALGRIND_VER="/mnt/gvfs/third-party2/valgrind/6c45ef049cbf11c2df593addb712cd891049e737/3.10.0/gcc-4.9-glibc-2.20/4230243/bin"
 
 export CC CXX AR CFLAGS CXXFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED VALGRIND_VER JEMALLOC_LIB JEMALLOC_INCLUDE

From 401d4205efb24c03739e1dc7a7f60b228620fb8b Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Fri, 23 Jan 2015 11:22:20 -0800
Subject: [PATCH 733/829] Add thread sanitizer

Summary: When you compile with COMPILE_WITH_TSAN=1, we will compile the code with -fsanitize=thread. This will resolve bunch of data race issues we might have.

Test Plan: COMPILE_WITH_TSAN=1 m db_test

Reviewers: yhchiang, rven, sdong

Reviewed By: sdong

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D32019
---
 Makefile                          | 15 +++++--
 build_tools/build_detect_platform |  2 +
 build_tools/fbcode_config.sh      | 67 ++++++++++++++++++++-----------
 3 files changed, 58 insertions(+), 26 deletions(-)

diff --git a/Makefile b/Makefile
index 4c7a562b5..bc23295b2 100644
--- a/Makefile
+++ b/Makefile
@@ -51,12 +51,21 @@ endif
 
 # ASAN doesn't work well with jemalloc. If we're compiling with ASAN, we should use regular malloc.
 ifdef COMPILE_WITH_ASAN
-	# ASAN compile flags
+	DISABLE_JEMALLOC=1
 	EXEC_LDFLAGS += -fsanitize=address
 	PLATFORM_CCFLAGS += -fsanitize=address
 	PLATFORM_CXXFLAGS += -fsanitize=address
-else
-	# if we're not compiling with ASAN, use jemalloc
+endif
+
+# TSAN doesn't work well with jemalloc. If we're compiling with TSAN, we should use regular malloc.
+ifdef COMPILE_WITH_TSAN
+	DISABLE_JEMALLOC=1
+	EXEC_LDFLAGS += -fsanitize=thread -pie
+	PLATFORM_CCFLAGS += -fsanitize=thread -fPIC
+	PLATFORM_CXXFLAGS += -fsanitize=thread -fPIC
+endif
+
+ifndef DISABLE_JEMALLOC
 	EXEC_LDFLAGS := $(JEMALLOC_LIB) $(EXEC_LDFLAGS)
 	PLATFORM_CXXFLAGS += $(JEMALLOC_INCLUDE) -DHAVE_JEMALLOC
 	PLATFORM_CCFLAGS += $(JEMALLOC_INCLUDE) -DHAVE_JEMALLOC
diff --git a/build_tools/build_detect_platform b/build_tools/build_detect_platform
index 5ad5552ce..01a53b31a 100755
--- a/build_tools/build_detect_platform
+++ b/build_tools/build_detect_platform
@@ -48,6 +48,8 @@ COMMON_FLAGS="-DROCKSDB_PLATFORM_POSIX"
 # Default to fbcode gcc on internal fb machines
 if [ -z "$ROCKSDB_NO_FBCODE" -a -d /mnt/gvfs/third-party ]; then
     FBCODE_BUILD="true"
+    # If we're compiling with TSAN we need pic build
+    PIC_BUILD=$COMPILE_WITH_TSAN
     source "$PWD/build_tools/fbcode_config.sh"
 fi
 
diff --git a/build_tools/fbcode_config.sh b/build_tools/fbcode_config.sh
index a3bfd58b8..ecc0cbad6 100644
--- a/build_tools/fbcode_config.sh
+++ b/build_tools/fbcode_config.sh
@@ -1,11 +1,15 @@
 #!/bin/sh
 #
 # Set environment variables so that we can compile rocksdb using
-# fbcode settings.  It uses the latest g++ compiler and also
+# fbcode settings.  It uses the latest g++ and clang compilers and also
 # uses jemalloc
+# Environment variables that change the behavior of this script:
+# PIC_BUILD -- if true, it will only take pic versions of libraries from fbcode. libraries that don't have pic variant will not be included
+
+CFLAGS=""
 
 # location of libgcc
-LIBGCC_BASE="/mnt/gvfs/third-party2/libgcc/0473c80518a10d6efcbe24c5eeca3fb4ec9b519c/4.9.x/gcc-4.9-glibc-2.20/e1a7e4e/"
+LIBGCC_BASE="/mnt/gvfs/third-party2/libgcc/0473c80518a10d6efcbe24c5eeca3fb4ec9b519c/4.9.x/gcc-4.9-glibc-2.20/e1a7e4e"
 LIBGCC_INCLUDE="$LIBGCC_BASE/include"
 LIBGCC_LIBS=" -L $LIBGCC_BASE/libs"
 
@@ -16,33 +20,51 @@ GLIBC_LIBS=" -L /mnt/gvfs/third-party2/glibc/$GLIBC_REV/2.20/gcc-4.9-glibc-2.20/
 
 # location of snappy headers and libraries
 SNAPPY_INCLUDE=" -I /mnt/gvfs/third-party2/snappy/b0f269b3ca47770121aa159b99e1d8d2ab260e1f/1.0.3/gcc-4.9-glibc-2.20/c32916f/include/"
-SNAPPY_LIBS=" /mnt/gvfs/third-party2/snappy/b0f269b3ca47770121aa159b99e1d8d2ab260e1f/1.0.3/gcc-4.9-glibc-2.20/c32916f/lib/libsnappy.a"
-
-# location of zlib headers and libraries
-ZLIB_INCLUDE=" -I /mnt/gvfs/third-party2/zlib/feb983d9667f4cf5e9da07ce75abc824764b67a1/1.2.8/gcc-4.9-glibc-2.20/4230243/include/"
-ZLIB_LIBS=" /mnt/gvfs/third-party2/zlib/feb983d9667f4cf5e9da07ce75abc824764b67a1/1.2.8/gcc-4.9-glibc-2.20/4230243/lib/libz.a"
-
-# location of bzip headers and libraries
-BZIP_INCLUDE=" -I /mnt/gvfs/third-party2/bzip2/af004cceebb2dfd173ca29933ea5915e727aad2f/1.0.6/gcc-4.9-glibc-2.20/4230243/include/"
-BZIP_LIBS=" /mnt/gvfs/third-party2/bzip2/af004cceebb2dfd173ca29933ea5915e727aad2f/1.0.6/gcc-4.9-glibc-2.20/4230243/lib/libbz2.a"
-
-LZ4_INCLUDE=" -I /mnt/gvfs/third-party2/lz4/79d2943e2dd7208a3e0b06cf95e9f85f05fe9e1b/r124/gcc-4.9-glibc-2.20/4230243/include/"
-LZ4_LIBS=" /mnt/gvfs/third-party2/lz4/79d2943e2dd7208a3e0b06cf95e9f85f05fe9e1b/r124/gcc-4.9-glibc-2.20/4230243/lib/liblz4.a"
+if test -z $PIC_BUILD; then
+  SNAPPY_LIBS=" /mnt/gvfs/third-party2/snappy/b0f269b3ca47770121aa159b99e1d8d2ab260e1f/1.0.3/gcc-4.9-glibc-2.20/c32916f/lib/libsnappy.a"
+else
+  SNAPPY_LIBS=" /mnt/gvfs/third-party2/snappy/b0f269b3ca47770121aa159b99e1d8d2ab260e1f/1.0.3/gcc-4.9-glibc-2.20/c32916f/lib/libsnappy_pic.a"
+fi
+CFLAGS+=" -DSNAPPY"
+
+if test -z $PIC_BUILD; then
+  # location of zlib headers and libraries
+  ZLIB_INCLUDE=" -I /mnt/gvfs/third-party2/zlib/feb983d9667f4cf5e9da07ce75abc824764b67a1/1.2.8/gcc-4.9-glibc-2.20/4230243/include/"
+  ZLIB_LIBS=" /mnt/gvfs/third-party2/zlib/feb983d9667f4cf5e9da07ce75abc824764b67a1/1.2.8/gcc-4.9-glibc-2.20/4230243/lib/libz.a"
+  CFLAGS+=" -DZLIB"
+
+  # location of bzip headers and libraries
+  BZIP_INCLUDE=" -I /mnt/gvfs/third-party2/bzip2/af004cceebb2dfd173ca29933ea5915e727aad2f/1.0.6/gcc-4.9-glibc-2.20/4230243/include/"
+  BZIP_LIBS=" /mnt/gvfs/third-party2/bzip2/af004cceebb2dfd173ca29933ea5915e727aad2f/1.0.6/gcc-4.9-glibc-2.20/4230243/lib/libbz2.a"
+  CFLAGS+=" -DBZIP2"
+
+  LZ4_INCLUDE=" -I /mnt/gvfs/third-party2/lz4/79d2943e2dd7208a3e0b06cf95e9f85f05fe9e1b/r124/gcc-4.9-glibc-2.20/4230243/include/"
+  LZ4_LIBS=" /mnt/gvfs/third-party2/lz4/79d2943e2dd7208a3e0b06cf95e9f85f05fe9e1b/r124/gcc-4.9-glibc-2.20/4230243/lib/liblz4.a"
+  CFLAGS+=" -DLZ4"
+fi
 
 # location of gflags headers and libraries
 GFLAGS_INCLUDE=" -I /mnt/gvfs/third-party2/gflags/0fa60e2b88de3e469db6c482d6e6dac72f5d65f9/1.6/gcc-4.9-glibc-2.20/4230243/include/"
-GFLAGS_LIBS=" /mnt/gvfs/third-party2/gflags/0fa60e2b88de3e469db6c482d6e6dac72f5d65f9/1.6/gcc-4.9-glibc-2.20/4230243/lib/libgflags.a"
+if test -z $PIC_BUILD; then
+  GFLAGS_LIBS=" /mnt/gvfs/third-party2/gflags/0fa60e2b88de3e469db6c482d6e6dac72f5d65f9/1.6/gcc-4.9-glibc-2.20/4230243/lib/libgflags.a"
+else
+  GFLAGS_LIBS=" /mnt/gvfs/third-party2/gflags/0fa60e2b88de3e469db6c482d6e6dac72f5d65f9/1.6/gcc-4.9-glibc-2.20/4230243/lib/libgflags_pic.a"
+fi
+CFLAGS+=" -DGFLAGS=google"
 
 # location of jemalloc
 JEMALLOC_INCLUDE=" -I /mnt/gvfs/third-party2/jemalloc/bcd68e5e419efa4e61b9486d6854564d6d75a0b5/3.6.0/gcc-4.9-glibc-2.20/2aafc78/include/"
 JEMALLOC_LIB=" -Wl,--whole-archive /mnt/gvfs/third-party2/jemalloc/bcd68e5e419efa4e61b9486d6854564d6d75a0b5/3.6.0/gcc-4.9-glibc-2.20/2aafc78/lib/libjemalloc.a"
 
-# location of numa
-NUMA_INCLUDE=" -I /mnt/gvfs/third-party2/numa/bbefc39ecbf31d0ca184168eb613ef8d397790ee/2.0.8/gcc-4.9-glibc-2.20/4230243/include/"
-NUMA_LIB=" /mnt/gvfs/third-party2/numa/bbefc39ecbf31d0ca184168eb613ef8d397790ee/2.0.8/gcc-4.9-glibc-2.20/4230243/lib/libnuma.a"
+if test -z $PIC_BUILD; then
+  # location of numa
+  NUMA_INCLUDE=" -I /mnt/gvfs/third-party2/numa/bbefc39ecbf31d0ca184168eb613ef8d397790ee/2.0.8/gcc-4.9-glibc-2.20/4230243/include/"
+  NUMA_LIB=" /mnt/gvfs/third-party2/numa/bbefc39ecbf31d0ca184168eb613ef8d397790ee/2.0.8/gcc-4.9-glibc-2.20/4230243/lib/libnuma.a"
+  CFLAGS+=" -DNUMA"
 
-# location of libunwind
-LIBUNWIND="/mnt/gvfs/third-party2/libunwind/1de3b75e0afedfe5585b231bbb340ec7a1542335/1.1/gcc-4.9-glibc-2.20/34235e8/lib/libunwind.a"
+  # location of libunwind
+  LIBUNWIND="/mnt/gvfs/third-party2/libunwind/1de3b75e0afedfe5585b231bbb340ec7a1542335/1.1/gcc-4.9-glibc-2.20/34235e8/lib/libunwind.a"
+fi
 
 # use Intel SSE support for checksum calculations
 export USE_SSE=1
@@ -60,7 +82,7 @@ if [ -z "$USE_CLANG" ]; then
   CC="$GCC_BASE/bin/gcc"
   CXX="$GCC_BASE/bin/g++"
   
-  CFLAGS="-B$BINUTILS/gold -m64 -mtune=generic"
+  CFLAGS+=" -B$BINUTILS/gold"
   CFLAGS+=" -isystem $GLIBC_INCLUDE"
   CFLAGS+=" -isystem $LIBGCC_INCLUDE"
 else
@@ -72,7 +94,7 @@ else
 
   KERNEL_HEADERS_INCLUDE="/mnt/gvfs/third-party2/kernel-headers/ffd14f660a43c4b92717986b1bba66722ef089d0/3.2.18_70_fbk11_00129_gc8882d0/gcc-4.9-glibc-2.20/da39a3e/include"
 
-  CFLAGS="-B$BINUTILS/gold -nostdinc -nostdlib"
+  CFLAGS+=" -B$BINUTILS/gold -nostdinc -nostdlib"
   CFLAGS+=" -isystem $LIBGCC_BASE/include/c++/4.9.x "
   CFLAGS+=" -isystem $LIBGCC_BASE/include/c++/4.9.x/x86_64-facebook-linux "
   CFLAGS+=" -isystem $GLIBC_INCLUDE"
@@ -85,7 +107,6 @@ fi
 
 CFLAGS+=" $DEPS_INCLUDE"
 CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_FALLOCATE_PRESENT"
-CFLAGS+=" -DSNAPPY -DGFLAGS=google -DZLIB -DBZIP2 -DLZ4 -DNUMA"
 CXXFLAGS+=" $CFLAGS"
 
 EXEC_LDFLAGS=" $SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $GFLAGS_LIBS $NUMA_LIB"

From 910186c2789b096f4a7d5be5df42818af0b2b4db Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Fri, 23 Jan 2015 14:51:27 -0800
Subject: [PATCH 734/829] Return the build with 4.8.1

Summary: We need this because we build MySQL with 4.8.1.

Test Plan: ROCKSDB_FBCODE_BUILD_WITH_481=1 make check

Reviewers: sdong, yhchiang, rven, yoshinorim

Reviewed By: yoshinorim

Subscribers: jonahcohen, yoshinorim, dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D32073
---
 build_tools/build_detect_platform |   7 +-
 build_tools/fbcode_config4.8.1.sh | 105 ++++++++++++++++++++++++++++++
 2 files changed, 111 insertions(+), 1 deletion(-)
 create mode 100644 build_tools/fbcode_config4.8.1.sh

diff --git a/build_tools/build_detect_platform b/build_tools/build_detect_platform
index 01a53b31a..c9ce01eab 100755
--- a/build_tools/build_detect_platform
+++ b/build_tools/build_detect_platform
@@ -50,7 +50,12 @@ if [ -z "$ROCKSDB_NO_FBCODE" -a -d /mnt/gvfs/third-party ]; then
     FBCODE_BUILD="true"
     # If we're compiling with TSAN we need pic build
     PIC_BUILD=$COMPILE_WITH_TSAN
-    source "$PWD/build_tools/fbcode_config.sh"
+    if [ -z "$ROCKSDB_FBCODE_BUILD_WITH_481" ]; then
+      source "$PWD/build_tools/fbcode_config.sh"
+    else
+      # we need this to build with MySQL. Don't use for other purposes.
+      source "$PWD/build_tools/fbcode_config4.8.1.sh"
+    fi
 fi
 
 # Delete existing output, if it exists
diff --git a/build_tools/fbcode_config4.8.1.sh b/build_tools/fbcode_config4.8.1.sh
new file mode 100644
index 000000000..7c1ff5147
--- /dev/null
+++ b/build_tools/fbcode_config4.8.1.sh
@@ -0,0 +1,105 @@
+#!/bin/sh
+#
+# Set environment variables so that we can compile rocksdb using
+# fbcode settings.  It uses the latest g++ compiler and also
+# uses jemalloc
+
+# location of libgcc
+LIBGCC_BASE="/mnt/gvfs/third-party2/libgcc/7712e757d7355cb51292454ee0b7b46a467fdfed/4.8.1/gcc-4.8.1-glibc-2.17/8aac7fc"
+LIBGCC_INCLUDE="$LIBGCC_BASE/include"
+LIBGCC_LIBS=" -L $LIBGCC_BASE/libs"
+
+# location of glibc
+GLIBC_REV=6e40560b4e0b6d690fd1cf8c7a43ad7452b04cfa
+GLIBC_INCLUDE="/mnt/gvfs/third-party2/glibc/$GLIBC_REV/2.17/gcc-4.8.1-glibc-2.17/99df8fc/include"
+GLIBC_LIBS=" -L /mnt/gvfs/third-party2/glibc/$GLIBC_REV/2.17/gcc-4.8.1-glibc-2.17/99df8fc/lib"
+
+# location of snappy headers and libraries
+SNAPPY_INCLUDE=" -I /mnt/gvfs/third-party2/snappy/aef17f6c0b44b4fe408bd06f67c93701ab0a6ceb/1.0.3/gcc-4.8.1-glibc-2.17/43d84e2/include"
+SNAPPY_LIBS=" /mnt/gvfs/third-party2/snappy/aef17f6c0b44b4fe408bd06f67c93701ab0a6ceb/1.0.3/gcc-4.8.1-glibc-2.17/43d84e2/lib/libsnappy.a"
+
+# location of zlib headers and libraries
+ZLIB_INCLUDE=" -I /mnt/gvfs/third-party2/zlib/25c6216928b4d77b59ddeca0990ff6fe9ac16b81/1.2.5/gcc-4.8.1-glibc-2.17/c3f970a/include"
+ZLIB_LIBS=" /mnt/gvfs/third-party2/zlib/25c6216928b4d77b59ddeca0990ff6fe9ac16b81/1.2.5/gcc-4.8.1-glibc-2.17/c3f970a/lib/libz.a"
+
+# location of bzip headers and libraries
+BZIP_INCLUDE=" -I /mnt/gvfs/third-party2/bzip2/c9ef7629c2aa0024f7a416e87602f06eb88f5eac/1.0.6/gcc-4.8.1-glibc-2.17/c3f970a/include/"
+BZIP_LIBS=" /mnt/gvfs/third-party2/bzip2/c9ef7629c2aa0024f7a416e87602f06eb88f5eac/1.0.6/gcc-4.8.1-glibc-2.17/c3f970a/lib/libbz2.a"
+
+LZ4_REV=065ec7e38fe83329031f6668c43bef83eff5808b
+LZ4_INCLUDE=" -I /mnt/gvfs/third-party2/lz4/$LZ4_REV/r108/gcc-4.8.1-glibc-2.17/c3f970a/include"
+LZ4_LIBS=" /mnt/gvfs/third-party2/lz4/$LZ4_REV/r108/gcc-4.8.1-glibc-2.17/c3f970a/lib/liblz4.a"
+
+# location of gflags headers and libraries
+GFLAGS_INCLUDE=" -I /mnt/gvfs/third-party2/gflags/1ad047a6e6f6673991918ecadc670868205a243a/1.6/gcc-4.8.1-glibc-2.17/c3f970a/include/"
+GFLAGS_LIBS=" /mnt/gvfs/third-party2/gflags/1ad047a6e6f6673991918ecadc670868205a243a/1.6/gcc-4.8.1-glibc-2.17/c3f970a/lib/libgflags.a"
+
+# location of jemalloc
+JEMALLOC_INCLUDE=" -I /mnt/gvfs/third-party2/jemalloc/c60d854f7824f334195fe7fd34b2bc9057e3c1f9/3.6.0/gcc-4.8.1-glibc-2.17/4d53c6f/include"
+JEMALLOC_LIB=" -Wl,--whole-archive /mnt/gvfs/third-party2/jemalloc/c60d854f7824f334195fe7fd34b2bc9057e3c1f9/3.6.0/gcc-4.8.1-glibc-2.17/4d53c6f/lib/libjemalloc.a"
+
+# location of numa
+NUMA_REV=829d10dac0230f99cd7e1778869d2adf3da24b65
+NUMA_INCLUDE=" -I /mnt/gvfs/third-party2/numa/$NUMA_REV/2.0.8/gcc-4.8.1-glibc-2.17/c3f970a/include/"
+NUMA_LIB=" /mnt/gvfs/third-party2/numa/$NUMA_REV/2.0.8/gcc-4.8.1-glibc-2.17/c3f970a/lib/libnuma.a"
+
+# location of libunwind
+LIBUNWIND_REV=2c060e64064559905d46fd194000d61592087bdc
+LIBUNWIND="/mnt/gvfs/third-party2/libunwind/$LIBUNWIND_REV/1.1/gcc-4.8.1-glibc-2.17/675d945/lib/libunwind.a"
+
+# use Intel SSE support for checksum calculations
+export USE_SSE=1
+
+BINUTILS="/mnt/gvfs/third-party2/binutils/2aff2e7b474cd3e6ab23495ad1224b7d214b9f8e/2.21.1/centos6-native/da39a3e/bin"
+AR="$BINUTILS/ar"
+
+DEPS_INCLUDE="$SNAPPY_INCLUDE $ZLIB_INCLUDE $BZIP_INCLUDE $LZ4_INCLUDE $GFLAGS_INCLUDE $NUMA_INCLUDE"
+
+GCC_BASE="/mnt/gvfs/third-party2/gcc/1ec615e23800f0815d474478ba476a0adc3fe788/4.8.1/centos6-native/cc6c9dc"
+STDLIBS="-L $GCC_BASE/lib64"
+
+if [ -z "$USE_CLANG" ]; then
+  # gcc
+  CC="$GCC_BASE/bin/gcc"
+  CXX="$GCC_BASE/bin/g++"
+  
+  CFLAGS="-B$BINUTILS/gold -m64 -mtune=generic"
+  CFLAGS+=" -isystem $GLIBC_INCLUDE"
+  CFLAGS+=" -isystem $LIBGCC_INCLUDE"
+else
+  # clang 
+  CLANG_BASE="/mnt/gvfs/third-party2/clang/9ab68376f938992c4eb5946ca68f90c3185cffc8/3.4"
+  CLANG_INCLUDE="$CLANG_BASE/gcc-4.8.1-glibc-2.17/fb0f730/lib/clang/3.4/include"
+  CC="$CLANG_BASE/centos6-native/9cefd8a/bin/clang"
+  CXX="$CLANG_BASE/centos6-native/9cefd8a/bin/clang++"
+
+  KERNEL_HEADERS_INCLUDE="/mnt/gvfs/third-party2/kernel-headers/a683ed7135276731065a9d76d3016c9731f4e2f9/3.2.18_70_fbk11_00129_gc8882d0/gcc-4.8.1-glibc-2.17/da39a3e/include/"
+
+  CFLAGS="-B$BINUTILS/gold -nostdinc -nostdlib"
+  CFLAGS+=" -isystem $LIBGCC_BASE/include/c++/4.8.1 "
+  CFLAGS+=" -isystem $LIBGCC_BASE/include/c++/4.8.1/x86_64-facebook-linux "
+  CFLAGS+=" -isystem $GLIBC_INCLUDE"
+  CFLAGS+=" -isystem $LIBGCC_INCLUDE"
+  CFLAGS+=" -isystem $CLANG_INCLUDE"
+  CFLAGS+=" -isystem $KERNEL_HEADERS_INCLUDE/linux "
+  CFLAGS+=" -isystem $KERNEL_HEADERS_INCLUDE "
+  CXXFLAGS="-nostdinc++"
+fi
+
+CFLAGS+=" $DEPS_INCLUDE"
+CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_FALLOCATE_PRESENT"
+CFLAGS+=" -DSNAPPY -DGFLAGS=google -DZLIB -DBZIP2 -DLZ4 -DNUMA"
+CXXFLAGS+=" $CFLAGS"
+
+EXEC_LDFLAGS=" $SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $GFLAGS_LIBS $NUMA_LIB"
+EXEC_LDFLAGS+=" -Wl,--dynamic-linker,/usr/local/fbcode/gcc-4.8.1-glibc-2.17/lib/ld.so"
+EXEC_LDFLAGS+=" -Wl,--no-whole-archive $LIBUNWIND"
+
+PLATFORM_LDFLAGS="$LIBGCC_LIBS $GLIBC_LIBS $STDLIBS -lgcc -lstdc++"
+
+EXEC_LDFLAGS_SHARED="$SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $GFLAGS_LIBS"
+
+VALGRIND_REV=b2a9f85e4b70cd03abc85a7f3027fbc4cef35bd0
+VALGRIND_VER="/mnt/gvfs/third-party2/valgrind/$VALGRIND_REV/3.8.1/gcc-4.8.1-glibc-2.17/c3f970a/bin/"
+
+export CC CXX AR CFLAGS CXXFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED VALGRIND_VER JEMALLOC_LIB JEMALLOC_INCLUDE

From a52dd00243ae939ab6dbd1e72c0585ae840066b4 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Fri, 23 Jan 2015 15:02:43 -0800
Subject: [PATCH 735/829] Fix ASAN failure with backupable DB

Summary: It looks like ASAN with gcc 4.9 works better than 4.8.1. It detected this possibility of heap buffer overflow. This was in our codebase for a year :)

Test Plan: COMPILE_WITH_ASAN=1 make backupable_db && ./backupable_db

Reviewers: yhchiang, rven, sdong

Reviewed By: sdong

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D32085
---
 utilities/backupable/backupable_db.cc | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/utilities/backupable/backupable_db.cc b/utilities/backupable/backupable_db.cc
index 2a526c940..6338df4c1 100644
--- a/utilities/backupable/backupable_db.cc
+++ b/utilities/backupable/backupable_db.cc
@@ -1207,8 +1207,7 @@ Status BackupEngineImpl::BackupMeta::LoadFromFile(
       line.remove_prefix(checksum_prefix.size());
       checksum_value = static_cast<uint32_t>(
           strtoul(line.data(), nullptr, 10));
-      if (memcmp(line.data(), std::to_string(checksum_value).c_str(),
-                 line.size() - 1) != 0) {
+      if (line != std::to_string(checksum_value)) {
         return Status::Corruption("Invalid checksum value");
       }
     } else {

From b4c13a868a57cda00c12b7d8a202fa0499c21578 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Thu, 22 Jan 2015 18:34:23 -0800
Subject: [PATCH 736/829] fault_injection_test: improvements and add new tests

Summary:
Wrapper classes in fault_injection_test doesn't simulate RocksDB Env behavior close enough. Improve it by:
(1) when fsync, don't sync parent
(2) support directory fsync
(3) support multiple directories

Add test cases of
(1) persisting by WAL fsync, not just compact range
(2) different WAL dir
(3) combination of (1) and (2)
(4) data directory is not the same as db name.

Test Plan: Run the test and make sure it passes.

Reviewers: rven, yhchiang, igor

Subscribers: leveldb, dhruba

Differential Revision: https://reviews.facebook.net/D32031
---
 db/fault_injection_test.cc | 316 ++++++++++++++++++++++++++-----------
 1 file changed, 221 insertions(+), 95 deletions(-)

diff --git a/db/fault_injection_test.cc b/db/fault_injection_test.cc
index 6861f1525..4aa459fe2 100644
--- a/db/fault_injection_test.cc
+++ b/db/fault_injection_test.cc
@@ -48,9 +48,21 @@ static std::string GetDirName(const std::string filename) {
   }
 }
 
-Status SyncDir(const std::string& dir) {
-  // As this is a test it isn't required to *actually* sync this directory.
-  return Status::OK();
+// Trim the tailing "/" in the end of `str`
+static std::string TrimDirname(const std::string& str) {
+  size_t found = str.find_last_not_of("/");
+  if (found == std::string::npos) {
+    return str;
+  }
+  return str.substr(0, found + 1);
+}
+
+// Return pair <parent directory name, file name> of a full path.
+static std::pair<std::string, std::string> GetDirAndName(
+    const std::string& name) {
+  std::string dirname = GetDirName(name);
+  std::string fname = name.substr(dirname.size() + 1);
+  return std::make_pair(dirname, fname);
 }
 
 // A basic file truncation function suitable for this test.
@@ -124,10 +136,22 @@ class TestWritableFile : public WritableFile {
   unique_ptr<WritableFile> target_;
   bool writable_file_opened_;
   FaultInjectionTestEnv* env_;
-
-  Status SyncParent();
 };
 
+class TestDirectory : public Directory {
+ public:
+  explicit TestDirectory(FaultInjectionTestEnv* env, std::string dirname,
+                         Directory* dir)
+      : env_(env), dirname_(dirname), dir_(dir) {}
+  ~TestDirectory() {}
+
+  virtual Status Fsync() override;
+
+ private:
+  FaultInjectionTestEnv* env_;
+  std::string dirname_;
+  unique_ptr<Directory> dir_;
+};
 
 class FaultInjectionTestEnv : public EnvWrapper {
  public:
@@ -136,6 +160,18 @@ class FaultInjectionTestEnv : public EnvWrapper {
         filesystem_active_(true) {}
   virtual ~FaultInjectionTestEnv() { }
 
+  Status NewDirectory(const std::string& name,
+                      unique_ptr<Directory>* result) override {
+    unique_ptr<Directory> r;
+    Status s = target()->NewDirectory(name, &r);
+    ASSERT_OK(s);
+    if (!s.ok()) {
+      return s;
+    }
+    result->reset(new TestDirectory(this, TrimDirname(name), r.release()));
+    return Status::OK();
+  }
+
   Status NewWritableFile(const std::string& fname,
                          unique_ptr<WritableFile>* result,
                          const EnvOptions& soptions) {
@@ -146,7 +182,10 @@ class FaultInjectionTestEnv : public EnvWrapper {
       // again then it will be truncated - so forget our saved state.
       UntrackFile(fname);
       MutexLock l(&mutex_);
-      new_files_since_last_dir_sync_.insert(fname);
+
+      auto dir_and_name = GetDirAndName(fname);
+      auto& list = dir_to_new_files_since_last_sync_[dir_and_name.first];
+      list.insert(dir_and_name.second);
     }
     return s;
   }
@@ -170,10 +209,12 @@ class FaultInjectionTestEnv : public EnvWrapper {
         db_file_state_.erase(s);
       }
 
-      if (new_files_since_last_dir_sync_.erase(s) != 0) {
-        assert(new_files_since_last_dir_sync_.find(t) ==
-               new_files_since_last_dir_sync_.end());
-        new_files_since_last_dir_sync_.insert(t);
+      auto sdn = GetDirAndName(s);
+      auto tdn = GetDirAndName(t);
+      if (dir_to_new_files_since_last_sync_[sdn.first].erase(sdn.second) != 0) {
+        auto& tlist = dir_to_new_files_since_last_sync_[tdn.first];
+        assert(tlist.find(tdn.second) == tlist.end());
+        tlist.insert(tdn.second);
       }
     }
 
@@ -201,40 +242,37 @@ class FaultInjectionTestEnv : public EnvWrapper {
 
   Status DeleteFilesCreatedAfterLastDirSync() {
     // Because DeleteFile access this container make a copy to avoid deadlock
-    mutex_.Lock();
-    std::set<std::string> new_files(new_files_since_last_dir_sync_.begin(),
-                                    new_files_since_last_dir_sync_.end());
-    mutex_.Unlock();
-    Status s;
-    std::set<std::string>::const_iterator it;
-    for (it = new_files.begin(); s.ok() && it != new_files.end(); ++it) {
-      s = DeleteFile(*it);
+    std::map<std::string, std::set<std::string>> map_copy;
+    {
+      MutexLock l(&mutex_);
+      map_copy.insert(dir_to_new_files_since_last_sync_.begin(),
+                      dir_to_new_files_since_last_sync_.end());
     }
-    return s;
-  }
 
-  void DirWasSynced() {
-    MutexLock l(&mutex_);
-    new_files_since_last_dir_sync_.clear();
-  }
-
-  bool IsFileCreatedSinceLastDirSync(const std::string& filename) {
-    MutexLock l(&mutex_);
-    return new_files_since_last_dir_sync_.find(filename) !=
-           new_files_since_last_dir_sync_.end();
+    for (auto& pair : map_copy) {
+      for (std::string name : pair.second) {
+        Status s = DeleteFile(pair.first + "/" + name);
+      }
+    }
+    return Status::OK();
   }
-
   void ResetState() {
     MutexLock l(&mutex_);
     db_file_state_.clear();
-    new_files_since_last_dir_sync_.clear();
+    dir_to_new_files_since_last_sync_.clear();
     SetFilesystemActive(true);
   }
 
   void UntrackFile(const std::string& f) {
     MutexLock l(&mutex_);
+    auto dir_and_name = GetDirAndName(f);
+    dir_to_new_files_since_last_sync_[dir_and_name.first].erase(
+        dir_and_name.second);
     db_file_state_.erase(f);
-    new_files_since_last_dir_sync_.erase(f);
+  }
+
+  void SyncDir(const std::string& dirname) {
+    dir_to_new_files_since_last_sync_.erase(dirname);
   }
 
   // Setting the filesystem to inactive is the test equivalent to simulating a
@@ -247,7 +285,8 @@ class FaultInjectionTestEnv : public EnvWrapper {
  private:
   port::Mutex mutex_;
   std::map<std::string, FileState> db_file_state_;
-  std::set<std::string> new_files_since_last_dir_sync_;
+  std::unordered_map<std::string, std::set<std::string>>
+      dir_to_new_files_since_last_sync_;
   bool filesystem_active_;  // Record flushes, syncs, writes
 };
 
@@ -256,6 +295,11 @@ Status FileState::DropUnsyncedData() const {
   return Truncate(filename_, sync_pos);
 }
 
+Status TestDirectory::Fsync() {
+  env_->SyncDir(dirname_);
+  return dir_->Fsync();
+}
+
 TestWritableFile::TestWritableFile(const std::string& fname,
                                    unique_ptr<WritableFile>&& f,
                                    FaultInjectionTestEnv* env)
@@ -302,32 +346,35 @@ Status TestWritableFile::Sync() {
   if (!env_->IsFilesystemActive()) {
     return Status::OK();
   }
-  // Ensure new files referred to by the manifest are in the filesystem.
-  Status s = target_->Sync();
-  if (s.ok()) {
-    state_.pos_at_last_sync_ = state_.pos_;
-  }
-  if (env_->IsFileCreatedSinceLastDirSync(state_.filename_)) {
-    Status ps = SyncParent();
-    if (s.ok() && !ps.ok()) {
-      s = ps;
-    }
-  }
-  return s;
-}
-
-Status TestWritableFile::SyncParent() {
-  Status s = SyncDir(GetDirName(state_.filename_));
-  if (s.ok()) {
-    env_->DirWasSynced();
-  }
-  return s;
+  // No need to actual sync.
+  state_.pos_at_last_sync_ = state_.pos_;
+  return Status::OK();
 }
 
 class FaultInjectionTest {
+ protected:
+  enum OptionConfig {
+    kDefault,
+    kDifferentDataDir,
+    kWalDir,
+    kSyncWal,
+    kWalDirSyncWal,
+    kEnd,
+  };
+  int option_config_;
+  // When need to make sure data is persistent, sync WAL
+  bool sync_use_wal;
+  // When need to make sure data is persistent, call DB::CompactRange()
+  bool sync_use_compact;
+
+ protected:
  public:
-  enum ExpectedVerifResult { VAL_EXPECT_NO_ERROR, VAL_EXPECT_ERROR };
-  enum ResetMethod { RESET_DROP_UNSYNCED_DATA, RESET_DELETE_UNSYNCED_FILES };
+  enum ExpectedVerifResult { kValExpectFound, kValExpectNoError };
+  enum ResetMethod {
+    kResetDropUnsyncedData,
+    kResetDeleteUnsyncedFiles,
+    kResetDropAndDeleteUnsynced
+  };
 
   FaultInjectionTestEnv* env_;
   std::string dbname_;
@@ -335,10 +382,54 @@ class FaultInjectionTest {
   Options options_;
   DB* db_;
 
-  FaultInjectionTest() : env_(NULL), db_(NULL) { NewDB(); }
+  FaultInjectionTest()
+      : option_config_(kDefault),
+        sync_use_wal(false),
+        sync_use_compact(true),
+        env_(NULL),
+        db_(NULL) {
+    NewDB();
+  }
 
   ~FaultInjectionTest() { ASSERT_OK(TearDown()); }
 
+  bool ChangeOptions() {
+    option_config_++;
+    if (option_config_ >= kEnd) {
+      return false;
+    } else {
+      return true;
+    }
+  }
+
+  // Return the current option configuration.
+  Options CurrentOptions() {
+    sync_use_wal = false;
+    sync_use_compact = true;
+    Options options;
+    switch (option_config_) {
+      case kWalDir:
+        options.wal_dir = test::TmpDir(env_) + "/fault_test_wal";
+        break;
+      case kDifferentDataDir:
+        options.db_paths.emplace_back(test::TmpDir(env_) + "/fault_test_data",
+                                      1000000U);
+        break;
+      case kSyncWal:
+        sync_use_wal = true;
+        sync_use_compact = false;
+        break;
+      case kWalDirSyncWal:
+        options.wal_dir = test::TmpDir(env_) + "/fault_test_wal";
+        sync_use_wal = true;
+        sync_use_compact = false;
+        break;
+      default:
+        break;
+    }
+    return options;
+  }
+
   Status NewDB() {
     assert(db_ == NULL);
     assert(tiny_cache_ == nullptr);
@@ -346,7 +437,7 @@ class FaultInjectionTest {
 
     env_ = new FaultInjectionTestEnv(Env::Default());
 
-    options_ = Options();
+    options_ = CurrentOptions();
     options_.env = env_;
     options_.paranoid_checks = true;
 
@@ -357,6 +448,8 @@ class FaultInjectionTest {
 
     dbname_ = test::TmpDir() + "/fault_test";
 
+    ASSERT_OK(DestroyDB(dbname_, options_));
+
     options_.create_if_missing = true;
     Status s = OpenDB();
     options_.create_if_missing = false;
@@ -374,7 +467,7 @@ class FaultInjectionTest {
   Status TearDown() {
     CloseDB();
 
-    Status s = DestroyDB(dbname_, Options());
+    Status s = DestroyDB(dbname_, options_);
 
     delete env_;
     env_ = NULL;
@@ -384,15 +477,14 @@ class FaultInjectionTest {
     return s;
   }
 
-  void Build(int start_idx, int num_vals) {
+  void Build(const WriteOptions& write_options, int start_idx, int num_vals) {
     std::string key_space, value_space;
     WriteBatch batch;
     for (int i = start_idx; i < start_idx + num_vals; i++) {
       Slice key = Key(i, &key_space);
       batch.Clear();
       batch.Put(key, Value(i, &value_space));
-      WriteOptions options;
-      ASSERT_OK(db_->Write(options, &batch));
+      ASSERT_OK(db_->Write(write_options, &batch));
     }
   }
 
@@ -412,18 +504,22 @@ class FaultInjectionTest {
     for (int i = start_idx; i < start_idx + num_vals && s.ok(); i++) {
       Value(i, &value_space);
       s = ReadValue(i, &val);
-      if (expected == VAL_EXPECT_NO_ERROR) {
-        if (s.ok()) {
-          ASSERT_EQ(value_space, val);
+      if (s.ok()) {
+        ASSERT_EQ(value_space, val);
+      }
+      if (expected == kValExpectFound) {
+        if (!s.ok()) {
+          fprintf(stderr, "Error when read %dth record (expect found): %s\n", i,
+                  s.ToString().c_str());
+          return s;
         }
-      } else if (s.ok()) {
-        fprintf(stderr, "Expected an error at %d, but was OK\n", i);
-        s = Status::IOError(dbname_, "Expected value error:");
-      } else {
-        s = Status::OK();  // An expected error
+      } else if (!s.ok() && !s.IsNotFound()) {
+        fprintf(stderr, "Error when read %dth record: %s\n", i,
+                s.ToString().c_str());
+        return s;
       }
     }
-    return s;
+    return Status::OK();
   }
 
   // Return the ith key
@@ -460,14 +556,22 @@ class FaultInjectionTest {
     }
 
     delete iter;
+
+    FlushOptions flush_options;
+    flush_options.wait = true;
+    db_->Flush(flush_options);
   }
 
   void ResetDBState(ResetMethod reset_method) {
     switch (reset_method) {
-      case RESET_DROP_UNSYNCED_DATA:
+      case kResetDropUnsyncedData:
         ASSERT_OK(env_->DropUnsyncedFileData());
         break;
-      case RESET_DELETE_UNSYNCED_FILES:
+      case kResetDeleteUnsyncedFiles:
+        ASSERT_OK(env_->DeleteFilesCreatedAfterLastDirSync());
+        break;
+      case kResetDropAndDeleteUnsynced:
+        ASSERT_OK(env_->DropUnsyncedFileData());
         ASSERT_OK(env_->DeleteFilesCreatedAfterLastDirSync());
         break;
       default:
@@ -477,9 +581,16 @@ class FaultInjectionTest {
 
   void PartialCompactTestPreFault(int num_pre_sync, int num_post_sync) {
     DeleteAllData();
-    Build(0, num_pre_sync);
-    db_->CompactRange(NULL, NULL);
-    Build(num_pre_sync, num_post_sync);
+
+    WriteOptions write_options;
+    write_options.sync = sync_use_wal;
+
+    Build(write_options, 0, num_pre_sync);
+    if (sync_use_compact) {
+      db_->CompactRange(nullptr, nullptr);
+    }
+    write_options.sync = false;
+    Build(write_options, num_pre_sync, num_post_sync);
   }
 
   void PartialCompactTestReopenWithFault(ResetMethod reset_method,
@@ -489,9 +600,9 @@ class FaultInjectionTest {
     CloseDB();
     ResetDBState(reset_method);
     ASSERT_OK(OpenDB());
-    ASSERT_OK(Verify(0, num_pre_sync, FaultInjectionTest::VAL_EXPECT_NO_ERROR));
+    ASSERT_OK(Verify(0, num_pre_sync, FaultInjectionTest::kValExpectFound));
     ASSERT_OK(Verify(num_pre_sync, num_post_sync,
-      FaultInjectionTest::VAL_EXPECT_ERROR));
+                     FaultInjectionTest::kValExpectNoError));
   }
 
   void NoWriteTestPreFault() {
@@ -505,30 +616,45 @@ class FaultInjectionTest {
 };
 
 TEST(FaultInjectionTest, FaultTest) {
-  Random rnd(0);
-  ASSERT_OK(SetUp());
-  for (size_t idx = 0; idx < kNumIterations; idx++) {
+  do {
+    Random rnd(301);
+    ASSERT_OK(SetUp());
+
     int num_pre_sync = rnd.Uniform(kMaxNumValues);
     int num_post_sync = rnd.Uniform(kMaxNumValues);
 
     PartialCompactTestPreFault(num_pre_sync, num_post_sync);
-    PartialCompactTestReopenWithFault(RESET_DROP_UNSYNCED_DATA,
-                                      num_pre_sync,
+    PartialCompactTestReopenWithFault(kResetDropUnsyncedData, num_pre_sync,
                                       num_post_sync);
-
-    NoWriteTestPreFault();
-    NoWriteTestReopenWithFault(RESET_DROP_UNSYNCED_DATA);
-
-    PartialCompactTestPreFault(num_pre_sync, num_post_sync);
-    // No new files created so we expect all values since no files will be
-    // dropped.
-    PartialCompactTestReopenWithFault(RESET_DELETE_UNSYNCED_FILES,
-                                      num_pre_sync + num_post_sync,
-                                      0);
-
     NoWriteTestPreFault();
-    NoWriteTestReopenWithFault(RESET_DELETE_UNSYNCED_FILES);
-  }
+    NoWriteTestReopenWithFault(kResetDropUnsyncedData);
+
+    // TODO(t6070540) Need to sync WAL Dir and other DB paths too.
+
+    // Setting a separate data path won't pass the test as we don't sync
+    // it after creating new files,
+    if (option_config_ != kDifferentDataDir) {
+      PartialCompactTestPreFault(num_pre_sync, num_post_sync);
+      // Since we don't sync WAL Dir, this test dosn't pass.
+      if (option_config_ != kWalDirSyncWal) {
+        PartialCompactTestReopenWithFault(kResetDropAndDeleteUnsynced,
+                                          num_pre_sync, num_post_sync);
+      }
+      NoWriteTestPreFault();
+      NoWriteTestReopenWithFault(kResetDropAndDeleteUnsynced);
+
+      PartialCompactTestPreFault(num_pre_sync, num_post_sync);
+      // No new files created so we expect all values since no files will be
+      // dropped.
+      // WAL Dir is not synced for now.
+      if (option_config_ != kWalDir && option_config_ != kWalDirSyncWal) {
+        PartialCompactTestReopenWithFault(kResetDeleteUnsyncedFiles,
+                                          num_pre_sync + num_post_sync, 0);
+      }
+      NoWriteTestPreFault();
+      NoWriteTestReopenWithFault(kResetDeleteUnsyncedFiles);
+    }
+  } while (ChangeOptions());
 }
 
 }  // namespace rocksdb

From c2e8e8c1c0751246da71a9256ae5237484f29857 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Fri, 23 Jan 2015 16:03:24 -0800
Subject: [PATCH 737/829] Fix two namings in fault_injection_test.cc

Summary: fault_injection_test.cc has two variable names not following the convention fix it.

Test Plan: run the test

Reviewers: rven, yhchiang, igor

Reviewed By: igor

Subscribers: leveldb, dhruba

Differential Revision: https://reviews.facebook.net/D32097
---
 db/fault_injection_test.cc | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/db/fault_injection_test.cc b/db/fault_injection_test.cc
index 4aa459fe2..41664e728 100644
--- a/db/fault_injection_test.cc
+++ b/db/fault_injection_test.cc
@@ -363,9 +363,9 @@ class FaultInjectionTest {
   };
   int option_config_;
   // When need to make sure data is persistent, sync WAL
-  bool sync_use_wal;
+  bool sync_use_wal_;
   // When need to make sure data is persistent, call DB::CompactRange()
-  bool sync_use_compact;
+  bool sync_use_compact_;
 
  protected:
  public:
@@ -384,8 +384,8 @@ class FaultInjectionTest {
 
   FaultInjectionTest()
       : option_config_(kDefault),
-        sync_use_wal(false),
-        sync_use_compact(true),
+        sync_use_wal_(false),
+        sync_use_compact_(true),
         env_(NULL),
         db_(NULL) {
     NewDB();
@@ -404,8 +404,8 @@ class FaultInjectionTest {
 
   // Return the current option configuration.
   Options CurrentOptions() {
-    sync_use_wal = false;
-    sync_use_compact = true;
+    sync_use_wal_ = false;
+    sync_use_compact_ = true;
     Options options;
     switch (option_config_) {
       case kWalDir:
@@ -416,13 +416,13 @@ class FaultInjectionTest {
                                       1000000U);
         break;
       case kSyncWal:
-        sync_use_wal = true;
-        sync_use_compact = false;
+        sync_use_wal_ = true;
+        sync_use_compact_ = false;
         break;
       case kWalDirSyncWal:
         options.wal_dir = test::TmpDir(env_) + "/fault_test_wal";
-        sync_use_wal = true;
-        sync_use_compact = false;
+        sync_use_wal_ = true;
+        sync_use_compact_ = false;
         break;
       default:
         break;
@@ -583,10 +583,10 @@ class FaultInjectionTest {
     DeleteAllData();
 
     WriteOptions write_options;
-    write_options.sync = sync_use_wal;
+    write_options.sync = sync_use_wal_;
 
     Build(write_options, 0, num_pre_sync);
-    if (sync_use_compact) {
+    if (sync_use_compact_) {
       db_->CompactRange(nullptr, nullptr);
     }
     write_options.sync = false;

From 43ec4e68ba609b246993f1d8dfb33f4657382ecf Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Fri, 23 Jan 2015 16:26:38 -0800
Subject: [PATCH 738/829] fault_injection_test: bring back 3 iteration runs

Summary: 3 iterations were disabled by mistake by one recent commit, causing CLANG build error. Fix it

Test Plan:
USE_CLANG=1 make fault_injection_test
and run the test

Reviewers: igor

Reviewed By: igor

Subscribers: leveldb, dhruba

Differential Revision: https://reviews.facebook.net/D32109
---
 db/fault_injection_test.cc | 60 ++++++++++++++++++++------------------
 1 file changed, 31 insertions(+), 29 deletions(-)

diff --git a/db/fault_injection_test.cc b/db/fault_injection_test.cc
index 41664e728..a014726ab 100644
--- a/db/fault_injection_test.cc
+++ b/db/fault_injection_test.cc
@@ -620,39 +620,41 @@ TEST(FaultInjectionTest, FaultTest) {
     Random rnd(301);
     ASSERT_OK(SetUp());
 
-    int num_pre_sync = rnd.Uniform(kMaxNumValues);
-    int num_post_sync = rnd.Uniform(kMaxNumValues);
+    for (size_t idx = 0; idx < kNumIterations; idx++) {
+      int num_pre_sync = rnd.Uniform(kMaxNumValues);
+      int num_post_sync = rnd.Uniform(kMaxNumValues);
 
-    PartialCompactTestPreFault(num_pre_sync, num_post_sync);
-    PartialCompactTestReopenWithFault(kResetDropUnsyncedData, num_pre_sync,
-                                      num_post_sync);
-    NoWriteTestPreFault();
-    NoWriteTestReopenWithFault(kResetDropUnsyncedData);
-
-    // TODO(t6070540) Need to sync WAL Dir and other DB paths too.
-
-    // Setting a separate data path won't pass the test as we don't sync
-    // it after creating new files,
-    if (option_config_ != kDifferentDataDir) {
       PartialCompactTestPreFault(num_pre_sync, num_post_sync);
-      // Since we don't sync WAL Dir, this test dosn't pass.
-      if (option_config_ != kWalDirSyncWal) {
-        PartialCompactTestReopenWithFault(kResetDropAndDeleteUnsynced,
-                                          num_pre_sync, num_post_sync);
-      }
+      PartialCompactTestReopenWithFault(kResetDropUnsyncedData, num_pre_sync,
+                                        num_post_sync);
       NoWriteTestPreFault();
-      NoWriteTestReopenWithFault(kResetDropAndDeleteUnsynced);
-
-      PartialCompactTestPreFault(num_pre_sync, num_post_sync);
-      // No new files created so we expect all values since no files will be
-      // dropped.
-      // WAL Dir is not synced for now.
-      if (option_config_ != kWalDir && option_config_ != kWalDirSyncWal) {
-        PartialCompactTestReopenWithFault(kResetDeleteUnsyncedFiles,
-                                          num_pre_sync + num_post_sync, 0);
+      NoWriteTestReopenWithFault(kResetDropUnsyncedData);
+
+      // TODO(t6070540) Need to sync WAL Dir and other DB paths too.
+
+      // Setting a separate data path won't pass the test as we don't sync
+      // it after creating new files,
+      if (option_config_ != kDifferentDataDir) {
+        PartialCompactTestPreFault(num_pre_sync, num_post_sync);
+        // Since we don't sync WAL Dir, this test dosn't pass.
+        if (option_config_ != kWalDirSyncWal) {
+          PartialCompactTestReopenWithFault(kResetDropAndDeleteUnsynced,
+                                            num_pre_sync, num_post_sync);
+        }
+        NoWriteTestPreFault();
+        NoWriteTestReopenWithFault(kResetDropAndDeleteUnsynced);
+
+        PartialCompactTestPreFault(num_pre_sync, num_post_sync);
+        // No new files created so we expect all values since no files will be
+        // dropped.
+        // WAL Dir is not synced for now.
+        if (option_config_ != kWalDir && option_config_ != kWalDirSyncWal) {
+          PartialCompactTestReopenWithFault(kResetDeleteUnsyncedFiles,
+                                            num_pre_sync + num_post_sync, 0);
+        }
+        NoWriteTestPreFault();
+        NoWriteTestReopenWithFault(kResetDeleteUnsyncedFiles);
       }
-      NoWriteTestPreFault();
-      NoWriteTestReopenWithFault(kResetDeleteUnsyncedFiles);
     }
   } while (ChangeOptions());
 }

From f5a839835263ec3a296b731a89ddbdb65d580a80 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Fri, 23 Jan 2015 17:35:12 -0800
Subject: [PATCH 739/829] Fix archive WAL race conditions

Summary: More race condition bugs with our archive WAL files. I do believe this caused t5988326, but can't reproduce the failure unfortunately.

Test Plan: make check

Reviewers: yhchiang, rven, sdong

Reviewed By: sdong

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D32103
---
 db/wal_manager.cc | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/db/wal_manager.cc b/db/wal_manager.cc
index 7fac575f2..aa79b0280 100644
--- a/db/wal_manager.cc
+++ b/db/wal_manager.cc
@@ -310,9 +310,15 @@ Status WalManager::GetSortedWalsOfType(const std::string& path,
       uint64_t size_bytes;
       s = env_->GetFileSize(LogFileName(path, number), &size_bytes);
       // re-try in case the alive log file has been moved to archive.
+      std::string archived_file = ArchivedLogFileName(path, number);
       if (!s.ok() && log_type == kAliveLogFile &&
-          env_->FileExists(ArchivedLogFileName(path, number))) {
-        s = env_->GetFileSize(ArchivedLogFileName(path, number), &size_bytes);
+          env_->FileExists(archived_file)) {
+        s = env_->GetFileSize(archived_file, &size_bytes);
+        if (!s.ok() && !env_->FileExists(archived_file)) {
+          // oops, the file just got deleted from archived dir! move on
+          s = Status::OK();
+          continue;
+        }
       }
       if (!s.ok()) {
         return s;
@@ -354,6 +360,7 @@ Status WalManager::RetainProbableWalFiles(VectorLogPtr& all_logs,
 Status WalManager::ReadFirstRecord(const WalFileType type,
                                    const uint64_t number,
                                    SequenceNumber* sequence) {
+  *sequence = 0;
   if (type != kAliveLogFile && type != kArchivedLogFile) {
     Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
         "[WalManger] Unknown file type %s", ToString(type).c_str());
@@ -383,6 +390,12 @@ Status WalManager::ReadFirstRecord(const WalFileType type,
     std::string archived_file =
         ArchivedLogFileName(db_options_.wal_dir, number);
     s = ReadFirstLine(archived_file, sequence);
+    // maybe the file was deleted from archive dir. If that's the case, return
+    // Status::OK(). The caller with identify this as empty file because
+    // *sequence == 0
+    if (!s.ok() && !env_->FileExists(archived_file)) {
+      return Status::OK();
+    }
   }
 
   if (s.ok() && *sequence != 0) {

From 42189612c3b6ce6557337b8e1b78dbfb47485764 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Fri, 23 Jan 2015 18:04:39 -0800
Subject: [PATCH 740/829] Fix data race #2

Summary: We should not be calling InternalStats methods outside of the mutex.

Test Plan:
COMPILE_WITH_TSAN=1 m db_test && ROCKSDB_TESTS=CompactionTrigger ./db_test

failing before the diff, works now

Reviewers: yhchiang, rven, sdong

Reviewed By: sdong

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D32127
---
 db/db_impl.cc | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/db/db_impl.cc b/db/db_impl.cc
index 34c3077fd..350dcde16 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -2908,11 +2908,15 @@ Status DBImpl::Write(const WriteOptions& write_options, WriteBatch* my_batch) {
 
   if (!write_options.disableWAL) {
     RecordTick(stats_, WRITE_WITH_WAL);
-    default_cf_internal_stats_->AddDBStats(InternalStats::WRITE_WITH_WAL, 1);
   }
 
   WriteContext context;
   mutex_.Lock();
+
+  if (!write_options.disableWAL) {
+    default_cf_internal_stats_->AddDBStats(InternalStats::WRITE_WITH_WAL, 1);
+  }
+
   Status status = write_thread_.EnterWriteThread(&w, expiration_time);
   assert(status.ok() || status.IsTimedOut());
   if (status.IsTimedOut()) {

From 26b50783d30c54205aba4a2e0693efdec6e0798d Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Fri, 23 Jan 2015 18:10:52 -0800
Subject: [PATCH 741/829] Fix assert in histogramData

---
 util/statistics.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/util/statistics.cc b/util/statistics.cc
index 9d828a6fe..ba7670bb4 100644
--- a/util/statistics.cc
+++ b/util/statistics.cc
@@ -44,8 +44,8 @@ void StatisticsImpl::histogramData(uint32_t histogramType,
                                    HistogramData* const data) const {
   assert(
     enable_internal_stats_ ?
-      histogramType < INTERNAL_TICKER_ENUM_MAX :
-      histogramType < TICKER_ENUM_MAX);
+      histogramType < INTERNAL_HISTOGRAM_ENUM_MAX :
+      histogramType < HISTOGRAM_ENUM_MAX);
   // Return its own ticker version
   histograms_[histogramType].Data(data);
 }

From e61f38e5a0871bf0a9ea179c92567cac32f4b7ba Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Sat, 24 Jan 2015 16:27:19 +0100
Subject: [PATCH 742/829] [RocksJava] Fix native library loader

Summary:
Prior to this the native library loader instance didn`t
care about a state. So if library loading was called multiple
times, multiple copies of the shared object were put into
the tmp folder and loaded into the JVM.

This changed within this commit to the following behavior:

- library loading is now synchronized
- library is loaded within the first call
- if loading was successful the library loaded sets a flag
- every subsequent call checks for a boolean flag indicating if there was
  already a successful attempt

Test Plan:
- Execute example and watch tmp folder while the example is running

- After this patch only one shared object will be in the tmp folder

Usual tests:
- make rocksdbjava jtest
- mvn -f rocksjni.pom package

Reviewers: adamretter, ankgup87, yhchiang

Subscribers: dhruba

Differential Revision: https://reviews.facebook.net/D32133
---
 java/org/rocksdb/NativeLibraryLoader.java | 45 +++++++++++++----------
 1 file changed, 25 insertions(+), 20 deletions(-)

diff --git a/java/org/rocksdb/NativeLibraryLoader.java b/java/org/rocksdb/NativeLibraryLoader.java
index 1aa9a8b16..fb09d3600 100644
--- a/java/org/rocksdb/NativeLibraryLoader.java
+++ b/java/org/rocksdb/NativeLibraryLoader.java
@@ -13,6 +13,7 @@ import org.rocksdb.util.Environment;
 public class NativeLibraryLoader {
   //singleton
   private static final NativeLibraryLoader instance = new NativeLibraryLoader();
+  private static boolean initialized = false;
 
   private static final String sharedLibraryName = Environment.getJniLibraryName("rocksdb");
   private static final String tempFilePrefix = "librocksdbjni";
@@ -41,31 +42,35 @@ public class NativeLibraryLoader {
    *
    * @throws java.io.IOException if a filesystem operation fails.
    */
-  public void loadLibraryFromJar(final String tmpDir)
+  public synchronized void loadLibraryFromJar(final String tmpDir)
       throws IOException {
-    final File temp;
-    if(tmpDir == null || tmpDir.equals("")) {
-      temp = File.createTempFile(tempFilePrefix, tempFileSuffix);
-    } else {
-      temp = new File(tmpDir, sharedLibraryName);
-    }
-
-    if (!temp.exists()) {
-      throw new RuntimeException("File " + temp.getAbsolutePath() + " does not exist.");
-    } else {
-      temp.deleteOnExit();
-    }
+    if (!initialized) {
+      final File temp;
+      if (tmpDir == null || tmpDir.equals("")) {
+        temp = File.createTempFile(tempFilePrefix, tempFileSuffix);
+      } else {
+        temp = new File(tmpDir, sharedLibraryName);
+      }
 
-    // attempt to copy the library from the Jar file to the temp destination
-    try(final InputStream is = getClass().getClassLoader().getResourceAsStream(sharedLibraryName)) {
-      if (is == null) {
-        throw new RuntimeException(sharedLibraryName + " was not found inside JAR.");
+      if (!temp.exists()) {
+        throw new RuntimeException("File " + temp.getAbsolutePath() + " does not exist.");
       } else {
-        Files.copy(is, temp.toPath(), StandardCopyOption.REPLACE_EXISTING);
+        temp.deleteOnExit();
+      }
+
+      // attempt to copy the library from the Jar file to the temp destination
+      try (final InputStream is = getClass().getClassLoader().
+          getResourceAsStream(sharedLibraryName)) {
+        if (is == null) {
+          throw new RuntimeException(sharedLibraryName + " was not found inside JAR.");
+        } else {
+          Files.copy(is, temp.toPath(), StandardCopyOption.REPLACE_EXISTING);
+        }
       }
-    }
 
-    System.load(temp.getAbsolutePath());
+      System.load(temp.getAbsolutePath());
+      initialized = true;
+    }
   }
   /**
    * Private constructor to disallow instantiation

From f1c8862479a9eebbfca7ca27ff51c8b3014c14dc Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Mon, 26 Jan 2015 11:48:07 -0800
Subject: [PATCH 743/829] Fix data race #1

Summary:
This is first in a series of diffs that fixes data races detected by thread sanitizer.

Here the problem is that we call Ref() on a column family during a single-threaded write, without holding a mutex.

Test Plan: TSAN is no longer complaining about LevelLimitReopen.

Reviewers: yhchiang, rven, sdong

Reviewed By: sdong

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D32121
---
 db/column_family.cc  |  4 ++--
 db/column_family.h   | 15 ++++++++++-----
 db/flush_scheduler.h |  1 +
 3 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/db/column_family.cc b/db/column_family.cc
index 19bb09564..e6e75aad9 100644
--- a/db/column_family.cc
+++ b/db/column_family.cc
@@ -299,7 +299,7 @@ ColumnFamilyData::ColumnFamilyData(
 
 // DB mutex held
 ColumnFamilyData::~ColumnFamilyData() {
-  assert(refs_ == 0);
+  assert(refs_.load(std::memory_order_relaxed) == 0);
   // remove from linked list
   auto prev = prev_;
   auto next = next_;
@@ -731,7 +731,7 @@ ColumnFamilyData* ColumnFamilySet::CreateColumnFamily(
 void ColumnFamilySet::FreeDeadColumnFamilies() {
   autovector<ColumnFamilyData*> to_delete;
   for (auto cfd = dummy_cfd_->next_; cfd != dummy_cfd_; cfd = cfd->next_) {
-    if (cfd->refs_ == 0) {
+    if (cfd->refs_.load(std::memory_order_relaxed) == 0) {
       to_delete.push_back(cfd);
     }
   }
diff --git a/db/column_family.h b/db/column_family.h
index 1c987a3f0..a1a9e8034 100644
--- a/db/column_family.h
+++ b/db/column_family.h
@@ -134,14 +134,18 @@ class ColumnFamilyData {
   // thread-safe
   const std::string& GetName() const { return name_; }
 
-  void Ref() { ++refs_; }
+  // Ref() can only be called whily holding a DB mutex or during a
+  // single-threaded write.
+  void Ref() { refs_.fetch_add(1, std::memory_order_relaxed); }
   // will just decrease reference count to 0, but will not delete it. returns
   // true if the ref count was decreased to zero. in that case, it can be
   // deleted by the caller immediately, or later, by calling
   // FreeDeadColumnFamilies()
+  // Unref() can only be called while holding a DB mutex
   bool Unref() {
-    assert(refs_ > 0);
-    return --refs_ == 0;
+    int old_refs = refs_.fetch_sub(1, std::memory_order_relaxed);
+    assert(old_refs > 0);
+    return old_refs == 1;
   }
 
   // SetDropped() can only be called under following conditions:
@@ -290,7 +294,7 @@ class ColumnFamilyData {
   Version* dummy_versions_;  // Head of circular doubly-linked list of versions.
   Version* current_;         // == dummy_versions->prev_
 
-  int refs_;                   // outstanding references to ColumnFamilyData
+  std::atomic<int> refs_;      // outstanding references to ColumnFamilyData
   bool dropped_;               // true if client dropped it
 
   const InternalKeyComparator internal_comparator_;
@@ -373,7 +377,8 @@ class ColumnFamilySet {
       // dummy is never dead or dropped, so this will never be infinite
       do {
         current_ = current_->next_;
-      } while (current_->refs_ == 0 || current_->IsDropped());
+      } while (current_->refs_.load(std::memory_order_relaxed) == 0 ||
+               current_->IsDropped());
       return *this;
     }
     bool operator!=(const iterator& other) {
diff --git a/db/flush_scheduler.h b/db/flush_scheduler.h
index 201e4a13c..0c96709b9 100644
--- a/db/flush_scheduler.h
+++ b/db/flush_scheduler.h
@@ -23,6 +23,7 @@ class FlushScheduler {
 
   void ScheduleFlush(ColumnFamilyData* cfd);
   // Returns Ref()-ed column family. Client needs to Unref()
+  // REQUIRES: db mutex is held (exception is single-threaded recovery)
   ColumnFamilyData* GetNextColumnFamily();
 
   bool Empty();

From 58f34edfcee00f57754ccb796f159ea102af5499 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Mon, 26 Jan 2015 13:17:28 -0800
Subject: [PATCH 744/829] Fix valgrind

---
 build_tools/fbcode_config.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/build_tools/fbcode_config.sh b/build_tools/fbcode_config.sh
index ecc0cbad6..afc3de40b 100644
--- a/build_tools/fbcode_config.sh
+++ b/build_tools/fbcode_config.sh
@@ -117,6 +117,6 @@ PLATFORM_LDFLAGS="$LIBGCC_LIBS $GLIBC_LIBS $STDLIBS -lgcc -lstdc++"
 
 EXEC_LDFLAGS_SHARED="$SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $GFLAGS_LIBS"
 
-VALGRIND_VER="/mnt/gvfs/third-party2/valgrind/6c45ef049cbf11c2df593addb712cd891049e737/3.10.0/gcc-4.9-glibc-2.20/4230243/bin"
+VALGRIND_VER="/mnt/gvfs/third-party2/valgrind/6c45ef049cbf11c2df593addb712cd891049e737/3.10.0/gcc-4.9-glibc-2.20/4230243/bin/"
 
 export CC CXX AR CFLAGS CXXFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED VALGRIND_VER JEMALLOC_LIB JEMALLOC_INCLUDE

From d888c95748463392a70f593c08ace89413dcc408 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Mon, 26 Jan 2015 13:59:38 -0800
Subject: [PATCH 745/829] Sync WAL Directory and DB Path if different from DB
 directory

Summary:
1. If WAL directory is different from db directory. Sync the directory after creating a log file under it.
2. After creating an SST file, sync its parent directory instead of DB directory.
3. change the check of kResetDeleteUnsyncedFiles in fault_injection_test. Since we changed the behavior to sync log files' parent directory after first WAL sync, instead of creating, kResetDeleteUnsyncedFiles will not guarantee to show post sync updates.

Test Plan: make all check

Reviewers: yhchiang, rven, igor

Reviewed By: igor

Subscribers: leveldb, dhruba

Differential Revision: https://reviews.facebook.net/D32067
---
 Makefile                   |   1 +
 db/compaction_job.cc       |   9 +--
 db/compaction_job.h        |   7 +-
 db/compaction_job_test.cc  |   9 +--
 db/db_impl.cc              | 136 +++++++++++++++++++++++++------------
 db/db_impl.h               |  30 +++++++-
 db/fault_injection_test.cc |  37 ++++------
 db/flush_job.cc            |  10 +--
 db/flush_job.h             |   4 +-
 db/flush_job_test.cc       |   4 +-
 10 files changed, 161 insertions(+), 86 deletions(-)

diff --git a/Makefile b/Makefile
index bc23295b2..237eebdf5 100644
--- a/Makefile
+++ b/Makefile
@@ -108,6 +108,7 @@ BENCHHARNESS = ./util/benchharness.o
 VALGRIND_ERROR = 2
 VALGRIND_DIR = build_tools/VALGRIND_LOGS
 VALGRIND_VER := $(join $(VALGRIND_VER),valgrind)
+
 VALGRIND_OPTS = --error-exitcode=$(VALGRIND_ERROR) --leak-check=full
 
 TESTS = \
diff --git a/db/compaction_job.cc b/db/compaction_job.cc
index 7b786c116..09b21a237 100644
--- a/db/compaction_job.cc
+++ b/db/compaction_job.cc
@@ -205,8 +205,8 @@ CompactionJob::CompactionJob(
     Compaction* compaction, const DBOptions& db_options,
     const MutableCFOptions& mutable_cf_options, const EnvOptions& env_options,
     VersionSet* versions, std::atomic<bool>* shutting_down,
-    LogBuffer* log_buffer, Directory* db_directory, Statistics* stats,
-    SnapshotList* snapshots, bool is_snapshot_supported,
+    LogBuffer* log_buffer, Directory* db_directory, Directory* output_directory,
+    Statistics* stats, SnapshotList* snapshots, bool is_snapshot_supported,
     std::shared_ptr<Cache> table_cache,
     std::function<uint64_t()> yield_callback)
     : compact_(new CompactionState(compaction)),
@@ -219,6 +219,7 @@ CompactionJob::CompactionJob(
       shutting_down_(shutting_down),
       log_buffer_(log_buffer),
       db_directory_(db_directory),
+      output_directory_(output_directory),
       stats_(stats),
       snapshots_(snapshots),
       is_snapshot_supported_(is_snapshot_supported),
@@ -422,8 +423,8 @@ Status CompactionJob::Run() {
   }
   input.reset();
 
-  if (db_directory_ && !db_options_.disableDataSync) {
-    db_directory_->Fsync();
+  if (output_directory_ && !db_options_.disableDataSync) {
+    output_directory_->Fsync();
   }
 
   compaction_stats_.micros = env_->NowMicros() - start_micros - imm_micros;
diff --git a/db/compaction_job.h b/db/compaction_job.h
index 4ce440a36..705ba7c64 100644
--- a/db/compaction_job.h
+++ b/db/compaction_job.h
@@ -57,9 +57,9 @@ class CompactionJob {
                 const MutableCFOptions& mutable_cf_options,
                 const EnvOptions& env_options, VersionSet* versions,
                 std::atomic<bool>* shutting_down, LogBuffer* log_buffer,
-                Directory* db_directory, Statistics* stats,
-                SnapshotList* snapshot_list, bool is_snapshot_supported,
-                std::shared_ptr<Cache> table_cache,
+                Directory* db_directory, Directory* output_directory,
+                Statistics* stats, SnapshotList* snapshot_list,
+                bool is_snapshot_supported, std::shared_ptr<Cache> table_cache,
                 std::function<uint64_t()> yield_callback);
 
   ~CompactionJob() { assert(compact_ == nullptr); }
@@ -114,6 +114,7 @@ class CompactionJob {
   std::atomic<bool>* shutting_down_;
   LogBuffer* log_buffer_;
   Directory* db_directory_;
+  Directory* output_directory_;
   Statistics* stats_;
   SnapshotList* snapshots_;
   bool is_snapshot_supported_;
diff --git a/db/compaction_job_test.cc b/db/compaction_job_test.cc
index 1db802813..54217cc37 100644
--- a/db/compaction_job_test.cc
+++ b/db/compaction_job_test.cc
@@ -160,10 +160,11 @@ TEST(CompactionJobTest, Simple) {
   };
   LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, db_options_.info_log.get());
   mutex_.Lock();
-  CompactionJob compaction_job(
-      compaction.get(), db_options_, *cfd->GetLatestMutableCFOptions(),
-      env_options_, versions_.get(), &shutting_down_, &log_buffer, nullptr,
-      nullptr, &snapshots, true, table_cache_, std::move(yield_callback));
+  CompactionJob compaction_job(compaction.get(), db_options_,
+                               *cfd->GetLatestMutableCFOptions(), env_options_,
+                               versions_.get(), &shutting_down_, &log_buffer,
+                               nullptr, nullptr, nullptr, &snapshots, true,
+                               table_cache_, std::move(yield_callback));
   compaction_job.Prepare();
   mutex_.Unlock();
   ASSERT_OK(compaction_job.Run());
diff --git a/db/db_impl.cc b/db/db_impl.cc
index 350dcde16..27c5a998b 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -201,6 +201,7 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname)
       shutting_down_(false),
       bg_cv_(&mutex_),
       logfile_number_(0),
+      log_dir_unsynced_(true),
       log_empty_(true),
       default_cf_handle_(nullptr),
       total_log_size_(0),
@@ -354,7 +355,7 @@ Status DBImpl::NewDB() {
   }
   if (s.ok()) {
     // Make "CURRENT" file that points to the new manifest file.
-    s = SetCurrentFile(env_, dbname_, 1, db_directory_.get());
+    s = SetCurrentFile(env_, dbname_, 1, directories_.GetDbDir());
   } else {
     env_->DeleteFile(manifest);
   }
@@ -701,34 +702,75 @@ void DBImpl::DeleteObsoleteFiles() {
   job_context.Clean();
 }
 
-Status DBImpl::Recover(
-    const std::vector<ColumnFamilyDescriptor>& column_families, bool read_only,
-    bool error_if_log_file_exist) {
-  mutex_.AssertHeld();
+Status DBImpl::Directories::CreateAndNewDirectory(
+    Env* env, const std::string& dirname,
+    std::unique_ptr<Directory>* directory) const {
+  // We call CreateDirIfMissing() as the directory may already exist (if we
+  // are reopening a DB), when this happens we don't want creating the
+  // directory to cause an error. However, we need to check if creating the
+  // directory fails or else we may get an obscure message about the lock
+  // file not existing. One real-world example of this occurring is if
+  // env->CreateDirIfMissing() doesn't create intermediate directories, e.g.
+  // when dbname_ is "dir/db" but when "dir" doesn't exist.
+  Status s = env->CreateDirIfMissing(dirname);
+  if (!s.ok()) {
+    return s;
+  }
+  return env->NewDirectory(dirname, directory);
+}
 
-  bool is_new_db = false;
-  assert(db_lock_ == nullptr);
-  if (!read_only) {
-    // We call CreateDirIfMissing() as the directory may already exist (if we
-    // are reopening a DB), when this happens we don't want creating the
-    // directory to cause an error. However, we need to check if creating the
-    // directory fails or else we may get an obscure message about the lock
-    // file not existing. One real-world example of this occurring is if
-    // env->CreateDirIfMissing() doesn't create intermediate directories, e.g.
-    // when dbname_ is "dir/db" but when "dir" doesn't exist.
-    Status s = env_->CreateDirIfMissing(dbname_);
+Status DBImpl::Directories::SetDirectories(
+    Env* env, const std::string& dbname, const std::string& wal_dir,
+    const std::vector<DbPath>& data_paths) {
+  Status s = CreateAndNewDirectory(env, dbname, &db_dir_);
+  if (!s.ok()) {
+    return s;
+  }
+  if (!wal_dir.empty() && dbname != wal_dir) {
+    s = CreateAndNewDirectory(env, wal_dir, &wal_dir_);
     if (!s.ok()) {
       return s;
     }
+  }
 
-    for (auto& db_path : db_options_.db_paths) {
-      s = env_->CreateDirIfMissing(db_path.path);
+  data_dirs_.clear();
+  for (auto& p : data_paths) {
+    const std::string db_path = p.path;
+    if (db_path == dbname) {
+      data_dirs_.emplace_back(nullptr);
+    } else {
+      std::unique_ptr<Directory> path_directory;
+      s = CreateAndNewDirectory(env, db_path, &path_directory);
       if (!s.ok()) {
         return s;
       }
+      data_dirs_.emplace_back(path_directory.release());
     }
+  }
+  assert(data_dirs_.size() == data_paths.size());
+  return Status::OK();
+}
+
+Directory* DBImpl::Directories::GetDataDir(size_t path_id) {
+  assert(path_id < data_dirs_.size());
+  Directory* ret_dir = data_dirs_[path_id].get();
+  if (ret_dir == nullptr) {
+    // Should use db_dir_
+    return db_dir_.get();
+  }
+  return ret_dir;
+}
+
+Status DBImpl::Recover(
+    const std::vector<ColumnFamilyDescriptor>& column_families, bool read_only,
+    bool error_if_log_file_exist) {
+  mutex_.AssertHeld();
 
-    s = env_->NewDirectory(dbname_, &db_directory_);
+  bool is_new_db = false;
+  assert(db_lock_ == nullptr);
+  if (!read_only) {
+    Status s = directories_.SetDirectories(env_, dbname_, db_options_.wal_dir,
+                                           db_options_.db_paths);
     if (!s.ok()) {
       return s;
     }
@@ -1088,8 +1130,8 @@ Status DBImpl::FlushMemTableToOutputFile(
   FlushJob flush_job(dbname_, cfd, db_options_, mutable_cf_options,
                      env_options_, versions_.get(), &mutex_, &shutting_down_,
                      snapshots_.GetNewest(), job_context, log_buffer,
-                     db_directory_.get(), GetCompressionFlush(*cfd->ioptions()),
-                     stats_);
+                     directories_.GetDbDir(), directories_.GetDataDir(0U),
+                     GetCompressionFlush(*cfd->ioptions()), stats_);
 
   uint64_t file_number;
   Status s = flush_job.Run(&file_number);
@@ -1331,11 +1373,11 @@ Status DBImpl::CompactFilesImpl(
                                      *c->mutable_cf_options(), &job_context,
                                      &log_buffer);
   };
-  CompactionJob compaction_job(c.get(), db_options_, *c->mutable_cf_options(),
-                               env_options_, versions_.get(), &shutting_down_,
-                               &log_buffer, db_directory_.get(), stats_,
-                               &snapshots_, is_snapshot_supported_,
-                               table_cache_, std::move(yield_callback));
+  CompactionJob compaction_job(
+      c.get(), db_options_, *c->mutable_cf_options(), env_options_,
+      versions_.get(), &shutting_down_, &log_buffer, directories_.GetDbDir(),
+      directories_.GetDataDir(c->GetOutputPathId()), stats_, &snapshots_,
+      is_snapshot_supported_, table_cache_, std::move(yield_callback));
   compaction_job.Prepare();
 
   mutex_.Unlock();
@@ -1510,8 +1552,8 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) {
         "[%s] Apply version edit:\n%s",
         cfd->GetName().c_str(), edit.DebugString().data());
 
-    status = versions_->LogAndApply(cfd,
-        mutable_cf_options, &edit, &mutex_, db_directory_.get());
+    status = versions_->LogAndApply(cfd, mutable_cf_options, &edit, &mutex_,
+                                    directories_.GetDbDir());
     superversion_to_free = InstallSuperVersion(
         cfd, new_superversion, mutable_cf_options);
     new_superversion = nullptr;
@@ -2136,9 +2178,9 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress, JobContext* job_context,
     for (const auto& f : *c->inputs(0)) {
       c->edit()->DeleteFile(c->level(), f->fd.GetNumber());
     }
-    status = versions_->LogAndApply(
-        c->column_family_data(), *c->mutable_cf_options(), c->edit(),
-        &mutex_, db_directory_.get());
+    status = versions_->LogAndApply(c->column_family_data(),
+                                    *c->mutable_cf_options(), c->edit(),
+                                    &mutex_, directories_.GetDbDir());
     InstallSuperVersionBackground(c->column_family_data(), job_context,
                                   *c->mutable_cf_options());
     LogToBuffer(log_buffer, "[%s] Deleted %d files\n",
@@ -2164,8 +2206,8 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress, JobContext* job_context,
                        f->fd.GetFileSize(), f->smallest, f->largest,
                        f->smallest_seqno, f->largest_seqno);
     status = versions_->LogAndApply(c->column_family_data(),
-                                    *c->mutable_cf_options(),
-                                    c->edit(), &mutex_, db_directory_.get());
+                                    *c->mutable_cf_options(), c->edit(),
+                                    &mutex_, directories_.GetDbDir());
     // Use latest MutableCFOptions
     InstallSuperVersionBackground(c->column_family_data(), job_context,
                                   *c->mutable_cf_options());
@@ -2190,11 +2232,11 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress, JobContext* job_context,
                                        *c->mutable_cf_options(), job_context,
                                        log_buffer);
     };
-    CompactionJob compaction_job(c.get(), db_options_, *c->mutable_cf_options(),
-                                 env_options_, versions_.get(), &shutting_down_,
-                                 log_buffer, db_directory_.get(), stats_,
-                                 &snapshots_, is_snapshot_supported_,
-                                 table_cache_, std::move(yield_callback));
+    CompactionJob compaction_job(
+        c.get(), db_options_, *c->mutable_cf_options(), env_options_,
+        versions_.get(), &shutting_down_, log_buffer, directories_.GetDbDir(),
+        directories_.GetDataDir(c->GetOutputPathId()), stats_, &snapshots_,
+        is_snapshot_supported_, table_cache_, std::move(yield_callback));
     compaction_job.Prepare();
     mutex_.Unlock();
     status = compaction_job.Run();
@@ -2600,7 +2642,7 @@ Status DBImpl::CreateColumnFamily(const ColumnFamilyOptions& cf_options,
       // ColumnFamilyData object
       s = versions_->LogAndApply(
           nullptr, MutableCFOptions(opt, ImmutableCFOptions(opt)), &edit,
-          &mutex_, db_directory_.get(), false, &cf_options);
+          &mutex_, directories_.GetDbDir(), false, &cf_options);
       write_thread_.ExitWriteThread(&w, &w, s);
     }
     if (s.ok()) {
@@ -3059,6 +3101,13 @@ Status DBImpl::Write(const WriteOptions& write_options, WriteBatch* my_batch) {
           } else {
             status = log_->file()->Sync();
           }
+          if (status.ok() && log_dir_unsynced_) {
+            // We only sync WAL directory the first time WAL syncing is
+            // requested, so that in case users never turn on WAL sync,
+            // we can avoid the disk I/O in the write code path.
+            status = directories_.GetWalDir()->Fsync();
+          }
+          log_dir_unsynced_ = false;
         }
       }
       if (status.ok()) {
@@ -3193,14 +3242,15 @@ Status DBImpl::SetNewMemtableAndNewLogFile(ColumnFamilyData* cfd,
   {
     if (creating_new_log) {
       s = env_->NewWritableFile(
-          LogFileName(db_options_.wal_dir, new_log_number),
-          &lfile, env_->OptimizeForLogWrite(env_options_));
+          LogFileName(db_options_.wal_dir, new_log_number), &lfile,
+          env_->OptimizeForLogWrite(env_options_));
       if (s.ok()) {
         // Our final size should be less than write_buffer_size
         // (compression, etc) but err on the side of caution.
         lfile->SetPreallocationBlockSize(
             1.1 * mutable_cf_options.write_buffer_size);
         new_log = new log::Writer(std::move(lfile));
+        log_dir_unsynced_ = true;
       }
     }
 
@@ -3497,7 +3547,7 @@ Status DBImpl::DeleteFile(std::string name) {
     edit.SetColumnFamily(cfd->GetID());
     edit.DeleteFile(level, number);
     status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(),
-                                    &edit, &mutex_, db_directory_.get());
+                                    &edit, &mutex_, directories_.GetDbDir());
     if (status.ok()) {
       InstallSuperVersionBackground(cfd, &job_context,
                                     *cfd->GetLatestMutableCFOptions());
@@ -3745,7 +3795,7 @@ Status DB::Open(const DBOptions& db_options, const std::string& dbname,
       impl->alive_log_files_.push_back(
           DBImpl::LogFileNumberSize(impl->logfile_number_));
       impl->DeleteObsoleteFiles();
-      s = impl->db_directory_->Fsync();
+      s = impl->directories_.GetDbDir()->Fsync();
     }
   }
 
diff --git a/db/db_impl.h b/db/db_impl.h
index 4664a3d60..24c952b1d 100644
--- a/db/db_impl.h
+++ b/db/db_impl.h
@@ -422,6 +422,7 @@ class DBImpl : public DB {
   port::CondVar bg_cv_;
   uint64_t logfile_number_;
   unique_ptr<log::Writer> log_;
+  bool log_dir_unsynced_;
   bool log_empty_;
   ColumnFamilyHandleImpl* default_cf_handle_;
   InternalStats* default_cf_internal_stats_;
@@ -445,7 +446,34 @@ class DBImpl : public DB {
 
   bool is_snapshot_supported_;
 
-  std::unique_ptr<Directory> db_directory_;
+  // Class to maintain directories for all database paths other than main one.
+  class Directories {
+   public:
+    Status SetDirectories(Env* env, const std::string& dbname,
+                          const std::string& wal_dir,
+                          const std::vector<DbPath>& data_paths);
+
+    Directory* GetDataDir(size_t path_id);
+
+    Directory* GetWalDir() {
+      if (wal_dir_) {
+        return wal_dir_.get();
+      }
+      return db_dir_.get();
+    }
+
+    Directory* GetDbDir() { return db_dir_.get(); }
+
+   private:
+    std::unique_ptr<Directory> db_dir_;
+    std::vector<std::unique_ptr<Directory>> data_dirs_;
+    std::unique_ptr<Directory> wal_dir_;
+
+    Status CreateAndNewDirectory(Env* env, const std::string& dirname,
+                                 std::unique_ptr<Directory>* directory) const;
+  };
+
+  Directories directories_;
 
   WriteBuffer write_buffer_;
 
diff --git a/db/fault_injection_test.cc b/db/fault_injection_test.cc
index a014726ab..0ca21e6ce 100644
--- a/db/fault_injection_test.cc
+++ b/db/fault_injection_test.cc
@@ -272,6 +272,7 @@ class FaultInjectionTestEnv : public EnvWrapper {
   }
 
   void SyncDir(const std::string& dirname) {
+    MutexLock l(&mutex_);
     dir_to_new_files_since_last_sync_.erase(dirname);
   }
 
@@ -630,31 +631,21 @@ TEST(FaultInjectionTest, FaultTest) {
       NoWriteTestPreFault();
       NoWriteTestReopenWithFault(kResetDropUnsyncedData);
 
-      // TODO(t6070540) Need to sync WAL Dir and other DB paths too.
-
       // Setting a separate data path won't pass the test as we don't sync
       // it after creating new files,
-      if (option_config_ != kDifferentDataDir) {
-        PartialCompactTestPreFault(num_pre_sync, num_post_sync);
-        // Since we don't sync WAL Dir, this test dosn't pass.
-        if (option_config_ != kWalDirSyncWal) {
-          PartialCompactTestReopenWithFault(kResetDropAndDeleteUnsynced,
-                                            num_pre_sync, num_post_sync);
-        }
-        NoWriteTestPreFault();
-        NoWriteTestReopenWithFault(kResetDropAndDeleteUnsynced);
-
-        PartialCompactTestPreFault(num_pre_sync, num_post_sync);
-        // No new files created so we expect all values since no files will be
-        // dropped.
-        // WAL Dir is not synced for now.
-        if (option_config_ != kWalDir && option_config_ != kWalDirSyncWal) {
-          PartialCompactTestReopenWithFault(kResetDeleteUnsyncedFiles,
-                                            num_pre_sync + num_post_sync, 0);
-        }
-        NoWriteTestPreFault();
-        NoWriteTestReopenWithFault(kResetDeleteUnsyncedFiles);
-      }
+      PartialCompactTestPreFault(num_pre_sync, num_post_sync);
+      PartialCompactTestReopenWithFault(kResetDropAndDeleteUnsynced,
+                                        num_pre_sync, num_post_sync);
+      NoWriteTestPreFault();
+      NoWriteTestReopenWithFault(kResetDropAndDeleteUnsynced);
+
+      PartialCompactTestPreFault(num_pre_sync, num_post_sync);
+      // No new files created so we expect all values since no files will be
+      // dropped.
+      PartialCompactTestReopenWithFault(kResetDeleteUnsyncedFiles, num_pre_sync,
+                                        num_post_sync);
+      NoWriteTestPreFault();
+      NoWriteTestReopenWithFault(kResetDeleteUnsyncedFiles);
     }
   } while (ChangeOptions());
 }
diff --git a/db/flush_job.cc b/db/flush_job.cc
index ccc0245a3..8cf4daa49 100644
--- a/db/flush_job.cc
+++ b/db/flush_job.cc
@@ -58,6 +58,7 @@ FlushJob::FlushJob(const std::string& dbname, ColumnFamilyData* cfd,
                    port::Mutex* db_mutex, std::atomic<bool>* shutting_down,
                    SequenceNumber newest_snapshot, JobContext* job_context,
                    LogBuffer* log_buffer, Directory* db_directory,
+                   Directory* output_file_directory,
                    CompressionType output_compression, Statistics* stats)
     : dbname_(dbname),
       cfd_(cfd),
@@ -71,6 +72,7 @@ FlushJob::FlushJob(const std::string& dbname, ColumnFamilyData* cfd,
       job_context_(job_context),
       log_buffer_(log_buffer),
       db_directory_(db_directory),
+      output_file_directory_(output_file_directory),
       output_compression_(output_compression),
       stats_(stats) {}
 
@@ -125,10 +127,9 @@ Status FlushJob::WriteLevel0Table(const autovector<MemTable*>& mems,
   db_mutex_->AssertHeld();
   const uint64_t start_micros = db_options_.env->NowMicros();
   FileMetaData meta;
-
+  // path 0 for level 0 file.
   meta.fd = FileDescriptor(versions_->NewFileNumber(), 0, 0);
   *filenumber = meta.fd.GetNumber();
-  // path 0 for level 0 file.
 
   const SequenceNumber earliest_seqno_in_memtable =
       mems[0]->GetFirstSequenceNumber();
@@ -169,9 +170,8 @@ Status FlushJob::WriteLevel0Table(const autovector<MemTable*>& mems,
         "[%s] Level-0 flush table #%" PRIu64 ": %" PRIu64 " bytes %s",
         cfd_->GetName().c_str(), meta.fd.GetNumber(), meta.fd.GetFileSize(),
         s.ToString().c_str());
-
-    if (!db_options_.disableDataSync && db_directory_ != nullptr) {
-      db_directory_->Fsync();
+    if (!db_options_.disableDataSync && output_file_directory_ != nullptr) {
+      output_file_directory_->Fsync();
     }
     db_mutex_->Lock();
   }
diff --git a/db/flush_job.h b/db/flush_job.h
index 394a7a45e..0b8491484 100644
--- a/db/flush_job.h
+++ b/db/flush_job.h
@@ -57,7 +57,8 @@ class FlushJob {
            port::Mutex* db_mutex, std::atomic<bool>* shutting_down,
            SequenceNumber newest_snapshot, JobContext* job_context,
            LogBuffer* log_buffer, Directory* db_directory,
-           CompressionType output_compression, Statistics* stats);
+           Directory* output_file_directory, CompressionType output_compression,
+           Statistics* stats);
   ~FlushJob() {}
 
   Status Run(uint64_t* file_number = nullptr);
@@ -77,6 +78,7 @@ class FlushJob {
   JobContext* job_context_;
   LogBuffer* log_buffer_;
   Directory* db_directory_;
+  Directory* output_file_directory_;
   CompressionType output_compression_;
   Statistics* stats_;
 };
diff --git a/db/flush_job_test.cc b/db/flush_job_test.cc
index 7d779b58f..2f4f08b2e 100644
--- a/db/flush_job_test.cc
+++ b/db/flush_job_test.cc
@@ -86,7 +86,7 @@ TEST(FlushJobTest, Empty) {
   FlushJob flush_job(dbname_, versions_->GetColumnFamilySet()->GetDefault(),
                      db_options_, *cfd->GetLatestMutableCFOptions(),
                      env_options_, versions_.get(), &mutex_, &shutting_down_,
-                     SequenceNumber(), &job_context, nullptr, nullptr,
+                     SequenceNumber(), &job_context, nullptr, nullptr, nullptr,
                      kNoCompression, nullptr);
   ASSERT_OK(flush_job.Run());
   job_context.Clean();
@@ -110,7 +110,7 @@ TEST(FlushJobTest, NonEmpty) {
   FlushJob flush_job(dbname_, versions_->GetColumnFamilySet()->GetDefault(),
                      db_options_, *cfd->GetLatestMutableCFOptions(),
                      env_options_, versions_.get(), &mutex_, &shutting_down_,
-                     SequenceNumber(), &job_context, nullptr, nullptr,
+                     SequenceNumber(), &job_context, nullptr, nullptr, nullptr,
                      kNoCompression, nullptr);
   mutex_.Lock();
   ASSERT_OK(flush_job.Run());

From c1de6c42a0cabd1180b52f34b4b52d00290b7446 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Mon, 26 Jan 2015 15:22:18 -0800
Subject: [PATCH 746/829] fault_injection_test: add a test case to drop random
 number of unsynced data

Summary: Currently fault_injection_test has a test case to drop all the unsynced data. Add one more case to take a randomized bytes from it.

Test Plan: Run the test

Reviewers: rven, yhchiang, igor

Reviewed By: igor

Subscribers: leveldb, dhruba

Differential Revision: https://reviews.facebook.net/D32229
---
 db/fault_injection_test.cc | 47 +++++++++++++++++++++++++++++++++-----
 1 file changed, 41 insertions(+), 6 deletions(-)

diff --git a/db/fault_injection_test.cc b/db/fault_injection_test.cc
index 0ca21e6ce..e2c4629d2 100644
--- a/db/fault_injection_test.cc
+++ b/db/fault_injection_test.cc
@@ -114,6 +114,8 @@ struct FileState {
   bool IsFullySynced() const { return pos_ <= 0 || pos_ == pos_at_last_sync_; }
 
   Status DropUnsyncedData() const;
+
+  Status DropRandomUnsyncedData(Random* rand) const;
 };
 
 }  // anonymous namespace
@@ -226,7 +228,9 @@ class FaultInjectionTestEnv : public EnvWrapper {
     db_file_state_[state.filename_] = state;
   }
 
-  Status DropUnsyncedFileData() {
+  // For every file that is not fully synced, make a call to `func` with
+  // FileState of the file as the parameter.
+  Status DropFileData(std::function<Status(FileState)> func) {
     Status s;
     MutexLock l(&mutex_);
     for (std::map<std::string, FileState>::const_iterator it =
@@ -234,12 +238,23 @@ class FaultInjectionTestEnv : public EnvWrapper {
          s.ok() && it != db_file_state_.end(); ++it) {
       const FileState& state = it->second;
       if (!state.IsFullySynced()) {
-        s = state.DropUnsyncedData();
+        s = func(state);
       }
     }
     return s;
   }
 
+  Status DropUnsyncedFileData() {
+    return DropFileData(
+        [&](const FileState& state) { return state.DropUnsyncedData(); });
+  }
+
+  Status DropRandomUnsyncedFileData(Random* rnd) {
+    return DropFileData([&](const FileState& state) {
+      return state.DropRandomUnsyncedData(rnd);
+    });
+  }
+
   Status DeleteFilesCreatedAfterLastDirSync() {
     // Because DeleteFile access this container make a copy to avoid deadlock
     std::map<std::string, std::set<std::string>> map_copy;
@@ -296,6 +311,15 @@ Status FileState::DropUnsyncedData() const {
   return Truncate(filename_, sync_pos);
 }
 
+Status FileState::DropRandomUnsyncedData(Random* rand) const {
+  ssize_t sync_pos = pos_at_last_sync_ == -1 ? 0 : pos_at_last_sync_;
+  assert(pos_ >= sync_pos);
+  int range = static_cast<int>(pos_ - sync_pos);
+  uint64_t truncated_size =
+      static_cast<uint64_t>(sync_pos) + rand->Uniform(range);
+  return Truncate(filename_, truncated_size);
+}
+
 Status TestDirectory::Fsync() {
   env_->SyncDir(dirname_);
   return dir_->Fsync();
@@ -373,6 +397,7 @@ class FaultInjectionTest {
   enum ExpectedVerifResult { kValExpectFound, kValExpectNoError };
   enum ResetMethod {
     kResetDropUnsyncedData,
+    kResetDropRandomUnsyncedData,
     kResetDeleteUnsyncedFiles,
     kResetDropAndDeleteUnsynced
   };
@@ -563,11 +588,15 @@ class FaultInjectionTest {
     db_->Flush(flush_options);
   }
 
-  void ResetDBState(ResetMethod reset_method) {
+  // rnd cannot be null for kResetDropRandomUnsyncedData
+  void ResetDBState(ResetMethod reset_method, Random* rnd = nullptr) {
     switch (reset_method) {
       case kResetDropUnsyncedData:
         ASSERT_OK(env_->DropUnsyncedFileData());
         break;
+      case kResetDropRandomUnsyncedData:
+        ASSERT_OK(env_->DropRandomUnsyncedFileData(rnd));
+        break;
       case kResetDeleteUnsyncedFiles:
         ASSERT_OK(env_->DeleteFilesCreatedAfterLastDirSync());
         break;
@@ -595,11 +624,11 @@ class FaultInjectionTest {
   }
 
   void PartialCompactTestReopenWithFault(ResetMethod reset_method,
-                                         int num_pre_sync,
-                                         int num_post_sync) {
+                                         int num_pre_sync, int num_post_sync,
+                                         Random* rnd = nullptr) {
     env_->SetFilesystemActive(false);
     CloseDB();
-    ResetDBState(reset_method);
+    ResetDBState(reset_method, rnd);
     ASSERT_OK(OpenDB());
     ASSERT_OK(Verify(0, num_pre_sync, FaultInjectionTest::kValExpectFound));
     ASSERT_OK(Verify(num_pre_sync, num_post_sync,
@@ -631,6 +660,12 @@ TEST(FaultInjectionTest, FaultTest) {
       NoWriteTestPreFault();
       NoWriteTestReopenWithFault(kResetDropUnsyncedData);
 
+      PartialCompactTestPreFault(num_pre_sync, num_post_sync);
+      PartialCompactTestReopenWithFault(kResetDropRandomUnsyncedData,
+                                        num_pre_sync, num_post_sync, &rnd);
+      NoWriteTestPreFault();
+      NoWriteTestReopenWithFault(kResetDropUnsyncedData);
+
       // Setting a separate data path won't pass the test as we don't sync
       // it after creating new files,
       PartialCompactTestPreFault(num_pre_sync, num_post_sync);

From be8f0b12edd000804b4fe48fe89d5cb8813fa40f Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Mon, 26 Jan 2015 15:48:59 -0800
Subject: [PATCH 747/829] Rename DBImpl::log_dir_unsynced_ to log_dir_synced_

Summary: log_dir_unsynced_ is a confusing name. Rename it to log_dir_synced_ and flip the value.

Test Plan: Run ./fault_injection_test

Reviewers: rven, yhchiang, igor

Reviewed By: igor

Subscribers: leveldb, dhruba

Differential Revision: https://reviews.facebook.net/D32235
---
 db/db_impl.cc | 8 ++++----
 db/db_impl.h  | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/db/db_impl.cc b/db/db_impl.cc
index 27c5a998b..2be3d2359 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -201,7 +201,7 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname)
       shutting_down_(false),
       bg_cv_(&mutex_),
       logfile_number_(0),
-      log_dir_unsynced_(true),
+      log_dir_synced_(false),
       log_empty_(true),
       default_cf_handle_(nullptr),
       total_log_size_(0),
@@ -3101,13 +3101,13 @@ Status DBImpl::Write(const WriteOptions& write_options, WriteBatch* my_batch) {
           } else {
             status = log_->file()->Sync();
           }
-          if (status.ok() && log_dir_unsynced_) {
+          if (status.ok() && !log_dir_synced_) {
             // We only sync WAL directory the first time WAL syncing is
             // requested, so that in case users never turn on WAL sync,
             // we can avoid the disk I/O in the write code path.
             status = directories_.GetWalDir()->Fsync();
           }
-          log_dir_unsynced_ = false;
+          log_dir_synced_ = true;
         }
       }
       if (status.ok()) {
@@ -3250,7 +3250,7 @@ Status DBImpl::SetNewMemtableAndNewLogFile(ColumnFamilyData* cfd,
         lfile->SetPreallocationBlockSize(
             1.1 * mutable_cf_options.write_buffer_size);
         new_log = new log::Writer(std::move(lfile));
-        log_dir_unsynced_ = true;
+        log_dir_synced_ = false;
       }
     }
 
diff --git a/db/db_impl.h b/db/db_impl.h
index 24c952b1d..70fa14727 100644
--- a/db/db_impl.h
+++ b/db/db_impl.h
@@ -422,7 +422,7 @@ class DBImpl : public DB {
   port::CondVar bg_cv_;
   uint64_t logfile_number_;
   unique_ptr<log::Writer> log_;
-  bool log_dir_unsynced_;
+  bool log_dir_synced_;
   bool log_empty_;
   ColumnFamilyHandleImpl* default_cf_handle_;
   InternalStats* default_cf_internal_stats_;

From c4fb83441cf0084d4f35ddb14b56b05f1c31d6fc Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Tue, 27 Jan 2015 10:30:35 -0800
Subject: [PATCH 748/829] Update the comment for the removal of
 mac-install-gflags.sh

Summary: Update the comment for the removal of mac-install-gflags.sh

Test Plan: n/a

Reviewers: igor

Reviewed By: igor

Subscribers: dhruba

Differential Revision: https://reviews.facebook.net/D32295
---
 build_tools/build_detect_platform | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/build_tools/build_detect_platform b/build_tools/build_detect_platform
index c9ce01eab..58451ace2 100755
--- a/build_tools/build_detect_platform
+++ b/build_tools/build_detect_platform
@@ -32,7 +32,7 @@
 #   2. Once install, add the include path/lib path for gflags to CPATH and
 #      LIBRARY_PATH respectively. If installed with default mode, the
 #      lib and include path will be /usr/local/lib and /usr/local/include
-# Mac user can do this by running build_tools/mac-install-gflags.sh
+# Mac user can do this by having brew installed and running brew install gflags
 
 OUTPUT=$1
 if test -z "$OUTPUT"; then

From 7ffcc457ffad8abd6938296185a0fb8ea292a906 Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Sat, 24 Jan 2015 23:38:43 +0100
Subject: [PATCH 749/829] [RocksJava] Cleanup portal.h

Simple Java Native Objects usually are represented using
the same functionality but within different classes.

With this commit a template class was introduced to remove
the redundant impelementation to a certain extent.
---
 java/rocksjni/portal.h | 562 ++++++++---------------------------------
 1 file changed, 108 insertions(+), 454 deletions(-)

diff --git a/java/rocksjni/portal.h b/java/rocksjni/portal.h
index 771223dba..443761c52 100644
--- a/java/rocksjni/portal.h
+++ b/java/rocksjni/portal.h
@@ -34,36 +34,45 @@ inline Status check_if_jlong_fits_size_t(const jlong& jvalue) {
   return s;
 }
 
-// The portal class for org.rocksdb.RocksDB
-class RocksDBJni {
+// Native class template
+template<class PTR, class DERIVED> class RocksDBNativeClass {
  public:
-  // Get the java class id of org.rocksdb.RocksDB.
-  static jclass getJClass(JNIEnv* env) {
-    jclass jclazz = env->FindClass("org/rocksdb/RocksDB");
+  // Get the java class id
+  static jclass getJClass(JNIEnv* env, const char* jclazz_name) {
+    jclass jclazz = env->FindClass(jclazz_name);
     assert(jclazz != nullptr);
     return jclazz;
   }
 
-  // Get the field id of the member variable of org.rocksdb.RocksDB
-  // that stores the pointer to rocksdb::DB.
+  // Get the field id of the member variable to store
+  // the ptr
   static jfieldID getHandleFieldID(JNIEnv* env) {
     static jfieldID fid = env->GetFieldID(
-        getJClass(env), "nativeHandle_", "J");
+        DERIVED::getJClass(env), "nativeHandle_", "J");
     assert(fid != nullptr);
     return fid;
   }
 
-  // Get the pointer to rocksdb::DB of the specified org.rocksdb.RocksDB.
-  static rocksdb::DB* getHandle(JNIEnv* env, jobject jdb) {
-    return reinterpret_cast<rocksdb::DB*>(
-        env->GetLongField(jdb, getHandleFieldID(env)));
+  // Get the pointer from Java
+  static PTR getHandle(JNIEnv* env, jobject jobj) {
+    return reinterpret_cast<PTR>(
+        env->GetLongField(jobj, getHandleFieldID(env)));
   }
 
-  // Pass the rocksdb::DB pointer to the java side.
-  static void setHandle(JNIEnv* env, jobject jdb, rocksdb::DB* db) {
+  // Pass the pointer to the java side.
+  static void setHandle(JNIEnv* env, jobject jdb, PTR ptr) {
     env->SetLongField(
         jdb, getHandleFieldID(env),
-        reinterpret_cast<jlong>(db));
+        reinterpret_cast<jlong>(ptr));
+  }
+};
+
+// The portal class for org.rocksdb.RocksDB
+class RocksDBJni : public RocksDBNativeClass<rocksdb::DB*, RocksDBJni> {
+ public:
+  // Get the java class id of org.rocksdb.RocksDB.
+  static jclass getJClass(JNIEnv* env) {
+    return RocksDBNativeClass::getJClass(env, "org/rocksdb/RocksDB");
   }
 };
 
@@ -96,67 +105,21 @@ class RocksDBExceptionJni {
   }
 };
 
-class OptionsJni {
+// The portal class for org.rocksdb.Options
+class OptionsJni : public RocksDBNativeClass<rocksdb::Options*,
+    OptionsJni> {
  public:
-  // Get the java class id of org.rocksdb.Options.
   static jclass getJClass(JNIEnv* env) {
-    jclass jclazz = env->FindClass("org/rocksdb/Options");
-    assert(jclazz != nullptr);
-    return jclazz;
-  }
-
-  // Get the field id of the member variable of org.rocksdb.Options
-  // that stores the pointer to rocksdb::Options
-  static jfieldID getHandleFieldID(JNIEnv* env) {
-    static jfieldID fid = env->GetFieldID(
-        getJClass(env), "nativeHandle_", "J");
-    assert(fid != nullptr);
-    return fid;
-  }
-
-  // Get the pointer to rocksdb::Options
-  static rocksdb::Options* getHandle(JNIEnv* env, jobject jobj) {
-    return reinterpret_cast<rocksdb::Options*>(
-        env->GetLongField(jobj, getHandleFieldID(env)));
-  }
-
-  // Pass the rocksdb::Options pointer to the java side.
-  static void setHandle(JNIEnv* env, jobject jobj, rocksdb::Options* op) {
-    env->SetLongField(
-        jobj, getHandleFieldID(env),
-        reinterpret_cast<jlong>(op));
+    return RocksDBNativeClass::getJClass(env, "org/rocksdb/Options");
   }
 };
 
-class DBOptionsJni {
+// The portal class for org.rocksdb.DBOptions
+class DBOptionsJni : public RocksDBNativeClass<rocksdb::DBOptions*,
+    DBOptionsJni> {
  public:
-  // Get the java class id of org.rocksdb.DBOptions.
   static jclass getJClass(JNIEnv* env) {
-    jclass jclazz = env->FindClass("org/rocksdb/DBOptions");
-    assert(jclazz != nullptr);
-    return jclazz;
-  }
-
-  // Get the field id of the member variable of org.rocksdb.DBOptions
-  // that stores the pointer to rocksdb::DBOptions
-  static jfieldID getHandleFieldID(JNIEnv* env) {
-    static jfieldID fid = env->GetFieldID(
-        getJClass(env), "nativeHandle_", "J");
-    assert(fid != nullptr);
-    return fid;
-  }
-
-  // Get the pointer to rocksdb::DBOptions
-  static rocksdb::DBOptions* getHandle(JNIEnv* env, jobject jobj) {
-    return reinterpret_cast<rocksdb::DBOptions*>(
-        env->GetLongField(jobj, getHandleFieldID(env)));
-  }
-
-  // Pass the rocksdb::DBOptions pointer to the java side.
-  static void setHandle(JNIEnv* env, jobject jobj, rocksdb::DBOptions* op) {
-    env->SetLongField(
-        jobj, getHandleFieldID(env),
-        reinterpret_cast<jlong>(op));
+    return RocksDBNativeClass::getJClass(env, "org/rocksdb/DBOptions");
   }
 };
 
@@ -188,149 +151,54 @@ class ColumnFamilyDescriptorJni {
   }
 };
 
-class ColumnFamilyOptionsJni {
+// The portal class for org.rocksdb.ColumnFamilyOptions
+class ColumnFamilyOptionsJni : public RocksDBNativeClass<
+    rocksdb::ColumnFamilyOptions*, ColumnFamilyOptionsJni> {
  public:
-  // Get the java class id of org.rocksdb.ColumnFamilyOptions.
   static jclass getJClass(JNIEnv* env) {
-    jclass jclazz = env->FindClass("org/rocksdb/ColumnFamilyOptions");
-    assert(jclazz != nullptr);
-    return jclazz;
-  }
-
-  // Get the field id of the member variable of org.rocksdb.DBOptions
-  // that stores the pointer to rocksdb::ColumnFamilyOptions
-  static jfieldID getHandleFieldID(JNIEnv* env) {
-    static jfieldID fid = env->GetFieldID(
-        getJClass(env), "nativeHandle_", "J");
-    assert(fid != nullptr);
-    return fid;
-  }
-
-  // Get the pointer to rocksdb::ColumnFamilyOptions
-  static rocksdb::ColumnFamilyOptions* getHandle(JNIEnv* env, jobject jobj) {
-    return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
-        env->GetLongField(jobj, getHandleFieldID(env)));
-  }
-
-  // Pass the rocksdb::ColumnFamilyOptions pointer to the java side.
-  static void setHandle(JNIEnv* env, jobject jobj,
-      rocksdb::ColumnFamilyOptions* op) {
-    env->SetLongField(
-        jobj, getHandleFieldID(env),
-        reinterpret_cast<jlong>(op));
+    return RocksDBNativeClass::getJClass(env,
+        "org/rocksdb/ColumnFamilyOptions");
   }
 };
 
-class WriteOptionsJni {
+// The portal class for org.rocksdb.WriteOptions
+class WriteOptionsJni : public RocksDBNativeClass<
+    rocksdb::WriteOptions*, WriteOptionsJni> {
  public:
-  // Get the java class id of org.rocksdb.WriteOptions.
   static jclass getJClass(JNIEnv* env) {
-    jclass jclazz = env->FindClass("org/rocksdb/WriteOptions");
-    assert(jclazz != nullptr);
-    return jclazz;
-  }
-
-  // Get the field id of the member variable of org.rocksdb.WriteOptions
-  // that stores the pointer to rocksdb::WriteOptions
-  static jfieldID getHandleFieldID(JNIEnv* env) {
-    static jfieldID fid = env->GetFieldID(
-        getJClass(env), "nativeHandle_", "J");
-    assert(fid != nullptr);
-    return fid;
-  }
-
-  // Get the pointer to rocksdb::WriteOptions
-  static rocksdb::WriteOptions* getHandle(JNIEnv* env, jobject jobj) {
-    return reinterpret_cast<rocksdb::WriteOptions*>(
-        env->GetLongField(jobj, getHandleFieldID(env)));
-  }
-
-  // Pass the rocksdb::WriteOptions pointer to the java side.
-  static void setHandle(JNIEnv* env, jobject jobj, rocksdb::WriteOptions* op) {
-    env->SetLongField(
-        jobj, getHandleFieldID(env),
-        reinterpret_cast<jlong>(op));
+    return RocksDBNativeClass::getJClass(env,
+        "org/rocksdb/WriteOptions");
   }
 };
 
-
-class ReadOptionsJni {
+// The portal class for org.rocksdb.ReadOptions
+class ReadOptionsJni : public RocksDBNativeClass<
+    rocksdb::ReadOptions*, ReadOptionsJni> {
  public:
-  // Get the java class id of org.rocksdb.ReadOptions.
   static jclass getJClass(JNIEnv* env) {
-    jclass jclazz = env->FindClass("org/rocksdb/ReadOptions");
-    assert(jclazz != nullptr);
-    return jclazz;
-  }
-
-  // Get the field id of the member variable of org.rocksdb.ReadOptions
-  // that stores the pointer to rocksdb::ReadOptions
-  static jfieldID getHandleFieldID(JNIEnv* env) {
-    static jfieldID fid = env->GetFieldID(
-        getJClass(env), "nativeHandle_", "J");
-    assert(fid != nullptr);
-    return fid;
-  }
-
-  // Get the pointer to rocksdb::ReadOptions
-  static rocksdb::ReadOptions* getHandle(JNIEnv* env, jobject jobj) {
-    return reinterpret_cast<rocksdb::ReadOptions*>(
-        env->GetLongField(jobj, getHandleFieldID(env)));
-  }
-
-  // Pass the rocksdb::ReadOptions pointer to the java side.
-  static void setHandle(JNIEnv* env, jobject jobj,
-                        rocksdb::ReadOptions* op) {
-    env->SetLongField(
-        jobj, getHandleFieldID(env),
-        reinterpret_cast<jlong>(op));
+    return RocksDBNativeClass::getJClass(env,
+        "org/rocksdb/ReadOptions");
   }
 };
 
-
-class WriteBatchJni {
+// The portal class for org.rocksdb.ReadOptions
+class WriteBatchJni : public RocksDBNativeClass<
+    rocksdb::WriteBatch*, WriteBatchJni> {
  public:
   static jclass getJClass(JNIEnv* env) {
-    jclass jclazz = env->FindClass("org/rocksdb/WriteBatch");
-    assert(jclazz != nullptr);
-    return jclazz;
-  }
-
-  static jfieldID getHandleFieldID(JNIEnv* env) {
-    static jfieldID fid = env->GetFieldID(
-        getJClass(env), "nativeHandle_", "J");
-    assert(fid != nullptr);
-    return fid;
-  }
-
-  // Get the pointer to rocksdb::WriteBatch of the specified
-  // org.rocksdb.WriteBatch.
-  static rocksdb::WriteBatch* getHandle(JNIEnv* env, jobject jwb) {
-    return reinterpret_cast<rocksdb::WriteBatch*>(
-        env->GetLongField(jwb, getHandleFieldID(env)));
-  }
-
-  // Pass the rocksdb::WriteBatch pointer to the java side.
-  static void setHandle(JNIEnv* env, jobject jwb, rocksdb::WriteBatch* wb) {
-    env->SetLongField(
-        jwb, getHandleFieldID(env),
-        reinterpret_cast<jlong>(wb));
+    return RocksDBNativeClass::getJClass(env,
+        "org/rocksdb/WriteBatch");
   }
 };
 
-class WriteBatchHandlerJni {
+// The portal class for org.rocksdb.WriteBatch.Handler
+class WriteBatchHandlerJni : public RocksDBNativeClass<
+    const rocksdb::WriteBatchHandlerJniCallback*,
+    WriteBatchHandlerJni> {
  public:
   static jclass getJClass(JNIEnv* env) {
-    jclass jclazz = env->FindClass("org/rocksdb/WriteBatch$Handler");
-    assert(jclazz != nullptr);
-    return jclazz;
-  }
-
-  static jfieldID getHandleFieldID(JNIEnv* env) {
-    static jfieldID fid = env->GetFieldID(
-        getJClass(env), "nativeHandle_", "J");
-    assert(fid != nullptr);
-    return fid;
+    return RocksDBNativeClass::getJClass(env,
+        "org/rocksdb/WriteBatch$Handler");
   }
 
   // Get the java method `put` of org.rocksdb.WriteBatch.Handler.
@@ -372,53 +240,15 @@ class WriteBatchHandlerJni {
     assert(mid != nullptr);
     return mid;
   }
-
-  // Get the pointer to rocksdb::WriteBatchHandlerJniCallback of the specified
-  // org.rocksdb.WriteBatchHandler.
-  static rocksdb::WriteBatchHandlerJniCallback* getHandle(
-      JNIEnv* env, jobject jobj) {
-    return reinterpret_cast<rocksdb::WriteBatchHandlerJniCallback*>(
-        env->GetLongField(jobj, getHandleFieldID(env)));
-  }
-
-  // Pass the rocksdb::WriteBatchHandlerJniCallback pointer to the java side.
-  static void setHandle(
-      JNIEnv* env, jobject jobj,
-      const rocksdb::WriteBatchHandlerJniCallback* op) {
-    env->SetLongField(
-        jobj, getHandleFieldID(env),
-        reinterpret_cast<jlong>(op));
-  }
 };
 
-class WriteBatchWithIndexJni {
+// The portal class for org.rocksdb.WriteBatchWithIndex
+class WriteBatchWithIndexJni : public RocksDBNativeClass<
+    rocksdb::WriteBatchWithIndex*, WriteBatchWithIndexJni> {
  public:
   static jclass getJClass(JNIEnv* env) {
-    jclass jclazz = env->FindClass("org/rocksdb/WriteBatchWithIndex");
-    assert(jclazz != nullptr);
-    return jclazz;
-  }
-
-  static jfieldID getHandleFieldID(JNIEnv* env) {
-    static jfieldID fid = env->GetFieldID(
-        getJClass(env), "nativeHandle_", "J");
-    assert(fid != nullptr);
-    return fid;
-  }
-
-  // Get the pointer to rocksdb::WriteBatchWithIndex of the specified
-  // org.rocksdb.WriteBatchWithIndex.
-  static rocksdb::WriteBatchWithIndex* getHandle(JNIEnv* env, jobject jwbwi) {
-    return reinterpret_cast<rocksdb::WriteBatchWithIndex*>(
-        env->GetLongField(jwbwi, getHandleFieldID(env)));
-  }
-
-  // Pass the rocksdb::WriteBatchWithIndex pointer to the java side.
-  static void setHandle(JNIEnv* env, jobject jwbwi,
-      rocksdb::WriteBatchWithIndex* wbwi) {
-    env->SetLongField(
-        jwbwi, getHandleFieldID(env),
-        reinterpret_cast<jlong>(wbwi));
+    return RocksDBNativeClass::getJClass(env,
+        "org/rocksdb/WriteBatch");
   }
 };
 
@@ -431,212 +261,74 @@ class HistogramDataJni {
   }
 };
 
-class BackupableDBOptionsJni {
+// The portal class for org.rocksdb.WriteBatchWithIndex
+class BackupableDBOptionsJni : public RocksDBNativeClass<
+    rocksdb::BackupableDBOptions*, BackupableDBOptionsJni> {
  public:
-  // Get the java class id of org.rocksdb.BackupableDBOptions.
   static jclass getJClass(JNIEnv* env) {
-    jclass jclazz = env->FindClass("org/rocksdb/BackupableDBOptions");
-    assert(jclazz != nullptr);
-    return jclazz;
-  }
-
-  // Get the field id of the member variable of org.rocksdb.BackupableDBOptions
-  // that stores the pointer to rocksdb::BackupableDBOptions
-  static jfieldID getHandleFieldID(JNIEnv* env) {
-    static jfieldID fid = env->GetFieldID(
-        getJClass(env), "nativeHandle_", "J");
-    assert(fid != nullptr);
-    return fid;
-  }
-
-  // Get the pointer to rocksdb::BackupableDBOptions
-  static rocksdb::BackupableDBOptions* getHandle(JNIEnv* env, jobject jobj) {
-    return reinterpret_cast<rocksdb::BackupableDBOptions*>(
-        env->GetLongField(jobj, getHandleFieldID(env)));
-  }
-
-  // Pass the rocksdb::BackupableDBOptions pointer to the java side.
-  static void setHandle(
-      JNIEnv* env, jobject jobj, rocksdb::BackupableDBOptions* op) {
-    env->SetLongField(
-        jobj, getHandleFieldID(env),
-        reinterpret_cast<jlong>(op));
+    return RocksDBNativeClass::getJClass(env,
+        "org/rocksdb/BackupableDBOptions");
   }
 };
 
-class IteratorJni {
+// The portal class for org.rocksdb.RocksIterator
+class IteratorJni : public RocksDBNativeClass<
+    rocksdb::Iterator*, IteratorJni> {
  public:
-  // Get the java class id of org.rocksdb.Iteartor.
   static jclass getJClass(JNIEnv* env) {
-    jclass jclazz = env->FindClass("org/rocksdb/RocksIterator");
-    assert(jclazz != nullptr);
-    return jclazz;
-  }
-
-  // Get the field id of the member variable of org.rocksdb.Iterator
-  // that stores the pointer to rocksdb::Iterator.
-  static jfieldID getHandleFieldID(JNIEnv* env) {
-    static jfieldID fid = env->GetFieldID(
-        getJClass(env), "nativeHandle_", "J");
-    assert(fid != nullptr);
-    return fid;
-  }
-
-  // Get the pointer to rocksdb::Iterator.
-  static rocksdb::Iterator* getHandle(JNIEnv* env, jobject jobj) {
-    return reinterpret_cast<rocksdb::Iterator*>(
-        env->GetLongField(jobj, getHandleFieldID(env)));
-  }
-
-  // Pass the rocksdb::Iterator pointer to the java side.
-  static void setHandle(
-      JNIEnv* env, jobject jobj, rocksdb::Iterator* op) {
-    env->SetLongField(
-        jobj, getHandleFieldID(env),
-        reinterpret_cast<jlong>(op));
+    return RocksDBNativeClass::getJClass(env,
+        "org/rocksdb/RocksIterator");
   }
 };
 
-class FilterJni {
+// The portal class for org.rocksdb.Filter
+class FilterJni : public RocksDBNativeClass<
+    std::shared_ptr<rocksdb::FilterPolicy>*, FilterJni> {
  public:
-  // Get the java class id of org.rocksdb.FilterPolicy.
   static jclass getJClass(JNIEnv* env) {
-    jclass jclazz = env->FindClass("org/rocksdb/Filter");
-    assert(jclazz != nullptr);
-    return jclazz;
-  }
-
-  // Get the field id of the member variable of org.rocksdb.Filter
-  // that stores the pointer to rocksdb::FilterPolicy.
-  static jfieldID getHandleFieldID(JNIEnv* env) {
-    static jfieldID fid = env->GetFieldID(
-        getJClass(env), "nativeHandle_", "J");
-    assert(fid != nullptr);
-    return fid;
-  }
-
-  // Get the pointer to rocksdb::FilterPolicy.
-  static std::shared_ptr<rocksdb::FilterPolicy>* getHandle(
-      JNIEnv* env, jobject jobj) {
-    return reinterpret_cast
-        <std::shared_ptr<rocksdb::FilterPolicy> *>(
-        env->GetLongField(jobj, getHandleFieldID(env)));
-  }
-
-  // Pass the rocksdb::FilterPolicy pointer to the java side.
-  static void setHandle(
-      JNIEnv* env, jobject jobj, std::shared_ptr<rocksdb::FilterPolicy>* op) {
-    env->SetLongField(
-        jobj, getHandleFieldID(env),
-        reinterpret_cast<jlong>(op));
+    return RocksDBNativeClass::getJClass(env,
+        "org/rocksdb/Filter");
   }
 };
 
-class ColumnFamilyHandleJni {
+// The portal class for org.rocksdb.ColumnFamilyHandle
+class ColumnFamilyHandleJni : public RocksDBNativeClass<
+    rocksdb::ColumnFamilyHandle*, ColumnFamilyHandleJni> {
  public:
-  // Get the java class id of org.rocksdb.ColumnFamilyHandle.
   static jclass getJClass(JNIEnv* env) {
-    jclass jclazz = env->FindClass("org/rocksdb/ColumnFamilyHandle");
-    assert(jclazz != nullptr);
-    return jclazz;
-  }
-
-  // Get the field id of the member variable of org.rocksdb.ColumnFamilyHandle.
-  // that stores the pointer to rocksdb::ColumnFamilyHandle.
-  static jfieldID getHandleFieldID(JNIEnv* env) {
-    static jfieldID fid = env->GetFieldID(
-        getJClass(env), "nativeHandle_", "J");
-    assert(fid != nullptr);
-    return fid;
-  }
-
-  // Get the pointer to rocksdb::ColumnFamilyHandle.
-  static rocksdb::ColumnFamilyHandle* getHandle(JNIEnv* env, jobject jobj) {
-    return reinterpret_cast<rocksdb::ColumnFamilyHandle*>(
-        env->GetLongField(jobj, getHandleFieldID(env)));
-  }
-
-  // Pass the rocksdb::ColumnFamilyHandle pointer to the java side.
-  static void setHandle(
-      JNIEnv* env, jobject jobj, const rocksdb::ColumnFamilyHandle* op) {
-    env->SetLongField(
-        jobj, getHandleFieldID(env),
-        reinterpret_cast<jlong>(op));
+    return RocksDBNativeClass::getJClass(env,
+        "org/rocksdb/ColumnFamilyHandle");
   }
 };
 
-class FlushOptionsJni {
+// The portal class for org.rocksdb.FlushOptions
+class FlushOptionsJni : public RocksDBNativeClass<
+    rocksdb::FlushOptions*, FlushOptionsJni> {
  public:
-    // Get the java class id of org.rocksdb.FlushOptions.
-    static jclass getJClass(JNIEnv* env) {
-      jclass jclazz = env->FindClass("org/rocksdb/FlushOptions");
-      assert(jclazz != nullptr);
-      return jclazz;
-    }
-
-    // Get the field id of the member variable of org.rocksdb.FlushOptions
-    // that stores the pointer to rocksdb::FlushOptions.
-    static jfieldID getHandleFieldID(JNIEnv* env) {
-      static jfieldID fid = env->GetFieldID(
-          getJClass(env), "nativeHandle_", "J");
-      assert(fid != nullptr);
-      return fid;
-    }
-
-    // Pass the FlushOptions pointer to the java side.
-    static void setHandle(
-      JNIEnv* env, jobject jobj,
-      const rocksdb::FlushOptions* op) {
-      env->SetLongField(
-          jobj, getHandleFieldID(env),
-          reinterpret_cast<jlong>(op));
-    }
+  static jclass getJClass(JNIEnv* env) {
+    return RocksDBNativeClass::getJClass(env,
+        "org/rocksdb/FlushOptions");
+  }
 };
 
-class ComparatorOptionsJni {
+// The portal class for org.rocksdb.ComparatorOptions
+class ComparatorOptionsJni : public RocksDBNativeClass<
+    rocksdb::ComparatorJniCallbackOptions*, ComparatorOptionsJni> {
  public:
-    // Get the java class id of org.rocksdb.ComparatorOptions.
-    static jclass getJClass(JNIEnv* env) {
-      jclass jclazz = env->FindClass("org/rocksdb/ComparatorOptions");
-      assert(jclazz != nullptr);
-      return jclazz;
-    }
-
-    // Get the field id of the member variable of org.rocksdb.ComparatorOptions
-    // that stores the pointer to rocksdb::ComparatorJniCallbackOptions.
-    static jfieldID getHandleFieldID(JNIEnv* env) {
-      static jfieldID fid = env->GetFieldID(
-          getJClass(env), "nativeHandle_", "J");
-      assert(fid != nullptr);
-      return fid;
-    }
-
-    // Pass the ComparatorJniCallbackOptions pointer to the java side.
-    static void setHandle(
-      JNIEnv* env, jobject jobj,
-      const rocksdb::ComparatorJniCallbackOptions* op) {
-      env->SetLongField(
-          jobj, getHandleFieldID(env),
-          reinterpret_cast<jlong>(op));
-    }
+  static jclass getJClass(JNIEnv* env) {
+    return RocksDBNativeClass::getJClass(env,
+        "org/rocksdb/ComparatorOptions");
+  }
 };
 
-class AbstractComparatorJni {
+// The portal class for org.rocksdb.AbstractComparator
+class AbstractComparatorJni : public RocksDBNativeClass<
+    const rocksdb::BaseComparatorJniCallback*,
+    AbstractComparatorJni> {
  public:
-  // Get the java class id of org.rocksdb.Comparator.
   static jclass getJClass(JNIEnv* env) {
-    jclass jclazz = env->FindClass("org/rocksdb/AbstractComparator");
-    assert(jclazz != nullptr);
-    return jclazz;
-  }
-
-  // Get the field id of the member variable of org.rocksdb.Comparator
-  // that stores the pointer to rocksdb::Comparator.
-  static jfieldID getHandleFieldID(JNIEnv* env) {
-    static jfieldID fid = env->GetFieldID(
-        getJClass(env), "nativeHandle_", "J");
-    assert(fid != nullptr);
-    return fid;
+    return RocksDBNativeClass::getJClass(env,
+        "org/rocksdb/AbstractComparator");
   }
 
   // Get the java method `name` of org.rocksdb.Comparator.
@@ -673,53 +365,15 @@ class AbstractComparatorJni {
     assert(mid != nullptr);
     return mid;
   }
-
-  // Get the pointer to ComparatorJniCallback.
-  static rocksdb::BaseComparatorJniCallback* getHandle(
-    JNIEnv* env, jobject jobj) {
-    return reinterpret_cast<rocksdb::BaseComparatorJniCallback*>(
-        env->GetLongField(jobj, getHandleFieldID(env)));
-  }
-
-  // Pass the ComparatorJniCallback pointer to the java side.
-  static void setHandle(
-    JNIEnv* env, jobject jobj, const rocksdb::BaseComparatorJniCallback* op) {
-    env->SetLongField(
-        jobj, getHandleFieldID(env),
-        reinterpret_cast<jlong>(op));
-  }
 };
 
-class AbstractSliceJni {
+// The portal class for org.rocksdb.AbstractSlice
+class AbstractSliceJni : public RocksDBNativeClass<
+    const rocksdb::Slice*, AbstractSliceJni> {
  public:
-  // Get the java class id of org.rocksdb.Slice.
   static jclass getJClass(JNIEnv* env) {
-    jclass jclazz = env->FindClass("org/rocksdb/AbstractSlice");
-    assert(jclazz != nullptr);
-    return jclazz;
-  }
-
-  // Get the field id of the member variable of org.rocksdb.Slice
-  // that stores the pointer to rocksdb::Slice.
-  static jfieldID getHandleFieldID(JNIEnv* env) {
-    static jfieldID fid = env->GetFieldID(
-        getJClass(env), "nativeHandle_", "J");
-    assert(fid != nullptr);
-    return fid;
-  }
-
-  // Get the pointer to Slice.
-  static rocksdb::Slice* getHandle(JNIEnv* env, jobject jobj) {
-    return reinterpret_cast<rocksdb::Slice*>(
-        env->GetLongField(jobj, getHandleFieldID(env)));
-  }
-
-  // Pass the Slice pointer to the java side.
-  static void setHandle(
-      JNIEnv* env, jobject jobj, const rocksdb::Slice* op) {
-    env->SetLongField(
-        jobj, getHandleFieldID(env),
-        reinterpret_cast<jlong>(op));
+    return RocksDBNativeClass::getJClass(env,
+        "org/rocksdb/AbstractSlice");
   }
 };
 

From b3c1331488b573e741ae9fb53913af9ccb27c713 Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Sat, 24 Jan 2015 23:43:46 +0100
Subject: [PATCH 750/829] [RocksJava] Removed todo comment in portal.h

As jclass instances shall not be cached, both
todos are obsolete and can be removed.
---
 java/rocksjni/portal.h | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/java/rocksjni/portal.h b/java/rocksjni/portal.h
index 443761c52..234d338ff 100644
--- a/java/rocksjni/portal.h
+++ b/java/rocksjni/portal.h
@@ -567,9 +567,6 @@ class WriteTypeJni {
  private:
     // Get the java class id of org.rocksdb.WBWIRocksIterator.WriteType.
     static jclass getJClass(JNIEnv* env) {
-      // TODO(AR) setting the jclazz var to static causes getEnum to fail
-      // occasionally (e.g. in WriteBatchWithIndex#iterator() test) with
-      // SIGSEGV but I have no idea why...
       jclass jclazz = env->FindClass("org/rocksdb/WBWIRocksIterator$WriteType");
       assert(jclazz != nullptr);
       return jclazz;
@@ -577,9 +574,6 @@ class WriteTypeJni {
 
     // Get an enum field of org.rocksdb.WBWIRocksIterator.WriteType
     static jobject getEnum(JNIEnv* env, const char name[]) {
-      // TODO(AR) setting the jclazz var to static causes getEnum to fail
-      // occasionally (e.g. in WriteBatchWithIndex#iterator() test) with
-      // SIGSEGV but I have no idea why...
       jclass jclazz = getJClass(env);
       jfieldID jfid =
           env->GetStaticFieldID(jclazz, name,

From f8dc5c459f19e94988c8ca1bdb9b7befb71eefa8 Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Sat, 24 Jan 2015 23:58:04 +0100
Subject: [PATCH 751/829] [RocksJava] Add missing test to Makefile

---
 java/Makefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/java/Makefile b/java/Makefile
index 32717ddd8..42f465e10 100644
--- a/java/Makefile
+++ b/java/Makefile
@@ -59,6 +59,7 @@ JAVA_TESTS = org.rocksdb.test.BackupableDBOptionsTest\
 		org.rocksdb.test.ColumnFamilyTest\
 		org.rocksdb.test.ComparatorOptionsTest\
 		org.rocksdb.test.ComparatorTest\
+		org.rocksdb.test.CompressionOptionsTest\
 		org.rocksdb.test.DBOptionsTest\
 		org.rocksdb.test.DirectComparatorTest\
 		org.rocksdb.test.DirectSliceTest\

From ca2b00277e1eb3bfd74f213ed425f04eabf891de Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Sun, 25 Jan 2015 00:24:44 +0100
Subject: [PATCH 752/829] [RocksJava] Cleanup portal.h & tests

Summary:
Simple Java Native Objects usually are represented using
the same functionality but within different classes.

With this commit a template class was introduced to remove
the redundant impelementation to a certain extent.

[RocksJava] Removed todo comment in portal.h

As jclass instances shall not be cached, both
todos are obsolete and can be removed.

[RocksJava] Add missing test to Makefile

[RocksJava] Added tests for uncovered methods

Test Plan:
make rocksdbjava
make jtest
mvn -f rocksjni.pom package

Reviewers: adamretter, yhchiang, ankgup87

Subscribers: dhruba

Differential Revision: https://reviews.facebook.net/D32145
---
 java/org/rocksdb/test/ColumnFamilyTest.java | 10 +++++++++-
 java/org/rocksdb/test/DirectSliceTest.java  |  4 +++-
 java/org/rocksdb/test/FlushTest.java        |  1 +
 3 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/java/org/rocksdb/test/ColumnFamilyTest.java b/java/org/rocksdb/test/ColumnFamilyTest.java
index fb95e8010..bf568b5e8 100644
--- a/java/org/rocksdb/test/ColumnFamilyTest.java
+++ b/java/org/rocksdb/test/ColumnFamilyTest.java
@@ -259,7 +259,8 @@ public class ColumnFamilyTest {
           new ArrayList<>();
       List<ColumnFamilyHandle> columnFamilyHandleList =
           new ArrayList<>();
-      cfNames.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY));
+      cfNames.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY,
+          new ColumnFamilyOptions().setMergeOperator(new StringAppendOperator())));
       cfNames.add(new ColumnFamilyDescriptor("new_cf".getBytes()));
 
       db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath(),
@@ -268,6 +269,10 @@ public class ColumnFamilyTest {
       WriteBatch writeBatch = new WriteBatch();
       WriteOptions writeOpt = new WriteOptions();
       writeBatch.put("key".getBytes(), "value".getBytes());
+      writeBatch.put(db.getDefaultColumnFamily(),
+          "mergeKey".getBytes(), "merge".getBytes());
+      writeBatch.merge(db.getDefaultColumnFamily(), "mergeKey".getBytes(),
+          "merge".getBytes());
       writeBatch.put(columnFamilyHandleList.get(1), "newcfkey".getBytes(),
           "value".getBytes());
       writeBatch.put(columnFamilyHandleList.get(1), "newcfkey2".getBytes(),
@@ -283,6 +288,9 @@ public class ColumnFamilyTest {
       assertThat(new String(db.get(columnFamilyHandleList.get(1),
           "newcfkey2".getBytes()))).isEqualTo("value2");
       assertThat(new String(db.get("key".getBytes()))).isEqualTo("value");
+      // check if key is merged
+      assertThat(new String(db.get(db.getDefaultColumnFamily(),
+          "mergeKey".getBytes()))).isEqualTo("merge,merge");
     } finally {
       if (db != null) {
         db.close();
diff --git a/java/org/rocksdb/test/DirectSliceTest.java b/java/org/rocksdb/test/DirectSliceTest.java
index a50664867..20a44a904 100644
--- a/java/org/rocksdb/test/DirectSliceTest.java
+++ b/java/org/rocksdb/test/DirectSliceTest.java
@@ -48,8 +48,10 @@ public class DirectSliceTest {
     DirectSlice directSlice = null;
     try {
       byte[] data = "Some text".getBytes();
-      ByteBuffer buffer = ByteBuffer.allocateDirect(data.length);
+      ByteBuffer buffer = ByteBuffer.allocateDirect(data.length + 1);
       buffer.put(data);
+      buffer.put(data.length, (byte)0);
+
       directSlice = new DirectSlice(buffer);
       assertThat(directSlice.toString()).isEqualTo("Some text");
     } finally {
diff --git a/java/org/rocksdb/test/FlushTest.java b/java/org/rocksdb/test/FlushTest.java
index 9dea7e753..3bfdb3114 100644
--- a/java/org/rocksdb/test/FlushTest.java
+++ b/java/org/rocksdb/test/FlushTest.java
@@ -36,6 +36,7 @@ public class FlushTest {
       wOpt = new WriteOptions();
       flushOptions = new FlushOptions();
       flushOptions.setWaitForFlush(true);
+      assertThat(flushOptions.waitForFlush()).isTrue();
       wOpt.setDisableWAL(true);
       db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath());
       db.put(wOpt, "key1".getBytes(), "value1".getBytes());

From 1b43ab58d96a44af2dc43cdac2216658755dbcb4 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Tue, 27 Jan 2015 13:44:04 -0800
Subject: [PATCH 753/829] fault_injection_test: add more logging and makes
 synchronization slightly stronger

Summary:
We see failure of the test in travis but I can't repro it.
Add more logging in failure cases to help us figure out which failure it is.
Also makes synchronization slightly stronger, though there isn't seem to be a problem without it

Test Plan: Run the test

Reviewers: rven, yhchiang, igor

Reviewed By: igor

Subscribers: leveldb, dhruba

Differential Revision: https://reviews.facebook.net/D32319
---
 db/fault_injection_test.cc | 28 ++++++++++++++++++++++++----
 1 file changed, 24 insertions(+), 4 deletions(-)

diff --git a/db/fault_injection_test.cc b/db/fault_injection_test.cc
index e2c4629d2..b6a63c36d 100644
--- a/db/fault_injection_test.cc
+++ b/db/fault_injection_test.cc
@@ -72,8 +72,11 @@ Status Truncate(const std::string& filename, uint64_t length) {
   unique_ptr<SequentialFile> orig_file;
   const EnvOptions options;
   Status s = env->NewSequentialFile(filename, &orig_file, options);
-  if (!s.ok())
+  if (!s.ok()) {
+    fprintf(stderr, "Cannot truncate file %s: %s\n", filename.c_str(),
+            s.ToString().c_str());
     return s;
+  }
 
   char* scratch = new char[length];
   rocksdb::Slice result;
@@ -87,10 +90,16 @@ Status Truncate(const std::string& filename, uint64_t length) {
       if (s.ok()) {
         s = env->RenameFile(tmp_name, filename);
       } else {
+        fprintf(stderr, "Cannot renmae file %s to %s: %s\n", tmp_name.c_str(),
+                filename.c_str(), s.ToString().c_str());
         env->DeleteFile(tmp_name);
       }
     }
   }
+  if (!s.ok()) {
+    fprintf(stderr, "Cannot truncate file %s: %s\n", filename.c_str(),
+            s.ToString().c_str());
+  }
 
   delete[] scratch;
 
@@ -194,6 +203,10 @@ class FaultInjectionTestEnv : public EnvWrapper {
 
   virtual Status DeleteFile(const std::string& f) {
     Status s = EnvWrapper::DeleteFile(f);
+    if (!s.ok()) {
+      fprintf(stderr, "Cannot delete file %s: %s\n", f.c_str(),
+              s.ToString().c_str());
+    }
     ASSERT_OK(s);
     if (s.ok()) {
       UntrackFile(f);
@@ -275,7 +288,7 @@ class FaultInjectionTestEnv : public EnvWrapper {
     MutexLock l(&mutex_);
     db_file_state_.clear();
     dir_to_new_files_since_last_sync_.clear();
-    SetFilesystemActive(true);
+    SetFilesystemActiveNoLock(true);
   }
 
   void UntrackFile(const std::string& f) {
@@ -295,8 +308,15 @@ class FaultInjectionTestEnv : public EnvWrapper {
   // system reset. Setting to inactive will freeze our saved filesystem state so
   // that it will stop being recorded. It can then be reset back to the state at
   // the time of the reset.
-  bool IsFilesystemActive() const { return filesystem_active_; }
-  void SetFilesystemActive(bool active) { filesystem_active_ = active; }
+  bool IsFilesystemActive() {
+    MutexLock l(&mutex_);
+    return filesystem_active_;
+  }
+  void SetFilesystemActiveNoLock(bool active) { filesystem_active_ = active; }
+  void SetFilesystemActive(bool active) {
+    MutexLock l(&mutex_);
+    SetFilesystemActiveNoLock(active);
+  }
 
  private:
   port::Mutex mutex_;

From 4c49fedaf15bca9f8cb82d92a8dc934e7d1a3309 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Mon, 26 Jan 2015 20:23:15 -0800
Subject: [PATCH 754/829] Use ustricter consistency in thread local operations

Summary:
ThreadSanitizer complains data race of super version and version's destructor with Get(). This patch will fix those warning.

The warning is likely from ColumnFamilyData::ReturnThreadLocalSuperVersion(). With relaxed consistency of CAS, reading the data of the super version can technically happen after swapping it in, enabling the background thread to clean it up.

Test Plan: make all check

Reviewers: rven, igor, yhchiang

Reviewed By: yhchiang

Subscribers: leveldb, dhruba

Differential Revision: https://reviews.facebook.net/D32265
---
 util/thread_local.cc | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/util/thread_local.cc b/util/thread_local.cc
index 60e418dff..af0c8e12b 100644
--- a/util/thread_local.cc
+++ b/util/thread_local.cc
@@ -36,7 +36,7 @@ void ThreadLocalPtr::StaticMeta::OnThreadExit(void* ptr) {
   // Unref stored pointers of current thread from all instances
   uint32_t id = 0;
   for (auto& e : tls->entries) {
-    void* raw = e.ptr.load(std::memory_order_relaxed);
+    void* raw = e.ptr.load();
     if (raw != nullptr) {
       auto unref = inst->GetHandler(id);
       if (unref != nullptr) {
@@ -109,7 +109,7 @@ void* ThreadLocalPtr::StaticMeta::Get(uint32_t id) const {
   if (UNLIKELY(id >= tls->entries.size())) {
     return nullptr;
   }
-  return tls->entries[id].ptr.load(std::memory_order_relaxed);
+  return tls->entries[id].ptr.load(std::memory_order_acquire);
 }
 
 void ThreadLocalPtr::StaticMeta::Reset(uint32_t id, void* ptr) {
@@ -119,7 +119,7 @@ void ThreadLocalPtr::StaticMeta::Reset(uint32_t id, void* ptr) {
     MutexLock l(&mutex_);
     tls->entries.resize(id + 1);
   }
-  tls->entries[id].ptr.store(ptr, std::memory_order_relaxed);
+  tls->entries[id].ptr.store(ptr, std::memory_order_release);
 }
 
 void* ThreadLocalPtr::StaticMeta::Swap(uint32_t id, void* ptr) {
@@ -129,7 +129,7 @@ void* ThreadLocalPtr::StaticMeta::Swap(uint32_t id, void* ptr) {
     MutexLock l(&mutex_);
     tls->entries.resize(id + 1);
   }
-  return tls->entries[id].ptr.exchange(ptr, std::memory_order_relaxed);
+  return tls->entries[id].ptr.exchange(ptr, std::memory_order_acquire);
 }
 
 bool ThreadLocalPtr::StaticMeta::CompareAndSwap(uint32_t id, void* ptr,
@@ -140,8 +140,8 @@ bool ThreadLocalPtr::StaticMeta::CompareAndSwap(uint32_t id, void* ptr,
     MutexLock l(&mutex_);
     tls->entries.resize(id + 1);
   }
-  return tls->entries[id].ptr.compare_exchange_strong(expected, ptr,
-      std::memory_order_relaxed, std::memory_order_relaxed);
+  return tls->entries[id].ptr.compare_exchange_strong(
+      expected, ptr, std::memory_order_release, std::memory_order_relaxed);
 }
 
 void ThreadLocalPtr::StaticMeta::Scrape(uint32_t id, autovector<void*>* ptrs,
@@ -150,7 +150,7 @@ void ThreadLocalPtr::StaticMeta::Scrape(uint32_t id, autovector<void*>* ptrs,
   for (ThreadData* t = head_.next; t != &head_; t = t->next) {
     if (id < t->entries.size()) {
       void* ptr =
-          t->entries[id].ptr.exchange(replacement, std::memory_order_relaxed);
+          t->entries[id].ptr.exchange(replacement, std::memory_order_acquire);
       if (ptr != nullptr) {
         ptrs->push_back(ptr);
       }
@@ -198,8 +198,7 @@ void ThreadLocalPtr::StaticMeta::ReclaimId(uint32_t id) {
   auto unref = GetHandler(id);
   for (ThreadData* t = head_.next; t != &head_; t = t->next) {
     if (id < t->entries.size()) {
-      void* ptr =
-          t->entries[id].ptr.exchange(nullptr, std::memory_order_relaxed);
+      void* ptr = t->entries[id].ptr.exchange(nullptr);
       if (ptr != nullptr && unref != nullptr) {
         unref(ptr);
       }

From e919ecedfc18d1e2007711ddf2dc0f641f689528 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Tue, 27 Jan 2015 13:57:44 -0800
Subject: [PATCH 755/829] SuperVersion::Unref() to use sequential consistency
 to decrease ref counting

Summary: I'm not sure the expected results of std::atomic::fetch_sub() when using memory_order_relaxed, and I suspect TSAN complains.

Test Plan: make all check

Reviewers: rven, yhchiang, igor

Reviewed By: igor

Subscribers: leveldb, dhruba

Differential Revision: https://reviews.facebook.net/D32259
---
 db/column_family.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/db/column_family.cc b/db/column_family.cc
index e6e75aad9..be01a2993 100644
--- a/db/column_family.cc
+++ b/db/column_family.cc
@@ -181,7 +181,7 @@ SuperVersion* SuperVersion::Ref() {
 
 bool SuperVersion::Unref() {
   // fetch_sub returns the previous value of ref
-  uint32_t previous_refs = refs.fetch_sub(1, std::memory_order_relaxed);
+  uint32_t previous_refs = refs.fetch_sub(1);
   assert(previous_refs > 0);
   return previous_refs == 1;
 }

From f9758e01297c6c5211397515d4befd89f89b35de Mon Sep 17 00:00:00 2001
From: Ori Bernstein <orib@dev589.prn1.facebook.com>
Date: Tue, 27 Jan 2015 14:44:02 -0800
Subject: [PATCH 756/829] Add compaction listener.

Summary: This adds a listener for compactions, and gives some useful statistics on each compaction pass.

Test Plan: Unit tests.

Reviewers: sdong, igor, rven, yhchiang

Reviewed By: yhchiang

Subscribers: dhruba

Differential Revision: https://reviews.facebook.net/D31641
---
 db/column_family.cc        | 26 +++++++++++++++++
 db/column_family.h         |  2 ++
 db/db_impl.cc              | 29 ++++++++++++++++--
 db/db_impl.h               |  3 ++
 db/listener_test.cc        | 60 ++++++++++++++++++++++++++++++++++++--
 include/rocksdb/listener.h | 29 ++++++++++++++++++
 6 files changed, 145 insertions(+), 4 deletions(-)

diff --git a/db/column_family.cc b/db/column_family.cc
index be01a2993..d3ff9b3f5 100644
--- a/db/column_family.cc
+++ b/db/column_family.cc
@@ -556,6 +556,32 @@ bool ColumnFamilyData::ReturnThreadLocalSuperVersion(SuperVersion* sv) {
   return false;
 }
 
+void ColumnFamilyData::NotifyOnCompactionCompleted(
+    DB* db, Compaction* c, const Status& status) {
+#ifndef ROCKSDB_LITE
+  auto listeners = ioptions()->listeners;
+  CompactionJobInfo info;
+  info.cf_name = c->column_family_data()->GetName();
+  info.status = status;
+  info.output_level = c->output_level();
+  for (const auto fmd : *c->inputs(c->level())) {
+    info.input_files.push_back(
+        TableFileName(options_.db_paths,
+                      fmd->fd.GetNumber(),
+                      fmd->fd.GetPathId()));
+  }
+  for (const auto newf : c->edit()->GetNewFiles()) {
+    info.input_files.push_back(
+        TableFileName(options_.db_paths,
+                      newf.second.fd.GetNumber(),
+                      newf.second.fd.GetPathId()));
+  }
+  for (auto listener : listeners) {
+    listener->OnCompactionCompleted(db, info);
+  }
+#endif  // ROCKSDB_LITE
+}
+
 void ColumnFamilyData::NotifyOnFlushCompleted(
     DB* db, const std::string& file_path,
     bool triggered_flush_slowdown,
diff --git a/db/column_family.h b/db/column_family.h
index a1a9e8034..8101e7032 100644
--- a/db/column_family.h
+++ b/db/column_family.h
@@ -261,6 +261,8 @@ class ColumnFamilyData {
 
   void ResetThreadLocalSuperVersions();
 
+  void NotifyOnCompactionCompleted(DB* db, Compaction* c, const Status& status);
+
   void NotifyOnFlushCompleted(
       DB* db, const std::string& file_path,
       bool triggered_flush_slowdown,
diff --git a/db/db_impl.cc b/db/db_impl.cc
index 2be3d2359..8e8f3b733 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -1433,6 +1433,28 @@ Status DBImpl::CompactFilesImpl(
 }
 #endif  // ROCKSDB_LITE
 
+void DBImpl::NotifyOnCompactionCompleted(
+    ColumnFamilyData* cfd, Compaction *c, const Status &st) {
+#ifndef ROCKSDB_LITE
+  if (cfd->ioptions()->listeners.size() == 0U) {
+    return;
+  }
+  mutex_.AssertHeld();
+  if (shutting_down_.load(std::memory_order_acquire)) {
+    return;
+  }
+  notifying_events_++;
+  // release lock while notifying events
+  mutex_.Unlock();
+  cfd->NotifyOnCompactionCompleted(this, c, st);
+  mutex_.Lock();
+  notifying_events_--;
+  assert(notifying_events_ >= 0);
+  // no need to signal bg_cv_ as it will be signaled at the end of the
+  // flush process.
+#endif  // ROCKSDB_LITE
+}
+
 Status DBImpl::SetOptions(ColumnFamilyHandle* column_family,
     const std::unordered_map<std::string, std::string>& options_map) {
 #ifdef ROCKSDB_LITE
@@ -2186,7 +2208,6 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress, JobContext* job_context,
     LogToBuffer(log_buffer, "[%s] Deleted %d files\n",
                 c->column_family_data()->GetName().c_str(),
                 c->num_input_files(0));
-    c->ReleaseCompactionFiles(status);
     *madeProgress = true;
   } else if (!is_manual && c->IsTrivialMove()) {
     // Instrument for event update
@@ -2221,7 +2242,6 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress, JobContext* job_context,
         c->column_family_data()->GetName().c_str(), f->fd.GetNumber(),
         c->level() + 1, f->fd.GetFileSize(), status.ToString().c_str(),
         c->column_family_data()->current()->storage_info()->LevelSummary(&tmp));
-    c->ReleaseCompactionFiles(status);
     *madeProgress = true;
 
     // Clear Instrument
@@ -2246,6 +2266,11 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress, JobContext* job_context,
       InstallSuperVersionBackground(c->column_family_data(), job_context,
                                     *c->mutable_cf_options());
     }
+    *madeProgress = true;
+  }
+  // FIXME(orib): should I check if column family data is null?
+  if (c != nullptr) {
+    NotifyOnCompactionCompleted(c->column_family_data(), c.get(), status);
     c->ReleaseCompactionFiles(status);
     *madeProgress = true;
   }
diff --git a/db/db_impl.h b/db/db_impl.h
index 70fa14727..3b3376665 100644
--- a/db/db_impl.h
+++ b/db/db_impl.h
@@ -268,6 +268,9 @@ class DBImpl : public DB {
   void NotifyOnFlushCompleted(ColumnFamilyData* cfd, uint64_t file_number,
                               const MutableCFOptions& mutable_cf_options);
 
+  void NotifyOnCompactionCompleted(ColumnFamilyData* cfd,
+                                   Compaction *c, const Status &st);
+
   void NewThreadStatusCfInfo(ColumnFamilyData* cfd) const;
 
   void EraseThreadStatusCfInfo(ColumnFamilyData* cfd) const;
diff --git a/db/listener_test.cc b/db/listener_test.cc
index dfc075803..a1577fba2 100644
--- a/db/listener_test.cc
+++ b/db/listener_test.cc
@@ -2,7 +2,6 @@
 //  This source code is licensed under the BSD-style license found in the
 //  LICENSE file in the root directory of this source tree. An additional grant
 //  of patent rights can be found in the PATENTS file in the same directory.
-
 #include "db/dbformat.h"
 #include "db/db_impl.h"
 #include "db/filename.h"
@@ -144,12 +143,69 @@ class EventListenerTest {
     }
   }
 
-
   DB* db_;
   std::string dbname_;
   std::vector<ColumnFamilyHandle*> handles_;
 };
 
+class TestCompactionListener : public EventListener {
+ public:
+  void OnCompactionCompleted(DB *db,
+                             int input_level,
+                             int output_level,
+                             const std::vector<int64_t>& input_files) {
+    compacted_dbs_.push_back(db);
+  }
+
+  std::vector<DB*> compacted_dbs_;
+};
+
+TEST(EventListenerTest, OnSingleDBCompactionTest) {
+  const int kTestKeySize = 16;
+  const int kTestValueSize = 984;
+  const int kEntrySize = kTestKeySize + kTestValueSize;
+  const int kEntriesPerBuffer = 100;
+  const int kNumL0Files = 4;
+
+  Options options;
+  options.create_if_missing = true;
+  options.write_buffer_size = kEntrySize * kEntriesPerBuffer;
+  options.compaction_style = kCompactionStyleLevel;
+  options.target_file_size_base = options.write_buffer_size;
+  options.max_bytes_for_level_base = options.target_file_size_base * 2;
+  options.max_bytes_for_level_multiplier = 2;
+  options.compression = kNoCompression;
+  options.enable_thread_tracking = true;
+  options.level0_file_num_compaction_trigger = kNumL0Files;
+
+  TestCompactionListener* listener = new TestCompactionListener();
+  options.listeners.emplace_back(listener);
+  std::vector<std::string> cf_names = {
+      "pikachu", "ilya", "muromec", "dobrynia",
+      "nikitich", "alyosha", "popovich"};
+  CreateAndReopenWithCF(cf_names, &options);
+  ASSERT_OK(Put(1, "pikachu", std::string(90000, 'p')));
+  ASSERT_OK(Put(2, "ilya", std::string(90000, 'i')));
+  ASSERT_OK(Put(3, "muromec", std::string(90000, 'm')));
+  ASSERT_OK(Put(4, "dobrynia", std::string(90000, 'd')));
+  ASSERT_OK(Put(5, "nikitich", std::string(90000, 'n')));
+  ASSERT_OK(Put(6, "alyosha", std::string(90000, 'a')));
+  ASSERT_OK(Put(7, "popovich", std::string(90000, 'p')));
+  for (size_t i = 1; i < 8; ++i) {
+    ASSERT_OK(Flush(static_cast<int>(i)));
+    const Slice kStart = "a";
+    const Slice kEnd = "z";
+    ASSERT_OK(dbfull()->CompactRange(handles_[i], &kStart, &kEnd));
+    dbfull()->TEST_WaitForFlushMemTable();
+    dbfull()->TEST_WaitForCompact();
+  }
+
+  ASSERT_EQ(listener->compacted_dbs_.size(), cf_names.size());
+  for (size_t i = 0; i < cf_names.size(); ++i) {
+    ASSERT_EQ(listener->compacted_dbs_[i], db_);
+  }
+}
+
 class TestFlushListener : public EventListener {
  public:
   void OnFlushCompleted(
diff --git a/include/rocksdb/listener.h b/include/rocksdb/listener.h
index 4ad1ae04b..be5b96032 100644
--- a/include/rocksdb/listener.h
+++ b/include/rocksdb/listener.h
@@ -7,6 +7,7 @@
 #ifndef ROCKSDB_LITE
 
 #include <string>
+#include <vector>
 #include "rocksdb/status.h"
 
 namespace rocksdb {
@@ -14,6 +15,19 @@ namespace rocksdb {
 class DB;
 class Status;
 
+struct CompactionJobInfo {
+  // the name of the column family where the compaction happened.
+  std::string cf_name;
+  // the status indicating whether the compaction was successful or not.
+  Status status;
+  // the output level of the compaction.
+  int output_level;
+  // the names of the compaction input files.
+  std::vector<std::string> input_files;
+  // the names of the compaction output files.
+  std::vector<std::string> output_files;
+};
+
 // EventListener class contains a set of call-back functions that will
 // be called when specific RocksDB event happens such as flush.  It can
 // be used as a building block for developing custom features such as
@@ -58,6 +72,21 @@ class EventListener {
       const std::string& file_path,
       bool triggered_writes_slowdown,
       bool triggered_writes_stop) {}
+
+  // A call-back function for RocksDB which will be called whenever
+  // a registered RocksDB compacts a file. The default implementation
+  // is a no-op.
+  //
+  // Note that this function must be implemented in a way such that
+  // it should not run for an extended period of time before the function
+  // returns. Otherwise, RocksDB may be blocked.
+  //
+  // @param db a pointer to the rocksdb instance which just compacted
+  //   a file.
+  // @param ci a reference to a CompactionJobInfo struct. 'ci' is released
+  //  after this function is returned, and must be copied if it is needed
+  //  outside of this function.
+  virtual void OnCompactionCompleted(DB *db, const CompactionJobInfo& ci) {}
   virtual ~EventListener() {}
 };
 

From d6c7300ccfef3554da0ffbc874fd261cf2b1de98 Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Tue, 27 Jan 2015 15:01:04 -0800
Subject: [PATCH 757/829] Fixed a compile warning in clang in
 db/listener_test.cc

Summary: Fixed a compile warning in clang in db/listener_test.cc

Test Plan: make listener_test

Reviewers: oridb

Reviewed By: oridb

Subscribers: dhruba

Differential Revision: https://reviews.facebook.net/D32337
---
 db/listener_test.cc | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/db/listener_test.cc b/db/listener_test.cc
index a1577fba2..80d4d4cd1 100644
--- a/db/listener_test.cc
+++ b/db/listener_test.cc
@@ -150,10 +150,7 @@ class EventListenerTest {
 
 class TestCompactionListener : public EventListener {
  public:
-  void OnCompactionCompleted(DB *db,
-                             int input_level,
-                             int output_level,
-                             const std::vector<int64_t>& input_files) {
+  void OnCompactionCompleted(DB *db, const CompactionJobInfo& ci) override {
     compacted_dbs_.push_back(db);
   }
 

From ea7d0b943a07ee54bba826e30d022927376c6f77 Mon Sep 17 00:00:00 2001
From: alabid <alabidan@gmail.com>
Date: Tue, 27 Jan 2015 19:37:36 -0500
Subject: [PATCH 758/829] Added WriteBatch block to simple_example.cc

---
 examples/simple_example.cc | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/examples/simple_example.cc b/examples/simple_example.cc
index 20e7faa4b..28a7c9e8b 100644
--- a/examples/simple_example.cc
+++ b/examples/simple_example.cc
@@ -27,14 +27,28 @@ int main() {
   assert(s.ok());
 
   // Put key-value
-  s = db->Put(WriteOptions(), "key", "value");
+  s = db->Put(WriteOptions(), "key1", "value");
   assert(s.ok());
   std::string value;
   // get value
-  s = db->Get(ReadOptions(), "key", &value);
+  s = db->Get(ReadOptions(), "key1", &value);
   assert(s.ok());
   assert(value == "value");
 
+  // atomically apply a set of updates
+  {
+    WriteBatch batch;
+    batch.Delete("key1");
+    batch.Put("key2", value);
+    s = db->Write(WriteOptions(), &batch);
+  }
+
+  s = db->Get(ReadOptions(), "key1", &value);
+  assert(s.IsNotFound());
+
+  db->Get(ReadOptions(), "key2", &value);
+  assert(value == "value");
+
   delete db;
 
   return 0;

From e8bf2310a0b2db7fb824256c1febbb080c533b91 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Tue, 27 Jan 2015 16:55:33 -0800
Subject: [PATCH 759/829] Remove blob store from the codebase

Summary: We don't have plans to work on this in the short term. If we ever resurrect the project, we can find the code in the history. No need for it to linger around

Test Plan: no test

Reviewers: yhchiang, rven, sdong

Reviewed By: sdong

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D32349
---
 Makefile                  |   8 --
 tools/blob_store_bench.cc | 292 --------------------------------------
 util/blob_store.cc        | 272 -----------------------------------
 util/blob_store.h         | 163 ---------------------
 util/blob_store_test.cc   | 200 --------------------------
 5 files changed, 935 deletions(-)
 delete mode 100644 tools/blob_store_bench.cc
 delete mode 100644 util/blob_store.cc
 delete mode 100644 util/blob_store.h
 delete mode 100644 util/blob_store_test.cc

diff --git a/Makefile b/Makefile
index 237eebdf5..7d0e7275a 100644
--- a/Makefile
+++ b/Makefile
@@ -132,7 +132,6 @@ TESTS = \
 	dbformat_test \
 	env_test \
 	fault_injection_test \
-	blob_store_test \
 	filelock_test \
 	filename_test \
 	block_based_filter_block_test \
@@ -188,7 +187,6 @@ TOOLS = \
         ldb \
 	db_repl_stress \
 	options_test \
-	blob_store_bench
 
 PROGRAMS = db_bench signal_test table_reader_bench log_and_apply_bench cache_bench perf_context_test memtablerep_bench $(TOOLS)
 
@@ -356,9 +354,6 @@ db_sanity_test: tools/db_sanity_test.o $(LIBOBJECTS) $(TESTUTIL)
 db_repl_stress: tools/db_repl_stress.o $(LIBOBJECTS) $(TESTUTIL)
 	$(CXX) tools/db_repl_stress.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@  $(LDFLAGS) $(COVERAGEFLAGS)
 
-blob_store_bench: tools/blob_store_bench.o $(LIBOBJECTS) $(TESTUTIL)
-	$(CXX) tools/blob_store_bench.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@  $(LDFLAGS) $(COVERAGEFLAGS)
-
 signal_test: util/signal_test.o $(LIBOBJECTS)
 	$(CXX) util/signal_test.o $(LIBOBJECTS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
 
@@ -389,9 +384,6 @@ cache_test: util/cache_test.o $(LIBOBJECTS) $(TESTHARNESS)
 coding_test: util/coding_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) util/coding_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
 
-blob_store_test: util/blob_store_test.o $(LIBOBJECTS) $(TESTHARNESS) $(TESTUTIL)
-	$(CXX) util/blob_store_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o$@ $(LDFLAGS) $(COVERAGEFLAGS)
-
 stringappend_test: utilities/merge_operators/string_append/stringappend_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) utilities/merge_operators/string_append/stringappend_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
 
diff --git a/tools/blob_store_bench.cc b/tools/blob_store_bench.cc
deleted file mode 100644
index 0daae1a11..000000000
--- a/tools/blob_store_bench.cc
+++ /dev/null
@@ -1,292 +0,0 @@
-//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under the BSD-style license found in the
-//  LICENSE file in the root directory of this source tree. An additional grant
-//  of patent rights can be found in the PATENTS file in the same directory.
-
-#ifndef ROCKSDB_LITE
-#include <cstdio>
-#include <vector>
-#include <atomic>
-
-#include "rocksdb/env.h"
-#include "util/blob_store.h"
-#include "util/testutil.h"
-
-#define KB 1024LL
-#define MB 1024*1024LL
-// BlobStore does costly asserts to make sure it's running correctly, which
-// significantly impacts benchmark runtime.
-// NDEBUG will compile out those asserts.
-#ifndef NDEBUG
-#define NDEBUG
-#endif
-
-using namespace rocksdb;
-using namespace std;
-
-// used by all threads
-uint64_t timeout_sec;
-Env *env;
-BlobStore* bs;
-
-namespace {
-std::string RandomString(Random* rnd, uint64_t len) {
-  std::string r;
-  test::RandomString(rnd, static_cast<int>(len), &r);
-  return r;
-}
-}  // namespace
-
-struct Result {
-  uint32_t writes;
-  uint32_t reads;
-  uint32_t deletes;
-  uint64_t data_written;
-  uint64_t data_read;
-
-  void print() {
-    printf("Total writes = %u\n", writes);
-    printf("Total reads = %u\n", reads);
-    printf("Total deletes = %u\n", deletes);
-    printf("Write throughput = %lf MB/s\n",
-           (double)data_written / (1024*1024.0) / timeout_sec);
-    printf("Read throughput = %lf MB/s\n",
-           (double)data_read / (1024*1024.0) / timeout_sec);
-    printf("Total throughput = %lf MB/s\n",
-           (double)(data_read + data_written) / (1024*1024.0) / timeout_sec);
-  }
-
-  Result() {
-    writes = reads = deletes = data_read = data_written = 0;
-  }
-
-  Result(uint32_t _writes, uint32_t _reads, uint32_t _deletes,
-         uint64_t _data_written, uint64_t _data_read)
-      : writes(_writes),
-        reads(_reads),
-        deletes(_deletes),
-        data_written(_data_written),
-        data_read(_data_read) {}
-};
-
-namespace {
-Result operator + (const Result &a, const Result &b) {
-  return Result(a.writes + b.writes, a.reads + b.reads,
-                a.deletes + b.deletes, a.data_written + b.data_written,
-                a.data_read + b.data_read);
-}
-}  // namespace
-
-struct WorkerThread {
-  uint64_t data_size_from, data_size_to;
-  double read_ratio;
-  uint64_t working_set_size; // start deleting once you reach this
-  Result result;
-  atomic<bool> stopped;
-
-  WorkerThread(uint64_t _data_size_from, uint64_t _data_size_to,
-               double _read_ratio, uint64_t _working_set_size)
-      : data_size_from(_data_size_from),
-        data_size_to(_data_size_to),
-        read_ratio(_read_ratio),
-        working_set_size(_working_set_size),
-        stopped(false) {}
-
-  WorkerThread(const WorkerThread& wt) :
-    data_size_from(wt.data_size_from), data_size_to(wt.data_size_to),
-    read_ratio(wt.read_ratio), working_set_size(wt.working_set_size),
-    stopped(false) {}
-};
-
-static void WorkerThreadBody(void* arg) {
-  WorkerThread* t = reinterpret_cast<WorkerThread*>(arg);
-  Random rnd(5);
-  string buf;
-  vector<pair<Blob, uint64_t>> blobs;
-  vector<string> random_strings;
-
-  for (int i = 0; i < 10; ++i) {
-    random_strings.push_back(RandomString(&rnd, t->data_size_to));
-  }
-
-  uint64_t total_size = 0;
-
-  uint64_t start_micros = env->NowMicros();
-  while (env->NowMicros() - start_micros < timeout_sec * 1000 * 1000) {
-    if (blobs.size() && rand() < RAND_MAX * t->read_ratio) {
-      // read
-      int bi = rand() % blobs.size();
-      Status s = bs->Get(blobs[bi].first, &buf);
-      assert(s.ok());
-      t->result.data_read += buf.size();
-      t->result.reads++;
-    } else {
-      // write
-      uint64_t size = rand() % (t->data_size_to - t->data_size_from) +
-        t->data_size_from;
-      total_size += size;
-      string put_str = random_strings[rand() % random_strings.size()];
-      blobs.push_back(make_pair(Blob(), size));
-      Status s = bs->Put(Slice(put_str.data(), size), &blobs.back().first);
-      assert(s.ok());
-      t->result.data_written += size;
-      t->result.writes++;
-    }
-
-    while (total_size >= t->working_set_size) {
-      // delete random
-      int bi = rand() % blobs.size();
-      total_size -= blobs[bi].second;
-      bs->Delete(blobs[bi].first);
-      blobs.erase(blobs.begin() + bi);
-      t->result.deletes++;
-    }
-  }
-  t->stopped.store(true);
-}
-
-namespace {
-Result StartBenchmark(vector<WorkerThread*>& config) {
-  for (auto w : config) {
-    env->StartThread(WorkerThreadBody, w);
-  }
-
-  Result result;
-
-  for (auto w : config) {
-    while (!w->stopped.load());
-    result = result + w->result;
-  }
-
-  for (auto w : config) {
-    delete w;
-  }
-
-  delete bs;
-
-  return result;
-}
-
-vector<WorkerThread*> SetupBenchmarkBalanced() {
-  string test_path;
-  env->GetTestDirectory(&test_path);
-  test_path.append("/blob_store");
-
-  // config start
-  uint32_t block_size = 16*KB;
-  uint32_t file_size = 1*MB;
-  double read_write_ratio = 0.5;
-  uint64_t data_read_from = 16*KB;
-  uint64_t data_read_to = 32*KB;
-  int number_of_threads = 10;
-  uint64_t working_set_size = 5*MB;
-  timeout_sec = 5;
-  // config end
-
-  bs = new BlobStore(test_path, block_size, file_size / block_size, 10000, env);
-
-  vector <WorkerThread*> config;
-
-  for (int i = 0; i < number_of_threads; ++i) {
-    config.push_back(new WorkerThread(data_read_from,
-                                      data_read_to,
-                                      read_write_ratio,
-                                      working_set_size));
-  };
-
-  return config;
-}
-
-vector<WorkerThread*> SetupBenchmarkWriteHeavy() {
-  string test_path;
-  env->GetTestDirectory(&test_path);
-  test_path.append("/blob_store");
-
-  // config start
-  uint32_t block_size = 16*KB;
-  uint32_t file_size = 1*MB;
-  double read_write_ratio = 0.1;
-  uint64_t data_read_from = 16*KB;
-  uint64_t data_read_to = 32*KB;
-  int number_of_threads = 10;
-  uint64_t working_set_size = 5*MB;
-  timeout_sec = 5;
-  // config end
-
-  bs = new BlobStore(test_path, block_size, file_size / block_size, 10000, env);
-
-  vector <WorkerThread*> config;
-
-  for (int i = 0; i < number_of_threads; ++i) {
-    config.push_back(new WorkerThread(data_read_from,
-                                      data_read_to,
-                                      read_write_ratio,
-                                      working_set_size));
-  };
-
-  return config;
-}
-
-vector<WorkerThread*> SetupBenchmarkReadHeavy() {
-  string test_path;
-  env->GetTestDirectory(&test_path);
-  test_path.append("/blob_store");
-
-  // config start
-  uint32_t block_size = 16*KB;
-  uint32_t file_size = 1*MB;
-  double read_write_ratio = 0.9;
-  uint64_t data_read_from = 16*KB;
-  uint64_t data_read_to = 32*KB;
-  int number_of_threads = 10;
-  uint64_t working_set_size = 5*MB;
-  timeout_sec = 5;
-  // config end
-
-  bs = new BlobStore(test_path, block_size, file_size / block_size, 10000, env);
-
-  vector <WorkerThread*> config;
-
-  for (int i = 0; i < number_of_threads; ++i) {
-    config.push_back(new WorkerThread(data_read_from,
-                                      data_read_to,
-                                      read_write_ratio,
-                                      working_set_size));
-  };
-
-  return config;
-}
-}  // namespace
-
-int main(int argc, const char** argv) {
-  srand(33);
-  env = Env::Default();
-
-  {
-    printf("--- Balanced read/write benchmark ---\n");
-    vector <WorkerThread*> config = SetupBenchmarkBalanced();
-    Result r = StartBenchmark(config);
-    r.print();
-  }
-  {
-    printf("--- Write heavy benchmark ---\n");
-    vector <WorkerThread*> config = SetupBenchmarkWriteHeavy();
-    Result r = StartBenchmark(config);
-    r.print();
-  }
-  {
-    printf("--- Read heavy benchmark ---\n");
-    vector <WorkerThread*> config = SetupBenchmarkReadHeavy();
-    Result r = StartBenchmark(config);
-    r.print();
-  }
-
-  return 0;
-}
-#else  // ROCKSDB_LITE
-#include <stdio.h>
-int main(int argc, char** argv) {
-  fprintf(stderr, "Not supported in lite mode.\n");
-  return 1;
-}
-#endif  // ROCKSDB_LITE
diff --git a/util/blob_store.cc b/util/blob_store.cc
deleted file mode 100644
index 80dfba512..000000000
--- a/util/blob_store.cc
+++ /dev/null
@@ -1,272 +0,0 @@
-// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-
-#ifndef ROCKSDB_LITE
-#include "util/blob_store.h"
-
-namespace rocksdb {
-
-using namespace std;
-
-// BlobChunk
-bool BlobChunk::ImmediatelyBefore(const BlobChunk& chunk) const {
-  // overlapping!?
-  assert(!Overlap(chunk));
-  // size == 0 is a marker, not a block
-  return size != 0 &&
-    bucket_id == chunk.bucket_id &&
-    offset + size == chunk.offset;
-}
-
-bool BlobChunk::Overlap(const BlobChunk &chunk) const {
-  return size != 0 && chunk.size != 0 && bucket_id == chunk.bucket_id &&
-    ((offset >= chunk.offset && offset < chunk.offset + chunk.size) ||
-     (chunk.offset >= offset && chunk.offset < offset + size));
-}
-
-// Blob
-string Blob::ToString() const {
-  string ret;
-  for (auto chunk : chunks) {
-    PutFixed32(&ret, chunk.bucket_id);
-    PutFixed32(&ret, chunk.offset);
-    PutFixed32(&ret, chunk.size);
-  }
-  return ret;
-}
-
-Blob::Blob(const std::string& blob) {
-  for (uint32_t i = 0; i < blob.size(); ) {
-    uint32_t t[3] = {0};
-    for (int j = 0; j < 3 && i + sizeof(uint32_t) - 1 < blob.size();
-                    ++j, i += sizeof(uint32_t)) {
-      t[j] = DecodeFixed32(blob.data() + i);
-    }
-    chunks.push_back(BlobChunk(t[0], t[1], t[2]));
-  }
-}
-
-// FreeList
-Status FreeList::Free(const Blob& blob) {
-  // add it back to the free list
-  for (auto chunk : blob.chunks) {
-    free_blocks_ += chunk.size;
-    if (fifo_free_chunks_.size() &&
-        fifo_free_chunks_.back().ImmediatelyBefore(chunk)) {
-      fifo_free_chunks_.back().size += chunk.size;
-    } else {
-      fifo_free_chunks_.push_back(chunk);
-    }
-  }
-
-  return Status::OK();
-}
-
-Status FreeList::Allocate(uint32_t blocks, Blob* blob) {
-  if (free_blocks_ < blocks) {
-    return Status::Incomplete("");
-  }
-
-  blob->chunks.clear();
-  free_blocks_ -= blocks;
-
-  while (blocks > 0) {
-    assert(fifo_free_chunks_.size() > 0);
-    auto& front = fifo_free_chunks_.front();
-    if (front.size > blocks) {
-      blob->chunks.push_back(BlobChunk(front.bucket_id, front.offset, blocks));
-      front.offset += blocks;
-      front.size -= blocks;
-      blocks = 0;
-    } else {
-      blob->chunks.push_back(front);
-      blocks -= front.size;
-      fifo_free_chunks_.pop_front();
-    }
-  }
-  assert(blocks == 0);
-
-  return Status::OK();
-}
-
-bool FreeList::Overlap(const Blob &blob) const {
-  for (auto chunk : blob.chunks) {
-    for (auto itr = fifo_free_chunks_.begin();
-         itr != fifo_free_chunks_.end();
-         ++itr) {
-      if (itr->Overlap(chunk)) {
-        return true;
-      }
-    }
-  }
-  return false;
-}
-
-// BlobStore
-BlobStore::BlobStore(const string& directory,
-                     uint64_t block_size,
-                     uint32_t blocks_per_bucket,
-                     uint32_t max_buckets,
-                     Env* env) :
-    directory_(directory),
-    block_size_(block_size),
-    blocks_per_bucket_(blocks_per_bucket),
-    env_(env),
-    max_buckets_(max_buckets) {
-  env_->CreateDirIfMissing(directory_);
-
-  storage_options_.use_mmap_writes = false;
-  storage_options_.use_mmap_reads = false;
-
-  buckets_size_ = 0;
-  buckets_ = new unique_ptr<RandomRWFile>[max_buckets_];
-
-  CreateNewBucket();
-}
-
-BlobStore::~BlobStore() {
-  // TODO we don't care about recovery for now
-  delete [] buckets_;
-}
-
-Status BlobStore::Put(const Slice& value, Blob* blob) {
-  // convert size to number of blocks
-  Status s = Allocate(
-      static_cast<uint32_t>((value.size() + block_size_ - 1) / block_size_),
-      blob);
-  if (!s.ok()) {
-    return s;
-  }
-  auto size_left = (uint64_t) value.size();
-
-  uint64_t offset = 0; // in bytes, not blocks
-  for (auto chunk : blob->chunks) {
-    uint64_t write_size = min(chunk.size * block_size_, size_left);
-    assert(chunk.bucket_id < buckets_size_);
-    s = buckets_[chunk.bucket_id].get()->Write(chunk.offset * block_size_,
-                                               Slice(value.data() + offset,
-                                                     write_size));
-    if (!s.ok()) {
-      Delete(*blob);
-      return s;
-    }
-    offset += write_size;
-    size_left -= write_size;
-    if (write_size < chunk.size * block_size_) {
-      // if we have any space left in the block, fill it up with zeros
-      string zero_string(chunk.size * block_size_ - write_size, 0);
-      s = buckets_[chunk.bucket_id].get()->Write(chunk.offset * block_size_ +
-                                                    write_size,
-                                                 Slice(zero_string));
-    }
-  }
-
-  if (size_left > 0) {
-    Delete(*blob);
-    return Status::Corruption("Tried to write more data than fits in the blob");
-  }
-
-  return Status::OK();
-}
-
-Status BlobStore::Get(const Blob& blob,
-                      string* value) const {
-  {
-    // assert that it doesn't overlap with free list
-    // it will get compiled out for release
-    MutexLock l(&free_list_mutex_);
-    assert(!free_list_.Overlap(blob));
-  }
-
-  value->resize(blob.Size() * block_size_);
-
-  uint64_t offset = 0; // in bytes, not blocks
-  for (auto chunk : blob.chunks) {
-    Slice result;
-    assert(chunk.bucket_id < buckets_size_);
-    Status s;
-    s = buckets_[chunk.bucket_id].get()->Read(chunk.offset * block_size_,
-                                              chunk.size * block_size_,
-                                              &result,
-                                              &value->at(offset));
-    if (!s.ok()) {
-      value->clear();
-      return s;
-    }
-    if (result.size() < chunk.size * block_size_) {
-      value->clear();
-      return Status::Corruption("Could not read in from file");
-    }
-    offset += chunk.size * block_size_;
-  }
-
-  // remove the '\0's at the end of the string
-  value->erase(find(value->begin(), value->end(), '\0'), value->end());
-
-  return Status::OK();
-}
-
-Status BlobStore::Delete(const Blob& blob) {
-  MutexLock l(&free_list_mutex_);
-  return free_list_.Free(blob);
-}
-
-Status BlobStore::Sync() {
-  for (size_t i = 0; i < buckets_size_; ++i) {
-    Status s = buckets_[i].get()->Sync();
-    if (!s.ok()) {
-      return s;
-    }
-  }
-  return Status::OK();
-}
-
-Status BlobStore::Allocate(uint32_t blocks, Blob* blob) {
-  MutexLock l(&free_list_mutex_);
-  Status s;
-
-  s = free_list_.Allocate(blocks, blob);
-  if (!s.ok()) {
-    s = CreateNewBucket();
-    if (!s.ok()) {
-      return s;
-    }
-    s = free_list_.Allocate(blocks, blob);
-  }
-
-  return s;
-}
-
-// called with free_list_mutex_ held
-Status BlobStore::CreateNewBucket() {
-  MutexLock l(&buckets_mutex_);
-
-  if (buckets_size_ >= max_buckets_) {
-    return Status::NotSupported("Max size exceeded\n");
-  }
-
-  int new_bucket_id = buckets_size_;
-
-  char fname[200];
-  sprintf(fname, "%s/%d.bs", directory_.c_str(), new_bucket_id);
-
-  Status s = env_->NewRandomRWFile(string(fname),
-                                   &buckets_[new_bucket_id],
-                                   storage_options_);
-  if (!s.ok()) {
-    return s;
-  }
-
-  // whether Allocate succeeds or not, does not affect the overall correctness
-  // of this function - calling Allocate is really optional
-  // (also, tmpfs does not support allocate)
-  buckets_[new_bucket_id].get()->Allocate(0, block_size_ * blocks_per_bucket_);
-
-  buckets_size_ = new_bucket_id + 1;
-
-  return free_list_.Free(Blob(new_bucket_id, 0, blocks_per_bucket_));
-}
-
-} // namespace rocksdb
-#endif  // ROCKSDB_LITE
diff --git a/util/blob_store.h b/util/blob_store.h
deleted file mode 100644
index 917fb947e..000000000
--- a/util/blob_store.h
+++ /dev/null
@@ -1,163 +0,0 @@
-// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-
-#ifndef ROCKSDB_LITE
-#pragma once
-#include "rocksdb/env.h"
-#include "rocksdb/status.h"
-#include "port/port.h"
-#include "util/mutexlock.h"
-#include "util/coding.h"
-
-#include <list>
-#include <deque>
-#include <cstdint>
-#include <iostream>
-#include <stdexcept>
-#include <algorithm>
-#include <cstdio>
-
-namespace rocksdb {
-
-struct BlobChunk {
-  uint32_t bucket_id;
-  uint32_t offset; // in blocks
-  uint32_t size; // in blocks
-  BlobChunk() {}
-  BlobChunk(uint32_t _bucket_id, uint32_t _offset, uint32_t _size)
-      : bucket_id(_bucket_id), offset(_offset), size(_size) {}
-
-  // returns true if it's immediately before chunk
-  bool ImmediatelyBefore(const BlobChunk& chunk) const;
-  // returns true if chunks overlap
-  bool Overlap(const BlobChunk &chunk) const;
-};
-
-// We represent each Blob as a string in format:
-// bucket_id offset size|bucket_id offset size...
-// The string can be used to reference the Blob stored on external
-// device/file
-// Not thread-safe!
-struct Blob {
-  // Generates the string
-  std::string ToString() const;
-  // Parses the previously generated string
-  explicit Blob(const std::string& blob);
-  // Creates unfragmented Blob
-  Blob(uint32_t bucket_id, uint32_t offset, uint32_t size) {
-    SetOneChunk(bucket_id, offset, size);
-  }
-  Blob() {}
-
-  void SetOneChunk(uint32_t bucket_id, uint32_t offset, uint32_t size) {
-    chunks.clear();
-    chunks.push_back(BlobChunk(bucket_id, offset, size));
-  }
-
-  uint32_t Size() const { // in blocks
-    uint32_t ret = 0;
-    for (auto chunk : chunks) {
-      ret += chunk.size;
-    }
-    assert(ret > 0);
-    return ret;
-  }
-
-  // bucket_id, offset, size
-  std::vector<BlobChunk> chunks;
-};
-
-// Keeps a list of free chunks
-// NOT thread-safe. Externally synchronized
-class FreeList {
- public:
-  FreeList() :
-    free_blocks_(0) {}
-  ~FreeList() {}
-
-  // Allocates a a blob. Stores the allocated blob in
-  // 'blob'. Returns non-OK status if it failed to allocate.
-  // Thread-safe
-  Status Allocate(uint32_t blocks, Blob* blob);
-  // Frees the blob for reuse. Thread-safe
-  Status Free(const Blob& blob);
-
-  // returns true if blob is overlapping with any of the
-  // chunks stored in free list
-  bool Overlap(const Blob &blob) const;
-
- private:
-  std::deque<BlobChunk> fifo_free_chunks_;
-  uint32_t free_blocks_;
-  mutable port::Mutex mutex_;
-};
-
-// thread-safe
-class BlobStore {
- public:
-   // directory - wherever the blobs should be stored. It will be created
-   //   if missing
-   // block_size - self explanatory
-   // blocks_per_bucket - how many blocks we want to keep in one bucket.
-   //   Bucket is a device or a file that we use to store the blobs.
-   //   If we don't have enough blocks to allocate a new blob, we will
-   //   try to create a new file or device.
-   // max_buckets - maximum number of buckets BlobStore will create
-   //   BlobStore max size in bytes is
-   //     max_buckets * blocks_per_bucket * block_size
-   // env - env for creating new files
-  BlobStore(const std::string& directory,
-            uint64_t block_size,
-            uint32_t blocks_per_bucket,
-            uint32_t max_buckets,
-            Env* env);
-  ~BlobStore();
-
-  // Allocates space for value.size bytes (rounded up to be multiple of
-  // block size) and writes value.size bytes from value.data to a backing store.
-  // Sets Blob blob that can than be used for addressing the
-  // stored value. Returns non-OK status on error.
-  Status Put(const Slice& value, Blob* blob);
-  // Value needs to have enough space to store all the loaded stuff.
-  // This function is thread safe!
-  Status Get(const Blob& blob, std::string* value) const;
-  // Frees the blob for reuse, but does not delete the data
-  // on the backing store.
-  Status Delete(const Blob& blob);
-  // Sync all opened files that are modified
-  Status Sync();
-
- private:
-  const std::string directory_;
-  // block_size_ is uint64_t because when we multiply with
-  // blocks_size_ we want the result to be uint64_t or
-  // we risk overflowing
-  const uint64_t block_size_;
-  const uint32_t blocks_per_bucket_;
-  Env* env_;
-  EnvOptions storage_options_;
-  // protected by free_list_mutex_
-  FreeList free_list_;
-  // free_list_mutex_ is locked BEFORE buckets_mutex_
-  mutable port::Mutex free_list_mutex_;
-  // protected by buckets_mutex_
-  // array of buckets
-  unique_ptr<RandomRWFile>* buckets_;
-  // number of buckets in the array
-  uint32_t buckets_size_;
-  uint32_t max_buckets_;
-  mutable port::Mutex buckets_mutex_;
-
-  // Calls FreeList allocate. If free list can't allocate
-  // new blob, creates new bucket and tries again
-  // Thread-safe
-  Status Allocate(uint32_t blocks, Blob* blob);
-
-  // Creates a new backing store and adds all the blocks
-  // from the new backing store to the free list
-  Status CreateNewBucket();
-};
-
-} // namespace rocksdb
-#endif  // ROCKSDB_LITE
diff --git a/util/blob_store_test.cc b/util/blob_store_test.cc
deleted file mode 100644
index f199f5ddd..000000000
--- a/util/blob_store_test.cc
+++ /dev/null
@@ -1,200 +0,0 @@
-// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-
-#include "util/blob_store.h"
-
-#include "util/testharness.h"
-#include "util/testutil.h"
-#include "util/random.h"
-
-#include <cstdlib>
-#include <string>
-
-namespace rocksdb {
-
-using namespace std;
-
-class BlobStoreTest { };
-
-TEST(BlobStoreTest, RangeParseTest) {
-  Blob e;
-  for (int i = 0; i < 5; ++i) {
-    e.chunks.push_back(BlobChunk(rand(), rand(), rand()));
-  }
-  string x = e.ToString();
-  Blob nx(x);
-
-  ASSERT_EQ(nx.ToString(), x);
-}
-
-// make sure we're reusing the freed space
-TEST(BlobStoreTest, SanityTest) {
-  const uint64_t block_size = 10;
-  const uint32_t blocks_per_file = 20;
-  Random random(5);
-
-  BlobStore blob_store(test::TmpDir() + "/blob_store_test",
-                       block_size,
-                       blocks_per_file,
-                       1000,
-                       Env::Default());
-
-  string buf;
-
-  // put string of size 170
-  test::RandomString(&random, 170, &buf);
-  Blob r1;
-  ASSERT_OK(blob_store.Put(Slice(buf), &r1));
-  // use the first file
-  for (size_t i = 0; i < r1.chunks.size(); ++i) {
-    ASSERT_EQ(r1.chunks[0].bucket_id, 0u);
-  }
-
-  // put string of size 30
-  test::RandomString(&random, 30, &buf);
-  Blob r2;
-  ASSERT_OK(blob_store.Put(Slice(buf), &r2));
-  // use the first file
-  for (size_t i = 0; i < r2.chunks.size(); ++i) {
-    ASSERT_EQ(r2.chunks[0].bucket_id, 0u);
-  }
-
-  // delete blob of size 170
-  ASSERT_OK(blob_store.Delete(r1));
-
-  // put a string of size 100
-  test::RandomString(&random, 100, &buf);
-  Blob r3;
-  ASSERT_OK(blob_store.Put(Slice(buf), &r3));
-  // use the first file
-  for (size_t i = 0; i < r3.chunks.size(); ++i) {
-    ASSERT_EQ(r3.chunks[0].bucket_id, 0u);
-  }
-
-  // put a string of size 70
-  test::RandomString(&random, 70, &buf);
-  Blob r4;
-  ASSERT_OK(blob_store.Put(Slice(buf), &r4));
-  // use the first file
-  for (size_t i = 0; i < r4.chunks.size(); ++i) {
-    ASSERT_EQ(r4.chunks[0].bucket_id, 0u);
-  }
-
-  // put a string of size 5
-  test::RandomString(&random, 5, &buf);
-  Blob r5;
-  ASSERT_OK(blob_store.Put(Slice(buf), &r5));
-  // now you get to use the second file
-  for (size_t i = 0; i < r5.chunks.size(); ++i) {
-    ASSERT_EQ(r5.chunks[0].bucket_id, 1u);
-  }
-}
-
-TEST(BlobStoreTest, FragmentedChunksTest) {
-  const uint64_t block_size = 10;
-  const uint32_t blocks_per_file = 20;
-  Random random(5);
-
-  BlobStore blob_store(test::TmpDir() + "/blob_store_test",
-                       block_size,
-                       blocks_per_file,
-                       1000,
-                       Env::Default());
-
-  string buf;
-
-  vector <Blob> r(4);
-
-  // put 4 strings of size 50
-  for (int k = 0; k < 4; ++k)  {
-    test::RandomString(&random, 50, &buf);
-    ASSERT_OK(blob_store.Put(Slice(buf), &r[k]));
-    // use the first file
-    for (size_t i = 0; i < r[k].chunks.size(); ++i) {
-      ASSERT_EQ(r[k].chunks[0].bucket_id, 0u);
-    }
-  }
-
-  // delete the first and third
-  ASSERT_OK(blob_store.Delete(r[0]));
-  ASSERT_OK(blob_store.Delete(r[2]));
-
-  // put string of size 100. it should reuse space that we deleting
-  // by deleting first and third strings of size 50
-  test::RandomString(&random, 100, &buf);
-  Blob r2;
-  ASSERT_OK(blob_store.Put(Slice(buf), &r2));
-  // use the first file
-  for (size_t i = 0; i < r2.chunks.size(); ++i) {
-    ASSERT_EQ(r2.chunks[0].bucket_id, 0u);
-  }
-}
-
-TEST(BlobStoreTest, CreateAndStoreTest) {
-  const uint64_t block_size = 10;
-  const uint32_t blocks_per_file = 1000;
-  const int max_blurb_size = 300;
-  Random random(5);
-
-  BlobStore blob_store(test::TmpDir() + "/blob_store_test",
-                       block_size,
-                       blocks_per_file,
-                       10000,
-                       Env::Default());
-  vector<pair<Blob, string>> ranges;
-
-  for (int i = 0; i < 2000; ++i) {
-    int decision = rand() % 5;
-    if (decision <= 2 || ranges.size() == 0) {
-      string buf;
-      int size_blocks = (rand() % max_blurb_size + 1);
-      int string_size = size_blocks * block_size - (rand() % block_size);
-      test::RandomString(&random, string_size, &buf);
-      Blob r;
-      ASSERT_OK(blob_store.Put(Slice(buf), &r));
-      ranges.push_back(make_pair(r, buf));
-    } else if (decision == 3) {
-      int ti = rand() % ranges.size();
-      string out_buf;
-      ASSERT_OK(blob_store.Get(ranges[ti].first, &out_buf));
-      ASSERT_EQ(ranges[ti].second, out_buf);
-    } else {
-      int ti = rand() % ranges.size();
-      ASSERT_OK(blob_store.Delete(ranges[ti].first));
-      ranges.erase(ranges.begin() + ti);
-    }
-  }
-  ASSERT_OK(blob_store.Sync());
-}
-
-TEST(BlobStoreTest, MaxSizeTest) {
-  const uint64_t block_size = 10;
-  const uint32_t blocks_per_file = 100;
-  const int max_buckets = 10;
-  Random random(5);
-
-  BlobStore blob_store(test::TmpDir() + "/blob_store_test",
-                       block_size,
-                       blocks_per_file,
-                       max_buckets,
-                       Env::Default());
-  string buf;
-  for (int i = 0; i < max_buckets; ++i) {
-    test::RandomString(&random, 1000, &buf);
-    Blob r;
-    ASSERT_OK(blob_store.Put(Slice(buf), &r));
-  }
-
-  test::RandomString(&random, 1000, &buf);
-  Blob r;
-  // should fail because max size
-  Status s = blob_store.Put(Slice(buf), &r);
-  ASSERT_EQ(s.ok(), false);
-}
-
-}  // namespace rocksdb
-
-int main(int argc, char** argv) {
-  return rocksdb::test::RunAllTests();
-}

From f8f040ccc7baed78425dd55e996653e9e2ded9e4 Mon Sep 17 00:00:00 2001
From: alabid <alabidan@gmail.com>
Date: Tue, 27 Jan 2015 19:39:39 -0500
Subject: [PATCH 760/829] Updated .gitignore to ignore *~ files and example
 object files

---
 .gitignore          | 1 +
 examples/.gitignore | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/.gitignore b/.gitignore
index 70316aebc..dfd3f4924 100644
--- a/.gitignore
+++ b/.gitignore
@@ -20,6 +20,7 @@ make_config.mk
 *.d-e
 *.o-*
 *.swp
+*~
 
 ldb
 manifest_dump
diff --git a/examples/.gitignore b/examples/.gitignore
index d3c22099a..5cb04d4b6 100644
--- a/examples/.gitignore
+++ b/examples/.gitignore
@@ -1,2 +1,4 @@
 column_families_example
 simple_example
+c_simple_example
+compact_files_example

From d2a2b058f089d9b0b9e70aa3b365a04008cdd504 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Tue, 27 Jan 2015 16:34:16 -0800
Subject: [PATCH 761/829] fault_injection_test: to support file closed after
 being deleted

Summary: fault_injection_test occasionally fails because file closing can happen after deletion. Improve the test to support it.

Test Plan: I have a new test case I'm working on, where the issue appears almost every time. With the patch, the problem goes away.

Reviewers: rven, yhchiang, igor

Reviewed By: igor

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D32373
---
 db/fault_injection_test.cc | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/db/fault_injection_test.cc b/db/fault_injection_test.cc
index b6a63c36d..65d7444a9 100644
--- a/db/fault_injection_test.cc
+++ b/db/fault_injection_test.cc
@@ -90,7 +90,7 @@ Status Truncate(const std::string& filename, uint64_t length) {
       if (s.ok()) {
         s = env->RenameFile(tmp_name, filename);
       } else {
-        fprintf(stderr, "Cannot renmae file %s to %s: %s\n", tmp_name.c_str(),
+        fprintf(stderr, "Cannot rename file %s to %s: %s\n", tmp_name.c_str(),
                 filename.c_str(), s.ToString().c_str());
         env->DeleteFile(tmp_name);
       }
@@ -193,7 +193,7 @@ class FaultInjectionTestEnv : public EnvWrapper {
       // again then it will be truncated - so forget our saved state.
       UntrackFile(fname);
       MutexLock l(&mutex_);
-
+      open_files_.insert(fname);
       auto dir_and_name = GetDirAndName(fname);
       auto& list = dir_to_new_files_since_last_sync_[dir_and_name.first];
       list.insert(dir_and_name.second);
@@ -238,7 +238,10 @@ class FaultInjectionTestEnv : public EnvWrapper {
 
   void WritableFileClosed(const FileState& state) {
     MutexLock l(&mutex_);
-    db_file_state_[state.filename_] = state;
+    if (open_files_.find(state.filename_) != open_files_.end()) {
+      db_file_state_[state.filename_] = state;
+      open_files_.erase(state.filename_);
+    }
   }
 
   // For every file that is not fully synced, make a call to `func` with
@@ -280,6 +283,9 @@ class FaultInjectionTestEnv : public EnvWrapper {
     for (auto& pair : map_copy) {
       for (std::string name : pair.second) {
         Status s = DeleteFile(pair.first + "/" + name);
+        if (!s.ok()) {
+          return s;
+        }
       }
     }
     return Status::OK();
@@ -297,6 +303,7 @@ class FaultInjectionTestEnv : public EnvWrapper {
     dir_to_new_files_since_last_sync_[dir_and_name.first].erase(
         dir_and_name.second);
     db_file_state_.erase(f);
+    open_files_.erase(f);
   }
 
   void SyncDir(const std::string& dirname) {
@@ -317,10 +324,12 @@ class FaultInjectionTestEnv : public EnvWrapper {
     MutexLock l(&mutex_);
     SetFilesystemActiveNoLock(active);
   }
+  void AssertNoOpenFile() { ASSERT_TRUE(open_files_.empty()); }
 
  private:
   port::Mutex mutex_;
   std::map<std::string, FileState> db_file_state_;
+  std::set<std::string> open_files_;
   std::unordered_map<std::string, std::set<std::string>>
       dir_to_new_files_since_last_sync_;
   bool filesystem_active_;  // Record flushes, syncs, writes
@@ -610,6 +619,7 @@ class FaultInjectionTest {
 
   // rnd cannot be null for kResetDropRandomUnsyncedData
   void ResetDBState(ResetMethod reset_method, Random* rnd = nullptr) {
+    env_->AssertNoOpenFile();
     switch (reset_method) {
       case kResetDropUnsyncedData:
         ASSERT_OK(env_->DropUnsyncedFileData());

From 560ed402bdd74497470948e9f570ed4fd60c83cb Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Tue, 27 Jan 2015 21:00:33 -0800
Subject: [PATCH 762/829] [minor] fprintf to stderr instead of stdout in test

---
 db/db_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/db/db_test.cc b/db/db_test.cc
index c734351b5..535b800a9 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -8139,7 +8139,7 @@ TEST(DBTest, Randomized) {
       }
 
       if ((step % 2000) == 0) {
-        fprintf(stdout,
+        fprintf(stderr,
                 "DBTest.Randomized, option ID: %d, step: %d out of %d\n",
                 option_config_, step, N);
       }

From cc0d8be011e3dc89e0f88431bb1068d58060ca8b Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Wed, 28 Jan 2015 21:54:01 +0100
Subject: [PATCH 763/829] [RocksJava] Integrated review comments (D32145)

---
 java/rocksjni/portal.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/java/rocksjni/portal.h b/java/rocksjni/portal.h
index 234d338ff..0c35eef4e 100644
--- a/java/rocksjni/portal.h
+++ b/java/rocksjni/portal.h
@@ -106,8 +106,8 @@ class RocksDBExceptionJni {
 };
 
 // The portal class for org.rocksdb.Options
-class OptionsJni : public RocksDBNativeClass<rocksdb::Options*,
-    OptionsJni> {
+class OptionsJni : public RocksDBNativeClass<
+    rocksdb::Options*, OptionsJni> {
  public:
   static jclass getJClass(JNIEnv* env) {
     return RocksDBNativeClass::getJClass(env, "org/rocksdb/Options");
@@ -115,8 +115,8 @@ class OptionsJni : public RocksDBNativeClass<rocksdb::Options*,
 };
 
 // The portal class for org.rocksdb.DBOptions
-class DBOptionsJni : public RocksDBNativeClass<rocksdb::DBOptions*,
-    DBOptionsJni> {
+class DBOptionsJni : public RocksDBNativeClass<
+    rocksdb::DBOptions*, DBOptionsJni> {
  public:
   static jclass getJClass(JNIEnv* env) {
     return RocksDBNativeClass::getJClass(env, "org/rocksdb/DBOptions");

From 0c4d1053dfe7e9ff2b2e936b9a4a09c35969504c Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Wed, 28 Jan 2015 13:42:40 -0800
Subject: [PATCH 764/829] Fix data race #5

Summary: TSAN complained that these are non-atomic reads and writes from different threads.

Test Plan: TSAN no longer complains

Reviewers: yhchiang, rven, sdong

Reviewed By: sdong

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D32409
---
 util/thread_status_util_debug.cc | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/util/thread_status_util_debug.cc b/util/thread_status_util_debug.cc
index 5378acaf8..5a86af26a 100644
--- a/util/thread_status_util_debug.cc
+++ b/util/thread_status_util_debug.cc
@@ -3,6 +3,8 @@
 // LICENSE file in the root directory of this source tree. An additional grant
 // of patent rights can be found in the PATENTS file in the same directory.
 
+#include <atomic>
+
 #include "rocksdb/env.h"
 #include "util/thread_status_updater.h"
 #include "util/thread_status_util.h"
@@ -11,30 +13,29 @@ namespace rocksdb {
 
 #ifndef NDEBUG
 // the delay for debugging purpose.
-static int operations_delay[ThreadStatus::NUM_OP_TYPES] ={0};
-static int states_delay[ThreadStatus::NUM_STATE_TYPES] = {0};
+static std::atomic<int> operations_delay[ThreadStatus::NUM_OP_TYPES];
+static std::atomic<int> states_delay[ThreadStatus::NUM_STATE_TYPES];
 
 void ThreadStatusUtil::TEST_SetStateDelay(
     const ThreadStatus::StateType state, int micro) {
-  states_delay[state] = micro;
+  states_delay[state].store(micro, std::memory_order_relaxed);
 }
 
 void ThreadStatusUtil::TEST_StateDelay(
     const ThreadStatus::StateType state) {
   Env::Default()->SleepForMicroseconds(
-      states_delay[state]);
+      states_delay[state].load(std::memory_order_relaxed));
 }
 
 void ThreadStatusUtil::TEST_SetOperationDelay(
     const ThreadStatus::OperationType operation, int micro) {
-  operations_delay[operation] = micro;
+  operations_delay[operation].store(micro, std::memory_order_relaxed);
 }
 
-
 void ThreadStatusUtil::TEST_OperationDelay(
     const ThreadStatus::OperationType operation) {
   Env::Default()->SleepForMicroseconds(
-      operations_delay[operation]);
+      operations_delay[operation].load(std::memory_order_relaxed));
 }
 #endif  // !NDEBUG
 

From 10af17f3d7d9a79648fa95b674e3ebbc319dd7b1 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Tue, 27 Jan 2015 14:44:19 -0800
Subject: [PATCH 765/829] fault_injection_test: add a unit test to allow
 parallel compactions and multiple levels

Summary: Add a new test case in fault_injection_test, which covers parallel compactions and multiple levels. Use MockEnv to run the new test case to speed it up. Improve MockEnv to avoid DestoryDB(), previously failed when deleting lock files.

Test Plan: Run ./fault_injection_test, including valgrind

Reviewers: rven, yhchiang, igor

Reviewed By: igor

Subscribers: leveldb, dhruba

Differential Revision: https://reviews.facebook.net/D32415
---
 db/fault_injection_test.cc | 52 ++++++++++++++++++++----------
 util/mock_env.cc           | 65 ++++++++++++++++++++++++++++++--------
 2 files changed, 88 insertions(+), 29 deletions(-)

diff --git a/db/fault_injection_test.cc b/db/fault_injection_test.cc
index 65d7444a9..8291f7287 100644
--- a/db/fault_injection_test.cc
+++ b/db/fault_injection_test.cc
@@ -23,6 +23,7 @@
 #include "rocksdb/table.h"
 #include "rocksdb/write_batch.h"
 #include "util/logging.h"
+#include "util/mock_env.h"
 #include "util/mutexlock.h"
 #include "util/testharness.h"
 #include "util/testutil.h"
@@ -66,9 +67,7 @@ static std::pair<std::string, std::string> GetDirAndName(
 }
 
 // A basic file truncation function suitable for this test.
-Status Truncate(const std::string& filename, uint64_t length) {
-  rocksdb::Env* env = rocksdb::Env::Default();
-
+Status Truncate(Env* env, const std::string& filename, uint64_t length) {
   unique_ptr<SequentialFile> orig_file;
   const EnvOptions options;
   Status s = env->NewSequentialFile(filename, &orig_file, options);
@@ -122,9 +121,9 @@ struct FileState {
 
   bool IsFullySynced() const { return pos_ <= 0 || pos_ == pos_at_last_sync_; }
 
-  Status DropUnsyncedData() const;
+  Status DropUnsyncedData(Env* env) const;
 
-  Status DropRandomUnsyncedData(Random* rand) const;
+  Status DropRandomUnsyncedData(Env* env, Random* rand) const;
 };
 
 }  // anonymous namespace
@@ -246,7 +245,7 @@ class FaultInjectionTestEnv : public EnvWrapper {
 
   // For every file that is not fully synced, make a call to `func` with
   // FileState of the file as the parameter.
-  Status DropFileData(std::function<Status(FileState)> func) {
+  Status DropFileData(std::function<Status(Env*, FileState)> func) {
     Status s;
     MutexLock l(&mutex_);
     for (std::map<std::string, FileState>::const_iterator it =
@@ -254,20 +253,21 @@ class FaultInjectionTestEnv : public EnvWrapper {
          s.ok() && it != db_file_state_.end(); ++it) {
       const FileState& state = it->second;
       if (!state.IsFullySynced()) {
-        s = func(state);
+        s = func(target(), state);
       }
     }
     return s;
   }
 
   Status DropUnsyncedFileData() {
-    return DropFileData(
-        [&](const FileState& state) { return state.DropUnsyncedData(); });
+    return DropFileData([&](Env* env, const FileState& state) {
+      return state.DropUnsyncedData(env);
+    });
   }
 
   Status DropRandomUnsyncedFileData(Random* rnd) {
-    return DropFileData([&](const FileState& state) {
-      return state.DropRandomUnsyncedData(rnd);
+    return DropFileData([&](Env* env, const FileState& state) {
+      return state.DropRandomUnsyncedData(env, rnd);
     });
   }
 
@@ -335,18 +335,18 @@ class FaultInjectionTestEnv : public EnvWrapper {
   bool filesystem_active_;  // Record flushes, syncs, writes
 };
 
-Status FileState::DropUnsyncedData() const {
+Status FileState::DropUnsyncedData(Env* env) const {
   ssize_t sync_pos = pos_at_last_sync_ == -1 ? 0 : pos_at_last_sync_;
-  return Truncate(filename_, sync_pos);
+  return Truncate(env, filename_, sync_pos);
 }
 
-Status FileState::DropRandomUnsyncedData(Random* rand) const {
+Status FileState::DropRandomUnsyncedData(Env* env, Random* rand) const {
   ssize_t sync_pos = pos_at_last_sync_ == -1 ? 0 : pos_at_last_sync_;
   assert(pos_ >= sync_pos);
   int range = static_cast<int>(pos_ - sync_pos);
   uint64_t truncated_size =
       static_cast<uint64_t>(sync_pos) + rand->Uniform(range);
-  return Truncate(filename_, truncated_size);
+  return Truncate(env, filename_, truncated_size);
 }
 
 Status TestDirectory::Fsync() {
@@ -413,6 +413,7 @@ class FaultInjectionTest {
     kWalDir,
     kSyncWal,
     kWalDirSyncWal,
+    kMultiLevels,
     kEnd,
   };
   int option_config_;
@@ -431,6 +432,7 @@ class FaultInjectionTest {
     kResetDropAndDeleteUnsynced
   };
 
+  std::unique_ptr<Env> base_env_;
   FaultInjectionTestEnv* env_;
   std::string dbname_;
   shared_ptr<Cache> tiny_cache_;
@@ -441,6 +443,7 @@ class FaultInjectionTest {
       : option_config_(kDefault),
         sync_use_wal_(false),
         sync_use_compact_(true),
+        base_env_(nullptr),
         env_(NULL),
         db_(NULL) {
     NewDB();
@@ -453,6 +456,9 @@ class FaultInjectionTest {
     if (option_config_ >= kEnd) {
       return false;
     } else {
+      if (option_config_ == kMultiLevels) {
+        base_env_.reset(new MockEnv(Env::Default()));
+      }
       return true;
     }
   }
@@ -479,6 +485,19 @@ class FaultInjectionTest {
         sync_use_wal_ = true;
         sync_use_compact_ = false;
         break;
+      case kMultiLevels:
+        options.write_buffer_size = 64 * 1024;
+        options.target_file_size_base = 64 * 1024;
+        options.level0_file_num_compaction_trigger = 2;
+        options.level0_slowdown_writes_trigger = 2;
+        options.level0_stop_writes_trigger = 4;
+        options.max_bytes_for_level_base = 128 * 1024;
+        options.max_write_buffer_number = 2;
+        options.max_background_compactions = 8;
+        options.max_background_flushes = 8;
+        sync_use_wal_ = true;
+        sync_use_compact_ = false;
+        break;
       default:
         break;
     }
@@ -490,7 +509,8 @@ class FaultInjectionTest {
     assert(tiny_cache_ == nullptr);
     assert(env_ == NULL);
 
-    env_ = new FaultInjectionTestEnv(Env::Default());
+    env_ =
+        new FaultInjectionTestEnv(base_env_ ? base_env_.get() : Env::Default());
 
     options_ = CurrentOptions();
     options_.env = env_;
diff --git a/util/mock_env.cc b/util/mock_env.cc
index a88db18d5..c1b74a3d3 100644
--- a/util/mock_env.cc
+++ b/util/mock_env.cc
@@ -19,9 +19,11 @@ namespace rocksdb {
 
 class MemFile {
  public:
-  explicit MemFile(const std::string& fn)
+  explicit MemFile(const std::string& fn, bool _is_lock_file = false)
       : fn_(fn),
         refs_(0),
+        is_lock_file_(_is_lock_file),
+        locked_(false),
         size_(0),
         modified_time_(Now()),
         rnd_(static_cast<uint32_t>(
@@ -33,6 +35,25 @@ class MemFile {
     ++refs_;
   }
 
+  bool is_lock_file() const { return is_lock_file_; }
+
+  bool Lock() {
+    assert(is_lock_file_);
+    MutexLock lock(&mutex_);
+    if (locked_) {
+      return false;
+    } else {
+      refs_ = true;
+      return true;
+    }
+  }
+
+  void Unlock() {
+    assert(is_lock_file_);
+    MutexLock lock(&mutex_);
+    locked_ = false;
+  }
+
   void Unref() {
     bool do_delete = false;
     {
@@ -132,6 +153,8 @@ class MemFile {
   const std::string fn_;
   mutable port::Mutex mutex_;
   int refs_;
+  bool is_lock_file_;
+  bool locked_;
 
   // Data written into this file, all bytes before fsynced_bytes are
   // persistent.
@@ -398,6 +421,9 @@ Status MockEnv::NewSequentialFile(const std::string& fname,
     return Status::IOError(fn, "File not found");
   }
   auto* f = file_map_[fn];
+  if (f->is_lock_file()) {
+    return Status::InvalidArgument(fn, "Cannot open a lock file.");
+  }
   result->reset(new SequentialFileImpl(f));
   return Status::OK();
 }
@@ -412,6 +438,9 @@ Status MockEnv::NewRandomAccessFile(const std::string& fname,
     return Status::IOError(fn, "File not found");
   }
   auto* f = file_map_[fn];
+  if (f->is_lock_file()) {
+    return Status::InvalidArgument(fn, "Cannot open a lock file.");
+  }
   result->reset(new RandomAccessFileImpl(f));
   return Status::OK();
 }
@@ -424,7 +453,7 @@ Status MockEnv::NewWritableFile(const std::string& fname,
   if (file_map_.find(fn) != file_map_.end()) {
     DeleteFileInternal(fn);
   }
-  MemFile* file = new MemFile(fn);
+  MemFile* file = new MemFile(fn, false);
   file->Ref();
   file_map_[fn] = file;
 
@@ -490,12 +519,11 @@ Status MockEnv::GetChildren(const std::string& dir,
 
 void MockEnv::DeleteFileInternal(const std::string& fname) {
   assert(fname == NormalizePath(fname));
-  if (file_map_.find(fname) == file_map_.end()) {
-    return;
+  const auto& pair = file_map_.find(fname);
+  if (pair != file_map_.end()) {
+    pair->second->Unref();
+    file_map_.erase(fname);
   }
-
-  file_map_[fname]->Unref();
-  file_map_.erase(fname);
 }
 
 Status MockEnv::DeleteFile(const std::string& fname) {
@@ -579,7 +607,7 @@ Status MockEnv::NewLogger(const std::string& fname,
   auto iter = file_map_.find(fn);
   MemFile* file = nullptr;
   if (iter == file_map_.end()) {
-    file = new MemFile(fn);
+    file = new MemFile(fn, false);
     file->Ref();
     file_map_[fn] = file;
   } else {
@@ -595,9 +623,18 @@ Status MockEnv::LockFile(const std::string& fname, FileLock** flock) {
   {
     MutexLock lock(&mutex_);
     if (file_map_.find(fn) != file_map_.end()) {
-      return Status::IOError(fn, "Lock file exists");
+      if (!file_map_[fn]->is_lock_file()) {
+        return Status::InvalidArgument(fname, "Not a lock file.");
+      }
+      if (!file_map_[fn]->Lock()) {
+        return Status::IOError(fn, "Lock is already held.");
+      }
+    } else {
+      auto* file = new MemFile(fname, true);
+      file->Ref();
+      file->Lock();
+      file_map_[fname] = file;
     }
-    file_map_[fn] = nullptr;
   }
   *flock = new MockEnvFileLock(fn);
   return Status::OK();
@@ -607,9 +644,11 @@ Status MockEnv::UnlockFile(FileLock* flock) {
   std::string fn = dynamic_cast<MockEnvFileLock*>(flock)->FileName();
   {
     MutexLock lock(&mutex_);
-    auto iter = file_map_.find(fn);
-    if (iter != file_map_.end()) {
-      file_map_.erase(fn);
+    if (file_map_.find(fn) != file_map_.end()) {
+      if (!file_map_[fn]->is_lock_file()) {
+        return Status::InvalidArgument(fn, "Not a lock file.");
+      }
+      file_map_[fn]->Unlock();
     }
   }
   delete flock;

From e5aab4c2b230b8e3171fb571c614a6f81121608d Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Wed, 28 Jan 2015 15:16:25 -0800
Subject: [PATCH 766/829] Fix data race in HashLinkList

Summary:
1) need to do acquire load when read the first entry in the bucket.
2) Make num_entries atomic

Test Plan: Ran DBTest.MultiThreaded with TSAN

Reviewers: yhchiang, rven, igor

Reviewed By: igor

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D32361
---
 util/hash_linklist_rep.cc | 39 ++++++++++++++++++++++++++-------------
 1 file changed, 26 insertions(+), 13 deletions(-)

diff --git a/util/hash_linklist_rep.cc b/util/hash_linklist_rep.cc
index d8e6da6aa..3e98f3d00 100644
--- a/util/hash_linklist_rep.cc
+++ b/util/hash_linklist_rep.cc
@@ -29,7 +29,7 @@ typedef std::atomic<void*> Pointer;
 // A data structure used as the header of a link list of a hash bucket.
 struct BucketHeader {
   Pointer next;
-  uint32_t num_entries;
+  std::atomic<uint32_t> num_entries;
 
   explicit BucketHeader(void* n, uint32_t count)
       : next(n), num_entries(count) {}
@@ -37,6 +37,17 @@ struct BucketHeader {
   bool IsSkipListBucket() {
     return next.load(std::memory_order_relaxed) == this;
   }
+
+  uint32_t GetNumEntries() const {
+    return num_entries.load(std::memory_order_relaxed);
+  }
+
+  // REQUIRES: called from single-threaded Insert()
+  void IncNumEntries() {
+    // Only one thread can do write at one time. No need to do atomic
+    // incremental. Update it with relaxed load and store.
+    num_entries.store(GetNumEntries() + 1, std::memory_order_relaxed);
+  }
 };
 
 // A data structure used as the header of a skip list of a hash bucket.
@@ -503,14 +514,14 @@ SkipListBucketHeader* HashLinkListRep::GetSkipListBucketHeader(
   // Counting header
   BucketHeader* header = reinterpret_cast<BucketHeader*>(first_next_pointer);
   if (header->IsSkipListBucket()) {
-    assert(header->num_entries > threshold_use_skiplist_);
+    assert(header->GetNumEntries() > threshold_use_skiplist_);
     auto* skip_list_bucket_header =
         reinterpret_cast<SkipListBucketHeader*>(header);
     assert(skip_list_bucket_header->Counting_header.next.load(
                std::memory_order_relaxed) == header);
     return skip_list_bucket_header;
   }
-  assert(header->num_entries <= threshold_use_skiplist_);
+  assert(header->GetNumEntries() <= threshold_use_skiplist_);
   return nullptr;
 }
 
@@ -525,11 +536,11 @@ Node* HashLinkListRep::GetLinkListFirstNode(Pointer* first_next_pointer) const {
   // Counting header
   BucketHeader* header = reinterpret_cast<BucketHeader*>(first_next_pointer);
   if (!header->IsSkipListBucket()) {
-    assert(header->num_entries <= threshold_use_skiplist_);
+    assert(header->GetNumEntries() <= threshold_use_skiplist_);
     return reinterpret_cast<Node*>(
-        header->next.load(std::memory_order_relaxed));
+        header->next.load(std::memory_order_acquire));
   }
-  assert(header->num_entries > threshold_use_skiplist_);
+  assert(header->GetNumEntries() > threshold_use_skiplist_);
   return nullptr;
 }
 
@@ -568,26 +579,28 @@ void HashLinkListRep::Insert(KeyHandle handle) {
     header = reinterpret_cast<BucketHeader*>(first_next_pointer);
     if (header->IsSkipListBucket()) {
       // Case 4. Bucket is already a skip list
-      assert(header->num_entries > threshold_use_skiplist_);
+      assert(header->GetNumEntries() > threshold_use_skiplist_);
       auto* skip_list_bucket_header =
           reinterpret_cast<SkipListBucketHeader*>(header);
-      skip_list_bucket_header->Counting_header.num_entries++;
+      // Only one thread can execute Insert() at one time. No need to do atomic
+      // incremental.
+      skip_list_bucket_header->Counting_header.IncNumEntries();
       skip_list_bucket_header->skip_list.Insert(x->key);
       return;
     }
   }
 
   if (bucket_entries_logging_threshold_ > 0 &&
-      header->num_entries ==
+      header->GetNumEntries() ==
           static_cast<uint32_t>(bucket_entries_logging_threshold_)) {
     Info(logger_,
          "HashLinkedList bucket %zu has more than %d "
          "entries. Key to insert: %s",
-         GetHash(transformed), header->num_entries,
+         GetHash(transformed), header->GetNumEntries(),
          GetLengthPrefixedSlice(x->key).ToString(true).c_str());
   }
 
-  if (header->num_entries == threshold_use_skiplist_) {
+  if (header->GetNumEntries() == threshold_use_skiplist_) {
     // Case 3. number of entries reaches the threshold so need to convert to
     // skip list.
     LinkListIterator bucket_iter(
@@ -595,7 +608,7 @@ void HashLinkListRep::Insert(KeyHandle handle) {
                   first_next_pointer->load(std::memory_order_relaxed)));
     auto mem = allocator_->AllocateAligned(sizeof(SkipListBucketHeader));
     SkipListBucketHeader* new_skip_list_header = new (mem)
-        SkipListBucketHeader(compare_, allocator_, header->num_entries + 1);
+        SkipListBucketHeader(compare_, allocator_, header->GetNumEntries() + 1);
     auto& skip_list = new_skip_list_header->skip_list;
 
     // Add all current entries to the skip list
@@ -616,7 +629,7 @@ void HashLinkListRep::Insert(KeyHandle handle) {
     // Advance counter unless the bucket needs to be advanced to skip list.
     // In that case, we need to make sure the previous count never exceeds
     // threshold_use_skiplist_ to avoid readers to cast to wrong format.
-    header->num_entries++;
+    header->IncNumEntries();
 
     Node* cur = first;
     Node* prev = nullptr;

From e84299c769f57c62aa6c9bf353802f31347b3ac8 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Wed, 28 Jan 2015 14:28:49 -0800
Subject: [PATCH 767/829] Fix bug recently introduced in MemFile::Lock()

Summary: This bug fails DBTest.CheckLock

Test Plan: DBTest.CheckLock now passes with MEM_ENV=1.

Reviewers: rven, yhchiang, igor

Reviewed By: igor

Subscribers: leveldb, dhruba

Differential Revision: https://reviews.facebook.net/D32451
---
 util/mock_env.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/util/mock_env.cc b/util/mock_env.cc
index c1b74a3d3..856d73d92 100644
--- a/util/mock_env.cc
+++ b/util/mock_env.cc
@@ -43,7 +43,7 @@ class MemFile {
     if (locked_) {
       return false;
     } else {
-      refs_ = true;
+      locked_ = true;
       return true;
     }
   }

From 4bdf38b16e7d4cc044740ad3ff2ce7093b7262a8 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Wed, 28 Jan 2015 15:31:48 -0800
Subject: [PATCH 768/829] Disable FlushSchedule when running TSAN

Summary:
There's a bug in TSAN (or libstdc++?) with std::shared_ptr<> for some reason. In db_test, only FlushSchedule is affected.

See more: https://groups.google.com/forum/#!topic/thread-sanitizer/vz_s-t226Vg

With this change and all other @sdong's and mine diffs, our db_test should be TSAN-clean. I'll move to other tests.

Test Plan: no more flush schedule when running TSAN

Reviewers: yhchiang, rven, sdong

Reviewed By: sdong

Subscribers: sdong, dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D32469
---
 Makefile      | 4 ++--
 db/db_test.cc | 4 ++++
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index 7d0e7275a..bcd8c77d7 100644
--- a/Makefile
+++ b/Makefile
@@ -61,8 +61,8 @@ endif
 ifdef COMPILE_WITH_TSAN
 	DISABLE_JEMALLOC=1
 	EXEC_LDFLAGS += -fsanitize=thread -pie
-	PLATFORM_CCFLAGS += -fsanitize=thread -fPIC
-	PLATFORM_CXXFLAGS += -fsanitize=thread -fPIC
+	PLATFORM_CCFLAGS += -fsanitize=thread -fPIC -DROCKSDB_TSAN_RUN
+	PLATFORM_CXXFLAGS += -fsanitize=thread -fPIC -DROCKSDB_TSAN_RUN
 endif
 
 ifndef DISABLE_JEMALLOC
diff --git a/db/db_test.cc b/db/db_test.cc
index 535b800a9..067f2869d 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -3036,6 +3036,9 @@ TEST(DBTest, RecoverDuringMemtableCompaction) {
   } while (ChangeOptions());
 }
 
+// false positive TSAN report on shared_ptr --
+// https://groups.google.com/forum/#!topic/thread-sanitizer/vz_s-t226Vg
+#ifndef ROCKSDB_TSAN_RUN
 TEST(DBTest, FlushSchedule) {
   Options options = CurrentOptions();
   options.disable_auto_compactions = true;
@@ -3073,6 +3076,7 @@ TEST(DBTest, FlushSchedule) {
   ASSERT_LE(pikachu_tables, static_cast<uint64_t>(10));
   ASSERT_GT(pikachu_tables, static_cast<uint64_t>(0));
 }
+#endif  // enabled only if not TSAN run
 
 TEST(DBTest, MinorCompactionsHappen) {
   do {

From d07fec3bdcda95e1c0957fc200c78af694a77082 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Wed, 28 Jan 2015 15:30:02 -0800
Subject: [PATCH 769/829] make DBTest.SharedWriteBuffer to pass MockEnv

Summary: DBTest.SharedWriteBuffer uses an Options that doesn't pass CurrentOptions(), so that it doesn't use MockEnv. However, DBTest's constructor uses MockEnv to call DestoryDB() to clean up, causing uncleaned state before it runs.

Test Plan: Run the test modified to make sure they pass default Env and SharedWriteBuffer now passes MockEnv.

Reviewers: rven, yhchiang, igor

Reviewed By: igor

Subscribers: leveldb, dhruba

Differential Revision: https://reviews.facebook.net/D32475
---
 db/db_test.cc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/db/db_test.cc b/db/db_test.cc
index 067f2869d..a8973a102 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -3995,7 +3995,7 @@ TEST(DBTest, UniversalCompactionCompressRatio2) {
 }
 
 TEST(DBTest, FailMoreDbPaths) {
-  Options options;
+  Options options = CurrentOptions();
   options.db_paths.emplace_back(dbname_, 10000000);
   options.db_paths.emplace_back(dbname_ + "_2", 1000000);
   options.db_paths.emplace_back(dbname_ + "_3", 1000000);
@@ -4099,7 +4099,7 @@ TEST(DBTest, UniversalCompactionSecondPathRatio) {
 }
 
 TEST(DBTest, LevelCompactionThirdPath) {
-  Options options;
+  Options options = CurrentOptions();
   options.db_paths.emplace_back(dbname_, 500 * 1024);
   options.db_paths.emplace_back(dbname_ + "_2", 4 * 1024 * 1024);
   options.db_paths.emplace_back(dbname_ + "_3", 1024 * 1024 * 1024);
@@ -4212,7 +4212,7 @@ TEST(DBTest, LevelCompactionThirdPath) {
 }
 
 TEST(DBTest, LevelCompactionPathUse) {
-  Options options;
+  Options options = CurrentOptions();
   options.db_paths.emplace_back(dbname_, 500 * 1024);
   options.db_paths.emplace_back(dbname_ + "_2", 4 * 1024 * 1024);
   options.db_paths.emplace_back(dbname_ + "_3", 1024 * 1024 * 1024);
@@ -7154,7 +7154,7 @@ TEST(DBTest, RecoverCheckFileAmount) {
 }
 
 TEST(DBTest, SharedWriteBuffer) {
-  Options options;
+  Options options = CurrentOptions();
   options.db_write_buffer_size = 100000;  // this is the real limit
   options.write_buffer_size    = 500000;  // this is never hit
   CreateAndReopenWithCF({"pikachu", "dobrynia", "nikitich"}, options);

From 173c52a97f464fb3d0101612c6eef8a70c9a4c3e Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Thu, 29 Jan 2015 13:43:09 -0800
Subject: [PATCH 770/829] Fix build on older compilers -- emplace() is not
 available

---
 utilities/backupable/backupable_db.cc | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/utilities/backupable/backupable_db.cc b/utilities/backupable/backupable_db.cc
index f1920f110..0cac257e3 100644
--- a/utilities/backupable/backupable_db.cc
+++ b/utilities/backupable/backupable_db.cc
@@ -391,10 +391,10 @@ BackupEngineImpl::BackupEngineImpl(Env* db_env,
       continue;
     }
     assert(backups_.find(backup_id) == backups_.end());
-    backups_.emplace(backup_id,
-        unique_ptr<BackupMeta>(new BackupMeta(
-            GetBackupMetaFile(backup_id),
-            &backuped_file_infos_, backup_env_)));
+    backups_.insert(std::move(
+        std::make_pair(backup_id, unique_ptr<BackupMeta>(new BackupMeta(
+                                      GetBackupMetaFile(backup_id),
+                                      &backuped_file_infos_, backup_env_)))));
   }
 
   if (options_.destroy_old_data) {  // Destory old data
@@ -475,10 +475,10 @@ Status BackupEngineImpl::CreateNewBackup(DB* db, bool flush_before_backup) {
 
   BackupID new_backup_id = latest_backup_id_ + 1;
   assert(backups_.find(new_backup_id) == backups_.end());
-  auto ret = backups_.emplace(new_backup_id,
-      unique_ptr<BackupMeta>(new BackupMeta(
-          GetBackupMetaFile(new_backup_id),
-          &backuped_file_infos_, backup_env_)));
+  auto ret = backups_.insert(std::move(
+      std::make_pair(new_backup_id, unique_ptr<BackupMeta>(new BackupMeta(
+                                        GetBackupMetaFile(new_backup_id),
+                                        &backuped_file_infos_, backup_env_)))));
   assert(ret.second == true);
   auto& new_backup = ret.first->second;
   new_backup->RecordTimestamp();
@@ -1123,7 +1123,7 @@ Status BackupEngineImpl::BackupMeta::AddFile(
     std::shared_ptr<FileInfo> file_info) {
   auto itr = file_infos_->find(file_info->filename);
   if (itr == file_infos_->end()) {
-    auto ret = file_infos_->emplace(file_info->filename, file_info);
+    auto ret = file_infos_->insert({file_info->filename, file_info});
     if (ret.second) {
       itr = ret.first;
       itr->second->refs = 1;

From 2fd8f750ab05bd100b627f1e043603d1069246ed Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Thu, 29 Jan 2015 16:33:11 -0800
Subject: [PATCH 771/829] Compile MemEnv with standard RocksDB library

Summary: This was a feature request by osquery. See task t5617758

Test Plan: compiles and memenv_test runs

Reviewers: yhchiang, rven, sdong

Reviewed By: sdong

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D32115
---
 HISTORY.md                              |  1 +
 Makefile                                | 12 +++---------
 build_tools/build_detect_platform       |  3 +--
 build_tools/unity                       |  2 +-
 {helpers/memenv => util}/memenv.cc      |  8 ++++++++
 {helpers/memenv => util}/memenv_test.cc |  0
 6 files changed, 14 insertions(+), 12 deletions(-)
 rename {helpers/memenv => util}/memenv.cc (98%)
 rename {helpers/memenv => util}/memenv_test.cc (100%)

diff --git a/HISTORY.md b/HISTORY.md
index bce856f4f..22b0c05ab 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -11,6 +11,7 @@
 * Added BlockBasedTableOptions.format_version option, which allows user to specify which version of block based table he wants. As a general guidline, newer versions have more features, but might not be readable by older versions of RocksDB.
 * Added new block based table format (version 2), which you can enable by setting BlockBasedTableOptions.format_version = 2. This format changes how we encode size information in compressed blocks and should help with memory allocations if you're using Zlib or BZip2 compressions.
 * GetThreadStatus() is now able to report compaction activity.
+* MemEnv (env that stores data in memory) is now available in default library build. You can create it by calling NewMemEnv().
 
 ### Public API changes
 * Deprecated skip_log_error_on_recovery option
diff --git a/Makefile b/Makefile
index bcd8c77d7..36814780b 100644
--- a/Makefile
+++ b/Makefile
@@ -99,7 +99,6 @@ CXXFLAGS += $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CXXFLAGS) $(OPT) -Woverl
 LDFLAGS += $(PLATFORM_LDFLAGS)
 
 LIBOBJECTS = $(SOURCES:.cc=.o)
-MEMENVOBJECTS = $(MEMENV_SOURCES:.cc=.o)
 MOCKOBJECTS = $(MOCK_SOURCES:.cc=.o)
 
 TESTUTIL = ./util/testutil.o
@@ -196,7 +195,6 @@ ifeq ($(LIBNAME),)
         LIBNAME=librocksdb
 endif
 LIBRARY = ${LIBNAME}.a
-MEMENVLIBRARY = libmemenv.a
 
 ROCKSDB_MAJOR = $(shell egrep "ROCKSDB_MAJOR.[0-9]" include/rocksdb/version.h | cut -d ' ' -f 3)
 ROCKSDB_MINOR = $(shell egrep "ROCKSDB_MINOR.[0-9]" include/rocksdb/version.h | cut -d ' ' -f 3)
@@ -311,7 +309,7 @@ unity: unity.cc unity.o
 	$(CXX) unity.o $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
 
 clean:
-	-rm -f $(PROGRAMS) $(TESTS) $(LIBRARY) $(SHARED) $(MEMENVLIBRARY) make_config.mk unity.cc
+	-rm -f $(PROGRAMS) $(TESTS) $(LIBRARY) $(SHARED) make_config.mk unity.cc
 	-rm -rf ios-x86/* ios-arm/*
 	-find . -name "*.[oda]" -exec rm {} \;
 	-find . -type f -regex ".*\.\(\(gcda\)\|\(gcno\)\)" -exec rm {} \;
@@ -552,12 +550,8 @@ options_test: util/options_test.o util/options_helper.o $(LIBOBJECTS) $(TESTHARN
 sst_dump_test: util/sst_dump_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) util/sst_dump_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@  $(LDFLAGS) $(COVERAGEFLAGS)
 
-$(MEMENVLIBRARY) : $(MEMENVOBJECTS)
-	rm -f $@
-	$(AR) -rs $@ $(MEMENVOBJECTS)
-
-memenv_test : helpers/memenv/memenv_test.o $(MEMENVOBJECTS) $(LIBOBJECTS) $(TESTHARNESS)
-	$(CXX) helpers/memenv/memenv_test.o $(MEMENVOBJECTS) $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+memenv_test : util/memenv_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) util/memenv_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
 
 mock_env_test : util/mock_env_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) util/mock_env_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
diff --git a/build_tools/build_detect_platform b/build_tools/build_detect_platform
index 58451ace2..b314b3acb 100755
--- a/build_tools/build_detect_platform
+++ b/build_tools/build_detect_platform
@@ -161,7 +161,7 @@ if test -z "$DO_NOT_RUN_BUILD_DETECT_VERSION"; then
   "$PWD/build_tools/build_detect_version"
 fi
 
-# We want to make a list of all cc files within util, db, table, and helpers
+# We want to make a list of all cc files within util, db and table
 # except for the test and benchmark files. By default, find will output a list
 # of all files matching either rule, so we need to append -print to make the
 # prune take effect.
@@ -179,7 +179,6 @@ set +f # re-enable globbing
 # file.
 echo "SOURCES=$PORTABLE_FILES $GENERIC_PORT_FILES $PORT_FILES" >> "$OUTPUT"
 echo "MOCK_SOURCES=$MOCK_SOURCES" >> "$OUTPUT"
-echo "MEMENV_SOURCES=helpers/memenv/memenv.cc" >> "$OUTPUT"
 
 if [ "$CROSS_COMPILE" = "true" -o "$FBCODE_BUILD" = "true" ]; then
     # Cross-compiling; do not try any compilation tests.
diff --git a/build_tools/unity b/build_tools/unity
index 477b8f7fb..8138de542 100755
--- a/build_tools/unity
+++ b/build_tools/unity
@@ -54,7 +54,7 @@ case "$TARGET_OS" in
         exit 1
 esac
 
-# We want to make a list of all cc files within util, db, table, and helpers
+# We want to make a list of all cc files within util, db and table
 # except for the test and benchmark files. By default, find will output a list
 # of all files matching either rule, so we need to append -print to make the
 # prune take effect.
diff --git a/helpers/memenv/memenv.cc b/util/memenv.cc
similarity index 98%
rename from helpers/memenv/memenv.cc
rename to util/memenv.cc
index d13fa55eb..e2db2e140 100644
--- a/helpers/memenv/memenv.cc
+++ b/util/memenv.cc
@@ -13,6 +13,8 @@
 
 namespace rocksdb {
 
+#ifndef ROCKSDB_LITE
+
 namespace {
 
 std::string NormalizeFileName(const std::string fname) {
@@ -420,4 +422,10 @@ Env* NewMemEnv(Env* base_env) {
   return new InMemoryEnv(base_env);
 }
 
+#else  // ROCKSDB_LITE
+
+Env* NewMemEnv(Env* base_env) { return nullptr; }
+
+#endif  // !ROCKSDB_LITE
+
 }  // namespace rocksdb
diff --git a/helpers/memenv/memenv_test.cc b/util/memenv_test.cc
similarity index 100%
rename from helpers/memenv/memenv_test.cc
rename to util/memenv_test.cc

From 6c6037f60cfc1b4c2bc59391eddefaf3c766b8b7 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Thu, 29 Jan 2015 18:22:43 -0800
Subject: [PATCH 772/829] Expose Snapshot's SequenceNumber

Summary:
Requested here: https://www.facebook.com/groups/rocksdb.dev/permalink/705524519546065/

It might also help with mongo. I don't see a reason why we shouldn't expose this info.

Test Plan: make check

Reviewers: sdong, yhchiang, rven

Reviewed By: rven

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D32547
---
 db/db_test.cc        | 6 ++++++
 db/snapshot.h        | 2 ++
 include/rocksdb/db.h | 4 ++++
 3 files changed, 12 insertions(+)

diff --git a/db/db_test.cc b/db/db_test.cc
index a8973a102..5941dc2d2 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -7751,6 +7751,12 @@ class ModelDB: public DB {
   class ModelSnapshot : public Snapshot {
    public:
     KVMap map_;
+
+    virtual SequenceNumber GetSequenceNumber() const {
+      // no need to call this
+      assert(false);
+      return 0;
+    }
   };
 
   explicit ModelDB(const Options& options) : options_(options) {}
diff --git a/db/snapshot.h b/db/snapshot.h
index 45c66eabc..de9897f24 100644
--- a/db/snapshot.h
+++ b/db/snapshot.h
@@ -20,6 +20,8 @@ class SnapshotImpl : public Snapshot {
  public:
   SequenceNumber number_;  // const after creation
 
+  virtual SequenceNumber GetSequenceNumber() const { return number_; }
+
  private:
   friend class SnapshotList;
 
diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h
index a519db7f6..a4141f38b 100644
--- a/include/rocksdb/db.h
+++ b/include/rocksdb/db.h
@@ -64,6 +64,10 @@ static const int kMinorVersion = __ROCKSDB_MINOR__;
 // A Snapshot is an immutable object and can therefore be safely
 // accessed from multiple threads without any external synchronization.
 class Snapshot {
+ public:
+  // returns Snapshot's sequence number
+  virtual SequenceNumber GetSequenceNumber() const = 0;
+
  protected:
   virtual ~Snapshot();
 };

From 5917de0bae64c104cd3311eb8fe8dd1bc812dd4f Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Wed, 21 Jan 2015 11:09:56 -0800
Subject: [PATCH 773/829] CappedFixTransform: return fixed length prefix, or
 full key if key is shorter than the fixed length

Summary: Add CappedFixTransform, which is the same as fixed length prefix extractor, except that when slice is shorter than the fixed length, it will use the full key.

Test Plan:
Add a test case for
db_test
options_test
and a new test

Reviewers: yhchiang, rven, igor

Reviewed By: igor

Subscribers: MarkCallaghan, leveldb, dhruba, yoshinorim

Differential Revision: https://reviews.facebook.net/D31887
---
 HISTORY.md                        |   1 +
 Makefile                          |   5 +
 db/db_test.cc                     |  59 +++++++-----
 include/rocksdb/slice_transform.h |  27 ++++++
 util/options_helper.cc            |  27 ++++--
 util/options_test.cc              |  11 ++-
 util/slice.cc                     |  40 ++++++++
 util/slice_transform_test.cc      | 150 ++++++++++++++++++++++++++++++
 8 files changed, 286 insertions(+), 34 deletions(-)
 create mode 100644 util/slice_transform_test.cc

diff --git a/HISTORY.md b/HISTORY.md
index 22b0c05ab..c688585e5 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -12,6 +12,7 @@
 * Added new block based table format (version 2), which you can enable by setting BlockBasedTableOptions.format_version = 2. This format changes how we encode size information in compressed blocks and should help with memory allocations if you're using Zlib or BZip2 compressions.
 * GetThreadStatus() is now able to report compaction activity.
 * MemEnv (env that stores data in memory) is now available in default library build. You can create it by calling NewMemEnv().
+* Add SliceTransform.SameResultWhenAppended() to help users determine it is safe to apply prefix bloom/hash.
 
 ### Public API changes
 * Deprecated skip_log_error_on_recovery option
diff --git a/Makefile b/Makefile
index 36814780b..1c0dea975 100644
--- a/Makefile
+++ b/Makefile
@@ -128,6 +128,7 @@ TESTS = \
 	coding_test \
 	corruption_test \
 	crc32c_test \
+	slice_transform_test \
 	dbformat_test \
 	env_test \
 	fault_injection_test \
@@ -403,6 +404,10 @@ corruption_test: db/corruption_test.o $(LIBOBJECTS) $(TESTHARNESS)
 crc32c_test: util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
 
+slice_transform_test: util/slice_transform_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) util/slice_transform_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+
 db_test: db/db_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) db/db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
 
diff --git a/db/db_test.cc b/db/db_test.cc
index 5941dc2d2..9014c9c86 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -389,28 +389,29 @@ class DBTest {
     kBlockBasedTableWithPrefixHashIndex = 1,
     kBlockBasedTableWithWholeKeyHashIndex = 2,
     kPlainTableFirstBytePrefix = 3,
-    kPlainTableAllBytesPrefix = 4,
-    kVectorRep = 5,
-    kHashLinkList = 6,
-    kHashCuckoo = 7,
-    kMergePut = 8,
-    kFilter = 9,
-    kFullFilter = 10,
-    kUncompressed = 11,
-    kNumLevel_3 = 12,
-    kDBLogDir = 13,
-    kWalDirAndMmapReads = 14,
-    kManifestFileSize = 15,
-    kCompactOnFlush = 16,
-    kPerfOptions = 17,
-    kDeletesFilterFirst = 18,
-    kHashSkipList = 19,
-    kUniversalCompaction = 20,
-    kCompressedBlockCache = 21,
-    kInfiniteMaxOpenFiles = 22,
-    kxxHashChecksum = 23,
-    kFIFOCompaction = 24,
-    kEnd = 25
+    kPlainTableCappedPrefix = 4,
+    kPlainTableAllBytesPrefix = 5,
+    kVectorRep = 6,
+    kHashLinkList = 7,
+    kHashCuckoo = 8,
+    kMergePut = 9,
+    kFilter = 10,
+    kFullFilter = 11,
+    kUncompressed = 12,
+    kNumLevel_3 = 13,
+    kDBLogDir = 14,
+    kWalDirAndMmapReads = 15,
+    kManifestFileSize = 16,
+    kCompactOnFlush = 17,
+    kPerfOptions = 18,
+    kDeletesFilterFirst = 19,
+    kHashSkipList = 20,
+    kUniversalCompaction = 21,
+    kCompressedBlockCache = 22,
+    kInfiniteMaxOpenFiles = 23,
+    kxxHashChecksum = 24,
+    kFIFOCompaction = 25,
+    kEnd = 26
   };
   int option_config_;
 
@@ -483,9 +484,10 @@ class DBTest {
            option_config_ == kHashSkipList)) {;
         continue;
       }
-      if ((skip_mask & kSkipPlainTable)
-          && (option_config_ == kPlainTableAllBytesPrefix
-              || option_config_ == kPlainTableFirstBytePrefix)) {
+      if ((skip_mask & kSkipPlainTable) &&
+          (option_config_ == kPlainTableAllBytesPrefix ||
+           option_config_ == kPlainTableFirstBytePrefix ||
+           option_config_ == kPlainTableCappedPrefix)) {
         continue;
       }
       if ((skip_mask & kSkipHashIndex) &&
@@ -577,6 +579,13 @@ class DBTest {
         options.max_sequential_skip_in_iterations = 999999;
         set_block_based_table_factory = false;
         break;
+      case kPlainTableCappedPrefix:
+        options.table_factory.reset(new PlainTableFactory());
+        options.prefix_extractor.reset(NewCappedPrefixTransform(8));
+        options.allow_mmap_reads = true;
+        options.max_sequential_skip_in_iterations = 999999;
+        set_block_based_table_factory = false;
+        break;
       case kPlainTableAllBytesPrefix:
         options.table_factory.reset(new PlainTableFactory());
         options.prefix_extractor.reset(NewNoopTransform());
diff --git a/include/rocksdb/slice_transform.h b/include/rocksdb/slice_transform.h
index a78455001..c51dd8cb8 100644
--- a/include/rocksdb/slice_transform.h
+++ b/include/rocksdb/slice_transform.h
@@ -36,10 +36,37 @@ class SliceTransform {
 
   // determine whether dst=Transform(src) for some src
   virtual bool InRange(const Slice& dst) const = 0;
+
+  // Transform(s)=Transform(`prefix`) for any s with `prefix` as a prefix.
+  //
+  // This function is not used by RocksDB, but for users. If users pass
+  // Options by string to RocksDB, they might not know what prefix extractor
+  // they are using. This function is to help users can determine:
+  //   if they want to iterate all keys prefixing `prefix`, whetherit is
+  //   safe to use prefix bloom filter and seek to key `prefix`.
+  // Only returning false indicates it is correct to do that.
+  //
+  // Here is an example: Suppose we implement a slice transform that returns
+  // the first part of the string after spliting it using deimiter ",":
+  // 1. SameResultWhenAppended("abc,") should return true. If aplying prefix
+  //    bloom filter using it, all slices matching "abc:.*" will be extracted
+  //    to "abc,", so any SST file or memtable containing any of those key
+  //    will not be filtered out.
+  // 2. SameResultWhenAppended("abc") should return false. A user will not
+  //    guaranteed to see all the keys matching "abc.*" if a user seek to "abc"
+  //    against a DB with the same setting. If one SST file only contains
+  //    "abcd,e", the file can be filtered out and the key will be invisible.
+  //
+  // i.e., an implementation always returning false is safe.
+  virtual bool SameResultWhenAppended(const Slice& prefix) const {
+    return false;
+  }
 };
 
 extern const SliceTransform* NewFixedPrefixTransform(size_t prefix_len);
 
+extern const SliceTransform* NewCappedPrefixTransform(size_t cap_len);
+
 extern const SliceTransform* NewNoopTransform();
 
 }
diff --git a/util/options_helper.cc b/util/options_helper.cc
index efc028497..d720a91e6 100644
--- a/util/options_helper.cc
+++ b/util/options_helper.cc
@@ -510,14 +510,27 @@ Status GetColumnFamilyOptionsFromMap(
       } else if (o.first == "inplace_update_support") {
         new_options->inplace_update_support = ParseBoolean(o.first, o.second);
       } else if (o.first == "prefix_extractor") {
-        const std::string kName = "fixed:";
-        if (o.second.compare(0, kName.size(), kName) != 0) {
-          return Status::InvalidArgument("Invalid Prefix Extractor type: "
-                                         + o.second);
+        const std::string kFixedPrefixName = "fixed:";
+        const std::string kCappedPrefixName = "capped:";
+        auto& pe_value = o.second;
+        if (pe_value.size() > kFixedPrefixName.size() &&
+            pe_value.compare(0, kFixedPrefixName.size(), kFixedPrefixName) ==
+                0) {
+          int prefix_length =
+              ParseInt(trim(o.second.substr(kFixedPrefixName.size())));
+          new_options->prefix_extractor.reset(
+              NewFixedPrefixTransform(prefix_length));
+        } else if (pe_value.size() > kCappedPrefixName.size() &&
+                   pe_value.compare(0, kCappedPrefixName.size(),
+                                    kCappedPrefixName) == 0) {
+          int prefix_length =
+              ParseInt(trim(pe_value.substr(kCappedPrefixName.size())));
+          new_options->prefix_extractor.reset(
+              NewCappedPrefixTransform(prefix_length));
+        } else {
+          return Status::InvalidArgument("Invalid Prefix Extractor type: " +
+                                         pe_value);
         }
-        int prefix_length = ParseInt(trim(o.second.substr(kName.size())));
-        new_options->prefix_extractor.reset(
-            NewFixedPrefixTransform(prefix_length));
       } else {
         return Status::InvalidArgument("Unrecognized option: " + o.first);
       }
diff --git a/util/options_test.cc b/util/options_test.cc
index cd26b0211..5ddfac27c 100644
--- a/util/options_test.cc
+++ b/util/options_test.cc
@@ -330,10 +330,17 @@ TEST(OptionsTest, GetOptionsFromStringTest) {
   ASSERT_EQ(new_cf_opt.max_write_buffer_number, 16*1024*1024);
   ASSERT_EQ(new_cf_opt.inplace_update_num_locks, 17*1024UL*1024UL);
   // Units (g)
-  ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt,
-            "write_buffer_size=18g;arena_block_size=19G", &new_cf_opt));
+  ASSERT_OK(GetColumnFamilyOptionsFromString(
+      base_cf_opt,
+      "write_buffer_size=18g;prefix_extractor=capped:8;"
+      "arena_block_size=19G",
+      &new_cf_opt));
   ASSERT_EQ(new_cf_opt.write_buffer_size, 18*1024UL*1024UL*1024UL);
   ASSERT_EQ(new_cf_opt.arena_block_size, 19*1024UL*1024UL*1024UL);
+  ASSERT_TRUE(new_cf_opt.prefix_extractor.get() != nullptr);
+  std::string prefix_name(new_cf_opt.prefix_extractor->Name());
+  ASSERT_EQ(prefix_name, "rocksdb.CappedPrefix.8");
+
   // Units (t)
   ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt,
             "write_buffer_size=20t;arena_block_size=21T", &new_cf_opt));
diff --git a/util/slice.cc b/util/slice.cc
index cd197ced5..734ea974b 100644
--- a/util/slice.cc
+++ b/util/slice.cc
@@ -39,6 +39,38 @@ class FixedPrefixTransform : public SliceTransform {
   virtual bool InRange(const Slice& dst) const {
     return (dst.size() == prefix_len_);
   }
+
+  virtual bool SameResultWhenAppended(const Slice& prefix) const {
+    return InDomain(prefix);
+  }
+};
+
+class CappedPrefixTransform : public SliceTransform {
+ private:
+  size_t cap_len_;
+  std::string name_;
+
+ public:
+  explicit CappedPrefixTransform(size_t cap_len)
+      : cap_len_(cap_len),
+        name_("rocksdb.CappedPrefix." + ToString(cap_len_)) {}
+
+  virtual const char* Name() const { return name_.c_str(); }
+
+  virtual Slice Transform(const Slice& src) const {
+    assert(InDomain(src));
+    return Slice(src.data(), std::min(cap_len_, src.size()));
+  }
+
+  virtual bool InDomain(const Slice& src) const { return true; }
+
+  virtual bool InRange(const Slice& dst) const {
+    return (dst.size() <= cap_len_);
+  }
+
+  virtual bool SameResultWhenAppended(const Slice& prefix) const {
+    return prefix.size() >= cap_len_;
+  }
 };
 
 class NoopTransform : public SliceTransform {
@@ -60,6 +92,10 @@ class NoopTransform : public SliceTransform {
   virtual bool InRange(const Slice& dst) const {
     return true;
   }
+
+  virtual bool SameResultWhenAppended(const Slice& prefix) const {
+    return false;
+  }
 };
 
 }
@@ -68,6 +104,10 @@ const SliceTransform* NewFixedPrefixTransform(size_t prefix_len) {
   return new FixedPrefixTransform(prefix_len);
 }
 
+const SliceTransform* NewCappedPrefixTransform(size_t cap_len) {
+  return new CappedPrefixTransform(cap_len);
+}
+
 const SliceTransform* NewNoopTransform() {
   return new NoopTransform;
 }
diff --git a/util/slice_transform_test.cc b/util/slice_transform_test.cc
new file mode 100644
index 000000000..9f0e34b15
--- /dev/null
+++ b/util/slice_transform_test.cc
@@ -0,0 +1,150 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "rocksdb/slice_transform.h"
+
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/table.h"
+#include "util/testharness.h"
+
+namespace rocksdb {
+
+class SliceTransformTest {};
+
+TEST(SliceTransformTest, CapPrefixTransform) {
+  std::string s;
+  s = "abcdefge";
+
+  unique_ptr<const SliceTransform> transform;
+
+  transform.reset(NewCappedPrefixTransform(6));
+  ASSERT_EQ(transform->Transform(s).ToString(), "abcdef");
+  ASSERT_TRUE(transform->SameResultWhenAppended("123456"));
+  ASSERT_TRUE(transform->SameResultWhenAppended("1234567"));
+  ASSERT_TRUE(!transform->SameResultWhenAppended("12345"));
+
+  transform.reset(NewCappedPrefixTransform(8));
+  ASSERT_EQ(transform->Transform(s).ToString(), "abcdefge");
+
+  transform.reset(NewCappedPrefixTransform(10));
+  ASSERT_EQ(transform->Transform(s).ToString(), "abcdefge");
+
+  transform.reset(NewCappedPrefixTransform(0));
+  ASSERT_EQ(transform->Transform(s).ToString(), "");
+
+  transform.reset(NewCappedPrefixTransform(0));
+  ASSERT_EQ(transform->Transform("").ToString(), "");
+}
+
+class SliceTransformDBTest {
+ private:
+  std::string dbname_;
+  Env* env_;
+  DB* db_;
+
+ public:
+  SliceTransformDBTest() : env_(Env::Default()), db_(nullptr) {
+    dbname_ = test::TmpDir() + "/slice_transform_db_test";
+    ASSERT_OK(DestroyDB(dbname_, last_options_));
+  }
+
+  ~SliceTransformDBTest() {
+    delete db_;
+    ASSERT_OK(DestroyDB(dbname_, last_options_));
+  }
+
+  DB* db() { return db_; }
+
+  // Return the current option configuration.
+  Options* GetOptions() { return &last_options_; }
+
+  void DestroyAndReopen() {
+    // Destroy using last options
+    Destroy();
+    ASSERT_OK(TryReopen());
+  }
+
+  void Destroy() {
+    delete db_;
+    db_ = nullptr;
+    ASSERT_OK(DestroyDB(dbname_, last_options_));
+  }
+
+  Status TryReopen() {
+    delete db_;
+    db_ = nullptr;
+    last_options_.create_if_missing = true;
+
+    return DB::Open(last_options_, dbname_, &db_);
+  }
+
+  Options last_options_;
+};
+
+namespace {
+uint64_t TestGetTickerCount(const Options& options, Tickers ticker_type) {
+  return options.statistics->getTickerCount(ticker_type);
+}
+}  // namespace
+
+TEST(SliceTransformDBTest, CapPrefix) {
+  last_options_.prefix_extractor.reset(NewCappedPrefixTransform(8));
+  last_options_.statistics = rocksdb::CreateDBStatistics();
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  bbto.whole_key_filtering = false;
+  last_options_.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  ASSERT_OK(TryReopen());
+
+  ReadOptions ro;
+  FlushOptions fo;
+  WriteOptions wo;
+
+  ASSERT_OK(db()->Put(wo, "barbarbar", "foo"));
+  ASSERT_OK(db()->Put(wo, "barbarbar2", "foo2"));
+  ASSERT_OK(db()->Put(wo, "foo", "bar"));
+  ASSERT_OK(db()->Put(wo, "foo3", "bar3"));
+  ASSERT_OK(db()->Flush(fo));
+
+  unique_ptr<Iterator> iter(db()->NewIterator(ro));
+
+  iter->Seek("foo");
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->value().ToString(), "bar");
+  ASSERT_EQ(TestGetTickerCount(last_options_, BLOOM_FILTER_PREFIX_USEFUL), 0U);
+
+  iter->Seek("foo2");
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(!iter->Valid());
+  ASSERT_EQ(TestGetTickerCount(last_options_, BLOOM_FILTER_PREFIX_USEFUL), 1U);
+
+  iter->Seek("barbarbar");
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->value().ToString(), "foo");
+  ASSERT_EQ(TestGetTickerCount(last_options_, BLOOM_FILTER_PREFIX_USEFUL), 1U);
+
+  iter->Seek("barfoofoo");
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(!iter->Valid());
+  ASSERT_EQ(TestGetTickerCount(last_options_, BLOOM_FILTER_PREFIX_USEFUL), 2U);
+
+  iter->Seek("foobarbar");
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(!iter->Valid());
+  ASSERT_EQ(TestGetTickerCount(last_options_, BLOOM_FILTER_PREFIX_USEFUL), 3U);
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) { return rocksdb::test::RunAllTests(); }

From db9ed5fdb44a78e955c56b0483c50caa0c4dd22e Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Fri, 30 Jan 2015 16:07:35 -0800
Subject: [PATCH 774/829] Unaddressed comment in previous diff. Change only in
 code comments.

---
 include/rocksdb/slice_transform.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/include/rocksdb/slice_transform.h b/include/rocksdb/slice_transform.h
index c51dd8cb8..3694c5802 100644
--- a/include/rocksdb/slice_transform.h
+++ b/include/rocksdb/slice_transform.h
@@ -44,7 +44,9 @@ class SliceTransform {
   // they are using. This function is to help users can determine:
   //   if they want to iterate all keys prefixing `prefix`, whetherit is
   //   safe to use prefix bloom filter and seek to key `prefix`.
-  // Only returning false indicates it is correct to do that.
+  // If this function returns true, this means a user can Seek() to a prefix
+  // using the bloom filter. Otherwise, user needs to skip the bloom filter
+  // by setting ReadOptions.total_order_seek = true.
   //
   // Here is an example: Suppose we implement a slice transform that returns
   // the first part of the string after spliting it using deimiter ",":

From 86e2a1eeea07917923318c5d7ed95b6aafe652bb Mon Sep 17 00:00:00 2001
From: Marko Kevac <marko@kevac.org>
Date: Sat, 31 Jan 2015 15:47:49 +0300
Subject: [PATCH 775/829] Allow creating backups from C

---
 db/c.cc                     | 30 ++++++++++++++++++++++++++++++
 examples/c_simple_example.c |  9 +++++++++
 include/rocksdb/c.h         | 13 +++++++++++++
 3 files changed, 52 insertions(+)

diff --git a/db/c.cc b/db/c.cc
index 76a949cd1..c952b2e8b 100644
--- a/db/c.cc
+++ b/db/c.cc
@@ -29,6 +29,7 @@
 #include "rocksdb/statistics.h"
 #include "rocksdb/slice_transform.h"
 #include "rocksdb/table.h"
+#include "rocksdb/utilities/backupable_db.h"
 
 using rocksdb::Cache;
 using rocksdb::ColumnFamilyDescriptor;
@@ -69,12 +70,15 @@ using rocksdb::WritableFile;
 using rocksdb::WriteBatch;
 using rocksdb::WriteOptions;
 using rocksdb::LiveFileMetaData;
+using rocksdb::BackupEngine;
+using rocksdb::BackupableDBOptions;
 
 using std::shared_ptr;
 
 extern "C" {
 
 struct rocksdb_t                 { DB*               rep; };
+struct rocksdb_backup_engine_t   { BackupEngine*     rep; };
 struct rocksdb_iterator_t        { Iterator*         rep; };
 struct rocksdb_writebatch_t      { WriteBatch        rep; };
 struct rocksdb_snapshot_t        { const Snapshot*   rep; };
@@ -527,6 +531,32 @@ rocksdb_t* rocksdb_open_for_read_only(
   return result;
 }
 
+rocksdb_backup_engine_t* rocksdb_backup_engine_open(
+    const char* path,
+    char** errptr) {
+  BackupEngine* be;
+  if (SaveError(errptr, BackupEngine::Open(Env::Default(), BackupableDBOptions(path), &be))) {
+    return nullptr;
+  }
+  rocksdb_backup_engine_t* result = new rocksdb_backup_engine_t;
+  result->rep = be;
+  return result;
+}
+
+void rocksdb_backup_engine_create_new_backup(
+    rocksdb_backup_engine_t *be,
+    rocksdb_t *db,
+    char** errptr) {
+  SaveError(errptr, be->rep->CreateNewBackup(db->rep));
+}
+
+void rocksdb_backup_engine_close(
+    rocksdb_backup_engine_t *be) {
+  delete be->rep;
+  delete be;
+}
+
+
 void rocksdb_close(rocksdb_t* db) {
   delete db->rep;
   delete db;
diff --git a/examples/c_simple_example.c b/examples/c_simple_example.c
index 1dd380721..8340026db 100644
--- a/examples/c_simple_example.c
+++ b/examples/c_simple_example.c
@@ -8,9 +8,11 @@
 #include <unistd.h>  // sysconf() - get CPU count
 
 const char DBPath[] = "/tmp/rocksdb_simple_example";
+const char DBBackupPath[] = "/tmp/rocksdb_simple_example_backup";
 
 int main(int argc, char **argv) {
   rocksdb_t *db;
+  rocksdb_backup_engine_t *be;
   rocksdb_options_t *options = rocksdb_options_create();
   // Optimize RocksDB. This is the easiest way to
   // get RocksDB to perform well
@@ -25,6 +27,9 @@ int main(int argc, char **argv) {
   db = rocksdb_open(options, DBPath, &err);
   assert(!err);
 
+  be = rocksdb_backup_engine_open(DBBackupPath, &err);
+  assert(!err);
+
   // Put key-value
   rocksdb_writeoptions_t *writeoptions = rocksdb_writeoptions_create();
   const char key[] = "key";
@@ -41,10 +46,14 @@ int main(int argc, char **argv) {
   assert(strcmp(returned_value, "value") == 0);
   free(returned_value);
 
+  rocksdb_backup_engine_create_new_backup(be, db, &err);
+  assert(!err);
+
   // cleanup
   rocksdb_writeoptions_destroy(writeoptions);
   rocksdb_readoptions_destroy(readoptions);
   rocksdb_options_destroy(options);
+  rocksdb_backup_engine_close(be);
   rocksdb_close(db);
 
   return 0;
diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h
index c686c90c7..9d36f8277 100644
--- a/include/rocksdb/c.h
+++ b/include/rocksdb/c.h
@@ -55,6 +55,7 @@ extern "C" {
 /* Exported types */
 
 typedef struct rocksdb_t                 rocksdb_t;
+typedef struct rocksdb_backup_engine_t   rocksdb_backup_engine_t;
 typedef struct rocksdb_cache_t           rocksdb_cache_t;
 typedef struct rocksdb_compactionfilter_t rocksdb_compactionfilter_t;
 typedef struct rocksdb_compactionfiltercontext_t
@@ -104,6 +105,18 @@ extern rocksdb_t* rocksdb_open_for_read_only(
     unsigned char error_if_log_file_exist,
     char** errptr);
 
+extern rocksdb_backup_engine_t* rocksdb_backup_engine_open(
+    const char* path,
+    char** errptr);
+
+extern void rocksdb_backup_engine_create_new_backup(
+    rocksdb_backup_engine_t *be,
+    rocksdb_t* db,
+    char** errptr);
+
+extern void rocksdb_backup_engine_close(
+    rocksdb_backup_engine_t *be);
+
 extern rocksdb_t* rocksdb_open_column_families(
     const rocksdb_options_t* options,
     const char* name,

From 939bb36597d9fc812888c3dae33fe32fd5f1c428 Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Sat, 31 Jan 2015 14:43:21 +0100
Subject: [PATCH 776/829] [RocksJava] Fix ColumnFamily name alloc in TTL DB

While fixing the RocksDB ColumnFamily name the TTL DB
wasn`t touched. This commit resolves this.
---
 java/rocksjni/ttl.cc | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/java/rocksjni/ttl.cc b/java/rocksjni/ttl.cc
index 4164a0c4b..2992e930d 100644
--- a/java/rocksjni/ttl.cc
+++ b/java/rocksjni/ttl.cc
@@ -57,6 +57,8 @@ jobject
   const char* db_path = env->GetStringUTFChars(jdb_path, 0);
 
   std::vector<jbyte*> cfnames_to_free;
+  // the zero-terminated version of cfnames_to_free.
+  std::vector<char*> c_cfnames_to_free;
   std::vector<jbyteArray> jcfnames_for_free;
 
   std::vector<rocksdb::ColumnFamilyDescriptor> column_families;
@@ -85,12 +87,17 @@ jobject
           rocksdb::ColumnFamilyOptionsJni::getHandle(env, jcf_opt_obj);
 
       jbyte* cfname = env->GetByteArrayElements(byteArray, 0);
+      const int len = env->GetArrayLength(byteArray) + 1;
+      char* c_cfname = new char[len];
+      memcpy(c_cfname, cfname, len - 1);
+      c_cfname[len - 1] = 0;
+
       // free allocated cfnames after call to open
       cfnames_to_free.push_back(cfname);
+      c_cfnames_to_free.push_back(c_cfname);
       jcfnames_for_free.push_back(byteArray);
       column_families.push_back(rocksdb::ColumnFamilyDescriptor(
-          reinterpret_cast<const char *>(cfname),
-          *cfOptions));
+          c_cfname, *cfOptions));
   }
   // get iterator for TTL values
   iteratorObj = env->CallObjectMethod(
@@ -115,6 +122,8 @@ jobject
       i != cfnames_to_free.size(); i++) {
     // free  cfnames
     env->ReleaseByteArrayElements(jcfnames_for_free[i], cfnames_to_free[i], 0);
+    // free c_cfnames
+    delete[] c_cfnames_to_free[i];
   }
 
   // check if open operation was successful
@@ -167,9 +176,15 @@ jlong Java_org_rocksdb_TtlDB_createColumnFamilyWithTtl(
       rocksdb::ColumnFamilyOptionsJni::getHandle(env, jcf_opt_obj);
 
   jbyte* cfname = env->GetByteArrayElements(byteArray, 0);
+  const int len = env->GetArrayLength(byteArray) + 1;
+  char* c_cfname = new char[len];
+  memcpy(c_cfname, cfname, len - 1);
+  c_cfname[len - 1] = 0;
+
   rocksdb::Status s = db_handle->CreateColumnFamilyWithTtl(
-      *cfOptions, reinterpret_cast<char *>(cfname), &handle, jttl);
+      *cfOptions, c_cfname, &handle, jttl);
   env->ReleaseByteArrayElements(byteArray, cfname, 0);
+  delete[] c_cfname;
 
   if (s.ok()) {
     return reinterpret_cast<jlong>(handle);

From 9a456fba2088f2bd66a5b319c630300ea6da2aba Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Sun, 25 Jan 2015 22:05:29 +0100
Subject: [PATCH 777/829] [RocksJava] GetUpdatesSince support

---
 java/Makefile                                 |   2 +
 java/org/rocksdb/RocksDB.java                 |  25 ++++
 java/org/rocksdb/TransactionLogIterator.java  | 116 ++++++++++++++++++
 java/org/rocksdb/WriteBatch.java              |   6 +
 .../test/TransactionLogIteratorTest.java      |  79 ++++++++++++
 java/rocksjni/rocksjni.cc                     |  24 ++++
 java/rocksjni/transaction_log.cc              |  79 ++++++++++++
 7 files changed, 331 insertions(+)
 create mode 100644 java/org/rocksdb/TransactionLogIterator.java
 create mode 100644 java/org/rocksdb/test/TransactionLogIteratorTest.java
 create mode 100644 java/rocksjni/transaction_log.cc

diff --git a/java/Makefile b/java/Makefile
index 42f465e10..97f0b0244 100644
--- a/java/Makefile
+++ b/java/Makefile
@@ -29,6 +29,7 @@ NATIVE_JAVA_CLASSES = org.rocksdb.AbstractComparator\
 	org.rocksdb.SkipListMemTableConfig\
 	org.rocksdb.Slice\
 	org.rocksdb.Statistics\
+	org.rocksdb.TransactionLogIterator\
 	org.rocksdb.TtlDB\
 	org.rocksdb.VectorMemTableConfig\
 	org.rocksdb.StringAppendOperator\
@@ -81,6 +82,7 @@ JAVA_TESTS = org.rocksdb.test.BackupableDBOptionsTest\
 		org.rocksdb.test.SizeUnitTest\
 		org.rocksdb.test.SliceTest\
 		org.rocksdb.test.SnapshotTest\
+		org.rocksdb.test.TransactionLogIteratorTest\
 		org.rocksdb.test.TtlDBTest\
 		org.rocksdb.test.StatisticsCollectorTest\
 		org.rocksdb.test.WriteBatchHandlerTest\
diff --git a/java/org/rocksdb/RocksDB.java b/java/org/rocksdb/RocksDB.java
index ac02860e8..96032165e 100644
--- a/java/org/rocksdb/RocksDB.java
+++ b/java/org/rocksdb/RocksDB.java
@@ -1588,6 +1588,29 @@ public class RocksDB extends RocksObject {
         columnFamilyHandle.nativeHandle_);
   }
 
+  /**
+   * <p>Returns an iterator that is positioned at a write-batch containing
+   * seq_number. If the sequence number is non existent, it returns an iterator
+   * at the first available seq_no after the requested seq_no.</p>
+   *
+   * <p>Must set WAL_ttl_seconds or WAL_size_limit_MB to large values to
+   * use this api, else the WAL files will get
+   * cleared aggressively and the iterator might keep getting invalid before
+   * an update is read.</p>
+   *
+   * @param sequenceNumber sequence number offset
+   *
+   * @return {@link org.rocksdb.TransactionLogIterator} instance.
+   *
+   * @throws org.rocksdb.RocksDBException if iterator cannot be retrieved
+   *     from native-side.
+   */
+  public TransactionLogIterator getUpdatesSince(long sequenceNumber)
+      throws RocksDBException {
+    return new TransactionLogIterator(
+        getUpdatesSince(nativeHandle_, sequenceNumber));
+  }
+
   /**
    * Private constructor.
    */
@@ -1730,6 +1753,8 @@ public class RocksDB extends RocksObject {
   private native void compactRange(long handle, byte[] begin, int beginLen, byte[] end,
       int endLen, boolean reduce_level, int target_level, int target_path_id,
       long cfHandle) throws RocksDBException;
+  private native long getUpdatesSince(long handle, long sequenceNumber)
+      throws RocksDBException;
 
   protected DBOptionsInterface options_;
 }
diff --git a/java/org/rocksdb/TransactionLogIterator.java b/java/org/rocksdb/TransactionLogIterator.java
new file mode 100644
index 000000000..8de61aa00
--- /dev/null
+++ b/java/org/rocksdb/TransactionLogIterator.java
@@ -0,0 +1,116 @@
+package org.rocksdb;
+
+/**
+ * <p>A TransactionLogIterator is used to iterate over the transactions in a db.
+ * One run of the iterator is continuous, i.e. the iterator will stop at the
+ * beginning of any gap in sequences.</p>
+ */
+public class TransactionLogIterator extends RocksObject {
+
+  /**
+   * <p>An iterator is either positioned at a WriteBatch
+   * or not valid. This method returns true if the iterator
+   * is valid. Can read data from a valid iterator.</p>
+   *
+   * @return true if iterator position is valid.
+   */
+  public boolean isValid() {
+    return isValid(nativeHandle_);
+  }
+
+  /**
+   * <p>Moves the iterator to the next WriteBatch.
+   * <strong>REQUIRES</strong>: Valid() to be true.</p>
+   */
+  public void next() {
+    assert(isValid());
+    next(nativeHandle_);
+  }
+
+  /**
+   * <p>Throws RocksDBException if something went wrong.</p>
+   *
+   * @throws org.rocksdb.RocksDBException if something went
+   *     wrong in the underlying C++ code.
+   */
+  public void status() throws RocksDBException {
+    status(nativeHandle_);
+  }
+
+  /**
+   * <p>If iterator position is valid, return the current
+   * write_batch and the sequence number of the earliest
+   * transaction contained in the batch.</p>
+   *
+   * <p>ONLY use if Valid() is true and status() is OK.</p>
+   *
+   * @return {@link org.rocksdb.TransactionLogIterator.BatchResult}
+   *     instance.
+   */
+  public BatchResult getBatch() {
+    assert(isValid());
+    return getBatch(nativeHandle_);
+  }
+
+  /**
+   * <p>TransactionLogIterator constructor.</p>
+   *
+   * @param nativeHandle address to native address.
+   */
+  TransactionLogIterator(long nativeHandle) {
+    super();
+    nativeHandle_ = nativeHandle;
+  }
+
+  @Override protected void disposeInternal() {
+    disposeInternal(nativeHandle_);
+  }
+
+  /**
+   * <p>BatchResult represents a data structure returned
+   * by a TransactionLogIterator containing a sequence
+   * number and a {@link WriteBatch} instance.</p>
+   */
+  public class BatchResult {
+    /**
+     * <p>Constructor of BatchResult class.</p>
+     *
+     * @param sequenceNumber related to this BatchResult instance.
+     * @param nativeHandle to {@link org.rocksdb.WriteBatch}
+     *     native instance.
+     */
+    public BatchResult(long sequenceNumber, long nativeHandle) {
+      sequenceNumber_ = sequenceNumber;
+      writeBatch_ = new WriteBatch(nativeHandle);
+    }
+
+    /**
+     * <p>Return sequence number related to this BatchResult.</p>
+     *
+     * @return Sequence number.
+     */
+    public long sequenceNumber() {
+      return sequenceNumber_;
+    }
+
+    /**
+     * <p>Return contained {@link org.rocksdb.WriteBatch}
+     * instance</p>
+     *
+     * @return {@link org.rocksdb.WriteBatch} instance.
+     */
+    public WriteBatch writeBatch() {
+      return writeBatch_;
+    }
+
+    private final long sequenceNumber_;
+    private final WriteBatch writeBatch_;
+  }
+
+  private native void disposeInternal(long handle);
+  private native boolean isValid(long handle);
+  private native void next(long handle);
+  private native void status(long handle)
+      throws RocksDBException;
+  private native BatchResult getBatch(long handle);
+}
diff --git a/java/org/rocksdb/WriteBatch.java b/java/org/rocksdb/WriteBatch.java
index 24133ec39..fd8b894cb 100644
--- a/java/org/rocksdb/WriteBatch.java
+++ b/java/org/rocksdb/WriteBatch.java
@@ -53,6 +53,12 @@ public class WriteBatch extends AbstractWriteBatch {
     iterate(handler.nativeHandle_);
   }
 
+  WriteBatch(long nativeHandle) {
+    super();
+    disOwnNativeHandle();
+    nativeHandle_ = nativeHandle;
+  }
+
   @Override final native void disposeInternal(long handle);
   @Override final native int count0();
   @Override final native void put(byte[] key, int keyLen, byte[] value, int valueLen);
diff --git a/java/org/rocksdb/test/TransactionLogIteratorTest.java b/java/org/rocksdb/test/TransactionLogIteratorTest.java
new file mode 100644
index 000000000..2069e1200
--- /dev/null
+++ b/java/org/rocksdb/test/TransactionLogIteratorTest.java
@@ -0,0 +1,79 @@
+package org.rocksdb.test;
+
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+import org.rocksdb.*;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class TransactionLogIteratorTest {
+  @ClassRule
+  public static final RocksMemoryResource rocksMemoryResource =
+      new RocksMemoryResource();
+
+  @Rule
+  public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  @Test
+  public void transactionLogIterator() throws RocksDBException {
+    RocksDB db = null;
+    Options options = null;
+    TransactionLogIterator transactionLogIterator = null;
+    try {
+      options = new Options().
+          setCreateIfMissing(true);
+      db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath());
+      transactionLogIterator = db.getUpdatesSince(0);
+    } finally {
+      if (transactionLogIterator != null) {
+        transactionLogIterator.dispose();
+      }
+      if (db != null) {
+        db.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void getBatch() throws RocksDBException {
+    RocksDB db = null;
+    Options options = null;
+    TransactionLogIterator transactionLogIterator = null;
+    try {
+      options = new Options().
+          setCreateIfMissing(true).
+          setWalTtlSeconds(1000).
+          setWalSizeLimitMB(10);
+
+      db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath());
+
+      for (int i = 0; i < 250; i++){
+        db.put(String.valueOf(i).getBytes(),
+            String.valueOf(i).getBytes());
+      }
+      db.flush(new FlushOptions().setWaitForFlush(true));
+      transactionLogIterator = db.getUpdatesSince(0);
+      assertThat(transactionLogIterator.isValid()).isTrue();
+      transactionLogIterator.status();
+
+      TransactionLogIterator.BatchResult batchResult =
+          transactionLogIterator.getBatch();
+      assertThat(batchResult.sequenceNumber()).isEqualTo(1);
+    } finally {
+      if (transactionLogIterator != null) {
+        transactionLogIterator.dispose();
+      }
+      if (db != null) {
+        db.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
+    }
+  }
+}
diff --git a/java/rocksjni/rocksjni.cc b/java/rocksjni/rocksjni.cc
index 1055f87fe..9f5b9446e 100644
--- a/java/rocksjni/rocksjni.cc
+++ b/java/rocksjni/rocksjni.cc
@@ -9,6 +9,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <jni.h>
+#include <memory>
 #include <string>
 #include <vector>
 
@@ -16,6 +17,7 @@
 #include "rocksjni/portal.h"
 #include "rocksdb/db.h"
 #include "rocksdb/cache.h"
+#include "rocksdb/types.h"
 
 //////////////////////////////////////////////////////////////////////////////
 // rocksdb::DB::Open
@@ -1598,3 +1600,25 @@ void Java_org_rocksdb_RocksDB_compactRange__J_3BI_3BIZIIJ(
   rocksdb_compactrange_helper(env, db, cf_handle, jbegin, jbegin_len,
       jend, jend_len, jreduce_level, jtarget_level, jtarget_path_id);
 }
+
+//////////////////////////////////////////////////////////////////////////////
+// rocksdb::DB::GetUpdatesSince
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    getUpdatesSince
+ * Signature: (JJ)J
+ */
+jlong Java_org_rocksdb_RocksDB_getUpdatesSince(JNIEnv* env,
+    jobject jdb, jlong jdb_handle, jlong jsequence_number) {
+  auto db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  rocksdb::SequenceNumber sequence_number =
+      static_cast<rocksdb::SequenceNumber>(jsequence_number);
+  std::unique_ptr<rocksdb::TransactionLogIterator> iter;
+  rocksdb::Status s = db->GetUpdatesSince(sequence_number, &iter);
+  if (s.ok()) {
+    return reinterpret_cast<jlong>(iter.release());
+  }
+  rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  return 0;
+}
diff --git a/java/rocksjni/transaction_log.cc b/java/rocksjni/transaction_log.cc
new file mode 100644
index 000000000..28e387fe1
--- /dev/null
+++ b/java/rocksjni/transaction_log.cc
@@ -0,0 +1,79 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+//
+// This file implements the "bridge" between Java and C++ and enables
+// calling c++ rocksdb::Iterator methods from Java side.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <jni.h>
+
+#include "include/org_rocksdb_TransactionLogIterator.h"
+#include "rocksdb/transaction_log.h"
+#include "rocksjni/portal.h"
+
+/*
+ * Class:     org_rocksdb_TransactionLogIterator
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_TransactionLogIterator_disposeInternal(
+    JNIEnv* env, jobject jobj, jlong handle) {
+  auto* it = reinterpret_cast<rocksdb::TransactionLogIterator*>(handle);
+  delete it;
+}
+
+/*
+ * Class:     org_rocksdb_TransactionLogIterator
+ * Method:    isValid
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_TransactionLogIterator_isValid(
+    JNIEnv* env, jobject jobj, jlong handle) {
+  return reinterpret_cast<rocksdb::TransactionLogIterator*>(handle)->Valid();
+}
+
+/*
+ * Class:     org_rocksdb_TransactionLogIterator
+ * Method:    next
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_TransactionLogIterator_next(
+    JNIEnv* env, jobject jobj, jlong handle) {
+  reinterpret_cast<rocksdb::TransactionLogIterator*>(handle)->Next();
+}
+
+/*
+ * Class:     org_rocksdb_TransactionLogIterator
+ * Method:    status
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_TransactionLogIterator_status(
+    JNIEnv* env, jobject jobj, jlong handle) {
+  rocksdb::Status s = reinterpret_cast<
+      rocksdb::TransactionLogIterator*>(handle)->status();
+  if (!s.ok()) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_TransactionLogIterator
+ * Method:    getBatch
+ * Signature: (J)Lorg/rocksdb/TransactionLogIterator$BatchResult
+ */
+jobject Java_org_rocksdb_TransactionLogIterator_getBatch(
+    JNIEnv* env, jobject jobj, jlong handle) {
+  rocksdb::BatchResult batch_result =
+      reinterpret_cast<rocksdb::TransactionLogIterator*>(handle)->GetBatch();
+  jclass jclazz = env->FindClass(
+      "org/rocksdb/TransactionLogIterator$BatchResult");
+  assert(jclazz != nullptr);
+  jmethodID mid = env->GetMethodID(
+      jclazz, "<init>", "(Lorg/rocksdb/TransactionLogIterator;JJ)V");
+  assert(mid != nullptr);
+  return env->NewObject(jclazz, mid, jobj,
+      batch_result.sequence, batch_result.writeBatchPtr.release());
+}

From b39006e3db598cd6f95e34c05bb19e6fc04bb24c Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Sun, 25 Jan 2015 22:46:11 +0100
Subject: [PATCH 778/829] [RocksJava] enable/disable File deletions

---
 java/org/rocksdb/RocksDB.java                 | 52 +++++++++++++++++++
 java/org/rocksdb/test/RocksDBTest.java        | 21 ++++++++
 .../test/TransactionLogIteratorTest.java      |  3 ++
 java/rocksjni/rocksjni.cc                     | 45 ++++++++++++++++
 4 files changed, 121 insertions(+)

diff --git a/java/org/rocksdb/RocksDB.java b/java/org/rocksdb/RocksDB.java
index 96032165e..ea3824196 100644
--- a/java/org/rocksdb/RocksDB.java
+++ b/java/org/rocksdb/RocksDB.java
@@ -1588,6 +1588,53 @@ public class RocksDB extends RocksObject {
         columnFamilyHandle.nativeHandle_);
   }
 
+  /**
+   * <p>The sequence number of the most recent transaction.</p>
+   *
+   * @return sequence number of the most
+   *     recent transaction.
+   */
+  public long getLatestSequenceNumber() {
+    return getLatestSequenceNumber(nativeHandle_);
+  }
+
+  /**
+   * <p>Prevent file deletions. Compactions will continue to occur,
+   * but no obsolete files will be deleted. Calling this multiple
+   * times have the same effect as calling it once.</p>
+   *
+   * @throws RocksDBException thrown if operation was not performed
+   *     successfully.
+   */
+  public void disableFileDeletions() throws RocksDBException {
+    disableFileDeletions(nativeHandle_);
+  }
+
+  /**
+   * <p>Allow compactions to delete obsolete files.
+   * If force == true, the call to EnableFileDeletions()
+   * will guarantee that file deletions are enabled after
+   * the call, even if DisableFileDeletions() was called
+   * multiple times before.</p>
+   *
+   * <p>If force == false, EnableFileDeletions will only
+   * enable file deletion after it's been called at least
+   * as many times as DisableFileDeletions(), enabling
+   * the two methods to be called by two threads
+   * concurrently without synchronization
+   * -- i.e., file deletions will be enabled only after both
+   * threads call EnableFileDeletions()</p>
+   *
+   * @param force boolean value described above.
+   *
+   * @throws RocksDBException thrown if operation was not performed
+   *     successfully.
+   */
+  public void enableFileDeletions(boolean force)
+      throws RocksDBException {
+    enableFileDeletions(nativeHandle_, force);
+  }
+
   /**
    * <p>Returns an iterator that is positioned at a write-batch containing
    * seq_number. If the sequence number is non existent, it returns an iterator
@@ -1753,6 +1800,11 @@ public class RocksDB extends RocksObject {
   private native void compactRange(long handle, byte[] begin, int beginLen, byte[] end,
       int endLen, boolean reduce_level, int target_level, int target_path_id,
       long cfHandle) throws RocksDBException;
+  private native long getLatestSequenceNumber(long handle);
+  private native void disableFileDeletions(long handle)
+      throws RocksDBException;
+  private native void enableFileDeletions(long handle,
+      boolean force) throws RocksDBException;
   private native long getUpdatesSince(long handle, long sequenceNumber)
       throws RocksDBException;
 
diff --git a/java/org/rocksdb/test/RocksDBTest.java b/java/org/rocksdb/test/RocksDBTest.java
index a6934b310..15dde9856 100644
--- a/java/org/rocksdb/test/RocksDBTest.java
+++ b/java/org/rocksdb/test/RocksDBTest.java
@@ -738,4 +738,25 @@ public class RocksDBTest {
       }
     }
   }
+
+  @Test
+  public void enableDisableFileDeletions() throws RocksDBException {
+    RocksDB db = null;
+    Options options = null;
+    try {
+      options = new Options().setCreateIfMissing(true);
+      db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath());
+      db.disableFileDeletions();
+      db.enableFileDeletions(false);
+      db.disableFileDeletions();
+      db.enableFileDeletions(true);
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
+    }
+  }
 }
diff --git a/java/org/rocksdb/test/TransactionLogIteratorTest.java b/java/org/rocksdb/test/TransactionLogIteratorTest.java
index 2069e1200..4e1ee4dfd 100644
--- a/java/org/rocksdb/test/TransactionLogIteratorTest.java
+++ b/java/org/rocksdb/test/TransactionLogIteratorTest.java
@@ -57,6 +57,9 @@ public class TransactionLogIteratorTest {
             String.valueOf(i).getBytes());
       }
       db.flush(new FlushOptions().setWaitForFlush(true));
+
+      assertThat(db.getLatestSequenceNumber()).isEqualTo(250);
+
       transactionLogIterator = db.getUpdatesSince(0);
       assertThat(transactionLogIterator.isValid()).isTrue();
       transactionLogIterator.status();
diff --git a/java/rocksjni/rocksjni.cc b/java/rocksjni/rocksjni.cc
index 9f5b9446e..148c6c7dc 100644
--- a/java/rocksjni/rocksjni.cc
+++ b/java/rocksjni/rocksjni.cc
@@ -1601,6 +1601,51 @@ void Java_org_rocksdb_RocksDB_compactRange__J_3BI_3BIZIIJ(
       jend, jend_len, jreduce_level, jtarget_level, jtarget_path_id);
 }
 
+//////////////////////////////////////////////////////////////////////////////
+// rocksdb::DB::GetLatestSequenceNumber
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    getLatestSequenceNumber
+ * Signature: (J)V
+ */
+jlong Java_org_rocksdb_RocksDB_getLatestSequenceNumber(JNIEnv* env,
+    jobject jdb, jlong jdb_handle) {
+  auto db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  return db->GetLatestSequenceNumber();
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// rocksdb::DB enable/disable file deletions
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    enableFileDeletions
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_RocksDB_disableFileDeletions(JNIEnv* env,
+    jobject jdb, jlong jdb_handle) {
+  auto db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  rocksdb::Status s = db->DisableFileDeletions();
+  if (!s.ok()) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    enableFileDeletions
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_RocksDB_enableFileDeletions(JNIEnv* env,
+    jobject jdb, jlong jdb_handle, jboolean jforce) {
+  auto db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  rocksdb::Status s = db->EnableFileDeletions(jforce);
+  if (!s.ok()) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
 //////////////////////////////////////////////////////////////////////////////
 // rocksdb::DB::GetUpdatesSince
 

From caedd40ddd189eadc2b2974c77b129117c6a8560 Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Sun, 25 Jan 2015 22:47:29 +0100
Subject: [PATCH 779/829] [RocksJava] Adjusted auto pointer

---
 java/rocksjni/rocksjni.cc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/java/rocksjni/rocksjni.cc b/java/rocksjni/rocksjni.cc
index 148c6c7dc..eaa5603ea 100644
--- a/java/rocksjni/rocksjni.cc
+++ b/java/rocksjni/rocksjni.cc
@@ -1611,7 +1611,7 @@ void Java_org_rocksdb_RocksDB_compactRange__J_3BI_3BIZIIJ(
  */
 jlong Java_org_rocksdb_RocksDB_getLatestSequenceNumber(JNIEnv* env,
     jobject jdb, jlong jdb_handle) {
-  auto db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
   return db->GetLatestSequenceNumber();
 }
 
@@ -1625,7 +1625,7 @@ jlong Java_org_rocksdb_RocksDB_getLatestSequenceNumber(JNIEnv* env,
  */
 void Java_org_rocksdb_RocksDB_disableFileDeletions(JNIEnv* env,
     jobject jdb, jlong jdb_handle) {
-  auto db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
   rocksdb::Status s = db->DisableFileDeletions();
   if (!s.ok()) {
     rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
@@ -1639,7 +1639,7 @@ void Java_org_rocksdb_RocksDB_disableFileDeletions(JNIEnv* env,
  */
 void Java_org_rocksdb_RocksDB_enableFileDeletions(JNIEnv* env,
     jobject jdb, jlong jdb_handle, jboolean jforce) {
-  auto db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
   rocksdb::Status s = db->EnableFileDeletions(jforce);
   if (!s.ok()) {
     rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
@@ -1656,7 +1656,7 @@ void Java_org_rocksdb_RocksDB_enableFileDeletions(JNIEnv* env,
  */
 jlong Java_org_rocksdb_RocksDB_getUpdatesSince(JNIEnv* env,
     jobject jdb, jlong jdb_handle, jlong jsequence_number) {
-  auto db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
   rocksdb::SequenceNumber sequence_number =
       static_cast<rocksdb::SequenceNumber>(jsequence_number);
   std::unique_ptr<rocksdb::TransactionLogIterator> iter;

From 68cd93b8739161a8cc7148256d49039fee9eaf4f Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Sun, 25 Jan 2015 22:59:48 +0100
Subject: [PATCH 780/829] [RocksJava] GetUpdatesSince support

Summary:
This differential describes further changes to the Java-API

New methods:

* GetUpdatesSince
* GetLatestSequenceNumber
* EnableFileDeletions
* DisableFileDeletions

This pull requests depends on: https://github.com/facebook/rocksdb/pull/472

Test Plan:
make rocksdbjava
make jtest
mvn -f rocksjni.pom package

Reviewers: yhchiang, adamretter, ankgup87

Subscribers: dhruba

Differential Revision: https://reviews.facebook.net/D32151
---
 java/rocksjni/transaction_log.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/java/rocksjni/transaction_log.cc b/java/rocksjni/transaction_log.cc
index 28e387fe1..028062879 100644
--- a/java/rocksjni/transaction_log.cc
+++ b/java/rocksjni/transaction_log.cc
@@ -6,9 +6,9 @@
 // This file implements the "bridge" between Java and C++ and enables
 // calling c++ rocksdb::Iterator methods from Java side.
 
+#include <jni.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <jni.h>
 
 #include "include/org_rocksdb_TransactionLogIterator.h"
 #include "rocksdb/transaction_log.h"

From 391f85fc8252339800cd51716ae7eacf42b8b971 Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Fri, 30 Jan 2015 21:57:40 +0100
Subject: [PATCH 781/829] [RocksJava] Incorporated changes for D32151

---
 java/org/rocksdb/TransactionLogIterator.java  |   1 -
 java/org/rocksdb/WriteBatch.java              |   7 ++
 .../test/TransactionLogIteratorTest.java      | 105 +++++++++++++++++-
 java/rocksjni/transaction_log.cc              |   3 +-
 4 files changed, 111 insertions(+), 5 deletions(-)

diff --git a/java/org/rocksdb/TransactionLogIterator.java b/java/org/rocksdb/TransactionLogIterator.java
index 8de61aa00..d82cde3ea 100644
--- a/java/org/rocksdb/TransactionLogIterator.java
+++ b/java/org/rocksdb/TransactionLogIterator.java
@@ -23,7 +23,6 @@ public class TransactionLogIterator extends RocksObject {
    * <strong>REQUIRES</strong>: Valid() to be true.</p>
    */
   public void next() {
-    assert(isValid());
     next(nativeHandle_);
   }
 
diff --git a/java/org/rocksdb/WriteBatch.java b/java/org/rocksdb/WriteBatch.java
index fd8b894cb..fd6d9386c 100644
--- a/java/org/rocksdb/WriteBatch.java
+++ b/java/org/rocksdb/WriteBatch.java
@@ -53,6 +53,13 @@ public class WriteBatch extends AbstractWriteBatch {
     iterate(handler.nativeHandle_);
   }
 
+  /**
+   * <p>Private WriteBatch constructor which is used to construct
+   * WriteBatch instances from C++ side. As the reference to this
+   * object is also managed from C++ side the handle will be disowned.</p>
+   *
+   * @param nativeHandle address of native instance.
+   */
   WriteBatch(long nativeHandle) {
     super();
     disOwnNativeHandle();
diff --git a/java/org/rocksdb/test/TransactionLogIteratorTest.java b/java/org/rocksdb/test/TransactionLogIteratorTest.java
index 4e1ee4dfd..6d700dac9 100644
--- a/java/org/rocksdb/test/TransactionLogIteratorTest.java
+++ b/java/org/rocksdb/test/TransactionLogIteratorTest.java
@@ -41,8 +41,10 @@ public class TransactionLogIteratorTest {
 
   @Test
   public void getBatch() throws RocksDBException {
+    final int numberOfPuts = 5;
     RocksDB db = null;
     Options options = null;
+    ColumnFamilyHandle cfHandle = null;
     TransactionLogIterator transactionLogIterator = null;
     try {
       options = new Options().
@@ -52,21 +54,120 @@ public class TransactionLogIteratorTest {
 
       db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath());
 
-      for (int i = 0; i < 250; i++){
+      for (int i = 0; i < numberOfPuts; i++){
         db.put(String.valueOf(i).getBytes(),
             String.valueOf(i).getBytes());
       }
       db.flush(new FlushOptions().setWaitForFlush(true));
 
-      assertThat(db.getLatestSequenceNumber()).isEqualTo(250);
+      // the latest sequence number is 5 because 5 puts
+      // were written beforehand
+      assertThat(db.getLatestSequenceNumber()).
+          isEqualTo(numberOfPuts);
 
+      // insert 5 writes into a cf
+      cfHandle = db.createColumnFamily(
+          new ColumnFamilyDescriptor("new_cf".getBytes()));
+
+      for (int i = 0; i < numberOfPuts; i++){
+        db.put(cfHandle, String.valueOf(i).getBytes(),
+            String.valueOf(i).getBytes());
+      }
+      // the latest sequence number is 10 because
+      // (5 + 5) puts were written beforehand
+      assertThat(db.getLatestSequenceNumber()).
+          isEqualTo(numberOfPuts + numberOfPuts);
+
+      // Get updates since the beginning
       transactionLogIterator = db.getUpdatesSince(0);
       assertThat(transactionLogIterator.isValid()).isTrue();
       transactionLogIterator.status();
 
+      // The first sequence number is 1
       TransactionLogIterator.BatchResult batchResult =
           transactionLogIterator.getBatch();
       assertThat(batchResult.sequenceNumber()).isEqualTo(1);
+    } finally {
+      if (transactionLogIterator != null) {
+        transactionLogIterator.dispose();
+      }
+      if (cfHandle != null) {
+        cfHandle.dispose();
+      }
+      if (db != null) {
+        db.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void transactionLogIteratorStallAtLastRecord() throws RocksDBException {
+    RocksDB db = null;
+    Options options = null;
+    TransactionLogIterator transactionLogIterator = null;
+    try {
+      options = new Options().
+          setCreateIfMissing(true).
+          setWalTtlSeconds(1000).
+          setWalSizeLimitMB(10);
+
+      db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath());
+      db.put("key1".getBytes(), "value1".getBytes());
+      // Get updates since the beginning
+      transactionLogIterator = db.getUpdatesSince(0);
+      transactionLogIterator.status();
+      assertThat(transactionLogIterator.isValid()).isTrue();
+      transactionLogIterator.next();
+      assertThat(transactionLogIterator.isValid()).isFalse();
+      transactionLogIterator.status();
+      db.put("key2".getBytes(), "value2".getBytes());
+      transactionLogIterator.next();
+      transactionLogIterator.status();
+      assertThat(transactionLogIterator.isValid()).isTrue();
+
+    } finally {
+      if (transactionLogIterator != null) {
+        transactionLogIterator.dispose();
+      }
+      if (db != null) {
+        db.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void transactionLogIteratorCheckAfterRestart() throws RocksDBException {
+    final int numberOfKeys = 2;
+    RocksDB db = null;
+    Options options = null;
+    TransactionLogIterator transactionLogIterator = null;
+    try {
+      options = new Options().
+          setCreateIfMissing(true).
+          setWalTtlSeconds(1000).
+          setWalSizeLimitMB(10);
+
+      db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath());
+      db.put("key1".getBytes(), "value1".getBytes());
+      db.put("key2".getBytes(), "value2".getBytes());
+      db.flush(new FlushOptions().setWaitForFlush(true));
+      // reopen
+      db.close();
+      db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath());
+      assertThat(db.getLatestSequenceNumber()).isEqualTo(numberOfKeys);
+
+      transactionLogIterator = db.getUpdatesSince(0);
+      for (int i = 0; i < numberOfKeys; i++) {
+        transactionLogIterator.status();
+        assertThat(transactionLogIterator.isValid()).isTrue();
+        transactionLogIterator.next();
+      }
     } finally {
       if (transactionLogIterator != null) {
         transactionLogIterator.dispose();
diff --git a/java/rocksjni/transaction_log.cc b/java/rocksjni/transaction_log.cc
index 028062879..1d3d7c100 100644
--- a/java/rocksjni/transaction_log.cc
+++ b/java/rocksjni/transaction_log.cc
@@ -21,8 +21,7 @@
  */
 void Java_org_rocksdb_TransactionLogIterator_disposeInternal(
     JNIEnv* env, jobject jobj, jlong handle) {
-  auto* it = reinterpret_cast<rocksdb::TransactionLogIterator*>(handle);
-  delete it;
+  delete reinterpret_cast<rocksdb::TransactionLogIterator*>(handle);
 }
 
 /*

From cb5c3159f0d60556b54e43a19d3fd3f54fbd9941 Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Fri, 30 Jan 2015 21:05:45 +0100
Subject: [PATCH 782/829] [RocksJava] Snapshot - GetSequenceNumber

Summary:
As the C++ part exposes now SequenceNumber retrieval
for Snapshots we want this obviously also in the Java API.

Test Plan:
make rocksdbjava
make jtest
mvn -f rocksjni.pom test

Reviewers: yhchiang, adamretter, ankgup87

Subscribers: dhruba

Differential Revision: https://reviews.facebook.net/D32571
---
 java/Makefile                           |  1 +
 java/org/rocksdb/Snapshot.java          | 13 +++++++++++++
 java/org/rocksdb/test/SnapshotTest.java |  2 ++
 java/rocksjni/rocksjni.cc               |  4 ++--
 java/rocksjni/snapshot.cc               | 26 +++++++++++++++++++++++++
 5 files changed, 44 insertions(+), 2 deletions(-)
 create mode 100644 java/rocksjni/snapshot.cc

diff --git a/java/Makefile b/java/Makefile
index 97f0b0244..8d75ee5e5 100644
--- a/java/Makefile
+++ b/java/Makefile
@@ -32,6 +32,7 @@ NATIVE_JAVA_CLASSES = org.rocksdb.AbstractComparator\
 	org.rocksdb.TransactionLogIterator\
 	org.rocksdb.TtlDB\
 	org.rocksdb.VectorMemTableConfig\
+	org.rocksdb.Snapshot\
 	org.rocksdb.StringAppendOperator\
 	org.rocksdb.WriteBatch\
 	org.rocksdb.WriteBatch.Handler\
diff --git a/java/org/rocksdb/Snapshot.java b/java/org/rocksdb/Snapshot.java
index 5817a8b44..1842dddd3 100644
--- a/java/org/rocksdb/Snapshot.java
+++ b/java/org/rocksdb/Snapshot.java
@@ -14,6 +14,17 @@ public class Snapshot extends RocksObject {
     nativeHandle_ = nativeHandle;
   }
 
+  /**
+   * Return the associated sequence number;
+   *
+   * @return the associated sequence number of
+   *     this snapshot.
+   */
+  public long getSequenceNumber() {
+    assert(isInitialized());
+    return getSequenceNumber(nativeHandle_);
+  }
+
   /**
    * Dont release C++ Snapshot pointer. The pointer
    * to the snapshot is released by the database
@@ -21,4 +32,6 @@ public class Snapshot extends RocksObject {
    */
   @Override protected void disposeInternal() {
   }
+
+  private native long getSequenceNumber(long handle);
 }
diff --git a/java/org/rocksdb/test/SnapshotTest.java b/java/org/rocksdb/test/SnapshotTest.java
index b6dd2a360..4aeef44ef 100644
--- a/java/org/rocksdb/test/SnapshotTest.java
+++ b/java/org/rocksdb/test/SnapshotTest.java
@@ -35,6 +35,8 @@ public class SnapshotTest {
       db.put("key".getBytes(), "value".getBytes());
       // Get new Snapshot of database
       Snapshot snapshot = db.getSnapshot();
+      assertThat(snapshot.getSequenceNumber()).isGreaterThan(0);
+      assertThat(snapshot.getSequenceNumber()).isEqualTo(1);
       readOptions = new ReadOptions();
       // set snapshot in ReadOptions
       readOptions.setSnapshot(snapshot);
diff --git a/java/rocksjni/rocksjni.cc b/java/rocksjni/rocksjni.cc
index eaa5603ea..54e449f53 100644
--- a/java/rocksjni/rocksjni.cc
+++ b/java/rocksjni/rocksjni.cc
@@ -6,18 +6,18 @@
 // This file implements the "bridge" between Java and C++ and enables
 // calling c++ rocksdb::DB methods from Java side.
 
+#include <jni.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <jni.h>
 #include <memory>
 #include <string>
 #include <vector>
 
 #include "include/org_rocksdb_RocksDB.h"
-#include "rocksjni/portal.h"
 #include "rocksdb/db.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/types.h"
+#include "rocksjni/portal.h"
 
 //////////////////////////////////////////////////////////////////////////////
 // rocksdb::DB::Open
diff --git a/java/rocksjni/snapshot.cc b/java/rocksjni/snapshot.cc
new file mode 100644
index 000000000..cd10c97c8
--- /dev/null
+++ b/java/rocksjni/snapshot.cc
@@ -0,0 +1,26 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+//
+// This file implements the "bridge" between Java and C++.
+
+#include <jni.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "include/org_rocksdb_Snapshot.h"
+#include "rocksdb/db.h"
+#include "rocksjni/portal.h"
+
+/*
+ * Class:     org_rocksdb_Snapshot
+ * Method:    getSequenceNumber
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Snapshot_getSequenceNumber(JNIEnv* env,
+    jobject jobj, jlong jsnapshot_handle) {
+  auto* snapshot = reinterpret_cast<rocksdb::Snapshot*>(
+      jsnapshot_handle);
+  return snapshot->GetSequenceNumber();
+}

From ca52a67cfb403072bec5a17e2737660690278467 Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Sat, 31 Jan 2015 00:21:45 +0100
Subject: [PATCH 783/829] [RocksJava] Deprecate setSkipLogErrorOnRecovery

- see: 62ad0a9b19f0be4cefa70b6b32876e764b7f3c11
---
 java/org/rocksdb/DBOptions.java          | 2 ++
 java/org/rocksdb/DBOptionsInterface.java | 6 ++++++
 java/org/rocksdb/Options.java            | 2 ++
 3 files changed, 10 insertions(+)

diff --git a/java/org/rocksdb/DBOptions.java b/java/org/rocksdb/DBOptions.java
index e3614f463..fb8f27bc4 100644
--- a/java/org/rocksdb/DBOptions.java
+++ b/java/org/rocksdb/DBOptions.java
@@ -464,6 +464,7 @@ public class DBOptions extends RocksObject implements DBOptionsInterface {
   }
 
   @Override
+  @Deprecated
   public DBOptions setSkipLogErrorOnRecovery(boolean skip) {
     assert(isInitialized());
     setSkipLogErrorOnRecovery(nativeHandle_, skip);
@@ -471,6 +472,7 @@ public class DBOptions extends RocksObject implements DBOptionsInterface {
   }
 
   @Override
+  @Deprecated
   public boolean skipLogErrorOnRecovery() {
     assert(isInitialized());
     return skipLogErrorOnRecovery(nativeHandle_);
diff --git a/java/org/rocksdb/DBOptionsInterface.java b/java/org/rocksdb/DBOptionsInterface.java
index 83d7ba1e1..38c0338e4 100644
--- a/java/org/rocksdb/DBOptionsInterface.java
+++ b/java/org/rocksdb/DBOptionsInterface.java
@@ -707,7 +707,10 @@ public interface DBOptionsInterface {
    *
    * @param skip true if log corruption errors are skipped during recovery.
    * @return the instance of the current Object.
+   *
+   * @deprecated will be removed in RocksDB 3.11.0. Not used anymore.
    */
+  @Deprecated
   Object setSkipLogErrorOnRecovery(boolean skip);
 
   /**
@@ -716,7 +719,10 @@ public interface DBOptionsInterface {
    * Default: false
    *
    * @return true if log corruption errors are skipped during recovery.
+   *
+   * @deprecated will be removed in RocksDB 3.11.0. Not used anymore.
    */
+  @Deprecated
   boolean skipLogErrorOnRecovery();
 
   /**
diff --git a/java/org/rocksdb/Options.java b/java/org/rocksdb/Options.java
index ac4037508..56385154d 100644
--- a/java/org/rocksdb/Options.java
+++ b/java/org/rocksdb/Options.java
@@ -551,12 +551,14 @@ public class Options extends RocksObject
   }
 
   @Override
+  @Deprecated
   public boolean skipLogErrorOnRecovery() {
     assert(isInitialized());
     return skipLogErrorOnRecovery(nativeHandle_);
   }
 
   @Override
+  @Deprecated
   public Options setSkipLogErrorOnRecovery(boolean skip) {
     assert(isInitialized());
     setSkipLogErrorOnRecovery(nativeHandle_, skip);

From 4a4e4279f0b17f960c9e504f018ec758cd5b9d0b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=B6rg=20Maier?= <fyrgoss@gmail.com>
Date: Sat, 31 Jan 2015 16:13:06 +0100
Subject: [PATCH 784/829] Update HISTORY-JAVA.md

---
 java/HISTORY-JAVA.md | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/java/HISTORY-JAVA.md b/java/HISTORY-JAVA.md
index 7a293fd3f..9bced168e 100644
--- a/java/HISTORY-JAVA.md
+++ b/java/HISTORY-JAVA.md
@@ -1,5 +1,13 @@
 # RocksJava Change Log
 
+## By 01/31/2015
+### New Features
+* WriteBatchWithIndex support.
+* Iterator support for WriteBatch and WriteBatchWithIndex
+* GetUpdatesSince support.
+* Snapshots carry now information about the related sequence number.
+* TTL DB support.
+
 ## By 11/14/2014
 ### New Features
 * Full support for Column Family.

From 7479a62a7aaa0565feaff0fe5e9626fae6f2bfc0 Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Sun, 1 Feb 2015 14:30:41 +0100
Subject: [PATCH 785/829] Release.md - Remove version change instrcution

The version change instruction is obsolete with the change
that maven pulls versioning information from version.h.
---
 java/RELEASE.md | 2 --
 1 file changed, 2 deletions(-)

diff --git a/java/RELEASE.md b/java/RELEASE.md
index 16feae6ee..084460c88 100644
--- a/java/RELEASE.md
+++ b/java/RELEASE.md
@@ -41,8 +41,6 @@ Set ~/.m2/settings.xml to contain:
       </servers>
     </settings>
 
-Then update rocksjni.pom's version tag to reflect the release version.
-
 From RocksDB's root directory, first build the Java static JARs:
 
     make jclean clean rocksdbjavastaticpublish

From 98cb501bc0a5611405d60e79c6b31d61967098a2 Mon Sep 17 00:00:00 2001
From: Adam Retter <adam.retter@googlemail.com>
Date: Sat, 31 Jan 2015 19:43:09 +0000
Subject: [PATCH 786/829] Moved Java test classes into src/test/java

---
 .gitignore                                    |   2 +
 java/Makefile                                 | 107 +++++++------
 java/rocksjni/write_batch.cc                  | 124 ---------------
 java/rocksjni/write_batch_test.cc             | 148 ++++++++++++++++++
 .../org/rocksdb}/AbstractComparatorTest.java  |   8 +-
 .../org/rocksdb}/BackupableDBOptionsTest.java |   3 +-
 .../java/org/rocksdb}/BackupableDBTest.java   |   3 +-
 .../rocksdb}/BlockBasedTableConfigTest.java   |   3 +-
 .../java/org/rocksdb}/CheckPointTest.java     |   6 +-
 .../org/rocksdb}/ColumnFamilyOptionsTest.java |   3 +-
 .../java/org/rocksdb}/ColumnFamilyTest.java   |   3 +-
 .../org/rocksdb}/ComparatorOptionsTest.java   |   3 +-
 .../java/org/rocksdb}/ComparatorTest.java     |   3 +-
 .../org/rocksdb}/CompressionOptionsTest.java  |   3 +-
 .../test/java/org/rocksdb}/DBOptionsTest.java |   3 +-
 .../org/rocksdb}/DirectComparatorTest.java    |   3 +-
 .../java/org/rocksdb}/DirectSliceTest.java    |   3 +-
 .../test/java/org/rocksdb}/FilterTest.java    |   3 +-
 .../test/java/org/rocksdb}/FlushTest.java     |   3 +-
 .../java/org/rocksdb}/InfoLogLevelTest.java   |   3 +-
 .../java/org/rocksdb}/KeyMayExistTest.java    |   3 +-
 .../test/java/org/rocksdb}/MemTableTest.java  |   3 +-
 .../test/java/org/rocksdb}/MergeTest.java     |   3 +-
 .../java/org/rocksdb}/MixedOptionsTest.java   |   3 +-
 .../test/java/org/rocksdb}/OptionsTest.java   |   3 +-
 .../org/rocksdb}/PlainTableConfigTest.java    |   5 +-
 .../org/rocksdb}/PlatformRandomHelper.java    |   2 +-
 .../test/java/org/rocksdb}/ReadOnlyTest.java  |   3 +-
 .../java/org/rocksdb}/ReadOptionsTest.java    |   3 +-
 .../test/java/org/rocksdb}/RocksDBTest.java   |   3 +-
 .../test/java/org/rocksdb}/RocksEnvTest.java  |   3 +-
 .../java/org/rocksdb}/RocksIteratorTest.java  |   6 +-
 .../org/rocksdb}/RocksMemoryResource.java     |   3 +-
 .../test/java/org/rocksdb}/SliceTest.java     |   3 +-
 .../test/java/org/rocksdb}/SnapshotTest.java  |   3 +-
 .../org/rocksdb}/StatisticsCollectorTest.java |   3 +-
 .../java/org/rocksdb}/StatsCallbackMock.java  |   4 +-
 .../rocksdb}/TransactionLogIteratorTest.java  |   3 +-
 .../test/java/org/rocksdb}/TtlDBTest.java     |   3 +-
 .../test/java/org/rocksdb}/Types.java         |   2 +-
 .../org/rocksdb}/WriteBatchHandlerTest.java   |   5 +-
 .../java/org/rocksdb}/WriteBatchTest.java     |  21 ++-
 .../org/rocksdb}/WriteBatchWithIndexTest.java |  10 +-
 .../java/org/rocksdb}/WriteOptionsTest.java   |   3 +-
 .../org/rocksdb/test/RocksJunitRunner.java    |   0
 .../org/rocksdb/util}/EnvironmentTest.java    |   3 +-
 .../java/org/rocksdb/util}/SizeUnitTest.java  |   3 +-
 47 files changed, 263 insertions(+), 283 deletions(-)
 create mode 100644 java/rocksjni/write_batch_test.cc
 rename java/{org/rocksdb/test => src/test/java/org/rocksdb}/AbstractComparatorTest.java (97%)
 rename java/{org/rocksdb/test => src/test/java/org/rocksdb}/BackupableDBOptionsTest.java (99%)
 rename java/{org/rocksdb/test => src/test/java/org/rocksdb}/BackupableDBTest.java (99%)
 rename java/{org/rocksdb/test => src/test/java/org/rocksdb}/BlockBasedTableConfigTest.java (99%)
 rename java/{org/rocksdb/test => src/test/java/org/rocksdb}/CheckPointTest.java (94%)
 rename java/{org/rocksdb/test => src/test/java/org/rocksdb}/ColumnFamilyOptionsTest.java (99%)
 rename java/{org/rocksdb/test => src/test/java/org/rocksdb}/ColumnFamilyTest.java (99%)
 rename java/{org/rocksdb/test => src/test/java/org/rocksdb}/ComparatorOptionsTest.java (93%)
 rename java/{org/rocksdb/test => src/test/java/org/rocksdb}/ComparatorTest.java (99%)
 rename java/{org/rocksdb/test => src/test/java/org/rocksdb}/CompressionOptionsTest.java (91%)
 rename java/{org/rocksdb/test => src/test/java/org/rocksdb}/DBOptionsTest.java (99%)
 rename java/{org/rocksdb/test => src/test/java/org/rocksdb}/DirectComparatorTest.java (97%)
 rename java/{org/rocksdb/test => src/test/java/org/rocksdb}/DirectSliceTest.java (98%)
 rename java/{org/rocksdb/test => src/test/java/org/rocksdb}/FilterTest.java (96%)
 rename java/{org/rocksdb/test => src/test/java/org/rocksdb}/FlushTest.java (97%)
 rename java/{org/rocksdb/test => src/test/java/org/rocksdb}/InfoLogLevelTest.java (98%)
 rename java/{org/rocksdb/test => src/test/java/org/rocksdb}/KeyMayExistTest.java (98%)
 rename java/{org/rocksdb/test => src/test/java/org/rocksdb}/MemTableTest.java (98%)
 rename java/{org/rocksdb/test => src/test/java/org/rocksdb}/MergeTest.java (99%)
 rename java/{org/rocksdb/test => src/test/java/org/rocksdb}/MixedOptionsTest.java (97%)
 rename java/{org/rocksdb/test => src/test/java/org/rocksdb}/OptionsTest.java (99%)
 rename java/{org/rocksdb/test => src/test/java/org/rocksdb}/PlainTableConfigTest.java (95%)
 rename java/{org/rocksdb/test => src/test/java/org/rocksdb}/PlatformRandomHelper.java (98%)
 rename java/{org/rocksdb/test => src/test/java/org/rocksdb}/ReadOnlyTest.java (99%)
 rename java/{org/rocksdb/test => src/test/java/org/rocksdb}/ReadOptionsTest.java (98%)
 rename java/{org/rocksdb/test => src/test/java/org/rocksdb}/RocksDBTest.java (99%)
 rename java/{org/rocksdb/test => src/test/java/org/rocksdb}/RocksEnvTest.java (96%)
 rename java/{org/rocksdb/test => src/test/java/org/rocksdb}/RocksIteratorTest.java (95%)
 rename java/{org/rocksdb/test => src/test/java/org/rocksdb}/RocksMemoryResource.java (85%)
 rename java/{org/rocksdb/test => src/test/java/org/rocksdb}/SliceTest.java (98%)
 rename java/{org/rocksdb/test => src/test/java/org/rocksdb}/SnapshotTest.java (99%)
 rename java/{org/rocksdb/test => src/test/java/org/rocksdb}/StatisticsCollectorTest.java (97%)
 rename java/{org/rocksdb/test => src/test/java/org/rocksdb}/StatsCallbackMock.java (93%)
 rename java/{org/rocksdb/test => src/test/java/org/rocksdb}/TransactionLogIteratorTest.java (99%)
 rename java/{org/rocksdb/test => src/test/java/org/rocksdb}/TtlDBTest.java (99%)
 rename java/{org/rocksdb/test => src/test/java/org/rocksdb}/Types.java (97%)
 rename java/{org/rocksdb/test => src/test/java/org/rocksdb}/WriteBatchHandlerTest.java (98%)
 rename java/{org/rocksdb/test => src/test/java/org/rocksdb}/WriteBatchTest.java (89%)
 rename java/{org/rocksdb/test => src/test/java/org/rocksdb}/WriteBatchWithIndexTest.java (95%)
 rename java/{org/rocksdb/test => src/test/java/org/rocksdb}/WriteOptionsTest.java (94%)
 rename java/{ => src/test/java}/org/rocksdb/test/RocksJunitRunner.java (100%)
 rename java/{org/rocksdb/test => src/test/java/org/rocksdb/util}/EnvironmentTest.java (98%)
 rename java/{org/rocksdb/test => src/test/java/org/rocksdb/util}/SizeUnitTest.java (93%)

diff --git a/.gitignore b/.gitignore
index dfd3f4924..79e5c994c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -34,6 +34,8 @@ package/
 tags
 
 java/out
+java/target
+java/test-libs
 java/*.log
 java/include/org_rocksdb_*.h
 
diff --git a/java/Makefile b/java/Makefile
index 8d75ee5e5..a43275c2c 100644
--- a/java/Makefile
+++ b/java/Makefile
@@ -36,12 +36,13 @@ NATIVE_JAVA_CLASSES = org.rocksdb.AbstractComparator\
 	org.rocksdb.StringAppendOperator\
 	org.rocksdb.WriteBatch\
 	org.rocksdb.WriteBatch.Handler\
-	org.rocksdb.test.WriteBatchInternal\
-	org.rocksdb.test.WriteBatchTest\
-    org.rocksdb.WriteOptions\
+	org.rocksdb.WriteOptions\
 	org.rocksdb.WriteBatchWithIndex\
 	org.rocksdb.WBWIRocksIterator
 
+NATIVE_JAVA_TEST_CLASSES = org.rocksdb.WriteBatchTest\
+    org.rocksdb.WriteBatchTestInternalHelper
+
 ROCKSDB_MAJOR = $(shell egrep "ROCKSDB_MAJOR.[0-9]" ../include/rocksdb/version.h | cut -d ' ' -f 3)
 ROCKSDB_MINOR = $(shell egrep "ROCKSDB_MINOR.[0-9]" ../include/rocksdb/version.h | cut -d ' ' -f 3)
 ROCKSDB_PATCH = $(shell egrep "ROCKSDB_PATCH.[0-9]" ../include/rocksdb/version.h | cut -d ' ' -f 3)
@@ -53,43 +54,49 @@ ifeq ($(PLATFORM), OS_MACOSX)
 ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-osx.jar
 endif
 
-JAVA_TESTS = org.rocksdb.test.BackupableDBOptionsTest\
-        org.rocksdb.test.BackupableDBTest\
-		org.rocksdb.test.BlockBasedTableConfigTest\
-		org.rocksdb.test.CheckPointTest\
-		org.rocksdb.test.ColumnFamilyOptionsTest\
-		org.rocksdb.test.ColumnFamilyTest\
-		org.rocksdb.test.ComparatorOptionsTest\
-		org.rocksdb.test.ComparatorTest\
-		org.rocksdb.test.CompressionOptionsTest\
-		org.rocksdb.test.DBOptionsTest\
-		org.rocksdb.test.DirectComparatorTest\
-		org.rocksdb.test.DirectSliceTest\
-		org.rocksdb.test.EnvironmentTest\
-		org.rocksdb.test.FilterTest\
-		org.rocksdb.test.FlushTest\
-		org.rocksdb.test.InfoLogLevelTest\
-		org.rocksdb.test.KeyMayExistTest\
-		org.rocksdb.test.MemTableTest\
-		org.rocksdb.test.MergeTest\
-		org.rocksdb.test.MixedOptionsTest\
-		org.rocksdb.test.OptionsTest\
-		org.rocksdb.test.PlainTableConfigTest\
-		org.rocksdb.test.ReadOnlyTest\
-		org.rocksdb.test.ReadOptionsTest\
-		org.rocksdb.test.RocksDBTest\
-		org.rocksdb.test.RocksEnvTest\
-		org.rocksdb.test.RocksIteratorTest\
-		org.rocksdb.test.SizeUnitTest\
-		org.rocksdb.test.SliceTest\
-		org.rocksdb.test.SnapshotTest\
-		org.rocksdb.test.TransactionLogIteratorTest\
-		org.rocksdb.test.TtlDBTest\
-		org.rocksdb.test.StatisticsCollectorTest\
-		org.rocksdb.test.WriteBatchHandlerTest\
-		org.rocksdb.test.WriteBatchTest\
-		org.rocksdb.test.WriteOptionsTest\
-		org.rocksdb.test.WriteBatchWithIndexTest
+JAVA_TESTS = org.rocksdb.BackupableDBOptionsTest\
+	org.rocksdb.BackupableDBTest\
+	org.rocksdb.BlockBasedTableConfigTest\
+	org.rocksdb.CheckPointTest\
+	org.rocksdb.ColumnFamilyOptionsTest\
+	org.rocksdb.ColumnFamilyTest\
+	org.rocksdb.ComparatorOptionsTest\
+	org.rocksdb.ComparatorTest\
+	org.rocksdb.CompressionOptionsTest\
+	org.rocksdb.DBOptionsTest\
+	org.rocksdb.DirectComparatorTest\
+	org.rocksdb.DirectSliceTest\
+	org.rocksdb.util.EnvironmentTest\
+	org.rocksdb.FilterTest\
+	org.rocksdb.FlushTest\
+	org.rocksdb.InfoLogLevelTest\
+	org.rocksdb.KeyMayExistTest\
+	org.rocksdb.MemTableTest\
+	org.rocksdb.MergeTest\
+	org.rocksdb.MixedOptionsTest\
+	org.rocksdb.OptionsTest\
+	org.rocksdb.PlainTableConfigTest\
+	org.rocksdb.ReadOnlyTest\
+	org.rocksdb.ReadOptionsTest\
+	org.rocksdb.RocksDBTest\
+	org.rocksdb.RocksEnvTest\
+	org.rocksdb.RocksIteratorTest\
+	org.rocksdb.util.SizeUnitTest\
+	org.rocksdb.SliceTest\
+	org.rocksdb.SnapshotTest\
+	org.rocksdb.TransactionLogIteratorTest\
+	org.rocksdb.TtlDBTest\
+	org.rocksdb.StatisticsCollectorTest\
+	org.rocksdb.WriteBatchHandlerTest\
+	org.rocksdb.WriteBatchTest\
+	org.rocksdb.WriteOptionsTest\
+	org.rocksdb.WriteBatchWithIndexTest
+
+TEST_SRC = src/test/java
+OUTPUT = target
+# TODO update after moving main classes
+MAIN_CLASSES = .
+TEST_CLASSES = $(OUTPUT)/test-classes
 
 JAVA_TEST_LIBDIR = ./test-libs/
 JAVA_JUNIT_JAR = $(JAVA_TEST_LIBDIR)junit-4.12-beta-2.jar
@@ -97,14 +104,14 @@ JAVA_HAMCR_JAR = $(JAVA_TEST_LIBDIR)hamcrest-core-1.3.jar
 JAVA_MOCKITO_JAR = $(JAVA_TEST_LIBDIR)mockito-all-1.9.5.jar
 JAVA_CGLIB_JAR = $(JAVA_TEST_LIBDIR)cglib-2.2.2.jar
 JAVA_ASSERTJ_JAR = $(JAVA_TEST_LIBDIR)assertj-core-1.7.0.jar
-JAVA_TESTCLASSPATH = $(ROCKSDB_JAR):$(JAVA_JUNIT_JAR):$(JAVA_HAMCR_JAR):$(JAVA_MOCKITO_JAR):$(JAVA_CGLIB_JAR):$(JAVA_ASSERTJ_JAR):.:./*
+JAVA_TESTCLASSPATH = $(JAVA_JUNIT_JAR):$(JAVA_HAMCR_JAR):$(JAVA_MOCKITO_JAR):$(JAVA_CGLIB_JAR):$(JAVA_ASSERTJ_JAR):.:./*
 
 clean:
 	-find . -name "*.class" -exec rm {} \;
 	rm -rf include/*
 	rm -rf javadoc/*
 	rm -rf test-libs/
-	rm -rf target
+	rm -rf $(OUTPUT)
 	rm -rf librocksdbjni*
 	rm -f rocksdbjni*
 
@@ -112,11 +119,10 @@ clean:
 javadocs:
 	mkdir -p javadoc; javadoc -d javadoc -sourcepath . -subpackages org -exclude org.rocksdb.test
 
-javalib: java javadocs
+javalib: java java_test javadocs
 
-java: resolve_test_deps
+java:
 	javac org/rocksdb/util/*.java org/rocksdb/*.java
-	javac -cp $(JAVA_TESTCLASSPATH) org/rocksdb/test/*.java
 	@cp ../HISTORY.md ./HISTORY-CPP.md
 	@rm -f ./HISTORY-CPP.md
 	javah -d $(NATIVE_INCLUDE) -jni $(NATIVE_JAVA_CLASSES)
@@ -143,8 +149,15 @@ resolve_test_deps:
 	test -s "$(JAVA_CGLIB_JAR)" || curl -k -L -o "$(JAVA_CGLIB_JAR)" http://search.maven.org/remotecontent?filepath=cglib/cglib/2.2.2/cglib-2.2.2.jar
 	test -s "$(JAVA_ASSERTJ_JAR)" || curl -k -L -o "$(JAVA_ASSERTJ_JAR)" http://central.maven.org/maven2/org/assertj/assertj-core/1.7.0/assertj-core-1.7.0.jar
 
-test: java resolve_test_deps
-	java -ea -Xcheck:jni -Djava.library.path=.:../ -cp "$(JAVA_TESTCLASSPATH)" org.rocksdb.test.RocksJunitRunner $(JAVA_TESTS)
+java_test: resolve_test_deps
+	mkdir -p $(TEST_CLASSES)
+	javac -cp $(MAIN_CLASSES):$(JAVA_TESTCLASSPATH) -d $(TEST_CLASSES) $(TEST_SRC)/org/rocksdb/*.java
+	javac -cp $(MAIN_CLASSES):$(JAVA_TESTCLASSPATH) -d $(TEST_CLASSES) $(TEST_SRC)/org/rocksdb/test/*.java
+	javac -cp $(MAIN_CLASSES):$(JAVA_TESTCLASSPATH) -d $(TEST_CLASSES) $(TEST_SRC)/org/rocksdb/util/*.java
+	javah -cp $(MAIN_CLASSES):$(TEST_CLASSES) -d $(NATIVE_INCLUDE) -jni $(NATIVE_JAVA_TEST_CLASSES)
+
+test: java resolve_test_deps java_test
+	java -ea -Xcheck:jni -Djava.library.path=.:../ -cp "$(MAIN_CLASSES):$(TEST_CLASSES):$(JAVA_TESTCLASSPATH)" org.rocksdb.test.RocksJunitRunner $(JAVA_TESTS)
 
 db_bench: java
 	javac org/rocksdb/benchmark/*.java
diff --git a/java/rocksjni/write_batch.cc b/java/rocksjni/write_batch.cc
index 20eb55407..aa0c2309a 100644
--- a/java/rocksjni/write_batch.cc
+++ b/java/rocksjni/write_batch.cc
@@ -9,8 +9,6 @@
 
 #include "include/org_rocksdb_WriteBatch.h"
 #include "include/org_rocksdb_WriteBatch_Handler.h"
-#include "include/org_rocksdb_test_WriteBatchInternal.h"
-#include "include/org_rocksdb_test_WriteBatchTest.h"
 #include "rocksjni/portal.h"
 #include "rocksjni/writebatchhandlerjnicallback.h"
 #include "rocksdb/db.h"
@@ -217,48 +215,6 @@ void Java_org_rocksdb_WriteBatch_disposeInternal(
   delete reinterpret_cast<rocksdb::WriteBatch*>(handle);
 }
 
-/*
- * Class:     org_rocksdb_test_WriteBatchInternal
- * Method:    setSequence
- * Signature: (Lorg/rocksdb/WriteBatch;J)V
- */
-void Java_org_rocksdb_test_WriteBatchInternal_setSequence(
-    JNIEnv* env, jclass jclazz, jobject jobj, jlong jsn) {
-  rocksdb::WriteBatch* wb = rocksdb::WriteBatchJni::getHandle(env, jobj);
-  assert(wb != nullptr);
-
-  rocksdb::WriteBatchInternal::SetSequence(
-      wb, static_cast<rocksdb::SequenceNumber>(jsn));
-}
-
-/*
- * Class:     org_rocksdb_test_WriteBatchInternal
- * Method:    sequence
- * Signature: (Lorg/rocksdb/WriteBatch;)J
- */
-jlong Java_org_rocksdb_test_WriteBatchInternal_sequence(
-    JNIEnv* env, jclass jclazz, jobject jobj) {
-  rocksdb::WriteBatch* wb = rocksdb::WriteBatchJni::getHandle(env, jobj);
-  assert(wb != nullptr);
-
-  return static_cast<jlong>(rocksdb::WriteBatchInternal::Sequence(wb));
-}
-
-/*
- * Class:     org_rocksdb_test_WriteBatchInternal
- * Method:    append
- * Signature: (Lorg/rocksdb/WriteBatch;Lorg/rocksdb/WriteBatch;)V
- */
-void Java_org_rocksdb_test_WriteBatchInternal_append(
-    JNIEnv* env, jclass jclazz, jobject jwb1, jobject jwb2) {
-  rocksdb::WriteBatch* wb1 = rocksdb::WriteBatchJni::getHandle(env, jwb1);
-  assert(wb1 != nullptr);
-  rocksdb::WriteBatch* wb2 = rocksdb::WriteBatchJni::getHandle(env, jwb2);
-  assert(wb2 != nullptr);
-
-  rocksdb::WriteBatchInternal::Append(wb1, wb2);
-}
-
 /*
  * Class:     org_rocksdb_WriteBatch_Handler
  * Method:    createNewHandler0
@@ -280,83 +236,3 @@ void Java_org_rocksdb_WriteBatch_00024Handler_disposeInternal(
     JNIEnv* env, jobject jobj, jlong handle) {
   delete reinterpret_cast<rocksdb::WriteBatchHandlerJniCallback*>(handle);
 }
-
-/*
- * Class:     org_rocksdb_test_WriteBatchTest
- * Method:    getContents
- * Signature: (Lorg/rocksdb/WriteBatch;)[B
- */
-jbyteArray Java_org_rocksdb_test_WriteBatchTest_getContents(
-    JNIEnv* env, jclass jclazz, jobject jobj) {
-  rocksdb::WriteBatch* b = rocksdb::WriteBatchJni::getHandle(env, jobj);
-  assert(b != nullptr);
-
-  // todo: Currently the following code is directly copied from
-  // db/write_bench_test.cc.  It could be implemented in java once
-  // all the necessary components can be accessed via jni api.
-
-  rocksdb::InternalKeyComparator cmp(rocksdb::BytewiseComparator());
-  auto factory = std::make_shared<rocksdb::SkipListFactory>();
-  rocksdb::Options options;
-  rocksdb::WriteBuffer wb(options.db_write_buffer_size);
-  options.memtable_factory = factory;
-  rocksdb::MemTable* mem = new rocksdb::MemTable(
-      cmp, rocksdb::ImmutableCFOptions(options),
-      rocksdb::MutableCFOptions(options, rocksdb::ImmutableCFOptions(options)),
-      &wb);
-  mem->Ref();
-  std::string state;
-  rocksdb::ColumnFamilyMemTablesDefault cf_mems_default(mem);
-  rocksdb::Status s =
-      rocksdb::WriteBatchInternal::InsertInto(b, &cf_mems_default);
-  int count = 0;
-  rocksdb::Arena arena;
-  rocksdb::ScopedArenaIterator iter(mem->NewIterator(
-      rocksdb::ReadOptions(), &arena));
-  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
-    rocksdb::ParsedInternalKey ikey;
-    memset(reinterpret_cast<void*>(&ikey), 0, sizeof(ikey));
-    ASSERT_TRUE(rocksdb::ParseInternalKey(iter->key(), &ikey));
-    switch (ikey.type) {
-      case rocksdb::kTypeValue:
-        state.append("Put(");
-        state.append(ikey.user_key.ToString());
-        state.append(", ");
-        state.append(iter->value().ToString());
-        state.append(")");
-        count++;
-        break;
-      case rocksdb::kTypeMerge:
-        state.append("Merge(");
-        state.append(ikey.user_key.ToString());
-        state.append(", ");
-        state.append(iter->value().ToString());
-        state.append(")");
-        count++;
-        break;
-      case rocksdb::kTypeDeletion:
-        state.append("Delete(");
-        state.append(ikey.user_key.ToString());
-        state.append(")");
-        count++;
-        break;
-      default:
-        assert(false);
-        break;
-    }
-    state.append("@");
-    state.append(rocksdb::NumberToString(ikey.sequence));
-  }
-  if (!s.ok()) {
-    state.append(s.ToString());
-  } else if (count != rocksdb::WriteBatchInternal::Count(b)) {
-    state.append("CountMismatch()");
-  }
-  delete mem->Unref();
-
-  jbyteArray jstate = env->NewByteArray(static_cast<jsize>(state.size()));
-  env->SetByteArrayRegion(jstate, 0, static_cast<jsize>(state.size()),
-                          reinterpret_cast<const jbyte*>(state.c_str()));
-
-  return jstate;
-}
diff --git a/java/rocksjni/write_batch_test.cc b/java/rocksjni/write_batch_test.cc
new file mode 100644
index 000000000..d78178211
--- /dev/null
+++ b/java/rocksjni/write_batch_test.cc
@@ -0,0 +1,148 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+//
+// This file implements the "bridge" between Java and C++ and enables
+// calling c++ rocksdb::WriteBatch methods testing from Java side.
+#include <memory>
+
+#include "db/memtable.h"
+#include "db/write_batch_internal.h"
+#include "db/writebuffer.h"
+#include "include/org_rocksdb_WriteBatch.h"
+#include "include/org_rocksdb_WriteBatch_Handler.h"
+#include "include/org_rocksdb_WriteBatchTest.h"
+#include "include/org_rocksdb_WriteBatchTestInternalHelper.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/immutable_options.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/status.h"
+#include "rocksdb/write_batch.h"
+#include "rocksjni/portal.h"
+#include "util/logging.h"
+#include "util/scoped_arena_iterator.h"
+#include "util/testharness.h"
+
+/*
+ * Class:     org_rocksdb_WriteBatchTest
+ * Method:    getContents
+ * Signature: (Lorg/rocksdb/WriteBatch;)[B
+ */
+jbyteArray Java_org_rocksdb_WriteBatchTest_getContents(
+    JNIEnv* env, jclass jclazz, jobject jobj) {
+  rocksdb::WriteBatch* b = rocksdb::WriteBatchJni::getHandle(env, jobj);
+  assert(b != nullptr);
+
+  // todo: Currently the following code is directly copied from
+  // db/write_bench_test.cc.  It could be implemented in java once
+  // all the necessary components can be accessed via jni api.
+
+  rocksdb::InternalKeyComparator cmp(rocksdb::BytewiseComparator());
+  auto factory = std::make_shared<rocksdb::SkipListFactory>();
+  rocksdb::Options options;
+  rocksdb::WriteBuffer wb(options.db_write_buffer_size);
+  options.memtable_factory = factory;
+  rocksdb::MemTable* mem = new rocksdb::MemTable(
+      cmp, rocksdb::ImmutableCFOptions(options),
+      rocksdb::MutableCFOptions(options, rocksdb::ImmutableCFOptions(options)),
+      &wb);
+  mem->Ref();
+  std::string state;
+  rocksdb::ColumnFamilyMemTablesDefault cf_mems_default(mem);
+  rocksdb::Status s =
+      rocksdb::WriteBatchInternal::InsertInto(b, &cf_mems_default);
+  int count = 0;
+  rocksdb::Arena arena;
+  rocksdb::ScopedArenaIterator iter(mem->NewIterator(
+      rocksdb::ReadOptions(), &arena));
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    rocksdb::ParsedInternalKey ikey;
+    memset(reinterpret_cast<void*>(&ikey), 0, sizeof(ikey));
+    ASSERT_TRUE(rocksdb::ParseInternalKey(iter->key(), &ikey));
+    switch (ikey.type) {
+      case rocksdb::kTypeValue:
+        state.append("Put(");
+        state.append(ikey.user_key.ToString());
+        state.append(", ");
+        state.append(iter->value().ToString());
+        state.append(")");
+        count++;
+        break;
+      case rocksdb::kTypeMerge:
+        state.append("Merge(");
+        state.append(ikey.user_key.ToString());
+        state.append(", ");
+        state.append(iter->value().ToString());
+        state.append(")");
+        count++;
+        break;
+      case rocksdb::kTypeDeletion:
+        state.append("Delete(");
+        state.append(ikey.user_key.ToString());
+        state.append(")");
+        count++;
+        break;
+      default:
+        assert(false);
+        break;
+    }
+    state.append("@");
+    state.append(rocksdb::NumberToString(ikey.sequence));
+  }
+  if (!s.ok()) {
+    state.append(s.ToString());
+  } else if (count != rocksdb::WriteBatchInternal::Count(b)) {
+    state.append("CountMismatch()");
+  }
+  delete mem->Unref();
+
+  jbyteArray jstate = env->NewByteArray(static_cast<jsize>(state.size()));
+  env->SetByteArrayRegion(jstate, 0, static_cast<jsize>(state.size()),
+                          reinterpret_cast<const jbyte*>(state.c_str()));
+
+  return jstate;
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatchTestInternalHelper
+ * Method:    setSequence
+ * Signature: (Lorg/rocksdb/WriteBatch;J)V
+ */
+void Java_org_rocksdb_WriteBatchTestInternalHelper_setSequence(
+    JNIEnv* env, jclass jclazz, jobject jobj, jlong jsn) {
+  rocksdb::WriteBatch* wb = rocksdb::WriteBatchJni::getHandle(env, jobj);
+  assert(wb != nullptr);
+
+  rocksdb::WriteBatchInternal::SetSequence(
+      wb, static_cast<rocksdb::SequenceNumber>(jsn));
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatchTestInternalHelper
+ * Method:    sequence
+ * Signature: (Lorg/rocksdb/WriteBatch;)J
+ */
+jlong Java_org_rocksdb_WriteBatchTestInternalHelper_sequence(
+    JNIEnv* env, jclass jclazz, jobject jobj) {
+  rocksdb::WriteBatch* wb = rocksdb::WriteBatchJni::getHandle(env, jobj);
+  assert(wb != nullptr);
+
+  return static_cast<jlong>(rocksdb::WriteBatchInternal::Sequence(wb));
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatchTestInternalHelper
+ * Method:    append
+ * Signature: (Lorg/rocksdb/WriteBatch;Lorg/rocksdb/WriteBatch;)V
+ */
+void Java_org_rocksdb_WriteBatchTestInternalHelper_append(
+    JNIEnv* env, jclass jclazz, jobject jwb1, jobject jwb2) {
+  rocksdb::WriteBatch* wb1 = rocksdb::WriteBatchJni::getHandle(env, jwb1);
+  assert(wb1 != nullptr);
+  rocksdb::WriteBatch* wb2 = rocksdb::WriteBatchJni::getHandle(env, jwb2);
+  assert(wb2 != nullptr);
+
+  rocksdb::WriteBatchInternal::Append(wb1, wb2);
+}
diff --git a/java/org/rocksdb/test/AbstractComparatorTest.java b/java/src/test/java/org/rocksdb/AbstractComparatorTest.java
similarity index 97%
rename from java/org/rocksdb/test/AbstractComparatorTest.java
rename to java/src/test/java/org/rocksdb/AbstractComparatorTest.java
index f0281a521..97afb48d1 100644
--- a/java/org/rocksdb/test/AbstractComparatorTest.java
+++ b/java/src/test/java/org/rocksdb/AbstractComparatorTest.java
@@ -3,9 +3,7 @@
 // LICENSE file in the root directory of this source tree. An additional grant
 // of patent rights can be found in the PATENTS file in the same directory.
 
-package org.rocksdb.test;
-
-import org.rocksdb.*;
+package org.rocksdb;
 
 import java.io.IOException;
 import java.nio.file.*;
@@ -14,8 +12,8 @@ import java.util.List;
 import java.util.Random;
 
 import static org.assertj.core.api.Assertions.assertThat;
-import static org.rocksdb.test.Types.byteToInt;
-import static org.rocksdb.test.Types.intToByte;
+import static org.rocksdb.Types.byteToInt;
+import static org.rocksdb.Types.intToByte;
 
 /**
  * Abstract tests for both Comparator and DirectComparator
diff --git a/java/org/rocksdb/test/BackupableDBOptionsTest.java b/java/src/test/java/org/rocksdb/BackupableDBOptionsTest.java
similarity index 99%
rename from java/org/rocksdb/test/BackupableDBOptionsTest.java
rename to java/src/test/java/org/rocksdb/BackupableDBOptionsTest.java
index b7bdc0011..6fe3bd2f0 100644
--- a/java/org/rocksdb/test/BackupableDBOptionsTest.java
+++ b/java/src/test/java/org/rocksdb/BackupableDBOptionsTest.java
@@ -3,13 +3,12 @@
 // LICENSE file in the root directory of this source tree. An additional grant
 // of patent rights can be found in the PATENTS file in the same directory.
 
-package org.rocksdb.test;
+package org.rocksdb;
 
 import org.junit.ClassRule;
 import org.junit.Rule;
 import org.junit.Test;
 import org.junit.rules.ExpectedException;
-import org.rocksdb.BackupableDBOptions;
 
 import java.util.Random;
 
diff --git a/java/org/rocksdb/test/BackupableDBTest.java b/java/src/test/java/org/rocksdb/BackupableDBTest.java
similarity index 99%
rename from java/org/rocksdb/test/BackupableDBTest.java
rename to java/src/test/java/org/rocksdb/BackupableDBTest.java
index 2ac2abfa1..3f358bdb7 100644
--- a/java/org/rocksdb/test/BackupableDBTest.java
+++ b/java/src/test/java/org/rocksdb/BackupableDBTest.java
@@ -3,13 +3,12 @@
 // LICENSE file in the root directory of this source tree. An additional grant
 // of patent rights can be found in the PATENTS file in the same directory.
 
-package org.rocksdb.test;
+package org.rocksdb;
 
 import org.junit.ClassRule;
 import org.junit.Rule;
 import org.junit.Test;
 import org.junit.rules.TemporaryFolder;
-import org.rocksdb.*;
 
 import java.util.List;
 
diff --git a/java/org/rocksdb/test/BlockBasedTableConfigTest.java b/java/src/test/java/org/rocksdb/BlockBasedTableConfigTest.java
similarity index 99%
rename from java/org/rocksdb/test/BlockBasedTableConfigTest.java
rename to java/src/test/java/org/rocksdb/BlockBasedTableConfigTest.java
index 1172effc8..aacf44054 100644
--- a/java/org/rocksdb/test/BlockBasedTableConfigTest.java
+++ b/java/src/test/java/org/rocksdb/BlockBasedTableConfigTest.java
@@ -3,11 +3,10 @@
 // LICENSE file in the root directory of this source tree. An additional grant
 // of patent rights can be found in the PATENTS file in the same directory.
 
-package org.rocksdb.test;
+package org.rocksdb;
 
 import org.junit.ClassRule;
 import org.junit.Test;
-import org.rocksdb.*;
 
 import static org.assertj.core.api.Assertions.assertThat;
 
diff --git a/java/org/rocksdb/test/CheckPointTest.java b/java/src/test/java/org/rocksdb/CheckPointTest.java
similarity index 94%
rename from java/org/rocksdb/test/CheckPointTest.java
rename to java/src/test/java/org/rocksdb/CheckPointTest.java
index 3891e062e..3081e585a 100644
--- a/java/org/rocksdb/test/CheckPointTest.java
+++ b/java/src/test/java/org/rocksdb/CheckPointTest.java
@@ -1,14 +1,10 @@
-package org.rocksdb.test;
+package org.rocksdb;
 
 
 import org.junit.ClassRule;
 import org.junit.Rule;
 import org.junit.Test;
 import org.junit.rules.TemporaryFolder;
-import org.rocksdb.Checkpoint;
-import org.rocksdb.Options;
-import org.rocksdb.RocksDB;
-import org.rocksdb.RocksDBException;
 
 import static org.assertj.core.api.Assertions.assertThat;
 
diff --git a/java/org/rocksdb/test/ColumnFamilyOptionsTest.java b/java/src/test/java/org/rocksdb/ColumnFamilyOptionsTest.java
similarity index 99%
rename from java/org/rocksdb/test/ColumnFamilyOptionsTest.java
rename to java/src/test/java/org/rocksdb/ColumnFamilyOptionsTest.java
index aae9b5749..4082c602d 100644
--- a/java/org/rocksdb/test/ColumnFamilyOptionsTest.java
+++ b/java/src/test/java/org/rocksdb/ColumnFamilyOptionsTest.java
@@ -3,11 +3,10 @@
 // LICENSE file in the root directory of this source tree. An additional grant
 // of patent rights can be found in the PATENTS file in the same directory.
 
-package org.rocksdb.test;
+package org.rocksdb;
 
 import org.junit.ClassRule;
 import org.junit.Test;
-import org.rocksdb.*;
 
 import java.util.Properties;
 import java.util.Random;
diff --git a/java/org/rocksdb/test/ColumnFamilyTest.java b/java/src/test/java/org/rocksdb/ColumnFamilyTest.java
similarity index 99%
rename from java/org/rocksdb/test/ColumnFamilyTest.java
rename to java/src/test/java/org/rocksdb/ColumnFamilyTest.java
index bf568b5e8..9a860ebe8 100644
--- a/java/org/rocksdb/test/ColumnFamilyTest.java
+++ b/java/src/test/java/org/rocksdb/ColumnFamilyTest.java
@@ -3,7 +3,7 @@
 // LICENSE file in the root directory of this source tree. An additional grant
 // of patent rights can be found in the PATENTS file in the same directory.
 
-package org.rocksdb.test;
+package org.rocksdb;
 
 import java.util.HashMap;
 import java.util.List;
@@ -14,7 +14,6 @@ import org.junit.ClassRule;
 import org.junit.Rule;
 import org.junit.Test;
 import org.junit.rules.TemporaryFolder;
-import org.rocksdb.*;
 
 import static org.assertj.core.api.Assertions.assertThat;
 
diff --git a/java/org/rocksdb/test/ComparatorOptionsTest.java b/java/src/test/java/org/rocksdb/ComparatorOptionsTest.java
similarity index 93%
rename from java/org/rocksdb/test/ComparatorOptionsTest.java
rename to java/src/test/java/org/rocksdb/ComparatorOptionsTest.java
index 1064910df..4f8a7d1a6 100644
--- a/java/org/rocksdb/test/ComparatorOptionsTest.java
+++ b/java/src/test/java/org/rocksdb/ComparatorOptionsTest.java
@@ -3,11 +3,10 @@
 // LICENSE file in the root directory of this source tree. An additional grant
 // of patent rights can be found in the PATENTS file in the same directory.
 
-package org.rocksdb.test;
+package org.rocksdb;
 
 import org.junit.ClassRule;
 import org.junit.Test;
-import org.rocksdb.ComparatorOptions;
 
 import static org.assertj.core.api.Assertions.assertThat;
 
diff --git a/java/org/rocksdb/test/ComparatorTest.java b/java/src/test/java/org/rocksdb/ComparatorTest.java
similarity index 99%
rename from java/org/rocksdb/test/ComparatorTest.java
rename to java/src/test/java/org/rocksdb/ComparatorTest.java
index e1bba6a7f..e689a9cf5 100644
--- a/java/org/rocksdb/test/ComparatorTest.java
+++ b/java/src/test/java/org/rocksdb/ComparatorTest.java
@@ -3,13 +3,12 @@
 // LICENSE file in the root directory of this source tree. An additional grant
 // of patent rights can be found in the PATENTS file in the same directory.
 
-package org.rocksdb.test;
+package org.rocksdb;
 
 import org.junit.ClassRule;
 import org.junit.Rule;
 import org.junit.Test;
 import org.junit.rules.TemporaryFolder;
-import org.rocksdb.*;
 
 import java.io.IOException;
 import java.nio.file.FileSystems;
diff --git a/java/org/rocksdb/test/CompressionOptionsTest.java b/java/src/test/java/org/rocksdb/CompressionOptionsTest.java
similarity index 91%
rename from java/org/rocksdb/test/CompressionOptionsTest.java
rename to java/src/test/java/org/rocksdb/CompressionOptionsTest.java
index f8aff9268..bff4d5f6c 100644
--- a/java/org/rocksdb/test/CompressionOptionsTest.java
+++ b/java/src/test/java/org/rocksdb/CompressionOptionsTest.java
@@ -3,10 +3,9 @@
 // LICENSE file in the root directory of this source tree. An additional grant
 // of patent rights can be found in the PATENTS file in the same directory.
 
-package org.rocksdb.test;
+package org.rocksdb;
 
 import org.junit.Test;
-import org.rocksdb.CompressionType;
 
 
 public class CompressionOptionsTest
diff --git a/java/org/rocksdb/test/DBOptionsTest.java b/java/src/test/java/org/rocksdb/DBOptionsTest.java
similarity index 99%
rename from java/org/rocksdb/test/DBOptionsTest.java
rename to java/src/test/java/org/rocksdb/DBOptionsTest.java
index 858379768..9dab55955 100644
--- a/java/org/rocksdb/test/DBOptionsTest.java
+++ b/java/src/test/java/org/rocksdb/DBOptionsTest.java
@@ -3,11 +3,10 @@
 // LICENSE file in the root directory of this source tree. An additional grant
 // of patent rights can be found in the PATENTS file in the same directory.
 
-package org.rocksdb.test;
+package org.rocksdb;
 
 import org.junit.ClassRule;
 import org.junit.Test;
-import org.rocksdb.*;
 
 import java.util.Properties;
 import java.util.Random;
diff --git a/java/org/rocksdb/test/DirectComparatorTest.java b/java/src/test/java/org/rocksdb/DirectComparatorTest.java
similarity index 97%
rename from java/org/rocksdb/test/DirectComparatorTest.java
rename to java/src/test/java/org/rocksdb/DirectComparatorTest.java
index 328ea0089..be84d6647 100644
--- a/java/org/rocksdb/test/DirectComparatorTest.java
+++ b/java/src/test/java/org/rocksdb/DirectComparatorTest.java
@@ -3,13 +3,12 @@
 // LICENSE file in the root directory of this source tree. An additional grant
 // of patent rights can be found in the PATENTS file in the same directory.
 
-package org.rocksdb.test;
+package org.rocksdb;
 
 import org.junit.ClassRule;
 import org.junit.Rule;
 import org.junit.Test;
 import org.junit.rules.TemporaryFolder;
-import org.rocksdb.*;
 
 import java.io.IOException;
 import java.nio.file.FileSystems;
diff --git a/java/org/rocksdb/test/DirectSliceTest.java b/java/src/test/java/org/rocksdb/DirectSliceTest.java
similarity index 98%
rename from java/org/rocksdb/test/DirectSliceTest.java
rename to java/src/test/java/org/rocksdb/DirectSliceTest.java
index 20a44a904..123eed2e7 100644
--- a/java/org/rocksdb/test/DirectSliceTest.java
+++ b/java/src/test/java/org/rocksdb/DirectSliceTest.java
@@ -2,11 +2,10 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree. An additional grant
 // of patent rights can be found in the PATENTS file in the same directory.
-package org.rocksdb.test;
+package org.rocksdb;
 
 import org.junit.ClassRule;
 import org.junit.Test;
-import org.rocksdb.DirectSlice;
 
 import java.nio.ByteBuffer;
 
diff --git a/java/org/rocksdb/test/FilterTest.java b/java/src/test/java/org/rocksdb/FilterTest.java
similarity index 96%
rename from java/org/rocksdb/test/FilterTest.java
rename to java/src/test/java/org/rocksdb/FilterTest.java
index da4783fbf..36ce37970 100644
--- a/java/org/rocksdb/test/FilterTest.java
+++ b/java/src/test/java/org/rocksdb/FilterTest.java
@@ -3,11 +3,10 @@
 // LICENSE file in the root directory of this source tree. An additional grant
 // of patent rights can be found in the PATENTS file in the same directory.
 
-package org.rocksdb.test;
+package org.rocksdb;
 
 import org.junit.ClassRule;
 import org.junit.Test;
-import org.rocksdb.*;
 
 public class FilterTest {
 
diff --git a/java/org/rocksdb/test/FlushTest.java b/java/src/test/java/org/rocksdb/FlushTest.java
similarity index 97%
rename from java/org/rocksdb/test/FlushTest.java
rename to java/src/test/java/org/rocksdb/FlushTest.java
index 3bfdb3114..94a32d383 100644
--- a/java/org/rocksdb/test/FlushTest.java
+++ b/java/src/test/java/org/rocksdb/FlushTest.java
@@ -2,13 +2,12 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree. An additional grant
 // of patent rights can be found in the PATENTS file in the same directory.
-package org.rocksdb.test;
+package org.rocksdb;
 
 import org.junit.ClassRule;
 import org.junit.Rule;
 import org.junit.Test;
 import org.junit.rules.TemporaryFolder;
-import org.rocksdb.*;
 
 import static org.assertj.core.api.Assertions.assertThat;
 
diff --git a/java/org/rocksdb/test/InfoLogLevelTest.java b/java/src/test/java/org/rocksdb/InfoLogLevelTest.java
similarity index 98%
rename from java/org/rocksdb/test/InfoLogLevelTest.java
rename to java/src/test/java/org/rocksdb/InfoLogLevelTest.java
index 82bf485de..39d1ddd1d 100644
--- a/java/org/rocksdb/test/InfoLogLevelTest.java
+++ b/java/src/test/java/org/rocksdb/InfoLogLevelTest.java
@@ -1,10 +1,9 @@
-package org.rocksdb.test;
+package org.rocksdb;
 
 import org.junit.ClassRule;
 import org.junit.Rule;
 import org.junit.Test;
 import org.junit.rules.TemporaryFolder;
-import org.rocksdb.*;
 
 import java.io.IOException;
 
diff --git a/java/org/rocksdb/test/KeyMayExistTest.java b/java/src/test/java/org/rocksdb/KeyMayExistTest.java
similarity index 98%
rename from java/org/rocksdb/test/KeyMayExistTest.java
rename to java/src/test/java/org/rocksdb/KeyMayExistTest.java
index 921a6593c..f29c2f872 100644
--- a/java/org/rocksdb/test/KeyMayExistTest.java
+++ b/java/src/test/java/org/rocksdb/KeyMayExistTest.java
@@ -2,13 +2,12 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree. An additional grant
 // of patent rights can be found in the PATENTS file in the same directory.
-package org.rocksdb.test;
+package org.rocksdb;
 
 import org.junit.ClassRule;
 import org.junit.Rule;
 import org.junit.Test;
 import org.junit.rules.TemporaryFolder;
-import org.rocksdb.*;
 
 import java.util.ArrayList;
 import java.util.List;
diff --git a/java/org/rocksdb/test/MemTableTest.java b/java/src/test/java/org/rocksdb/MemTableTest.java
similarity index 98%
rename from java/org/rocksdb/test/MemTableTest.java
rename to java/src/test/java/org/rocksdb/MemTableTest.java
index dbf6b0bef..bfc898c42 100644
--- a/java/org/rocksdb/test/MemTableTest.java
+++ b/java/src/test/java/org/rocksdb/MemTableTest.java
@@ -3,11 +3,10 @@
 // LICENSE file in the root directory of this source tree. An additional grant
 // of patent rights can be found in the PATENTS file in the same directory.
 
-package org.rocksdb.test;
+package org.rocksdb;
 
 import org.junit.ClassRule;
 import org.junit.Test;
-import org.rocksdb.*;
 
 import static org.assertj.core.api.Assertions.assertThat;
 
diff --git a/java/org/rocksdb/test/MergeTest.java b/java/src/test/java/org/rocksdb/MergeTest.java
similarity index 99%
rename from java/org/rocksdb/test/MergeTest.java
rename to java/src/test/java/org/rocksdb/MergeTest.java
index 9bb882e44..55e8a20cd 100644
--- a/java/org/rocksdb/test/MergeTest.java
+++ b/java/src/test/java/org/rocksdb/MergeTest.java
@@ -3,7 +3,7 @@
 // LICENSE file in the root directory of this source tree. An additional grant
 // of patent rights can be found in the PATENTS file in the same directory.
 
-package org.rocksdb.test;
+package org.rocksdb;
 
 import java.util.List;
 import java.util.ArrayList;
@@ -12,7 +12,6 @@ import org.junit.ClassRule;
 import org.junit.Rule;
 import org.junit.Test;
 import org.junit.rules.TemporaryFolder;
-import org.rocksdb.*;
 
 import static org.assertj.core.api.Assertions.assertThat;
 
diff --git a/java/org/rocksdb/test/MixedOptionsTest.java b/java/src/test/java/org/rocksdb/MixedOptionsTest.java
similarity index 97%
rename from java/org/rocksdb/test/MixedOptionsTest.java
rename to java/src/test/java/org/rocksdb/MixedOptionsTest.java
index 528bea2e3..f095e99d8 100644
--- a/java/org/rocksdb/test/MixedOptionsTest.java
+++ b/java/src/test/java/org/rocksdb/MixedOptionsTest.java
@@ -3,11 +3,10 @@
 // LICENSE file in the root directory of this source tree. An additional grant
 // of patent rights can be found in the PATENTS file in the same directory.
 
-package org.rocksdb.test;
+package org.rocksdb;
 
 import org.junit.ClassRule;
 import org.junit.Test;
-import org.rocksdb.*;
 
 import static org.assertj.core.api.Assertions.assertThat;
 
diff --git a/java/org/rocksdb/test/OptionsTest.java b/java/src/test/java/org/rocksdb/OptionsTest.java
similarity index 99%
rename from java/org/rocksdb/test/OptionsTest.java
rename to java/src/test/java/org/rocksdb/OptionsTest.java
index 0e699c406..5b84d2510 100644
--- a/java/org/rocksdb/test/OptionsTest.java
+++ b/java/src/test/java/org/rocksdb/OptionsTest.java
@@ -3,12 +3,11 @@
 // LICENSE file in the root directory of this source tree. An additional grant
 // of patent rights can be found in the PATENTS file in the same directory.
 
-package org.rocksdb.test;
+package org.rocksdb;
 
 import java.util.Random;
 import org.junit.ClassRule;
 import org.junit.Test;
-import org.rocksdb.*;
 
 import static org.assertj.core.api.Assertions.assertThat;
 
diff --git a/java/org/rocksdb/test/PlainTableConfigTest.java b/java/src/test/java/org/rocksdb/PlainTableConfigTest.java
similarity index 95%
rename from java/org/rocksdb/test/PlainTableConfigTest.java
rename to java/src/test/java/org/rocksdb/PlainTableConfigTest.java
index 72347e7d4..850b050a0 100644
--- a/java/org/rocksdb/test/PlainTableConfigTest.java
+++ b/java/src/test/java/org/rocksdb/PlainTableConfigTest.java
@@ -3,13 +3,10 @@
 // LICENSE file in the root directory of this source tree. An additional grant
 // of patent rights can be found in the PATENTS file in the same directory.
 
-package org.rocksdb.test;
+package org.rocksdb;
 
 import org.junit.ClassRule;
 import org.junit.Test;
-import org.rocksdb.EncodingType;
-import org.rocksdb.Options;
-import org.rocksdb.PlainTableConfig;
 
 import static org.assertj.core.api.Assertions.assertThat;
 
diff --git a/java/org/rocksdb/test/PlatformRandomHelper.java b/java/src/test/java/org/rocksdb/PlatformRandomHelper.java
similarity index 98%
rename from java/org/rocksdb/test/PlatformRandomHelper.java
rename to java/src/test/java/org/rocksdb/PlatformRandomHelper.java
index d43f4a4f0..0155ce263 100644
--- a/java/org/rocksdb/test/PlatformRandomHelper.java
+++ b/java/src/test/java/org/rocksdb/PlatformRandomHelper.java
@@ -3,7 +3,7 @@
 // LICENSE file in the root directory of this source tree. An additional grant
 // of patent rights can be found in the PATENTS file in the same directory.
 
-package org.rocksdb.test;
+package org.rocksdb;
 
 import java.util.Random;
 
diff --git a/java/org/rocksdb/test/ReadOnlyTest.java b/java/src/test/java/org/rocksdb/ReadOnlyTest.java
similarity index 99%
rename from java/org/rocksdb/test/ReadOnlyTest.java
rename to java/src/test/java/org/rocksdb/ReadOnlyTest.java
index fce704eb5..a254481e5 100644
--- a/java/org/rocksdb/test/ReadOnlyTest.java
+++ b/java/src/test/java/org/rocksdb/ReadOnlyTest.java
@@ -2,13 +2,12 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree. An additional grant
 // of patent rights can be found in the PATENTS file in the same directory.
-package org.rocksdb.test;
+package org.rocksdb;
 
 import org.junit.ClassRule;
 import org.junit.Rule;
 import org.junit.Test;
 import org.junit.rules.TemporaryFolder;
-import org.rocksdb.*;
 
 import java.util.ArrayList;
 import java.util.List;
diff --git a/java/org/rocksdb/test/ReadOptionsTest.java b/java/src/test/java/org/rocksdb/ReadOptionsTest.java
similarity index 98%
rename from java/org/rocksdb/test/ReadOptionsTest.java
rename to java/src/test/java/org/rocksdb/ReadOptionsTest.java
index 2cf1584a1..af88ce351 100644
--- a/java/org/rocksdb/test/ReadOptionsTest.java
+++ b/java/src/test/java/org/rocksdb/ReadOptionsTest.java
@@ -3,7 +3,7 @@
 // LICENSE file in the root directory of this source tree. An additional grant
 // of patent rights can be found in the PATENTS file in the same directory.
 
-package org.rocksdb.test;
+package org.rocksdb;
 
 import java.util.Random;
 
@@ -11,7 +11,6 @@ import org.junit.ClassRule;
 import org.junit.Rule;
 import org.junit.Test;
 import org.junit.rules.ExpectedException;
-import org.rocksdb.ReadOptions;
 
 import static org.assertj.core.api.Assertions.assertThat;
 
diff --git a/java/org/rocksdb/test/RocksDBTest.java b/java/src/test/java/org/rocksdb/RocksDBTest.java
similarity index 99%
rename from java/org/rocksdb/test/RocksDBTest.java
rename to java/src/test/java/org/rocksdb/RocksDBTest.java
index 15dde9856..100db529d 100644
--- a/java/org/rocksdb/test/RocksDBTest.java
+++ b/java/src/test/java/org/rocksdb/RocksDBTest.java
@@ -2,13 +2,12 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree. An additional grant
 // of patent rights can be found in the PATENTS file in the same directory.
-package org.rocksdb.test;
+package org.rocksdb;
 
 import org.junit.ClassRule;
 import org.junit.Rule;
 import org.junit.Test;
 import org.junit.rules.TemporaryFolder;
-import org.rocksdb.*;
 
 import java.util.ArrayList;
 import java.util.List;
diff --git a/java/org/rocksdb/test/RocksEnvTest.java b/java/src/test/java/org/rocksdb/RocksEnvTest.java
similarity index 96%
rename from java/org/rocksdb/test/RocksEnvTest.java
rename to java/src/test/java/org/rocksdb/RocksEnvTest.java
index f55e9042e..6b0b9becc 100644
--- a/java/org/rocksdb/test/RocksEnvTest.java
+++ b/java/src/test/java/org/rocksdb/RocksEnvTest.java
@@ -3,11 +3,10 @@
 // LICENSE file in the root directory of this source tree. An additional grant
 // of patent rights can be found in the PATENTS file in the same directory.
 
-package org.rocksdb.test;
+package org.rocksdb;
 
 import org.junit.ClassRule;
 import org.junit.Test;
-import org.rocksdb.RocksEnv;
 
 import static org.assertj.core.api.Assertions.assertThat;
 
diff --git a/java/org/rocksdb/test/RocksIteratorTest.java b/java/src/test/java/org/rocksdb/RocksIteratorTest.java
similarity index 95%
rename from java/org/rocksdb/test/RocksIteratorTest.java
rename to java/src/test/java/org/rocksdb/RocksIteratorTest.java
index 448e8f397..c5918d8ac 100644
--- a/java/org/rocksdb/test/RocksIteratorTest.java
+++ b/java/src/test/java/org/rocksdb/RocksIteratorTest.java
@@ -2,16 +2,12 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree. An additional grant
 // of patent rights can be found in the PATENTS file in the same directory.
-package org.rocksdb.test;
+package org.rocksdb;
 
 import org.junit.ClassRule;
 import org.junit.Rule;
 import org.junit.Test;
 import org.junit.rules.TemporaryFolder;
-import org.rocksdb.Options;
-import org.rocksdb.RocksDB;
-import org.rocksdb.RocksDBException;
-import org.rocksdb.RocksIterator;
 
 import static org.assertj.core.api.Assertions.assertThat;
 
diff --git a/java/org/rocksdb/test/RocksMemoryResource.java b/java/src/test/java/org/rocksdb/RocksMemoryResource.java
similarity index 85%
rename from java/org/rocksdb/test/RocksMemoryResource.java
rename to java/src/test/java/org/rocksdb/RocksMemoryResource.java
index 51164ad65..de9ba0d6b 100644
--- a/java/org/rocksdb/test/RocksMemoryResource.java
+++ b/java/src/test/java/org/rocksdb/RocksMemoryResource.java
@@ -1,7 +1,6 @@
-package org.rocksdb.test;
+package org.rocksdb;
 
 import org.junit.rules.ExternalResource;
-import org.rocksdb.RocksDB;
 
 /**
  * Resource to trigger garbage collection after each test
diff --git a/java/org/rocksdb/test/SliceTest.java b/java/src/test/java/org/rocksdb/SliceTest.java
similarity index 98%
rename from java/org/rocksdb/test/SliceTest.java
rename to java/src/test/java/org/rocksdb/SliceTest.java
index 4b04172f8..16221ef65 100644
--- a/java/org/rocksdb/test/SliceTest.java
+++ b/java/src/test/java/org/rocksdb/SliceTest.java
@@ -2,11 +2,10 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree. An additional grant
 // of patent rights can be found in the PATENTS file in the same directory.
-package org.rocksdb.test;
+package org.rocksdb;
 
 import org.junit.ClassRule;
 import org.junit.Test;
-import org.rocksdb.Slice;
 
 import static org.assertj.core.api.Assertions.assertThat;
 
diff --git a/java/org/rocksdb/test/SnapshotTest.java b/java/src/test/java/org/rocksdb/SnapshotTest.java
similarity index 99%
rename from java/org/rocksdb/test/SnapshotTest.java
rename to java/src/test/java/org/rocksdb/SnapshotTest.java
index 4aeef44ef..87ccdbcb5 100644
--- a/java/org/rocksdb/test/SnapshotTest.java
+++ b/java/src/test/java/org/rocksdb/SnapshotTest.java
@@ -2,13 +2,12 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree. An additional grant
 // of patent rights can be found in the PATENTS file in the same directory.
-package org.rocksdb.test;
+package org.rocksdb;
 
 import org.junit.ClassRule;
 import org.junit.Rule;
 import org.junit.Test;
 import org.junit.rules.TemporaryFolder;
-import org.rocksdb.*;
 
 import static org.assertj.core.api.Assertions.assertThat;
 
diff --git a/java/org/rocksdb/test/StatisticsCollectorTest.java b/java/src/test/java/org/rocksdb/StatisticsCollectorTest.java
similarity index 97%
rename from java/org/rocksdb/test/StatisticsCollectorTest.java
rename to java/src/test/java/org/rocksdb/StatisticsCollectorTest.java
index ba84857ba..927826d71 100644
--- a/java/org/rocksdb/test/StatisticsCollectorTest.java
+++ b/java/src/test/java/org/rocksdb/StatisticsCollectorTest.java
@@ -3,7 +3,7 @@
 // LICENSE file in the root directory of this source tree. An additional grant
 // of patent rights can be found in the PATENTS file in the same directory.
 
-package org.rocksdb.test;
+package org.rocksdb;
 
 import java.util.Collections;
 
@@ -11,7 +11,6 @@ import org.junit.ClassRule;
 import org.junit.Rule;
 import org.junit.Test;
 import org.junit.rules.TemporaryFolder;
-import org.rocksdb.*;
 
 import static org.assertj.core.api.Assertions.assertThat;
 
diff --git a/java/org/rocksdb/test/StatsCallbackMock.java b/java/src/test/java/org/rocksdb/StatsCallbackMock.java
similarity index 93%
rename from java/org/rocksdb/test/StatsCallbackMock.java
rename to java/src/test/java/org/rocksdb/StatsCallbackMock.java
index 4ad2fb7b7..3c5800e42 100644
--- a/java/org/rocksdb/test/StatsCallbackMock.java
+++ b/java/src/test/java/org/rocksdb/StatsCallbackMock.java
@@ -3,9 +3,7 @@
 // LICENSE file in the root directory of this source tree. An additional grant
 // of patent rights can be found in the PATENTS file in the same directory.
 
-package org.rocksdb.test;
-
-import org.rocksdb.*;
+package org.rocksdb;
 
 public class StatsCallbackMock implements StatisticsCollectorCallback {
   public int tickerCallbackCount = 0;
diff --git a/java/org/rocksdb/test/TransactionLogIteratorTest.java b/java/src/test/java/org/rocksdb/TransactionLogIteratorTest.java
similarity index 99%
rename from java/org/rocksdb/test/TransactionLogIteratorTest.java
rename to java/src/test/java/org/rocksdb/TransactionLogIteratorTest.java
index 6d700dac9..1de2efdea 100644
--- a/java/org/rocksdb/test/TransactionLogIteratorTest.java
+++ b/java/src/test/java/org/rocksdb/TransactionLogIteratorTest.java
@@ -1,10 +1,9 @@
-package org.rocksdb.test;
+package org.rocksdb;
 
 import org.junit.ClassRule;
 import org.junit.Rule;
 import org.junit.Test;
 import org.junit.rules.TemporaryFolder;
-import org.rocksdb.*;
 
 import static org.assertj.core.api.Assertions.assertThat;
 
diff --git a/java/org/rocksdb/test/TtlDBTest.java b/java/src/test/java/org/rocksdb/TtlDBTest.java
similarity index 99%
rename from java/org/rocksdb/test/TtlDBTest.java
rename to java/src/test/java/org/rocksdb/TtlDBTest.java
index 56f7ebc1a..0b816d66a 100644
--- a/java/org/rocksdb/test/TtlDBTest.java
+++ b/java/src/test/java/org/rocksdb/TtlDBTest.java
@@ -3,13 +3,12 @@
 // LICENSE file in the root directory of this source tree. An additional grant
 // of patent rights can be found in the PATENTS file in the same directory.
 
-package org.rocksdb.test;
+package org.rocksdb;
 
 import org.junit.ClassRule;
 import org.junit.Rule;
 import org.junit.Test;
 import org.junit.rules.TemporaryFolder;
-import org.rocksdb.*;
 
 import java.util.ArrayList;
 import java.util.List;
diff --git a/java/org/rocksdb/test/Types.java b/java/src/test/java/org/rocksdb/Types.java
similarity index 97%
rename from java/org/rocksdb/test/Types.java
rename to java/src/test/java/org/rocksdb/Types.java
index 22fcd3537..5ad35f463 100644
--- a/java/org/rocksdb/test/Types.java
+++ b/java/src/test/java/org/rocksdb/Types.java
@@ -3,7 +3,7 @@
 // LICENSE file in the root directory of this source tree. An additional grant
 // of patent rights can be found in the PATENTS file in the same directory.
 
-package org.rocksdb.test;
+package org.rocksdb;
 
 /**
  * Simple type conversion methods
diff --git a/java/org/rocksdb/test/WriteBatchHandlerTest.java b/java/src/test/java/org/rocksdb/WriteBatchHandlerTest.java
similarity index 98%
rename from java/org/rocksdb/test/WriteBatchHandlerTest.java
rename to java/src/test/java/org/rocksdb/WriteBatchHandlerTest.java
index ca26c9275..b09cc9259 100644
--- a/java/org/rocksdb/test/WriteBatchHandlerTest.java
+++ b/java/src/test/java/org/rocksdb/WriteBatchHandlerTest.java
@@ -3,10 +3,7 @@
 // LICENSE file in the root directory of this source tree. An additional grant
 // of patent rights can be found in the PATENTS file in the same directory.
 
-package org.rocksdb.test;
-
-import org.rocksdb.RocksDBException;
-import org.rocksdb.WriteBatch;
+package org.rocksdb;
 
 import java.io.IOException;
 import java.util.ArrayList;
diff --git a/java/org/rocksdb/test/WriteBatchTest.java b/java/src/test/java/org/rocksdb/WriteBatchTest.java
similarity index 89%
rename from java/org/rocksdb/test/WriteBatchTest.java
rename to java/src/test/java/org/rocksdb/WriteBatchTest.java
index cf855c121..89a9d5405 100644
--- a/java/org/rocksdb/test/WriteBatchTest.java
+++ b/java/src/test/java/org/rocksdb/WriteBatchTest.java
@@ -6,13 +6,12 @@
 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
-package org.rocksdb.test;
+package org.rocksdb;
 
 import org.junit.ClassRule;
 import org.junit.Rule;
 import org.junit.Test;
 import org.junit.rules.TemporaryFolder;
-import org.rocksdb.WriteBatch;
 
 import java.io.UnsupportedEncodingException;
 
@@ -48,8 +47,8 @@ public class WriteBatchTest {
     batch.put("foo".getBytes("US-ASCII"), "bar".getBytes("US-ASCII"));
     batch.remove("box".getBytes("US-ASCII"));
     batch.put("baz".getBytes("US-ASCII"), "boo".getBytes("US-ASCII"));
-    WriteBatchInternal.setSequence(batch, 100);
-    assertThat(WriteBatchInternal.sequence(batch)).
+    WriteBatchTestInternalHelper.setSequence(batch, 100);
+    assertThat(WriteBatchTestInternalHelper.sequence(batch)).
         isNotNull().
         isEqualTo(100);
     assertThat(batch.count()).isEqualTo(3);
@@ -64,24 +63,24 @@ public class WriteBatchTest {
       throws UnsupportedEncodingException {
     WriteBatch b1 = new WriteBatch();
     WriteBatch b2 = new WriteBatch();
-    WriteBatchInternal.setSequence(b1, 200);
-    WriteBatchInternal.setSequence(b2, 300);
-    WriteBatchInternal.append(b1, b2);
+    WriteBatchTestInternalHelper.setSequence(b1, 200);
+    WriteBatchTestInternalHelper.setSequence(b2, 300);
+    WriteBatchTestInternalHelper.append(b1, b2);
     assertThat(getContents(b1).length).isEqualTo(0);
     assertThat(b1.count()).isEqualTo(0);
     b2.put("a".getBytes("US-ASCII"), "va".getBytes("US-ASCII"));
-    WriteBatchInternal.append(b1, b2);
+    WriteBatchTestInternalHelper.append(b1, b2);
     assertThat("Put(a, va)@200".equals(new String(getContents(b1), "US-ASCII")));
     assertThat(b1.count()).isEqualTo(1);
     b2.clear();
     b2.put("b".getBytes("US-ASCII"), "vb".getBytes("US-ASCII"));
-    WriteBatchInternal.append(b1, b2);
+    WriteBatchTestInternalHelper.append(b1, b2);
     assertThat(("Put(a, va)@200" +
             "Put(b, vb)@201")
                 .equals(new String(getContents(b1), "US-ASCII")));
     assertThat(b1.count()).isEqualTo(2);
     b2.remove("foo".getBytes("US-ASCII"));
-    WriteBatchInternal.append(b1, b2);
+    WriteBatchTestInternalHelper.append(b1, b2);
     assertThat(("Put(a, va)@200" +
         "Put(b, vb)@202" +
         "Put(b, vb)@201" +
@@ -117,7 +116,7 @@ public class WriteBatchTest {
  * Package-private class which provides java api to access
  * c++ WriteBatchInternal.
  */
-class WriteBatchInternal {
+class WriteBatchTestInternalHelper {
   static native void setSequence(WriteBatch batch, long sn);
   static native long sequence(WriteBatch batch);
   static native void append(WriteBatch b1, WriteBatch b2);
diff --git a/java/org/rocksdb/test/WriteBatchWithIndexTest.java b/java/src/test/java/org/rocksdb/WriteBatchWithIndexTest.java
similarity index 95%
rename from java/org/rocksdb/test/WriteBatchWithIndexTest.java
rename to java/src/test/java/org/rocksdb/WriteBatchWithIndexTest.java
index dac3f1110..f7eed556a 100644
--- a/java/org/rocksdb/test/WriteBatchWithIndexTest.java
+++ b/java/src/test/java/org/rocksdb/WriteBatchWithIndexTest.java
@@ -7,20 +7,12 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-package org.rocksdb.test;
+package org.rocksdb;
 
 import org.junit.ClassRule;
 import org.junit.Rule;
 import org.junit.Test;
 import org.junit.rules.TemporaryFolder;
-import org.rocksdb.WriteBatchWithIndex;
-import org.rocksdb.DirectSlice;
-import org.rocksdb.Options;
-import org.rocksdb.RocksDB;
-import org.rocksdb.RocksDBException;
-import org.rocksdb.RocksIterator;
-import org.rocksdb.WriteOptions;
-import org.rocksdb.WBWIRocksIterator;
 
 import java.nio.ByteBuffer;
 import java.util.ArrayDeque;
diff --git a/java/org/rocksdb/test/WriteOptionsTest.java b/java/src/test/java/org/rocksdb/WriteOptionsTest.java
similarity index 94%
rename from java/org/rocksdb/test/WriteOptionsTest.java
rename to java/src/test/java/org/rocksdb/WriteOptionsTest.java
index 70a68335d..4d8e6d97e 100644
--- a/java/org/rocksdb/test/WriteOptionsTest.java
+++ b/java/src/test/java/org/rocksdb/WriteOptionsTest.java
@@ -3,11 +3,10 @@
 // LICENSE file in the root directory of this source tree. An additional grant
 // of patent rights can be found in the PATENTS file in the same directory.
 
-package org.rocksdb.test;
+package org.rocksdb;
 
 import org.junit.ClassRule;
 import org.junit.Test;
-import org.rocksdb.WriteOptions;
 
 import static org.assertj.core.api.Assertions.assertThat;
 
diff --git a/java/org/rocksdb/test/RocksJunitRunner.java b/java/src/test/java/org/rocksdb/test/RocksJunitRunner.java
similarity index 100%
rename from java/org/rocksdb/test/RocksJunitRunner.java
rename to java/src/test/java/org/rocksdb/test/RocksJunitRunner.java
diff --git a/java/org/rocksdb/test/EnvironmentTest.java b/java/src/test/java/org/rocksdb/util/EnvironmentTest.java
similarity index 98%
rename from java/org/rocksdb/test/EnvironmentTest.java
rename to java/src/test/java/org/rocksdb/util/EnvironmentTest.java
index b5af069da..b8e22bce1 100644
--- a/java/org/rocksdb/test/EnvironmentTest.java
+++ b/java/src/test/java/org/rocksdb/util/EnvironmentTest.java
@@ -2,10 +2,9 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree. An additional grant
 // of patent rights can be found in the PATENTS file in the same directory.
-package org.rocksdb.test;
+package org.rocksdb.util;
 
 import org.junit.Test;
-import org.rocksdb.util.Environment;
 
 import java.lang.reflect.Field;
 import java.lang.reflect.Modifier;
diff --git a/java/org/rocksdb/test/SizeUnitTest.java b/java/src/test/java/org/rocksdb/util/SizeUnitTest.java
similarity index 93%
rename from java/org/rocksdb/test/SizeUnitTest.java
rename to java/src/test/java/org/rocksdb/util/SizeUnitTest.java
index 16f636267..517e1b2b5 100644
--- a/java/org/rocksdb/test/SizeUnitTest.java
+++ b/java/src/test/java/org/rocksdb/util/SizeUnitTest.java
@@ -2,10 +2,9 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree. An additional grant
 // of patent rights can be found in the PATENTS file in the same directory.
-package org.rocksdb.test;
+package org.rocksdb.util;
 
 import org.junit.Test;
-import org.rocksdb.util.SizeUnit;
 
 import static org.assertj.core.api.Assertions.assertThat;
 

From 353db6daef28ebe4b07c9933652af7f13fef8489 Mon Sep 17 00:00:00 2001
From: Adam Retter <adam.retter@googlemail.com>
Date: Sat, 31 Jan 2015 22:23:59 +0000
Subject: [PATCH 787/829] Moved Java main classes into src/main/java

---
 Makefile                                      | 39 +++++++++++--------
 java/Makefile                                 | 28 ++++++-------
 java/crossbuild/build-linux-centos.sh         |  4 +-
 java/crossbuild/build-linux.sh                |  4 +-
 .../java}/org/rocksdb/AbstractComparator.java |  0
 .../org/rocksdb/AbstractRocksIterator.java    |  0
 .../main/java}/org/rocksdb/AbstractSlice.java |  0
 .../java}/org/rocksdb/AbstractWriteBatch.java |  0
 .../main/java}/org/rocksdb/BackupInfo.java    |  0
 .../main/java}/org/rocksdb/BackupableDB.java  |  0
 .../org/rocksdb/BackupableDBOptions.java      |  0
 .../org/rocksdb/BlockBasedTableConfig.java    |  0
 .../main/java}/org/rocksdb/BloomFilter.java   |  0
 .../java}/org/rocksdb/BuiltinComparator.java  |  0
 .../main/java}/org/rocksdb/Checkpoint.java    |  0
 .../main/java}/org/rocksdb/ChecksumType.java  |  0
 .../org/rocksdb/ColumnFamilyDescriptor.java   |  0
 .../java}/org/rocksdb/ColumnFamilyHandle.java |  0
 .../org/rocksdb/ColumnFamilyOptions.java      |  0
 .../rocksdb/ColumnFamilyOptionsInterface.java |  0
 .../java}/org/rocksdb/CompactionStyle.java    |  0
 .../main/java}/org/rocksdb/Comparator.java    |  0
 .../java}/org/rocksdb/ComparatorOptions.java  |  0
 .../java}/org/rocksdb/CompressionType.java    |  0
 .../main/java}/org/rocksdb/DBOptions.java     |  0
 .../java}/org/rocksdb/DBOptionsInterface.java |  0
 .../java}/org/rocksdb/DirectComparator.java   |  0
 .../main/java}/org/rocksdb/DirectSlice.java   |  0
 .../main/java}/org/rocksdb/EncodingType.java  |  0
 .../main/java}/org/rocksdb/Filter.java        |  0
 .../main/java}/org/rocksdb/FlushOptions.java  |  0
 .../org/rocksdb/GenericRateLimiterConfig.java |  0
 .../rocksdb/HashLinkedListMemTableConfig.java |  0
 .../rocksdb/HashSkipListMemTableConfig.java   |  0
 .../main/java}/org/rocksdb/HistogramData.java |  0
 .../main/java}/org/rocksdb/HistogramType.java |  0
 .../main/java}/org/rocksdb/IndexType.java     |  0
 .../main/java}/org/rocksdb/InfoLogLevel.java  |  0
 .../java}/org/rocksdb/MemTableConfig.java     |  0
 .../main/java}/org/rocksdb/MergeOperator.java |  0
 .../org/rocksdb/NativeLibraryLoader.java      |  0
 .../main/java}/org/rocksdb/Options.java       |  0
 .../java}/org/rocksdb/PlainTableConfig.java   |  0
 .../java}/org/rocksdb/RateLimiterConfig.java  |  0
 .../main/java}/org/rocksdb/ReadOptions.java   |  0
 .../org/rocksdb/RestoreBackupableDB.java      |  0
 .../java}/org/rocksdb/RestoreOptions.java     |  0
 .../main/java}/org/rocksdb/RocksDB.java       |  0
 .../java}/org/rocksdb/RocksDBException.java   |  0
 .../main/java}/org/rocksdb/RocksEnv.java      |  0
 .../main/java}/org/rocksdb/RocksIterator.java |  0
 .../org/rocksdb/RocksIteratorInterface.java   |  0
 .../main/java}/org/rocksdb/RocksObject.java   |  0
 .../org/rocksdb/SkipListMemTableConfig.java   |  0
 .../main/java}/org/rocksdb/Slice.java         |  0
 .../main/java}/org/rocksdb/Snapshot.java      |  0
 .../main/java}/org/rocksdb/Statistics.java    |  0
 .../org/rocksdb/StatisticsCollector.java      |  0
 .../rocksdb/StatisticsCollectorCallback.java  |  0
 .../org/rocksdb/StatsCollectorInput.java      |  0
 .../org/rocksdb/StringAppendOperator.java     |  0
 .../java}/org/rocksdb/TableFormatConfig.java  |  0
 .../main/java}/org/rocksdb/TickerType.java    |  0
 .../org/rocksdb/TransactionLogIterator.java   |  0
 .../main/java}/org/rocksdb/TtlDB.java         |  0
 .../org/rocksdb/VectorMemTableConfig.java     |  0
 .../java}/org/rocksdb/WBWIRocksIterator.java  |  0
 .../main/java}/org/rocksdb/WriteBatch.java    |  0
 .../org/rocksdb/WriteBatchInterface.java      |  0
 .../org/rocksdb/WriteBatchWithIndex.java      |  0
 .../main/java}/org/rocksdb/WriteOptions.java  |  0
 .../java}/org/rocksdb/util/Environment.java   |  0
 .../main/java}/org/rocksdb/util/SizeUnit.java |  0
 73 files changed, 41 insertions(+), 34 deletions(-)
 rename java/{ => src/main/java}/org/rocksdb/AbstractComparator.java (100%)
 rename java/{ => src/main/java}/org/rocksdb/AbstractRocksIterator.java (100%)
 rename java/{ => src/main/java}/org/rocksdb/AbstractSlice.java (100%)
 rename java/{ => src/main/java}/org/rocksdb/AbstractWriteBatch.java (100%)
 rename java/{ => src/main/java}/org/rocksdb/BackupInfo.java (100%)
 rename java/{ => src/main/java}/org/rocksdb/BackupableDB.java (100%)
 rename java/{ => src/main/java}/org/rocksdb/BackupableDBOptions.java (100%)
 rename java/{ => src/main/java}/org/rocksdb/BlockBasedTableConfig.java (100%)
 rename java/{ => src/main/java}/org/rocksdb/BloomFilter.java (100%)
 rename java/{ => src/main/java}/org/rocksdb/BuiltinComparator.java (100%)
 rename java/{ => src/main/java}/org/rocksdb/Checkpoint.java (100%)
 rename java/{ => src/main/java}/org/rocksdb/ChecksumType.java (100%)
 rename java/{ => src/main/java}/org/rocksdb/ColumnFamilyDescriptor.java (100%)
 rename java/{ => src/main/java}/org/rocksdb/ColumnFamilyHandle.java (100%)
 rename java/{ => src/main/java}/org/rocksdb/ColumnFamilyOptions.java (100%)
 rename java/{ => src/main/java}/org/rocksdb/ColumnFamilyOptionsInterface.java (100%)
 rename java/{ => src/main/java}/org/rocksdb/CompactionStyle.java (100%)
 rename java/{ => src/main/java}/org/rocksdb/Comparator.java (100%)
 rename java/{ => src/main/java}/org/rocksdb/ComparatorOptions.java (100%)
 rename java/{ => src/main/java}/org/rocksdb/CompressionType.java (100%)
 rename java/{ => src/main/java}/org/rocksdb/DBOptions.java (100%)
 rename java/{ => src/main/java}/org/rocksdb/DBOptionsInterface.java (100%)
 rename java/{ => src/main/java}/org/rocksdb/DirectComparator.java (100%)
 rename java/{ => src/main/java}/org/rocksdb/DirectSlice.java (100%)
 rename java/{ => src/main/java}/org/rocksdb/EncodingType.java (100%)
 rename java/{ => src/main/java}/org/rocksdb/Filter.java (100%)
 rename java/{ => src/main/java}/org/rocksdb/FlushOptions.java (100%)
 rename java/{ => src/main/java}/org/rocksdb/GenericRateLimiterConfig.java (100%)
 rename java/{ => src/main/java}/org/rocksdb/HashLinkedListMemTableConfig.java (100%)
 rename java/{ => src/main/java}/org/rocksdb/HashSkipListMemTableConfig.java (100%)
 rename java/{ => src/main/java}/org/rocksdb/HistogramData.java (100%)
 rename java/{ => src/main/java}/org/rocksdb/HistogramType.java (100%)
 rename java/{ => src/main/java}/org/rocksdb/IndexType.java (100%)
 rename java/{ => src/main/java}/org/rocksdb/InfoLogLevel.java (100%)
 rename java/{ => src/main/java}/org/rocksdb/MemTableConfig.java (100%)
 rename java/{ => src/main/java}/org/rocksdb/MergeOperator.java (100%)
 rename java/{ => src/main/java}/org/rocksdb/NativeLibraryLoader.java (100%)
 rename java/{ => src/main/java}/org/rocksdb/Options.java (100%)
 rename java/{ => src/main/java}/org/rocksdb/PlainTableConfig.java (100%)
 rename java/{ => src/main/java}/org/rocksdb/RateLimiterConfig.java (100%)
 rename java/{ => src/main/java}/org/rocksdb/ReadOptions.java (100%)
 rename java/{ => src/main/java}/org/rocksdb/RestoreBackupableDB.java (100%)
 rename java/{ => src/main/java}/org/rocksdb/RestoreOptions.java (100%)
 rename java/{ => src/main/java}/org/rocksdb/RocksDB.java (100%)
 rename java/{ => src/main/java}/org/rocksdb/RocksDBException.java (100%)
 rename java/{ => src/main/java}/org/rocksdb/RocksEnv.java (100%)
 rename java/{ => src/main/java}/org/rocksdb/RocksIterator.java (100%)
 rename java/{ => src/main/java}/org/rocksdb/RocksIteratorInterface.java (100%)
 rename java/{ => src/main/java}/org/rocksdb/RocksObject.java (100%)
 rename java/{ => src/main/java}/org/rocksdb/SkipListMemTableConfig.java (100%)
 rename java/{ => src/main/java}/org/rocksdb/Slice.java (100%)
 rename java/{ => src/main/java}/org/rocksdb/Snapshot.java (100%)
 rename java/{ => src/main/java}/org/rocksdb/Statistics.java (100%)
 rename java/{ => src/main/java}/org/rocksdb/StatisticsCollector.java (100%)
 rename java/{ => src/main/java}/org/rocksdb/StatisticsCollectorCallback.java (100%)
 rename java/{ => src/main/java}/org/rocksdb/StatsCollectorInput.java (100%)
 rename java/{ => src/main/java}/org/rocksdb/StringAppendOperator.java (100%)
 rename java/{ => src/main/java}/org/rocksdb/TableFormatConfig.java (100%)
 rename java/{ => src/main/java}/org/rocksdb/TickerType.java (100%)
 rename java/{ => src/main/java}/org/rocksdb/TransactionLogIterator.java (100%)
 rename java/{ => src/main/java}/org/rocksdb/TtlDB.java (100%)
 rename java/{ => src/main/java}/org/rocksdb/VectorMemTableConfig.java (100%)
 rename java/{ => src/main/java}/org/rocksdb/WBWIRocksIterator.java (100%)
 rename java/{ => src/main/java}/org/rocksdb/WriteBatch.java (100%)
 rename java/{ => src/main/java}/org/rocksdb/WriteBatchInterface.java (100%)
 rename java/{ => src/main/java}/org/rocksdb/WriteBatchWithIndex.java (100%)
 rename java/{ => src/main/java}/org/rocksdb/WriteOptions.java (100%)
 rename java/{ => src/main/java}/org/rocksdb/util/Environment.java (100%)
 rename java/{ => src/main/java}/org/rocksdb/util/SizeUnit.java (100%)

diff --git a/Makefile b/Makefile
index 1c0dea975..724be1d3c 100644
--- a/Makefile
+++ b/Makefile
@@ -631,35 +631,40 @@ libsnappy.a:
 rocksdbjavastatic: libz.a libbz2.a libsnappy.a
 	OPT="-fPIC -DNDEBUG -O2" $(MAKE) $(LIBRARY) -j
 	cd java;$(MAKE) javalib;
-	rm -f ./java/$(ROCKSDBJNILIB)
-	$(CXX) $(CXXFLAGS) -I./java/. $(JAVA_INCLUDE) -shared -fPIC -o ./java/$(ROCKSDBJNILIB) $(JNI_NATIVE_SOURCES) $(LIBOBJECTS) $(COVERAGEFLAGS) libz.a libbz2.a libsnappy.a
-	cd java;strip -S -x $(ROCKSDBJNILIB)
-	cd java;jar -cf $(ROCKSDB_JAR) org/rocksdb/*.class org/rocksdb/util/*.class HISTORY*.md $(ROCKSDBJNILIB)
-	cd java/javadoc;jar -cf ../$(ROCKSDB_JAVADOCS_JAR) *
-	cd java;jar -cf $(ROCKSDB_SOURCES_JAR) org
+	rm -f ./java/target/$(ROCKSDBJNILIB)
+	$(CXX) $(CXXFLAGS) -I./java/. $(JAVA_INCLUDE) -shared -fPIC -o ./java/target/$(ROCKSDBJNILIB) $(JNI_NATIVE_SOURCES) $(LIBOBJECTS) $(COVERAGEFLAGS) libz.a libbz2.a libsnappy.a
+	cd java/target;strip -S -x $(ROCKSDBJNILIB)
+	cd java;jar -cf target/$(ROCKSDB_JAR) HISTORY*.md
+	cd java/target;jar -uf $(ROCKSDB_JAR) $(ROCKSDBJNILIB)
+	cd java/target/classes;jar -uf ../$(ROCKSDB_JAR) org/rocksdb/*.class org/rocksdb/util/*.class
+	cd java/target/apidocs;jar -cf ../$(ROCKSDB_JAVADOCS_JAR) *
+	cd java/src/main/java;jar -cf ../../../target/$(ROCKSDB_SOURCES_JAR) org
 
 rocksdbjavastaticrelease: rocksdbjavastatic
 	cd java/crossbuild && vagrant destroy -f && vagrant up linux32 && vagrant halt linux32 && vagrant up linux64 && vagrant halt linux64
-	cd java;jar -cf $(ROCKSDB_JAR_ALL) org/rocksdb/*.class org/rocksdb/util/*.class HISTORY*.md librocksdbjni-*.so librocksdbjni-*.jnilib
+	cd java;jar -cf target/$(ROCKSDB_JAR_ALL) HISTORY*.md
+	cd java/target;jar -uf $(ROCKSDB_JAR_ALL) librocksdbjni-*.so librocksdbjni-*.jnilib
+	cd java/target/classes;jar -uf ../$(ROCKSDB_JAR_ALL) org/rocksdb/*.class org/rocksdb/util/*.class
 
 rocksdbjavastaticpublish: rocksdbjavastaticrelease
-	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-javadoc.jar -Dclassifier=javadoc
-	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-sources.jar -Dclassifier=sources
-	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-linux64.jar -Dclassifier=linux64
-	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-linux32.jar -Dclassifier=linux32
-	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-osx.jar -Dclassifier=osx
-	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH).jar
+	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/target/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-javadoc.jar -Dclassifier=javadoc
+	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/target/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-sources.jar -Dclassifier=sources
+	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/target/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-linux64.jar -Dclassifier=linux64
+	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/target/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-linux32.jar -Dclassifier=linux32
+	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/target/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-osx.jar -Dclassifier=osx
+	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/target/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH).jar
 
 rocksdbjava:
 	OPT="-fPIC -DNDEBUG -O2" $(MAKE) $(LIBRARY) -j32
 	cd java;$(MAKE) javalib;
-	rm -f ./java/$(ROCKSDBJNILIB)
-	$(CXX) $(CXXFLAGS) -I./java/. $(JAVA_INCLUDE) -shared -fPIC -o ./java/$(ROCKSDBJNILIB) $(JNI_NATIVE_SOURCES) $(LIBOBJECTS) $(JAVA_LDFLAGS) $(COVERAGEFLAGS)
-	cd java;jar -cf $(ROCKSDB_JAR) org/rocksdb/*.class org/rocksdb/util/*.class HISTORY*.md $(ROCKSDBJNILIB)
+	rm -f ./java/target/$(ROCKSDBJNILIB)
+	$(CXX) $(CXXFLAGS) -I./java/. $(JAVA_INCLUDE) -shared -fPIC -o ./java/target/$(ROCKSDBJNILIB) $(JNI_NATIVE_SOURCES) $(LIBOBJECTS) $(JAVA_LDFLAGS) $(COVERAGEFLAGS)
+	cd java;jar -cf target/$(ROCKSDB_JAR) HISTORY*.md
+	cd java/target;jar -uf $(ROCKSDB_JAR) $(ROCKSDBJNILIB)
+	cd java/target/classes;jar -uf ../$(ROCKSDB_JAR) org/rocksdb/*.class org/rocksdb/util/*.class
 
 jclean:
 	cd java;$(MAKE) clean;
-	rm -f $(ROCKSDBJNILIB)
 
 jtest:
 	cd java;$(MAKE) sample;$(MAKE) test;
diff --git a/java/Makefile b/java/Makefile
index a43275c2c..b7d44fc25 100644
--- a/java/Makefile
+++ b/java/Makefile
@@ -92,11 +92,12 @@ JAVA_TESTS = org.rocksdb.BackupableDBOptionsTest\
 	org.rocksdb.WriteOptionsTest\
 	org.rocksdb.WriteBatchWithIndexTest
 
+MAIN_SRC = src/main/java
 TEST_SRC = src/test/java
 OUTPUT = target
-# TODO update after moving main classes
-MAIN_CLASSES = .
+MAIN_CLASSES = $(OUTPUT)/classes
 TEST_CLASSES = $(OUTPUT)/test-classes
+JAVADOC = $(OUTPUT)/apidocs
 
 JAVA_TEST_LIBDIR = ./test-libs/
 JAVA_JUNIT_JAR = $(JAVA_TEST_LIBDIR)junit-4.12-beta-2.jar
@@ -107,25 +108,25 @@ JAVA_ASSERTJ_JAR = $(JAVA_TEST_LIBDIR)assertj-core-1.7.0.jar
 JAVA_TESTCLASSPATH = $(JAVA_JUNIT_JAR):$(JAVA_HAMCR_JAR):$(JAVA_MOCKITO_JAR):$(JAVA_CGLIB_JAR):$(JAVA_ASSERTJ_JAR):.:./*
 
 clean:
-	-find . -name "*.class" -exec rm {} \;
 	rm -rf include/*
-	rm -rf javadoc/*
 	rm -rf test-libs/
 	rm -rf $(OUTPUT)
-	rm -rf librocksdbjni*
-	rm -f rocksdbjni*
 
 
 javadocs:
-	mkdir -p javadoc; javadoc -d javadoc -sourcepath . -subpackages org -exclude org.rocksdb.test
+	mkdir -p $(JAVADOC)
+	javadoc -d $(JAVADOC) -sourcepath $(MAIN_SRC) -subpackages org
 
 javalib: java java_test javadocs
 
 java:
-	javac org/rocksdb/util/*.java org/rocksdb/*.java
+	mkdir -p $(MAIN_CLASSES)
+	javac -d $(MAIN_CLASSES)\
+		$(MAIN_SRC)/org/rocksdb/util/*.java\
+		$(MAIN_SRC)/org/rocksdb/*.java
 	@cp ../HISTORY.md ./HISTORY-CPP.md
 	@rm -f ./HISTORY-CPP.md
-	javah -d $(NATIVE_INCLUDE) -jni $(NATIVE_JAVA_CLASSES)
+	javah -cp $(MAIN_CLASSES) -d $(NATIVE_INCLUDE) -jni $(NATIVE_JAVA_CLASSES)
 
 sample: java
 	javac -cp $(ROCKSDB_JAR) RocksDBSample.java
@@ -151,13 +152,14 @@ resolve_test_deps:
 
 java_test: resolve_test_deps
 	mkdir -p $(TEST_CLASSES)
-	javac -cp $(MAIN_CLASSES):$(JAVA_TESTCLASSPATH) -d $(TEST_CLASSES) $(TEST_SRC)/org/rocksdb/*.java
-	javac -cp $(MAIN_CLASSES):$(JAVA_TESTCLASSPATH) -d $(TEST_CLASSES) $(TEST_SRC)/org/rocksdb/test/*.java
-	javac -cp $(MAIN_CLASSES):$(JAVA_TESTCLASSPATH) -d $(TEST_CLASSES) $(TEST_SRC)/org/rocksdb/util/*.java
+	javac -cp $(MAIN_CLASSES):$(JAVA_TESTCLASSPATH) -d $(TEST_CLASSES)\
+		$(TEST_SRC)/org/rocksdb/test/*.java\
+		$(TEST_SRC)/org/rocksdb/util/*.java\
+		$(TEST_SRC)/org/rocksdb/*.java
 	javah -cp $(MAIN_CLASSES):$(TEST_CLASSES) -d $(NATIVE_INCLUDE) -jni $(NATIVE_JAVA_TEST_CLASSES)
 
 test: java resolve_test_deps java_test
-	java -ea -Xcheck:jni -Djava.library.path=.:../ -cp "$(MAIN_CLASSES):$(TEST_CLASSES):$(JAVA_TESTCLASSPATH)" org.rocksdb.test.RocksJunitRunner $(JAVA_TESTS)
+	java -ea -Xcheck:jni -Djava.library.path=target -cp "$(MAIN_CLASSES):$(TEST_CLASSES):$(JAVA_TESTCLASSPATH)" org.rocksdb.test.RocksJunitRunner $(JAVA_TESTS)
 
 db_bench: java
 	javac org/rocksdb/benchmark/*.java
diff --git a/java/crossbuild/build-linux-centos.sh b/java/crossbuild/build-linux-centos.sh
index 5730b1533..158303069 100755
--- a/java/crossbuild/build-linux-centos.sh
+++ b/java/crossbuild/build-linux-centos.sh
@@ -18,6 +18,6 @@ export JAVA_HOME=/usr/lib/jvm/java-1.7.0
 cd /rocksdb
 scl enable devtoolset-1.1 'make jclean clean'
 scl enable devtoolset-1.1 'make -j 4 rocksdbjavastatic'
-cp /rocksdb/java/librocksdbjni-* /rocksdb-build
-cp /rocksdb/java/rocksdbjni-* /rocksdb-build
+cp /rocksdb/java/target/librocksdbjni-* /rocksdb-build
+cp /rocksdb/java/target/rocksdbjni-* /rocksdb-build
 
diff --git a/java/crossbuild/build-linux.sh b/java/crossbuild/build-linux.sh
index 75edac526..48d1c28d9 100755
--- a/java/crossbuild/build-linux.sh
+++ b/java/crossbuild/build-linux.sh
@@ -8,7 +8,7 @@ export JAVA_HOME=$(echo /usr/lib/jvm/java-7-openjdk*)
 cd /rocksdb
 make jclean clean
 make -j 4 rocksdbjavastatic
-cp /rocksdb/java/librocksdbjni-* /rocksdb-build
-cp /rocksdb/java/rocksdbjni-* /rocksdb-build
+cp /rocksdb/java/target/librocksdbjni-* /rocksdb-build
+cp /rocksdb/java/target/rocksdbjni-* /rocksdb-build
 sudo shutdown -h now
 
diff --git a/java/org/rocksdb/AbstractComparator.java b/java/src/main/java/org/rocksdb/AbstractComparator.java
similarity index 100%
rename from java/org/rocksdb/AbstractComparator.java
rename to java/src/main/java/org/rocksdb/AbstractComparator.java
diff --git a/java/org/rocksdb/AbstractRocksIterator.java b/java/src/main/java/org/rocksdb/AbstractRocksIterator.java
similarity index 100%
rename from java/org/rocksdb/AbstractRocksIterator.java
rename to java/src/main/java/org/rocksdb/AbstractRocksIterator.java
diff --git a/java/org/rocksdb/AbstractSlice.java b/java/src/main/java/org/rocksdb/AbstractSlice.java
similarity index 100%
rename from java/org/rocksdb/AbstractSlice.java
rename to java/src/main/java/org/rocksdb/AbstractSlice.java
diff --git a/java/org/rocksdb/AbstractWriteBatch.java b/java/src/main/java/org/rocksdb/AbstractWriteBatch.java
similarity index 100%
rename from java/org/rocksdb/AbstractWriteBatch.java
rename to java/src/main/java/org/rocksdb/AbstractWriteBatch.java
diff --git a/java/org/rocksdb/BackupInfo.java b/java/src/main/java/org/rocksdb/BackupInfo.java
similarity index 100%
rename from java/org/rocksdb/BackupInfo.java
rename to java/src/main/java/org/rocksdb/BackupInfo.java
diff --git a/java/org/rocksdb/BackupableDB.java b/java/src/main/java/org/rocksdb/BackupableDB.java
similarity index 100%
rename from java/org/rocksdb/BackupableDB.java
rename to java/src/main/java/org/rocksdb/BackupableDB.java
diff --git a/java/org/rocksdb/BackupableDBOptions.java b/java/src/main/java/org/rocksdb/BackupableDBOptions.java
similarity index 100%
rename from java/org/rocksdb/BackupableDBOptions.java
rename to java/src/main/java/org/rocksdb/BackupableDBOptions.java
diff --git a/java/org/rocksdb/BlockBasedTableConfig.java b/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java
similarity index 100%
rename from java/org/rocksdb/BlockBasedTableConfig.java
rename to java/src/main/java/org/rocksdb/BlockBasedTableConfig.java
diff --git a/java/org/rocksdb/BloomFilter.java b/java/src/main/java/org/rocksdb/BloomFilter.java
similarity index 100%
rename from java/org/rocksdb/BloomFilter.java
rename to java/src/main/java/org/rocksdb/BloomFilter.java
diff --git a/java/org/rocksdb/BuiltinComparator.java b/java/src/main/java/org/rocksdb/BuiltinComparator.java
similarity index 100%
rename from java/org/rocksdb/BuiltinComparator.java
rename to java/src/main/java/org/rocksdb/BuiltinComparator.java
diff --git a/java/org/rocksdb/Checkpoint.java b/java/src/main/java/org/rocksdb/Checkpoint.java
similarity index 100%
rename from java/org/rocksdb/Checkpoint.java
rename to java/src/main/java/org/rocksdb/Checkpoint.java
diff --git a/java/org/rocksdb/ChecksumType.java b/java/src/main/java/org/rocksdb/ChecksumType.java
similarity index 100%
rename from java/org/rocksdb/ChecksumType.java
rename to java/src/main/java/org/rocksdb/ChecksumType.java
diff --git a/java/org/rocksdb/ColumnFamilyDescriptor.java b/java/src/main/java/org/rocksdb/ColumnFamilyDescriptor.java
similarity index 100%
rename from java/org/rocksdb/ColumnFamilyDescriptor.java
rename to java/src/main/java/org/rocksdb/ColumnFamilyDescriptor.java
diff --git a/java/org/rocksdb/ColumnFamilyHandle.java b/java/src/main/java/org/rocksdb/ColumnFamilyHandle.java
similarity index 100%
rename from java/org/rocksdb/ColumnFamilyHandle.java
rename to java/src/main/java/org/rocksdb/ColumnFamilyHandle.java
diff --git a/java/org/rocksdb/ColumnFamilyOptions.java b/java/src/main/java/org/rocksdb/ColumnFamilyOptions.java
similarity index 100%
rename from java/org/rocksdb/ColumnFamilyOptions.java
rename to java/src/main/java/org/rocksdb/ColumnFamilyOptions.java
diff --git a/java/org/rocksdb/ColumnFamilyOptionsInterface.java b/java/src/main/java/org/rocksdb/ColumnFamilyOptionsInterface.java
similarity index 100%
rename from java/org/rocksdb/ColumnFamilyOptionsInterface.java
rename to java/src/main/java/org/rocksdb/ColumnFamilyOptionsInterface.java
diff --git a/java/org/rocksdb/CompactionStyle.java b/java/src/main/java/org/rocksdb/CompactionStyle.java
similarity index 100%
rename from java/org/rocksdb/CompactionStyle.java
rename to java/src/main/java/org/rocksdb/CompactionStyle.java
diff --git a/java/org/rocksdb/Comparator.java b/java/src/main/java/org/rocksdb/Comparator.java
similarity index 100%
rename from java/org/rocksdb/Comparator.java
rename to java/src/main/java/org/rocksdb/Comparator.java
diff --git a/java/org/rocksdb/ComparatorOptions.java b/java/src/main/java/org/rocksdb/ComparatorOptions.java
similarity index 100%
rename from java/org/rocksdb/ComparatorOptions.java
rename to java/src/main/java/org/rocksdb/ComparatorOptions.java
diff --git a/java/org/rocksdb/CompressionType.java b/java/src/main/java/org/rocksdb/CompressionType.java
similarity index 100%
rename from java/org/rocksdb/CompressionType.java
rename to java/src/main/java/org/rocksdb/CompressionType.java
diff --git a/java/org/rocksdb/DBOptions.java b/java/src/main/java/org/rocksdb/DBOptions.java
similarity index 100%
rename from java/org/rocksdb/DBOptions.java
rename to java/src/main/java/org/rocksdb/DBOptions.java
diff --git a/java/org/rocksdb/DBOptionsInterface.java b/java/src/main/java/org/rocksdb/DBOptionsInterface.java
similarity index 100%
rename from java/org/rocksdb/DBOptionsInterface.java
rename to java/src/main/java/org/rocksdb/DBOptionsInterface.java
diff --git a/java/org/rocksdb/DirectComparator.java b/java/src/main/java/org/rocksdb/DirectComparator.java
similarity index 100%
rename from java/org/rocksdb/DirectComparator.java
rename to java/src/main/java/org/rocksdb/DirectComparator.java
diff --git a/java/org/rocksdb/DirectSlice.java b/java/src/main/java/org/rocksdb/DirectSlice.java
similarity index 100%
rename from java/org/rocksdb/DirectSlice.java
rename to java/src/main/java/org/rocksdb/DirectSlice.java
diff --git a/java/org/rocksdb/EncodingType.java b/java/src/main/java/org/rocksdb/EncodingType.java
similarity index 100%
rename from java/org/rocksdb/EncodingType.java
rename to java/src/main/java/org/rocksdb/EncodingType.java
diff --git a/java/org/rocksdb/Filter.java b/java/src/main/java/org/rocksdb/Filter.java
similarity index 100%
rename from java/org/rocksdb/Filter.java
rename to java/src/main/java/org/rocksdb/Filter.java
diff --git a/java/org/rocksdb/FlushOptions.java b/java/src/main/java/org/rocksdb/FlushOptions.java
similarity index 100%
rename from java/org/rocksdb/FlushOptions.java
rename to java/src/main/java/org/rocksdb/FlushOptions.java
diff --git a/java/org/rocksdb/GenericRateLimiterConfig.java b/java/src/main/java/org/rocksdb/GenericRateLimiterConfig.java
similarity index 100%
rename from java/org/rocksdb/GenericRateLimiterConfig.java
rename to java/src/main/java/org/rocksdb/GenericRateLimiterConfig.java
diff --git a/java/org/rocksdb/HashLinkedListMemTableConfig.java b/java/src/main/java/org/rocksdb/HashLinkedListMemTableConfig.java
similarity index 100%
rename from java/org/rocksdb/HashLinkedListMemTableConfig.java
rename to java/src/main/java/org/rocksdb/HashLinkedListMemTableConfig.java
diff --git a/java/org/rocksdb/HashSkipListMemTableConfig.java b/java/src/main/java/org/rocksdb/HashSkipListMemTableConfig.java
similarity index 100%
rename from java/org/rocksdb/HashSkipListMemTableConfig.java
rename to java/src/main/java/org/rocksdb/HashSkipListMemTableConfig.java
diff --git a/java/org/rocksdb/HistogramData.java b/java/src/main/java/org/rocksdb/HistogramData.java
similarity index 100%
rename from java/org/rocksdb/HistogramData.java
rename to java/src/main/java/org/rocksdb/HistogramData.java
diff --git a/java/org/rocksdb/HistogramType.java b/java/src/main/java/org/rocksdb/HistogramType.java
similarity index 100%
rename from java/org/rocksdb/HistogramType.java
rename to java/src/main/java/org/rocksdb/HistogramType.java
diff --git a/java/org/rocksdb/IndexType.java b/java/src/main/java/org/rocksdb/IndexType.java
similarity index 100%
rename from java/org/rocksdb/IndexType.java
rename to java/src/main/java/org/rocksdb/IndexType.java
diff --git a/java/org/rocksdb/InfoLogLevel.java b/java/src/main/java/org/rocksdb/InfoLogLevel.java
similarity index 100%
rename from java/org/rocksdb/InfoLogLevel.java
rename to java/src/main/java/org/rocksdb/InfoLogLevel.java
diff --git a/java/org/rocksdb/MemTableConfig.java b/java/src/main/java/org/rocksdb/MemTableConfig.java
similarity index 100%
rename from java/org/rocksdb/MemTableConfig.java
rename to java/src/main/java/org/rocksdb/MemTableConfig.java
diff --git a/java/org/rocksdb/MergeOperator.java b/java/src/main/java/org/rocksdb/MergeOperator.java
similarity index 100%
rename from java/org/rocksdb/MergeOperator.java
rename to java/src/main/java/org/rocksdb/MergeOperator.java
diff --git a/java/org/rocksdb/NativeLibraryLoader.java b/java/src/main/java/org/rocksdb/NativeLibraryLoader.java
similarity index 100%
rename from java/org/rocksdb/NativeLibraryLoader.java
rename to java/src/main/java/org/rocksdb/NativeLibraryLoader.java
diff --git a/java/org/rocksdb/Options.java b/java/src/main/java/org/rocksdb/Options.java
similarity index 100%
rename from java/org/rocksdb/Options.java
rename to java/src/main/java/org/rocksdb/Options.java
diff --git a/java/org/rocksdb/PlainTableConfig.java b/java/src/main/java/org/rocksdb/PlainTableConfig.java
similarity index 100%
rename from java/org/rocksdb/PlainTableConfig.java
rename to java/src/main/java/org/rocksdb/PlainTableConfig.java
diff --git a/java/org/rocksdb/RateLimiterConfig.java b/java/src/main/java/org/rocksdb/RateLimiterConfig.java
similarity index 100%
rename from java/org/rocksdb/RateLimiterConfig.java
rename to java/src/main/java/org/rocksdb/RateLimiterConfig.java
diff --git a/java/org/rocksdb/ReadOptions.java b/java/src/main/java/org/rocksdb/ReadOptions.java
similarity index 100%
rename from java/org/rocksdb/ReadOptions.java
rename to java/src/main/java/org/rocksdb/ReadOptions.java
diff --git a/java/org/rocksdb/RestoreBackupableDB.java b/java/src/main/java/org/rocksdb/RestoreBackupableDB.java
similarity index 100%
rename from java/org/rocksdb/RestoreBackupableDB.java
rename to java/src/main/java/org/rocksdb/RestoreBackupableDB.java
diff --git a/java/org/rocksdb/RestoreOptions.java b/java/src/main/java/org/rocksdb/RestoreOptions.java
similarity index 100%
rename from java/org/rocksdb/RestoreOptions.java
rename to java/src/main/java/org/rocksdb/RestoreOptions.java
diff --git a/java/org/rocksdb/RocksDB.java b/java/src/main/java/org/rocksdb/RocksDB.java
similarity index 100%
rename from java/org/rocksdb/RocksDB.java
rename to java/src/main/java/org/rocksdb/RocksDB.java
diff --git a/java/org/rocksdb/RocksDBException.java b/java/src/main/java/org/rocksdb/RocksDBException.java
similarity index 100%
rename from java/org/rocksdb/RocksDBException.java
rename to java/src/main/java/org/rocksdb/RocksDBException.java
diff --git a/java/org/rocksdb/RocksEnv.java b/java/src/main/java/org/rocksdb/RocksEnv.java
similarity index 100%
rename from java/org/rocksdb/RocksEnv.java
rename to java/src/main/java/org/rocksdb/RocksEnv.java
diff --git a/java/org/rocksdb/RocksIterator.java b/java/src/main/java/org/rocksdb/RocksIterator.java
similarity index 100%
rename from java/org/rocksdb/RocksIterator.java
rename to java/src/main/java/org/rocksdb/RocksIterator.java
diff --git a/java/org/rocksdb/RocksIteratorInterface.java b/java/src/main/java/org/rocksdb/RocksIteratorInterface.java
similarity index 100%
rename from java/org/rocksdb/RocksIteratorInterface.java
rename to java/src/main/java/org/rocksdb/RocksIteratorInterface.java
diff --git a/java/org/rocksdb/RocksObject.java b/java/src/main/java/org/rocksdb/RocksObject.java
similarity index 100%
rename from java/org/rocksdb/RocksObject.java
rename to java/src/main/java/org/rocksdb/RocksObject.java
diff --git a/java/org/rocksdb/SkipListMemTableConfig.java b/java/src/main/java/org/rocksdb/SkipListMemTableConfig.java
similarity index 100%
rename from java/org/rocksdb/SkipListMemTableConfig.java
rename to java/src/main/java/org/rocksdb/SkipListMemTableConfig.java
diff --git a/java/org/rocksdb/Slice.java b/java/src/main/java/org/rocksdb/Slice.java
similarity index 100%
rename from java/org/rocksdb/Slice.java
rename to java/src/main/java/org/rocksdb/Slice.java
diff --git a/java/org/rocksdb/Snapshot.java b/java/src/main/java/org/rocksdb/Snapshot.java
similarity index 100%
rename from java/org/rocksdb/Snapshot.java
rename to java/src/main/java/org/rocksdb/Snapshot.java
diff --git a/java/org/rocksdb/Statistics.java b/java/src/main/java/org/rocksdb/Statistics.java
similarity index 100%
rename from java/org/rocksdb/Statistics.java
rename to java/src/main/java/org/rocksdb/Statistics.java
diff --git a/java/org/rocksdb/StatisticsCollector.java b/java/src/main/java/org/rocksdb/StatisticsCollector.java
similarity index 100%
rename from java/org/rocksdb/StatisticsCollector.java
rename to java/src/main/java/org/rocksdb/StatisticsCollector.java
diff --git a/java/org/rocksdb/StatisticsCollectorCallback.java b/java/src/main/java/org/rocksdb/StatisticsCollectorCallback.java
similarity index 100%
rename from java/org/rocksdb/StatisticsCollectorCallback.java
rename to java/src/main/java/org/rocksdb/StatisticsCollectorCallback.java
diff --git a/java/org/rocksdb/StatsCollectorInput.java b/java/src/main/java/org/rocksdb/StatsCollectorInput.java
similarity index 100%
rename from java/org/rocksdb/StatsCollectorInput.java
rename to java/src/main/java/org/rocksdb/StatsCollectorInput.java
diff --git a/java/org/rocksdb/StringAppendOperator.java b/java/src/main/java/org/rocksdb/StringAppendOperator.java
similarity index 100%
rename from java/org/rocksdb/StringAppendOperator.java
rename to java/src/main/java/org/rocksdb/StringAppendOperator.java
diff --git a/java/org/rocksdb/TableFormatConfig.java b/java/src/main/java/org/rocksdb/TableFormatConfig.java
similarity index 100%
rename from java/org/rocksdb/TableFormatConfig.java
rename to java/src/main/java/org/rocksdb/TableFormatConfig.java
diff --git a/java/org/rocksdb/TickerType.java b/java/src/main/java/org/rocksdb/TickerType.java
similarity index 100%
rename from java/org/rocksdb/TickerType.java
rename to java/src/main/java/org/rocksdb/TickerType.java
diff --git a/java/org/rocksdb/TransactionLogIterator.java b/java/src/main/java/org/rocksdb/TransactionLogIterator.java
similarity index 100%
rename from java/org/rocksdb/TransactionLogIterator.java
rename to java/src/main/java/org/rocksdb/TransactionLogIterator.java
diff --git a/java/org/rocksdb/TtlDB.java b/java/src/main/java/org/rocksdb/TtlDB.java
similarity index 100%
rename from java/org/rocksdb/TtlDB.java
rename to java/src/main/java/org/rocksdb/TtlDB.java
diff --git a/java/org/rocksdb/VectorMemTableConfig.java b/java/src/main/java/org/rocksdb/VectorMemTableConfig.java
similarity index 100%
rename from java/org/rocksdb/VectorMemTableConfig.java
rename to java/src/main/java/org/rocksdb/VectorMemTableConfig.java
diff --git a/java/org/rocksdb/WBWIRocksIterator.java b/java/src/main/java/org/rocksdb/WBWIRocksIterator.java
similarity index 100%
rename from java/org/rocksdb/WBWIRocksIterator.java
rename to java/src/main/java/org/rocksdb/WBWIRocksIterator.java
diff --git a/java/org/rocksdb/WriteBatch.java b/java/src/main/java/org/rocksdb/WriteBatch.java
similarity index 100%
rename from java/org/rocksdb/WriteBatch.java
rename to java/src/main/java/org/rocksdb/WriteBatch.java
diff --git a/java/org/rocksdb/WriteBatchInterface.java b/java/src/main/java/org/rocksdb/WriteBatchInterface.java
similarity index 100%
rename from java/org/rocksdb/WriteBatchInterface.java
rename to java/src/main/java/org/rocksdb/WriteBatchInterface.java
diff --git a/java/org/rocksdb/WriteBatchWithIndex.java b/java/src/main/java/org/rocksdb/WriteBatchWithIndex.java
similarity index 100%
rename from java/org/rocksdb/WriteBatchWithIndex.java
rename to java/src/main/java/org/rocksdb/WriteBatchWithIndex.java
diff --git a/java/org/rocksdb/WriteOptions.java b/java/src/main/java/org/rocksdb/WriteOptions.java
similarity index 100%
rename from java/org/rocksdb/WriteOptions.java
rename to java/src/main/java/org/rocksdb/WriteOptions.java
diff --git a/java/org/rocksdb/util/Environment.java b/java/src/main/java/org/rocksdb/util/Environment.java
similarity index 100%
rename from java/org/rocksdb/util/Environment.java
rename to java/src/main/java/org/rocksdb/util/Environment.java
diff --git a/java/org/rocksdb/util/SizeUnit.java b/java/src/main/java/org/rocksdb/util/SizeUnit.java
similarity index 100%
rename from java/org/rocksdb/util/SizeUnit.java
rename to java/src/main/java/org/rocksdb/util/SizeUnit.java

From dd8d5471ea139e6f59c3f4e589260ea74e707ffd Mon Sep 17 00:00:00 2001
From: Adam Retter <adam.retter@googlemail.com>
Date: Sat, 31 Jan 2015 22:24:35 +0000
Subject: [PATCH 788/829] Adjustment to NativeLibraryLoader to allow native
 library to be loaded from either java.library.path or from extracting from
 the Jar. Means that the test in the build do not need to rely on the Jar,
 useful when creating similar builds (and executing tests) from Maven

---
 .../java/org/rocksdb/NativeLibraryLoader.java | 40 ++++++++++++++++---
 java/src/main/java/org/rocksdb/RocksDB.java   |  6 +--
 .../java/org/rocksdb/util/Environment.java    | 24 ++++++++---
 .../org/rocksdb/util/EnvironmentTest.java     | 36 ++++++++---------
 4 files changed, 74 insertions(+), 32 deletions(-)

diff --git a/java/src/main/java/org/rocksdb/NativeLibraryLoader.java b/java/src/main/java/org/rocksdb/NativeLibraryLoader.java
index fb09d3600..06ae773cb 100644
--- a/java/src/main/java/org/rocksdb/NativeLibraryLoader.java
+++ b/java/src/main/java/org/rocksdb/NativeLibraryLoader.java
@@ -15,7 +15,9 @@ public class NativeLibraryLoader {
   private static final NativeLibraryLoader instance = new NativeLibraryLoader();
   private static boolean initialized = false;
 
-  private static final String sharedLibraryName = Environment.getJniLibraryName("rocksdb");
+  private static final String sharedLibraryName = Environment.getSharedLibraryName("rocksdb");
+  private static final String jniLibraryName = Environment.getJniLibraryName("rocksdb");
+  private static final String jniLibraryFileName = Environment.getJniLibraryFileName("rocksdb");
   private static final String tempFilePrefix = "librocksdbjni";
   private static final String tempFileSuffix = "." + Environment.getJniLibraryExtension();
 
@@ -28,6 +30,34 @@ public class NativeLibraryLoader {
     return instance;
   }
 
+  /**
+   * Firstly attempts to load the library from <i>java.library.path</i>,
+   * if that fails then it falls back to extracting
+   * the library from the classpath
+   * {@link org.rocksdb.NativeLibraryLoader#loadLibraryFromJar(java.lang.String)}
+   *
+   * @param tmpDir A temporary directory to use
+   *   to copy the native library to when loading from the classpath.
+   *   If null, or the empty string, we rely on Java's
+   *   {@link java.io.File#createTempFile(String, String)}
+   *   function to provide a temporary location.
+   *   The temporary file will be registered for deletion
+   *   on exit.
+   *
+   * @throws java.io.IOException if a filesystem operation fails.
+   */
+  public synchronized void loadLibrary(final String tmpDir) throws IOException {
+    try {
+        System.loadLibrary(sharedLibraryName);
+    } catch(final UnsatisfiedLinkError ule1) {
+      try {
+        System.loadLibrary(jniLibraryName);
+      } catch(final UnsatisfiedLinkError ule2) {
+        loadLibraryFromJar(tmpDir);
+      }
+    }
+  }
+
   /**
    * Attempts to extract the native RocksDB library
    * from the classpath and load it
@@ -42,14 +72,14 @@ public class NativeLibraryLoader {
    *
    * @throws java.io.IOException if a filesystem operation fails.
    */
-  public synchronized void loadLibraryFromJar(final String tmpDir)
+  private void loadLibraryFromJar(final String tmpDir)
       throws IOException {
     if (!initialized) {
       final File temp;
       if (tmpDir == null || tmpDir.equals("")) {
         temp = File.createTempFile(tempFilePrefix, tempFileSuffix);
       } else {
-        temp = new File(tmpDir, sharedLibraryName);
+        temp = new File(tmpDir, jniLibraryFileName);
       }
 
       if (!temp.exists()) {
@@ -60,9 +90,9 @@ public class NativeLibraryLoader {
 
       // attempt to copy the library from the Jar file to the temp destination
       try (final InputStream is = getClass().getClassLoader().
-          getResourceAsStream(sharedLibraryName)) {
+          getResourceAsStream(jniLibraryFileName)) {
         if (is == null) {
-          throw new RuntimeException(sharedLibraryName + " was not found inside JAR.");
+          throw new RuntimeException(jniLibraryFileName + " was not found inside JAR.");
         } else {
           Files.copy(is, temp.toPath(), StandardCopyOption.REPLACE_EXISTING);
         }
diff --git a/java/src/main/java/org/rocksdb/RocksDB.java b/java/src/main/java/org/rocksdb/RocksDB.java
index ea3824196..ed8b05b93 100644
--- a/java/src/main/java/org/rocksdb/RocksDB.java
+++ b/java/src/main/java/org/rocksdb/RocksDB.java
@@ -44,7 +44,7 @@ public class RocksDB extends RocksObject {
     }
     try
     {
-      NativeLibraryLoader.getInstance().loadLibraryFromJar(tmpDir);
+      NativeLibraryLoader.getInstance().loadLibrary(tmpDir);
     }
     catch (IOException e)
     {
@@ -66,7 +66,7 @@ public class RocksDB extends RocksObject {
       }
       for (String path : paths) {
         try {
-          System.load(path + "/" + Environment.getSharedLibraryName(
+          System.load(path + "/" + Environment.getSharedLibraryFileName(
               compressionType.getLibraryName()));
           break;
         } catch (UnsatisfiedLinkError e) {
@@ -78,7 +78,7 @@ public class RocksDB extends RocksObject {
     UnsatisfiedLinkError err = null;
     for (String path : paths) {
       try {
-        System.load(path + "/" + Environment.getJniLibraryName("rocksdbjni"));
+        System.load(path + "/" + Environment.getJniLibraryFileName("rocksdbjni"));
         success = true;
         break;
       } catch (UnsatisfiedLinkError e) {
diff --git a/java/src/main/java/org/rocksdb/util/Environment.java b/java/src/main/java/org/rocksdb/util/Environment.java
index c121adb17..6b5a9f2c8 100644
--- a/java/src/main/java/org/rocksdb/util/Environment.java
+++ b/java/src/main/java/org/rocksdb/util/Environment.java
@@ -23,20 +23,32 @@ public class Environment {
   }
 
   public static String getSharedLibraryName(String name) {
+    return name + "jni";
+  }
+
+  public static String getSharedLibraryFileName(String name) {
+    return appendLibOsSuffix("lib" + getSharedLibraryName(name), true);
+  }
+
+  public static String getJniLibraryName(final String name) {
     if (isUnix()) {
-      return String.format("lib%sjni.so", name);
+      final String arch = (is64Bit()) ? "64" : "32";
+      return String.format("%sjni-linux%s", name, arch);
     } else if (isMac()) {
-      return String.format("lib%sjni.dylib", name);
+      return String.format("%sjni-osx", name);
     }
     throw new UnsupportedOperationException();
   }
 
-  public static String getJniLibraryName(String name) {
+  public static String getJniLibraryFileName(final String name) {
+    return appendLibOsSuffix("lib" + getJniLibraryName(name), false);
+  }
+
+  private static String appendLibOsSuffix(final String libraryFileName, final boolean shared) {
     if (isUnix()) {
-      String arch = (is64Bit()) ? "64" : "32";
-      return String.format("lib%sjni-linux%s.so", name, arch);
+      return libraryFileName + ".so";
     } else if (isMac()) {
-      return String.format("lib%sjni-osx.jnilib", name);
+      return libraryFileName + (shared ? ".dylib" : ".jnilib");
     }
     throw new UnsupportedOperationException();
   }
diff --git a/java/src/test/java/org/rocksdb/util/EnvironmentTest.java b/java/src/test/java/org/rocksdb/util/EnvironmentTest.java
index b8e22bce1..741effebb 100644
--- a/java/src/test/java/org/rocksdb/util/EnvironmentTest.java
+++ b/java/src/test/java/org/rocksdb/util/EnvironmentTest.java
@@ -23,9 +23,9 @@ public class EnvironmentTest {
     assertThat(Environment.isWindows()).isFalse();
     assertThat(Environment.getJniLibraryExtension()).
         isEqualTo(".jnilib");
-    assertThat(Environment.getJniLibraryName("rocksdb")).
+    assertThat(Environment.getJniLibraryFileName("rocksdb")).
         isEqualTo("librocksdbjni-osx.jnilib");
-    assertThat(Environment.getSharedLibraryName("rocksdb")).
+    assertThat(Environment.getSharedLibraryFileName("rocksdb")).
         isEqualTo("librocksdbjni.dylib");
   }
 
@@ -35,9 +35,9 @@ public class EnvironmentTest {
     assertThat(Environment.isWindows()).isFalse();
     assertThat(Environment.getJniLibraryExtension()).
         isEqualTo(".jnilib");
-    assertThat(Environment.getJniLibraryName("rocksdb")).
+    assertThat(Environment.getJniLibraryFileName("rocksdb")).
         isEqualTo("librocksdbjni-osx.jnilib");
-    assertThat(Environment.getSharedLibraryName("rocksdb")).
+    assertThat(Environment.getSharedLibraryFileName("rocksdb")).
         isEqualTo("librocksdbjni.dylib");
   }
 
@@ -48,27 +48,27 @@ public class EnvironmentTest {
     assertThat(Environment.isWindows()).isFalse();
     assertThat(Environment.getJniLibraryExtension()).
         isEqualTo(".so");
-    assertThat(Environment.getJniLibraryName("rocksdb")).
+    assertThat(Environment.getJniLibraryFileName("rocksdb")).
         isEqualTo("librocksdbjni-linux32.so");
-    assertThat(Environment.getSharedLibraryName("rocksdb")).
+    assertThat(Environment.getSharedLibraryFileName("rocksdb")).
         isEqualTo("librocksdbjni.so");
     // UNIX
     setEnvironmentClassFields("Unix", "32");
     assertThat(Environment.isWindows()).isFalse();
     assertThat(Environment.getJniLibraryExtension()).
         isEqualTo(".so");
-    assertThat(Environment.getJniLibraryName("rocksdb")).
+    assertThat(Environment.getJniLibraryFileName("rocksdb")).
         isEqualTo("librocksdbjni-linux32.so");
-    assertThat(Environment.getSharedLibraryName("rocksdb")).
+    assertThat(Environment.getSharedLibraryFileName("rocksdb")).
         isEqualTo("librocksdbjni.so");
     // AIX
     setEnvironmentClassFields("aix", "32");
     assertThat(Environment.isWindows()).isFalse();
     assertThat(Environment.getJniLibraryExtension()).
         isEqualTo(".so");
-    assertThat(Environment.getJniLibraryName("rocksdb")).
+    assertThat(Environment.getJniLibraryFileName("rocksdb")).
         isEqualTo("librocksdbjni-linux32.so");
-    assertThat(Environment.getSharedLibraryName("rocksdb")).
+    assertThat(Environment.getSharedLibraryFileName("rocksdb")).
         isEqualTo("librocksdbjni.so");
   }
 
@@ -78,27 +78,27 @@ public class EnvironmentTest {
     assertThat(Environment.isWindows()).isFalse();
     assertThat(Environment.getJniLibraryExtension()).
         isEqualTo(".so");
-    assertThat(Environment.getJniLibraryName("rocksdb")).
+    assertThat(Environment.getJniLibraryFileName("rocksdb")).
         isEqualTo("librocksdbjni-linux64.so");
-    assertThat(Environment.getSharedLibraryName("rocksdb")).
+    assertThat(Environment.getSharedLibraryFileName("rocksdb")).
         isEqualTo("librocksdbjni.so");
     // UNIX
     setEnvironmentClassFields("Unix", "x64");
     assertThat(Environment.isWindows()).isFalse();
     assertThat(Environment.getJniLibraryExtension()).
         isEqualTo(".so");
-    assertThat(Environment.getJniLibraryName("rocksdb")).
+    assertThat(Environment.getJniLibraryFileName("rocksdb")).
         isEqualTo("librocksdbjni-linux64.so");
-    assertThat(Environment.getSharedLibraryName("rocksdb")).
+    assertThat(Environment.getSharedLibraryFileName("rocksdb")).
         isEqualTo("librocksdbjni.so");
     // AIX
     setEnvironmentClassFields("aix", "x64");
     assertThat(Environment.isWindows()).isFalse();
     assertThat(Environment.getJniLibraryExtension()).
         isEqualTo(".so");
-    assertThat(Environment.getJniLibraryName("rocksdb")).
+    assertThat(Environment.getJniLibraryFileName("rocksdb")).
         isEqualTo("librocksdbjni-linux64.so");
-    assertThat(Environment.getSharedLibraryName("rocksdb")).
+    assertThat(Environment.getSharedLibraryFileName("rocksdb")).
         isEqualTo("librocksdbjni.so");
   }
 
@@ -111,13 +111,13 @@ public class EnvironmentTest {
   @Test(expected = UnsupportedOperationException.class)
   public void failWinJniLibraryName(){
     setEnvironmentClassFields("win", "x64");
-    Environment.getJniLibraryName("rocksdb");
+    Environment.getJniLibraryFileName("rocksdb");
   }
 
   @Test(expected = UnsupportedOperationException.class)
   public void failWinSharedLibrary(){
     setEnvironmentClassFields("win", "x64");
-    Environment.getSharedLibraryName("rocksdb");
+    Environment.getSharedLibraryFileName("rocksdb");
   }
 
   private void setEnvironmentClassFields(String osName,

From 157768890c44724d16c9fb7da70580f85f30e354 Mon Sep 17 00:00:00 2001
From: Adam Retter <adam.retter@googlemail.com>
Date: Sat, 31 Jan 2015 22:42:13 +0000
Subject: [PATCH 789/829] Moved Java Benchmark main classes into
 benchmark/src/main/java

---
 java/Makefile                                             | 8 +++++++-
 .../src/main/java}/org/rocksdb/benchmark/DbBenchmark.java | 0
 java/jdb_bench.sh                                         | 5 ++++-
 3 files changed, 11 insertions(+), 2 deletions(-)
 rename java/{ => benchmark/src/main/java}/org/rocksdb/benchmark/DbBenchmark.java (100%)

diff --git a/java/Makefile b/java/Makefile
index b7d44fc25..821cc4acc 100644
--- a/java/Makefile
+++ b/java/Makefile
@@ -99,6 +99,10 @@ MAIN_CLASSES = $(OUTPUT)/classes
 TEST_CLASSES = $(OUTPUT)/test-classes
 JAVADOC = $(OUTPUT)/apidocs
 
+BENCHMARK_MAIN_SRC = benchmark/src/main/java
+BENCHMARK_OUTPUT = benchmark/target
+BENCHMARK_MAIN_CLASSES = $(BENCHMARK_OUTPUT)/classes
+
 JAVA_TEST_LIBDIR = ./test-libs/
 JAVA_JUNIT_JAR = $(JAVA_TEST_LIBDIR)junit-4.12-beta-2.jar
 JAVA_HAMCR_JAR = $(JAVA_TEST_LIBDIR)hamcrest-core-1.3.jar
@@ -111,6 +115,7 @@ clean:
 	rm -rf include/*
 	rm -rf test-libs/
 	rm -rf $(OUTPUT)
+	rm -rf $(BENCHMARK_OUTPUT)
 
 
 javadocs:
@@ -162,4 +167,5 @@ test: java resolve_test_deps java_test
 	java -ea -Xcheck:jni -Djava.library.path=target -cp "$(MAIN_CLASSES):$(TEST_CLASSES):$(JAVA_TESTCLASSPATH)" org.rocksdb.test.RocksJunitRunner $(JAVA_TESTS)
 
 db_bench: java
-	javac org/rocksdb/benchmark/*.java
+	mkdir -p $(BENCHMARK_MAIN_CLASSES)
+	javac -cp $(MAIN_CLASSES) -d $(BENCHMARK_MAIN_CLASSES) $(BENCHMARK_MAIN_SRC)/org/rocksdb/benchmark/*.java
diff --git a/java/org/rocksdb/benchmark/DbBenchmark.java b/java/benchmark/src/main/java/org/rocksdb/benchmark/DbBenchmark.java
similarity index 100%
rename from java/org/rocksdb/benchmark/DbBenchmark.java
rename to java/benchmark/src/main/java/org/rocksdb/benchmark/DbBenchmark.java
diff --git a/java/jdb_bench.sh b/java/jdb_bench.sh
index 92ee6e3db..9665de785 100755
--- a/java/jdb_bench.sh
+++ b/java/jdb_bench.sh
@@ -3,5 +3,8 @@ if [ `getconf LONG_BIT` != "64" ]
 then
   PLATFORM=32
 fi
+
+ROCKS_JAR=`find target -name rocksdbjni*.jar`
+
 echo "Running benchmark in $PLATFORM-Bit mode."
-java -server -d$PLATFORM -XX:NewSize=4m -XX:+AggressiveOpts -Djava.library.path=.:../ -cp "rocksdbjni.jar:.:./*" org.rocksdb.benchmark.DbBenchmark $@
+java -server -d$PLATFORM -XX:NewSize=4m -XX:+AggressiveOpts -Djava.library.path=target -cp "${ROCKS_JAR}:benchmark/target/classes" org.rocksdb.benchmark.DbBenchmark $@

From d6187d07bb3355960ce95e09208a12cc439c60ac Mon Sep 17 00:00:00 2001
From: Adam Retter <adam.retter@googlemail.com>
Date: Sat, 31 Jan 2015 22:57:35 +0000
Subject: [PATCH 790/829] Maven can now build a standard project layout

---
 java/rocksjni.pom | 70 ++++-------------------------------------------
 1 file changed, 6 insertions(+), 64 deletions(-)

diff --git a/java/rocksjni.pom b/java/rocksjni.pom
index 69e124c48..242674e3d 100644
--- a/java/rocksjni.pom
+++ b/java/rocksjni.pom
@@ -38,87 +38,29 @@
     </developers>
 
     <properties>
+        <project.build.source>1.7</project.build.source>
+        <project.build.target>1.7</project.build.target>
         <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
     </properties>
 
     <build>
-        <!-- Use custom maven folder layout -->
-        <!-- Set folder for src root -->
-        <sourceDirectory>${project.basedir}</sourceDirectory>
-        <!-- main resources, nothing shall be excluded -->
-        <resources>
-            <resource>
-                <directory>${project.basedir}</directory>
-                <excludes>
-                    <exclude>**/*</exclude>
-                </excludes>
-            </resource>
-        </resources>
-        <!-- Set folder for test root -->
-        <testSourceDirectory>${project.basedir}</testSourceDirectory>
-        <!-- Bring libraries on classpath -->
-        <testResources>
-            <testResource>
-                <directory>${project.basedir}</directory>
-                <includes>
-                    <include>*.so</include>
-                    <include>*.jar</include>
-                    <include>*.jnilib</include>
-                </includes>
-            </testResource>
-        </testResources>
         <plugins>
             <plugin>
                 <groupId>org.apache.maven.plugins</groupId>
                 <artifactId>maven-compiler-plugin</artifactId>
                 <version>2.0.2</version>
                 <configuration>
-                    <source>1.7</source>
-                    <target>1.7</target>
-                    <!-- Exclude all tests from classes -->
-                    <excludes>
-                        <!-- Exclude Sample -->
-                        <exclude>*.java</exclude>
-                        <!-- Exclude Benchmark -->
-                        <exclude>org/rocksdb/benchmark/*.java</exclude>
-                        <!-- Exclude Tests -->
-                        <exclude>org/rocksdb/test/*.java</exclude>
-                        <exclude>org/rocksdb/WriteBatchTest.java</exclude>
-                    </excludes>
+                    <source>${project.build.source}</source>
+                    <target>${project.build.target}</target>
+                    <encoding>${project.build.sourceEncoding}</encoding>
                 </configuration>
-                <executions>
-                    <execution>
-                        <id>default-testCompile</id>
-                        <phase>test-compile</phase>
-                        <configuration>
-                            <!-- Include only tests in test-classes -->
-                            <testExcludes>
-                                <!-- Exclude everything but WriteBatchTest -->
-                                <exclude>%regex[org/rocksdb/[^WriteBatchTest].*java]</exclude>
-                                <!-- Exclude WriteBatchTest -->
-                                <exclude>*.java</exclude>
-                                <!-- Exclude Benchmark -->
-                                <exclude>org/rocksdb/benchmark/*.java</exclude>
-                                <!-- Exclude Utilities -->
-                                <exclude>org/rocksdb/util/*.java</exclude>
-                            </testExcludes>
-                            <testIncludes>
-                                <!-- Include Tests -->
-                                <include>org/rocksdb/test/*.java</include>
-                            </testIncludes>
-                        </configuration>
-                        <goals>
-                            <goal>testCompile</goal>
-                        </goals>
-                    </execution>
-                </executions>
             </plugin>
             <plugin>
                 <groupId>org.apache.maven.plugins</groupId>
                 <artifactId>maven-surefire-plugin</artifactId>
                 <version>2.17</version>
                 <configuration>
-                    <argLine>${argLine} -Xcheck:jni</argLine>
+                    <argLine>-ea -Xcheck:jni -Djava.library.path=${project.build.directory}</argLine>
                 </configuration>
             </plugin>
             <plugin>

From ad325517fc935de60317eb4de4e28a19a40fed14 Mon Sep 17 00:00:00 2001
From: Adam Retter <adam.retter@googlemail.com>
Date: Sat, 31 Jan 2015 23:21:19 +0000
Subject: [PATCH 791/829] Update test lib versions and maven plugin versions

---
 java/Makefile     | 20 ++++++++++----------
 java/rocksjni.pom | 10 +++++-----
 2 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/java/Makefile b/java/Makefile
index 821cc4acc..a1434d494 100644
--- a/java/Makefile
+++ b/java/Makefile
@@ -103,13 +103,13 @@ BENCHMARK_MAIN_SRC = benchmark/src/main/java
 BENCHMARK_OUTPUT = benchmark/target
 BENCHMARK_MAIN_CLASSES = $(BENCHMARK_OUTPUT)/classes
 
-JAVA_TEST_LIBDIR = ./test-libs/
-JAVA_JUNIT_JAR = $(JAVA_TEST_LIBDIR)junit-4.12-beta-2.jar
-JAVA_HAMCR_JAR = $(JAVA_TEST_LIBDIR)hamcrest-core-1.3.jar
-JAVA_MOCKITO_JAR = $(JAVA_TEST_LIBDIR)mockito-all-1.9.5.jar
-JAVA_CGLIB_JAR = $(JAVA_TEST_LIBDIR)cglib-2.2.2.jar
-JAVA_ASSERTJ_JAR = $(JAVA_TEST_LIBDIR)assertj-core-1.7.0.jar
-JAVA_TESTCLASSPATH = $(JAVA_JUNIT_JAR):$(JAVA_HAMCR_JAR):$(JAVA_MOCKITO_JAR):$(JAVA_CGLIB_JAR):$(JAVA_ASSERTJ_JAR):.:./*
+JAVA_TEST_LIBDIR = test-libs
+JAVA_JUNIT_JAR = $(JAVA_TEST_LIBDIR)/junit-4.12.jar
+JAVA_HAMCR_JAR = $(JAVA_TEST_LIBDIR)/hamcrest-core-1.3.jar
+JAVA_MOCKITO_JAR = $(JAVA_TEST_LIBDIR)/mockito-all-1.10.19.jar
+JAVA_CGLIB_JAR = $(JAVA_TEST_LIBDIR)/cglib-2.2.2.jar
+JAVA_ASSERTJ_JAR = $(JAVA_TEST_LIBDIR)/assertj-core-1.7.1.jar
+JAVA_TESTCLASSPATH = $(JAVA_JUNIT_JAR):$(JAVA_HAMCR_JAR):$(JAVA_MOCKITO_JAR):$(JAVA_CGLIB_JAR):$(JAVA_ASSERTJ_JAR)
 
 clean:
 	rm -rf include/*
@@ -149,11 +149,11 @@ column_family_sample: java
 
 resolve_test_deps:
 	mkdir -p "$(JAVA_TEST_LIBDIR)"
-	test -s "$(JAVA_JUNIT_JAR)" || curl -k -L -o $(JAVA_JUNIT_JAR) http://search.maven.org/remotecontent?filepath=junit/junit/4.12-beta-2/junit-4.12-beta-2.jar
+	test -s "$(JAVA_JUNIT_JAR)" || curl -k -L -o $(JAVA_JUNIT_JAR) http://search.maven.org/remotecontent?filepath=junit/junit/4.12/junit-4.12.jar
 	test -s "$(JAVA_HAMCR_JAR)" || curl -k -L -o $(JAVA_HAMCR_JAR) http://search.maven.org/remotecontent?filepath=org/hamcrest/hamcrest-core/1.3/hamcrest-core-1.3.jar
-	test -s "$(JAVA_MOCKITO_JAR)" || curl -k -L -o "$(JAVA_MOCKITO_JAR)" http://search.maven.org/remotecontent?filepath=org/mockito/mockito-all/1.9.5/mockito-all-1.9.5.jar
+	test -s "$(JAVA_MOCKITO_JAR)" || curl -k -L -o "$(JAVA_MOCKITO_JAR)" http://search.maven.org/remotecontent?filepath=org/mockito/mockito-all/1.10.19/mockito-all-1.10.19.jar
 	test -s "$(JAVA_CGLIB_JAR)" || curl -k -L -o "$(JAVA_CGLIB_JAR)" http://search.maven.org/remotecontent?filepath=cglib/cglib/2.2.2/cglib-2.2.2.jar
-	test -s "$(JAVA_ASSERTJ_JAR)" || curl -k -L -o "$(JAVA_ASSERTJ_JAR)" http://central.maven.org/maven2/org/assertj/assertj-core/1.7.0/assertj-core-1.7.0.jar
+	test -s "$(JAVA_ASSERTJ_JAR)" || curl -k -L -o "$(JAVA_ASSERTJ_JAR)" http://central.maven.org/maven2/org/assertj/assertj-core/1.7.1/assertj-core-1.7.1.jar
 
 java_test: resolve_test_deps
 	mkdir -p $(TEST_CLASSES)
diff --git a/java/rocksjni.pom b/java/rocksjni.pom
index 242674e3d..74676fdf4 100644
--- a/java/rocksjni.pom
+++ b/java/rocksjni.pom
@@ -48,7 +48,7 @@
             <plugin>
                 <groupId>org.apache.maven.plugins</groupId>
                 <artifactId>maven-compiler-plugin</artifactId>
-                <version>2.0.2</version>
+                <version>3.2</version>
                 <configuration>
                     <source>${project.build.source}</source>
                     <target>${project.build.target}</target>
@@ -58,7 +58,7 @@
             <plugin>
                 <groupId>org.apache.maven.plugins</groupId>
                 <artifactId>maven-surefire-plugin</artifactId>
-                <version>2.17</version>
+                <version>2.18.1</version>
                 <configuration>
                     <argLine>-ea -Xcheck:jni -Djava.library.path=${project.build.directory}</argLine>
                 </configuration>
@@ -121,19 +121,19 @@
         <dependency>
             <groupId>junit</groupId>
             <artifactId>junit</artifactId>
-            <version>4.12-beta-2</version>
+            <version>4.12</version>
             <scope>test</scope>
         </dependency>
         <dependency>
             <groupId>org.assertj</groupId>
             <artifactId>assertj-core</artifactId>
-            <version>1.7.0</version>
+            <version>1.7.1</version>
             <scope>test</scope>
         </dependency>
         <dependency>
             <groupId>org.mockito</groupId>
             <artifactId>mockito-all</artifactId>
-            <version>1.9.5</version>
+            <version>1.10.19</version>
             <scope>test</scope>
         </dependency>
     </dependencies>

From f33f3955eee379b29fda02dac1d30ba628669d67 Mon Sep 17 00:00:00 2001
From: Adam Retter <adam.retter@googlemail.com>
Date: Sat, 31 Jan 2015 23:43:01 +0000
Subject: [PATCH 792/829] Moved Java Samples main classes into
 samples/src/main/java

---
 java/Makefile                                     | 15 +++++++++++----
 .../src/main/java}/RocksDBColumnFamilySample.java |  0
 .../src/main/java}/RocksDBSample.java             |  0
 3 files changed, 11 insertions(+), 4 deletions(-)
 rename java/{ => samples/src/main/java}/RocksDBColumnFamilySample.java (100%)
 rename java/{ => samples/src/main/java}/RocksDBSample.java (100%)

diff --git a/java/Makefile b/java/Makefile
index a1434d494..a07afb20a 100644
--- a/java/Makefile
+++ b/java/Makefile
@@ -103,6 +103,10 @@ BENCHMARK_MAIN_SRC = benchmark/src/main/java
 BENCHMARK_OUTPUT = benchmark/target
 BENCHMARK_MAIN_CLASSES = $(BENCHMARK_OUTPUT)/classes
 
+SAMPLES_MAIN_SRC = samples/src/main/java
+SAMPLES_OUTPUT = samples/target
+SAMPLES_MAIN_CLASSES = $(SAMPLES_OUTPUT)/classes
+
 JAVA_TEST_LIBDIR = test-libs
 JAVA_JUNIT_JAR = $(JAVA_TEST_LIBDIR)/junit-4.12.jar
 JAVA_HAMCR_JAR = $(JAVA_TEST_LIBDIR)/hamcrest-core-1.3.jar
@@ -116,6 +120,7 @@ clean:
 	rm -rf test-libs/
 	rm -rf $(OUTPUT)
 	rm -rf $(BENCHMARK_OUTPUT)
+	rm -rf $(SAMPLES_OUTPUT)
 
 
 javadocs:
@@ -134,17 +139,19 @@ java:
 	javah -cp $(MAIN_CLASSES) -d $(NATIVE_INCLUDE) -jni $(NATIVE_JAVA_CLASSES)
 
 sample: java
-	javac -cp $(ROCKSDB_JAR) RocksDBSample.java
+	mkdir -p $(SAMPLES_MAIN_CLASSES)
+	javac -cp $(MAIN_CLASSES) -d $(SAMPLES_MAIN_CLASSES) $(SAMPLES_MAIN_SRC)/RocksDBSample.java
 	@rm -rf /tmp/rocksdbjni
 	@rm -rf /tmp/rocksdbjni_not_found
-	java -ea -Djava.library.path=.:../ -cp ".:./*" -Xcheck:jni RocksDBSample /tmp/rocksdbjni
+	java -ea -Xcheck:jni -Djava.library.path=target -cp $(MAIN_CLASSES):$(SAMPLES_MAIN_CLASSES) RocksDBSample /tmp/rocksdbjni
 	@rm -rf /tmp/rocksdbjni
 	@rm -rf /tmp/rocksdbjni_not_found
 
 column_family_sample: java
-	javac -cp $(ROCKSDB_JAR) RocksDBColumnFamilySample.java
+	mkdir -p $(SAMPLES_MAIN_CLASSES)
+	javac -cp $(MAIN_CLASSES) -d $(SAMPLES_MAIN_CLASSES) $(SAMPLES_MAIN_SRC)/RocksDBColumnFamilySample.java
 	@rm -rf /tmp/rocksdbjni
-	java -ea -Djava.library.path=.:../ -cp ".:./*" -Xcheck:jni RocksDBColumnFamilySample /tmp/rocksdbjni
+	java -ea -Xcheck:jni -Djava.library.path=target -cp $(MAIN_CLASSES):$(SAMPLES_MAIN_CLASSES) RocksDBColumnFamilySample /tmp/rocksdbjni
 	@rm -rf /tmp/rocksdbjni
 
 resolve_test_deps:
diff --git a/java/RocksDBColumnFamilySample.java b/java/samples/src/main/java/RocksDBColumnFamilySample.java
similarity index 100%
rename from java/RocksDBColumnFamilySample.java
rename to java/samples/src/main/java/RocksDBColumnFamilySample.java
diff --git a/java/RocksDBSample.java b/java/samples/src/main/java/RocksDBSample.java
similarity index 100%
rename from java/RocksDBSample.java
rename to java/samples/src/main/java/RocksDBSample.java

From e6eaf938c384d899374f4bab8852413b79dc0662 Mon Sep 17 00:00:00 2001
From: Erik Garrison <erik.garrison@gmail.com>
Date: Sun, 1 Feb 2015 20:34:24 +0000
Subject: [PATCH 793/829] remove old debugging message (#487)

It doesn't seem this is needed.
---
 util/env_posix.cc | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/util/env_posix.cc b/util/env_posix.cc
index 9e1e4da5b..4adf58bcc 100644
--- a/util/env_posix.cc
+++ b/util/env_posix.cc
@@ -1635,10 +1635,6 @@ class PosixEnv : public Env {
             WakeUpAllThreads();
           }
           PthreadCall("unlock", pthread_mutex_unlock(&mu_));
-          // TODO(sdong): temp logging. Need to help debugging. Remove it when
-          // the feature is proved to be stable.
-          fprintf(stdout, "Bg thread %zu terminates %llx\n", thread_id,
-                  static_cast<long long unsigned int>(gettid()));
           break;
         }
         void (*function)(void*) = queue_.front().function;

From 0b8dec717297b69f1731d12c1b30c14da5032725 Mon Sep 17 00:00:00 2001
From: Venkatesh Radhakrishnan <rven@fb.com>
Date: Mon, 2 Feb 2015 14:49:22 -0800
Subject: [PATCH 794/829] Cross functional test infrastructure for RocksDB.

Summary:
This Diff provides the implementation of the cross functional
test infrastructure. This provides the ability to test a single feature
with every existing regression test in order to identify issues with
interoperability between features.

Test Plan:
Reference implementation of inplace update support cross
functional test. Able to find interoperability issues with inplace
support and ran all of db_test. Will add separate diff for those changes.

Reviewers: igor, sdong

Reviewed By: sdong

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D32247
---
 db/db_test.cc             | 35 +++++++++++---
 include/rocksdb/options.h |  3 +-
 util/xfunc.cc             | 27 +++++++++++
 util/xfunc.h              | 99 +++++++++++++++++++++++++++++++++++++++
 4 files changed, 156 insertions(+), 8 deletions(-)
 create mode 100644 util/xfunc.cc
 create mode 100644 util/xfunc.h

diff --git a/db/db_test.cc b/db/db_test.cc
index 9014c9c86..735ed30f2 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -53,6 +53,7 @@
 #include "util/mock_env.h"
 #include "util/string_util.h"
 #include "util/thread_status_util.h"
+#include "util/xfunc.h"
 
 namespace rocksdb {
 
@@ -115,6 +116,9 @@ class AtomicCounter {
 
 struct OptionsOverride {
   std::shared_ptr<const FilterPolicy> filter_policy = nullptr;
+
+  // Used as a bit mask of individual enums in which to skip an XF test point
+  int skip_policy = 0;
 };
 
 }  // namespace anon
@@ -564,6 +568,9 @@ class DBTest {
       const anon::OptionsOverride& options_override = anon::OptionsOverride()) {
     // this redudant copy is to minimize code change w/o having lint error.
     Options options = defaultOptions;
+    XFUNC_TEST("", "dbtest_options", inplace_options1, GetXFTestOptions,
+               reinterpret_cast<Options*>(&options),
+               options_override.skip_policy);
     BlockBasedTableOptions table_options;
     bool set_block_based_table_factory = true;
     switch (option_config_) {
@@ -1631,8 +1638,10 @@ TEST(DBTest, GetFromVersions) {
 }
 
 TEST(DBTest, GetSnapshot) {
+  anon::OptionsOverride options_override;
+  options_override.skip_policy = kSkipNoSnapshot;
   do {
-    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions(options_override));
     // Try with both a short key and a long key
     for (int i = 0; i < 2; i++) {
       std::string key = (i == 0) ? std::string("foo") : std::string(200, 'x');
@@ -2242,7 +2251,9 @@ TEST(DBTest, IterMulti) {
 // Check that we can skip over a run of user keys
 // by using reseek rather than sequential scan
 TEST(DBTest, IterReseek) {
-  Options options = CurrentOptions();
+  anon::OptionsOverride options_override;
+  options_override.skip_policy = kSkipNoSnapshot;
+  Options options = CurrentOptions(options_override);
   options.max_sequential_skip_in_iterations = 3;
   options.create_if_missing = true;
   options.statistics = rocksdb::CreateDBStatistics();
@@ -5699,8 +5710,10 @@ TEST(DBTest, IteratorPinsRef) {
 }
 
 TEST(DBTest, Snapshot) {
+  anon::OptionsOverride options_override;
+  options_override.skip_policy = kSkipNoSnapshot;
   do {
-    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions(options_override));
     Put(0, "foo", "0v1");
     Put(1, "foo", "1v1");
 
@@ -5760,8 +5773,10 @@ TEST(DBTest, Snapshot) {
 }
 
 TEST(DBTest, HiddenValuesAreRemoved) {
+  anon::OptionsOverride options_override;
+  options_override.skip_policy = kSkipNoSnapshot;
   do {
-    Options options = CurrentOptions();
+    Options options = CurrentOptions(options_override);
     options.max_background_flushes = 0;
     CreateAndReopenWithCF({"pikachu"}, options);
     Random rnd(301);
@@ -5798,8 +5813,10 @@ TEST(DBTest, HiddenValuesAreRemoved) {
 }
 
 TEST(DBTest, CompactBetweenSnapshots) {
+  anon::OptionsOverride options_override;
+  options_override.skip_policy = kSkipNoSnapshot;
   do {
-    Options options = CurrentOptions();
+    Options options = CurrentOptions(options_override);
     options.disable_auto_compactions = true;
     CreateAndReopenWithCF({"pikachu"}, options);
     Random rnd(301);
@@ -6908,8 +6925,10 @@ TEST(DBTest, SnapshotFiles) {
 }
 
 TEST(DBTest, CompactOnFlush) {
+  anon::OptionsOverride options_override;
+  options_override.skip_policy = kSkipNoSnapshot;
   do {
-    Options options = CurrentOptions();
+    Options options = CurrentOptions(options_override);
     options.purge_redundant_kvs_while_flush = true;
     options.disable_auto_compactions = true;
     CreateAndReopenWithCF({"pikachu"}, options);
@@ -7641,12 +7660,14 @@ static void MTThreadBody(void* arg) {
 }  // namespace
 
 TEST(DBTest, MultiThreaded) {
+  anon::OptionsOverride options_override;
+  options_override.skip_policy = kSkipNoSnapshot;
   do {
     std::vector<std::string> cfs;
     for (int i = 1; i < kColumnFamilies; ++i) {
       cfs.push_back(ToString(i));
     }
-    CreateAndReopenWithCF(cfs, CurrentOptions());
+    CreateAndReopenWithCF(cfs, CurrentOptions(options_override));
     // Initialize state
     MTState mt;
     mt.test = this;
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index 75625abcc..0541a7b34 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -501,7 +501,8 @@ struct ColumnFamilyOptions {
 
   // Allows thread-safe inplace updates. If this is true, there is no way to
   // achieve point-in-time consistency using snapshot or iterator (assuming
-  // concurrent updates).
+  // concurrent updates). Hence iterator and multi-get will return results
+  // which are not consistent as of any point-in-time.
   // If inplace_callback function is not set,
   //   Put(key, new_value) will update inplace the existing_value iff
   //   * key exists in current memtable
diff --git a/util/xfunc.cc b/util/xfunc.cc
new file mode 100644
index 000000000..9a2482272
--- /dev/null
+++ b/util/xfunc.cc
@@ -0,0 +1,27 @@
+//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include <string>
+#include "rocksdb/options.h"
+#include "util/xfunc.h"
+
+#ifdef XFUNC
+
+namespace rocksdb {
+
+std::string XFuncPoint::xfunc_test_;
+bool XFuncPoint::initialized_ = false;
+bool XFuncPoint::enabled_ = false;
+
+void GetXFTestOptions(Options* options, int skip_policy) {
+  if (XFuncPoint::Check("inplace_lock_test") &&
+      (!(skip_policy & kSkipNoSnapshot))) {
+    options->inplace_update_support = true;
+  }
+}
+
+}  // namespace rocksdb
+
+#endif  // XFUNC
diff --git a/util/xfunc.h b/util/xfunc.h
new file mode 100644
index 000000000..a48640a11
--- /dev/null
+++ b/util/xfunc.h
@@ -0,0 +1,99 @@
+//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+#pragma once
+
+#include <string>
+
+namespace rocksdb {
+
+/*
+ * If ROCKSDB_XFTEST_FORCE has a value of 1, XFUNC is forced to be defined.
+ * If ROCKSDB_XFTEST_FORCE has a value other than 1,
+ *    XFUNC is forced to be undefined.
+ * If ROCKSDB_XFTEST_FORCE is undefined, XFUNC is defined based on NDEBUG,
+ *   with XFUNC only being set for debug builds.
+ */
+#if defined(ROCKSDB_XFTEST_FORCE)
+#if (ROCKSDB_XFTEST_FORCE == 1)
+#define XFUNC
+#endif
+#elif NDEBUG
+#else
+#define XFUNC
+#endif
+
+#ifndef XFUNC
+#define XFUNC_TEST(condition, location, lfname, fname, ...)
+#else
+
+class Options;
+void GetXFTestOptions(Options* options, int skip_policy);
+
+// This class provides the facility to run custom code to test a specific
+// feature typically with all existing unit tests.
+// A developer could specify cross functional test points in the codebase
+// via XFUNC_TEST.
+// Each xfunc test represents a position in the execution stream of a thread.
+// Whenever that particular piece of code is called, the given cross-functional
+// test point is executed.
+// eg. on DBOpen, a particular option can be set.
+// on Get, a particular option can be set, or a specific check can be invoked.
+// XFUNC_TEST(TestName, location, lfname, FunctionName, Args)
+// Turn on a specific cross functional test by setting the environment variable
+// ROCKSDB_XFUNC_TEST
+
+class XFuncPoint {
+ public:
+  // call once at the beginning of a test to get the test name
+  static void Init() {
+    char* s = getenv("ROCKSDB_XFUNC_TEST");
+    if (s == nullptr) {
+      xfunc_test_ = "";
+      enabled_ = false;
+    } else {
+      xfunc_test_ = s;
+      enabled_ = true;
+    }
+    initialized_ = true;
+  }
+
+  static bool Initialized() { return initialized_; }
+
+  static bool Check(std::string test) {
+    return (enabled_ &&
+            ((test.compare("") == 0) || (test.compare(xfunc_test_) == 0)));
+  }
+
+ private:
+  static std::string xfunc_test_;
+  static bool initialized_;
+  static bool enabled_;
+};
+
+// Use XFUNC_TEST to specify cross functional test points inside the code base.
+// By setting ROCKSDB_XFUNC_TEST, all XFUNC_TEST having that
+// value in the condition field will be executed.
+// The second argument specifies a string representing the calling location
+// The third argument, lfname, is the name of the function which will be created
+// and called.
+// The fourth argument fname represents the function to be called
+// The arguments following that are the arguments to fname
+// See Options::Options in options.h for an example use case.
+// XFUNC_TEST is no op in release build.
+#define XFUNC_TEST(condition, location, lfname, fname, ...)         \
+  {                                                                 \
+    if (!XFuncPoint::Initialized()) {                               \
+      XFuncPoint::Init();                                           \
+    }                                                               \
+    if (XFuncPoint::Check(condition)) {                             \
+      std::function<void()> lfname = std::bind(fname, __VA_ARGS__); \
+      lfname();                                                     \
+    }                                                               \
+  }
+
+#endif  // XFUNC
+
+enum SkipPolicy { kSkipNone = 0, kSkipNoSnapshot = 1, kSkipNoPrefix = 2 };
+}  // namespace rocksdb

From 2c2d5ab7e86c387c538f3a9eda8b9cc9408df54c Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Mon, 2 Feb 2015 15:20:19 -0800
Subject: [PATCH 795/829] Fix compile warning in util/xfunc.h

Summary:
./util/xfunc.h:31:1: error: class 'Options' was previously declared as a struct [-Werror,-Wmismatched-tags]
class Options;
^

Test Plan:
make dbg -j32
---
 util/xfunc.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/util/xfunc.h b/util/xfunc.h
index a48640a11..51122b7aa 100644
--- a/util/xfunc.h
+++ b/util/xfunc.h
@@ -28,7 +28,7 @@ namespace rocksdb {
 #define XFUNC_TEST(condition, location, lfname, fname, ...)
 #else
 
-class Options;
+struct Options;
 void GetXFTestOptions(Options* options, int skip_policy);
 
 // This class provides the facility to run custom code to test a specific

From 8d3819369f9704b705053a1f2112dbf1d7036c20 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Mon, 2 Feb 2015 22:29:43 -0800
Subject: [PATCH 796/829] NewIteratorWithBase() for default column family

Summary: I'm moving mongo to a single column family, so I need DeltaBase iterator with default column family.

Test Plan: Added unit test

Reviewers: sdong

Reviewed By: sdong

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D32589
---
 .../utilities/write_batch_with_index.h        |  2 ++
 .../write_batch_with_index.cc                 | 16 +++++++--
 .../write_batch_with_index_test.cc            | 33 +++++++++++++++++++
 3 files changed, 49 insertions(+), 2 deletions(-)

diff --git a/include/rocksdb/utilities/write_batch_with_index.h b/include/rocksdb/utilities/write_batch_with_index.h
index ee5ec198e..566934b70 100644
--- a/include/rocksdb/utilities/write_batch_with_index.h
+++ b/include/rocksdb/utilities/write_batch_with_index.h
@@ -106,6 +106,8 @@ class WriteBatchWithIndex {
   // base_iterator as base
   Iterator* NewIteratorWithBase(ColumnFamilyHandle* column_family,
                                 Iterator* base_iterator);
+  // default column family
+  Iterator* NewIteratorWithBase(Iterator* base_iterator);
 
  private:
   struct Rep;
diff --git a/utilities/write_batch_with_index/write_batch_with_index.cc b/utilities/write_batch_with_index/write_batch_with_index.cc
index 761b955a1..160e7dac7 100644
--- a/utilities/write_batch_with_index/write_batch_with_index.cc
+++ b/utilities/write_batch_with_index/write_batch_with_index.cc
@@ -317,9 +317,9 @@ struct WriteBatchIndexEntry {
 
 class WriteBatchEntryComparator {
  public:
-  WriteBatchEntryComparator(const Comparator* default_comparator,
+  WriteBatchEntryComparator(const Comparator* _default_comparator,
                             const ReadableWriteBatch* write_batch)
-      : default_comparator_(default_comparator), write_batch_(write_batch) {}
+      : default_comparator_(_default_comparator), write_batch_(write_batch) {}
   // Compare a and b. Return a negative value if a is less than b, 0 if they
   // are equal, and a positive value if a is greater than b
   int operator()(const WriteBatchIndexEntry* entry1,
@@ -333,6 +333,8 @@ class WriteBatchEntryComparator {
     cf_comparator_map_[column_family_id] = comparator;
   }
 
+  const Comparator* default_comparator() { return default_comparator_; }
+
  private:
   const Comparator* default_comparator_;
   std::unordered_map<uint32_t, const Comparator*> cf_comparator_map_;
@@ -590,6 +592,16 @@ Iterator* WriteBatchWithIndex::NewIteratorWithBase(
                                GetColumnFamilyUserComparator(column_family));
 }
 
+Iterator* WriteBatchWithIndex::NewIteratorWithBase(Iterator* base_iterator) {
+  if (rep->overwrite_key == false) {
+    assert(false);
+    return nullptr;
+  }
+  // default column family's comparator
+  return new BaseDeltaIterator(base_iterator, NewIterator(),
+                               rep->comparator.default_comparator());
+}
+
 void WriteBatchWithIndex::Put(ColumnFamilyHandle* column_family,
                               const Slice& key, const Slice& value) {
   rep->SetLastEntryOffset();
diff --git a/utilities/write_batch_with_index/write_batch_with_index_test.cc b/utilities/write_batch_with_index/write_batch_with_index_test.cc
index b573699db..f5d6a55a3 100644
--- a/utilities/write_batch_with_index/write_batch_with_index_test.cc
+++ b/utilities/write_batch_with_index/write_batch_with_index_test.cc
@@ -842,6 +842,39 @@ TEST(WriteBatchWithIndexTest, TestIteraratorWithBaseReverseCmp) {
     iter->Seek("a");
     AssertIter(iter.get(), "a", "aa");
   }
+
+  // default column family
+  batch.Put("a", "b");
+  {
+    KVMap map;
+    map["b"] = "";
+    std::unique_ptr<Iterator> iter(batch.NewIteratorWithBase(new KVIter(&map)));
+
+    iter->SeekToFirst();
+    AssertIter(iter.get(), "a", "b");
+    iter->Next();
+    AssertIter(iter.get(), "b", "");
+    iter->Next();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(!iter->Valid());
+
+    iter->SeekToLast();
+    AssertIter(iter.get(), "b", "");
+    iter->Prev();
+    AssertIter(iter.get(), "a", "b");
+    iter->Prev();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(!iter->Valid());
+
+    iter->Seek("b");
+    AssertIter(iter.get(), "b", "");
+
+    iter->Prev();
+    AssertIter(iter.get(), "a", "b");
+
+    iter->Seek("0");
+    AssertIter(iter.get(), "a", "b");
+  }
 }
 
 }  // namespace

From b04408c47bb7ced38df59c7c66c8bd4942939323 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Tue, 3 Feb 2015 00:32:11 -0600
Subject: [PATCH 797/829] Fix unity build

Summary: I broke it with https://github.com/facebook/rocksdb/commit/2fd8f750ab05bd100b627f1e043603d1069246ed

Test Plan: make unity

Reviewers: yhchiang, rven, sdong

Reviewed By: sdong

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D32577
---
 .gitignore       |  1 +
 util/mock_env.cc | 26 +++++++++++++-------------
 2 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/.gitignore b/.gitignore
index dfd3f4924..34d1ee2e2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -31,6 +31,7 @@ coverage/COVERAGE_REPORT
 .gdbhistory
 package/
 .phutil_module_cache
+unity
 tags
 
 java/out
diff --git a/util/mock_env.cc b/util/mock_env.cc
index 856d73d92..41aeafaaf 100644
--- a/util/mock_env.cc
+++ b/util/mock_env.cc
@@ -168,13 +168,13 @@ class MemFile {
 
 namespace {
 
-class SequentialFileImpl : public SequentialFile {
+class MockSequentialFile : public SequentialFile {
  public:
-  explicit SequentialFileImpl(MemFile* file) : file_(file), pos_(0) {
+  explicit MockSequentialFile(MemFile* file) : file_(file), pos_(0) {
     file_->Ref();
   }
 
-  ~SequentialFileImpl() {
+  ~MockSequentialFile() {
     file_->Unref();
   }
 
@@ -203,13 +203,13 @@ class SequentialFileImpl : public SequentialFile {
   size_t pos_;
 };
 
-class RandomAccessFileImpl : public RandomAccessFile {
+class MockRandomAccessFile : public RandomAccessFile {
  public:
-  explicit RandomAccessFileImpl(MemFile* file) : file_(file) {
+  explicit MockRandomAccessFile(MemFile* file) : file_(file) {
     file_->Ref();
   }
 
-  ~RandomAccessFileImpl() {
+  ~MockRandomAccessFile() {
     file_->Unref();
   }
 
@@ -222,15 +222,15 @@ class RandomAccessFileImpl : public RandomAccessFile {
   MemFile* file_;
 };
 
-class WritableFileImpl : public WritableFile {
+class MockWritableFile : public WritableFile {
  public:
-  WritableFileImpl(MemFile* file, RateLimiter* rate_limiter)
+  MockWritableFile(MemFile* file, RateLimiter* rate_limiter)
     : file_(file),
       rate_limiter_(rate_limiter) {
     file_->Ref();
   }
 
-  ~WritableFileImpl() {
+  ~MockWritableFile() {
     file_->Unref();
   }
 
@@ -424,7 +424,7 @@ Status MockEnv::NewSequentialFile(const std::string& fname,
   if (f->is_lock_file()) {
     return Status::InvalidArgument(fn, "Cannot open a lock file.");
   }
-  result->reset(new SequentialFileImpl(f));
+  result->reset(new MockSequentialFile(f));
   return Status::OK();
 }
 
@@ -441,7 +441,7 @@ Status MockEnv::NewRandomAccessFile(const std::string& fname,
   if (f->is_lock_file()) {
     return Status::InvalidArgument(fn, "Cannot open a lock file.");
   }
-  result->reset(new RandomAccessFileImpl(f));
+  result->reset(new MockRandomAccessFile(f));
   return Status::OK();
 }
 
@@ -457,7 +457,7 @@ Status MockEnv::NewWritableFile(const std::string& fname,
   file->Ref();
   file_map_[fn] = file;
 
-  result->reset(new WritableFileImpl(file, env_options.rate_limiter));
+  result->reset(new MockWritableFile(file, env_options.rate_limiter));
   return Status::OK();
 }
 
@@ -613,7 +613,7 @@ Status MockEnv::NewLogger(const std::string& fname,
   } else {
     file = iter->second;
   }
-  std::unique_ptr<WritableFile> f(new WritableFileImpl(file, nullptr));
+  std::unique_ptr<WritableFile> f(new MockWritableFile(file, nullptr));
   result->reset(new TestMemLogger(std::move(f), this));
   return Status::OK();
 }

From 829363b449fc6f0f9c973f530222f5767c625704 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Mon, 2 Feb 2015 11:09:21 -0800
Subject: [PATCH 798/829] Options::PrepareForBulkLoad() to increase parallelism
 of flushes

Summary: Increasing parallelism of flushes will help bulk load throughput.

Test Plan: Compile it.

Reviewers: MarkCallaghan, yhchiang, rven, igor

Reviewed By: igor

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D32685
---
 util/options.cc | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/util/options.cc b/util/options.cc
index 75307f13f..69aca5ab1 100644
--- a/util/options.cc
+++ b/util/options.cc
@@ -505,6 +505,15 @@ Options::PrepareForBulkLoad()
   // increasing the total time needed for compactions.
   num_levels = 2;
 
+  // Need to allow more write buffers to allow more parallism
+  // of flushes.
+  max_write_buffer_number = 6;
+  min_write_buffer_number_to_merge = 1;
+
+  // When compaction is disabled, more parallel flush threads can
+  // help with write throughput.
+  max_background_flushes = 4;
+
   // Prevent a memtable flush to automatically promote files
   // to L1. This is helpful so that all files that are
   // input to the manual compaction are all at L0.

From 9898f639889a82490861e0bb3006ac7d2b0e1a59 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Tue, 3 Feb 2015 09:47:29 -0800
Subject: [PATCH 799/829] Divide test DBIteratorTest.DBIterator to smaller
 tests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary:
When building on my host, I saw warning:

In file included from db/db_iter_test.cc:17:0:
db/db_iter_test.cc: In member function â€˜void rocksdb::_Test_DBIterator::_Run()â€™:
./util/testharness.h:147:14: note: variable tracking size limit exceeded with -fvar-tracking-assignments, retrying without
 void TCONCAT(_Test_,name)::_Run()
              ^
./util/testharness.h:134:23: note: in definition of macro â€˜TCONCAT1â€™
 #define TCONCAT1(a,b) a##b
                       ^
./util/testharness.h:147:6: note: in expansion of macro â€˜TCONCATâ€™
 void TCONCAT(_Test_,name)::_Run()
      ^
db/db_iter_test.cc:589:1: note: in expansion of macro â€˜TESTâ€™
 TEST(DBIteratorTest, DBIterator) {
 ^

By dividing the test into small tests, it should fix the problem

Test Plan: Run the test

Reviewers: rven, yhchiang, igor

Reviewed By: igor

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D32679
---
 db/db_iter_test.cc | 1373 ++++++++++++++++++++++----------------------
 1 file changed, 680 insertions(+), 693 deletions(-)

diff --git a/db/db_iter_test.cc b/db/db_iter_test.cc
index e06900010..f045d7798 100644
--- a/db/db_iter_test.cc
+++ b/db/db_iter_test.cc
@@ -586,832 +586,819 @@ TEST(DBIteratorTest, DBIteratorUseSkip) {
   }
 }
 
-TEST(DBIteratorTest, DBIterator) {
+TEST(DBIteratorTest, DBIterator1) {
+  Options options;
+  options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+
+  TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+  internal_iter->AddPut("a", "0");
+  internal_iter->AddPut("b", "0");
+  internal_iter->AddDeletion("b");
+  internal_iter->AddMerge("a", "1");
+  internal_iter->AddMerge("b", "2");
+  internal_iter->Finish();
+
+  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, 1,
+      options.max_sequential_skip_in_iterations));
+  db_iter->SeekToFirst();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "a");
+  ASSERT_EQ(db_iter->value().ToString(), "0");
+  db_iter->Next();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "b");
+}
+
+TEST(DBIteratorTest, DBIterator2) {
+  Options options;
+  options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+
+  TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+  internal_iter->AddPut("a", "0");
+  internal_iter->AddPut("b", "0");
+  internal_iter->AddDeletion("b");
+  internal_iter->AddMerge("a", "1");
+  internal_iter->AddMerge("b", "2");
+  internal_iter->Finish();
+
+  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, 0,
+      options.max_sequential_skip_in_iterations));
+  db_iter->SeekToFirst();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "a");
+  ASSERT_EQ(db_iter->value().ToString(), "0");
+  db_iter->Next();
+  ASSERT_TRUE(!db_iter->Valid());
+}
+
+TEST(DBIteratorTest, DBIterator3) {
+  Options options;
+  options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+
+  TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+  internal_iter->AddPut("a", "0");
+  internal_iter->AddPut("b", "0");
+  internal_iter->AddDeletion("b");
+  internal_iter->AddMerge("a", "1");
+  internal_iter->AddMerge("b", "2");
+  internal_iter->Finish();
+
+  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, 2,
+      options.max_sequential_skip_in_iterations));
+  db_iter->SeekToFirst();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "a");
+  ASSERT_EQ(db_iter->value().ToString(), "0");
+  db_iter->Next();
+  ASSERT_TRUE(!db_iter->Valid());
+}
+TEST(DBIteratorTest, DBIterator4) {
+  Options options;
+  options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+
+  TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+  internal_iter->AddPut("a", "0");
+  internal_iter->AddPut("b", "0");
+  internal_iter->AddDeletion("b");
+  internal_iter->AddMerge("a", "1");
+  internal_iter->AddMerge("b", "2");
+  internal_iter->Finish();
+
+  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, 4,
+      options.max_sequential_skip_in_iterations));
+  db_iter->SeekToFirst();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "a");
+  ASSERT_EQ(db_iter->value().ToString(), "0,1");
+  db_iter->Next();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "b");
+  ASSERT_EQ(db_iter->value().ToString(), "2");
+  db_iter->Next();
+  ASSERT_TRUE(!db_iter->Valid());
+}
+
+TEST(DBIteratorTest, DBIterator5) {
   Options options;
   options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
   {
     TestIterator* internal_iter = new TestIterator(BytewiseComparator());
-    internal_iter->AddPut("a", "0");
-    internal_iter->AddPut("b", "0");
-    internal_iter->AddDeletion("b");
-    internal_iter->AddMerge("a", "1");
-    internal_iter->AddMerge("b", "2");
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddMerge("a", "merge_2");
+    internal_iter->AddMerge("a", "merge_3");
+    internal_iter->AddPut("a", "put_1");
+    internal_iter->AddMerge("a", "merge_4");
+    internal_iter->AddMerge("a", "merge_5");
+    internal_iter->AddMerge("a", "merge_6");
     internal_iter->Finish();
 
-    std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, ImmutableCFOptions(options),
-                      BytewiseComparator(), internal_iter, 1,
-                      options.max_sequential_skip_in_iterations));
-    db_iter->SeekToFirst();
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter,
+        0, options.max_sequential_skip_in_iterations));
+    db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
-    ASSERT_EQ(db_iter->value().ToString(), "0");
-    db_iter->Next();
-    ASSERT_TRUE(db_iter->Valid());
-    ASSERT_EQ(db_iter->key().ToString(), "b");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
   }
 
   {
     TestIterator* internal_iter = new TestIterator(BytewiseComparator());
-    internal_iter->AddPut("a", "0");
-    internal_iter->AddPut("b", "0");
-    internal_iter->AddDeletion("b");
-    internal_iter->AddMerge("a", "1");
-    internal_iter->AddMerge("b", "2");
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddMerge("a", "merge_2");
+    internal_iter->AddMerge("a", "merge_3");
+    internal_iter->AddPut("a", "put_1");
+    internal_iter->AddMerge("a", "merge_4");
+    internal_iter->AddMerge("a", "merge_5");
+    internal_iter->AddMerge("a", "merge_6");
     internal_iter->Finish();
 
-    std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, ImmutableCFOptions(options),
-                      BytewiseComparator(), internal_iter, 0,
-                      options.max_sequential_skip_in_iterations));
-    db_iter->SeekToFirst();
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter,
+        1, options.max_sequential_skip_in_iterations));
+    db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
-    ASSERT_EQ(db_iter->value().ToString(), "0");
-    db_iter->Next();
+    ASSERT_EQ(db_iter->value().ToString(), "merge_1,merge_2");
+    db_iter->Prev();
     ASSERT_TRUE(!db_iter->Valid());
   }
 
   {
     TestIterator* internal_iter = new TestIterator(BytewiseComparator());
-    internal_iter->AddPut("a", "0");
-    internal_iter->AddPut("b", "0");
-    internal_iter->AddDeletion("b");
-    internal_iter->AddMerge("a", "1");
-    internal_iter->AddMerge("b", "2");
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddMerge("a", "merge_2");
+    internal_iter->AddMerge("a", "merge_3");
+    internal_iter->AddPut("a", "put_1");
+    internal_iter->AddMerge("a", "merge_4");
+    internal_iter->AddMerge("a", "merge_5");
+    internal_iter->AddMerge("a", "merge_6");
     internal_iter->Finish();
 
-    std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, ImmutableCFOptions(options),
-                      BytewiseComparator(), internal_iter, 2,
-                      options.max_sequential_skip_in_iterations));
-    db_iter->SeekToFirst();
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter,
+        2, options.max_sequential_skip_in_iterations));
+    db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
-    ASSERT_EQ(db_iter->value().ToString(), "0");
-    db_iter->Next();
+    ASSERT_EQ(db_iter->value().ToString(), "merge_1,merge_2,merge_3");
+    db_iter->Prev();
     ASSERT_TRUE(!db_iter->Valid());
   }
 
   {
     TestIterator* internal_iter = new TestIterator(BytewiseComparator());
-    internal_iter->AddPut("a", "0");
-    internal_iter->AddPut("b", "0");
-    internal_iter->AddDeletion("b");
-    internal_iter->AddMerge("a", "1");
-    internal_iter->AddMerge("b", "2");
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddMerge("a", "merge_2");
+    internal_iter->AddMerge("a", "merge_3");
+    internal_iter->AddPut("a", "put_1");
+    internal_iter->AddMerge("a", "merge_4");
+    internal_iter->AddMerge("a", "merge_5");
+    internal_iter->AddMerge("a", "merge_6");
     internal_iter->Finish();
 
-    std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, ImmutableCFOptions(options),
-                      BytewiseComparator(), internal_iter, 4,
-                      options.max_sequential_skip_in_iterations));
-    db_iter->SeekToFirst();
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter,
+        3, options.max_sequential_skip_in_iterations));
+    db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
-    ASSERT_EQ(db_iter->value().ToString(), "0,1");
-    db_iter->Next();
-    ASSERT_TRUE(db_iter->Valid());
-    ASSERT_EQ(db_iter->key().ToString(), "b");
-    ASSERT_EQ(db_iter->value().ToString(), "2");
-    db_iter->Next();
+    ASSERT_EQ(db_iter->value().ToString(), "put_1");
+    db_iter->Prev();
     ASSERT_TRUE(!db_iter->Valid());
   }
 
   {
-    {
-      TestIterator* internal_iter = new TestIterator(BytewiseComparator());
-      internal_iter->AddMerge("a", "merge_1");
-      internal_iter->AddMerge("a", "merge_2");
-      internal_iter->AddMerge("a", "merge_3");
-      internal_iter->AddPut("a", "put_1");
-      internal_iter->AddMerge("a", "merge_4");
-      internal_iter->AddMerge("a", "merge_5");
-      internal_iter->AddMerge("a", "merge_6");
-      internal_iter->Finish();
-
-      std::unique_ptr<Iterator> db_iter(
-          NewDBIterator(env_, ImmutableCFOptions(options),
-                        BytewiseComparator(), internal_iter, 0,
-                        options.max_sequential_skip_in_iterations));
-      db_iter->SeekToLast();
-      ASSERT_TRUE(db_iter->Valid());
-      ASSERT_EQ(db_iter->key().ToString(), "a");
-      ASSERT_EQ(db_iter->value().ToString(), "merge_1");
-      db_iter->Prev();
-      ASSERT_TRUE(!db_iter->Valid());
-    }
-
-    {
-      TestIterator* internal_iter = new TestIterator(BytewiseComparator());
-      internal_iter->AddMerge("a", "merge_1");
-      internal_iter->AddMerge("a", "merge_2");
-      internal_iter->AddMerge("a", "merge_3");
-      internal_iter->AddPut("a", "put_1");
-      internal_iter->AddMerge("a", "merge_4");
-      internal_iter->AddMerge("a", "merge_5");
-      internal_iter->AddMerge("a", "merge_6");
-      internal_iter->Finish();
-
-      std::unique_ptr<Iterator> db_iter(
-          NewDBIterator(env_, ImmutableCFOptions(options),
-                        BytewiseComparator(), internal_iter, 1,
-                        options.max_sequential_skip_in_iterations));
-      db_iter->SeekToLast();
-      ASSERT_TRUE(db_iter->Valid());
-      ASSERT_EQ(db_iter->key().ToString(), "a");
-      ASSERT_EQ(db_iter->value().ToString(), "merge_1,merge_2");
-      db_iter->Prev();
-      ASSERT_TRUE(!db_iter->Valid());
-    }
-
-    {
-      TestIterator* internal_iter = new TestIterator(BytewiseComparator());
-      internal_iter->AddMerge("a", "merge_1");
-      internal_iter->AddMerge("a", "merge_2");
-      internal_iter->AddMerge("a", "merge_3");
-      internal_iter->AddPut("a", "put_1");
-      internal_iter->AddMerge("a", "merge_4");
-      internal_iter->AddMerge("a", "merge_5");
-      internal_iter->AddMerge("a", "merge_6");
-      internal_iter->Finish();
-
-      std::unique_ptr<Iterator> db_iter(
-          NewDBIterator(env_, ImmutableCFOptions(options),
-                        BytewiseComparator(), internal_iter, 2,
-                        options.max_sequential_skip_in_iterations));
-      db_iter->SeekToLast();
-      ASSERT_TRUE(db_iter->Valid());
-      ASSERT_EQ(db_iter->key().ToString(), "a");
-      ASSERT_EQ(db_iter->value().ToString(), "merge_1,merge_2,merge_3");
-      db_iter->Prev();
-      ASSERT_TRUE(!db_iter->Valid());
-    }
-
-    {
-      TestIterator* internal_iter = new TestIterator(BytewiseComparator());
-      internal_iter->AddMerge("a", "merge_1");
-      internal_iter->AddMerge("a", "merge_2");
-      internal_iter->AddMerge("a", "merge_3");
-      internal_iter->AddPut("a", "put_1");
-      internal_iter->AddMerge("a", "merge_4");
-      internal_iter->AddMerge("a", "merge_5");
-      internal_iter->AddMerge("a", "merge_6");
-      internal_iter->Finish();
-
-      std::unique_ptr<Iterator> db_iter(
-          NewDBIterator(env_, ImmutableCFOptions(options),
-                        BytewiseComparator(), internal_iter, 3,
-                        options.max_sequential_skip_in_iterations));
-      db_iter->SeekToLast();
-      ASSERT_TRUE(db_iter->Valid());
-      ASSERT_EQ(db_iter->key().ToString(), "a");
-      ASSERT_EQ(db_iter->value().ToString(), "put_1");
-      db_iter->Prev();
-      ASSERT_TRUE(!db_iter->Valid());
-    }
-
-    {
-      TestIterator* internal_iter = new TestIterator(BytewiseComparator());
-      internal_iter->AddMerge("a", "merge_1");
-      internal_iter->AddMerge("a", "merge_2");
-      internal_iter->AddMerge("a", "merge_3");
-      internal_iter->AddPut("a", "put_1");
-      internal_iter->AddMerge("a", "merge_4");
-      internal_iter->AddMerge("a", "merge_5");
-      internal_iter->AddMerge("a", "merge_6");
-      internal_iter->Finish();
-
-      std::unique_ptr<Iterator> db_iter(
-          NewDBIterator(env_, ImmutableCFOptions(options),
-                        BytewiseComparator(), internal_iter, 4,
-                        options.max_sequential_skip_in_iterations));
-      db_iter->SeekToLast();
-      ASSERT_TRUE(db_iter->Valid());
-      ASSERT_EQ(db_iter->key().ToString(), "a");
-      ASSERT_EQ(db_iter->value().ToString(), "put_1,merge_4");
-      db_iter->Prev();
-      ASSERT_TRUE(!db_iter->Valid());
-    }
-
-    {
-      TestIterator* internal_iter = new TestIterator(BytewiseComparator());
-      internal_iter->AddMerge("a", "merge_1");
-      internal_iter->AddMerge("a", "merge_2");
-      internal_iter->AddMerge("a", "merge_3");
-      internal_iter->AddPut("a", "put_1");
-      internal_iter->AddMerge("a", "merge_4");
-      internal_iter->AddMerge("a", "merge_5");
-      internal_iter->AddMerge("a", "merge_6");
-      internal_iter->Finish();
-
-      std::unique_ptr<Iterator> db_iter(
-          NewDBIterator(env_, ImmutableCFOptions(options),
-                        BytewiseComparator(), internal_iter, 5,
-                        options.max_sequential_skip_in_iterations));
-      db_iter->SeekToLast();
-      ASSERT_TRUE(db_iter->Valid());
-      ASSERT_EQ(db_iter->key().ToString(), "a");
-      ASSERT_EQ(db_iter->value().ToString(), "put_1,merge_4,merge_5");
-      db_iter->Prev();
-      ASSERT_TRUE(!db_iter->Valid());
-    }
-
-    {
-      TestIterator* internal_iter = new TestIterator(BytewiseComparator());
-      internal_iter->AddMerge("a", "merge_1");
-      internal_iter->AddMerge("a", "merge_2");
-      internal_iter->AddMerge("a", "merge_3");
-      internal_iter->AddPut("a", "put_1");
-      internal_iter->AddMerge("a", "merge_4");
-      internal_iter->AddMerge("a", "merge_5");
-      internal_iter->AddMerge("a", "merge_6");
-      internal_iter->Finish();
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddMerge("a", "merge_2");
+    internal_iter->AddMerge("a", "merge_3");
+    internal_iter->AddPut("a", "put_1");
+    internal_iter->AddMerge("a", "merge_4");
+    internal_iter->AddMerge("a", "merge_5");
+    internal_iter->AddMerge("a", "merge_6");
+    internal_iter->Finish();
 
-      std::unique_ptr<Iterator> db_iter(
-          NewDBIterator(env_, ImmutableCFOptions(options),
-                        BytewiseComparator(), internal_iter, 6,
-                        options.max_sequential_skip_in_iterations));
-      db_iter->SeekToLast();
-      ASSERT_TRUE(db_iter->Valid());
-      ASSERT_EQ(db_iter->key().ToString(), "a");
-      ASSERT_EQ(db_iter->value().ToString(), "put_1,merge_4,merge_5,merge_6");
-      db_iter->Prev();
-      ASSERT_TRUE(!db_iter->Valid());
-    }
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter,
+        4, options.max_sequential_skip_in_iterations));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "put_1,merge_4");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
   }
 
   {
-    {
-      TestIterator* internal_iter = new TestIterator(BytewiseComparator());
-      internal_iter->AddMerge("a", "merge_1");
-      internal_iter->AddMerge("a", "merge_2");
-      internal_iter->AddMerge("a", "merge_3");
-      internal_iter->AddDeletion("a");
-      internal_iter->AddMerge("a", "merge_4");
-      internal_iter->AddMerge("a", "merge_5");
-      internal_iter->AddMerge("a", "merge_6");
-      internal_iter->Finish();
-
-      std::unique_ptr<Iterator> db_iter(
-          NewDBIterator(env_, ImmutableCFOptions(options),
-                        BytewiseComparator(), internal_iter, 0,
-                        options.max_sequential_skip_in_iterations));
-      db_iter->SeekToLast();
-      ASSERT_TRUE(db_iter->Valid());
-      ASSERT_EQ(db_iter->key().ToString(), "a");
-      ASSERT_EQ(db_iter->value().ToString(), "merge_1");
-      db_iter->Prev();
-      ASSERT_TRUE(!db_iter->Valid());
-    }
-
-    {
-      TestIterator* internal_iter = new TestIterator(BytewiseComparator());
-      internal_iter->AddMerge("a", "merge_1");
-      internal_iter->AddMerge("a", "merge_2");
-      internal_iter->AddMerge("a", "merge_3");
-      internal_iter->AddDeletion("a");
-      internal_iter->AddMerge("a", "merge_4");
-      internal_iter->AddMerge("a", "merge_5");
-      internal_iter->AddMerge("a", "merge_6");
-      internal_iter->Finish();
-
-      std::unique_ptr<Iterator> db_iter(
-          NewDBIterator(env_, ImmutableCFOptions(options),
-                        BytewiseComparator(), internal_iter, 1,
-                        options.max_sequential_skip_in_iterations));
-      db_iter->SeekToLast();
-      ASSERT_TRUE(db_iter->Valid());
-      ASSERT_EQ(db_iter->key().ToString(), "a");
-      ASSERT_EQ(db_iter->value().ToString(), "merge_1,merge_2");
-      db_iter->Prev();
-      ASSERT_TRUE(!db_iter->Valid());
-    }
-
-    {
-      TestIterator* internal_iter = new TestIterator(BytewiseComparator());
-      internal_iter->AddMerge("a", "merge_1");
-      internal_iter->AddMerge("a", "merge_2");
-      internal_iter->AddMerge("a", "merge_3");
-      internal_iter->AddDeletion("a");
-      internal_iter->AddMerge("a", "merge_4");
-      internal_iter->AddMerge("a", "merge_5");
-      internal_iter->AddMerge("a", "merge_6");
-      internal_iter->Finish();
-
-      std::unique_ptr<Iterator> db_iter(
-          NewDBIterator(env_, ImmutableCFOptions(options),
-                        BytewiseComparator(), internal_iter, 2,
-                        options.max_sequential_skip_in_iterations));
-      db_iter->SeekToLast();
-      ASSERT_TRUE(db_iter->Valid());
-      ASSERT_EQ(db_iter->key().ToString(), "a");
-      ASSERT_EQ(db_iter->value().ToString(), "merge_1,merge_2,merge_3");
-      db_iter->Prev();
-      ASSERT_TRUE(!db_iter->Valid());
-    }
-
-    {
-      TestIterator* internal_iter = new TestIterator(BytewiseComparator());
-      internal_iter->AddMerge("a", "merge_1");
-      internal_iter->AddMerge("a", "merge_2");
-      internal_iter->AddMerge("a", "merge_3");
-      internal_iter->AddDeletion("a");
-      internal_iter->AddMerge("a", "merge_4");
-      internal_iter->AddMerge("a", "merge_5");
-      internal_iter->AddMerge("a", "merge_6");
-      internal_iter->Finish();
-
-      std::unique_ptr<Iterator> db_iter(
-          NewDBIterator(env_, ImmutableCFOptions(options),
-                        BytewiseComparator(), internal_iter, 3,
-                        options.max_sequential_skip_in_iterations));
-      db_iter->SeekToLast();
-      ASSERT_TRUE(!db_iter->Valid());
-    }
-
-    {
-      TestIterator* internal_iter = new TestIterator(BytewiseComparator());
-      internal_iter->AddMerge("a", "merge_1");
-      internal_iter->AddMerge("a", "merge_2");
-      internal_iter->AddMerge("a", "merge_3");
-      internal_iter->AddDeletion("a");
-      internal_iter->AddMerge("a", "merge_4");
-      internal_iter->AddMerge("a", "merge_5");
-      internal_iter->AddMerge("a", "merge_6");
-      internal_iter->Finish();
-
-      std::unique_ptr<Iterator> db_iter(
-          NewDBIterator(env_, ImmutableCFOptions(options),
-                        BytewiseComparator(), internal_iter, 4,
-                        options.max_sequential_skip_in_iterations));
-      db_iter->SeekToLast();
-      ASSERT_TRUE(db_iter->Valid());
-      ASSERT_EQ(db_iter->key().ToString(), "a");
-      ASSERT_EQ(db_iter->value().ToString(), "merge_4");
-      db_iter->Prev();
-      ASSERT_TRUE(!db_iter->Valid());
-    }
-
-    {
-      TestIterator* internal_iter = new TestIterator(BytewiseComparator());
-      internal_iter->AddMerge("a", "merge_1");
-      internal_iter->AddMerge("a", "merge_2");
-      internal_iter->AddMerge("a", "merge_3");
-      internal_iter->AddDeletion("a");
-      internal_iter->AddMerge("a", "merge_4");
-      internal_iter->AddMerge("a", "merge_5");
-      internal_iter->AddMerge("a", "merge_6");
-      internal_iter->Finish();
-
-      std::unique_ptr<Iterator> db_iter(
-          NewDBIterator(env_, ImmutableCFOptions(options),
-                        BytewiseComparator(), internal_iter, 5,
-                        options.max_sequential_skip_in_iterations));
-      db_iter->SeekToLast();
-      ASSERT_TRUE(db_iter->Valid());
-      ASSERT_EQ(db_iter->key().ToString(), "a");
-      ASSERT_EQ(db_iter->value().ToString(), "merge_4,merge_5");
-      db_iter->Prev();
-      ASSERT_TRUE(!db_iter->Valid());
-    }
-
-    {
-      TestIterator* internal_iter = new TestIterator(BytewiseComparator());
-      internal_iter->AddMerge("a", "merge_1");
-      internal_iter->AddMerge("a", "merge_2");
-      internal_iter->AddMerge("a", "merge_3");
-      internal_iter->AddDeletion("a");
-      internal_iter->AddMerge("a", "merge_4");
-      internal_iter->AddMerge("a", "merge_5");
-      internal_iter->AddMerge("a", "merge_6");
-      internal_iter->Finish();
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddMerge("a", "merge_2");
+    internal_iter->AddMerge("a", "merge_3");
+    internal_iter->AddPut("a", "put_1");
+    internal_iter->AddMerge("a", "merge_4");
+    internal_iter->AddMerge("a", "merge_5");
+    internal_iter->AddMerge("a", "merge_6");
+    internal_iter->Finish();
 
-      std::unique_ptr<Iterator> db_iter(
-          NewDBIterator(env_, ImmutableCFOptions(options),
-                        BytewiseComparator(), internal_iter, 6,
-                        options.max_sequential_skip_in_iterations));
-      db_iter->SeekToLast();
-      ASSERT_TRUE(db_iter->Valid());
-      ASSERT_EQ(db_iter->key().ToString(), "a");
-      ASSERT_EQ(db_iter->value().ToString(), "merge_4,merge_5,merge_6");
-      db_iter->Prev();
-      ASSERT_TRUE(!db_iter->Valid());
-    }
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter,
+        5, options.max_sequential_skip_in_iterations));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "put_1,merge_4,merge_5");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
   }
 
   {
-    {
-      TestIterator* internal_iter = new TestIterator(BytewiseComparator());
-      internal_iter->AddMerge("a", "merge_1");
-      internal_iter->AddPut("b", "val");
-      internal_iter->AddMerge("b", "merge_2");
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddMerge("a", "merge_2");
+    internal_iter->AddMerge("a", "merge_3");
+    internal_iter->AddPut("a", "put_1");
+    internal_iter->AddMerge("a", "merge_4");
+    internal_iter->AddMerge("a", "merge_5");
+    internal_iter->AddMerge("a", "merge_6");
+    internal_iter->Finish();
 
-      internal_iter->AddDeletion("b");
-      internal_iter->AddMerge("b", "merge_3");
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter,
+        6, options.max_sequential_skip_in_iterations));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "put_1,merge_4,merge_5,merge_6");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+}
 
-      internal_iter->AddMerge("c", "merge_4");
-      internal_iter->AddMerge("c", "merge_5");
+TEST(DBIteratorTest, DBIterator6) {
+  Options options;
+  options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddMerge("a", "merge_2");
+    internal_iter->AddMerge("a", "merge_3");
+    internal_iter->AddDeletion("a");
+    internal_iter->AddMerge("a", "merge_4");
+    internal_iter->AddMerge("a", "merge_5");
+    internal_iter->AddMerge("a", "merge_6");
+    internal_iter->Finish();
 
-      internal_iter->AddDeletion("b");
-      internal_iter->AddMerge("b", "merge_6");
-      internal_iter->AddMerge("b", "merge_7");
-      internal_iter->AddMerge("b", "merge_8");
-      internal_iter->AddMerge("b", "merge_9");
-      internal_iter->AddMerge("b", "merge_10");
-      internal_iter->AddMerge("b", "merge_11");
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter,
+        0, options.max_sequential_skip_in_iterations));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
 
-      internal_iter->AddDeletion("c");
-      internal_iter->Finish();
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddMerge("a", "merge_2");
+    internal_iter->AddMerge("a", "merge_3");
+    internal_iter->AddDeletion("a");
+    internal_iter->AddMerge("a", "merge_4");
+    internal_iter->AddMerge("a", "merge_5");
+    internal_iter->AddMerge("a", "merge_6");
+    internal_iter->Finish();
 
-      std::unique_ptr<Iterator> db_iter(
-          NewDBIterator(env_, ImmutableCFOptions(options),
-                        BytewiseComparator(), internal_iter, 0,
-                        options.max_sequential_skip_in_iterations));
-      db_iter->SeekToLast();
-      ASSERT_TRUE(db_iter->Valid());
-      ASSERT_EQ(db_iter->key().ToString(), "a");
-      ASSERT_EQ(db_iter->value().ToString(), "merge_1");
-      db_iter->Prev();
-      ASSERT_TRUE(!db_iter->Valid());
-    }
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter,
+        1, options.max_sequential_skip_in_iterations));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_1,merge_2");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
 
-    {
-      TestIterator* internal_iter = new TestIterator(BytewiseComparator());
-      internal_iter->AddMerge("a", "merge_1");
-      internal_iter->AddPut("b", "val");
-      internal_iter->AddMerge("b", "merge_2");
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddMerge("a", "merge_2");
+    internal_iter->AddMerge("a", "merge_3");
+    internal_iter->AddDeletion("a");
+    internal_iter->AddMerge("a", "merge_4");
+    internal_iter->AddMerge("a", "merge_5");
+    internal_iter->AddMerge("a", "merge_6");
+    internal_iter->Finish();
 
-      internal_iter->AddDeletion("b");
-      internal_iter->AddMerge("b", "merge_3");
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter,
+        2, options.max_sequential_skip_in_iterations));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_1,merge_2,merge_3");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
 
-      internal_iter->AddMerge("c", "merge_4");
-      internal_iter->AddMerge("c", "merge_5");
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddMerge("a", "merge_2");
+    internal_iter->AddMerge("a", "merge_3");
+    internal_iter->AddDeletion("a");
+    internal_iter->AddMerge("a", "merge_4");
+    internal_iter->AddMerge("a", "merge_5");
+    internal_iter->AddMerge("a", "merge_6");
+    internal_iter->Finish();
 
-      internal_iter->AddDeletion("b");
-      internal_iter->AddMerge("b", "merge_6");
-      internal_iter->AddMerge("b", "merge_7");
-      internal_iter->AddMerge("b", "merge_8");
-      internal_iter->AddMerge("b", "merge_9");
-      internal_iter->AddMerge("b", "merge_10");
-      internal_iter->AddMerge("b", "merge_11");
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter,
+        3, options.max_sequential_skip_in_iterations));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
 
-      internal_iter->AddDeletion("c");
-      internal_iter->Finish();
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddMerge("a", "merge_2");
+    internal_iter->AddMerge("a", "merge_3");
+    internal_iter->AddDeletion("a");
+    internal_iter->AddMerge("a", "merge_4");
+    internal_iter->AddMerge("a", "merge_5");
+    internal_iter->AddMerge("a", "merge_6");
+    internal_iter->Finish();
 
-      std::unique_ptr<Iterator> db_iter(
-          NewDBIterator(env_, ImmutableCFOptions(options),
-                        BytewiseComparator(), internal_iter, 2,
-                        options.max_sequential_skip_in_iterations));
-      db_iter->SeekToLast();
-      ASSERT_TRUE(db_iter->Valid());
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter,
+        4, options.max_sequential_skip_in_iterations));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_4");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
 
-      ASSERT_EQ(db_iter->key().ToString(), "b");
-      ASSERT_EQ(db_iter->value().ToString(), "val,merge_2");
-      db_iter->Prev();
-      ASSERT_TRUE(db_iter->Valid());
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddMerge("a", "merge_2");
+    internal_iter->AddMerge("a", "merge_3");
+    internal_iter->AddDeletion("a");
+    internal_iter->AddMerge("a", "merge_4");
+    internal_iter->AddMerge("a", "merge_5");
+    internal_iter->AddMerge("a", "merge_6");
+    internal_iter->Finish();
 
-      ASSERT_EQ(db_iter->key().ToString(), "a");
-      ASSERT_EQ(db_iter->value().ToString(), "merge_1");
-      db_iter->Prev();
-      ASSERT_TRUE(!db_iter->Valid());
-    }
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter,
+        5, options.max_sequential_skip_in_iterations));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_4,merge_5");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
 
-    {
-      TestIterator* internal_iter = new TestIterator(BytewiseComparator());
-      internal_iter->AddMerge("a", "merge_1");
-      internal_iter->AddPut("b", "val");
-      internal_iter->AddMerge("b", "merge_2");
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddMerge("a", "merge_2");
+    internal_iter->AddMerge("a", "merge_3");
+    internal_iter->AddDeletion("a");
+    internal_iter->AddMerge("a", "merge_4");
+    internal_iter->AddMerge("a", "merge_5");
+    internal_iter->AddMerge("a", "merge_6");
+    internal_iter->Finish();
 
-      internal_iter->AddDeletion("b");
-      internal_iter->AddMerge("b", "merge_3");
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter,
+        6, options.max_sequential_skip_in_iterations));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_4,merge_5,merge_6");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+}
 
-      internal_iter->AddMerge("c", "merge_4");
-      internal_iter->AddMerge("c", "merge_5");
+TEST(DBIteratorTest, DBIterator7) {
+  Options options;
+  options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddPut("b", "val");
+    internal_iter->AddMerge("b", "merge_2");
 
-      internal_iter->AddDeletion("b");
-      internal_iter->AddMerge("b", "merge_6");
-      internal_iter->AddMerge("b", "merge_7");
-      internal_iter->AddMerge("b", "merge_8");
-      internal_iter->AddMerge("b", "merge_9");
-      internal_iter->AddMerge("b", "merge_10");
-      internal_iter->AddMerge("b", "merge_11");
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_3");
 
-      internal_iter->AddDeletion("c");
-      internal_iter->Finish();
+    internal_iter->AddMerge("c", "merge_4");
+    internal_iter->AddMerge("c", "merge_5");
 
-      std::unique_ptr<Iterator> db_iter(
-          NewDBIterator(env_, ImmutableCFOptions(options),
-                        BytewiseComparator(), internal_iter, 4,
-                        options.max_sequential_skip_in_iterations));
-      db_iter->SeekToLast();
-      ASSERT_TRUE(db_iter->Valid());
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_6");
+    internal_iter->AddMerge("b", "merge_7");
+    internal_iter->AddMerge("b", "merge_8");
+    internal_iter->AddMerge("b", "merge_9");
+    internal_iter->AddMerge("b", "merge_10");
+    internal_iter->AddMerge("b", "merge_11");
+
+    internal_iter->AddDeletion("c");
+    internal_iter->Finish();
 
-      ASSERT_EQ(db_iter->key().ToString(), "b");
-      ASSERT_EQ(db_iter->value().ToString(), "merge_3");
-      db_iter->Prev();
-      ASSERT_TRUE(db_iter->Valid());
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter,
+        0, options.max_sequential_skip_in_iterations));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
 
-      ASSERT_EQ(db_iter->key().ToString(), "a");
-      ASSERT_EQ(db_iter->value().ToString(), "merge_1");
-      db_iter->Prev();
-      ASSERT_TRUE(!db_iter->Valid());
-    }
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddPut("b", "val");
+    internal_iter->AddMerge("b", "merge_2");
 
-    {
-      TestIterator* internal_iter = new TestIterator(BytewiseComparator());
-      internal_iter->AddMerge("a", "merge_1");
-      internal_iter->AddPut("b", "val");
-      internal_iter->AddMerge("b", "merge_2");
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_3");
 
-      internal_iter->AddDeletion("b");
-      internal_iter->AddMerge("b", "merge_3");
+    internal_iter->AddMerge("c", "merge_4");
+    internal_iter->AddMerge("c", "merge_5");
 
-      internal_iter->AddMerge("c", "merge_4");
-      internal_iter->AddMerge("c", "merge_5");
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_6");
+    internal_iter->AddMerge("b", "merge_7");
+    internal_iter->AddMerge("b", "merge_8");
+    internal_iter->AddMerge("b", "merge_9");
+    internal_iter->AddMerge("b", "merge_10");
+    internal_iter->AddMerge("b", "merge_11");
+
+    internal_iter->AddDeletion("c");
+    internal_iter->Finish();
 
-      internal_iter->AddDeletion("b");
-      internal_iter->AddMerge("b", "merge_6");
-      internal_iter->AddMerge("b", "merge_7");
-      internal_iter->AddMerge("b", "merge_8");
-      internal_iter->AddMerge("b", "merge_9");
-      internal_iter->AddMerge("b", "merge_10");
-      internal_iter->AddMerge("b", "merge_11");
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter,
+        2, options.max_sequential_skip_in_iterations));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
 
-      internal_iter->AddDeletion("c");
-      internal_iter->Finish();
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+    ASSERT_EQ(db_iter->value().ToString(), "val,merge_2");
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
 
-      std::unique_ptr<Iterator> db_iter(
-          NewDBIterator(env_, ImmutableCFOptions(options),
-                        BytewiseComparator(), internal_iter, 5,
-                        options.max_sequential_skip_in_iterations));
-      db_iter->SeekToLast();
-      ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
 
-      ASSERT_EQ(db_iter->key().ToString(), "c");
-      ASSERT_EQ(db_iter->value().ToString(), "merge_4");
-      db_iter->Prev();
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddPut("b", "val");
+    internal_iter->AddMerge("b", "merge_2");
 
-      ASSERT_TRUE(db_iter->Valid());
-      ASSERT_EQ(db_iter->key().ToString(), "b");
-      ASSERT_EQ(db_iter->value().ToString(), "merge_3");
-      db_iter->Prev();
-      ASSERT_TRUE(db_iter->Valid());
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_3");
 
-      ASSERT_EQ(db_iter->key().ToString(), "a");
-      ASSERT_EQ(db_iter->value().ToString(), "merge_1");
-      db_iter->Prev();
-      ASSERT_TRUE(!db_iter->Valid());
-    }
+    internal_iter->AddMerge("c", "merge_4");
+    internal_iter->AddMerge("c", "merge_5");
 
-    {
-      TestIterator* internal_iter = new TestIterator(BytewiseComparator());
-      internal_iter->AddMerge("a", "merge_1");
-      internal_iter->AddPut("b", "val");
-      internal_iter->AddMerge("b", "merge_2");
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_6");
+    internal_iter->AddMerge("b", "merge_7");
+    internal_iter->AddMerge("b", "merge_8");
+    internal_iter->AddMerge("b", "merge_9");
+    internal_iter->AddMerge("b", "merge_10");
+    internal_iter->AddMerge("b", "merge_11");
+
+    internal_iter->AddDeletion("c");
+    internal_iter->Finish();
 
-      internal_iter->AddDeletion("b");
-      internal_iter->AddMerge("b", "merge_3");
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter,
+        4, options.max_sequential_skip_in_iterations));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
 
-      internal_iter->AddMerge("c", "merge_4");
-      internal_iter->AddMerge("c", "merge_5");
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_3");
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
 
-      internal_iter->AddDeletion("b");
-      internal_iter->AddMerge("b", "merge_6");
-      internal_iter->AddMerge("b", "merge_7");
-      internal_iter->AddMerge("b", "merge_8");
-      internal_iter->AddMerge("b", "merge_9");
-      internal_iter->AddMerge("b", "merge_10");
-      internal_iter->AddMerge("b", "merge_11");
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
 
-      internal_iter->AddDeletion("c");
-      internal_iter->Finish();
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddPut("b", "val");
+    internal_iter->AddMerge("b", "merge_2");
 
-      std::unique_ptr<Iterator> db_iter(
-          NewDBIterator(env_, ImmutableCFOptions(options),
-                        BytewiseComparator(), internal_iter, 6,
-                        options.max_sequential_skip_in_iterations));
-      db_iter->SeekToLast();
-      ASSERT_TRUE(db_iter->Valid());
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_3");
 
-      ASSERT_EQ(db_iter->key().ToString(), "c");
-      ASSERT_EQ(db_iter->value().ToString(), "merge_4,merge_5");
-      db_iter->Prev();
-      ASSERT_TRUE(db_iter->Valid());
+    internal_iter->AddMerge("c", "merge_4");
+    internal_iter->AddMerge("c", "merge_5");
 
-      ASSERT_TRUE(db_iter->Valid());
-      ASSERT_EQ(db_iter->key().ToString(), "b");
-      ASSERT_EQ(db_iter->value().ToString(), "merge_3");
-      db_iter->Prev();
-      ASSERT_TRUE(db_iter->Valid());
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_6");
+    internal_iter->AddMerge("b", "merge_7");
+    internal_iter->AddMerge("b", "merge_8");
+    internal_iter->AddMerge("b", "merge_9");
+    internal_iter->AddMerge("b", "merge_10");
+    internal_iter->AddMerge("b", "merge_11");
+
+    internal_iter->AddDeletion("c");
+    internal_iter->Finish();
 
-      ASSERT_EQ(db_iter->key().ToString(), "a");
-      ASSERT_EQ(db_iter->value().ToString(), "merge_1");
-      db_iter->Prev();
-      ASSERT_TRUE(!db_iter->Valid());
-    }
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter,
+        5, options.max_sequential_skip_in_iterations));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
 
-    {
-      TestIterator* internal_iter = new TestIterator(BytewiseComparator());
-      internal_iter->AddMerge("a", "merge_1");
-      internal_iter->AddPut("b", "val");
-      internal_iter->AddMerge("b", "merge_2");
+    ASSERT_EQ(db_iter->key().ToString(), "c");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_4");
+    db_iter->Prev();
 
-      internal_iter->AddDeletion("b");
-      internal_iter->AddMerge("b", "merge_3");
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_3");
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
 
-      internal_iter->AddMerge("c", "merge_4");
-      internal_iter->AddMerge("c", "merge_5");
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
 
-      internal_iter->AddDeletion("b");
-      internal_iter->AddMerge("b", "merge_6");
-      internal_iter->AddMerge("b", "merge_7");
-      internal_iter->AddMerge("b", "merge_8");
-      internal_iter->AddMerge("b", "merge_9");
-      internal_iter->AddMerge("b", "merge_10");
-      internal_iter->AddMerge("b", "merge_11");
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddPut("b", "val");
+    internal_iter->AddMerge("b", "merge_2");
 
-      internal_iter->AddDeletion("c");
-      internal_iter->Finish();
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_3");
 
-      std::unique_ptr<Iterator> db_iter(
-          NewDBIterator(env_, ImmutableCFOptions(options),
-                        BytewiseComparator(), internal_iter, 7,
-                        options.max_sequential_skip_in_iterations));
-      db_iter->SeekToLast();
-      ASSERT_TRUE(db_iter->Valid());
+    internal_iter->AddMerge("c", "merge_4");
+    internal_iter->AddMerge("c", "merge_5");
 
-      ASSERT_EQ(db_iter->key().ToString(), "c");
-      ASSERT_EQ(db_iter->value().ToString(), "merge_4,merge_5");
-      db_iter->Prev();
-      ASSERT_TRUE(db_iter->Valid());
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_6");
+    internal_iter->AddMerge("b", "merge_7");
+    internal_iter->AddMerge("b", "merge_8");
+    internal_iter->AddMerge("b", "merge_9");
+    internal_iter->AddMerge("b", "merge_10");
+    internal_iter->AddMerge("b", "merge_11");
+
+    internal_iter->AddDeletion("c");
+    internal_iter->Finish();
 
-      ASSERT_EQ(db_iter->key().ToString(), "a");
-      ASSERT_EQ(db_iter->value().ToString(), "merge_1");
-      db_iter->Prev();
-      ASSERT_TRUE(!db_iter->Valid());
-    }
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter,
+        6, options.max_sequential_skip_in_iterations));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
 
-    {
-      TestIterator* internal_iter = new TestIterator(BytewiseComparator());
-      internal_iter->AddMerge("a", "merge_1");
-      internal_iter->AddPut("b", "val");
-      internal_iter->AddMerge("b", "merge_2");
+    ASSERT_EQ(db_iter->key().ToString(), "c");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_4,merge_5");
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
 
-      internal_iter->AddDeletion("b");
-      internal_iter->AddMerge("b", "merge_3");
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_3");
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
 
-      internal_iter->AddMerge("c", "merge_4");
-      internal_iter->AddMerge("c", "merge_5");
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
 
-      internal_iter->AddDeletion("b");
-      internal_iter->AddMerge("b", "merge_6");
-      internal_iter->AddMerge("b", "merge_7");
-      internal_iter->AddMerge("b", "merge_8");
-      internal_iter->AddMerge("b", "merge_9");
-      internal_iter->AddMerge("b", "merge_10");
-      internal_iter->AddMerge("b", "merge_11");
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddPut("b", "val");
+    internal_iter->AddMerge("b", "merge_2");
 
-      internal_iter->AddDeletion("c");
-      internal_iter->Finish();
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_3");
 
-      std::unique_ptr<Iterator> db_iter(
-          NewDBIterator(env_, ImmutableCFOptions(options),
-                        BytewiseComparator(), internal_iter, 9,
-                        options.max_sequential_skip_in_iterations));
-      db_iter->SeekToLast();
-      ASSERT_TRUE(db_iter->Valid());
+    internal_iter->AddMerge("c", "merge_4");
+    internal_iter->AddMerge("c", "merge_5");
 
-      ASSERT_EQ(db_iter->key().ToString(), "c");
-      ASSERT_EQ(db_iter->value().ToString(), "merge_4,merge_5");
-      db_iter->Prev();
-      ASSERT_TRUE(db_iter->Valid());
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_6");
+    internal_iter->AddMerge("b", "merge_7");
+    internal_iter->AddMerge("b", "merge_8");
+    internal_iter->AddMerge("b", "merge_9");
+    internal_iter->AddMerge("b", "merge_10");
+    internal_iter->AddMerge("b", "merge_11");
+
+    internal_iter->AddDeletion("c");
+    internal_iter->Finish();
 
-      ASSERT_TRUE(db_iter->Valid());
-      ASSERT_EQ(db_iter->key().ToString(), "b");
-      ASSERT_EQ(db_iter->value().ToString(), "merge_6,merge_7");
-      db_iter->Prev();
-      ASSERT_TRUE(db_iter->Valid());
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter,
+        7, options.max_sequential_skip_in_iterations));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
 
-      ASSERT_EQ(db_iter->key().ToString(), "a");
-      ASSERT_EQ(db_iter->value().ToString(), "merge_1");
-      db_iter->Prev();
-      ASSERT_TRUE(!db_iter->Valid());
-    }
+    ASSERT_EQ(db_iter->key().ToString(), "c");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_4,merge_5");
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
 
-    {
-      TestIterator* internal_iter = new TestIterator(BytewiseComparator());
-      internal_iter->AddMerge("a", "merge_1");
-      internal_iter->AddPut("b", "val");
-      internal_iter->AddMerge("b", "merge_2");
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
 
-      internal_iter->AddDeletion("b");
-      internal_iter->AddMerge("b", "merge_3");
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddPut("b", "val");
+    internal_iter->AddMerge("b", "merge_2");
 
-      internal_iter->AddMerge("c", "merge_4");
-      internal_iter->AddMerge("c", "merge_5");
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_3");
 
-      internal_iter->AddDeletion("b");
-      internal_iter->AddMerge("b", "merge_6");
-      internal_iter->AddMerge("b", "merge_7");
-      internal_iter->AddMerge("b", "merge_8");
-      internal_iter->AddMerge("b", "merge_9");
-      internal_iter->AddMerge("b", "merge_10");
-      internal_iter->AddMerge("b", "merge_11");
+    internal_iter->AddMerge("c", "merge_4");
+    internal_iter->AddMerge("c", "merge_5");
 
-      internal_iter->AddDeletion("c");
-      internal_iter->Finish();
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_6");
+    internal_iter->AddMerge("b", "merge_7");
+    internal_iter->AddMerge("b", "merge_8");
+    internal_iter->AddMerge("b", "merge_9");
+    internal_iter->AddMerge("b", "merge_10");
+    internal_iter->AddMerge("b", "merge_11");
+
+    internal_iter->AddDeletion("c");
+    internal_iter->Finish();
 
-      std::unique_ptr<Iterator> db_iter(NewDBIterator(
-          env_, ImmutableCFOptions(options),
-          BytewiseComparator(), internal_iter, 13,
-          options.max_sequential_skip_in_iterations));
-      db_iter->SeekToLast();
-      ASSERT_TRUE(db_iter->Valid());
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter,
+        9, options.max_sequential_skip_in_iterations));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
 
-      ASSERT_EQ(db_iter->key().ToString(), "c");
-      ASSERT_EQ(db_iter->value().ToString(), "merge_4,merge_5");
-      db_iter->Prev();
-      ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "c");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_4,merge_5");
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
 
-      ASSERT_TRUE(db_iter->Valid());
-      ASSERT_EQ(db_iter->key().ToString(), "b");
-      ASSERT_EQ(db_iter->value().ToString(),
-                "merge_6,merge_7,merge_8,merge_9,merge_10,merge_11");
-      db_iter->Prev();
-      ASSERT_TRUE(db_iter->Valid());
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_6,merge_7");
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
 
-      ASSERT_EQ(db_iter->key().ToString(), "a");
-      ASSERT_EQ(db_iter->value().ToString(), "merge_1");
-      db_iter->Prev();
-      ASSERT_TRUE(!db_iter->Valid());
-    }
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
 
-    {
-      TestIterator* internal_iter = new TestIterator(BytewiseComparator());
-      internal_iter->AddMerge("a", "merge_1");
-      internal_iter->AddPut("b", "val");
-      internal_iter->AddMerge("b", "merge_2");
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddPut("b", "val");
+    internal_iter->AddMerge("b", "merge_2");
 
-      internal_iter->AddDeletion("b");
-      internal_iter->AddMerge("b", "merge_3");
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_3");
 
-      internal_iter->AddMerge("c", "merge_4");
-      internal_iter->AddMerge("c", "merge_5");
+    internal_iter->AddMerge("c", "merge_4");
+    internal_iter->AddMerge("c", "merge_5");
 
-      internal_iter->AddDeletion("b");
-      internal_iter->AddMerge("b", "merge_6");
-      internal_iter->AddMerge("b", "merge_7");
-      internal_iter->AddMerge("b", "merge_8");
-      internal_iter->AddMerge("b", "merge_9");
-      internal_iter->AddMerge("b", "merge_10");
-      internal_iter->AddMerge("b", "merge_11");
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_6");
+    internal_iter->AddMerge("b", "merge_7");
+    internal_iter->AddMerge("b", "merge_8");
+    internal_iter->AddMerge("b", "merge_9");
+    internal_iter->AddMerge("b", "merge_10");
+    internal_iter->AddMerge("b", "merge_11");
+
+    internal_iter->AddDeletion("c");
+    internal_iter->Finish();
 
-      internal_iter->AddDeletion("c");
-      internal_iter->Finish();
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter,
+        13, options.max_sequential_skip_in_iterations));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
 
-      std::unique_ptr<Iterator> db_iter(NewDBIterator(
-          env_, ImmutableCFOptions(options),
-          BytewiseComparator(), internal_iter, 14,
-          options.max_sequential_skip_in_iterations));
-      db_iter->SeekToLast();
-      ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "c");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_4,merge_5");
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
 
-      ASSERT_EQ(db_iter->key().ToString(), "b");
-      ASSERT_EQ(db_iter->value().ToString(),
-                "merge_6,merge_7,merge_8,merge_9,merge_10,merge_11");
-      db_iter->Prev();
-      ASSERT_TRUE(db_iter->Valid());
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+    ASSERT_EQ(db_iter->value().ToString(),
+              "merge_6,merge_7,merge_8,merge_9,merge_10,merge_11");
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
 
-      ASSERT_EQ(db_iter->key().ToString(), "a");
-      ASSERT_EQ(db_iter->value().ToString(), "merge_1");
-      db_iter->Prev();
-      ASSERT_TRUE(!db_iter->Valid());
-    }
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
   }
 
   {
     TestIterator* internal_iter = new TestIterator(BytewiseComparator());
-    internal_iter->AddDeletion("a");
-    internal_iter->AddPut("a", "0");
-    internal_iter->AddPut("b", "0");
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddPut("b", "val");
+    internal_iter->AddMerge("b", "merge_2");
+
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_3");
+
+    internal_iter->AddMerge("c", "merge_4");
+    internal_iter->AddMerge("c", "merge_5");
+
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_6");
+    internal_iter->AddMerge("b", "merge_7");
+    internal_iter->AddMerge("b", "merge_8");
+    internal_iter->AddMerge("b", "merge_9");
+    internal_iter->AddMerge("b", "merge_10");
+    internal_iter->AddMerge("b", "merge_11");
+
+    internal_iter->AddDeletion("c");
     internal_iter->Finish();
 
-    std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, ImmutableCFOptions(options),
-                      BytewiseComparator(), internal_iter, 10,
-                      options.max_sequential_skip_in_iterations));
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter,
+        14, options.max_sequential_skip_in_iterations));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
-    ASSERT_EQ(db_iter->key().ToString(), "b");
-    ASSERT_EQ(db_iter->value().ToString(), "0");
 
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+    ASSERT_EQ(db_iter->value().ToString(),
+              "merge_6,merge_7,merge_8,merge_9,merge_10,merge_11");
     db_iter->Prev();
     ASSERT_TRUE(db_iter->Valid());
+
     ASSERT_EQ(db_iter->key().ToString(), "a");
-    ASSERT_EQ(db_iter->value().ToString(), "0");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
   }
 }
+TEST(DBIteratorTest, DBIterator8) {
+  Options options;
+  options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+
+  TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+  internal_iter->AddDeletion("a");
+  internal_iter->AddPut("a", "0");
+  internal_iter->AddPut("b", "0");
+  internal_iter->Finish();
+
+  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter,
+      10, options.max_sequential_skip_in_iterations));
+  db_iter->SeekToLast();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "b");
+  ASSERT_EQ(db_iter->value().ToString(), "0");
+
+  db_iter->Prev();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "a");
+  ASSERT_EQ(db_iter->value().ToString(), "0");
+}
 
 }  // namespace rocksdb
 

From dad98dd4ae2a9dccec5297c104a9885c3912264e Mon Sep 17 00:00:00 2001
From: Venkatesh Radhakrishnan <rven@fb.com>
Date: Tue, 3 Feb 2015 12:19:56 -0800
Subject: [PATCH 800/829] Changes for supporting cross functional tests for
 inplace_update

Summary:
This diff containes the changes to the code and db_test
for supporting cross functional tests for inplace_update

Test Plan: Run XF with inplace_test and also without

Reviewers: igor, sdong

Reviewed By: sdong

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D32367
---
 db/db_test.cc | 10 +++++++---
 db/memtable.h |  5 ++++-
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/db/db_test.cc b/db/db_test.cc
index 735ed30f2..5ae209ad6 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -2432,8 +2432,10 @@ TEST(DBTest, IterPrevMaxSkip) {
 }
 
 TEST(DBTest, IterWithSnapshot) {
+  anon::OptionsOverride options_override;
+  options_override.skip_policy = kSkipNoSnapshot;
   do {
-    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions(options_override));
     ASSERT_OK(Put(1, "key1", "val1"));
     ASSERT_OK(Put(1, "key2", "val2"));
     ASSERT_OK(Put(1, "key3", "val3"));
@@ -8098,9 +8100,11 @@ static bool CompareIterators(int step,
 }
 
 TEST(DBTest, Randomized) {
+  anon::OptionsOverride options_override;
+  options_override.skip_policy = kSkipNoSnapshot;
   Random rnd(test::RandomSeed());
   do {
-    ModelDB model(CurrentOptions());
+    ModelDB model(CurrentOptions(options_override));
     const int N = 10000;
     const Snapshot* model_snap = nullptr;
     const Snapshot* db_snap = nullptr;
@@ -8170,7 +8174,7 @@ TEST(DBTest, Randomized) {
         if (db_snap != nullptr) db_->ReleaseSnapshot(db_snap);
 
 
-        auto options = CurrentOptions();
+        auto options = CurrentOptions(options_override);
         Reopen(options);
         ASSERT_TRUE(CompareIterators(step, &model, db_, nullptr, nullptr));
 
diff --git a/db/memtable.h b/db/memtable.h
index 0c1f0de1a..f3befce7d 100644
--- a/db/memtable.h
+++ b/db/memtable.h
@@ -196,7 +196,10 @@ class MemTable {
   }
 
   // return true if the current MemTableRep supports snapshots.
-  bool IsSnapshotSupported() const { return table_->IsSnapshotSupported(); }
+  // inplace update prevents snapshots,
+  bool IsSnapshotSupported() const {
+    return table_->IsSnapshotSupported() && !moptions_.inplace_update_support;
+  }
 
   // Get the lock associated for the key
   port::RWMutex* GetLock(const Slice& key);

From 756e1f151e4768a3adfbe067849d25130f4708ca Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Tue, 3 Feb 2015 17:53:05 -0800
Subject: [PATCH 801/829] Remove unused util/thread_event_info.h

Summary:
Remove unused util/thread_event_info.h, which is replaced by
util/thread_operation.h

Test Plan:
make dbg -j32
make release -j32
---
 util/thread_event_info.h | 71 ----------------------------------------
 1 file changed, 71 deletions(-)
 delete mode 100644 util/thread_event_info.h

diff --git a/util/thread_event_info.h b/util/thread_event_info.h
deleted file mode 100644
index 28916deb4..000000000
--- a/util/thread_event_info.h
+++ /dev/null
@@ -1,71 +0,0 @@
-// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree. An additional grant
-// of patent rights can be found in the PATENTS file in the same directory.
-//
-// This file defines the structures for thread event and operation.
-// Thread events are used to describe high level action of a
-// thread such as doing compaction or flush, while thread operation
-// are used to describe lower-level action such as reading /
-// writing a file or waiting for a mutex.  Events and operations
-// are designed to be independent.  Typically, a thread usually involves
-// in one event and one operation at any specific point in time.
-
-#pragma once
-
-#include "include/rocksdb/thread_status.h"
-
-#include <string>
-
-namespace rocksdb {
-
-#if ROCKSDB_USING_THREAD_STATUS
-
-// The structure that describes a major thread event.
-struct EventInfo {
-  const ThreadStatus::EventType code;
-  const std::string name;
-};
-
-// The global event table.
-//
-// When updating a status of a thread, the pointer of the EventInfo
-// of the current ThreadStatusData will be pointing to one of the
-// rows in this global table.
-//
-// Note that it's not designed to be constant as in the future we
-// might consider adding global count to the EventInfo.
-static EventInfo global_event_table[] = {
-  {ThreadStatus::EVENT_UNKNOWN, ""},
-  {ThreadStatus::EVENT_COMPACTION, "Compaction"},
-  {ThreadStatus::EVENT_FLUSH, "Flush"}
-};
-
-// The structure that describes a operation.
-struct OperationInfo {
-  const ThreadStatus::OperationType code;
-  const std::string name;
-};
-
-// The global operation table.
-//
-// When updating a status of a thread, the pointer of the OperationInfo
-// of the current ThreadStatusData will be pointing to one of the
-// rows in this global table.
-static OperationInfo global_operation_table[] = {
-  {ThreadStatus::OPERATION_UNKNOWN, ""},
-  {ThreadStatus::OPERATION_WRITE_FILE, "Writing SST file"},
-  {ThreadStatus::OPERATION_READ_FILE, "Reaing SST file"},
-  {ThreadStatus::OPERATION_WAIT_DB_MUTEX, "Waiting DB Mutex"}
-};
-
-#else
-
-struct EventInfo {
-};
-
-struct OperationInfo {
-};
-
-#endif  // ROCKSDB_USING_THREAD_STATUS
-}  // namespace rocksdb

From 4d98e29352f36c023e6b1297c87ceb2ece000c98 Mon Sep 17 00:00:00 2001
From: Igor Sugak <sugak@dev2555.prn2.facebook.com>
Date: Tue, 3 Feb 2015 21:43:06 -0800
Subject: [PATCH 802/829] rocksdb: Enable scan-build static analysis

Summary:
Added new target ##make analyze## into Makefile. This command runs clang static analyzer and builds the sources as ##make all##. The result report is put into ##$(RocksDbSourceRoot)/can_build_report/##

If the development environment is a Facebook devserver and ##ROCKSDB_NO_FBCODE## is not set, then scan-build is used from fbcode. If it is run not on a Facebook devserver, scan-build should be available in ##$PATH##. I'll add details to wiki how to install scan-build on a non Facebook devserver environment.

Test Plan:
Run the fallowing commands on a Facebook devserver and Mac OS, and ensure no build or test errors.

```
% make all check -j32
% make clean
% USE_CLANG=1 make all -j32
% make analyze
% USE_CLANG=1 make analyze
```

Reviewers: sdong, lgalanis, leveldb, igor

Reviewed By: igor

Subscribers: dhruba

Differential Revision: https://reviews.facebook.net/D32799
---
 Makefile                          |  8 +++++++-
 build_tools/build_detect_platform | 10 ++++++++++
 build_tools/fbcode_config.sh      |  7 +++++--
 3 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/Makefile b/Makefile
index 1c0dea975..215ecb8a7 100644
--- a/Makefile
+++ b/Makefile
@@ -238,7 +238,7 @@ endif  # PLATFORM_SHARED_EXT
 
 .PHONY: blackbox_crash_test check clean coverage crash_test ldb_tests package \
 	release tags valgrind_check whitebox_crash_test format static_lib shared_lib all \
-	dbg rocksdbjavastatic rocksdbjava install uninstall
+	dbg rocksdbjavastatic rocksdbjava install uninstall analyze
 
 all: $(LIBRARY) $(PROGRAMS) $(TESTS)
 
@@ -303,6 +303,10 @@ valgrind_check: all $(PROGRAMS) $(TESTS)
 		echo $$t $$((etime - stime)) >> $(VALGRIND_DIR)/valgrind_tests_times; \
 	done
 
+analyze:
+	$(MAKE) clean
+	$(CLANG_SCAN_BUILD) --use-analyzer=$(CLANG_ANALYZER) -o $(CURDIR)/scan_build_report $(MAKE) all -j32
+
 unity.cc:
 	$(shell (export ROCKSDB_ROOT="$(CURDIR)"; "$(CURDIR)/build_tools/unity" "$(CURDIR)/unity.cc"))
 
@@ -733,9 +737,11 @@ ifneq ($(MAKECMDGOALS),format)
 ifneq ($(MAKECMDGOALS),jclean)
 ifneq ($(MAKECMDGOALS),jtest)
 ifneq ($(MAKECMDGOALS),package)
+ifneq ($(MAKECMDGOALS),analyze)
 -include $(DEPFILES)
 endif
 endif
 endif
 endif
 endif
+endif
diff --git a/build_tools/build_detect_platform b/build_tools/build_detect_platform
index b314b3acb..9c8bd7d2a 100755
--- a/build_tools/build_detect_platform
+++ b/build_tools/build_detect_platform
@@ -75,6 +75,14 @@ if test -z "$TARGET_OS"; then
     TARGET_OS=`uname -s`
 fi
 
+if test -z "$CLANG_SCAN_BUILD"; then
+    CLANG_SCAN_BUILD=scan-build
+fi
+
+if test -z "$CLANG_ANALYZER"; then  
+    CLANG_ANALYZER=$(which clang++)
+fi
+
 COMMON_FLAGS="$COMMON_FLAGS ${CFLAGS}"
 CROSS_COMPILE=
 PLATFORM_CCFLAGS=
@@ -348,3 +356,5 @@ echo "JEMALLOC_LIB=$JEMALLOC_LIB" >> "$OUTPUT"
 echo "ROCKSDB_MAJOR=$ROCKSDB_MAJOR" >> "$OUTPUT"
 echo "ROCKSDB_MINOR=$ROCKSDB_MINOR" >> "$OUTPUT"
 echo "ROCKSDB_PATCH=$ROCKSDB_PATCH" >> "$OUTPUT"
+echo "CLANG_SCAN_BUILD=$CLANG_SCAN_BUILD" >> "$OUTPUT"
+echo "CLANG_ANALYZER=$CLANG_ANALYZER" >> "$OUTPUT"
diff --git a/build_tools/fbcode_config.sh b/build_tools/fbcode_config.sh
index afc3de40b..2b40e3412 100644
--- a/build_tools/fbcode_config.sh
+++ b/build_tools/fbcode_config.sh
@@ -77,6 +77,10 @@ DEPS_INCLUDE="$SNAPPY_INCLUDE $ZLIB_INCLUDE $BZIP_INCLUDE $LZ4_INCLUDE $GFLAGS_I
 GCC_BASE="/mnt/gvfs/third-party2/gcc/1c67a0b88f64d4d9ced0382d141c76aaa7d62fba/4.9.x/centos6-native/1317bc4"
 STDLIBS="-L $GCC_BASE/lib64"
 
+CLANG_BASE="/mnt/gvfs/third-party2/clang/290704c112bf894bf4a30d7bbd1be81e34998473/dev"
+CLANG_ANALYZER="$CLANG_BASE/centos6-native/af4b1a0/bin/clang++"
+CLANG_SCAN_BUILD="$CLANG_BASE/src/clang/tools/scan-build/scan-build"
+
 if [ -z "$USE_CLANG" ]; then
   # gcc
   CC="$GCC_BASE/bin/gcc"
@@ -87,7 +91,6 @@ if [ -z "$USE_CLANG" ]; then
   CFLAGS+=" -isystem $LIBGCC_INCLUDE"
 else
   # clang 
-  CLANG_BASE="/mnt/gvfs/third-party2/clang/290704c112bf894bf4a30d7bbd1be81e34998473/dev"
   CLANG_INCLUDE="$CLANG_BASE/gcc-4.9-glibc-2.20/74c386f/lib/clang/dev/include/"
   CC="$CLANG_BASE/centos6-native/af4b1a0/bin/clang"
   CXX="$CLANG_BASE/centos6-native/af4b1a0/bin/clang++"
@@ -119,4 +122,4 @@ EXEC_LDFLAGS_SHARED="$SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $GFLAGS_LIBS"
 
 VALGRIND_VER="/mnt/gvfs/third-party2/valgrind/6c45ef049cbf11c2df593addb712cd891049e737/3.10.0/gcc-4.9-glibc-2.20/4230243/bin/"
 
-export CC CXX AR CFLAGS CXXFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED VALGRIND_VER JEMALLOC_LIB JEMALLOC_INCLUDE
+export CC CXX AR CFLAGS CXXFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED VALGRIND_VER JEMALLOC_LIB JEMALLOC_INCLUDE CLANG_ANALYZER CLANG_SCAN_BUILD

From 678503ebcfc133442e69bace54e517296071c1ac Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Wed, 4 Feb 2015 01:47:32 -0800
Subject: [PATCH 803/829] Add utility functions for interpreting ThreadStatus

Summary:
Add ThreadStatus::GetOperationName() and ThreadStatus::GetStateName(),
two utility functions that help interpreting ThreadStatus.

Test Plan: ./thread_list_test

Reviewers: sdong, rven, igor

Reviewed By: igor

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D32793
---
 include/rocksdb/thread_status.h | 10 +++++++++
 util/thread_list_test.cc        |  6 ++++++
 util/thread_status_impl.cc      | 38 +++++++++++++++++++++++++++++++++
 3 files changed, 54 insertions(+)
 create mode 100644 util/thread_status_impl.cc

diff --git a/include/rocksdb/thread_status.h b/include/rocksdb/thread_status.h
index 57a87a21a..539321291 100644
--- a/include/rocksdb/thread_status.h
+++ b/include/rocksdb/thread_status.h
@@ -89,6 +89,16 @@ struct ThreadStatus {
 
   // The state (lower-level action) that the current thread is involved.
   const StateType state_type;
+
+  // The followings are a set of utility functions for interpreting
+  // the information of ThreadStatus
+
+  // Obtain the name of an operation given its type.
+  static const std::string& GetOperationName(OperationType op_type);
+
+  // Obtain the name of a state given its type.
+  static const std::string& GetStateName(StateType state_type);
 };
 
+
 }  // namespace rocksdb
diff --git a/util/thread_list_test.cc b/util/thread_list_test.cc
index 86ce1c4d9..737b78fe3 100644
--- a/util/thread_list_test.cc
+++ b/util/thread_list_test.cc
@@ -98,10 +98,16 @@ TEST(ThreadListTest, EventTables) {
   // verify the global tables for operations and states are properly indexed.
   for (int type = 0; type != ThreadStatus::NUM_OP_TYPES; ++type) {
     ASSERT_EQ(global_operation_table[type].type, type);
+    ASSERT_EQ(global_operation_table[type].name,
+              ThreadStatus::GetOperationName(
+                  ThreadStatus::OperationType(type)));
   }
 
   for (int type = 0; type != ThreadStatus::NUM_STATE_TYPES; ++type) {
     ASSERT_EQ(global_state_table[type].type, type);
+    ASSERT_EQ(global_state_table[type].name,
+              ThreadStatus::GetStateName(
+                  ThreadStatus::StateType(type)));
   }
 }
 
diff --git a/util/thread_status_impl.cc b/util/thread_status_impl.cc
new file mode 100644
index 000000000..faeadf302
--- /dev/null
+++ b/util/thread_status_impl.cc
@@ -0,0 +1,38 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+//
+
+#include "rocksdb/thread_status.h"
+#include "util/thread_operation.h"
+
+namespace rocksdb {
+
+#if ROCKSDB_USING_THREAD_STATUS
+const std::string& ThreadStatus::GetOperationName(
+    ThreadStatus::OperationType op_type) {
+  return global_operation_table[op_type].name;
+}
+
+const std::string& ThreadStatus::GetStateName(
+    ThreadStatus::StateType state_type) {
+  return global_state_table[state_type].name;
+}
+
+#else
+
+const std::string& ThreadStatus::GetOperationName(
+    ThreadStatus::OperationType op_type) {
+  static std::string dummy_str = "";
+  return dummy_str;
+}
+
+const std::string& ThreadStatus::GetStateName(
+    ThreadStatus::StateType state_type) {
+  static std::string dummy_str = "";
+  return dummy_str;
+}
+
+#endif  // ROCKSDB_USING_THREAD_STATUS
+}  // namespace rocksdb

From e63140d52baff154a502452aa94baef5bcec87b6 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Mon, 2 Feb 2015 17:42:57 -0800
Subject: [PATCH 804/829] Get() to use prefix bloom filter when filter is not
 block based

Summary:
Get() now doesn't make use of bloom filter if it is prefix based. Add the check.
Didn't touch block based bloom filter. I can't fully reason whether it is correct to do that. But it's straight-forward to for full bloom filter.

Test Plan:
make all check
Add a test case in DBTest

Reviewers: rven, yhchiang, igor

Reviewed By: igor

Subscribers: MarkCallaghan, leveldb, dhruba, yoshinorim

Differential Revision: https://reviews.facebook.net/D31941
---
 HISTORY.md                        |  1 +
 db/db_test.cc                     | 35 +++++++++++++++++++++++++++++++
 table/block_based_table_reader.cc | 20 ++++++++++++++++--
 table/block_based_table_reader.h  |  3 +++
 4 files changed, 57 insertions(+), 2 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index c688585e5..3502df3ea 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -13,6 +13,7 @@
 * GetThreadStatus() is now able to report compaction activity.
 * MemEnv (env that stores data in memory) is now available in default library build. You can create it by calling NewMemEnv().
 * Add SliceTransform.SameResultWhenAppended() to help users determine it is safe to apply prefix bloom/hash.
+* Block based table now makes use of prefix bloom filter if it is a full fulter.
 
 ### Public API changes
 * Deprecated skip_log_error_on_recovery option
diff --git a/db/db_test.cc b/db/db_test.cc
index 5ae209ad6..4271e3281 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -1976,6 +1976,41 @@ TEST(DBTest, FilterDeletes) {
   } while (ChangeCompactOptions());
 }
 
+TEST(DBTest, GetFilterByPrefixBloom) {
+  Options options = last_options_;
+  options.prefix_extractor.reset(NewFixedPrefixTransform(8));
+  options.statistics = rocksdb::CreateDBStatistics();
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  bbto.whole_key_filtering = false;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  DestroyAndReopen(options);
+
+  WriteOptions wo;
+  ReadOptions ro;
+  FlushOptions fo;
+  fo.wait = true;
+  std::string value;
+
+  ASSERT_OK(dbfull()->Put(wo, "barbarbar", "foo"));
+  ASSERT_OK(dbfull()->Put(wo, "barbarbar2", "foo2"));
+  ASSERT_OK(dbfull()->Put(wo, "foofoofoo", "bar"));
+
+  dbfull()->Flush(fo);
+
+  ASSERT_EQ("foo", Get("barbarbar"));
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
+  ASSERT_EQ("foo2", Get("barbarbar2"));
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
+  ASSERT_EQ("NOT_FOUND", Get("barbarbar3"));
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
+
+  ASSERT_EQ("NOT_FOUND", Get("barfoofoo"));
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+
+  ASSERT_EQ("NOT_FOUND", Get("foobarbar"));
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 2);
+}
 
 TEST(DBTest, IterSeekBeforePrev) {
   ASSERT_OK(Put("a", "b"));
diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc
index 8747d83d7..f03ab2b4b 100644
--- a/table/block_based_table_reader.cc
+++ b/table/block_based_table_reader.cc
@@ -1104,6 +1104,23 @@ Iterator* BlockBasedTable::NewIterator(const ReadOptions& read_options,
                              NewIndexIterator(read_options), arena);
 }
 
+bool BlockBasedTable::FullFilterKeyMayMatch(FilterBlockReader* filter,
+                                            const Slice& internal_key) const {
+  if (filter == nullptr || filter->IsBlockBased()) {
+    return true;
+  }
+  Slice user_key = ExtractUserKey(internal_key);
+  if (!filter->KeyMayMatch(user_key)) {
+    return false;
+  }
+  if (rep_->ioptions.prefix_extractor &&
+      !filter->PrefixMayMatch(
+          rep_->ioptions.prefix_extractor->Transform(user_key))) {
+    return false;
+  }
+  return true;
+}
+
 Status BlockBasedTable::Get(
     const ReadOptions& read_options, const Slice& key,
     GetContext* get_context) {
@@ -1113,8 +1130,7 @@ Status BlockBasedTable::Get(
 
   // First check the full filter
   // If full filter not useful, Then go into each block
-  if (filter != nullptr && !filter->IsBlockBased()
-                        && !filter->KeyMayMatch(ExtractUserKey(key))) {
+  if (!FullFilterKeyMayMatch(filter, key)) {
     RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL);
   } else {
     BlockIter iiter;
diff --git a/table/block_based_table_reader.h b/table/block_based_table_reader.h
index 2902aa441..e3594cf7c 100644
--- a/table/block_based_table_reader.h
+++ b/table/block_based_table_reader.h
@@ -180,6 +180,9 @@ class BlockBasedTable : public TableReader {
   Status CreateIndexReader(IndexReader** index_reader,
                            Iterator* preloaded_meta_index_iter = nullptr);
 
+  bool FullFilterKeyMayMatch(FilterBlockReader* filter,
+                             const Slice& user_key) const;
+
   // Read the meta block from sst.
   static Status ReadMetaBlock(
       Rep* rep,

From e39f4f6cf9d63b14c0f23bee222540a523698c36 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Wed, 4 Feb 2015 16:04:51 -0800
Subject: [PATCH 805/829] Fix data race #3

Summary: Added requirement that ComputeCompactionScore() be executed in mutex, since it's accessing being_compacted bool, which can be mutated by other threads. Also added more comments about thread safety of FileMetaData, since it was a bit confusing. However, it seems that FileMetaData doesn't have data races (except being_compacted)

Test Plan: Ran 100 ConvertCompactionStyle tests with thread sanitizer. On master -- some failures. With this patch -- none.

Reviewers: yhchiang, rven, sdong

Reviewed By: sdong

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D32283
---
 db/compaction_picker.cc      | 21 ++--------------
 db/compaction_picker.h       |  7 ------
 db/compaction_picker_test.cc |  2 --
 db/version_builder_test.cc   |  2 --
 db/version_edit.h            |  4 ++-
 db/version_set.cc            | 47 +++++++++++++++---------------------
 db/version_set.h             | 13 ++++------
 7 files changed, 30 insertions(+), 66 deletions(-)

diff --git a/db/compaction_picker.cc b/db/compaction_picker.cc
index 70be388c9..f74e63436 100644
--- a/db/compaction_picker.cc
+++ b/db/compaction_picker.cc
@@ -23,6 +23,7 @@
 
 namespace rocksdb {
 
+namespace {
 uint64_t TotalCompensatedFileSize(const std::vector<FileMetaData*>& files) {
   uint64_t sum = 0;
   for (size_t i = 0; i < files.size() && files[i]; i++) {
@@ -31,7 +32,6 @@ uint64_t TotalCompensatedFileSize(const std::vector<FileMetaData*>& files) {
   return sum;
 }
 
-namespace {
 // Determine compression type, based on user options, level of the output
 // file and whether compression is disabled.
 // If enable_compression is false, then compression is always disabled no
@@ -71,19 +71,6 @@ CompactionPicker::CompactionPicker(const ImmutableCFOptions& ioptions,
 
 CompactionPicker::~CompactionPicker() {}
 
-void CompactionPicker::SizeBeingCompacted(std::vector<uint64_t>& sizes) {
-  for (int level = 0; level < NumberLevels() - 1; level++) {
-    uint64_t total = 0;
-    for (auto c : compactions_in_progress_[level]) {
-      assert(c->level() == level);
-      for (size_t i = 0; i < c->num_input_files(0); i++) {
-        total += c->input(0, i)->compensated_file_size;
-      }
-    }
-    sizes[level] = total;
-  }
-}
-
 // Clear all files to indicate that they are not being compacted
 // Delete this compaction from the list of running compactions.
 void CompactionPicker::ReleaseCompactionFiles(Compaction* c, Status status) {
@@ -763,13 +750,9 @@ Compaction* LevelCompactionPicker::PickCompaction(
   // being compacted). Since we just changed compaction score, we recalculate it
   // here
   {  // this piece of code recomputes compaction score
-    std::vector<uint64_t> size_being_compacted(NumberLevels() - 1);
-    SizeBeingCompacted(size_being_compacted);
-
     CompactionOptionsFIFO dummy_compaction_options_fifo;
     vstorage->ComputeCompactionScore(mutable_cf_options,
-                                     dummy_compaction_options_fifo,
-                                     size_being_compacted);
+                                     dummy_compaction_options_fifo);
   }
 
   return c;
diff --git a/db/compaction_picker.h b/db/compaction_picker.h
index ad72e609a..7cc58d66b 100644
--- a/db/compaction_picker.h
+++ b/db/compaction_picker.h
@@ -91,10 +91,6 @@ class CompactionPicker {
   // Free up the files that participated in a compaction
   void ReleaseCompactionFiles(Compaction* c, Status status);
 
-  // Return the total amount of data that is undergoing
-  // compactions per level
-  void SizeBeingCompacted(std::vector<uint64_t>& sizes);
-
   // Returns true if any one of the specified files are being compacted
   bool FilesInCompaction(const std::vector<FileMetaData*>& files);
 
@@ -314,7 +310,4 @@ class NullCompactionPicker : public CompactionPicker {
 };
 #endif  // !ROCKSDB_LITE
 
-// Utility function
-extern uint64_t TotalCompensatedFileSize(const std::vector<FileMetaData*>& files);
-
 }  // namespace rocksdb
diff --git a/db/compaction_picker_test.cc b/db/compaction_picker_test.cc
index 5ffc74f0d..dd5c06a54 100644
--- a/db/compaction_picker_test.cc
+++ b/db/compaction_picker_test.cc
@@ -83,8 +83,6 @@ class CompactionPickerTest {
   }
 
   void UpdateVersionStorageInfo() {
-    vstorage_->ComputeCompactionScore(mutable_cf_options_, fifo_options_,
-                                    size_being_compacted_);
     vstorage_->UpdateFilesBySize();
     vstorage_->UpdateNumNonEmptyLevels();
     vstorage_->GenerateFileIndexer();
diff --git a/db/version_builder_test.cc b/db/version_builder_test.cc
index 5da73cbc3..a48b4e3a2 100644
--- a/db/version_builder_test.cc
+++ b/db/version_builder_test.cc
@@ -74,8 +74,6 @@ class VersionBuilderTest {
   }
 
   void UpdateVersionStorageInfo() {
-    vstorage_.ComputeCompactionScore(mutable_cf_options_, fifo_options_,
-                                    size_being_compacted_);
     vstorage_.UpdateFilesBySize();
     vstorage_.UpdateNumNonEmptyLevels();
     vstorage_.GenerateFileIndexer();
diff --git a/db/version_edit.h b/db/version_edit.h
index 35b894954..6f7a692f3 100644
--- a/db/version_edit.h
+++ b/db/version_edit.h
@@ -76,8 +76,10 @@ struct FileMetaData {
 
   // File size compensated by deletion entry.
   // This is updated in Version::UpdateAccumulatedStats() first time when the
-  // file is created or loaded.  After it is updated, it is immutable.
+  // file is created or loaded.  After it is updated (!= 0), it is immutable.
   uint64_t compensated_file_size;
+  // These values can mutate, but they can only be read or written from
+  // single-threaded LogAndApply thread
   uint64_t num_entries;            // the number of entries.
   uint64_t num_deletions;          // the number of deletion entries.
   uint64_t raw_key_size;           // total uncompressed key size.
diff --git a/db/version_set.cc b/db/version_set.cc
index a0af2decc..3b6d57f55 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -850,12 +850,8 @@ void VersionStorageInfo::GenerateLevelFilesBrief() {
   }
 }
 
-void Version::PrepareApply(const MutableCFOptions& mutable_cf_options,
-                           std::vector<uint64_t>& size_being_compacted) {
+void Version::PrepareApply() {
   UpdateAccumulatedStats();
-  storage_info_.ComputeCompactionScore(
-      mutable_cf_options, cfd_->ioptions()->compaction_options_fifo,
-      size_being_compacted);
   storage_info_.UpdateFilesBySize();
   storage_info_.UpdateNumNonEmptyLevels();
   storage_info_.GenerateFileIndexer();
@@ -947,7 +943,9 @@ void VersionStorageInfo::ComputeCompensatedSizes() {
   for (int level = 0; level < num_levels_; level++) {
     for (auto* file_meta : files_[level]) {
       // Here we only compute compensated_file_size for those file_meta
-      // which compensated_file_size is uninitialized (== 0).
+      // which compensated_file_size is uninitialized (== 0). This is true only
+      // for files that have been created right now and no other thread has
+      // access to them. That's why we can safely mutate compensated_file_size.
       if (file_meta->compensated_file_size == 0) {
         file_meta->compensated_file_size = file_meta->fd.GetFileSize() +
             file_meta->num_deletions * average_value_size *
@@ -966,8 +964,7 @@ int VersionStorageInfo::MaxInputLevel() const {
 
 void VersionStorageInfo::ComputeCompactionScore(
     const MutableCFOptions& mutable_cf_options,
-    const CompactionOptionsFIFO& compaction_options_fifo,
-    std::vector<uint64_t>& size_being_compacted) {
+    const CompactionOptionsFIFO& compaction_options_fifo) {
   double max_score = 0;
   int max_score_level = 0;
 
@@ -1008,9 +1005,13 @@ void VersionStorageInfo::ComputeCompactionScore(
       }
     } else {
       // Compute the ratio of current size to size limit.
-      const uint64_t level_bytes =
-          TotalCompensatedFileSize(files_[level]) - size_being_compacted[level];
-      score = static_cast<double>(level_bytes) /
+      uint64_t level_bytes_no_compacting = 0;
+      for (auto f : files_[level]) {
+        if (f && f->being_compacted == false) {
+          level_bytes_no_compacting += f->compensated_file_size;
+        }
+      }
+      score = static_cast<double>(level_bytes_no_compacting) /
               mutable_cf_options.MaxBytesForLevel(level);
       if (max_score < score) {
         max_score = score;
@@ -1527,6 +1528,11 @@ VersionSet::~VersionSet() {
 
 void VersionSet::AppendVersion(ColumnFamilyData* column_family_data,
                                Version* v) {
+  // compute new compaction score
+  v->storage_info()->ComputeCompactionScore(
+      *column_family_data->GetLatestMutableCFOptions(),
+      column_family_data->ioptions()->compaction_options_fifo);
+
   // Mark v finalized
   v->storage_info_.SetFinalized();
 
@@ -1637,13 +1643,6 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
   // Unlock during expensive operations. New writes cannot get here
   // because &w is ensuring that all new writes get queued.
   {
-    std::vector<uint64_t> size_being_compacted;
-    if (!edit->IsColumnFamilyManipulation()) {
-      size_being_compacted.resize(v->storage_info()->num_levels() - 1);
-      // calculate the amount of data being compacted at every level
-      column_family_data->compaction_picker()->SizeBeingCompacted(
-          size_being_compacted);
-    }
 
     mu->Unlock();
 
@@ -1674,7 +1673,7 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
 
     if (!edit->IsColumnFamilyManipulation()) {
       // This is cpu-heavy operations, which should be called outside mutex.
-      v->PrepareApply(mutable_cf_options, size_being_compacted);
+      v->PrepareApply();
     }
 
     // Write new record to MANIFEST log
@@ -2097,10 +2096,7 @@ Status VersionSet::Recover(
       builder->SaveTo(v->storage_info());
 
       // Install recovered version
-      std::vector<uint64_t> size_being_compacted(
-          v->storage_info()->num_levels() - 1);
-      cfd->compaction_picker()->SizeBeingCompacted(size_being_compacted);
-      v->PrepareApply(*cfd->GetLatestMutableCFOptions(), size_being_compacted);
+      v->PrepareApply();
       AppendVersion(cfd, v);
     }
 
@@ -2434,10 +2430,7 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname,
 
       Version* v = new Version(cfd, this, current_version_number_++);
       builder->SaveTo(v->storage_info());
-      std::vector<uint64_t> size_being_compacted(
-          v->storage_info()->num_levels() - 1);
-      cfd->compaction_picker()->SizeBeingCompacted(size_being_compacted);
-      v->PrepareApply(*cfd->GetLatestMutableCFOptions(), size_being_compacted);
+      v->PrepareApply();
 
       printf("--------------- Column family \"%s\"  (ID %u) --------------\n",
              cfd->GetName().c_str(), (unsigned int)cfd->GetID());
diff --git a/db/version_set.h b/db/version_set.h
index 6e645680b..83801e1da 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -113,13 +113,11 @@ class VersionStorageInfo {
 
   // Updates internal structures that keep track of compaction scores
   // We use compaction scores to figure out which compaction to do next
-  // REQUIRES: If Version is not yet saved to current_, it can be called without
-  // a lock. Once a version is saved to current_, call only with mutex held
+  // REQUIRES: db_mutex held!!
   // TODO find a better way to pass compaction_options_fifo.
   void ComputeCompactionScore(
       const MutableCFOptions& mutable_cf_options,
-      const CompactionOptionsFIFO& compaction_options_fifo,
-      std::vector<uint64_t>& size_being_compacted);
+      const CompactionOptionsFIFO& compaction_options_fifo);
 
   // Generate level_files_brief_ from files_
   void GenerateLevelFilesBrief();
@@ -365,10 +363,9 @@ class Version {
            Status* status, MergeContext* merge_context,
            bool* value_found = nullptr);
 
-  // Update scores, pre-calculated variables. It needs to be called before
-  // applying the version to the version set.
-  void PrepareApply(const MutableCFOptions& mutable_cf_options,
-                    std::vector<uint64_t>& size_being_compacted);
+  // Loads some stats information from files. Call without mutex held. It needs
+  // to be called before applying the version to the version set.
+  void PrepareApply();
 
   // Reference count management (so Versions do not disappear out from
   // under live iterators)

From 3e53760fc437b653298f23c4e5073edd53e8599a Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Wed, 4 Feb 2015 16:20:25 -0800
Subject: [PATCH 806/829] Fix compaction_picker_test

---
 db/compaction_picker_test.cc | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/db/compaction_picker_test.cc b/db/compaction_picker_test.cc
index dd5c06a54..e0ba7722e 100644
--- a/db/compaction_picker_test.cc
+++ b/db/compaction_picker_test.cc
@@ -31,7 +31,6 @@ class CompactionPickerTest {
   LogBuffer log_buffer_;
   uint32_t file_num_;
   CompactionOptionsFIFO fifo_options_;
-  std::vector<uint64_t> size_being_compacted_;
   std::unique_ptr<VersionStorageInfo> vstorage_;
   std::vector<std::unique_ptr<FileMetaData>> files_;
 
@@ -47,7 +46,6 @@ class CompactionPickerTest {
         vstorage_(nullptr) {
     fifo_options_.max_table_files_size = 1;
     mutable_cf_options_.RefreshDerivedOptions(ioptions_);
-    size_being_compacted_.resize(options_.num_levels);
     ioptions_.db_paths.emplace_back("dummy",
                                     std::numeric_limits<uint64_t>::max());
   }
@@ -87,6 +85,7 @@ class CompactionPickerTest {
     vstorage_->UpdateNumNonEmptyLevels();
     vstorage_->GenerateFileIndexer();
     vstorage_->GenerateLevelFilesBrief();
+    vstorage_->ComputeCompactionScore(mutable_cf_options_, fifo_options_);
     vstorage_->SetFinalized();
   }
 };

From 108470e96338be109de4fc0fccf2a80e684712d6 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Wed, 4 Feb 2015 16:24:02 -0800
Subject: [PATCH 807/829] Fix stack trace on mac

---
 Makefile            | 1 -
 port/stack_trace.cc | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index 215ecb8a7..2ecf5f9f4 100644
--- a/Makefile
+++ b/Makefile
@@ -111,7 +111,6 @@ VALGRIND_VER := $(join $(VALGRIND_VER),valgrind)
 VALGRIND_OPTS = --error-exitcode=$(VALGRIND_ERROR) --leak-check=full
 
 TESTS = \
-	db_test \
 	db_iter_test \
 	block_hash_index_test \
 	autovector_test \
diff --git a/port/stack_trace.cc b/port/stack_trace.cc
index 224cac700..b2075b9a9 100644
--- a/port/stack_trace.cc
+++ b/port/stack_trace.cc
@@ -78,7 +78,7 @@ void PrintStackTraceLine(const char* symbol, void* frame) {
   // out source to atos, for the address translation
   const int kLineMax = 256;
   char cmd[kLineMax];
-  snprintf(cmd, kLineMax, "xcrun atos -d %p -p %d  2>&1", frame, pid);
+  snprintf(cmd, kLineMax, "xcrun atos %p -p %d  2>&1", frame, pid);
   auto f = popen(cmd, "r");
   if (f) {
     char line[kLineMax];

From b37f5ffc76388a647563f7146ba00e7150e391d2 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Wed, 4 Feb 2015 16:24:52 -0800
Subject: [PATCH 808/829] Put db_test back to TESTS in Makefile

---
 Makefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Makefile b/Makefile
index 2ecf5f9f4..215ecb8a7 100644
--- a/Makefile
+++ b/Makefile
@@ -111,6 +111,7 @@ VALGRIND_VER := $(join $(VALGRIND_VER),valgrind)
 VALGRIND_OPTS = --error-exitcode=$(VALGRIND_ERROR) --leak-check=full
 
 TESTS = \
+	db_test \
 	db_iter_test \
 	block_hash_index_test \
 	autovector_test \

From fe9f6911947a655b0f6dd579b40658f1beca17e8 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Wed, 4 Feb 2015 15:32:06 -0800
Subject: [PATCH 809/829] Fix fault_injestion_test

Summary: A bug in MockEnv causes fault_injestion_test to fail. I don't know why it doesn't fail every time but it doesn't seem to be right.

Test Plan:
Run fault_injestion_test
Also run db_test with MEM_ENV=1 until the first failure.

Reviewers: yhchiang, rven, igor

Reviewed By: igor

Subscribers: leveldb, dhruba

Differential Revision: https://reviews.facebook.net/D32877
---
 util/mock_env.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/util/mock_env.cc b/util/mock_env.cc
index 41aeafaaf..bcfc611b5 100644
--- a/util/mock_env.cc
+++ b/util/mock_env.cc
@@ -630,10 +630,10 @@ Status MockEnv::LockFile(const std::string& fname, FileLock** flock) {
         return Status::IOError(fn, "Lock is already held.");
       }
     } else {
-      auto* file = new MemFile(fname, true);
+      auto* file = new MemFile(fn, true);
       file->Ref();
       file->Lock();
-      file_map_[fname] = file;
+      file_map_[fn] = file;
     }
   }
   *flock = new MockEnvFileLock(fn);

From 53ae09c398921a4eec63e5d92550bb8e0f49afd0 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Wed, 4 Feb 2015 18:03:36 -0800
Subject: [PATCH 810/829] db_test: fix a data race in SpecialEnv

Summary: db_test's test class SpecialEnv has a thread unsafe variable rnd_ but it can be accessed by multiple threads. It is complained by TSAN. Protect it by a mutex.

Test Plan: Run the test

Reviewers: yhchiang, rven, igor

Reviewed By: igor

Subscribers: leveldb, dhruba

Differential Revision: https://reviews.facebook.net/D32895
---
 db/db_test.cc | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/db/db_test.cc b/db/db_test.cc
index 4271e3281..a0a583334 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -133,6 +133,7 @@ static std::string Key(int i) {
 class SpecialEnv : public EnvWrapper {
  public:
   Random rnd_;
+  port::Mutex rnd_mutex_;  // Lock to pretect rnd_
 
   // sstable Sync() calls are blocked while this pointer is non-nullptr.
   std::atomic<bool> delay_sstable_sync_;
@@ -294,7 +295,11 @@ class SpecialEnv : public EnvWrapper {
     };
 
     if (non_writeable_rate_.load(std::memory_order_acquire) > 0) {
-      auto random_number = rnd_.Uniform(100);
+      uint32_t random_number;
+      {
+        MutexLock l(&rnd_mutex_);
+        random_number = rnd_.Uniform(100);
+      }
       if (random_number < non_writeable_rate_.load()) {
         return Status::IOError("simulated random write error");
       }

From f36d394aeddf420661e54a1a0a54fcc790c9cffb Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Wed, 4 Feb 2015 18:57:22 -0800
Subject: [PATCH 811/829] Fix wal_dir not getting cleaned

---
 db/db_test.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/db/db_test.cc b/db/db_test.cc
index a0a583334..4915d29c1 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -2596,7 +2596,8 @@ TEST(DBTest, IgnoreRecoveredLog) {
     options.create_if_missing = true;
     options.merge_operator = MergeOperators::CreateUInt64AddOperator();
     options.wal_dir = dbname_ + "/logs";
-    DestroyAndReopen(options);
+    Destroy(options);
+    Reopen(options);
 
     // fill up the DB
     std::string one, two;

From 181191a1e444708718fa86126314560ec98d9e7a Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Wed, 4 Feb 2015 21:39:45 -0800
Subject: [PATCH 812/829] Add a counter for collecting the wait time on db
 mutex.

Summary:
Add a counter for collecting the wait time on db mutex.
Also add MutexWrapper and CondVarWrapper for measuring wait time.

Test Plan:
./db_test
export ROCKSDB_TESTS=MutexWaitStats
./db_test

verify stats output using db_bench
make clean
make release
./db_bench --statistics=1 --benchmarks=fillseq,readwhilewriting --num=10000 --threads=10

Sample output:
    rocksdb.db.mutex.wait.micros COUNT : 7546866

Reviewers: MarkCallaghan, rven, sdong, igor

Reviewed By: igor

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D32787
---
 db/column_family.cc             | 10 ++--
 db/column_family.h              | 20 ++++---
 db/compaction_job.cc            |  4 +-
 db/compaction_job.h             |  4 +-
 db/compaction_job_test.cc       |  2 +-
 db/db_filesnapshot.cc           |  4 +-
 db/db_impl.cc                   | 56 ++++++++++---------
 db/db_impl.h                    |  5 +-
 db/db_impl_debug.cc             |  8 +--
 db/db_test.cc                   | 15 +++++
 db/flush_job.cc                 |  3 +-
 db/flush_job.h                  |  5 +-
 db/flush_job_test.cc            |  2 +-
 db/log_and_apply_bench.cc       |  4 +-
 db/memtable_list.cc             |  2 +-
 db/memtable_list.h              |  5 +-
 db/version_set.cc               | 12 ++--
 db/version_set.h                |  5 +-
 db/write_thread.h               |  5 +-
 include/rocksdb/statistics.h    |  3 +
 include/rocksdb/thread_status.h |  1 +
 util/instrumented_mutex.cc      | 72 ++++++++++++++++++++++++
 util/instrumented_mutex.h       | 98 +++++++++++++++++++++++++++++++++
 util/thread_status_util.h       |  1 +
 24 files changed, 274 insertions(+), 72 deletions(-)
 create mode 100644 util/instrumented_mutex.cc
 create mode 100644 util/instrumented_mutex.h

diff --git a/db/column_family.cc b/db/column_family.cc
index d3ff9b3f5..ea3e617e2 100644
--- a/db/column_family.cc
+++ b/db/column_family.cc
@@ -66,7 +66,7 @@ uint64_t SlowdownAmount(int n, double bottom, double top) {
 }  // namespace
 
 ColumnFamilyHandleImpl::ColumnFamilyHandleImpl(
-    ColumnFamilyData* column_family_data, DBImpl* db, port::Mutex* mutex)
+    ColumnFamilyData* column_family_data, DBImpl* db, InstrumentedMutex* mutex)
     : cfd_(column_family_data), db_(db), mutex_(mutex) {
   if (cfd_ != nullptr) {
     cfd_->Ref();
@@ -482,7 +482,7 @@ Compaction* ColumnFamilyData::CompactRange(
 }
 
 SuperVersion* ColumnFamilyData::GetReferencedSuperVersion(
-    port::Mutex* db_mutex) {
+    InstrumentedMutex* db_mutex) {
   SuperVersion* sv = nullptr;
   sv = GetThreadLocalSuperVersion(db_mutex);
   sv->Ref();
@@ -493,7 +493,7 @@ SuperVersion* ColumnFamilyData::GetReferencedSuperVersion(
 }
 
 SuperVersion* ColumnFamilyData::GetThreadLocalSuperVersion(
-    port::Mutex* db_mutex) {
+    InstrumentedMutex* db_mutex) {
   SuperVersion* sv = nullptr;
   // The SuperVersion is cached in thread local storage to avoid acquiring
   // mutex when SuperVersion does not change since the last use. When a new
@@ -599,13 +599,13 @@ void ColumnFamilyData::NotifyOnFlushCompleted(
 }
 
 SuperVersion* ColumnFamilyData::InstallSuperVersion(
-    SuperVersion* new_superversion, port::Mutex* db_mutex) {
+    SuperVersion* new_superversion, InstrumentedMutex* db_mutex) {
   db_mutex->AssertHeld();
   return InstallSuperVersion(new_superversion, db_mutex, mutable_cf_options_);
 }
 
 SuperVersion* ColumnFamilyData::InstallSuperVersion(
-    SuperVersion* new_superversion, port::Mutex* db_mutex,
+    SuperVersion* new_superversion, InstrumentedMutex* db_mutex,
     const MutableCFOptions& mutable_cf_options) {
   new_superversion->db_mutex = db_mutex;
   new_superversion->mutable_cf_options = mutable_cf_options;
diff --git a/db/column_family.h b/db/column_family.h
index 8101e7032..84b01dc71 100644
--- a/db/column_family.h
+++ b/db/column_family.h
@@ -21,9 +21,10 @@
 #include "db/write_batch_internal.h"
 #include "db/write_controller.h"
 #include "db/table_cache.h"
-#include "util/thread_local.h"
 #include "db/flush_scheduler.h"
+#include "util/instrumented_mutex.h"
 #include "util/mutable_cf_options.h"
+#include "util/thread_local.h"
 
 namespace rocksdb {
 
@@ -38,6 +39,8 @@ class InternalStats;
 class ColumnFamilyData;
 class DBImpl;
 class LogBuffer;
+class InstrumentedMutex;
+class InstrumentedMutexLock;
 
 // ColumnFamilyHandleImpl is the class that clients use to access different
 // column families. It has non-trivial destructor, which gets called when client
@@ -45,7 +48,8 @@ class LogBuffer;
 class ColumnFamilyHandleImpl : public ColumnFamilyHandle {
  public:
   // create while holding the mutex
-  ColumnFamilyHandleImpl(ColumnFamilyData* cfd, DBImpl* db, port::Mutex* mutex);
+  ColumnFamilyHandleImpl(
+      ColumnFamilyData* cfd, DBImpl* db, InstrumentedMutex* mutex);
   // destroy without mutex
   virtual ~ColumnFamilyHandleImpl();
   virtual ColumnFamilyData* cfd() const { return cfd_; }
@@ -57,7 +61,7 @@ class ColumnFamilyHandleImpl : public ColumnFamilyHandle {
  private:
   ColumnFamilyData* cfd_;
   DBImpl* db_;
-  port::Mutex* mutex_;
+  InstrumentedMutex* mutex_;
 };
 
 // Does not ref-count ColumnFamilyData
@@ -91,7 +95,7 @@ struct SuperVersion {
   autovector<MemTable*> to_delete;
   // Version number of the current SuperVersion
   uint64_t version_number;
-  port::Mutex* db_mutex;
+  InstrumentedMutex* db_mutex;
 
   // should be called outside the mutex
   SuperVersion() = default;
@@ -235,11 +239,11 @@ class ColumnFamilyData {
   SuperVersion* GetSuperVersion() { return super_version_; }
   // thread-safe
   // Return a already referenced SuperVersion to be used safely.
-  SuperVersion* GetReferencedSuperVersion(port::Mutex* db_mutex);
+  SuperVersion* GetReferencedSuperVersion(InstrumentedMutex* db_mutex);
   // thread-safe
   // Get SuperVersion stored in thread local storage. If it does not exist,
   // get a reference from a current SuperVersion.
-  SuperVersion* GetThreadLocalSuperVersion(port::Mutex* db_mutex);
+  SuperVersion* GetThreadLocalSuperVersion(InstrumentedMutex* db_mutex);
   // Try to return SuperVersion back to thread local storage. Retrun true on
   // success and false on failure. It fails when the thread local storage
   // contains anything other than SuperVersion::kSVInUse flag.
@@ -254,10 +258,10 @@ class ColumnFamilyData {
   // the clients to allocate SuperVersion outside of mutex.
   // IMPORTANT: Only call this from DBImpl::InstallSuperVersion()
   SuperVersion* InstallSuperVersion(SuperVersion* new_superversion,
-                                    port::Mutex* db_mutex,
+                                    InstrumentedMutex* db_mutex,
                                     const MutableCFOptions& mutable_cf_options);
   SuperVersion* InstallSuperVersion(SuperVersion* new_superversion,
-                                    port::Mutex* db_mutex);
+                                    InstrumentedMutex* db_mutex);
 
   void ResetThreadLocalSuperVersions();
 
diff --git a/db/compaction_job.cc b/db/compaction_job.cc
index 09b21a237..775dcebec 100644
--- a/db/compaction_job.cc
+++ b/db/compaction_job.cc
@@ -470,7 +470,7 @@ Status CompactionJob::Run() {
   return status;
 }
 
-void CompactionJob::Install(Status* status, port::Mutex* db_mutex) {
+void CompactionJob::Install(Status* status, InstrumentedMutex* db_mutex) {
   db_mutex->AssertHeld();
   ColumnFamilyData* cfd = compact_->compaction->column_family_data();
   cfd->internal_stats()->AddCompactionStats(
@@ -955,7 +955,7 @@ Status CompactionJob::FinishCompactionOutputFile(Iterator* input) {
   return s;
 }
 
-Status CompactionJob::InstallCompactionResults(port::Mutex* db_mutex) {
+Status CompactionJob::InstallCompactionResults(InstrumentedMutex* db_mutex) {
   db_mutex->AssertHeld();
 
   // paranoia: verify that the files that we started with
diff --git a/db/compaction_job.h b/db/compaction_job.h
index 705ba7c64..cc31ece87 100644
--- a/db/compaction_job.h
+++ b/db/compaction_job.h
@@ -75,7 +75,7 @@ class CompactionJob {
   Status Run();
   // REQUIRED: mutex held
   // status is the return of Run()
-  void Install(Status* status, port::Mutex* db_mutex);
+  void Install(Status* status, InstrumentedMutex* db_mutex);
 
  private:
   void AllocateCompactionOutputFileNumbers();
@@ -86,7 +86,7 @@ class CompactionJob {
   // Call compaction_filter_v2->Filter() on kv-pairs in compact
   void CallCompactionFilterV2(CompactionFilterV2* compaction_filter_v2);
   Status FinishCompactionOutputFile(Iterator* input);
-  Status InstallCompactionResults(port::Mutex* db_mutex);
+  Status InstallCompactionResults(InstrumentedMutex* db_mutex);
   SequenceNumber findEarliestVisibleSnapshot(
       SequenceNumber in, const std::vector<SequenceNumber>& snapshots,
       SequenceNumber* prev_snapshot);
diff --git a/db/compaction_job_test.cc b/db/compaction_job_test.cc
index 54217cc37..2a089dc57 100644
--- a/db/compaction_job_test.cc
+++ b/db/compaction_job_test.cc
@@ -130,7 +130,7 @@ class CompactionJobTest {
   ColumnFamilyOptions cf_options_;
   WriteBuffer write_buffer_;
   std::unique_ptr<VersionSet> versions_;
-  port::Mutex mutex_;
+  InstrumentedMutex mutex_;
   std::atomic<bool> shutting_down_;
   std::shared_ptr<mock::MockTableFactory> mock_table_factory_;
 };
diff --git a/db/db_filesnapshot.cc b/db/db_filesnapshot.cc
index ce009a976..4011b4652 100644
--- a/db/db_filesnapshot.cc
+++ b/db/db_filesnapshot.cc
@@ -31,7 +31,7 @@
 namespace rocksdb {
 
 Status DBImpl::DisableFileDeletions() {
-  MutexLock l(&mutex_);
+  InstrumentedMutexLock l(&mutex_);
   ++disable_delete_obsolete_files_;
   if (disable_delete_obsolete_files_ == 1) {
     Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
@@ -48,7 +48,7 @@ Status DBImpl::EnableFileDeletions(bool force) {
   JobContext job_context;
   bool should_purge_files = false;
   {
-    MutexLock l(&mutex_);
+    InstrumentedMutexLock l(&mutex_);
     if (force) {
       // if force, we need to enable file deletions right away
       disable_delete_obsolete_files_ = 0;
diff --git a/db/db_impl.cc b/db/db_impl.cc
index 8e8f3b733..dd627313b 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -197,7 +197,9 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname)
       db_options_(SanitizeOptions(dbname, options)),
       stats_(db_options_.statistics.get()),
       db_lock_(nullptr),
-      mutex_(options.use_adaptive_mutex),
+      mutex_(stats_, env_,
+             DB_MUTEX_WAIT_MICROS,
+             options.use_adaptive_mutex),
       shutting_down_(false),
       bg_cv_(&mutex_),
       logfile_number_(0),
@@ -411,7 +413,7 @@ void DBImpl::MaybeDumpStats() {
         GetPropertyType("rocksdb.dbstats", &tmp1, &tmp2);
     std::string stats;
     {
-      MutexLock l(&mutex_);
+      InstrumentedMutexLock l(&mutex_);
       for (auto cfd : *versions_->GetColumnFamilySet()) {
         cfd->internal_stats()->GetStringProperty(cf_property_type,
                                                  "rocksdb.cfstats", &stats);
@@ -1225,7 +1227,7 @@ Status DBImpl::CompactRange(ColumnFamilyHandle* column_family,
 
   int max_level_with_files = 0;
   {
-    MutexLock l(&mutex_);
+    InstrumentedMutexLock l(&mutex_);
     Version* base = cfd->current();
     for (int level = 1; level < cfd->NumberLevels(); level++) {
       if (base->storage_info()->OverlapInLevel(level, begin, end)) {
@@ -1258,7 +1260,7 @@ Status DBImpl::CompactRange(ColumnFamilyHandle* column_family,
   LogFlush(db_options_.info_log);
 
   {
-    MutexLock l(&mutex_);
+    InstrumentedMutexLock l(&mutex_);
     // an automatic compaction that has been scheduled might have been
     // preempted by the manual compactions. Need to schedule it back.
     MaybeScheduleFlushOrCompaction();
@@ -1276,7 +1278,7 @@ Status DBImpl::CompactFiles(
     // not supported in lite version
   return Status::NotSupported("Not supported in ROCKSDB LITE");
 #else
-  MutexLock l(&mutex_);
+  InstrumentedMutexLock l(&mutex_);
   if (column_family == nullptr) {
     return Status::InvalidArgument("ColumnFamilyHandle must be non-null.");
   }
@@ -1471,7 +1473,7 @@ Status DBImpl::SetOptions(ColumnFamilyHandle* column_family,
   MutableCFOptions new_options;
   Status s;
   {
-    MutexLock l(&mutex_);
+    InstrumentedMutexLock l(&mutex_);
     s = cfd->SetOptions(options_map);
     if (s.ok()) {
       new_options = *cfd->GetLatestMutableCFOptions();
@@ -1607,14 +1609,14 @@ int DBImpl::NumberLevels(ColumnFamilyHandle* column_family) {
 
 int DBImpl::MaxMemCompactionLevel(ColumnFamilyHandle* column_family) {
   auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
-  MutexLock l(&mutex_);
+  InstrumentedMutexLock l(&mutex_);
   return cfh->cfd()->GetSuperVersion()->
       mutable_cf_options.max_mem_compaction_level;
 }
 
 int DBImpl::Level0StopWriteTrigger(ColumnFamilyHandle* column_family) {
   auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
-  MutexLock l(&mutex_);
+  InstrumentedMutexLock l(&mutex_);
   return cfh->cfd()->GetSuperVersion()->
       mutable_cf_options.level0_stop_writes_trigger;
 }
@@ -1662,7 +1664,7 @@ Status DBImpl::RunManualCompaction(ColumnFamilyData* cfd, int input_level,
     manual.end = &end_storage;
   }
 
-  MutexLock l(&mutex_);
+  InstrumentedMutexLock l(&mutex_);
 
   // When a manual compaction arrives, temporarily disable scheduling of
   // non-manual compactions and wait until the number of scheduled compaction
@@ -1717,7 +1719,7 @@ Status DBImpl::FlushMemTable(ColumnFamilyData* cfd,
   Status s;
   {
     WriteContext context;
-    MutexLock guard_lock(&mutex_);
+    InstrumentedMutexLock guard_lock(&mutex_);
 
     if (cfd->imm()->size() == 0 && cfd->mem()->IsEmpty()) {
       // Nothing to flush
@@ -1750,7 +1752,7 @@ Status DBImpl::FlushMemTable(ColumnFamilyData* cfd,
 Status DBImpl::WaitForFlushMemTable(ColumnFamilyData* cfd) {
   Status s;
   // Wait until the compaction completes
-  MutexLock l(&mutex_);
+  InstrumentedMutexLock l(&mutex_);
   while (cfd->imm()->size() > 0 && bg_error_.ok()) {
     bg_cv_.Wait();
   }
@@ -1917,7 +1919,7 @@ void DBImpl::BackgroundCallFlush() {
 
   LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, db_options_.info_log.get());
   {
-    MutexLock l(&mutex_);
+    InstrumentedMutexLock l(&mutex_);
 
     auto pending_outputs_inserted_elem =
         CaptureCurrentFileNumberInPendingOutputs();
@@ -1985,7 +1987,7 @@ void DBImpl::BackgroundCallCompaction() {
   MaybeDumpStats();
   LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, db_options_.info_log.get());
   {
-    MutexLock l(&mutex_);
+    InstrumentedMutexLock l(&mutex_);
 
     auto pending_outputs_inserted_elem =
         CaptureCurrentFileNumberInPendingOutputs();
@@ -2352,11 +2354,11 @@ uint64_t DBImpl::CallFlushDuringCompaction(
 
 namespace {
 struct IterState {
-  IterState(DBImpl* _db, port::Mutex* _mu, SuperVersion* _super_version)
+  IterState(DBImpl* _db, InstrumentedMutex* _mu, SuperVersion* _super_version)
       : db(_db), mu(_mu), super_version(_super_version) {}
 
   DBImpl* db;
-  port::Mutex* mu;
+  InstrumentedMutex* mu;
   SuperVersion* super_version;
 };
 
@@ -2643,7 +2645,7 @@ Status DBImpl::CreateColumnFamily(const ColumnFamilyOptions& cf_options,
   Status s;
   *handle = nullptr;
   {
-    MutexLock l(&mutex_);
+    InstrumentedMutexLock l(&mutex_);
 
     if (versions_->GetColumnFamilySet()->GetColumnFamily(column_family_name) !=
         nullptr) {
@@ -2691,7 +2693,7 @@ Status DBImpl::CreateColumnFamily(const ColumnFamilyOptions& cf_options,
           "Creating column family [%s] FAILED -- %s",
           column_family_name.c_str(), s.ToString().c_str());
     }
-  }  // MutexLock l(&mutex_)
+  }  // InstrumentedMutexLock l(&mutex_)
 
   // this is outside the mutex
   if (s.ok()) {
@@ -2716,7 +2718,7 @@ Status DBImpl::DropColumnFamily(ColumnFamilyHandle* column_family) {
 
   Status s;
   {
-    MutexLock l(&mutex_);
+    InstrumentedMutexLock l(&mutex_);
     if (cfd->IsDropped()) {
       s = Status::InvalidArgument("Column family already dropped!\n");
     }
@@ -2919,14 +2921,14 @@ const Snapshot* DBImpl::GetSnapshot() {
   int64_t unix_time = 0;
   env_->GetCurrentTime(&unix_time);  // Ignore error
 
-  MutexLock l(&mutex_);
+  InstrumentedMutexLock l(&mutex_);
   // returns null if the underlying memtable does not support snapshot.
   if (!is_snapshot_supported_) return nullptr;
   return snapshots_.New(versions_->LastSequence(), unix_time);
 }
 
 void DBImpl::ReleaseSnapshot(const Snapshot* s) {
-  MutexLock l(&mutex_);
+  InstrumentedMutexLock l(&mutex_);
   snapshots_.Delete(reinterpret_cast<const SnapshotImpl*>(s));
 }
 
@@ -3377,7 +3379,7 @@ bool DBImpl::GetProperty(ColumnFamilyHandle* column_family,
   } else {
     auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
     auto cfd = cfh->cfd();
-    MutexLock l(&mutex_);
+    InstrumentedMutexLock l(&mutex_);
     return cfd->internal_stats()->GetStringProperty(property_type, property,
                                                     value);
   }
@@ -3403,7 +3405,7 @@ bool DBImpl::GetIntPropertyInternal(ColumnFamilyHandle* column_family,
   auto cfd = cfh->cfd();
 
   if (!need_out_of_mutex) {
-    MutexLock l(&mutex_);
+    InstrumentedMutexLock l(&mutex_);
     return cfd->internal_stats()->GetIntProperty(property_type, value, this);
   } else {
     SuperVersion* sv = GetAndRefSuperVersion(cfd);
@@ -3430,7 +3432,7 @@ void DBImpl::ReturnAndCleanupSuperVersion(ColumnFamilyData* cfd,
     // Release SuperVersion
     if (sv->Unref()) {
       {
-        MutexLock l(&mutex_);
+        InstrumentedMutexLock l(&mutex_);
         sv->Cleanup();
       }
       delete sv;
@@ -3447,7 +3449,7 @@ void DBImpl::GetApproximateSizes(ColumnFamilyHandle* column_family,
   auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
   auto cfd = cfh->cfd();
   {
-    MutexLock l(&mutex_);
+    InstrumentedMutexLock l(&mutex_);
     v = cfd->current();
     v->Ref();
   }
@@ -3462,7 +3464,7 @@ void DBImpl::GetApproximateSizes(ColumnFamilyHandle* column_family,
   }
 
   {
-    MutexLock l(&mutex_);
+    InstrumentedMutexLock l(&mutex_);
     v->Unref();
   }
 }
@@ -3530,7 +3532,7 @@ Status DBImpl::DeleteFile(std::string name) {
   VersionEdit edit;
   JobContext job_context(true);
   {
-    MutexLock l(&mutex_);
+    InstrumentedMutexLock l(&mutex_);
     status = versions_->GetMetadataForFile(number, &level, &metadata, &cfd);
     if (!status.ok()) {
       Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log,
@@ -3589,7 +3591,7 @@ Status DBImpl::DeleteFile(std::string name) {
 }
 
 void DBImpl::GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata) {
-  MutexLock l(&mutex_);
+  InstrumentedMutexLock l(&mutex_);
   versions_->GetLiveFilesMetaData(metadata);
 }
 
diff --git a/db/db_impl.h b/db/db_impl.h
index 3b3376665..86402e817 100644
--- a/db/db_impl.h
+++ b/db/db_impl.h
@@ -36,6 +36,7 @@
 #include "util/thread_local.h"
 #include "util/scoped_arena_iterator.h"
 #include "util/hash.h"
+#include "util/instrumented_mutex.h"
 #include "db/internal_stats.h"
 #include "db/write_controller.h"
 #include "db/flush_scheduler.h"
@@ -412,7 +413,7 @@ class DBImpl : public DB {
   FileLock* db_lock_;
 
   // State below is protected by mutex_
-  port::Mutex mutex_;
+  InstrumentedMutex mutex_;
   std::atomic<bool> shutting_down_;
   // This condition variable is signaled on these conditions:
   // * whenever bg_compaction_scheduled_ goes down to 0
@@ -422,7 +423,7 @@ class DBImpl : public DB {
   // * whenever bg_flush_scheduled_ value decreases (i.e. whenever a flush is
   // done, even if it didn't make any progress)
   // * whenever there is an error in background flush or compaction
-  port::CondVar bg_cv_;
+  InstrumentedCondVar bg_cv_;
   uint64_t logfile_number_;
   unique_ptr<log::Writer> log_;
   bool log_dir_synced_;
diff --git a/db/db_impl_debug.cc b/db/db_impl_debug.cc
index db4c91ae5..efa209a2b 100644
--- a/db/db_impl_debug.cc
+++ b/db/db_impl_debug.cc
@@ -15,7 +15,7 @@
 namespace rocksdb {
 
 uint64_t DBImpl::TEST_GetLevel0TotalSize() {
-  MutexLock l(&mutex_);
+  InstrumentedMutexLock l(&mutex_);
   return default_cf_handle_->cfd()->current()->storage_info()->NumLevelBytes(0);
 }
 
@@ -45,7 +45,7 @@ int64_t DBImpl::TEST_MaxNextLevelOverlappingBytes(
     auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
     cfd = cfh->cfd();
   }
-  MutexLock l(&mutex_);
+  InstrumentedMutexLock l(&mutex_);
   return cfd->current()->storage_info()->MaxNextLevelOverlappingBytes();
 }
 
@@ -54,7 +54,7 @@ void DBImpl::TEST_GetFilesMetaData(
     std::vector<std::vector<FileMetaData>>* metadata) {
   auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
   auto cfd = cfh->cfd();
-  MutexLock l(&mutex_);
+  InstrumentedMutexLock l(&mutex_);
   metadata->resize(NumberLevels());
   for (int level = 0; level < NumberLevels(); level++) {
     const std::vector<FileMetaData*>& files =
@@ -113,7 +113,7 @@ Status DBImpl::TEST_WaitForCompact() {
   // wait for compact. It actually waits for scheduled compaction
   // OR flush to finish.
 
-  MutexLock l(&mutex_);
+  InstrumentedMutexLock l(&mutex_);
   while ((bg_compaction_scheduled_ || bg_flush_scheduled_) && bg_error_.ok()) {
     bg_cv_.Wait();
   }
diff --git a/db/db_test.cc b/db/db_test.cc
index 4915d29c1..66e47c680 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -10298,6 +10298,21 @@ TEST(DBTest, EncodeDecompressedBlockSizeTest) {
   }
 }
 
+TEST(DBTest, MutexWaitStats) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.statistics = rocksdb::CreateDBStatistics();
+  CreateAndReopenWithCF({"pikachu"}, options);
+  const int64_t kMutexWaitDelay = 100;
+  ThreadStatusUtil::TEST_SetStateDelay(
+      ThreadStatus::STATE_MUTEX_WAIT, kMutexWaitDelay);
+  ASSERT_OK(Put("hello", "rocksdb"));
+  ASSERT_GE(TestGetTickerCount(
+            options, DB_MUTEX_WAIT_MICROS), kMutexWaitDelay);
+  ThreadStatusUtil::TEST_SetStateDelay(
+      ThreadStatus::STATE_MUTEX_WAIT, 0);
+}
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
diff --git a/db/flush_job.cc b/db/flush_job.cc
index 8cf4daa49..ca1d113db 100644
--- a/db/flush_job.cc
+++ b/db/flush_job.cc
@@ -55,7 +55,8 @@ FlushJob::FlushJob(const std::string& dbname, ColumnFamilyData* cfd,
                    const DBOptions& db_options,
                    const MutableCFOptions& mutable_cf_options,
                    const EnvOptions& env_options, VersionSet* versions,
-                   port::Mutex* db_mutex, std::atomic<bool>* shutting_down,
+                   InstrumentedMutex* db_mutex,
+                   std::atomic<bool>* shutting_down,
                    SequenceNumber newest_snapshot, JobContext* job_context,
                    LogBuffer* log_buffer, Directory* db_directory,
                    Directory* output_file_directory,
diff --git a/db/flush_job.h b/db/flush_job.h
index 0b8491484..40cdc5045 100644
--- a/db/flush_job.h
+++ b/db/flush_job.h
@@ -28,6 +28,7 @@
 #include "rocksdb/memtablerep.h"
 #include "rocksdb/transaction_log.h"
 #include "util/autovector.h"
+#include "util/instrumented_mutex.h"
 #include "util/stop_watch.h"
 #include "util/thread_local.h"
 #include "util/scoped_arena_iterator.h"
@@ -54,7 +55,7 @@ class FlushJob {
            const DBOptions& db_options,
            const MutableCFOptions& mutable_cf_options,
            const EnvOptions& env_options, VersionSet* versions,
-           port::Mutex* db_mutex, std::atomic<bool>* shutting_down,
+           InstrumentedMutex* db_mutex, std::atomic<bool>* shutting_down,
            SequenceNumber newest_snapshot, JobContext* job_context,
            LogBuffer* log_buffer, Directory* db_directory,
            Directory* output_file_directory, CompressionType output_compression,
@@ -72,7 +73,7 @@ class FlushJob {
   const MutableCFOptions& mutable_cf_options_;
   const EnvOptions& env_options_;
   VersionSet* versions_;
-  port::Mutex* db_mutex_;
+  InstrumentedMutex* db_mutex_;
   std::atomic<bool>* shutting_down_;
   SequenceNumber newest_snapshot_;
   JobContext* job_context_;
diff --git a/db/flush_job_test.cc b/db/flush_job_test.cc
index 2f4f08b2e..d3e824087 100644
--- a/db/flush_job_test.cc
+++ b/db/flush_job_test.cc
@@ -75,7 +75,7 @@ class FlushJobTest {
   WriteBuffer write_buffer_;
   ColumnFamilyOptions cf_options_;
   std::unique_ptr<VersionSet> versions_;
-  port::Mutex mutex_;
+  InstrumentedMutex mutex_;
   std::atomic<bool> shutting_down_;
   std::shared_ptr<mock::MockTableFactory> mock_table_factory_;
 };
diff --git a/db/log_and_apply_bench.cc b/db/log_and_apply_bench.cc
index b55ec0539..e5e271a1f 100644
--- a/db/log_and_apply_bench.cc
+++ b/db/log_and_apply_bench.cc
@@ -31,8 +31,8 @@ void BM_LogAndApply(int iters, int num_base_files) {
   WriteController wc;
   ColumnFamilyData* default_cfd;
   uint64_t fnum = 1;
-  port::Mutex mu;
-  MutexLock l(&mu);
+  InstrumentedMutex mu;
+  InstrumentedMutexLock l(&mu);
 
   BENCHMARK_SUSPEND {
     std::string dbname = test::TmpDir() + "/rocksdb_test_benchmark";
diff --git a/db/memtable_list.cc b/db/memtable_list.cc
index 8d568e895..44c069dd5 100644
--- a/db/memtable_list.cc
+++ b/db/memtable_list.cc
@@ -164,7 +164,7 @@ void MemTableList::RollbackMemtableFlush(const autovector<MemTable*>& mems,
 // Record a successful flush in the manifest file
 Status MemTableList::InstallMemtableFlushResults(
     ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
-    const autovector<MemTable*>& mems, VersionSet* vset, port::Mutex* mu,
+    const autovector<MemTable*>& mems, VersionSet* vset, InstrumentedMutex* mu,
     uint64_t file_number, autovector<MemTable*>* to_delete,
     Directory* db_directory, LogBuffer* log_buffer) {
   mu->AssertHeld();
diff --git a/db/memtable_list.h b/db/memtable_list.h
index 6cf1737c1..30382eac6 100644
--- a/db/memtable_list.h
+++ b/db/memtable_list.h
@@ -22,13 +22,14 @@
 #include "rocksdb/iterator.h"
 #include "rocksdb/options.h"
 #include "util/autovector.h"
+#include "util/instrumented_mutex.h"
 #include "util/log_buffer.h"
 
 namespace rocksdb {
 
 class ColumnFamilyData;
 class InternalKeyComparator;
-class Mutex;
+class InstrumentedMutex;
 class MergeIteratorBuilder;
 
 // keeps a list of immutable memtables in a vector. the list is immutable
@@ -113,7 +114,7 @@ class MemTableList {
   // Commit a successful flush in the manifest file
   Status InstallMemtableFlushResults(
       ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
-      const autovector<MemTable*>& m, VersionSet* vset, port::Mutex* mu,
+      const autovector<MemTable*>& m, VersionSet* vset, InstrumentedMutex* mu,
       uint64_t file_number, autovector<MemTable*>* to_delete,
       Directory* db_directory, LogBuffer* log_buffer);
 
diff --git a/db/version_set.cc b/db/version_set.cc
index 3b6d57f55..211ee3fda 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -1487,11 +1487,11 @@ std::string Version::DebugString(bool hex) const {
 struct VersionSet::ManifestWriter {
   Status status;
   bool done;
-  port::CondVar cv;
+  InstrumentedCondVar cv;
   ColumnFamilyData* cfd;
   VersionEdit* edit;
 
-  explicit ManifestWriter(port::Mutex* mu, ColumnFamilyData* _cfd,
+  explicit ManifestWriter(InstrumentedMutex* mu, ColumnFamilyData* _cfd,
                           VersionEdit* e)
       : done(false), cv(mu), cfd(_cfd), edit(e) {}
 };
@@ -1556,7 +1556,7 @@ void VersionSet::AppendVersion(ColumnFamilyData* column_family_data,
 
 Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
                                const MutableCFOptions& mutable_cf_options,
-                               VersionEdit* edit, port::Mutex* mu,
+                               VersionEdit* edit, InstrumentedMutex* mu,
                                Directory* db_directory, bool new_descriptor_log,
                                const ColumnFamilyOptions* new_cf_options) {
   mu->AssertHeld();
@@ -1824,7 +1824,7 @@ void VersionSet::LogAndApplyCFHelper(VersionEdit* edit) {
 
 void VersionSet::LogAndApplyHelper(ColumnFamilyData* cfd,
                                    VersionBuilder* builder, Version* v,
-                                   VersionEdit* edit, port::Mutex* mu) {
+                                   VersionEdit* edit, InstrumentedMutex* mu) {
   mu->AssertHeld();
   assert(!edit->IsColumnFamilyManipulation());
 
@@ -2275,8 +2275,8 @@ Status VersionSet::ReduceNumberOfLevels(const std::string& dbname,
 
   MutableCFOptions mutable_cf_options(*options, ImmutableCFOptions(*options));
   VersionEdit ve;
-  port::Mutex dummy_mutex;
-  MutexLock l(&dummy_mutex);
+  InstrumentedMutex dummy_mutex;
+  InstrumentedMutexLock l(&dummy_mutex);
   return versions.LogAndApply(
       versions.GetColumnFamilySet()->GetDefault(),
       mutable_cf_options, &ve, &dummy_mutex, nullptr, true);
diff --git a/db/version_set.h b/db/version_set.h
index 83801e1da..ca79aff4e 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -36,6 +36,7 @@
 #include "db/log_reader.h"
 #include "db/file_indexer.h"
 #include "db/write_controller.h"
+#include "util/instrumented_mutex.h"
 
 namespace rocksdb {
 
@@ -485,7 +486,7 @@ class VersionSet {
   Status LogAndApply(
       ColumnFamilyData* column_family_data,
       const MutableCFOptions& mutable_cf_options, VersionEdit* edit,
-      port::Mutex* mu, Directory* db_directory = nullptr,
+      InstrumentedMutex* mu, Directory* db_directory = nullptr,
       bool new_descriptor_log = false,
       const ColumnFamilyOptions* column_family_options = nullptr);
 
@@ -656,7 +657,7 @@ class VersionSet {
 
   void LogAndApplyCFHelper(VersionEdit* edit);
   void LogAndApplyHelper(ColumnFamilyData* cfd, VersionBuilder* b, Version* v,
-                         VersionEdit* edit, port::Mutex* mu);
+                         VersionEdit* edit, InstrumentedMutex* mu);
 };
 
 }  // namespace rocksdb
diff --git a/db/write_thread.h b/db/write_thread.h
index 8c5baa664..db3520244 100644
--- a/db/write_thread.h
+++ b/db/write_thread.h
@@ -12,6 +12,7 @@
 #include "db/write_batch_internal.h"
 #include "util/autovector.h"
 #include "port/port.h"
+#include "util/instrumented_mutex.h"
 
 namespace rocksdb {
 
@@ -27,9 +28,9 @@ class WriteThread {
     bool in_batch_group;
     bool done;
     uint64_t timeout_hint_us;
-    port::CondVar cv;
+    InstrumentedCondVar cv;
 
-    explicit Writer(port::Mutex* mu)
+    explicit Writer(InstrumentedMutex* mu)
         : batch(nullptr),
           sync(false),
           disableWAL(false),
diff --git a/include/rocksdb/statistics.h b/include/rocksdb/statistics.h
index 7d0dad5d6..4b28fd0d9 100644
--- a/include/rocksdb/statistics.h
+++ b/include/rocksdb/statistics.h
@@ -81,6 +81,8 @@ enum Tickers : uint32_t {
   STALL_L0_NUM_FILES_MICROS,
   // Writer has to wait for compaction or flush to finish.
   STALL_MICROS,
+  // The wait time for db mutex.
+  DB_MUTEX_WAIT_MICROS,
   RATE_LIMIT_DELAY_MILLIS,
   NO_ITERATORS,  // number of iterators currently open
 
@@ -163,6 +165,7 @@ const std::vector<std::pair<Tickers, std::string>> TickersNameMap = {
     {STALL_MEMTABLE_COMPACTION_MICROS, "rocksdb.memtable.compaction.micros"},
     {STALL_L0_NUM_FILES_MICROS, "rocksdb.l0.num.files.stall.micros"},
     {STALL_MICROS, "rocksdb.stall.micros"},
+    {DB_MUTEX_WAIT_MICROS, "rocksdb.db.mutex.wait.micros"},
     {RATE_LIMIT_DELAY_MILLIS, "rocksdb.rate.limit.delay.millis"},
     {NO_ITERATORS, "rocksdb.num.iterators"},
     {NUMBER_MULTIGET_CALLS, "rocksdb.number.multiget.get"},
diff --git a/include/rocksdb/thread_status.h b/include/rocksdb/thread_status.h
index 539321291..36efd6f75 100644
--- a/include/rocksdb/thread_status.h
+++ b/include/rocksdb/thread_status.h
@@ -53,6 +53,7 @@ struct ThreadStatus {
   // such as reading / writing a file or waiting for a mutex.
   enum StateType : int {
     STATE_UNKNOWN = 0,
+    STATE_MUTEX_WAIT = 1,
     NUM_STATE_TYPES
   };
 
diff --git a/util/instrumented_mutex.cc b/util/instrumented_mutex.cc
new file mode 100644
index 000000000..05d19b2ae
--- /dev/null
+++ b/util/instrumented_mutex.cc
@@ -0,0 +1,72 @@
+//  Copyright (c) 2015, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include "util/instrumented_mutex.h"
+#include "util/thread_status_util.h"
+
+namespace rocksdb {
+void InstrumentedMutex::Lock() {
+  uint64_t wait_time_micros = 0;
+  if (env_ != nullptr && stats_ != nullptr) {
+    {
+      StopWatch sw(env_, nullptr, 0, &wait_time_micros);
+      LockInternal();
+    }
+    RecordTick(stats_, stats_code_, wait_time_micros);
+  } else {
+    LockInternal();
+  }
+}
+
+void InstrumentedMutex::LockInternal() {
+#ifndef NDEBUG
+  ThreadStatusUtil::TEST_StateDelay(ThreadStatus::STATE_MUTEX_WAIT);
+#endif
+  mutex_.Lock();
+}
+
+void InstrumentedCondVar::Wait() {
+  uint64_t wait_time_micros = 0;
+  if (env_ != nullptr && stats_ != nullptr) {
+    {
+      StopWatch sw(env_, nullptr, 0, &wait_time_micros);
+      WaitInternal();
+    }
+    RecordTick(stats_, stats_code_, wait_time_micros);
+  } else {
+    WaitInternal();
+  }
+}
+
+void InstrumentedCondVar::WaitInternal() {
+#ifndef NDEBUG
+  ThreadStatusUtil::TEST_StateDelay(ThreadStatus::STATE_MUTEX_WAIT);
+#endif
+  cond_.Wait();
+}
+
+bool InstrumentedCondVar::TimedWait(uint64_t abs_time_us) {
+  uint64_t wait_time_micros = 0;
+  bool result = false;
+  if (env_ != nullptr && stats_ != nullptr) {
+    {
+      StopWatch sw(env_, nullptr, 0, &wait_time_micros);
+      result = TimedWaitInternal(abs_time_us);
+    }
+    RecordTick(stats_, stats_code_, wait_time_micros);
+  } else {
+    result = TimedWaitInternal(abs_time_us);
+  }
+  return result;
+}
+
+bool InstrumentedCondVar::TimedWaitInternal(uint64_t abs_time_us) {
+#ifndef NDEBUG
+  ThreadStatusUtil::TEST_StateDelay(ThreadStatus::STATE_MUTEX_WAIT);
+#endif
+  return cond_.TimedWait(abs_time_us);
+}
+
+}  // namespace rocksdb
diff --git a/util/instrumented_mutex.h b/util/instrumented_mutex.h
new file mode 100644
index 000000000..3f233494a
--- /dev/null
+++ b/util/instrumented_mutex.h
@@ -0,0 +1,98 @@
+//  Copyright (c) 2015, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#pragma once
+
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/thread_status.h"
+#include "util/statistics.h"
+#include "util/stop_watch.h"
+
+namespace rocksdb {
+class InstrumentedCondVar;
+
+// A wrapper class for port::Mutex that provides additional layer
+// for collecting stats and instrumentation.
+class InstrumentedMutex {
+ public:
+  explicit InstrumentedMutex(bool adaptive = false)
+      : mutex_(adaptive), stats_(nullptr), env_(nullptr),
+        stats_code_(0) {}
+
+  InstrumentedMutex(
+      Statistics* stats, Env* env,
+      int stats_code, bool adaptive = false)
+      : mutex_(adaptive), stats_(stats), env_(env),
+        stats_code_(stats_code) {}
+
+  void Lock();
+
+  void Unlock() {
+    mutex_.Unlock();
+  }
+
+  void AssertHeld() {
+    mutex_.AssertHeld();
+  }
+
+ private:
+  void LockInternal();
+  friend class InstrumentedCondVar;
+  port::Mutex mutex_;
+  Statistics* stats_;
+  Env* env_;
+  int stats_code_;
+};
+
+// A wrapper class for port::Mutex that provides additional layer
+// for collecting stats and instrumentation.
+class InstrumentedMutexLock {
+ public:
+  explicit InstrumentedMutexLock(InstrumentedMutex* mutex) : mutex_(mutex) {
+    mutex_->Lock();
+  }
+
+  ~InstrumentedMutexLock() {
+    mutex_->Unlock();
+  }
+
+ private:
+  InstrumentedMutex* const mutex_;
+  InstrumentedMutexLock(const InstrumentedMutexLock&) = delete;
+  void operator=(const InstrumentedMutexLock&) = delete;
+};
+
+class InstrumentedCondVar {
+ public:
+  explicit InstrumentedCondVar(InstrumentedMutex* instrumented_mutex)
+      : cond_(&(instrumented_mutex->mutex_)),
+        stats_(instrumented_mutex->stats_),
+        env_(instrumented_mutex->env_),
+        stats_code_(instrumented_mutex->stats_code_) {}
+
+  void Wait();
+
+  bool TimedWait(uint64_t abs_time_us);
+
+  void Signal() {
+    cond_.Signal();
+  }
+
+  void SignalAll() {
+    cond_.SignalAll();
+  }
+
+ private:
+  void WaitInternal();
+  bool TimedWaitInternal(uint64_t abs_time_us);
+  port::CondVar cond_;
+  Statistics* stats_;
+  Env* env_;
+  int stats_code_;
+};
+
+}  // namespace rocksdb
diff --git a/util/thread_status_util.h b/util/thread_status_util.h
index a8549e8ae..8428d492c 100644
--- a/util/thread_status_util.h
+++ b/util/thread_status_util.h
@@ -11,6 +11,7 @@
 #include "util/thread_status_updater.h"
 
 namespace rocksdb {
+class ColumnFamilyData;
 
 
 // The static utility class for updating thread-local status.

From 9a52e06a02cb1e06cc27a68d652a0afe6c760d85 Mon Sep 17 00:00:00 2001
From: Jonah Cohen <jonah@fb.com>
Date: Thu, 5 Feb 2015 08:26:33 -0800
Subject: [PATCH 813/829] Add GetID to ColumnFamilyHandle

Summary:
Expose GetID to ColumnFamilyHandle interface so that we can save column
family data by id instead of name.

Test Plan: Testing in MySQL on Rocks.

Reviewers: sdong, igor

Reviewed By: igor

Subscribers: dhruba

Differential Revision: https://reviews.facebook.net/D32943
---
 include/rocksdb/db.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h
index a4141f38b..7cba31488 100644
--- a/include/rocksdb/db.h
+++ b/include/rocksdb/db.h
@@ -44,6 +44,7 @@ class ColumnFamilyHandle {
  public:
   virtual ~ColumnFamilyHandle() {}
   virtual const std::string& GetName() const = 0;
+  virtual uint32_t GetID() const = 0;
 };
 extern const std::string kDefaultColumnFamilyName;
 

From 7de4e99a8eedcff113e3d6e74d50a051f6a96c89 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Thu, 5 Feb 2015 11:44:17 -0800
Subject: [PATCH 814/829] Revert "Fix wal_dir not getting cleaned"

This reverts commit f36d394aeddf420661e54a1a0a54fcc790c9cffb.
---
 db/db_test.cc | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/db/db_test.cc b/db/db_test.cc
index 66e47c680..720978021 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -2596,8 +2596,7 @@ TEST(DBTest, IgnoreRecoveredLog) {
     options.create_if_missing = true;
     options.merge_operator = MergeOperators::CreateUInt64AddOperator();
     options.wal_dir = dbname_ + "/logs";
-    Destroy(options);
-    Reopen(options);
+    DestroyAndReopen(options);
 
     // fill up the DB
     std::string one, two;

From 6f101303542f2259075a9abe88d836c792ead411 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Thu, 5 Feb 2015 20:09:42 -0800
Subject: [PATCH 815/829] Fix DestroyDB

Summary:
When DestroyDB() finds a wal file in the DB directory, it assumes it is actually in WAL directory. This can lead to confusion, since it reports IO error when it tries to delete wal file from DB directory. For example: https://ci-builds.fb.com/job/rocksdb_clang_build/296/console

This change will fix our unit tests.

Test Plan: unit tests work

Reviewers: yhchiang, rven, sdong

Reviewed By: sdong

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D32907
---
 db/db_impl.cc | 33 ++++++++++++++++++---------------
 db/db_test.cc |  7 ++++---
 2 files changed, 22 insertions(+), 18 deletions(-)

diff --git a/db/db_impl.cc b/db/db_impl.cc
index dd627313b..3365e18a4 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -3885,23 +3885,10 @@ Status DestroyDB(const std::string& dbname, const Options& options) {
   const Options& soptions(SanitizeOptions(dbname, &comparator, options));
   Env* env = soptions.env;
   std::vector<std::string> filenames;
-  std::vector<std::string> archiveFiles;
 
-  std::string archivedir = ArchivalDirectory(dbname);
   // Ignore error in case directory does not exist
   env->GetChildren(dbname, &filenames);
 
-  if (dbname != soptions.wal_dir) {
-    std::vector<std::string> logfilenames;
-    env->GetChildren(soptions.wal_dir, &logfilenames);
-    filenames.insert(filenames.end(), logfilenames.begin(), logfilenames.end());
-    archivedir = ArchivalDirectory(soptions.wal_dir);
-  }
-
-  if (filenames.empty()) {
-    return Status::OK();
-  }
-
   FileLock* lock;
   const std::string lockname = LockFileName(dbname);
   Status result = env->LockFile(lockname, &lock);
@@ -3915,8 +3902,6 @@ Status DestroyDB(const std::string& dbname, const Options& options) {
         Status del;
         if (type == kMetaDatabase) {
           del = DestroyDB(dbname + "/" + filenames[i], options);
-        } else if (type == kLogFile) {
-          del = env->DeleteFile(soptions.wal_dir + "/" + filenames[i]);
         } else {
           del = env->DeleteFile(dbname + "/" + filenames[i]);
         }
@@ -3939,6 +3924,24 @@ Status DestroyDB(const std::string& dbname, const Options& options) {
       }
     }
 
+    std::vector<std::string> walDirFiles;
+    std::string archivedir = ArchivalDirectory(dbname);
+    if (dbname != soptions.wal_dir) {
+      env->GetChildren(soptions.wal_dir, &walDirFiles);
+      archivedir = ArchivalDirectory(soptions.wal_dir);
+    }
+
+    // Delete log files in the WAL dir
+    for (const auto& file : walDirFiles) {
+      if (ParseFileName(file, &number, &type) && type == kLogFile) {
+        Status del = env->DeleteFile(soptions.wal_dir + "/" + file);
+        if (result.ok() && !del.ok()) {
+          result = del;
+        }
+      }
+    }
+
+    std::vector<std::string> archiveFiles;
     env->GetChildren(archivedir, &archiveFiles);
     // Delete archival files.
     for (size_t i = 0; i < archiveFiles.size(); ++i) {
diff --git a/db/db_test.cc b/db/db_test.cc
index 720978021..b19d2550b 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -624,7 +624,7 @@ class DBTest {
         options.db_log_dir = test::TmpDir(env_);
         break;
       case kWalDirAndMmapReads:
-        options.wal_dir = test::TmpDir(env_) + "/wal";
+        options.wal_dir = dbname_ + "/wal";
         // mmap reads should be orthogonal to WalDir setting, so we piggyback to
         // this option config to test mmap reads as well
         options.allow_mmap_reads = true;
@@ -2595,8 +2595,9 @@ TEST(DBTest, IgnoreRecoveredLog) {
     Options options = CurrentOptions();
     options.create_if_missing = true;
     options.merge_operator = MergeOperators::CreateUInt64AddOperator();
-    options.wal_dir = dbname_ + "/logs";
-    DestroyAndReopen(options);
+    options.wal_dir = dbname_ + "/wal";
+    Destroy(options);
+    Reopen(options);
 
     // fill up the DB
     std::string one, two;

From 8e83a9d3153e36ab0dadf433aa795d9cff124886 Mon Sep 17 00:00:00 2001
From: Yueh-Hsuan Chiang <yhchiang@fb.com>
Date: Fri, 6 Feb 2015 02:38:14 -0800
Subject: [PATCH 816/829] Add a missing field for STATE_MUTEX_WAIT to
 global_state_table

Summary:
Add a missing field for STATE_MUTEX_WAIT to global_state_table.
This will fix the failure of thread_list_test.

Test Plan:
thread_list_test
---
 util/thread_operation.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/util/thread_operation.h b/util/thread_operation.h
index b4326f5bd..45521e227 100644
--- a/util/thread_operation.h
+++ b/util/thread_operation.h
@@ -54,6 +54,7 @@ struct StateInfo {
 // rows in this global table.
 static StateInfo global_state_table[] = {
   {ThreadStatus::STATE_UNKNOWN, ""},
+  {ThreadStatus::STATE_MUTEX_WAIT, "Mutex Wait"},
 };
 
 #else

From 2a979822b6f72a4c563e2c23f32b5c8523584c91 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Fri, 6 Feb 2015 08:44:30 -0800
Subject: [PATCH 817/829] Fix deleting obsolete files

Summary:
This diff basically reverts D30249 and also adds a unit test that was failing before this patch.

I have no idea how I didn't catch this terrible bug when writing a diff, sorry about that :(

I think we should redesign our system of keeping track of and deleting files. This is already a second bug in this critical piece of code. I'll think of few ideas.

BTW this diff is also a regression when running lots of column families. I plan to revisit this separately.

Test Plan: added a unit test

Reviewers: yhchiang, rven, sdong

Reviewed By: sdong

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D33045
---
 db/db_impl.cc         | 17 +++-------
 db/db_test.cc         | 74 +++++++++++++++++++++++++++++++++++++++++++
 db/job_context.h      |  3 +-
 db/version_builder.cc |  1 -
 db/version_edit.h     |  7 +---
 db/version_set.cc     |  8 +----
 util/options.cc       |  4 +++
 7 files changed, 86 insertions(+), 28 deletions(-)

diff --git a/db/db_impl.cc b/db/db_impl.cc
index 3365e18a4..be1f7037f 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -430,9 +430,10 @@ void DBImpl::MaybeDumpStats() {
   }
 }
 
+// * Returns the list of live files in 'sst_live'
 // If it's doing full scan:
-// * Returns the list of live files in 'full_scan_sst_live' and the list
-// of all files in the filesystem in 'full_scan_candidate_files'.
+// * Returns the list of all files in the filesystem in
+// 'full_scan_candidate_files'.
 // Otherwise, gets obsolete files from VersionSet.
 // no_full_scan = true -- never do the full scan using GetChildren()
 // force = false -- don't force the full scan, except every
@@ -440,7 +441,6 @@ void DBImpl::MaybeDumpStats() {
 // force = true -- force the full scan
 void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force,
                                bool no_full_scan) {
-  // TODO(icanadi) clean up FindObsoleteFiles, no need to do full scans anymore
   mutex_.AssertHeld();
 
   // if deletion is disabled, do nothing
@@ -482,13 +482,8 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force,
     job_context->min_pending_output = std::numeric_limits<uint64_t>::max();
   }
 
+  versions_->AddLiveFiles(&job_context->sst_live);
   if (doing_the_full_scan) {
-    // Here we find all files in the DB directory and all the live files. In the
-    // DeleteObsoleteFiles(), we will calculate a set difference (all_files -
-    // live_files) and delete all files in that difference. If we're not doing
-    // the full scan we don't need to get live files, because all files returned
-    // by GetObsoleteFiles() will be dead (and need to be deleted)
-    versions_->AddLiveFiles(&job_context->full_scan_sst_live);
     for (uint32_t path_id = 0; path_id < db_options_.db_paths.size();
          path_id++) {
       // set of all files in the directory. We'll exclude files that are still
@@ -554,7 +549,7 @@ void DBImpl::PurgeObsoleteFiles(const JobContext& state) {
   // Now, convert live list to an unordered map, WITHOUT mutex held;
   // set is slow.
   std::unordered_map<uint64_t, const FileDescriptor*> sst_live_map;
-  for (const FileDescriptor& fd : state.full_scan_sst_live) {
+  for (const FileDescriptor& fd : state.sst_live) {
     sst_live_map[fd.GetNumber()] = &fd;
   }
 
@@ -1566,7 +1561,6 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) {
     VersionEdit edit;
     edit.SetColumnFamily(cfd->GetID());
     for (const auto& f : cfd->current()->storage_info()->LevelFiles(level)) {
-      f->moved = true;
       edit.DeleteFile(level, f->fd.GetNumber());
       edit.AddFile(to_level, f->fd.GetNumber(), f->fd.GetPathId(),
                    f->fd.GetFileSize(), f->smallest, f->largest,
@@ -2223,7 +2217,6 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress, JobContext* job_context,
     // Move file to next level
     assert(c->num_input_files(0) == 1);
     FileMetaData* f = c->input(0, 0);
-    f->moved = true;
     c->edit()->DeleteFile(c->level(), f->fd.GetNumber());
     c->edit()->AddFile(c->level() + 1, f->fd.GetNumber(), f->fd.GetPathId(),
                        f->fd.GetFileSize(), f->smallest, f->largest,
diff --git a/db/db_test.cc b/db/db_test.cc
index b19d2550b..715d63970 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -10256,6 +10256,80 @@ TEST(DBTest, DontDeleteMovedFile) {
   Reopen(options);
 }
 
+TEST(DBTest, DeleteMovedFileAfterCompaction) {
+  // iter 1 -- delete_obsolete_files_period_micros == 0
+  for (int iter = 0; iter < 2; ++iter) {
+    // This test triggers move compaction and verifies that the file is not
+    // deleted when it's part of move compaction
+    Options options = CurrentOptions();
+    options.env = env_;
+    if (iter == 1) {
+      options.delete_obsolete_files_period_micros = 0;
+    }
+    options.create_if_missing = true;
+    options.level0_file_num_compaction_trigger =
+        2;  // trigger compaction when we have 2 files
+    DestroyAndReopen(options);
+
+    Random rnd(301);
+    // Create two 1MB sst files
+    for (int i = 0; i < 2; ++i) {
+      // Create 1MB sst file
+      for (int j = 0; j < 100; ++j) {
+        ASSERT_OK(Put(Key(i * 50 + j), RandomString(&rnd, 10 * 1024)));
+      }
+      ASSERT_OK(Flush());
+    }
+    // this should execute L0->L1
+    dbfull()->TEST_WaitForCompact();
+    ASSERT_EQ("0,1", FilesPerLevel(0));
+
+    // block compactions
+    SleepingBackgroundTask sleeping_task;
+    env_->Schedule(&SleepingBackgroundTask::DoSleepTask, &sleeping_task,
+                   Env::Priority::LOW);
+
+    options.max_bytes_for_level_base = 1024 * 1024;  // 1 MB
+    Reopen(options);
+    std::unique_ptr<Iterator> iterator(db_->NewIterator(ReadOptions()));
+    ASSERT_EQ("0,1", FilesPerLevel(0));
+    // let compactions go
+    sleeping_task.WakeUp();
+    sleeping_task.WaitUntilDone();
+
+    // this should execute L1->L2 (move)
+    dbfull()->TEST_WaitForCompact();
+
+    ASSERT_EQ("0,0,1", FilesPerLevel(0));
+
+    std::vector<LiveFileMetaData> metadata;
+    db_->GetLiveFilesMetaData(&metadata);
+    ASSERT_EQ(metadata.size(), 1U);
+    auto moved_file_name = metadata[0].name;
+
+    // Create two more 1MB sst files
+    for (int i = 0; i < 2; ++i) {
+      // Create 1MB sst file
+      for (int j = 0; j < 100; ++j) {
+        ASSERT_OK(Put(Key(i * 50 + j + 100), RandomString(&rnd, 10 * 1024)));
+      }
+      ASSERT_OK(Flush());
+    }
+    // this should execute both L0->L1 and L1->L2 (merge with previous file)
+    dbfull()->TEST_WaitForCompact();
+
+    ASSERT_EQ("0,0,2", FilesPerLevel(0));
+
+    // iterator is holding the file
+    ASSERT_TRUE(env_->FileExists(dbname_ + "/" + moved_file_name));
+
+    iterator.reset();
+
+    // this file should have been compacted away
+    ASSERT_TRUE(!env_->FileExists(dbname_ + "/" + moved_file_name));
+  }
+}
+
 TEST(DBTest, EncodeDecompressedBlockSizeTest) {
   // iter 0 -- zlib
   // iter 1 -- bzip2
diff --git a/db/job_context.h b/db/job_context.h
index 01c868c03..d3aa9b215 100644
--- a/db/job_context.h
+++ b/db/job_context.h
@@ -43,8 +43,7 @@ struct JobContext {
   std::vector<CandidateFileInfo> full_scan_candidate_files;
 
   // the list of all live sst files that cannot be deleted
-  // (filled only if we're doing full scan)
-  std::vector<FileDescriptor> full_scan_sst_live;
+  std::vector<FileDescriptor> sst_live;
 
   // a list of sst files that we need to delete
   std::vector<FileMetaData*> sst_delete_files;
diff --git a/db/version_builder.cc b/db/version_builder.cc
index 3a4143b9e..c010ee429 100644
--- a/db/version_builder.cc
+++ b/db/version_builder.cc
@@ -215,7 +215,6 @@ class VersionBuilder::Rep {
       const int level = new_file.first;
       FileMetaData* f = new FileMetaData(new_file.second);
       f->refs = 1;
-      f->moved = false;
 
       assert(levels_[level].added_files.find(f->fd.GetNumber()) ==
              levels_[level].added_files.end());
diff --git a/db/version_edit.h b/db/version_edit.h
index 6f7a692f3..004855ff9 100644
--- a/db/version_edit.h
+++ b/db/version_edit.h
@@ -87,10 +87,6 @@ struct FileMetaData {
   bool init_stats_from_file;   // true if the data-entry stats of this file
                                // has initialized from file.
 
-  // Always false for new files. Set to true if the file was part of move
-  // compaction. Can only be mutated from the compaction process, under DB mutex
-  bool moved;
-
   FileMetaData()
       : refs(0),
         being_compacted(false),
@@ -100,8 +96,7 @@ struct FileMetaData {
         num_deletions(0),
         raw_key_size(0),
         raw_value_size(0),
-        init_stats_from_file(false),
-        moved(false) {}
+        init_stats_from_file(false) {}
 };
 
 // A compressed copy of file meta data that just contain
diff --git a/db/version_set.cc b/db/version_set.cc
index 211ee3fda..6ec6f1d9e 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -309,13 +309,7 @@ Version::~Version() {
           cfd_->table_cache()->ReleaseHandle(f->table_reader_handle);
           f->table_reader_handle = nullptr;
         }
-        if (!f->moved) {
-          vset_->obsolete_files_.push_back(f);
-        } else {
-          // moved!
-          // TODO(icanadi) delete this outside of mutex
-          delete f;
-        }
+        vset_->obsolete_files_.push_back(f);
       }
     }
   }
diff --git a/util/options.cc b/util/options.cc
index 69aca5ab1..fbfa74ccc 100644
--- a/util/options.cc
+++ b/util/options.cc
@@ -204,7 +204,11 @@ DBOptions::DBOptions()
       env(Env::Default()),
       rate_limiter(nullptr),
       info_log(nullptr),
+#ifdef NDEBUG
       info_log_level(INFO_LEVEL),
+#else
+      info_log_level(DEBUG_LEVEL),
+#endif  // NDEBUG
       max_open_files(5000),
       max_total_wal_size(0),
       statistics(nullptr),

From da9cbce73136517e8ed5eb63c70f18a1fdf32506 Mon Sep 17 00:00:00 2001
From: Karthikeyan Radhakrishnan <krad@dev2306.prn2.facebook.com>
Date: Mon, 2 Feb 2015 09:47:24 -0800
Subject: [PATCH 818/829] Add Header to logging to  capture application level
 information

Summary:
This change adds LogHeader provision to the logger. For the rolling logger
implementation, the headers are copied over to the new log file every time
there is a log roll over.

Test Plan: Added a unit test to test the rolling log case.

Reviewers: sdong

Subscribers: dhruba

Differential Revision: https://reviews.facebook.net/D32817
---
 include/rocksdb/env.h         | 11 +++++++++
 util/auto_roll_logger.cc      | 46 +++++++++++++++++++++++++++++++++++
 util/auto_roll_logger.h       | 15 +++++++++++-
 util/auto_roll_logger_test.cc | 41 +++++++++++++++++++++++++++++++
 util/env.cc                   | 18 ++++++++++++++
 5 files changed, 130 insertions(+), 1 deletion(-)

diff --git a/include/rocksdb/env.h b/include/rocksdb/env.h
index 8a96ef1e1..433fa4174 100644
--- a/include/rocksdb/env.h
+++ b/include/rocksdb/env.h
@@ -617,6 +617,15 @@ class Logger {
       : log_level_(log_level) {}
   virtual ~Logger();
 
+  // Write a header to the log file with the specified format
+  // It is recommended that you log all header information at the start of the
+  // application. But it is not enforced.
+  virtual void LogHeader(const char* format, va_list ap) {
+    // Default implementation does a simple INFO level log write.
+    // Please override as per the logger class requirement.
+    Logv(format, ap);
+  }
+
   // Write an entry to the log file with the specified format.
   virtual void Logv(const char* format, va_list ap) = 0;
 
@@ -678,6 +687,7 @@ extern void Log(const InfoLogLevel log_level,
                 const shared_ptr<Logger>& info_log, const char* format, ...);
 
 // a set of log functions with different log levels.
+extern void Header(const shared_ptr<Logger>& info_log, const char* format, ...);
 extern void Debug(const shared_ptr<Logger>& info_log, const char* format, ...);
 extern void Info(const shared_ptr<Logger>& info_log, const char* format, ...);
 extern void Warn(const shared_ptr<Logger>& info_log, const char* format, ...);
@@ -705,6 +715,7 @@ extern void Log(Logger* info_log, const char* format, ...)
     ;
 
 // a set of log functions with different log levels.
+extern void Header(Logger* info_log, const char* format, ...);
 extern void Debug(Logger* info_log, const char* format, ...);
 extern void Info(Logger* info_log, const char* format, ...);
 extern void Warn(Logger* info_log, const char* format, ...);
diff --git a/util/auto_roll_logger.cc b/util/auto_roll_logger.cc
index 3385986f9..684abfc30 100644
--- a/util/auto_roll_logger.cc
+++ b/util/auto_roll_logger.cc
@@ -37,6 +37,27 @@ void AutoRollLogger::RollLogFile() {
   env_->RenameFile(log_fname_, old_fname);
 }
 
+string AutoRollLogger::ValistToString(const char* format, va_list args) const {
+  // Any log messages longer than 1024 will get truncated.
+  // The user is responsible for chopping longer messages into multi line log
+  static const int MAXBUFFERSIZE = 1024;
+  char buffer[MAXBUFFERSIZE];
+
+  int count = vsnprintf(buffer, MAXBUFFERSIZE, format, args);
+  (void) count;
+  assert(count >= 0);
+
+  return buffer;
+}
+
+void AutoRollLogger::LogInternal(const char* format, ...) {
+  mutex_.AssertHeld();
+  va_list args;
+  va_start(args, format);
+  logger_->Logv(format, args);
+  va_end(args);
+}
+
 void AutoRollLogger::Logv(const char* format, va_list ap) {
   assert(GetStatus().ok());
 
@@ -51,6 +72,8 @@ void AutoRollLogger::Logv(const char* format, va_list ap) {
         // can't really log the error if creating a new LOG file failed
         return;
       }
+
+      WriteHeaderInfo();
     }
 
     // pin down the current logger_ instance before releasing the mutex.
@@ -66,6 +89,29 @@ void AutoRollLogger::Logv(const char* format, va_list ap) {
   logger->Logv(format, ap);
 }
 
+void AutoRollLogger::WriteHeaderInfo() {
+  mutex_.AssertHeld();
+  for (auto header : headers_) {
+    LogInternal("%s", header.c_str());
+  }
+}
+
+void AutoRollLogger::LogHeader(const char* format, va_list args) {
+  // header message are to be retained in memory. Since we cannot make any
+  // assumptions about the data contained in va_list, we will retain them as
+  // strings
+  va_list tmp;
+  va_copy(tmp, args);
+  string data = ValistToString(format, tmp);
+  va_end(tmp);
+
+  MutexLock l(&mutex_);
+  headers_.push_back(data);
+
+  // Log the original message to the current log
+  logger_->Logv(format, args);
+}
+
 bool AutoRollLogger::LogExpired() {
   if (cached_now_access_count >= call_NowMicros_every_N_records_) {
     cached_now = static_cast<uint64_t>(env_->NowMicros() * 1e-6);
diff --git a/util/auto_roll_logger.h b/util/auto_roll_logger.h
index c592d79ce..4aab6a119 100644
--- a/util/auto_roll_logger.h
+++ b/util/auto_roll_logger.h
@@ -7,6 +7,8 @@
 // where enough posix functionality is available.
 
 #pragma once
+#include <list>
+
 #include "db/filename.h"
 #include "port/port.h"
 #include "util/posix_logger.h"
@@ -40,6 +42,10 @@ class AutoRollLogger : public Logger {
 
   void Logv(const char* format, va_list ap);
 
+  // Write a header entry to the log. All header information will be written
+  // again every time the log rolls over.
+  virtual void LogHeader(const char* format, va_list ap) override;
+
   // check if the logger has encountered any problem.
   Status GetStatus() {
     return status_;
@@ -57,10 +63,15 @@ class AutoRollLogger : public Logger {
   }
 
  private:
-
   bool LogExpired();
   Status ResetLogger();
   void RollLogFile();
+  // Log message to logger without rolling
+  void LogInternal(const char* format, ...);
+  // Serialize the va_list to a string
+  std::string ValistToString(const char* format, va_list args) const;
+  // Write the logs marked as headers to the new log file
+  void WriteHeaderInfo();
 
   std::string log_fname_; // Current active info log's file name.
   std::string dbname_;
@@ -72,6 +83,8 @@ class AutoRollLogger : public Logger {
   Status status_;
   const size_t kMaxLogFileSize;
   const size_t kLogFileTimeToRoll;
+  // header information
+  std::list<std::string> headers_;
   // to avoid frequent env->NowMicros() calls, we cached the current time
   uint64_t cached_now;
   uint64_t ctime_;
diff --git a/util/auto_roll_logger_test.cc b/util/auto_roll_logger_test.cc
index 7a2bb6aa7..7f75edf99 100755
--- a/util/auto_roll_logger_test.cc
+++ b/util/auto_roll_logger_test.cc
@@ -285,6 +285,47 @@ TEST(AutoRollLoggerTest, InfoLogLevel) {
   inFile.close();
 }
 
+// Test the logger Header function for roll over logs
+// We expect the new logs creates as roll over to carry the headers specified
+TEST(AutoRollLoggerTest, LogHeaderTest) {
+  static const size_t MAX_HEADERS = 10;
+  static const size_t LOG_MAX_SIZE = 1024 * 5;
+  static const std::string HEADER_STR = "Log header line";
+
+  InitTestDb();
+
+  AutoRollLogger logger(Env::Default(), kTestDir, /*db_log_dir=*/ "",
+                        LOG_MAX_SIZE, /*log_file_time_to_roll=*/ 0);
+
+  // log some headers
+  for (size_t i = 0; i < MAX_HEADERS; i++) {
+    Header(&logger, "%s %d", HEADER_STR.c_str(), i);
+  }
+
+  // log enough data to cause a roll over
+  size_t i = 0;
+  while (logger.GetLogFileSize() < LOG_MAX_SIZE) {
+    Info(&logger, (kSampleMessage + ":LogHeaderTest line %d").c_str(), i);
+    ++i;
+  }
+
+  // verify that the new log contains all the header logs
+  std::stringstream ssbuf;
+  std::string line;
+  size_t count = 0;
+
+  std::ifstream inFile(AutoRollLoggerTest::kLogFile.c_str());
+  ssbuf << inFile.rdbuf();
+
+  while (getline(ssbuf, line)) {
+    if (line.find(HEADER_STR) != std::string::npos) {
+      count++;
+    }
+  }
+
+  ASSERT_EQ(count, MAX_HEADERS);
+}
+
 TEST(AutoRollLoggerTest, LogFileExistence) {
   rocksdb::DB* db;
   rocksdb::Options options;
diff --git a/util/env.cc b/util/env.cc
index 6fd020489..a95205273 100644
--- a/util/env.cc
+++ b/util/env.cc
@@ -59,6 +59,15 @@ void Log(const InfoLogLevel log_level, Logger* info_log, const char* format,
   }
 }
 
+void Header(Logger* info_log, const char* format, ...) {
+  if (info_log) {
+    va_list ap;
+    va_start(ap, format);
+    info_log->LogHeader(format, ap);
+    va_end(ap);
+  }
+}
+
 void Debug(Logger* info_log, const char* format, ...) {
   if (info_log && info_log->GetInfoLogLevel() <= InfoLogLevel::DEBUG_LEVEL) {
     va_list ap;
@@ -118,6 +127,15 @@ void Log(const InfoLogLevel log_level, const shared_ptr<Logger>& info_log,
   }
 }
 
+void Header(const shared_ptr<Logger>& info_log, const char* format, ...) {
+  if (info_log) {
+    va_list ap;
+    va_start(ap, format);
+    info_log->LogHeader(format, ap);
+    va_end(ap);
+  }
+}
+
 void Debug(const shared_ptr<Logger>& info_log, const char* format, ...) {
   if (info_log) {
     va_list ap;

From 218c3ecea3de9b0ced9e9ab7b68810e5e2e1b337 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Fri, 6 Feb 2015 13:01:59 -0800
Subject: [PATCH 819/829] Fix std::cout data race

Summary: std::cout is not thread safe. tsan complains. Eliminate it.

Test Plan: env_test with TSAN

Reviewers: yhchiang, rven, sdong

Reviewed By: sdong

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D33087
---
 util/env_test.cc | 2 --
 1 file changed, 2 deletions(-)

diff --git a/util/env_test.cc b/util/env_test.cc
index 3511d985b..351f6358a 100644
--- a/util/env_test.cc
+++ b/util/env_test.cc
@@ -141,8 +141,6 @@ TEST(EnvPosixTest, TwoPools) {
       {
         MutexLock l(&mu_);
         num_running_++;
-        std::cout << "Pool " << pool_name_ << ": "
-                  << num_running_ << " running threads.\n";
         // make sure we don't have more than pool_size_ jobs running.
         ASSERT_LE(num_running_, pool_size_.load());
       }

From 7e50ed8c24a9ee8cb4ec5384b31865144679e13c Mon Sep 17 00:00:00 2001
From: Marko Kevac <marko@kevac.org>
Date: Sat, 7 Feb 2015 14:25:10 +0300
Subject: [PATCH 820/829] Added some more wrappers and wrote a test for backup
 in C

---
 db/c.cc             | 70 ++++++++++++++++++++++++++++++++++++++++++++-
 db/c_test.c         | 42 +++++++++++++++++++++++++++
 include/rocksdb/c.h | 44 ++++++++++++++++++++++++++--
 3 files changed, 153 insertions(+), 3 deletions(-)

diff --git a/db/c.cc b/db/c.cc
index c952b2e8b..f3d0fed12 100644
--- a/db/c.cc
+++ b/db/c.cc
@@ -72,6 +72,8 @@ using rocksdb::WriteOptions;
 using rocksdb::LiveFileMetaData;
 using rocksdb::BackupEngine;
 using rocksdb::BackupableDBOptions;
+using rocksdb::BackupInfo;
+using rocksdb::RestoreOptions;
 
 using std::shared_ptr;
 
@@ -79,6 +81,8 @@ extern "C" {
 
 struct rocksdb_t                 { DB*               rep; };
 struct rocksdb_backup_engine_t   { BackupEngine*     rep; };
+struct rocksdb_backup_engine_info_t { std::vector<BackupInfo> rep; };
+struct rocksdb_restore_options_t { RestoreOptions rep; };
 struct rocksdb_iterator_t        { Iterator*         rep; };
 struct rocksdb_writebatch_t      { WriteBatch        rep; };
 struct rocksdb_snapshot_t        { const Snapshot*   rep; };
@@ -532,10 +536,11 @@ rocksdb_t* rocksdb_open_for_read_only(
 }
 
 rocksdb_backup_engine_t* rocksdb_backup_engine_open(
+    const rocksdb_options_t* options,
     const char* path,
     char** errptr) {
   BackupEngine* be;
-  if (SaveError(errptr, BackupEngine::Open(Env::Default(), BackupableDBOptions(path), &be))) {
+  if (SaveError(errptr, BackupEngine::Open(options->rep.env, BackupableDBOptions(path), &be))) {
     return nullptr;
   }
   rocksdb_backup_engine_t* result = new rocksdb_backup_engine_t;
@@ -550,6 +555,69 @@ void rocksdb_backup_engine_create_new_backup(
   SaveError(errptr, be->rep->CreateNewBackup(db->rep));
 }
 
+rocksdb_restore_options_t* rocksdb_restore_options_create() {
+  return new rocksdb_restore_options_t;
+}
+
+void rocksdb_restore_options_destroy(rocksdb_restore_options_t* opt) {
+  delete opt;
+}
+
+void rocksdb_restore_options_set_keep_log_files(
+    rocksdb_restore_options_t* opt, int v) {
+  opt->rep.keep_log_files = v;
+}
+
+void rocksdb_backup_engine_restore_db_from_latest_backup(
+    rocksdb_backup_engine_t *be,
+    const char* db_dir,
+    const char* wal_dir,
+    const rocksdb_restore_options_t *restore_options,
+    char** errptr) {
+  SaveError(errptr, be->rep->RestoreDBFromLatestBackup(std::string(db_dir), std::string(wal_dir), restore_options->rep));
+}
+
+const rocksdb_backup_engine_info_t* rocksdb_backup_engine_get_backup_info(
+    rocksdb_backup_engine_t* be) {
+  rocksdb_backup_engine_info_t* result = new rocksdb_backup_engine_info_t;
+  be->rep->GetBackupInfo(&result->rep);
+  return result;
+}
+
+int rocksdb_backup_engine_info_count(
+    const rocksdb_backup_engine_info_t* info) {
+  return static_cast<int>(info->rep.size());
+}
+
+const int64_t rocksdb_backup_engine_info_timestamp(
+    const rocksdb_backup_engine_info_t* info,
+    int index) {
+  return info->rep[index].timestamp;
+}
+
+const uint32_t rocksdb_backup_engine_info_backup_id(
+    const rocksdb_backup_engine_info_t* info,
+    int index) {
+  return info->rep[index].backup_id;
+}
+
+const uint64_t rocksdb_backup_engine_info_size(
+    const rocksdb_backup_engine_info_t* info,
+    int index) {
+  return info->rep[index].size;
+}
+
+const uint32_t rocksdb_backup_engine_info_number_files(
+    const rocksdb_backup_engine_info_t* info,
+    int index) {
+  return info->rep[index].number_files;
+}
+
+void rocksdb_backup_engine_info_destroy(
+    const rocksdb_backup_engine_info_t* info) {
+  delete info;
+}
+
 void rocksdb_backup_engine_close(
     rocksdb_backup_engine_t *be) {
   delete be->rep;
diff --git a/db/c_test.c b/db/c_test.c
index ba239a43d..4f9bd4ebf 100644
--- a/db/c_test.c
+++ b/db/c_test.c
@@ -10,9 +10,11 @@
 #include <string.h>
 #include <sys/types.h>
 #include <unistd.h>
+#include <inttypes.h>
 
 const char* phase = "";
 static char dbname[200];
+static char dbbackupname[200];
 
 static void StartPhase(const char* name) {
   fprintf(stderr, "=== Test %s\n", name);
@@ -346,6 +348,11 @@ int main(int argc, char** argv) {
            GetTempDir(),
            ((int) geteuid()));
 
+  snprintf(dbbackupname, sizeof(dbbackupname),
+           "%s/rocksdb_c_test-%d-backup",
+           GetTempDir(),
+           ((int) geteuid()));
+
   StartPhase("create_objects");
   cmp = rocksdb_comparator_create(NULL, CmpDestroy, CmpCompare, CmpName);
   env = rocksdb_create_default_env();
@@ -396,6 +403,41 @@ int main(int argc, char** argv) {
   CheckNoError(err);
   CheckGet(db, roptions, "foo", "hello");
 
+  StartPhase("backup");
+  {
+    rocksdb_destroy_db(options, dbbackupname, &err);
+    CheckNoError(err);
+
+    rocksdb_backup_engine_t *be = rocksdb_backup_engine_open(options, dbbackupname, &err);
+    CheckNoError(err);
+
+    rocksdb_backup_engine_create_new_backup(be, db, &err);
+    CheckNoError(err);
+
+    rocksdb_delete(db, woptions, "foo", 3, &err);
+    CheckNoError(err);
+
+    rocksdb_close(db);
+
+    rocksdb_destroy_db(options, dbname, &err);
+    CheckNoError(err);
+
+    rocksdb_restore_options_t *restore_options = rocksdb_restore_options_create();
+    rocksdb_restore_options_set_keep_log_files(restore_options, 0);
+    rocksdb_backup_engine_restore_db_from_latest_backup(be, dbname, dbname, restore_options, &err);
+    CheckNoError(err);
+    rocksdb_restore_options_destroy(restore_options);
+
+    rocksdb_options_set_error_if_exists(options, 0);
+    db = rocksdb_open(options, dbname, &err);
+    CheckNoError(err);
+    rocksdb_options_set_error_if_exists(options, 1);
+
+    CheckGet(db, roptions, "foo", "hello");
+
+    rocksdb_backup_engine_close(be);
+  }
+
   StartPhase("compactall");
   rocksdb_compact_range(db, NULL, 0, NULL, 0);
   CheckGet(db, roptions, "foo", "hello");
diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h
index 9d36f8277..ac5f612a0 100644
--- a/include/rocksdb/c.h
+++ b/include/rocksdb/c.h
@@ -56,6 +56,8 @@ extern "C" {
 
 typedef struct rocksdb_t                 rocksdb_t;
 typedef struct rocksdb_backup_engine_t   rocksdb_backup_engine_t;
+typedef struct rocksdb_backup_engine_info_t   rocksdb_backup_engine_info_t;
+typedef struct rocksdb_restore_options_t rocksdb_restore_options_t;
 typedef struct rocksdb_cache_t           rocksdb_cache_t;
 typedef struct rocksdb_compactionfilter_t rocksdb_compactionfilter_t;
 typedef struct rocksdb_compactionfiltercontext_t
@@ -106,16 +108,54 @@ extern rocksdb_t* rocksdb_open_for_read_only(
     char** errptr);
 
 extern rocksdb_backup_engine_t* rocksdb_backup_engine_open(
+    const rocksdb_options_t* options,
     const char* path,
     char** errptr);
 
 extern void rocksdb_backup_engine_create_new_backup(
-    rocksdb_backup_engine_t *be,
+    rocksdb_backup_engine_t* be,
     rocksdb_t* db,
     char** errptr);
 
+extern rocksdb_restore_options_t* rocksdb_restore_options_create();
+extern void rocksdb_restore_options_destroy(rocksdb_restore_options_t* opt);
+extern void rocksdb_restore_options_set_keep_log_files(
+    rocksdb_restore_options_t* opt, int v);
+
+extern void rocksdb_backup_engine_restore_db_from_latest_backup(
+    rocksdb_backup_engine_t *be,
+    const char* db_dir,
+    const char* wal_dir,
+    const rocksdb_restore_options_t *restore_options,
+    char** errptr);
+
+extern const rocksdb_backup_engine_info_t* rocksdb_backup_engine_get_backup_info(
+    rocksdb_backup_engine_t* be);
+
+extern int rocksdb_backup_engine_info_count(
+    const rocksdb_backup_engine_info_t* info);
+
+extern const int64_t rocksdb_backup_engine_info_timestamp(
+    const rocksdb_backup_engine_info_t* info,
+    int index);
+
+extern const uint32_t rocksdb_backup_engine_info_backup_id(
+    const rocksdb_backup_engine_info_t* info,
+    int index);
+
+extern const uint64_t rocksdb_backup_engine_info_size(
+    const rocksdb_backup_engine_info_t* info,
+    int index);
+
+extern const uint32_t rocksdb_backup_engine_info_number_files(
+    const rocksdb_backup_engine_info_t* info,
+    int index);
+
+extern void rocksdb_backup_engine_info_destroy(
+    const rocksdb_backup_engine_info_t *info);
+
 extern void rocksdb_backup_engine_close(
-    rocksdb_backup_engine_t *be);
+    rocksdb_backup_engine_t* be);
 
 extern rocksdb_t* rocksdb_open_column_families(
     const rocksdb_options_t* options,

From 9651308307234b6eb08b3cdf62324b10df1e94e9 Mon Sep 17 00:00:00 2001
From: Marko Kevac <marko@kevac.org>
Date: Mon, 9 Feb 2015 12:11:42 +0300
Subject: [PATCH 821/829] renamed backup to backup_and_restore in c_test for
 clarity

---
 db/c_test.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/db/c_test.c b/db/c_test.c
index 4f9bd4ebf..b8f0ea186 100644
--- a/db/c_test.c
+++ b/db/c_test.c
@@ -403,7 +403,7 @@ int main(int argc, char** argv) {
   CheckNoError(err);
   CheckGet(db, roptions, "foo", "hello");
 
-  StartPhase("backup");
+  StartPhase("backup_and_restore");
   {
     rocksdb_destroy_db(options, dbbackupname, &err);
     CheckNoError(err);

From d090330c8eb5b7ac61370e16305510e1e86cd913 Mon Sep 17 00:00:00 2001
From: Marko Kevac <marko@kevac.org>
Date: Mon, 9 Feb 2015 12:16:04 +0300
Subject: [PATCH 822/829] fixed c_simple_example and added some comments

---
 examples/c_simple_example.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/examples/c_simple_example.c b/examples/c_simple_example.c
index 8340026db..fd79a6968 100644
--- a/examples/c_simple_example.c
+++ b/examples/c_simple_example.c
@@ -27,7 +27,8 @@ int main(int argc, char **argv) {
   db = rocksdb_open(options, DBPath, &err);
   assert(!err);
 
-  be = rocksdb_backup_engine_open(DBBackupPath, &err);
+  // open Backup Engine that we will use for backing up or database
+  be = rocksdb_backup_engine_open(options, DBBackupPath, &err);
   assert(!err);
 
   // Put key-value
@@ -46,6 +47,7 @@ int main(int argc, char **argv) {
   assert(strcmp(returned_value, "value") == 0);
   free(returned_value);
 
+  // create new backup in a directory specified by DBBackupPath
   rocksdb_backup_engine_create_new_backup(be, db, &err);
   assert(!err);
 

From 82faa377a8276faedfaeeebfbec6da6e2bc68540 Mon Sep 17 00:00:00 2001
From: Marko Kevac <marko@kevac.org>
Date: Mon, 9 Feb 2015 19:34:50 +0300
Subject: [PATCH 823/829] added simple example for db restore from backup

---
 examples/c_simple_example.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/examples/c_simple_example.c b/examples/c_simple_example.c
index fd79a6968..c152b7da1 100644
--- a/examples/c_simple_example.c
+++ b/examples/c_simple_example.c
@@ -51,6 +51,17 @@ int main(int argc, char **argv) {
   rocksdb_backup_engine_create_new_backup(be, db, &err);
   assert(!err);
 
+  rocksdb_close(db);
+
+  // If something is wrong, you might want to restore data from last backup
+  rocksdb_restore_options_t *restore_options = rocksdb_restore_options_create();
+  rocksdb_backup_engine_restore_db_from_latest_backup(be, DBPath, DBPath, restore_options, &err);
+  assert(!err);
+  rocksdb_restore_options_destroy(restore_options);
+
+  db = rocksdb_open(options, DBPath, &err);
+  assert(!err);
+
   // cleanup
   rocksdb_writeoptions_destroy(writeoptions);
   rocksdb_readoptions_destroy(readoptions);

From aaceef363877c6b7d842c18680278c187e2115c0 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Mon, 9 Feb 2015 09:53:30 -0800
Subject: [PATCH 824/829] Fix formatting

---
 db/c.cc                     | 47 ++++++++++++++-----------------------
 examples/c_simple_example.c |  3 ++-
 2 files changed, 20 insertions(+), 30 deletions(-)

diff --git a/db/c.cc b/db/c.cc
index f3d0fed12..55afad94e 100644
--- a/db/c.cc
+++ b/db/c.cc
@@ -536,11 +536,10 @@ rocksdb_t* rocksdb_open_for_read_only(
 }
 
 rocksdb_backup_engine_t* rocksdb_backup_engine_open(
-    const rocksdb_options_t* options,
-    const char* path,
-    char** errptr) {
+    const rocksdb_options_t* options, const char* path, char** errptr) {
   BackupEngine* be;
-  if (SaveError(errptr, BackupEngine::Open(options->rep.env, BackupableDBOptions(path), &be))) {
+  if (SaveError(errptr, BackupEngine::Open(options->rep.env,
+                                           BackupableDBOptions(path), &be))) {
     return nullptr;
   }
   rocksdb_backup_engine_t* result = new rocksdb_backup_engine_t;
@@ -548,10 +547,8 @@ rocksdb_backup_engine_t* rocksdb_backup_engine_open(
   return result;
 }
 
-void rocksdb_backup_engine_create_new_backup(
-    rocksdb_backup_engine_t *be,
-    rocksdb_t *db,
-    char** errptr) {
+void rocksdb_backup_engine_create_new_backup(rocksdb_backup_engine_t* be,
+                                             rocksdb_t* db, char** errptr) {
   SaveError(errptr, be->rep->CreateNewBackup(db->rep));
 }
 
@@ -563,18 +560,17 @@ void rocksdb_restore_options_destroy(rocksdb_restore_options_t* opt) {
   delete opt;
 }
 
-void rocksdb_restore_options_set_keep_log_files(
-    rocksdb_restore_options_t* opt, int v) {
+void rocksdb_restore_options_set_keep_log_files(rocksdb_restore_options_t* opt,
+                                                int v) {
   opt->rep.keep_log_files = v;
 }
 
 void rocksdb_backup_engine_restore_db_from_latest_backup(
-    rocksdb_backup_engine_t *be,
-    const char* db_dir,
-    const char* wal_dir,
-    const rocksdb_restore_options_t *restore_options,
-    char** errptr) {
-  SaveError(errptr, be->rep->RestoreDBFromLatestBackup(std::string(db_dir), std::string(wal_dir), restore_options->rep));
+    rocksdb_backup_engine_t* be, const char* db_dir, const char* wal_dir,
+    const rocksdb_restore_options_t* restore_options, char** errptr) {
+  SaveError(errptr, be->rep->RestoreDBFromLatestBackup(std::string(db_dir),
+                                                       std::string(wal_dir),
+                                                       restore_options->rep));
 }
 
 const rocksdb_backup_engine_info_t* rocksdb_backup_engine_get_backup_info(
@@ -584,32 +580,27 @@ const rocksdb_backup_engine_info_t* rocksdb_backup_engine_get_backup_info(
   return result;
 }
 
-int rocksdb_backup_engine_info_count(
-    const rocksdb_backup_engine_info_t* info) {
+int rocksdb_backup_engine_info_count(const rocksdb_backup_engine_info_t* info) {
   return static_cast<int>(info->rep.size());
 }
 
 const int64_t rocksdb_backup_engine_info_timestamp(
-    const rocksdb_backup_engine_info_t* info,
-    int index) {
+    const rocksdb_backup_engine_info_t* info, int index) {
   return info->rep[index].timestamp;
 }
 
 const uint32_t rocksdb_backup_engine_info_backup_id(
-    const rocksdb_backup_engine_info_t* info,
-    int index) {
+    const rocksdb_backup_engine_info_t* info, int index) {
   return info->rep[index].backup_id;
 }
 
 const uint64_t rocksdb_backup_engine_info_size(
-    const rocksdb_backup_engine_info_t* info,
-    int index) {
+    const rocksdb_backup_engine_info_t* info, int index) {
   return info->rep[index].size;
 }
 
 const uint32_t rocksdb_backup_engine_info_number_files(
-    const rocksdb_backup_engine_info_t* info,
-    int index) {
+    const rocksdb_backup_engine_info_t* info, int index) {
   return info->rep[index].number_files;
 }
 
@@ -618,13 +609,11 @@ void rocksdb_backup_engine_info_destroy(
   delete info;
 }
 
-void rocksdb_backup_engine_close(
-    rocksdb_backup_engine_t *be) {
+void rocksdb_backup_engine_close(rocksdb_backup_engine_t* be) {
   delete be->rep;
   delete be;
 }
 
-
 void rocksdb_close(rocksdb_t* db) {
   delete db->rep;
   delete db;
diff --git a/examples/c_simple_example.c b/examples/c_simple_example.c
index c152b7da1..7a6382765 100644
--- a/examples/c_simple_example.c
+++ b/examples/c_simple_example.c
@@ -55,7 +55,8 @@ int main(int argc, char **argv) {
 
   // If something is wrong, you might want to restore data from last backup
   rocksdb_restore_options_t *restore_options = rocksdb_restore_options_create();
-  rocksdb_backup_engine_restore_db_from_latest_backup(be, DBPath, DBPath, restore_options, &err);
+  rocksdb_backup_engine_restore_db_from_latest_backup(be, DBPath, DBPath,
+                                                      restore_options, &err);
   assert(!err);
   rocksdb_restore_options_destroy(restore_options);
 

From cfe8837e43cbe05040a6a59ef8760cfb53f83758 Mon Sep 17 00:00:00 2001
From: fyrz <fyrgoss@gmail.com>
Date: Sun, 1 Feb 2015 20:08:19 +0100
Subject: [PATCH 825/829] Switch logv with loglevel to virtual

---
 HISTORY.md                            | 1 +
 db/compaction_picker_test.cc          | 1 +
 db/table_properties_collector_test.cc | 1 +
 include/rocksdb/env.h                 | 2 +-
 util/auto_roll_logger.h               | 1 +
 util/env_test.cc                      | 2 ++
 util/mock_env.cc                      | 2 ++
 util/options_test.cc                  | 1 +
 util/posix_logger.h                   | 2 ++
 9 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/HISTORY.md b/HISTORY.md
index 3502df3ea..bef3e1ff1 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -17,6 +17,7 @@
 
 ### Public API changes
 * Deprecated skip_log_error_on_recovery option
+* Logger method logv with log level parameter is now virtual
 
 ### 3.9.0 (12/8/2014)
 
diff --git a/db/compaction_picker_test.cc b/db/compaction_picker_test.cc
index e0ba7722e..ca7ba014f 100644
--- a/db/compaction_picker_test.cc
+++ b/db/compaction_picker_test.cc
@@ -14,6 +14,7 @@ namespace rocksdb {
 
 class CountingLogger : public Logger {
  public:
+  using Logger::Logv;
   virtual void Logv(const char* format, va_list ap) override { log_count++; }
   size_t log_count;
 };
diff --git a/db/table_properties_collector_test.cc b/db/table_properties_collector_test.cc
index 74abf8670..364b23b44 100644
--- a/db/table_properties_collector_test.cc
+++ b/db/table_properties_collector_test.cc
@@ -79,6 +79,7 @@ class FakeRandomeAccessFile : public RandomAccessFile {
 
 class DumbLogger : public Logger {
  public:
+  using Logger::Logv;
   virtual void Logv(const char* format, va_list ap) { }
   virtual size_t GetLogFileSize() const { return 0; }
 };
diff --git a/include/rocksdb/env.h b/include/rocksdb/env.h
index 433fa4174..dfc598ff6 100644
--- a/include/rocksdb/env.h
+++ b/include/rocksdb/env.h
@@ -633,7 +633,7 @@ class Logger {
   // and format.  Any log with level under the internal log level
   // of *this (see @SetInfoLogLevel and @GetInfoLogLevel) will not be
   // printed.
-  void Logv(const InfoLogLevel log_level, const char* format, va_list ap) {
+  virtual void Logv(const InfoLogLevel log_level, const char* format, va_list ap) {
     static const char* kInfoLogLevelNames[5] = {"DEBUG", "INFO", "WARN",
                                                 "ERROR", "FATAL"};
     if (log_level < log_level_) {
diff --git a/util/auto_roll_logger.h b/util/auto_roll_logger.h
index 4aab6a119..486a1eae3 100644
--- a/util/auto_roll_logger.h
+++ b/util/auto_roll_logger.h
@@ -40,6 +40,7 @@ class AutoRollLogger : public Logger {
     ResetLogger();
   }
 
+  using Logger::Logv;
   void Logv(const char* format, va_list ap);
 
   // Write a header entry to the log. All header information will be written
diff --git a/util/env_test.cc b/util/env_test.cc
index 351f6358a..9e484c77f 100644
--- a/util/env_test.cc
+++ b/util/env_test.cc
@@ -735,6 +735,7 @@ TEST(EnvPosixTest, PosixRandomRWFileTest) {
 
 class TestLogger : public Logger {
  public:
+  using Logger::Logv;
   virtual void Logv(const char* format, va_list ap) override {
     log_count++;
 
@@ -808,6 +809,7 @@ TEST(EnvPosixTest, LogBufferTest) {
 class TestLogger2 : public Logger {
  public:
   explicit TestLogger2(size_t max_log_size) : max_log_size_(max_log_size) {}
+  using Logger::Logv;
   virtual void Logv(const char* format, va_list ap) override {
     char new_format[2000];
     std::fill_n(new_format, sizeof(new_format), '2');
diff --git a/util/mock_env.cc b/util/mock_env.cc
index bcfc611b5..2b357cefe 100644
--- a/util/mock_env.cc
+++ b/util/mock_env.cc
@@ -322,6 +322,8 @@ class TestMemLogger : public Logger {
     }
     last_flush_micros_ = env_->NowMicros();
   }
+
+  using Logger::Logv;
   virtual void Logv(const char* format, va_list ap) {
     // We try twice: the first time with a fixed-size stack allocated buffer,
     // and the second time with a much larger dynamically allocated buffer.
diff --git a/util/options_test.cc b/util/options_test.cc
index 5ddfac27c..0c1e3ce21 100644
--- a/util/options_test.cc
+++ b/util/options_test.cc
@@ -36,6 +36,7 @@ class OptionsTest {};
 
 class StderrLogger : public Logger {
  public:
+  using Logger::Logv;
   virtual void Logv(const char* format, va_list ap) override {
     vprintf(format, ap);
     printf("\n");
diff --git a/util/posix_logger.h b/util/posix_logger.h
index e4a2c8456..6faa844ba 100644
--- a/util/posix_logger.h
+++ b/util/posix_logger.h
@@ -58,6 +58,8 @@ class PosixLogger : public Logger {
     }
     last_flush_micros_ = env_->NowMicros();
   }
+
+  using Logger::Logv;
   virtual void Logv(const char* format, va_list ap) {
     const uint64_t thread_id = (*gettid_)();
 

From 91ac3b2067804920b3444aab7d2535cdb825f50a Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Mon, 9 Feb 2015 12:03:45 -0800
Subject: [PATCH 826/829] Print DB pointer when opening a DB

Summary: Having a pointer for DB will be helpful to debug when GDB or working on a dump. If the client process doesn't have any thread actively working on RocksDB, it can be hard to find out.

Test Plan: make all check

Reviewers: rven, yhchiang, igor

Reviewed By: igor

Subscribers: yoshinorim, dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D33159
---
 db/db_impl.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/db/db_impl.cc b/db/db_impl.cc
index be1f7037f..d3f445926 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -3853,6 +3853,8 @@ Status DB::Open(const DBOptions& db_options, const std::string& dbname,
 
   if (s.ok()) {
     impl->opened_successfully_ = true;
+    Log(InfoLogLevel::INFO_LEVEL, impl->db_options_.info_log, "DB pointer %p",
+        impl);
     *dbptr = impl;
   } else {
     for (auto* h : *handles) {

From 1851f977c2b5a7579892de8ec3edbae7236305d0 Mon Sep 17 00:00:00 2001
From: Grace Law <gracelaw@fb.com>
Date: Mon, 9 Feb 2015 14:53:58 -0800
Subject: [PATCH 827/829] Added RocksDB stats GET_HIT_L0 and GET_HIT_L1

Summary:
  - In statistics.h , added tickers.
  - In version_set.cc,
  -- Added a getter method for hit_file_level_ in the class FilePicker
  -- Added a line in the Get() method in case of a found, increment the corresponding counters based on the level of the file respectively.

Corresponding task: https://our.intern.facebook.com/intern/tasks/?s=506100481&t=5952818
Personal fork: https://github.com/sycamlaw43/rocksdb/commit/0c3f2e3600a1e0faad63249c45f3951fd0430b30

Test Plan:
In terminal,
```
make -j32 db_test
ROCKSDB_TESTS=L0L1L2AndUpHitCounter ./db_test
```

Or to use debugger,
```
make -j32 db_test
export ROCKSDB_TESTS=L0L1L2AndUpHitCounter
gdb db_test
```

Reviewers: rven, sdong

Reviewed By: sdong

Subscribers: dhruba

Differential Revision: https://reviews.facebook.net/D32205
---
 db/db_test.cc                | 38 ++++++++++++++++++++++++++++++++++++
 db/version_set.cc            | 14 +++++++++++++
 include/rocksdb/statistics.h | 13 +++++++++++-
 3 files changed, 64 insertions(+), 1 deletion(-)

diff --git a/db/db_test.cc b/db/db_test.cc
index 715d63970..47227396b 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -10330,6 +10330,44 @@ TEST(DBTest, DeleteMovedFileAfterCompaction) {
   }
 }
 
+TEST(DBTest, L0L1L2AndUpHitCounter) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 32 * 1024;
+  options.target_file_size_base = 32 * 1024;
+  options.level0_file_num_compaction_trigger = 2;
+  options.level0_slowdown_writes_trigger = 2;
+  options.level0_stop_writes_trigger = 4;
+  options.max_bytes_for_level_base = 64 * 1024;
+  options.max_write_buffer_number = 2;
+  options.max_background_compactions = 8;
+  options.max_background_flushes = 8;
+  options.statistics = rocksdb::CreateDBStatistics();
+  CreateAndReopenWithCF({"mypikachu"}, options);
+
+  int numkeys = 20000;
+  for (int i = 0; i < numkeys; i++) {
+    ASSERT_OK(Put(1, Key(i), "val"));
+  }
+  ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L0));
+  ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L1));
+  ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L2_AND_UP));
+
+  ASSERT_OK(Flush(1));
+  dbfull()->TEST_WaitForCompact();
+
+  for (int i = 0; i < numkeys; i++) {
+    ASSERT_EQ(Get(1, Key(i)), "val");
+  }
+
+  ASSERT_GT(TestGetTickerCount(options, GET_HIT_L0), 100);
+  ASSERT_GT(TestGetTickerCount(options, GET_HIT_L1), 100);
+  ASSERT_GT(TestGetTickerCount(options, GET_HIT_L2_AND_UP), 100);
+
+  ASSERT_EQ(numkeys, TestGetTickerCount(options, GET_HIT_L0) +
+                         TestGetTickerCount(options, GET_HIT_L1) +
+                         TestGetTickerCount(options, GET_HIT_L2_AND_UP));
+}
+
 TEST(DBTest, EncodeDecompressedBlockSizeTest) {
   // iter 0 -- zlib
   // iter 1 -- bzip2
diff --git a/db/version_set.cc b/db/version_set.cc
index 6ec6f1d9e..e3ec67a8f 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -91,6 +91,7 @@ class FilePicker {
       const InternalKeyComparator* internal_comparator)
       : num_levels_(num_levels),
         curr_level_(-1),
+        hit_file_level_(-1),
         search_left_bound_(0),
         search_right_bound_(FileIndexer::kLevelMaxIndex),
 #ifndef NDEBUG
@@ -120,6 +121,7 @@ class FilePicker {
       while (curr_index_in_curr_level_ < curr_file_level_->num_files) {
         // Loops over all files in current level.
         FdWithKeyRange* f = &curr_file_level_->files[curr_index_in_curr_level_];
+        hit_file_level_ = curr_level_;
         int cmp_largest = -1;
 
         // Do key range filtering of files or/and fractional cascading if:
@@ -199,9 +201,14 @@ class FilePicker {
     return nullptr;
   }
 
+  // getter for current file level
+  // for GET_HIT_L0, GET_HIT_L1 & GET_HIT_L2_AND_UP counts
+  unsigned int GetHitFileLevel() { return hit_file_level_; }
+
  private:
   unsigned int num_levels_;
   unsigned int curr_level_;
+  unsigned int hit_file_level_;
   int32_t search_left_bound_;
   int32_t search_right_bound_;
 #ifndef NDEBUG
@@ -800,6 +807,13 @@ void Version::Get(const ReadOptions& read_options,
         // Keep searching in other files
         break;
       case GetContext::kFound:
+        if (fp.GetHitFileLevel() == 0) {
+          RecordTick(db_statistics_, GET_HIT_L0);
+        } else if (fp.GetHitFileLevel() == 1) {
+          RecordTick(db_statistics_, GET_HIT_L1);
+        } else if (fp.GetHitFileLevel() >= 2) {
+          RecordTick(db_statistics_, GET_HIT_L2_AND_UP);
+        }
         return;
       case GetContext::kDeleted:
         // Use empty error message for speed
diff --git a/include/rocksdb/statistics.h b/include/rocksdb/statistics.h
index 4b28fd0d9..c5b364a0c 100644
--- a/include/rocksdb/statistics.h
+++ b/include/rocksdb/statistics.h
@@ -53,6 +53,13 @@ enum Tickers : uint32_t {
   // # of memtable misses.
   MEMTABLE_MISS,
 
+  // # of Get() queries served by L0
+  GET_HIT_L0,
+  // # of Get() queries served by L1
+  GET_HIT_L1,
+  // # of Get() queries served by L2 and up
+  GET_HIT_L2_AND_UP,
+
   /**
    * COMPACTION_KEY_DROP_* count the reasons for key drop during compaction
    * There are 3 reasons currently.
@@ -150,6 +157,9 @@ const std::vector<std::pair<Tickers, std::string>> TickersNameMap = {
     {BLOOM_FILTER_USEFUL, "rocksdb.bloom.filter.useful"},
     {MEMTABLE_HIT, "rocksdb.memtable.hit"},
     {MEMTABLE_MISS, "rocksdb.memtable.miss"},
+    {GET_HIT_L0, "rocksdb.l0.hit"},
+    {GET_HIT_L1, "rocksdb.l1.hit"},
+    {GET_HIT_L2_AND_UP, "rocksdb.l2andup.hit"},
     {COMPACTION_KEY_DROP_NEWER_ENTRY, "rocksdb.compaction.key.drop.new"},
     {COMPACTION_KEY_DROP_OBSOLETE, "rocksdb.compaction.key.drop.obsolete"},
     {COMPACTION_KEY_DROP_USER, "rocksdb.compaction.key.drop.user"},
@@ -194,7 +204,8 @@ const std::vector<std::pair<Tickers, std::string>> TickersNameMap = {
     {NUMBER_SUPERVERSION_ACQUIRES, "rocksdb.number.superversion_acquires"},
     {NUMBER_SUPERVERSION_RELEASES, "rocksdb.number.superversion_releases"},
     {NUMBER_SUPERVERSION_CLEANUPS, "rocksdb.number.superversion_cleanups"},
-    {NUMBER_BLOCK_NOT_COMPRESSED, "rocksdb.number.block.not_compressed"}, };
+    {NUMBER_BLOCK_NOT_COMPRESSED, "rocksdb.number.block.not_compressed"},
+};
 
 /**
  * Keep adding histogram's here.

From 863009b5a594fd4ecd7ed38ba1540f8cbc15011e Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Mon, 9 Feb 2015 17:38:32 -0800
Subject: [PATCH 828/829] Fix deleting obsolete files #2

Summary: For description of the bug, see comment in db_test. The fix is pretty straight forward.

Test Plan: added unit test. eventually we need better testing of FOF/POF process.

Reviewers: yhchiang, rven, sdong

Reviewed By: sdong

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D33081
---
 db/db_impl.cc     | 20 ++++++-----
 db/db_test.cc     | 90 +++++++++++++++++++++++++++++++++++++++++++++++
 db/version_set.cc | 14 ++++++--
 db/version_set.h  |  3 +-
 4 files changed, 114 insertions(+), 13 deletions(-)

diff --git a/db/db_impl.cc b/db/db_impl.cc
index d3f445926..570928b1e 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -464,8 +464,18 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force,
     }
   }
 
+  // don't delete files that might be currently written to from compaction
+  // threads
+  if (!pending_outputs_.empty()) {
+    job_context->min_pending_output = *pending_outputs_.begin();
+  } else {
+    // delete all of them
+    job_context->min_pending_output = std::numeric_limits<uint64_t>::max();
+  }
+
   // get obsolete files
-  versions_->GetObsoleteFiles(&job_context->sst_delete_files);
+  versions_->GetObsoleteFiles(&job_context->sst_delete_files,
+                              job_context->min_pending_output);
 
   // store the current filenum, lognum, etc
   job_context->manifest_file_number = versions_->manifest_file_number();
@@ -474,14 +484,6 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force,
   job_context->log_number = versions_->MinLogNumber();
   job_context->prev_log_number = versions_->prev_log_number();
 
-  // don't delete live files
-  if (pending_outputs_.size()) {
-    job_context->min_pending_output = *pending_outputs_.begin();
-  } else {
-    // delete all of them
-    job_context->min_pending_output = std::numeric_limits<uint64_t>::max();
-  }
-
   versions_->AddLiveFiles(&job_context->sst_live);
   if (doing_the_full_scan) {
     for (uint32_t path_id = 0; path_id < db_options_.db_paths.size();
diff --git a/db/db_test.cc b/db/db_test.cc
index 47227396b..b4e0a46d0 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -22,6 +22,7 @@
 #include "db/job_context.h"
 #include "db/version_set.h"
 #include "db/write_batch_internal.h"
+#include "port/stack_trace.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/compaction_filter.h"
 #include "rocksdb/db.h"
@@ -10425,6 +10426,95 @@ TEST(DBTest, MutexWaitStats) {
       ThreadStatus::STATE_MUTEX_WAIT, 0);
 }
 
+// This reproduces a bug where we don't delete a file because when it was
+// supposed to be deleted, it was blocked by pending_outputs
+// Consider:
+// 1. current file_number is 13
+// 2. compaction (1) starts, blocks deletion of all files starting with 13
+// (pending outputs)
+// 3. file 13 is created by compaction (2)
+// 4. file 13 is consumed by compaction (3) and file 15 was created. Since file
+// 13 has no references, it is put into VersionSet::obsolete_files_
+// 5. FindObsoleteFiles() gets file 13 from VersionSet::obsolete_files_. File 13
+// is deleted from obsolete_files_ set.
+// 6. PurgeObsoleteFiles() tries to delete file 13, but this file is blocked by
+// pending outputs since compaction (1) is still running. It is not deleted and
+// it is not present in obsolete_files_ anymore. Therefore, we never delete it.
+TEST(DBTest, DeleteObsoleteFilesPendingOutputs) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.write_buffer_size = 2 * 1024 * 1024;     // 2 MB
+  options.max_bytes_for_level_base = 1024 * 1024;  // 1 MB
+  options.level0_file_num_compaction_trigger =
+      2;  // trigger compaction when we have 2 files
+  options.max_background_flushes = 2;
+  options.max_background_compactions = 2;
+  Reopen(options);
+
+  Random rnd(301);
+  // Create two 1MB sst files
+  for (int i = 0; i < 2; ++i) {
+    // Create 1MB sst file
+    for (int j = 0; j < 100; ++j) {
+      ASSERT_OK(Put(Key(i * 50 + j), RandomString(&rnd, 10 * 1024)));
+    }
+    ASSERT_OK(Flush());
+  }
+  // this should execute both L0->L1 and L1->(move)->L2 compactions
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ("0,0,1", FilesPerLevel(0));
+
+  SleepingBackgroundTask blocking_thread;
+  port::Mutex mutex_;
+  bool already_blocked(false);
+
+  // block the flush
+  std::function<void()> block_first_time = [&]() {
+    bool blocking = false;
+    {
+      MutexLock l(&mutex_);
+      if (!already_blocked) {
+        blocking = true;
+        already_blocked = true;
+      }
+    }
+    if (blocking) {
+      blocking_thread.DoSleep();
+    }
+  };
+  env_->table_write_callback_ = &block_first_time;
+  // Create 1MB sst file
+  for (int j = 0; j < 256; ++j) {
+    ASSERT_OK(Put(Key(j), RandomString(&rnd, 10 * 1024)));
+  }
+  // this should trigger a flush, which is blocked with block_first_time
+  // pending_file is protecting all the files created after
+
+  ASSERT_OK(dbfull()->TEST_CompactRange(2, nullptr, nullptr));
+
+  ASSERT_EQ("0,0,0,1", FilesPerLevel(0));
+  std::vector<LiveFileMetaData> metadata;
+  db_->GetLiveFilesMetaData(&metadata);
+  ASSERT_EQ(metadata.size(), 1U);
+  auto file_on_L2 = metadata[0].name;
+
+  ASSERT_OK(dbfull()->TEST_CompactRange(3, nullptr, nullptr));
+  ASSERT_EQ("0,0,0,0,1", FilesPerLevel(0));
+
+  // finish the flush!
+  blocking_thread.WakeUp();
+  blocking_thread.WaitUntilDone();
+  dbfull()->TEST_WaitForFlushMemTable();
+  ASSERT_EQ("1,0,0,0,1", FilesPerLevel(0));
+
+  metadata.clear();
+  db_->GetLiveFilesMetaData(&metadata);
+  ASSERT_EQ(metadata.size(), 2U);
+
+  // This file should have been deleted
+  ASSERT_TRUE(!env_->FileExists(dbname_ + "/" + file_on_L2));
+}
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
diff --git a/db/version_set.cc b/db/version_set.cc
index e3ec67a8f..09f45b7dc 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -2772,9 +2772,17 @@ void VersionSet::GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata) {
   }
 }
 
-void VersionSet::GetObsoleteFiles(std::vector<FileMetaData*>* files) {
-  files->insert(files->end(), obsolete_files_.begin(), obsolete_files_.end());
-  obsolete_files_.clear();
+void VersionSet::GetObsoleteFiles(std::vector<FileMetaData*>* files,
+                                  uint64_t min_pending_output) {
+  std::vector<FileMetaData*> pending_files;
+  for (auto f : obsolete_files_) {
+    if (f->fd.GetNumber() < min_pending_output) {
+      files->push_back(f);
+    } else {
+      pending_files.push_back(f);
+    }
+  }
+  obsolete_files_.swap(pending_files);
 }
 
 ColumnFamilyData* VersionSet::CreateColumnFamily(
diff --git a/db/version_set.h b/db/version_set.h
index ca79aff4e..b00c9ce2b 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -590,7 +590,8 @@ class VersionSet {
 
   void GetLiveFilesMetaData(std::vector<LiveFileMetaData> *metadata);
 
-  void GetObsoleteFiles(std::vector<FileMetaData*>* files);
+  void GetObsoleteFiles(std::vector<FileMetaData*>* files,
+                        uint64_t min_pending_output);
 
   ColumnFamilySet* GetColumnFamilySet() { return column_family_set_.get(); }
   const EnvOptions& env_options() { return env_options_; }

From 6d6305dd7dec072ceab025b14c4a60a1d58e61c6 Mon Sep 17 00:00:00 2001
From: sdong <siying.d@fb.com>
Date: Mon, 9 Feb 2015 16:12:31 -0800
Subject: [PATCH 829/829] Perf Context to report DB mutex waiting time

Summary: Add counters in perf context to allow users to figure out how time spent on waiting for DB mutex

Test Plan: Add a test and run it.

Reviewers: yhchiang, rven, igor

Reviewed By: igor

Subscribers: leveldb, dhruba

Differential Revision: https://reviews.facebook.net/D33177
---
 db/perf_context_test.cc        | 24 ++++++++++++++++++++++--
 include/rocksdb/perf_context.h |  3 +++
 util/instrumented_mutex.cc     |  4 ++++
 util/perf_context.cc           |  6 +++++-
 4 files changed, 34 insertions(+), 3 deletions(-)

diff --git a/db/perf_context_test.cc b/db/perf_context_test.cc
index 81e3eb156..be35fd6d9 100644
--- a/db/perf_context_test.cc
+++ b/db/perf_context_test.cc
@@ -14,6 +14,7 @@
 #include "util/histogram.h"
 #include "util/stop_watch.h"
 #include "util/testharness.h"
+#include "util/thread_status_util.h"
 #include "util/string_util.h"
 
 
@@ -210,6 +211,8 @@ void ProfileQueries(bool enabled_time = false) {
   HistogramImpl hist_write_wal_time;
   HistogramImpl hist_write_memtable_time;
 
+  uint64_t total_db_mutex_nanos = 0;
+
   std::cout << "Inserting " << FLAGS_total_keys << " key/value pairs\n...\n";
 
   std::vector<int> keys;
@@ -225,13 +228,17 @@ void ProfileQueries(bool enabled_time = false) {
   if (FLAGS_random_key) {
     std::random_shuffle(keys.begin(), keys.end());
   }
-
+#ifndef NDEBUG
+  ThreadStatusUtil::TEST_SetStateDelay(ThreadStatus::STATE_MUTEX_WAIT, 1U);
+#endif
+  int num_mutex_waited = 0;
   for (const int i : keys) {
     if (i == kFlushFlag) {
       FlushOptions fo;
       db->Flush(fo);
       continue;
     }
+
     std::string key = "k" + ToString(i);
     std::string value = "v" + ToString(i);
 
@@ -239,11 +246,20 @@ void ProfileQueries(bool enabled_time = false) {
 
     perf_context.Reset();
     db->Put(write_options, key, value);
+    if (++num_mutex_waited > 3) {
+#ifndef NDEBUG
+      ThreadStatusUtil::TEST_SetStateDelay(ThreadStatus::STATE_MUTEX_WAIT, 0U);
+#endif
+    }
     hist_write_pre_post.Add(perf_context.write_pre_and_post_process_time);
     hist_write_wal_time.Add(perf_context.write_wal_time);
     hist_write_memtable_time.Add(perf_context.write_memtable_time);
     hist_put.Add(perf_context.user_key_comparison_count);
+    total_db_mutex_nanos += perf_context.db_mutex_lock_nanos;
   }
+#ifndef NDEBUG
+  ThreadStatusUtil::TEST_SetStateDelay(ThreadStatus::STATE_MUTEX_WAIT, 0U);
+#endif
 
   for (const int i : keys) {
     std::string key = "k" + ToString(i);
@@ -279,7 +295,8 @@ void ProfileQueries(bool enabled_time = false) {
             << " Writing WAL time: \n"
             << hist_write_wal_time.ToString() << "\n"
             << " Writing Mem Table time: \n"
-            << hist_write_memtable_time.ToString() << "\n";
+            << hist_write_memtable_time.ToString() << "\n"
+            << " Total DB mutex nanos: \n" << total_db_mutex_nanos << "\n";
 
   std::cout << "Get(): Time to get snapshot: \n" << hist_get_snapshot.ToString()
             << " Time to get value from memtables: \n"
@@ -316,6 +333,9 @@ void ProfileQueries(bool enabled_time = false) {
     ASSERT_GT(hist_mget_files.Average(), 0);
     ASSERT_GT(hist_mget_post_process.Average(), 0);
     ASSERT_GT(hist_mget_num_memtable_checked.Average(), 0);
+#ifndef NDEBUG
+    ASSERT_GT(total_db_mutex_nanos, 2000U);
+#endif
   }
 
   db.reset();
diff --git a/include/rocksdb/perf_context.h b/include/rocksdb/perf_context.h
index e96d09d2a..18c186a95 100644
--- a/include/rocksdb/perf_context.h
+++ b/include/rocksdb/perf_context.h
@@ -65,6 +65,9 @@ struct PerfContext {
   uint64_t write_wal_time;            // total time spent on writing to WAL
   // total time spent on writing to mem tables
   uint64_t write_memtable_time;
+  uint64_t db_mutex_lock_nanos;      // time spent on acquiring DB mutex.
+  // Time spent on waiting with a condition variable created with DB mutex.
+  uint64_t db_condition_wait_nanos;
 };
 
 #if defined(NPERF_CONTEXT) || defined(IOS_CROSS_COMPILE)
diff --git a/util/instrumented_mutex.cc b/util/instrumented_mutex.cc
index 05d19b2ae..2e240cc82 100644
--- a/util/instrumented_mutex.cc
+++ b/util/instrumented_mutex.cc
@@ -3,11 +3,13 @@
 //  LICENSE file in the root directory of this source tree. An additional grant
 //  of patent rights can be found in the PATENTS file in the same directory.
 
+#include "util/perf_context_imp.h"
 #include "util/instrumented_mutex.h"
 #include "util/thread_status_util.h"
 
 namespace rocksdb {
 void InstrumentedMutex::Lock() {
+  PERF_TIMER_GUARD(db_mutex_lock_nanos);
   uint64_t wait_time_micros = 0;
   if (env_ != nullptr && stats_ != nullptr) {
     {
@@ -28,6 +30,7 @@ void InstrumentedMutex::LockInternal() {
 }
 
 void InstrumentedCondVar::Wait() {
+  PERF_TIMER_GUARD(db_condition_wait_nanos);
   uint64_t wait_time_micros = 0;
   if (env_ != nullptr && stats_ != nullptr) {
     {
@@ -48,6 +51,7 @@ void InstrumentedCondVar::WaitInternal() {
 }
 
 bool InstrumentedCondVar::TimedWait(uint64_t abs_time_us) {
+  PERF_TIMER_GUARD(db_condition_wait_nanos);
   uint64_t wait_time_micros = 0;
   bool result = false;
   if (env_ != nullptr && stats_ != nullptr) {
diff --git a/util/perf_context.cc b/util/perf_context.cc
index 5443471d5..e89856513 100644
--- a/util/perf_context.cc
+++ b/util/perf_context.cc
@@ -51,6 +51,8 @@ void PerfContext::Reset() {
   find_next_user_entry_time = 0;
   write_pre_and_post_process_time = 0;
   write_memtable_time = 0;
+  db_mutex_lock_nanos = 0;
+  db_condition_wait_nanos = 0;
 #endif
 }
 
@@ -82,7 +84,9 @@ std::string PerfContext::ToString() const {
      << OUTPUT(seek_internal_seek_time)
      << OUTPUT(find_next_user_entry_time)
      << OUTPUT(write_pre_and_post_process_time)
-     << OUTPUT(write_memtable_time);
+     << OUTPUT(write_memtable_time)
+     << OUTPUT(db_mutex_lock_nanos)
+     << OUTPUT(db_condition_wait_nanos);
   return ss.str();
 #endif
 }